Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieu@hoang.co.uk>2013-05-29 21:16:15 +0400
committerHieu Hoang <hieu@hoang.co.uk>2013-05-29 21:16:15 +0400
commit6249432407af8730c10bccc7894c0725fcaf5e47 (patch)
tree3ac1f094b9fdc199b04bc5ef209ce00e3596e37d
parent59bd7deb4b6b9c4f7b3b7dbb055783528fbc31ca (diff)
beautify
-rw-r--r--OnDiskPt/Main.cpp46
-rw-r--r--OnDiskPt/Main.h10
-rw-r--r--OnDiskPt/OnDiskQuery.cpp38
-rw-r--r--OnDiskPt/OnDiskQuery.h17
-rw-r--r--OnDiskPt/OnDiskWrapper.cpp8
-rw-r--r--OnDiskPt/OnDiskWrapper.h2
-rw-r--r--OnDiskPt/PhraseNode.cpp8
-rw-r--r--OnDiskPt/PhraseNode.h2
-rw-r--r--OnDiskPt/TargetPhrase.cpp45
-rw-r--r--OnDiskPt/TargetPhrase.h8
-rw-r--r--OnDiskPt/TargetPhraseCollection.cpp12
-rw-r--r--OnDiskPt/TargetPhraseCollection.h4
-rw-r--r--OnDiskPt/Vocab.cpp2
-rw-r--r--OnDiskPt/Word.cpp13
-rw-r--r--OnDiskPt/Word.h7
-rw-r--r--OnDiskPt/queryOnDiskPt.cpp22
-rw-r--r--biconcor/Alignment.cpp11
-rw-r--r--biconcor/Mismatch.cpp443
-rw-r--r--biconcor/Mismatch.h4
-rw-r--r--biconcor/PhrasePair.cpp123
-rw-r--r--biconcor/PhrasePairCollection.cpp175
-rw-r--r--biconcor/SuffixArray.cpp21
-rw-r--r--biconcor/TargetCorpus.cpp13
-rw-r--r--biconcor/Vocabulary.cpp3
-rw-r--r--biconcor/base64.cpp25
-rw-r--r--biconcor/biconcor.cpp11
-rw-r--r--defer/PhraseDictionaryInterpolated.cpp269
-rw-r--r--defer/PhraseDictionaryInterpolated.h10
-rw-r--r--defer/PhraseLengthFeatureTest.cpp8
-rw-r--r--defer/TargetBigramFeatureTest.cpp55
-rw-r--r--mert/BleuScorer.cpp145
-rw-r--r--mert/BleuScorer.h18
-rw-r--r--mert/BleuScorerTest.cpp42
-rw-r--r--mert/CderScorer.cpp21
-rw-r--r--mert/CderScorer.h13
-rw-r--r--mert/Data.cpp25
-rw-r--r--mert/Data.h20
-rw-r--r--mert/DataTest.cpp9
-rw-r--r--mert/Fdstream.h72
-rw-r--r--mert/FeatureArray.cpp6
-rw-r--r--mert/FeatureArray.h44
-rw-r--r--mert/FeatureData.cpp8
-rw-r--r--mert/FeatureData.h24
-rw-r--r--mert/FeatureDataIterator.cpp35
-rw-r--r--mert/FeatureDataIterator.h63
-rw-r--r--mert/FeatureDataTest.cpp9
-rw-r--r--mert/FeatureStats.cpp78
-rw-r--r--mert/FeatureStats.h43
-rw-r--r--mert/FileStream.cpp12
-rw-r--r--mert/FileStream.h8
-rw-r--r--mert/GzFileBuf.cpp23
-rw-r--r--mert/GzFileBuf.h4
-rw-r--r--mert/HypPackEnumerator.cpp79
-rw-r--r--mert/HypPackEnumerator.h11
-rw-r--r--mert/InterpolatedScorer.cpp24
-rw-r--r--mert/InterpolatedScorer.h2
-rw-r--r--mert/MiraFeatureVector.cpp28
-rw-r--r--mert/MiraFeatureVector.h5
-rw-r--r--mert/MiraWeightVector.cpp46
-rw-r--r--mert/MiraWeightVector.h8
-rw-r--r--mert/Ngram.h59
-rw-r--r--mert/NgramTest.cpp9
-rw-r--r--mert/Optimizer.cpp13
-rw-r--r--mert/Optimizer.h14
-rw-r--r--mert/OptimizerFactory.cpp36
-rw-r--r--mert/OptimizerFactory.h6
-rw-r--r--mert/OptimizerFactoryTest.cpp12
-rw-r--r--mert/PerScorer.cpp2
-rw-r--r--mert/PerScorer.h6
-rw-r--r--mert/Permutation.cpp6
-rw-r--r--mert/Permutation.h2
-rw-r--r--mert/PermutationScorer.cpp4
-rw-r--r--mert/PermutationScorer.h2
-rw-r--r--mert/Point.cpp2
-rw-r--r--mert/Point.h30
-rw-r--r--mert/PointTest.cpp3
-rw-r--r--mert/PreProcessFilter.cpp196
-rw-r--r--mert/PreProcessFilter.h6
-rw-r--r--mert/Reference.h56
-rw-r--r--mert/ReferenceTest.cpp15
-rw-r--r--mert/ScopedVector.h73
-rw-r--r--mert/ScoreArray.cpp9
-rw-r--r--mert/ScoreArray.h46
-rw-r--r--mert/ScoreData.cpp3
-rw-r--r--mert/ScoreData.h16
-rw-r--r--mert/ScoreDataIterator.cpp21
-rw-r--r--mert/ScoreDataIterator.h41
-rw-r--r--mert/ScoreStats.cpp19
-rw-r--r--mert/ScoreStats.h30
-rw-r--r--mert/Scorer.cpp49
-rw-r--r--mert/Scorer.h61
-rw-r--r--mert/ScorerFactory.cpp11
-rw-r--r--mert/ScorerFactory.h2
-rw-r--r--mert/SemposOverlapping.cpp19
-rw-r--r--mert/SemposOverlapping.h21
-rw-r--r--mert/SemposScorer.cpp60
-rw-r--r--mert/SemposScorer.h10
-rw-r--r--mert/SentenceLevelScorer.cpp76
-rw-r--r--mert/Singleton.h9
-rw-r--r--mert/SingletonTest.cpp15
-rw-r--r--mert/StatisticsBasedScorer.cpp23
-rw-r--r--mert/StatisticsBasedScorer.h10
-rw-r--r--mert/TerScorer.cpp4
-rw-r--r--mert/TerScorer.h2
-rw-r--r--mert/Timer.cpp31
-rw-r--r--mert/Timer.h19
-rw-r--r--mert/TimerTest.cpp3
-rw-r--r--mert/Util.cpp9
-rw-r--r--mert/Util.h6
-rw-r--r--mert/UtilTest.cpp9
-rw-r--r--mert/Vocabulary.cpp48
-rw-r--r--mert/Vocabulary.h69
-rw-r--r--mert/VocabularyTest.cpp15
-rw-r--r--mert/evaluator.cpp133
-rw-r--r--mert/extractor.cpp116
-rw-r--r--mert/kbmira.cpp71
-rw-r--r--mert/mert.cpp170
-rw-r--r--mert/pro.cpp44
-rw-r--r--mert/sentence-bleu.cpp2
-rw-r--r--mira/Decoder.cpp670
-rw-r--r--mira/Decoder.h190
-rw-r--r--mira/Hildreth.cpp283
-rw-r--r--mira/Hildreth.h14
-rw-r--r--mira/HildrethTest.cpp1283
-rw-r--r--mira/HypothesisQueue.cpp18
-rw-r--r--mira/HypothesisQueue.h24
-rw-r--r--mira/Main.cpp2317
-rw-r--r--mira/Main.h10
-rw-r--r--mira/MiraOptimiser.cpp811
-rw-r--r--mira/Optimiser.h288
-rw-r--r--mira/Perceptron.cpp43
-rw-r--r--misc/processLexicalTableMin.cpp57
-rw-r--r--misc/processPhraseTableMin.cpp91
-rw-r--r--misc/queryPhraseTable.cpp3
-rw-r--r--misc/queryPhraseTableMin.cpp28
-rw-r--r--moses-chart-cmd/IOWrapper.cpp68
-rw-r--r--moses-chart-cmd/IOWrapper.h8
-rw-r--r--moses-chart-cmd/Main.cpp18
-rw-r--r--moses-chart-cmd/Main.h5
-rw-r--r--moses-cmd/IOWrapper.cpp108
-rw-r--r--moses-cmd/IOWrapper.h10
-rw-r--r--moses-cmd/Main.cpp228
-rw-r--r--moses-cmd/TranslationAnalysis.cpp30
-rw-r--r--moses/AlignmentInfo.cpp55
-rw-r--r--moses/AlignmentInfo.h56
-rw-r--r--moses/AlignmentInfoCollection.cpp8
-rw-r--r--moses/AlignmentInfoCollection.h16
-rw-r--r--moses/AlignmentInfoTest.cpp3
-rw-r--r--moses/BitmapContainer.cpp8
-rw-r--r--moses/ChartCell.cpp24
-rw-r--r--moses/ChartCell.h41
-rw-r--r--moses/ChartCellCollection.cpp25
-rw-r--r--moses/ChartCellCollection.h68
-rw-r--r--moses/ChartCellLabel.h34
-rw-r--r--moses/ChartCellLabelSet.h47
-rw-r--r--moses/ChartHypothesis.cpp42
-rw-r--r--moses/ChartHypothesis.h62
-rw-r--r--moses/ChartHypothesisCollection.cpp7
-rw-r--r--moses/ChartHypothesisCollection.h6
-rw-r--r--moses/ChartManager.cpp117
-rw-r--r--moses/ChartManager.h14
-rw-r--r--moses/ChartParser.cpp43
-rw-r--r--moses/ChartParser.h40
-rw-r--r--moses/ChartParserCallback.h16
-rw-r--r--moses/ChartRuleLookupManager.h2
-rw-r--r--moses/ChartTranslationOptionList.cpp14
-rw-r--r--moses/ChartTranslationOptionList.h19
-rw-r--r--moses/ChartTranslationOptions.cpp4
-rw-r--r--moses/ChartTranslationOptions.h28
-rw-r--r--moses/ChartTrellisDetour.cpp14
-rw-r--r--moses/ChartTrellisDetour.h18
-rw-r--r--moses/ChartTrellisDetourQueue.cpp18
-rw-r--r--moses/ChartTrellisDetourQueue.h20
-rw-r--r--moses/ChartTrellisNode.cpp18
-rw-r--r--moses/ChartTrellisNode.h16
-rw-r--r--moses/ChartTrellisPath.cpp14
-rw-r--r--moses/ChartTrellisPath.h16
-rw-r--r--moses/ConfusionNet.h2
-rw-r--r--moses/DecodeFeature.cpp25
-rw-r--r--moses/DecodeFeature.h50
-rw-r--r--moses/DecodeStepTranslation.cpp6
-rw-r--r--moses/FF/BleuScoreFeature.cpp977
-rw-r--r--moses/FF/BleuScoreFeature.h203
-rw-r--r--moses/FF/ChartBasedFeatureContext.cpp6
-rw-r--r--moses/FF/ChartBasedFeatureContext.h12
-rw-r--r--moses/FF/DistortionScoreProducer.cpp11
-rw-r--r--moses/FF/DistortionScoreProducer.h12
-rw-r--r--moses/FF/FFState.h5
-rw-r--r--moses/FF/FeatureFunction.cpp17
-rw-r--r--moses/FF/FeatureFunction.h34
-rw-r--r--moses/FF/GlobalLexicalModel.cpp19
-rw-r--r--moses/FF/GlobalLexicalModel.h10
-rw-r--r--moses/FF/GlobalLexicalModelUnlimited.cpp453
-rw-r--r--moses/FF/GlobalLexicalModelUnlimited.h21
-rw-r--r--moses/FF/InputFeature.cpp16
-rw-r--r--moses/FF/PhraseBasedFeatureContext.cpp2
-rw-r--r--moses/FF/PhraseBasedFeatureContext.h12
-rw-r--r--moses/FF/PhraseBoundaryFeature.cpp56
-rw-r--r--moses/FF/PhraseBoundaryFeature.h20
-rw-r--r--moses/FF/PhraseLengthFeature.cpp11
-rw-r--r--moses/FF/PhraseLengthFeature.h9
-rw-r--r--moses/FF/PhrasePairFeature.cpp189
-rw-r--r--moses/FF/PhrasePairFeature.h60
-rw-r--r--moses/FF/SourceWordDeletionFeature.cpp69
-rw-r--r--moses/FF/SourceWordDeletionFeature.h17
-rw-r--r--moses/FF/StatefulFeatureFunction.cpp4
-rw-r--r--moses/FF/StatefulFeatureFunction.h11
-rw-r--r--moses/FF/StatelessFeatureFunction.cpp4
-rw-r--r--moses/FF/StatelessFeatureFunction.h11
-rw-r--r--moses/FF/TargetBigramFeature.cpp19
-rw-r--r--moses/FF/TargetBigramFeature.h39
-rw-r--r--moses/FF/TargetNgramFeature.cpp472
-rw-r--r--moses/FF/TargetNgramFeature.h74
-rw-r--r--moses/FF/TargetWordInsertionFeature.cpp51
-rw-r--r--moses/FF/TargetWordInsertionFeature.h17
-rw-r--r--moses/FF/UnknownWordPenaltyProducer.h7
-rw-r--r--moses/FF/WordPenaltyProducer.cpp6
-rw-r--r--moses/FF/WordPenaltyProducer.h8
-rw-r--r--moses/FF/WordTranslationFeature.cpp619
-rw-r--r--moses/FF/WordTranslationFeature.h17
-rw-r--r--moses/Factor.h8
-rw-r--r--moses/FactorCollection.cpp9
-rw-r--r--moses/FactorCollection.h2
-rw-r--r--moses/FeatureVector.cpp1361
-rw-r--r--moses/FeatureVector.h635
-rw-r--r--moses/FeatureVectorTest.cpp107
-rw-r--r--moses/GenerationDictionary.cpp5
-rw-r--r--moses/GenerationDictionary.h31
-rw-r--r--moses/Hypothesis.cpp26
-rw-r--r--moses/HypothesisStack.h2
-rw-r--r--moses/Incremental.cpp170
-rw-r--r--moses/Incremental.h57
-rw-r--r--moses/InputType.cpp2
-rw-r--r--moses/InputType.h8
-rw-r--r--moses/LM/Backward.cpp509
-rw-r--r--moses/LM/Backward.h80
-rw-r--r--moses/LM/BackwardLMState.cpp12
-rw-r--r--moses/LM/BackwardLMState.h20
-rw-r--r--moses/LM/BackwardTest.cpp507
-rw-r--r--moses/LM/Base.cpp68
-rw-r--r--moses/LM/Base.h16
-rw-r--r--moses/LM/ChartState.h42
-rw-r--r--moses/LM/IRST.cpp34
-rw-r--r--moses/LM/Implementation.cpp56
-rw-r--r--moses/LM/Implementation.h7
-rw-r--r--moses/LM/Joint.h3
-rw-r--r--moses/LM/Ken.cpp217
-rw-r--r--moses/LM/Ken.h3
-rw-r--r--moses/LM/LDHT.cpp551
-rw-r--r--moses/LM/LDHT.h3
-rw-r--r--moses/LM/MultiFactor.h10
-rw-r--r--moses/LM/ORLM.cpp30
-rw-r--r--moses/LM/ORLM.h9
-rw-r--r--moses/LM/ParallelBackoff.cpp2
-rw-r--r--moses/LM/Rand.cpp8
-rw-r--r--moses/LM/SRI.cpp31
-rw-r--r--moses/LM/SingleFactor.cpp2
-rw-r--r--moses/LM/SingleFactor.h42
-rw-r--r--moses/LexicalReordering.cpp50
-rw-r--r--moses/LexicalReordering.h41
-rw-r--r--moses/LexicalReorderingState.cpp2
-rw-r--r--moses/LexicalReorderingTable.cpp2
-rw-r--r--moses/Manager.cpp247
-rw-r--r--moses/Manager.h6
-rw-r--r--moses/MockHypothesis.cpp34
-rw-r--r--moses/MockHypothesis.h81
-rw-r--r--moses/OutputCollector.h22
-rw-r--r--moses/PCNTools.h2
-rw-r--r--moses/PDTAimp.h14
-rw-r--r--moses/Parameter.cpp219
-rw-r--r--moses/Parameter.h56
-rw-r--r--moses/PartialTranslOptColl.h2
-rw-r--r--moses/Phrase.cpp32
-rw-r--r--moses/Phrase.h89
-rw-r--r--moses/PrefixTree.h2
-rw-r--r--moses/PrefixTreeMap.h2
-rw-r--r--moses/RuleCube.h28
-rw-r--r--moses/RuleCubeItem.h20
-rw-r--r--moses/RuleCubeQueue.h12
-rw-r--r--moses/ScoreComponentCollection.cpp61
-rw-r--r--moses/ScoreComponentCollection.h353
-rw-r--r--moses/ScoreComponentCollectionTest.cpp44
-rw-r--r--moses/SearchNormalBatch.cpp136
-rw-r--r--moses/SearchNormalBatch.h4
-rw-r--r--moses/Sentence.cpp3
-rw-r--r--moses/StaticData.cpp150
-rw-r--r--moses/StaticData.h69
-rw-r--r--moses/SyntacticLanguageModel.cpp227
-rw-r--r--moses/SyntacticLanguageModel.h47
-rw-r--r--moses/SyntacticLanguageModelFiles.h46
-rw-r--r--moses/SyntacticLanguageModelState.h190
-rw-r--r--moses/TargetPhrase.cpp68
-rw-r--r--moses/TargetPhrase.h76
-rw-r--r--moses/TargetPhraseCollection.cpp4
-rw-r--r--moses/TargetPhraseCollection.h4
-rw-r--r--moses/Terminal.h10
-rw-r--r--moses/ThreadPool.h15
-rw-r--r--moses/Timer.h2
-rw-r--r--moses/TranslationModel/BilingualDynSuffixArray.cpp887
-rw-r--r--moses/TranslationModel/BilingualDynSuffixArray.h194
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp6
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h10
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp28
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h4
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp28
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h8
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h4
-rw-r--r--moses/TranslationModel/CYKPlusParser/DotChart.h38
-rw-r--r--moses/TranslationModel/CYKPlusParser/DotChartInMemory.cpp6
-rw-r--r--moses/TranslationModel/CYKPlusParser/DotChartInMemory.h18
-rw-r--r--moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h28
-rw-r--r--moses/TranslationModel/CompactPT/BlockHashIndex.cpp178
-rw-r--r--moses/TranslationModel/CompactPT/BlockHashIndex.h285
-rw-r--r--moses/TranslationModel/CompactPT/CanonicalHuffman.h599
-rw-r--r--moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp168
-rw-r--r--moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h171
-rw-r--r--moses/TranslationModel/CompactPT/ConsistentPhrases.h197
-rw-r--r--moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp114
-rw-r--r--moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h123
-rw-r--r--moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp267
-rw-r--r--moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h230
-rw-r--r--moses/TranslationModel/CompactPT/ListCoders.h603
-rw-r--r--moses/TranslationModel/CompactPT/MmapAllocator.h358
-rw-r--r--moses/TranslationModel/CompactPT/MonotonicVector.h399
-rw-r--r--moses/TranslationModel/CompactPT/MurmurHash3.cpp248
-rw-r--r--moses/TranslationModel/CompactPT/PackedArray.h320
-rw-r--r--moses/TranslationModel/CompactPT/PhraseDecoder.cpp323
-rw-r--r--moses/TranslationModel/CompactPT/PhraseDecoder.h214
-rw-r--r--moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp133
-rw-r--r--moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h66
-rw-r--r--moses/TranslationModel/CompactPT/PhraseTableCreator.cpp785
-rw-r--r--moses/TranslationModel/CompactPT/PhraseTableCreator.h685
-rw-r--r--moses/TranslationModel/CompactPT/StringVector.h654
-rw-r--r--moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h238
-rw-r--r--moses/TranslationModel/CompactPT/ThrowingFwrite.cpp45
-rw-r--r--moses/TranslationModel/CompactPT/ThrowingFwrite.h42
-rw-r--r--moses/TranslationModel/DynSAInclude/RandLMCache.h321
-rw-r--r--moses/TranslationModel/DynSAInclude/RandLMFilter.h557
-rw-r--r--moses/TranslationModel/DynSAInclude/hash.h301
-rw-r--r--moses/TranslationModel/DynSAInclude/onlineRLM.h247
-rw-r--r--moses/TranslationModel/DynSAInclude/params.cpp122
-rw-r--r--moses/TranslationModel/DynSAInclude/params.h20
-rw-r--r--moses/TranslationModel/DynSAInclude/perfectHash.h173
-rw-r--r--moses/TranslationModel/DynSAInclude/quantizer.h25
-rw-r--r--moses/TranslationModel/DynSAInclude/vocab.cpp5
-rw-r--r--moses/TranslationModel/DynSAInclude/vocab.h2
-rw-r--r--moses/TranslationModel/DynSuffixArray.cpp14
-rw-r--r--moses/TranslationModel/PhraseDictionary.cpp17
-rw-r--r--moses/TranslationModel/PhraseDictionary.h4
-rw-r--r--moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp4
-rw-r--r--moses/TranslationModel/PhraseDictionaryMemory.cpp17
-rw-r--r--moses/TranslationModel/PhraseDictionaryMemory.h22
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModel.cpp314
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModel.h49
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp477
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModelCounts.h70
-rw-r--r--moses/TranslationModel/PhraseDictionaryNodeMemory.cpp11
-rw-r--r--moses/TranslationModel/PhraseDictionaryNodeMemory.h6
-rw-r--r--moses/TranslationModel/PhraseDictionaryTree.cpp13
-rw-r--r--moses/TranslationModel/PhraseDictionaryTree.h5
-rw-r--r--moses/TranslationModel/RuleTable/Loader.h18
-rw-r--r--moses/TranslationModel/RuleTable/LoaderCompact.cpp49
-rw-r--r--moses/TranslationModel/RuleTable/LoaderCompact.h13
-rw-r--r--moses/TranslationModel/RuleTable/LoaderFactory.cpp24
-rw-r--r--moses/TranslationModel/RuleTable/LoaderFactory.h8
-rw-r--r--moses/TranslationModel/RuleTable/LoaderHiero.cpp21
-rw-r--r--moses/TranslationModel/RuleTable/LoaderHiero.h3
-rw-r--r--moses/TranslationModel/RuleTable/LoaderStandard.cpp78
-rw-r--r--moses/TranslationModel/RuleTable/LoaderStandard.h8
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp14
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h16
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp588
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h119
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp2
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h2
-rw-r--r--moses/TranslationModel/RuleTable/Trie.cpp8
-rw-r--r--moses/TranslationModel/RuleTable/Trie.h21
-rw-r--r--moses/TranslationModel/RuleTable/UTrie.cpp6
-rw-r--r--moses/TranslationModel/RuleTable/UTrie.h20
-rw-r--r--moses/TranslationModel/RuleTable/UTrieNode.cpp8
-rw-r--r--moses/TranslationModel/RuleTable/UTrieNode.h51
-rw-r--r--moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp8
-rw-r--r--moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h13
-rw-r--r--moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h13
-rw-r--r--moses/TranslationModel/Scope3Parser/Parser.cpp22
-rw-r--r--moses/TranslationModel/Scope3Parser/Parser.h47
-rw-r--r--moses/TranslationModel/Scope3Parser/SentenceMap.h6
-rw-r--r--moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp16
-rw-r--r--moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h2
-rw-r--r--moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h20
-rw-r--r--moses/TranslationModel/Scope3Parser/VarSpanNode.h18
-rw-r--r--moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp2
-rw-r--r--moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h7
-rw-r--r--moses/TranslationModel/fuzzy-match/Alignments.cpp21
-rw-r--r--moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp1853
-rw-r--r--moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h23
-rw-r--r--moses/TranslationModel/fuzzy-match/Match.h21
-rw-r--r--moses/TranslationModel/fuzzy-match/SentenceAlignment.h16
-rw-r--r--moses/TranslationModel/fuzzy-match/SuffixArray.cpp360
-rw-r--r--moses/TranslationModel/fuzzy-match/SuffixArray.h84
-rw-r--r--moses/TranslationModel/fuzzy-match/Vocabulary.cpp19
-rw-r--r--moses/TranslationModel/fuzzy-match/Vocabulary.h12
-rw-r--r--moses/TranslationModel/fuzzy-match/create_xml.cpp25
-rw-r--r--moses/TranslationOption.cpp8
-rw-r--r--moses/TranslationOption.h12
-rw-r--r--moses/TranslationOptionCollection.cpp155
-rw-r--r--moses/TranslationOptionCollection.h2
-rw-r--r--moses/TranslationOptionCollectionConfusionNet.cpp4
-rw-r--r--moses/TreeInput.cpp9
-rw-r--r--moses/TrellisPath.cpp7
-rw-r--r--moses/TrellisPath.h60
-rw-r--r--moses/TypeDef.h5
-rw-r--r--moses/Util.cpp6
-rw-r--r--moses/Util.h40
-rw-r--r--moses/Word.cpp3
-rw-r--r--moses/Word.h5
-rw-r--r--moses/XmlOption.cpp10
-rw-r--r--moses/XmlOption.h4
-rw-r--r--phrase-extract/AlignmentPhrase.h2
-rw-r--r--phrase-extract/ExtractedRule.cpp9
-rw-r--r--phrase-extract/ExtractedRule.h9
-rw-r--r--phrase-extract/Hole.h2
-rw-r--r--phrase-extract/HoleCollection.cpp2
-rw-r--r--phrase-extract/OutputFileStream.cpp8
-rw-r--r--phrase-extract/PhraseAlignment.cpp51
-rw-r--r--phrase-extract/PhraseAlignment.h33
-rw-r--r--phrase-extract/PhraseExtractionOptions.h203
-rw-r--r--phrase-extract/RuleExtractionOptions.h2
-rw-r--r--phrase-extract/ScoreFeature.cpp129
-rw-r--r--phrase-extract/ScoreFeature.h110
-rw-r--r--phrase-extract/ScoreFeatureTest.cpp30
-rw-r--r--phrase-extract/SentenceAlignment.cpp12
-rw-r--r--phrase-extract/SentenceAlignment.h2
-rw-r--r--phrase-extract/XmlTree.cpp2
-rw-r--r--phrase-extract/consolidate-direct-main.cpp41
-rw-r--r--phrase-extract/consolidate-main.cpp71
-rw-r--r--phrase-extract/consolidate-reverse-main.cpp5
-rw-r--r--phrase-extract/domain.cpp60
-rw-r--r--phrase-extract/domain.h134
-rw-r--r--phrase-extract/extract-ghkm/Alignment.cpp12
-rw-r--r--phrase-extract/extract-ghkm/Alignment.h12
-rw-r--r--phrase-extract/extract-ghkm/AlignmentGraph.cpp18
-rw-r--r--phrase-extract/extract-ghkm/AlignmentGraph.h28
-rw-r--r--phrase-extract/extract-ghkm/ComposedRule.cpp34
-rw-r--r--phrase-extract/extract-ghkm/ComposedRule.h16
-rw-r--r--phrase-extract/extract-ghkm/Exception.h20
-rw-r--r--phrase-extract/extract-ghkm/ExtractGHKM.cpp144
-rw-r--r--phrase-extract/extract-ghkm/ExtractGHKM.h22
-rw-r--r--phrase-extract/extract-ghkm/Main.cpp6
-rw-r--r--phrase-extract/extract-ghkm/Node.cpp12
-rw-r--r--phrase-extract/extract-ghkm/Node.h96
-rw-r--r--phrase-extract/extract-ghkm/Options.h40
-rw-r--r--phrase-extract/extract-ghkm/ParseTree.cpp12
-rw-r--r--phrase-extract/extract-ghkm/ParseTree.h42
-rw-r--r--phrase-extract/extract-ghkm/ScfgRule.cpp18
-rw-r--r--phrase-extract/extract-ghkm/ScfgRule.h55
-rw-r--r--phrase-extract/extract-ghkm/ScfgRuleWriter.cpp20
-rw-r--r--phrase-extract/extract-ghkm/ScfgRuleWriter.h22
-rw-r--r--phrase-extract/extract-ghkm/Span.cpp12
-rw-r--r--phrase-extract/extract-ghkm/Span.h12
-rw-r--r--phrase-extract/extract-ghkm/Subgraph.cpp12
-rw-r--r--phrase-extract/extract-ghkm/Subgraph.h67
-rw-r--r--phrase-extract/extract-ghkm/XmlTreeParser.cpp20
-rw-r--r--phrase-extract/extract-ghkm/XmlTreeParser.h19
-rw-r--r--phrase-extract/extract-lex-main.cpp74
-rw-r--r--phrase-extract/extract-lex.h46
-rw-r--r--phrase-extract/extract-main.cpp122
-rw-r--r--phrase-extract/extract-rules-main.cpp84
-rw-r--r--phrase-extract/lexical-reordering/reordering_classes.cpp2
-rw-r--r--phrase-extract/lexical-reordering/score.cpp21
-rw-r--r--phrase-extract/pcfg-common/exception.h23
-rw-r--r--phrase-extract/pcfg-common/numbered_set.h55
-rw-r--r--phrase-extract/pcfg-common/pcfg.h35
-rw-r--r--phrase-extract/pcfg-common/pcfg_tree.h37
-rw-r--r--phrase-extract/pcfg-common/syntax_tree.h81
-rw-r--r--phrase-extract/pcfg-common/tool.h25
-rw-r--r--phrase-extract/pcfg-common/typedef.h12
-rw-r--r--phrase-extract/pcfg-common/xml_tree_parser.h19
-rw-r--r--phrase-extract/pcfg-common/xml_tree_writer.h30
-rw-r--r--phrase-extract/pcfg-extract/options.h12
-rw-r--r--phrase-extract/pcfg-extract/pcfg_extract.h19
-rw-r--r--phrase-extract/pcfg-extract/rule_collection.h35
-rw-r--r--phrase-extract/pcfg-extract/rule_extractor.h19
-rw-r--r--phrase-extract/pcfg-score/options.h12
-rw-r--r--phrase-extract/pcfg-score/pcfg_score.h19
-rw-r--r--phrase-extract/pcfg-score/tree_scorer.h19
-rw-r--r--phrase-extract/score-main.cpp251
-rw-r--r--phrase-extract/score.h2
-rw-r--r--phrase-extract/tables-core.cpp7
-rwxr-xr-xscripts/other/beautify.perl51
-rw-r--r--symal/symal.cpp2
-rw-r--r--util/double-conversion/bignum-dtoa.h3
-rw-r--r--util/double-conversion/bignum.h18
-rw-r--r--util/double-conversion/cached-powers.h18
-rw-r--r--util/double-conversion/diy-fp.h26
-rw-r--r--util/double-conversion/double-conversion.h49
-rw-r--r--util/double-conversion/fast-dtoa.h3
-rw-r--r--util/double-conversion/fixed-dtoa.h3
-rw-r--r--util/double-conversion/ieee.h59
-rw-r--r--util/double-conversion/strtod.h3
-rw-r--r--util/double-conversion/utils.h70
501 files changed, 21014 insertions, 20127 deletions
diff --git a/OnDiskPt/Main.cpp b/OnDiskPt/Main.cpp
index 5d4e0be8d..c3cda2a75 100644
--- a/OnDiskPt/Main.cpp
+++ b/OnDiskPt/Main.cpp
@@ -50,14 +50,14 @@ int main (int argc, char * const argv[])
}
int numSourceFactors = Moses::Scan<int>(argv[1])
- , numTargetFactors = Moses::Scan<int>(argv[2])
- , numScores = Moses::Scan<int>(argv[3])
- , tableLimit = Moses::Scan<int>(argv[4]);
+ , numTargetFactors = Moses::Scan<int>(argv[2])
+ , numScores = Moses::Scan<int>(argv[3])
+ , tableLimit = Moses::Scan<int>(argv[4]);
TargetPhraseCollection::s_sortScoreInd = Moses::Scan<int>(argv[5]);
assert(TargetPhraseCollection::s_sortScoreInd < numScores);
-
+
const string filePath = argv[6]
- ,destPath = argv[7];
+ ,destPath = argv[7];
Moses::InputFileStream inStream(filePath);
@@ -128,10 +128,10 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
} else {
switch (stage) {
case 0: {
- WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
- if (w != NULL)
- out->AddWord(w);
-
+ WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
+ if (w != NULL)
+ out->AddWord(w);
+
break;
}
case 1: {
@@ -146,19 +146,19 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
}
case 3: {
//targetPhrase.Create1AlignFromString(tok);
- targetPhrase.CreateAlignFromString(tok);
+ targetPhrase.CreateAlignFromString(tok);
break;
}
case 4:
++stage;
break;
- /* case 5: {
- // count info. Only store the 2nd one
- float val = Moses::Scan<float>(tok);
- misc[0] = val;
- ++stage;
- break;
- }*/
+ /* case 5: {
+ // count info. Only store the 2nd one
+ float val = Moses::Scan<float>(tok);
+ misc[0] = val;
+ ++stage;
+ break;
+ }*/
case 5: {
// count info. Only store the 2nd one
//float val = Moses::Scan<float>(tok);
@@ -167,12 +167,12 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
break;
}
case 6: {
- // store only the 3rd one (rule count)
+ // store only the 3rd one (rule count)
float val = Moses::Scan<float>(tok);
misc[0] = val;
++stage;
break;
- }
+ }
default:
cerr << "ERROR in line " << line << endl;
assert(false);
@@ -189,8 +189,8 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
} // Tokenize()
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
- , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
- , OnDiskPt::OnDiskWrapper &onDiskWrapper)
+ , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
+ , OnDiskPt::OnDiskWrapper &onDiskWrapper)
{
bool nonTerm = false;
@@ -218,7 +218,7 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
if (addSourceNonTerm) {
WordPtr word(new Word());
word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
- phrase.AddWord(word);
+ phrase.AddWord(word);
}
wordStr = token.substr(splitPos, tokSize - splitPos);
@@ -237,7 +237,7 @@ OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
phrase.AddWord(word);
out = word;
}
-
+
return out;
}
diff --git a/OnDiskPt/Main.h b/OnDiskPt/Main.h
index b79827589..5c7efa43c 100644
--- a/OnDiskPt/Main.h
+++ b/OnDiskPt/Main.h
@@ -26,12 +26,12 @@ typedef std::pair<size_t, size_t> AlignPair;
typedef std::vector<AlignPair> AlignType;
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
- , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
- , OnDiskPt::OnDiskWrapper &onDiskWrapper);
+ , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
+ , OnDiskPt::OnDiskWrapper &onDiskWrapper);
OnDiskPt::PhrasePtr Tokenize(OnDiskPt::SourcePhrase &sourcePhrase, OnDiskPt::TargetPhrase &targetPhrase
- , char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper
- , int numScores
- , std::vector<float> &misc);
+ , char *line, OnDiskPt::OnDiskWrapper &onDiskWrapper
+ , int numScores
+ , std::vector<float> &misc);
void InsertTargetNonTerminals(std::vector<std::string> &sourceToks, const std::vector<std::string> &targetToks, const AlignType &alignments);
void SortAlign(AlignType &alignments);
diff --git a/OnDiskPt/OnDiskQuery.cpp b/OnDiskPt/OnDiskQuery.cpp
index c39697d04..2cc7380db 100644
--- a/OnDiskPt/OnDiskQuery.cpp
+++ b/OnDiskPt/OnDiskQuery.cpp
@@ -3,10 +3,10 @@
namespace OnDiskPt
{
-void OnDiskQuery::Tokenize(Phrase &phrase,
- const std::string &token,
- bool addSourceNonTerm,
- bool addTargetNonTerm)
+void OnDiskQuery::Tokenize(Phrase &phrase,
+ const std::string &token,
+ bool addSourceNonTerm,
+ bool addTargetNonTerm)
{
bool nonTerm = false;
size_t tokSize = token.size();
@@ -50,13 +50,13 @@ void OnDiskQuery::Tokenize(Phrase &phrase,
phrase.AddWord(word);
}
}
-
+
SourcePhrase OnDiskQuery::Tokenize(const std::vector<std::string>& tokens)
{
SourcePhrase sourcePhrase;
- if (tokens.size() > 0){
+ if (tokens.size() > 0) {
std::vector<std::string>::const_iterator token = tokens.begin();
- for (; token + 1 != tokens.end(); ++token){
+ for (; token + 1 != tokens.end(); ++token) {
Tokenize(sourcePhrase, *token, true, true);
}
// last position. LHS non-term
@@ -64,22 +64,20 @@ SourcePhrase OnDiskQuery::Tokenize(const std::vector<std::string>& tokens)
}
return sourcePhrase;
}
-
+
const PhraseNode* OnDiskQuery::Query(const SourcePhrase& sourcePhrase)
{
- const PhraseNode *node = &m_wrapper.GetRootSourceNode();
- assert(node);
-
- for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos)
- {
- const Word &word = sourcePhrase.GetWord(pos);
- node = node->GetChild(word, m_wrapper);
- if (node == NULL)
- {
- break;
- }
+ const PhraseNode *node = &m_wrapper.GetRootSourceNode();
+ assert(node);
+
+ for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos) {
+ const Word &word = sourcePhrase.GetWord(pos);
+ node = node->GetChild(word, m_wrapper);
+ if (node == NULL) {
+ break;
}
- return node;
+ }
+ return node;
}
}
diff --git a/OnDiskPt/OnDiskQuery.h b/OnDiskPt/OnDiskQuery.h
index 679f545fa..233603c6c 100644
--- a/OnDiskPt/OnDiskQuery.h
+++ b/OnDiskPt/OnDiskQuery.h
@@ -18,22 +18,21 @@ private:
public:
- OnDiskQuery(OnDiskWrapper &wrapper):m_wrapper(wrapper){}
+ OnDiskQuery(OnDiskWrapper &wrapper):m_wrapper(wrapper) {}
+
+ void Tokenize(Phrase &phrase,
+ const std::string &token,
+ bool addSourceNonTerm,
+ bool addTargetNonTerm);
- void Tokenize(Phrase &phrase,
- const std::string &token,
- bool addSourceNonTerm,
- bool addTargetNonTerm);
-
SourcePhrase Tokenize(const std::vector<std::string>& tokens);
const PhraseNode *Query(const SourcePhrase& sourcePhrase);
- inline const PhraseNode *Query(const std::vector<std::string>& tokens)
- {
+ inline const PhraseNode *Query(const std::vector<std::string>& tokens) {
return Query(Tokenize(tokens));
}
-
+
};
diff --git a/OnDiskPt/OnDiskWrapper.cpp b/OnDiskPt/OnDiskWrapper.cpp
index 3a1773c0a..8f90862be 100644
--- a/OnDiskPt/OnDiskWrapper.cpp
+++ b/OnDiskPt/OnDiskWrapper.cpp
@@ -204,16 +204,16 @@ Word *OnDiskWrapper::ConvertFromMoses(Moses::FactorDirection /* direction */
Word *newWord = new Word(isNonTerminal);
stringstream strme;
- size_t factorType = factorsVec[0];
+ size_t factorType = factorsVec[0];
const Moses::Factor *factor = origWord.GetFactor(factorType);
- CHECK(factor);
+ CHECK(factor);
strme << factor->GetString();
for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) {
size_t factorType = factorsVec[ind];
const Moses::Factor *factor = origWord.GetFactor(factorType);
- if (factor == NULL)
- { // can have less factors than factorType.size()
+ if (factor == NULL) {
+ // can have less factors than factorType.size()
break;
}
CHECK(factor);
diff --git a/OnDiskPt/OnDiskWrapper.h b/OnDiskPt/OnDiskWrapper.h
index f763194c1..8b786d346 100644
--- a/OnDiskPt/OnDiskWrapper.h
+++ b/OnDiskPt/OnDiskWrapper.h
@@ -28,7 +28,7 @@ namespace OnDiskPt
{
const float DEFAULT_COUNT = 66666;
-/** Global class with misc information need to create and use the on-disk rule table.
+/** Global class with misc information need to create and use the on-disk rule table.
* 1 object of this class should be instantiated per rule table.
* Currently only hierarchical/syntax models use this, but can & should be used with pb models too
*/
diff --git a/OnDiskPt/PhraseNode.cpp b/OnDiskPt/PhraseNode.cpp
index c3f2ebdc4..c259aa077 100644
--- a/OnDiskPt/PhraseNode.cpp
+++ b/OnDiskPt/PhraseNode.cpp
@@ -38,7 +38,7 @@ size_t PhraseNode::GetNodeSize(size_t numChildren, size_t wordSize, size_t count
}
PhraseNode::PhraseNode()
- : m_value(0)
+ : m_value(0)
,m_currChild(NULL)
,m_saved(false)
,m_memLoad(NULL)
@@ -58,7 +58,7 @@ PhraseNode::PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper)
CHECK(filePos == (UINT64)file.tellg());
file.read((char*) &m_numChildrenLoad, sizeof(UINT64));
-
+
size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize);
m_memLoad = (char*) malloc(memAlloc);
@@ -168,7 +168,7 @@ void PhraseNode::AddTargetPhrase(const SourcePhrase &sourcePhrase, TargetPhrase
void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
, TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
, size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort)
-{
+{
size_t phraseSize = sourcePhrase.GetSize();
if (pos < phraseSize) {
const Word &word = sourcePhrase.GetWord(pos);
@@ -185,7 +185,7 @@ void PhraseNode::AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
m_currChild = &node;
}
- // keep searching for target phrase node..
+ // keep searching for target phrase node..
node.AddTargetPhrase(pos + 1, sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, counts, spShort);
} else {
// drilled down to the right node
diff --git a/OnDiskPt/PhraseNode.h b/OnDiskPt/PhraseNode.h
index fbd20ce36..6b629a401 100644
--- a/OnDiskPt/PhraseNode.h
+++ b/OnDiskPt/PhraseNode.h
@@ -53,7 +53,7 @@ protected:
void AddTargetPhrase(size_t pos, const SourcePhrase &sourcePhrase
, TargetPhrase *targetPhrase, OnDiskWrapper &onDiskWrapper
, size_t tableLimit, const std::vector<float> &counts, OnDiskPt::PhrasePtr spShort);
- size_t ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem) const;
+ size_t ReadChild(Word &wordFound, UINT64 &childFilePos, const char *mem) const;
void GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const;
public:
diff --git a/OnDiskPt/TargetPhrase.cpp b/OnDiskPt/TargetPhrase.cpp
index 6ca2ef5f9..2e3e3511b 100644
--- a/OnDiskPt/TargetPhrase.cpp
+++ b/OnDiskPt/TargetPhrase.cpp
@@ -64,13 +64,13 @@ void TargetPhrase::Create1AlignFromString(const std::string &align1Str)
void TargetPhrase::CreateAlignFromString(const std::string &alignStr)
{
- vector<std::string> alignPairs;
- boost::split(alignPairs, alignStr, boost::is_any_of("\t "));
- for (size_t i = 0; i < alignPairs.size(); ++i) {
- vector<size_t> alignPoints;
- Moses::Tokenize<size_t>(alignPoints, alignPairs[i], "-");
- m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
- }
+ vector<std::string> alignPairs;
+ boost::split(alignPairs, alignStr, boost::is_any_of("\t "));
+ for (size_t i = 0; i < alignPairs.size(); ++i) {
+ vector<size_t> alignPoints;
+ Moses::Tokenize<size_t>(alignPoints, alignPairs[i], "-");
+ m_align.push_back(pair<size_t, size_t>(alignPoints[0], alignPoints[1]) );
+ }
}
@@ -97,16 +97,16 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed)
{
size_t phraseSize = GetSize();
size_t targetWordSize = onDiskWrapper.GetTargetWordSize();
-
+
const PhrasePtr sp = GetSourcePhrase();
size_t spSize = sp->GetSize();
size_t sourceWordSize = onDiskWrapper.GetSourceWordSize();
-
+
size_t memNeeded = sizeof(UINT64) // num of words
+ targetWordSize * phraseSize // actual words. lhs as last words
- + sizeof(UINT64) // num source words
- + sourceWordSize * spSize; // actual source words
-
+ + sizeof(UINT64) // num source words
+ + sourceWordSize * spSize; // actual source words
+
memUsed = 0;
UINT64 *mem = (UINT64*) malloc(memNeeded);
@@ -125,13 +125,13 @@ char *TargetPhrase::WriteToMemory(OnDiskWrapper &onDiskWrapper, size_t &memUsed)
char *currPtr = (char*)mem + memUsed;
UINT64 *memTmp = (UINT64*) currPtr;
memTmp[0] = spSize;
- memUsed += sizeof(UINT64);
+ memUsed += sizeof(UINT64);
for (size_t pos = 0; pos < spSize; ++pos) {
const Word &word = sp->GetWord(pos);
char *currPtr = (char*)mem + memUsed;
memUsed += word.WriteToMemory((char*) currPtr);
}
-
+
CHECK(memUsed == memNeeded);
return (char *) mem;
}
@@ -174,7 +174,7 @@ char *TargetPhrase::WriteOtherInfoToMemory(OnDiskWrapper &onDiskWrapper, size_t
// phrase id
memcpy(mem, &m_filePos, sizeof(UINT64));
memUsed += sizeof(UINT64);
-
+
// align
size_t tmp = WriteAlignToMemory(mem + memUsed);
memUsed += tmp;
@@ -223,7 +223,7 @@ size_t TargetPhrase::WriteScoresToMemory(char *mem) const
}
-Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::FactorType> & inputFactors
+Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::FactorType> & inputFactors
, const std::vector<Moses::FactorType> &outputFactors
, const Vocab &vocab
, const Moses::PhraseDictionary &phraseDict
@@ -244,7 +244,7 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
int index = 0;
Moses::AlignmentInfo::CollType alignTerm, alignNonTerm;
std::set<std::pair<size_t, size_t> > alignmentInfo;
- const PhrasePtr sp = GetSourcePhrase();
+ const PhrasePtr sp = GetSourcePhrase();
for (size_t ind = 0; ind < m_align.size(); ++ind) {
const std::pair<size_t, size_t> &entry = m_align[ind];
alignmentInfo.insert(entry);
@@ -252,11 +252,10 @@ Moses::TargetPhrase *TargetPhrase::ConvertToMoses(const std::vector<Moses::Facto
size_t targetPos = entry.second;
if (GetWord(targetPos).IsNonTerminal()) {
- alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
+ alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
+ } else {
+ alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
}
- else {
- alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
- }
}
ret->SetAlignTerm(alignTerm);
@@ -313,7 +312,7 @@ UINT64 TargetPhrase::ReadFromFile(std::fstream &fileTP)
bytesRead += word->ReadFromFile(fileTP);
AddWord(word);
}
-
+
// read source words
UINT64 numSourceWords;
fileTP.read((char*) &numSourceWords, sizeof(UINT64));
@@ -371,7 +370,7 @@ UINT64 TargetPhrase::ReadScoresFromFile(std::fstream &fileTPColl)
void TargetPhrase::DebugPrint(ostream &out, const Vocab &vocab) const
{
Phrase::DebugPrint(out, vocab);
-
+
for (size_t ind = 0; ind < m_align.size(); ++ind) {
const AlignPair &alignPair = m_align[ind];
out << alignPair.first << "-" << alignPair.second << " ";
diff --git a/OnDiskPt/TargetPhrase.h b/OnDiskPt/TargetPhrase.h
index 5510ddd11..c4bb40454 100644
--- a/OnDiskPt/TargetPhrase.h
+++ b/OnDiskPt/TargetPhrase.h
@@ -49,7 +49,7 @@ class TargetPhrase: public Phrase
friend std::ostream& operator<<(std::ostream&, const TargetPhrase&);
protected:
AlignType m_align;
- PhrasePtr m_sourcePhrase;
+ PhrasePtr m_sourcePhrase;
std::vector<float> m_scores;
UINT64 m_filePos;
@@ -73,10 +73,10 @@ public:
const PhrasePtr GetSourcePhrase() const {
return m_sourcePhrase;
}
- const std::vector<float> &GetScores() const{
+ const std::vector<float> &GetScores() const {
return m_scores;
}
-
+
void SetLHS(WordPtr lhs);
void Create1AlignFromString(const std::string &align1Str);
@@ -107,7 +107,7 @@ public:
UINT64 ReadOtherInfoFromFile(UINT64 filePos, std::fstream &fileTPColl);
UINT64 ReadFromFile(std::fstream &fileTP);
- virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;
+ virtual void DebugPrint(std::ostream &out, const Vocab &vocab) const;
};
diff --git a/OnDiskPt/TargetPhraseCollection.cpp b/OnDiskPt/TargetPhraseCollection.cpp
index f29bea9cf..c865c2df7 100644
--- a/OnDiskPt/TargetPhraseCollection.cpp
+++ b/OnDiskPt/TargetPhraseCollection.cpp
@@ -82,7 +82,7 @@ void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper)
CollType::iterator iter;
for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
// save phrase
- TargetPhrase &targetPhrase = **iter;
+ TargetPhrase &targetPhrase = **iter;
targetPhrase.Save(onDiskWrapper);
// save coll
@@ -150,9 +150,9 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
{
fstream &fileTPColl = onDiskWrapper.GetFileTargetColl();
fstream &fileTP = onDiskWrapper.GetFileTargetInd();
-
+
size_t numScores = onDiskWrapper.GetNumScores();
-
+
UINT64 numPhrases;
@@ -164,9 +164,9 @@ void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnD
numPhrases = std::min(numPhrases, (UINT64) tableLimit);
currFilePos += sizeof(UINT64);
-
+
for (size_t ind = 0; ind < numPhrases; ++ind) {
- TargetPhrase *tp = new TargetPhrase(numScores);
+ TargetPhrase *tp = new TargetPhrase(numScores);
UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
tp->ReadFromFile(fileTP);
@@ -197,7 +197,7 @@ const TargetPhrase &TargetPhraseCollection::GetTargetPhrase(size_t ind) const
assert(ind < GetSize());
return *m_coll[ind];
}
-
+
}
diff --git a/OnDiskPt/TargetPhraseCollection.h b/OnDiskPt/TargetPhraseCollection.h
index e0d5d1599..d6086850f 100644
--- a/OnDiskPt/TargetPhraseCollection.h
+++ b/OnDiskPt/TargetPhraseCollection.h
@@ -64,9 +64,9 @@ public:
size_t GetSize() const {
return m_coll.size();
}
-
+
const TargetPhrase &GetTargetPhrase(size_t ind) const;
-
+
UINT64 GetFilePos() const;
Moses::TargetPhraseCollection *ConvertToMoses(const std::vector<Moses::FactorType> &inputFactors
diff --git a/OnDiskPt/Vocab.cpp b/OnDiskPt/Vocab.cpp
index 5de620b75..03af2d886 100644
--- a/OnDiskPt/Vocab.cpp
+++ b/OnDiskPt/Vocab.cpp
@@ -44,7 +44,7 @@ bool Vocab::Load(OnDiskWrapper &onDiskWrapper)
// assume contiguous vocab id
m_lookup.resize(m_vocabColl.size() + 1);
m_nextId = m_lookup.size();
-
+
CollType::const_iterator iter;
for (iter = m_vocabColl.begin(); iter != m_vocabColl.end(); ++iter) {
UINT32 vocabId = iter->second;
diff --git a/OnDiskPt/Word.cpp b/OnDiskPt/Word.cpp
index 13c77f739..1664571c5 100644
--- a/OnDiskPt/Word.cpp
+++ b/OnDiskPt/Word.cpp
@@ -97,13 +97,14 @@ size_t Word::ReadFromFile(std::fstream &file)
}
void Word::ConvertToMoses(
- const std::vector<Moses::FactorType> &outputFactorsVec,
- const Vocab &vocab,
- Moses::Word &overwrite) const {
+ const std::vector<Moses::FactorType> &outputFactorsVec,
+ const Vocab &vocab,
+ Moses::Word &overwrite) const
+{
Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance();
overwrite = Moses::Word(m_isNonTerminal);
- // TODO: this conversion should have been done at load time.
+ // TODO: this conversion should have been done at load time.
util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|');
for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
@@ -144,14 +145,14 @@ bool Word::operator==(const Word &compare) const
void Word::DebugPrint(ostream &out, const Vocab &vocab) const
{
- const string &str = vocab.GetString(m_vocabId);
+ const string &str = vocab.GetString(m_vocabId);
out << str;
}
std::ostream& operator<<(std::ostream &out, const Word &word)
{
out << "(";
- out << word.m_vocabId;
+ out << word.m_vocabId;
out << (word.m_isNonTerminal ? "n" : "t");
out << ")";
diff --git a/OnDiskPt/Word.h b/OnDiskPt/Word.h
index 64be6f148..254959737 100644
--- a/OnDiskPt/Word.h
+++ b/OnDiskPt/Word.h
@@ -50,8 +50,8 @@ public:
{}
explicit Word(bool isNonTerminal)
- :m_isNonTerminal(isNonTerminal)
- ,m_vocabId(0)
+ :m_isNonTerminal(isNonTerminal)
+ ,m_vocabId(0)
{}
Word(const Word &copy);
@@ -77,8 +77,7 @@ public:
Moses::Word &overwrite) const;
void DebugPrint(std::ostream &out, const Vocab &vocab) const;
- inline const std::string &GetString(const Vocab &vocab) const
- {
+ inline const std::string &GetString(const Vocab &vocab) const {
return vocab.GetString(m_vocabId);
}
diff --git a/OnDiskPt/queryOnDiskPt.cpp b/OnDiskPt/queryOnDiskPt.cpp
index 8126a2b75..776dd8a2c 100644
--- a/OnDiskPt/queryOnDiskPt.cpp
+++ b/OnDiskPt/queryOnDiskPt.cpp
@@ -33,8 +33,7 @@ int main(int argc, char **argv)
if(i + 1 == argc)
usage();
ttable = argv[++i];
- }
- else
+ } else
usage();
}
@@ -55,30 +54,27 @@ int main(int argc, char **argv)
cerr << "line: " << line << endl;
const PhraseNode* node = onDiskQuery.Query(tokens);
-
- if (node)
- { // source phrase points to a bunch of rules
+
+ if (node) {
+ // source phrase points to a bunch of rules
const TargetPhraseCollection *coll = node->GetTargetPhraseCollection(tableLimit, onDiskWrapper);
string str = coll->GetDebugStr();
cout << "Found " << coll->GetSize() << endl;
-
- for (size_t ind = 0; ind < coll->GetSize(); ++ind)
- {
+
+ for (size_t ind = 0; ind < coll->GetSize(); ++ind) {
const TargetPhrase &targetPhrase = coll->GetTargetPhrase(ind);
cerr << " ";
targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab());
cerr << endl;
}
- }
- else
- {
+ } else {
cout << "Not found" << endl;
}
-
+
std::cout << '\n';
std::cout.flush();
}
-
+
cerr << "Finished." << endl;
}
diff --git a/biconcor/Alignment.cpp b/biconcor/Alignment.cpp
index e73e18840..814802531 100644
--- a/biconcor/Alignment.cpp
+++ b/biconcor/Alignment.cpp
@@ -5,7 +5,8 @@
#include <stdlib.h>
#include <cstring>
-namespace {
+namespace
+{
const int LINE_MAX_LENGTH = 10000;
@@ -84,10 +85,10 @@ void Alignment::Create(const string& fileName)
}
Alignment::Alignment()
- : m_array(NULL),
- m_sentenceEnd(NULL),
- m_size(0),
- m_sentenceCount(0) {}
+ : m_array(NULL),
+ m_sentenceEnd(NULL),
+ m_size(0),
+ m_sentenceCount(0) {}
Alignment::~Alignment()
{
diff --git a/biconcor/Mismatch.cpp b/biconcor/Mismatch.cpp
index 31140b200..c3afec781 100644
--- a/biconcor/Mismatch.cpp
+++ b/biconcor/Mismatch.cpp
@@ -23,16 +23,16 @@ enum {
};
Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end )
- :m_suffixArray(sa)
- ,m_targetCorpus(tc)
- ,m_alignment(a)
- ,m_sentence_id(sentence_id)
- ,m_source_length(source_length)
- ,m_target_length(target_length)
- ,m_source_position(position)
- ,m_source_start(source_start)
- ,m_source_end(source_end)
- ,m_unaligned(true)
+ :m_suffixArray(sa)
+ ,m_targetCorpus(tc)
+ ,m_alignment(a)
+ ,m_sentence_id(sentence_id)
+ ,m_source_length(source_length)
+ ,m_target_length(target_length)
+ ,m_source_position(position)
+ ,m_source_start(source_start)
+ ,m_source_end(source_end)
+ ,m_unaligned(true)
{
// initialize unaligned indexes
for (int i = 0; i < m_source_length; i++) {
@@ -42,7 +42,7 @@ Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sente
m_target_unaligned[i] = true;
}
m_num_alignment_points =
- m_alignment->GetNumberOfAlignmentPoints( sentence_id );
+ m_alignment->GetNumberOfAlignmentPoints( sentence_id );
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
m_source_unaligned[ (int)m_alignment->GetSourceWord( sentence_id, ap ) ] = false;
m_target_unaligned[ (int)m_alignment->GetTargetWord( sentence_id, ap ) ] = false;
@@ -58,234 +58,235 @@ Mismatch::~Mismatch () {}
void Mismatch::PrintClippedHTML( ostream* out, int width )
{
- int source_annotation[256], target_annotation[256];
- vector< string > label_class;
- label_class.push_back( "" );
- label_class.push_back( "mismatch_pre_aligned" );
- label_class.push_back( "mismatch_post_aligned" );
- label_class.push_back( "null_aligned" );
- label_class.push_back( "mismatch_misaligned" );
- label_class.push_back( "mismatch_aligned" );
+ int source_annotation[256], target_annotation[256];
+ vector< string > label_class;
+ label_class.push_back( "" );
+ label_class.push_back( "mismatch_pre_aligned" );
+ label_class.push_back( "mismatch_post_aligned" );
+ label_class.push_back( "null_aligned" );
+ label_class.push_back( "mismatch_misaligned" );
+ label_class.push_back( "mismatch_aligned" );
- for(int i=0; i<m_source_length;i++) source_annotation[i] = UNANNOTATED;
- for(int i=0; i<m_target_length;i++) target_annotation[i] = UNANNOTATED;
-
- if (m_unaligned) {
- // find alignment points for prior and next word(s) and
- // center target phrase around those.
- bool found_aligned = false;
- for(int i=1; i<m_source_length && !found_aligned; i++) {
- if (m_source_start-i >= 0) {
- int word_id = m_source_start-i;
- source_annotation[ word_id ] = UNALIGNED;
- if (!m_source_unaligned[ word_id ]) {
- found_aligned = true;
- LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
- }
- }
+ for(int i=0; i<m_source_length; i++) source_annotation[i] = UNANNOTATED;
+ for(int i=0; i<m_target_length; i++) target_annotation[i] = UNANNOTATED;
- if (m_source_end+i < m_source_length) {
- int word_id = m_source_end+i;
- source_annotation[ word_id ] = UNALIGNED;
- if (!m_source_unaligned[ word_id ]) {
- found_aligned = true;
- LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED );
- }
- }
- }
-
- }
- // misalignment
- else {
- // label aligned output words
- for(int i=m_source_start; i<=m_source_end; i++)
- LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
+ if (m_unaligned) {
+ // find alignment points for prior and next word(s) and
+ // center target phrase around those.
+ bool found_aligned = false;
+ for(int i=1; i<m_source_length && !found_aligned; i++) {
+ if (m_source_start-i >= 0) {
+ int word_id = m_source_start-i;
+ source_annotation[ word_id ] = UNALIGNED;
+ if (!m_source_unaligned[ word_id ]) {
+ found_aligned = true;
+ LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
+ }
+ }
- // find first and last
- int target_start = -1;
- int target_end;
- for(int i=0; i<m_target_length; i++)
- if (target_annotation[i] == ALIGNED) {
- if (target_start == -1)
- target_start = i;
- target_end = i;
- }
- // go over all enclosed target words
- for(int i=target_start; i<=target_end; i++) {
- // label other target words as unaligned or misaligned
- if (m_target_unaligned[ i ])
- target_annotation[ i ] = UNALIGNED;
- else {
- if (target_annotation[ i ] != ALIGNED)
- target_annotation[ i ] = MISALIGNED;
- // loop over aligned source words
- for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
- if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) {
- int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
- // if not part of the source phrase -> also misaligned
- if (source_word < m_source_start || source_word > m_source_end)
- source_annotation[ source_word ] = MISALIGNED;
- }
- }
- }
- }
- // closure
- bool change = true;
- while(change) {
- change = false;
- for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
- int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
- int target_word = m_alignment->GetTargetWord( m_sentence_id, ap );
- if (source_annotation[source_word] != UNANNOTATED &&
- target_annotation[target_word] == UNANNOTATED) {
- target_annotation[target_word] = MISALIGNED;
- change = true;
- }
- if (source_annotation[source_word] == UNANNOTATED &&
- target_annotation[target_word] != UNANNOTATED) {
- source_annotation[source_word] = MISALIGNED;
- change = true;
- }
- }
- }
- }
-
- // print source
- // shorten source context if too long
+ if (m_source_end+i < m_source_length) {
+ int word_id = m_source_end+i;
+ source_annotation[ word_id ] = UNALIGNED;
+ if (!m_source_unaligned[ word_id ]) {
+ found_aligned = true;
+ LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED );
+ }
+ }
+ }
+
+ }
+ // misalignment
+ else {
+ // label aligned output words
+ for(int i=m_source_start; i<=m_source_end; i++)
+ LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
+
+ // find first and last
+ int target_start = -1;
+ int target_end;
+ for(int i=0; i<m_target_length; i++)
+ if (target_annotation[i] == ALIGNED) {
+ if (target_start == -1)
+ target_start = i;
+ target_end = i;
+ }
+ // go over all enclosed target words
+ for(int i=target_start; i<=target_end; i++) {
+ // label other target words as unaligned or misaligned
+ if (m_target_unaligned[ i ])
+ target_annotation[ i ] = UNALIGNED;
+ else {
+ if (target_annotation[ i ] != ALIGNED)
+ target_annotation[ i ] = MISALIGNED;
+ // loop over aligned source words
+ for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
+ if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) {
+ int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
+ // if not part of the source phrase -> also misaligned
+ if (source_word < m_source_start || source_word > m_source_end)
+ source_annotation[ source_word ] = MISALIGNED;
+ }
+ }
+ }
+ }
+ // closure
+ bool change = true;
+ while(change) {
+ change = false;
+ for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
+ int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
+ int target_word = m_alignment->GetTargetWord( m_sentence_id, ap );
+ if (source_annotation[source_word] != UNANNOTATED &&
+ target_annotation[target_word] == UNANNOTATED) {
+ target_annotation[target_word] = MISALIGNED;
+ change = true;
+ }
+ if (source_annotation[source_word] == UNANNOTATED &&
+ target_annotation[target_word] != UNANNOTATED) {
+ source_annotation[source_word] = MISALIGNED;
+ change = true;
+ }
+ }
+ }
+ }
+
+ // print source
+ // shorten source context if too long
int sentence_start = m_source_position - m_source_start;
- int context_space = width/2;
- for(int i=m_source_start;i<=m_source_end;i++)
- context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1;
- context_space /= 2;
+ int context_space = width/2;
+ for(int i=m_source_start; i<=m_source_end; i++)
+ context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1;
+ context_space /= 2;
- int remaining = context_space;
- int start_word = m_source_start;
- for(;start_word>0 && remaining>0; start_word--)
- remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1;
- if (remaining<0 || start_word == -1) start_word++;
+ int remaining = context_space;
+ int start_word = m_source_start;
+ for(; start_word>0 && remaining>0; start_word--)
+ remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1;
+ if (remaining<0 || start_word == -1) start_word++;
- remaining = context_space;
- int end_word = m_source_end;
- for(;end_word<m_source_length && remaining>0; end_word++)
- remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1;
- end_word--;
+ remaining = context_space;
+ int end_word = m_source_end;
+ for(; end_word<m_source_length && remaining>0; end_word++)
+ remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1;
+ end_word--;
- // output with markup
- *out << "<tr><td class=\"pp_source_left\">";
- char current_label = UNANNOTATED;
- if (start_word>0) {
- current_label = source_annotation[start_word-1];
- *out << "... ";
- }
- for(int i=start_word; i<=end_word; i++) {
- // change to phrase block
- if (i == m_source_start) {
- if (current_label != UNANNOTATED && i!=start_word)
- *out << "</span>";
- *out << "</td><td class=\"pp_source\">";
- current_label = UNANNOTATED;
- }
+ // output with markup
+ *out << "<tr><td class=\"pp_source_left\">";
+ char current_label = UNANNOTATED;
+ if (start_word>0) {
+ current_label = source_annotation[start_word-1];
+ *out << "... ";
+ }
+ for(int i=start_word; i<=end_word; i++) {
+ // change to phrase block
+ if (i == m_source_start) {
+ if (current_label != UNANNOTATED && i!=start_word)
+ *out << "</span>";
+ *out << "</td><td class=\"pp_source\">";
+ current_label = UNANNOTATED;
+ }
- // change to labeled word
- else if (source_annotation[i] != current_label &&
- source_annotation[i] != ALIGNED) {
- if (current_label != UNANNOTATED && i!=start_word)
- *out << "</span>";
- if (source_annotation[i] != UNANNOTATED)
- *out << "<span class=\""
- << label_class[ source_annotation[i] ]
- << "\">";
- current_label = source_annotation[i];
- }
+ // change to labeled word
+ else if (source_annotation[i] != current_label &&
+ source_annotation[i] != ALIGNED) {
+ if (current_label != UNANNOTATED && i!=start_word)
+ *out << "</span>";
+ if (source_annotation[i] != UNANNOTATED)
+ *out << "<span class=\""
+ << label_class[ source_annotation[i] ]
+ << "\">";
+ current_label = source_annotation[i];
+ }
- // output word
- *out << m_suffixArray->GetWord( sentence_start + i ) << " ";
+ // output word
+ *out << m_suffixArray->GetWord( sentence_start + i ) << " ";
- // change to right context block
- if (i == m_source_end) {
- *out << "</td><td class=\"pp_source_right\">";
- current_label = UNANNOTATED;
- }
- }
+ // change to right context block
+ if (i == m_source_end) {
+ *out << "</td><td class=\"pp_source_right\">";
+ current_label = UNANNOTATED;
+ }
+ }
- if (current_label != UNANNOTATED && end_word>m_source_end)
- *out << "</span>";
- if (end_word<m_source_length-1)
- *out << "... ";
+ if (current_label != UNANNOTATED && end_word>m_source_end)
+ *out << "</span>";
+ if (end_word<m_source_length-1)
+ *out << "... ";
- // print target
- // shorten target context if too long
- int target_start = -1;
- int target_end;
- for(int i=0; i<m_target_length; i++)
- if (target_annotation[i] != UNANNOTATED) {
- if (target_start == -1)
- target_start = i;
- target_end = i;
- }
+ // print target
+ // shorten target context if too long
+ int target_start = -1;
+ int target_end;
+ for(int i=0; i<m_target_length; i++)
+ if (target_annotation[i] != UNANNOTATED) {
+ if (target_start == -1)
+ target_start = i;
+ target_end = i;
+ }
- context_space = width/2;
- for(int i=target_start;i<=target_end;i++)
- context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1;
- while (context_space < 0) { // shorten matched part, if too long
- context_space +=
- m_targetCorpus->GetWord( m_sentence_id, target_start ).size() +
- m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2;
- target_start++;
- target_end--;
- }
- context_space /= 2;
+ context_space = width/2;
+ for(int i=target_start; i<=target_end; i++)
+ context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1;
+ while (context_space < 0) { // shorten matched part, if too long
+ context_space +=
+ m_targetCorpus->GetWord( m_sentence_id, target_start ).size() +
+ m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2;
+ target_start++;
+ target_end--;
+ }
+ context_space /= 2;
- remaining = context_space;
- start_word = target_start;
- for(;start_word>0 && remaining>0; start_word--) {
- //cerr << "remaining: " << remaining << ", start_word: " << start_word << endl;
- remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1;
- }
- if (remaining<0 || start_word == -1) start_word++;
+ remaining = context_space;
+ start_word = target_start;
+ for(; start_word>0 && remaining>0; start_word--) {
+ //cerr << "remaining: " << remaining << ", start_word: " << start_word << endl;
+ remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1;
+ }
+ if (remaining<0 || start_word == -1) start_word++;
- remaining = context_space;
- end_word = target_end;
- for(;end_word<m_target_length && remaining>0; end_word++) {
- //cerr << "remaining: " << remaining << ", end_word: " << end_word << endl;
- remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1;
- }
- end_word--;
+ remaining = context_space;
+ end_word = target_end;
+ for(; end_word<m_target_length && remaining>0; end_word++) {
+ //cerr << "remaining: " << remaining << ", end_word: " << end_word << endl;
+ remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1;
+ }
+ end_word--;
- // output with markup
- *out << "</td><td class=\"mismatch_target\">";
- current_label = UNANNOTATED;
- if (start_word>0) {
- current_label = target_annotation[start_word-1];
- *out << "... ";
- }
- for(int i=start_word; i<=end_word; i++) {
- if (target_annotation[i] != current_label) {
- if (current_label != UNANNOTATED && i!=start_word)
- *out << "</span>";
- if (target_annotation[i] != UNANNOTATED)
- *out << "<span class=\""
- << label_class[ target_annotation[i] ]
- << "\">";
- current_label = target_annotation[i];
- }
+ // output with markup
+ *out << "</td><td class=\"mismatch_target\">";
+ current_label = UNANNOTATED;
+ if (start_word>0) {
+ current_label = target_annotation[start_word-1];
+ *out << "... ";
+ }
+ for(int i=start_word; i<=end_word; i++) {
+ if (target_annotation[i] != current_label) {
+ if (current_label != UNANNOTATED && i!=start_word)
+ *out << "</span>";
+ if (target_annotation[i] != UNANNOTATED)
+ *out << "<span class=\""
+ << label_class[ target_annotation[i] ]
+ << "\">";
+ current_label = target_annotation[i];
+ }
- // output word
- *out << m_targetCorpus->GetWord( m_sentence_id, i ) << " ";
- }
+ // output word
+ *out << m_targetCorpus->GetWord( m_sentence_id, i ) << " ";
+ }
- if (current_label != UNANNOTATED && end_word>target_end)
- *out << "</span>";
- if (end_word<m_target_length-1)
- *out << "... ";
- *out << "</td></tr>";
+ if (current_label != UNANNOTATED && end_word>target_end)
+ *out << "</span>";
+ if (end_word<m_target_length-1)
+ *out << "... ";
+ *out << "</td></tr>";
}
-void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label ) {
- for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
- if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) {
- source_annotation[ source_id ] = label;
- target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label;
- }
- }
+void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label )
+{
+ for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
+ if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) {
+ source_annotation[ source_id ] = label;
+ target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label;
+ }
+ }
}
diff --git a/biconcor/Mismatch.h b/biconcor/Mismatch.h
index c0063d049..1277ed95a 100644
--- a/biconcor/Mismatch.h
+++ b/biconcor/Mismatch.h
@@ -34,7 +34,9 @@ public:
Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end );
~Mismatch();
- bool Unaligned() const { return m_unaligned; }
+ bool Unaligned() const {
+ return m_unaligned;
+ }
void PrintClippedHTML(std::ostream* out, int width );
void LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label );
};
diff --git a/biconcor/PhrasePair.cpp b/biconcor/PhrasePair.cpp
index 038fa3a31..b6409258b 100644
--- a/biconcor/PhrasePair.cpp
+++ b/biconcor/PhrasePair.cpp
@@ -37,7 +37,7 @@ void PhrasePair::Print( ostream* out ) const
INDEX ap_points = m_alignment->GetNumberOfAlignmentPoints( m_sentence_id );
for( INDEX i=0; i<ap_points; i++) {
*out << " " << m_alignment->GetSourceWord( m_sentence_id, i )
- << "-" << m_alignment->GetTargetWord( m_sentence_id, i );
+ << "-" << m_alignment->GetTargetWord( m_sentence_id, i );
}
*out << endl;
@@ -185,27 +185,27 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
size_t source_pre_width = (source_width-source.size())/2;
size_t source_post_width = (source_width-source.size()+1)/2;
- // if phrase is too long, don't show any context
+ // if phrase is too long, don't show any context
if (source.size() > (size_t)width) {
source_pre_width = 0;
source_post_width = 0;
}
- // too long -> truncate and add "..."
+ // too long -> truncate and add "..."
if (source_pre.size() > source_pre_width) {
- // first skip up to a space
- while(source_pre_width>0 &&
- source_pre.substr(source_pre.size()-source_pre_width,1) != " ") {
- source_pre_width--;
- }
+ // first skip up to a space
+ while(source_pre_width>0 &&
+ source_pre.substr(source_pre.size()-source_pre_width,1) != " ") {
+ source_pre_width--;
+ }
source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width );
- }
+ }
if (source_post.size() > source_post_width) {
- while(source_post_width>0 &&
- source_post.substr(source_post_width-1,1) != " ") {
- source_post_width--;
- }
+ while(source_post_width>0 &&
+ source_post.substr(source_post_width-1,1) != " ") {
+ source_post_width--;
+ }
source_post = source_post.substr( 0, source_post_width ) + "...";
- }
+ }
*out << "<tr><td class=\"pp_source_left\">"
<< source_pre
@@ -220,13 +220,13 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
string target_pre = "";
string target = "";
string target_post = "";
- size_t target_pre_null_width = 0;
- size_t target_post_null_width = 0;
+ size_t target_pre_null_width = 0;
+ size_t target_post_null_width = 0;
for( char i=0; i<m_target_start; i++ ) {
- WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
+ WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
target_pre += " " + word;
- if (i >= m_target_start-m_pre_null)
- target_pre_null_width += word.size() + 1;
+ if (i >= m_target_start-m_pre_null)
+ target_pre_null_width += word.size() + 1;
}
for( char i=m_target_start; i<=m_target_end; i++ ) {
if (i>m_target_start) target += " ";
@@ -234,11 +234,11 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
}
for( char i=m_target_end+1; i<m_target_length; i++ ) {
if (i>m_target_end+1) target_post += " ";
- WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
+ WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
target_post += word;
- if (i-(m_target_end+1) < m_post_null) {
- target_post_null_width += word.size() + 1;
- }
+ if (i-(m_target_end+1) < m_post_null) {
+ target_post_null_width += word.size() + 1;
+ }
}
size_t target_pre_width = (target_width-target.size())/2;
@@ -249,46 +249,45 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
target_post_width = 0;
}
- if (target_pre.size() < target_pre_width)
- target_pre_width = target_pre.size();
- else {
- while(target_pre_width>0 &&
- target_pre.substr(target_pre.size()-target_pre_width,1) != " ") {
- target_pre_width--;
- }
+ if (target_pre.size() < target_pre_width)
+ target_pre_width = target_pre.size();
+ else {
+ while(target_pre_width>0 &&
+ target_pre.substr(target_pre.size()-target_pre_width,1) != " ") {
+ target_pre_width--;
+ }
target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width );
- }
-
- if (target_post.size() < target_post_width) {
- target_post_width = target_post.size();
- }
- else {
- while(target_post_width>0 &&
- target_post.substr(target_post_width-1,1) != " ") {
- target_post_width--;
- }
- target_post = target_post.substr( 0, target_post_width ) + "...";
- }
-
- if (m_pre_null) {
- //cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl;
- if (target_pre_width < target_pre.size())
- target_pre_null_width -= target_pre.size()-target_pre_width;
- target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width)
- + "<span class=\"null_aligned\">"
- + target_pre.substr(target_pre_width-target_pre_null_width)
- + "</span>";
- }
- if (m_post_null) {
- //cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl;
- if (target_post_null_width > target_post.size()) {
- target_post_null_width = target_post.size();
- }
- target_post = "<span class=\"null_aligned\">"
- + target_post.substr(0,target_post_null_width)
- + "</span>"
- + target_post.substr(target_post_null_width);
- }
+ }
+
+ if (target_post.size() < target_post_width) {
+ target_post_width = target_post.size();
+ } else {
+ while(target_post_width>0 &&
+ target_post.substr(target_post_width-1,1) != " ") {
+ target_post_width--;
+ }
+ target_post = target_post.substr( 0, target_post_width ) + "...";
+ }
+
+ if (m_pre_null) {
+ //cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl;
+ if (target_pre_width < target_pre.size())
+ target_pre_null_width -= target_pre.size()-target_pre_width;
+ target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width)
+ + "<span class=\"null_aligned\">"
+ + target_pre.substr(target_pre_width-target_pre_null_width)
+ + "</span>";
+ }
+ if (m_post_null) {
+ //cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl;
+ if (target_post_null_width > target_post.size()) {
+ target_post_null_width = target_post.size();
+ }
+ target_post = "<span class=\"null_aligned\">"
+ + target_post.substr(0,target_post_null_width)
+ + "</span>"
+ + target_post.substr(target_post_null_width);
+ }
*out << "<td class=\"pp_target_left\">"
<< target_pre
diff --git a/biconcor/PhrasePairCollection.cpp b/biconcor/PhrasePairCollection.cpp
index 7497b2af8..dd21faad3 100644
--- a/biconcor/PhrasePairCollection.cpp
+++ b/biconcor/PhrasePairCollection.cpp
@@ -47,15 +47,15 @@ int PhrasePairCollection::GetCollection( const vector< string >& sourceString )
int sentence_length = m_suffixArray->GetSentenceLength( sentence_id );
int target_length = m_targetCorpus->GetSentenceLength( sentence_id );
//cerr << "match " << (i-first_match)
- //<< " in sentence " << sentence_id
- //<< ", starting at word " << source_start
- //<< " of " << sentence_length
- //<< ". target sentence has " << target_length << " words.";
+ //<< " in sentence " << sentence_id
+ //<< ", starting at word " << source_start
+ //<< " of " << sentence_length
+ //<< ". target sentence has " << target_length << " words.";
int target_start, target_end, pre_null, post_null;
if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
//cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
//cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
- bool null_boundary_words = false;
+ bool null_boundary_words = false;
for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) {
for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) {
vector< WORD_ID > targetString;
@@ -75,19 +75,18 @@ int PhrasePairCollection::GetCollection( const vector< string >& sourceString )
m_size++;
}
}
+ } else {
+ //cerr << "mismatch " << (i-first_match)
+ // << " in sentence " << sentence_id
+ // << ", starting at word " << source_start
+ // << " of " << sentence_length
+ // << ". target sentence has " << target_length << " words.";
+ Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
+ if (mismatch->Unaligned())
+ m_unaligned.push_back( mismatch );
+ else
+ m_mismatch.push_back( mismatch );
}
- else {
- //cerr << "mismatch " << (i-first_match)
- // << " in sentence " << sentence_id
- // << ", starting at word " << source_start
- // << " of " << sentence_length
- // << ". target sentence has " << target_length << " words.";
- Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
- if (mismatch->Unaligned())
- m_unaligned.push_back( mismatch );
- else
- m_mismatch.push_back( mismatch );
- }
//cerr << endl;
if (found > (INDEX)m_max_lookup) {
@@ -111,8 +110,7 @@ void PhrasePairCollection::Print(bool pretty) const
for(int j=0; j<ppWithSameTarget->size() && j<m_max_example; j++, p++ ) {
if (pretty) {
(*p)->PrintPretty( &cout, 100 );
- }
- else {
+ } else {
(*p)->Print( &cout );
}
if (ppWithSameTarget->size() > m_max_example) {
@@ -125,33 +123,32 @@ void PhrasePairCollection::Print(bool pretty) const
void PhrasePairCollection::PrintHTML() const
{
int pp_target = 0;
- bool singleton = false;
- // loop over all translations
+ bool singleton = false;
+ // loop over all translations
vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget;
for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_translation; ppWithSameTarget++, pp_target++ ) {
- int count = ppWithSameTarget->size();
- if (!singleton) {
- if (count == 1) {
- singleton = true;
- cout << "<p class=\"pp_singleton_header\">singleton"
- << (m_collection.end() - ppWithSameTarget==1?"":"s") << " ("
- << (m_collection.end() - ppWithSameTarget)
- << "/" << m_size << ")</p>";
- }
- else {
- cout << "<p class=\"pp_target_header\">";
- (*(ppWithSameTarget->begin()))->PrintTarget( &cout );
- cout << " (" << count << "/" << m_size << ")" << endl;
- cout << "<p><div id=\"pp_" << pp_target << "\">";
- }
- cout << "<table align=\"center\">";
- }
+ int count = ppWithSameTarget->size();
+ if (!singleton) {
+ if (count == 1) {
+ singleton = true;
+ cout << "<p class=\"pp_singleton_header\">singleton"
+ << (m_collection.end() - ppWithSameTarget==1?"":"s") << " ("
+ << (m_collection.end() - ppWithSameTarget)
+ << "/" << m_size << ")</p>";
+ } else {
+ cout << "<p class=\"pp_target_header\">";
+ (*(ppWithSameTarget->begin()))->PrintTarget( &cout );
+ cout << " (" << count << "/" << m_size << ")" << endl;
+ cout << "<p><div id=\"pp_" << pp_target << "\">";
+ }
+ cout << "<table align=\"center\">";
+ }
vector< PhrasePair* >::const_iterator p;
- // loop over all sentences where translation occurs
+ // loop over all sentences where translation occurs
int pp=0;
- int i=0;
+ int i=0;
for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
(*p)->PrintClippedHTML( &cout, 160 );
if (count > m_max_example) {
@@ -159,54 +156,54 @@ void PhrasePairCollection::PrintHTML() const
pp += count/m_max_example-1;
}
}
- if (i == 10 && pp < count) {
- // extended table
- cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
- cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
- cout << "<table align=\"center\">";
- for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
- (*p)->PrintClippedHTML( &cout, 160 );
- if (count > m_max_example) {
- p += count/m_max_example-1;
- pp += count/m_max_example-1;
- }
- }
- }
- if (!singleton) cout << "</table></div>\n";
-
- if (!singleton && pp_target == 9) {
- cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">";
- cout << "<p class=\"pp_target_header\">(more)</p></div>";
- cout << "<div id=\"pp_additional\" style=\"display:none;\";\">";
- }
+ if (i == 10 && pp < count) {
+ // extended table
+ cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
+ cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
+ cout << "<table align=\"center\">";
+ for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
+ (*p)->PrintClippedHTML( &cout, 160 );
+ if (count > m_max_example) {
+ p += count/m_max_example-1;
+ pp += count/m_max_example-1;
+ }
+ }
+ }
+ if (!singleton) cout << "</table></div>\n";
+
+ if (!singleton && pp_target == 9) {
+ cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">";
+ cout << "<p class=\"pp_target_header\">(more)</p></div>";
+ cout << "<div id=\"pp_additional\" style=\"display:none;\";\">";
+ }
+ }
+ if (singleton) cout << "</table></div>\n";
+ else if (pp_target > 9) cout << "</div>";
+
+ size_t max_mismatch = m_max_example/3;
+ // unaligned phrases
+ if (m_unaligned.size() > 0) {
+ cout << "<p class=\"pp_singleton_header\">unaligned"
+ << " (" << (m_unaligned.size()) << ")</p>";
+ cout << "<table align=\"center\">";
+ int step_size = 1;
+ if (m_unaligned.size() > max_mismatch)
+ step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch;
+ for(size_t i=0; i<m_unaligned.size(); i+=step_size)
+ m_unaligned[i]->PrintClippedHTML( &cout, 160 );
+ cout << "</table>";
+ }
+
+ // mismatched phrases
+ if (m_mismatch.size() > 0) {
+ cout << "<p class=\"pp_singleton_header\">mismatched"
+ << " (" << (m_mismatch.size()) << ")</p>";
+ cout << "<table align=\"center\">";
+ int step_size = 1;
+ if (m_mismatch.size() > max_mismatch)
+ step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch;
+ for(size_t i=0; i<m_mismatch.size(); i+=step_size)
+ m_mismatch[i]->PrintClippedHTML( &cout, 160 );
+ cout << "</table>";
}
- if (singleton) cout << "</table></div>\n";
- else if (pp_target > 9) cout << "</div>";
-
- size_t max_mismatch = m_max_example/3;
- // unaligned phrases
- if (m_unaligned.size() > 0) {
- cout << "<p class=\"pp_singleton_header\">unaligned"
- << " (" << (m_unaligned.size()) << ")</p>";
- cout << "<table align=\"center\">";
- int step_size = 1;
- if (m_unaligned.size() > max_mismatch)
- step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch;
- for(size_t i=0;i<m_unaligned.size();i+=step_size)
- m_unaligned[i]->PrintClippedHTML( &cout, 160 );
- cout << "</table>";
- }
-
- // mismatched phrases
- if (m_mismatch.size() > 0) {
- cout << "<p class=\"pp_singleton_header\">mismatched"
- << " (" << (m_mismatch.size()) << ")</p>";
- cout << "<table align=\"center\">";
- int step_size = 1;
- if (m_mismatch.size() > max_mismatch)
- step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch;
- for(size_t i=0;i<m_mismatch.size();i+=step_size)
- m_mismatch[i]->PrintClippedHTML( &cout, 160 );
- cout << "</table>";
- }
}
diff --git a/biconcor/SuffixArray.cpp b/biconcor/SuffixArray.cpp
index 15e6b47b0..f4122a2d8 100644
--- a/biconcor/SuffixArray.cpp
+++ b/biconcor/SuffixArray.cpp
@@ -5,7 +5,8 @@
#include <stdlib.h>
#include <cstring>
-namespace {
+namespace
+{
const int LINE_MAX_LENGTH = 10000;
@@ -14,15 +15,15 @@ const int LINE_MAX_LENGTH = 10000;
using namespace std;
SuffixArray::SuffixArray()
- : m_array(NULL),
- m_index(NULL),
- m_buffer(NULL),
- m_wordInSentence(NULL),
- m_sentence(NULL),
- m_sentenceLength(NULL),
- m_vcb(),
- m_size(0),
- m_sentenceCount(0) { }
+ : m_array(NULL),
+ m_index(NULL),
+ m_buffer(NULL),
+ m_wordInSentence(NULL),
+ m_sentence(NULL),
+ m_sentenceLength(NULL),
+ m_vcb(),
+ m_size(0),
+ m_sentenceCount(0) { }
SuffixArray::~SuffixArray()
{
diff --git a/biconcor/TargetCorpus.cpp b/biconcor/TargetCorpus.cpp
index d331a548a..06468007f 100644
--- a/biconcor/TargetCorpus.cpp
+++ b/biconcor/TargetCorpus.cpp
@@ -5,7 +5,8 @@
#include <stdlib.h>
#include <cstring>
-namespace {
+namespace
+{
const int LINE_MAX_LENGTH = 10000;
@@ -14,11 +15,11 @@ const int LINE_MAX_LENGTH = 10000;
using namespace std;
TargetCorpus::TargetCorpus()
- : m_array(NULL),
- m_sentenceEnd(NULL),
- m_vcb(),
- m_size(0),
- m_sentenceCount(0) {}
+ : m_array(NULL),
+ m_sentenceEnd(NULL),
+ m_vcb(),
+ m_size(0),
+ m_sentenceCount(0) {}
TargetCorpus::~TargetCorpus()
{
diff --git a/biconcor/Vocabulary.cpp b/biconcor/Vocabulary.cpp
index 9c35b3feb..9d52ee44e 100644
--- a/biconcor/Vocabulary.cpp
+++ b/biconcor/Vocabulary.cpp
@@ -2,7 +2,8 @@
#include "Vocabulary.h"
#include <fstream>
-namespace {
+namespace
+{
const int MAX_LENGTH = 10000;
diff --git a/biconcor/base64.cpp b/biconcor/base64.cpp
index 2a863d161..8032399b5 100644
--- a/biconcor/base64.cpp
+++ b/biconcor/base64.cpp
@@ -1,4 +1,4 @@
-/*
+/*
base64.cpp and base64.h
Copyright (C) 2004-2008 René Nyffenegger
@@ -28,17 +28,19 @@
#include "base64.h"
#include <iostream>
-static const std::string base64_chars =
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
- "abcdefghijklmnopqrstuvwxyz"
- "0123456789+/";
+static const std::string base64_chars =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789+/";
-static inline bool is_base64(unsigned char c) {
+static inline bool is_base64(unsigned char c)
+{
return (isalnum(c) || (c == '+') || (c == '/'));
}
-std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) {
+std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len)
+{
std::string ret;
int i = 0;
int j = 0;
@@ -59,8 +61,7 @@ std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_
}
}
- if (i)
- {
+ if (i) {
for(j = i; j < 3; j++)
char_array_3[j] = '\0';
@@ -81,7 +82,8 @@ std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_
}
-std::string base64_decode(std::string const& encoded_string) {
+std::string base64_decode(std::string const& encoded_string)
+{
int in_len = encoded_string.size();
int i = 0;
int j = 0;
@@ -90,7 +92,8 @@ std::string base64_decode(std::string const& encoded_string) {
std::string ret;
while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
- char_array_4[i++] = encoded_string[in_]; in_++;
+ char_array_4[i++] = encoded_string[in_];
+ in_++;
if (i ==4) {
for (i = 0; i <4; i++)
char_array_4[i] = base64_chars.find(char_array_4[i]);
diff --git a/biconcor/biconcor.cpp b/biconcor/biconcor.cpp
index f4e7c03fb..cb63e855d 100644
--- a/biconcor/biconcor.cpp
+++ b/biconcor/biconcor.cpp
@@ -150,22 +150,19 @@ int main(int argc, char* argv[])
cout << "TOTAL: " << total << endl;
if (htmlFlag) {
ppCollection.PrintHTML();
- }
- else {
- ppCollection.Print(prettyFlag);
+ } else {
+ ppCollection.Print(prettyFlag);
}
cout << "-|||- BICONCOR END -|||-" << endl << flush;
}
- }
- else if (queryFlag) {
+ } else if (queryFlag) {
cerr << "query is " << query << endl;
vector< string > queryString = alignment.Tokenize( query.c_str() );
PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example );
ppCollection.GetCollection( queryString );
if (htmlFlag) {
ppCollection.PrintHTML();
- }
- else {
+ } else {
ppCollection.Print(prettyFlag);
}
}
diff --git a/defer/PhraseDictionaryInterpolated.cpp b/defer/PhraseDictionaryInterpolated.cpp
index 764927081..93c74d956 100644
--- a/defer/PhraseDictionaryInterpolated.cpp
+++ b/defer/PhraseDictionaryInterpolated.cpp
@@ -29,155 +29,158 @@ using namespace std;
namespace Moses
{
- PhraseDictionaryInterpolated::PhraseDictionaryInterpolated
- (size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature):
- PhraseDictionary(numScoreComponent,feature),
- m_targetPhrases(NULL),
- m_languageModels(NULL) {}
-
- bool PhraseDictionaryInterpolated::Load(
- const std::vector<FactorType> &input
- , const std::vector<FactorType> &output
- , const std::vector<std::string>& config
- , const std::vector<float> &weightT
- , size_t tableLimit
- , const LMList &languageModels
- , float weightWP) {
-
- m_languageModels = &languageModels;
- m_weightT = weightT;
- m_tableLimit = tableLimit;
- m_weightWP = weightWP;
-
- //The config should be as follows:
- //0-3: type factor factor num-components (as usual)
- //4: combination mode (e.g. naive)
- //5-(length-2): List of phrase-table files
- //length-1: Weight string, in the same format as used for tmcombine
-
- UTIL_THROW_IF(config.size() < 7, util::Exception, "Missing fields from phrase table configuration: expected at least 7");
- UTIL_THROW_IF(config[4] != "naive", util::Exception, "Unsupported combination mode: '" << config[4] << "'");
-
- // Create the dictionaries
- for (size_t i = 5; i < config.size()-1; ++i) {
- m_dictionaries.push_back(DictionaryHandle(new PhraseDictionaryTreeAdaptor(
- GetFeature()->GetNumScoreComponents(),
- GetFeature()->GetNumInputScores(),
- GetFeature())));
- bool ret = m_dictionaries.back()->Load(
- input,
- output,
- config[i],
- weightT,
- 0,
- languageModels,
- weightWP);
- if (!ret) return ret;
- }
-
- //Parse the weight strings
- for (util::TokenIter<util::SingleCharacter, false> featureWeights(config.back(), util::SingleCharacter(';')); featureWeights; ++featureWeights) {
- m_weights.push_back(vector<float>());
- float sum = 0;
- for (util::TokenIter<util::SingleCharacter, false> tableWeights(*featureWeights, util::SingleCharacter(',')); tableWeights; ++tableWeights) {
- const float weight = boost::lexical_cast<float>(*tableWeights);
- m_weights.back().push_back(weight);
- sum += weight;
- }
- UTIL_THROW_IF(m_weights.back().size() != m_dictionaries.size(), util::Exception,
- "Number of weights (" << m_weights.back().size() <<
- ") does not match number of dictionaries to combine (" << m_dictionaries.size() << ")");
- UTIL_THROW_IF(abs(sum - 1) > 0.01, util::Exception, "Weights not normalised");
+PhraseDictionaryInterpolated::PhraseDictionaryInterpolated
+(size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature):
+ PhraseDictionary(numScoreComponent,feature),
+ m_targetPhrases(NULL),
+ m_languageModels(NULL) {}
+
+bool PhraseDictionaryInterpolated::Load(
+ const std::vector<FactorType> &input
+ , const std::vector<FactorType> &output
+ , const std::vector<std::string>& config
+ , const std::vector<float> &weightT
+ , size_t tableLimit
+ , const LMList &languageModels
+ , float weightWP)
+{
- }
+ m_languageModels = &languageModels;
+ m_weightT = weightT;
+ m_tableLimit = tableLimit;
+ m_weightWP = weightWP;
+
+ //The config should be as follows:
+ //0-3: type factor factor num-components (as usual)
+ //4: combination mode (e.g. naive)
+ //5-(length-2): List of phrase-table files
+ //length-1: Weight string, in the same format as used for tmcombine
+
+ UTIL_THROW_IF(config.size() < 7, util::Exception, "Missing fields from phrase table configuration: expected at least 7");
+ UTIL_THROW_IF(config[4] != "naive", util::Exception, "Unsupported combination mode: '" << config[4] << "'");
+
+ // Create the dictionaries
+ for (size_t i = 5; i < config.size()-1; ++i) {
+ m_dictionaries.push_back(DictionaryHandle(new PhraseDictionaryTreeAdaptor(
+ GetFeature()->GetNumScoreComponents(),
+ GetFeature()->GetNumInputScores(),
+ GetFeature())));
+ bool ret = m_dictionaries.back()->Load(
+ input,
+ output,
+ config[i],
+ weightT,
+ 0,
+ languageModels,
+ weightWP);
+ if (!ret) return ret;
+ }
- //check number of weight sets. Make sure there is a weight for every score component
- //except for the last - which is assumed to be the phrase penalty.
- UTIL_THROW_IF(m_weights.size() != 1 && m_weights.size() != GetFeature()->GetNumScoreComponents()-1, util::Exception, "Unexpected number of weight sets");
- //if 1 weight set, then repeat
- if (m_weights.size() == 1) {
- while(m_weights.size() < GetFeature()->GetNumScoreComponents()-1) {
- m_weights.push_back(m_weights[0]);
- }
+ //Parse the weight strings
+ for (util::TokenIter<util::SingleCharacter, false> featureWeights(config.back(), util::SingleCharacter(';')); featureWeights; ++featureWeights) {
+ m_weights.push_back(vector<float>());
+ float sum = 0;
+ for (util::TokenIter<util::SingleCharacter, false> tableWeights(*featureWeights, util::SingleCharacter(',')); tableWeights; ++tableWeights) {
+ const float weight = boost::lexical_cast<float>(*tableWeights);
+ m_weights.back().push_back(weight);
+ sum += weight;
}
+ UTIL_THROW_IF(m_weights.back().size() != m_dictionaries.size(), util::Exception,
+ "Number of weights (" << m_weights.back().size() <<
+ ") does not match number of dictionaries to combine (" << m_dictionaries.size() << ")");
+ UTIL_THROW_IF(abs(sum - 1) > 0.01, util::Exception, "Weights not normalised");
- return true;
}
- void PhraseDictionaryInterpolated::InitializeForInput(InputType const& source) {
- for (size_t i = 0; i < m_dictionaries.size(); ++i) {
- m_dictionaries[i]->InitializeForInput(source);
+ //check number of weight sets. Make sure there is a weight for every score component
+ //except for the last - which is assumed to be the phrase penalty.
+ UTIL_THROW_IF(m_weights.size() != 1 && m_weights.size() != GetFeature()->GetNumScoreComponents()-1, util::Exception, "Unexpected number of weight sets");
+ //if 1 weight set, then repeat
+ if (m_weights.size() == 1) {
+ while(m_weights.size() < GetFeature()->GetNumScoreComponents()-1) {
+ m_weights.push_back(m_weights[0]);
}
}
- typedef
- boost::unordered_set<TargetPhrase*,PhrasePtrHasher,PhrasePtrComparator> PhraseSet;
-
-
- const TargetPhraseCollection*
- PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const {
-
- delete m_targetPhrases;
- m_targetPhrases = new TargetPhraseCollection();
- PhraseSet allPhrases;
- vector<PhraseSet> phrasesByTable(m_dictionaries.size());
- for (size_t i = 0; i < m_dictionaries.size(); ++i) {
- const TargetPhraseCollection* phrases = m_dictionaries[i]->GetTargetPhraseCollection(src);
- if (phrases) {
- for (TargetPhraseCollection::const_iterator j = phrases->begin();
- j != phrases->end(); ++j) {
- allPhrases.insert(*j);
- phrasesByTable[i].insert(*j);
- }
+ return true;
+}
+
+void PhraseDictionaryInterpolated::InitializeForInput(InputType const& source)
+{
+ for (size_t i = 0; i < m_dictionaries.size(); ++i) {
+ m_dictionaries[i]->InitializeForInput(source);
+ }
+}
+
+typedef
+boost::unordered_set<TargetPhrase*,PhrasePtrHasher,PhrasePtrComparator> PhraseSet;
+
+
+const TargetPhraseCollection*
+PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const
+{
+
+ delete m_targetPhrases;
+ m_targetPhrases = new TargetPhraseCollection();
+ PhraseSet allPhrases;
+ vector<PhraseSet> phrasesByTable(m_dictionaries.size());
+ for (size_t i = 0; i < m_dictionaries.size(); ++i) {
+ const TargetPhraseCollection* phrases = m_dictionaries[i]->GetTargetPhraseCollection(src);
+ if (phrases) {
+ for (TargetPhraseCollection::const_iterator j = phrases->begin();
+ j != phrases->end(); ++j) {
+ allPhrases.insert(*j);
+ phrasesByTable[i].insert(*j);
}
}
- ScoreComponentCollection sparseVector;
- for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) {
- TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i);
- //combinedPhrase->ResetScore();
- //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
- combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase());
- combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm()));
- combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm()));
- Scores combinedScores(GetFeature()->GetNumScoreComponents());
- for (size_t j = 0; j < phrasesByTable.size(); ++j) {
- PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase);
- if (tablePhrase != phrasesByTable[j].end()) {
- Scores tableScores = (*tablePhrase)->GetScoreBreakdown()
- .GetScoresForProducer(GetFeature());
- //cerr << "Scores from " << j << " table: ";
- for (size_t k = 0; k < tableScores.size()-1; ++k) {
- //cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") ";
- combinedScores[k] += m_weights[k][j] * exp(tableScores[k]);
- //cerr << m_weights[k][j] * exp(tableScores[k]) << " ";
- }
- //cerr << endl;
+ }
+ ScoreComponentCollection sparseVector;
+ for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) {
+ TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i);
+ //combinedPhrase->ResetScore();
+ //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
+ combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase());
+ combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm()));
+ combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm()));
+ Scores combinedScores(GetFeature()->GetNumScoreComponents());
+ for (size_t j = 0; j < phrasesByTable.size(); ++j) {
+ PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase);
+ if (tablePhrase != phrasesByTable[j].end()) {
+ Scores tableScores = (*tablePhrase)->GetScoreBreakdown()
+ .GetScoresForProducer(GetFeature());
+ //cerr << "Scores from " << j << " table: ";
+ for (size_t k = 0; k < tableScores.size()-1; ++k) {
+ //cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") ";
+ combinedScores[k] += m_weights[k][j] * exp(tableScores[k]);
+ //cerr << m_weights[k][j] * exp(tableScores[k]) << " ";
}
+ //cerr << endl;
}
- //map back to log space
- //cerr << "Combined ";
- for (size_t k = 0; k < combinedScores.size()-1; ++k) {
- //cerr << combinedScores[k] << " ";
- combinedScores[k] = log(combinedScores[k]);
- //cerr << combinedScores[k] << " ";
- }
- //cerr << endl;
- combinedScores.back() = 1; //assume last is penalty
- combinedPhrase->SetScore(
- GetFeature(),
- combinedScores,
- sparseVector,
- m_weightT,
- m_weightWP,
- *m_languageModels);
- //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
- m_targetPhrases->Add(combinedPhrase);
}
+ //map back to log space
+ //cerr << "Combined ";
+ for (size_t k = 0; k < combinedScores.size()-1; ++k) {
+ //cerr << combinedScores[k] << " ";
+ combinedScores[k] = log(combinedScores[k]);
+ //cerr << combinedScores[k] << " ";
+ }
+ //cerr << endl;
+ combinedScores.back() = 1; //assume last is penalty
+ combinedPhrase->SetScore(
+ GetFeature(),
+ combinedScores,
+ sparseVector,
+ m_weightT,
+ m_weightWP,
+ *m_languageModels);
+ //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl;
+ m_targetPhrases->Add(combinedPhrase);
+ }
- m_targetPhrases->Prune(true,m_tableLimit);
+ m_targetPhrases->Prune(true,m_tableLimit);
- return m_targetPhrases;
- }
+ return m_targetPhrases;
+}
}
diff --git a/defer/PhraseDictionaryInterpolated.h b/defer/PhraseDictionaryInterpolated.h
index 74add1833..9bb4dcc3c 100644
--- a/defer/PhraseDictionaryInterpolated.h
+++ b/defer/PhraseDictionaryInterpolated.h
@@ -34,12 +34,14 @@ namespace Moses
**/
class PhraseDictionaryInterpolated : public PhraseDictionary
{
- public:
+public:
PhraseDictionaryInterpolated
- (size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature);
+ (size_t numScoreComponent,size_t numInputScores,const PhraseDictionaryFeature* feature);
- virtual ~PhraseDictionaryInterpolated() {delete m_targetPhrases;}
+ virtual ~PhraseDictionaryInterpolated() {
+ delete m_targetPhrases;
+ }
// initialize ...
bool Load(const std::vector<FactorType> &input
@@ -58,7 +60,7 @@ class PhraseDictionaryInterpolated : public PhraseDictionary
throw std::logic_error("PhraseDictionaryInterpolated.CreateRuleLookupManager() Not implemented");
}
- private:
+private:
typedef boost::shared_ptr<PhraseDictionaryTreeAdaptor> DictionaryHandle;
std::vector<DictionaryHandle> m_dictionaries;
diff --git a/defer/PhraseLengthFeatureTest.cpp b/defer/PhraseLengthFeatureTest.cpp
index 42026e805..6fb15e71e 100644
--- a/defer/PhraseLengthFeatureTest.cpp
+++ b/defer/PhraseLengthFeatureTest.cpp
@@ -31,7 +31,8 @@ BOOST_AUTO_TEST_SUITE(phrase_length_feature)
//TODO: Factor out setup code so that it can be reused
-static Word MakeWord(string text) {
+static Word MakeWord(string text)
+{
FactorCollection &factorCollection = FactorCollection::Instance();
const Factor* f = factorCollection.AddFactor(Input,0,text);
Word w;
@@ -40,7 +41,8 @@ static Word MakeWord(string text) {
}
-BOOST_AUTO_TEST_CASE(evaluate) {
+BOOST_AUTO_TEST_CASE(evaluate)
+{
Word w1 = MakeWord("w1");
Word w2 = MakeWord("y2");
Word w3 = MakeWord("x3");
@@ -78,7 +80,7 @@ BOOST_AUTO_TEST_CASE(evaluate) {
PhraseBasedFeatureContext context1(topt1,sentence);
PhraseBasedFeatureContext context2(topt2,sentence);
PhraseBasedFeatureContext context3(topt3,sentence);
-
+
PhraseLengthFeature plf;
ScoreComponentCollection acc1,acc2,acc3;
diff --git a/defer/TargetBigramFeatureTest.cpp b/defer/TargetBigramFeatureTest.cpp
index 4b8d00800..c651c8ed9 100644
--- a/defer/TargetBigramFeatureTest.cpp
+++ b/defer/TargetBigramFeatureTest.cpp
@@ -34,12 +34,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
using namespace Moses;
-namespace MosesTest
+namespace MosesTest
{
BOOST_AUTO_TEST_SUITE(target_bigram)
-static Word MakeWord(string text) {
+static Word MakeWord(string text)
+{
FactorCollection &factorCollection = FactorCollection::Instance();
const Factor* f = factorCollection.AddFactor(Input,0,text);
Word w;
@@ -47,34 +48,32 @@ static Word MakeWord(string text) {
return w;
}
-class VocabFileFixture {
- public:
- template<class I>
- VocabFileFixture(I begin, I end)
- {
- char name[] = "TargetBigramXXXXXX";
- int fd = mkstemp(name);
- BOOST_CHECK(fd != -1);
- BOOST_CHECK(!close(fd));
- filename = name;
- ofstream out(name);
- for (I i = begin; i != end; ++i)
- {
- out << *i << endl;
- }
- out.close();
+class VocabFileFixture
+{
+public:
+ template<class I>
+ VocabFileFixture(I begin, I end) {
+ char name[] = "TargetBigramXXXXXX";
+ int fd = mkstemp(name);
+ BOOST_CHECK(fd != -1);
+ BOOST_CHECK(!close(fd));
+ filename = name;
+ ofstream out(name);
+ for (I i = begin; i != end; ++i) {
+ out << *i << endl;
}
+ out.close();
+ }
- ~VocabFileFixture()
- {
- BOOST_CHECK(!remove(filename.c_str()));
- }
+ ~VocabFileFixture() {
+ BOOST_CHECK(!remove(filename.c_str()));
+ }
- string filename;
+ string filename;
};
/*
-BOOST_AUTO_TEST_CASE(Test2)
+BOOST_AUTO_TEST_CASE(Test2)
{
HypothesisFixture hypos;
cerr << hypos.empty() << ", " << *hypos.empty() << endl;
@@ -113,7 +112,7 @@ BOOST_AUTO_TEST_CASE(score_components)
ScoreProducer::unlimited);
}
-BOOST_AUTO_TEST_CASE(empty_hypo)
+BOOST_AUTO_TEST_CASE(empty_hypo)
{
Sentence s;
TargetBigramFeature tbf;
@@ -124,7 +123,7 @@ BOOST_AUTO_TEST_CASE(empty_hypo)
}
//Test of evaluate() where a vocab is specified
-BOOST_AUTO_TEST_CASE(evaluate_vocab)
+BOOST_AUTO_TEST_CASE(evaluate_vocab)
{
string vocab[] = {"i", "do"};
VocabFileFixture vocabFile(vocab,vocab+2);
@@ -156,7 +155,7 @@ BOOST_AUTO_TEST_CASE(evaluate_all)
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&tbf, "do:not"),1);
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&tbf, "not:</s>"),0);
BOOST_CHECK(! currState->Compare(TargetBigramState(MakeWord("not"))));
-
+
}
BOOST_AUTO_TEST_CASE(evaluate_empty)
@@ -171,7 +170,7 @@ BOOST_AUTO_TEST_CASE(evaluate_empty)
BOOST_CHECK(! currState->Compare(*prevState));
}
-BOOST_AUTO_TEST_CASE(evaluate_eos)
+BOOST_AUTO_TEST_CASE(evaluate_eos)
{
HypothesisFixture hypos;
TargetBigramFeature tbf;
diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp
index 26723d36b..1d5caa394 100644
--- a/mert/BleuScorer.cpp
+++ b/mert/BleuScorer.cpp
@@ -18,7 +18,8 @@
using namespace std;
-namespace {
+namespace
+{
// configure regularisation
const char KEY_REFLEN[] = "reflen";
@@ -33,8 +34,9 @@ namespace MosesTuning
BleuScorer::BleuScorer(const string& config)
- : StatisticsBasedScorer("BLEU", config),
- m_ref_length_type(CLOSEST) {
+ : StatisticsBasedScorer("BLEU", config),
+ m_ref_length_type(CLOSEST)
+{
const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
if (reflen == REFLEN_AVERAGE) {
m_ref_length_type = AVERAGE;
@@ -101,7 +103,8 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
}
}
-bool BleuScorer::OpenReference(const char* filename, size_t file_id) {
+bool BleuScorer::OpenReference(const char* filename, size_t file_id)
+{
ifstream ifs(filename);
if (!ifs) {
cerr << "Cannot open " << filename << endl;
@@ -110,7 +113,8 @@ bool BleuScorer::OpenReference(const char* filename, size_t file_id) {
return OpenReferenceStream(&ifs, file_id);
}
-bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id) {
+bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
+{
if (is == NULL) return false;
string line;
@@ -203,25 +207,27 @@ statscore_t BleuScorer::calculateScore(const vector<int>& comps) const
return exp(logbleu);
}
-int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length) {
+int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length)
+{
switch (m_ref_length_type) {
- case AVERAGE:
- return m_references[sentence_id]->CalcAverage();
- break;
- case CLOSEST:
- return m_references[sentence_id]->CalcClosest(length);
- break;
- case SHORTEST:
- return m_references[sentence_id]->CalcShortest();
- break;
- default:
- cerr << "unknown reference types." << endl;
- exit(1);
+ case AVERAGE:
+ return m_references[sentence_id]->CalcAverage();
+ break;
+ case CLOSEST:
+ return m_references[sentence_id]->CalcClosest(length);
+ break;
+ case SHORTEST:
+ return m_references[sentence_id]->CalcShortest();
+ break;
+ default:
+ cerr << "unknown reference types." << endl;
+ exit(1);
}
}
void BleuScorer::DumpCounts(ostream* os,
- const NgramCounts& counts) const {
+ const NgramCounts& counts) const
+{
for (NgramCounts::const_iterator it = counts.begin();
it != counts.end(); ++it) {
*os << "(";
@@ -238,7 +244,8 @@ void BleuScorer::DumpCounts(ostream* os,
}
float smoothedSentenceBleu
- (const std::vector<float>& stats, float smoothing, bool smoothBP) {
+(const std::vector<float>& stats, float smoothing, bool smoothBP)
+{
CHECK(stats.size() == kBleuNgramOrder * 2 + 1);
@@ -247,8 +254,8 @@ float smoothedSentenceBleu
logbleu += log(stats[2 * j] + smoothing) - log(stats[2 * j + 1] + smoothing);
}
logbleu /= kBleuNgramOrder;
- const float reflength = stats[(kBleuNgramOrder * 2)] +
- (smoothBP ? smoothing : 0.0f);
+ const float reflength = stats[(kBleuNgramOrder * 2)] +
+ (smoothBP ? smoothing : 0.0f);
const float brevity = 1.0 - reflength / stats[1];
if (brevity < 0.0) {
@@ -263,7 +270,7 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
std::vector<float> stats;
CHECK(sent.size()==bg.size());
CHECK(sent.size()==kBleuNgramOrder*2+1);
- for(size_t i=0;i<sent.size();i++)
+ for(size_t i=0; i<sent.size(); i++)
stats.push_back(sent[i]+bg[i]);
// Calculate BLEU
@@ -282,7 +289,8 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
return exp(logbleu) * stats[kBleuNgramOrder*2];
}
-float unsmoothedBleu(const std::vector<float>& stats) {
+float unsmoothedBleu(const std::vector<float>& stats)
+{
CHECK(stats.size() == kBleuNgramOrder * 2 + 1);
float logbleu = 0.0;
@@ -298,50 +306,51 @@ float unsmoothedBleu(const std::vector<float>& stats) {
return exp(logbleu);
}
-vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string& featureFile) {
- vector<string> scoreFiles;
- vector<string> featureFiles;
- scoreFiles.push_back(scoreFile);
- featureFiles.push_back(featureFile);
-
- vector<FeatureDataIterator> featureDataIters;
- vector<ScoreDataIterator> scoreDataIters;
- for (size_t i = 0; i < featureFiles.size(); ++i) {
- featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
- scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
- }
-
- vector<pair<size_t,size_t> > hypotheses;
- if (featureDataIters[0] == FeatureDataIterator::end()) {
- cerr << "Error: at the end of feature data iterator" << endl;
- exit(1);
- }
- for (size_t i = 0; i < featureFiles.size(); ++i) {
- if (featureDataIters[i] == FeatureDataIterator::end()) {
- cerr << "Error: Feature file " << i << " ended prematurely" << endl;
- exit(1);
- }
- if (scoreDataIters[i] == ScoreDataIterator::end()) {
- cerr << "Error: Score file " << i << " ended prematurely" << endl;
- exit(1);
- }
- if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
- cerr << "Error: features and scores have different size" << endl;
- exit(1);
- }
- for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
- hypotheses.push_back(pair<size_t,size_t>(i,j));
- }
- }
-
- // score the nbest list
- vector<float> bleuScores;
- for (size_t i=0; i < hypotheses.size(); ++i) {
- pair<size_t,size_t> translation = hypotheses[i];
- float bleu = smoothedSentenceBleu(scoreDataIters[translation.first]->operator[](translation.second));
- bleuScores.push_back(bleu);
- }
- return bleuScores;
+vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string& featureFile)
+{
+ vector<string> scoreFiles;
+ vector<string> featureFiles;
+ scoreFiles.push_back(scoreFile);
+ featureFiles.push_back(featureFile);
+
+ vector<FeatureDataIterator> featureDataIters;
+ vector<ScoreDataIterator> scoreDataIters;
+ for (size_t i = 0; i < featureFiles.size(); ++i) {
+ featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
+ scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
+ }
+
+ vector<pair<size_t,size_t> > hypotheses;
+ if (featureDataIters[0] == FeatureDataIterator::end()) {
+ cerr << "Error: at the end of feature data iterator" << endl;
+ exit(1);
+ }
+ for (size_t i = 0; i < featureFiles.size(); ++i) {
+ if (featureDataIters[i] == FeatureDataIterator::end()) {
+ cerr << "Error: Feature file " << i << " ended prematurely" << endl;
+ exit(1);
+ }
+ if (scoreDataIters[i] == ScoreDataIterator::end()) {
+ cerr << "Error: Score file " << i << " ended prematurely" << endl;
+ exit(1);
+ }
+ if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
+ cerr << "Error: features and scores have different size" << endl;
+ exit(1);
+ }
+ for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
+ hypotheses.push_back(pair<size_t,size_t>(i,j));
+ }
+ }
+
+ // score the nbest list
+ vector<float> bleuScores;
+ for (size_t i=0; i < hypotheses.size(); ++i) {
+ pair<size_t,size_t> translation = hypotheses[i];
+ float bleu = smoothedSentenceBleu(scoreDataIters[translation.first]->operator[](translation.second));
+ bleuScores.push_back(bleu);
+ }
+ return bleuScores;
}
diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h
index af889b13e..248b3e1d1 100644
--- a/mert/BleuScorer.h
+++ b/mert/BleuScorer.h
@@ -38,14 +38,22 @@ public:
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
virtual statscore_t calculateScore(const std::vector<int>& comps) const;
- virtual std::size_t NumberOfScores() const { return 2 * kBleuNgramOrder + 1; }
+ virtual std::size_t NumberOfScores() const {
+ return 2 * kBleuNgramOrder + 1;
+ }
int CalcReferenceLength(std::size_t sentence_id, std::size_t length);
- ReferenceLengthType GetReferenceLengthType() const { return m_ref_length_type; }
- void SetReferenceLengthType(ReferenceLengthType type) { m_ref_length_type = type; }
+ ReferenceLengthType GetReferenceLengthType() const {
+ return m_ref_length_type;
+ }
+ void SetReferenceLengthType(ReferenceLengthType type) {
+ m_ref_length_type = type;
+ }
- const std::vector<Reference*>& GetReferences() const { return m_references.get(); }
+ const std::vector<Reference*>& GetReferences() const {
+ return m_references.get();
+ }
/**
* Count the ngrams of each type, up to the given length in the input line.
@@ -74,7 +82,7 @@ private:
* This function is used in PRO.
*/
float smoothedSentenceBleu
- (const std::vector<float>& stats, float smoothing=1.0, bool smoothBP=false);
+(const std::vector<float>& stats, float smoothing=1.0, bool smoothBP=false);
/** Computes sentence-level BLEU score given a background corpus.
* This function is used in batch MIRA.
diff --git a/mert/BleuScorerTest.cpp b/mert/BleuScorerTest.cpp
index 136f134eb..a63196a3b 100644
--- a/mert/BleuScorerTest.cpp
+++ b/mert/BleuScorerTest.cpp
@@ -10,16 +10,19 @@
using namespace MosesTuning;
-namespace {
+namespace
+{
NgramCounts* g_counts = NULL;
-NgramCounts* GetNgramCounts() {
+NgramCounts* GetNgramCounts()
+{
assert(g_counts);
return g_counts;
}
-void SetNgramCounts(NgramCounts* counts) {
+void SetNgramCounts(NgramCounts* counts)
+{
g_counts = counts;
}
@@ -58,33 +61,38 @@ struct Fourgram {
NgramCounts::Key instance;
};
-bool CheckUnigram(const std::string& str) {
+bool CheckUnigram(const std::string& str)
+{
Unigram unigram(str);
NgramCounts::Value v;
return GetNgramCounts()->Lookup(unigram.instance, &v);
}
-bool CheckBigram(const std::string& a, const std::string& b) {
+bool CheckBigram(const std::string& a, const std::string& b)
+{
Bigram bigram(a, b);
NgramCounts::Value v;
return GetNgramCounts()->Lookup(bigram.instance, &v);
}
bool CheckTrigram(const std::string& a, const std::string& b,
- const std::string& c) {
+ const std::string& c)
+{
Trigram trigram(a, b, c);
NgramCounts::Value v;
return GetNgramCounts()->Lookup(trigram.instance, &v);
}
bool CheckFourgram(const std::string& a, const std::string& b,
- const std::string& c, const std::string& d) {
+ const std::string& c, const std::string& d)
+{
Fourgram fourgram(a, b, c, d);
NgramCounts::Value v;
return GetNgramCounts()->Lookup(fourgram.instance, &v);
}
-void SetUpReferences(BleuScorer& scorer) {
+void SetUpReferences(BleuScorer& scorer)
+{
// The following examples are taken from Koehn, "Statistical Machine Translation",
// Cambridge University Press, 2010.
{
@@ -115,7 +123,8 @@ void SetUpReferences(BleuScorer& scorer) {
} // namespace
-BOOST_AUTO_TEST_CASE(bleu_reference_type) {
+BOOST_AUTO_TEST_CASE(bleu_reference_type)
+{
BleuScorer scorer;
// BleuScorer will use "closest" by default.
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::CLOSEST);
@@ -127,7 +136,8 @@ BOOST_AUTO_TEST_CASE(bleu_reference_type) {
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::SHORTEST);
}
-BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config) {
+BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config)
+{
{
BleuScorer scorer("reflen:average");
BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::AVERAGE);
@@ -139,7 +149,8 @@ BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config) {
}
}
-BOOST_AUTO_TEST_CASE(bleu_count_ngrams) {
+BOOST_AUTO_TEST_CASE(bleu_count_ngrams)
+{
BleuScorer scorer;
std::string line = "I saw a girl with a telescope .";
@@ -198,7 +209,8 @@ BOOST_AUTO_TEST_CASE(bleu_count_ngrams) {
BOOST_CHECK(CheckFourgram("with", "a", "telescope", "."));
}
-BOOST_AUTO_TEST_CASE(bleu_clipped_counts) {
+BOOST_AUTO_TEST_CASE(bleu_clipped_counts)
+{
BleuScorer scorer;
SetUpReferences(scorer);
std::string line("israeli officials responsibility of airport safety");
@@ -220,7 +232,8 @@ BOOST_AUTO_TEST_CASE(bleu_clipped_counts) {
BOOST_CHECK_EQUAL(entry.get(7), 3); // fourgram
}
-BOOST_AUTO_TEST_CASE(calculate_actual_score) {
+BOOST_AUTO_TEST_CASE(calculate_actual_score)
+{
BOOST_REQUIRE(4 == kBleuNgramOrder);
std::vector<int> stats(2 * kBleuNgramOrder + 1);
BleuScorer scorer;
@@ -247,7 +260,8 @@ BOOST_AUTO_TEST_CASE(calculate_actual_score) {
BOOST_CHECK_CLOSE(0.5115f, scorer.calculateScore(stats), 0.01);
}
-BOOST_AUTO_TEST_CASE(sentence_level_bleu) {
+BOOST_AUTO_TEST_CASE(sentence_level_bleu)
+{
BOOST_REQUIRE(4 == kBleuNgramOrder);
std::vector<float> stats(2 * kBleuNgramOrder + 1);
diff --git a/mert/CderScorer.cpp b/mert/CderScorer.cpp
index cece29034..21a80ad52 100644
--- a/mert/CderScorer.cpp
+++ b/mert/CderScorer.cpp
@@ -6,9 +6,11 @@
using namespace std;
-namespace {
+namespace
+{
-inline int CalcDistance(int word1, int word2) {
+inline int CalcDistance(int word1, int word2)
+{
return word1 == word2 ? 0 : 1;
}
@@ -16,11 +18,11 @@ inline int CalcDistance(int word1, int word2) {
namespace MosesTuning
{
-
+
CderScorer::CderScorer(const string& config, bool allowed_long_jumps)
- : StatisticsBasedScorer(allowed_long_jumps ? "CDER" : "WER", config),
- m_allowed_long_jumps(allowed_long_jumps) {}
+ : StatisticsBasedScorer(allowed_long_jumps ? "CDER" : "WER", config),
+ m_allowed_long_jumps(allowed_long_jumps) {}
CderScorer::~CderScorer() {}
@@ -82,7 +84,8 @@ float CderScorer::calculateScore(const vector<int>& comps) const
}
void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
- vector<int>& stats) const {
+ vector<int>& stats) const
+{
int I = cand.size() + 1; // Number of inter-words positions in candidate sentence
int L = ref.size() + 1; // Number of inter-words positions in reference sentence
@@ -95,11 +98,9 @@ void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
for (int i = 1; i < I; ++i) (*row)[i] = 1;
// Calculating costs for next row using costs from the previous row.
- while (++l < L)
- {
+ while (++l < L) {
vector<int>* nextRow = new vector<int>(I);
- for (int i = 0; i < I; ++i)
- {
+ for (int i = 0; i < I; ++i) {
vector<int> possibleCosts;
if (i > 0) {
possibleCosts.push_back((*nextRow)[i-1] + 1); // Deletion
diff --git a/mert/CderScorer.h b/mert/CderScorer.h
index 60b6ad125..bd43ec0d8 100644
--- a/mert/CderScorer.h
+++ b/mert/CderScorer.h
@@ -8,13 +8,14 @@
namespace MosesTuning
{
-
+
/**
* CderScorer class can compute both CDER and WER metric.
*/
-class CderScorer: public StatisticsBasedScorer {
- public:
+class CderScorer: public StatisticsBasedScorer
+{
+public:
explicit CderScorer(const std::string& config, bool allowed_long_jumps = true);
~CderScorer();
@@ -24,11 +25,13 @@ class CderScorer: public StatisticsBasedScorer {
virtual void prepareStatsVector(std::size_t sid, const std::string& text, std::vector<int>& stats);
- virtual std::size_t NumberOfScores() const { return 2; }
+ virtual std::size_t NumberOfScores() const {
+ return 2;
+ }
virtual float calculateScore(const std::vector<int>& comps) const;
- private:
+private:
bool m_allowed_long_jumps;
typedef std::vector<int> sent_t;
diff --git a/mert/Data.cpp b/mert/Data.cpp
index 1efa080a2..613ce419b 100644
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@@ -27,11 +27,11 @@ namespace MosesTuning
{
Data::Data(Scorer* scorer, const string& sparse_weights_file)
- : m_scorer(scorer),
- m_score_type(m_scorer->getName()),
- m_num_scores(0),
- m_score_data(new ScoreData(m_scorer)),
- m_feature_data(new FeatureData)
+ : m_scorer(scorer),
+ m_score_type(m_scorer->getName()),
+ m_num_scores(0),
+ m_score_data(new ScoreData(m_scorer)),
+ m_feature_data(new FeatureData)
{
TRACE_ERR("Data::m_score_type " << m_score_type << endl);
TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl);
@@ -48,7 +48,8 @@ Data::Data(Scorer* scorer, const string& sparse_weights_file)
//ADDED BY TS
// TODO: This is too long; consider creating additional functions to
// reduce the lines of this function.
-void Data::removeDuplicates() {
+void Data::removeDuplicates()
+{
size_t nSentences = m_feature_data->size();
assert(m_score_data->size() == nSentences);
@@ -128,7 +129,8 @@ void Data::removeDuplicates() {
}
//END_ADDED
-void Data::load(const std::string &featfile, const std::string &scorefile) {
+void Data::load(const std::string &featfile, const std::string &scorefile)
+{
m_feature_data->load(featfile, m_sparse_weights);
m_score_data->load(scorefile);
}
@@ -192,7 +194,8 @@ void Data::loadNBest(const string &file)
}
}
-void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) {
+void Data::save(const std::string &featfile, const std::string &scorefile, bool bin)
+{
if (bin)
cerr << "Binary write mode is selected" << endl;
else
@@ -202,7 +205,8 @@ void Data::save(const std::string &featfile, const std::string &scorefile, bool
m_score_data->save(scorefile, bin);
}
-void Data::InitFeatureMap(const string& str) {
+void Data::InitFeatureMap(const string& str)
+{
string buf = str;
string substr;
string features = "";
@@ -231,7 +235,8 @@ void Data::InitFeatureMap(const string& str) {
}
void Data::AddFeatures(const string& str,
- int sentence_index) {
+ int sentence_index)
+{
string buf = str;
string substr;
FeatureStats feature_entry;
diff --git a/mert/Data.h b/mert/Data.h
index e17ac0239..cd090bad3 100644
--- a/mert/Data.h
+++ b/mert/Data.h
@@ -44,18 +44,28 @@ public:
m_feature_data->clear();
}
- ScoreDataHandle getScoreData() { return m_score_data; }
+ ScoreDataHandle getScoreData() {
+ return m_score_data;
+ }
- FeatureDataHandle getFeatureData() { return m_feature_data; }
+ FeatureDataHandle getFeatureData() {
+ return m_feature_data;
+ }
- Scorer* getScorer() { return m_scorer; }
+ Scorer* getScorer() {
+ return m_scorer;
+ }
std::size_t NumberOfFeatures() const {
return m_feature_data->NumberOfFeatures();
}
- std::string Features() const { return m_feature_data->Features(); }
- void Features(const std::string &f) { m_feature_data->Features(f); }
+ std::string Features() const {
+ return m_feature_data->Features();
+ }
+ void Features(const std::string &f) {
+ m_feature_data->Features(f);
+ }
void loadNBest(const std::string &file);
diff --git a/mert/DataTest.cpp b/mert/DataTest.cpp
index 189d8ccda..911171e0b 100644
--- a/mert/DataTest.cpp
+++ b/mert/DataTest.cpp
@@ -10,7 +10,8 @@
using namespace MosesTuning;
//very basic test of sharding
-BOOST_AUTO_TEST_CASE(shard_basic) {
+BOOST_AUTO_TEST_CASE(shard_basic)
+{
boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
Data data(scorer.get());
FeatureArray fa1, fa2, fa3, fa4;
@@ -39,7 +40,8 @@ BOOST_AUTO_TEST_CASE(shard_basic) {
BOOST_CHECK_EQUAL(shards[1].getFeatureData()->size(),(std::size_t)2);
}
-BOOST_AUTO_TEST_CASE(init_feature_map_test) {
+BOOST_AUTO_TEST_CASE(init_feature_map_test)
+{
boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
Data data(scorer.get());
@@ -49,7 +51,8 @@ BOOST_AUTO_TEST_CASE(init_feature_map_test) {
BOOST_CHECK_EQUAL(expected, data.Features());
}
-BOOST_AUTO_TEST_CASE(add_features_test) {
+BOOST_AUTO_TEST_CASE(add_features_test)
+{
boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
Data data(scorer.get());
diff --git a/mert/Fdstream.h b/mert/Fdstream.h
index 6dbdb40a6..c59052f02 100644
--- a/mert/Fdstream.h
+++ b/mert/Fdstream.h
@@ -13,27 +13,27 @@
#define BUFFER_SIZE (32768)
-namespace MosesTuning
+namespace MosesTuning
{
class _fdstream
{
protected:
_fdstream() :
- _file_descriptor(-1), _filebuf(NULL)
+ _file_descriptor(-1), _filebuf(NULL)
{ }
_fdstream(int file_descriptor, std::ios_base::openmode openmode) :
- _file_descriptor(file_descriptor), _openmode(openmode)
- {
+ _file_descriptor(file_descriptor), _openmode(openmode) {
_filebuf = NULL;
open(file_descriptor, openmode);
}
- std::ios_base::openmode openmode() const { return _openmode; }
+ std::ios_base::openmode openmode() const {
+ return _openmode;
+ }
- void open(int file_descriptor, std::ios_base::openmode openmode)
- {
+ void open(int file_descriptor, std::ios_base::openmode openmode) {
if (!_filebuf)
// We create a C++ stream from a file descriptor
// stdio_filebuf is not synced with stdio.
@@ -41,11 +41,10 @@ protected:
// You can also create the filebuf from a FILE* with
// FILE* f = fdopen(file_descriptor, mode);
_filebuf = new __gnu_cxx::stdio_filebuf<char> (file_descriptor,
- openmode);
+ openmode);
}
- virtual ~_fdstream()
- {
+ virtual ~_fdstream() {
close(_file_descriptor);
delete _filebuf;
_filebuf = NULL;
@@ -60,59 +59,51 @@ class ifdstream : public _fdstream
{
public:
ifdstream() :
- _fdstream(), _stream(NULL)
+ _fdstream(), _stream(NULL)
{ }
ifdstream(int file_descriptor) :
- _fdstream(file_descriptor, std::ios_base::in)
- {
+ _fdstream(file_descriptor, std::ios_base::in) {
_stream = new std::istream(_filebuf);
}
- void open(int file_descriptor)
- {
- if (!_stream)
- {
- _fdstream::open(file_descriptor, std::ios_base::in);
- _stream = new std::istream(_filebuf);
- }
+ void open(int file_descriptor) {
+ if (!_stream) {
+ _fdstream::open(file_descriptor, std::ios_base::in);
+ _stream = new std::istream(_filebuf);
+ }
}
- ifdstream& operator>> (std::string& str)
- {
+ ifdstream& operator>> (std::string& str) {
(*_stream) >> str;
return *this;
}
- std::size_t getline(std::string& str)
- {
+ std::size_t getline(std::string& str) {
char tmp[BUFFER_SIZE];
std::size_t ret = getline(tmp, BUFFER_SIZE);
str = tmp;
return ret;
}
- std::size_t getline(char* s, std::streamsize n)
- {
+ std::size_t getline(char* s, std::streamsize n) {
return (getline(s, n, '\n'));
}
- std::size_t getline(char* s, std::streamsize n, char delim)
- {
+ std::size_t getline(char* s, std::streamsize n, char delim) {
int i = 0;
- do{
+ do {
s[i] = _stream->get();
i++;
- }while(i < n-1 && s[i-1] != delim && s[i-1] != '\0');
+ } while(i < n-1 && s[i-1] != delim && s[i-1] != '\0');
s[i-1] = '\0'; // overwrite the delimiter given with string end
return i-1;
}
- ~ifdstream()
- {
+ ~ifdstream() {
//this->~_fdstream();
delete _stream;
}
@@ -125,27 +116,23 @@ class ofdstream : public _fdstream
{
public:
ofdstream() :
- _fdstream(), _stream(NULL)
+ _fdstream(), _stream(NULL)
{ }
ofdstream(int file_descriptor) :
- _fdstream(file_descriptor, std::ios_base::out)
- {
+ _fdstream(file_descriptor, std::ios_base::out) {
_stream = new std::ostream(_filebuf);
}
- void open(int file_descriptor)
- {
- if (!_stream)
- {
+ void open(int file_descriptor) {
+ if (!_stream) {
_fdstream::open(file_descriptor, std::ios_base::out);
_stream = new std::ostream(_filebuf);
}
}
- ofdstream& operator<< (const std::string& str)
- {
+ ofdstream& operator<< (const std::string& str) {
if (_stream->good())
(*_stream) << str;
@@ -153,8 +140,7 @@ public:
return *this;
}
- ~ofdstream()
- {
+ ~ofdstream() {
//this->~_fdstream();
delete _stream;
}
diff --git a/mert/FeatureArray.cpp b/mert/FeatureArray.cpp
index fd5fea200..d49b53b96 100644
--- a/mert/FeatureArray.cpp
+++ b/mert/FeatureArray.cpp
@@ -19,14 +19,14 @@ namespace MosesTuning
FeatureArray::FeatureArray()
- : m_index(0), m_num_features(0){}
+ : m_index(0), m_num_features(0) {}
FeatureArray::~FeatureArray() {}
void FeatureArray::savetxt(ostream* os)
{
*os << FEATURES_TXT_BEGIN << " " << m_index << " " << m_array.size()
- << " " << m_num_features << " " << m_features << endl;
+ << " " << m_num_features << " " << m_features << endl;
for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i) {
i->savetxt(os);
*os << endl;
@@ -37,7 +37,7 @@ void FeatureArray::savetxt(ostream* os)
void FeatureArray::savebin(ostream* os)
{
*os << FEATURES_BIN_BEGIN << " " << m_index << " " << m_array.size()
- << " " << m_num_features << " " << m_features << endl;
+ << " " << m_num_features << " " << m_features << endl;
for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i)
i->savebin(os);
diff --git a/mert/FeatureArray.h b/mert/FeatureArray.h
index 03fe6b40c..f5fc489a2 100644
--- a/mert/FeatureArray.h
+++ b/mert/FeatureArray.h
@@ -36,16 +36,28 @@ public:
FeatureArray();
~FeatureArray();
- void clear() { m_array.clear(); }
+ void clear() {
+ m_array.clear();
+ }
- int getIndex() const { return m_index; }
- void setIndex(const int value) { m_index = value; }
+ int getIndex() const {
+ return m_index;
+ }
+ void setIndex(const int value) {
+ m_index = value;
+ }
- FeatureStats& get(std::size_t i) { return m_array.at(i); }
- const FeatureStats& get(std::size_t i) const { return m_array.at(i); }
+ FeatureStats& get(std::size_t i) {
+ return m_array.at(i);
+ }
+ const FeatureStats& get(std::size_t i) const {
+ return m_array.at(i);
+ }
- void add(FeatureStats& e) { m_array.push_back(e); }
+ void add(FeatureStats& e) {
+ m_array.push_back(e);
+ }
//ADDED BY TS
void swap(std::size_t i, std::size_t j) {
@@ -59,13 +71,23 @@ public:
void merge(FeatureArray& e);
- std::size_t size() const { return m_array.size(); }
+ std::size_t size() const {
+ return m_array.size();
+ }
- std::size_t NumberOfFeatures() const { return m_num_features; }
- void NumberOfFeatures(std::size_t v) { m_num_features = v; }
+ std::size_t NumberOfFeatures() const {
+ return m_num_features;
+ }
+ void NumberOfFeatures(std::size_t v) {
+ m_num_features = v;
+ }
- std::string Features() const { return m_features; }
- void Features(const std::string& f) { m_features = f; }
+ std::string Features() const {
+ return m_features;
+ }
+ void Features(const std::string& f) {
+ m_features = f;
+ }
void savetxt(std::ostream* os);
void savebin(std::ostream* os);
diff --git a/mert/FeatureData.cpp b/mert/FeatureData.cpp
index 75888ef6d..13b9b3a96 100644
--- a/mert/FeatureData.cpp
+++ b/mert/FeatureData.cpp
@@ -20,7 +20,7 @@ namespace MosesTuning
FeatureData::FeatureData()
- : m_num_features(0) {}
+ : m_num_features(0) {}
void FeatureData::save(ostream* os, bool bin)
{
@@ -38,7 +38,8 @@ void FeatureData::save(const string &file, bool bin)
ofs.close();
}
-void FeatureData::save(bool bin) {
+void FeatureData::save(bool bin)
+{
save(&cout, bin);
}
@@ -145,7 +146,8 @@ void FeatureData::setFeatureMap(const string& feat)
}
}
-string FeatureData::ToString() const {
+string FeatureData::ToString() const
+{
string res;
{
diff --git a/mert/FeatureData.h b/mert/FeatureData.h
index 79e52b330..2510b3aee 100644
--- a/mert/FeatureData.h
+++ b/mert/FeatureData.h
@@ -33,7 +33,9 @@ public:
FeatureData();
~FeatureData() {}
- void clear() { m_array.clear(); }
+ void clear() {
+ m_array.clear();
+ }
FeatureArray& get(size_t idx) {
return m_array.at(idx);
@@ -61,13 +63,23 @@ public:
void add(FeatureArray& e);
void add(FeatureStats& e, int sent_idx);
- std::size_t size() const { return m_array.size(); }
+ std::size_t size() const {
+ return m_array.size();
+ }
- std::size_t NumberOfFeatures() const { return m_num_features; }
- void NumberOfFeatures(std::size_t v) { m_num_features = v; }
+ std::size_t NumberOfFeatures() const {
+ return m_num_features;
+ }
+ void NumberOfFeatures(std::size_t v) {
+ m_num_features = v;
+ }
- std::string Features() const { return m_features; }
- void Features(const std::string& f) { m_features = f; }
+ std::string Features() const {
+ return m_features;
+ }
+ void Features(const std::string& f) {
+ m_features = f;
+ }
void save(const std::string &file, bool bin=false);
void save(std::ostream* os, bool bin=false);
diff --git a/mert/FeatureDataIterator.cpp b/mert/FeatureDataIterator.cpp
index 471da07ee..9deb0ac50 100644
--- a/mert/FeatureDataIterator.cpp
+++ b/mert/FeatureDataIterator.cpp
@@ -32,9 +32,10 @@ using namespace util;
namespace MosesTuning
{
-
-int ParseInt(const StringPiece& str ) {
+
+int ParseInt(const StringPiece& str )
+{
char* errIndex;
//could wrap?
int value = static_cast<int>(strtol(str.data(), &errIndex,10));
@@ -44,7 +45,8 @@ int ParseInt(const StringPiece& str ) {
return value;
}
-float ParseFloat(const StringPiece& str) {
+float ParseFloat(const StringPiece& str)
+{
char* errIndex;
float value = static_cast<float>(strtod(str.data(), &errIndex));
if (errIndex == str.data()) {
@@ -53,11 +55,13 @@ float ParseFloat(const StringPiece& str) {
return value;
}
-bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2) {
+bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2)
+{
return item1.dense==item1.dense && item1.sparse==item1.sparse;
}
-size_t hash_value(FeatureDataItem const& item) {
+size_t hash_value(FeatureDataItem const& item)
+{
size_t seed = 0;
boost::hash_combine(seed,item.dense);
boost::hash_combine(seed,item.sparse);
@@ -67,14 +71,16 @@ size_t hash_value(FeatureDataItem const& item) {
FeatureDataIterator::FeatureDataIterator() {}
-FeatureDataIterator::FeatureDataIterator(const string& filename) {
+FeatureDataIterator::FeatureDataIterator(const string& filename)
+{
m_in.reset(new FilePiece(filename.c_str()));
readNext();
}
FeatureDataIterator::~FeatureDataIterator() {}
-void FeatureDataIterator::readNext() {
+void FeatureDataIterator::readNext()
+{
m_next.clear();
try {
StringPiece marker = m_in->ReadDelimited();
@@ -101,7 +107,7 @@ void FeatureDataIterator::readNext() {
//sparse feature
StringPiece second = *value;
float floatValue = ParseFloat(second);
- m_next.back().sparse.set(first.as_string(),floatValue);
+ m_next.back().sparse.set(first.as_string(),floatValue);
}
}
if (length != m_next.back().dense.size()) {
@@ -117,11 +123,13 @@ void FeatureDataIterator::readNext() {
}
}
-void FeatureDataIterator::increment() {
+void FeatureDataIterator::increment()
+{
readNext();
}
-bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const {
+bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const
+{
if (!m_in && !rhs.m_in) {
return true;
} else if (!m_in) {
@@ -129,12 +137,13 @@ bool FeatureDataIterator::equal(const FeatureDataIterator& rhs) const {
} else if (!rhs.m_in) {
return false;
} else {
- return m_in->FileName() == rhs.m_in->FileName() &&
- m_in->Offset() == rhs.m_in->Offset();
+ return m_in->FileName() == rhs.m_in->FileName() &&
+ m_in->Offset() == rhs.m_in->Offset();
}
}
-const vector<FeatureDataItem>& FeatureDataIterator::dereference() const {
+const vector<FeatureDataItem>& FeatureDataIterator::dereference() const
+{
return m_next;
}
diff --git a/mert/FeatureDataIterator.h b/mert/FeatureDataIterator.h
index 8bbb8d497..15a654182 100644
--- a/mert/FeatureDataIterator.h
+++ b/mert/FeatureDataIterator.h
@@ -37,18 +37,21 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "FeatureStats.h"
-namespace util { class FilePiece; }
+namespace util
+{
+class FilePiece;
+}
namespace MosesTuning
{
-
-class FileFormatException : public util::Exception
+
+class FileFormatException : public util::Exception
{
- public:
- explicit FileFormatException(const std::string& filename, const std::string& line) {
- *this << "Error in line \"" << line << "\" of " << filename;
- }
+public:
+ explicit FileFormatException(const std::string& filename, const std::string& line) {
+ *this << "Error in line \"" << line << "\" of " << filename;
+ }
};
@@ -56,45 +59,45 @@ class FileFormatException : public util::Exception
int ParseInt(const StringPiece& str );
/** Assumes a delimiter, so only apply to tokens */
-float ParseFloat(const StringPiece& str);
+float ParseFloat(const StringPiece& str);
-class FeatureDataItem
+class FeatureDataItem
{
- public:
- std::vector<float> dense;
- SparseVector sparse;
+public:
+ std::vector<float> dense;
+ SparseVector sparse;
};
bool operator==(FeatureDataItem const& item1, FeatureDataItem const& item2);
std::size_t hash_value(FeatureDataItem const& item);
-class FeatureDataIterator :
+class FeatureDataIterator :
public boost::iterator_facade<FeatureDataIterator,
- const std::vector<FeatureDataItem>,
- boost::forward_traversal_tag>
+ const std::vector<FeatureDataItem>,
+ boost::forward_traversal_tag>
{
- public:
- FeatureDataIterator();
- explicit FeatureDataIterator(const std::string& filename);
- ~FeatureDataIterator();
+public:
+ FeatureDataIterator();
+ explicit FeatureDataIterator(const std::string& filename);
+ ~FeatureDataIterator();
- static FeatureDataIterator end() {
- return FeatureDataIterator();
- }
+ static FeatureDataIterator end() {
+ return FeatureDataIterator();
+ }
- private:
- friend class boost::iterator_core_access;
+private:
+ friend class boost::iterator_core_access;
- void increment();
- bool equal(const FeatureDataIterator& rhs) const;
- const std::vector<FeatureDataItem>& dereference() const;
+ void increment();
+ bool equal(const FeatureDataIterator& rhs) const;
+ const std::vector<FeatureDataItem>& dereference() const;
- void readNext();
+ void readNext();
- boost::shared_ptr<util::FilePiece> m_in;
- std::vector<FeatureDataItem> m_next;
+ boost::shared_ptr<util::FilePiece> m_in;
+ std::vector<FeatureDataItem> m_next;
};
}
diff --git a/mert/FeatureDataTest.cpp b/mert/FeatureDataTest.cpp
index 0f3d6a596..916203592 100644
--- a/mert/FeatureDataTest.cpp
+++ b/mert/FeatureDataTest.cpp
@@ -7,10 +7,12 @@
using namespace MosesTuning;
-namespace {
+namespace
+{
void CheckFeatureMap(const FeatureData* feature_data,
- const char* str, int num_feature, int* cnt) {
+ const char* str, int num_feature, int* cnt)
+{
for (int i = 0; i < num_feature; ++i) {
std::stringstream ss;
ss << str << "_" << i;
@@ -23,7 +25,8 @@ void CheckFeatureMap(const FeatureData* feature_data,
} // namespace
-BOOST_AUTO_TEST_CASE(set_feature_map) {
+BOOST_AUTO_TEST_CASE(set_feature_map)
+{
std::string str("d_0 d_1 d_2 d_3 d_4 d_5 d_6 lm_0 lm_1 tm_0 tm_1 tm_2 tm_3 tm_4 w_0 ");
FeatureData feature_data;
diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp
index 242d3fbd0..aa32e1fef 100644
--- a/mert/FeatureStats.cpp
+++ b/mert/FeatureStats.cpp
@@ -18,31 +18,35 @@
using namespace std;
-namespace {
+namespace
+{
const int kAvailableSize = 8;
} // namespace
namespace MosesTuning
{
-
+
SparseVector::name2id_t SparseVector::m_name_to_id;
SparseVector::id2name_t SparseVector::m_id_to_name;
-FeatureStatsType SparseVector::get(const string& name) const {
+FeatureStatsType SparseVector::get(const string& name) const
+{
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
if (name2id_iter == m_name_to_id.end()) return 0;
size_t id = name2id_iter->second;
return get(id);
}
-FeatureStatsType SparseVector::get(size_t id) const {
+FeatureStatsType SparseVector::get(size_t id) const
+{
fvector_t::const_iterator fvector_iter = m_fvector.find(id);
if (fvector_iter == m_fvector.end()) return 0;
return fvector_iter->second;
}
-void SparseVector::set(const string& name, FeatureStatsType value) {
+void SparseVector::set(const string& name, FeatureStatsType value)
+{
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
size_t id = 0;
if (name2id_iter == m_name_to_id.end()) {
@@ -55,7 +59,8 @@ void SparseVector::set(const string& name, FeatureStatsType value) {
m_fvector[id] = value;
}
-void SparseVector::write(ostream& out, const string& sep) const {
+void SparseVector::write(ostream& out, const string& sep) const
+{
for (fvector_t::const_iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) {
if (abs(i->second) < 0.00001) continue;
string name = m_id_to_name[i->first];
@@ -63,11 +68,13 @@ void SparseVector::write(ostream& out, const string& sep) const {
}
}
-void SparseVector::clear() {
+void SparseVector::clear()
+{
m_fvector.clear();
}
-void SparseVector::load(const string& file) {
+void SparseVector::load(const string& file)
+{
ifstream in(file.c_str());
if (!in) {
throw runtime_error("Failed to open sparse weights file: " + file);
@@ -84,39 +91,44 @@ void SparseVector::load(const string& file) {
}
}
-SparseVector& SparseVector::operator-=(const SparseVector& rhs) {
+SparseVector& SparseVector::operator-=(const SparseVector& rhs)
+{
for (fvector_t::const_iterator i = rhs.m_fvector.begin();
- i != rhs.m_fvector.end(); ++i) {
+ i != rhs.m_fvector.end(); ++i) {
m_fvector[i->first] = get(i->first) - (i->second);
}
return *this;
}
-FeatureStatsType SparseVector::inner_product(const SparseVector& rhs) const {
+FeatureStatsType SparseVector::inner_product(const SparseVector& rhs) const
+{
FeatureStatsType product = 0.0;
for (fvector_t::const_iterator i = m_fvector.begin();
- i != m_fvector.end(); ++i) {
+ i != m_fvector.end(); ++i) {
product += ((i->second) * (rhs.get(i->first)));
}
return product;
}
-SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs) {
+SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs)
+{
SparseVector res(lhs);
res -= rhs;
return res;
}
-FeatureStatsType inner_product(const SparseVector& lhs, const SparseVector& rhs) {
- if (lhs.size() >= rhs.size()) {
- return rhs.inner_product(lhs);
- } else {
- return lhs.inner_product(rhs);
- }
+FeatureStatsType inner_product(const SparseVector& lhs, const SparseVector& rhs)
+{
+ if (lhs.size() >= rhs.size()) {
+ return rhs.inner_product(lhs);
+ } else {
+ return lhs.inner_product(rhs);
+ }
}
-std::vector<std::size_t> SparseVector::feats() const {
+std::vector<std::size_t> SparseVector::feats() const
+{
std::vector<std::size_t> toRet;
for(fvector_t::const_iterator iter = m_fvector.begin();
iter!=m_fvector.end();
@@ -126,7 +138,8 @@ std::vector<std::size_t> SparseVector::feats() const {
return toRet;
}
-std::size_t SparseVector::encode(const std::string& name) {
+std::size_t SparseVector::encode(const std::string& name)
+{
name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
size_t id = 0;
if (name2id_iter == m_name_to_id.end()) {
@@ -139,26 +152,29 @@ std::size_t SparseVector::encode(const std::string& name) {
return id;
}
-std::string SparseVector::decode(std::size_t id) {
+std::string SparseVector::decode(std::size_t id)
+{
return m_id_to_name[id];
}
-bool operator==(SparseVector const& item1, SparseVector const& item2) {
+bool operator==(SparseVector const& item1, SparseVector const& item2)
+{
return item1.m_fvector==item2.m_fvector;
}
-std::size_t hash_value(SparseVector const& item) {
+std::size_t hash_value(SparseVector const& item)
+{
boost::hash<SparseVector::fvector_t> hasher;
return hasher(item.m_fvector);
}
FeatureStats::FeatureStats()
- : m_available_size(kAvailableSize), m_entries(0),
- m_array(new FeatureStatsType[m_available_size]) {}
+ : m_available_size(kAvailableSize), m_entries(0),
+ m_array(new FeatureStatsType[m_available_size]) {}
FeatureStats::FeatureStats(const size_t size)
- : m_available_size(size), m_entries(size),
- m_array(new FeatureStatsType[m_available_size])
+ : m_available_size(size), m_entries(size),
+ m_array(new FeatureStatsType[m_available_size])
{
memset(m_array, 0, GetArraySizeWithBytes());
}
@@ -276,7 +292,8 @@ void FeatureStats::savetxt(ostream* os)
*os << *this;
}
-void FeatureStats::savetxt() {
+void FeatureStats::savetxt()
+{
savetxt(&cout);
}
@@ -298,7 +315,8 @@ ostream& operator<<(ostream& o, const FeatureStats& e)
return o;
}
-bool operator==(const FeatureStats& f1, const FeatureStats& f2) {
+bool operator==(const FeatureStats& f1, const FeatureStats& f2)
+{
size_t size = f1.size();
if (size != f2.size())
diff --git a/mert/FeatureStats.h b/mert/FeatureStats.h
index 883a89b97..a882e7358 100644
--- a/mert/FeatureStats.h
+++ b/mert/FeatureStats.h
@@ -18,10 +18,11 @@
namespace MosesTuning
{
-
+
// Minimal sparse vector
-class SparseVector {
+class SparseVector
+{
public:
typedef std::map<std::size_t,FeatureStatsType> fvector_t;
typedef std::map<std::string, std::size_t> name2id_t;
@@ -32,8 +33,10 @@ public:
void set(const std::string& name, FeatureStatsType value);
void clear();
void load(const std::string& file);
- std::size_t size() const { return m_fvector.size(); }
-
+ std::size_t size() const {
+ return m_fvector.size();
+ }
+
void write(std::ostream& out, const std::string& sep = " ") const;
SparseVector& operator-=(const SparseVector& rhs);
@@ -78,7 +81,9 @@ public:
void Copy(const FeatureStats &stats);
- bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; }
+ bool isfull() const {
+ return (m_entries < m_available_size) ? 0 : 1;
+ }
void expand();
void add(FeatureStatsType v);
void addSparse(const std::string& name, FeatureStatsType v);
@@ -93,23 +98,37 @@ public:
clear();
}
- FeatureStatsType get(std::size_t i) { return m_array[i]; }
- FeatureStatsType get(std::size_t i)const { return m_array[i]; }
- featstats_t getArray() const { return m_array; }
+ FeatureStatsType get(std::size_t i) {
+ return m_array[i];
+ }
+ FeatureStatsType get(std::size_t i)const {
+ return m_array[i];
+ }
+ featstats_t getArray() const {
+ return m_array;
+ }
- const SparseVector& getSparse() const { return m_map; }
+ const SparseVector& getSparse() const {
+ return m_map;
+ }
void set(std::string &theString, const SparseVector& sparseWeights);
- inline std::size_t bytes() const { return GetArraySizeWithBytes(); }
+ inline std::size_t bytes() const {
+ return GetArraySizeWithBytes();
+ }
std::size_t GetArraySizeWithBytes() const {
return m_entries * sizeof(FeatureStatsType);
}
- std::size_t size() const { return m_entries; }
+ std::size_t size() const {
+ return m_entries;
+ }
- std::size_t available() const { return m_available_size; }
+ std::size_t available() const {
+ return m_available_size;
+ }
void savetxt(const std::string &file);
void savetxt(std::ostream* os);
diff --git a/mert/FileStream.cpp b/mert/FileStream.cpp
index 1a52e53fa..800ce1bfe 100644
--- a/mert/FileStream.cpp
+++ b/mert/FileStream.cpp
@@ -5,15 +5,17 @@
using namespace std;
-namespace {
-bool IsGzipFile(const std::string &filename) {
+namespace
+{
+bool IsGzipFile(const std::string &filename)
+{
return filename.size() > 3 &&
- filename.substr(filename.size() - 3, 3) == ".gz";
+ filename.substr(filename.size() - 3, 3) == ".gz";
}
} // namespace
inputfilestream::inputfilestream(const std::string &filePath)
- : std::istream(0), m_streambuf(0), m_is_good(false)
+ : std::istream(0), m_streambuf(0), m_is_good(false)
{
// check if file is readable
std::filebuf* fb = new std::filebuf();
@@ -40,7 +42,7 @@ void inputfilestream::close()
}
outputfilestream::outputfilestream(const std::string &filePath)
- : std::ostream(0), m_streambuf(0), m_is_good(false)
+ : std::ostream(0), m_streambuf(0), m_is_good(false)
{
// check if file is readable
std::filebuf* fb = new std::filebuf();
diff --git a/mert/FileStream.h b/mert/FileStream.h
index 3fd489cd7..582cbcb59 100644
--- a/mert/FileStream.h
+++ b/mert/FileStream.h
@@ -16,7 +16,9 @@ public:
explicit inputfilestream(const std::string &filePath);
virtual ~inputfilestream();
- bool good() const { return m_is_good; }
+ bool good() const {
+ return m_is_good;
+ }
void close();
};
@@ -30,7 +32,9 @@ public:
explicit outputfilestream(const std::string &filePath);
virtual ~outputfilestream();
- bool good() const { return m_is_good; }
+ bool good() const {
+ return m_is_good;
+ }
void close();
};
diff --git a/mert/GzFileBuf.cpp b/mert/GzFileBuf.cpp
index 9d3ccb588..d61a22525 100644
--- a/mert/GzFileBuf.cpp
+++ b/mert/GzFileBuf.cpp
@@ -5,7 +5,8 @@
#include <cstdio>
#include <iostream>
-GzFileBuf::GzFileBuf(const char* filename) {
+GzFileBuf::GzFileBuf(const char* filename)
+{
m_gz_file = gzopen(filename, "rb");
if (m_gz_file == NULL) {
std::cerr << "ERROR: Failed to open " << filename << std::endl;
@@ -16,16 +17,19 @@ GzFileBuf::GzFileBuf(const char* filename) {
m_buf + sizeof(int)); // end position
}
-GzFileBuf::~GzFileBuf() {
+GzFileBuf::~GzFileBuf()
+{
gzclose(m_gz_file);
}
-int GzFileBuf::overflow(int_type c) {
+int GzFileBuf::overflow(int_type c)
+{
throw;
}
// read one character
-int GzFileBuf::underflow() {
+int GzFileBuf::underflow()
+{
// is read position before end of m_buf?
if (gptr() < egptr()) {
return traits_type::to_int_type(*gptr());
@@ -64,17 +68,20 @@ int GzFileBuf::underflow() {
}
std::streampos GzFileBuf::seekpos(
- std::streampos sp,
- std::ios_base::openmode which) {
+ std::streampos sp,
+ std::ios_base::openmode which)
+{
throw;
}
std::streamsize GzFileBuf::xsgetn(char* s,
- std::streamsize num) {
+ std::streamsize num)
+{
return static_cast<std::streamsize>(gzread(m_gz_file,s,num));
}
std::streamsize GzFileBuf::xsputn(const char* s,
- std::streamsize num) {
+ std::streamsize num)
+{
throw;
}
diff --git a/mert/GzFileBuf.h b/mert/GzFileBuf.h
index 729523e0e..fb57fcfe7 100644
--- a/mert/GzFileBuf.h
+++ b/mert/GzFileBuf.h
@@ -17,8 +17,8 @@ protected:
virtual int_type underflow();
virtual std::streampos seekpos(
- std::streampos sp,
- std::ios_base::openmode which = std::ios_base::in | std::ios_base::out);
+ std::streampos sp,
+ std::ios_base::openmode which = std::ios_base::in | std::ios_base::out);
virtual std::streamsize xsgetn(char* s, std::streamsize num);
diff --git a/mert/HypPackEnumerator.cpp b/mert/HypPackEnumerator.cpp
index 776c02857..1cdd1cb7d 100644
--- a/mert/HypPackEnumerator.cpp
+++ b/mert/HypPackEnumerator.cpp
@@ -8,13 +8,13 @@ using namespace std;
namespace MosesTuning
{
-
+
StreamingHypPackEnumerator::StreamingHypPackEnumerator
(
- vector<std::string> const& featureFiles,
- vector<std::string> const& scoreFiles
- )
+ vector<std::string> const& featureFiles,
+ vector<std::string> const& scoreFiles
+)
: m_featureFiles(featureFiles),
m_scoreFiles(scoreFiles)
{
@@ -22,19 +22,20 @@ StreamingHypPackEnumerator::StreamingHypPackEnumerator
cerr << "No data to process" << endl;
exit(0);
}
-
+
if (featureFiles.size() != scoreFiles.size()) {
cerr << "Error: Number of feature files (" << featureFiles.size() <<
- ") does not match number of score files (" << scoreFiles.size() << ")" << endl;
+ ") does not match number of score files (" << scoreFiles.size() << ")" << endl;
exit(1);
}
-
+
m_num_lists = scoreFiles.size();
m_primed = false;
m_iNumDense = -1;
}
-size_t StreamingHypPackEnumerator::num_dense() const {
+size_t StreamingHypPackEnumerator::num_dense() const
+{
if(m_iNumDense<0) {
cerr << "Error: Requested num_dense() for an unprimed StreamingHypPackEnumerator" << endl;
exit(1);
@@ -42,12 +43,13 @@ size_t StreamingHypPackEnumerator::num_dense() const {
return (size_t) m_iNumDense;
}
-void StreamingHypPackEnumerator::prime(){
+void StreamingHypPackEnumerator::prime()
+{
m_current_indexes.clear();
m_current_featureVectors.clear();
boost::unordered_set<FeatureDataItem> seen;
m_primed = true;
-
+
for (size_t i = 0; i < m_num_lists; ++i) {
if (m_featureDataIters[i] == FeatureDataIterator::end()) {
cerr << "Error: Feature file " << i << " ended prematurely" << endl;
@@ -78,13 +80,14 @@ void StreamingHypPackEnumerator::prime(){
}
// Store item for retrieval
m_current_indexes.push_back(pair<size_t,size_t>(i,j));
- m_current_featureVectors.push_back(MiraFeatureVector(item));
+ m_current_featureVectors.push_back(MiraFeatureVector(item));
}
}
}
}
-void StreamingHypPackEnumerator::reset(){
+void StreamingHypPackEnumerator::reset()
+{
m_featureDataIters.clear();
m_scoreDataIters.clear();
for (size_t i = 0; i < m_num_lists; ++i) {
@@ -95,11 +98,13 @@ void StreamingHypPackEnumerator::reset(){
prime();
}
-bool StreamingHypPackEnumerator::finished(){
+bool StreamingHypPackEnumerator::finished()
+{
return m_featureDataIters[0]==FeatureDataIterator::end();
}
-void StreamingHypPackEnumerator::next(){
+void StreamingHypPackEnumerator::next()
+{
if(!m_primed) {
cerr << "Enumerating an unprimed HypPackEnumerator" << endl;
exit(1);
@@ -113,7 +118,8 @@ void StreamingHypPackEnumerator::next(){
if(!finished()) prime();
}
-size_t StreamingHypPackEnumerator::cur_size(){
+size_t StreamingHypPackEnumerator::cur_size()
+{
if(!m_primed) {
cerr << "Querying size from an unprimed HypPackEnumerator" << endl;
exit(1);
@@ -121,7 +127,8 @@ size_t StreamingHypPackEnumerator::cur_size(){
return m_current_indexes.size();
}
-const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index){
+const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index)
+{
if(!m_primed) {
cerr << "Querying features from an unprimed HypPackEnumerator" << endl;
exit(1);
@@ -129,7 +136,8 @@ const MiraFeatureVector& StreamingHypPackEnumerator::featuresAt(size_t index){
return m_current_featureVectors[index];
}
-const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) {
+const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index)
+{
if(!m_primed) {
cerr << "Querying scores from an unprimed HypPackEnumerator" << endl;
exit(1);
@@ -138,22 +146,23 @@ const ScoreDataItem& StreamingHypPackEnumerator::scoresAt(size_t index) {
return m_scoreDataIters[pij.first]->operator[](pij.second);
}
-size_t StreamingHypPackEnumerator::cur_id() {
+size_t StreamingHypPackEnumerator::cur_id()
+{
return m_sentenceId;
}
/* --------- RandomAccessHypPackEnumerator ------------- */
RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector<string> const& featureFiles,
- vector<string> const& scoreFiles,
- bool no_shuffle)
+ vector<string> const& scoreFiles,
+ bool no_shuffle)
{
StreamingHypPackEnumerator train(featureFiles,scoreFiles);
size_t index=0;
for(train.reset(); !train.finished(); train.next()) {
m_features.push_back(vector<MiraFeatureVector>());
m_scores.push_back(vector<ScoreDataItem>());
- for(size_t j=0;j<train.cur_size();j++) {
+ for(size_t j=0; j<train.cur_size(); j++) {
m_features.back().push_back(train.featuresAt(j));
m_scores.back().push_back(train.scoresAt(j));
}
@@ -165,35 +174,43 @@ RandomAccessHypPackEnumerator::RandomAccessHypPackEnumerator(vector<string> cons
m_num_dense = train.num_dense();
}
-size_t RandomAccessHypPackEnumerator::num_dense() const {
+size_t RandomAccessHypPackEnumerator::num_dense() const
+{
return m_num_dense;
}
-
-void RandomAccessHypPackEnumerator::reset() {
+
+void RandomAccessHypPackEnumerator::reset()
+{
m_cur_index = 0;
if(!m_no_shuffle) random_shuffle(m_indexes.begin(),m_indexes.end());
}
-bool RandomAccessHypPackEnumerator::finished() {
+bool RandomAccessHypPackEnumerator::finished()
+{
return m_cur_index >= m_indexes.size();
}
-void RandomAccessHypPackEnumerator::next() {
+void RandomAccessHypPackEnumerator::next()
+{
m_cur_index++;
}
-size_t RandomAccessHypPackEnumerator::cur_size() {
+size_t RandomAccessHypPackEnumerator::cur_size()
+{
assert(m_features[m_indexes[m_cur_index]].size()==m_scores[m_indexes[m_cur_index]].size());
return m_features[m_indexes[m_cur_index]].size();
}
-const MiraFeatureVector& RandomAccessHypPackEnumerator::featuresAt(size_t i) {
+const MiraFeatureVector& RandomAccessHypPackEnumerator::featuresAt(size_t i)
+{
return m_features[m_indexes[m_cur_index]][i];
}
-const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i) {
+const ScoreDataItem& RandomAccessHypPackEnumerator::scoresAt(size_t i)
+{
return m_scores[m_indexes[m_cur_index]][i];
}
-size_t RandomAccessHypPackEnumerator::cur_id() {
+size_t RandomAccessHypPackEnumerator::cur_id()
+{
return m_indexes[m_cur_index];
-}
+}
// --Emacs trickery--
// Local Variables:
// mode:c++
diff --git a/mert/HypPackEnumerator.h b/mert/HypPackEnumerator.h
index 690e53103..957c6d408 100644
--- a/mert/HypPackEnumerator.h
+++ b/mert/HypPackEnumerator.h
@@ -20,11 +20,12 @@
namespace MosesTuning
{
-
+
// Start with these abstract classes
-class HypPackEnumerator {
+class HypPackEnumerator
+{
public:
virtual ~HypPackEnumerator() {}
@@ -41,7 +42,8 @@ public:
// Instantiation that streams from disk
// Low-memory, low-speed, sequential access
-class StreamingHypPackEnumerator : public HypPackEnumerator {
+class StreamingHypPackEnumerator : public HypPackEnumerator
+{
public:
StreamingHypPackEnumerator(std::vector<std::string> const& featureFiles,
std::vector<std::string> const& scoreFiles);
@@ -75,7 +77,8 @@ private:
// Instantiation that reads into memory
// High-memory, high-speed, random access
// (Actually randomizes with each call to reset)
-class RandomAccessHypPackEnumerator : public HypPackEnumerator {
+class RandomAccessHypPackEnumerator : public HypPackEnumerator
+{
public:
RandomAccessHypPackEnumerator(std::vector<std::string> const& featureFiles,
std::vector<std::string> const& scoreFiles,
diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp
index af3f26bf2..87cec9211 100644
--- a/mert/InterpolatedScorer.cpp
+++ b/mert/InterpolatedScorer.cpp
@@ -11,7 +11,7 @@ namespace MosesTuning
// TODO: This is too long. Consider creating a function for
// initialization such as Init().
InterpolatedScorer::InterpolatedScorer(const string& name, const string& config)
- : Scorer(name,config)
+ : Scorer(name,config)
{
// name would be: HAMMING,BLEU or similar
string scorers = name;
@@ -66,7 +66,8 @@ InterpolatedScorer::InterpolatedScorer(const string& name, const string& config)
cerr <<endl;
}
-bool InterpolatedScorer::useAlignment() const {
+bool InterpolatedScorer::useAlignment() const
+{
//cout << "InterpolatedScorer::useAlignment" << endl;
for (vector<Scorer*>::const_iterator itsc = m_scorers.begin(); itsc < m_scorers.end(); itsc++) {
if ((*itsc)->useAlignment()) {
@@ -176,8 +177,7 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats
ScoreStats tempEntry;
if ((*itsc)->useAlignment()) {
(*itsc)->prepareStats(sid, text, tempEntry);
- }
- else {
+ } else {
(*itsc)->prepareStats(sid, sentence, tempEntry);
}
if (i > 0) buff << " ";
@@ -206,17 +206,17 @@ void InterpolatedScorer::setFactors(const string& factors)
void InterpolatedScorer::setFilter(const string& filterCommand)
{
- if (filterCommand.empty()) return;
+ if (filterCommand.empty()) return;
- vector<string> csplit;
- split(filterCommand, ',', csplit);
+ vector<string> csplit;
+ split(filterCommand, ',', csplit);
- if (csplit.size() != m_scorers.size())
- throw runtime_error("Number of command specifications does not equal number of interpolated scorers.");
+ if (csplit.size() != m_scorers.size())
+ throw runtime_error("Number of command specifications does not equal number of interpolated scorers.");
- for (size_t i = 0; i < m_scorers.size(); ++i) {
- m_scorers[i]->setFilter(csplit[i]);
- }
+ for (size_t i = 0; i < m_scorers.size(); ++i) {
+ m_scorers[i]->setFilter(csplit[i]);
+ }
}
}
diff --git a/mert/InterpolatedScorer.h b/mert/InterpolatedScorer.h
index 49c065d27..d1078e9e1 100644
--- a/mert/InterpolatedScorer.h
+++ b/mert/InterpolatedScorer.h
@@ -10,7 +10,7 @@
namespace MosesTuning
{
-
+
/**
* Class that includes other scorers eg.
diff --git a/mert/MiraFeatureVector.cpp b/mert/MiraFeatureVector.cpp
index 95805c295..dea9b9b83 100644
--- a/mert/MiraFeatureVector.cpp
+++ b/mert/MiraFeatureVector.cpp
@@ -7,7 +7,7 @@ using namespace std;
namespace MosesTuning
{
-
+
MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
: m_dense(vec.dense)
@@ -17,8 +17,7 @@ MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
size_t lastFeat = 0;
m_sparseFeats.reserve(sparseFeats.size());
m_sparseVals.reserve(sparseFeats.size());
- for(size_t i=0;i<sparseFeats.size();i++)
- {
+ for(size_t i=0; i<sparseFeats.size(); i++) {
size_t feat = m_dense.size() + sparseFeats[i];
m_sparseFeats.push_back(feat);
m_sparseVals.push_back(vec.sparse.get(sparseFeats[i]));
@@ -26,8 +25,7 @@ MiraFeatureVector::MiraFeatureVector(const FeatureDataItem& vec)
// Check ordered property
if(bFirst) {
bFirst = false;
- }
- else {
+ } else {
if(lastFeat>=feat) {
cerr << "Error: Feature indeces must be strictly ascending coming out of SparseVector" << endl;
exit(1);
@@ -61,29 +59,33 @@ MiraFeatureVector::MiraFeatureVector(const vector<ValType>& dense,
}
}
-ValType MiraFeatureVector::val(size_t index) const {
+ValType MiraFeatureVector::val(size_t index) const
+{
if(index < m_dense.size())
return m_dense[index];
else
return m_sparseVals[index-m_dense.size()];
}
-size_t MiraFeatureVector::feat(size_t index) const {
+size_t MiraFeatureVector::feat(size_t index) const
+{
if(index < m_dense.size())
return index;
else
return m_sparseFeats[index-m_dense.size()];
}
-size_t MiraFeatureVector::size() const {
+size_t MiraFeatureVector::size() const
+{
return m_dense.size() + m_sparseVals.size();
}
-ValType MiraFeatureVector::sqrNorm() const {
+ValType MiraFeatureVector::sqrNorm() const
+{
ValType toRet = 0.0;
- for(size_t i=0;i<m_dense.size();i++)
+ for(size_t i=0; i<m_dense.size(); i++)
toRet += m_dense[i]*m_dense[i];
- for(size_t i=0;i<m_sparseVals.size();i++)
+ for(size_t i=0; i<m_sparseVals.size(); i++)
toRet += m_sparseVals[i] * m_sparseVals[i];
return toRet;
}
@@ -96,7 +98,7 @@ MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector&
cerr << "Mismatching dense vectors passed to MiraFeatureVector subtraction" << endl;
exit(1);
}
- for(size_t i=0;i<a.m_dense.size();i++) {
+ for(size_t i=0; i<a.m_dense.size(); i++) {
dense.push_back(a.m_dense[i] - b.m_dense[i]);
}
@@ -148,7 +150,7 @@ MiraFeatureVector operator-(const MiraFeatureVector& a, const MiraFeatureVector&
ostream& operator<<(ostream& o, const MiraFeatureVector& e)
{
- for(size_t i=0;i<e.size();i++) {
+ for(size_t i=0; i<e.size(); i++) {
if(i>0) o << " ";
o << e.feat(i) << ":" << e.val(i);
}
diff --git a/mert/MiraFeatureVector.h b/mert/MiraFeatureVector.h
index 60e765605..cb2b1c87d 100644
--- a/mert/MiraFeatureVector.h
+++ b/mert/MiraFeatureVector.h
@@ -19,11 +19,12 @@
namespace MosesTuning
{
-
+
typedef FeatureStatsType ValType;
-class MiraFeatureVector {
+class MiraFeatureVector
+{
public:
MiraFeatureVector(const FeatureDataItem& vec);
MiraFeatureVector(const MiraFeatureVector& other);
diff --git a/mert/MiraWeightVector.cpp b/mert/MiraWeightVector.cpp
index c6f0261dc..e23804cbf 100644
--- a/mert/MiraWeightVector.cpp
+++ b/mert/MiraWeightVector.cpp
@@ -6,7 +6,7 @@ using namespace std;
namespace MosesTuning
{
-
+
/**
* Constructor, initializes to the zero vector
@@ -36,9 +36,10 @@ MiraWeightVector::MiraWeightVector(const vector<ValType>& init)
* \param fv Feature vector to be added to the weights
* \param tau FV will be scaled by this value before update
*/
-void MiraWeightVector::update(const MiraFeatureVector& fv, float tau) {
+void MiraWeightVector::update(const MiraFeatureVector& fv, float tau)
+{
m_numUpdates++;
- for(size_t i=0;i<fv.size();i++) {
+ for(size_t i=0; i<fv.size(); i++) {
update(fv.feat(i), fv.val(i)*tau);
}
}
@@ -46,7 +47,8 @@ void MiraWeightVector::update(const MiraFeatureVector& fv, float tau) {
/**
* Perform an empty update (affects averaging)
*/
-void MiraWeightVector::tick() {
+void MiraWeightVector::tick()
+{
m_numUpdates++;
}
@@ -54,7 +56,8 @@ void MiraWeightVector::tick() {
* Score a feature vector according to the model
* \param fv Feature vector to be scored
*/
-ValType MiraWeightVector::score(const MiraFeatureVector& fv) const {
+ValType MiraWeightVector::score(const MiraFeatureVector& fv) const
+{
ValType toRet = 0.0;
for(size_t i=0; i<fv.size(); i++) {
toRet += weight(fv.feat(i)) * fv.val(i);
@@ -65,7 +68,8 @@ ValType MiraWeightVector::score(const MiraFeatureVector& fv) const {
/**
* Return an averaged view of this weight vector
*/
-AvgWeightVector MiraWeightVector::avg() {
+AvgWeightVector MiraWeightVector::avg()
+{
this->fixTotals();
return AvgWeightVector(*this);
}
@@ -73,7 +77,8 @@ AvgWeightVector MiraWeightVector::avg() {
/**
* Updates a weight and lazily updates its total
*/
-void MiraWeightVector::update(size_t index, ValType delta) {
+void MiraWeightVector::update(size_t index, ValType delta)
+{
// Handle previously unseen weights
while(index>=m_weights.size()) {
@@ -91,25 +96,27 @@ void MiraWeightVector::update(size_t index, ValType delta) {
/**
* Make sure everyone's total is up-to-date
*/
-void MiraWeightVector::fixTotals() {
+void MiraWeightVector::fixTotals()
+{
for(size_t i=0; i<m_weights.size(); i++) update(i,0);
}
/**
* Helper to handle out of range weights
*/
-ValType MiraWeightVector::weight(size_t index) const {
+ValType MiraWeightVector::weight(size_t index) const
+{
if(index < m_weights.size()) {
return m_weights[index];
- }
- else {
+ } else {
return 0;
}
}
-ValType MiraWeightVector::sqrNorm() const {
+ValType MiraWeightVector::sqrNorm() const
+{
ValType toRet = 0;
- for(size_t i=0;i<m_weights.size();i++) {
+ for(size_t i=0; i<m_weights.size(); i++) {
toRet += weight(i) * weight(i);
}
return toRet;
@@ -119,9 +126,9 @@ AvgWeightVector::AvgWeightVector(const MiraWeightVector& wv)
:m_wv(wv)
{}
-ostream& operator<<(ostream& o, const MiraWeightVector& e)
+ostream& operator<<(ostream& o, const MiraWeightVector& e)
{
- for(size_t i=0;i<e.m_weights.size();i++) {
+ for(size_t i=0; i<e.m_weights.size(); i++) {
if(abs(e.m_weights[i])>1e-8) {
if(i>0) o << " ";
cerr << i << ":" << e.m_weights[i];
@@ -136,14 +143,14 @@ ValType AvgWeightVector::weight(size_t index) const
else {
if(index < m_wv.m_totals.size()) {
return m_wv.m_totals[index] / m_wv.m_numUpdates;
- }
- else {
+ } else {
return 0;
}
}
}
-ValType AvgWeightVector::score(const MiraFeatureVector& fv) const {
+ValType AvgWeightVector::score(const MiraFeatureVector& fv) const
+{
ValType toRet = 0.0;
for(size_t i=0; i<fv.size(); i++) {
toRet += weight(fv.feat(i)) * fv.val(i);
@@ -151,7 +158,8 @@ ValType AvgWeightVector::score(const MiraFeatureVector& fv) const {
return toRet;
}
-size_t AvgWeightVector::size() const {
+size_t AvgWeightVector::size() const
+{
return m_wv.m_weights.size();
}
diff --git a/mert/MiraWeightVector.h b/mert/MiraWeightVector.h
index 30f8adfa4..eb27e8a6d 100644
--- a/mert/MiraWeightVector.h
+++ b/mert/MiraWeightVector.h
@@ -17,11 +17,12 @@
namespace MosesTuning
{
-
+
class AvgWeightVector;
-class MiraWeightVector {
+class MiraWeightVector
+{
public:
/**
* Constructor, initializes to the zero vector
@@ -91,7 +92,8 @@ private:
/**
* Averaged view of a weight vector
*/
-class AvgWeightVector {
+class AvgWeightVector
+{
public:
AvgWeightVector(const MiraWeightVector& wv);
ValType score(const MiraFeatureVector& fv) const;
diff --git a/mert/Ngram.h b/mert/Ngram.h
index 6363c847c..521dc4928 100644
--- a/mert/Ngram.h
+++ b/mert/Ngram.h
@@ -13,8 +13,9 @@ namespace MosesTuning
* typical accessors and mutaors, but we intentionally does not allow
* erasing elements.
*/
-class NgramCounts {
- public:
+class NgramCounts
+{
+public:
// Used to construct the ngram map
struct NgramComparator {
bool operator()(const std::vector<int>& a, const std::vector<int>& b) const {
@@ -45,7 +46,9 @@ class NgramCounts {
/**
* If the specified "ngram" is found, we add counts.
* If not, we insert the default count in the container. */
- inline void Add(const Key& ngram) { m_counts[ngram]++; }
+ inline void Add(const Key& ngram) {
+ m_counts[ngram]++;
+ }
/**
* Return true iff the specified "ngram" is found in the container.
@@ -60,34 +63,58 @@ class NgramCounts {
/**
* Clear all elments in the container.
*/
- void clear() { m_counts.clear(); }
+ void clear() {
+ m_counts.clear();
+ }
/**
* Return true iff the container is empty.
*/
- bool empty() const { return m_counts.empty(); }
+ bool empty() const {
+ return m_counts.empty();
+ }
/**
* Return the the number of elements in the container.
*/
- std::size_t size() const { return m_counts.size(); }
+ std::size_t size() const {
+ return m_counts.size();
+ }
- std::size_t max_size() const { return m_counts.max_size(); }
+ std::size_t max_size() const {
+ return m_counts.max_size();
+ }
// Note: This is mainly used by unit tests.
- int get_default_count() const { return kDefaultCount; }
+ int get_default_count() const {
+ return kDefaultCount;
+ }
- iterator find(const Key& ngram) { return m_counts.find(ngram); }
- const_iterator find(const Key& ngram) const { return m_counts.find(ngram); }
+ iterator find(const Key& ngram) {
+ return m_counts.find(ngram);
+ }
+ const_iterator find(const Key& ngram) const {
+ return m_counts.find(ngram);
+ }
- Value& operator[](const Key& ngram) { return m_counts[ngram]; }
+ Value& operator[](const Key& ngram) {
+ return m_counts[ngram];
+ }
- iterator begin() { return m_counts.begin(); }
- const_iterator begin() const { return m_counts.begin(); }
- iterator end() { return m_counts.end(); }
- const_iterator end() const { return m_counts.end(); }
+ iterator begin() {
+ return m_counts.begin();
+ }
+ const_iterator begin() const {
+ return m_counts.begin();
+ }
+ iterator end() {
+ return m_counts.end();
+ }
+ const_iterator end() const {
+ return m_counts.end();
+ }
- private:
+private:
const int kDefaultCount;
boost::unordered_map<Key, Value> m_counts;
};
diff --git a/mert/NgramTest.cpp b/mert/NgramTest.cpp
index e6218206f..87f36860b 100644
--- a/mert/NgramTest.cpp
+++ b/mert/NgramTest.cpp
@@ -5,7 +5,8 @@
using namespace MosesTuning;
-BOOST_AUTO_TEST_CASE(ngram_basic) {
+BOOST_AUTO_TEST_CASE(ngram_basic)
+{
NgramCounts counts;
NgramCounts::Key key;
key.push_back(1);
@@ -25,7 +26,8 @@ BOOST_AUTO_TEST_CASE(ngram_basic) {
BOOST_CHECK_EQUAL(it->second, 1);
}
-BOOST_AUTO_TEST_CASE(ngram_Add) {
+BOOST_AUTO_TEST_CASE(ngram_Add)
+{
NgramCounts counts;
NgramCounts::Key key;
key.push_back(1);
@@ -49,7 +51,8 @@ BOOST_AUTO_TEST_CASE(ngram_Add) {
BOOST_CHECK_EQUAL(counts[key3], counts.get_default_count());
}
-BOOST_AUTO_TEST_CASE(ngram_lookup) {
+BOOST_AUTO_TEST_CASE(ngram_lookup)
+{
NgramCounts counts;
NgramCounts::Key key;
key.push_back(1);
diff --git a/mert/Optimizer.cpp b/mert/Optimizer.cpp
index e5f5854b2..6afbd6241 100644
--- a/mert/Optimizer.cpp
+++ b/mert/Optimizer.cpp
@@ -17,7 +17,8 @@ using namespace std;
static const float MIN_FLOAT = -1.0 * numeric_limits<float>::max();
static const float MAX_FLOAT = numeric_limits<float>::max();
-namespace {
+namespace
+{
/**
* Compute the intersection of 2 lines.
@@ -35,7 +36,7 @@ inline float intersect(float m1, float b1, float m2, float b2)
namespace MosesTuning
{
-
+
Optimizer::Optimizer(unsigned Pd, const vector<unsigned>& i2O, const vector<bool>& pos, const vector<parameter_t>& start, unsigned int nrandom)
: m_scorer(NULL), m_feature_data(), m_num_random_directions(nrandom), m_positive(pos)
@@ -198,7 +199,7 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
thresholdmap.erase(previnserted); // erase old previnsert
previnserted = thresholdmap.find(leftmostx); // point previnsert to the new threshold
previnserted->second.back()=newd; // We update the diff for sentence S
- // Threshold already exists but is not the previous one.
+ // Threshold already exists but is not the previous one.
} else {
// We append the diffs in previnsert to tit before destroying previnsert.
tit->second.insert(tit->second.end(),previnserted->second.begin(),previnserted->second.end());
@@ -405,8 +406,7 @@ statscore_t SimpleOptimizer::TrueRun(Point& P) const
for (unsigned int i = 0; i < Point::getdim(); i++)
direction[i]=0.0;
direction[d]=1.0;
- }
- else { // random direction update
+ } else { // random direction update
direction.Randomize();
}
statscore_t curscore = LineOptimize(P, direction, linebest);//find the minimum on the line
@@ -443,8 +443,7 @@ statscore_t RandomDirectionOptimizer::TrueRun(Point& P) const
// do specified number of random direction optimizations
unsigned int nrun = 0;
unsigned int nrun_no_change = 0;
- for (; nrun_no_change < m_num_random_directions; nrun++, nrun_no_change++)
- {
+ for (; nrun_no_change < m_num_random_directions; nrun++, nrun_no_change++) {
// choose a random direction in which to optimize
Point direction;
direction.Randomize();
diff --git a/mert/Optimizer.h b/mert/Optimizer.h
index f81d59d96..80d6d10cc 100644
--- a/mert/Optimizer.h
+++ b/mert/Optimizer.h
@@ -12,7 +12,7 @@ static const float kMaxFloat = std::numeric_limits<float>::max();
namespace MosesTuning
{
-
+
class Point;
@@ -31,8 +31,12 @@ protected:
public:
Optimizer(unsigned Pd, const std::vector<unsigned>& i2O, const std::vector<bool>& positive, const std::vector<parameter_t>& start, unsigned int nrandom);
- void SetScorer(Scorer *scorer) { m_scorer = scorer; }
- void SetFeatureData(FeatureDataHandle feature_data) { m_feature_data = feature_data; }
+ void SetScorer(Scorer *scorer) {
+ m_scorer = scorer;
+ }
+ void SetFeatureData(FeatureDataHandle feature_data) {
+ m_feature_data = feature_data;
+ }
virtual ~Optimizer();
unsigned size() const {
@@ -97,7 +101,7 @@ private:
public:
RandomDirectionOptimizer(unsigned dim, const std::vector<unsigned>& i2O, const std::vector<bool>& positive,
const std::vector<parameter_t>& start, unsigned int nrandom)
- : Optimizer(dim, i2O, positive, start, nrandom), kEPS(0.0001f) {}
+ : Optimizer(dim, i2O, positive, start, nrandom), kEPS(0.0001f) {}
virtual statscore_t TrueRun(Point&) const;
};
@@ -109,7 +113,7 @@ class RandomOptimizer : public Optimizer
public:
RandomOptimizer(unsigned dim, const std::vector<unsigned>& i2O, const std::vector<bool>& positive,
const std::vector<parameter_t>& start, unsigned int nrandom)
- : Optimizer(dim, i2O, positive, start, nrandom) {}
+ : Optimizer(dim, i2O, positive, start, nrandom) {}
virtual statscore_t TrueRun(Point&) const;
};
diff --git a/mert/OptimizerFactory.cpp b/mert/OptimizerFactory.cpp
index b33194f33..97288f9a8 100644
--- a/mert/OptimizerFactory.cpp
+++ b/mert/OptimizerFactory.cpp
@@ -5,7 +5,7 @@ using namespace std;
namespace MosesTuning
{
-
+
vector<string> OptimizerFactory::m_type_names;
@@ -38,11 +38,11 @@ OptimizerFactory::OptimizerType OptimizerFactory::GetOptimizerType(const string&
}
Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,
- const vector<unsigned>& i2o,
- const std::vector<bool>& positive,
- const vector<parameter_t>& start,
- const string& type,
- unsigned int nrandom)
+ const vector<unsigned>& i2o,
+ const std::vector<bool>& positive,
+ const vector<parameter_t>& start,
+ const string& type,
+ unsigned int nrandom)
{
OptimizerType opt_type = GetOptimizerType(type);
if (opt_type == NOPTIMIZER) {
@@ -55,18 +55,18 @@ Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,
}
switch (opt_type) {
- case POWELL:
- return new SimpleOptimizer(dim, i2o, positive, start, nrandom);
- break;
- case RANDOM_DIRECTION:
- return new RandomDirectionOptimizer(dim, i2o, positive, start, nrandom);
- break;
- case RANDOM:
- return new RandomOptimizer(dim, i2o, positive, start, nrandom);
- break;
- default:
- cerr << "Error: unknown optimizer" << type << endl;
- return NULL;
+ case POWELL:
+ return new SimpleOptimizer(dim, i2o, positive, start, nrandom);
+ break;
+ case RANDOM_DIRECTION:
+ return new RandomDirectionOptimizer(dim, i2o, positive, start, nrandom);
+ break;
+ case RANDOM:
+ return new RandomOptimizer(dim, i2o, positive, start, nrandom);
+ break;
+ default:
+ cerr << "Error: unknown optimizer" << type << endl;
+ return NULL;
}
}
diff --git a/mert/OptimizerFactory.h b/mert/OptimizerFactory.h
index ae34bcb00..fc0fea65a 100644
--- a/mert/OptimizerFactory.h
+++ b/mert/OptimizerFactory.h
@@ -6,13 +6,13 @@
namespace MosesTuning
{
-
+
class Optimizer;
class OptimizerFactory
{
- public:
+public:
// NOTE: Add new optimizer here BEFORE NOPTIMZER
enum OptimizerType {
POWELL = 0,
@@ -36,7 +36,7 @@ class OptimizerFactory
const std::string& type,
unsigned int nrandom);
- private:
+private:
OptimizerFactory() {}
~OptimizerFactory() {}
diff --git a/mert/OptimizerFactoryTest.cpp b/mert/OptimizerFactoryTest.cpp
index 4d259c68d..56f894904 100644
--- a/mert/OptimizerFactoryTest.cpp
+++ b/mert/OptimizerFactoryTest.cpp
@@ -7,21 +7,24 @@
using namespace MosesTuning;
-namespace {
+namespace
+{
inline bool CheckBuildOptimizer(unsigned dim,
const std::vector<unsigned>& to_optimize,
const std::vector<bool>& positive,
const std::vector<parameter_t>& start,
const std::string& type,
- unsigned int num_random) {
+ unsigned int num_random)
+{
boost::scoped_ptr<Optimizer> optimizer(OptimizerFactory::BuildOptimizer(dim, to_optimize, positive, start, type, num_random));
return optimizer.get() != NULL;
}
} // namespace
-BOOST_AUTO_TEST_CASE(optimizer_type) {
+BOOST_AUTO_TEST_CASE(optimizer_type)
+{
BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("powell"),
OptimizerFactory::POWELL);
BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("random"),
@@ -30,7 +33,8 @@ BOOST_AUTO_TEST_CASE(optimizer_type) {
OptimizerFactory::RANDOM_DIRECTION);
}
-BOOST_AUTO_TEST_CASE(optimizer_build) {
+BOOST_AUTO_TEST_CASE(optimizer_build)
+{
const unsigned dim = 3;
std::vector<unsigned> to_optimize;
to_optimize.push_back(1);
diff --git a/mert/PerScorer.cpp b/mert/PerScorer.cpp
index 06b53436f..3e157a55e 100644
--- a/mert/PerScorer.cpp
+++ b/mert/PerScorer.cpp
@@ -10,7 +10,7 @@ using namespace std;
namespace MosesTuning
{
-
+
PerScorer::PerScorer(const string& config)
: StatisticsBasedScorer("PER",config) {}
diff --git a/mert/PerScorer.h b/mert/PerScorer.h
index 76ea9bfd7..ffb869942 100644
--- a/mert/PerScorer.h
+++ b/mert/PerScorer.h
@@ -9,7 +9,7 @@
namespace MosesTuning
{
-
+
class ScoreStats;
@@ -27,7 +27,9 @@ public:
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
- virtual std::size_t NumberOfScores() const { return 3; }
+ virtual std::size_t NumberOfScores() const {
+ return 3;
+ }
virtual float calculateScore(const std::vector<int>& comps) const;
private:
diff --git a/mert/Permutation.cpp b/mert/Permutation.cpp
index 5f3102f26..a4c74b0d2 100644
--- a/mert/Permutation.cpp
+++ b/mert/Permutation.cpp
@@ -16,7 +16,7 @@ using namespace std;
namespace MosesTuning
{
-
+
Permutation::Permutation(const string &alignment, const int sourceLength, const int targetLength )
{
@@ -86,7 +86,7 @@ void Permutation::set(const string & alignment,const int sourceLength)
//cout << "SP:" << sourcePos << " TP:" << targetPos << endl;
if (sourcePos > sourceLength) {
cerr << "Source sentence length:" << sourceLength << " is smaller than alignment source position:" << sourcePos << endl;
- cerr << "******** Permutation::set :" << alignment << ": len : " << sourceLength <<endl;
+ cerr << "******** Permutation::set :" << alignment << ": len : " << sourceLength <<endl;
exit(1);
}
//If have multiple target pos aligned to one source,
@@ -187,7 +187,7 @@ float Permutation::distance(const Permutation &permCompare, const distanceMetric
float score=0;
//bool debug= (verboselevel()>3); // TODO: fix verboselevel()
- bool debug=false;
+ bool debug=false;
if (debug) {
cout << "*****Permutation::distance" <<endl;
cout << "Hypo:" << endl;
diff --git a/mert/Permutation.h b/mert/Permutation.h
index 2c47487b6..b8be86a1b 100644
--- a/mert/Permutation.h
+++ b/mert/Permutation.h
@@ -19,7 +19,7 @@
namespace MosesTuning
{
-
+
class Permutation
{
diff --git a/mert/PermutationScorer.cpp b/mert/PermutationScorer.cpp
index 12025a77e..aec389c27 100644
--- a/mert/PermutationScorer.cpp
+++ b/mert/PermutationScorer.cpp
@@ -5,7 +5,7 @@ using namespace std;
namespace MosesTuning
{
-
+
const int PermutationScorer::SCORE_PRECISION = 5;
const int PermutationScorer::SCORE_MULTFACT = 100000; // 100000=10^SCORE_PRECISION
@@ -147,7 +147,7 @@ int PermutationScorer::getNumberWords (const string& text) const
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
//bool debug= (verboselevel()>3); // TODO: fix verboselevel()
- bool debug=false;
+ bool debug=false;
if (debug) {
cout << "*******prepareStats" ;
cout << text << endl;
diff --git a/mert/PermutationScorer.h b/mert/PermutationScorer.h
index 4d5c144ce..c3d0cc960 100644
--- a/mert/PermutationScorer.h
+++ b/mert/PermutationScorer.h
@@ -19,7 +19,7 @@
namespace MosesTuning
{
-
+
/**
* Permutation
**/
diff --git a/mert/Point.cpp b/mert/Point.cpp
index 5c446aa8b..2219749bd 100644
--- a/mert/Point.cpp
+++ b/mert/Point.cpp
@@ -29,7 +29,7 @@ Point::Point() : vector<parameter_t>(m_dim), m_score(0.0) {}
Point::Point(const vector<parameter_t>& init,
const vector<parameter_t>& min,
const vector<parameter_t>& max)
- : vector<parameter_t>(Point::m_dim), m_score(0.0)
+ : vector<parameter_t>(Point::m_dim), m_score(0.0)
{
m_min.resize(Point::m_dim);
m_max.resize(Point::m_dim);
diff --git a/mert/Point.h b/mert/Point.h
index 92cb832dd..f53f5f982 100644
--- a/mert/Point.h
+++ b/mert/Point.h
@@ -8,7 +8,7 @@
namespace MosesTuning
{
-
+
class FeatureStats;
class Optimizer;
@@ -53,11 +53,19 @@ private:
statscore_t m_score;
public:
- static unsigned int getdim() { return m_dim; }
- static void setdim(std::size_t d) { m_dim = d; }
+ static unsigned int getdim() {
+ return m_dim;
+ }
+ static void setdim(std::size_t d) {
+ m_dim = d;
+ }
- static unsigned int getpdim() { return m_pdim; }
- static void setpdim(std::size_t pd) { m_pdim = pd; }
+ static unsigned int getpdim() {
+ return m_pdim;
+ }
+ static void setpdim(std::size_t pd) {
+ m_pdim = pd;
+ }
static void set_optindices(const std::vector<unsigned int>& indices) {
m_opt_indices = indices;
@@ -90,7 +98,9 @@ public:
*/
friend std::ostream& operator<<(std::ostream& o,const Point& P);
- void Normalize() { NormalizeL2(); }
+ void Normalize() {
+ NormalizeL2();
+ }
void NormalizeL2();
void NormalizeL1();
@@ -100,8 +110,12 @@ public:
*/
void GetAllWeights(std::vector<parameter_t>& w) const;
- statscore_t GetScore() const { return m_score; }
- void SetScore(statscore_t score) { m_score = score; }
+ statscore_t GetScore() const {
+ return m_score;
+ }
+ void SetScore(statscore_t score) {
+ m_score = score;
+ }
};
}
diff --git a/mert/PointTest.cpp b/mert/PointTest.cpp
index df270dec9..f9e8e8bb2 100644
--- a/mert/PointTest.cpp
+++ b/mert/PointTest.cpp
@@ -9,7 +9,8 @@
using namespace std;
using namespace MosesTuning;
-BOOST_AUTO_TEST_CASE(point_operators) {
+BOOST_AUTO_TEST_CASE(point_operators)
+{
const unsigned int dim = 5;
vector<float> init(dim);
init[0] = 1.0f;
diff --git a/mert/PreProcessFilter.cpp b/mert/PreProcessFilter.cpp
index da26177f7..4fbcc0c89 100644
--- a/mert/PreProcessFilter.cpp
+++ b/mert/PreProcessFilter.cpp
@@ -18,7 +18,7 @@ using namespace std;
namespace MosesTuning
{
-
+
// Child exec error signal
void exec_failed (int sig)
@@ -28,116 +28,108 @@ void exec_failed (int sig)
}
PreProcessFilter::PreProcessFilter(const string& filterCommand)
- : m_toFilter(NULL),
- m_fromFilter(NULL)
+ : m_toFilter(NULL),
+ m_fromFilter(NULL)
{
- // Child error signal install
- // sigaction is the replacement for the traditional signal() method
- struct sigaction action;
- action.sa_handler = exec_failed;
- sigemptyset(&action.sa_mask);
- action.sa_flags = 0;
- if (sigaction(SIGUSR1, &action, NULL) < 0)
- {
- perror("SIGUSR1 install error");
- exit(EXIT_FAILURE);
- }
-
- int pipe_status;
- int pipefds_input[2];
- int pipefds_output[2];
- // int pipefds_error[2];
-
- // Create the pipes
- // We do this before the fork so both processes will know about
- // the same pipe and they can communicate.
-
- pipe_status = pipe(pipefds_input);
- if (pipe_status == -1)
- {
- perror("Error creating the pipe");
- exit(EXIT_FAILURE);
- }
-
- pipe_status = pipe(pipefds_output);
- if (pipe_status == -1)
- {
- perror("Error creating the pipe");
- exit(EXIT_FAILURE);
- }
-
- /*
- pipe_status = pipe(pipefds_error);
- if (pipe_status == -1)
- {
- perror("Error creating the pipe");
- exit(EXIT_FAILURE);
- }
- */
-
- pid_t pid;
- // Create child process; both processes continue from here
- pid = fork();
-
- if (pid == pid_t(0))
- {
- // Child process
-
- // When the child process finishes sends a SIGCHLD signal
- // to the parent
-
- // Tie the standard input, output and error streams to the
- // appropiate pipe ends
- // The file descriptor 0 is the standard input
- // We tie it to the read end of the pipe as we will use
- // this end of the pipe to read from it
- dup2 (CHILD_STDIN_READ,0);
- dup2 (CHILD_STDOUT_WRITE,1);
- // dup2 (CHILD_STDERR_WRITE,2);
- // Close in the child the unused ends of the pipes
- close(CHILD_STDIN_WRITE);
- close(CHILD_STDOUT_READ);
- //close(CHILD_STDERR_READ);
-
- // Execute the program
- execl("/bin/bash", "bash", "-c", filterCommand.c_str() , (char*)NULL);
-
- // We should never reach this point
- // Tell the parent the exec failed
- kill(getppid(), SIGUSR1);
- exit(EXIT_FAILURE);
- }
- else if (pid > pid_t(0))
- {
- // Parent
-
- // Close in the parent the unused ends of the pipes
- close(CHILD_STDIN_READ);
- close(CHILD_STDOUT_WRITE);
- // close(CHILD_STDERR_WRITE);
-
- m_toFilter = new ofdstream(CHILD_STDIN_WRITE);
- m_fromFilter = new ifdstream(CHILD_STDOUT_READ);
- }
- else
- {
- perror("Error: fork failed");
- exit(EXIT_FAILURE);
- }
+ // Child error signal install
+ // sigaction is the replacement for the traditional signal() method
+ struct sigaction action;
+ action.sa_handler = exec_failed;
+ sigemptyset(&action.sa_mask);
+ action.sa_flags = 0;
+ if (sigaction(SIGUSR1, &action, NULL) < 0) {
+ perror("SIGUSR1 install error");
+ exit(EXIT_FAILURE);
+ }
+
+ int pipe_status;
+ int pipefds_input[2];
+ int pipefds_output[2];
+ // int pipefds_error[2];
+
+ // Create the pipes
+ // We do this before the fork so both processes will know about
+ // the same pipe and they can communicate.
+
+ pipe_status = pipe(pipefds_input);
+ if (pipe_status == -1) {
+ perror("Error creating the pipe");
+ exit(EXIT_FAILURE);
+ }
+
+ pipe_status = pipe(pipefds_output);
+ if (pipe_status == -1) {
+ perror("Error creating the pipe");
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ pipe_status = pipe(pipefds_error);
+ if (pipe_status == -1)
+ {
+ perror("Error creating the pipe");
+ exit(EXIT_FAILURE);
+ }
+ */
+
+ pid_t pid;
+ // Create child process; both processes continue from here
+ pid = fork();
+
+ if (pid == pid_t(0)) {
+ // Child process
+
+ // When the child process finishes sends a SIGCHLD signal
+ // to the parent
+
+ // Tie the standard input, output and error streams to the
+ // appropiate pipe ends
+ // The file descriptor 0 is the standard input
+ // We tie it to the read end of the pipe as we will use
+ // this end of the pipe to read from it
+ dup2 (CHILD_STDIN_READ,0);
+ dup2 (CHILD_STDOUT_WRITE,1);
+ // dup2 (CHILD_STDERR_WRITE,2);
+ // Close in the child the unused ends of the pipes
+ close(CHILD_STDIN_WRITE);
+ close(CHILD_STDOUT_READ);
+ //close(CHILD_STDERR_READ);
+
+ // Execute the program
+ execl("/bin/bash", "bash", "-c", filterCommand.c_str() , (char*)NULL);
+
+ // We should never reach this point
+ // Tell the parent the exec failed
+ kill(getppid(), SIGUSR1);
+ exit(EXIT_FAILURE);
+ } else if (pid > pid_t(0)) {
+ // Parent
+
+ // Close in the parent the unused ends of the pipes
+ close(CHILD_STDIN_READ);
+ close(CHILD_STDOUT_WRITE);
+ // close(CHILD_STDERR_WRITE);
+
+ m_toFilter = new ofdstream(CHILD_STDIN_WRITE);
+ m_fromFilter = new ifdstream(CHILD_STDOUT_READ);
+ } else {
+ perror("Error: fork failed");
+ exit(EXIT_FAILURE);
+ }
}
string PreProcessFilter::ProcessSentence(const string& sentence)
{
- *m_toFilter << sentence << "\n";
- string processedSentence;
- m_fromFilter->getline(processedSentence);
- return processedSentence;
+ *m_toFilter << sentence << "\n";
+ string processedSentence;
+ m_fromFilter->getline(processedSentence);
+ return processedSentence;
}
PreProcessFilter::~PreProcessFilter()
{
- delete m_toFilter;
- delete m_fromFilter;
+ delete m_toFilter;
+ delete m_fromFilter;
}
}
diff --git a/mert/PreProcessFilter.h b/mert/PreProcessFilter.h
index 25e627f6d..9946ddccb 100644
--- a/mert/PreProcessFilter.h
+++ b/mert/PreProcessFilter.h
@@ -5,7 +5,7 @@
namespace MosesTuning
{
-
+
class ofdstream;
class ifdstream;
@@ -22,8 +22,8 @@ public:
~PreProcessFilter();
private:
- ofdstream* m_toFilter;
- ifdstream* m_fromFilter;
+ ofdstream* m_toFilter;
+ ifdstream* m_fromFilter;
};
}
diff --git a/mert/Reference.h b/mert/Reference.h
index 1d6869a12..2c12f2ed7 100644
--- a/mert/Reference.h
+++ b/mert/Reference.h
@@ -9,38 +9,57 @@
namespace MosesTuning
{
-
+
/**
* Reference class represents reference translations for an output
* translation used in calculating BLEU score.
*/
-class Reference {
- public:
+class Reference
+{
+public:
// for m_length
typedef std::vector<std::size_t>::iterator iterator;
typedef std::vector<std::size_t>::const_iterator const_iterator;
Reference() : m_counts(new NgramCounts) { }
- ~Reference() { delete m_counts; }
+ ~Reference() {
+ delete m_counts;
+ }
- NgramCounts* get_counts() { return m_counts; }
- const NgramCounts* get_counts() const { return m_counts; }
+ NgramCounts* get_counts() {
+ return m_counts;
+ }
+ const NgramCounts* get_counts() const {
+ return m_counts;
+ }
- iterator begin() { return m_length.begin(); }
- const_iterator begin() const { return m_length.begin(); }
- iterator end() { return m_length.end(); }
- const_iterator end() const { return m_length.end(); }
+ iterator begin() {
+ return m_length.begin();
+ }
+ const_iterator begin() const {
+ return m_length.begin();
+ }
+ iterator end() {
+ return m_length.end();
+ }
+ const_iterator end() const {
+ return m_length.end();
+ }
- void push_back(std::size_t len) { m_length.push_back(len); }
+ void push_back(std::size_t len) {
+ m_length.push_back(len);
+ }
- std::size_t num_references() const { return m_length.size(); }
+ std::size_t num_references() const {
+ return m_length.size();
+ }
int CalcAverage() const;
int CalcClosest(std::size_t length) const;
int CalcShortest() const;
- private:
+private:
NgramCounts* m_counts;
// multiple reference lengths
@@ -49,16 +68,18 @@ class Reference {
// TODO(tetsuok): fix this function and related stuff.
// "average" reference length should not be calculated at sentence-level unlike "closest".
-inline int Reference::CalcAverage() const {
+inline int Reference::CalcAverage() const
+{
int total = 0;
for (std::size_t i = 0; i < m_length.size(); ++i) {
total += m_length[i];
}
return static_cast<int>(
- static_cast<float>(total) / m_length.size());
+ static_cast<float>(total) / m_length.size());
}
-inline int Reference::CalcClosest(std::size_t length) const {
+inline int Reference::CalcClosest(std::size_t length) const
+{
int min_diff = INT_MAX;
int closest_ref_id = 0; // an index of the closest reference translation
for (std::size_t i = 0; i < m_length.size(); ++i) {
@@ -79,7 +100,8 @@ inline int Reference::CalcClosest(std::size_t length) const {
return static_cast<int>(m_length[closest_ref_id]);
}
-inline int Reference::CalcShortest() const {
+inline int Reference::CalcShortest() const
+{
return *std::min_element(m_length.begin(), m_length.end());
}
diff --git a/mert/ReferenceTest.cpp b/mert/ReferenceTest.cpp
index ad76de1f7..c33321227 100644
--- a/mert/ReferenceTest.cpp
+++ b/mert/ReferenceTest.cpp
@@ -5,12 +5,14 @@
using namespace MosesTuning;
-BOOST_AUTO_TEST_CASE(refernece_count) {
+BOOST_AUTO_TEST_CASE(refernece_count)
+{
Reference ref;
BOOST_CHECK(ref.get_counts() != NULL);
}
-BOOST_AUTO_TEST_CASE(refernece_length_iterator) {
+BOOST_AUTO_TEST_CASE(refernece_length_iterator)
+{
Reference ref;
ref.push_back(4);
ref.push_back(2);
@@ -24,7 +26,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_iterator) {
BOOST_CHECK(it == ref.end());
}
-BOOST_AUTO_TEST_CASE(refernece_length_average) {
+BOOST_AUTO_TEST_CASE(refernece_length_average)
+{
{
Reference ref;
ref.push_back(4);
@@ -49,7 +52,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_average) {
}
}
-BOOST_AUTO_TEST_CASE(refernece_length_closest) {
+BOOST_AUTO_TEST_CASE(refernece_length_closest)
+{
{
Reference ref;
ref.push_back(4);
@@ -92,7 +96,8 @@ BOOST_AUTO_TEST_CASE(refernece_length_closest) {
}
}
-BOOST_AUTO_TEST_CASE(refernece_length_shortest) {
+BOOST_AUTO_TEST_CASE(refernece_length_shortest)
+{
{
Reference ref;
ref.push_back(4);
diff --git a/mert/ScopedVector.h b/mert/ScopedVector.h
index c87f07071..bd9251a7c 100644
--- a/mert/ScopedVector.h
+++ b/mert/ScopedVector.h
@@ -5,19 +5,26 @@
namespace MosesTuning
{
-
+
template <class T>
-class ScopedVector {
- public:
+class ScopedVector
+{
+public:
typedef typename std::vector<T*>::iterator iterator;
typedef typename std::vector<T*>::const_iterator const_iterator;
ScopedVector() {}
- virtual ~ScopedVector() { reset(); }
+ virtual ~ScopedVector() {
+ reset();
+ }
- bool empty() const { return m_vec.empty(); }
+ bool empty() const {
+ return m_vec.empty();
+ }
- void push_back(T *e) { m_vec.push_back(e); }
+ void push_back(T *e) {
+ m_vec.push_back(e);
+ }
void reset() {
for (iterator it = m_vec.begin(); it != m_vec.end(); ++it) {
@@ -26,27 +33,53 @@ class ScopedVector {
m_vec.clear();
}
- void reserve(std::size_t capacity) { m_vec.reserve(capacity); }
- void resize(std::size_t size) { m_vec.resize(size); }
+ void reserve(std::size_t capacity) {
+ m_vec.reserve(capacity);
+ }
+ void resize(std::size_t size) {
+ m_vec.resize(size);
+ }
- std::size_t size() const {return m_vec.size(); }
+ std::size_t size() const {
+ return m_vec.size();
+ }
- iterator begin() { return m_vec.begin(); }
- const_iterator begin() const { return m_vec.begin(); }
+ iterator begin() {
+ return m_vec.begin();
+ }
+ const_iterator begin() const {
+ return m_vec.begin();
+ }
- iterator end() { return m_vec.end(); }
- const_iterator end() const { return m_vec.end(); }
+ iterator end() {
+ return m_vec.end();
+ }
+ const_iterator end() const {
+ return m_vec.end();
+ }
- std::vector<T*>& get() { return m_vec; }
- const std::vector<T*>& get() const { return m_vec; }
+ std::vector<T*>& get() {
+ return m_vec;
+ }
+ const std::vector<T*>& get() const {
+ return m_vec;
+ }
- std::vector<T*>* operator->() { return &m_vec; }
- const std::vector<T*>* operator->() const { return &m_vec; }
+ std::vector<T*>* operator->() {
+ return &m_vec;
+ }
+ const std::vector<T*>* operator->() const {
+ return &m_vec;
+ }
- T*& operator[](std::size_t i) { return m_vec[i]; }
- const T* operator[](std::size_t i) const { return m_vec[i]; }
+ T*& operator[](std::size_t i) {
+ return m_vec[i];
+ }
+ const T* operator[](std::size_t i) const {
+ return m_vec[i];
+ }
- private:
+private:
std::vector<T*> m_vec;
// no copying allowed.
diff --git a/mert/ScoreArray.cpp b/mert/ScoreArray.cpp
index dcd0f7680..dd9aa5b07 100644
--- a/mert/ScoreArray.cpp
+++ b/mert/ScoreArray.cpp
@@ -17,12 +17,12 @@ namespace MosesTuning
ScoreArray::ScoreArray()
- : m_num_scores(0), m_index(0) {}
+ : m_num_scores(0), m_index(0) {}
void ScoreArray::savetxt(ostream* os, const string& sctype)
{
*os << SCORES_TXT_BEGIN << " " << m_index << " " << m_array.size()
- << " " << m_num_scores << " " << sctype << endl;
+ << " " << m_num_scores << " " << sctype << endl;
for (scorearray_t::iterator i = m_array.begin(); i !=m_array.end(); i++) {
i->savetxt(os);
*os << endl;
@@ -33,7 +33,7 @@ void ScoreArray::savetxt(ostream* os, const string& sctype)
void ScoreArray::savebin(ostream* os, const string& score_type)
{
*os << SCORES_BIN_BEGIN << " " << m_index << " " << m_array.size()
- << " " << m_num_scores << " " << score_type << endl;
+ << " " << m_num_scores << " " << score_type << endl;
for (scorearray_t::iterator i = m_array.begin();
i != m_array.end(); i++) {
i->savebin(os);
@@ -63,7 +63,8 @@ void ScoreArray::save(const string &file, const string& score_type, bool bin)
ofs.close();
}
-void ScoreArray::save(const string& score_type, bool bin) {
+void ScoreArray::save(const string& score_type, bool bin)
+{
save(&cout, score_type, bin);
}
diff --git a/mert/ScoreArray.h b/mert/ScoreArray.h
index 5b6c748cb..438b57e3f 100644
--- a/mert/ScoreArray.h
+++ b/mert/ScoreArray.h
@@ -25,7 +25,7 @@ const char SCORES_BIN_END[] = "SCORES_BIN_END_0";
class ScoreArray
{
- private:
+private:
scorearray_t m_array;
std::string m_score_type;
std::size_t m_num_scores;
@@ -38,17 +38,29 @@ public:
ScoreArray();
~ScoreArray() {}
- void clear() { m_array.clear(); }
+ void clear() {
+ m_array.clear();
+ }
- int getIndex() const { return m_index; }
+ int getIndex() const {
+ return m_index;
+ }
- void setIndex(int value) { m_index = value; }
+ void setIndex(int value) {
+ m_index = value;
+ }
- ScoreStats& get(std::size_t i) { return m_array.at(i); }
+ ScoreStats& get(std::size_t i) {
+ return m_array.at(i);
+ }
- const ScoreStats& get(std::size_t i) const { return m_array.at(i); }
+ const ScoreStats& get(std::size_t i) const {
+ return m_array.at(i);
+ }
- void add(const ScoreStats& e) { m_array.push_back(e); }
+ void add(const ScoreStats& e) {
+ m_array.push_back(e);
+ }
//ADDED BY TS
void swap(std::size_t i, std::size_t j) {
@@ -62,15 +74,25 @@ public:
void merge(ScoreArray& e);
- std::string name() const { return m_score_type; }
+ std::string name() const {
+ return m_score_type;
+ }
- void name(std::string &score_type) { m_score_type = score_type; }
+ void name(std::string &score_type) {
+ m_score_type = score_type;
+ }
- std::size_t size() const { return m_array.size(); }
+ std::size_t size() const {
+ return m_array.size();
+ }
- std::size_t NumberOfScores() const { return m_num_scores; }
+ std::size_t NumberOfScores() const {
+ return m_num_scores;
+ }
- void NumberOfScores(std::size_t v) { m_num_scores = v; }
+ void NumberOfScores(std::size_t v) {
+ m_num_scores = v;
+ }
void savetxt(std::ostream* os, const std::string& score_type);
void savebin(std::ostream* os, const std::string& score_type);
diff --git a/mert/ScoreData.cpp b/mert/ScoreData.cpp
index d02a4d700..0906b3459 100644
--- a/mert/ScoreData.cpp
+++ b/mert/ScoreData.cpp
@@ -50,7 +50,8 @@ void ScoreData::save(const string &file, bool bin)
ofs.close();
}
-void ScoreData::save(bool bin) {
+void ScoreData::save(bool bin)
+{
save(&cout, bin);
}
diff --git a/mert/ScoreData.h b/mert/ScoreData.h
index ac3a6faa6..9159e029f 100644
--- a/mert/ScoreData.h
+++ b/mert/ScoreData.h
@@ -40,7 +40,9 @@ public:
ScoreData(Scorer* scorer);
~ScoreData() {}
- void clear() { m_array.clear(); }
+ void clear() {
+ m_array.clear();
+ }
inline ScoreArray& get(std::size_t idx) {
return m_array.at(idx);
@@ -66,7 +68,9 @@ public:
return m_array.at(i).get(j);
}
- std::string name() const { return m_score_type; }
+ std::string name() const {
+ return m_score_type;
+ }
std::string name(const std::string &score_type) {
return m_score_type = score_type;
@@ -75,8 +79,12 @@ public:
void add(ScoreArray& e);
void add(const ScoreStats& e, int sent_idx);
- std::size_t NumberOfScores() const { return m_num_scores; }
- std::size_t size() const { return m_array.size(); }
+ std::size_t NumberOfScores() const {
+ return m_num_scores;
+ }
+ std::size_t size() const {
+ return m_array.size();
+ }
void save(const std::string &file, bool bin=false);
void save(std::ostream* os, bool bin=false);
diff --git a/mert/ScoreDataIterator.cpp b/mert/ScoreDataIterator.cpp
index 80568b810..71e05ab0b 100644
--- a/mert/ScoreDataIterator.cpp
+++ b/mert/ScoreDataIterator.cpp
@@ -29,18 +29,20 @@ using namespace util;
namespace MosesTuning
{
-
+
ScoreDataIterator::ScoreDataIterator() {}
-ScoreDataIterator::ScoreDataIterator(const string& filename) {
+ScoreDataIterator::ScoreDataIterator(const string& filename)
+{
m_in.reset(new FilePiece(filename.c_str()));
readNext();
}
ScoreDataIterator::~ScoreDataIterator() {}
-void ScoreDataIterator::readNext() {
+void ScoreDataIterator::readNext()
+{
m_next.clear();
try {
StringPiece marker = m_in->ReadDelimited();
@@ -71,12 +73,14 @@ void ScoreDataIterator::readNext() {
}
}
-void ScoreDataIterator::increment() {
+void ScoreDataIterator::increment()
+{
readNext();
}
-bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const {
+bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const
+{
if (!m_in && !rhs.m_in) {
return true;
} else if (!m_in) {
@@ -84,13 +88,14 @@ bool ScoreDataIterator::equal(const ScoreDataIterator& rhs) const {
} else if (!rhs.m_in) {
return false;
} else {
- return m_in->FileName() == rhs.m_in->FileName() &&
- m_in->Offset() == rhs.m_in->Offset();
+ return m_in->FileName() == rhs.m_in->FileName() &&
+ m_in->Offset() == rhs.m_in->Offset();
}
}
-const vector<ScoreDataItem>& ScoreDataIterator::dereference() const {
+const vector<ScoreDataItem>& ScoreDataIterator::dereference() const
+{
return m_next;
}
diff --git a/mert/ScoreDataIterator.h b/mert/ScoreDataIterator.h
index 50640c158..e5968a8f7 100644
--- a/mert/ScoreDataIterator.h
+++ b/mert/ScoreDataIterator.h
@@ -33,40 +33,43 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "FeatureDataIterator.h"
-namespace util { class FilePiece; }
+namespace util
+{
+class FilePiece;
+}
namespace MosesTuning
{
-
+
typedef std::vector<float> ScoreDataItem;
class ScoreDataIterator :
public boost::iterator_facade<ScoreDataIterator,
- const std::vector<ScoreDataItem>,
- boost::forward_traversal_tag>
+ const std::vector<ScoreDataItem>,
+ boost::forward_traversal_tag>
{
- public:
- ScoreDataIterator();
- explicit ScoreDataIterator(const std::string& filename);
+public:
+ ScoreDataIterator();
+ explicit ScoreDataIterator(const std::string& filename);
- ~ScoreDataIterator();
+ ~ScoreDataIterator();
- static ScoreDataIterator end() {
- return ScoreDataIterator();
- }
+ static ScoreDataIterator end() {
+ return ScoreDataIterator();
+ }
- private:
- friend class boost::iterator_core_access;
+private:
+ friend class boost::iterator_core_access;
- void increment();
- bool equal(const ScoreDataIterator& rhs) const;
- const std::vector<ScoreDataItem>& dereference() const;
+ void increment();
+ bool equal(const ScoreDataIterator& rhs) const;
+ const std::vector<ScoreDataItem>& dereference() const;
- void readNext();
+ void readNext();
- boost::shared_ptr<util::FilePiece> m_in;
- std::vector<ScoreDataItem> m_next;
+ boost::shared_ptr<util::FilePiece> m_in;
+ std::vector<ScoreDataItem> m_next;
};
}
diff --git a/mert/ScoreStats.cpp b/mert/ScoreStats.cpp
index 20e707005..1c66cdb5f 100644
--- a/mert/ScoreStats.cpp
+++ b/mert/ScoreStats.cpp
@@ -13,21 +13,22 @@
using namespace std;
-namespace {
+namespace
+{
const int kAvailableSize = 8;
} // namespace
namespace MosesTuning
{
-
+
ScoreStats::ScoreStats()
- : m_available_size(kAvailableSize), m_entries(0),
- m_array(new ScoreStatsType[m_available_size]) {}
+ : m_available_size(kAvailableSize), m_entries(0),
+ m_array(new ScoreStatsType[m_available_size]) {}
ScoreStats::ScoreStats(const size_t size)
- : m_available_size(size), m_entries(size),
- m_array(new ScoreStatsType[m_available_size])
+ : m_available_size(size), m_entries(size),
+ m_array(new ScoreStatsType[m_available_size])
{
memset(m_array, 0, GetArraySizeWithBytes());
}
@@ -123,7 +124,8 @@ void ScoreStats::savetxt(ostream* os)
*os << *this;
}
-void ScoreStats::savetxt() {
+void ScoreStats::savetxt()
+{
savetxt(&cout);
}
@@ -140,7 +142,8 @@ ostream& operator<<(ostream& o, const ScoreStats& e)
return o;
}
-bool operator==(const ScoreStats& s1, const ScoreStats& s2) {
+bool operator==(const ScoreStats& s1, const ScoreStats& s2)
+{
size_t size = s1.size();
if (size != s2.size())
diff --git a/mert/ScoreStats.h b/mert/ScoreStats.h
index 4088b655e..930b03cea 100644
--- a/mert/ScoreStats.h
+++ b/mert/ScoreStats.h
@@ -18,7 +18,7 @@
namespace MosesTuning
{
-
+
class ScoreStats
{
@@ -41,7 +41,9 @@ public:
void Copy(const ScoreStats &stats);
- bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; }
+ bool isfull() const {
+ return (m_entries < m_available_size) ? 0 : 1;
+ }
void expand();
void add(ScoreStatsType v);
@@ -55,9 +57,15 @@ public:
clear();
}
- ScoreStatsType get(std::size_t i) { return m_array[i]; }
- ScoreStatsType get(std::size_t i) const { return m_array[i]; }
- scorestats_t getArray() const { return m_array; }
+ ScoreStatsType get(std::size_t i) {
+ return m_array[i];
+ }
+ ScoreStatsType get(std::size_t i) const {
+ return m_array[i];
+ }
+ scorestats_t getArray() const {
+ return m_array;
+ }
void set(const std::string& str);
@@ -69,15 +77,21 @@ public:
}
}
- std::size_t bytes() const { return GetArraySizeWithBytes(); }
+ std::size_t bytes() const {
+ return GetArraySizeWithBytes();
+ }
std::size_t GetArraySizeWithBytes() const {
return m_entries * sizeof(ScoreStatsType);
}
- std::size_t size() const { return m_entries; }
+ std::size_t size() const {
+ return m_entries;
+ }
- std::size_t available() const { return m_available_size; }
+ std::size_t available() const {
+ return m_available_size;
+ }
void savetxt(const std::string &file);
void savetxt(std::ostream* os);
diff --git a/mert/Scorer.cpp b/mert/Scorer.cpp
index 1a7136dc0..e3cc3d7e6 100644
--- a/mert/Scorer.cpp
+++ b/mert/Scorer.cpp
@@ -12,27 +12,31 @@ using namespace std;
namespace MosesTuning
{
-namespace {
+namespace
+{
// For tokenizing a hypothesis translation, we may encounter unknown tokens which
// do not exist in the corresponding reference translations.
const int kUnknownToken = -1;
} // namespace
Scorer::Scorer(const string& name, const string& config)
- : m_name(name),
- m_vocab(mert::VocabularyFactory::GetVocabulary()),
- m_filter(NULL),
- m_score_data(NULL),
- m_enable_preserve_case(true) {
+ : m_name(name),
+ m_vocab(mert::VocabularyFactory::GetVocabulary()),
+ m_filter(NULL),
+ m_score_data(NULL),
+ m_enable_preserve_case(true)
+{
InitConfig(config);
}
-Scorer::~Scorer() {
+Scorer::~Scorer()
+{
Singleton<mert::Vocabulary>::Delete();
delete m_filter;
}
-void Scorer::InitConfig(const string& config) {
+void Scorer::InitConfig(const string& config)
+{
// cerr << "Scorer config string: " << config << endl;
size_t start = 0;
while (start < config.size()) {
@@ -53,7 +57,8 @@ void Scorer::InitConfig(const string& config) {
}
}
-void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
+void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded)
+{
for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
it; ++it) {
if (!m_enable_preserve_case) {
@@ -69,7 +74,8 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
}
}
-void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded) {
+void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded)
+{
for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
it; ++it) {
if (!m_enable_preserve_case) {
@@ -103,8 +109,7 @@ void Scorer::setFactors(const string& factors)
if (factors.empty()) return;
vector<string> factors_vec;
split(factors, '|', factors_vec);
- for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it)
- {
+ for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it) {
int factor = atoi(it->c_str());
m_factors.push_back(factor);
}
@@ -115,8 +120,8 @@ void Scorer::setFactors(const string& factors)
*/
void Scorer::setFilter(const string& filterCommand)
{
- if (filterCommand.empty()) return;
- m_filter = new PreProcessFilter(filterCommand);
+ if (filterCommand.empty()) return;
+ m_filter = new PreProcessFilter(filterCommand);
}
/**
@@ -130,8 +135,7 @@ string Scorer::applyFactors(const string& sentence) const
split(sentence, ' ', tokens);
stringstream sstream;
- for (size_t i = 0; i < tokens.size(); ++i)
- {
+ for (size_t i = 0; i < tokens.size(); ++i) {
if (tokens[i] == "") continue;
vector<string> factors;
@@ -141,8 +145,7 @@ string Scorer::applyFactors(const string& sentence) const
if (i > 0) sstream << " ";
- for (size_t j = 0; j < m_factors.size(); ++j)
- {
+ for (size_t j = 0; j < m_factors.size(); ++j) {
int findex = m_factors[j];
if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
@@ -158,17 +161,15 @@ string Scorer::applyFactors(const string& sentence) const
*/
string Scorer::applyFilter(const string& sentence) const
{
- if (m_filter)
- {
+ if (m_filter) {
return m_filter->ProcessSentence(sentence);
- }
- else
- {
+ } else {
return sentence;
}
}
-float Scorer::score(const candidates_t& candidates) const {
+float Scorer::score(const candidates_t& candidates) const
+{
diffs_t diffs;
statscores_t scores;
score(candidates, diffs, scores);
diff --git a/mert/Scorer.h b/mert/Scorer.h
index 0a090d1c4..4a1f4476a 100644
--- a/mert/Scorer.h
+++ b/mert/Scorer.h
@@ -10,7 +10,8 @@
#include "Types.h"
#include "ScoreData.h"
-namespace mert {
+namespace mert
+{
class Vocabulary;
@@ -32,7 +33,7 @@ enum ScorerRegularisationStrategy {REG_NONE, REG_AVERAGE, REG_MINIMUM};
*/
class Scorer
{
- public:
+public:
Scorer(const std::string& name, const std::string& config);
virtual ~Scorer();
@@ -117,14 +118,16 @@ class Scorer
*/
virtual void setFactors(const std::string& factors);
- mert::Vocabulary* GetVocab() const { return m_vocab; }
+ mert::Vocabulary* GetVocab() const {
+ return m_vocab;
+ }
/**
* Set unix filter, which will be used to preprocess the sentences
*/
virtual void setFilter(const std::string& filterCommand);
- private:
+private:
void InitConfig(const std::string& config);
/**
@@ -143,7 +146,7 @@ class Scorer
std::vector<int> m_factors;
PreProcessFilter* m_filter;
- protected:
+protected:
ScoreData* m_score_data;
bool m_enable_preserve_case;
@@ -173,41 +176,41 @@ class Scorer
/**
* Every inherited scorer should call this function for each sentence
*/
- std::string preprocessSentence(const std::string& sentence) const
- {
+ std::string preprocessSentence(const std::string& sentence) const {
return applyFactors(applyFilter(sentence));
}
};
-namespace {
+namespace
+{
- //regularisation strategies
- inline float score_min(const statscores_t& scores, size_t start, size_t end)
- {
- float min = std::numeric_limits<float>::max();
- for (size_t i = start; i < end; ++i) {
- if (scores[i] < min) {
- min = scores[i];
- }
+//regularisation strategies
+inline float score_min(const statscores_t& scores, size_t start, size_t end)
+{
+ float min = std::numeric_limits<float>::max();
+ for (size_t i = start; i < end; ++i) {
+ if (scores[i] < min) {
+ min = scores[i];
}
- return min;
}
+ return min;
+}
- inline float score_average(const statscores_t& scores, size_t start, size_t end)
- {
- if ((end - start) < 1) {
- // this shouldn't happen
- return 0;
- }
- float total = 0;
- for (size_t j = start; j < end; ++j) {
- total += scores[j];
- }
-
- return total / (end - start);
+inline float score_average(const statscores_t& scores, size_t start, size_t end)
+{
+ if ((end - start) < 1) {
+ // this shouldn't happen
+ return 0;
+ }
+ float total = 0;
+ for (size_t j = start; j < end; ++j) {
+ total += scores[j];
}
+ return total / (end - start);
+}
+
} // namespace
}
diff --git a/mert/ScorerFactory.cpp b/mert/ScorerFactory.cpp
index 126218b65..02000c1bc 100644
--- a/mert/ScorerFactory.cpp
+++ b/mert/ScorerFactory.cpp
@@ -14,9 +14,10 @@ using namespace std;
namespace MosesTuning
{
-
-vector<string> ScorerFactory::getTypes() {
+
+vector<string> ScorerFactory::getTypes()
+{
vector<string> types;
types.push_back(string("BLEU"));
types.push_back(string("PER"));
@@ -29,7 +30,8 @@ vector<string> ScorerFactory::getTypes() {
return types;
}
-Scorer* ScorerFactory::getScorer(const string& type, const string& config) {
+Scorer* ScorerFactory::getScorer(const string& type, const string& config)
+{
if (type == "BLEU") {
return new BleuScorer(config);
} else if (type == "PER") {
@@ -48,8 +50,7 @@ Scorer* ScorerFactory::getScorer(const string& type, const string& config) {
} else {
if (type.find(',') != string::npos) {
return new InterpolatedScorer(type, config);
- }
- else {
+ } else {
throw runtime_error("Unknown scorer type: " + type);
}
}
diff --git a/mert/ScorerFactory.h b/mert/ScorerFactory.h
index e8b33d87c..b93db3024 100644
--- a/mert/ScorerFactory.h
+++ b/mert/ScorerFactory.h
@@ -6,7 +6,7 @@
namespace MosesTuning
{
-
+
class Scorer;
diff --git a/mert/SemposOverlapping.cpp b/mert/SemposOverlapping.cpp
index ffcabaab2..718bc7f26 100644
--- a/mert/SemposOverlapping.cpp
+++ b/mert/SemposOverlapping.cpp
@@ -6,7 +6,8 @@
using namespace std;
-namespace {
+namespace
+{
MosesTuning::SemposOverlapping* g_overlapping = NULL;
@@ -14,9 +15,10 @@ MosesTuning::SemposOverlapping* g_overlapping = NULL;
namespace MosesTuning
{
-
-SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, const SemposScorer* sempos) {
+
+SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, const SemposScorer* sempos)
+{
if (str == "cap-micro") {
return new CapMicroOverlapping(sempos);
} else if (str == "cap-macro") {
@@ -26,7 +28,8 @@ SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, c
}
}
-void SemposOverlappingFactory::SetOverlapping(SemposOverlapping* ovr) {
+void SemposOverlappingFactory::SetOverlapping(SemposOverlapping* ovr)
+{
g_overlapping = ovr;
}
@@ -41,15 +44,13 @@ vector<int> CapMicroOverlapping::prepareStats(const sentence_t& cand, const sent
int multCoeff = 1000;
float interSum = 0;
- for (sentence_t::iterator it = intersection.begin(); it != intersection.end(); it++)
- {
+ for (sentence_t::iterator it = intersection.begin(); it != intersection.end(); it++) {
interSum += semposScorer->weight(it->first);
}
float refSum = 0;
- for (sentence_t::iterator it = ref.begin(); it != ref.end(); it++)
- {
- refSum += semposScorer->weight(it->first);
+ for (sentence_t::iterator it = ref.begin(); it != ref.end(); it++) {
+ refSum += semposScorer->weight(it->first);
}
stats[0] = (int)(multCoeff * interSum);
diff --git a/mert/SemposOverlapping.h b/mert/SemposOverlapping.h
index 3b5a99f7f..5eddbaef3 100644
--- a/mert/SemposOverlapping.h
+++ b/mert/SemposOverlapping.h
@@ -9,7 +9,7 @@
namespace MosesTuning
{
-
+
class SemposScorer;
@@ -36,14 +36,15 @@ public:
virtual std::size_t NumberOfScores() const = 0;
};
-class SemposOverlappingFactory {
- public:
+class SemposOverlappingFactory
+{
+public:
static SemposOverlapping* GetOverlapping(const std::string& str, const SemposScorer* sempos);
// dependency injection for unit testing.
static void SetOverlapping(SemposOverlapping* ovr);
- private:
+private:
SemposOverlappingFactory() {}
~SemposOverlappingFactory() {}
};
@@ -62,9 +63,11 @@ public:
virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
virtual float calculateScore(const std::vector<int>& stats) const;
- virtual std::size_t NumberOfScores() const { return 2; }
+ virtual std::size_t NumberOfScores() const {
+ return 2;
+ }
- private:
+private:
// no copying allowed.
CapMicroOverlapping(const CapMicroOverlapping&);
CapMicroOverlapping& operator=(const CapMicroOverlapping&);
@@ -82,9 +85,11 @@ public:
virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
virtual float calculateScore(const std::vector<int>& stats) const;
- virtual std::size_t NumberOfScores() const { return kMaxNOC * 2; }
+ virtual std::size_t NumberOfScores() const {
+ return kMaxNOC * 2;
+ }
- private:
+private:
// no copying allowed.
CapMacroOverlapping(const CapMacroOverlapping&);
CapMacroOverlapping& operator=(const CapMacroOverlapping&);
diff --git a/mert/SemposScorer.cpp b/mert/SemposScorer.cpp
index 8dd1fc8ee..235f73fbf 100644
--- a/mert/SemposScorer.cpp
+++ b/mert/SemposScorer.cpp
@@ -12,7 +12,7 @@ using namespace std;
namespace MosesTuning
{
-
+
SemposScorer::SemposScorer(const string& config)
: StatisticsBasedScorer("SEMPOS", config),
@@ -25,8 +25,7 @@ SemposScorer::SemposScorer(const string& config)
m_semposMap.clear();
string weightsfile = getConfig("weightsfile", "");
- if (weightsfile != "")
- {
+ if (weightsfile != "") {
loadWeights(weightsfile);
}
}
@@ -144,42 +143,35 @@ int SemposScorer::encodeSempos(const string& sempos)
float SemposScorer::weight(int item) const
{
- std::map<int,float>::const_iterator it = weightsMap.find(item);
- if (it == weightsMap.end())
- {
- return 1.0f;
- }
- else
- {
- return it->second;
- }
+ std::map<int,float>::const_iterator it = weightsMap.find(item);
+ if (it == weightsMap.end()) {
+ return 1.0f;
+ } else {
+ return it->second;
+ }
}
void SemposScorer::loadWeights(const string& weightsfile)
{
- string line;
- ifstream myfile;
- myfile.open(weightsfile.c_str(), ifstream::in);
- if (myfile.is_open())
- {
- while ( myfile.good() )
- {
- getline (myfile,line);
- vector<string> fields;
- if (line == "") continue;
- split(line, '\t', fields);
- if (fields.size() != 2) throw std::runtime_error("Bad format of a row in weights file.");
- int encoded = encodeString(fields[0]);
- float weight = atof(fields[1].c_str());
- weightsMap[encoded] = weight;
- }
- myfile.close();
- }
- else
- {
- cerr << "Unable to open file "<< weightsfile << endl;
- exit(1);
+ string line;
+ ifstream myfile;
+ myfile.open(weightsfile.c_str(), ifstream::in);
+ if (myfile.is_open()) {
+ while ( myfile.good() ) {
+ getline (myfile,line);
+ vector<string> fields;
+ if (line == "") continue;
+ split(line, '\t', fields);
+ if (fields.size() != 2) throw std::runtime_error("Bad format of a row in weights file.");
+ int encoded = encodeString(fields[0]);
+ float weight = atof(fields[1].c_str());
+ weightsMap[encoded] = weight;
}
+ myfile.close();
+ } else {
+ cerr << "Unable to open file "<< weightsfile << endl;
+ exit(1);
+ }
}
diff --git a/mert/SemposScorer.h b/mert/SemposScorer.h
index bde064349..b6c735bbe 100644
--- a/mert/SemposScorer.h
+++ b/mert/SemposScorer.h
@@ -19,7 +19,7 @@
namespace MosesTuning
{
-
+
/**
* This class represents sempos based metrics.
@@ -32,12 +32,16 @@ public:
virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
virtual void prepareStats(std::size_t sindex, const std::string& text, ScoreStats& entry);
- virtual std::size_t NumberOfScores() const { return m_ovr->NumberOfScores(); }
+ virtual std::size_t NumberOfScores() const {
+ return m_ovr->NumberOfScores();
+ }
virtual float calculateScore(const std::vector<int>& comps) const {
return m_ovr->calculateScore(comps);
}
- bool EnableDebug() const { return m_enable_debug; }
+ bool EnableDebug() const {
+ return m_enable_debug;
+ }
float weight(int item) const;
diff --git a/mert/SentenceLevelScorer.cpp b/mert/SentenceLevelScorer.cpp
index 0b159f0b7..0d1c15140 100644
--- a/mert/SentenceLevelScorer.cpp
+++ b/mert/SentenceLevelScorer.cpp
@@ -17,48 +17,50 @@ namespace MosesTuning
{
SentenceLevelScorer::SentenceLevelScorer(const string& name, const string& config)
- : Scorer(name, config),
- m_regularisationStrategy(REG_NONE),
- m_regularisationWindow(0) {
+ : Scorer(name, config),
+ m_regularisationStrategy(REG_NONE),
+ m_regularisationWindow(0)
+{
Init();
}
SentenceLevelScorer::~SentenceLevelScorer() {}
-void SentenceLevelScorer::Init() {
- // Configure regularisation.
- static string KEY_TYPE = "regtype";
- static string KEY_WINDOW = "regwin";
- static string KEY_CASE = "case";
- static string TYPE_NONE = "none";
- static string TYPE_AVERAGE = "average";
- static string TYPE_MINIMUM = "min";
- static string TRUE = "true";
- static string FALSE = "false";
-
- const string type = getConfig(KEY_TYPE, TYPE_NONE);
- if (type == TYPE_NONE) {
- m_regularisationStrategy = REG_NONE;
- } else if (type == TYPE_AVERAGE) {
- m_regularisationStrategy = REG_AVERAGE;
- } else if (type == TYPE_MINIMUM) {
- m_regularisationStrategy = REG_MINIMUM;
- } else {
- throw boost::lexer::runtime_error("Unknown scorer regularisation strategy: " + type);
- }
- cerr << "Using scorer regularisation strategy: " << type << endl;
+void SentenceLevelScorer::Init()
+{
+ // Configure regularisation.
+ static string KEY_TYPE = "regtype";
+ static string KEY_WINDOW = "regwin";
+ static string KEY_CASE = "case";
+ static string TYPE_NONE = "none";
+ static string TYPE_AVERAGE = "average";
+ static string TYPE_MINIMUM = "min";
+ static string TRUE = "true";
+ static string FALSE = "false";
+
+ const string type = getConfig(KEY_TYPE, TYPE_NONE);
+ if (type == TYPE_NONE) {
+ m_regularisationStrategy = REG_NONE;
+ } else if (type == TYPE_AVERAGE) {
+ m_regularisationStrategy = REG_AVERAGE;
+ } else if (type == TYPE_MINIMUM) {
+ m_regularisationStrategy = REG_MINIMUM;
+ } else {
+ throw boost::lexer::runtime_error("Unknown scorer regularisation strategy: " + type);
+ }
+ cerr << "Using scorer regularisation strategy: " << type << endl;
- const string window = getConfig(KEY_WINDOW, "0");
- m_regularisationWindow = atoi(window.c_str());
- cerr << "Using scorer regularisation window: " << m_regularisationWindow << endl;
+ const string window = getConfig(KEY_WINDOW, "0");
+ m_regularisationWindow = atoi(window.c_str());
+ cerr << "Using scorer regularisation window: " << m_regularisationWindow << endl;
- const string preservecase = getConfig(KEY_CASE, TRUE);
- if (preservecase == TRUE) {
- m_enable_preserve_case = true;
- } else if (preservecase == FALSE) {
- m_enable_preserve_case = false;
- }
- cerr << "Using case preservation: " << m_enable_preserve_case << endl;
+ const string preservecase = getConfig(KEY_CASE, TRUE);
+ if (preservecase == TRUE) {
+ m_enable_preserve_case = true;
+ } else if (preservecase == FALSE) {
+ m_enable_preserve_case = false;
+ }
+ cerr << "Using case preservation: " << m_enable_preserve_case << endl;
}
void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t& diffs,
@@ -83,8 +85,8 @@ void SentenceLevelScorer::score(const candidates_t& candidates, const diffs_t&
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
- << "number of fields. Found: " << stats.size() << " Expected: "
- << totals.size();
+ << "number of fields. Found: " << stats.size() << " Expected: "
+ << totals.size();
throw runtime_error(msg.str());
}
//Add up scores for all sentences, would normally be just one score
diff --git a/mert/Singleton.h b/mert/Singleton.h
index 473517170..f50925fa4 100644
--- a/mert/Singleton.h
+++ b/mert/Singleton.h
@@ -5,13 +5,14 @@
namespace MosesTuning
{
-
+
// thread *un*safe singleton.
// TODO: replace this with thread-safe singleton.
template <typename T>
-class Singleton {
- public:
+class Singleton
+{
+public:
static T* GetInstance() {
if (m_instance == NULL) {
m_instance = new T;
@@ -26,7 +27,7 @@ class Singleton {
}
}
- private:
+private:
Singleton();
static T* m_instance;
};
diff --git a/mert/SingletonTest.cpp b/mert/SingletonTest.cpp
index a74ce7c6b..36acbeec2 100644
--- a/mert/SingletonTest.cpp
+++ b/mert/SingletonTest.cpp
@@ -5,19 +5,24 @@
using namespace MosesTuning;
-namespace {
+namespace
+{
static int g_count = 0;
-class Instance {
- public:
- Instance() { ++g_count; }
+class Instance
+{
+public:
+ Instance() {
+ ++g_count;
+ }
~Instance() {}
};
} // namespace
-BOOST_AUTO_TEST_CASE(singleton_basic) {
+BOOST_AUTO_TEST_CASE(singleton_basic)
+{
Instance* instance1 = Singleton<Instance>::GetInstance();
Instance* instance2 = Singleton<Instance>::GetInstance();
Instance* instance3 = Singleton<Instance>::GetInstance();
diff --git a/mert/StatisticsBasedScorer.cpp b/mert/StatisticsBasedScorer.cpp
index 05dd95939..869e2f55a 100644
--- a/mert/StatisticsBasedScorer.cpp
+++ b/mert/StatisticsBasedScorer.cpp
@@ -13,10 +13,11 @@ using namespace std;
namespace MosesTuning
{
-
+
StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& config)
-: Scorer(name,config) {
+ : Scorer(name,config)
+{
//configure regularisation
static string KEY_TYPE = "regtype";
static string KEY_WINDOW = "regwin";
@@ -26,7 +27,7 @@ StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& c
static string TYPE_MINIMUM = "min";
static string TRUE = "true";
static string FALSE = "false";
-
+
string type = getConfig(KEY_TYPE,TYPE_NONE);
if (type == TYPE_NONE) {
m_regularization_type = NONE;
@@ -38,11 +39,11 @@ StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& c
throw runtime_error("Unknown scorer regularisation strategy: " + type);
}
// cerr << "Using scorer regularisation strategy: " << type << endl;
-
+
const string& window = getConfig(KEY_WINDOW, "0");
m_regularization_window = atoi(window.c_str());
// cerr << "Using scorer regularisation window: " << m_regularization_window << endl;
-
+
const string& preserve_case = getConfig(KEY_CASE,TRUE);
if (preserve_case == TRUE) {
m_enable_preserve_case = true;
@@ -72,8 +73,8 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t
if (stats.size() != totals.size()) {
stringstream msg;
msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
- << "number of fields. Found: " << stats.size() << " Expected: "
- << totals.size();
+ << "number of fields. Found: " << stats.size() << " Expected: "
+ << totals.size();
throw runtime_error(msg.str());
}
for (size_t k = 0; k < totals.size(); ++k) {
@@ -81,7 +82,7 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t
}
}
scores.push_back(calculateScore(totals));
-
+
candidates_t last_candidates(candidates);
// apply each of the diffs, and get new scores
for (size_t i = 0; i < diffs.size(); ++i) {
@@ -91,21 +92,21 @@ void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t
size_t last_nid = last_candidates[sid];
for (size_t k = 0; k < totals.size(); ++k) {
int diff = m_score_data->get(sid,nid).get(k)
- - m_score_data->get(sid,last_nid).get(k);
+ - m_score_data->get(sid,last_nid).get(k);
totals[k] += diff;
}
last_candidates[sid] = nid;
}
scores.push_back(calculateScore(totals));
}
-
+
// Regularisation. This can either be none, or the min or average as described in
// Cer, Jurafsky and Manning at WMT08.
if (m_regularization_type == NONE || m_regularization_window <= 0) {
// no regularisation
return;
}
-
+
// window size specifies the +/- in each direction
statscores_t raw_scores(scores); // copy scores
for (size_t i = 0; i < scores.size(); ++i) {
diff --git a/mert/StatisticsBasedScorer.h b/mert/StatisticsBasedScorer.h
index ca32535ad..644873b60 100644
--- a/mert/StatisticsBasedScorer.h
+++ b/mert/StatisticsBasedScorer.h
@@ -13,7 +13,7 @@
namespace MosesTuning
{
-
+
/**
* Abstract base class for Scorers that work by adding statistics across all
@@ -26,20 +26,20 @@ public:
virtual ~StatisticsBasedScorer() {}
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
statscores_t& scores) const;
-
+
protected:
-
+
enum RegularisationType {
NONE,
AVERAGE,
MINIMUM
};
-
+
/**
* Calculate the actual score.
*/
virtual statscore_t calculateScore(const std::vector<int>& totals) const = 0;
-
+
// regularisation
RegularisationType m_regularization_type;
std::size_t m_regularization_window;
diff --git a/mert/TerScorer.cpp b/mert/TerScorer.cpp
index cc7cf1630..7c11ea66b 100644
--- a/mert/TerScorer.cpp
+++ b/mert/TerScorer.cpp
@@ -14,10 +14,10 @@ using namespace TERCpp;
namespace MosesTuning
{
-
+
TerScorer::TerScorer(const string& config)
- : StatisticsBasedScorer("TER",config), kLENGTH(2) {}
+ : StatisticsBasedScorer("TER",config), kLENGTH(2) {}
TerScorer::~TerScorer() {}
diff --git a/mert/TerScorer.h b/mert/TerScorer.h
index 0229f5e8c..5e9fed46f 100644
--- a/mert/TerScorer.h
+++ b/mert/TerScorer.h
@@ -10,7 +10,7 @@
namespace MosesTuning
{
-
+
class ScoreStats;
diff --git a/mert/Timer.cpp b/mert/Timer.cpp
index 088be93a5..47fa5c750 100644
--- a/mert/Timer.cpp
+++ b/mert/Timer.cpp
@@ -6,14 +6,17 @@
#include <sys/time.h>
#endif
-namespace {
+namespace
+{
#if !defined(_WIN32) && !defined(_WIN64)
-uint64_t GetMicroSeconds(const struct timeval& tv) {
+uint64_t GetMicroSeconds(const struct timeval& tv)
+{
return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
}
-uint64_t GetTimeOfDayMicroSeconds() {
+uint64_t GetTimeOfDayMicroSeconds()
+{
struct timeval tv;
gettimeofday(&tv, NULL);
return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
@@ -24,9 +27,10 @@ uint64_t GetTimeOfDayMicroSeconds() {
namespace MosesTuning
{
-
-void Timer::GetCPUTimeMicroSeconds(Timer::CPUTime* cpu_time) const {
+
+void Timer::GetCPUTimeMicroSeconds(Timer::CPUTime* cpu_time) const
+{
#if !defined(_WIN32) && !defined(_WIN64)
struct rusage usage;
if (getrusage(RUSAGE_SELF, &usage)) {
@@ -41,22 +45,26 @@ void Timer::GetCPUTimeMicroSeconds(Timer::CPUTime* cpu_time) const {
#endif
}
-double Timer::get_elapsed_cpu_time() const {
+double Timer::get_elapsed_cpu_time() const
+{
return static_cast<double>(get_elapsed_cpu_time_microseconds()) * 1e-6;
}
-uint64_t Timer::get_elapsed_cpu_time_microseconds() const {
+uint64_t Timer::get_elapsed_cpu_time_microseconds() const
+{
CPUTime e;
GetCPUTimeMicroSeconds(&e);
return (e.user_time - m_start_time.user_time) +
- (e.sys_time - m_start_time.sys_time);
+ (e.sys_time - m_start_time.sys_time);
}
-double Timer::get_elapsed_wall_time() const {
+double Timer::get_elapsed_wall_time() const
+{
return static_cast<double>(get_elapsed_wall_time_microseconds()) * 1e-6;
}
-uint64_t Timer::get_elapsed_wall_time_microseconds() const {
+uint64_t Timer::get_elapsed_wall_time_microseconds() const
+{
return GetTimeOfDayMicroSeconds() - m_wall;
}
@@ -92,7 +100,8 @@ void Timer::check(const char* msg)
}
}
-std::string Timer::ToString() const {
+std::string Timer::ToString() const
+{
std::string res;
const double wall = get_elapsed_wall_time();
CPUTime e;
diff --git a/mert/Timer.h b/mert/Timer.h
index bae4ab6b3..2adb86412 100644
--- a/mert/Timer.h
+++ b/mert/Timer.h
@@ -7,11 +7,11 @@
namespace MosesTuning
{
-
+
class Timer
{
- private:
+private:
// Time values are stored in microseconds.
struct CPUTime {
uint64_t user_time; // user CPU time
@@ -30,15 +30,15 @@ class Timer
Timer(const Timer&);
void operator=(const Timer&);
- public:
+public:
/**
* 'm_is_running' is initially false. A timer needs to be explicitly started
* using 'start'.
*/
Timer()
- : m_is_running(false),
- m_wall(0),
- m_start_time() {}
+ : m_is_running(false),
+ m_wall(0),
+ m_start_time() {}
~Timer() {}
@@ -61,7 +61,9 @@ class Timer
/**
*/
- bool is_running() const { return m_is_running; }
+ bool is_running() const {
+ return m_is_running;
+ }
/**
* Return the total time in seconds that the timer has been in the
@@ -97,7 +99,8 @@ class Timer
* for an ostream 'os' and a timer 't'. For example, "cout << t" will
* print out the total amount of time 't' has been "running".
*/
-inline std::ostream& operator<<(std::ostream& os, const Timer& t) {
+inline std::ostream& operator<<(std::ostream& os, const Timer& t)
+{
if (t.is_running()) {
os << t.ToString();
} else {
diff --git a/mert/TimerTest.cpp b/mert/TimerTest.cpp
index 3bf0e5573..d72b1c312 100644
--- a/mert/TimerTest.cpp
+++ b/mert/TimerTest.cpp
@@ -8,7 +8,8 @@
using namespace MosesTuning;
-BOOST_AUTO_TEST_CASE(timer_basic_test) {
+BOOST_AUTO_TEST_CASE(timer_basic_test)
+{
Timer timer;
const int sleep_time_microsec = 40; // ad-hoc microseconds to pass unit tests.
diff --git a/mert/Util.cpp b/mert/Util.cpp
index ac7d1803b..67448292f 100644
--- a/mert/Util.cpp
+++ b/mert/Util.cpp
@@ -11,7 +11,8 @@
using namespace std;
-namespace {
+namespace
+{
MosesTuning::Timer g_timer;
int g_verbose = 0;
@@ -56,7 +57,8 @@ size_t getNextPound(std::string &str, std::string &substr,
return pos;
}
-void split(const std::string &s, char delim, std::vector<std::string> &elems) {
+void split(const std::string &s, char delim, std::vector<std::string> &elems)
+{
std::stringstream ss(s);
std::string item;
while(std::getline(ss, item, delim)) {
@@ -65,7 +67,8 @@ void split(const std::string &s, char delim, std::vector<std::string> &elems) {
}
void Tokenize(const char *str, const char delim,
- std::vector<std::string> *res) {
+ std::vector<std::string> *res)
+{
while (1) {
const char *begin = str;
while (*str != delim && *str) str++;
diff --git a/mert/Util.h b/mert/Util.h
index e2071bf1f..5c9c635ab 100644
--- a/mert/Util.h
+++ b/mert/Util.h
@@ -40,7 +40,8 @@ int setverboselevel(int v);
const float kEPS = 0.0001f;
template <typename T>
-bool IsAlmostEqual(T expected, T actual, float round=kEPS) {
+bool IsAlmostEqual(T expected, T actual, float round=kEPS)
+{
if (std::abs(expected - actual) < round) {
return true;
} else {
@@ -86,7 +87,8 @@ inline T Scan(const std::string &input)
* Returns true iff "str" ends with "suffix".
* e.g., Given str = "abc:" and suffix = ":", this function returns true.
*/
-inline bool EndsWith(const std::string& str, const char* suffix) {
+inline bool EndsWith(const std::string& str, const char* suffix)
+{
return str.find_last_of(suffix) == str.size() - 1;
}
diff --git a/mert/UtilTest.cpp b/mert/UtilTest.cpp
index f3ca6ca80..6f86d5144 100644
--- a/mert/UtilTest.cpp
+++ b/mert/UtilTest.cpp
@@ -5,7 +5,8 @@
using namespace MosesTuning;
-BOOST_AUTO_TEST_CASE(util_get_next_pound_test) {
+BOOST_AUTO_TEST_CASE(util_get_next_pound_test)
+{
{
std::string str("9 9 7 ");
std::string substr;
@@ -38,7 +39,8 @@ BOOST_AUTO_TEST_CASE(util_get_next_pound_test) {
}
}
-BOOST_AUTO_TEST_CASE(util_tokenize_test) {
+BOOST_AUTO_TEST_CASE(util_tokenize_test)
+{
{
std::vector<std::string> res;
Tokenize("9 9 7", ' ', &res);
@@ -66,7 +68,8 @@ BOOST_AUTO_TEST_CASE(util_tokenize_test) {
}
}
-BOOST_AUTO_TEST_CASE(util_ends_with_test) {
+BOOST_AUTO_TEST_CASE(util_ends_with_test)
+{
BOOST_CHECK(EndsWith("abc:", ":"));
BOOST_CHECK(EndsWith("a b c:", ":"));
BOOST_CHECK(!EndsWith("a", ":"));
diff --git a/mert/Vocabulary.cpp b/mert/Vocabulary.cpp
index 458024ce1..5a17c2c6e 100644
--- a/mert/Vocabulary.cpp
+++ b/mert/Vocabulary.cpp
@@ -1,34 +1,39 @@
#include "Vocabulary.h"
#include "Singleton.h"
-namespace mert {
-namespace {
+namespace mert
+{
+namespace
+{
Vocabulary* g_vocab = NULL;
} // namespace
-int Vocabulary::Encode(const std::string& token) {
- iterator it = m_vocab.find(token);
- int encoded_token;
- if (it == m_vocab.end()) {
- // Add an new entry to the vocaburary.
- encoded_token = static_cast<int>(m_vocab.size());
-
- m_vocab[token] = encoded_token;
- } else {
- encoded_token = it->second;
- }
- return encoded_token;
+int Vocabulary::Encode(const std::string& token)
+{
+ iterator it = m_vocab.find(token);
+ int encoded_token;
+ if (it == m_vocab.end()) {
+ // Add an new entry to the vocaburary.
+ encoded_token = static_cast<int>(m_vocab.size());
+
+ m_vocab[token] = encoded_token;
+ } else {
+ encoded_token = it->second;
+ }
+ return encoded_token;
}
-bool Vocabulary::Lookup(const std::string&str , int* v) const {
+bool Vocabulary::Lookup(const std::string&str , int* v) const
+{
- const_iterator it = m_vocab.find(str);
- if (it == m_vocab.end()) return false;
- *v = it->second;
- return true;
+ const_iterator it = m_vocab.find(str);
+ if (it == m_vocab.end()) return false;
+ *v = it->second;
+ return true;
}
-Vocabulary* VocabularyFactory::GetVocabulary() {
+Vocabulary* VocabularyFactory::GetVocabulary()
+{
if (g_vocab == NULL) {
return MosesTuning::Singleton<Vocabulary>::GetInstance();
} else {
@@ -36,7 +41,8 @@ Vocabulary* VocabularyFactory::GetVocabulary() {
}
}
-void VocabularyFactory::SetVocabulary(Vocabulary* vocab) {
+void VocabularyFactory::SetVocabulary(Vocabulary* vocab)
+{
g_vocab = vocab;
}
diff --git a/mert/Vocabulary.h b/mert/Vocabulary.h
index 3ad42f46f..16c8698c6 100644
--- a/mert/Vocabulary.h
+++ b/mert/Vocabulary.h
@@ -4,7 +4,8 @@
#include <boost/unordered_map.hpp>
#include <string>
-namespace mert {
+namespace mert
+{
/**
* A map to handle vocabularies to calculate
@@ -12,8 +13,9 @@ namespace mert {
*
* TODO: replace this with more efficient data structure.
*/
-class Vocabulary {
- public:
+class Vocabulary
+{
+public:
typedef boost::unordered_map<std::string, int>::iterator iterator;
typedef boost::unordered_map<std::string, int>::const_iterator const_iterator;
@@ -28,32 +30,53 @@ class Vocabulary {
*/
bool Lookup(const std::string&str , int* v) const;
- void clear() { m_vocab.clear(); }
-
- bool empty() const { return m_vocab.empty(); }
-
- std::size_t size() const { return m_vocab.size(); }
-
- iterator find(const std::string& str) { return m_vocab.find(str); }
- const_iterator find(const std::string& str) const { return m_vocab.find(str); }
-
- int& operator[](const std::string& str) { return m_vocab[str]; }
-
- iterator begin() { return m_vocab.begin(); }
- const_iterator begin() const { return m_vocab.begin(); }
- iterator end() { return m_vocab.end(); }
- const_iterator end() const { return m_vocab.end(); }
-
- private:
+ void clear() {
+ m_vocab.clear();
+ }
+
+ bool empty() const {
+ return m_vocab.empty();
+ }
+
+ std::size_t size() const {
+ return m_vocab.size();
+ }
+
+ iterator find(const std::string& str) {
+ return m_vocab.find(str);
+ }
+ const_iterator find(const std::string& str) const {
+ return m_vocab.find(str);
+ }
+
+ int& operator[](const std::string& str) {
+ return m_vocab[str];
+ }
+
+ iterator begin() {
+ return m_vocab.begin();
+ }
+ const_iterator begin() const {
+ return m_vocab.begin();
+ }
+ iterator end() {
+ return m_vocab.end();
+ }
+ const_iterator end() const {
+ return m_vocab.end();
+ }
+
+private:
boost::unordered_map<std::string, int> m_vocab;
};
-class VocabularyFactory {
- public:
+class VocabularyFactory
+{
+public:
static Vocabulary* GetVocabulary();
static void SetVocabulary(Vocabulary* vocab);
- private:
+private:
VocabularyFactory() {}
virtual ~VocabularyFactory() {}
};
diff --git a/mert/VocabularyTest.cpp b/mert/VocabularyTest.cpp
index 5b453fcda..002b6a64f 100644
--- a/mert/VocabularyTest.cpp
+++ b/mert/VocabularyTest.cpp
@@ -6,16 +6,20 @@
using namespace MosesTuning;
-namespace mert {
-namespace {
+namespace mert
+{
+namespace
+{
-void TearDown() {
+void TearDown()
+{
Singleton<Vocabulary>::Delete();
}
} // namespace
-BOOST_AUTO_TEST_CASE(vocab_basic) {
+BOOST_AUTO_TEST_CASE(vocab_basic)
+{
Vocabulary vocab;
BOOST_REQUIRE(vocab.empty());
vocab.clear();
@@ -39,7 +43,8 @@ BOOST_AUTO_TEST_CASE(vocab_basic) {
BOOST_CHECK(!vocab.Lookup("world", &v));
}
-BOOST_AUTO_TEST_CASE(vocab_factory_test) {
+BOOST_AUTO_TEST_CASE(vocab_factory_test)
+{
Vocabulary* vocab1 = VocabularyFactory::GetVocabulary();
Vocabulary* vocab2 = VocabularyFactory::GetVocabulary();
Vocabulary* vocab3 = VocabularyFactory::GetVocabulary();
diff --git a/mert/evaluator.cpp b/mert/evaluator.cpp
index 58a66b12d..f976f39b3 100644
--- a/mert/evaluator.cpp
+++ b/mert/evaluator.cpp
@@ -14,7 +14,8 @@
using namespace std;
using namespace MosesTuning;
-namespace {
+namespace
+{
Scorer* g_scorer = NULL;
bool g_has_more_files = false;
@@ -22,13 +23,14 @@ bool g_has_more_scorers = false;
const float g_alpha = 0.05;
-class EvaluatorUtil {
- public:
+class EvaluatorUtil
+{
+public:
static void evaluate(const string& candFile, int bootstrap);
static float average(const vector<float>& list);
static string int2string(int n);
- private:
+private:
EvaluatorUtil() {}
~EvaluatorUtil() {}
};
@@ -43,22 +45,18 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap)
// Loading sentences and preparing statistics
ScoreStats scoreentry;
string line;
- while (getline(cand, line))
- {
+ while (getline(cand, line)) {
g_scorer->prepareStats(entries.size(), line, scoreentry);
entries.push_back(scoreentry);
}
int n = entries.size();
- if (bootstrap)
- {
+ if (bootstrap) {
vector<float> scores;
- for (int i = 0; i < bootstrap; ++i)
- {
+ for (int i = 0; i < bootstrap; ++i) {
// TODO: Use smart pointer for exceptional-safety.
ScoreData* scoredata = new ScoreData(g_scorer);
- for (int j = 0; j < n; ++j)
- {
+ for (int j = 0; j < n; ++j) {
int randomIndex = random() % n;
scoredata->add(entries[randomIndex], j);
}
@@ -85,13 +83,10 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap)
cout.setf(ios::fixed, ios::floatfield);
cout.precision(4);
cout << avg << "\t[" << lb << "," << rb << "]" << endl;
- }
- else
- {
+ } else {
// TODO: Use smart pointer for exceptional-safety.
ScoreData* scoredata = new ScoreData(g_scorer);
- for (int sid = 0; sid < n; ++sid)
- {
+ for (int sid = 0; sid < n; ++sid) {
scoredata->add(entries[sid], sid);
}
g_scorer->setScoreData(scoredata);
@@ -184,56 +179,56 @@ struct ProgramOption {
bool has_seed;
ProgramOption()
- : reference(""),
- candidate(""),
- bootstrap(0),
- seed(0),
- has_seed(false) { }
+ : reference(""),
+ candidate(""),
+ bootstrap(0),
+ seed(0),
+ has_seed(false) { }
};
-void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
+void ParseCommandOptions(int argc, char** argv, ProgramOption* opt)
+{
int c;
int option_index;
int last_scorer_index = -1;
while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:f:l:h", long_options, &option_index)) != -1) {
switch(c) {
- case 's':
- opt->scorer_types.push_back(string(optarg));
- opt->scorer_configs.push_back(string(""));
- opt->scorer_factors.push_back(string(""));
- opt->scorer_filter.push_back(string(""));
- last_scorer_index++;
- break;
- case 'c':
- opt->scorer_configs[last_scorer_index] = string(optarg);
- break;
- case 'R':
- opt->reference = string(optarg);
- break;
- case 'C':
- opt->candidate = string(optarg);
- break;
- case 'b':
- opt->bootstrap = atoi(optarg);
- break;
- case 'r':
- opt->seed = strtol(optarg, NULL, 10);
- opt->has_seed = true;
- break;
- case 'f':
- opt->scorer_factors[last_scorer_index] = string(optarg);
- break;
- case 'l':
- opt->scorer_filter[last_scorer_index] = string(optarg);
- break;
- default:
- usage();
+ case 's':
+ opt->scorer_types.push_back(string(optarg));
+ opt->scorer_configs.push_back(string(""));
+ opt->scorer_factors.push_back(string(""));
+ opt->scorer_filter.push_back(string(""));
+ last_scorer_index++;
+ break;
+ case 'c':
+ opt->scorer_configs[last_scorer_index] = string(optarg);
+ break;
+ case 'R':
+ opt->reference = string(optarg);
+ break;
+ case 'C':
+ opt->candidate = string(optarg);
+ break;
+ case 'b':
+ opt->bootstrap = atoi(optarg);
+ break;
+ case 'r':
+ opt->seed = strtol(optarg, NULL, 10);
+ opt->has_seed = true;
+ break;
+ case 'f':
+ opt->scorer_factors[last_scorer_index] = string(optarg);
+ break;
+ case 'l':
+ opt->scorer_filter[last_scorer_index] = string(optarg);
+ break;
+ default:
+ usage();
}
}
// Add default scorer if no scorer provided
- if (opt->scorer_types.size() == 0)
- {
+ if (opt->scorer_types.size() == 0) {
opt->scorer_types.push_back(string("BLEU"));
opt->scorer_configs.push_back(string(""));
opt->scorer_factors.push_back(string(""));
@@ -241,7 +236,8 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
}
}
-void InitSeed(const ProgramOption *opt) {
+void InitSeed(const ProgramOption *opt)
+{
if (opt->has_seed) {
cerr << "Seeding random numbers with " << opt->seed << endl;
srandom(opt->seed);
@@ -260,8 +256,7 @@ int main(int argc, char** argv)
ProgramOption option;
ParseCommandOptions(argc, argv, &option);
- if (option.bootstrap)
- {
+ if (option.bootstrap) {
InitSeed(&option);
}
@@ -278,17 +273,15 @@ int main(int argc, char** argv)
if (candFiles.size() > 1) g_has_more_files = true;
if (option.scorer_types.size() > 1) g_has_more_scorers = true;
- for (vector<string>::const_iterator fileIt = candFiles.begin(); fileIt != candFiles.end(); ++fileIt)
- {
- for (size_t i = 0; i < option.scorer_types.size(); i++)
- {
- g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]);
- g_scorer->setFactors(option.scorer_factors[i]);
- g_scorer->setFilter(option.scorer_filter[i]);
- g_scorer->setReferenceFiles(refFiles);
- EvaluatorUtil::evaluate(*fileIt, option.bootstrap);
- delete g_scorer;
- }
+ for (vector<string>::const_iterator fileIt = candFiles.begin(); fileIt != candFiles.end(); ++fileIt) {
+ for (size_t i = 0; i < option.scorer_types.size(); i++) {
+ g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]);
+ g_scorer->setFactors(option.scorer_factors[i]);
+ g_scorer->setFilter(option.scorer_filter[i]);
+ g_scorer->setReferenceFiles(refFiles);
+ EvaluatorUtil::evaluate(*fileIt, option.bootstrap);
+ delete g_scorer;
+ }
}
return EXIT_SUCCESS;
} catch (const exception& e) {
diff --git a/mert/extractor.cpp b/mert/extractor.cpp
index 077d9b94c..38652296e 100644
--- a/mert/extractor.cpp
+++ b/mert/extractor.cpp
@@ -20,7 +20,8 @@
using namespace std;
using namespace MosesTuning;
-namespace {
+namespace
+{
void usage()
{
@@ -78,68 +79,69 @@ struct ProgramOption {
int verbosity;
ProgramOption()
- : scorerType("BLEU"),
- scorerConfig(""),
- scorerFactors(""),
- scorerFilter(""),
- referenceFile(""),
- nbestFile(""),
- scoreDataFile("statscore.data"),
- featureDataFile("features.data"),
- prevScoreDataFile(""),
- prevFeatureDataFile(""),
- binmode(false),
- allowDuplicates(false),
- verbosity(0) { }
+ : scorerType("BLEU"),
+ scorerConfig(""),
+ scorerFactors(""),
+ scorerFilter(""),
+ referenceFile(""),
+ nbestFile(""),
+ scoreDataFile("statscore.data"),
+ featureDataFile("features.data"),
+ prevScoreDataFile(""),
+ prevFeatureDataFile(""),
+ binmode(false),
+ allowDuplicates(false),
+ verbosity(0) { }
};
-void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
+void ParseCommandOptions(int argc, char** argv, ProgramOption* opt)
+{
int c;
int option_index;
while ((c = getopt_long(argc, argv, "s:r:f:l:n:S:F:R:E:v:hbd", long_options, &option_index)) != -1) {
switch (c) {
- case 's':
- opt->scorerType = string(optarg);
- break;
- case 'c':
- opt->scorerConfig = string(optarg);
- break;
- case 'f':
- opt->scorerFactors = string(optarg);
- break;
- case 'l':
- opt->scorerFilter = string(optarg);
- break;
- case 'r':
- opt->referenceFile = string(optarg);
- break;
- case 'b':
- opt->binmode = true;
- break;
- case 'n':
- opt->nbestFile = string(optarg);
- break;
- case 'S':
- opt->scoreDataFile = string(optarg);
- break;
- case 'F':
- opt->featureDataFile = string(optarg);
- break;
- case 'E':
- opt->prevFeatureDataFile = string(optarg);
- break;
- case 'R':
- opt->prevScoreDataFile = string(optarg);
- break;
- case 'v':
- opt->verbosity = atoi(optarg);
- break;
- case 'd':
- opt->allowDuplicates = true;
- break;
- default:
- usage();
+ case 's':
+ opt->scorerType = string(optarg);
+ break;
+ case 'c':
+ opt->scorerConfig = string(optarg);
+ break;
+ case 'f':
+ opt->scorerFactors = string(optarg);
+ break;
+ case 'l':
+ opt->scorerFilter = string(optarg);
+ break;
+ case 'r':
+ opt->referenceFile = string(optarg);
+ break;
+ case 'b':
+ opt->binmode = true;
+ break;
+ case 'n':
+ opt->nbestFile = string(optarg);
+ break;
+ case 'S':
+ opt->scoreDataFile = string(optarg);
+ break;
+ case 'F':
+ opt->featureDataFile = string(optarg);
+ break;
+ case 'E':
+ opt->prevFeatureDataFile = string(optarg);
+ break;
+ case 'R':
+ opt->prevScoreDataFile = string(optarg);
+ break;
+ case 'v':
+ opt->verbosity = atoi(optarg);
+ break;
+ case 'd':
+ opt->allowDuplicates = true;
+ break;
+ default:
+ usage();
}
}
}
@@ -202,7 +204,7 @@ int main(int argc, char** argv)
TRACE_ERR("Scorer type: " << option.scorerType << endl);
boost::scoped_ptr<Scorer> scorer(
- ScorerFactory::getScorer(option.scorerType, option.scorerConfig));
+ ScorerFactory::getScorer(option.scorerType, option.scorerConfig));
// set Factors and Filter used to preprocess the sentences
scorer->setFactors(option.scorerFactors);
diff --git a/mert/kbmira.cpp b/mert/kbmira.cpp
index f0d1624e6..a2665ac13 100644
--- a/mert/kbmira.cpp
+++ b/mert/kbmira.cpp
@@ -2,7 +2,7 @@
// vim:tabstop=2
/***********************************************************************
K-best Batch MIRA for Moses
-Copyright (C) 2012, National Research Council Canada / Conseil national
+Copyright (C) 2012, National Research Council Canada / Conseil national
de recherches du Canada
***********************************************************************/
@@ -49,13 +49,14 @@ using namespace MosesTuning;
namespace po = boost::program_options;
-ValType evaluate(HypPackEnumerator* train, const AvgWeightVector& wv) {
+ValType evaluate(HypPackEnumerator* train, const AvgWeightVector& wv)
+{
vector<ValType> stats(kBleuNgramOrder*2+1,0);
for(train->reset(); !train->finished(); train->next()) {
// Find max model
size_t max_index=0;
ValType max_score=0;
- for(size_t i=0;i<train->cur_size();i++) {
+ for(size_t i=0; i<train->cur_size(); i++) {
MiraFeatureVector vec(train->featuresAt(i));
ValType score = wv.score(vec);
if(i==0 || score > max_score) {
@@ -64,8 +65,8 @@ ValType evaluate(HypPackEnumerator* train, const AvgWeightVector& wv) {
}
}
// Update stats
- const vector<float>& sent = train->scoresAt(max_index);
- for(size_t i=0;i<sent.size();i++) {
+ const vector<float>& sent = train->scoresAt(max_index);
+ for(size_t i=0; i<sent.size(); i++) {
stats[i]+=sent[i];
}
}
@@ -90,26 +91,26 @@ int main(int argc, char** argv)
bool model_bg = false; // Use model for background corpus
bool verbose = false; // Verbose updates
bool safe_hope = false; // Model score cannot have more than BLEU_RATIO times more influence than BLEU
-
+
// Command-line processing follows pro.cpp
po::options_description desc("Allowed options");
desc.add_options()
- ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
- ("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
- ("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
- ("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
- ("output-file,o", po::value<string>(&outputFile), "Output file")
- ("cparam,C", po::value<float>(&c), "MIRA C-parameter, lower for more regularization (default 0.01)")
- ("decay,D", po::value<float>(&decay), "BLEU background corpus decay rate (default 0.999)")
- ("iters,J", po::value<int>(&n_iters), "Number of MIRA iterations to run (default 60)")
- ("dense-init,d", po::value<string>(&denseInitFile), "Weight file for dense features")
- ("sparse-init,s", po::value<string>(&sparseInitFile), "Weight file for sparse features")
- ("streaming", po::value(&streaming)->zero_tokens()->default_value(false), "Stream n-best lists to save memory, implies --no-shuffle")
- ("no-shuffle", po::value(&no_shuffle)->zero_tokens()->default_value(false), "Don't shuffle hypotheses before each epoch")
- ("model-bg", po::value(&model_bg)->zero_tokens()->default_value(false), "Use model instead of hope for BLEU background")
- ("verbose", po::value(&verbose)->zero_tokens()->default_value(false), "Verbose updates")
- ("safe-hope", po::value(&safe_hope)->zero_tokens()->default_value(false), "Mode score's influence on hope decoding is limited")
- ;
+ ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
+ ("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
+ ("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
+ ("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
+ ("output-file,o", po::value<string>(&outputFile), "Output file")
+ ("cparam,C", po::value<float>(&c), "MIRA C-parameter, lower for more regularization (default 0.01)")
+ ("decay,D", po::value<float>(&decay), "BLEU background corpus decay rate (default 0.999)")
+ ("iters,J", po::value<int>(&n_iters), "Number of MIRA iterations to run (default 60)")
+ ("dense-init,d", po::value<string>(&denseInitFile), "Weight file for dense features")
+ ("sparse-init,s", po::value<string>(&sparseInitFile), "Weight file for sparse features")
+ ("streaming", po::value(&streaming)->zero_tokens()->default_value(false), "Stream n-best lists to save memory, implies --no-shuffle")
+ ("no-shuffle", po::value(&no_shuffle)->zero_tokens()->default_value(false), "Don't shuffle hypotheses before each epoch")
+ ("model-bg", po::value(&model_bg)->zero_tokens()->default_value(false), "Use model instead of hope for BLEU background")
+ ("verbose", po::value(&verbose)->zero_tokens()->default_value(false), "Verbose updates")
+ ("safe-hope", po::value(&safe_hope)->zero_tokens()->default_value(false), "Mode score's influence on hope decoding is limited")
+ ;
po::options_description cmdline_options;
cmdline_options.add(desc);
@@ -118,9 +119,9 @@ int main(int argc, char** argv)
options(cmdline_options).run(), vm);
po::notify(vm);
if (help) {
- cout << "Usage: " + string(argv[0]) + " [options]" << endl;
- cout << desc << endl;
- exit(0);
+ cout << "Usage: " + string(argv[0]) + " [options]" << endl;
+ cout << desc << endl;
+ exit(0);
}
cerr << "kbmira with c=" << c << " decay=" << decay << " no_shuffle=" << no_shuffle << endl;
@@ -165,7 +166,8 @@ int main(int argc, char** argv)
exit(3);
}
int sparseCount=0;
- parameter_t val; std::string name;
+ parameter_t val;
+ std::string name;
while(opt >> name >> val) {
size_t id = SparseVector::encode(name) + initDenseSize;
while(initParams.size()<=id) initParams.push_back(0.0);
@@ -175,17 +177,17 @@ int main(int argc, char** argv)
cerr << "Found " << sparseCount << " initial sparse features" << endl;
opt.close();
}
-
+
MiraWeightVector wv(initParams);
// Initialize background corpus
vector<ValType> bg;
- for(int j=0;j<kBleuNgramOrder;j++){
+ for(int j=0; j<kBleuNgramOrder; j++) {
bg.push_back(kBleuNgramOrder-j);
bg.push_back(kBleuNgramOrder-j);
}
bg.push_back(kBleuNgramOrder);
-
+
// Training loop
boost::scoped_ptr<HypPackEnumerator> train;
if(streaming)
@@ -194,8 +196,7 @@ int main(int argc, char** argv)
train.reset(new RandomAccessHypPackEnumerator(featureFiles, scoreFiles, no_shuffle));
cerr << "Initial BLEU = " << evaluate(train.get(), wv.avg()) << endl;
ValType bestBleu = 0;
- for(int j=0;j<n_iters;j++)
- {
+ for(int j=0; j<n_iters; j++) {
// MIRA train for one epoch
int iNumHyps = 0;
int iNumExamples = 0;
@@ -273,7 +274,7 @@ int main(int argc, char** argv)
}
// Update BLEU statistics
const vector<float>& model_stats = train->scoresAt(model_index);
- for(size_t k=0;k<bg.size();k++) {
+ for(size_t k=0; k<bg.size(); k++) {
bg[k]*=decay;
if(model_bg)
bg[k]+=model_stats[k];
@@ -286,7 +287,7 @@ int main(int argc, char** argv)
// Training Epoch summary
cerr << iNumUpdates << "/" << iNumExamples << " updates"
<< ", avg loss = " << (totalLoss / iNumExamples);
-
+
// Evaluate current average weights
AvgWeightVector avg = wv.avg();
@@ -312,11 +313,11 @@ int main(int argc, char** argv)
} else {
out = &cout;
}
- for(size_t i=0;i<avg.size();i++) {
+ for(size_t i=0; i<avg.size(); i++) {
if(i<num_dense)
*out << "F" << i << " " << avg.weight(i) << endl;
else {
- if(abs(avg.weight(i))>1e-8)
+ if(abs(avg.weight(i))>1e-8)
*out << SparseVector::decode(i-num_dense) << " " << avg.weight(i) << endl;
}
}
diff --git a/mert/mert.cpp b/mert/mert.cpp
index e53c86be2..b73c536d1 100644
--- a/mert/mert.cpp
+++ b/mert/mert.cpp
@@ -30,7 +30,8 @@
using namespace std;
using namespace MosesTuning;
-namespace {
+namespace
+{
const char kDefaultOptimizer[] = "powell";
const char kDefaultScorer[] = "BLEU";
@@ -46,10 +47,11 @@ const char kOutputFile[] = "weights.txt";
/**
* Runs an optimisation, or a random restart.
*/
-class OptimizationTask : public Moses::Task {
- public:
+class OptimizationTask : public Moses::Task
+{
+public:
OptimizationTask(Optimizer* optimizer, const Point& point)
- : m_optimizer(optimizer), m_point(point) {}
+ : m_optimizer(optimizer), m_point(point) {}
~OptimizationTask() {}
@@ -76,7 +78,7 @@ class OptimizationTask : public Moses::Task {
return m_point;
}
- private:
+private:
// Do not allow the user to instanciate without arguments.
OptimizationTask() {}
@@ -85,7 +87,8 @@ class OptimizationTask : public Moses::Task {
statscore_t m_score;
};
-bool WriteFinalWeights(const char* filename, const Point& point) {
+bool WriteFinalWeights(const char* filename, const Point& point)
+{
ofstream ofs(filename);
if (!ofs) {
cerr << "Cannot open " << filename << endl;
@@ -165,91 +168,92 @@ struct ProgramOption {
size_t shard_count;
ProgramOption()
- : to_optimize_str(""),
- pdim(-1),
- ntry(1),
- nrandom(0),
- seed(0),
- has_seed(false),
- optimize_type(kDefaultOptimizer),
- scorer_type(kDefaultScorer),
- scorer_config(""),
- scorer_file(kDefaultScorerFile),
- feature_file(kDefaultFeatureFile),
- init_file(kDefaultInitFile),
- positive_string(kDefaultPositiveString),
- sparse_weights_file(kDefaultSparseWeightsFile),
- num_threads(1),
- shard_size(0),
- shard_count(0) { }
+ : to_optimize_str(""),
+ pdim(-1),
+ ntry(1),
+ nrandom(0),
+ seed(0),
+ has_seed(false),
+ optimize_type(kDefaultOptimizer),
+ scorer_type(kDefaultScorer),
+ scorer_config(""),
+ scorer_file(kDefaultScorerFile),
+ feature_file(kDefaultFeatureFile),
+ init_file(kDefaultInitFile),
+ positive_string(kDefaultPositiveString),
+ sparse_weights_file(kDefaultSparseWeightsFile),
+ num_threads(1),
+ shard_size(0),
+ shard_count(0) { }
};
-void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
+void ParseCommandOptions(int argc, char** argv, ProgramOption* opt)
+{
int c;
int option_index;
while ((c = getopt_long(argc, argv, "o:r:d:n:m:t:s:S:F:v:p:P:", long_options, &option_index)) != -1) {
switch (c) {
- case 'o':
- opt->to_optimize_str = string(optarg);
- break;
- case 'd':
- opt->pdim = strtol(optarg, NULL, 10);
- break;
- case 'n':
- opt->ntry = strtol(optarg, NULL, 10);
- break;
- case 'm':
- opt->nrandom = strtol(optarg, NULL, 10);
- break;
- case 'r':
- opt->seed = strtol(optarg, NULL, 10);
- opt->has_seed = true;
- break;
- case 't':
- opt->optimize_type = string(optarg);
- break;
- case's':
- opt->scorer_type = string(optarg);
- break;
- case 'c':
- opt->scorer_config = string(optarg);
- break;
- case 'S':
- opt->scorer_file = string(optarg);
- break;
- case 'F':
- opt->feature_file = string(optarg);
- break;
- case 'i':
- opt->init_file = string(optarg);
- break;
- case 'p':
- opt->sparse_weights_file=string(optarg);
- break;
- case 'v':
- setverboselevel(strtol(optarg, NULL, 10));
- break;
+ case 'o':
+ opt->to_optimize_str = string(optarg);
+ break;
+ case 'd':
+ opt->pdim = strtol(optarg, NULL, 10);
+ break;
+ case 'n':
+ opt->ntry = strtol(optarg, NULL, 10);
+ break;
+ case 'm':
+ opt->nrandom = strtol(optarg, NULL, 10);
+ break;
+ case 'r':
+ opt->seed = strtol(optarg, NULL, 10);
+ opt->has_seed = true;
+ break;
+ case 't':
+ opt->optimize_type = string(optarg);
+ break;
+ case's':
+ opt->scorer_type = string(optarg);
+ break;
+ case 'c':
+ opt->scorer_config = string(optarg);
+ break;
+ case 'S':
+ opt->scorer_file = string(optarg);
+ break;
+ case 'F':
+ opt->feature_file = string(optarg);
+ break;
+ case 'i':
+ opt->init_file = string(optarg);
+ break;
+ case 'p':
+ opt->sparse_weights_file=string(optarg);
+ break;
+ case 'v':
+ setverboselevel(strtol(optarg, NULL, 10));
+ break;
#ifdef WITH_THREADS
- case 'T':
- opt->num_threads = strtol(optarg, NULL, 10);
- if (opt->num_threads < 1) opt->num_threads = 1;
- break;
+ case 'T':
+ opt->num_threads = strtol(optarg, NULL, 10);
+ if (opt->num_threads < 1) opt->num_threads = 1;
+ break;
#endif
- case 'a':
- opt->shard_count = strtof(optarg, NULL);
- break;
- case 'b':
- opt->shard_size = strtof(optarg, NULL);
- break;
- case 'h':
- usage(0);
- break;
- case 'P':
- opt->positive_string = string(optarg);
- break;
- default:
- usage(1);
+ case 'a':
+ opt->shard_count = strtof(optarg, NULL);
+ break;
+ case 'b':
+ opt->shard_size = strtof(optarg, NULL);
+ break;
+ case 'h':
+ usage(0);
+ break;
+ case 'P':
+ opt->positive_string = string(optarg);
+ break;
+ default:
+ usage(1);
}
}
}
@@ -353,7 +357,7 @@ int main(int argc, char **argv)
// it make sense to know what parameter set were used to generate the nbest
boost::scoped_ptr<Scorer> scorer(
- ScorerFactory::getScorer(option.scorer_type, option.scorer_config));
+ ScorerFactory::getScorer(option.scorer_type, option.scorer_config));
//load data
Data data(scorer.get(), option.sparse_weights_file);
diff --git a/mert/pro.cpp b/mert/pro.cpp
index 3777d0470..b8cf81ca3 100644
--- a/mert/pro.cpp
+++ b/mert/pro.cpp
@@ -51,7 +51,8 @@ namespace po = boost::program_options;
namespace MosesTuning
{
-class SampledPair {
+class SampledPair
+{
private:
pair<size_t,size_t> m_translation1;
pair<size_t,size_t> m_translation2;
@@ -70,12 +71,19 @@ public:
}
}
- float getDiff() const { return m_score_diff; }
- const pair<size_t,size_t>& getTranslation1() const { return m_translation1; }
- const pair<size_t,size_t>& getTranslation2() const { return m_translation2; }
+ float getDiff() const {
+ return m_score_diff;
+ }
+ const pair<size_t,size_t>& getTranslation1() const {
+ return m_translation1;
+ }
+ const pair<size_t,size_t>& getTranslation2() const {
+ return m_translation2;
+ }
};
-static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2) {
+static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2)
+{
// difference in score in regular features
for(unsigned int j=0; j<f1.dense.size(); j++)
if (abs(f1.dense[j]-f2.dense[j]) > 0.00001)
@@ -110,13 +118,13 @@ int main(int argc, char** argv)
po::options_description desc("Allowed options");
desc.add_options()
- ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
- ("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
- ("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
- ("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
- ("output-file,o", po::value<string>(&outputFile), "Output file")
- ("smooth-brevity-penalty,b", po::value(&smoothBP)->zero_tokens()->default_value(false), "Smooth the brevity penalty, as in Nakov et al. (Coling 2012)")
- ;
+ ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
+ ("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
+ ("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
+ ("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
+ ("output-file,o", po::value<string>(&outputFile), "Output file")
+ ("smooth-brevity-penalty,b", po::value(&smoothBP)->zero_tokens()->default_value(false), "Smooth the brevity penalty, as in Nakov et al. (Coling 2012)")
+ ;
po::options_description cmdline_options;
cmdline_options.add(desc);
@@ -125,9 +133,9 @@ int main(int argc, char** argv)
options(cmdline_options).run(), vm);
po::notify(vm);
if (help) {
- cout << "Usage: " + string(argv[0]) + " [options]" << endl;
- cout << desc << endl;
- exit(0);
+ cout << "Usage: " + string(argv[0]) + " [options]" << endl;
+ cout << desc << endl;
+ exit(0);
}
if (vm.count("random-seed")) {
@@ -145,7 +153,7 @@ int main(int argc, char** argv)
if (featureFiles.size() != scoreFiles.size()) {
cerr << "Error: Number of feature files (" << featureFiles.size() <<
- ") does not match number of score files (" << scoreFiles.size() << ")" << endl;
+ ") does not match number of score files (" << scoreFiles.size() << ")" << endl;
exit(1);
}
@@ -238,11 +246,11 @@ int main(int argc, char** argv)
size_t hypo_id2 = samples[i].getTranslation2().second;
*out << "1";
outputSample(*out, featureDataIters[file_id1]->operator[](hypo_id1),
- featureDataIters[file_id2]->operator[](hypo_id2));
+ featureDataIters[file_id2]->operator[](hypo_id2));
*out << endl;
*out << "0";
outputSample(*out, featureDataIters[file_id2]->operator[](hypo_id2),
- featureDataIters[file_id1]->operator[](hypo_id1));
+ featureDataIters[file_id1]->operator[](hypo_id1));
*out << endl;
}
//advance all iterators
diff --git a/mert/sentence-bleu.cpp b/mert/sentence-bleu.cpp
index 17a9737f2..5269d37cd 100644
--- a/mert/sentence-bleu.cpp
+++ b/mert/sentence-bleu.cpp
@@ -18,7 +18,7 @@ int main(int argc, char **argv)
// TODO all of these are empty for now
string config;
string factors;
- string filter;
+ string filter;
BleuScorer scorer(config);
scorer.setFactors(factors);
diff --git a/mira/Decoder.cpp b/mira/Decoder.cpp
index dea6699f6..3dea97f24 100644
--- a/mira/Decoder.cpp
+++ b/mira/Decoder.cpp
@@ -31,357 +31,371 @@ using namespace std;
using namespace Moses;
-namespace Mira {
-
- /**
- * Allocates a char* and copies string into it.
- **/
- static char* strToChar(const string& s) {
- char* c = new char[s.size()+1];
- strcpy(c,s.c_str());
- return c;
+namespace Mira
+{
+
+/**
+ * Allocates a char* and copies string into it.
+**/
+static char* strToChar(const string& s)
+{
+ char* c = new char[s.size()+1];
+ strcpy(c,s.c_str());
+ return c;
+}
+
+MosesDecoder::MosesDecoder(const string& inifile, int debuglevel, int argc, vector<string> decoder_params)
+ : m_manager(NULL)
+{
+ static int BASE_ARGC = 8;
+ Parameter* params = new Parameter();
+ char ** mosesargv = new char*[BASE_ARGC + argc];
+ mosesargv[0] = strToChar("-f");
+ mosesargv[1] = strToChar(inifile);
+ mosesargv[2] = strToChar("-v");
+ stringstream dbgin;
+ dbgin << debuglevel;
+ mosesargv[3] = strToChar(dbgin.str());
+ mosesargv[4] = strToChar("-use-persistent-cache");
+ mosesargv[5] = strToChar("0");
+ mosesargv[6] = strToChar("-persistent-cache-size");
+ mosesargv[7] = strToChar("0");
+
+ for (int i = 0; i < argc; ++i) {
+ char *cstr = &(decoder_params[i])[0];
+ mosesargv[BASE_ARGC + i] = cstr;
}
- MosesDecoder::MosesDecoder(const string& inifile, int debuglevel, int argc, vector<string> decoder_params)
- : m_manager(NULL) {
- static int BASE_ARGC = 8;
- Parameter* params = new Parameter();
- char ** mosesargv = new char*[BASE_ARGC + argc];
- mosesargv[0] = strToChar("-f");
- mosesargv[1] = strToChar(inifile);
- mosesargv[2] = strToChar("-v");
- stringstream dbgin;
- dbgin << debuglevel;
- mosesargv[3] = strToChar(dbgin.str());
- mosesargv[4] = strToChar("-use-persistent-cache");
- mosesargv[5] = strToChar("0");
- mosesargv[6] = strToChar("-persistent-cache-size");
- mosesargv[7] = strToChar("0");
-
- for (int i = 0; i < argc; ++i) {
- char *cstr = &(decoder_params[i])[0];
- mosesargv[BASE_ARGC + i] = cstr;
- }
-
- if (!params->LoadParam(BASE_ARGC + argc,mosesargv)) {
- cerr << "Loading static data failed, exit." << endl;
- exit(1);
- }
- StaticData::LoadDataStatic(params, "mira");
- for (int i = 0; i < BASE_ARGC; ++i) {
- delete[] mosesargv[i];
- }
- delete[] mosesargv;
-
- //m_bleuScoreFeature = staticData.GetBleuScoreFeature(); TODO
- assert(false);
+ if (!params->LoadParam(BASE_ARGC + argc,mosesargv)) {
+ cerr << "Loading static data failed, exit." << endl;
+ exit(1);
}
-
- void MosesDecoder::cleanup(bool chartDecoding) {
- delete m_manager;
- if (chartDecoding)
- delete m_chartManager;
- else
- delete m_sentence;
+ StaticData::LoadDataStatic(params, "mira");
+ for (int i = 0; i < BASE_ARGC; ++i) {
+ delete[] mosesargv[i];
+ }
+ delete[] mosesargv;
+
+ //m_bleuScoreFeature = staticData.GetBleuScoreFeature(); TODO
+ assert(false);
+}
+
+void MosesDecoder::cleanup(bool chartDecoding)
+{
+ delete m_manager;
+ if (chartDecoding)
+ delete m_chartManager;
+ else
+ delete m_sentence;
+}
+
+vector< vector<const Word*> > MosesDecoder::getNBest(const std::string& source,
+ size_t sentenceid,
+ size_t nBestSize,
+ float bleuObjectiveWeight,
+ float bleuScoreWeight,
+ vector< ScoreComponentCollection>& featureValues,
+ vector< float>& bleuScores,
+ vector< float>& modelScores,
+ size_t numReturnedTranslations,
+ bool realBleu,
+ bool distinct,
+ bool avgRefLength,
+ size_t rank,
+ size_t epoch,
+ string filename)
+{
+ StaticData &staticData = StaticData::InstanceNonConst();
+ bool chartDecoding = (staticData.GetSearchAlgorithm() == ChartDecoding);
+ initialize(staticData, source, sentenceid, bleuObjectiveWeight, bleuScoreWeight, avgRefLength, chartDecoding);
+
+ // run the decoder
+ if (chartDecoding) {
+ return runChartDecoder(source, sentenceid, nBestSize, bleuObjectiveWeight, bleuScoreWeight,
+ featureValues, bleuScores, modelScores, numReturnedTranslations, realBleu, distinct, rank, epoch);
+ } else {
+ SearchAlgorithm search = staticData.GetSearchAlgorithm();
+ return runDecoder(source, sentenceid, nBestSize, bleuObjectiveWeight, bleuScoreWeight,
+ featureValues, bleuScores, modelScores, numReturnedTranslations, realBleu, distinct, rank, epoch,
+ search, filename);
+ }
+}
+
+vector< vector<const Word*> > MosesDecoder::runDecoder(const std::string& source,
+ size_t sentenceid,
+ size_t nBestSize,
+ float bleuObjectiveWeight,
+ float bleuScoreWeight,
+ vector< ScoreComponentCollection>& featureValues,
+ vector< float>& bleuScores,
+ vector< float>& modelScores,
+ size_t numReturnedTranslations,
+ bool realBleu,
+ bool distinct,
+ size_t rank,
+ size_t epoch,
+ SearchAlgorithm& search,
+ string filename)
+{
+ // run the decoder
+ m_manager = new Moses::Manager(0,*m_sentence, search);
+ m_manager->ProcessSentence();
+ TrellisPathList nBestList;
+ m_manager->CalcNBest(nBestSize, nBestList, distinct);
+
+ // optionally print nbest to file (to extract scores and features.. currently just for sentence bleu scoring)
+ if (filename != "") {
+ ofstream out(filename.c_str());
+ if (!out) {
+ ostringstream msg;
+ msg << "Unable to open " << filename;
+ throw runtime_error(msg.str());
+ }
+ // TODO: handle sentence id (for now always 0)
+ //OutputNBest(out, nBestList, StaticData::Instance().GetOutputFactorOrder(), 0, false);
+ out.close();
}
- vector< vector<const Word*> > MosesDecoder::getNBest(const std::string& source,
- size_t sentenceid,
- size_t nBestSize,
- float bleuObjectiveWeight,
- float bleuScoreWeight,
- vector< ScoreComponentCollection>& featureValues,
- vector< float>& bleuScores,
- vector< float>& modelScores,
- size_t numReturnedTranslations,
- bool realBleu,
- bool distinct,
- bool avgRefLength,
- size_t rank,
- size_t epoch,
- string filename)
- {
- StaticData &staticData = StaticData::InstanceNonConst();
- bool chartDecoding = (staticData.GetSearchAlgorithm() == ChartDecoding);
- initialize(staticData, source, sentenceid, bleuObjectiveWeight, bleuScoreWeight, avgRefLength, chartDecoding);
+ // read off the feature values and bleu scores for each sentence in the nbest list
+ Moses::TrellisPathList::const_iterator iter;
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
+ const Moses::TrellisPath &path = **iter;
+ featureValues.push_back(path.GetScoreBreakdown());
+ float bleuScore, dynBleuScore, realBleuScore;
+ if (realBleu) realBleuScore = m_bleuScoreFeature->CalculateBleu(path.GetTargetPhrase());
+ else dynBleuScore = getBleuScore(featureValues.back());
+ bleuScore = realBleu ? realBleuScore : dynBleuScore;
+ bleuScores.push_back(bleuScore);
+
+ //std::cout << "Score breakdown: " << path.GetScoreBreakdown() << endl;
+ float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore);
+ modelScores.push_back(scoreWithoutBleu);
+
+ if (iter != nBestList.begin())
+ cerr << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << path.GetTargetPhrase() << "\", score: "
+ << scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << path.GetTotalScore();
+ if (m_bleuScoreFeature->Enabled() && realBleu)
+ cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") ";
+
+ // set bleu score to zero in the feature vector since we do not want to optimise its weight
+ setBleuScore(featureValues.back(), 0);
+ }
- // run the decoder
- if (chartDecoding) {
- return runChartDecoder(source, sentenceid, nBestSize, bleuObjectiveWeight, bleuScoreWeight,
- featureValues, bleuScores, modelScores, numReturnedTranslations, realBleu, distinct, rank, epoch);
- }
- else {
- SearchAlgorithm search = staticData.GetSearchAlgorithm();
- return runDecoder(source, sentenceid, nBestSize, bleuObjectiveWeight, bleuScoreWeight,
- featureValues, bleuScores, modelScores, numReturnedTranslations, realBleu, distinct, rank, epoch,
- search, filename);
+ // prepare translations to return
+ vector< vector<const Word*> > translations;
+ for (size_t i=0; i < numReturnedTranslations && i < nBestList.GetSize(); ++i) {
+ const TrellisPath &path = nBestList.at(i);
+ Phrase phrase = path.GetTargetPhrase();
+
+ vector<const Word*> translation;
+ for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
+ const Word &word = phrase.GetWord(pos);
+ Word *newWord = new Word(word);
+ translation.push_back(newWord);
}
+ translations.push_back(translation);
}
- vector< vector<const Word*> > MosesDecoder::runDecoder(const std::string& source,
- size_t sentenceid,
- size_t nBestSize,
- float bleuObjectiveWeight,
- float bleuScoreWeight,
- vector< ScoreComponentCollection>& featureValues,
- vector< float>& bleuScores,
- vector< float>& modelScores,
- size_t numReturnedTranslations,
- bool realBleu,
- bool distinct,
- size_t rank,
- size_t epoch,
- SearchAlgorithm& search,
- string filename) {
- // run the decoder
- m_manager = new Moses::Manager(0,*m_sentence, search);
- m_manager->ProcessSentence();
- TrellisPathList nBestList;
- m_manager->CalcNBest(nBestSize, nBestList, distinct);
-
- // optionally print nbest to file (to extract scores and features.. currently just for sentence bleu scoring)
- if (filename != "") {
- ofstream out(filename.c_str());
- if (!out) {
- ostringstream msg;
- msg << "Unable to open " << filename;
- throw runtime_error(msg.str());
- }
- // TODO: handle sentence id (for now always 0)
- //OutputNBest(out, nBestList, StaticData::Instance().GetOutputFactorOrder(), 0, false);
- out.close();
- }
+ return translations;
+}
+
+vector< vector<const Word*> > MosesDecoder::runChartDecoder(const std::string& source,
+ size_t sentenceid,
+ size_t nBestSize,
+ float bleuObjectiveWeight,
+ float bleuScoreWeight,
+ vector< ScoreComponentCollection>& featureValues,
+ vector< float>& bleuScores,
+ vector< float>& modelScores,
+ size_t numReturnedTranslations,
+ bool realBleu,
+ bool distinct,
+ size_t rank,
+ size_t epoch)
+{
+ // run the decoder
+ m_chartManager = new ChartManager(*m_sentence);
+ m_chartManager->ProcessSentence();
+ ChartTrellisPathList nBestList;
+ m_chartManager->CalcNBest(nBestSize, nBestList, distinct);
+
+ // read off the feature values and bleu scores for each sentence in the nbest list
+ ChartTrellisPathList::const_iterator iter;
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
+ const Moses::ChartTrellisPath &path = **iter;
+ featureValues.push_back(path.GetScoreBreakdown());
+ float bleuScore, dynBleuScore, realBleuScore;
+ dynBleuScore = getBleuScore(featureValues.back());
+ realBleuScore = m_bleuScoreFeature->CalculateBleu(path.GetOutputPhrase());
+ bleuScore = realBleu ? realBleuScore : dynBleuScore;
+ bleuScores.push_back(bleuScore);
+
+ //std::cout << "Score breakdown: " << path.GetScoreBreakdown() << endl;
+ float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore);
+ modelScores.push_back(scoreWithoutBleu);
+
+ if (iter != nBestList.begin())
+ cerr << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << path.GetOutputPhrase() << "\", score: "
+ << scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << path.GetTotalScore();
+ if (m_bleuScoreFeature->Enabled() && realBleu)
+ cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") ";
+
+ // set bleu score to zero in the feature vector since we do not want to optimise its weight
+ setBleuScore(featureValues.back(), 0);
+ }
- // read off the feature values and bleu scores for each sentence in the nbest list
- Moses::TrellisPathList::const_iterator iter;
- for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
- const Moses::TrellisPath &path = **iter;
- featureValues.push_back(path.GetScoreBreakdown());
- float bleuScore, dynBleuScore, realBleuScore;
- if (realBleu) realBleuScore = m_bleuScoreFeature->CalculateBleu(path.GetTargetPhrase());
- else dynBleuScore = getBleuScore(featureValues.back());
- bleuScore = realBleu ? realBleuScore : dynBleuScore;
- bleuScores.push_back(bleuScore);
-
- //std::cout << "Score breakdown: " << path.GetScoreBreakdown() << endl;
- float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore);
- modelScores.push_back(scoreWithoutBleu);
-
- if (iter != nBestList.begin())
- cerr << endl;
- cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << path.GetTargetPhrase() << "\", score: "
- << scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << path.GetTotalScore();
- if (m_bleuScoreFeature->Enabled() && realBleu)
- cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") ";
-
- // set bleu score to zero in the feature vector since we do not want to optimise its weight
- setBleuScore(featureValues.back(), 0);
+ // prepare translations to return
+ vector< vector<const Word*> > translations;
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
+ const ChartTrellisPath &path = **iter;
+ Phrase phrase = path.GetOutputPhrase();
+
+ vector<const Word*> translation;
+ for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
+ const Word &word = phrase.GetWord(pos);
+ Word *newWord = new Word(word);
+ translation.push_back(newWord);
}
+ translations.push_back(translation);
+ }
- // prepare translations to return
- vector< vector<const Word*> > translations;
- for (size_t i=0; i < numReturnedTranslations && i < nBestList.GetSize(); ++i) {
- const TrellisPath &path = nBestList.at(i);
- Phrase phrase = path.GetTargetPhrase();
-
- vector<const Word*> translation;
- for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
- const Word &word = phrase.GetWord(pos);
- Word *newWord = new Word(word);
- translation.push_back(newWord);
- }
- translations.push_back(translation);
- }
+ return translations;
+}
- return translations;
- }
+void MosesDecoder::outputNBestList(const std::string& source, size_t sentenceid,
+ size_t nBestSize, float bleuObjectiveWeight, float bleuScoreWeight,
+ bool distinctNbest, bool avgRefLength, string filename, ofstream& streamOut)
+{
+ StaticData &staticData = StaticData::InstanceNonConst();
+ bool chartDecoding = (staticData.GetSearchAlgorithm() == ChartDecoding);
+ initialize(staticData, source, sentenceid, bleuObjectiveWeight, bleuScoreWeight, avgRefLength, chartDecoding);
- vector< vector<const Word*> > MosesDecoder::runChartDecoder(const std::string& source,
- size_t sentenceid,
- size_t nBestSize,
- float bleuObjectiveWeight,
- float bleuScoreWeight,
- vector< ScoreComponentCollection>& featureValues,
- vector< float>& bleuScores,
- vector< float>& modelScores,
- size_t numReturnedTranslations,
- bool realBleu,
- bool distinct,
- size_t rank,
- size_t epoch) {
- // run the decoder
+ if (chartDecoding) {
m_chartManager = new ChartManager(*m_sentence);
m_chartManager->ProcessSentence();
ChartTrellisPathList nBestList;
- m_chartManager->CalcNBest(nBestSize, nBestList, distinct);
-
- // read off the feature values and bleu scores for each sentence in the nbest list
- ChartTrellisPathList::const_iterator iter;
- for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
- const Moses::ChartTrellisPath &path = **iter;
- featureValues.push_back(path.GetScoreBreakdown());
- float bleuScore, dynBleuScore, realBleuScore;
- dynBleuScore = getBleuScore(featureValues.back());
- realBleuScore = m_bleuScoreFeature->CalculateBleu(path.GetOutputPhrase());
- bleuScore = realBleu ? realBleuScore : dynBleuScore;
- bleuScores.push_back(bleuScore);
-
- //std::cout << "Score breakdown: " << path.GetScoreBreakdown() << endl;
- float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore);
- modelScores.push_back(scoreWithoutBleu);
-
- if (iter != nBestList.begin())
- cerr << endl;
- cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << path.GetOutputPhrase() << "\", score: "
- << scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << path.GetTotalScore();
- if (m_bleuScoreFeature->Enabled() && realBleu)
- cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") ";
-
- // set bleu score to zero in the feature vector since we do not want to optimise its weight
- setBleuScore(featureValues.back(), 0);
- }
-
- // prepare translations to return
- vector< vector<const Word*> > translations;
- for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
- const ChartTrellisPath &path = **iter;
- Phrase phrase = path.GetOutputPhrase();
-
- vector<const Word*> translation;
- for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
- const Word &word = phrase.GetWord(pos);
- Word *newWord = new Word(word);
- translation.push_back(newWord);
- }
- translations.push_back(translation);
- }
-
- return translations;
- }
+ m_chartManager->CalcNBest(nBestSize, nBestList, distinctNbest);
- void MosesDecoder::outputNBestList(const std::string& source, size_t sentenceid,
- size_t nBestSize, float bleuObjectiveWeight, float bleuScoreWeight,
- bool distinctNbest, bool avgRefLength, string filename, ofstream& streamOut) {
- StaticData &staticData = StaticData::InstanceNonConst();
- bool chartDecoding = (staticData.GetSearchAlgorithm() == ChartDecoding);
- initialize(staticData, source, sentenceid, bleuObjectiveWeight, bleuScoreWeight, avgRefLength, chartDecoding);
-
- if (chartDecoding) {
- m_chartManager = new ChartManager(*m_sentence);
- m_chartManager->ProcessSentence();
- ChartTrellisPathList nBestList;
- m_chartManager->CalcNBest(nBestSize, nBestList, distinctNbest);
-
- cerr << "generate nbest list " << filename << endl;
- cerr << "not implemented.." << endl;
- exit(1);
- if (filename != "") {
- ofstream out(filename.c_str());
- if (!out) {
- ostringstream msg;
- msg << "Unable to open " << filename;
- throw runtime_error(msg.str());
- }
- // TODO: handle sentence id (for now always 0)
+ cerr << "generate nbest list " << filename << endl;
+ cerr << "not implemented.." << endl;
+ exit(1);
+ if (filename != "") {
+ ofstream out(filename.c_str());
+ if (!out) {
+ ostringstream msg;
+ msg << "Unable to open " << filename;
+ throw runtime_error(msg.str());
+ }
+ // TODO: handle sentence id (for now always 0)
// OutputNBestList(const ChartTrellisPathList &nBestList, const ChartHypothesis *bestHypo, const TranslationSystem* system, long translationId, false)
// OutputNBest(out, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), 0, false);
- out.close();
- }
- else {
+ out.close();
+ } else {
// OutputNBest(streamOut, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), sentenceid, false);
- }
}
- else {
- // run the decoder
- m_manager = new Moses::Manager(0,*m_sentence, staticData.GetSearchAlgorithm());
- m_manager->ProcessSentence();
- TrellisPathList nBestList;
- m_manager->CalcNBest(nBestSize, nBestList, distinctNbest);
-
- if (filename != "") {
- ofstream out(filename.c_str());
- if (!out) {
- ostringstream msg;
- msg << "Unable to open " << filename;
- throw runtime_error(msg.str());
- }
- // TODO: handle sentence id (for now always 0)
- //OutputNBest(out, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), 0, false);
- out.close();
- }
- else {
- //OutputNBest(streamOut, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), sentenceid, false);
- streamOut.flush();
+ } else {
+ // run the decoder
+ m_manager = new Moses::Manager(0,*m_sentence, staticData.GetSearchAlgorithm());
+ m_manager->ProcessSentence();
+ TrellisPathList nBestList;
+ m_manager->CalcNBest(nBestSize, nBestList, distinctNbest);
+
+ if (filename != "") {
+ ofstream out(filename.c_str());
+ if (!out) {
+ ostringstream msg;
+ msg << "Unable to open " << filename;
+ throw runtime_error(msg.str());
}
+ // TODO: handle sentence id (for now always 0)
+ //OutputNBest(out, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), 0, false);
+ out.close();
+ } else {
+ //OutputNBest(streamOut, nBestList, StaticData::Instance().GetOutputFactorOrder(),m_manager->GetTranslationSystem(), sentenceid, false);
+ streamOut.flush();
}
}
-
- void MosesDecoder::initialize(StaticData& staticData, const std::string& source, size_t sentenceid,
- float bleuObjectiveWeight, float bleuScoreWeight, bool avgRefLength, bool chartDecoding) {
- m_sentence = new Sentence();
- stringstream in(source + "\n");
- const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder();
- m_sentence->Read(in,inputFactorOrder);
-
- // set weight of BleuScoreFeature
- //cerr << "Reload Bleu feature weight: " << bleuObjectiveWeight*bleuScoreWeight << " (" << bleuObjectiveWeight << "*" << bleuScoreWeight << ")" << endl;
- staticData.ReLoadBleuScoreFeatureParameter(bleuObjectiveWeight*bleuScoreWeight);
-
- m_bleuScoreFeature->SetCurrSourceLength((*m_sentence).GetSize());
- if (chartDecoding)
- m_bleuScoreFeature->SetCurrNormSourceLength((*m_sentence).GetSize()-2);
- else
- m_bleuScoreFeature->SetCurrNormSourceLength((*m_sentence).GetSize());
-
- if (avgRefLength)
- m_bleuScoreFeature->SetCurrAvgRefLength(sentenceid);
- else
- m_bleuScoreFeature->SetCurrShortestRefLength(sentenceid);
- m_bleuScoreFeature->SetCurrReferenceNgrams(sentenceid);
- }
-
- float MosesDecoder::getBleuScore(const ScoreComponentCollection& scores) {
- return scores.GetScoreForProducer(m_bleuScoreFeature);
- }
-
- void MosesDecoder::setBleuScore(ScoreComponentCollection& scores, float bleu) {
- scores.Assign(m_bleuScoreFeature, bleu);
- }
-
- ScoreComponentCollection MosesDecoder::getWeights() {
- return StaticData::Instance().GetAllWeights();
- }
-
- void MosesDecoder::setWeights(const ScoreComponentCollection& weights) {
- StaticData::InstanceNonConst().SetAllWeights(weights);
- }
-
- void MosesDecoder::updateHistory(const vector<const Word*>& words) {
- m_bleuScoreFeature->UpdateHistory(words);
- }
-
- void MosesDecoder::updateHistory(const vector< vector< const Word*> >& words, vector<size_t>& sourceLengths, vector<size_t>& ref_ids, size_t rank, size_t epoch) {
- m_bleuScoreFeature->UpdateHistory(words, sourceLengths, ref_ids, rank, epoch);
- }
-
- void MosesDecoder::printBleuFeatureHistory(std::ostream& out) {
- m_bleuScoreFeature->PrintHistory(out);
- }
-
- size_t MosesDecoder::getClosestReferenceLength(size_t ref_id, int hypoLength) {
- return m_bleuScoreFeature->GetClosestRefLength(ref_id, hypoLength);
- }
-
- size_t MosesDecoder::getShortestReferenceIndex(size_t ref_id) {
- return m_bleuScoreFeature->GetShortestRefIndex(ref_id);
- }
-
- void MosesDecoder::setBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
- bool scaleByInverseLength, bool scaleByAvgInverseLength,
- float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu) {
- m_bleuScoreFeature->SetBleuParameters(disable, sentenceBleu, scaleByInputLength, scaleByAvgInputLength,
- scaleByInverseLength, scaleByAvgInverseLength,
- scaleByX, historySmoothing, scheme, simpleHistoryBleu);
- }
-}
+}
+
+void MosesDecoder::initialize(StaticData& staticData, const std::string& source, size_t sentenceid,
+ float bleuObjectiveWeight, float bleuScoreWeight, bool avgRefLength, bool chartDecoding)
+{
+ m_sentence = new Sentence();
+ stringstream in(source + "\n");
+ const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder();
+ m_sentence->Read(in,inputFactorOrder);
+
+ // set weight of BleuScoreFeature
+ //cerr << "Reload Bleu feature weight: " << bleuObjectiveWeight*bleuScoreWeight << " (" << bleuObjectiveWeight << "*" << bleuScoreWeight << ")" << endl;
+ staticData.ReLoadBleuScoreFeatureParameter(bleuObjectiveWeight*bleuScoreWeight);
+
+ m_bleuScoreFeature->SetCurrSourceLength((*m_sentence).GetSize());
+ if (chartDecoding)
+ m_bleuScoreFeature->SetCurrNormSourceLength((*m_sentence).GetSize()-2);
+ else
+ m_bleuScoreFeature->SetCurrNormSourceLength((*m_sentence).GetSize());
+
+ if (avgRefLength)
+ m_bleuScoreFeature->SetCurrAvgRefLength(sentenceid);
+ else
+ m_bleuScoreFeature->SetCurrShortestRefLength(sentenceid);
+ m_bleuScoreFeature->SetCurrReferenceNgrams(sentenceid);
+}
+
+float MosesDecoder::getBleuScore(const ScoreComponentCollection& scores)
+{
+ return scores.GetScoreForProducer(m_bleuScoreFeature);
+}
+
+void MosesDecoder::setBleuScore(ScoreComponentCollection& scores, float bleu)
+{
+ scores.Assign(m_bleuScoreFeature, bleu);
+}
+
+ScoreComponentCollection MosesDecoder::getWeights()
+{
+ return StaticData::Instance().GetAllWeights();
+}
+
+void MosesDecoder::setWeights(const ScoreComponentCollection& weights)
+{
+ StaticData::InstanceNonConst().SetAllWeights(weights);
+}
+
+void MosesDecoder::updateHistory(const vector<const Word*>& words)
+{
+ m_bleuScoreFeature->UpdateHistory(words);
+}
+
+void MosesDecoder::updateHistory(const vector< vector< const Word*> >& words, vector<size_t>& sourceLengths, vector<size_t>& ref_ids, size_t rank, size_t epoch)
+{
+ m_bleuScoreFeature->UpdateHistory(words, sourceLengths, ref_ids, rank, epoch);
+}
+
+void MosesDecoder::printBleuFeatureHistory(std::ostream& out)
+{
+ m_bleuScoreFeature->PrintHistory(out);
+}
+
+size_t MosesDecoder::getClosestReferenceLength(size_t ref_id, int hypoLength)
+{
+ return m_bleuScoreFeature->GetClosestRefLength(ref_id, hypoLength);
+}
+
+size_t MosesDecoder::getShortestReferenceIndex(size_t ref_id)
+{
+ return m_bleuScoreFeature->GetShortestRefIndex(ref_id);
+}
+
+void MosesDecoder::setBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
+ bool scaleByInverseLength, bool scaleByAvgInverseLength,
+ float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu)
+{
+ m_bleuScoreFeature->SetBleuParameters(disable, sentenceBleu, scaleByInputLength, scaleByAvgInputLength,
+ scaleByInverseLength, scaleByAvgInverseLength,
+ scaleByX, historySmoothing, scheme, simpleHistoryBleu);
+}
+}
diff --git a/mira/Decoder.h b/mira/Decoder.h
index 49a33d4d0..ac8acc26b 100644
--- a/mira/Decoder.h
+++ b/mira/Decoder.h
@@ -36,100 +36,110 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
// Wrapper functions and objects for the decoder.
//
-namespace Mira {
-
+namespace Mira
+{
+
/**
* Wraps moses decoder.
**/
-class MosesDecoder {
- public:
- /**
- * Initialise moses (including StaticData) using the given ini file and debuglevel, passing through any
- * other command line arguments.
- **/
- MosesDecoder(const std::string& inifile, int debuglevel, int argc, std::vector<std::string> decoder_params);
-
- //returns the best sentence
- std::vector< std::vector<const Moses::Word*> > getNBest(const std::string& source,
- size_t sentenceid,
- size_t nbestSize,
- float bleuObjectiveweight, //weight of bleu in objective
- float bleuScoreWeight, //weight of bleu in score
- std::vector< Moses::ScoreComponentCollection>& featureValues,
- std::vector< float>& bleuScores,
- std::vector< float>& modelScores,
- size_t numReturnedTranslations,
- bool realBleu,
- bool distinct,
- bool avgRefLength,
- size_t rank,
- size_t epoch,
- std::string filename);
- std::vector< std::vector<const Moses::Word*> > runDecoder(const std::string& source,
- size_t sentenceid,
- size_t nbestSize,
- float bleuObjectiveweight, //weight of bleu in objective
- float bleuScoreWeight, //weight of bleu in score
- std::vector< Moses::ScoreComponentCollection>& featureValues,
- std::vector< float>& bleuScores,
- std::vector< float>& modelScores,
- size_t numReturnedTranslations,
- bool realBleu,
- bool distinct,
- size_t rank,
- size_t epoch,
- Moses::SearchAlgorithm& seach,
- std::string filename);
- std::vector< std::vector<const Moses::Word*> > runChartDecoder(const std::string& source,
- size_t sentenceid,
- size_t nbestSize,
- float bleuObjectiveweight, //weight of bleu in objective
- float bleuScoreWeight, //weight of bleu in score
- std::vector< Moses::ScoreComponentCollection>& featureValues,
- std::vector< float>& bleuScores,
- std::vector< float>& modelScores,
- size_t numReturnedTranslations,
- bool realBleu,
- bool distinct,
- size_t rank,
- size_t epoch);
- void outputNBestList(const std::string& source,
- size_t sentenceid,
- size_t nBestSize,
- float bleuObjectiveWeight,
- float bleuScoreWeight,
- bool distinctNbest,
- bool avgRefLength,
- std::string filename,
- std::ofstream& streamOut);
- void initialize(Moses::StaticData& staticData, const std::string& source, size_t sentenceid,
- float bleuObjectiveWeight, float bleuScoreWeight, bool avgRefLength, bool chartDecoding);
- void updateHistory(const std::vector<const Moses::Word*>& words);
- void updateHistory(const std::vector< std::vector< const Moses::Word*> >& words, std::vector<size_t>& sourceLengths, std::vector<size_t>& ref_ids, size_t rank, size_t epoch);
- void printBleuFeatureHistory(std::ostream& out);
- void printReferenceLength(const std::vector<size_t>& ref_ids);
- size_t getReferenceLength(size_t ref_id);
- size_t getClosestReferenceLength(size_t ref_id, int hypoLength);
- size_t getShortestReferenceIndex(size_t ref_id);
- void setBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
- bool scaleByInverseLength, bool scaleByAvgInverseLength,
- float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu);
- void setAvgInputLength (float l) { m_bleuScoreFeature->SetAvgInputLength(l); }
- Moses::ScoreComponentCollection getWeights();
- void setWeights(const Moses::ScoreComponentCollection& weights);
- void cleanup(bool chartDecoding);
-
- float getSourceLengthHistory() { return m_bleuScoreFeature->GetSourceLengthHistory(); }
- float getTargetLengthHistory() { return m_bleuScoreFeature->GetTargetLengthHistory(); }
- float getAverageInputLength() { return m_bleuScoreFeature->GetAverageInputLength(); }
+class MosesDecoder
+{
+public:
+ /**
+ * Initialise moses (including StaticData) using the given ini file and debuglevel, passing through any
+ * other command line arguments.
+ **/
+ MosesDecoder(const std::string& inifile, int debuglevel, int argc, std::vector<std::string> decoder_params);
+
+ //returns the best sentence
+ std::vector< std::vector<const Moses::Word*> > getNBest(const std::string& source,
+ size_t sentenceid,
+ size_t nbestSize,
+ float bleuObjectiveweight, //weight of bleu in objective
+ float bleuScoreWeight, //weight of bleu in score
+ std::vector< Moses::ScoreComponentCollection>& featureValues,
+ std::vector< float>& bleuScores,
+ std::vector< float>& modelScores,
+ size_t numReturnedTranslations,
+ bool realBleu,
+ bool distinct,
+ bool avgRefLength,
+ size_t rank,
+ size_t epoch,
+ std::string filename);
+ std::vector< std::vector<const Moses::Word*> > runDecoder(const std::string& source,
+ size_t sentenceid,
+ size_t nbestSize,
+ float bleuObjectiveweight, //weight of bleu in objective
+ float bleuScoreWeight, //weight of bleu in score
+ std::vector< Moses::ScoreComponentCollection>& featureValues,
+ std::vector< float>& bleuScores,
+ std::vector< float>& modelScores,
+ size_t numReturnedTranslations,
+ bool realBleu,
+ bool distinct,
+ size_t rank,
+ size_t epoch,
+ Moses::SearchAlgorithm& seach,
+ std::string filename);
+ std::vector< std::vector<const Moses::Word*> > runChartDecoder(const std::string& source,
+ size_t sentenceid,
+ size_t nbestSize,
+ float bleuObjectiveweight, //weight of bleu in objective
+ float bleuScoreWeight, //weight of bleu in score
+ std::vector< Moses::ScoreComponentCollection>& featureValues,
+ std::vector< float>& bleuScores,
+ std::vector< float>& modelScores,
+ size_t numReturnedTranslations,
+ bool realBleu,
+ bool distinct,
+ size_t rank,
+ size_t epoch);
+ void outputNBestList(const std::string& source,
+ size_t sentenceid,
+ size_t nBestSize,
+ float bleuObjectiveWeight,
+ float bleuScoreWeight,
+ bool distinctNbest,
+ bool avgRefLength,
+ std::string filename,
+ std::ofstream& streamOut);
+ void initialize(Moses::StaticData& staticData, const std::string& source, size_t sentenceid,
+ float bleuObjectiveWeight, float bleuScoreWeight, bool avgRefLength, bool chartDecoding);
+ void updateHistory(const std::vector<const Moses::Word*>& words);
+ void updateHistory(const std::vector< std::vector< const Moses::Word*> >& words, std::vector<size_t>& sourceLengths, std::vector<size_t>& ref_ids, size_t rank, size_t epoch);
+ void printBleuFeatureHistory(std::ostream& out);
+ void printReferenceLength(const std::vector<size_t>& ref_ids);
+ size_t getReferenceLength(size_t ref_id);
+ size_t getClosestReferenceLength(size_t ref_id, int hypoLength);
+ size_t getShortestReferenceIndex(size_t ref_id);
+ void setBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
+ bool scaleByInverseLength, bool scaleByAvgInverseLength,
+ float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu);
+ void setAvgInputLength (float l) {
+ m_bleuScoreFeature->SetAvgInputLength(l);
+ }
+ Moses::ScoreComponentCollection getWeights();
+ void setWeights(const Moses::ScoreComponentCollection& weights);
+ void cleanup(bool chartDecoding);
+
+ float getSourceLengthHistory() {
+ return m_bleuScoreFeature->GetSourceLengthHistory();
+ }
+ float getTargetLengthHistory() {
+ return m_bleuScoreFeature->GetTargetLengthHistory();
+ }
+ float getAverageInputLength() {
+ return m_bleuScoreFeature->GetAverageInputLength();
+ }
- private:
- float getBleuScore(const Moses::ScoreComponentCollection& scores);
- void setBleuScore(Moses::ScoreComponentCollection& scores, float bleu);
- Moses::Manager *m_manager;
- Moses::ChartManager *m_chartManager;
- Moses::Sentence *m_sentence;
- Moses::BleuScoreFeature *m_bleuScoreFeature;
+private:
+ float getBleuScore(const Moses::ScoreComponentCollection& scores);
+ void setBleuScore(Moses::ScoreComponentCollection& scores, float bleu);
+ Moses::Manager *m_manager;
+ Moses::ChartManager *m_chartManager;
+ Moses::Sentence *m_sentence;
+ Moses::BleuScoreFeature *m_bleuScoreFeature;
};
diff --git a/mira/Hildreth.cpp b/mira/Hildreth.cpp
index 53d1e0881..03076e767 100644
--- a/mira/Hildreth.cpp
+++ b/mira/Hildreth.cpp
@@ -3,186 +3,173 @@
using namespace Moses;
using namespace std;
-namespace Mira {
+namespace Mira
+{
- vector<float> Hildreth::optimise (const vector<ScoreComponentCollection>& a, const vector<float>& b) {
+vector<float> Hildreth::optimise (const vector<ScoreComponentCollection>& a, const vector<float>& b)
+{
- size_t i;
- int max_iter = 10000;
- float eps = 0.00000001;
- float zero = 0.000000000001;
+ size_t i;
+ int max_iter = 10000;
+ float eps = 0.00000001;
+ float zero = 0.000000000001;
- vector<float> alpha ( b.size() );
- vector<float> F ( b.size() );
- vector<float> kkt ( b.size() );
+ vector<float> alpha ( b.size() );
+ vector<float> F ( b.size() );
+ vector<float> kkt ( b.size() );
- float max_kkt = -1e100;
+ float max_kkt = -1e100;
- size_t K = b.size();
+ size_t K = b.size();
- float A[K][K];
- bool is_computed[K];
- for ( i = 0; i < K; i++ )
- {
- A[i][i] = a[i].InnerProduct(a[i]);
- is_computed[i] = false;
+ float A[K][K];
+ bool is_computed[K];
+ for ( i = 0; i < K; i++ ) {
+ A[i][i] = a[i].InnerProduct(a[i]);
+ is_computed[i] = false;
+ }
+
+ int max_kkt_i = -1;
+
+
+ for ( i = 0; i < b.size(); i++ ) {
+ F[i] = b[i];
+ kkt[i] = F[i];
+ if ( kkt[i] > max_kkt ) {
+ max_kkt = kkt[i];
+ max_kkt_i = i;
}
+ }
- int max_kkt_i = -1;
+ int iter = 0;
+ float diff_alpha;
+ float try_alpha;
+ float add_alpha;
+ while ( max_kkt >= eps && iter < max_iter ) {
- for ( i = 0; i < b.size(); i++ )
- {
- F[i] = b[i];
+ diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i];
+ try_alpha = alpha[max_kkt_i] + diff_alpha;
+ add_alpha = 0.0;
+
+ if ( try_alpha < 0.0 )
+ add_alpha = -1.0 * alpha[max_kkt_i];
+ else
+ add_alpha = diff_alpha;
+
+ alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha;
+
+ if ( !is_computed[max_kkt_i] ) {
+ for ( i = 0; i < K; i++ ) {
+ A[i][max_kkt_i] = a[i].InnerProduct(a[max_kkt_i] ); // for version 1
+ //A[i][max_kkt_i] = 0; // for version 1
+ is_computed[max_kkt_i] = true;
+ }
+ }
+
+ for ( i = 0; i < F.size(); i++ ) {
+ F[i] -= add_alpha * A[i][max_kkt_i];
kkt[i] = F[i];
- if ( kkt[i] > max_kkt )
- {
+ if ( alpha[i] > zero )
+ kkt[i] = abs ( F[i] );
+ }
+ max_kkt = -1e100;
+ max_kkt_i = -1;
+ for ( i = 0; i < F.size(); i++ )
+ if ( kkt[i] > max_kkt ) {
max_kkt = kkt[i];
max_kkt_i = i;
}
- }
- int iter = 0;
- float diff_alpha;
- float try_alpha;
- float add_alpha;
-
- while ( max_kkt >= eps && iter < max_iter )
- {
-
- diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i];
- try_alpha = alpha[max_kkt_i] + diff_alpha;
- add_alpha = 0.0;
-
- if ( try_alpha < 0.0 )
- add_alpha = -1.0 * alpha[max_kkt_i];
- else
- add_alpha = diff_alpha;
-
- alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha;
-
- if ( !is_computed[max_kkt_i] )
- {
- for ( i = 0; i < K; i++ )
- {
- A[i][max_kkt_i] = a[i].InnerProduct(a[max_kkt_i] ); // for version 1
- //A[i][max_kkt_i] = 0; // for version 1
- is_computed[max_kkt_i] = true;
- }
- }
+ iter++;
+ }
- for ( i = 0; i < F.size(); i++ )
- {
- F[i] -= add_alpha * A[i][max_kkt_i];
- kkt[i] = F[i];
- if ( alpha[i] > zero )
- kkt[i] = abs ( F[i] );
- }
- max_kkt = -1e100;
- max_kkt_i = -1;
- for ( i = 0; i < F.size(); i++ )
- if ( kkt[i] > max_kkt )
- {
- max_kkt = kkt[i];
- max_kkt_i = i;
- }
-
- iter++;
- }
+ return alpha;
+}
- return alpha;
- }
+vector<float> Hildreth::optimise (const vector<ScoreComponentCollection>& a, const vector<float>& b, float C)
+{
- vector<float> Hildreth::optimise (const vector<ScoreComponentCollection>& a, const vector<float>& b, float C) {
+ size_t i;
+ int max_iter = 10000;
+ float eps = 0.00000001;
+ float zero = 0.000000000001;
- size_t i;
- int max_iter = 10000;
- float eps = 0.00000001;
- float zero = 0.000000000001;
+ vector<float> alpha ( b.size() );
+ vector<float> F ( b.size() );
+ vector<float> kkt ( b.size() );
- vector<float> alpha ( b.size() );
- vector<float> F ( b.size() );
- vector<float> kkt ( b.size() );
+ float max_kkt = -1e100;
- float max_kkt = -1e100;
+ size_t K = b.size();
+
+ float A[K][K];
+ bool is_computed[K];
+ for ( i = 0; i < K; i++ ) {
+ A[i][i] = a[i].InnerProduct(a[i]);
+ is_computed[i] = false;
+ }
+
+ int max_kkt_i = -1;
- size_t K = b.size();
- float A[K][K];
- bool is_computed[K];
- for ( i = 0; i < K; i++ )
- {
- A[i][i] = a[i].InnerProduct(a[i]);
- is_computed[i] = false;
+ for ( i = 0; i < b.size(); i++ ) {
+ F[i] = b[i];
+ kkt[i] = F[i];
+ if ( kkt[i] > max_kkt ) {
+ max_kkt = kkt[i];
+ max_kkt_i = i;
}
+ }
- int max_kkt_i = -1;
+ int iter = 0;
+ float diff_alpha;
+ float try_alpha;
+ float add_alpha;
+ while ( max_kkt >= eps && iter < max_iter ) {
- for ( i = 0; i < b.size(); i++ )
- {
- F[i] = b[i];
- kkt[i] = F[i];
- if ( kkt[i] > max_kkt )
- {
- max_kkt = kkt[i];
- max_kkt_i = i;
- }
- }
+ diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i];
+ try_alpha = alpha[max_kkt_i] + diff_alpha;
+ add_alpha = 0.0;
- int iter = 0;
- float diff_alpha;
- float try_alpha;
- float add_alpha;
-
- while ( max_kkt >= eps && iter < max_iter )
- {
-
- diff_alpha = A[max_kkt_i][max_kkt_i] <= zero ? 0.0 : F[max_kkt_i]/A[max_kkt_i][max_kkt_i];
- try_alpha = alpha[max_kkt_i] + diff_alpha;
- add_alpha = 0.0;
-
- if ( try_alpha < 0.0 )
- add_alpha = -1.0 * alpha[max_kkt_i];
- else if (try_alpha > C)
- add_alpha = C - alpha[max_kkt_i];
- else
- add_alpha = diff_alpha;
-
- alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha;
-
- if ( !is_computed[max_kkt_i] )
- {
- for ( i = 0; i < K; i++ )
- {
- A[i][max_kkt_i] = a[i].InnerProduct(a[max_kkt_i] ); // for version 1
- //A[i][max_kkt_i] = 0; // for version 1
- is_computed[max_kkt_i] = true;
- }
- }
+ if ( try_alpha < 0.0 )
+ add_alpha = -1.0 * alpha[max_kkt_i];
+ else if (try_alpha > C)
+ add_alpha = C - alpha[max_kkt_i];
+ else
+ add_alpha = diff_alpha;
- for ( i = 0; i < F.size(); i++ )
- {
- F[i] -= add_alpha * A[i][max_kkt_i];
- kkt[i] = F[i];
- if (alpha[i] > C - zero)
- kkt[i]=-kkt[i];
- else if (alpha[i] > zero)
- kkt[i] = abs(F[i]);
+ alpha[max_kkt_i] = alpha[max_kkt_i] + add_alpha;
+ if ( !is_computed[max_kkt_i] ) {
+ for ( i = 0; i < K; i++ ) {
+ A[i][max_kkt_i] = a[i].InnerProduct(a[max_kkt_i] ); // for version 1
+ //A[i][max_kkt_i] = 0; // for version 1
+ is_computed[max_kkt_i] = true;
}
- max_kkt = -1e100;
- max_kkt_i = -1;
- for ( i = 0; i < F.size(); i++ )
- if ( kkt[i] > max_kkt )
- {
- max_kkt = kkt[i];
- max_kkt_i = i;
- }
-
- iter++;
}
- return alpha;
+ for ( i = 0; i < F.size(); i++ ) {
+ F[i] -= add_alpha * A[i][max_kkt_i];
+ kkt[i] = F[i];
+ if (alpha[i] > C - zero)
+ kkt[i]=-kkt[i];
+ else if (alpha[i] > zero)
+ kkt[i] = abs(F[i]);
+
+ }
+ max_kkt = -1e100;
+ max_kkt_i = -1;
+ for ( i = 0; i < F.size(); i++ )
+ if ( kkt[i] > max_kkt ) {
+ max_kkt = kkt[i];
+ max_kkt_i = i;
+ }
+
+ iter++;
}
+
+ return alpha;
+}
}
diff --git a/mira/Hildreth.h b/mira/Hildreth.h
index da52995aa..373f2ac43 100644
--- a/mira/Hildreth.h
+++ b/mira/Hildreth.h
@@ -1,11 +1,13 @@
#include "moses/FeatureVector.h"
#include "moses/ScoreComponentCollection.h"
-namespace Mira {
+namespace Mira
+{
- class Hildreth {
- public :
- static std::vector<float> optimise (const std::vector<Moses::ScoreComponentCollection>& a, const std::vector<float>& b );
- static std::vector<float> optimise (const std::vector<Moses::ScoreComponentCollection>& a, const std::vector<float>& b, float C);
- };
+class Hildreth
+{
+public :
+ static std::vector<float> optimise (const std::vector<Moses::ScoreComponentCollection>& a, const std::vector<float>& b );
+ static std::vector<float> optimise (const std::vector<Moses::ScoreComponentCollection>& a, const std::vector<float>& b, float C);
+};
}
diff --git a/mira/HildrethTest.cpp b/mira/HildrethTest.cpp
index a32dcd1d3..43e4403e4 100644
--- a/mira/HildrethTest.cpp
+++ b/mira/HildrethTest.cpp
@@ -34,22 +34,31 @@ using namespace Mira;
namespace MosesTest
{
-class MockSingleFeature : public StatelessFeatureFunction {
- public:
- MockSingleFeature(): StatelessFeatureFunction("MockSingle",1) {}
- std::string GetScoreProducerWeightShortName(unsigned) const {return "sf";}
+class MockSingleFeature : public StatelessFeatureFunction
+{
+public:
+ MockSingleFeature(): StatelessFeatureFunction("MockSingle",1) {}
+ std::string GetScoreProducerWeightShortName(unsigned) const {
+ return "sf";
+ }
};
-class MockMultiFeature : public StatelessFeatureFunction {
- public:
- MockMultiFeature(): StatelessFeatureFunction("MockMulti",5) {}
- std::string GetScoreProducerWeightShortName(unsigned) const {return "mf";}
+class MockMultiFeature : public StatelessFeatureFunction
+{
+public:
+ MockMultiFeature(): StatelessFeatureFunction("MockMulti",5) {}
+ std::string GetScoreProducerWeightShortName(unsigned) const {
+ return "mf";
+ }
};
-class MockSparseFeature : public StatelessFeatureFunction {
- public:
- MockSparseFeature(): StatelessFeatureFunction("MockSparse", ScoreProducer::unlimited) {}
- std::string GetScoreProducerWeightShortName(unsigned) const {return "sf";}
+class MockSparseFeature : public StatelessFeatureFunction
+{
+public:
+ MockSparseFeature(): StatelessFeatureFunction("MockSparse", ScoreProducer::unlimited) {}
+ std::string GetScoreProducerWeightShortName(unsigned) const {
+ return "sf";
+ }
};
struct MockProducers {
@@ -66,716 +75,716 @@ BOOST_AUTO_TEST_SUITE(hildreth_test)
BOOST_FIXTURE_TEST_CASE(test_hildreth_1, MockProducers)
{
- // Feasible example with 2 constraints
- cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl;
- vector< ScoreComponentCollection> featureValueDiffs;
- vector< float> lossMinusModelScoreDiff;
-
- // initial weights
- float w[] = { 1, 1, 1, 1, 0 };
- vector<float> vec(w,w+5);
- ScoreComponentCollection weights;
- weights.PlusEquals(&multi, vec);
-
- // feature values (second is oracle)
- //float arr1[] = {0, -5, -27.0908, -1.83258, 0 };
- //float arr2[] = {0, -5, -29.158, -1.83258, 0 };
- //float arr3[] = {0, -5, -27.0908, -1.83258, 0 };
-
- // feature value differences (to oracle)
- ScoreComponentCollection s1, s2, s3;
- float arr1[] = { 0, 0, -2.0672, 0, 0 };
- float arr2[] = { 0, 0, 0, 0, 0 };
- float arr3[] = { 0, 0, -2.0672, 0, 0 };
-
- float loss1 = 2.34085;
- float loss2 = 0;
- float loss3 = 2.34085;
-
- vector<float> vec1(arr1,arr1+5);
- vector<float> vec2(arr2,arr2+5);
- vector<float> vec3(arr3,arr3+5);
-
- s1.PlusEquals(&multi,vec1);
- s2.PlusEquals(&multi,vec2);
- s3.PlusEquals(&multi,vec3);
-
- featureValueDiffs.push_back(s1);
- featureValueDiffs.push_back(s2);
- featureValueDiffs.push_back(s3);
-
- cerr << "feature value diff: " << featureValueDiffs[0] << endl;
- cerr << "feature value diff: " << featureValueDiffs[1] << endl;
- cerr << "feature value diff: " << featureValueDiffs[2] << endl << endl;
-
- float oldModelScoreDiff1 = featureValueDiffs[0].InnerProduct(weights);
- float oldModelScoreDiff2 = featureValueDiffs[1].InnerProduct(weights);
- float oldModelScoreDiff3 = featureValueDiffs[2].InnerProduct(weights);
-
- cerr << "model score diff: " << oldModelScoreDiff1 << ", loss: " << loss1 << endl;
- cerr << "model score diff: " << oldModelScoreDiff2 << ", loss: " << loss2 << endl;
- cerr << "model score diff: " << oldModelScoreDiff3 << ", loss: " << loss3 << endl << endl;
-
- lossMinusModelScoreDiff.push_back(loss1 - oldModelScoreDiff1);
- lossMinusModelScoreDiff.push_back(loss2 - oldModelScoreDiff2);
- lossMinusModelScoreDiff.push_back(loss3 - oldModelScoreDiff3);
-
- vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff);
- vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01);
-
- cerr << "\nalphas without slack:" << endl;
- for (size_t i = 0; i < alphas1.size(); ++i) {
- cerr << "alpha " << i << ": " << alphas1[i] << endl;
- }
- cerr << endl;
-
- cerr << "partial updates:" << endl;
- vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs);
- FVector totalUpdate1 = ScoreComponentCollection::CreateFVector();
- for (size_t k = 0; k < featureValueDiffs1.size(); ++k) {
- featureValueDiffs1[k].MultiplyEquals(alphas1[k]);
- cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl;
- FVector update = featureValueDiffs1[k].GetScoresVector();
- totalUpdate1 += update;
- }
- cerr << endl;
- cerr << "total update: " << totalUpdate1 << endl << endl;
-
- ScoreComponentCollection weightsUpdate1(weights);
- weightsUpdate1.PlusEquals(totalUpdate1);
- cerr << "new weights: " << weightsUpdate1 << endl << endl;
-
- float newModelScoreDiff1 = featureValueDiffs[0].InnerProduct(weightsUpdate1);
- float newModelScoreDiff2 = featureValueDiffs[1].InnerProduct(weightsUpdate1);
- float newModelScoreDiff3 = featureValueDiffs[2].InnerProduct(weightsUpdate1);
-
- cerr << "new model score diff: " << newModelScoreDiff1 << ", loss: " << loss1 << endl;
- cerr << "new model score diff: " << newModelScoreDiff2 << ", loss: " << loss2 << endl;
- cerr << "new model score diff: " << newModelScoreDiff3 << ", loss: " << loss3 << endl;
-
- cerr << "\n\nalphas with slack 0.01:" << endl;
- for (size_t i = 0; i < alphas2.size(); ++i) {
- cerr << "alpha " << i << ": " << alphas2[i] << endl;
- }
- cerr << endl;
-
- cerr << "partial updates:" << endl;
- vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs);
- FVector totalUpdate2 = ScoreComponentCollection::CreateFVector();
- for (size_t k = 0; k < featureValueDiffs2.size(); ++k) {
- featureValueDiffs2[k].MultiplyEquals(alphas2[k]);
- cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl;
- FVector update = featureValueDiffs2[k].GetScoresVector();
- totalUpdate2 += update;
- }
- cerr << endl;
- cerr << "total update: " << totalUpdate2 << endl << endl;
-
- ScoreComponentCollection weightsUpdate2(weights);
- weightsUpdate2.PlusEquals(totalUpdate2);
- cerr << "new weights: " << weightsUpdate2 << endl << endl;
-
- float newModelScoreDiff4 = featureValueDiffs[0].InnerProduct(weightsUpdate2);
- float newModelScoreDiff5 = featureValueDiffs[1].InnerProduct(weightsUpdate2);
- float newModelScoreDiff6 = featureValueDiffs[2].InnerProduct(weightsUpdate2);
-
- cerr << "new model score diff: " << newModelScoreDiff4 << ", loss: " << loss1 << endl;
- cerr << "new model score diff: " << newModelScoreDiff5 << ", loss: " << loss2 << endl;
- cerr << "new model score diff: " << newModelScoreDiff6 << ", loss: " << loss3 << endl;
+ // Feasible example with 2 constraints
+ cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl;
+ vector< ScoreComponentCollection> featureValueDiffs;
+ vector< float> lossMinusModelScoreDiff;
+
+ // initial weights
+ float w[] = { 1, 1, 1, 1, 0 };
+ vector<float> vec(w,w+5);
+ ScoreComponentCollection weights;
+ weights.PlusEquals(&multi, vec);
+
+ // feature values (second is oracle)
+ //float arr1[] = {0, -5, -27.0908, -1.83258, 0 };
+ //float arr2[] = {0, -5, -29.158, -1.83258, 0 };
+ //float arr3[] = {0, -5, -27.0908, -1.83258, 0 };
+
+ // feature value differences (to oracle)
+ ScoreComponentCollection s1, s2, s3;
+ float arr1[] = { 0, 0, -2.0672, 0, 0 };
+ float arr2[] = { 0, 0, 0, 0, 0 };
+ float arr3[] = { 0, 0, -2.0672, 0, 0 };
+
+ float loss1 = 2.34085;
+ float loss2 = 0;
+ float loss3 = 2.34085;
+
+ vector<float> vec1(arr1,arr1+5);
+ vector<float> vec2(arr2,arr2+5);
+ vector<float> vec3(arr3,arr3+5);
+
+ s1.PlusEquals(&multi,vec1);
+ s2.PlusEquals(&multi,vec2);
+ s3.PlusEquals(&multi,vec3);
+
+ featureValueDiffs.push_back(s1);
+ featureValueDiffs.push_back(s2);
+ featureValueDiffs.push_back(s3);
+
+ cerr << "feature value diff: " << featureValueDiffs[0] << endl;
+ cerr << "feature value diff: " << featureValueDiffs[1] << endl;
+ cerr << "feature value diff: " << featureValueDiffs[2] << endl << endl;
+
+ float oldModelScoreDiff1 = featureValueDiffs[0].InnerProduct(weights);
+ float oldModelScoreDiff2 = featureValueDiffs[1].InnerProduct(weights);
+ float oldModelScoreDiff3 = featureValueDiffs[2].InnerProduct(weights);
+
+ cerr << "model score diff: " << oldModelScoreDiff1 << ", loss: " << loss1 << endl;
+ cerr << "model score diff: " << oldModelScoreDiff2 << ", loss: " << loss2 << endl;
+ cerr << "model score diff: " << oldModelScoreDiff3 << ", loss: " << loss3 << endl << endl;
+
+ lossMinusModelScoreDiff.push_back(loss1 - oldModelScoreDiff1);
+ lossMinusModelScoreDiff.push_back(loss2 - oldModelScoreDiff2);
+ lossMinusModelScoreDiff.push_back(loss3 - oldModelScoreDiff3);
+
+ vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff);
+ vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01);
+
+ cerr << "\nalphas without slack:" << endl;
+ for (size_t i = 0; i < alphas1.size(); ++i) {
+ cerr << "alpha " << i << ": " << alphas1[i] << endl;
+ }
+ cerr << endl;
+
+ cerr << "partial updates:" << endl;
+ vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs);
+ FVector totalUpdate1 = ScoreComponentCollection::CreateFVector();
+ for (size_t k = 0; k < featureValueDiffs1.size(); ++k) {
+ featureValueDiffs1[k].MultiplyEquals(alphas1[k]);
+ cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl;
+ FVector update = featureValueDiffs1[k].GetScoresVector();
+ totalUpdate1 += update;
+ }
+ cerr << endl;
+ cerr << "total update: " << totalUpdate1 << endl << endl;
+
+ ScoreComponentCollection weightsUpdate1(weights);
+ weightsUpdate1.PlusEquals(totalUpdate1);
+ cerr << "new weights: " << weightsUpdate1 << endl << endl;
+
+ float newModelScoreDiff1 = featureValueDiffs[0].InnerProduct(weightsUpdate1);
+ float newModelScoreDiff2 = featureValueDiffs[1].InnerProduct(weightsUpdate1);
+ float newModelScoreDiff3 = featureValueDiffs[2].InnerProduct(weightsUpdate1);
+
+ cerr << "new model score diff: " << newModelScoreDiff1 << ", loss: " << loss1 << endl;
+ cerr << "new model score diff: " << newModelScoreDiff2 << ", loss: " << loss2 << endl;
+ cerr << "new model score diff: " << newModelScoreDiff3 << ", loss: " << loss3 << endl;
+
+ cerr << "\n\nalphas with slack 0.01:" << endl;
+ for (size_t i = 0; i < alphas2.size(); ++i) {
+ cerr << "alpha " << i << ": " << alphas2[i] << endl;
+ }
+ cerr << endl;
+
+ cerr << "partial updates:" << endl;
+ vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs);
+ FVector totalUpdate2 = ScoreComponentCollection::CreateFVector();
+ for (size_t k = 0; k < featureValueDiffs2.size(); ++k) {
+ featureValueDiffs2[k].MultiplyEquals(alphas2[k]);
+ cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl;
+ FVector update = featureValueDiffs2[k].GetScoresVector();
+ totalUpdate2 += update;
+ }
+ cerr << endl;
+ cerr << "total update: " << totalUpdate2 << endl << endl;
+
+ ScoreComponentCollection weightsUpdate2(weights);
+ weightsUpdate2.PlusEquals(totalUpdate2);
+ cerr << "new weights: " << weightsUpdate2 << endl << endl;
+
+ float newModelScoreDiff4 = featureValueDiffs[0].InnerProduct(weightsUpdate2);
+ float newModelScoreDiff5 = featureValueDiffs[1].InnerProduct(weightsUpdate2);
+ float newModelScoreDiff6 = featureValueDiffs[2].InnerProduct(weightsUpdate2);
+
+ cerr << "new model score diff: " << newModelScoreDiff4 << ", loss: " << loss1 << endl;
+ cerr << "new model score diff: " << newModelScoreDiff5 << ", loss: " << loss2 << endl;
+ cerr << "new model score diff: " << newModelScoreDiff6 << ", loss: " << loss3 << endl;
}
BOOST_FIXTURE_TEST_CASE(test_hildreth_3, MockProducers)
{
- // Unfeasible example with 21 constraints
- cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl;
- vector< ScoreComponentCollection> featureValueDiffs;
- vector< float> lossMinusModelScoreDiff;
-
- // initial weights
- float w[] = { 1, 1, 0.638672, 1, 0 };
- vector<float> vec(w,w+5);
- ScoreComponentCollection weights;
- weights.PlusEquals(&multi, vec);
-
- int numberOfConstraints = 21;
-
- // feature value differences (to oracle)
- // NOTE: these feature values are only approximations
- ScoreComponentCollection s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21;
- float arr1[] = { 0, 0, -2.0672, 0, 0 };
- float arr2[] = { 0, 0, 0, 0, 0 };
- float arr3[] = { 0, 0, -2.08436, 1.38629, 0 };
- float arr4[] = { 0, 0, -0.0171661, 1.38629, 0 };
- float arr5[] = { 0, 0, 4.4283, 0, 0 };
- float arr6[] = { 0, 0, 3.84829, 1.38629, 0 };
- float arr7[] = { 0, 0, 6.83689, 0, 0 };
- float arr8[] = { 0, 0, 0, 0, 0 };
- float arr9[] = { 0, 0, -2.0672, 0, 0 };
- float arr10[] = { 0, 0, -0.0171661, 1.38629, 0 };
- float arr11[] = { 0, 0, -2.08436, 1.38629, 0 };
- float arr12[] = { 0, 0, 4.4283, 0, 0 };
- float arr13[] = { 3, 0, 2.41089, 0, 0 };
- float arr14[] = { 3, 0, 2.32709, 0, 0 };
- float arr15[] = { 0, 0, -2.0672, 0, 0 };
- float arr16[] = { 0, 0, -2.08436, 1.38629, 0 };
- float arr17[] = { 0, 0, 4.4283, 0, 0 };
- float arr18[] = { 0, 0, 3.84829, 1.38629, 0 };
- float arr19[] = { 0, 0, -0.0171661, 1.38629, 0 };
- float arr20[] = { 0, 0, 0, 0, 0 };
- float arr21[] = { 0, 0, 6.83689, 0, 0 };
-
- vector<float> losses;
- losses.push_back(2.73485);
- losses.push_back(0);
- losses.push_back(3.64118);
- losses.push_back(1.47347);
- losses.push_back(3.64118);
- losses.push_back(4.16278);
- losses.push_back(3.13952);
- losses.push_back(0);
- losses.push_back(2.73485);
- losses.push_back(1.47347);
- losses.push_back(3.64118);
- losses.push_back(3.64118);
- losses.push_back(2.51662);
- losses.push_back(2.73485);
- losses.push_back(2.73485);
- losses.push_back(3.64118);
- losses.push_back(3.64118);
- losses.push_back(4.16278);
- losses.push_back(1.47347);
- losses.push_back(0);
- losses.push_back(3.13952);
-
- vector<float> vec1(arr1,arr1+5);
- vector<float> vec2(arr2,arr2+5);
- vector<float> vec3(arr3,arr3+5);
- vector<float> vec4(arr4,arr4+5);
- vector<float> vec5(arr5,arr5+5);
- vector<float> vec6(arr6,arr6+5);
- vector<float> vec7(arr7,arr7+5);
- vector<float> vec8(arr8,arr8+5);
- vector<float> vec9(arr9,arr9+5);
- vector<float> vec10(arr10,arr10+5);
- vector<float> vec11(arr11,arr11+5);
- vector<float> vec12(arr12,arr12+5);
- vector<float> vec13(arr13,arr13+5);
- vector<float> vec14(arr14,arr14+5);
- vector<float> vec15(arr15,arr15+5);
- vector<float> vec16(arr16,arr16+5);
- vector<float> vec17(arr17,arr17+5);
- vector<float> vec18(arr18,arr18+5);
- vector<float> vec19(arr19,arr19+5);
- vector<float> vec20(arr20,arr20+5);
- vector<float> vec21(arr21,arr21+5);
-
- s1.PlusEquals(&multi,vec1);
- s2.PlusEquals(&multi,vec2);
- s3.PlusEquals(&multi,vec3);
- s4.PlusEquals(&multi,vec4);
- s5.PlusEquals(&multi,vec5);
- s6.PlusEquals(&multi,vec6);
- s7.PlusEquals(&multi,vec7);
- s8.PlusEquals(&multi,vec8);
- s9.PlusEquals(&multi,vec9);
- s10.PlusEquals(&multi,vec10);
- s11.PlusEquals(&multi,vec11);
- s12.PlusEquals(&multi,vec12);
- s13.PlusEquals(&multi,vec13);
- s14.PlusEquals(&multi,vec14);
- s15.PlusEquals(&multi,vec15);
- s16.PlusEquals(&multi,vec16);
- s17.PlusEquals(&multi,vec17);
- s18.PlusEquals(&multi,vec18);
- s19.PlusEquals(&multi,vec19);
- s20.PlusEquals(&multi,vec20);
- s21.PlusEquals(&multi,vec21);
-
- featureValueDiffs.push_back(s1);
- featureValueDiffs.push_back(s2);
- featureValueDiffs.push_back(s3);
- featureValueDiffs.push_back(s4);
- featureValueDiffs.push_back(s5);
- featureValueDiffs.push_back(s6);
- featureValueDiffs.push_back(s7);
- featureValueDiffs.push_back(s8);
- featureValueDiffs.push_back(s9);
- featureValueDiffs.push_back(s10);
- featureValueDiffs.push_back(s11);
- featureValueDiffs.push_back(s12);
- featureValueDiffs.push_back(s13);
- featureValueDiffs.push_back(s14);
- featureValueDiffs.push_back(s15);
- featureValueDiffs.push_back(s16);
- featureValueDiffs.push_back(s17);
- featureValueDiffs.push_back(s18);
- featureValueDiffs.push_back(s19);
- featureValueDiffs.push_back(s20);
- featureValueDiffs.push_back(s21);
-
- vector<float> oldModelScoreDiff;
- for (int i = 0; i < numberOfConstraints; ++i) {
- oldModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weights));
- }
-
- for (int i = 0; i < numberOfConstraints; ++i) {
- cerr << "old model score diff: " << oldModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (oldModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
- }
-
- for (int i = 0; i < numberOfConstraints; ++i) {
- lossMinusModelScoreDiff.push_back(losses[i] - oldModelScoreDiff[i]);
- }
-
- for (int i = 0; i < numberOfConstraints; ++i) {
- cerr << "A: " << featureValueDiffs[i] << ", b: " << lossMinusModelScoreDiff[i] << endl;
- }
-
- vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff);
- vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01);
-
- cerr << "\nalphas without slack:" << endl;
- for (size_t i = 0; i < alphas1.size(); ++i) {
- cerr << "alpha " << i << ": " << alphas1[i] << endl;
- }
- cerr << endl;
-
- cerr << "partial updates:" << endl;
- vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs);
- FVector totalUpdate1 = ScoreComponentCollection::CreateFVector();
- for (size_t k = 0; k < featureValueDiffs1.size(); ++k) {
- featureValueDiffs1[k].MultiplyEquals(alphas1[k]);
- cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl;
- FVector update = featureValueDiffs1[k].GetScoresVector();
- totalUpdate1 += update;
- }
- cerr << endl;
- cerr << "total update: " << totalUpdate1 << endl << endl;
-
- ScoreComponentCollection weightsUpdate1(weights);
- weightsUpdate1.PlusEquals(totalUpdate1);
- cerr << "old weights: " << weights << endl;
- cerr << "new weights: " << weightsUpdate1 << endl << endl;
-
- vector<float> newModelScoreDiff;
- for (int i = 0; i < numberOfConstraints; ++i) {
- newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate1));
- }
-
- for (int i = 0; i < numberOfConstraints; ++i) {
- cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
- }
-
- cerr << "\n\nalphas with slack 0.01:" << endl;
- for (size_t i = 0; i < alphas2.size(); ++i) {
- cerr << "alpha " << i << ": " << alphas2[i] << endl;
- }
- cerr << endl;
-
- cerr << "partial updates:" << endl;
- vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs);
- FVector totalUpdate2 = ScoreComponentCollection::CreateFVector();
- for (size_t k = 0; k < featureValueDiffs2.size(); ++k) {
- featureValueDiffs2[k].MultiplyEquals(alphas2[k]);
- cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl;
- FVector update = featureValueDiffs2[k].GetScoresVector();
- totalUpdate2 += update;
- }
- cerr << endl;
- cerr << "total update: " << totalUpdate2 << endl << endl;
-
- ScoreComponentCollection weightsUpdate2(weights);
- weightsUpdate2.PlusEquals(totalUpdate2);
- cerr << "old weights: " << weights << endl;
- cerr << "new weights: " << weightsUpdate2 << endl << endl;
-
- newModelScoreDiff.clear();
- for (int i = 0; i < numberOfConstraints; ++i) {
- newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate2));
- }
-
- for (int i = 0; i < numberOfConstraints; ++i) {
- cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << endl;
- }
+ // Unfeasible example with 21 constraints
+ cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl;
+ vector< ScoreComponentCollection> featureValueDiffs;
+ vector< float> lossMinusModelScoreDiff;
+
+ // initial weights
+ float w[] = { 1, 1, 0.638672, 1, 0 };
+ vector<float> vec(w,w+5);
+ ScoreComponentCollection weights;
+ weights.PlusEquals(&multi, vec);
+
+ int numberOfConstraints = 21;
+
+ // feature value differences (to oracle)
+ // NOTE: these feature values are only approximations
+ ScoreComponentCollection s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21;
+ float arr1[] = { 0, 0, -2.0672, 0, 0 };
+ float arr2[] = { 0, 0, 0, 0, 0 };
+ float arr3[] = { 0, 0, -2.08436, 1.38629, 0 };
+ float arr4[] = { 0, 0, -0.0171661, 1.38629, 0 };
+ float arr5[] = { 0, 0, 4.4283, 0, 0 };
+ float arr6[] = { 0, 0, 3.84829, 1.38629, 0 };
+ float arr7[] = { 0, 0, 6.83689, 0, 0 };
+ float arr8[] = { 0, 0, 0, 0, 0 };
+ float arr9[] = { 0, 0, -2.0672, 0, 0 };
+ float arr10[] = { 0, 0, -0.0171661, 1.38629, 0 };
+ float arr11[] = { 0, 0, -2.08436, 1.38629, 0 };
+ float arr12[] = { 0, 0, 4.4283, 0, 0 };
+ float arr13[] = { 3, 0, 2.41089, 0, 0 };
+ float arr14[] = { 3, 0, 2.32709, 0, 0 };
+ float arr15[] = { 0, 0, -2.0672, 0, 0 };
+ float arr16[] = { 0, 0, -2.08436, 1.38629, 0 };
+ float arr17[] = { 0, 0, 4.4283, 0, 0 };
+ float arr18[] = { 0, 0, 3.84829, 1.38629, 0 };
+ float arr19[] = { 0, 0, -0.0171661, 1.38629, 0 };
+ float arr20[] = { 0, 0, 0, 0, 0 };
+ float arr21[] = { 0, 0, 6.83689, 0, 0 };
+
+ vector<float> losses;
+ losses.push_back(2.73485);
+ losses.push_back(0);
+ losses.push_back(3.64118);
+ losses.push_back(1.47347);
+ losses.push_back(3.64118);
+ losses.push_back(4.16278);
+ losses.push_back(3.13952);
+ losses.push_back(0);
+ losses.push_back(2.73485);
+ losses.push_back(1.47347);
+ losses.push_back(3.64118);
+ losses.push_back(3.64118);
+ losses.push_back(2.51662);
+ losses.push_back(2.73485);
+ losses.push_back(2.73485);
+ losses.push_back(3.64118);
+ losses.push_back(3.64118);
+ losses.push_back(4.16278);
+ losses.push_back(1.47347);
+ losses.push_back(0);
+ losses.push_back(3.13952);
+
+ vector<float> vec1(arr1,arr1+5);
+ vector<float> vec2(arr2,arr2+5);
+ vector<float> vec3(arr3,arr3+5);
+ vector<float> vec4(arr4,arr4+5);
+ vector<float> vec5(arr5,arr5+5);
+ vector<float> vec6(arr6,arr6+5);
+ vector<float> vec7(arr7,arr7+5);
+ vector<float> vec8(arr8,arr8+5);
+ vector<float> vec9(arr9,arr9+5);
+ vector<float> vec10(arr10,arr10+5);
+ vector<float> vec11(arr11,arr11+5);
+ vector<float> vec12(arr12,arr12+5);
+ vector<float> vec13(arr13,arr13+5);
+ vector<float> vec14(arr14,arr14+5);
+ vector<float> vec15(arr15,arr15+5);
+ vector<float> vec16(arr16,arr16+5);
+ vector<float> vec17(arr17,arr17+5);
+ vector<float> vec18(arr18,arr18+5);
+ vector<float> vec19(arr19,arr19+5);
+ vector<float> vec20(arr20,arr20+5);
+ vector<float> vec21(arr21,arr21+5);
+
+ s1.PlusEquals(&multi,vec1);
+ s2.PlusEquals(&multi,vec2);
+ s3.PlusEquals(&multi,vec3);
+ s4.PlusEquals(&multi,vec4);
+ s5.PlusEquals(&multi,vec5);
+ s6.PlusEquals(&multi,vec6);
+ s7.PlusEquals(&multi,vec7);
+ s8.PlusEquals(&multi,vec8);
+ s9.PlusEquals(&multi,vec9);
+ s10.PlusEquals(&multi,vec10);
+ s11.PlusEquals(&multi,vec11);
+ s12.PlusEquals(&multi,vec12);
+ s13.PlusEquals(&multi,vec13);
+ s14.PlusEquals(&multi,vec14);
+ s15.PlusEquals(&multi,vec15);
+ s16.PlusEquals(&multi,vec16);
+ s17.PlusEquals(&multi,vec17);
+ s18.PlusEquals(&multi,vec18);
+ s19.PlusEquals(&multi,vec19);
+ s20.PlusEquals(&multi,vec20);
+ s21.PlusEquals(&multi,vec21);
+
+ featureValueDiffs.push_back(s1);
+ featureValueDiffs.push_back(s2);
+ featureValueDiffs.push_back(s3);
+ featureValueDiffs.push_back(s4);
+ featureValueDiffs.push_back(s5);
+ featureValueDiffs.push_back(s6);
+ featureValueDiffs.push_back(s7);
+ featureValueDiffs.push_back(s8);
+ featureValueDiffs.push_back(s9);
+ featureValueDiffs.push_back(s10);
+ featureValueDiffs.push_back(s11);
+ featureValueDiffs.push_back(s12);
+ featureValueDiffs.push_back(s13);
+ featureValueDiffs.push_back(s14);
+ featureValueDiffs.push_back(s15);
+ featureValueDiffs.push_back(s16);
+ featureValueDiffs.push_back(s17);
+ featureValueDiffs.push_back(s18);
+ featureValueDiffs.push_back(s19);
+ featureValueDiffs.push_back(s20);
+ featureValueDiffs.push_back(s21);
+
+ vector<float> oldModelScoreDiff;
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ oldModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weights));
+ }
+
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ cerr << "old model score diff: " << oldModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (oldModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
+ }
+
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ lossMinusModelScoreDiff.push_back(losses[i] - oldModelScoreDiff[i]);
+ }
+
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ cerr << "A: " << featureValueDiffs[i] << ", b: " << lossMinusModelScoreDiff[i] << endl;
+ }
+
+ vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff);
+ vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01);
+
+ cerr << "\nalphas without slack:" << endl;
+ for (size_t i = 0; i < alphas1.size(); ++i) {
+ cerr << "alpha " << i << ": " << alphas1[i] << endl;
+ }
+ cerr << endl;
+
+ cerr << "partial updates:" << endl;
+ vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs);
+ FVector totalUpdate1 = ScoreComponentCollection::CreateFVector();
+ for (size_t k = 0; k < featureValueDiffs1.size(); ++k) {
+ featureValueDiffs1[k].MultiplyEquals(alphas1[k]);
+ cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl;
+ FVector update = featureValueDiffs1[k].GetScoresVector();
+ totalUpdate1 += update;
+ }
+ cerr << endl;
+ cerr << "total update: " << totalUpdate1 << endl << endl;
+
+ ScoreComponentCollection weightsUpdate1(weights);
+ weightsUpdate1.PlusEquals(totalUpdate1);
+ cerr << "old weights: " << weights << endl;
+ cerr << "new weights: " << weightsUpdate1 << endl << endl;
+
+ vector<float> newModelScoreDiff;
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate1));
+ }
+
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
+ }
+
+ cerr << "\n\nalphas with slack 0.01:" << endl;
+ for (size_t i = 0; i < alphas2.size(); ++i) {
+ cerr << "alpha " << i << ": " << alphas2[i] << endl;
+ }
+ cerr << endl;
+
+ cerr << "partial updates:" << endl;
+ vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs);
+ FVector totalUpdate2 = ScoreComponentCollection::CreateFVector();
+ for (size_t k = 0; k < featureValueDiffs2.size(); ++k) {
+ featureValueDiffs2[k].MultiplyEquals(alphas2[k]);
+ cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl;
+ FVector update = featureValueDiffs2[k].GetScoresVector();
+ totalUpdate2 += update;
+ }
+ cerr << endl;
+ cerr << "total update: " << totalUpdate2 << endl << endl;
+
+ ScoreComponentCollection weightsUpdate2(weights);
+ weightsUpdate2.PlusEquals(totalUpdate2);
+ cerr << "old weights: " << weights << endl;
+ cerr << "new weights: " << weightsUpdate2 << endl << endl;
+
+ newModelScoreDiff.clear();
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate2));
+ }
+
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << endl;
+ }
}
BOOST_FIXTURE_TEST_CASE(test_hildreth_4, MockProducers)
{
- // Feasible example with 8 constraints
- cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl;
- vector< ScoreComponentCollection> featureValueDiffs;
- vector< float> lossMinusModelScoreDiff;
-
- // initial weights
- float w[] = { 1, 1, 0.638672, 1, 0 };
- vector<float> vec(w,w+5);
- ScoreComponentCollection weights;
- weights.PlusEquals(&multi, vec);
-
- int numberOfConstraints = 8;
-
- // feature value differences (to oracle)
- // NOTE: these feature values are only approximations
- ScoreComponentCollection s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21;
- float arr1[] = { 0, 0, -2.0672, 0, 0 };
- float arr2[] = { 0, 0, 0, 0, 0 };
- float arr3[] = { 0, 0, -2.08436, 1.38629, 0 };
- float arr4[] = { 0, 0, -0.0171661, 1.38629, 0 };
+ // Feasible example with 8 constraints
+ cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl;
+ vector< ScoreComponentCollection> featureValueDiffs;
+ vector< float> lossMinusModelScoreDiff;
+
+ // initial weights
+ float w[] = { 1, 1, 0.638672, 1, 0 };
+ vector<float> vec(w,w+5);
+ ScoreComponentCollection weights;
+ weights.PlusEquals(&multi, vec);
+
+ int numberOfConstraints = 8;
+
+ // feature value differences (to oracle)
+ // NOTE: these feature values are only approximations
+ ScoreComponentCollection s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21;
+ float arr1[] = { 0, 0, -2.0672, 0, 0 };
+ float arr2[] = { 0, 0, 0, 0, 0 };
+ float arr3[] = { 0, 0, -2.08436, 1.38629, 0 };
+ float arr4[] = { 0, 0, -0.0171661, 1.38629, 0 };
// float arr5[] = { 0, 0, 4.4283, 0, 0 };
// float arr6[] = { 0, 0, 3.84829, 1.38629, 0 };
// float arr7[] = { 0, 0, 6.83689, 0, 0 };
- float arr8[] = { 0, 0, 0, 0, 0 };
- float arr9[] = { 0, 0, -2.0672, 0, 0 };
+ float arr8[] = { 0, 0, 0, 0, 0 };
+ float arr9[] = { 0, 0, -2.0672, 0, 0 };
// float arr10[] = { 0, 0, -0.0171661, 1.38629, 0 };
// float arr11[] = { 0, 0, -2.08436, 1.38629, 0 };
// float arr12[] = { 0, 0, 4.4283, 0, 0 };
// float arr13[] = { 3, 0, 2.41089, 0, 0 };
// float arr14[] = { 3, 0, 2.32709, 0, 0 };
- float arr15[] = { 0, 0, -2.0672, 0, 0 };
- float arr16[] = { 0, 0, -2.08436, 1.38629, 0 };
+ float arr15[] = { 0, 0, -2.0672, 0, 0 };
+ float arr16[] = { 0, 0, -2.08436, 1.38629, 0 };
// float arr17[] = { 0, 0, 4.4283, 0, 0 };
// float arr18[] = { 0, 0, 3.84829, 1.38629, 0 };
// float arr19[] = { 0, 0, -0.0171661, 1.38629, 0 };
// float arr20[] = { 0, 0, 0, 0, 0 };
// float arr21[] = { 0, 0, 6.83689, 0, 0 };
- vector<float> losses;
- losses.push_back(2.73485);
- losses.push_back(0);
- losses.push_back(3.64118);
- losses.push_back(1.47347);
+ vector<float> losses;
+ losses.push_back(2.73485);
+ losses.push_back(0);
+ losses.push_back(3.64118);
+ losses.push_back(1.47347);
// losses.push_back(3.64118);
// losses.push_back(4.16278);
// losses.push_back(3.13952);
- losses.push_back(0);
- losses.push_back(2.73485);
+ losses.push_back(0);
+ losses.push_back(2.73485);
// losses.push_back(1.47347);
// losses.push_back(3.64118);
// losses.push_back(3.64118);
// losses.push_back(2.51662);
// losses.push_back(2.73485);
- losses.push_back(2.73485);
- losses.push_back(3.64118);
+ losses.push_back(2.73485);
+ losses.push_back(3.64118);
// losses.push_back(3.64118);
// losses.push_back(4.16278);
// losses.push_back(1.47347);
// losses.push_back(0);
// losses.push_back(3.13952);
- vector<float> vec1(arr1,arr1+5);
- vector<float> vec2(arr2,arr2+5);
- vector<float> vec3(arr3,arr3+5);
- vector<float> vec4(arr4,arr4+5);
+ vector<float> vec1(arr1,arr1+5);
+ vector<float> vec2(arr2,arr2+5);
+ vector<float> vec3(arr3,arr3+5);
+ vector<float> vec4(arr4,arr4+5);
// vector<float> vec5(arr5,arr5+5);
// vector<float> vec6(arr6,arr6+5);
// vector<float> vec7(arr7,arr7+5);
- vector<float> vec8(arr8,arr8+5);
- vector<float> vec9(arr9,arr9+5);
+ vector<float> vec8(arr8,arr8+5);
+ vector<float> vec9(arr9,arr9+5);
// vector<float> vec10(arr10,arr10+5);
// vector<float> vec11(arr11,arr11+5);
// vector<float> vec12(arr12,arr12+5);
// vector<float> vec13(arr13,arr13+5);
// vector<float> vec14(arr14,arr14+5);
- vector<float> vec15(arr15,arr15+5);
- vector<float> vec16(arr16,arr16+5);
+ vector<float> vec15(arr15,arr15+5);
+ vector<float> vec16(arr16,arr16+5);
// vector<float> vec17(arr17,arr17+5);
// vector<float> vec18(arr18,arr18+5);
// vector<float> vec19(arr19,arr19+5);
// vector<float> vec20(arr20,arr20+5);
// vector<float> vec21(arr21,arr21+5);
- s1.PlusEquals(&multi,vec1);
- s2.PlusEquals(&multi,vec2);
- s3.PlusEquals(&multi,vec3);
- s4.PlusEquals(&multi,vec4);
+ s1.PlusEquals(&multi,vec1);
+ s2.PlusEquals(&multi,vec2);
+ s3.PlusEquals(&multi,vec3);
+ s4.PlusEquals(&multi,vec4);
// s5.PlusEquals(&multi,vec5);
// s6.PlusEquals(&multi,vec6);
// s7.PlusEquals(&multi,vec7);
- s8.PlusEquals(&multi,vec8);
- s9.PlusEquals(&multi,vec9);
+ s8.PlusEquals(&multi,vec8);
+ s9.PlusEquals(&multi,vec9);
// s10.PlusEquals(&multi,vec10);
// s11.PlusEquals(&multi,vec11);
// s12.PlusEquals(&multi,vec12);
// s13.PlusEquals(&multi,vec13);
// s14.PlusEquals(&multi,vec14);
- s15.PlusEquals(&multi,vec15);
- s16.PlusEquals(&multi,vec16);
+ s15.PlusEquals(&multi,vec15);
+ s16.PlusEquals(&multi,vec16);
// s17.PlusEquals(&multi,vec17);
// s18.PlusEquals(&multi,vec18);
// s19.PlusEquals(&multi,vec19);
// s20.PlusEquals(&multi,vec20);
// s21.PlusEquals(&multi,vec21);
- featureValueDiffs.push_back(s1);
- featureValueDiffs.push_back(s2);
- featureValueDiffs.push_back(s3);
- featureValueDiffs.push_back(s4);
+ featureValueDiffs.push_back(s1);
+ featureValueDiffs.push_back(s2);
+ featureValueDiffs.push_back(s3);
+ featureValueDiffs.push_back(s4);
// featureValueDiffs.push_back(s5);
// featureValueDiffs.push_back(s6);
// featureValueDiffs.push_back(s7);
- featureValueDiffs.push_back(s8);
- featureValueDiffs.push_back(s9);
+ featureValueDiffs.push_back(s8);
+ featureValueDiffs.push_back(s9);
// featureValueDiffs.push_back(s10);
// featureValueDiffs.push_back(s11);
// featureValueDiffs.push_back(s12);
// featureValueDiffs.push_back(s13);
// featureValueDiffs.push_back(s14);
- featureValueDiffs.push_back(s15);
- featureValueDiffs.push_back(s16);
+ featureValueDiffs.push_back(s15);
+ featureValueDiffs.push_back(s16);
// featureValueDiffs.push_back(s17);
// featureValueDiffs.push_back(s18);
// featureValueDiffs.push_back(s19);
// featureValueDiffs.push_back(s20);
// featureValueDiffs.push_back(s21);
- vector<float> oldModelScoreDiff;
- for (int i = 0; i < numberOfConstraints; ++i) {
- oldModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weights));
- }
-
- for (int i = 0; i < numberOfConstraints; ++i) {
- cerr << "old model score diff: " << oldModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (oldModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
- }
-
- for (int i = 0; i < numberOfConstraints; ++i) {
- lossMinusModelScoreDiff.push_back(losses[i] - oldModelScoreDiff[i]);
- }
-
- for (int i = 0; i < numberOfConstraints; ++i) {
- cerr << "A: " << featureValueDiffs[i] << ", b: " << lossMinusModelScoreDiff[i] << endl;
- }
-
- vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff);
- vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01);
-
- cerr << "\nalphas without slack:" << endl;
- for (size_t i = 0; i < alphas1.size(); ++i) {
- cerr << "alpha " << i << ": " << alphas1[i] << endl;
- }
- cerr << endl;
-
- cerr << "partial updates:" << endl;
- vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs);
- FVector totalUpdate1 = ScoreComponentCollection::CreateFVector();
- for (size_t k = 0; k < featureValueDiffs1.size(); ++k) {
- featureValueDiffs1[k].MultiplyEquals(alphas1[k]);
- cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl;
- FVector update = featureValueDiffs1[k].GetScoresVector();
- totalUpdate1 += update;
- }
- cerr << endl;
- cerr << "total update: " << totalUpdate1 << endl << endl;
-
- ScoreComponentCollection weightsUpdate1(weights);
- weightsUpdate1.PlusEquals(totalUpdate1);
- cerr << "old weights: " << weights << endl;
- cerr << "new weights: " << weightsUpdate1 << endl << endl;
-
- vector<float> newModelScoreDiff;
- for (int i = 0; i < numberOfConstraints; ++i) {
- newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate1));
- }
-
- for (int i = 0; i < numberOfConstraints; ++i) {
- cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
- }
-
- cerr << "\n\nalphas with slack 0.01:" << endl;
- for (size_t i = 0; i < alphas2.size(); ++i) {
- cerr << "alpha " << i << ": " << alphas2[i] << endl;
- }
- cerr << endl;
-
- cerr << "partial updates:" << endl;
- vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs);
- FVector totalUpdate2 = ScoreComponentCollection::CreateFVector();
- for (size_t k = 0; k < featureValueDiffs2.size(); ++k) {
- featureValueDiffs2[k].MultiplyEquals(alphas2[k]);
- cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl;
- FVector update = featureValueDiffs2[k].GetScoresVector();
- totalUpdate2 += update;
- }
- cerr << endl;
- cerr << "total update: " << totalUpdate2 << endl << endl;
-
- ScoreComponentCollection weightsUpdate2(weights);
- weightsUpdate2.PlusEquals(totalUpdate2);
- cerr << "old weights: " << weights << endl;
- cerr << "new weights: " << weightsUpdate2 << endl << endl;
-
- newModelScoreDiff.clear();
- for (int i = 0; i < numberOfConstraints; ++i) {
- newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate2));
- }
-
- for (int i = 0; i < numberOfConstraints; ++i) {
- cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << endl;
- }
+ vector<float> oldModelScoreDiff;
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ oldModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weights));
+ }
+
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ cerr << "old model score diff: " << oldModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (oldModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
+ }
+
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ lossMinusModelScoreDiff.push_back(losses[i] - oldModelScoreDiff[i]);
+ }
+
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ cerr << "A: " << featureValueDiffs[i] << ", b: " << lossMinusModelScoreDiff[i] << endl;
+ }
+
+ vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff);
+ vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01);
+
+ cerr << "\nalphas without slack:" << endl;
+ for (size_t i = 0; i < alphas1.size(); ++i) {
+ cerr << "alpha " << i << ": " << alphas1[i] << endl;
+ }
+ cerr << endl;
+
+ cerr << "partial updates:" << endl;
+ vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs);
+ FVector totalUpdate1 = ScoreComponentCollection::CreateFVector();
+ for (size_t k = 0; k < featureValueDiffs1.size(); ++k) {
+ featureValueDiffs1[k].MultiplyEquals(alphas1[k]);
+ cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl;
+ FVector update = featureValueDiffs1[k].GetScoresVector();
+ totalUpdate1 += update;
+ }
+ cerr << endl;
+ cerr << "total update: " << totalUpdate1 << endl << endl;
+
+ ScoreComponentCollection weightsUpdate1(weights);
+ weightsUpdate1.PlusEquals(totalUpdate1);
+ cerr << "old weights: " << weights << endl;
+ cerr << "new weights: " << weightsUpdate1 << endl << endl;
+
+ vector<float> newModelScoreDiff;
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate1));
+ }
+
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
+ }
+
+ cerr << "\n\nalphas with slack 0.01:" << endl;
+ for (size_t i = 0; i < alphas2.size(); ++i) {
+ cerr << "alpha " << i << ": " << alphas2[i] << endl;
+ }
+ cerr << endl;
+
+ cerr << "partial updates:" << endl;
+ vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs);
+ FVector totalUpdate2 = ScoreComponentCollection::CreateFVector();
+ for (size_t k = 0; k < featureValueDiffs2.size(); ++k) {
+ featureValueDiffs2[k].MultiplyEquals(alphas2[k]);
+ cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl;
+ FVector update = featureValueDiffs2[k].GetScoresVector();
+ totalUpdate2 += update;
+ }
+ cerr << endl;
+ cerr << "total update: " << totalUpdate2 << endl << endl;
+
+ ScoreComponentCollection weightsUpdate2(weights);
+ weightsUpdate2.PlusEquals(totalUpdate2);
+ cerr << "old weights: " << weights << endl;
+ cerr << "new weights: " << weightsUpdate2 << endl << endl;
+
+ newModelScoreDiff.clear();
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate2));
+ }
+
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << endl;
+ }
}
BOOST_FIXTURE_TEST_CASE(test_hildreth_5, MockProducers)
{
- // Unfeasible example with 2 constraints
- cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl;
- vector< ScoreComponentCollection> featureValueDiffs;
- vector< float> lossMinusModelScoreDiff;
-
- // initial weights
- float w[] = { 1, 1, 0.638672, 1, 0 };
- vector<float> vec(w,w+5);
- ScoreComponentCollection weights;
- weights.PlusEquals(&multi, vec);
-
- int numberOfConstraints = 2;
-
- // feature value differences (to oracle)
- // NOTE: these feature values are only approximations
- ScoreComponentCollection s1, s17;
- float arr1[] = { 0, 0, -2.0672, 0, 0 };
- float arr17[] = { 0, 0, 4.4283, 0, 0 };
- vector<float> losses;
- losses.push_back(2.73485);
- losses.push_back(3.64118);
-
- vector<float> vec1(arr1,arr1+5);
- vector<float> vec17(arr17,arr17+5);
-
- s1.PlusEquals(&multi,vec1);
- s17.PlusEquals(&multi,vec17);
-
- featureValueDiffs.push_back(s1);
- featureValueDiffs.push_back(s17);
-
- vector<float> oldModelScoreDiff;
- for (int i = 0; i < numberOfConstraints; ++i) {
- oldModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weights));
- }
-
- float sumOfOldError = 0;
- for (int i = 0; i < numberOfConstraints; ++i) {
- cerr << "old model score diff: " << oldModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (oldModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
- sumOfOldError += (losses[i] - oldModelScoreDiff[i]);
- }
- cerr << "sum of old error: " << sumOfOldError << endl;
-
- for (int i = 0; i < numberOfConstraints; ++i) {
- lossMinusModelScoreDiff.push_back(losses[i] - oldModelScoreDiff[i]);
- }
-
- for (int i = 0; i < numberOfConstraints; ++i) {
- cerr << "A: " << featureValueDiffs[i] << ", b: " << lossMinusModelScoreDiff[i] << endl;
- }
-
- vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff);
- vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01);
- vector< float> alphas3 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.1);
-
- cerr << "\nalphas without slack:" << endl;
- for (size_t i = 0; i < alphas1.size(); ++i) {
- cerr << "alpha " << i << ": " << alphas1[i] << endl;
- }
- cerr << endl;
-
- cerr << "partial updates:" << endl;
- vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs);
- FVector totalUpdate1 = ScoreComponentCollection::CreateFVector();
- for (size_t k = 0; k < featureValueDiffs1.size(); ++k) {
- featureValueDiffs1[k].MultiplyEquals(alphas1[k]);
- cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl;
- FVector update = featureValueDiffs1[k].GetScoresVector();
- totalUpdate1 += update;
- }
- cerr << endl;
- cerr << "total update: " << totalUpdate1 << endl << endl;
-
- ScoreComponentCollection weightsUpdate1(weights);
- weightsUpdate1.PlusEquals(totalUpdate1);
- cerr << "old weights: " << weights << endl;
- cerr << "new weights: " << weightsUpdate1 << endl << endl;
-
- vector<float> newModelScoreDiff;
- for (int i = 0; i < numberOfConstraints; ++i) {
- newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate1));
- }
-
- float sumOfNewError = 0;
- for (int i = 0; i < numberOfConstraints; ++i) {
- cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
- sumOfNewError += (losses[i] - newModelScoreDiff[i]);
- }
- cerr << "sum of new error: " << sumOfNewError << endl;
-
- cerr << "\n\nalphas with slack 0.01:" << endl;
- for (size_t i = 0; i < alphas2.size(); ++i) {
- cerr << "alpha " << i << ": " << alphas2[i] << endl;
- }
- cerr << endl;
-
- cerr << "partial updates:" << endl;
- vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs);
- FVector totalUpdate2 = ScoreComponentCollection::CreateFVector();
- for (size_t k = 0; k < featureValueDiffs2.size(); ++k) {
- featureValueDiffs2[k].MultiplyEquals(alphas2[k]);
- cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl;
- FVector update = featureValueDiffs2[k].GetScoresVector();
- totalUpdate2 += update;
- }
- cerr << endl;
- cerr << "total update: " << totalUpdate2 << endl << endl;
-
- ScoreComponentCollection weightsUpdate2(weights);
- weightsUpdate2.PlusEquals(totalUpdate2);
- cerr << "old weights: " << weights << endl;
- cerr << "new weights: " << weightsUpdate2 << endl << endl;
-
- newModelScoreDiff.clear();
- for (int i = 0; i < numberOfConstraints; ++i) {
- newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate2));
- }
-
- sumOfNewError = 0;
- for (int i = 0; i < numberOfConstraints; ++i) {
- cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
- sumOfNewError += (losses[i] - newModelScoreDiff[i]);
- }
- cerr << "sum of new error: " << sumOfNewError << endl;
-
- cerr << "\n\nalphas with slack 0.1:" << endl;
- for (size_t i = 0; i < alphas3.size(); ++i) {
- cerr << "alpha " << i << ": " << alphas3[i] << endl;
- }
- cerr << endl;
-
- cerr << "partial updates:" << endl;
- vector< ScoreComponentCollection> featureValueDiffs3(featureValueDiffs);
- FVector totalUpdate3 = ScoreComponentCollection::CreateFVector();
- for (size_t k = 0; k < featureValueDiffs3.size(); ++k) {
- featureValueDiffs3[k].MultiplyEquals(alphas3[k]);
- cerr << k << ": " << featureValueDiffs3[k].GetScoresVector() << endl;
- FVector update = featureValueDiffs3[k].GetScoresVector();
- totalUpdate3 += update;
- }
- cerr << endl;
- cerr << "total update: " << totalUpdate3 << endl << endl;
-
- ScoreComponentCollection weightsUpdate3(weights);
- weightsUpdate3.PlusEquals(totalUpdate3);
- cerr << "old weights: " << weights << endl;
- cerr << "new weights: " << weightsUpdate3 << endl << endl;
-
- newModelScoreDiff.clear();
- for (int i = 0; i < numberOfConstraints; ++i) {
- newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate3));
- }
-
- sumOfNewError = 0;
- for (int i = 0; i < numberOfConstraints; ++i) {
- cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
- sumOfNewError += (losses[i] - newModelScoreDiff[i]);
- }
- cerr << "sum of new error: " << sumOfNewError << endl;
+ // Unfeasible example with 2 constraints
+ cerr << "\n>>>>>Hildreth test, without slack and with 0.01 slack" << endl << endl;
+ vector< ScoreComponentCollection> featureValueDiffs;
+ vector< float> lossMinusModelScoreDiff;
+
+ // initial weights
+ float w[] = { 1, 1, 0.638672, 1, 0 };
+ vector<float> vec(w,w+5);
+ ScoreComponentCollection weights;
+ weights.PlusEquals(&multi, vec);
+
+ int numberOfConstraints = 2;
+
+ // feature value differences (to oracle)
+ // NOTE: these feature values are only approximations
+ ScoreComponentCollection s1, s17;
+ float arr1[] = { 0, 0, -2.0672, 0, 0 };
+ float arr17[] = { 0, 0, 4.4283, 0, 0 };
+ vector<float> losses;
+ losses.push_back(2.73485);
+ losses.push_back(3.64118);
+
+ vector<float> vec1(arr1,arr1+5);
+ vector<float> vec17(arr17,arr17+5);
+
+ s1.PlusEquals(&multi,vec1);
+ s17.PlusEquals(&multi,vec17);
+
+ featureValueDiffs.push_back(s1);
+ featureValueDiffs.push_back(s17);
+
+ vector<float> oldModelScoreDiff;
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ oldModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weights));
+ }
+
+ float sumOfOldError = 0;
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ cerr << "old model score diff: " << oldModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (oldModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
+ sumOfOldError += (losses[i] - oldModelScoreDiff[i]);
+ }
+ cerr << "sum of old error: " << sumOfOldError << endl;
+
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ lossMinusModelScoreDiff.push_back(losses[i] - oldModelScoreDiff[i]);
+ }
+
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ cerr << "A: " << featureValueDiffs[i] << ", b: " << lossMinusModelScoreDiff[i] << endl;
+ }
+
+ vector< float> alphas1 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff);
+ vector< float> alphas2 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.01);
+ vector< float> alphas3 = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiff, 0.1);
+
+ cerr << "\nalphas without slack:" << endl;
+ for (size_t i = 0; i < alphas1.size(); ++i) {
+ cerr << "alpha " << i << ": " << alphas1[i] << endl;
+ }
+ cerr << endl;
+
+ cerr << "partial updates:" << endl;
+ vector< ScoreComponentCollection> featureValueDiffs1(featureValueDiffs);
+ FVector totalUpdate1 = ScoreComponentCollection::CreateFVector();
+ for (size_t k = 0; k < featureValueDiffs1.size(); ++k) {
+ featureValueDiffs1[k].MultiplyEquals(alphas1[k]);
+ cerr << k << ": " << featureValueDiffs1[k].GetScoresVector() << endl;
+ FVector update = featureValueDiffs1[k].GetScoresVector();
+ totalUpdate1 += update;
+ }
+ cerr << endl;
+ cerr << "total update: " << totalUpdate1 << endl << endl;
+
+ ScoreComponentCollection weightsUpdate1(weights);
+ weightsUpdate1.PlusEquals(totalUpdate1);
+ cerr << "old weights: " << weights << endl;
+ cerr << "new weights: " << weightsUpdate1 << endl << endl;
+
+ vector<float> newModelScoreDiff;
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate1));
+ }
+
+ float sumOfNewError = 0;
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
+ sumOfNewError += (losses[i] - newModelScoreDiff[i]);
+ }
+ cerr << "sum of new error: " << sumOfNewError << endl;
+
+ cerr << "\n\nalphas with slack 0.01:" << endl;
+ for (size_t i = 0; i < alphas2.size(); ++i) {
+ cerr << "alpha " << i << ": " << alphas2[i] << endl;
+ }
+ cerr << endl;
+
+ cerr << "partial updates:" << endl;
+ vector< ScoreComponentCollection> featureValueDiffs2(featureValueDiffs);
+ FVector totalUpdate2 = ScoreComponentCollection::CreateFVector();
+ for (size_t k = 0; k < featureValueDiffs2.size(); ++k) {
+ featureValueDiffs2[k].MultiplyEquals(alphas2[k]);
+ cerr << k << ": " << featureValueDiffs2[k].GetScoresVector() << endl;
+ FVector update = featureValueDiffs2[k].GetScoresVector();
+ totalUpdate2 += update;
+ }
+ cerr << endl;
+ cerr << "total update: " << totalUpdate2 << endl << endl;
+
+ ScoreComponentCollection weightsUpdate2(weights);
+ weightsUpdate2.PlusEquals(totalUpdate2);
+ cerr << "old weights: " << weights << endl;
+ cerr << "new weights: " << weightsUpdate2 << endl << endl;
+
+ newModelScoreDiff.clear();
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate2));
+ }
+
+ sumOfNewError = 0;
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
+ sumOfNewError += (losses[i] - newModelScoreDiff[i]);
+ }
+ cerr << "sum of new error: " << sumOfNewError << endl;
+
+ cerr << "\n\nalphas with slack 0.1:" << endl;
+ for (size_t i = 0; i < alphas3.size(); ++i) {
+ cerr << "alpha " << i << ": " << alphas3[i] << endl;
+ }
+ cerr << endl;
+
+ cerr << "partial updates:" << endl;
+ vector< ScoreComponentCollection> featureValueDiffs3(featureValueDiffs);
+ FVector totalUpdate3 = ScoreComponentCollection::CreateFVector();
+ for (size_t k = 0; k < featureValueDiffs3.size(); ++k) {
+ featureValueDiffs3[k].MultiplyEquals(alphas3[k]);
+ cerr << k << ": " << featureValueDiffs3[k].GetScoresVector() << endl;
+ FVector update = featureValueDiffs3[k].GetScoresVector();
+ totalUpdate3 += update;
+ }
+ cerr << endl;
+ cerr << "total update: " << totalUpdate3 << endl << endl;
+
+ ScoreComponentCollection weightsUpdate3(weights);
+ weightsUpdate3.PlusEquals(totalUpdate3);
+ cerr << "old weights: " << weights << endl;
+ cerr << "new weights: " << weightsUpdate3 << endl << endl;
+
+ newModelScoreDiff.clear();
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ newModelScoreDiff.push_back(featureValueDiffs[i].InnerProduct(weightsUpdate3));
+ }
+
+ sumOfNewError = 0;
+ for (int i = 0; i < numberOfConstraints; ++i) {
+ cerr << "new model score diff: " << newModelScoreDiff[i] << ", loss: " << losses[i] << "\t" << (newModelScoreDiff[i] >= losses[i] ? 1 : 0) << endl;
+ sumOfNewError += (losses[i] - newModelScoreDiff[i]);
+ }
+ cerr << "sum of new error: " << sumOfNewError << endl;
}
BOOST_AUTO_TEST_SUITE_END()
diff --git a/mira/HypothesisQueue.cpp b/mira/HypothesisQueue.cpp
index 43e082b92..8c8daa4da 100644
--- a/mira/HypothesisQueue.cpp
+++ b/mira/HypothesisQueue.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -22,13 +22,16 @@
using namespace std;
-namespace Moses {
+namespace Moses
+{
-HypothesisQueue::~HypothesisQueue() {
+HypothesisQueue::~HypothesisQueue()
+{
m_queue.clear();
}
-void HypothesisQueue::Push(BleuIndexPair hypo) {
+void HypothesisQueue::Push(BleuIndexPair hypo)
+{
//pair<set<BleuIndexPair>::iterator,bool> ret;
if (m_capacity == 0 || m_queue.size() < m_capacity) {
@@ -52,7 +55,8 @@ void HypothesisQueue::Push(BleuIndexPair hypo) {
}
}
-BleuIndexPair HypothesisQueue::Pop() {
+BleuIndexPair HypothesisQueue::Pop()
+{
HypoQueueType::iterator p = m_queue.begin();
BleuIndexPair top = *p;
m_queue.erase(p);
diff --git a/mira/HypothesisQueue.h b/mira/HypothesisQueue.h
index a926a40da..63cabbd0f 100644
--- a/mira/HypothesisQueue.h
+++ b/mira/HypothesisQueue.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -21,7 +21,8 @@
#include <set>
-namespace Moses {
+namespace Moses
+{
// pair of Bleu score and index
typedef std::pair<float, size_t> BleuIndexPair;
@@ -30,14 +31,17 @@ typedef std::pair<float, size_t> BleuIndexPair;
// the best scoring hypothesis. The queue assumes ownership of pushed items and
// relinquishes ownership when they are popped. Any remaining items at the
// time of the queue's destruction are deleted.
-class HypothesisQueue {
+class HypothesisQueue
+{
- public:
+public:
// Create empty queue with fixed capacity of c. Capacity 0 means unbounded.
HypothesisQueue(size_t c) : m_capacity(c) {}
~HypothesisQueue();
- bool Empty() { return m_queue.empty(); }
+ bool Empty() {
+ return m_queue.empty();
+ }
// Add the hypo to the queue or delete it if the queue is full and the
// score is no better than the queue's worst score.
@@ -47,17 +51,17 @@ class HypothesisQueue {
// caller is responsible for deleting the object.
BleuIndexPair Pop();
- private:
+private:
struct HypothesisOrderer {
bool operator()(BleuIndexPair a,
- BleuIndexPair b) {
+ BleuIndexPair b) {
return (a.first > b.first);
}
};
typedef std::multiset<BleuIndexPair, HypothesisOrderer> HypoQueueType;
//typedef std::set<BleuIndexPair, HypothesisOrderer> HypoQueueType;
-
+
HypoQueueType m_queue;
const size_t m_capacity;
};
diff --git a/mira/Main.cpp b/mira/Main.cpp
index 2c62256d9..0dbc9be43 100644
--- a/mira/Main.cpp
+++ b/mira/Main.cpp
@@ -54,7 +54,8 @@ using namespace std;
using namespace Moses;
namespace po = boost::program_options;
-int main(int argc, char** argv) {
+int main(int argc, char** argv)
+{
size_t rank = 0;
size_t size = 1;
#ifdef MPI_ENABLE
@@ -141,113 +142,113 @@ int main(int argc, char** argv) {
bool modelPlusBleu, simpleHistoryBleu;
po::options_description desc("Allowed options");
desc.add_options()
- ("continue-epoch", po::value<size_t>(&continue_epoch)->default_value(0), "Continue an interrupted experiment from this epoch on")
- ("freq-reg", po::value<bool>(&reg_on_every_mix)->default_value(false), "Regularize after every weight mixing")
- ("l1sparse", po::value<bool>(&l1_reg_sparse)->default_value(true), "L1-regularization for sparse weights only")
- ("l2sparse", po::value<bool>(&l2_reg_sparse)->default_value(true), "L2-regularization for sparse weights only")
- ("mv-reg", po::value<bool>(&most_violated_reg)->default_value(false), "Regularize most violated constraint")
- ("dbg", po::value<bool>(&debug)->default_value(true), "More debug output")
- ("make-pairs", po::value<bool>(&makePairs)->default_value(true), "Make pairs of hypotheses for 1slack")
- ("debug", po::value<bool>(&debug)->default_value(true), "More debug output")
- ("rescale-slack", po::value<bool>(&rescaleSlack)->default_value(false), "Rescale slack in 1-slack formulation")
- ("disable-bleu-feature", po::value<bool>(&disableBleuFeature)->default_value(false), "Disable the Bleu feature")
- ("real-bleu", po::value<bool>(&realBleu)->default_value(false), "Compute real sentence Bleu on complete translations")
- ("add2lm", po::value<float>(&add2lm)->default_value(0.0), "Add the specified amount to all LM weights")
- ("hildreth", po::value<bool>(&hildreth)->default_value(false), "Prefer Hildreth over analytical update")
- ("selective", po::value<bool>(&selective)->default_value(false), "Build constraints for every feature")
- ("summed", po::value<bool>(&summed)->default_value(false), "Sum up all constraints")
- ("model-plus-bleu", po::value<bool>(&modelPlusBleu)->default_value(false), "Use the sum of model score and +/- bleu to select hope and fear translations")
- ("simple-history-bleu", po::value<bool>(&simpleHistoryBleu)->default_value(false), "Simple history Bleu")
-
- ("bleu-weight", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu weight used in decoder objective")
- ("bw-hope", po::value<float>(&bleuWeight_hope)->default_value(-1.0), "Bleu weight used in decoder objective for hope")
- ("bw-fear", po::value<float>(&bleuWeight_fear)->default_value(-1.0), "Bleu weight used in decoder objective for fear")
-
- ("core-r0", po::value<float>(&core_r0)->default_value(1.0), "Start learning rate for core features")
- ("sparse-r0", po::value<float>(&sparse_r0)->default_value(1.0), "Start learning rate for sparse features")
-
- ("tie-bw-to-lm", po::value<bool>(&bleu_weight_lm)->default_value(false), "Make bleu weight depend on lm weight")
- ("adjust-bw", po::value<bool>(&bleu_weight_lm_adjust)->default_value(false), "Adjust bleu weight when lm weight changes")
- ("bw-lm-factor", po::value<float>(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor")
- ("bw-factor-fear", po::value<float>(&bleu_weight_fear_factor)->default_value(1.0), "Multiply fear weight by this factor")
- ("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
- ("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
- ("avg-ref-length", po::value<bool>(&avgRefLength)->default_value(false), "Use average reference length instead of shortest for BLEU score feature")
- ("batch-equals-shard", po::value<bool>(&batchEqualsShard)->default_value(false), "Batch size is equal to shard size (purely batch)")
- ("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
- ("bleu-smoothing-scheme", po::value<size_t>(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)")
- ("boost", po::value<bool>(&boost)->default_value(false), "Apply boosting factor to updates on misranked candidates")
- ("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file")
- ("configs-folds", po::value<vector<string> >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold")
- ("debug-model", po::value<bool>(&debug_model)->default_value(false), "Get best model translation for debugging purposes")
- ("decode-hope", po::value<bool>(&decode_hope)->default_value(false), "Decode dev input set according to hope objective")
- ("decode-fear", po::value<bool>(&decode_fear)->default_value(false), "Decode dev input set according to fear objective")
- ("decode-model", po::value<bool>(&decode_model)->default_value(false), "Decode dev input set according to normal objective")
- ("decode-filename", po::value<string>(&decode_filename), "Filename for Bleu objective translations")
- ("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
- ("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use n-best list with distinct translations in inference step")
- ("dump-mixed-weights", po::value<bool>(&dumpMixedWeights)->default_value(false), "Dump mixed weights instead of averaged weights")
- ("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
- ("feature-cutoff", po::value<int>(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features")
- ("fear-n", po::value<int>(&fear_n)->default_value(1), "Number of fear translations used")
- ("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
- ("history-bleu", po::value<bool>(&historyBleu)->default_value(false), "Use 1best translations to update the history")
- ("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
- ("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)")
- ("hope-n", po::value<int>(&hope_n)->default_value(2), "Number of hope translations used")
- ("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
- ("input-files-folds", po::value<vector<string> >(&inputFilesFolds), "Input files containing tokenised source, one for each fold")
- ("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
- ("l1-lambda", po::value<float>(&l1_lambda)->default_value(0.0001), "Lambda for l1-regularization (w_i +/- lambda)")
- ("l2-lambda", po::value<float>(&l2_lambda)->default_value(0.01), "Lambda for l2-regularization (w_i * (1 - lambda))")
- ("l1-reg", po::value<bool>(&l1_regularize)->default_value(false), "L1-regularization")
- ("l2-reg", po::value<bool>(&l2_regularize)->default_value(false), "L2-regularization")
- ("min-bleu-ratio", po::value<float>(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear")
- ("max-bleu-ratio", po::value<float>(&maxBleuRatio)->default_value(-1), "Set a maximum BLEU ratio between hope and fear")
- ("max-bleu-diff", po::value<bool>(&max_bleu_diff)->default_value(true), "Select hope/fear with maximum Bleu difference")
- ("min-oracle-bleu", po::value<float>(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score")
- ("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.0001), "Set minimum weight change for stopping criterion")
- ("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
- ("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi")
- ("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation")
- ("moses-src", po::value<string>(&moses_src)->default_value(""), "Moses source directory")
- ("nbest,n", po::value<size_t>(&n)->default_value(1), "Number of translations in n-best list")
- ("normalise-weights", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
- ("normalise-margin", po::value<bool>(&normaliseMargin)->default_value(false), "Normalise the margin: squash between 0 and 1")
- ("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
- ("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
- ("print-feature-counts", po::value<bool>(&printFeatureCounts)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
- ("print-nbest-with-features", po::value<bool>(&printNbestWithFeatures)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
- ("print-weights", po::value<bool>(&print_weights)->default_value(false), "Print out current weights")
- ("print-core-weights", po::value<bool>(&print_core_weights)->default_value(true), "Print out current core weights")
- ("prune-zero-weights", po::value<bool>(&pruneZeroWeights)->default_value(false), "Prune zero-valued sparse feature weights")
- ("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
- ("reference-files-folds", po::value<vector<string> >(&referenceFilesFolds), "Reference translation files for training, one for each fold")
- ("kbest", po::value<bool>(&kbest)->default_value(false), "Select hope/fear pairs from a list of nbest translations")
-
- ("scale-by-inverse-length", po::value<bool>(&scaleByInverseLength)->default_value(false), "Scale BLEU by (history of) inverse input length")
- ("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(false), "Scale BLEU by (history of) input length")
- ("scale-by-avg-input-length", po::value<bool>(&scaleByAvgInputLength)->default_value(false), "Scale BLEU by average input length")
- ("scale-by-avg-inverse-length", po::value<bool>(&scaleByAvgInverseLength)->default_value(false), "Scale BLEU by average inverse input length")
- ("scale-by-x", po::value<float>(&scaleByX)->default_value(1), "Scale the BLEU score by value x")
- ("scale-lm", po::value<bool>(&scale_lm)->default_value(false), "Scale the language model feature")
- ("scale-factor-lm", po::value<float>(&scale_lm_factor)->default_value(2), "Scale the language model feature by this factor")
- ("scale-wp", po::value<bool>(&scale_wp)->default_value(false), "Scale the word penalty feature")
- ("scale-factor-wp", po::value<float>(&scale_wp_factor)->default_value(2), "Scale the word penalty feature by this factor")
- ("scale-margin", po::value<bool>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
- ("scale-margin-precision", po::value<bool>(&scale_margin_precision)->default_value(0), "Scale margin by precision of oracle")
- ("scale-update", po::value<bool>(&scale_update)->default_value(0), "Scale update by Bleu score of oracle")
- ("scale-update-precision", po::value<bool>(&scale_update_precision)->default_value(0), "Scale update by precision of oracle")
- ("sentence-level-bleu", po::value<bool>(&sentenceBleu)->default_value(true), "Use a sentences level Bleu scoring function")
- ("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
- ("sigmoid-param", po::value<float>(&sigmoidParam)->default_value(1), "y=sigmoidParam is the axis that this sigmoid approaches")
- ("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimiser")
- ("sparse-average", po::value<bool>(&sparseAverage)->default_value(false), "Average weights by the number of processes")
- ("sparse-no-average", po::value<bool>(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum")
- ("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
- ("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
- ("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights (mpi)")
- ("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights");
+ ("continue-epoch", po::value<size_t>(&continue_epoch)->default_value(0), "Continue an interrupted experiment from this epoch on")
+ ("freq-reg", po::value<bool>(&reg_on_every_mix)->default_value(false), "Regularize after every weight mixing")
+ ("l1sparse", po::value<bool>(&l1_reg_sparse)->default_value(true), "L1-regularization for sparse weights only")
+ ("l2sparse", po::value<bool>(&l2_reg_sparse)->default_value(true), "L2-regularization for sparse weights only")
+ ("mv-reg", po::value<bool>(&most_violated_reg)->default_value(false), "Regularize most violated constraint")
+ ("dbg", po::value<bool>(&debug)->default_value(true), "More debug output")
+ ("make-pairs", po::value<bool>(&makePairs)->default_value(true), "Make pairs of hypotheses for 1slack")
+ ("debug", po::value<bool>(&debug)->default_value(true), "More debug output")
+ ("rescale-slack", po::value<bool>(&rescaleSlack)->default_value(false), "Rescale slack in 1-slack formulation")
+ ("disable-bleu-feature", po::value<bool>(&disableBleuFeature)->default_value(false), "Disable the Bleu feature")
+ ("real-bleu", po::value<bool>(&realBleu)->default_value(false), "Compute real sentence Bleu on complete translations")
+ ("add2lm", po::value<float>(&add2lm)->default_value(0.0), "Add the specified amount to all LM weights")
+ ("hildreth", po::value<bool>(&hildreth)->default_value(false), "Prefer Hildreth over analytical update")
+ ("selective", po::value<bool>(&selective)->default_value(false), "Build constraints for every feature")
+ ("summed", po::value<bool>(&summed)->default_value(false), "Sum up all constraints")
+ ("model-plus-bleu", po::value<bool>(&modelPlusBleu)->default_value(false), "Use the sum of model score and +/- bleu to select hope and fear translations")
+ ("simple-history-bleu", po::value<bool>(&simpleHistoryBleu)->default_value(false), "Simple history Bleu")
+
+ ("bleu-weight", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu weight used in decoder objective")
+ ("bw-hope", po::value<float>(&bleuWeight_hope)->default_value(-1.0), "Bleu weight used in decoder objective for hope")
+ ("bw-fear", po::value<float>(&bleuWeight_fear)->default_value(-1.0), "Bleu weight used in decoder objective for fear")
+
+ ("core-r0", po::value<float>(&core_r0)->default_value(1.0), "Start learning rate for core features")
+ ("sparse-r0", po::value<float>(&sparse_r0)->default_value(1.0), "Start learning rate for sparse features")
+
+ ("tie-bw-to-lm", po::value<bool>(&bleu_weight_lm)->default_value(false), "Make bleu weight depend on lm weight")
+ ("adjust-bw", po::value<bool>(&bleu_weight_lm_adjust)->default_value(false), "Adjust bleu weight when lm weight changes")
+ ("bw-lm-factor", po::value<float>(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor")
+ ("bw-factor-fear", po::value<float>(&bleu_weight_fear_factor)->default_value(1.0), "Multiply fear weight by this factor")
+ ("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
+ ("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
+ ("avg-ref-length", po::value<bool>(&avgRefLength)->default_value(false), "Use average reference length instead of shortest for BLEU score feature")
+ ("batch-equals-shard", po::value<bool>(&batchEqualsShard)->default_value(false), "Batch size is equal to shard size (purely batch)")
+ ("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
+ ("bleu-smoothing-scheme", po::value<size_t>(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)")
+ ("boost", po::value<bool>(&boost)->default_value(false), "Apply boosting factor to updates on misranked candidates")
+ ("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file")
+ ("configs-folds", po::value<vector<string> >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold")
+ ("debug-model", po::value<bool>(&debug_model)->default_value(false), "Get best model translation for debugging purposes")
+ ("decode-hope", po::value<bool>(&decode_hope)->default_value(false), "Decode dev input set according to hope objective")
+ ("decode-fear", po::value<bool>(&decode_fear)->default_value(false), "Decode dev input set according to fear objective")
+ ("decode-model", po::value<bool>(&decode_model)->default_value(false), "Decode dev input set according to normal objective")
+ ("decode-filename", po::value<string>(&decode_filename), "Filename for Bleu objective translations")
+ ("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
+ ("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use n-best list with distinct translations in inference step")
+ ("dump-mixed-weights", po::value<bool>(&dumpMixedWeights)->default_value(false), "Dump mixed weights instead of averaged weights")
+ ("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
+ ("feature-cutoff", po::value<int>(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features")
+ ("fear-n", po::value<int>(&fear_n)->default_value(1), "Number of fear translations used")
+ ("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
+ ("history-bleu", po::value<bool>(&historyBleu)->default_value(false), "Use 1best translations to update the history")
+ ("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
+ ("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)")
+ ("hope-n", po::value<int>(&hope_n)->default_value(2), "Number of hope translations used")
+ ("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
+ ("input-files-folds", po::value<vector<string> >(&inputFilesFolds), "Input files containing tokenised source, one for each fold")
+ ("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
+ ("l1-lambda", po::value<float>(&l1_lambda)->default_value(0.0001), "Lambda for l1-regularization (w_i +/- lambda)")
+ ("l2-lambda", po::value<float>(&l2_lambda)->default_value(0.01), "Lambda for l2-regularization (w_i * (1 - lambda))")
+ ("l1-reg", po::value<bool>(&l1_regularize)->default_value(false), "L1-regularization")
+ ("l2-reg", po::value<bool>(&l2_regularize)->default_value(false), "L2-regularization")
+ ("min-bleu-ratio", po::value<float>(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear")
+ ("max-bleu-ratio", po::value<float>(&maxBleuRatio)->default_value(-1), "Set a maximum BLEU ratio between hope and fear")
+ ("max-bleu-diff", po::value<bool>(&max_bleu_diff)->default_value(true), "Select hope/fear with maximum Bleu difference")
+ ("min-oracle-bleu", po::value<float>(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score")
+ ("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.0001), "Set minimum weight change for stopping criterion")
+ ("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
+ ("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi")
+ ("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation")
+ ("moses-src", po::value<string>(&moses_src)->default_value(""), "Moses source directory")
+ ("nbest,n", po::value<size_t>(&n)->default_value(1), "Number of translations in n-best list")
+ ("normalise-weights", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
+ ("normalise-margin", po::value<bool>(&normaliseMargin)->default_value(false), "Normalise the margin: squash between 0 and 1")
+ ("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
+ ("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
+ ("print-feature-counts", po::value<bool>(&printFeatureCounts)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
+ ("print-nbest-with-features", po::value<bool>(&printNbestWithFeatures)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
+ ("print-weights", po::value<bool>(&print_weights)->default_value(false), "Print out current weights")
+ ("print-core-weights", po::value<bool>(&print_core_weights)->default_value(true), "Print out current core weights")
+ ("prune-zero-weights", po::value<bool>(&pruneZeroWeights)->default_value(false), "Prune zero-valued sparse feature weights")
+ ("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
+ ("reference-files-folds", po::value<vector<string> >(&referenceFilesFolds), "Reference translation files for training, one for each fold")
+ ("kbest", po::value<bool>(&kbest)->default_value(false), "Select hope/fear pairs from a list of nbest translations")
+
+ ("scale-by-inverse-length", po::value<bool>(&scaleByInverseLength)->default_value(false), "Scale BLEU by (history of) inverse input length")
+ ("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(false), "Scale BLEU by (history of) input length")
+ ("scale-by-avg-input-length", po::value<bool>(&scaleByAvgInputLength)->default_value(false), "Scale BLEU by average input length")
+ ("scale-by-avg-inverse-length", po::value<bool>(&scaleByAvgInverseLength)->default_value(false), "Scale BLEU by average inverse input length")
+ ("scale-by-x", po::value<float>(&scaleByX)->default_value(1), "Scale the BLEU score by value x")
+ ("scale-lm", po::value<bool>(&scale_lm)->default_value(false), "Scale the language model feature")
+ ("scale-factor-lm", po::value<float>(&scale_lm_factor)->default_value(2), "Scale the language model feature by this factor")
+ ("scale-wp", po::value<bool>(&scale_wp)->default_value(false), "Scale the word penalty feature")
+ ("scale-factor-wp", po::value<float>(&scale_wp_factor)->default_value(2), "Scale the word penalty feature by this factor")
+ ("scale-margin", po::value<bool>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
+ ("scale-margin-precision", po::value<bool>(&scale_margin_precision)->default_value(0), "Scale margin by precision of oracle")
+ ("scale-update", po::value<bool>(&scale_update)->default_value(0), "Scale update by Bleu score of oracle")
+ ("scale-update-precision", po::value<bool>(&scale_update_precision)->default_value(0), "Scale update by precision of oracle")
+ ("sentence-level-bleu", po::value<bool>(&sentenceBleu)->default_value(true), "Use a sentences level Bleu scoring function")
+ ("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
+ ("sigmoid-param", po::value<float>(&sigmoidParam)->default_value(1), "y=sigmoidParam is the axis that this sigmoid approaches")
+ ("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimiser")
+ ("sparse-average", po::value<bool>(&sparseAverage)->default_value(false), "Average weights by the number of processes")
+ ("sparse-no-average", po::value<bool>(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum")
+ ("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
+ ("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
+ ("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights (mpi)")
+ ("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights");
po::options_description cmdline_options;
cmdline_options.add(desc);
@@ -257,7 +258,7 @@ int main(int argc, char** argv) {
if (help) {
std::cout << "Usage: " + string(argv[0])
- + " -f mosesini-file -i input-file -r reference-file(s) [options]" << std::endl;
+ + " -f mosesini-file -i input-file -r reference-file(s) [options]" << std::endl;
std::cout << desc << std::endl;
return 0;
}
@@ -296,8 +297,7 @@ int main(int argc, char** argv) {
cerr << "Error: No reference files specified for training with folds" << endl;
exit(1);
}
- }
- else {
+ } else {
if (mosesConfigFile.empty()) {
cerr << "Error: No moses ini file specified" << endl;
return 1;
@@ -354,12 +354,11 @@ int main(int argc, char** argv) {
}
if (referenceSentences[myFold].size() != inputSentencesFolds[myFold].size()) {
cerr << "Error: Input file length (" << inputSentencesFolds[myFold].size() << ") != ("
- << referenceSentences[myFold].size() << ") reference file length (rank " << rank << ")" << endl;
+ << referenceSentences[myFold].size() << ") reference file length (rank " << rank << ")" << endl;
exit(1);
}
VERBOSE(1, "Rank " << rank << " reading references from " << referenceFilesFolds[myFold] << endl);
- }
- else {
+ } else {
if (!loadSentences(inputFile, inputSentences)) {
cerr << "Error: Failed to load input sentences from " << inputFile << endl;
return 1;
@@ -367,15 +366,15 @@ int main(int argc, char** argv) {
for (size_t i = 0; i < referenceFiles.size(); ++i) {
if (!loadSentences(referenceFiles[i], referenceSentences[i])) {
- cerr << "Error: Failed to load reference sentences from "
- << referenceFiles[i] << endl;
- return 1;
+ cerr << "Error: Failed to load reference sentences from "
+ << referenceFiles[i] << endl;
+ return 1;
}
if (referenceSentences[i].size() != inputSentences.size()) {
- cerr << "Error: Input file length (" << inputSentences.size() << ") != ("
- << referenceSentences[i].size() << ") length of reference file " << i
- << endl;
- return 1;
+ cerr << "Error: Input file length (" << inputSentences.size() << ") != ("
+ << referenceSentences[i].size() << ") length of reference file " << i
+ << endl;
+ return 1;
}
}
}
@@ -401,8 +400,7 @@ int main(int argc, char** argv) {
if (trainWithMultipleFolds) {
decoder_settings += " ";
decoder_settings += referenceFilesFolds[myFold];
- }
- else {
+ } else {
for (size_t i=0; i < referenceFiles.size(); ++i) {
decoder_settings += " ";
decoder_settings += referenceFiles[i];
@@ -416,8 +414,8 @@ int main(int argc, char** argv) {
VERBOSE(1, "Rank " << rank << " reading config file from " << configFile << endl);
MosesDecoder* decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength,
- scaleByInverseLength, scaleByAvgInverseLength,
- scaleByX, historySmoothing, bleu_smoothing_scheme, simpleHistoryBleu);
+ scaleByInverseLength, scaleByAvgInverseLength,
+ scaleByX, historySmoothing, bleu_smoothing_scheme, simpleHistoryBleu);
SearchAlgorithm searchAlgorithm = staticData.GetSearchAlgorithm();
bool chartDecoding = (searchAlgorithm == ChartDecoding);
@@ -427,11 +425,10 @@ int main(int argc, char** argv) {
for (size_t i = 0; i < inputSentencesFolds[myFold].size(); ++i) {
order.push_back(i);
}
- }
- else {
+ } else {
if (rank == 0) {
for (size_t i = 0; i < inputSentences.size(); ++i) {
- order.push_back(i);
+ order.push_back(i);
}
}
}
@@ -444,10 +441,10 @@ int main(int argc, char** argv) {
cerr << "slack: " << slack << ", learning rate: " << mira_learning_rate << endl;
cerr << "selective: " << selective << endl;
if (normaliseMargin)
- cerr << "sigmoid parameter: " << sigmoidParam << endl;
+ cerr << "sigmoid parameter: " << sigmoidParam << endl;
}
optimiser = new MiraOptimiser(slack, scale_margin, scale_margin_precision,
- scale_update, scale_update_precision, boost, normaliseMargin, sigmoidParam);
+ scale_update, scale_update_precision, boost, normaliseMargin, sigmoidParam);
learning_rate = mira_learning_rate;
perceptron_update = false;
} else if (learner == "perceptron") {
@@ -466,30 +463,30 @@ int main(int argc, char** argv) {
cerr << "Error: Unknown optimiser: " << learner << endl;
return 1;
}
-
+
// resolve parameter dependencies
if (batchSize > 1 && perceptron_update) {
batchSize = 1;
cerr << "Info: Setting batch size to 1 for perceptron update" << endl;
}
-
+
if (hope_n == -1)
hope_n = n;
if (fear_n == -1)
fear_n = n;
-
+
if (model_hope_fear || kbest)
hope_fear = false; // is true by default
if (learner == "mira" && !(hope_fear || model_hope_fear || kbest)) {
cerr << "Error: Need to select one of parameters --hope-fear/--model-hope-fear/--kbest for mira update." << endl;
return 1;
}
-
+
#ifdef MPI_ENABLE
if (!trainWithMultipleFolds)
mpi::broadcast(world, order, 0);
#endif
-
+
// Create shards according to the number of processes used
vector<size_t> shard;
if (trainWithMultipleFolds) {
@@ -505,8 +502,7 @@ int main(int argc, char** argv) {
shard.resize(shardSize);
copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
batchSize = 1;
- }
- else {
+ } else {
size_t shardSize = order.size() / size;
size_t shardStart = (size_t) (shardSize * rank);
size_t shardEnd = (size_t) (shardSize * (rank + 1));
@@ -521,49 +517,49 @@ int main(int argc, char** argv) {
if (batchEqualsShard)
batchSize = shardSize;
}
-
+
// get reference to feature functions
const vector<FeatureFunction*> &featureFunctions = FeatureFunction::GetFeatureFunctions();
ScoreComponentCollection initialWeights = decoder->getWeights();
-
+
if (add2lm != 0) {
- const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- for (size_t i = 0; i < statefulFFs.size(); ++i) {
- const StatefulFeatureFunction *ff = statefulFFs[i];
- const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
-
- if (lm) {
- float lmWeight = initialWeights.GetScoreForProducer(lm) + add2lm;
- initialWeights.Assign(lm, lmWeight);
- cerr << "Rank " << rank << ", add " << add2lm << " to lm weight." << endl;
- }
- }
+ const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ for (size_t i = 0; i < statefulFFs.size(); ++i) {
+ const StatefulFeatureFunction *ff = statefulFFs[i];
+ const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
+
+ if (lm) {
+ float lmWeight = initialWeights.GetScoreForProducer(lm) + add2lm;
+ initialWeights.Assign(lm, lmWeight);
+ cerr << "Rank " << rank << ", add " << add2lm << " to lm weight." << endl;
+ }
+ }
}
-
+
if (normaliseWeights) {
initialWeights.L1Normalise();
cerr << "Rank " << rank << ", normalised initial weights: " << initialWeights << endl;
}
decoder->setWeights(initialWeights);
-
+
// set bleu weight to twice the size of the language model weight(s)
if (bleu_weight_lm) {
float lmSum = 0;
- const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- for (size_t i = 0; i < statefulFFs.size(); ++i) {
- const StatefulFeatureFunction *ff = statefulFFs[i];
- const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
+ const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ for (size_t i = 0; i < statefulFFs.size(); ++i) {
+ const StatefulFeatureFunction *ff = statefulFFs[i];
+ const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
- if (lm) {
+ if (lm) {
lmSum += abs(initialWeights.GetScoreForProducer(lm));
- }
- }
+ }
+ }
bleuWeight = lmSum * bleu_weight_lm_factor;
cerr << "Set bleu weight to lm weight * " << bleu_weight_lm_factor << endl;
}
-
+
if (bleuWeight_hope == -1) {
bleuWeight_hope = bleuWeight;
}
@@ -573,35 +569,35 @@ int main(int argc, char** argv) {
bleuWeight_fear *= bleu_weight_fear_factor;
cerr << "Bleu weight: " << bleuWeight << endl;
cerr << "Bleu weight fear: " << bleuWeight_fear << endl;
-
+
if (decode_hope || decode_fear || decode_model) {
size_t decode = 1;
if (decode_fear) decode = 2;
if (decode_model) decode = 3;
decodeHopeOrFear(rank, size, decode, decode_filename, inputSentences, decoder, n, bleuWeight);
}
-
+
//Main loop:
ScoreComponentCollection cumulativeWeights; // collect weights per epoch to produce an average
ScoreComponentCollection cumulativeWeightsBinary;
size_t numberOfUpdates = 0;
size_t numberOfUpdatesThisEpoch = 0;
-
+
time_t now;
time(&now);
cerr << "Rank " << rank << ", " << ctime(&now);
-
+
float avgInputLength = 0;
float sumOfInputs = 0;
size_t numberOfInputs = 0;
-
+
ScoreComponentCollection mixedWeights;
ScoreComponentCollection mixedWeightsPrevious;
ScoreComponentCollection mixedWeightsBeforePrevious;
ScoreComponentCollection mixedAverageWeights;
ScoreComponentCollection mixedAverageWeightsPrevious;
ScoreComponentCollection mixedAverageWeightsBeforePrevious;
-
+
bool stop = false;
// int sumStillViolatedConstraints;
float epsilon = 0.0001;
@@ -610,66 +606,65 @@ int main(int argc, char** argv) {
ScoreComponentCollection confidenceCounts, mixedConfidenceCounts, featureLearningRates;
featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0); //initialise core learning rates
cerr << "Initial learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
-
+
for (size_t epoch = continue_epoch; epoch < epochs && !stop; ++epoch) {
if (shuffle) {
if (trainWithMultipleFolds || rank == 0) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", shuffling input sentences.." << endl;
- RandomIndex rindex;
- random_shuffle(order.begin(), order.end(), rindex);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", shuffling input sentences.." << endl;
+ RandomIndex rindex;
+ random_shuffle(order.begin(), order.end(), rindex);
}
-
+
#ifdef MPI_ENABLE
if (!trainWithMultipleFolds)
- mpi::broadcast(world, order, 0);
+ mpi::broadcast(world, order, 0);
#endif
-
+
// redo shards
if (trainWithMultipleFolds) {
- size_t shardSize = order.size()/coresPerFold;
- size_t shardStart = (size_t) (shardSize * (rank % coresPerFold));
- size_t shardEnd = shardStart + shardSize;
- if (rank % coresPerFold == coresPerFold - 1) { // last rank of each fold
- shardEnd = order.size();
- shardSize = shardEnd - shardStart;
- }
- VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl);
- VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl);
- shard.resize(shardSize);
- copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
- batchSize = 1;
- }
- else {
- size_t shardSize = order.size()/size;
- size_t shardStart = (size_t) (shardSize * rank);
- size_t shardEnd = (size_t) (shardSize * (rank + 1));
- if (rank == size - 1) {
- shardEnd = order.size();
- shardSize = shardEnd - shardStart;
- }
- VERBOSE(1, "Shard size: " << shardSize << endl);
- VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl);
- shard.resize(shardSize);
- copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
- if (batchEqualsShard)
- batchSize = shardSize;
+ size_t shardSize = order.size()/coresPerFold;
+ size_t shardStart = (size_t) (shardSize * (rank % coresPerFold));
+ size_t shardEnd = shardStart + shardSize;
+ if (rank % coresPerFold == coresPerFold - 1) { // last rank of each fold
+ shardEnd = order.size();
+ shardSize = shardEnd - shardStart;
+ }
+ VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl);
+ VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl);
+ shard.resize(shardSize);
+ copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
+ batchSize = 1;
+ } else {
+ size_t shardSize = order.size()/size;
+ size_t shardStart = (size_t) (shardSize * rank);
+ size_t shardEnd = (size_t) (shardSize * (rank + 1));
+ if (rank == size - 1) {
+ shardEnd = order.size();
+ shardSize = shardEnd - shardStart;
+ }
+ VERBOSE(1, "Shard size: " << shardSize << endl);
+ VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl);
+ shard.resize(shardSize);
+ copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
+ if (batchEqualsShard)
+ batchSize = shardSize;
}
}
-
+
// sum of violated constraints in an epoch
// sumStillViolatedConstraints = 0;
-
+
numberOfUpdatesThisEpoch = 0;
// Sum up weights over one epoch, final average uses weights from last epoch
if (!accumulateWeights) {
cumulativeWeights.ZeroAll();
cumulativeWeightsBinary.ZeroAll();
}
-
+
// number of weight dumps this epoch
size_t weightMixingThisEpoch = 0;
size_t weightEpochDump = 0;
-
+
size_t shardPosition = 0;
vector<size_t>::const_iterator sid = shard.begin();
while (sid != shard.end()) {
@@ -677,7 +672,7 @@ int main(int argc, char** argv) {
vector<vector<ScoreComponentCollection> > featureValues;
vector<vector<float> > bleuScores;
vector<vector<float> > modelScores;
-
+
// variables for hope-fear/perceptron setting
vector<vector<ScoreComponentCollection> > featureValuesHope;
vector<vector<ScoreComponentCollection> > featureValuesFear;
@@ -685,15 +680,15 @@ int main(int argc, char** argv) {
vector<vector<float> > bleuScoresFear;
vector<vector<float> > modelScoresHope;
vector<vector<float> > modelScoresFear;
-
+
// get moses weights
ScoreComponentCollection mosesWeights = decoder->getWeights();
VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", weights: " << mosesWeights << endl);
-
+
if (historyBleu || simpleHistoryBleu) {
- decoder->printBleuFeatureHistory(cerr);
+ decoder->printBleuFeatureHistory(cerr);
}
-
+
// BATCHING: produce nbest lists for all input sentences in batch
vector<float> oracleBleuScores;
vector<float> oracleModelScores;
@@ -702,926 +697,903 @@ int main(int argc, char** argv) {
vector<size_t> inputLengths;
vector<size_t> ref_ids;
size_t actualBatchSize = 0;
-
+
vector<size_t>::const_iterator current_sid_start = sid;
size_t examples_in_batch = 0;
bool skip_example = false;
for (size_t batchPosition = 0; batchPosition < batchSize && sid
- != shard.end(); ++batchPosition) {
- string input;
- if (trainWithMultipleFolds)
- input = inputSentencesFolds[myFold][*sid];
- else
- input = inputSentences[*sid];
-
- Moses::Sentence *sentence = new Sentence();
- stringstream in(input + "\n");
- const vector<FactorType> inputFactorOrder = staticData.GetInputFactorOrder();
- sentence->Read(in,inputFactorOrder);
- cerr << "\nRank " << rank << ", epoch " << epoch << ", input sentence " << *sid << ": \"";
- sentence->Print(cerr);
- cerr << "\"" << " (batch pos " << batchPosition << ")" << endl;
- size_t current_input_length = (*sentence).GetSize();
-
- if (epoch == 0 && (scaleByAvgInputLength || scaleByAvgInverseLength)) {
- sumOfInputs += current_input_length;
- ++numberOfInputs;
- avgInputLength = sumOfInputs/numberOfInputs;
- decoder->setAvgInputLength(avgInputLength);
- cerr << "Rank " << rank << ", epoch 0, average input length: " << avgInputLength << endl;
- }
-
- vector<ScoreComponentCollection> newFeatureValues;
- vector<float> newScores;
- if (model_hope_fear) {
- featureValues.push_back(newFeatureValues);
- bleuScores.push_back(newScores);
- modelScores.push_back(newScores);
- }
- if (hope_fear || perceptron_update) {
- featureValuesHope.push_back(newFeatureValues);
- featureValuesFear.push_back(newFeatureValues);
- bleuScoresHope.push_back(newScores);
- bleuScoresFear.push_back(newScores);
- modelScoresHope.push_back(newScores);
- modelScoresFear.push_back(newScores);
- if (historyBleu || simpleHistoryBleu || debug_model) {
- featureValues.push_back(newFeatureValues);
- bleuScores.push_back(newScores);
- modelScores.push_back(newScores);
- }
- }
- if (kbest) {
- // for decoding
- featureValues.push_back(newFeatureValues);
- bleuScores.push_back(newScores);
- modelScores.push_back(newScores);
-
- // for storing selected examples
- featureValuesHope.push_back(newFeatureValues);
- featureValuesFear.push_back(newFeatureValues);
- bleuScoresHope.push_back(newScores);
- bleuScoresFear.push_back(newScores);
- modelScoresHope.push_back(newScores);
- modelScoresFear.push_back(newScores);
- }
-
- size_t ref_length;
- float avg_ref_length;
-
- if (print_weights)
- cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: " << mosesWeights << endl;
- if (print_core_weights) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: ";
- mosesWeights.PrintCoreFeatures();
- cerr << endl;
- }
-
- // check LM weight
- const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- for (size_t i = 0; i < statefulFFs.size(); ++i) {
- const StatefulFeatureFunction *ff = statefulFFs[i];
- const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
-
- if (lm) {
- float lmWeight = mosesWeights.GetScoreForProducer(lm);
- cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight: " << lmWeight << endl;
- if (lmWeight <= 0) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: language model weight should never be <= 0." << endl;
- mosesWeights.Assign(lm, 0.1);
- cerr << "Rank " << rank << ", epoch " << epoch << ", assign lm weights of 0.1" << endl;
- }
- }
- }
-
- // select inference scheme
- cerr << "Rank " << rank << ", epoch " << epoch << ", real Bleu? " << realBleu << endl;
- if (hope_fear || perceptron_update) {
- // HOPE
- cerr << "Rank " << rank << ", epoch " << epoch << ", " << hope_n <<
- "best hope translations" << endl;
- vector< vector<const Word*> > outputHope = decoder->getNBest(input, *sid, hope_n, 1.0, bleuWeight_hope,
- featureValuesHope[batchPosition], bleuScoresHope[batchPosition], modelScoresHope[batchPosition],
- 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
- vector<const Word*> oracle = outputHope[0];
- decoder->cleanup(chartDecoding);
- ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
- avg_ref_length = ref_length;
- float hope_length_ratio = (float)oracle.size()/ref_length;
- int oracleSize = (int)oracle.size();
- cerr << endl;
-
- // count sparse features occurring in hope translation
- featureValuesHope[batchPosition][0].IncrementSparseHopeFeatures();
-
- float precision = bleuScoresHope[batchPosition][0];
- if (historyBleu || simpleHistoryBleu) {
- precision /= decoder->getTargetLengthHistory();
- }
- else {
- if (scaleByAvgInputLength) precision /= decoder->getAverageInputLength();
- else if (scaleByAvgInverseLength) precision /= (100/decoder->getAverageInputLength());
- precision /= scaleByX;
- }
- if (scale_margin_precision || scale_update_precision) {
- if (historyBleu || simpleHistoryBleu || scaleByAvgInputLength || scaleByAvgInverseLength) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", set hope precision: " << precision << endl;
- ((MiraOptimiser*) optimiser)->setPrecision(precision);
- }
- }
-
- vector<const Word*> bestModel;
- if (debug_model || historyBleu || simpleHistoryBleu) {
- // MODEL (for updating the history only, using dummy vectors)
- cerr << "Rank " << rank << ", epoch " << epoch << ", 1best wrt model score (debug or history)" << endl;
- vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
- featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
- 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
- bestModel = outputModel[0];
- decoder->cleanup(chartDecoding);
- cerr << endl;
- ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
- }
-
- // FEAR
- float fear_length_ratio = 0;
- float bleuRatioHopeFear = 0;
- int fearSize = 0;
- cerr << "Rank " << rank << ", epoch " << epoch << ", " << fear_n << "best fear translations" << endl;
- vector< vector<const Word*> > outputFear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuWeight_fear,
- featureValuesFear[batchPosition], bleuScoresFear[batchPosition], modelScoresFear[batchPosition],
- 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
- vector<const Word*> fear = outputFear[0];
- decoder->cleanup(chartDecoding);
- ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
- avg_ref_length += ref_length;
- avg_ref_length /= 2;
- fear_length_ratio = (float)fear.size()/ref_length;
- fearSize = (int)fear.size();
- cerr << endl;
- for (size_t i = 0; i < fear.size(); ++i)
- delete fear[i];
-
- // count sparse features occurring in fear translation
- featureValuesFear[batchPosition][0].IncrementSparseFearFeatures();
-
- // Bleu-related example selection
- bool skip = false;
- bleuRatioHopeFear = bleuScoresHope[batchPosition][0] / bleuScoresFear[batchPosition][0];
- if (minBleuRatio != -1 && bleuRatioHopeFear < minBleuRatio)
- skip = true;
- if(maxBleuRatio != -1 && bleuRatioHopeFear > maxBleuRatio)
- skip = true;
-
- // sanity check
- if (historyBleu || simpleHistoryBleu) {
- if (bleuScores[batchPosition][0] > bleuScoresHope[batchPosition][0] &&
- modelScores[batchPosition][0] > modelScoresHope[batchPosition][0]) {
- if (abs(bleuScores[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon &&
- abs(modelScores[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: MODEL translation better than HOPE translation." << endl;
- skip = true;
- }
- }
- if (bleuScoresFear[batchPosition][0] > bleuScores[batchPosition][0] &&
- modelScoresFear[batchPosition][0] > modelScores[batchPosition][0]) {
- if (abs(bleuScoresFear[batchPosition][0] - bleuScores[batchPosition][0]) > epsilon &&
- abs(modelScoresFear[batchPosition][0] - modelScores[batchPosition][0]) > epsilon) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than MODEL translation." << endl;
- skip = true;
- }
- }
- }
- if (bleuScoresFear[batchPosition][0] > bleuScoresHope[batchPosition][0]) {
- if (abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon) {
- // check if it's an error or a warning
- skip = true;
- if (modelScoresFear[batchPosition][0] > modelScoresHope[batchPosition][0] && abs(modelScoresFear[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
- }
- else {
- cerr << "Rank " << rank << ", epoch " << epoch << ", WARNING: FEAR translation has better Bleu than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
- }
- }
- }
-
- if (skip) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", skip example (" << hope_length_ratio << ", " << bleuRatioHopeFear << ").. " << endl;
- featureValuesHope[batchPosition].clear();
- featureValuesFear[batchPosition].clear();
- bleuScoresHope[batchPosition].clear();
- bleuScoresFear[batchPosition].clear();
- if (historyBleu || simpleHistoryBleu || debug_model) {
- featureValues[batchPosition].clear();
- bleuScores[batchPosition].clear();
- }
- }
- else {
- examples_in_batch++;
-
- // needed for history
- if (historyBleu || simpleHistoryBleu) {
- inputLengths.push_back(current_input_length);
- ref_ids.push_back(*sid);
- oneBests.push_back(bestModel);
- }
- }
- }
- if (model_hope_fear) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best hope translations" << endl;
- size_t oraclePos = featureValues[batchPosition].size();
- decoder->getNBest(input, *sid, n, 1.0, bleuWeight_hope,
- featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
- 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
- //vector<const Word*> oracle = outputHope[0];
- // needed for history
- inputLengths.push_back(current_input_length);
- ref_ids.push_back(*sid);
- decoder->cleanup(chartDecoding);
- //ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
- //float hope_length_ratio = (float)oracle.size()/ref_length;
- cerr << endl;
-
- oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
- oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]);
- oracleModelScores.push_back(modelScores[batchPosition][oraclePos]);
-
- // MODEL
- cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
- if (historyBleu || simpleHistoryBleu) {
- vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0,
- bleuWeight, featureValues[batchPosition], bleuScores[batchPosition],
- modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
- vector<const Word*> bestModel = outputModel[0];
- oneBests.push_back(bestModel);
- inputLengths.push_back(current_input_length);
- ref_ids.push_back(*sid);
- }
- else {
- decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
- featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
- 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
- }
- decoder->cleanup(chartDecoding);
- //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
- //float model_length_ratio = (float)bestModel.size()/ref_length;
- cerr << endl;
-
- // FEAR
- cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best fear translations" << endl;
- decoder->getNBest(input, *sid, n, -1.0, bleuWeight_fear,
- featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
- 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
- decoder->cleanup(chartDecoding);
- //ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
- //float fear_length_ratio = (float)fear.size()/ref_length;
-
- examples_in_batch++;
- }
- if (kbest) {
- // MODEL
- cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
- if (historyBleu || simpleHistoryBleu) {
- vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0,
- bleuWeight, featureValues[batchPosition], bleuScores[batchPosition],
- modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
- vector<const Word*> bestModel = outputModel[0];
- oneBests.push_back(bestModel);
- inputLengths.push_back(current_input_length);
- ref_ids.push_back(*sid);
- }
- else {
- decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
- featureValues[batchPosition], bleuScores[batchPosition],
- modelScores[batchPosition], 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
- }
- decoder->cleanup(chartDecoding);
- //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
- //float model_length_ratio = (float)bestModel.size()/ref_length;
- cerr << endl;
-
- examples_in_batch++;
-
- HypothesisQueue queueHope(hope_n);
- HypothesisQueue queueFear(fear_n);
- cerr << endl;
- if (most_violated || all_violated || one_against_all) {
- float bleuHope = -1000;
- float bleuFear = 1000;
- size_t indexHope = -1;
- size_t indexFear = -1;
-
- vector<float> bleuHopeList;
- vector<float> bleuFearList;
- vector<float> indexHopeList;
- vector<float> indexFearList;
-
- if (most_violated)
- cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint" << endl;
- else if (all_violated)
- cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with violated constraints";
- else
- cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with hope";
-
- // find best hope, then find fear that violates our constraint most
- for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
- if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) { // equal bleu scores
- if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) {
- if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) {
- // better model score
- bleuHope = bleuScores[batchPosition][i];
- indexHope = i;
- }
- }
- }
- else if (bleuScores[batchPosition][i] > bleuHope) { // better than current best
- bleuHope = bleuScores[batchPosition][i];
- indexHope = i;
- }
- }
-
- float currentViolation = 0;
- float minimum_bleu_diff = 0.01;
- for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
- float bleuDiff = bleuHope - bleuScores[batchPosition][i];
- float modelDiff = modelScores[batchPosition][indexHope] - modelScores[batchPosition][i];
- if (bleuDiff > epsilon) {
- if (one_against_all && bleuDiff > minimum_bleu_diff) {
- cerr << ".. adding pair";
- bleuHopeList.push_back(bleuHope);
- bleuFearList.push_back(bleuScores[batchPosition][i]);
- indexHopeList.push_back(indexHope);
- indexFearList.push_back(i);
- }
- else if (modelDiff < bleuDiff) {
- float diff = bleuDiff - modelDiff;
- if (diff > epsilon) {
- if (all_violated) {
- cerr << ".. adding pair";
- bleuHopeList.push_back(bleuHope);
- bleuFearList.push_back(bleuScores[batchPosition][i]);
- indexHopeList.push_back(indexHope);
- indexFearList.push_back(i);
- }
- else if (most_violated && diff > currentViolation) {
- currentViolation = diff;
- bleuFear = bleuScores[batchPosition][i];
- indexFear = i;
- cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << currentViolation << " (" << modelDiff << " >= " << bleuDiff << ")" << endl;
- }
- }
- }
- }
- }
-
- if (most_violated) {
- if (currentViolation > 0) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", adding pair with violation " << currentViolation << endl;
- cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope << "), fear: " << bleuFear << " (" << indexFear << ")" << endl;
- bleuScoresHope[batchPosition].push_back(bleuHope);
- bleuScoresFear[batchPosition].push_back(bleuFear);
- featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]);
- featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]);
- float modelScoreHope = modelScores[batchPosition][indexHope];
- float modelScoreFear = modelScores[batchPosition][indexFear];
- if (most_violated_reg) {
- // reduce model score difference by factor ~0.5
- float reg = currentViolation/4;
- modelScoreHope += abs(reg);
- modelScoreFear -= abs(reg);
- float newViolation = (bleuHope - bleuFear) - (modelScoreHope - modelScoreFear);
- cerr << "Rank " << rank << ", epoch " << epoch << ", regularized violation: " << newViolation << endl;
- }
- modelScoresHope[batchPosition].push_back(modelScoreHope);
- modelScoresFear[batchPosition].push_back(modelScoreFear);
-
- featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures();
- featureValues[batchPosition][indexFear].IncrementSparseFearFeatures();
- }
- else {
- cerr << "Rank " << rank << ", epoch " << epoch << ", no violated constraint found." << endl;
- skip_example = 1;
- }
- }
- else cerr << endl;
- }
- if (max_bleu_diff) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with max Bleu diff from list: " << bleuScores[batchPosition].size() << endl;
- for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
- float hopeScore = bleuScores[batchPosition][i];
- if (modelPlusBleu) hopeScore += modelScores[batchPosition][i];
- BleuIndexPair hope(hopeScore, i);
- queueHope.Push(hope);
-
- float fearScore = -1*(bleuScores[batchPosition][i]);
- if (modelPlusBleu) fearScore += modelScores[batchPosition][i];
- BleuIndexPair fear(fearScore, i);
- queueFear.Push(fear);
- }
- skip_example = 0;
- }
- cerr << endl;
-
- vector<BleuIndexPair> hopeList, fearList;
- for (size_t i=0; i<hope_n && !queueHope.Empty(); ++i) hopeList.push_back(queueHope.Pop());
- for (size_t i=0; i<fear_n && !queueFear.Empty(); ++i) fearList.push_back(queueFear.Pop());
- for (size_t i=0; i<hopeList.size(); ++i) {
- //float bleuHope = hopeList[i].first;
- size_t indexHope = hopeList[i].second;
- float bleuHope = bleuScores[batchPosition][indexHope];
- for (size_t j=0; j<fearList.size(); ++j) {
- //float bleuFear = -1*(fearList[j].first);
- size_t indexFear = fearList[j].second;
- float bleuFear = bleuScores[batchPosition][indexFear];
- cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope << "), fear: " << bleuFear << " (" << indexFear << ")" << endl;
- bleuScoresHope[batchPosition].push_back(bleuHope);
- bleuScoresFear[batchPosition].push_back(bleuFear);
- featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]);
- featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]);
- float modelScoreHope = modelScores[batchPosition][indexHope];
- float modelScoreFear = modelScores[batchPosition][indexFear];
-
- modelScoresHope[batchPosition].push_back(modelScoreHope);
- modelScoresFear[batchPosition].push_back(modelScoreFear);
-
- featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures();
- featureValues[batchPosition][indexFear].IncrementSparseFearFeatures();
- }
- }
- if (!makePairs)
- cerr << "Rank " << rank << ", epoch " << epoch << "summing up hope and fear vectors, no pairs" << endl;
- }
-
- // next input sentence
- ++sid;
- ++actualBatchSize;
- ++shardPosition;
+ != shard.end(); ++batchPosition) {
+ string input;
+ if (trainWithMultipleFolds)
+ input = inputSentencesFolds[myFold][*sid];
+ else
+ input = inputSentences[*sid];
+
+ Moses::Sentence *sentence = new Sentence();
+ stringstream in(input + "\n");
+ const vector<FactorType> inputFactorOrder = staticData.GetInputFactorOrder();
+ sentence->Read(in,inputFactorOrder);
+ cerr << "\nRank " << rank << ", epoch " << epoch << ", input sentence " << *sid << ": \"";
+ sentence->Print(cerr);
+ cerr << "\"" << " (batch pos " << batchPosition << ")" << endl;
+ size_t current_input_length = (*sentence).GetSize();
+
+ if (epoch == 0 && (scaleByAvgInputLength || scaleByAvgInverseLength)) {
+ sumOfInputs += current_input_length;
+ ++numberOfInputs;
+ avgInputLength = sumOfInputs/numberOfInputs;
+ decoder->setAvgInputLength(avgInputLength);
+ cerr << "Rank " << rank << ", epoch 0, average input length: " << avgInputLength << endl;
+ }
+
+ vector<ScoreComponentCollection> newFeatureValues;
+ vector<float> newScores;
+ if (model_hope_fear) {
+ featureValues.push_back(newFeatureValues);
+ bleuScores.push_back(newScores);
+ modelScores.push_back(newScores);
+ }
+ if (hope_fear || perceptron_update) {
+ featureValuesHope.push_back(newFeatureValues);
+ featureValuesFear.push_back(newFeatureValues);
+ bleuScoresHope.push_back(newScores);
+ bleuScoresFear.push_back(newScores);
+ modelScoresHope.push_back(newScores);
+ modelScoresFear.push_back(newScores);
+ if (historyBleu || simpleHistoryBleu || debug_model) {
+ featureValues.push_back(newFeatureValues);
+ bleuScores.push_back(newScores);
+ modelScores.push_back(newScores);
+ }
+ }
+ if (kbest) {
+ // for decoding
+ featureValues.push_back(newFeatureValues);
+ bleuScores.push_back(newScores);
+ modelScores.push_back(newScores);
+
+ // for storing selected examples
+ featureValuesHope.push_back(newFeatureValues);
+ featureValuesFear.push_back(newFeatureValues);
+ bleuScoresHope.push_back(newScores);
+ bleuScoresFear.push_back(newScores);
+ modelScoresHope.push_back(newScores);
+ modelScoresFear.push_back(newScores);
+ }
+
+ size_t ref_length;
+ float avg_ref_length;
+
+ if (print_weights)
+ cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: " << mosesWeights << endl;
+ if (print_core_weights) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: ";
+ mosesWeights.PrintCoreFeatures();
+ cerr << endl;
+ }
+
+ // check LM weight
+ const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ for (size_t i = 0; i < statefulFFs.size(); ++i) {
+ const StatefulFeatureFunction *ff = statefulFFs[i];
+ const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
+
+ if (lm) {
+ float lmWeight = mosesWeights.GetScoreForProducer(lm);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight: " << lmWeight << endl;
+ if (lmWeight <= 0) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: language model weight should never be <= 0." << endl;
+ mosesWeights.Assign(lm, 0.1);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", assign lm weights of 0.1" << endl;
+ }
+ }
+ }
+
+ // select inference scheme
+ cerr << "Rank " << rank << ", epoch " << epoch << ", real Bleu? " << realBleu << endl;
+ if (hope_fear || perceptron_update) {
+ // HOPE
+ cerr << "Rank " << rank << ", epoch " << epoch << ", " << hope_n <<
+ "best hope translations" << endl;
+ vector< vector<const Word*> > outputHope = decoder->getNBest(input, *sid, hope_n, 1.0, bleuWeight_hope,
+ featureValuesHope[batchPosition], bleuScoresHope[batchPosition], modelScoresHope[batchPosition],
+ 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+ vector<const Word*> oracle = outputHope[0];
+ decoder->cleanup(chartDecoding);
+ ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
+ avg_ref_length = ref_length;
+ float hope_length_ratio = (float)oracle.size()/ref_length;
+ int oracleSize = (int)oracle.size();
+ cerr << endl;
+
+ // count sparse features occurring in hope translation
+ featureValuesHope[batchPosition][0].IncrementSparseHopeFeatures();
+
+ float precision = bleuScoresHope[batchPosition][0];
+ if (historyBleu || simpleHistoryBleu) {
+ precision /= decoder->getTargetLengthHistory();
+ } else {
+ if (scaleByAvgInputLength) precision /= decoder->getAverageInputLength();
+ else if (scaleByAvgInverseLength) precision /= (100/decoder->getAverageInputLength());
+ precision /= scaleByX;
+ }
+ if (scale_margin_precision || scale_update_precision) {
+ if (historyBleu || simpleHistoryBleu || scaleByAvgInputLength || scaleByAvgInverseLength) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", set hope precision: " << precision << endl;
+ ((MiraOptimiser*) optimiser)->setPrecision(precision);
+ }
+ }
+
+ vector<const Word*> bestModel;
+ if (debug_model || historyBleu || simpleHistoryBleu) {
+ // MODEL (for updating the history only, using dummy vectors)
+ cerr << "Rank " << rank << ", epoch " << epoch << ", 1best wrt model score (debug or history)" << endl;
+ vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
+ featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
+ 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+ bestModel = outputModel[0];
+ decoder->cleanup(chartDecoding);
+ cerr << endl;
+ ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
+ }
+
+ // FEAR
+ float fear_length_ratio = 0;
+ float bleuRatioHopeFear = 0;
+ int fearSize = 0;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", " << fear_n << "best fear translations" << endl;
+ vector< vector<const Word*> > outputFear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuWeight_fear,
+ featureValuesFear[batchPosition], bleuScoresFear[batchPosition], modelScoresFear[batchPosition],
+ 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+ vector<const Word*> fear = outputFear[0];
+ decoder->cleanup(chartDecoding);
+ ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
+ avg_ref_length += ref_length;
+ avg_ref_length /= 2;
+ fear_length_ratio = (float)fear.size()/ref_length;
+ fearSize = (int)fear.size();
+ cerr << endl;
+ for (size_t i = 0; i < fear.size(); ++i)
+ delete fear[i];
+
+ // count sparse features occurring in fear translation
+ featureValuesFear[batchPosition][0].IncrementSparseFearFeatures();
+
+ // Bleu-related example selection
+ bool skip = false;
+ bleuRatioHopeFear = bleuScoresHope[batchPosition][0] / bleuScoresFear[batchPosition][0];
+ if (minBleuRatio != -1 && bleuRatioHopeFear < minBleuRatio)
+ skip = true;
+ if(maxBleuRatio != -1 && bleuRatioHopeFear > maxBleuRatio)
+ skip = true;
+
+ // sanity check
+ if (historyBleu || simpleHistoryBleu) {
+ if (bleuScores[batchPosition][0] > bleuScoresHope[batchPosition][0] &&
+ modelScores[batchPosition][0] > modelScoresHope[batchPosition][0]) {
+ if (abs(bleuScores[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon &&
+ abs(modelScores[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: MODEL translation better than HOPE translation." << endl;
+ skip = true;
+ }
+ }
+ if (bleuScoresFear[batchPosition][0] > bleuScores[batchPosition][0] &&
+ modelScoresFear[batchPosition][0] > modelScores[batchPosition][0]) {
+ if (abs(bleuScoresFear[batchPosition][0] - bleuScores[batchPosition][0]) > epsilon &&
+ abs(modelScoresFear[batchPosition][0] - modelScores[batchPosition][0]) > epsilon) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than MODEL translation." << endl;
+ skip = true;
+ }
+ }
+ }
+ if (bleuScoresFear[batchPosition][0] > bleuScoresHope[batchPosition][0]) {
+ if (abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon) {
+ // check if it's an error or a warning
+ skip = true;
+ if (modelScoresFear[batchPosition][0] > modelScoresHope[batchPosition][0] && abs(modelScoresFear[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
+ } else {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", WARNING: FEAR translation has better Bleu than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
+ }
+ }
+ }
+
+ if (skip) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", skip example (" << hope_length_ratio << ", " << bleuRatioHopeFear << ").. " << endl;
+ featureValuesHope[batchPosition].clear();
+ featureValuesFear[batchPosition].clear();
+ bleuScoresHope[batchPosition].clear();
+ bleuScoresFear[batchPosition].clear();
+ if (historyBleu || simpleHistoryBleu || debug_model) {
+ featureValues[batchPosition].clear();
+ bleuScores[batchPosition].clear();
+ }
+ } else {
+ examples_in_batch++;
+
+ // needed for history
+ if (historyBleu || simpleHistoryBleu) {
+ inputLengths.push_back(current_input_length);
+ ref_ids.push_back(*sid);
+ oneBests.push_back(bestModel);
+ }
+ }
+ }
+ if (model_hope_fear) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best hope translations" << endl;
+ size_t oraclePos = featureValues[batchPosition].size();
+ decoder->getNBest(input, *sid, n, 1.0, bleuWeight_hope,
+ featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
+ 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+ //vector<const Word*> oracle = outputHope[0];
+ // needed for history
+ inputLengths.push_back(current_input_length);
+ ref_ids.push_back(*sid);
+ decoder->cleanup(chartDecoding);
+ //ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
+ //float hope_length_ratio = (float)oracle.size()/ref_length;
+ cerr << endl;
+
+ oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
+ oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]);
+ oracleModelScores.push_back(modelScores[batchPosition][oraclePos]);
+
+ // MODEL
+ cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
+ if (historyBleu || simpleHistoryBleu) {
+ vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0,
+ bleuWeight, featureValues[batchPosition], bleuScores[batchPosition],
+ modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+ vector<const Word*> bestModel = outputModel[0];
+ oneBests.push_back(bestModel);
+ inputLengths.push_back(current_input_length);
+ ref_ids.push_back(*sid);
+ } else {
+ decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
+ featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
+ 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+ }
+ decoder->cleanup(chartDecoding);
+ //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
+ //float model_length_ratio = (float)bestModel.size()/ref_length;
+ cerr << endl;
+
+ // FEAR
+ cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best fear translations" << endl;
+ decoder->getNBest(input, *sid, n, -1.0, bleuWeight_fear,
+ featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
+ 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+ decoder->cleanup(chartDecoding);
+ //ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
+ //float fear_length_ratio = (float)fear.size()/ref_length;
+
+ examples_in_batch++;
+ }
+ if (kbest) {
+ // MODEL
+ cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
+ if (historyBleu || simpleHistoryBleu) {
+ vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0,
+ bleuWeight, featureValues[batchPosition], bleuScores[batchPosition],
+ modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+ vector<const Word*> bestModel = outputModel[0];
+ oneBests.push_back(bestModel);
+ inputLengths.push_back(current_input_length);
+ ref_ids.push_back(*sid);
+ } else {
+ decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
+ featureValues[batchPosition], bleuScores[batchPosition],
+ modelScores[batchPosition], 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+ }
+ decoder->cleanup(chartDecoding);
+ //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
+ //float model_length_ratio = (float)bestModel.size()/ref_length;
+ cerr << endl;
+
+ examples_in_batch++;
+
+ HypothesisQueue queueHope(hope_n);
+ HypothesisQueue queueFear(fear_n);
+ cerr << endl;
+ if (most_violated || all_violated || one_against_all) {
+ float bleuHope = -1000;
+ float bleuFear = 1000;
+ size_t indexHope = -1;
+ size_t indexFear = -1;
+
+ vector<float> bleuHopeList;
+ vector<float> bleuFearList;
+ vector<float> indexHopeList;
+ vector<float> indexFearList;
+
+ if (most_violated)
+ cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint" << endl;
+ else if (all_violated)
+ cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with violated constraints";
+ else
+ cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with hope";
+
+ // find best hope, then find fear that violates our constraint most
+ for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
+ if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) { // equal bleu scores
+ if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) {
+ if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) {
+ // better model score
+ bleuHope = bleuScores[batchPosition][i];
+ indexHope = i;
+ }
+ }
+ } else if (bleuScores[batchPosition][i] > bleuHope) { // better than current best
+ bleuHope = bleuScores[batchPosition][i];
+ indexHope = i;
+ }
+ }
+
+ float currentViolation = 0;
+ float minimum_bleu_diff = 0.01;
+ for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
+ float bleuDiff = bleuHope - bleuScores[batchPosition][i];
+ float modelDiff = modelScores[batchPosition][indexHope] - modelScores[batchPosition][i];
+ if (bleuDiff > epsilon) {
+ if (one_against_all && bleuDiff > minimum_bleu_diff) {
+ cerr << ".. adding pair";
+ bleuHopeList.push_back(bleuHope);
+ bleuFearList.push_back(bleuScores[batchPosition][i]);
+ indexHopeList.push_back(indexHope);
+ indexFearList.push_back(i);
+ } else if (modelDiff < bleuDiff) {
+ float diff = bleuDiff - modelDiff;
+ if (diff > epsilon) {
+ if (all_violated) {
+ cerr << ".. adding pair";
+ bleuHopeList.push_back(bleuHope);
+ bleuFearList.push_back(bleuScores[batchPosition][i]);
+ indexHopeList.push_back(indexHope);
+ indexFearList.push_back(i);
+ } else if (most_violated && diff > currentViolation) {
+ currentViolation = diff;
+ bleuFear = bleuScores[batchPosition][i];
+ indexFear = i;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << currentViolation << " (" << modelDiff << " >= " << bleuDiff << ")" << endl;
+ }
+ }
+ }
+ }
+ }
+
+ if (most_violated) {
+ if (currentViolation > 0) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", adding pair with violation " << currentViolation << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope << "), fear: " << bleuFear << " (" << indexFear << ")" << endl;
+ bleuScoresHope[batchPosition].push_back(bleuHope);
+ bleuScoresFear[batchPosition].push_back(bleuFear);
+ featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]);
+ featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]);
+ float modelScoreHope = modelScores[batchPosition][indexHope];
+ float modelScoreFear = modelScores[batchPosition][indexFear];
+ if (most_violated_reg) {
+ // reduce model score difference by factor ~0.5
+ float reg = currentViolation/4;
+ modelScoreHope += abs(reg);
+ modelScoreFear -= abs(reg);
+ float newViolation = (bleuHope - bleuFear) - (modelScoreHope - modelScoreFear);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", regularized violation: " << newViolation << endl;
+ }
+ modelScoresHope[batchPosition].push_back(modelScoreHope);
+ modelScoresFear[batchPosition].push_back(modelScoreFear);
+
+ featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures();
+ featureValues[batchPosition][indexFear].IncrementSparseFearFeatures();
+ } else {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", no violated constraint found." << endl;
+ skip_example = 1;
+ }
+ } else cerr << endl;
+ }
+ if (max_bleu_diff) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with max Bleu diff from list: " << bleuScores[batchPosition].size() << endl;
+ for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
+ float hopeScore = bleuScores[batchPosition][i];
+ if (modelPlusBleu) hopeScore += modelScores[batchPosition][i];
+ BleuIndexPair hope(hopeScore, i);
+ queueHope.Push(hope);
+
+ float fearScore = -1*(bleuScores[batchPosition][i]);
+ if (modelPlusBleu) fearScore += modelScores[batchPosition][i];
+ BleuIndexPair fear(fearScore, i);
+ queueFear.Push(fear);
+ }
+ skip_example = 0;
+ }
+ cerr << endl;
+
+ vector<BleuIndexPair> hopeList, fearList;
+ for (size_t i=0; i<hope_n && !queueHope.Empty(); ++i) hopeList.push_back(queueHope.Pop());
+ for (size_t i=0; i<fear_n && !queueFear.Empty(); ++i) fearList.push_back(queueFear.Pop());
+ for (size_t i=0; i<hopeList.size(); ++i) {
+ //float bleuHope = hopeList[i].first;
+ size_t indexHope = hopeList[i].second;
+ float bleuHope = bleuScores[batchPosition][indexHope];
+ for (size_t j=0; j<fearList.size(); ++j) {
+ //float bleuFear = -1*(fearList[j].first);
+ size_t indexFear = fearList[j].second;
+ float bleuFear = bleuScores[batchPosition][indexFear];
+ cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope << "), fear: " << bleuFear << " (" << indexFear << ")" << endl;
+ bleuScoresHope[batchPosition].push_back(bleuHope);
+ bleuScoresFear[batchPosition].push_back(bleuFear);
+ featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]);
+ featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]);
+ float modelScoreHope = modelScores[batchPosition][indexHope];
+ float modelScoreFear = modelScores[batchPosition][indexFear];
+
+ modelScoresHope[batchPosition].push_back(modelScoreHope);
+ modelScoresFear[batchPosition].push_back(modelScoreFear);
+
+ featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures();
+ featureValues[batchPosition][indexFear].IncrementSparseFearFeatures();
+ }
+ }
+ if (!makePairs)
+ cerr << "Rank " << rank << ", epoch " << epoch << "summing up hope and fear vectors, no pairs" << endl;
+ }
+
+ // next input sentence
+ ++sid;
+ ++actualBatchSize;
+ ++shardPosition;
} // end of batch loop
-
+
if (examples_in_batch == 0 || (kbest && skip_example)) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", batch is empty." << endl;
- }
- else {
- vector<vector<float> > losses(actualBatchSize);
- if (model_hope_fear) {
- // Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis)
- for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) {
- for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) {
- losses[batchPosition].push_back(oracleBleuScores[batchPosition] - bleuScores[batchPosition][j]);
- }
- }
- }
-
- // set weight for bleu feature to 0 before optimizing
- vector<FeatureFunction*>::const_iterator iter;
- const vector<FeatureFunction*> &featureFunctions2 = FeatureFunction::GetFeatureFunctions();
- for (iter = featureFunctions2.begin(); iter != featureFunctions2.end(); ++iter) {
- if ((*iter)->GetScoreProducerDescription() == "BleuScoreFeature") {
- mosesWeights.Assign(*iter, 0);
- break;
- }
- }
-
- // scale LM feature (to avoid rapid changes)
- if (scale_lm) {
- cerr << "scale lm" << endl;
- const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- for (size_t i = 0; i < statefulFFs.size(); ++i) {
- const StatefulFeatureFunction *ff = statefulFFs[i];
- const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
-
- if (lm) {
- // scale down score
- if (model_hope_fear) {
- scaleFeatureScore(lm, scale_lm_factor, featureValues, rank, epoch);
- }
- else {
- scaleFeatureScore(lm, scale_lm_factor, featureValuesHope, rank, epoch);
- scaleFeatureScore(lm, scale_lm_factor, featureValuesFear, rank, epoch);
- }
- }
- }
- }
-
- // scale WP
- if (scale_wp) {
- // scale up weight
- WordPenaltyProducer *wp = StaticData::InstanceNonConst().GetWordPenaltyProducer();
-
- // scale down score
- if (model_hope_fear) {
- scaleFeatureScore(wp, scale_wp_factor, featureValues, rank, epoch);
- }
- else {
- scaleFeatureScore(wp, scale_wp_factor, featureValuesHope, rank, epoch);
- scaleFeatureScore(wp, scale_wp_factor, featureValuesFear, rank, epoch);
- }
- }
-
- // print out the feature values
- if (print_feature_values) {
- cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
- if (model_hope_fear) printFeatureValues(featureValues);
- else {
- cerr << "hope: " << endl;
- printFeatureValues(featureValuesHope);
- cerr << "fear: " << endl;
- printFeatureValues(featureValuesFear);
- }
- }
-
- // apply learning rates to feature vectors before optimization
- if (feature_confidence) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decays " << decay_core << "/" << decay_sparse << ": " << featureLearningRates << endl;
- if (model_hope_fear) {
- applyPerFeatureLearningRates(featureValues, featureLearningRates, sparse_r0);
- }
- else {
- applyPerFeatureLearningRates(featureValuesHope, featureLearningRates, sparse_r0);
- applyPerFeatureLearningRates(featureValuesFear, featureLearningRates, sparse_r0);
- }
- }
- else {
- // apply fixed learning rates
- cerr << "Rank " << rank << ", epoch " << epoch << ", apply fixed learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
- if (core_r0 != 1.0 || sparse_r0 != 1.0) {
- if (model_hope_fear) {
- applyLearningRates(featureValues, core_r0, sparse_r0);
- }
- else {
- applyLearningRates(featureValuesHope, core_r0, sparse_r0);
- applyLearningRates(featureValuesFear, core_r0, sparse_r0);
- }
- }
- }
-
- // Run optimiser on batch:
- VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl);
- size_t update_status = 1;
- ScoreComponentCollection weightUpdate;
- if (perceptron_update) {
- vector<vector<float> > dummy1;
- update_status = optimiser->updateWeightsHopeFear( weightUpdate, featureValuesHope,
- featureValuesFear, dummy1, dummy1, dummy1, dummy1, learning_rate, rank, epoch);
- }
- else if (hope_fear) {
- if (bleuScoresHope[0][0] >= min_oracle_bleu) {
- if (hope_n == 1 && fear_n ==1 && batchSize == 1 && !hildreth) {
- update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(weightUpdate,
- featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0],
- bleuScoresFear[0][0], modelScoresHope[0][0], modelScoresFear[0][0], learning_rate, rank, epoch);
- }
- else
- update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
- featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
- modelScoresFear, learning_rate, rank, epoch);
- }
- else
- update_status = 1;
- }
- else if (kbest) {
- if (selective)
- update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSelective(
- weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear,
- modelScoresHope, modelScoresFear, learning_rate, rank, epoch);
- else if (summed)
- update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSummed(
- weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear,
- modelScoresHope, modelScoresFear, learning_rate, rank, epoch, rescaleSlack, makePairs);
- else {
- if (batchSize == 1 && featureValuesHope[0].size() == 1 && !hildreth) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
- cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
- update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(
- weightUpdate, featureValuesHope[0][0], featureValuesFear[0][0],
- bleuScoresHope[0][0], bleuScoresFear[0][0], modelScoresHope[0][0],
- modelScoresFear[0][0], learning_rate, rank, epoch);
- }
- else {
- cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
- cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
- update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
- featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
- modelScoresFear, learning_rate, rank, epoch);
- }
- }
- }
- else {
- // model_hope_fear
- update_status = ((MiraOptimiser*) optimiser)->updateWeights(weightUpdate,
- featureValues, losses, bleuScores, modelScores, oracleFeatureValues,
- oracleBleuScores, oracleModelScores, learning_rate, rank, epoch);
- }
-
- // sumStillViolatedConstraints += update_status;
-
- if (update_status == 0) { // if weights were updated
- // apply weight update
- if (debug)
- cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl;
-
- if (feature_confidence) {
- // update confidence counts based on weight update
- confidenceCounts.UpdateConfidenceCounts(weightUpdate, signed_counts);
-
- // update feature learning rates
- featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0);
- }
-
- // apply weight update to Moses weights
- mosesWeights.PlusEquals(weightUpdate);
-
- if (normaliseWeights)
- mosesWeights.L1Normalise();
-
- cumulativeWeights.PlusEquals(mosesWeights);
- if (sparseAverage) {
- ScoreComponentCollection binary;
- binary.SetToBinaryOf(mosesWeights);
- cumulativeWeightsBinary.PlusEquals(binary);
- }
-
- ++numberOfUpdates;
- ++numberOfUpdatesThisEpoch;
- if (averageWeights) {
- ScoreComponentCollection averageWeights(cumulativeWeights);
- if (accumulateWeights) {
- averageWeights.DivideEquals(numberOfUpdates);
- } else {
- averageWeights.DivideEquals(numberOfUpdatesThisEpoch);
- }
-
- mosesWeights = averageWeights;
- }
-
- // set new Moses weights
- decoder->setWeights(mosesWeights);
- //cerr << "Rank " << rank << ", epoch " << epoch << ", new weights: " << mosesWeights << endl;
- }
-
- // update history (for approximate document Bleu)
- if (historyBleu || simpleHistoryBleu) {
- for (size_t i = 0; i < oneBests.size(); ++i)
- cerr << "Rank " << rank << ", epoch " << epoch << ", update history with 1best length: " << oneBests[i].size() << " ";
- decoder->updateHistory(oneBests, inputLengths, ref_ids, rank, epoch);
- deleteTranslations(oneBests);
- }
+ cerr << "Rank " << rank << ", epoch " << epoch << ", batch is empty." << endl;
+ } else {
+ vector<vector<float> > losses(actualBatchSize);
+ if (model_hope_fear) {
+ // Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis)
+ for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) {
+ for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) {
+ losses[batchPosition].push_back(oracleBleuScores[batchPosition] - bleuScores[batchPosition][j]);
+ }
+ }
+ }
+
+ // set weight for bleu feature to 0 before optimizing
+ vector<FeatureFunction*>::const_iterator iter;
+ const vector<FeatureFunction*> &featureFunctions2 = FeatureFunction::GetFeatureFunctions();
+ for (iter = featureFunctions2.begin(); iter != featureFunctions2.end(); ++iter) {
+ if ((*iter)->GetScoreProducerDescription() == "BleuScoreFeature") {
+ mosesWeights.Assign(*iter, 0);
+ break;
+ }
+ }
+
+ // scale LM feature (to avoid rapid changes)
+ if (scale_lm) {
+ cerr << "scale lm" << endl;
+ const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ for (size_t i = 0; i < statefulFFs.size(); ++i) {
+ const StatefulFeatureFunction *ff = statefulFFs[i];
+ const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
+
+ if (lm) {
+ // scale down score
+ if (model_hope_fear) {
+ scaleFeatureScore(lm, scale_lm_factor, featureValues, rank, epoch);
+ } else {
+ scaleFeatureScore(lm, scale_lm_factor, featureValuesHope, rank, epoch);
+ scaleFeatureScore(lm, scale_lm_factor, featureValuesFear, rank, epoch);
+ }
+ }
+ }
+ }
+
+ // scale WP
+ if (scale_wp) {
+ // scale up weight
+ WordPenaltyProducer *wp = StaticData::InstanceNonConst().GetWordPenaltyProducer();
+
+ // scale down score
+ if (model_hope_fear) {
+ scaleFeatureScore(wp, scale_wp_factor, featureValues, rank, epoch);
+ } else {
+ scaleFeatureScore(wp, scale_wp_factor, featureValuesHope, rank, epoch);
+ scaleFeatureScore(wp, scale_wp_factor, featureValuesFear, rank, epoch);
+ }
+ }
+
+ // print out the feature values
+ if (print_feature_values) {
+ cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
+ if (model_hope_fear) printFeatureValues(featureValues);
+ else {
+ cerr << "hope: " << endl;
+ printFeatureValues(featureValuesHope);
+ cerr << "fear: " << endl;
+ printFeatureValues(featureValuesFear);
+ }
+ }
+
+ // apply learning rates to feature vectors before optimization
+ if (feature_confidence) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decays " << decay_core << "/" << decay_sparse << ": " << featureLearningRates << endl;
+ if (model_hope_fear) {
+ applyPerFeatureLearningRates(featureValues, featureLearningRates, sparse_r0);
+ } else {
+ applyPerFeatureLearningRates(featureValuesHope, featureLearningRates, sparse_r0);
+ applyPerFeatureLearningRates(featureValuesFear, featureLearningRates, sparse_r0);
+ }
+ } else {
+ // apply fixed learning rates
+ cerr << "Rank " << rank << ", epoch " << epoch << ", apply fixed learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
+ if (core_r0 != 1.0 || sparse_r0 != 1.0) {
+ if (model_hope_fear) {
+ applyLearningRates(featureValues, core_r0, sparse_r0);
+ } else {
+ applyLearningRates(featureValuesHope, core_r0, sparse_r0);
+ applyLearningRates(featureValuesFear, core_r0, sparse_r0);
+ }
+ }
+ }
+
+ // Run optimiser on batch:
+ VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl);
+ size_t update_status = 1;
+ ScoreComponentCollection weightUpdate;
+ if (perceptron_update) {
+ vector<vector<float> > dummy1;
+ update_status = optimiser->updateWeightsHopeFear( weightUpdate, featureValuesHope,
+ featureValuesFear, dummy1, dummy1, dummy1, dummy1, learning_rate, rank, epoch);
+ } else if (hope_fear) {
+ if (bleuScoresHope[0][0] >= min_oracle_bleu) {
+ if (hope_n == 1 && fear_n ==1 && batchSize == 1 && !hildreth) {
+ update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(weightUpdate,
+ featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0],
+ bleuScoresFear[0][0], modelScoresHope[0][0], modelScoresFear[0][0], learning_rate, rank, epoch);
+ } else
+ update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
+ featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
+ modelScoresFear, learning_rate, rank, epoch);
+ } else
+ update_status = 1;
+ } else if (kbest) {
+ if (selective)
+ update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSelective(
+ weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear,
+ modelScoresHope, modelScoresFear, learning_rate, rank, epoch);
+ else if (summed)
+ update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSummed(
+ weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear,
+ modelScoresHope, modelScoresFear, learning_rate, rank, epoch, rescaleSlack, makePairs);
+ else {
+ if (batchSize == 1 && featureValuesHope[0].size() == 1 && !hildreth) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
+ update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(
+ weightUpdate, featureValuesHope[0][0], featureValuesFear[0][0],
+ bleuScoresHope[0][0], bleuScoresFear[0][0], modelScoresHope[0][0],
+ modelScoresFear[0][0], learning_rate, rank, epoch);
+ } else {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
+ update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
+ featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
+ modelScoresFear, learning_rate, rank, epoch);
+ }
+ }
+ } else {
+ // model_hope_fear
+ update_status = ((MiraOptimiser*) optimiser)->updateWeights(weightUpdate,
+ featureValues, losses, bleuScores, modelScores, oracleFeatureValues,
+ oracleBleuScores, oracleModelScores, learning_rate, rank, epoch);
+ }
+
+ // sumStillViolatedConstraints += update_status;
+
+ if (update_status == 0) { // if weights were updated
+ // apply weight update
+ if (debug)
+ cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl;
+
+ if (feature_confidence) {
+ // update confidence counts based on weight update
+ confidenceCounts.UpdateConfidenceCounts(weightUpdate, signed_counts);
+
+ // update feature learning rates
+ featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0);
+ }
+
+ // apply weight update to Moses weights
+ mosesWeights.PlusEquals(weightUpdate);
+
+ if (normaliseWeights)
+ mosesWeights.L1Normalise();
+
+ cumulativeWeights.PlusEquals(mosesWeights);
+ if (sparseAverage) {
+ ScoreComponentCollection binary;
+ binary.SetToBinaryOf(mosesWeights);
+ cumulativeWeightsBinary.PlusEquals(binary);
+ }
+
+ ++numberOfUpdates;
+ ++numberOfUpdatesThisEpoch;
+ if (averageWeights) {
+ ScoreComponentCollection averageWeights(cumulativeWeights);
+ if (accumulateWeights) {
+ averageWeights.DivideEquals(numberOfUpdates);
+ } else {
+ averageWeights.DivideEquals(numberOfUpdatesThisEpoch);
+ }
+
+ mosesWeights = averageWeights;
+ }
+
+ // set new Moses weights
+ decoder->setWeights(mosesWeights);
+ //cerr << "Rank " << rank << ", epoch " << epoch << ", new weights: " << mosesWeights << endl;
+ }
+
+ // update history (for approximate document Bleu)
+ if (historyBleu || simpleHistoryBleu) {
+ for (size_t i = 0; i < oneBests.size(); ++i)
+ cerr << "Rank " << rank << ", epoch " << epoch << ", update history with 1best length: " << oneBests[i].size() << " ";
+ decoder->updateHistory(oneBests, inputLengths, ref_ids, rank, epoch);
+ deleteTranslations(oneBests);
+ }
} // END TRANSLATE AND UPDATE BATCH
-
+
// size of all shards except for the last one
size_t generalShardSize;
if (trainWithMultipleFolds)
- generalShardSize = order.size()/coresPerFold;
+ generalShardSize = order.size()/coresPerFold;
else
- generalShardSize = order.size()/size;
-
+ generalShardSize = order.size()/size;
+
size_t mixing_base = mixingFrequency == 0 ? 0 : generalShardSize / mixingFrequency;
size_t dumping_base = weightDumpFrequency == 0 ? 0 : generalShardSize / weightDumpFrequency;
bool mix = evaluateModulo(shardPosition, mixing_base, actualBatchSize);
-
+
// mix weights?
if (mix) {
#ifdef MPI_ENABLE
- cerr << "Rank " << rank << ", epoch " << epoch << ", mixing weights.. " << endl;
- // collect all weights in mixedWeights and divide by number of processes
- mpi::reduce(world, mosesWeights, mixedWeights, SCCPlus(), 0);
-
- // mix confidence counts
- //mpi::reduce(world, confidenceCounts, mixedConfidenceCounts, SCCPlus(), 0);
- ScoreComponentCollection totalBinary;
- if (sparseAverage) {
- ScoreComponentCollection binary;
- binary.SetToBinaryOf(mosesWeights);
- mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
- }
- if (rank == 0) {
- // divide by number of processes
- if (sparseNoAverage)
- mixedWeights.CoreDivideEquals(size); // average only core weights
- else if (sparseAverage)
- mixedWeights.DivideEquals(totalBinary);
- else
- mixedWeights.DivideEquals(size);
-
- // divide confidence counts
- //mixedConfidenceCounts.DivideEquals(size);
-
- // normalise weights after averaging
- if (normaliseWeights) {
- mixedWeights.L1Normalise();
- }
-
- ++weightMixingThisEpoch;
-
- if (pruneZeroWeights) {
- size_t pruned = mixedWeights.PruneZeroWeightFeatures();
- cerr << "Rank " << rank << ", epoch " << epoch << ", "
- << pruned << " zero-weighted features pruned from mixedWeights." << endl;
-
- pruned = cumulativeWeights.PruneZeroWeightFeatures();
- cerr << "Rank " << rank << ", epoch " << epoch << ", "
- << pruned << " zero-weighted features pruned from cumulativeWeights." << endl;
- }
-
- if (featureCutoff != -1 && weightMixingThisEpoch == mixingFrequency) {
- size_t pruned = mixedWeights.PruneSparseFeatures(featureCutoff);
- cerr << "Rank " << rank << ", epoch " << epoch << ", "
- << pruned << " features pruned from mixedWeights." << endl;
-
- pruned = cumulativeWeights.PruneSparseFeatures(featureCutoff);
- cerr << "Rank " << rank << ", epoch " << epoch << ", "
- << pruned << " features pruned from cumulativeWeights." << endl;
- }
-
- if (weightMixingThisEpoch == mixingFrequency || reg_on_every_mix) {
- if (l1_regularize) {
- size_t pruned;
- if (l1_reg_sparse)
- pruned = mixedWeights.SparseL1Regularize(l1_lambda);
- else
- pruned = mixedWeights.L1Regularize(l1_lambda);
- cerr << "Rank " << rank << ", epoch " << epoch << ", "
- << "l1-reg. on mixedWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
- }
- if (l2_regularize) {
- if (l2_reg_sparse)
- mixedWeights.SparseL2Regularize(l2_lambda);
- else
- mixedWeights.L2Regularize(l2_lambda);
- cerr << "Rank " << rank << ", epoch " << epoch << ", "
- << "l2-reg. on mixedWeights with lambda=" << l2_lambda << endl;
- }
- }
- }
-
- // broadcast average weights from process 0
- mpi::broadcast(world, mixedWeights, 0);
- decoder->setWeights(mixedWeights);
- mosesWeights = mixedWeights;
-
- // broadcast summed confidence counts
- //mpi::broadcast(world, mixedConfidenceCounts, 0);
- //confidenceCounts = mixedConfidenceCounts;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", mixing weights.. " << endl;
+ // collect all weights in mixedWeights and divide by number of processes
+ mpi::reduce(world, mosesWeights, mixedWeights, SCCPlus(), 0);
+
+ // mix confidence counts
+ //mpi::reduce(world, confidenceCounts, mixedConfidenceCounts, SCCPlus(), 0);
+ ScoreComponentCollection totalBinary;
+ if (sparseAverage) {
+ ScoreComponentCollection binary;
+ binary.SetToBinaryOf(mosesWeights);
+ mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
+ }
+ if (rank == 0) {
+ // divide by number of processes
+ if (sparseNoAverage)
+ mixedWeights.CoreDivideEquals(size); // average only core weights
+ else if (sparseAverage)
+ mixedWeights.DivideEquals(totalBinary);
+ else
+ mixedWeights.DivideEquals(size);
+
+ // divide confidence counts
+ //mixedConfidenceCounts.DivideEquals(size);
+
+ // normalise weights after averaging
+ if (normaliseWeights) {
+ mixedWeights.L1Normalise();
+ }
+
+ ++weightMixingThisEpoch;
+
+ if (pruneZeroWeights) {
+ size_t pruned = mixedWeights.PruneZeroWeightFeatures();
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << pruned << " zero-weighted features pruned from mixedWeights." << endl;
+
+ pruned = cumulativeWeights.PruneZeroWeightFeatures();
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << pruned << " zero-weighted features pruned from cumulativeWeights." << endl;
+ }
+
+ if (featureCutoff != -1 && weightMixingThisEpoch == mixingFrequency) {
+ size_t pruned = mixedWeights.PruneSparseFeatures(featureCutoff);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << pruned << " features pruned from mixedWeights." << endl;
+
+ pruned = cumulativeWeights.PruneSparseFeatures(featureCutoff);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << pruned << " features pruned from cumulativeWeights." << endl;
+ }
+
+ if (weightMixingThisEpoch == mixingFrequency || reg_on_every_mix) {
+ if (l1_regularize) {
+ size_t pruned;
+ if (l1_reg_sparse)
+ pruned = mixedWeights.SparseL1Regularize(l1_lambda);
+ else
+ pruned = mixedWeights.L1Regularize(l1_lambda);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << "l1-reg. on mixedWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
+ }
+ if (l2_regularize) {
+ if (l2_reg_sparse)
+ mixedWeights.SparseL2Regularize(l2_lambda);
+ else
+ mixedWeights.L2Regularize(l2_lambda);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << "l2-reg. on mixedWeights with lambda=" << l2_lambda << endl;
+ }
+ }
+ }
+
+ // broadcast average weights from process 0
+ mpi::broadcast(world, mixedWeights, 0);
+ decoder->setWeights(mixedWeights);
+ mosesWeights = mixedWeights;
+
+ // broadcast summed confidence counts
+ //mpi::broadcast(world, mixedConfidenceCounts, 0);
+ //confidenceCounts = mixedConfidenceCounts;
#endif
#ifndef MPI_ENABLE
- //cerr << "\nRank " << rank << ", no mixing, weights: " << mosesWeights << endl;
- mixedWeights = mosesWeights;
+ //cerr << "\nRank " << rank << ", no mixing, weights: " << mosesWeights << endl;
+ mixedWeights = mosesWeights;
#endif
} // end mixing
-
+
// Dump weights?
if (trainWithMultipleFolds || weightEpochDump == weightDumpFrequency) {
- // dump mixed weights at end of every epoch to enable continuing a crashed experiment
- // (for jackknife every time the weights are mixed)
- ostringstream filename;
- if (epoch < 10)
- filename << weightDumpStem << "_mixed_0" << epoch;
- else
- filename << weightDumpStem << "_mixed_" << epoch;
-
- if (weightDumpFrequency > 1)
- filename << "_" << weightEpochDump;
-
- mixedWeights.Save(filename.str());
- cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl;
+ // dump mixed weights at end of every epoch to enable continuing a crashed experiment
+ // (for jackknife every time the weights are mixed)
+ ostringstream filename;
+ if (epoch < 10)
+ filename << weightDumpStem << "_mixed_0" << epoch;
+ else
+ filename << weightDumpStem << "_mixed_" << epoch;
+
+ if (weightDumpFrequency > 1)
+ filename << "_" << weightEpochDump;
+
+ mixedWeights.Save(filename.str());
+ cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl;
}
if (dumpMixedWeights) {
- if (mix && rank == 0 && !weightDumpStem.empty()) {
- // dump mixed weights instead of average weights
- ostringstream filename;
- if (epoch < 10)
- filename << weightDumpStem << "_0" << epoch;
- else
- filename << weightDumpStem << "_" << epoch;
-
- if (weightDumpFrequency > 1)
- filename << "_" << weightEpochDump;
-
- cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl;
- mixedWeights.Save(filename.str());
- ++weightEpochDump;
- }
- }
- else {
- if (evaluateModulo(shardPosition, dumping_base, actualBatchSize)) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", dump weights.. (pos: " << shardPosition << ", base: " << dumping_base << ")" << endl;
- ScoreComponentCollection tmpAverageWeights(cumulativeWeights);
- bool proceed = false;
- if (accumulateWeights) {
- if (numberOfUpdates > 0) {
- tmpAverageWeights.DivideEquals(numberOfUpdates);
- proceed = true;
- }
- } else {
- if (numberOfUpdatesThisEpoch > 0) {
- if (sparseNoAverage) // average only core weights
- tmpAverageWeights.CoreDivideEquals(numberOfUpdatesThisEpoch);
- else if (sparseAverage)
- tmpAverageWeights.DivideEquals(cumulativeWeightsBinary);
- else
- tmpAverageWeights.DivideEquals(numberOfUpdatesThisEpoch);
- proceed = true;
- }
- }
-
- if (proceed) {
+ if (mix && rank == 0 && !weightDumpStem.empty()) {
+ // dump mixed weights instead of average weights
+ ostringstream filename;
+ if (epoch < 10)
+ filename << weightDumpStem << "_0" << epoch;
+ else
+ filename << weightDumpStem << "_" << epoch;
+
+ if (weightDumpFrequency > 1)
+ filename << "_" << weightEpochDump;
+
+ cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl;
+ mixedWeights.Save(filename.str());
+ ++weightEpochDump;
+ }
+ } else {
+ if (evaluateModulo(shardPosition, dumping_base, actualBatchSize)) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", dump weights.. (pos: " << shardPosition << ", base: " << dumping_base << ")" << endl;
+ ScoreComponentCollection tmpAverageWeights(cumulativeWeights);
+ bool proceed = false;
+ if (accumulateWeights) {
+ if (numberOfUpdates > 0) {
+ tmpAverageWeights.DivideEquals(numberOfUpdates);
+ proceed = true;
+ }
+ } else {
+ if (numberOfUpdatesThisEpoch > 0) {
+ if (sparseNoAverage) // average only core weights
+ tmpAverageWeights.CoreDivideEquals(numberOfUpdatesThisEpoch);
+ else if (sparseAverage)
+ tmpAverageWeights.DivideEquals(cumulativeWeightsBinary);
+ else
+ tmpAverageWeights.DivideEquals(numberOfUpdatesThisEpoch);
+ proceed = true;
+ }
+ }
+
+ if (proceed) {
#ifdef MPI_ENABLE
- // average across processes
- mpi::reduce(world, tmpAverageWeights, mixedAverageWeights, SCCPlus(), 0);
- ScoreComponentCollection totalBinary;
- if (sparseAverage) {
- ScoreComponentCollection binary;
- binary.SetToBinaryOf(mosesWeights);
- mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
- }
+ // average across processes
+ mpi::reduce(world, tmpAverageWeights, mixedAverageWeights, SCCPlus(), 0);
+ ScoreComponentCollection totalBinary;
+ if (sparseAverage) {
+ ScoreComponentCollection binary;
+ binary.SetToBinaryOf(mosesWeights);
+ mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
+ }
#endif
#ifndef MPI_ENABLE
- mixedAverageWeights = tmpAverageWeights;
- //FIXME: What do to for non-mpi version
- ScoreComponentCollection totalBinary;
+ mixedAverageWeights = tmpAverageWeights;
+ //FIXME: What do to for non-mpi version
+ ScoreComponentCollection totalBinary;
#endif
- if (rank == 0 && !weightDumpStem.empty()) {
- // divide by number of processes
- if (sparseNoAverage)
- mixedAverageWeights.CoreDivideEquals(size); // average only core weights
- else if (sparseAverage)
- mixedAverageWeights.DivideEquals(totalBinary);
- else
- mixedAverageWeights.DivideEquals(size);
-
- // normalise weights after averaging
- if (normaliseWeights) {
- mixedAverageWeights.L1Normalise();
- }
-
- // dump final average weights
- ostringstream filename;
- if (epoch < 10) {
- filename << weightDumpStem << "_0" << epoch;
- } else {
- filename << weightDumpStem << "_" << epoch;
- }
-
- if (weightDumpFrequency > 1) {
- filename << "_" << weightEpochDump;
- }
-
- /*if (accumulateWeights) {
- cerr << "\nMixed average weights (cumulative) during epoch " << epoch << ": " << mixedAverageWeights << endl;
- } else {
- cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl;
- }*/
-
- cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
- mixedAverageWeights.Save(filename.str());
- ++weightEpochDump;
-
- if (weightEpochDump == weightDumpFrequency) {
- if (l1_regularize) {
- size_t pruned = mixedAverageWeights.SparseL1Regularize(l1_lambda);
- cerr << "Rank " << rank << ", epoch " << epoch << ", "
- << "l1-reg. on mixedAverageWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
-
- }
- if (l2_regularize) {
- mixedAverageWeights.SparseL2Regularize(l2_lambda);
- cerr << "Rank " << rank << ", epoch " << epoch << ", "
- << "l2-reg. on mixedAverageWeights with lambda=" << l2_lambda << endl;
- }
-
- if (l1_regularize || l2_regularize) {
- filename << "_reg";
- cerr << "Dumping regularized mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
- mixedAverageWeights.Save(filename.str());
- }
- }
-
- if (weightEpochDump == weightDumpFrequency && printFeatureCounts) {
- // print out all features with counts
- stringstream s1, s2;
- s1 << "sparse_feature_hope_counts" << "_" << epoch;
- s2 << "sparse_feature_fear_counts" << "_" << epoch;
- ofstream sparseFeatureCountsHope(s1.str().c_str());
- ofstream sparseFeatureCountsFear(s2.str().c_str());
-
- mixedAverageWeights.PrintSparseHopeFeatureCounts(sparseFeatureCountsHope);
- mixedAverageWeights.PrintSparseFearFeatureCounts(sparseFeatureCountsFear);
- sparseFeatureCountsHope.close();
- sparseFeatureCountsFear.close();
- }
- }
- }
- }// end dumping
+ if (rank == 0 && !weightDumpStem.empty()) {
+ // divide by number of processes
+ if (sparseNoAverage)
+ mixedAverageWeights.CoreDivideEquals(size); // average only core weights
+ else if (sparseAverage)
+ mixedAverageWeights.DivideEquals(totalBinary);
+ else
+ mixedAverageWeights.DivideEquals(size);
+
+ // normalise weights after averaging
+ if (normaliseWeights) {
+ mixedAverageWeights.L1Normalise();
+ }
+
+ // dump final average weights
+ ostringstream filename;
+ if (epoch < 10) {
+ filename << weightDumpStem << "_0" << epoch;
+ } else {
+ filename << weightDumpStem << "_" << epoch;
+ }
+
+ if (weightDumpFrequency > 1) {
+ filename << "_" << weightEpochDump;
+ }
+
+ /*if (accumulateWeights) {
+ cerr << "\nMixed average weights (cumulative) during epoch " << epoch << ": " << mixedAverageWeights << endl;
+ } else {
+ cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl;
+ }*/
+
+ cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
+ mixedAverageWeights.Save(filename.str());
+ ++weightEpochDump;
+
+ if (weightEpochDump == weightDumpFrequency) {
+ if (l1_regularize) {
+ size_t pruned = mixedAverageWeights.SparseL1Regularize(l1_lambda);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << "l1-reg. on mixedAverageWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
+
+ }
+ if (l2_regularize) {
+ mixedAverageWeights.SparseL2Regularize(l2_lambda);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
+ << "l2-reg. on mixedAverageWeights with lambda=" << l2_lambda << endl;
+ }
+
+ if (l1_regularize || l2_regularize) {
+ filename << "_reg";
+ cerr << "Dumping regularized mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
+ mixedAverageWeights.Save(filename.str());
+ }
+ }
+
+ if (weightEpochDump == weightDumpFrequency && printFeatureCounts) {
+ // print out all features with counts
+ stringstream s1, s2;
+ s1 << "sparse_feature_hope_counts" << "_" << epoch;
+ s2 << "sparse_feature_fear_counts" << "_" << epoch;
+ ofstream sparseFeatureCountsHope(s1.str().c_str());
+ ofstream sparseFeatureCountsFear(s2.str().c_str());
+
+ mixedAverageWeights.PrintSparseHopeFeatureCounts(sparseFeatureCountsHope);
+ mixedAverageWeights.PrintSparseFearFeatureCounts(sparseFeatureCountsFear);
+ sparseFeatureCountsHope.close();
+ sparseFeatureCountsFear.close();
+ }
+ }
+ }
+ }// end dumping
} // end if dump
} // end of shard loop, end of this epoch
cerr << "Rank " << rank << ", epoch " << epoch << ", end of epoch.." << endl;
-
+
if (historyBleu || simpleHistoryBleu) {
cerr << "Bleu feature history after epoch " << epoch << endl;
decoder->printBleuFeatureHistory(cerr);
}
// cerr << "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints: " << sumStillViolatedConstraints << endl;
-
+
// Check whether there were any weight updates during this epoch
size_t sumUpdates;
size_t *sendbuf_uint, *recvbuf_uint;
@@ -1643,63 +1615,62 @@ int main(int argc, char** argv) {
mpi::broadcast(world, stop, 0);
#endif
}
-
+
if (!stop) {
// Test if weights have converged
if (weightConvergence) {
- bool reached = true;
- if (rank == 0 && (epoch >= 2)) {
- ScoreComponentCollection firstDiff, secondDiff;
- if (dumpMixedWeights) {
- firstDiff = mixedWeights;
- firstDiff.MinusEquals(mixedWeightsPrevious);
- secondDiff = mixedWeights;
- secondDiff.MinusEquals(mixedWeightsBeforePrevious);
- }
- else {
- firstDiff = mixedAverageWeights;
- firstDiff.MinusEquals(mixedAverageWeightsPrevious);
- secondDiff = mixedAverageWeights;
- secondDiff.MinusEquals(mixedAverageWeightsBeforePrevious);
- }
- VERBOSE(1, "Average weight changes since previous epoch: " << firstDiff << " (max: " << firstDiff.GetLInfNorm() << ")" << endl);
- VERBOSE(1, "Average weight changes since before previous epoch: " << secondDiff << " (max: " << secondDiff.GetLInfNorm() << ")" << endl << endl);
-
- // check whether stopping criterion has been reached
- // (both difference vectors must have all weight changes smaller than min_weight_change)
- if (firstDiff.GetLInfNorm() >= min_weight_change)
- reached = false;
- if (secondDiff.GetLInfNorm() >= min_weight_change)
- reached = false;
- if (reached) {
- // stop MIRA
- stop = true;
- cerr << "\nWeights have converged after epoch " << epoch << ".. stopping MIRA." << endl;
- ScoreComponentCollection dummy;
- ostringstream endfilename;
- endfilename << "stopping";
- dummy.Save(endfilename.str());
- }
- }
-
- mixedWeightsBeforePrevious = mixedWeightsPrevious;
- mixedWeightsPrevious = mixedWeights;
- mixedAverageWeightsBeforePrevious = mixedAverageWeightsPrevious;
- mixedAverageWeightsPrevious = mixedAverageWeights;
+ bool reached = true;
+ if (rank == 0 && (epoch >= 2)) {
+ ScoreComponentCollection firstDiff, secondDiff;
+ if (dumpMixedWeights) {
+ firstDiff = mixedWeights;
+ firstDiff.MinusEquals(mixedWeightsPrevious);
+ secondDiff = mixedWeights;
+ secondDiff.MinusEquals(mixedWeightsBeforePrevious);
+ } else {
+ firstDiff = mixedAverageWeights;
+ firstDiff.MinusEquals(mixedAverageWeightsPrevious);
+ secondDiff = mixedAverageWeights;
+ secondDiff.MinusEquals(mixedAverageWeightsBeforePrevious);
+ }
+ VERBOSE(1, "Average weight changes since previous epoch: " << firstDiff << " (max: " << firstDiff.GetLInfNorm() << ")" << endl);
+ VERBOSE(1, "Average weight changes since before previous epoch: " << secondDiff << " (max: " << secondDiff.GetLInfNorm() << ")" << endl << endl);
+
+ // check whether stopping criterion has been reached
+ // (both difference vectors must have all weight changes smaller than min_weight_change)
+ if (firstDiff.GetLInfNorm() >= min_weight_change)
+ reached = false;
+ if (secondDiff.GetLInfNorm() >= min_weight_change)
+ reached = false;
+ if (reached) {
+ // stop MIRA
+ stop = true;
+ cerr << "\nWeights have converged after epoch " << epoch << ".. stopping MIRA." << endl;
+ ScoreComponentCollection dummy;
+ ostringstream endfilename;
+ endfilename << "stopping";
+ dummy.Save(endfilename.str());
+ }
+ }
+
+ mixedWeightsBeforePrevious = mixedWeightsPrevious;
+ mixedWeightsPrevious = mixedWeights;
+ mixedAverageWeightsBeforePrevious = mixedAverageWeightsPrevious;
+ mixedAverageWeightsPrevious = mixedAverageWeights;
#ifdef MPI_ENABLE
- mpi::broadcast(world, stop, 0);
+ mpi::broadcast(world, stop, 0);
#endif
} //end if (weightConvergence)
}
} // end of epoch loop
-
+
#ifdef MPI_ENABLE
MPI_Finalize();
#endif
-
+
time(&now);
cerr << "Rank " << rank << ", " << ctime(&now);
-
+
if (rank == 0) {
ScoreComponentCollection dummy;
ostringstream endfilename;
@@ -1711,7 +1682,8 @@ int main(int argc, char** argv) {
exit(0);
}
-bool loadSentences(const string& filename, vector<string>& sentences) {
+bool loadSentences(const string& filename, vector<string>& sentences)
+{
ifstream in(filename.c_str());
if (!in)
return false;
@@ -1721,27 +1693,28 @@ bool loadSentences(const string& filename, vector<string>& sentences) {
return true;
}
-bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size) {
+bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size)
+{
if (mix_or_dump_base == 0) return 0;
if (actual_batch_size > 1) {
bool mix_or_dump = false;
size_t numberSubtracts = actual_batch_size;
do {
if (shard_position % mix_or_dump_base == 0) {
- mix_or_dump = true;
- break;
+ mix_or_dump = true;
+ break;
}
--shard_position;
--numberSubtracts;
} while (numberSubtracts > 0);
return mix_or_dump;
- }
- else {
+ } else {
return ((shard_position % mix_or_dump_base) == 0);
}
}
-void printFeatureValues(vector<vector<ScoreComponentCollection> > &featureValues) {
+void printFeatureValues(vector<vector<ScoreComponentCollection> > &featureValues)
+{
for (size_t i = 0; i < featureValues.size(); ++i) {
for (size_t j = 0; j < featureValues[i].size(); ++j) {
cerr << featureValues[i][j] << endl;
@@ -1750,7 +1723,8 @@ void printFeatureValues(vector<vector<ScoreComponentCollection> > &featureValues
cerr << endl;
}
-void deleteTranslations(vector<vector<const Word*> > &translations) {
+void deleteTranslations(vector<vector<const Word*> > &translations)
+{
for (size_t i = 0; i < translations.size(); ++i) {
for (size_t j = 0; j < translations[i].size(); ++j) {
delete translations[i][j];
@@ -1758,19 +1732,20 @@ void deleteTranslations(vector<vector<const Word*> > &translations) {
}
}
-void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename, vector<string> &inputSentences, MosesDecoder* decoder, size_t n, float bleuWeight) {
+void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename, vector<string> &inputSentences, MosesDecoder* decoder, size_t n, float bleuWeight)
+{
if (decode == 1)
cerr << "Rank " << rank << ", decoding dev input set according to hope objective.. " << endl;
else if (decode == 2)
cerr << "Rank " << rank << ", decoding dev input set according to fear objective.. " << endl;
else
cerr << "Rank " << rank << ", decoding dev input set according to normal objective.. " << endl;
-
+
// Create shards according to the number of processes used
vector<size_t> order;
for (size_t i = 0; i < inputSentences.size(); ++i)
order.push_back(i);
-
+
vector<size_t> shard;
float shardSize = (float) (order.size()) / size;
size_t shardStart = (size_t) (shardSize * rank);
@@ -1783,7 +1758,7 @@ void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename,
VERBOSE(1, "Rank " << rank << ", shard size: " << shardSize << endl);
shard.resize(shardSize);
copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
-
+
// open files for writing
stringstream fname;
fname << filename << ".rank" << rank;
@@ -1802,76 +1777,79 @@ void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename,
msg << "Unable to open " << filename_nbest;
throw runtime_error(msg.str());
}
-
+
for (size_t i = 0; i < shard.size(); ++i) {
size_t sid = shard[i];
string& input = inputSentences[sid];
-
+
vector<vector<ScoreComponentCollection> > dummyFeatureValues;
vector<vector<float> > dummyBleuScores;
vector<vector<float> > dummyModelScores;
-
+
vector<ScoreComponentCollection> newFeatureValues;
vector<float> newScores;
dummyFeatureValues.push_back(newFeatureValues);
dummyBleuScores.push_back(newScores);
dummyModelScores.push_back(newScores);
-
+
float factor = 0.0;
if (decode == 1) factor = 1.0;
if (decode == 2) factor = -1.0;
cerr << "Rank " << rank << ", translating sentence " << sid << endl;
bool realBleu = false;
vector< vector<const Word*> > nbestOutput = decoder->getNBest(input, sid, n, factor, bleuWeight, dummyFeatureValues[0],
- dummyBleuScores[0], dummyModelScores[0], n, realBleu, true, false, rank, 0, "");
+ dummyBleuScores[0], dummyModelScores[0], n, realBleu, true, false, rank, 0, "");
cerr << endl;
decoder->cleanup(StaticData::Instance().GetSearchAlgorithm() == ChartDecoding);
-
+
for (size_t i = 0; i < nbestOutput.size(); ++i) {
vector<const Word*> output = nbestOutput[i];
stringstream translation;
for (size_t k = 0; k < output.size(); ++k) {
- Word* w = const_cast<Word*>(output[k]);
- translation << w->GetString(0);
- translation << " ";
+ Word* w = const_cast<Word*>(output[k]);
+ translation << w->GetString(0);
+ translation << " ";
}
-
+
if (i == 0)
- out << translation.str() << endl;
+ out << translation.str() << endl;
nbest_out << sid << " ||| " << translation.str() << " ||| " << dummyFeatureValues[0][i] <<
- " ||| " << dummyModelScores[0][i] << " ||| sBleu=" << dummyBleuScores[0][i] << endl;
+ " ||| " << dummyModelScores[0][i] << " ||| sBleu=" << dummyBleuScores[0][i] << endl;
}
}
-
+
out.close();
nbest_out.close();
cerr << "Closing files " << filename << " and " << filename_nbest.str() << endl;
-
+
#ifdef MPI_ENABLE
MPI_Finalize();
#endif
-
+
time_t now;
time(&now);
cerr << "Rank " << rank << ", " << ctime(&now);
-
+
delete decoder;
exit(0);
}
-void applyLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, float core_r0, float sparse_r0) {
+void applyLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, float core_r0, float sparse_r0)
+{
for (size_t i=0; i<featureValues.size(); ++i) // each item in batch
for (size_t j=0; j<featureValues[i].size(); ++j) // each item in nbest
featureValues[i][j].MultiplyEquals(core_r0, sparse_r0);
}
-void applyPerFeatureLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, ScoreComponentCollection featureLearningRates, float sparse_r0) {
+void applyPerFeatureLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, ScoreComponentCollection featureLearningRates, float sparse_r0)
+{
for (size_t i=0; i<featureValues.size(); ++i) // each item in batch
for (size_t j=0; j<featureValues[i].size(); ++j) // each item in nbest
featureValues[i][j].MultiplyEqualsBackoff(featureLearningRates, sparse_r0);
}
-void scaleFeatureScore(const FeatureFunction *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch) {
+void scaleFeatureScore(const FeatureFunction *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch)
+{
string name = sp->GetScoreProducerDescription();
// scale down score
@@ -1885,7 +1863,8 @@ void scaleFeatureScore(const FeatureFunction *sp, float scaling_factor, vector<v
}
}
-void scaleFeatureScores(const FeatureFunction *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch) {
+void scaleFeatureScores(const FeatureFunction *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch)
+{
string name = sp->GetScoreProducerDescription();
// scale down score
@@ -1893,7 +1872,7 @@ void scaleFeatureScores(const FeatureFunction *sp, float scaling_factor, vector<
for (size_t j=0; j<featureValues[i].size(); ++j) { // each item in nbest
vector<float> featureScores = featureValues[i][j].GetScoresForProducer(sp);
for (size_t k=0; k<featureScores.size(); ++k)
- featureScores[k] *= scaling_factor;
+ featureScores[k] *= scaling_factor;
featureValues[i][j].Assign(sp, featureScores);
//cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl;
}
diff --git a/mira/Main.h b/mira/Main.h
index 6ba375c2c..8736257f6 100644
--- a/mira/Main.h
+++ b/mira/Main.h
@@ -36,10 +36,10 @@ template <class T> bool from_string(T& t, const std::string& s, std::ios_base& (
}
struct RandomIndex {
- ptrdiff_t operator()(ptrdiff_t max) {
- srand(time(0)); // Initialize random number generator with current time.
- return static_cast<ptrdiff_t> (rand() % max);
- }
+ ptrdiff_t operator()(ptrdiff_t max) {
+ srand(time(0)); // Initialize random number generator with current time.
+ return static_cast<ptrdiff_t> (rand() % max);
+ }
};
//void OutputNBestList(const MosesChart::TrellisPathList &nBestList, const TranslationSystem* system, long translationId);
@@ -50,7 +50,7 @@ void ignoreCoreFeatures(std::vector<std::vector<Moses::ScoreComponentCollection>
void takeLogs(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, size_t base);
void deleteTranslations(std::vector<std::vector<const Moses::Word*> > &translations);
void decodeHopeOrFear(size_t rank, size_t size, size_t decode, std::string decode_filename, std::vector<std::string> &inputSentences, Mira::MosesDecoder* decoder, size_t n, float bleuWeight);
-void applyLearningRates(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, float core_r0, float sparse_r0);
+void applyLearningRates(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, float core_r0, float sparse_r0);
void applyPerFeatureLearningRates(std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, Moses::ScoreComponentCollection featureLearningRates, float sparse_r0);
void scaleFeatureScore(const Moses::FeatureFunction *sp, float scaling_factor, std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch);
void scaleFeatureScores(const Moses::FeatureFunction *sp, float scaling_factor, std::vector<std::vector<Moses::ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch);
diff --git a/mira/MiraOptimiser.cpp b/mira/MiraOptimiser.cpp
index 4cc7f3fc3..82e9d85fb 100644
--- a/mira/MiraOptimiser.cpp
+++ b/mira/MiraOptimiser.cpp
@@ -5,234 +5,236 @@
using namespace Moses;
using namespace std;
-namespace Mira {
+namespace Mira
+{
size_t MiraOptimiser::updateWeights(
- ScoreComponentCollection& weightUpdate,
- const vector<vector<ScoreComponentCollection> >& featureValues,
- const vector<vector<float> >& losses,
- const vector<vector<float> >& bleuScores,
- const vector<vector<float> >& modelScores,
- const vector<ScoreComponentCollection>& oracleFeatureValues,
- const vector<float> oracleBleuScores,
- const vector<float> oracleModelScores,
- float learning_rate,
- size_t rank,
- size_t epoch) {
-
- // vector of feature values differences for all created constraints
- vector<ScoreComponentCollection> featureValueDiffs;
- vector<float> lossMinusModelScoreDiffs;
- vector<float> all_losses;
-
- // most violated constraint in batch
- ScoreComponentCollection max_batch_featureValueDiff;
-
- // Make constraints for new hypothesis translations
- float epsilon = 0.0001;
- int violatedConstraintsBefore = 0;
- float oldDistanceFromOptimum = 0;
- // iterate over input sentences (1 (online) or more (batch))
- for (size_t i = 0; i < featureValues.size(); ++i) {
- //size_t sentenceId = sentenceIds[i];
- // iterate over hypothesis translations for one input sentence
- for (size_t j = 0; j < featureValues[i].size(); ++j) {
- ScoreComponentCollection featureValueDiff = oracleFeatureValues[i];
- featureValueDiff.MinusEquals(featureValues[i][j]);
-
- // cerr << "Rank " << rank << ", epoch " << epoch << ", feature value diff: " << featureValueDiff << endl;
- if (featureValueDiff.GetL1Norm() == 0) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
- continue;
- }
-
- float loss = losses[i][j];
-
- // check if constraint is violated
- bool violated = false;
+ ScoreComponentCollection& weightUpdate,
+ const vector<vector<ScoreComponentCollection> >& featureValues,
+ const vector<vector<float> >& losses,
+ const vector<vector<float> >& bleuScores,
+ const vector<vector<float> >& modelScores,
+ const vector<ScoreComponentCollection>& oracleFeatureValues,
+ const vector<float> oracleBleuScores,
+ const vector<float> oracleModelScores,
+ float learning_rate,
+ size_t rank,
+ size_t epoch)
+{
+
+ // vector of feature values differences for all created constraints
+ vector<ScoreComponentCollection> featureValueDiffs;
+ vector<float> lossMinusModelScoreDiffs;
+ vector<float> all_losses;
+
+ // most violated constraint in batch
+ ScoreComponentCollection max_batch_featureValueDiff;
+
+ // Make constraints for new hypothesis translations
+ float epsilon = 0.0001;
+ int violatedConstraintsBefore = 0;
+ float oldDistanceFromOptimum = 0;
+ // iterate over input sentences (1 (online) or more (batch))
+ for (size_t i = 0; i < featureValues.size(); ++i) {
+ //size_t sentenceId = sentenceIds[i];
+ // iterate over hypothesis translations for one input sentence
+ for (size_t j = 0; j < featureValues[i].size(); ++j) {
+ ScoreComponentCollection featureValueDiff = oracleFeatureValues[i];
+ featureValueDiff.MinusEquals(featureValues[i][j]);
+
+ // cerr << "Rank " << rank << ", epoch " << epoch << ", feature value diff: " << featureValueDiff << endl;
+ if (featureValueDiff.GetL1Norm() == 0) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
+ continue;
+ }
+
+ float loss = losses[i][j];
+
+ // check if constraint is violated
+ bool violated = false;
// float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
- float modelScoreDiff = oracleModelScores[i] - modelScores[i][j];
- float diff = 0;
-
- if (loss > modelScoreDiff)
- diff = loss - modelScoreDiff;
- cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
- if (diff > epsilon)
- violated = true;
-
- if (m_normaliseMargin) {
- modelScoreDiff = (2*m_sigmoidParam/(1 + exp(-modelScoreDiff))) - m_sigmoidParam;
- loss = (2*m_sigmoidParam/(1 + exp(-loss))) - m_sigmoidParam;
- diff = 0;
- if (loss > modelScoreDiff) {
- diff = loss - modelScoreDiff;
- }
- cerr << "Rank " << rank << ", epoch " << epoch << ", normalised constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
- }
-
- if (m_scale_margin) {
- diff *= oracleBleuScores[i];
- cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with oracle bleu score " << oracleBleuScores[i] << endl;
- }
-
- featureValueDiffs.push_back(featureValueDiff);
- lossMinusModelScoreDiffs.push_back(diff);
- all_losses.push_back(loss);
- if (violated) {
- ++violatedConstraintsBefore;
- oldDistanceFromOptimum += diff;
- }
- }
- }
-
- // run optimisation: compute alphas for all given constraints
- vector<float> alphas;
- ScoreComponentCollection summedUpdate;
- if (violatedConstraintsBefore > 0) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " <<
- featureValueDiffs.size() << " (of which violated: " << violatedConstraintsBefore << ")" << endl;
- if (m_slack != 0) {
- alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs, m_slack);
- } else {
- alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs);
- }
-
- // Update the weight vector according to the alphas and the feature value differences
- // * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis))
- for (size_t k = 0; k < featureValueDiffs.size(); ++k) {
- float alpha = alphas[k];
- cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
- ScoreComponentCollection update(featureValueDiffs[k]);
- update.MultiplyEquals(alpha);
-
- // sum updates
- summedUpdate.PlusEquals(update);
- }
- }
- else {
- cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl;
+ float modelScoreDiff = oracleModelScores[i] - modelScores[i][j];
+ float diff = 0;
+
+ if (loss > modelScoreDiff)
+ diff = loss - modelScoreDiff;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
+ if (diff > epsilon)
+ violated = true;
+
+ if (m_normaliseMargin) {
+ modelScoreDiff = (2*m_sigmoidParam/(1 + exp(-modelScoreDiff))) - m_sigmoidParam;
+ loss = (2*m_sigmoidParam/(1 + exp(-loss))) - m_sigmoidParam;
+ diff = 0;
+ if (loss > modelScoreDiff) {
+ diff = loss - modelScoreDiff;
+ }
+ cerr << "Rank " << rank << ", epoch " << epoch << ", normalised constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
+ }
+
+ if (m_scale_margin) {
+ diff *= oracleBleuScores[i];
+ cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with oracle bleu score " << oracleBleuScores[i] << endl;
+ }
+
+ featureValueDiffs.push_back(featureValueDiff);
+ lossMinusModelScoreDiffs.push_back(diff);
+ all_losses.push_back(loss);
+ if (violated) {
+ ++violatedConstraintsBefore;
+ oldDistanceFromOptimum += diff;
+ }
+ }
+ }
+
+ // run optimisation: compute alphas for all given constraints
+ vector<float> alphas;
+ ScoreComponentCollection summedUpdate;
+ if (violatedConstraintsBefore > 0) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " <<
+ featureValueDiffs.size() << " (of which violated: " << violatedConstraintsBefore << ")" << endl;
+ if (m_slack != 0) {
+ alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs, m_slack);
+ } else {
+ alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs);
+ }
+
+ // Update the weight vector according to the alphas and the feature value differences
+ // * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis))
+ for (size_t k = 0; k < featureValueDiffs.size(); ++k) {
+ float alpha = alphas[k];
+ cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
+ ScoreComponentCollection update(featureValueDiffs[k]);
+ update.MultiplyEquals(alpha);
+
+ // sum updates
+ summedUpdate.PlusEquals(update);
+ }
+ } else {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl;
// return 0;
- return 1;
- }
-
- // apply learning rate
- if (learning_rate != 1) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", apply learning rate " << learning_rate << " to update." << endl;
- summedUpdate.MultiplyEquals(learning_rate);
- }
-
- // scale update by BLEU of oracle (for batch size 1 only)
- if (oracleBleuScores.size() == 1) {
- if (m_scale_update) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", scaling summed update with oracle bleu score " << oracleBleuScores[0] << endl;
- summedUpdate.MultiplyEquals(oracleBleuScores[0]);
- }
- }
-
- // cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << summedUpdate << endl;
- weightUpdate.PlusEquals(summedUpdate);
-
- // Sanity check: are there still violated constraints after optimisation?
-/* int violatedConstraintsAfter = 0;
- float newDistanceFromOptimum = 0;
- for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
- float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
- float loss = all_losses[i];
- float diff = loss - modelScoreDiff;
- if (diff > epsilon) {
- ++violatedConstraintsAfter;
- newDistanceFromOptimum += diff;
- }
- }
- VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
- VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);*/
+ return 1;
+ }
+
+ // apply learning rate
+ if (learning_rate != 1) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", apply learning rate " << learning_rate << " to update." << endl;
+ summedUpdate.MultiplyEquals(learning_rate);
+ }
+
+ // scale update by BLEU of oracle (for batch size 1 only)
+ if (oracleBleuScores.size() == 1) {
+ if (m_scale_update) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", scaling summed update with oracle bleu score " << oracleBleuScores[0] << endl;
+ summedUpdate.MultiplyEquals(oracleBleuScores[0]);
+ }
+ }
+
+ // cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << summedUpdate << endl;
+ weightUpdate.PlusEquals(summedUpdate);
+
+ // Sanity check: are there still violated constraints after optimisation?
+ /* int violatedConstraintsAfter = 0;
+ float newDistanceFromOptimum = 0;
+ for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
+ float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
+ float loss = all_losses[i];
+ float diff = loss - modelScoreDiff;
+ if (diff > epsilon) {
+ ++violatedConstraintsAfter;
+ newDistanceFromOptimum += diff;
+ }
+ }
+ VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
+ VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);*/
// return violatedConstraintsAfter;
- return 0;
+ return 0;
}
size_t MiraOptimiser::updateWeightsHopeFear(
- Moses::ScoreComponentCollection& weightUpdate,
- const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
- const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
- const std::vector<std::vector<float> >& bleuScoresHope,
- const std::vector<std::vector<float> >& bleuScoresFear,
- const std::vector<std::vector<float> >& modelScoresHope,
- const std::vector<std::vector<float> >& modelScoresFear,
- float learning_rate,
- size_t rank,
- size_t epoch,
- int updatePosition) {
+ Moses::ScoreComponentCollection& weightUpdate,
+ const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+ const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+ const std::vector<std::vector<float> >& bleuScoresHope,
+ const std::vector<std::vector<float> >& bleuScoresFear,
+ const std::vector<std::vector<float> >& modelScoresHope,
+ const std::vector<std::vector<float> >& modelScoresFear,
+ float learning_rate,
+ size_t rank,
+ size_t epoch,
+ int updatePosition)
+{
// vector of feature values differences for all created constraints
vector<ScoreComponentCollection> featureValueDiffs;
vector<float> lossMinusModelScoreDiffs;
vector<float> modelScoreDiffs;
vector<float> all_losses;
-
+
// most violated constraint in batch
ScoreComponentCollection max_batch_featureValueDiff;
-
+
// Make constraints for new hypothesis translations
float epsilon = 0.0001;
int violatedConstraintsBefore = 0;
float oldDistanceFromOptimum = 0;
-
+
// iterate over input sentences (1 (online) or more (batch))
for (size_t i = 0; i < featureValuesHope.size(); ++i) {
if (updatePosition != -1) {
if (i < updatePosition)
- continue;
+ continue;
else if (i > updatePosition)
- break;
+ break;
}
-
+
// Pick all pairs[j,j] of hope and fear translations for one input sentence
for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
ScoreComponentCollection featureValueDiff = featureValuesHope[i][j];
featureValueDiff.MinusEquals(featureValuesFear[i][j]);
//cerr << "Rank " << rank << ", epoch " << epoch << ", feature value diff: " << featureValueDiff << endl;
if (featureValueDiff.GetL1Norm() == 0) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
- continue;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
+ continue;
}
-
+
float loss = bleuScoresHope[i][j] - bleuScoresFear[i][j];
-
+
// check if constraint is violated
bool violated = false;
//float modelScoreDiff = featureValueDiff.InnerProduct(currWeights);
float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j];
float diff = 0;
- if (loss > modelScoreDiff)
- diff = loss - modelScoreDiff;
- cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
-
- if (diff > epsilon)
- violated = true;
-
+ if (loss > modelScoreDiff)
+ diff = loss - modelScoreDiff;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
+
+ if (diff > epsilon)
+ violated = true;
+
if (m_normaliseMargin) {
- modelScoreDiff = (2*m_sigmoidParam/(1 + exp(-modelScoreDiff))) - m_sigmoidParam;
- loss = (2*m_sigmoidParam/(1 + exp(-loss))) - m_sigmoidParam;
- diff = 0;
- if (loss > modelScoreDiff) {
- diff = loss - modelScoreDiff;
- }
- cerr << "Rank " << rank << ", epoch " << epoch << ", normalised constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
+ modelScoreDiff = (2*m_sigmoidParam/(1 + exp(-modelScoreDiff))) - m_sigmoidParam;
+ loss = (2*m_sigmoidParam/(1 + exp(-loss))) - m_sigmoidParam;
+ diff = 0;
+ if (loss > modelScoreDiff) {
+ diff = loss - modelScoreDiff;
+ }
+ cerr << "Rank " << rank << ", epoch " << epoch << ", normalised constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
}
-
+
if (m_scale_margin) {
- diff *= bleuScoresHope[i][j];
- cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with oracle bleu score " << bleuScoresHope[i][j] << endl;
+ diff *= bleuScoresHope[i][j];
+ cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with oracle bleu score " << bleuScoresHope[i][j] << endl;
}
-
+
featureValueDiffs.push_back(featureValueDiff);
lossMinusModelScoreDiffs.push_back(diff);
modelScoreDiffs.push_back(modelScoreDiff);
all_losses.push_back(loss);
if (violated) {
- ++violatedConstraintsBefore;
- oldDistanceFromOptimum += diff;
- }
+ ++violatedConstraintsBefore;
+ oldDistanceFromOptimum += diff;
+ }
}
}
@@ -241,48 +243,47 @@ size_t MiraOptimiser::updateWeightsHopeFear(
ScoreComponentCollection summedUpdate;
if (violatedConstraintsBefore > 0) {
cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " <<
- featureValueDiffs.size() << " (of which violated: " << violatedConstraintsBefore << ")" << endl;
+ featureValueDiffs.size() << " (of which violated: " << violatedConstraintsBefore << ")" << endl;
if (m_slack != 0) {
alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs, m_slack);
} else {
alphas = Hildreth::optimise(featureValueDiffs, lossMinusModelScoreDiffs);
}
-
+
// Update the weight vector according to the alphas and the feature value differences
// * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis))
for (size_t k = 0; k < featureValueDiffs.size(); ++k) {
float alpha = alphas[k];
cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
if (alpha != 0) {
- // apply boosting factor
- if (m_boost && modelScoreDiffs[k] <= 0) {
- // factor between 1.5 and 3 (for Bleu scores between 5 and 20, the factor is within the boundaries)
- float factor = min(1.5, log2(bleuScoresHope[0][0])); // TODO: make independent of number of oracles!!
- factor = min(3.0f, factor);
- alpha = alpha * factor;
- cerr << "Rank " << rank << ", epoch " << epoch << ", apply boosting factor " << factor << " to update." << endl;
- }
-
- ScoreComponentCollection update(featureValueDiffs[k]);
- update.MultiplyEquals(alpha);
-
- // sum updates
- summedUpdate.PlusEquals(update);
+ // apply boosting factor
+ if (m_boost && modelScoreDiffs[k] <= 0) {
+ // factor between 1.5 and 3 (for Bleu scores between 5 and 20, the factor is within the boundaries)
+ float factor = min(1.5, log2(bleuScoresHope[0][0])); // TODO: make independent of number of oracles!!
+ factor = min(3.0f, factor);
+ alpha = alpha * factor;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", apply boosting factor " << factor << " to update." << endl;
+ }
+
+ ScoreComponentCollection update(featureValueDiffs[k]);
+ update.MultiplyEquals(alpha);
+
+ // sum updates
+ summedUpdate.PlusEquals(update);
}
}
- }
- else {
+ } else {
cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl;
// return 0;
return 1;
}
-
+
// apply learning rate
if (learning_rate != 1) {
cerr << "Rank " << rank << ", epoch " << epoch << ", apply learning rate " << learning_rate << " to update." << endl;
summedUpdate.MultiplyEquals(learning_rate);
}
-
+
// scale update by BLEU of oracle (for batch size 1 only)
if (featureValuesHope.size() == 1) {
if (m_scale_update) {
@@ -290,46 +291,47 @@ size_t MiraOptimiser::updateWeightsHopeFear(
summedUpdate.MultiplyEquals(bleuScoresHope[0][0]);
}
}
-
+
//cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << summedUpdate << endl;
weightUpdate.PlusEquals(summedUpdate);
-
+
// Sanity check: are there still violated constraints after optimisation?
-/* int violatedConstraintsAfter = 0;
- float newDistanceFromOptimum = 0;
- for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
- float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
- float loss = all_losses[i];
- float diff = loss - modelScoreDiff;
- if (diff > epsilon) {
- ++violatedConstraintsAfter;
- newDistanceFromOptimum += diff;
- }
- }
- VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
- VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);*/
+ /* int violatedConstraintsAfter = 0;
+ float newDistanceFromOptimum = 0;
+ for (size_t i = 0; i < featureValueDiffs.size(); ++i) {
+ float modelScoreDiff = featureValueDiffs[i].InnerProduct(currWeights);
+ float loss = all_losses[i];
+ float diff = loss - modelScoreDiff;
+ if (diff > epsilon) {
+ ++violatedConstraintsAfter;
+ newDistanceFromOptimum += diff;
+ }
+ }
+ VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", violated constraint before: " << violatedConstraintsBefore << ", after: " << violatedConstraintsAfter << ", change: " << violatedConstraintsBefore - violatedConstraintsAfter << endl);
+ VERBOSE(1, "Rank " << rank << ", epoch " << epoch << ", error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);*/
// return violatedConstraintsAfter;
- return 0;
+ return 0;
}
size_t MiraOptimiser::updateWeightsAnalytically(
- ScoreComponentCollection& weightUpdate,
- ScoreComponentCollection& featureValuesHope,
- ScoreComponentCollection& featureValuesFear,
- float bleuScoreHope,
- float bleuScoreFear,
- float modelScoreHope,
- float modelScoreFear,
- float learning_rate,
- size_t rank,
- size_t epoch) {
+ ScoreComponentCollection& weightUpdate,
+ ScoreComponentCollection& featureValuesHope,
+ ScoreComponentCollection& featureValuesFear,
+ float bleuScoreHope,
+ float bleuScoreFear,
+ float modelScoreHope,
+ float modelScoreFear,
+ float learning_rate,
+ size_t rank,
+ size_t epoch)
+{
float epsilon = 0.0001;
float oldDistanceFromOptimum = 0;
bool constraintViolatedBefore = false;
- // cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope << endl;
- // cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear << endl;
+// cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope << endl;
+// cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear << endl;
ScoreComponentCollection featureValueDiff = featureValuesHope;
featureValueDiff.MinusEquals(featureValuesFear);
if (featureValueDiff.GetL1Norm() == 0) {
@@ -342,35 +344,35 @@ size_t MiraOptimiser::updateWeightsAnalytically(
float modelScoreDiff = modelScoreHope - modelScoreFear;
float loss = bleuScoreHope - bleuScoreFear;
float diff = 0;
- if (loss > modelScoreDiff)
- diff = loss - modelScoreDiff;
+ if (loss > modelScoreDiff)
+ diff = loss - modelScoreDiff;
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
if (m_normaliseMargin) {
modelScoreDiff = (2*m_sigmoidParam/(1 + exp(-modelScoreDiff))) - m_sigmoidParam;
loss = (2*m_sigmoidParam/(1 + exp(-loss))) - m_sigmoidParam;
- if (loss > modelScoreDiff)
+ if (loss > modelScoreDiff)
diff = loss - modelScoreDiff;
cerr << "Rank " << rank << ", epoch " << epoch << ", normalised constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
}
-
+
if (m_scale_margin) {
- diff *= bleuScoreHope;
- cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with oracle bleu score " << bleuScoreHope << endl;
+ diff *= bleuScoreHope;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with oracle bleu score " << bleuScoreHope << endl;
}
if (m_scale_margin_precision) {
- diff *= (1+m_precision);
- cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with 1+precision: " << (1+m_precision) << endl;
+ diff *= (1+m_precision);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", scaling margin with 1+precision: " << (1+m_precision) << endl;
}
if (diff > epsilon) {
- // squash it between 0 and 1
- //diff = tanh(diff);
- //diff = (2/(1 + pow(2,-diff))) - 1;
+ // squash it between 0 and 1
+ //diff = tanh(diff);
+ //diff = (2/(1 + pow(2,-diff))) - 1;
/* if (m_normaliseMargin) {
- diff = (2/(1 + exp(-diff))) - 1;
- cerr << "Rank " << rank << ", epoch " << epoch << ", new margin: " << diff << endl;
- }*/
+ diff = (2/(1 + exp(-diff))) - 1;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", new margin: " << diff << endl;
+ }*/
// constraint violated
oldDistanceFromOptimum += diff;
@@ -384,134 +386,134 @@ size_t MiraOptimiser::updateWeightsAnalytically(
float alpha = diff / squaredNorm;
cerr << "Rank " << rank << ", epoch " << epoch << ", unclipped alpha: " << alpha << endl;
if (m_slack > 0 ) {
- if (alpha > m_slack) {
- alpha = m_slack;
- }
- else if (alpha < m_slack*(-1)) {
- alpha = m_slack*(-1);
- }
+ if (alpha > m_slack) {
+ alpha = m_slack;
+ } else if (alpha < m_slack*(-1)) {
+ alpha = m_slack*(-1);
+ }
}
// apply learning rate
if (learning_rate != 1)
- alpha = alpha * learning_rate;
-
+ alpha = alpha * learning_rate;
+
if (m_scale_update) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", scaling update with oracle bleu score " << bleuScoreHope << endl;
- alpha *= bleuScoreHope;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", scaling update with oracle bleu score " << bleuScoreHope << endl;
+ alpha *= bleuScoreHope;
}
if (m_scale_update_precision) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", scaling update with 1+precision: " << (1+m_precision) << endl;
- alpha *= (1+m_precision);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", scaling update with 1+precision: " << (1+m_precision) << endl;
+ alpha *= (1+m_precision);
}
-
+
cerr << "Rank " << rank << ", epoch " << epoch << ", clipped/scaled alpha: " << alpha << endl;
// apply boosting factor
if (m_boost && modelScoreDiff <= 0) {
- // factor between 1.5 and 3 (for Bleu scores between 5 and 20, the factor is within the boundaries)
- float factor = min(1.5, log2(bleuScoreHope));
- factor = min(3.0f, factor);
- alpha = alpha * factor;
- cerr << "Rank " << rank << ", epoch " << epoch << ", boosted alpha: " << alpha << endl;
+ // factor between 1.5 and 3 (for Bleu scores between 5 and 20, the factor is within the boundaries)
+ float factor = min(1.5, log2(bleuScoreHope));
+ factor = min(3.0f, factor);
+ alpha = alpha * factor;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", boosted alpha: " << alpha << endl;
}
featureValueDiff.MultiplyEquals(alpha);
weightUpdate.PlusEquals(featureValueDiff);
// cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl;
}
-
+
if (!constraintViolatedBefore) {
// constraint satisfied, nothing to do
- cerr << "Rank " << rank << ", epoch " << epoch << ", constraint already satisfied" << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", constraint already satisfied" << endl;
return 1;
}
// sanity check: constraint still violated after optimisation?
-/* ScoreComponentCollection newWeights(currWeights);
- newWeights.PlusEquals(weightUpdate);
- bool constraintViolatedAfter = false;
- float newDistanceFromOptimum = 0;
- featureValueDiff = featureValuesHope;
- featureValueDiff.MinusEquals(featureValuesFear);
- modelScoreDiff = featureValueDiff.InnerProduct(newWeights);
- diff = loss - modelScoreDiff;
- // approximate comparison between floats!
- if (diff > epsilon) {
- constraintViolatedAfter = true;
- newDistanceFromOptimum += (loss - modelScoreDiff);
- }
+ /* ScoreComponentCollection newWeights(currWeights);
+ newWeights.PlusEquals(weightUpdate);
+ bool constraintViolatedAfter = false;
+ float newDistanceFromOptimum = 0;
+ featureValueDiff = featureValuesHope;
+ featureValueDiff.MinusEquals(featureValuesFear);
+ modelScoreDiff = featureValueDiff.InnerProduct(newWeights);
+ diff = loss - modelScoreDiff;
+ // approximate comparison between floats!
+ if (diff > epsilon) {
+ constraintViolatedAfter = true;
+ newDistanceFromOptimum += (loss - modelScoreDiff);
+ }
- float hopeScore = featureValuesHope.InnerProduct(newWeights);
- float fearScore = featureValuesFear.InnerProduct(newWeights);
- cerr << "New hope score: " << hopeScore << endl;
- cerr << "New fear score: " << fearScore << endl;
+ float hopeScore = featureValuesHope.InnerProduct(newWeights);
+ float fearScore = featureValuesFear.InnerProduct(newWeights);
+ cerr << "New hope score: " << hopeScore << endl;
+ cerr << "New fear score: " << fearScore << endl;
- VERBOSE(0, "Rank " << rank << ", epoch " << epoch << ", check, constraint violated before? " << constraintViolatedBefore << ", after? " << constraintViolatedAfter << endl);
- VERBOSE(0, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
-*/
+ VERBOSE(0, "Rank " << rank << ", epoch " << epoch << ", check, constraint violated before? " << constraintViolatedBefore << ", after? " << constraintViolatedAfter << endl);
+ VERBOSE(0, "Rank " << rank << ", epoch " << epoch << ", check, error before: " << oldDistanceFromOptimum << ", after: " << newDistanceFromOptimum << ", change: " << oldDistanceFromOptimum - newDistanceFromOptimum << endl);
+ */
return 0;
}
size_t MiraOptimiser::updateWeightsHopeFearSelective(
- Moses::ScoreComponentCollection& weightUpdate,
- const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
- const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
- const std::vector<std::vector<float> >& bleuScoresHope,
- const std::vector<std::vector<float> >& bleuScoresFear,
- const std::vector<std::vector<float> >& modelScoresHope,
- const std::vector<std::vector<float> >& modelScoresFear,
- float learning_rate,
- size_t rank,
- size_t epoch,
- int updatePosition) {
+ Moses::ScoreComponentCollection& weightUpdate,
+ const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+ const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+ const std::vector<std::vector<float> >& bleuScoresHope,
+ const std::vector<std::vector<float> >& bleuScoresFear,
+ const std::vector<std::vector<float> >& modelScoresHope,
+ const std::vector<std::vector<float> >& modelScoresFear,
+ float learning_rate,
+ size_t rank,
+ size_t epoch,
+ int updatePosition)
+{
// vector of feature values differences for all created constraints
vector<ScoreComponentCollection> nonZeroFeatures;
vector<float> lossMinusModelScoreDiffs;
-
+
// Make constraints for new hypothesis translations
float epsilon = 0.0001;
int violatedConstraintsBefore = 0;
-
+
// iterate over input sentences (1 (online) or more (batch))
for (size_t i = 0; i < featureValuesHope.size(); ++i) {
if (updatePosition != -1) {
if (i < updatePosition)
- continue;
+ continue;
else if (i > updatePosition)
- break;
+ break;
}
-
+
// Pick all pairs[j,j] of hope and fear translations for one input sentence
for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
ScoreComponentCollection featureValueDiff = featureValuesHope[i][j];
featureValueDiff.MinusEquals(featureValuesFear[i][j]);
if (featureValueDiff.GetL1Norm() == 0) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
- continue;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
+ continue;
}
-
+
// check if constraint is violated
float loss = bleuScoresHope[i][j] - bleuScoresFear[i][j];
float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j];
float diff = 0;
- if (loss > modelScoreDiff)
- diff = loss - modelScoreDiff;
- if (diff > epsilon)
- ++violatedConstraintsBefore;
- cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
-
+ if (loss > modelScoreDiff)
+ diff = loss - modelScoreDiff;
+ if (diff > epsilon)
+ ++violatedConstraintsBefore;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << loss << " (current violation: " << diff << ")" << endl;
+
// iterate over difference vector and add a constraint for every non-zero feature
FVector features = featureValueDiff.GetScoresVector();
size_t n_core = 0, n_sparse = 0, n_sparse_hope = 0, n_sparse_fear = 0;
for (size_t i=0; i<features.coreSize(); ++i) {
- if (features[i] != 0.0) {
- ++n_core;
- ScoreComponentCollection f;
- f.Assign(i, features[i]);
- nonZeroFeatures.push_back(f);
- }
+ if (features[i] != 0.0) {
+ ++n_core;
+ ScoreComponentCollection f;
+ f.Assign(i, features[i]);
+ nonZeroFeatures.push_back(f);
+ }
}
vector<ScoreComponentCollection> nonZeroFeaturesHope;
@@ -522,27 +524,26 @@ size_t MiraOptimiser::updateWeightsHopeFearSelective(
f.Assign((i->first).name(), i->second);
cerr << "Rank " << rank << ", epoch " << epoch << ", f: " << f << endl;
- if (i->second > 0.0) {
- ++n_sparse_hope;
- nonZeroFeaturesHope.push_back(f);
- }
- else {
- ++n_sparse_fear;
- nonZeroFeaturesFear.push_back(f);
- }
+ if (i->second > 0.0) {
+ ++n_sparse_hope;
+ nonZeroFeaturesHope.push_back(f);
+ } else {
+ ++n_sparse_fear;
+ nonZeroFeaturesFear.push_back(f);
+ }
}
}
float n = n_core + n_sparse_hope + n_sparse_fear;
for (size_t i=0; i<n_core; ++i)
- lossMinusModelScoreDiffs.push_back(diff/n);
+ lossMinusModelScoreDiffs.push_back(diff/n);
for (size_t i=0; i<n_sparse_hope; ++i) {
- nonZeroFeatures.push_back(nonZeroFeaturesHope[i]);
+ nonZeroFeatures.push_back(nonZeroFeaturesHope[i]);
lossMinusModelScoreDiffs.push_back((diff/n)*1.1);
}
for (size_t i=0; i<n_sparse_fear; ++i) {
- nonZeroFeatures.push_back(nonZeroFeaturesFear[i]);
- lossMinusModelScoreDiffs.push_back(diff/n);
+ nonZeroFeatures.push_back(nonZeroFeaturesFear[i]);
+ lossMinusModelScoreDiffs.push_back(diff/n);
}
cerr << "Rank " << rank << ", epoch " << epoch << ", core diff: " << diff/n << endl;
cerr << "Rank " << rank << ", epoch " << epoch << ", hope diff: " << ((diff/n)*1.1) << endl;
@@ -558,33 +559,32 @@ size_t MiraOptimiser::updateWeightsHopeFearSelective(
if (violatedConstraintsBefore > 0) {
cerr << "Rank " << rank << ", epoch " << epoch << ", number of constraints passed to optimizer: " << nonZeroFeatures.size() << endl;
alphas = Hildreth::optimise(nonZeroFeatures, lossMinusModelScoreDiffs, m_slack);
-
+
// Update the weight vector according to the alphas and the feature value differences
// * w' = w' + SUM alpha_i * (h_i(oracle) - h_i(hypothesis))
for (size_t k = 0; k < nonZeroFeatures.size(); ++k) {
float alpha = alphas[k];
cerr << "Rank " << rank << ", epoch " << epoch << ", alpha: " << alpha << endl;
if (alpha != 0) {
- ScoreComponentCollection update(nonZeroFeatures[k]);
- update.MultiplyEquals(alpha);
-
- // sum updates
- summedUpdate.PlusEquals(update);
+ ScoreComponentCollection update(nonZeroFeatures[k]);
+ update.MultiplyEquals(alpha);
+
+ // sum updates
+ summedUpdate.PlusEquals(update);
}
}
- }
- else {
+ } else {
cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl;
// return 0;
return 1;
}
-
+
// apply learning rate
if (learning_rate != 1) {
cerr << "Rank " << rank << ", epoch " << epoch << ", apply learning rate " << learning_rate << " to update." << endl;
summedUpdate.MultiplyEquals(learning_rate);
}
-
+
// scale update by BLEU of oracle (for batch size 1 only)
if (featureValuesHope.size() == 1) {
if (m_scale_update) {
@@ -592,56 +592,57 @@ size_t MiraOptimiser::updateWeightsHopeFearSelective(
summedUpdate.MultiplyEquals(bleuScoresHope[0][0]);
}
}
-
+
//cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << summedUpdate << endl;
weightUpdate.PlusEquals(summedUpdate);
return 0;
}
size_t MiraOptimiser::updateWeightsHopeFearSummed(
- Moses::ScoreComponentCollection& weightUpdate,
- const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
- const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
- const std::vector<std::vector<float> >& bleuScoresHope,
- const std::vector<std::vector<float> >& bleuScoresFear,
- const std::vector<std::vector<float> >& modelScoresHope,
- const std::vector<std::vector<float> >& modelScoresFear,
- float learning_rate,
- size_t rank,
- size_t epoch,
- bool rescaleSlack,
- bool makePairs) {
+ Moses::ScoreComponentCollection& weightUpdate,
+ const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+ const std::vector< std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+ const std::vector<std::vector<float> >& bleuScoresHope,
+ const std::vector<std::vector<float> >& bleuScoresFear,
+ const std::vector<std::vector<float> >& modelScoresHope,
+ const std::vector<std::vector<float> >& modelScoresFear,
+ float learning_rate,
+ size_t rank,
+ size_t epoch,
+ bool rescaleSlack,
+ bool makePairs)
+{
// vector of feature values differences for all created constraints
ScoreComponentCollection averagedFeatureDiffs;
float averagedViolations = 0;
-
+
// Make constraints for new hypothesis translations
float epsilon = 0.0001;
int violatedConstraintsBefore = 0;
-
+
if (!makePairs) {
ScoreComponentCollection featureValueDiff;
float lossHope = 0, lossFear = 0, modelScoreHope = 0, modelScoreFear = 0, hopeCount = 0, fearCount = 0;
// add all hope vectors
for (size_t i = 0; i < featureValuesHope.size(); ++i) {
for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
- featureValueDiff.PlusEquals(featureValuesHope[i][j]);
- lossHope += bleuScoresHope[i][j];
- modelScoreHope += modelScoresHope[i][j];
- ++hopeCount;
+ featureValueDiff.PlusEquals(featureValuesHope[i][j]);
+ lossHope += bleuScoresHope[i][j];
+ modelScoreHope += modelScoresHope[i][j];
+ ++hopeCount;
}
}
lossHope /= hopeCount;
modelScoreHope /= hopeCount;
-
+
// subtract all fear vectors
for (size_t i = 0; i < featureValuesFear.size(); ++i) {
for (size_t j = 0; j < featureValuesFear[i].size(); ++j) {
featureValueDiff.MinusEquals(featureValuesFear[i][j]);
- lossFear += bleuScoresFear[i][j];
+ lossFear += bleuScoresFear[i][j];
modelScoreFear += modelScoresFear[i][j];
- ++fearCount;
+ ++fearCount;
}
}
lossFear /= fearCount;
@@ -653,7 +654,7 @@ size_t MiraOptimiser::updateWeightsHopeFearSummed(
return 1;
}
- // check if constraint is violated
+ // check if constraint is violated
float lossDiff = lossHope - lossFear;
float modelScoreDiff = modelScoreHope - modelScoreFear;
float diff = 0;
@@ -662,54 +663,52 @@ size_t MiraOptimiser::updateWeightsHopeFearSummed(
if (diff > epsilon)
++violatedConstraintsBefore;
cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << lossDiff << " (current violation: " <<\
- diff << ")" << endl;
+ diff << ")" << endl;
- // add constraint
+ // add constraint
averagedFeatureDiffs = featureValueDiff;
averagedViolations = diff;
- }
- else {
- // iterate over input sentences (1 (online) or more (batch))
- for (size_t i = 0; i < featureValuesHope.size(); ++i) {
- // Pick all pairs[j,j] of hope and fear translations for one input sentence and add them up
- for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
- ScoreComponentCollection featureValueDiff = featureValuesHope[i][j];
- featureValueDiff.MinusEquals(featureValuesFear[i][j]);
- if (featureValueDiff.GetL1Norm() == 0) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
- continue;
- }
-
- // check if constraint is violated
- float lossDiff = bleuScoresHope[i][j] - bleuScoresFear[i][j];
- float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j];
- if (rescaleSlack) {
- cerr << "Rank " << rank << ", epoch " << epoch << ", modelScoreDiff scaled by lossDiff: " << modelScoreDiff << " --> " << modelScoreDiff*lossDiff << endl;
- modelScoreDiff *= lossDiff;
- }
- float diff = 0;
- if (lossDiff > modelScoreDiff)
- diff = lossDiff - modelScoreDiff;
- if (diff > epsilon)
- ++violatedConstraintsBefore;
- cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << lossDiff << " (current violation: " << diff << ")" << endl;
-
- // add constraint
- if (rescaleSlack) {
- averagedFeatureDiffs.MultiplyEquals(lossDiff);
- cerr << "Rank " << rank << ", epoch " << epoch << ", featureValueDiff scaled by lossDiff." << endl;
+ } else {
+ // iterate over input sentences (1 (online) or more (batch))
+ for (size_t i = 0; i < featureValuesHope.size(); ++i) {
+ // Pick all pairs[j,j] of hope and fear translations for one input sentence and add them up
+ for (size_t j = 0; j < featureValuesHope[i].size(); ++j) {
+ ScoreComponentCollection featureValueDiff = featureValuesHope[i][j];
+ featureValueDiff.MinusEquals(featureValuesFear[i][j]);
+ if (featureValueDiff.GetL1Norm() == 0) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", features equal --> skip" << endl;
+ continue;
+ }
+
+ // check if constraint is violated
+ float lossDiff = bleuScoresHope[i][j] - bleuScoresFear[i][j];
+ float modelScoreDiff = modelScoresHope[i][j] - modelScoresFear[i][j];
+ if (rescaleSlack) {
+ cerr << "Rank " << rank << ", epoch " << epoch << ", modelScoreDiff scaled by lossDiff: " << modelScoreDiff << " --> " << modelScoreDiff*lossDiff << endl;
+ modelScoreDiff *= lossDiff;
+ }
+ float diff = 0;
+ if (lossDiff > modelScoreDiff)
+ diff = lossDiff - modelScoreDiff;
+ if (diff > epsilon)
+ ++violatedConstraintsBefore;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", constraint: " << modelScoreDiff << " >= " << lossDiff << " (current violation: " << diff << ")" << endl;
+
+ // add constraint
+ if (rescaleSlack) {
+ averagedFeatureDiffs.MultiplyEquals(lossDiff);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", featureValueDiff scaled by lossDiff." << endl;
+ }
+ averagedFeatureDiffs.PlusEquals(featureValueDiff);
+ averagedViolations += diff;
}
- averagedFeatureDiffs.PlusEquals(featureValueDiff);
- averagedViolations += diff;
- }
- }
+ }
}
// divide by number of constraints (1/n)
if (!makePairs) {
averagedFeatureDiffs.DivideEquals(featureValuesHope[0].size());
- }
- else {
+ } else {
averagedFeatureDiffs.DivideEquals(featureValuesHope[0].size());
averagedViolations /= featureValuesHope[0].size();
}
@@ -717,29 +716,27 @@ size_t MiraOptimiser::updateWeightsHopeFearSummed(
cerr << "Rank " << rank << ", epoch " << epoch << ", averaged violations: " << averagedViolations << endl;
if (violatedConstraintsBefore > 0) {
- // compute alpha for given constraint: (loss diff - model score diff) / || feature value diff ||^2
- // featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff)
- // from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2}
+ // compute alpha for given constraint: (loss diff - model score diff) / || feature value diff ||^2
+ // featureValueDiff.GetL2Norm() * featureValueDiff.GetL2Norm() == featureValueDiff.InnerProduct(featureValueDiff)
+ // from Crammer&Singer 2006: alpha = min {C , l_t/ ||x||^2}
// adjusted for 1 slack according to Joachims 2009, OP4 (margin rescaling), OP5 (slack rescaling)
float squaredNorm = averagedFeatureDiffs.GetL2Norm() * averagedFeatureDiffs.GetL2Norm();
float alpha = averagedViolations / squaredNorm;
cerr << "Rank " << rank << ", epoch " << epoch << ", unclipped alpha: " << alpha << endl;
if (m_slack > 0 ) {
if (alpha > m_slack) {
- alpha = m_slack;
- }
- else if (alpha < m_slack*(-1)) {
- alpha = m_slack*(-1);
+ alpha = m_slack;
+ } else if (alpha < m_slack*(-1)) {
+ alpha = m_slack*(-1);
}
}
cerr << "Rank " << rank << ", epoch " << epoch << ", clipped alpha: " << alpha << endl;
-
+
// compute update
averagedFeatureDiffs.MultiplyEquals(alpha);
weightUpdate.PlusEquals(averagedFeatureDiffs);
return 0;
- }
- else {
+ } else {
cerr << "Rank " << rank << ", epoch " << epoch << ", no constraint violated for this batch" << endl;
return 1;
}
diff --git a/mira/Optimiser.h b/mira/Optimiser.h
index a610268f3..d8afb8a0a 100644
--- a/mira/Optimiser.h
+++ b/mira/Optimiser.h
@@ -24,151 +24,155 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/ScoreComponentCollection.h"
-namespace Mira {
-
- class Optimiser {
- public:
- Optimiser() {}
-
- virtual size_t updateWeightsHopeFear(
- Moses::ScoreComponentCollection& weightUpdate,
- const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
- const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
- const std::vector<std::vector<float> >& bleuScoresHope,
- const std::vector<std::vector<float> >& bleuScoresFear,
- const std::vector<std::vector<float> >& modelScoresHope,
- const std::vector<std::vector<float> >& modelScoresFear,
- float learning_rate,
- size_t rank,
- size_t epoch,
- int updatePosition = -1) = 0;
- };
-
- class Perceptron : public Optimiser {
- public:
- virtual size_t updateWeightsHopeFear(
- Moses::ScoreComponentCollection& weightUpdate,
- const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
- const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
- const std::vector<std::vector<float> >& bleuScoresHope,
- const std::vector<std::vector<float> >& bleuScoresFear,
- const std::vector<std::vector<float> >& modelScoresHope,
- const std::vector<std::vector<float> >& modelScoresFear,
- float learning_rate,
- size_t rank,
- size_t epoch,
- int updatePosition = -1);
- };
-
- class MiraOptimiser : public Optimiser {
- public:
+namespace Mira
+{
+
+class Optimiser
+{
+public:
+ Optimiser() {}
+
+ virtual size_t updateWeightsHopeFear(
+ Moses::ScoreComponentCollection& weightUpdate,
+ const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+ const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+ const std::vector<std::vector<float> >& bleuScoresHope,
+ const std::vector<std::vector<float> >& bleuScoresFear,
+ const std::vector<std::vector<float> >& modelScoresHope,
+ const std::vector<std::vector<float> >& modelScoresFear,
+ float learning_rate,
+ size_t rank,
+ size_t epoch,
+ int updatePosition = -1) = 0;
+};
+
+class Perceptron : public Optimiser
+{
+public:
+ virtual size_t updateWeightsHopeFear(
+ Moses::ScoreComponentCollection& weightUpdate,
+ const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+ const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+ const std::vector<std::vector<float> >& bleuScoresHope,
+ const std::vector<std::vector<float> >& bleuScoresFear,
+ const std::vector<std::vector<float> >& modelScoresHope,
+ const std::vector<std::vector<float> >& modelScoresFear,
+ float learning_rate,
+ size_t rank,
+ size_t epoch,
+ int updatePosition = -1);
+};
+
+class MiraOptimiser : public Optimiser
+{
+public:
MiraOptimiser() :
Optimiser() { }
-
+
MiraOptimiser(
- float slack, bool scale_margin, bool scale_margin_precision,
- bool scale_update, bool scale_update_precision, bool boost, bool normaliseMargin, float sigmoidParam) :
- Optimiser(),
- m_slack(slack),
- m_scale_margin(scale_margin),
- m_scale_margin_precision(scale_margin_precision),
- m_scale_update(scale_update),
- m_scale_update_precision(scale_update_precision),
- m_precision(1),
- m_boost(boost),
- m_normaliseMargin(normaliseMargin),
- m_sigmoidParam(sigmoidParam) { }
-
- size_t updateWeights(
- Moses::ScoreComponentCollection& weightUpdate,
- const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValues,
- const std::vector<std::vector<float> >& losses,
- const std::vector<std::vector<float> >& bleuScores,
- const std::vector<std::vector<float> >& modelScores,
- const std::vector< Moses::ScoreComponentCollection>& oracleFeatureValues,
- const std::vector< float> oracleBleuScores,
- const std::vector< float> oracleModelScores,
- float learning_rate,
- size_t rank,
- size_t epoch);
- virtual size_t updateWeightsHopeFear(
- Moses::ScoreComponentCollection& weightUpdate,
- const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
- const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
- const std::vector<std::vector<float> >& bleuScoresHope,
- const std::vector<std::vector<float> >& bleuScoresFear,
- const std::vector<std::vector<float> >& modelScoresHope,
- const std::vector<std::vector<float> >& modelScoresFear,
- float learning_rate,
- size_t rank,
- size_t epoch,
- int updatePosition = -1);
- size_t updateWeightsHopeFearSelective(
- Moses::ScoreComponentCollection& weightUpdate,
- const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
- const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
- const std::vector<std::vector<float> >& bleuScoresHope,
- const std::vector<std::vector<float> >& bleuScoresFear,
- const std::vector<std::vector<float> >& modelScoresHope,
- const std::vector<std::vector<float> >& modelScoresFear,
- float learning_rate,
- size_t rank,
- size_t epoch,
- int updatePosition = -1);
- size_t updateWeightsHopeFearSummed(
- Moses::ScoreComponentCollection& weightUpdate,
- const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
- const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
- const std::vector<std::vector<float> >& bleuScoresHope,
- const std::vector<std::vector<float> >& bleuScoresFear,
- const std::vector<std::vector<float> >& modelScoresHope,
- const std::vector<std::vector<float> >& modelScoresFear,
- float learning_rate,
- size_t rank,
- size_t epoch,
- bool rescaleSlack,
- bool makePairs);
- size_t updateWeightsAnalytically(
- Moses::ScoreComponentCollection& weightUpdate,
- Moses::ScoreComponentCollection& featureValuesHope,
- Moses::ScoreComponentCollection& featureValuesFear,
- float bleuScoreHope,
- float bleuScoreFear,
- float modelScoreHope,
- float modelScoreFear,
- float learning_rate,
- size_t rank,
- size_t epoch);
-
- void setSlack(float slack) {
- m_slack = slack;
- }
-
- void setPrecision(float precision) {
- m_precision = precision;
- }
-
- private:
- // regularise Hildreth updates
- float m_slack;
-
- // scale margin with BLEU score or precision
- bool m_scale_margin, m_scale_margin_precision;
-
- // scale update with oracle BLEU score or precision
- bool m_scale_update, m_scale_update_precision;
-
- float m_precision;
-
- // boosting of updates on misranked candidates
- bool m_boost;
-
- // squash margin between 0 and 1 (or depending on m_sigmoidParam)
- bool m_normaliseMargin;
-
- // y=sigmoidParam is the axis that this sigmoid approaches
- float m_sigmoidParam ;
- };
+ float slack, bool scale_margin, bool scale_margin_precision,
+ bool scale_update, bool scale_update_precision, bool boost, bool normaliseMargin, float sigmoidParam) :
+ Optimiser(),
+ m_slack(slack),
+ m_scale_margin(scale_margin),
+ m_scale_margin_precision(scale_margin_precision),
+ m_scale_update(scale_update),
+ m_scale_update_precision(scale_update_precision),
+ m_precision(1),
+ m_boost(boost),
+ m_normaliseMargin(normaliseMargin),
+ m_sigmoidParam(sigmoidParam) { }
+
+ size_t updateWeights(
+ Moses::ScoreComponentCollection& weightUpdate,
+ const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValues,
+ const std::vector<std::vector<float> >& losses,
+ const std::vector<std::vector<float> >& bleuScores,
+ const std::vector<std::vector<float> >& modelScores,
+ const std::vector< Moses::ScoreComponentCollection>& oracleFeatureValues,
+ const std::vector< float> oracleBleuScores,
+ const std::vector< float> oracleModelScores,
+ float learning_rate,
+ size_t rank,
+ size_t epoch);
+ virtual size_t updateWeightsHopeFear(
+ Moses::ScoreComponentCollection& weightUpdate,
+ const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+ const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+ const std::vector<std::vector<float> >& bleuScoresHope,
+ const std::vector<std::vector<float> >& bleuScoresFear,
+ const std::vector<std::vector<float> >& modelScoresHope,
+ const std::vector<std::vector<float> >& modelScoresFear,
+ float learning_rate,
+ size_t rank,
+ size_t epoch,
+ int updatePosition = -1);
+ size_t updateWeightsHopeFearSelective(
+ Moses::ScoreComponentCollection& weightUpdate,
+ const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+ const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+ const std::vector<std::vector<float> >& bleuScoresHope,
+ const std::vector<std::vector<float> >& bleuScoresFear,
+ const std::vector<std::vector<float> >& modelScoresHope,
+ const std::vector<std::vector<float> >& modelScoresFear,
+ float learning_rate,
+ size_t rank,
+ size_t epoch,
+ int updatePosition = -1);
+ size_t updateWeightsHopeFearSummed(
+ Moses::ScoreComponentCollection& weightUpdate,
+ const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesHope,
+ const std::vector<std::vector<Moses::ScoreComponentCollection> >& featureValuesFear,
+ const std::vector<std::vector<float> >& bleuScoresHope,
+ const std::vector<std::vector<float> >& bleuScoresFear,
+ const std::vector<std::vector<float> >& modelScoresHope,
+ const std::vector<std::vector<float> >& modelScoresFear,
+ float learning_rate,
+ size_t rank,
+ size_t epoch,
+ bool rescaleSlack,
+ bool makePairs);
+ size_t updateWeightsAnalytically(
+ Moses::ScoreComponentCollection& weightUpdate,
+ Moses::ScoreComponentCollection& featureValuesHope,
+ Moses::ScoreComponentCollection& featureValuesFear,
+ float bleuScoreHope,
+ float bleuScoreFear,
+ float modelScoreHope,
+ float modelScoreFear,
+ float learning_rate,
+ size_t rank,
+ size_t epoch);
+
+ void setSlack(float slack) {
+ m_slack = slack;
+ }
+
+ void setPrecision(float precision) {
+ m_precision = precision;
+ }
+
+private:
+ // regularise Hildreth updates
+ float m_slack;
+
+ // scale margin with BLEU score or precision
+ bool m_scale_margin, m_scale_margin_precision;
+
+ // scale update with oracle BLEU score or precision
+ bool m_scale_update, m_scale_update_precision;
+
+ float m_precision;
+
+ // boosting of updates on misranked candidates
+ bool m_boost;
+
+ // squash margin between 0 and 1 (or depending on m_sigmoidParam)
+ bool m_normaliseMargin;
+
+ // y=sigmoidParam is the axis that this sigmoid approaches
+ float m_sigmoidParam ;
+};
}
#endif
diff --git a/mira/Perceptron.cpp b/mira/Perceptron.cpp
index 569a83216..af61c28a9 100644
--- a/mira/Perceptron.cpp
+++ b/mira/Perceptron.cpp
@@ -22,30 +22,31 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace Moses;
using namespace std;
-namespace Mira {
+namespace Mira
+{
size_t Perceptron::updateWeightsHopeFear(
- ScoreComponentCollection& weightUpdate,
- const vector< vector<ScoreComponentCollection> >& featureValuesHope,
- const vector< vector<ScoreComponentCollection> >& featureValuesFear,
- const vector< vector<float> >& dummy1,
- const vector< vector<float> >& dummy2,
- const vector< vector<float> >& dummy3,
- const vector< vector<float> >& dummy4,
- float perceptron_learning_rate,
- size_t rank,
- size_t epoch,
- int updatePosition)
+ ScoreComponentCollection& weightUpdate,
+ const vector< vector<ScoreComponentCollection> >& featureValuesHope,
+ const vector< vector<ScoreComponentCollection> >& featureValuesFear,
+ const vector< vector<float> >& dummy1,
+ const vector< vector<float> >& dummy2,
+ const vector< vector<float> >& dummy3,
+ const vector< vector<float> >& dummy4,
+ float perceptron_learning_rate,
+ size_t rank,
+ size_t epoch,
+ int updatePosition)
{
- cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope[0][0] << endl;
- cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear[0][0] << endl;
- ScoreComponentCollection featureValueDiff = featureValuesHope[0][0];
- featureValueDiff.MinusEquals(featureValuesFear[0][0]);
- cerr << "Rank " << rank << ", epoch " << epoch << ", hope - fear: " << featureValueDiff << endl;
- featureValueDiff.MultiplyEquals(perceptron_learning_rate);
- weightUpdate.PlusEquals(featureValueDiff);
- cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << featureValueDiff << endl;
- return 0;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope[0][0] << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear[0][0] << endl;
+ ScoreComponentCollection featureValueDiff = featureValuesHope[0][0];
+ featureValueDiff.MinusEquals(featureValuesFear[0][0]);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", hope - fear: " << featureValueDiff << endl;
+ featureValueDiff.MultiplyEquals(perceptron_learning_rate);
+ weightUpdate.PlusEquals(featureValueDiff);
+ cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << featureValueDiff << endl;
+ return 0;
}
}
diff --git a/misc/processLexicalTableMin.cpp b/misc/processLexicalTableMin.cpp
index 3f2887e55..04f6590b1 100644
--- a/misc/processLexicalTableMin.cpp
+++ b/misc/processLexicalTableMin.cpp
@@ -20,7 +20,7 @@ void printHelp(char **argv)
"\t-T string -- path to temporary directory (uses /tmp by default)\n"
#ifdef WITH_THREADS
"\t-threads int|all -- number of threads used for conversion\n"
-#endif
+#endif
"\n advanced:\n"
"\t-landmark int -- use landmark phrase every 2^n phrases\n"
"\t-fingerprint int -- number of bits used for phrase fingerprints\n"
@@ -44,11 +44,11 @@ void printHelp(char **argv)
int main(int argc, char** argv)
{
-
+
std::string inFilePath;
std::string outFilePath("out");
std::string tempfilePath;
-
+
size_t orderBits = 10;
size_t fingerPrintBits = 16;
bool multipleScoreTrees = true;
@@ -56,52 +56,36 @@ int main(int argc, char** argv)
#ifdef WITH_THREADS
size_t threads = 1;
-#endif
+#endif
- if(1 >= argc)
- {
+ if(1 >= argc) {
printHelp(argv);
return 1;
}
- for(int i = 1; i < argc; ++i)
- {
+ for(int i = 1; i < argc; ++i) {
std::string arg(argv[i]);
- if("-in" == arg && i+1 < argc)
- {
+ if("-in" == arg && i+1 < argc) {
++i;
inFilePath = argv[i];
- }
- else if("-out" == arg && i+1 < argc)
- {
+ } else if("-out" == arg && i+1 < argc) {
++i;
outFilePath = argv[i];
- }
- else if("-T" == arg && i+1 < argc) {
+ } else if("-T" == arg && i+1 < argc) {
++i;
tempfilePath = argv[i];
util::NormalizeTempPrefix(tempfilePath);
- }
- else if("-landmark" == arg && i+1 < argc)
- {
+ } else if("-landmark" == arg && i+1 < argc) {
++i;
orderBits = atoi(argv[i]);
- }
- else if("-fingerprint" == arg && i+1 < argc)
- {
+ } else if("-fingerprint" == arg && i+1 < argc) {
++i;
fingerPrintBits = atoi(argv[i]);
- }
- else if("-join-scores" == arg)
- {
+ } else if("-join-scores" == arg) {
multipleScoreTrees = false;
- }
- else if("-quantize" == arg && i+1 < argc)
- {
+ } else if("-quantize" == arg && i+1 < argc) {
++i;
quantize = atoi(argv[i]);
- }
- else if("-threads" == arg && i+1 < argc)
- {
+ } else if("-threads" == arg && i+1 < argc) {
#ifdef WITH_THREADS
++i;
if(std::string(argv[i]) == "all") {
@@ -109,23 +93,20 @@ int main(int argc, char** argv)
if(!threads) {
std::cerr << "Could not determine number of hardware threads, setting to 1" << std::endl;
threads = 1;
- }
- }
- else
+ }
+ } else
threads = atoi(argv[i]);
#else
std::cerr << "Thread support not compiled in" << std::endl;
exit(1);
#endif
- }
- else
- {
+ } else {
//somethings wrong... print help
printHelp(argv);
return 1;
}
}
-
+
if(outFilePath.rfind(".minlexr") != outFilePath.size() - 8)
outFilePath += ".minlexr";
@@ -135,6 +116,6 @@ int main(int argc, char** argv)
multipleScoreTrees, quantize
#ifdef WITH_THREADS
, threads
-#endif
+#endif
);
}
diff --git a/misc/processPhraseTableMin.cpp b/misc/processPhraseTableMin.cpp
index 5c25262b8..1ea42252c 100644
--- a/misc/processPhraseTableMin.cpp
+++ b/misc/processPhraseTableMin.cpp
@@ -2,7 +2,7 @@
#ifdef WITH_THREADS
#include <boost/thread/thread.hpp>
-#endif
+#endif
#include "moses/TypeDef.h"
#include "moses/TranslationModel/CompactPT/PhraseTableCreator.h"
@@ -11,7 +11,8 @@
using namespace Moses;
-void printHelp(char **argv) {
+void printHelp(char **argv)
+{
std::cerr << "Usage " << argv[0] << ":\n"
" options: \n"
"\t-in string -- input table file name\n"
@@ -21,7 +22,7 @@ void printHelp(char **argv) {
"\t-no-alignment-info -- do not include alignment info in the binary phrase table\n"
#ifdef WITH_THREADS
"\t-threads int|all -- number of threads used for conversion\n"
-#endif
+#endif
"\n advanced:\n"
"\t-encoding string -- encoding type: PREnc REnc None (default PREnc)\n"
"\t-rankscore int -- score index of P(t|s) (default 2)\n"
@@ -48,14 +49,15 @@ void printHelp(char **argv) {
}
-int main(int argc, char **argv) {
-
+int main(int argc, char **argv)
+{
+
std::string inFilePath;
std::string outFilePath("out");
std::string tempfilePath;
PhraseTableCreator::Coding coding = PhraseTableCreator::PREnc;
-
- size_t numScoreComponent = 5;
+
+ size_t numScoreComponent = 5;
size_t orderBits = 10;
size_t fingerprintBits = 16;
bool useAlignmentInfo = true;
@@ -63,10 +65,10 @@ int main(int argc, char **argv) {
size_t quantize = 0;
size_t maxRank = 100;
bool sortScoreIndexSet = false;
- size_t sortScoreIndex = 2;
+ size_t sortScoreIndex = 2;
bool warnMe = true;
size_t threads = 1;
-
+
if(1 >= argc) {
printHelp(argv);
return 1;
@@ -76,64 +78,49 @@ int main(int argc, char **argv) {
if("-in" == arg && i+1 < argc) {
++i;
inFilePath = argv[i];
- }
- else if("-out" == arg && i+1 < argc) {
+ } else if("-out" == arg && i+1 < argc) {
++i;
outFilePath = argv[i];
- }
- else if("-T" == arg && i+1 < argc) {
+ } else if("-T" == arg && i+1 < argc) {
++i;
tempfilePath = argv[i];
util::NormalizeTempPrefix(tempfilePath);
- }
- else if("-encoding" == arg && i+1 < argc) {
+ } else if("-encoding" == arg && i+1 < argc) {
++i;
std::string val(argv[i]);
if(val == "None" || val == "none") {
coding = PhraseTableCreator::None;
- }
- else if(val == "REnc" || val == "renc") {
+ } else if(val == "REnc" || val == "renc") {
coding = PhraseTableCreator::REnc;
- }
- else if(val == "PREnc" || val == "prenc") {
+ } else if(val == "PREnc" || val == "prenc") {
coding = PhraseTableCreator::PREnc;
}
- }
- else if("-maxrank" == arg && i+1 < argc) {
+ } else if("-maxrank" == arg && i+1 < argc) {
++i;
maxRank = atoi(argv[i]);
- }
- else if("-nscores" == arg && i+1 < argc) {
+ } else if("-nscores" == arg && i+1 < argc) {
++i;
numScoreComponent = atoi(argv[i]);
- }
- else if("-rankscore" == arg && i+1 < argc) {
+ } else if("-rankscore" == arg && i+1 < argc) {
++i;
sortScoreIndex = atoi(argv[i]);
sortScoreIndexSet = true;
- }
- else if("-no-alignment-info" == arg) {
+ } else if("-no-alignment-info" == arg) {
useAlignmentInfo = false;
- }
- else if("-landmark" == arg && i+1 < argc) {
+ } else if("-landmark" == arg && i+1 < argc) {
++i;
orderBits = atoi(argv[i]);
- }
- else if("-fingerprint" == arg && i+1 < argc) {
+ } else if("-fingerprint" == arg && i+1 < argc) {
++i;
fingerprintBits = atoi(argv[i]);
- }
- else if("-join-scores" == arg) {
+ } else if("-join-scores" == arg) {
multipleScoreTrees = false;
- }
- else if("-quantize" == arg && i+1 < argc) {
+ } else if("-quantize" == arg && i+1 < argc) {
++i;
quantize = atoi(argv[i]);
- }
- else if("-no-warnings" == arg) {
+ } else if("-no-warnings" == arg) {
warnMe = false;
- }
- else if("-threads" == arg && i+1 < argc) {
+ } else if("-threads" == arg && i+1 < argc) {
#ifdef WITH_THREADS
++i;
if(std::string(argv[i]) == "all") {
@@ -141,40 +128,36 @@ int main(int argc, char **argv) {
if(!threads) {
std::cerr << "Could not determine number of hardware threads, setting to 1" << std::endl;
threads = 1;
- }
- }
- else
+ }
+ } else
threads = atoi(argv[i]);
#else
std::cerr << "Thread support not compiled in" << std::endl;
exit(1);
#endif
- }
- else {
+ } else {
//something's wrong... print help
printHelp(argv);
return 1;
}
}
-
- if(!sortScoreIndexSet && numScoreComponent != 5 && coding == PhraseTableCreator::PREnc)
- {
+
+ if(!sortScoreIndexSet && numScoreComponent != 5 && coding == PhraseTableCreator::PREnc) {
std::cerr << "WARNING: You are using a nonstandard number of scores ("
<< numScoreComponent << ") with PREnc. Set the index of P(t|s) "
"with -rankscore int if it is not "
<< sortScoreIndex << "." << std::endl;
}
-
- if(sortScoreIndex >= numScoreComponent)
- {
+
+ if(sortScoreIndex >= numScoreComponent) {
std::cerr << "ERROR: -rankscore " << sortScoreIndex << " is out of range (0 ... "
<< (numScoreComponent-1) << ")" << std::endl;
abort();
}
-
+
if(outFilePath.rfind(".minphr") != outFilePath.size() - 7)
outFilePath += ".minphr";
-
+
PhraseTableCreator(inFilePath, outFilePath, tempfilePath,
numScoreComponent, sortScoreIndex,
coding, orderBits, fingerprintBits,
@@ -182,6 +165,6 @@ int main(int argc, char **argv) {
quantize, maxRank, warnMe
#ifdef WITH_THREADS
, threads
-#endif
- );
+#endif
+ );
}
diff --git a/misc/queryPhraseTable.cpp b/misc/queryPhraseTable.cpp
index d8103f371..5e4f7755a 100644
--- a/misc/queryPhraseTable.cpp
+++ b/misc/queryPhraseTable.cpp
@@ -33,8 +33,7 @@ int main(int argc, char **argv)
needAlignments = true;
} else if (!strcmp(argv[i], "-c")) {
reportCounts = true;
- }
- else
+ } else
usage();
}
diff --git a/misc/queryPhraseTableMin.cpp b/misc/queryPhraseTableMin.cpp
index f4dca8b6b..6b6f9beaf 100644
--- a/misc/queryPhraseTableMin.cpp
+++ b/misc/queryPhraseTableMin.cpp
@@ -36,8 +36,7 @@ int main(int argc, char **argv)
useAlignments = true;
} else if (!strcmp(argv[i], "-c")) {
reportCounts = true;
- }
- else
+ } else
usage();
}
@@ -47,28 +46,28 @@ int main(int argc, char **argv)
std::vector<FactorType> input(1, 0);
std::vector<FactorType> output(1, 0);
std::vector<float> weight(nscores, 0);
-
+
Parameter *parameter = new Parameter();
const_cast<std::vector<std::string>&>(parameter->GetParam("factor-delimiter")).resize(1, "||dummy_string||");
const_cast<std::vector<std::string>&>(parameter->GetParam("input-factors")).resize(1, "0");
const_cast<std::vector<std::string>&>(parameter->GetParam("verbose")).resize(1, "0");
const_cast<std::vector<std::string>&>(parameter->GetParam("weight-w")).resize(1, "0");
const_cast<std::vector<std::string>&>(parameter->GetParam("weight-d")).resize(1, "0");
-
+
StaticData::InstanceNonConst().LoadData(parameter);
PhraseDictionaryCompact pdc("input-factor=0 output-factor=0 num-features=5 path=" + ttable);
- bool ret = pdc.InitDictionary();
+ bool ret = pdc.InitDictionary();
assert(ret);
-
+
std::string line;
while(getline(std::cin, line)) {
Phrase sourcePhrase;
sourcePhrase.CreateFromString(Input, input, line, "||dummy_string||", NULL);
-
+
TargetPhraseVectorPtr decodedPhraseColl
- = pdc.GetTargetPhraseCollectionRaw(sourcePhrase);
-
+ = pdc.GetTargetPhraseCollectionRaw(sourcePhrase);
+
if(decodedPhraseColl != NULL) {
if(reportCounts)
std::cout << sourcePhrase << decodedPhraseColl->size() << std::endl;
@@ -77,19 +76,18 @@ int main(int argc, char **argv)
TargetPhrase &tp = *it;
std::cout << sourcePhrase << "||| ";
std::cout << static_cast<const Phrase&>(tp) << "|||";
-
+
if(useAlignments)
- std::cout << " " << tp.GetAlignTerm() << "|||";
-
+ std::cout << " " << tp.GetAlignTerm() << "|||";
+
std::vector<float> scores = tp.GetScoreBreakdown().GetScoresForProducer(&pdc);
for(size_t i = 0; i < scores.size(); i++)
std::cout << " " << exp(scores[i]);
std::cout << std::endl;
}
- }
- else if(reportCounts)
+ } else if(reportCounts)
std::cout << sourcePhrase << 0 << std::endl;
-
+
std::cout.flush();
}
}
diff --git a/moses-chart-cmd/IOWrapper.cpp b/moses-chart-cmd/IOWrapper.cpp
index 92f68a84d..b130943a1 100644
--- a/moses-chart-cmd/IOWrapper.cpp
+++ b/moses-chart-cmd/IOWrapper.cpp
@@ -138,7 +138,8 @@ IOWrapper::~IOWrapper()
delete m_alignmentInfoCollector;
}
-void IOWrapper::ResetTranslationId() {
+void IOWrapper::ResetTranslationId()
+{
m_translationId = StaticData::Instance().GetStartTranslationId();
}
@@ -174,7 +175,7 @@ void OutputSurface(std::ostream &out, const Phrase &phrase, const std::vector<Fa
for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
- CHECK(factor);
+ CHECK(factor);
out << "|" << *factor;
}
@@ -245,8 +246,8 @@ void OutputInput(std::ostream& os, const ChartHypothesis* hypo)
// Given a hypothesis and sentence, reconstructs the 'application context' --
// the source RHS symbols of the SCFG rule that was applied, plus their spans.
void IOWrapper::ReconstructApplicationContext(const ChartHypothesis &hypo,
- const Sentence &sentence,
- ApplicationContext &context)
+ const Sentence &sentence,
+ ApplicationContext &context)
{
context.clear();
const std::vector<const ChartHypothesis*> &prevHypos = hypo.GetPrevHypos();
@@ -276,7 +277,7 @@ void IOWrapper::ReconstructApplicationContext(const ChartHypothesis &hypo,
// but there are scripts and tools that expect the output of -T to look like
// that.
void IOWrapper::WriteApplicationContext(std::ostream &out,
- const ApplicationContext &context)
+ const ApplicationContext &context)
{
assert(!context.empty());
ApplicationContext::const_reverse_iterator p = context.rbegin();
@@ -327,7 +328,7 @@ void IOWrapper::OutputDetailedTranslationReport(
CHECK(m_detailOutputCollector);
m_detailOutputCollector->Write(translationId, out.str());
}
-
+
void IOWrapper::OutputBestHypo(const ChartHypothesis *hypo, long translationId)
{
@@ -344,18 +345,18 @@ void IOWrapper::OutputBestHypo(const ChartHypothesis *hypo, long translationId)
if (StaticData::Instance().GetOutputHypoScore()) {
out << hypo->GetTotalScore() << " ";
}
-
+
if (StaticData::Instance().IsPathRecoveryEnabled()) {
out << "||| ";
}
Phrase outPhrase(ARRAY_SIZE_INCR);
hypo->CreateOutputPhrase(outPhrase);
-
+
// delete 1st & last
CHECK(outPhrase.GetSize() >= 2);
outPhrase.RemoveWord(0);
outPhrase.RemoveWord(outPhrase.GetSize() - 1);
-
+
const std::vector<FactorType> outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
string output = outPhrase.GetStringRep(outputFactorOrder);
out << output << endl;
@@ -371,7 +372,8 @@ void IOWrapper::OutputBestHypo(const ChartHypothesis *hypo, long translationId)
m_singleBestOutputCollector->Write(translationId, out.str());
}
-void IOWrapper::OutputBestHypo(search::Applied applied, long translationId) {
+void IOWrapper::OutputBestHypo(search::Applied applied, long translationId)
+{
if (!m_singleBestOutputCollector) return;
std::ostringstream out;
IOWrapper::FixPrecision(out);
@@ -389,7 +391,8 @@ void IOWrapper::OutputBestHypo(search::Applied applied, long translationId) {
m_singleBestOutputCollector->Write(translationId, out.str());
}
-void IOWrapper::OutputBestNone(long translationId) {
+void IOWrapper::OutputBestNone(long translationId)
+{
if (!m_singleBestOutputCollector) return;
if (StaticData::Instance().GetOutputHypoScore()) {
m_singleBestOutputCollector->Write(translationId, "0 \n");
@@ -443,7 +446,8 @@ void IOWrapper::OutputFeatureScores( std::ostream& out, const ScoreComponentColl
}
}
-void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, long translationId) {
+void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, long translationId)
+{
std::ostringstream out;
// Check if we're writing to std::cout.
@@ -452,7 +456,7 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, long tran
// preserve existing behaviour, but should probably be done either way.
IOWrapper::FixPrecision(out);
- // Used to check StaticData's GetOutputHypoScore(), but it makes no sense with nbest output.
+ // Used to check StaticData's GetOutputHypoScore(), but it makes no sense with nbest output.
}
//bool includeAlignment = StaticData::Instance().NBestIncludesAlignment();
@@ -528,7 +532,8 @@ void IOWrapper::OutputNBestList(const ChartTrellisPathList &nBestList, long tran
m_nBestOutputCollector->Write(translationId, out.str());
}
-void IOWrapper::OutputNBestList(const std::vector<search::Applied> &nbest, long translationId) {
+void IOWrapper::OutputNBestList(const std::vector<search::Applied> &nbest, long translationId)
+{
std::ostringstream out;
// wtf? copied from the original OutputNBestList
if (m_nBestOutputCollector->OutputIsCout()) {
@@ -565,12 +570,11 @@ void ShiftOffsets(vector<T> &offsets, T shift)
T currPos = shift;
for (size_t i = 0; i < offsets.size(); ++i) {
if (offsets[i] == 0) {
- offsets[i] = currPos;
- ++currPos;
- }
- else {
- currPos += offsets[i];
- }
+ offsets[i] = currPos;
+ ++currPos;
+ } else {
+ currPos += offsets[i];
+ }
}
}
@@ -630,8 +634,7 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
totalTargetSize += targetSize;
++targetInd;
- }
- else {
+ } else {
++totalTargetSize;
}
}
@@ -666,15 +669,15 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe
ostringstream out;
if (hypo) {
- Alignments retAlign;
- OutputAlignment(retAlign, hypo, 0);
-
- // output alignments
- Alignments::const_iterator iter;
- for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
- const pair<size_t, size_t> &alignPoint = *iter;
- out << alignPoint.first << "-" << alignPoint.second << " ";
- }
+ Alignments retAlign;
+ OutputAlignment(retAlign, hypo, 0);
+
+ // output alignments
+ Alignments::const_iterator iter;
+ for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
+ const pair<size_t, size_t> &alignPoint = *iter;
+ out << alignPoint.first << "-" << alignPoint.second << " ";
+ }
}
out << endl;
@@ -724,8 +727,7 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
totalTargetSize += targetSize;
++targetInd;
- }
- else {
+ } else {
++totalTargetSize;
}
}
diff --git a/moses-chart-cmd/IOWrapper.h b/moses-chart-cmd/IOWrapper.h
index 3178b6507..aba73e9a6 100644
--- a/moses-chart-cmd/IOWrapper.h
+++ b/moses-chart-cmd/IOWrapper.h
@@ -91,11 +91,11 @@ protected:
const ApplicationContext &context);
void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
- , std::ostream &out);
+ , std::ostream &out);
void OutputFeatureScores( std::ostream& out
- , const Moses::ScoreComponentCollection &features
- , const Moses::FeatureFunction *ff
- , std::string &lastName );
+ , const Moses::ScoreComponentCollection &features
+ , const Moses::FeatureFunction *ff
+ , std::string &lastName );
public:
IOWrapper(const std::vector<Moses::FactorType> &inputFactorOrder
diff --git a/moses-chart-cmd/Main.cpp b/moses-chart-cmd/Main.cpp
index a7568f5fb..61b8b9f5e 100644
--- a/moses-chart-cmd/Main.cpp
+++ b/moses-chart-cmd/Main.cpp
@@ -190,7 +190,7 @@ static void PrintFeatureWeight(const FeatureFunction* ff)
size_t numScoreComps = ff->GetNumScoreComponents();
vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
for (size_t i = 0; i < numScoreComps; ++i) {
- cout << " " << values[i];
+ cout << " " << values[i];
}
cout << endl;
@@ -244,12 +244,12 @@ int main(int argc, char* argv[])
ShowWeights();
exit(0);
}
-
+
CHECK(staticData.IsChart());
-
+
// set up read/writing class
IOWrapper *ioWrapper = GetIOWrapper(staticData);
-
+
// check on weights
const ScoreComponentCollection& weights = staticData.GetAllWeights();
IFVERBOSE(2) {
@@ -264,7 +264,7 @@ int main(int argc, char* argv[])
#ifdef WITH_THREADS
ThreadPool pool(staticData.ThreadCount());
#endif
-
+
// read each sentence & decode
InputType *source=0;
while(ReadInput(*ioWrapper,staticData.GetInputType(),source)) {
@@ -279,16 +279,16 @@ int main(int argc, char* argv[])
delete task;
#endif
}
-
+
#ifdef WITH_THREADS
pool.Stop(true); // flush remaining jobs
#endif
-
+
delete ioWrapper;
-
+
IFVERBOSE(1)
PrintUserTime("End.");
-
+
} catch (const std::exception &e) {
std::cerr << "Exception: " << e.what() << std::endl;
return EXIT_FAILURE;
diff --git a/moses-chart-cmd/Main.h b/moses-chart-cmd/Main.h
index 4f2765695..319e3889c 100644
--- a/moses-chart-cmd/Main.h
+++ b/moses-chart-cmd/Main.h
@@ -36,8 +36,9 @@ POSSIBILITY OF SUCH DAMAGE.
#include "moses/StaticData.h"
-namespace MosesChartCmd {
- class IOWrapper;
+namespace MosesChartCmd
+{
+class IOWrapper;
}
int main(int argc, char* argv[]);
diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp
index e5346852f..44e60ddf3 100644
--- a/moses-cmd/IOWrapper.cpp
+++ b/moses-cmd/IOWrapper.cpp
@@ -53,47 +53,47 @@ namespace MosesCmd
{
IOWrapper::IOWrapper(
- const vector<FactorType> &inputFactorOrder
- , const vector<FactorType> &outputFactorOrder
- , const FactorMask &inputFactorUsed
- , size_t nBestSize
- , const string &nBestFilePath)
-:m_inputFactorOrder(inputFactorOrder)
-,m_outputFactorOrder(outputFactorOrder)
-,m_inputFactorUsed(inputFactorUsed)
-,m_inputFile(NULL)
-,m_inputStream(&std::cin)
-,m_nBestStream(NULL)
-,m_outputWordGraphStream(NULL)
-,m_outputSearchGraphStream(NULL)
-,m_detailedTranslationReportingStream(NULL)
-,m_alignmentOutputStream(NULL)
+ const vector<FactorType> &inputFactorOrder
+ , const vector<FactorType> &outputFactorOrder
+ , const FactorMask &inputFactorUsed
+ , size_t nBestSize
+ , const string &nBestFilePath)
+ :m_inputFactorOrder(inputFactorOrder)
+ ,m_outputFactorOrder(outputFactorOrder)
+ ,m_inputFactorUsed(inputFactorUsed)
+ ,m_inputFile(NULL)
+ ,m_inputStream(&std::cin)
+ ,m_nBestStream(NULL)
+ ,m_outputWordGraphStream(NULL)
+ ,m_outputSearchGraphStream(NULL)
+ ,m_detailedTranslationReportingStream(NULL)
+ ,m_alignmentOutputStream(NULL)
{
Initialization(inputFactorOrder, outputFactorOrder
- , inputFactorUsed
- , nBestSize, nBestFilePath);
+ , inputFactorUsed
+ , nBestSize, nBestFilePath);
}
IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
- , const std::vector<FactorType> &outputFactorOrder
- , const FactorMask &inputFactorUsed
- , size_t nBestSize
- , const std::string &nBestFilePath
- , const std::string &inputFilePath)
-:m_inputFactorOrder(inputFactorOrder)
-,m_outputFactorOrder(outputFactorOrder)
-,m_inputFactorUsed(inputFactorUsed)
-,m_inputFilePath(inputFilePath)
-,m_inputFile(new InputFileStream(inputFilePath))
-,m_nBestStream(NULL)
-,m_outputWordGraphStream(NULL)
-,m_outputSearchGraphStream(NULL)
-,m_detailedTranslationReportingStream(NULL)
-,m_alignmentOutputStream(NULL)
+ , const std::vector<FactorType> &outputFactorOrder
+ , const FactorMask &inputFactorUsed
+ , size_t nBestSize
+ , const std::string &nBestFilePath
+ , const std::string &inputFilePath)
+ :m_inputFactorOrder(inputFactorOrder)
+ ,m_outputFactorOrder(outputFactorOrder)
+ ,m_inputFactorUsed(inputFactorUsed)
+ ,m_inputFilePath(inputFilePath)
+ ,m_inputFile(new InputFileStream(inputFilePath))
+ ,m_nBestStream(NULL)
+ ,m_outputWordGraphStream(NULL)
+ ,m_outputSearchGraphStream(NULL)
+ ,m_detailedTranslationReportingStream(NULL)
+ ,m_alignmentOutputStream(NULL)
{
Initialization(inputFactorOrder, outputFactorOrder
- , inputFactorUsed
- , nBestSize, nBestFilePath);
+ , inputFactorUsed
+ , nBestSize, nBestFilePath);
m_inputStream = m_inputFile;
}
@@ -117,10 +117,10 @@ IOWrapper::~IOWrapper()
}
void IOWrapper::Initialization(const std::vector<FactorType> &/*inputFactorOrder*/
- , const std::vector<FactorType> &/*outputFactorOrder*/
- , const FactorMask &/*inputFactorUsed*/
- , size_t nBestSize
- , const std::string &nBestFilePath)
+ , const std::vector<FactorType> &/*outputFactorOrder*/
+ , const FactorMask &/*inputFactorUsed*/
+ , size_t nBestSize
+ , const std::string &nBestFilePath)
{
const StaticData &staticData = StaticData::Instance();
@@ -192,7 +192,7 @@ InputType*IOWrapper::GetInput(InputType* inputType)
* print surface factor only for the given phrase
*/
void OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<FactorType> &outputFactorOrder,
- bool reportSegmentation, bool reportAllFactors)
+ bool reportSegmentation, bool reportAllFactors)
{
CHECK(outputFactorOrder.size() > 0);
const Phrase& phrase = edge.GetCurrTargetPhrase();
@@ -218,12 +218,12 @@ void OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<
// trace option "-t"
if (reportSegmentation == true && phrase.GetSize() > 0) {
out << "|" << edge.GetCurrSourceWordsRange().GetStartPos()
- << "-" << edge.GetCurrSourceWordsRange().GetEndPos() << "| ";
+ << "-" << edge.GetCurrSourceWordsRange().GetEndPos() << "| ";
}
}
void OutputBestSurface(std::ostream &out, const Hypothesis *hypo, const std::vector<FactorType> &outputFactorOrder,
- bool reportSegmentation, bool reportAllFactors)
+ bool reportSegmentation, bool reportAllFactors)
{
if (hypo != NULL) {
// recursively retrace this best path through the lattice, starting from the end of the hypothesis sentence
@@ -377,10 +377,10 @@ void IOWrapper::OutputBestHypo(const Hypothesis *hypo, long /*translationId*/, b
}
void OutputNBest(std::ostream& out
- , const Moses::TrellisPathList &nBestList
- , const std::vector<Moses::FactorType>& outputFactorOrder
- , long translationId
- , bool reportSegmentation)
+ , const Moses::TrellisPathList &nBestList
+ , const std::vector<Moses::FactorType>& outputFactorOrder
+ , long translationId
+ , bool reportSegmentation)
{
const StaticData &staticData = StaticData::Instance();
bool labeledOutput = staticData.IsLabeledNBestList();
@@ -473,9 +473,9 @@ void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
}
void OutputFeatureScores( std::ostream& out
- , const ScoreComponentCollection &features
- , const FeatureFunction *ff
- , std::string &lastName )
+ , const ScoreComponentCollection &features
+ , const FeatureFunction *ff
+ , std::string &lastName )
{
const StaticData &staticData = StaticData::Instance();
bool labeledOutput = staticData.IsLabeledNBestList();
@@ -556,7 +556,7 @@ IOWrapper *GetIOWrapper(const StaticData &staticData)
{
IOWrapper *ioWrapper;
const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder()
- ,&outputFactorOrder = staticData.GetOutputFactorOrder();
+ ,&outputFactorOrder = staticData.GetOutputFactorOrder();
FactorMask inputFactorUsed(inputFactorOrder);
// io
@@ -565,14 +565,14 @@ IOWrapper *GetIOWrapper(const StaticData &staticData)
string filePath = staticData.GetParam("input-file")[0];
ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
- , staticData.GetNBestSize()
- , staticData.GetNBestFilePath()
- , filePath);
+ , staticData.GetNBestSize()
+ , staticData.GetNBestFilePath()
+ , filePath);
} else {
VERBOSE(1,"IO from STDOUT/STDIN" << endl);
ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
- , staticData.GetNBestSize()
- , staticData.GetNBestFilePath());
+ , staticData.GetNBestSize()
+ , staticData.GetNBestFilePath());
}
ioWrapper->ResetTranslationId();
diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h
index fac9ca307..66b9377dc 100644
--- a/moses-cmd/IOWrapper.h
+++ b/moses-cmd/IOWrapper.h
@@ -142,12 +142,12 @@ void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Mo
void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);
void OutputNBest(std::ostream& out
- , const Moses::TrellisPathList &nBestList
- , const std::vector<Moses::FactorType>& outputFactorOrder
- , long translationId
- , bool reportSegmentation);
+ , const Moses::TrellisPathList &nBestList
+ , const std::vector<Moses::FactorType>& outputFactorOrder
+ , long translationId
+ , bool reportSegmentation);
void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
- , std::ostream &out);
+ , std::ostream &out);
void OutputFeatureScores( std::ostream& out
, const Moses::ScoreComponentCollection &features
, const Moses::FeatureFunction *ff
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index 3a6f2856e..d70b64536 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -92,7 +92,7 @@ public:
OutputCollector* alignmentInfoCollector,
OutputCollector* unknownsCollector,
bool outputSearchGraphSLF,
- bool outputSearchGraphHypergraph) :
+ bool outputSearchGraphHypergraph) :
m_source(source), m_lineNumber(lineNumber),
m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
m_latticeSamplesCollector(latticeSamplesCollector),
@@ -103,7 +103,7 @@ public:
m_outputSearchGraphSLF(outputSearchGraphSLF),
m_outputSearchGraphHypergraph(outputSearchGraphHypergraph) {}
- /** Translate one sentence
+ /** Translate one sentence
* gets called by main function implemented at end of this source file */
void Run() {
@@ -150,7 +150,7 @@ public:
manager.SerializeSearchGraphPB(m_lineNumber, output);
}
#endif
- }
+ }
// Output search graph in HTK standard lattice format (SLF)
if (m_outputSearchGraphSLF) {
@@ -159,13 +159,13 @@ public:
std::ofstream *file = new std::ofstream;
file->open(fileName.str().c_str());
if (file->is_open() && file->good()) {
- ostringstream out;
- fix(out,PRECISION);
- manager.OutputSearchGraphAsSLF(m_lineNumber, out);
- *file << out.str();
- file -> flush();
+ ostringstream out;
+ fix(out,PRECISION);
+ manager.OutputSearchGraphAsSLF(m_lineNumber, out);
+ *file << out.str();
+ file -> flush();
} else {
- TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
+ TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
}
}
@@ -176,91 +176,91 @@ public:
bool appendSuffix;
if (hypergraphParameters.size() > 0 && hypergraphParameters[0] == "true") {
- appendSuffix = true;
+ appendSuffix = true;
} else {
- appendSuffix = false;
+ appendSuffix = false;
}
string compression;
if (hypergraphParameters.size() > 1) {
- compression = hypergraphParameters[1];
+ compression = hypergraphParameters[1];
} else {
- compression = "txt";
+ compression = "txt";
}
string hypergraphDir;
if ( hypergraphParameters.size() > 2 ) {
hypergraphDir = hypergraphParameters[2];
} else {
- string nbestFile = staticData.GetNBestFilePath();
- if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
- boost::filesystem::path nbestPath(nbestFile);
-
- // In the Boost filesystem API version 2,
- // which was the default prior to Boost 1.46,
- // the filename() method returned a string.
- //
- // In the Boost filesystem API version 3,
- // which is the default starting with Boost 1.46,
- // the filename() method returns a path object.
- //
- // To get a string from the path object,
- // the native() method must be called.
- // hypergraphDir = nbestPath.parent_path().filename()
- //#if BOOST_VERSION >= 104600
- // .native()
- //#endif
- //;
-
- // Hopefully the following compiles under all versions of Boost.
- //
- // If this line gives you compile errors,
- // contact Lane Schwartz on the Moses mailing list
- hypergraphDir = nbestPath.parent_path().string();
-
- } else {
- stringstream hypergraphDirName;
- hypergraphDirName << boost::filesystem::current_path() << "/hypergraph";
- hypergraphDir = hypergraphDirName.str();
- }
+ string nbestFile = staticData.GetNBestFilePath();
+ if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
+ boost::filesystem::path nbestPath(nbestFile);
+
+ // In the Boost filesystem API version 2,
+ // which was the default prior to Boost 1.46,
+ // the filename() method returned a string.
+ //
+ // In the Boost filesystem API version 3,
+ // which is the default starting with Boost 1.46,
+ // the filename() method returns a path object.
+ //
+ // To get a string from the path object,
+ // the native() method must be called.
+ // hypergraphDir = nbestPath.parent_path().filename()
+ //#if BOOST_VERSION >= 104600
+ // .native()
+ //#endif
+ //;
+
+ // Hopefully the following compiles under all versions of Boost.
+ //
+ // If this line gives you compile errors,
+ // contact Lane Schwartz on the Moses mailing list
+ hypergraphDir = nbestPath.parent_path().string();
+
+ } else {
+ stringstream hypergraphDirName;
+ hypergraphDirName << boost::filesystem::current_path() << "/hypergraph";
+ hypergraphDir = hypergraphDirName.str();
+ }
}
if ( ! boost::filesystem::exists(hypergraphDir) ) {
- boost::filesystem::create_directory(hypergraphDir);
- }
+ boost::filesystem::create_directory(hypergraphDir);
+ }
if ( ! boost::filesystem::exists(hypergraphDir) ) {
- TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because the directory does not exist" << std::endl);
+ TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because the directory does not exist" << std::endl);
} else if ( ! boost::filesystem::is_directory(hypergraphDir) ) {
- TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl);
+ TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl);
} else {
- stringstream fileName;
- fileName << hypergraphDir << "/" << m_lineNumber;
- if ( appendSuffix ) {
- fileName << "." << compression;
- }
- boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream;
-
- if ( compression == "gz" ) {
- file->push( boost::iostreams::gzip_compressor() );
- } else if ( compression == "bz2" ) {
- file->push( boost::iostreams::bzip2_compressor() );
- } else if ( compression != "txt" ) {
- TRACE_ERR("Unrecognized hypergraph compression format (" << compression << ") - using uncompressed plain txt" << std::endl);
- compression = "txt";
- }
-
- file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) );
-
- if (file->is_complete() && file->good()) {
- fix(*file,PRECISION);
- manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file);
- file -> flush();
- } else {
- TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file " << fileName.str() << " is not open or not ready for writing" << std::endl);
- }
- file -> pop();
- delete file;
+ stringstream fileName;
+ fileName << hypergraphDir << "/" << m_lineNumber;
+ if ( appendSuffix ) {
+ fileName << "." << compression;
+ }
+ boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream;
+
+ if ( compression == "gz" ) {
+ file->push( boost::iostreams::gzip_compressor() );
+ } else if ( compression == "bz2" ) {
+ file->push( boost::iostreams::bzip2_compressor() );
+ } else if ( compression != "txt" ) {
+ TRACE_ERR("Unrecognized hypergraph compression format (" << compression << ") - using uncompressed plain txt" << std::endl);
+ compression = "txt";
+ }
+
+ file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) );
+
+ if (file->is_complete() && file->good()) {
+ fix(*file,PRECISION);
+ manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file);
+ file -> flush();
+ } else {
+ TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file " << fileName.str() << " is not open or not ready for writing" << std::endl);
+ }
+ file -> pop();
+ delete file;
}
}
@@ -277,8 +277,7 @@ public:
// MAP decoding: best hypothesis
const Hypothesis* bestHypo = NULL;
- if (!staticData.UseMBR())
- {
+ if (!staticData.UseMBR()) {
bestHypo = manager.GetBestHypothesis();
if (bestHypo) {
if (staticData.IsPathRecoveryEnabled()) {
@@ -296,7 +295,7 @@ public:
staticData.GetReportSegmentation(),
staticData.GetReportAllFactors());
if (staticData.PrintAlignmentInfo()) {
- out << "||| ";
+ out << "||| ";
OutputAlignment(out, bestHypo);
}
@@ -306,11 +305,10 @@ public:
}
}
out << endl;
- }
+ }
// MBR decoding (n-best MBR, lattice MBR, consensus)
- else
- {
+ else {
// we first need the n-best translations
size_t nBestSize = staticData.GetMBRSize();
if (nBestSize <= 0) {
@@ -346,7 +344,7 @@ public:
}
// consensus decoding
- else if (staticData.UseConsensusDecoding()) {
+ else if (staticData.UseConsensusDecoding()) {
const TrellisPath &conBestHypo = doConsensusDecoding(manager,nBestList);
OutputBestHypo(conBestHypo, m_lineNumber,
staticData.GetReportSegmentation(),
@@ -355,8 +353,8 @@ public:
IFVERBOSE(2) {
PrintUserTime("finished Consensus decoding");
}
- }
-
+ }
+
// n-best MBR decoding
else {
const Moses::TrellisPath &mbrBestHypo = doMBR(nBestList);
@@ -380,7 +378,7 @@ public:
ostringstream out;
manager.CalcNBest(staticData.GetNBestSize(), nBestList,staticData.GetDistinctNBest());
OutputNBest(out, nBestList, staticData.GetOutputFactorOrder(), m_lineNumber,
- staticData.GetReportSegmentation());
+ staticData.GetReportSegmentation());
m_nbestCollector->Write(m_lineNumber, out.str());
}
@@ -390,7 +388,7 @@ public:
ostringstream out;
manager.CalcLatticeSamples(staticData.GetLatticeSamplesSize(), latticeSamples);
OutputNBest(out,latticeSamples, staticData.GetOutputFactorOrder(), m_lineNumber,
- staticData.GetReportSegmentation());
+ staticData.GetReportSegmentation());
m_latticeSamplesCollector->Write(m_lineNumber, out.str());
}
@@ -450,7 +448,7 @@ static void PrintFeatureWeight(const FeatureFunction* ff)
size_t numScoreComps = ff->GetNumScoreComponents();
vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
for (size_t i = 0; i < numScoreComps; ++i) {
- cout << " " << values[i];
+ cout << " " << values[i];
}
cout << endl;
}
@@ -484,13 +482,13 @@ size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff
vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
if (numScoreComps > 1) {
for (size_t i = 0; i < numScoreComps; ++i) {
- outputSearchGraphStream << ff->GetScoreProducerDescription()
- << i
- << "=" << values[i] << endl;
+ outputSearchGraphStream << ff->GetScoreProducerDescription()
+ << i
+ << "=" << values[i] << endl;
}
} else {
- outputSearchGraphStream << ff->GetScoreProducerDescription()
- << "=" << values[0] << endl;
+ outputSearchGraphStream << ff->GetScoreProducerDescription()
+ << "=" << values[0] << endl;
}
return index+numScoreComps;
} else {
@@ -541,7 +539,7 @@ void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream)
int main(int argc, char** argv)
{
try {
-
+
#ifdef HAVE_PROTOBUF
GOOGLE_PROTOBUF_VERIFY_VERSION;
#endif
@@ -601,20 +599,20 @@ int main(int argc, char** argv)
if (staticData.GetOutputSearchGraphHypergraph()) {
ofstream* weightsOut = new std::ofstream;
stringstream weightsFilename;
- if (staticData.GetParam("output-search-graph-hypergraph").size() > 3) {
- weightsFilename << staticData.GetParam("output-search-graph-hypergraph")[3];
+ if (staticData.GetParam("output-search-graph-hypergraph").size() > 3) {
+ weightsFilename << staticData.GetParam("output-search-graph-hypergraph")[3];
} else {
- string nbestFile = staticData.GetNBestFilePath();
- if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
- boost::filesystem::path nbestPath(nbestFile);
- weightsFilename << nbestPath.parent_path().filename() << "/weights";
- } else {
- weightsFilename << boost::filesystem::current_path() << "/hypergraph/weights";
- }
+ string nbestFile = staticData.GetNBestFilePath();
+ if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
+ boost::filesystem::path nbestPath(nbestFile);
+ weightsFilename << nbestPath.parent_path().filename() << "/weights";
+ } else {
+ weightsFilename << boost::filesystem::current_path() << "/hypergraph/weights";
+ }
}
boost::filesystem::path weightsFilePath(weightsFilename.str());
if ( ! boost::filesystem::exists(weightsFilePath.parent_path()) ) {
- boost::filesystem::create_directory(weightsFilePath.parent_path());
+ boost::filesystem::create_directory(weightsFilePath.parent_path());
}
TRACE_ERR("The weights file is " << weightsFilename.str() << "\n");
weightsOut->open(weightsFilename.str().c_str());
@@ -669,26 +667,26 @@ int main(int argc, char** argv)
if (output1best) {
outputCollector.reset(new OutputCollector());
}
-
+
// initialize stream for word graph (aka: output lattice)
auto_ptr<OutputCollector> wordGraphCollector;
if (staticData.GetOutputWordGraph()) {
wordGraphCollector.reset(new OutputCollector(&(ioWrapper->GetOutputWordGraphStream())));
}
-
+
// initialize stream for search graph
// note: this is essentially the same as above, but in a different format
auto_ptr<OutputCollector> searchGraphCollector;
if (staticData.GetOutputSearchGraph()) {
searchGraphCollector.reset(new OutputCollector(&(ioWrapper->GetOutputSearchGraphStream())));
}
-
+
// initialize stram for details about the decoder run
auto_ptr<OutputCollector> detailedTranslationCollector;
if (staticData.IsDetailedTranslationReportingEnabled()) {
detailedTranslationCollector.reset(new OutputCollector(&(ioWrapper->GetDetailedTranslationReportingStream())));
}
-
+
// initialize stram for word alignment between input and output
auto_ptr<OutputCollector> alignmentInfoCollector;
if (!staticData.GetAlignmentOutputFile().empty()) {
@@ -706,11 +704,11 @@ int main(int argc, char** argv)
}
unknownsCollector.reset(new OutputCollector(unknownsStream.get()));
}
-
+
#ifdef WITH_THREADS
ThreadPool pool(staticData.ThreadCount());
#endif
-
+
// main loop over set of input sentences
InputType* source = NULL;
size_t lineCount = staticData.GetStartTranslationId();
@@ -728,21 +726,21 @@ int main(int argc, char** argv)
detailedTranslationCollector.get(),
alignmentInfoCollector.get(),
unknownsCollector.get(),
- staticData.GetOutputSearchGraphSLF(),
- staticData.GetOutputSearchGraphHypergraph());
+ staticData.GetOutputSearchGraphSLF(),
+ staticData.GetOutputSearchGraphHypergraph());
// execute task
#ifdef WITH_THREADS
- pool.Submit(task);
+ pool.Submit(task);
#else
task->Run();
delete task;
#endif
-
+
source = NULL; //make sure it doesn't get deleted
++lineCount;
}
-
- // we are done, finishing up
+
+ // we are done, finishing up
#ifdef WITH_THREADS
pool.Stop(true); //flush remaining jobs
#endif
diff --git a/moses-cmd/TranslationAnalysis.cpp b/moses-cmd/TranslationAnalysis.cpp
index 4231001e9..bd7113096 100644
--- a/moses-cmd/TranslationAnalysis.cpp
+++ b/moses-cmd/TranslationAnalysis.cpp
@@ -57,7 +57,7 @@ void PrintTranslationAnalysis(std::ostream &os, const Hypothesis* hypo)
}
}
}
-
+
bool epsilon = false;
if (target == "") {
target="<EPSILON>";
@@ -101,21 +101,21 @@ void PrintTranslationAnalysis(std::ostream &os, const Hypothesis* hypo)
}
os << std::endl << std::endl;
if (doLMStats && lmCalls > 0) {
- std::vector<unsigned int>::iterator acc = lmAcc.begin();
+ std::vector<unsigned int>::iterator acc = lmAcc.begin();
- const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- for (size_t i = 0; i < statefulFFs.size(); ++i) {
- const StatefulFeatureFunction *ff = statefulFFs[i];
- const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
+ const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ for (size_t i = 0; i < statefulFFs.size(); ++i) {
+ const StatefulFeatureFunction *ff = statefulFFs[i];
+ const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
- if (lm) {
- char buf[256];
- sprintf(buf, "%.4f", (float)(*acc)/(float)lmCalls);
- os << lm->GetScoreProducerDescription() <<", AVG N-GRAM LENGTH: " << buf << std::endl;
+ if (lm) {
+ char buf[256];
+ sprintf(buf, "%.4f", (float)(*acc)/(float)lmCalls);
+ os << lm->GetScoreProducerDescription() <<", AVG N-GRAM LENGTH: " << buf << std::endl;
- ++acc;
- }
- }
+ ++acc;
+ }
+ }
}
if (droppedWords.size() > 0) {
@@ -125,10 +125,10 @@ void PrintTranslationAnalysis(std::ostream &os, const Hypothesis* hypo)
os << "\tdropped=" << *dwi << std::endl;
}
}
- os << std::endl << "SCORES (UNWEIGHTED/WEIGHTED): ";
+ os << std::endl << "SCORES (UNWEIGHTED/WEIGHTED): ";
os << translationPath.back()->GetScoreBreakdown();
os << " weighted(TODO)";
- os << std::endl;
+ os << std::endl;
}
}
diff --git a/moses/AlignmentInfo.cpp b/moses/AlignmentInfo.cpp
index 97eff59b5..178f3438a 100644
--- a/moses/AlignmentInfo.cpp
+++ b/moses/AlignmentInfo.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -46,17 +46,18 @@ void AlignmentInfo::BuildNonTermIndexMap()
m_nonTermIndexMap.resize(maxIndex+1, NOT_FOUND);
size_t i = 0;
for (p = begin(); p != end(); ++p) {
- if (m_nonTermIndexMap[p->second] != NOT_FOUND) {
- // 1-to-many. Definitely a set of terminals. Don't bother storing 1-to-1 index map
- m_nonTermIndexMap.clear();
- return;
- }
+ if (m_nonTermIndexMap[p->second] != NOT_FOUND) {
+ // 1-to-many. Definitely a set of terminals. Don't bother storing 1-to-1 index map
+ m_nonTermIndexMap.clear();
+ return;
+ }
m_nonTermIndexMap[p->second] = i++;
}
-
+
}
-bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,size_t> *b) {
+bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,size_t> *b)
+{
if(a->second < b->second) return true;
if(a->second == b->second) return (a->first < b->first);
return false;
@@ -66,32 +67,30 @@ bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,si
std::vector< const std::pair<size_t,size_t>* > AlignmentInfo::GetSortedAlignments() const
{
std::vector< const std::pair<size_t,size_t>* > ret;
-
+
CollType::const_iterator iter;
- for (iter = m_collection.begin(); iter != m_collection.end(); ++iter)
- {
+ for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
const std::pair<size_t,size_t> &alignPair = *iter;
ret.push_back(&alignPair);
}
-
+
const StaticData &staticData = StaticData::Instance();
WordAlignmentSort wordAlignmentSort = staticData.GetWordAlignmentSort();
-
- switch (wordAlignmentSort)
- {
- case NoSort:
- break;
-
- case TargetOrder:
- std::sort(ret.begin(), ret.end(), compare_target);
- break;
-
- default:
- CHECK(false);
+
+ switch (wordAlignmentSort) {
+ case NoSort:
+ break;
+
+ case TargetOrder:
+ std::sort(ret.begin(), ret.end(), compare_target);
+ break;
+
+ default:
+ CHECK(false);
}
-
+
return ret;
-
+
}
std::vector<size_t> AlignmentInfo::GetSourceIndex2PosMap() const
diff --git a/moses/AlignmentInfo.h b/moses/AlignmentInfo.h
index db92791aa..76d4d918a 100644
--- a/moses/AlignmentInfo.h
+++ b/moses/AlignmentInfo.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -42,16 +42,20 @@ class AlignmentInfo
friend struct AlignmentInfoHasher;
friend class AlignmentInfoCollection;
- public:
+public:
typedef std::set<std::pair<size_t,size_t> > CollType;
typedef std::vector<size_t> NonTermIndexMap;
typedef CollType::const_iterator const_iterator;
- const_iterator begin() const { return m_collection.begin(); }
- const_iterator end() const { return m_collection.end(); }
+ const_iterator begin() const {
+ return m_collection.begin();
+ }
+ const_iterator end() const {
+ return m_collection.end();
+ }
void Add(size_t sourcePos, size_t targetPos) {
- m_collection.insert(std::pair<size_t, size_t>(sourcePos, targetPos));
+ m_collection.insert(std::pair<size_t, size_t>(sourcePos, targetPos));
}
/** Provides a map from target-side to source-side non-terminal indices.
* The target-side index should be the rule symbol index (COUNTING terminals).
@@ -64,20 +68,21 @@ class AlignmentInfo
const CollType &GetAlignments() const {
return m_collection;
}
-
- size_t GetSize() const { return m_collection.size(); }
+
+ size_t GetSize() const {
+ return m_collection.size();
+ }
std::vector< const std::pair<size_t,size_t>* > GetSortedAlignments() const;
std::vector<size_t> GetSourceIndex2PosMap() const;
- bool operator==(const AlignmentInfo& rhs) const
- {
+ bool operator==(const AlignmentInfo& rhs) const {
return m_collection == rhs.m_collection &&
m_nonTermIndexMap == rhs.m_nonTermIndexMap;
}
-
- private:
+
+private:
//! AlignmentInfo objects should only be created by an AlignmentInfoCollection
explicit AlignmentInfo(const std::set<std::pair<size_t,size_t> > &pairs);
@@ -90,25 +95,21 @@ class AlignmentInfo
/** Define an arbitrary strict weak ordering between AlignmentInfo objects
* for use by AlignmentInfoCollection.
*/
-struct AlignmentInfoOrderer
-{
+struct AlignmentInfoOrderer {
bool operator()(const AlignmentInfo &a, const AlignmentInfo &b) const {
- if (a.m_collection == b.m_collection) {
- return a.m_nonTermIndexMap < b.m_nonTermIndexMap;
- }
- else {
- return a.m_collection < b.m_collection;
- }
+ if (a.m_collection == b.m_collection) {
+ return a.m_nonTermIndexMap < b.m_nonTermIndexMap;
+ } else {
+ return a.m_collection < b.m_collection;
+ }
}
};
-/**
+/**
* Hashing functoid
**/
-struct AlignmentInfoHasher
-{
- size_t operator()(const AlignmentInfo& a) const
- {
+struct AlignmentInfoHasher {
+ size_t operator()(const AlignmentInfo& a) const {
size_t seed = 0;
boost::hash_combine(seed,a.m_collection);
boost::hash_combine(seed,a.m_nonTermIndexMap);
@@ -117,7 +118,8 @@ struct AlignmentInfoHasher
};
-inline size_t hash_value(const AlignmentInfo& a) {
+inline size_t hash_value(const AlignmentInfo& a)
+{
static AlignmentInfoHasher hasher;
return hasher(a);
}
diff --git a/moses/AlignmentInfoCollection.cpp b/moses/AlignmentInfoCollection.cpp
index 53b83d8cd..ef6e62eb3 100644
--- a/moses/AlignmentInfoCollection.cpp
+++ b/moses/AlignmentInfoCollection.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -39,7 +39,7 @@ const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
}
const AlignmentInfo *AlignmentInfoCollection::Add(
- const std::set<std::pair<size_t,size_t> > &pairs)
+ const std::set<std::pair<size_t,size_t> > &pairs)
{
AlignmentInfo pairsAlignmentInfo(pairs);
#ifdef WITH_THREADS
diff --git a/moses/AlignmentInfoCollection.h b/moses/AlignmentInfoCollection.h
index 6185b32a9..37d717b0f 100644
--- a/moses/AlignmentInfoCollection.h
+++ b/moses/AlignmentInfoCollection.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -36,8 +36,10 @@ namespace Moses
*/
class AlignmentInfoCollection
{
- public:
- static AlignmentInfoCollection &Instance() { return s_instance; }
+public:
+ static AlignmentInfoCollection &Instance() {
+ return s_instance;
+ }
/** Returns a pointer to an AlignmentInfo object with the same source-target
* alignment pairs as given in the argument. If the collection already
@@ -49,7 +51,7 @@ class AlignmentInfoCollection
//! Returns a pointer to an empty AlignmentInfo object.
const AlignmentInfo &GetEmptyAlignmentInfo() const;
- private:
+private:
typedef std::set<AlignmentInfo, AlignmentInfoOrderer> AlignmentInfoSet;
//! Only a single static variable should be created.
@@ -62,7 +64,7 @@ class AlignmentInfoCollection
//reader-writer lock
mutable boost::shared_mutex m_accessLock;
#endif
-
+
AlignmentInfoSet m_collection;
const AlignmentInfo *m_emptyAlignmentInfo;
};
diff --git a/moses/AlignmentInfoTest.cpp b/moses/AlignmentInfoTest.cpp
index 48c88db65..26127f3cf 100644
--- a/moses/AlignmentInfoTest.cpp
+++ b/moses/AlignmentInfoTest.cpp
@@ -35,8 +35,7 @@ struct AlignmentInfoFixture {
const AlignmentInfo* ai2;
const AlignmentInfo* ai3;
- AlignmentInfoFixture()
- {
+ AlignmentInfoFixture() {
AlignmentInfoCollection& collection = AlignmentInfoCollection::Instance();
IndexSet aligns1,aligns2,aligns3;
aligns1.insert(IndexPair(1,1));
diff --git a/moses/BitmapContainer.cpp b/moses/BitmapContainer.cpp
index 7e8d470ee..64dd9081b 100644
--- a/moses/BitmapContainer.cpp
+++ b/moses/BitmapContainer.cpp
@@ -275,11 +275,11 @@ BitmapContainer::~BitmapContainer()
// As we have created the square position objects we clean up now.
while (!m_queue.empty()) {
- HypothesisQueueItem *item = m_queue.top();
- m_queue.pop();
+ HypothesisQueueItem *item = m_queue.top();
+ m_queue.pop();
- FREEHYPO( item->GetHypothesis() );
- delete item;
+ FREEHYPO( item->GetHypothesis() );
+ delete item;
}
// Delete all edges.
diff --git a/moses/ChartCell.cpp b/moses/ChartCell.cpp
index fd163450e..b57a4ab36 100644
--- a/moses/ChartCell.cpp
+++ b/moses/ChartCell.cpp
@@ -45,17 +45,18 @@ ChartCellBase::~ChartCellBase() {}
/** Constructor
* \param startPos endPos range of this cell
- * \param manager pointer back to the manager
+ * \param manager pointer back to the manager
*/
ChartCell::ChartCell(size_t startPos, size_t endPos, ChartManager &manager) :
- ChartCellBase(startPos, endPos), m_manager(manager) {
+ ChartCellBase(startPos, endPos), m_manager(manager)
+{
const StaticData &staticData = StaticData::Instance();
m_nBestIsEnabled = staticData.IsNBestEnabled();
}
ChartCell::~ChartCell() {}
-/** Add the given hypothesis to the cell.
+/** Add the given hypothesis to the cell.
* Returns true if added, false if not. Maybe it already exists in the collection or score falls below threshold etc.
* This function just calls the correspondind AddHypothesis() in ChartHypothesisCollection
* \param hypo Hypothesis to be added
@@ -98,8 +99,7 @@ void ChartCell::ProcessSentence(const ChartTranslationOptionList &transOptList
// pluck things out of queue and add to hypo collection
const size_t popLimit = staticData.GetCubePruningPopLimit();
- for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops)
- {
+ for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops) {
ChartHypothesis *hypo = queue.Pop();
AddHypothesis(hypo);
}
@@ -179,15 +179,15 @@ size_t ChartCell::GetSize() const
const HypoList *ChartCell::GetAllSortedHypotheses() const
{
- HypoList *ret = new HypoList();
+ HypoList *ret = new HypoList();
- MapType::const_iterator iter;
- for (iter = m_hypoColl.begin(); iter != m_hypoColl.end(); ++iter) {
- const ChartHypothesisCollection &coll = iter->second;
- const HypoList &list = coll.GetSortedHypotheses();
+ MapType::const_iterator iter;
+ for (iter = m_hypoColl.begin(); iter != m_hypoColl.end(); ++iter) {
+ const ChartHypothesisCollection &coll = iter->second;
+ const HypoList &list = coll.GetSortedHypotheses();
std::copy(list.begin(), list.end(), std::inserter(*ret, ret->end()));
- }
- return ret;
+ }
+ return ret;
}
//! call GetSearchGraph() for each hypo collection
diff --git a/moses/ChartCell.h b/moses/ChartCell.h
index 14ac8e3b4..1fed695ac 100644
--- a/moses/ChartCell.h
+++ b/moses/ChartCell.h
@@ -44,35 +44,43 @@ class ChartTranslationOptionList;
class ChartCellCollection;
class ChartManager;
-class ChartCellBase {
- public:
- ChartCellBase(size_t startPos, size_t endPos);
+class ChartCellBase
+{
+public:
+ ChartCellBase(size_t startPos, size_t endPos);
- virtual ~ChartCellBase();
+ virtual ~ChartCellBase();
- const ChartCellLabelSet &GetTargetLabelSet() const { return m_targetLabelSet; }
+ const ChartCellLabelSet &GetTargetLabelSet() const {
+ return m_targetLabelSet;
+ }
- ChartCellLabelSet &MutableTargetLabelSet() { return m_targetLabelSet; }
+ ChartCellLabelSet &MutableTargetLabelSet() {
+ return m_targetLabelSet;
+ }
- const WordsRange &GetCoverage() const { return m_coverage; }
+ const WordsRange &GetCoverage() const {
+ return m_coverage;
+ }
- protected:
- const WordsRange m_coverage;
- ChartCellLabelSet m_targetLabelSet;
+protected:
+ const WordsRange m_coverage;
+ ChartCellLabelSet m_targetLabelSet;
};
/** 1 cell in chart decoder.
* Doesn't directly hold hypotheses. Each cell contain a map of ChartHypothesisCollection that have different constituent labels
*/
-class ChartCell : public ChartCellBase {
+class ChartCell : public ChartCellBase
+{
friend std::ostream& operator<<(std::ostream&, const ChartCell&);
public:
#if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200)
typedef boost::unordered_map<Word,
- ChartHypothesisCollection,
- NonTerminalHasher,
- NonTerminalEqualityPred
- > MapType;
+ ChartHypothesisCollection,
+ NonTerminalHasher,
+ NonTerminalEqualityPred
+ > MapType;
#else
typedef std::map<Word, ChartHypothesisCollection> MapType;
#endif
@@ -91,8 +99,7 @@ public:
,const ChartCellCollection &allChartCells);
//! Get all hypotheses in the cell that have the specified constituent label
- const HypoList *GetSortedHypotheses(const Word &constituentLabel) const
- {
+ const HypoList *GetSortedHypotheses(const Word &constituentLabel) const {
MapType::const_iterator p = m_hypoColl.find(constituentLabel);
return (p == m_hypoColl.end()) ? NULL : &(p->second.GetSortedHypotheses());
}
diff --git a/moses/ChartCellCollection.cpp b/moses/ChartCellCollection.cpp
index a34687f59..46392261d 100644
--- a/moses/ChartCellCollection.cpp
+++ b/moses/ChartCellCollection.cpp
@@ -23,24 +23,27 @@
#include "InputType.h"
#include "WordsRange.h"
-namespace Moses {
+namespace Moses
+{
-ChartCellCollectionBase::~ChartCellCollectionBase() {
+ChartCellCollectionBase::~ChartCellCollectionBase()
+{
m_source.clear();
- for (std::vector<std::vector<ChartCellBase*> >::iterator i = m_cells.begin(); i != m_cells.end(); ++i)
+ for (std::vector<std::vector<ChartCellBase*> >::iterator i = m_cells.begin(); i != m_cells.end(); ++i)
RemoveAllInColl(*i);
}
-class CubeCellFactory {
- public:
- explicit CubeCellFactory(ChartManager &manager) : m_manager(manager) {}
+class CubeCellFactory
+{
+public:
+ explicit CubeCellFactory(ChartManager &manager) : m_manager(manager) {}
- ChartCell *operator()(size_t start, size_t end) const {
- return new ChartCell(start, end, m_manager);
- }
+ ChartCell *operator()(size_t start, size_t end) const {
+ return new ChartCell(start, end, m_manager);
+ }
- private:
- ChartManager &m_manager;
+private:
+ ChartManager &m_manager;
};
/** Costructor
diff --git a/moses/ChartCellCollection.h b/moses/ChartCellCollection.h
index 7532503d7..d0423b0b2 100644
--- a/moses/ChartCellCollection.h
+++ b/moses/ChartCellCollection.h
@@ -31,57 +31,59 @@ namespace Moses
class InputType;
class ChartManager;
-class ChartCellCollectionBase {
- public:
- template <class Factory> ChartCellCollectionBase(const InputType &input, const Factory &factory) :
- m_cells(input.GetSize()) {
- size_t size = input.GetSize();
- for (size_t startPos = 0; startPos < size; ++startPos) {
- std::vector<ChartCellBase*> &inner = m_cells[startPos];
- inner.reserve(size - startPos);
- for (size_t endPos = startPos; endPos < size; ++endPos) {
- inner.push_back(factory(startPos, endPos));
- }
- /* Hack: ChartCellLabel shouldn't need to know its span, but the parser
- * gets it from there :-(. The span is actually stored as a reference,
- * which needs to point somewhere, so I have it refer to the ChartCell.
- */
- m_source.push_back(new ChartCellLabel(inner[0]->GetCoverage(), input.GetWord(startPos)));
+class ChartCellCollectionBase
+{
+public:
+ template <class Factory> ChartCellCollectionBase(const InputType &input, const Factory &factory) :
+ m_cells(input.GetSize()) {
+ size_t size = input.GetSize();
+ for (size_t startPos = 0; startPos < size; ++startPos) {
+ std::vector<ChartCellBase*> &inner = m_cells[startPos];
+ inner.reserve(size - startPos);
+ for (size_t endPos = startPos; endPos < size; ++endPos) {
+ inner.push_back(factory(startPos, endPos));
}
+ /* Hack: ChartCellLabel shouldn't need to know its span, but the parser
+ * gets it from there :-(. The span is actually stored as a reference,
+ * which needs to point somewhere, so I have it refer to the ChartCell.
+ */
+ m_source.push_back(new ChartCellLabel(inner[0]->GetCoverage(), input.GetWord(startPos)));
}
+ }
- virtual ~ChartCellCollectionBase();
+ virtual ~ChartCellCollectionBase();
- const ChartCellBase &GetBase(const WordsRange &coverage) const {
- return *m_cells[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()];
- }
+ const ChartCellBase &GetBase(const WordsRange &coverage) const {
+ return *m_cells[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()];
+ }
- ChartCellBase &MutableBase(const WordsRange &coverage) {
- return *m_cells[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()];
- }
+ ChartCellBase &MutableBase(const WordsRange &coverage) {
+ return *m_cells[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()];
+ }
- const ChartCellLabel &GetSourceWordLabel(size_t at) const {
- return m_source[at];
- }
+ const ChartCellLabel &GetSourceWordLabel(size_t at) const {
+ return m_source[at];
+ }
- private:
- std::vector<std::vector<ChartCellBase*> > m_cells;
+private:
+ std::vector<std::vector<ChartCellBase*> > m_cells;
- boost::ptr_vector<ChartCellLabel> m_source;
+ boost::ptr_vector<ChartCellLabel> m_source;
};
/** Hold all the chart cells for 1 input sentence. A variable of this type is held by the ChartManager
*/
-class ChartCellCollection : public ChartCellCollectionBase {
- public:
- ChartCellCollection(const InputType &input, ChartManager &manager);
+class ChartCellCollection : public ChartCellCollectionBase
+{
+public:
+ ChartCellCollection(const InputType &input, ChartManager &manager);
//! get a chart cell for a particular range
ChartCell &Get(const WordsRange &coverage) {
return static_cast<ChartCell&>(MutableBase(coverage));
}
-
+
//! get a chart cell for a particular range
const ChartCell &Get(const WordsRange &coverage) const {
return static_cast<const ChartCell&>(GetBase(coverage));
diff --git a/moses/ChartCellLabel.h b/moses/ChartCellLabel.h
index 218a512c0..ad6e3565d 100644
--- a/moses/ChartCellLabel.h
+++ b/moses/ChartCellLabel.h
@@ -23,7 +23,10 @@
#include "Word.h"
#include "WordsRange.h"
-namespace search { class Vertex; }
+namespace search
+{
+class Vertex;
+}
namespace Moses
{
@@ -31,17 +34,17 @@ namespace Moses
class Word;
/** Contains a range, word (non-terms?) and a vector of hypotheses.
- * @todo This is probably incompatible with lattice decoding when the word that spans
+ * @todo This is probably incompatible with lattice decoding when the word that spans
* a position (or positions) can vary.
* @todo is this to hold sorted hypotheses that are in the queue for creating the next hypos?
*/
class ChartCellLabel
{
- public:
+public:
union Stack {
const HypoList *cube; // cube pruning
- search::Vertex *incr; // incremental search after filling.
- void *incr_generator; // incremental search during filling.
+ search::Vertex *incr; // incremental search after filling.
+ void *incr_generator; // incremental search during filling.
};
@@ -52,13 +55,20 @@ class ChartCellLabel
, m_stack(stack)
{}
- const WordsRange &GetCoverage() const { return m_coverage; }
- const Word &GetLabel() const { return m_label; }
- Stack GetStack() const { return m_stack; }
- Stack &MutableStack() { return m_stack; }
+ const WordsRange &GetCoverage() const {
+ return m_coverage;
+ }
+ const Word &GetLabel() const {
+ return m_label;
+ }
+ Stack GetStack() const {
+ return m_stack;
+ }
+ Stack &MutableStack() {
+ return m_stack;
+ }
- bool operator<(const ChartCellLabel &other) const
- {
+ bool operator<(const ChartCellLabel &other) const {
// m_coverage and m_label uniquely identify a ChartCellLabel, so don't
// need to compare m_stack.
if (m_coverage == other.m_coverage) {
@@ -67,7 +77,7 @@ class ChartCellLabel
return m_coverage < other.m_coverage;
}
- private:
+private:
const WordsRange &m_coverage;
const Word &m_label;
Stack m_stack;
diff --git a/moses/ChartCellLabelSet.h b/moses/ChartCellLabelSet.h
index 5ea192e51..68c8b4263 100644
--- a/moses/ChartCellLabelSet.h
+++ b/moses/ChartCellLabelSet.h
@@ -35,46 +35,55 @@ class ChartHypothesisCollection;
*/
class ChartCellLabelSet
{
- private:
+private:
#if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200)
typedef boost::unordered_map<Word, ChartCellLabel,
- NonTerminalHasher, NonTerminalEqualityPred
- > MapType;
+ NonTerminalHasher, NonTerminalEqualityPred
+ > MapType;
#else
typedef std::map<Word, ChartCellLabel> MapType;
#endif
- public:
+public:
typedef MapType::const_iterator const_iterator;
typedef MapType::iterator iterator;
ChartCellLabelSet(const WordsRange &coverage) : m_coverage(coverage) {}
- const_iterator begin() const { return m_map.begin(); }
- const_iterator end() const { return m_map.end(); }
-
- iterator mutable_begin() { return m_map.begin(); }
- iterator mutable_end() { return m_map.end(); }
+ const_iterator begin() const {
+ return m_map.begin();
+ }
+ const_iterator end() const {
+ return m_map.end();
+ }
- void AddWord(const Word &w)
- {
+ iterator mutable_begin() {
+ return m_map.begin();
+ }
+ iterator mutable_end() {
+ return m_map.end();
+ }
+
+ void AddWord(const Word &w) {
m_map.insert(std::make_pair(w, ChartCellLabel(m_coverage, w)));
}
- // Stack is a HypoList or whatever the search algorithm uses.
- void AddConstituent(const Word &w, const HypoList *stack)
- {
+ // Stack is a HypoList or whatever the search algorithm uses.
+ void AddConstituent(const Word &w, const HypoList *stack) {
ChartCellLabel::Stack s;
s.cube = stack;
m_map.insert(std::make_pair(w, ChartCellLabel(m_coverage, w, s)));
}
- bool Empty() const { return m_map.empty(); }
+ bool Empty() const {
+ return m_map.empty();
+ }
- size_t GetSize() const { return m_map.size(); }
+ size_t GetSize() const {
+ return m_map.size();
+ }
- const ChartCellLabel *Find(const Word &w) const
- {
+ const ChartCellLabel *Find(const Word &w) const {
MapType::const_iterator p = m_map.find(w);
return p == m_map.end() ? 0 : &(p->second);
}
@@ -83,7 +92,7 @@ class ChartCellLabelSet
return m_map.insert(std::make_pair(w, ChartCellLabel(m_coverage, w))).first->second.MutableStack();
}
- private:
+private:
const WordsRange &m_coverage;
MapType m_map;
};
diff --git a/moses/ChartHypothesis.cpp b/moses/ChartHypothesis.cpp
index c7c1047f1..ce5a318ac 100644
--- a/moses/ChartHypothesis.cpp
+++ b/moses/ChartHypothesis.cpp
@@ -39,7 +39,7 @@ namespace Moses
ObjectPool<ChartHypothesis> ChartHypothesis::s_objectPool("ChartHypothesis", 300000);
#endif
-/** Create a hypothesis from a rule
+/** Create a hypothesis from a rule
* \param transOpt wrapper around the rule
* \param item @todo dunno
* \param manager reference back to manager
@@ -59,15 +59,14 @@ ChartHypothesis::ChartHypothesis(const ChartTranslationOptions &transOpt,
const std::vector<HypothesisDimension> &childEntries = item.GetHypothesisDimensions();
m_prevHypos.reserve(childEntries.size());
std::vector<HypothesisDimension>::const_iterator iter;
- for (iter = childEntries.begin(); iter != childEntries.end(); ++iter)
- {
+ for (iter = childEntries.begin(); iter != childEntries.end(); ++iter) {
m_prevHypos.push_back(iter->GetHypothesis());
}
}
ChartHypothesis::~ChartHypothesis()
{
- // delete feature function states
+ // delete feature function states
for (unsigned i = 0; i < m_ffStates.size(); ++i) {
delete m_ffStates[i];
}
@@ -98,8 +97,7 @@ void ChartHypothesis::CreateOutputPhrase(Phrase &outPhrase) const
size_t nonTermInd = GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos];
const ChartHypothesis *prevHypo = m_prevHypos[nonTermInd];
prevHypo->CreateOutputPhrase(outPhrase);
- }
- else {
+ } else {
outPhrase.AddWord(word);
}
}
@@ -124,17 +122,16 @@ Phrase ChartHypothesis::GetOutputPhrase() const
*/
int ChartHypothesis::RecombineCompare(const ChartHypothesis &compare) const
{
- int comp = 0;
+ int comp = 0;
- for (unsigned i = 0; i < m_ffStates.size(); ++i)
- {
- if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL)
+ for (unsigned i = 0; i < m_ffStates.size(); ++i) {
+ if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL)
comp = m_ffStates[i] - compare.m_ffStates[i];
- else
+ else
comp = m_ffStates[i]->Compare(*compare.m_ffStates[i]);
- if (comp != 0)
- return comp;
+ if (comp != 0)
+ return comp;
}
return 0;
@@ -161,16 +158,16 @@ void ChartHypothesis::CalcScore()
//Add pre-computed features
m_manager.InsertPreCalculatedScores(GetCurrTargetPhrase(), &m_scoreBreakdown);
- // compute values of stateless feature functions that were not
+ // compute values of stateless feature functions that were not
// cached in the translation option-- there is no principled distinction
const std::vector<const StatelessFeatureFunction*>& sfs =
- StatelessFeatureFunction::GetStatelessFeatureFunctions();
+ StatelessFeatureFunction::GetStatelessFeatureFunctions();
for (unsigned i = 0; i < sfs.size(); ++i) {
sfs[i]->EvaluateChart(ChartBasedFeatureContext(this),&m_scoreBreakdown);
}
const std::vector<const StatefulFeatureFunction*>& ffs =
- StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ StatefulFeatureFunction::GetStatefulFeatureFunctions();
for (unsigned i = 0; i < ffs.size(); ++i)
m_ffStates[i] = ffs[i]->EvaluateChart(*this,i,&m_scoreBreakdown);
@@ -262,13 +259,12 @@ std::ostream& operator<<(std::ostream& out, const ChartHypothesis& hypo)
{
out << hypo.GetId();
-
- // recombination
- if (hypo.GetWinningHypothesis() != NULL &&
- hypo.GetWinningHypothesis() != &hypo)
- {
- out << "->" << hypo.GetWinningHypothesis()->GetId();
- }
+
+ // recombination
+ if (hypo.GetWinningHypothesis() != NULL &&
+ hypo.GetWinningHypothesis() != &hypo) {
+ out << "->" << hypo.GetWinningHypothesis()->GetId();
+ }
if (StaticData::Instance().GetIncludeLHSInSearchGraph()) {
out << " " << hypo.GetTargetLHS() << "=>";
diff --git a/moses/ChartHypothesis.h b/moses/ChartHypothesis.h
index 9dc1cba92..61c2faae1 100644
--- a/moses/ChartHypothesis.h
+++ b/moses/ChartHypothesis.h
@@ -52,7 +52,7 @@ protected:
const TargetPhrase &m_targetPhrase;
WordsRange m_currSourceWordsRange;
- std::vector<const FFState*> m_ffStates; /*! stateful feature function states */
+ std::vector<const FFState*> m_ffStates; /*! stateful feature function states */
ScoreComponentCollection m_scoreBreakdown /*! detailed score break-down by components (for instance language model, word penalty, etc) */
,m_lmNGram
,m_lmPrefix;
@@ -68,8 +68,8 @@ protected:
unsigned m_id; /* pkoehn wants to log the order in which hypotheses were generated */
//! not implemented
- ChartHypothesis();
-
+ ChartHypothesis();
+
//! not implemented
ChartHypothesis(const ChartHypothesis &copy);
@@ -96,35 +96,39 @@ public:
~ChartHypothesis();
- unsigned GetId() const { return m_id; }
+ unsigned GetId() const {
+ return m_id;
+ }
//! Get the rule that created this hypothesis
const TargetPhrase &GetCurrTargetPhrase()const {
return m_targetPhrase;
}
-
+
//! the source range that this hypothesis spans
const WordsRange &GetCurrSourceRange()const {
return m_currSourceWordsRange;
}
-
+
//! the arc list when creating n-best lists
inline const ChartArcList* GetArcList() const {
return m_arcList;
}
-
+
//! the feature function states for a particular feature \param featureID
- inline const FFState* GetFFState( size_t featureID ) const {
- return m_ffStates[ featureID ];
- }
-
+ inline const FFState* GetFFState( size_t featureID ) const {
+ return m_ffStates[ featureID ];
+ }
+
//! reference back to the manager
- inline const ChartManager& GetManager() const { return m_manager; }
+ inline const ChartManager& GetManager() const {
+ return m_manager;
+ }
void CreateOutputPhrase(Phrase &outPhrase) const;
Phrase GetOutputPhrase() const;
- int RecombineCompare(const ChartHypothesis &compare) const;
+ int RecombineCompare(const ChartHypothesis &compare) const;
void CalcScore();
@@ -133,30 +137,34 @@ public:
void SetWinningHypo(const ChartHypothesis *hypo);
//! get the unweighted score for each feature function
- const ScoreComponentCollection &GetScoreBreakdown() const
- { return m_scoreBreakdown; }
-
+ const ScoreComponentCollection &GetScoreBreakdown() const {
+ return m_scoreBreakdown;
+ }
+
//! Get the weighted total score
- float GetTotalScore() const
- { return m_totalScore; }
+ float GetTotalScore() const {
+ return m_totalScore;
+ }
- //! vector of previous hypotheses this hypo is built on
- const std::vector<const ChartHypothesis*> &GetPrevHypos() const
- { return m_prevHypos; }
+ //! vector of previous hypotheses this hypo is built on
+ const std::vector<const ChartHypothesis*> &GetPrevHypos() const {
+ return m_prevHypos;
+ }
//! get a particular previous hypos
- const ChartHypothesis* GetPrevHypo(size_t pos) const {
- return m_prevHypos[pos];
- }
-
+ const ChartHypothesis* GetPrevHypo(size_t pos) const {
+ return m_prevHypos[pos];
+ }
+
//! get the constituency label that covers this hypo
const Word &GetTargetLHS() const {
return GetCurrTargetPhrase().GetTargetLHS();
}
//! get the best hypo in the arc list when doing n-best list creation. It's either this hypothesis, or the best hypo is this hypo is in the arc list
- const ChartHypothesis* GetWinningHypothesis() const
- { return m_winningHypo; }
+ const ChartHypothesis* GetWinningHypothesis() const {
+ return m_winningHypo;
+ }
TO_STRING();
diff --git a/moses/ChartHypothesisCollection.cpp b/moses/ChartHypothesisCollection.cpp
index 752bb7f6c..3b80f68dc 100644
--- a/moses/ChartHypothesisCollection.cpp
+++ b/moses/ChartHypothesisCollection.cpp
@@ -51,7 +51,7 @@ ChartHypothesisCollection::~ChartHypothesisCollection()
//RemoveAllInColl(m_hypos);
}
-/** public function to add hypothesis to this collection.
+/** public function to add hypothesis to this collection.
* Returns false if equiv hypo exists in collection, otherwise returns true.
* Takes care of update arc list for n-best list creation.
* Will delete hypo is it exist - once this function is call don't delete hypothesis.
@@ -108,8 +108,7 @@ bool ChartHypothesisCollection::AddHypothesis(ChartHypothesis *hypo, ChartManage
VERBOSE(3,"worse than matching hyp " << hypoExisting->GetId() << ", recombining" << std::endl)
if (m_nBestIsEnabled) {
hypoExisting->AddArc(hypo);
- }
- else {
+ } else {
ChartHypothesis::Delete(hypo);
}
return false;
@@ -146,7 +145,7 @@ pair<ChartHypothesisCollection::HCType::iterator, bool> ChartHypothesisCollectio
return ret;
}
-/** Remove hypothesis pointed to by iterator but DOES NOT delete the object.
+/** Remove hypothesis pointed to by iterator but DOES NOT delete the object.
* \param iter iterator to delete
*/
void ChartHypothesisCollection::Detach(const HCType::iterator &iter)
diff --git a/moses/ChartHypothesisCollection.h b/moses/ChartHypothesisCollection.h
index f88cb8302..fa707b46d 100644
--- a/moses/ChartHypothesisCollection.h
+++ b/moses/ChartHypothesisCollection.h
@@ -46,7 +46,7 @@ public:
bool operator()(const ChartHypothesis* hypoA, const ChartHypothesis* hypoB) const {
// assert in same cell
const WordsRange &rangeA = hypoA->GetCurrSourceRange()
- , &rangeB = hypoB->GetCurrSourceRange();
+ , &rangeB = hypoB->GetCurrSourceRange();
CHECK(rangeA == rangeB);
// shouldn't be mixing hypos with different lhs
@@ -115,7 +115,9 @@ public:
}
//! return the best total score of all hypos in this collection
- float GetBestScore() const { return m_bestScore; }
+ float GetBestScore() const {
+ return m_bestScore;
+ }
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<unsigned,bool> &reachable) const;
diff --git a/moses/ChartManager.cpp b/moses/ChartManager.cpp
index fc4865af7..98f0e17f3 100644
--- a/moses/ChartManager.cpp
+++ b/moses/ChartManager.cpp
@@ -124,10 +124,13 @@ void ChartManager::ProcessSentence()
* Doesn't seem to do anything about walls and zones.
* @todo check walls & zones. Check that the implementation doesn't leak, xml options sometimes does if you're not careful
*/
-void ChartManager::AddXmlChartOptions() {
+void ChartManager::AddXmlChartOptions()
+{
const StaticData &staticData = StaticData::Instance();
const std::vector <ChartTranslationOptions*> xmlChartOptionsList = m_source.GetXmlChartTranslationOptions();
- IFVERBOSE(2) { cerr << "AddXmlChartOptions " << xmlChartOptionsList.size() << endl; }
+ IFVERBOSE(2) {
+ cerr << "AddXmlChartOptions " << xmlChartOptionsList.size() << endl;
+ }
if (xmlChartOptionsList.size() == 0) return;
for(std::vector<ChartTranslationOptions*>::const_iterator i = xmlChartOptionsList.begin();
@@ -160,12 +163,12 @@ const ChartHypothesis *ChartManager::GetBestHypothesis() const
}
}
- /** Calculate the n-best paths through the output hypergraph.
- * Return the list of paths with the variable ret
- * \param count how may paths to return
- * \param ret return argument
- * \param onlyDistinct whether to check for distinct output sentence or not (default - don't check, just return top n-paths)
- */
+/** Calculate the n-best paths through the output hypergraph.
+ * Return the list of paths with the variable ret
+ * \param count how may paths to return
+ * \param ret return argument
+ * \param onlyDistinct whether to check for distinct output sentence or not (default - don't check, just return top n-paths)
+ */
void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDistinct) const
{
size_t size = m_source.GetSize();
@@ -184,7 +187,7 @@ void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDi
// Add it to the n-best list.
if (count == 1) {
- ret.Add(basePath);
+ ret.Add(basePath);
return;
}
@@ -210,21 +213,21 @@ void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDi
// Get all complete translations
const HypoList *topHypos = lastCell.GetAllSortedHypotheses();
-
+
// Create a ChartTrellisDetour for each complete translation and add it to the queue
HypoList::const_iterator iter;
for (iter = topHypos->begin(); iter != topHypos->end(); ++iter) {
- const ChartHypothesis &hypo = **iter;
- boost::shared_ptr<ChartTrellisPath> basePath(new ChartTrellisPath(hypo));
- ChartTrellisDetour *detour = new ChartTrellisDetour(basePath, basePath->GetFinalNode(), hypo);
- contenders.Push(detour);
+ const ChartHypothesis &hypo = **iter;
+ boost::shared_ptr<ChartTrellisPath> basePath(new ChartTrellisPath(hypo));
+ ChartTrellisDetour *detour = new ChartTrellisDetour(basePath, basePath->GetFinalNode(), hypo);
+ contenders.Push(detour);
}
-
+
delete topHypos;
// Record the output phrase if distinct translations are required.
set<Phrase> distinctHyps;
-
+
// MAIN loop
for (size_t i = 0; ret.GetSize() < count && !contenders.Empty() && i < popLimit; ++i) {
// Get the best detour from the queue.
@@ -234,7 +237,7 @@ void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDi
// Create a full base path from the chosen detour.
//basePath.reset(new ChartTrellisPath(*detour));
boost::shared_ptr<ChartTrellisPath> path(new ChartTrellisPath(*detour));
-
+
// Generate new detours from this base path and add them to the queue of
// contenders. The new detours deviate from the base path by a single
// replacement along the previous detour sub-path.
@@ -259,17 +262,17 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
{
size_t size = m_source.GetSize();
- // which hypotheses are reachable?
- std::map<unsigned,bool> reachable;
- WordsRange fullRange(0, size-1);
- const ChartCell &lastCell = m_hypoStackColl.Get(fullRange);
+ // which hypotheses are reachable?
+ std::map<unsigned,bool> reachable;
+ WordsRange fullRange(0, size-1);
+ const ChartCell &lastCell = m_hypoStackColl.Get(fullRange);
const ChartHypothesis *hypo = lastCell.GetBestHypothesis();
if (hypo == NULL) {
// no hypothesis
return;
}
- FindReachableHypotheses( hypo, reachable);
+ FindReachableHypotheses( hypo, reachable);
for (size_t width = 1; width <= size; ++width) {
for (size_t startPos = 0; startPos <= size-width; ++startPos) {
@@ -285,42 +288,40 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const
{
- // do not recurse, if already visited
- if (reachable.find(hypo->GetId()) != reachable.end())
- {
- return;
- }
-
- // recurse
- reachable[ hypo->GetId() ] = true;
- const std::vector<const ChartHypothesis*> &previous = hypo->GetPrevHypos();
- for(std::vector<const ChartHypothesis*>::const_iterator i = previous.begin(); i != previous.end(); ++i)
- {
- FindReachableHypotheses( *i, reachable );
- }
-
- // also loop over recombined hypotheses (arcs)
- const ChartArcList *arcList = hypo->GetArcList();
- if (arcList) {
- ChartArcList::const_iterator iterArc;
- for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
- const ChartHypothesis &arc = **iterArc;
- FindReachableHypotheses( &arc, reachable );
- }
- }
+ // do not recurse, if already visited
+ if (reachable.find(hypo->GetId()) != reachable.end()) {
+ return;
+ }
+
+ // recurse
+ reachable[ hypo->GetId() ] = true;
+ const std::vector<const ChartHypothesis*> &previous = hypo->GetPrevHypos();
+ for(std::vector<const ChartHypothesis*>::const_iterator i = previous.begin(); i != previous.end(); ++i) {
+ FindReachableHypotheses( *i, reachable );
+ }
+
+ // also loop over recombined hypotheses (arcs)
+ const ChartArcList *arcList = hypo->GetArcList();
+ if (arcList) {
+ ChartArcList::const_iterator iterArc;
+ for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
+ const ChartHypothesis &arc = **iterArc;
+ FindReachableHypotheses( &arc, reachable );
+ }
+ }
}
void ChartManager::CreateDeviantPaths(
- boost::shared_ptr<const ChartTrellisPath> basePath,
- ChartTrellisDetourQueue &q)
+ boost::shared_ptr<const ChartTrellisPath> basePath,
+ ChartTrellisDetourQueue &q)
{
CreateDeviantPaths(basePath, basePath->GetFinalNode(), q);
}
void ChartManager::CreateDeviantPaths(
- boost::shared_ptr<const ChartTrellisPath> basePath,
- const ChartTrellisNode &substitutedNode,
- ChartTrellisDetourQueue &queue)
+ boost::shared_ptr<const ChartTrellisPath> basePath,
+ const ChartTrellisNode &substitutedNode,
+ ChartTrellisDetourQueue &queue)
{
const ChartArcList *arcList = substitutedNode.GetHypothesis().GetArcList();
if (arcList) {
@@ -340,18 +341,18 @@ void ChartManager::CreateDeviantPaths(
}
}
-
-void ChartManager::PreCalculateScores()
+
+void ChartManager::PreCalculateScores()
{
for (size_t i = 0; i < m_translationOptionList.GetSize(); ++i) {
const ChartTranslationOptions& cto = m_translationOptionList.Get(i);
for (TargetPhraseCollection::const_iterator j = cto.GetTargetPhraseCollection().begin();
- j != cto.GetTargetPhraseCollection().end(); ++j) {
+ j != cto.GetTargetPhraseCollection().end(); ++j) {
const TargetPhrase* targetPhrase = *j;
if (m_precalculatedScores.find(*targetPhrase) == m_precalculatedScores.end()) {
ChartBasedFeatureContext context(*targetPhrase,m_source);
const vector<const StatelessFeatureFunction*>& sfs =
- StatelessFeatureFunction::GetStatelessFeatureFunctions();
+ StatelessFeatureFunction::GetStatelessFeatureFunctions();
ScoreComponentCollection& breakdown = m_precalculatedScores[*targetPhrase];
for (size_t k = 0; k < sfs.size(); ++k) {
sfs[k]->EvaluateChart(context,&breakdown);
@@ -362,18 +363,18 @@ void ChartManager::PreCalculateScores()
}
void ChartManager::InsertPreCalculatedScores(
- const TargetPhrase& targetPhrase, ScoreComponentCollection* scoreBreakdown) const
+ const TargetPhrase& targetPhrase, ScoreComponentCollection* scoreBreakdown) const
{
- boost::unordered_map<TargetPhrase,ScoreComponentCollection>::const_iterator scoreIter =
+ boost::unordered_map<TargetPhrase,ScoreComponentCollection>::const_iterator scoreIter =
m_precalculatedScores.find(targetPhrase);
if (scoreIter != m_precalculatedScores.end()) {
scoreBreakdown->PlusEquals(scoreIter->second);
} else {
TRACE_ERR("ERROR: " << targetPhrase << " missing from precalculation cache" << endl);
- assert(0);
+ assert(0);
}
}
-
+
} // namespace Moses
diff --git a/moses/ChartManager.h b/moses/ChartManager.h
index 7f3f24a0b..736986e05 100644
--- a/moses/ChartManager.h
+++ b/moses/ChartManager.h
@@ -79,35 +79,37 @@ public:
void CalcNBest(size_t count, ChartTrellisPathList &ret, bool onlyDistinct=0) const;
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
- void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
+ void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
//! the input sentence being decoded
const InputType& GetSource() const {
return m_source;
}
-
+
//! debug data collected when decoding sentence
SentenceStats& GetSentenceStats() const {
return *m_sentenceStats;
}
-
+
/***
* to be called after processing a sentence (which may consist of more than just calling ProcessSentence() )
* currently an empty function
*/
void CalcDecoderStatistics() const
{ }
-
+
void ResetSentenceStats(const InputType& source) {
m_sentenceStats = std::auto_ptr<SentenceStats>(new SentenceStats(source));
}
//! contigious hypo id for each input sentence. For debugging purposes
- unsigned GetNextHypoId() { return m_hypothesisId++; }
+ unsigned GetNextHypoId() {
+ return m_hypothesisId++;
+ }
//! Access the pre-calculated values
void InsertPreCalculatedScores(const TargetPhrase& targetPhrase,
- ScoreComponentCollection* scoreBreakdown) const;
+ ScoreComponentCollection* scoreBreakdown) const;
};
diff --git a/moses/ChartParser.cpp b/moses/ChartParser.cpp
index 805bec7ab..0dba600e1 100644
--- a/moses/ChartParser.cpp
+++ b/moses/ChartParser.cpp
@@ -35,16 +35,18 @@ extern bool g_debug;
ChartParserUnknown::ChartParserUnknown() {}
-ChartParserUnknown::~ChartParserUnknown() {
+ChartParserUnknown::~ChartParserUnknown()
+{
RemoveAllInColl(m_unksrcs);
RemoveAllInColl(m_cacheTargetPhraseCollection);
}
-void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to) {
+void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to)
+{
// unknown word, add as trans opt
const StaticData &staticData = StaticData::Instance();
const UnknownWordPenaltyProducer *unknownWordPenaltyProducer = staticData.GetUnknownWordPenaltyProducer();
-
+
size_t isDigit = 0;
if (staticData.GetDropUnknown()) {
const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
@@ -56,11 +58,11 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
isDigit = 1;
// modify the starting bitmap
}
-
+
Phrase* unksrc = new Phrase(1);
unksrc->AddWord() = sourceWord;
m_unksrcs.push_back(unksrc);
-
+
//TranslationOption *transOpt;
if (! staticData.GetDropUnknown() || isDigit) {
// loop
@@ -69,19 +71,19 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
const string &targetLHSStr = iterLHS->first;
float prob = iterLHS->second;
-
+
// lhs
//const Word &sourceLHS = staticData.GetInputDefaultNonTerminal();
Word *targetLHS = new Word(true);
-
+
targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true);
CHECK(targetLHS->GetFactor(0) != NULL);
-
+
// add to dictionary
TargetPhrase *targetPhrase = new TargetPhrase();
Word &targetWord = targetPhrase->AddWord();
targetWord.CreateUnknownWord(sourceWord);
-
+
// scores
float unknownScore = FloorScore(TransformScore(prob));
@@ -98,7 +100,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
} else {
// drop source word. create blank trans opt
float unknownScore = FloorScore(-numeric_limits<float>::infinity());
-
+
TargetPhrase *targetPhrase = new TargetPhrase();
// loop
const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
@@ -106,11 +108,11 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
const string &targetLHSStr = iterLHS->first;
//float prob = iterLHS->second;
-
+
Word *targetLHS = new Word(true);
targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true);
CHECK(targetLHS->GetFactor(0) != NULL);
-
+
targetPhrase->GetScoreBreakdown().Assign(unknownWordPenaltyProducer, unknownScore);
targetPhrase->Evaluate(*unksrc);
@@ -125,7 +127,8 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
ChartParser::ChartParser(InputType const &source, ChartCellCollectionBase &cells) :
m_decodeGraphList(StaticData::Instance().GetDecodeGraphs()),
- m_source(source) {
+ m_source(source)
+{
const StaticData &staticData = StaticData::Instance();
staticData.InitializeForInput(source);
@@ -139,14 +142,16 @@ ChartParser::ChartParser(InputType const &source, ChartCellCollectionBase &cells
}
}
-ChartParser::~ChartParser() {
+ChartParser::~ChartParser()
+{
RemoveAllInColl(m_ruleLookupManagers);
StaticData::Instance().CleanUpAfterSentenceProcessing(m_source);
}
-void ChartParser::Create(const WordsRange &wordsRange, ChartParserCallback &to) {
+void ChartParser::Create(const WordsRange &wordsRange, ChartParserCallback &to)
+{
assert(m_decodeGraphList.size() == m_ruleLookupManagers.size());
-
+
std::vector <DecodeGraph*>::const_iterator iterDecodeGraph;
std::vector <ChartRuleLookupManager*>::const_iterator iterRuleLookupManagers = m_ruleLookupManagers.begin();
for (iterDecodeGraph = m_decodeGraphList.begin(); iterDecodeGraph != m_decodeGraphList.end(); ++iterDecodeGraph, ++iterRuleLookupManagers) {
@@ -158,7 +163,7 @@ void ChartParser::Create(const WordsRange &wordsRange, ChartParserCallback &to)
ruleLookupManager.GetChartRuleCollection(wordsRange, to);
}
}
-
+
if (wordsRange.GetNumWordsCovered() == 1 && wordsRange.GetStartPos() != 0 && wordsRange.GetStartPos() != m_source.GetSize()-1) {
bool alwaysCreateDirectTranslationOption = StaticData::Instance().IsAlwaysCreateDirectTranslationOption();
if (to.Empty() || alwaysCreateDirectTranslationOption) {
@@ -166,7 +171,7 @@ void ChartParser::Create(const WordsRange &wordsRange, ChartParserCallback &to)
const Word &sourceWord = m_source.GetWord(wordsRange.GetStartPos());
m_unknown.Process(sourceWord, wordsRange, to);
}
- }
+ }
}
-
+
} // namespace Moses
diff --git a/moses/ChartParser.h b/moses/ChartParser.h
index 9d8baa649..1ff99480d 100644
--- a/moses/ChartParser.h
+++ b/moses/ChartParser.h
@@ -39,31 +39,33 @@ class Phrase;
class TargetPhraseCollection;
class DecodeGraph;
-class ChartParserUnknown {
- public:
- ChartParserUnknown();
- ~ChartParserUnknown();
+class ChartParserUnknown
+{
+public:
+ ChartParserUnknown();
+ ~ChartParserUnknown();
- void Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to);
+ void Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to);
- private:
- std::vector<Phrase*> m_unksrcs;
- std::list<TargetPhraseCollection*> m_cacheTargetPhraseCollection;
- StackVec m_emptyStackVec;
+private:
+ std::vector<Phrase*> m_unksrcs;
+ std::list<TargetPhraseCollection*> m_cacheTargetPhraseCollection;
+ StackVec m_emptyStackVec;
};
-class ChartParser {
- public:
- ChartParser(const InputType &source, ChartCellCollectionBase &cells);
- ~ChartParser();
+class ChartParser
+{
+public:
+ ChartParser(const InputType &source, ChartCellCollectionBase &cells);
+ ~ChartParser();
- void Create(const WordsRange &range, ChartParserCallback &to);
+ void Create(const WordsRange &range, ChartParserCallback &to);
- private:
- ChartParserUnknown m_unknown;
- std::vector <DecodeGraph*> m_decodeGraphList;
- std::vector<ChartRuleLookupManager*> m_ruleLookupManagers;
- InputType const& m_source; /**< source sentence to be translated */
+private:
+ ChartParserUnknown m_unknown;
+ std::vector <DecodeGraph*> m_decodeGraphList;
+ std::vector<ChartRuleLookupManager*> m_ruleLookupManagers;
+ InputType const& m_source; /**< source sentence to be translated */
};
}
diff --git a/moses/ChartParserCallback.h b/moses/ChartParserCallback.h
index 797a57156..84ddb8b75 100644
--- a/moses/ChartParserCallback.h
+++ b/moses/ChartParserCallback.h
@@ -4,21 +4,23 @@
#include <list>
-namespace Moses {
+namespace Moses
+{
class TargetPhraseCollection;
class WordsRange;
class TargetPhrase;
-class ChartParserCallback {
- public:
- virtual ~ChartParserCallback() {}
+class ChartParserCallback
+{
+public:
+ virtual ~ChartParserCallback() {}
- virtual void Add(const TargetPhraseCollection &, const StackVec &, const WordsRange &) = 0;
+ virtual void Add(const TargetPhraseCollection &, const StackVec &, const WordsRange &) = 0;
- virtual bool Empty() const = 0;
+ virtual bool Empty() const = 0;
- virtual void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range) = 0;
+ virtual void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range) = 0;
};
} // namespace Moses
diff --git a/moses/ChartRuleLookupManager.h b/moses/ChartRuleLookupManager.h
index da8c98cb4..ad936ff9c 100644
--- a/moses/ChartRuleLookupManager.h
+++ b/moses/ChartRuleLookupManager.h
@@ -50,7 +50,7 @@ public:
const InputType &GetSentence() const {
return m_sentence;
}
-
+
const ChartCellLabelSet &GetTargetLabelSet(size_t begin, size_t end) const {
return m_cellCollection.GetBase(WordsRange(begin, end)).GetTargetLabelSet();
}
diff --git a/moses/ChartTranslationOptionList.cpp b/moses/ChartTranslationOptionList.cpp
index 8f4422e23..5b72ea7a3 100644
--- a/moses/ChartTranslationOptionList.cpp
+++ b/moses/ChartTranslationOptionList.cpp
@@ -74,11 +74,11 @@ void ChartTranslationOptionList::Add(const TargetPhraseCollection &tpc,
if (m_size == m_collection.size()) {
// m_collection has reached capacity: create a new object.
m_collection.push_back(new ChartTranslationOptions(tpc, stackVec,
- range, score));
+ range, score));
} else {
// Overwrite an unused object.
*(m_collection[m_size]) = ChartTranslationOptions(tpc, stackVec,
- range, score);
+ range, score);
}
++m_size;
@@ -98,7 +98,8 @@ void ChartTranslationOptionList::Add(const TargetPhraseCollection &tpc,
}
}
-void ChartTranslationOptionList::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range) {
+void ChartTranslationOptionList::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range)
+{
TargetPhraseCollection *tpc = new TargetPhraseCollection();
tpc->Add(&phrase);
waste_memory.push_back(tpc);
@@ -106,7 +107,8 @@ void ChartTranslationOptionList::AddPhraseOOV(TargetPhrase &phrase, std::list<Ta
Add(*tpc, empty, range);
}
-void ChartTranslationOptionList::ApplyThreshold() {
+void ChartTranslationOptionList::ApplyThreshold()
+{
if (m_size > m_ruleLimit) {
// Something's gone wrong if the list has grown to m_ruleLimit * 2
// without being pruned.
@@ -134,8 +136,8 @@ void ChartTranslationOptionList::ApplyThreshold() {
scoreThreshold += StaticData::Instance().GetTranslationOptionThreshold();
CollType::iterator bound = std::partition(m_collection.begin(),
- m_collection.begin()+m_size,
- ScoreThresholdPred(scoreThreshold));
+ m_collection.begin()+m_size,
+ ScoreThresholdPred(scoreThreshold));
m_size = std::distance(m_collection.begin(), bound);
}
diff --git a/moses/ChartTranslationOptionList.h b/moses/ChartTranslationOptionList.h
index 0b56b1f61..a2979fcbc 100644
--- a/moses/ChartTranslationOptionList.h
+++ b/moses/ChartTranslationOptionList.h
@@ -32,27 +32,34 @@ class TargetPhraseCollection;
class WordsRange;
//! a vector of translations options for a specific range, in a specific sentence
-class ChartTranslationOptionList : public ChartParserCallback {
- public:
+class ChartTranslationOptionList : public ChartParserCallback
+{
+public:
ChartTranslationOptionList(size_t);
~ChartTranslationOptionList();
- const ChartTranslationOptions &Get(size_t i) const { return *m_collection[i]; }
+ const ChartTranslationOptions &Get(size_t i) const {
+ return *m_collection[i];
+ }
//! number of translation options
- size_t GetSize() const { return m_size; }
+ size_t GetSize() const {
+ return m_size;
+ }
void Add(const TargetPhraseCollection &, const StackVec &,
const WordsRange &);
void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range);
- bool Empty() const { return m_size == 0; }
+ bool Empty() const {
+ return m_size == 0;
+ }
void Clear();
void ApplyThreshold();
- private:
+private:
typedef std::vector<ChartTranslationOptions*> CollType;
struct ScoreThresholdPred {
diff --git a/moses/ChartTranslationOptions.cpp b/moses/ChartTranslationOptions.cpp
index c55948a82..5ba88a0db 100644
--- a/moses/ChartTranslationOptions.cpp
+++ b/moses/ChartTranslationOptions.cpp
@@ -27,8 +27,8 @@ namespace Moses
{
float ChartTranslationOptions::CalcEstimateOfBestScore(
- const TargetPhraseCollection &tpc,
- const StackVec &stackVec)
+ const TargetPhraseCollection &tpc,
+ const StackVec &stackVec)
{
const TargetPhrase &targetPhrase = **(tpc.begin());
float estimateOfBestScore = targetPhrase.GetFutureScore();
diff --git a/moses/ChartTranslationOptions.h b/moses/ChartTranslationOptions.h
index 4910723f7..459c91659 100644
--- a/moses/ChartTranslationOptions.h
+++ b/moses/ChartTranslationOptions.h
@@ -35,7 +35,7 @@ namespace Moses
*/
class ChartTranslationOptions
{
- public:
+public:
/** Constructor
\param targetPhraseColl @todo dunno
\param stackVec @todo dunno
@@ -43,13 +43,13 @@ class ChartTranslationOptions
\param score @todo dunno
*/
ChartTranslationOptions(const TargetPhraseCollection &targetPhraseColl,
- const StackVec &stackVec,
- const WordsRange &wordsRange,
- float score)
- : m_stackVec(stackVec)
- , m_targetPhraseCollection(&targetPhraseColl)
- , m_wordsRange(&wordsRange)
- , m_estimateOfBestScore(score)
+ const StackVec &stackVec,
+ const WordsRange &wordsRange,
+ float score)
+ : m_stackVec(stackVec)
+ , m_targetPhraseCollection(&targetPhraseColl)
+ , m_wordsRange(&wordsRange)
+ , m_estimateOfBestScore(score)
{}
~ChartTranslationOptions() {}
@@ -58,10 +58,12 @@ class ChartTranslationOptions
const StackVec &);
//! @todo dunno
- const StackVec &GetStackVec() const { return m_stackVec; }
+ const StackVec &GetStackVec() const {
+ return m_stackVec;
+ }
//! @todo isn't the translation suppose to just contain 1 target phrase, not a whole collection of them?
- const TargetPhraseCollection &GetTargetPhraseCollection() const {
+ const TargetPhraseCollection &GetTargetPhraseCollection() const {
return *m_targetPhraseCollection;
}
@@ -74,9 +76,11 @@ class ChartTranslationOptions
* the estimate is the sum of the top target phrase's estimated score plus the
* scores of the best child hypotheses.
*/
- inline float GetEstimateOfBestScore() const { return m_estimateOfBestScore; }
+ inline float GetEstimateOfBestScore() const {
+ return m_estimateOfBestScore;
+ }
- private:
+private:
StackVec m_stackVec; //! vector of hypothesis list!
const TargetPhraseCollection *m_targetPhraseCollection;
diff --git a/moses/ChartTrellisDetour.cpp b/moses/ChartTrellisDetour.cpp
index 550a44a2c..1a187396c 100644
--- a/moses/ChartTrellisDetour.cpp
+++ b/moses/ChartTrellisDetour.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -27,15 +27,15 @@ namespace Moses
{
ChartTrellisDetour::ChartTrellisDetour(
- boost::shared_ptr<const ChartTrellisPath> basePath,
- const ChartTrellisNode &substitutedNode,
- const ChartHypothesis &replacementHypo)
+ boost::shared_ptr<const ChartTrellisPath> basePath,
+ const ChartTrellisNode &substitutedNode,
+ const ChartHypothesis &replacementHypo)
: m_basePath(basePath)
, m_substitutedNode(substitutedNode)
, m_replacementHypo(replacementHypo)
{
float diff = replacementHypo.GetTotalScore()
- - substitutedNode.GetHypothesis().GetTotalScore();
+ - substitutedNode.GetHypothesis().GetTotalScore();
m_totalScore = basePath->GetTotalScore() + diff;
}
diff --git a/moses/ChartTrellisDetour.h b/moses/ChartTrellisDetour.h
index 977ccb67d..26c98bef8 100644
--- a/moses/ChartTrellisDetour.h
+++ b/moses/ChartTrellisDetour.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -31,20 +31,24 @@ class ChartTrellisPath;
*/
class ChartTrellisDetour
{
- public:
+public:
ChartTrellisDetour(boost::shared_ptr<const ChartTrellisPath>,
const ChartTrellisNode &, const ChartHypothesis &);
- const ChartTrellisPath &GetBasePath() const { return *m_basePath; }
+ const ChartTrellisPath &GetBasePath() const {
+ return *m_basePath;
+ }
const ChartTrellisNode &GetSubstitutedNode() const {
return m_substitutedNode;
}
const ChartHypothesis &GetReplacementHypo() const {
return m_replacementHypo;
}
- float GetTotalScore() const { return m_totalScore; }
+ float GetTotalScore() const {
+ return m_totalScore;
+ }
- private:
+private:
boost::shared_ptr<const ChartTrellisPath> m_basePath;
const ChartTrellisNode &m_substitutedNode;
const ChartHypothesis &m_replacementHypo;
diff --git a/moses/ChartTrellisDetourQueue.cpp b/moses/ChartTrellisDetourQueue.cpp
index 9b359ca43..4bb81d20b 100644
--- a/moses/ChartTrellisDetourQueue.cpp
+++ b/moses/ChartTrellisDetourQueue.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -21,13 +21,16 @@
#include "Util.h"
-namespace Moses {
+namespace Moses
+{
-ChartTrellisDetourQueue::~ChartTrellisDetourQueue() {
+ChartTrellisDetourQueue::~ChartTrellisDetourQueue()
+{
RemoveAllInColl(m_queue);
}
-void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) {
+void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour)
+{
if (m_capacity == 0 || m_queue.size() < m_capacity) {
m_queue.insert(detour);
} else if (detour->GetTotalScore() > (*m_queue.rbegin())->GetTotalScore()) {
@@ -43,7 +46,8 @@ void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) {
}
}
-const ChartTrellisDetour *ChartTrellisDetourQueue::Pop() {
+const ChartTrellisDetour *ChartTrellisDetourQueue::Pop()
+{
QueueType::iterator p = m_queue.begin();
const ChartTrellisDetour *top = *p;
m_queue.erase(p);
diff --git a/moses/ChartTrellisDetourQueue.h b/moses/ChartTrellisDetourQueue.h
index d6505d8a2..2406a69f5 100644
--- a/moses/ChartTrellisDetourQueue.h
+++ b/moses/ChartTrellisDetourQueue.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -23,20 +23,24 @@
#include <set>
-namespace Moses {
+namespace Moses
+{
/** A bounded priority queue of ChartTrellisDetour pointers. The top item is
* the best scoring detour. The queue assumes ownership of pushed items and
* relinquishes ownership when they are popped. Any remaining items at the
* time of the queue's destruction are deleted.
*/
-class ChartTrellisDetourQueue {
- public:
+class ChartTrellisDetourQueue
+{
+public:
// Create empty queue with fixed capacity of c. Capacity 0 means unbounded.
ChartTrellisDetourQueue(size_t c) : m_capacity(c) {}
~ChartTrellisDetourQueue();
- bool Empty() const { return m_queue.empty(); }
+ bool Empty() const {
+ return m_queue.empty();
+ }
// Add the detour to the queue or delete it if the queue is full and the
// score is no better than the queue's worst score.
@@ -46,7 +50,7 @@ class ChartTrellisDetourQueue {
// caller is responsible for deleting the object.
const ChartTrellisDetour *Pop();
- private:
+private:
struct DetourOrderer {
bool operator()(const ChartTrellisDetour* a,
const ChartTrellisDetour* b) const {
diff --git a/moses/ChartTrellisNode.cpp b/moses/ChartTrellisNode.cpp
index e55d4b1ab..73651f507 100644
--- a/moses/ChartTrellisNode.cpp
+++ b/moses/ChartTrellisNode.cpp
@@ -29,16 +29,16 @@ namespace Moses
{
ChartTrellisNode::ChartTrellisNode(const ChartHypothesis &hypo)
- : m_hypo(hypo)
+ : m_hypo(hypo)
{
CreateChildren();
}
ChartTrellisNode::ChartTrellisNode(const ChartTrellisDetour &detour,
ChartTrellisNode *&deviationPoint)
- : m_hypo((&detour.GetBasePath().GetFinalNode() == &detour.GetSubstitutedNode())
- ? detour.GetReplacementHypo()
- : detour.GetBasePath().GetFinalNode().GetHypothesis())
+ : m_hypo((&detour.GetBasePath().GetFinalNode() == &detour.GetSubstitutedNode())
+ ? detour.GetReplacementHypo()
+ : detour.GetBasePath().GetFinalNode().GetHypothesis())
{
if (&m_hypo == &detour.GetReplacementHypo()) {
deviationPoint = this;
@@ -54,9 +54,9 @@ ChartTrellisNode::ChartTrellisNode(const ChartTrellisNode &root,
const ChartTrellisNode &substitutedNode,
const ChartHypothesis &replacementHypo,
ChartTrellisNode *&deviationPoint)
- : m_hypo((&root == &substitutedNode)
- ? replacementHypo
- : root.GetHypothesis())
+ : m_hypo((&root == &substitutedNode)
+ ? replacementHypo
+ : root.GetHypothesis())
{
if (&root == &substitutedNode) {
deviationPoint = this;
@@ -118,8 +118,8 @@ void ChartTrellisNode::CreateChildren(const ChartTrellisNode &rootNode,
for (size_t ind = 0; ind < children.size(); ++ind) {
const ChartTrellisNode *origChild = children[ind];
ChartTrellisNode *child = new ChartTrellisNode(*origChild, substitutedNode,
- replacementHypo,
- deviationPoint);
+ replacementHypo,
+ deviationPoint);
m_children.push_back(child);
}
}
diff --git a/moses/ChartTrellisNode.h b/moses/ChartTrellisNode.h
index 58203677e..643809728 100644
--- a/moses/ChartTrellisNode.h
+++ b/moses/ChartTrellisNode.h
@@ -34,7 +34,7 @@ class ChartTrellisDetour;
*/
class ChartTrellisNode
{
- public:
+public:
typedef std::vector<ChartTrellisNode*> NodeChildren;
ChartTrellisNode(const ChartHypothesis &hypo);
@@ -42,15 +42,21 @@ class ChartTrellisNode
~ChartTrellisNode();
- const ChartHypothesis &GetHypothesis() const { return m_hypo; }
+ const ChartHypothesis &GetHypothesis() const {
+ return m_hypo;
+ }
- const NodeChildren &GetChildren() const { return m_children; }
+ const NodeChildren &GetChildren() const {
+ return m_children;
+ }
- const ChartTrellisNode &GetChild(size_t i) const { return *m_children[i]; }
+ const ChartTrellisNode &GetChild(size_t i) const {
+ return *m_children[i];
+ }
Phrase GetOutputPhrase() const;
- private:
+private:
ChartTrellisNode(const ChartTrellisNode &); // Not implemented
ChartTrellisNode& operator=(const ChartTrellisNode &); // Not implemented
diff --git a/moses/ChartTrellisPath.cpp b/moses/ChartTrellisPath.cpp
index 231d4237a..c53e636e9 100644
--- a/moses/ChartTrellisPath.cpp
+++ b/moses/ChartTrellisPath.cpp
@@ -30,17 +30,17 @@ namespace Moses
{
ChartTrellisPath::ChartTrellisPath(const ChartHypothesis &hypo)
- : m_finalNode(new ChartTrellisNode(hypo))
- , m_deviationPoint(NULL)
- , m_scoreBreakdown(hypo.GetScoreBreakdown())
- , m_totalScore(hypo.GetTotalScore())
+ : m_finalNode(new ChartTrellisNode(hypo))
+ , m_deviationPoint(NULL)
+ , m_scoreBreakdown(hypo.GetScoreBreakdown())
+ , m_totalScore(hypo.GetTotalScore())
{
}
ChartTrellisPath::ChartTrellisPath(const ChartTrellisDetour &detour)
- : m_finalNode(new ChartTrellisNode(detour, m_deviationPoint))
- , m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown)
- , m_totalScore(0)
+ : m_finalNode(new ChartTrellisNode(detour, m_deviationPoint))
+ , m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown)
+ , m_totalScore(0)
{
CHECK(m_deviationPoint);
ScoreComponentCollection scoreChange;
diff --git a/moses/ChartTrellisPath.h b/moses/ChartTrellisPath.h
index 6e5d50e0c..1023ad7b4 100644
--- a/moses/ChartTrellisPath.h
+++ b/moses/ChartTrellisPath.h
@@ -41,18 +41,24 @@ class ChartTrellisNode;
*/
class ChartTrellisPath
{
- public:
+public:
ChartTrellisPath(const ChartHypothesis &hypo);
ChartTrellisPath(const ChartTrellisDetour &detour);
~ChartTrellisPath();
- const ChartTrellisNode &GetFinalNode() const { return *m_finalNode; }
+ const ChartTrellisNode &GetFinalNode() const {
+ return *m_finalNode;
+ }
- const ChartTrellisNode *GetDeviationPoint() const { return m_deviationPoint; }
+ const ChartTrellisNode *GetDeviationPoint() const {
+ return m_deviationPoint;
+ }
//! get score for this path throught trellis
- float GetTotalScore() const { return m_totalScore; }
+ float GetTotalScore() const {
+ return m_totalScore;
+ }
Phrase GetOutputPhrase() const;
@@ -61,7 +67,7 @@ class ChartTrellisPath
return m_scoreBreakdown;
}
- private:
+private:
ChartTrellisPath(const ChartTrellisPath &); // Not implemented
ChartTrellisPath &operator=(const ChartTrellisPath &); // Not implemented
diff --git a/moses/ConfusionNet.h b/moses/ConfusionNet.h
index 55fa0c8bf..c9c83e154 100644
--- a/moses/ConfusionNet.h
+++ b/moses/ConfusionNet.h
@@ -15,7 +15,7 @@ class FactorCollection;
class TranslationOptionCollection;
class Sentence;
-/** An input to the decoder where each position can be 1 of a number of words,
+/** An input to the decoder where each position can be 1 of a number of words,
* each with an associated probability. Compared with a sentence, where each position is a word
*/
class ConfusionNet : public InputType
diff --git a/moses/DecodeFeature.cpp b/moses/DecodeFeature.cpp
index ebec7a7e3..57137170e 100644
--- a/moses/DecodeFeature.cpp
+++ b/moses/DecodeFeature.cpp
@@ -30,8 +30,8 @@ using namespace std;
namespace Moses
{
DecodeFeature::DecodeFeature( const std::string& description
- , const std::string &line)
-: StatelessFeatureFunction(description, line)
+ , const std::string &line)
+ : StatelessFeatureFunction(description, line)
{
VERBOSE(2,"DecodeFeature:" << std::endl);
for (size_t i = 0; i < m_args.size(); ++i) {
@@ -40,8 +40,7 @@ DecodeFeature::DecodeFeature( const std::string& description
if (args[0] == "input-factor") {
m_input =Tokenize<FactorType>(args[1], ",");
m_inputFactors = FactorMask(m_input);
- }
- else if (args[0] == "output-factor") {
+ } else if (args[0] == "output-factor") {
m_output =Tokenize<FactorType>(args[1], ",");
m_outputFactors = FactorMask(m_output);
}
@@ -50,20 +49,20 @@ DecodeFeature::DecodeFeature( const std::string& description
}
DecodeFeature::DecodeFeature( const std::string& description
- , size_t numScoreComponents
- , const std::string &line)
-: StatelessFeatureFunction(description,numScoreComponents, line)
+ , size_t numScoreComponents
+ , const std::string &line)
+ : StatelessFeatureFunction(description,numScoreComponents, line)
{
VERBOSE(2,"DecodeFeature: no factors yet" << std::endl);
}
DecodeFeature::DecodeFeature(const std::string& description
- , size_t numScoreComponents
- , const std::vector<FactorType> &input
- , const std::vector<FactorType> &output
- , const std::string &line)
-: StatelessFeatureFunction(description,numScoreComponents, line)
-, m_input(input), m_output(output)
+ , size_t numScoreComponents
+ , const std::vector<FactorType> &input
+ , const std::vector<FactorType> &output
+ , const std::string &line)
+ : StatelessFeatureFunction(description,numScoreComponents, line)
+ , m_input(input), m_output(output)
{
m_inputFactors = FactorMask(input);
m_outputFactors = FactorMask(output);
diff --git a/moses/DecodeFeature.h b/moses/DecodeFeature.h
index b6352b181..d6cf3a323 100644
--- a/moses/DecodeFeature.h
+++ b/moses/DecodeFeature.h
@@ -34,9 +34,10 @@ namespace Moses
/**
* Baseclass for phrase-table or generation table feature function
**/
-class DecodeFeature : public StatelessFeatureFunction {
+class DecodeFeature : public StatelessFeatureFunction
+{
- public:
+public:
DecodeFeature( const std::string& description
, const std::string &line);
@@ -45,28 +46,29 @@ class DecodeFeature : public StatelessFeatureFunction {
, const std::string &line);
DecodeFeature( const std::string& description
- , size_t numScoreComponents
- , const std::vector<FactorType> &input
- , const std::vector<FactorType> &output
- , const std::string &line);
-
- //! returns output factor types as specified by the ini file
- const FactorMask& GetOutputFactorMask() const;
-
- //! returns input factor types as specified by the ini file
- const FactorMask& GetInputFactorMask() const;
-
- const std::vector<FactorType>& GetInput() const;
- const std::vector<FactorType>& GetOutput() const;
-
- bool IsDecodeFeature() const
- { return true; }
-
- protected:
- std::vector<FactorType> m_input;
- std::vector<FactorType> m_output;
- FactorMask m_inputFactors;
- FactorMask m_outputFactors;
+ , size_t numScoreComponents
+ , const std::vector<FactorType> &input
+ , const std::vector<FactorType> &output
+ , const std::string &line);
+
+ //! returns output factor types as specified by the ini file
+ const FactorMask& GetOutputFactorMask() const;
+
+ //! returns input factor types as specified by the ini file
+ const FactorMask& GetInputFactorMask() const;
+
+ const std::vector<FactorType>& GetInput() const;
+ const std::vector<FactorType>& GetOutput() const;
+
+ bool IsDecodeFeature() const {
+ return true;
+ }
+
+protected:
+ std::vector<FactorType> m_input;
+ std::vector<FactorType> m_output;
+ FactorMask m_inputFactors;
+ FactorMask m_outputFactors;
};
}
diff --git a/moses/DecodeStepTranslation.cpp b/moses/DecodeStepTranslation.cpp
index 0acd3479f..e4dbb673d 100644
--- a/moses/DecodeStepTranslation.cpp
+++ b/moses/DecodeStepTranslation.cpp
@@ -94,9 +94,9 @@ void DecodeStepTranslation::Process(const TranslationOption &inputPartialTranslO
void DecodeStepTranslation::ProcessInitialTranslation(
- const InputType &source
- ,PartialTranslOptColl &outputPartialTranslOptColl
- , size_t startPos, size_t endPos, bool adhereTableLimit) const
+ const InputType &source
+ ,PartialTranslOptColl &outputPartialTranslOptColl
+ , size_t startPos, size_t endPos, bool adhereTableLimit) const
{
const PhraseDictionary* phraseDictionary = GetPhraseDictionaryFeature();
const size_t tableLimit = phraseDictionary->GetTableLimit();
diff --git a/moses/FF/BleuScoreFeature.cpp b/moses/FF/BleuScoreFeature.cpp
index 091035b0f..7808d6012 100644
--- a/moses/FF/BleuScoreFeature.cpp
+++ b/moses/FF/BleuScoreFeature.cpp
@@ -5,90 +5,94 @@
using namespace std;
-namespace Moses {
+namespace Moses
+{
size_t BleuScoreState::bleu_order = 4;
BleuScoreState::BleuScoreState(): m_words(1),
- m_source_length(0),
- m_target_length(0),
- m_scaled_ref_length(0),
- m_ngram_counts(bleu_order),
- m_ngram_matches(bleu_order)
+ m_source_length(0),
+ m_target_length(0),
+ m_scaled_ref_length(0),
+ m_ngram_counts(bleu_order),
+ m_ngram_matches(bleu_order)
{
}
int BleuScoreState::Compare(const FFState& o) const
{
- if (&o == this)
- return 0;
-
- const StaticData &staticData = StaticData::Instance();
- SearchAlgorithm searchAlgorithm = staticData.GetSearchAlgorithm();
- bool chartDecoding = (searchAlgorithm == ChartDecoding);
- if (chartDecoding)
- return 0;
-
- const BleuScoreState& other = dynamic_cast<const BleuScoreState&>(o);
- int c = m_words.Compare(other.m_words);
- if (c != 0)
- return c;
-
- /*for(size_t i = 0; i < m_ngram_counts.size(); i++) {
- if (m_ngram_counts[i] < other.m_ngram_counts[i])
- return -1;
- if (m_ngram_counts[i] > other.m_ngram_counts[i])
- return 1;
- if (m_ngram_matches[i] < other.m_ngram_matches[i])
- return -1;
- if (m_ngram_matches[i] > other.m_ngram_matches[i])
- return 1;
- }*/
+ if (&o == this)
+ return 0;
+ const StaticData &staticData = StaticData::Instance();
+ SearchAlgorithm searchAlgorithm = staticData.GetSearchAlgorithm();
+ bool chartDecoding = (searchAlgorithm == ChartDecoding);
+ if (chartDecoding)
return 0;
+
+ const BleuScoreState& other = dynamic_cast<const BleuScoreState&>(o);
+ int c = m_words.Compare(other.m_words);
+ if (c != 0)
+ return c;
+
+ /*for(size_t i = 0; i < m_ngram_counts.size(); i++) {
+ if (m_ngram_counts[i] < other.m_ngram_counts[i])
+ return -1;
+ if (m_ngram_counts[i] > other.m_ngram_counts[i])
+ return 1;
+ if (m_ngram_matches[i] < other.m_ngram_matches[i])
+ return -1;
+ if (m_ngram_matches[i] > other.m_ngram_matches[i])
+ return 1;
+ }*/
+
+ return 0;
}
-std::ostream& operator<<(std::ostream& out, const BleuScoreState& state) {
+std::ostream& operator<<(std::ostream& out, const BleuScoreState& state)
+{
state.print(out);
return out;
}
-void BleuScoreState::print(std::ostream& out) const {
+void BleuScoreState::print(std::ostream& out) const
+{
out << "ref=" << m_scaled_ref_length
- << ";source=" << m_source_length
- << ";target=" << m_target_length << ";counts=";
+ << ";source=" << m_source_length
+ << ";target=" << m_target_length << ";counts=";
for (size_t i = 0; i < bleu_order; ++i) {
out << m_ngram_matches[i] << "/" << m_ngram_counts[i] << ",";
}
out << "ctxt=" << m_words;
-
+
}
void BleuScoreState::AddNgramCountAndMatches(std::vector< size_t >& counts,
- std::vector< size_t >& matches) {
- for (size_t order = 0; order < BleuScoreState::bleu_order; ++order) {
- m_ngram_counts[order] += counts[order];
- m_ngram_matches[order] += matches[order];
- }
+ std::vector< size_t >& matches)
+{
+ for (size_t order = 0; order < BleuScoreState::bleu_order; ++order) {
+ m_ngram_counts[order] += counts[order];
+ m_ngram_matches[order] += matches[order];
+ }
}
BleuScoreFeature::BleuScoreFeature(const std::string &line)
-:StatefulFeatureFunction("BleuScoreFeature",1, line),
-m_enabled(true),
-m_sentence_bleu(true),
-m_simple_history_bleu(false),
-m_count_history(BleuScoreState::bleu_order),
-m_match_history(BleuScoreState::bleu_order),
-m_source_length_history(0),
-m_target_length_history(0),
-m_ref_length_history(0),
-m_scale_by_input_length(true),
-m_scale_by_avg_input_length(false),
-m_scale_by_inverse_length(false),
-m_scale_by_avg_inverse_length(false),
-m_scale_by_x(1),
-m_historySmoothing(0.9),
-m_smoothing_scheme(PLUS_POINT_ONE)
+ :StatefulFeatureFunction("BleuScoreFeature",1, line),
+ m_enabled(true),
+ m_sentence_bleu(true),
+ m_simple_history_bleu(false),
+ m_count_history(BleuScoreState::bleu_order),
+ m_match_history(BleuScoreState::bleu_order),
+ m_source_length_history(0),
+ m_target_length_history(0),
+ m_ref_length_history(0),
+ m_scale_by_input_length(true),
+ m_scale_by_avg_input_length(false),
+ m_scale_by_inverse_length(false),
+ m_scale_by_avg_inverse_length(false),
+ m_scale_by_x(1),
+ m_historySmoothing(0.9),
+ m_smoothing_scheme(PLUS_POINT_ONE)
{
for (size_t i = 0; i < m_args.size(); ++i) {
const vector<string> &args = m_args[i];
@@ -131,10 +135,11 @@ m_smoothing_scheme(PLUS_POINT_ONE)
} // for (size_t i = 0; i < toks.size(); ++i) {
}
-void BleuScoreFeature::PrintHistory(std::ostream& out) const {
- out << "source length history=" << m_source_length_history << endl;
- out << "target length history=" << m_target_length_history << endl;
- out << "ref length history=" << m_ref_length_history << endl;
+void BleuScoreFeature::PrintHistory(std::ostream& out) const
+{
+ out << "source length history=" << m_source_length_history << endl;
+ out << "target length history=" << m_target_length_history << endl;
+ out << "ref length history=" << m_ref_length_history << endl;
for (size_t i = 0; i < BleuScoreState::bleu_order; ++i) {
out << "match history/count history (" << i << "):" << m_match_history[i] << "/" << m_count_history[i] << endl;
@@ -142,48 +147,49 @@ void BleuScoreFeature::PrintHistory(std::ostream& out) const {
}
void BleuScoreFeature::SetBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
- bool scaleByInverseLength, bool scaleByAvgInverseLength,
- float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu) {
- m_enabled = !disable;
- m_sentence_bleu = sentenceBleu;
- m_simple_history_bleu = simpleHistoryBleu;
- m_scale_by_input_length = scaleByInputLength;
- m_scale_by_avg_input_length = scaleByAvgInputLength;
- m_scale_by_inverse_length = scaleByInverseLength;
- m_scale_by_avg_inverse_length = scaleByAvgInverseLength;
- m_scale_by_x = scaleByX;
- m_historySmoothing = historySmoothing;
- m_smoothing_scheme = (SmoothingScheme)scheme;
+ bool scaleByInverseLength, bool scaleByAvgInverseLength,
+ float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu)
+{
+ m_enabled = !disable;
+ m_sentence_bleu = sentenceBleu;
+ m_simple_history_bleu = simpleHistoryBleu;
+ m_scale_by_input_length = scaleByInputLength;
+ m_scale_by_avg_input_length = scaleByAvgInputLength;
+ m_scale_by_inverse_length = scaleByInverseLength;
+ m_scale_by_avg_inverse_length = scaleByAvgInverseLength;
+ m_scale_by_x = scaleByX;
+ m_historySmoothing = historySmoothing;
+ m_smoothing_scheme = (SmoothingScheme)scheme;
}
// Incoming references (refs) are stored as refs[file_id][[sent_id][reference]]
// This data structure: m_refs[sent_id][[vector<length>][ngrams]]
void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::string > >& refs)
{
- m_refs.clear();
- FactorCollection& fc = FactorCollection::Instance();
- for (size_t file_id = 0; file_id < refs.size(); file_id++) {
- for (size_t sent_id = 0; sent_id < refs[file_id].size(); sent_id++) {
- const string& ref = refs[file_id][sent_id];
- vector<string> refTokens = Tokenize(ref);
- if (file_id == 0)
- m_refs[sent_id] = RefValue();
- pair<vector<size_t>,NGrams>& ref_pair = m_refs[sent_id];
- (ref_pair.first).push_back(refTokens.size());
- for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) {
- for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) {
- Phrase ngram(1);
- for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) {
- const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]);
- Word w;
- w.SetFactor(0, f);
- ngram.AddWord(w);
- }
- ref_pair.second[ngram] += 1;
- }
- }
- }
- }
+ m_refs.clear();
+ FactorCollection& fc = FactorCollection::Instance();
+ for (size_t file_id = 0; file_id < refs.size(); file_id++) {
+ for (size_t sent_id = 0; sent_id < refs[file_id].size(); sent_id++) {
+ const string& ref = refs[file_id][sent_id];
+ vector<string> refTokens = Tokenize(ref);
+ if (file_id == 0)
+ m_refs[sent_id] = RefValue();
+ pair<vector<size_t>,NGrams>& ref_pair = m_refs[sent_id];
+ (ref_pair.first).push_back(refTokens.size());
+ for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) {
+ for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) {
+ Phrase ngram(1);
+ for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) {
+ const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]);
+ Word w;
+ w.SetFactor(0, f);
+ ngram.AddWord(w);
+ }
+ ref_pair.second[ngram] += 1;
+ }
+ }
+ }
+ }
// cerr << "Number of ref files: " << refs.size() << endl;
// for (size_t i = 0; i < m_refs.size(); ++i) {
@@ -191,51 +197,57 @@ void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::strin
// }
}
-void BleuScoreFeature::SetCurrSourceLength(size_t source_length) {
- m_cur_source_length = source_length;
+void BleuScoreFeature::SetCurrSourceLength(size_t source_length)
+{
+ m_cur_source_length = source_length;
}
-void BleuScoreFeature::SetCurrNormSourceLength(size_t source_length) {
- m_cur_norm_source_length = source_length;
+void BleuScoreFeature::SetCurrNormSourceLength(size_t source_length)
+{
+ m_cur_norm_source_length = source_length;
}
// m_refs[sent_id][[vector<length>][ngrams]]
-void BleuScoreFeature::SetCurrShortestRefLength(size_t sent_id) {
- // look for shortest reference
- int shortestRef = -1;
- for (size_t i = 0; i < (m_refs[sent_id].first).size(); ++i) {
- if (shortestRef == -1 || (m_refs[sent_id].first)[i] < shortestRef)
- shortestRef = (m_refs[sent_id].first)[i];
- }
- m_cur_ref_length = shortestRef;
+void BleuScoreFeature::SetCurrShortestRefLength(size_t sent_id)
+{
+ // look for shortest reference
+ int shortestRef = -1;
+ for (size_t i = 0; i < (m_refs[sent_id].first).size(); ++i) {
+ if (shortestRef == -1 || (m_refs[sent_id].first)[i] < shortestRef)
+ shortestRef = (m_refs[sent_id].first)[i];
+ }
+ m_cur_ref_length = shortestRef;
// cerr << "Set shortest cur_ref_length: " << m_cur_ref_length << endl;
}
-void BleuScoreFeature::SetCurrAvgRefLength(size_t sent_id) {
- // compute average reference length
- size_t sum = 0;
- size_t numberRefs = (m_refs[sent_id].first).size();
- for (size_t i = 0; i < numberRefs; ++i) {
- sum += (m_refs[sent_id].first)[i];
- }
- m_cur_ref_length = (float)sum/numberRefs;
+void BleuScoreFeature::SetCurrAvgRefLength(size_t sent_id)
+{
+ // compute average reference length
+ size_t sum = 0;
+ size_t numberRefs = (m_refs[sent_id].first).size();
+ for (size_t i = 0; i < numberRefs; ++i) {
+ sum += (m_refs[sent_id].first)[i];
+ }
+ m_cur_ref_length = (float)sum/numberRefs;
// cerr << "Set average cur_ref_length: " << m_cur_ref_length << endl;
}
-void BleuScoreFeature::SetCurrReferenceNgrams(size_t sent_id) {
- m_cur_ref_ngrams = m_refs[sent_id].second;
+void BleuScoreFeature::SetCurrReferenceNgrams(size_t sent_id)
+{
+ m_cur_ref_ngrams = m_refs[sent_id].second;
}
-size_t BleuScoreFeature::GetShortestRefIndex(size_t ref_id) {
- // look for shortest reference
- int shortestRef = -1;
- size_t shortestRefIndex = 0;
- for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) {
- if (shortestRef == -1 || (m_refs[ref_id].first)[i] < shortestRef) {
- shortestRef = (m_refs[ref_id].first)[i];
- shortestRefIndex = i;
- }
- }
- return shortestRefIndex;
+size_t BleuScoreFeature::GetShortestRefIndex(size_t ref_id)
+{
+ // look for shortest reference
+ int shortestRef = -1;
+ size_t shortestRefIndex = 0;
+ for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) {
+ if (shortestRef == -1 || (m_refs[ref_id].first)[i] < shortestRef) {
+ shortestRef = (m_refs[ref_id].first)[i];
+ shortestRefIndex = i;
+ }
+ }
+ return shortestRefIndex;
}
/*
@@ -244,73 +256,75 @@ size_t BleuScoreFeature::GetShortestRefIndex(size_t ref_id) {
* O = m_historySmoothing * (O + c(e_oracle))
* O_f = m_historySmoothing * (O_f + |f|) input length of pseudo-document
*/
-void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
- Phrase phrase(hypo);
+void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo)
+{
+ Phrase phrase(hypo);
+ std::vector< size_t > ngram_counts(BleuScoreState::bleu_order);
+ std::vector< size_t > ngram_matches(BleuScoreState::bleu_order);
+
+ // compute vector c(e;{r_k}):
+ // vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k
+ GetNgramMatchCounts(phrase, m_cur_ref_ngrams, ngram_counts, ngram_matches, 0);
+
+ // update counts and matches for every ngram length with counts from hypo
+ for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
+ m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]);
+ m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]);
+ }
+
+ // update counts for reference and target length
+ m_source_length_history = m_historySmoothing * (m_source_length_history + m_cur_source_length);
+ m_target_length_history = m_historySmoothing * (m_target_length_history + hypo.size());
+ m_ref_length_history = m_historySmoothing * (m_ref_length_history + m_cur_ref_length);
+}
+
+/*
+ * Update history with a batch of translations
+ */
+void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypos, vector<size_t>& sourceLengths, vector<size_t>& ref_ids, size_t rank, size_t epoch)
+{
+ for (size_t ref_id = 0; ref_id < hypos.size(); ++ref_id) {
+ Phrase phrase(hypos[ref_id]);
std::vector< size_t > ngram_counts(BleuScoreState::bleu_order);
std::vector< size_t > ngram_matches(BleuScoreState::bleu_order);
+ // set current source and reference information for each oracle in the batch
+ size_t cur_source_length = sourceLengths[ref_id];
+ size_t hypo_length = hypos[ref_id].size();
+ size_t cur_ref_length = GetClosestRefLength(ref_ids[ref_id], hypo_length);
+ NGrams cur_ref_ngrams = m_refs[ref_ids[ref_id]].second;
+ cerr << "reference length: " << cur_ref_length << endl;
+
// compute vector c(e;{r_k}):
// vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k
- GetNgramMatchCounts(phrase, m_cur_ref_ngrams, ngram_counts, ngram_matches, 0);
+ GetNgramMatchCounts(phrase, cur_ref_ngrams, ngram_counts, ngram_matches, 0);
// update counts and matches for every ngram length with counts from hypo
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
- m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]);
- m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]);
+ m_count_history[i] += ngram_counts[i];
+ m_match_history[i] += ngram_matches[i];
+
+ // do this for last position in batch
+ if (ref_id == hypos.size() - 1) {
+ m_count_history[i] *= m_historySmoothing;
+ m_match_history[i] *= m_historySmoothing;
+ }
}
// update counts for reference and target length
- m_source_length_history = m_historySmoothing * (m_source_length_history + m_cur_source_length);
- m_target_length_history = m_historySmoothing * (m_target_length_history + hypo.size());
- m_ref_length_history = m_historySmoothing * (m_ref_length_history + m_cur_ref_length);
-}
-
-/*
- * Update history with a batch of translations
- */
-void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypos, vector<size_t>& sourceLengths, vector<size_t>& ref_ids, size_t rank, size_t epoch) {
- for (size_t ref_id = 0; ref_id < hypos.size(); ++ref_id){
- Phrase phrase(hypos[ref_id]);
- std::vector< size_t > ngram_counts(BleuScoreState::bleu_order);
- std::vector< size_t > ngram_matches(BleuScoreState::bleu_order);
-
- // set current source and reference information for each oracle in the batch
- size_t cur_source_length = sourceLengths[ref_id];
- size_t hypo_length = hypos[ref_id].size();
- size_t cur_ref_length = GetClosestRefLength(ref_ids[ref_id], hypo_length);
- NGrams cur_ref_ngrams = m_refs[ref_ids[ref_id]].second;
- cerr << "reference length: " << cur_ref_length << endl;
-
- // compute vector c(e;{r_k}):
- // vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k
- GetNgramMatchCounts(phrase, cur_ref_ngrams, ngram_counts, ngram_matches, 0);
-
- // update counts and matches for every ngram length with counts from hypo
- for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
- m_count_history[i] += ngram_counts[i];
- m_match_history[i] += ngram_matches[i];
-
- // do this for last position in batch
- if (ref_id == hypos.size() - 1) {
- m_count_history[i] *= m_historySmoothing;
- m_match_history[i] *= m_historySmoothing;
- }
- }
-
- // update counts for reference and target length
- m_source_length_history += cur_source_length;
- m_target_length_history += hypos[ref_id].size();
- m_ref_length_history += cur_ref_length;
-
- // do this for last position in batch
- if (ref_id == hypos.size() - 1) {
- cerr << "Rank " << rank << ", epoch " << epoch << " ,source length history: " << m_source_length_history << " --> " << m_source_length_history * m_historySmoothing << endl;
- cerr << "Rank " << rank << ", epoch " << epoch << " ,target length history: " << m_target_length_history << " --> " << m_target_length_history * m_historySmoothing << endl;
- m_source_length_history *= m_historySmoothing;
- m_target_length_history *= m_historySmoothing;
- m_ref_length_history *= m_historySmoothing;
- }
- }
+ m_source_length_history += cur_source_length;
+ m_target_length_history += hypos[ref_id].size();
+ m_ref_length_history += cur_ref_length;
+
+ // do this for last position in batch
+ if (ref_id == hypos.size() - 1) {
+ cerr << "Rank " << rank << ", epoch " << epoch << " ,source length history: " << m_source_length_history << " --> " << m_source_length_history * m_historySmoothing << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << " ,target length history: " << m_target_length_history << " --> " << m_target_length_history * m_historySmoothing << endl;
+ m_source_length_history *= m_historySmoothing;
+ m_target_length_history *= m_historySmoothing;
+ m_ref_length_history *= m_historySmoothing;
+ }
+ }
}
/*
@@ -323,17 +337,18 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo
}
}*/
-size_t BleuScoreFeature::GetClosestRefLength(size_t ref_id, int hypoLength) {
- // look for closest reference
- int currentDist = -1;
- int closestRefLength = -1;
- for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) {
- if (closestRefLength == -1 || abs(hypoLength - (int)(m_refs[ref_id].first)[i]) < currentDist) {
- closestRefLength = (m_refs[ref_id].first)[i];
- currentDist = abs(hypoLength - (int)(m_refs[ref_id].first)[i]);
- }
- }
- return (size_t)closestRefLength;
+size_t BleuScoreFeature::GetClosestRefLength(size_t ref_id, int hypoLength)
+{
+ // look for closest reference
+ int currentDist = -1;
+ int closestRefLength = -1;
+ for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) {
+ if (closestRefLength == -1 || abs(hypoLength - (int)(m_refs[ref_id].first)[i]) < currentDist) {
+ closestRefLength = (m_refs[ref_id].first)[i];
+ currentDist = abs(hypoLength - (int)(m_refs[ref_id].first)[i]);
+ }
+ }
+ return (size_t)closestRefLength;
}
/*
@@ -341,206 +356,206 @@ size_t BleuScoreFeature::GetClosestRefLength(size_t ref_id, int hypoLength) {
* its ngram matches against the ngrams in the reference translation
*/
void BleuScoreFeature::GetNgramMatchCounts(Phrase& phrase,
- const NGrams& ref_ngram_counts,
- std::vector< size_t >& ret_counts,
- std::vector< size_t >& ret_matches,
- size_t skip_first) const
+ const NGrams& ref_ngram_counts,
+ std::vector< size_t >& ret_counts,
+ std::vector< size_t >& ret_matches,
+ size_t skip_first) const
{
- NGrams::const_iterator ref_ngram_counts_iter;
- size_t ngram_start_idx, ngram_end_idx;
+ NGrams::const_iterator ref_ngram_counts_iter;
+ size_t ngram_start_idx, ngram_end_idx;
- // Chiang et al (2008) use unclipped counts of ngram matches
- for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) {
- for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
- if (order > end_idx) break;
+ // Chiang et al (2008) use unclipped counts of ngram matches
+ for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) {
+ for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
+ if (order > end_idx) break;
- ngram_end_idx = end_idx;
- ngram_start_idx = end_idx - order;
+ ngram_end_idx = end_idx;
+ ngram_start_idx = end_idx - order;
- Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
- ret_counts[order]++;
+ Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
+ ret_counts[order]++;
- ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
- if (ref_ngram_counts_iter != ref_ngram_counts.end())
- ret_matches[order]++;
- }
+ ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
+ if (ref_ngram_counts_iter != ref_ngram_counts.end())
+ ret_matches[order]++;
}
+ }
}
// score ngrams of words that have been added before the previous word span
void BleuScoreFeature::GetNgramMatchCounts_prefix(Phrase& phrase,
- const NGrams& ref_ngram_counts,
- std::vector< size_t >& ret_counts,
- std::vector< size_t >& ret_matches,
- size_t new_start_indices,
- size_t last_end_index) const
+ const NGrams& ref_ngram_counts,
+ std::vector< size_t >& ret_counts,
+ std::vector< size_t >& ret_matches,
+ size_t new_start_indices,
+ size_t last_end_index) const
{
- NGrams::const_iterator ref_ngram_counts_iter;
- size_t ngram_start_idx, ngram_end_idx;
-
- // Chiang et al (2008) use unclipped counts of ngram matches
- for (size_t start_idx = 0; start_idx < new_start_indices; start_idx++) {
- for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
- ngram_start_idx = start_idx;
- ngram_end_idx = start_idx + order;
- if (order > ngram_end_idx) break;
- if (ngram_end_idx > last_end_index) break;
-
- Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
- ret_counts[order]++;
-
- ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
- if (ref_ngram_counts_iter != ref_ngram_counts.end())
- ret_matches[order]++;
- }
+ NGrams::const_iterator ref_ngram_counts_iter;
+ size_t ngram_start_idx, ngram_end_idx;
+
+ // Chiang et al (2008) use unclipped counts of ngram matches
+ for (size_t start_idx = 0; start_idx < new_start_indices; start_idx++) {
+ for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
+ ngram_start_idx = start_idx;
+ ngram_end_idx = start_idx + order;
+ if (order > ngram_end_idx) break;
+ if (ngram_end_idx > last_end_index) break;
+
+ Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
+ ret_counts[order]++;
+
+ ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
+ if (ref_ngram_counts_iter != ref_ngram_counts.end())
+ ret_matches[order]++;
}
+ }
}
// score ngrams around the overlap of two previously scored phrases
void BleuScoreFeature::GetNgramMatchCounts_overlap(Phrase& phrase,
- const NGrams& ref_ngram_counts,
- std::vector< size_t >& ret_counts,
- std::vector< size_t >& ret_matches,
- size_t overlap_index) const
+ const NGrams& ref_ngram_counts,
+ std::vector< size_t >& ret_counts,
+ std::vector< size_t >& ret_matches,
+ size_t overlap_index) const
{
- NGrams::const_iterator ref_ngram_counts_iter;
- size_t ngram_start_idx, ngram_end_idx;
+ NGrams::const_iterator ref_ngram_counts_iter;
+ size_t ngram_start_idx, ngram_end_idx;
- // Chiang et al (2008) use unclipped counts of ngram matches
- for (size_t end_idx = overlap_index; end_idx < phrase.GetSize(); end_idx++) {
- if (end_idx >= (overlap_index+BleuScoreState::bleu_order-1)) break;
- for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
- if (order > end_idx) break;
+ // Chiang et al (2008) use unclipped counts of ngram matches
+ for (size_t end_idx = overlap_index; end_idx < phrase.GetSize(); end_idx++) {
+ if (end_idx >= (overlap_index+BleuScoreState::bleu_order-1)) break;
+ for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
+ if (order > end_idx) break;
- ngram_end_idx = end_idx;
- ngram_start_idx = end_idx - order;
- if (ngram_start_idx >= overlap_index) continue; // only score ngrams that span the overlap point
+ ngram_end_idx = end_idx;
+ ngram_start_idx = end_idx - order;
+ if (ngram_start_idx >= overlap_index) continue; // only score ngrams that span the overlap point
- Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
- ret_counts[order]++;
+ Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
+ ret_counts[order]++;
- ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
- if (ref_ngram_counts_iter != ref_ngram_counts.end())
- ret_matches[order]++;
- }
+ ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
+ if (ref_ngram_counts_iter != ref_ngram_counts.end())
+ ret_matches[order]++;
}
+ }
}
void BleuScoreFeature::GetClippedNgramMatchesAndCounts(Phrase& phrase,
- const NGrams& ref_ngram_counts,
- std::vector< size_t >& ret_counts,
- std::vector< size_t >& ret_matches,
- size_t skip_first) const
+ const NGrams& ref_ngram_counts,
+ std::vector< size_t >& ret_counts,
+ std::vector< size_t >& ret_matches,
+ size_t skip_first) const
{
- NGrams::const_iterator ref_ngram_counts_iter;
- size_t ngram_start_idx, ngram_end_idx;
+ NGrams::const_iterator ref_ngram_counts_iter;
+ size_t ngram_start_idx, ngram_end_idx;
- Matches ngram_matches;
- for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) {
- for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
- if (order > end_idx) break;
+ Matches ngram_matches;
+ for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) {
+ for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
+ if (order > end_idx) break;
- ngram_end_idx = end_idx;
- ngram_start_idx = end_idx - order;
+ ngram_end_idx = end_idx;
+ ngram_start_idx = end_idx - order;
- Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
- ret_counts[order]++;
+ Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
+ ret_counts[order]++;
- ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
- if (ref_ngram_counts_iter != ref_ngram_counts.end()) {
- ngram_matches[order][ngram]++;
- }
- }
- }
+ ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
+ if (ref_ngram_counts_iter != ref_ngram_counts.end()) {
+ ngram_matches[order][ngram]++;
+ }
+ }
+ }
- // clip ngram matches
- for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
- NGrams::const_iterator iter;
-
- // iterate over ngram counts for every ngram order
- for (iter=ngram_matches[order].begin(); iter != ngram_matches[order].end(); ++iter) {
- ref_ngram_counts_iter = ref_ngram_counts.find(iter->first);
- if (iter->second > ref_ngram_counts_iter->second) {
- ret_matches[order] += ref_ngram_counts_iter->second;
- }
- else {
- ret_matches[order] += iter->second;
- }
+ // clip ngram matches
+ for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
+ NGrams::const_iterator iter;
+
+ // iterate over ngram counts for every ngram order
+ for (iter=ngram_matches[order].begin(); iter != ngram_matches[order].end(); ++iter) {
+ ref_ngram_counts_iter = ref_ngram_counts.find(iter->first);
+ if (iter->second > ref_ngram_counts_iter->second) {
+ ret_matches[order] += ref_ngram_counts_iter->second;
+ } else {
+ ret_matches[order] += iter->second;
+ }
}
- }
+ }
}
/*
* Given a previous state, compute Bleu score for the updated state with an additional target
* phrase translated.
*/
-FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo,
- const FFState* prev_state,
+FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo,
+ const FFState* prev_state,
ScoreComponentCollection* accumulator) const
{
- if (!m_enabled) return new BleuScoreState();
-
- NGrams::const_iterator reference_ngrams_iter;
- const BleuScoreState& ps = dynamic_cast<const BleuScoreState&>(*prev_state);
- BleuScoreState* new_state = new BleuScoreState(ps);
-
- float old_bleu, new_bleu;
- size_t num_new_words, ctx_start_idx, ctx_end_idx;
-
- // Calculate old bleu;
- old_bleu = CalculateBleu(new_state);
-
- // Get context and append new words.
- num_new_words = cur_hypo.GetCurrTargetLength();
- if (num_new_words == 0) {
- return new_state;
- }
-
- Phrase new_words = ps.m_words;
- new_words.Append(cur_hypo.GetCurrTargetPhrase());
- //cerr << "NW: " << new_words << endl;
+ if (!m_enabled) return new BleuScoreState();
- // get ngram matches for new words
- GetNgramMatchCounts(new_words,
- m_cur_ref_ngrams,
- new_state->m_ngram_counts,
- new_state->m_ngram_matches,
- new_state->m_words.GetSize()); // number of words in previous states
+ NGrams::const_iterator reference_ngrams_iter;
+ const BleuScoreState& ps = dynamic_cast<const BleuScoreState&>(*prev_state);
+ BleuScoreState* new_state = new BleuScoreState(ps);
- // Update state variables
- ctx_end_idx = new_words.GetSize()-1;
- size_t bleu_context_length = BleuScoreState::bleu_order -1;
- if (ctx_end_idx > bleu_context_length) {
- ctx_start_idx = ctx_end_idx - bleu_context_length;
- } else {
- ctx_start_idx = 0;
- }
+ float old_bleu, new_bleu;
+ size_t num_new_words, ctx_start_idx, ctx_end_idx;
- WordsBitmap coverageVector = cur_hypo.GetWordsBitmap();
- new_state->m_source_length = coverageVector.GetNumWordsCovered();
+ // Calculate old bleu;
+ old_bleu = CalculateBleu(new_state);
- new_state->m_words = new_words.GetSubString(WordsRange(ctx_start_idx,
- ctx_end_idx));
- new_state->m_target_length += cur_hypo.GetCurrTargetLength();
+ // Get context and append new words.
+ num_new_words = cur_hypo.GetCurrTargetLength();
+ if (num_new_words == 0) {
+ return new_state;
+ }
- // we need a scaled reference length to compare the current target phrase to the corresponding reference phrase
- new_state->m_scaled_ref_length = m_cur_ref_length *
- ((float)coverageVector.GetNumWordsCovered()/coverageVector.GetSize());
+ Phrase new_words = ps.m_words;
+ new_words.Append(cur_hypo.GetCurrTargetPhrase());
+ //cerr << "NW: " << new_words << endl;
- // Calculate new bleu.
- new_bleu = CalculateBleu(new_state);
+ // get ngram matches for new words
+ GetNgramMatchCounts(new_words,
+ m_cur_ref_ngrams,
+ new_state->m_ngram_counts,
+ new_state->m_ngram_matches,
+ new_state->m_words.GetSize()); // number of words in previous states
- // Set score to new Bleu score
- accumulator->PlusEquals(this, new_bleu - old_bleu);
- return new_state;
+ // Update state variables
+ ctx_end_idx = new_words.GetSize()-1;
+ size_t bleu_context_length = BleuScoreState::bleu_order -1;
+ if (ctx_end_idx > bleu_context_length) {
+ ctx_start_idx = ctx_end_idx - bleu_context_length;
+ } else {
+ ctx_start_idx = 0;
+ }
+
+ WordsBitmap coverageVector = cur_hypo.GetWordsBitmap();
+ new_state->m_source_length = coverageVector.GetNumWordsCovered();
+
+ new_state->m_words = new_words.GetSubString(WordsRange(ctx_start_idx,
+ ctx_end_idx));
+ new_state->m_target_length += cur_hypo.GetCurrTargetLength();
+
+ // we need a scaled reference length to compare the current target phrase to the corresponding reference phrase
+ new_state->m_scaled_ref_length = m_cur_ref_length *
+ ((float)coverageVector.GetNumWordsCovered()/coverageVector.GetSize());
+
+ // Calculate new bleu.
+ new_bleu = CalculateBleu(new_state);
+
+ // Set score to new Bleu score
+ accumulator->PlusEquals(this, new_bleu - old_bleu);
+ return new_state;
}
FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int featureID,
- ScoreComponentCollection* accumulator ) const {
+ ScoreComponentCollection* accumulator ) const
+{
if (!m_enabled) return new BleuScoreState();
-
+
NGrams::const_iterator reference_ngrams_iter;
-
+
const Phrase& curr_target_phrase = static_cast<const Phrase&>(cur_hypo.GetCurrTargetPhrase());
// cerr << "\nCur target phrase: " << cur_hypo.GetTargetLHS() << " --> " << curr_target_phrase << endl;
@@ -553,35 +568,35 @@ FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int fe
assert(cur_hypo.GetPrevHypos().size() <= 2);
BleuScoreState* new_state;
if (cur_hypo.GetPrevHypos().size() == 0)
- new_state = new BleuScoreState();
+ new_state = new BleuScoreState();
else {
- const FFState* prev_state_zero = cur_hypo.GetPrevHypo(0)->GetFFState(featureID);
- const BleuScoreState& ps_zero = dynamic_cast<const BleuScoreState&>(*prev_state_zero);
- new_state = new BleuScoreState(ps_zero);
- num_words_first_prev = ps_zero.m_target_length;
-
- for (size_t i = 0; i < cur_hypo.GetPrevHypos().size(); ++i) {
- const FFState* prev_state = cur_hypo.GetPrevHypo(i)->GetFFState(featureID);
- const BleuScoreState* ps = dynamic_cast<const BleuScoreState*>(prev_state);
- BleuScoreState* ps_nonConst = const_cast<BleuScoreState*>(ps);
+ const FFState* prev_state_zero = cur_hypo.GetPrevHypo(0)->GetFFState(featureID);
+ const BleuScoreState& ps_zero = dynamic_cast<const BleuScoreState&>(*prev_state_zero);
+ new_state = new BleuScoreState(ps_zero);
+ num_words_first_prev = ps_zero.m_target_length;
+
+ for (size_t i = 0; i < cur_hypo.GetPrevHypos().size(); ++i) {
+ const FFState* prev_state = cur_hypo.GetPrevHypo(i)->GetFFState(featureID);
+ const BleuScoreState* ps = dynamic_cast<const BleuScoreState*>(prev_state);
+ BleuScoreState* ps_nonConst = const_cast<BleuScoreState*>(ps);
// cerr << "prev phrase: " << cur_hypo.GetPrevHypo(i)->GetOutputPhrase()
// << " ( " << cur_hypo.GetPrevHypo(i)->GetTargetLHS() << ")" << endl;
- old_bleu += CalculateBleu(ps_nonConst);
- num_old_words += ps->m_target_length;
+ old_bleu += CalculateBleu(ps_nonConst);
+ num_old_words += ps->m_target_length;
- if (i > 0)
- // add ngram matches from other previous states
- new_state->AddNgramCountAndMatches(ps_nonConst->m_ngram_counts, ps_nonConst->m_ngram_matches);
- }
+ if (i > 0)
+ // add ngram matches from other previous states
+ new_state->AddNgramCountAndMatches(ps_nonConst->m_ngram_counts, ps_nonConst->m_ngram_matches);
+ }
}
-
+
// check if we are already done (don't add <s> and </s>)
size_t numWordsCovered = cur_hypo.GetCurrSourceRange().GetNumWordsCovered();
if (numWordsCovered == m_cur_source_length) {
- // Bleu score stays the same, do not need to add anything
- //accumulator->PlusEquals(this, 0);
- return new_state;
+ // Bleu score stays the same, do not need to add anything
+ //accumulator->PlusEquals(this, 0);
+ return new_state;
}
// set new context
@@ -592,55 +607,52 @@ FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int fe
// get ngram matches for new words
if (num_old_words == 0) {
// cerr << "compute right ngram context" << endl;
- GetNgramMatchCounts(new_words,
- m_cur_ref_ngrams,
- new_state->m_ngram_counts,
- new_state->m_ngram_matches,
- 0);
- }
- else if (new_words.GetSize() == num_old_words) {
- // two hypotheses were glued together, compute new ngrams on the basis of first hypothesis
- num_words_added_right = num_curr_words - num_words_first_prev;
- // score around overlap point
+ GetNgramMatchCounts(new_words,
+ m_cur_ref_ngrams,
+ new_state->m_ngram_counts,
+ new_state->m_ngram_matches,
+ 0);
+ } else if (new_words.GetSize() == num_old_words) {
+ // two hypotheses were glued together, compute new ngrams on the basis of first hypothesis
+ num_words_added_right = num_curr_words - num_words_first_prev;
+ // score around overlap point
// cerr << "compute overlap ngram context (" << (num_words_first_prev) << ")" << endl;
- GetNgramMatchCounts_overlap(new_words,
- m_cur_ref_ngrams,
- new_state->m_ngram_counts,
- new_state->m_ngram_matches,
- num_words_first_prev);
- }
- else if (num_old_words + curr_target_phrase.GetNumTerminals() == num_curr_words) {
- assert(curr_target_phrase.GetSize() == curr_target_phrase.GetNumTerminals()+1);
- // previous hypothesis + rule with 1 non-terminal were combined (NT substituted by Ts)
- for (size_t i = 0; i < curr_target_phrase.GetSize(); ++i)
- if (curr_target_phrase.GetWord(i).IsNonTerminal()) {
- num_words_added_left = i;
- num_words_added_right = curr_target_phrase.GetSize() - (i+1);
- break;
- }
-
- // left context
+ GetNgramMatchCounts_overlap(new_words,
+ m_cur_ref_ngrams,
+ new_state->m_ngram_counts,
+ new_state->m_ngram_matches,
+ num_words_first_prev);
+ } else if (num_old_words + curr_target_phrase.GetNumTerminals() == num_curr_words) {
+ assert(curr_target_phrase.GetSize() == curr_target_phrase.GetNumTerminals()+1);
+ // previous hypothesis + rule with 1 non-terminal were combined (NT substituted by Ts)
+ for (size_t i = 0; i < curr_target_phrase.GetSize(); ++i)
+ if (curr_target_phrase.GetWord(i).IsNonTerminal()) {
+ num_words_added_left = i;
+ num_words_added_right = curr_target_phrase.GetSize() - (i+1);
+ break;
+ }
+
+ // left context
// cerr << "compute left ngram context" << endl;
- if (num_words_added_left > 0)
- GetNgramMatchCounts_prefix(new_words,
- m_cur_ref_ngrams,
- new_state->m_ngram_counts,
- new_state->m_ngram_matches,
- num_words_added_left,
- num_curr_words - num_words_added_right - 1);
-
- // right context
+ if (num_words_added_left > 0)
+ GetNgramMatchCounts_prefix(new_words,
+ m_cur_ref_ngrams,
+ new_state->m_ngram_counts,
+ new_state->m_ngram_matches,
+ num_words_added_left,
+ num_curr_words - num_words_added_right - 1);
+
+ // right context
// cerr << "compute right ngram context" << endl;
- if (num_words_added_right > 0)
- GetNgramMatchCounts(new_words,
- m_cur_ref_ngrams,
- new_state->m_ngram_counts,
- new_state->m_ngram_matches,
- num_words_added_left + num_old_words);
- }
- else {
- cerr << "undefined state.. " << endl;
- exit(1);
+ if (num_words_added_right > 0)
+ GetNgramMatchCounts(new_words,
+ m_cur_ref_ngrams,
+ new_state->m_ngram_counts,
+ new_state->m_ngram_matches,
+ num_words_added_left + num_old_words);
+ } else {
+ cerr << "undefined state.. " << endl;
+ exit(1);
}
// Update state variables
@@ -659,7 +671,7 @@ FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int fe
// reference phrase
size_t cur_source_length = m_cur_source_length;
new_state->m_scaled_ref_length = m_cur_ref_length * (float(new_state->m_source_length)/cur_source_length);
-
+
// Calculate new bleu.
new_bleu = CalculateBleu(new_state);
@@ -675,28 +687,28 @@ float BleuScoreFeature::CalculateBleu(Phrase translation) const
{
if (translation.GetSize() == 0)
return 0.0;
-
+
Phrase normTranslation = translation;
// remove start and end symbol for chart decoding
if (m_cur_source_length != m_cur_norm_source_length) {
WordsRange* range = new WordsRange(1, translation.GetSize()-2);
normTranslation = translation.GetSubString(*range);
}
-
+
// get ngram matches for translation
BleuScoreState* state = new BleuScoreState();
GetClippedNgramMatchesAndCounts(normTranslation,
- m_cur_ref_ngrams,
- state->m_ngram_counts,
- state->m_ngram_matches,
- 0); // number of words in previous states
+ m_cur_ref_ngrams,
+ state->m_ngram_counts,
+ state->m_ngram_matches,
+ 0); // number of words in previous states
// set state variables
state->m_words = normTranslation;
state->m_source_length = m_cur_norm_source_length;
state->m_target_length = normTranslation.GetSize();
state->m_scaled_ref_length = m_cur_ref_length;
-
+
// Calculate bleu.
return CalculateBleu(state);
}
@@ -704,52 +716,53 @@ float BleuScoreFeature::CalculateBleu(Phrase translation) const
/*
* Calculate Bleu score for a partial hypothesis given as state.
*/
-float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
+float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const
+{
if (!state->m_ngram_counts[0]) return 0;
if (!state->m_ngram_matches[0]) return 0; // if we have no unigram matches, score should be 0
-
+
float precision = 1.0;
float smooth = 1;
float smoothed_count, smoothed_matches;
-
+
if (m_sentence_bleu || m_simple_history_bleu) {
// Calculate geometric mean of modified ngram precisions
// BLEU = BP * exp(SUM_1_4 1/4 * log p_n)
// = BP * 4th root(PRODUCT_1_4 p_n)
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
if (state->m_ngram_counts[i]) {
- smoothed_matches = state->m_ngram_matches[i];
- smoothed_count = state->m_ngram_counts[i];
-
- switch (m_smoothing_scheme) {
- case PLUS_ONE:
- default:
- if (i > 0) {
- // smoothing for all n > 1
- smoothed_matches += 1;
- smoothed_count += 1;
- }
- break;
- case PLUS_POINT_ONE:
- if (i > 0) {
- // smoothing for all n > 1
- smoothed_matches += 0.1;
- smoothed_count += 0.1;
- }
- break;
- case PAPINENI:
- if (state->m_ngram_matches[i] == 0) {
- smooth *= 0.5;
- smoothed_matches += smooth;
- smoothed_count += smooth;
- }
- break;
- }
-
- if (m_simple_history_bleu) {
- smoothed_matches += m_match_history[i];
- smoothed_count += m_count_history[i];
- }
+ smoothed_matches = state->m_ngram_matches[i];
+ smoothed_count = state->m_ngram_counts[i];
+
+ switch (m_smoothing_scheme) {
+ case PLUS_ONE:
+ default:
+ if (i > 0) {
+ // smoothing for all n > 1
+ smoothed_matches += 1;
+ smoothed_count += 1;
+ }
+ break;
+ case PLUS_POINT_ONE:
+ if (i > 0) {
+ // smoothing for all n > 1
+ smoothed_matches += 0.1;
+ smoothed_count += 0.1;
+ }
+ break;
+ case PAPINENI:
+ if (state->m_ngram_matches[i] == 0) {
+ smooth *= 0.5;
+ smoothed_matches += smooth;
+ smoothed_count += smooth;
+ }
+ break;
+ }
+
+ if (m_simple_history_bleu) {
+ smoothed_matches += m_match_history[i];
+ smoothed_count += m_count_history[i];
+ }
precision *= smoothed_matches/smoothed_count;
}
@@ -766,40 +779,35 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
// r: effective reference length (sum of best match lengths for each candidate sentence)
if (m_simple_history_bleu) {
if ((m_target_length_history + state->m_target_length) < (m_ref_length_history + state->m_scaled_ref_length)) {
- float smoothed_target_length = m_target_length_history + state->m_target_length;
- float smoothed_ref_length = m_ref_length_history + state->m_scaled_ref_length;
- precision *= exp(1 - (smoothed_ref_length/smoothed_target_length));
+ float smoothed_target_length = m_target_length_history + state->m_target_length;
+ float smoothed_ref_length = m_ref_length_history + state->m_scaled_ref_length;
+ precision *= exp(1 - (smoothed_ref_length/smoothed_target_length));
}
- }
- else {
+ } else {
if (state->m_target_length < state->m_scaled_ref_length) {
- float target_length = state->m_target_length;
- float ref_length = state->m_scaled_ref_length;
- precision *= exp(1 - (ref_length/target_length));
+ float target_length = state->m_target_length;
+ float ref_length = state->m_scaled_ref_length;
+ precision *= exp(1 - (ref_length/target_length));
}
}
-
+
//cerr << "precision: " << precision << endl;
-
+
// Approximate bleu score as of Chiang/Resnik is scaled by the size of the input:
// B(e;f,{r_k}) = (O_f + |f|) * BLEU(O + c(e;{r_k}))
// where c(e;) is a vector of reference length, ngram counts and ngram matches
if (m_scale_by_input_length) {
precision *= m_cur_norm_source_length;
- }
- else if (m_scale_by_avg_input_length) {
+ } else if (m_scale_by_avg_input_length) {
precision *= m_avg_input_length;
- }
- else if (m_scale_by_inverse_length) {
+ } else if (m_scale_by_inverse_length) {
precision *= (100/m_cur_norm_source_length);
- }
- else if (m_scale_by_avg_inverse_length) {
+ } else if (m_scale_by_avg_inverse_length) {
precision *= (100/m_avg_input_length);
}
-
+
return precision * m_scale_by_x;
- }
- else {
+ } else {
// Revised history BLEU: compute Bleu in the context of the pseudo-document
// B(b) = size_of_oracle_doc * (Bleu(B_hist + b) - Bleu(B_hist))
// Calculate geometric mean of modified ngram precisions
@@ -807,12 +815,12 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
// = BP * 4th root(PRODUCT_1_4 p_n)
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
if (state->m_ngram_counts[i]) {
- smoothed_matches = m_match_history[i] + state->m_ngram_matches[i] + 0.1;
- smoothed_count = m_count_history[i] + state->m_ngram_counts[i] + 0.1;
- precision *= smoothed_matches/smoothed_count;
+ smoothed_matches = m_match_history[i] + state->m_ngram_matches[i] + 0.1;
+ smoothed_count = m_count_history[i] + state->m_ngram_counts[i] + 0.1;
+ precision *= smoothed_matches/smoothed_count;
}
}
-
+
// take geometric mean
precision = pow(precision, (float)1/4);
@@ -826,25 +834,24 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
float precision_pd = 1.0;
if (m_target_length_history > 0) {
for (size_t i = 0; i < BleuScoreState::bleu_order; i++)
- if (m_count_history[i] != 0)
- precision_pd *= (m_match_history[i] + 0.1)/(m_count_history[i] + 0.1);
-
+ if (m_count_history[i] != 0)
+ precision_pd *= (m_match_history[i] + 0.1)/(m_count_history[i] + 0.1);
+
// take geometric mean
precision_pd = pow(precision_pd, (float)1/4);
// Apply brevity penalty if applicable.
if (m_target_length_history < m_ref_length_history)
- precision_pd *= exp(1 - (m_ref_length_history/m_target_length_history));
- }
- else
+ precision_pd *= exp(1 - (m_ref_length_history/m_target_length_history));
+ } else
precision_pd = 0;
// **end BLEU of pseudo-document**
cerr << "precision pd: " << precision_pd << endl;
float sentence_impact;
- if (m_target_length_history > 0)
- sentence_impact = m_target_length_history * (precision - precision_pd);
+ if (m_target_length_history > 0)
+ sentence_impact = m_target_length_history * (precision - precision_pd);
else
sentence_impact = precision;
@@ -855,7 +862,7 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
const FFState* BleuScoreFeature::EmptyHypothesisState(const InputType& input) const
{
- return new BleuScoreState();
+ return new BleuScoreState();
}
} // namespace.
diff --git a/moses/FF/BleuScoreFeature.h b/moses/FF/BleuScoreFeature.h
index dc4495506..96e273672 100644
--- a/moses/FF/BleuScoreFeature.h
+++ b/moses/FF/BleuScoreFeature.h
@@ -13,31 +13,33 @@
#include "moses/Phrase.h"
#include "moses/ChartHypothesis.h"
-namespace Moses {
+namespace Moses
+{
class BleuScoreFeature;
-class BleuScoreState : public FFState {
+class BleuScoreState : public FFState
+{
public:
- friend class BleuScoreFeature;
- static size_t bleu_order;
+ friend class BleuScoreFeature;
+ static size_t bleu_order;
- BleuScoreState();
- virtual int Compare(const FFState& other) const;
- void print(std::ostream& out) const;
+ BleuScoreState();
+ virtual int Compare(const FFState& other) const;
+ void print(std::ostream& out) const;
private:
- Phrase m_words;
- size_t m_source_length;
- size_t m_target_length;
+ Phrase m_words;
+ size_t m_source_length;
+ size_t m_target_length;
- // scaled reference length is needed for scoring incomplete hypotheses against reference translation
- float m_scaled_ref_length;
+ // scaled reference length is needed for scoring incomplete hypotheses against reference translation
+ float m_scaled_ref_length;
- std::vector< size_t > m_ngram_counts;
- std::vector< size_t > m_ngram_matches;
+ std::vector< size_t > m_ngram_counts;
+ std::vector< size_t > m_ngram_matches;
- void AddNgramCountAndMatches(std::vector< size_t >& counts, std::vector< size_t >& matches);
+ void AddNgramCountAndMatches(std::vector< size_t >& counts, std::vector< size_t >& matches);
};
@@ -56,7 +58,8 @@ public:
};
-class BleuScoreFeature : public StatefulFeatureFunction {
+class BleuScoreFeature : public StatefulFeatureFunction
+{
public:
typedef boost::unordered_map<size_t, RefValue > RefCounts;
@@ -64,95 +67,105 @@ public:
BleuScoreFeature(const std::string &line);
- void PrintHistory(std::ostream& out) const;
- void LoadReferences(const std::vector< std::vector< std::string > > &);
- void SetCurrSourceLength(size_t);
- void SetCurrNormSourceLength(size_t);
- void SetCurrShortestRefLength(size_t);
- void SetCurrAvgRefLength(size_t sent_id);
- void SetAvgInputLength (float l) { m_avg_input_length = l; }
- void SetCurrReferenceNgrams(size_t sent_id);
- size_t GetShortestRefIndex(size_t ref_id);
- size_t GetClosestRefLength(size_t ref_id, int hypoLength);
- void UpdateHistory(const std::vector< const Word* >&);
- void UpdateHistory(const std::vector< std::vector< const Word* > >& hypos, std::vector<size_t>& sourceLengths, std::vector<size_t>& ref_ids, size_t rank, size_t epoch);
- void PrintRefLength(const std::vector<size_t>& ref_ids);
- void SetBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
- bool scaleByInverseLength, bool scaleByAvgInverseLength,
- float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu);
-
- void GetNgramMatchCounts(Phrase&,
- const NGrams&,
- std::vector< size_t >&,
- std::vector< size_t >&,
- size_t skip = 0) const;
- void GetNgramMatchCounts_prefix(Phrase&,
- const NGrams&,
- std::vector< size_t >&,
- std::vector< size_t >&,
- size_t new_start_indices,
- size_t last_end_index) const;
- void GetNgramMatchCounts_overlap(Phrase& phrase,
- const NGrams& ref_ngram_counts,
- std::vector< size_t >& ret_counts,
- std::vector< size_t >& ret_matches,
- size_t overlap_index) const;
- void GetClippedNgramMatchesAndCounts(Phrase&,
- const NGrams&,
- std::vector< size_t >&,
- std::vector< size_t >&,
- size_t skip = 0) const;
-
- FFState* Evaluate( const Hypothesis& cur_hypo,
- const FFState* prev_state,
- ScoreComponentCollection* accumulator) const;
- FFState* EvaluateChart(const ChartHypothesis& cur_hypo,
- int featureID,
- ScoreComponentCollection* accumulator) const;
- bool Enabled() const { return m_enabled; }
- float CalculateBleu(BleuScoreState*) const;
- float CalculateBleu(Phrase translation) const;
- const FFState* EmptyHypothesisState(const InputType&) const;
-
- float GetSourceLengthHistory() { return m_source_length_history; }
- float GetTargetLengthHistory() { return m_target_length_history; }
- float GetAverageInputLength() { return m_avg_input_length; }
+ void PrintHistory(std::ostream& out) const;
+ void LoadReferences(const std::vector< std::vector< std::string > > &);
+ void SetCurrSourceLength(size_t);
+ void SetCurrNormSourceLength(size_t);
+ void SetCurrShortestRefLength(size_t);
+ void SetCurrAvgRefLength(size_t sent_id);
+ void SetAvgInputLength (float l) {
+ m_avg_input_length = l;
+ }
+ void SetCurrReferenceNgrams(size_t sent_id);
+ size_t GetShortestRefIndex(size_t ref_id);
+ size_t GetClosestRefLength(size_t ref_id, int hypoLength);
+ void UpdateHistory(const std::vector< const Word* >&);
+ void UpdateHistory(const std::vector< std::vector< const Word* > >& hypos, std::vector<size_t>& sourceLengths, std::vector<size_t>& ref_ids, size_t rank, size_t epoch);
+ void PrintRefLength(const std::vector<size_t>& ref_ids);
+ void SetBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
+ bool scaleByInverseLength, bool scaleByAvgInverseLength,
+ float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu);
+
+ void GetNgramMatchCounts(Phrase&,
+ const NGrams&,
+ std::vector< size_t >&,
+ std::vector< size_t >&,
+ size_t skip = 0) const;
+ void GetNgramMatchCounts_prefix(Phrase&,
+ const NGrams&,
+ std::vector< size_t >&,
+ std::vector< size_t >&,
+ size_t new_start_indices,
+ size_t last_end_index) const;
+ void GetNgramMatchCounts_overlap(Phrase& phrase,
+ const NGrams& ref_ngram_counts,
+ std::vector< size_t >& ret_counts,
+ std::vector< size_t >& ret_matches,
+ size_t overlap_index) const;
+ void GetClippedNgramMatchesAndCounts(Phrase&,
+ const NGrams&,
+ std::vector< size_t >&,
+ std::vector< size_t >&,
+ size_t skip = 0) const;
+
+ FFState* Evaluate( const Hypothesis& cur_hypo,
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const;
+ FFState* EvaluateChart(const ChartHypothesis& cur_hypo,
+ int featureID,
+ ScoreComponentCollection* accumulator) const;
+ bool Enabled() const {
+ return m_enabled;
+ }
+ float CalculateBleu(BleuScoreState*) const;
+ float CalculateBleu(Phrase translation) const;
+ const FFState* EmptyHypothesisState(const InputType&) const;
+
+ float GetSourceLengthHistory() {
+ return m_source_length_history;
+ }
+ float GetTargetLengthHistory() {
+ return m_target_length_history;
+ }
+ float GetAverageInputLength() {
+ return m_avg_input_length;
+ }
private:
- bool m_enabled;
- bool m_sentence_bleu;
- bool m_simple_history_bleu;
+ bool m_enabled;
+ bool m_sentence_bleu;
+ bool m_simple_history_bleu;
- // counts for pseudo-document
- std::vector< float > m_count_history;
- std::vector< float > m_match_history;
- float m_source_length_history;
- float m_target_length_history;
- float m_ref_length_history;
+ // counts for pseudo-document
+ std::vector< float > m_count_history;
+ std::vector< float > m_match_history;
+ float m_source_length_history;
+ float m_target_length_history;
+ float m_ref_length_history;
- size_t m_cur_source_length;
- size_t m_cur_norm_source_length; // length without <s>, </s>
- RefCounts m_refs;
- NGrams m_cur_ref_ngrams;
- float m_cur_ref_length;
+ size_t m_cur_source_length;
+ size_t m_cur_norm_source_length; // length without <s>, </s>
+ RefCounts m_refs;
+ NGrams m_cur_ref_ngrams;
+ float m_cur_ref_length;
- // scale BLEU score by history of input length
- bool m_scale_by_input_length;
- bool m_scale_by_avg_input_length;
+ // scale BLEU score by history of input length
+ bool m_scale_by_input_length;
+ bool m_scale_by_avg_input_length;
- // scale by the inverse of the input length * 100
- bool m_scale_by_inverse_length;
- bool m_scale_by_avg_inverse_length;
+ // scale by the inverse of the input length * 100
+ bool m_scale_by_inverse_length;
+ bool m_scale_by_avg_inverse_length;
- float m_avg_input_length;
+ float m_avg_input_length;
- float m_scale_by_x;
+ float m_scale_by_x;
- // smoothing factor for history counts
- float m_historySmoothing;
+ // smoothing factor for history counts
+ float m_historySmoothing;
- enum SmoothingScheme { PLUS_ONE = 1, PLUS_POINT_ONE = 2, PAPINENI = 3 };
- SmoothingScheme m_smoothing_scheme;
+ enum SmoothingScheme { PLUS_ONE = 1, PLUS_POINT_ONE = 2, PAPINENI = 3 };
+ SmoothingScheme m_smoothing_scheme;
};
} // Namespace.
diff --git a/moses/FF/ChartBasedFeatureContext.cpp b/moses/FF/ChartBasedFeatureContext.cpp
index 803f81deb..a74cce50c 100644
--- a/moses/FF/ChartBasedFeatureContext.cpp
+++ b/moses/FF/ChartBasedFeatureContext.cpp
@@ -5,15 +5,15 @@
namespace Moses
{
ChartBasedFeatureContext::ChartBasedFeatureContext
- (const ChartHypothesis* hypothesis):
+(const ChartHypothesis* hypothesis):
m_hypothesis(hypothesis),
m_targetPhrase(hypothesis->GetCurrTargetPhrase()),
m_source(hypothesis->GetManager().GetSource())
{}
ChartBasedFeatureContext::ChartBasedFeatureContext(
- const TargetPhrase& targetPhrase,
- const InputType& source):
+ const TargetPhrase& targetPhrase,
+ const InputType& source):
m_hypothesis(NULL),
m_targetPhrase(targetPhrase),
m_source(source)
diff --git a/moses/FF/ChartBasedFeatureContext.h b/moses/FF/ChartBasedFeatureContext.h
index 7649effde..a204f7c77 100644
--- a/moses/FF/ChartBasedFeatureContext.h
+++ b/moses/FF/ChartBasedFeatureContext.h
@@ -11,7 +11,7 @@ class TargetPhrase;
**/
class ChartBasedFeatureContext
{
- //The context either has a hypothesis (during search) or a
+ //The context either has a hypothesis (during search) or a
//TargetPhrase and source sentence (during pre-calculation)
//TODO: should the context also include some info on where the TargetPhrase
//is anchored (assuming it's lexicalised), which is available at pre-calc?
@@ -24,11 +24,13 @@ public:
ChartBasedFeatureContext(const TargetPhrase& targetPhrase,
const InputType& source);
- const InputType& GetSource() const
- { return m_source; }
+ const InputType& GetSource() const {
+ return m_source;
+ }
- const TargetPhrase& GetTargetPhrase() const
- { return m_targetPhrase; }
+ const TargetPhrase& GetTargetPhrase() const {
+ return m_targetPhrase;
+ }
};
diff --git a/moses/FF/DistortionScoreProducer.cpp b/moses/FF/DistortionScoreProducer.cpp
index 413679779..328c833c8 100644
--- a/moses/FF/DistortionScoreProducer.cpp
+++ b/moses/FF/DistortionScoreProducer.cpp
@@ -39,8 +39,7 @@ float DistortionScoreProducer::CalculateDistortionScore(const Hypothesis& hypo,
{
if(!StaticData::Instance().UseEarlyDistortionCost()) {
return - (float) hypo.GetInput().ComputeDistortionDistance(prev, curr);
- }
- else {
+ } else {
/* Pay distortion score as soon as possible, from Moore and Quirk MT Summit 2007
Definitions:
S : current source range
@@ -50,23 +49,23 @@ float DistortionScoreProducer::CalculateDistortionScore(const Hypothesis& hypo,
int prefixEndPos = (int)FirstGap-1;
if((int)FirstGap==-1)
- prefixEndPos = -1;
+ prefixEndPos = -1;
// case1: S is adjacent to S'' => return 0
if ((int) curr.GetStartPos() == prefixEndPos+1) {
- IFVERBOSE(4) std::cerr<< "MQ07disto:case1" << std::endl;
+ IFVERBOSE(4) std::cerr<< "MQ07disto:case1" << std::endl;
return 0;
}
// case2: S is to the left of S' => return 2(length(S))
if ((int) curr.GetEndPos() < (int) prev.GetEndPos()) {
- IFVERBOSE(4) std::cerr<< "MQ07disto:case2" << std::endl;
+ IFVERBOSE(4) std::cerr<< "MQ07disto:case2" << std::endl;
return (float) -2*(int)curr.GetNumWordsCovered();
}
// case3: S' is a subsequence of S'' => return 2(nbWordBetween(S,S'')+length(S))
if ((int) prev.GetEndPos() <= prefixEndPos) {
- IFVERBOSE(4) std::cerr<< "MQ07disto:case3" << std::endl;
+ IFVERBOSE(4) std::cerr<< "MQ07disto:case3" << std::endl;
int z = (int)curr.GetStartPos()-prefixEndPos - 1;
return (float) -2*(z + (int)curr.GetNumWordsCovered());
}
diff --git a/moses/FF/DistortionScoreProducer.h b/moses/FF/DistortionScoreProducer.h
index 394e7f2e1..2601e6398 100644
--- a/moses/FF/DistortionScoreProducer.h
+++ b/moses/FF/DistortionScoreProducer.h
@@ -17,12 +17,12 @@ class WordsRange;
class DistortionScoreProducer : public StatefulFeatureFunction
{
public:
- DistortionScoreProducer(const std::string &line)
- : StatefulFeatureFunction("Distortion", 1, line)
- {}
+ DistortionScoreProducer(const std::string &line)
+ : StatefulFeatureFunction("Distortion", 1, line)
+ {}
static float CalculateDistortionScore(const Hypothesis& hypo,
- const WordsRange &prev, const WordsRange &curr, const int FirstGapPosition);
+ const WordsRange &prev, const WordsRange &curr, const int FirstGapPosition);
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
@@ -35,8 +35,8 @@ public:
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection*) const {
- throw std::logic_error("DistortionScoreProducer not supported in chart decoder, yet");
- }
+ throw std::logic_error("DistortionScoreProducer not supported in chart decoder, yet");
+ }
};
}
diff --git a/moses/FF/FFState.h b/moses/FF/FFState.h
index 49b0e55a8..bb3a119ef 100644
--- a/moses/FF/FFState.h
+++ b/moses/FF/FFState.h
@@ -15,11 +15,12 @@ public:
virtual int Compare(const FFState& other) const = 0;
};
-class DummyState : public FFState {
+class DummyState : public FFState
+{
public:
DummyState() {}
int Compare(const FFState& other) const {
- return 0;
+ return 0;
}
};
diff --git a/moses/FF/FeatureFunction.cpp b/moses/FF/FeatureFunction.cpp
index d1a73e1a9..ea4441522 100644
--- a/moses/FF/FeatureFunction.cpp
+++ b/moses/FF/FeatureFunction.cpp
@@ -19,7 +19,7 @@ std::vector<const StatelessFeatureFunction*> StatelessFeatureFunction::m_statele
std::vector<const StatefulFeatureFunction*> StatefulFeatureFunction::m_statefulFFs;
FeatureFunction::FeatureFunction(const std::string& description, const std::string &line)
-: m_tuneable(true)
+ : m_tuneable(true)
{
ParseLine(description, line);
@@ -35,13 +35,13 @@ FeatureFunction::FeatureFunction(const std::string& description, const std::stri
m_description = dstream.str();
}
- ScoreComponentCollection::RegisterScoreProducer(this);
+ ScoreComponentCollection::RegisterScoreProducer(this);
m_producers.push_back(this);
}
FeatureFunction::FeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line)
-: m_numScoreComponents(numScoreComponents)
-, m_tuneable(true)
+ : m_numScoreComponents(numScoreComponents)
+ , m_tuneable(true)
{
ParseLine(description, line);
@@ -75,14 +75,11 @@ void FeatureFunction::ParseLine(const std::string& description, const std::strin
if (args[0] == "num-features") {
m_numScoreComponents = Scan<size_t>(args[1]);
- }
- else if (args[0] == "name") {
+ } else if (args[0] == "name") {
m_description = args[1];
- }
- else if (args[0] == "tuneable") {
+ } else if (args[0] == "tuneable") {
m_tuneable = Scan<bool>(args[1]);
- }
- else {
+ } else {
m_args.push_back(args);
}
}
diff --git a/moses/FF/FeatureFunction.h b/moses/FF/FeatureFunction.h
index 6e1fa67a8..97e7d754d 100644
--- a/moses/FF/FeatureFunction.h
+++ b/moses/FF/FeatureFunction.h
@@ -42,26 +42,33 @@ protected:
void ParseLine(const std::string& description, const std::string &line);
public:
- static const std::vector<FeatureFunction*>& GetFeatureFunctions() { return m_producers; }
+ static const std::vector<FeatureFunction*>& GetFeatureFunctions() {
+ return m_producers;
+ }
FeatureFunction(const std::string& description, const std::string &line);
FeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line);
- virtual bool IsStateless() const = 0;
+ virtual bool IsStateless() const = 0;
virtual ~FeatureFunction();
-
+
static void ResetDescriptionCounts() {
description_counts.clear();
}
//! returns the number of scores that a subclass produces.
//! For example, a language model conventionally produces 1, a translation table some arbitrary number, etc
- size_t GetNumScoreComponents() const {return m_numScoreComponents;}
+ size_t GetNumScoreComponents() const {
+ return m_numScoreComponents;
+ }
//! returns a string description of this producer
- const std::string& GetScoreProducerDescription() const
- { return m_description; }
+ const std::string& GetScoreProducerDescription() const {
+ return m_description;
+ }
- virtual bool IsTuneable() const { return m_tuneable; }
+ virtual bool IsTuneable() const {
+ return m_tuneable;
+ }
//!
virtual void InitializeForInput(InputType const& source)
@@ -71,17 +78,18 @@ public:
virtual void CleanUpAfterSentenceProcessing(const InputType& source)
{}
- const std::string &GetArgLine() const
- { return m_argLine; }
+ const std::string &GetArgLine() const {
+ return m_argLine;
+ }
virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{}
virtual void Evaluate(const InputType &source
- , ScoreComponentCollection &scoreBreakdown) const
+ , ScoreComponentCollection &scoreBreakdown) const
{}
};
diff --git a/moses/FF/GlobalLexicalModel.cpp b/moses/FF/GlobalLexicalModel.cpp
index 5724f6598..cbc6811ee 100644
--- a/moses/FF/GlobalLexicalModel.cpp
+++ b/moses/FF/GlobalLexicalModel.cpp
@@ -10,7 +10,7 @@ using namespace std;
namespace Moses
{
GlobalLexicalModel::GlobalLexicalModel(const std::string &line)
-: StatelessFeatureFunction("GlobalLexicalModel",1, line)
+ : StatelessFeatureFunction("GlobalLexicalModel",1, line)
{
std::cerr << "Creating global lexical model...\n";
@@ -23,14 +23,11 @@ GlobalLexicalModel::GlobalLexicalModel(const std::string &line)
if (args[0] == "file") {
CHECK(args.size() == 2);
filePath = args[1];
- }
- else if (args[0] == "inputFactors") {
+ } else if (args[0] == "inputFactors") {
inputFactors = Tokenize<FactorType>(args[1],",");
- }
- else if (args[0] == "outputFactors") {
+ } else if (args[0] == "outputFactors") {
outputFactors = Tokenize<FactorType>(args[1],",");
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
@@ -179,11 +176,11 @@ float GlobalLexicalModel::GetFromCacheOrScorePhrase( const TargetPhrase& targetP
}
void GlobalLexicalModel::Evaluate
- (const PhraseBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const
+(const PhraseBasedFeatureContext& context,
+ ScoreComponentCollection* accumulator) const
{
- accumulator->PlusEquals( this,
- GetFromCacheOrScorePhrase(context.GetTargetPhrase()) );
+ accumulator->PlusEquals( this,
+ GetFromCacheOrScorePhrase(context.GetTargetPhrase()) );
}
}
diff --git a/moses/FF/GlobalLexicalModel.h b/moses/FF/GlobalLexicalModel.h
index 03659b7f2..b3bf79b53 100644
--- a/moses/FF/GlobalLexicalModel.h
+++ b/moses/FF/GlobalLexicalModel.h
@@ -37,8 +37,7 @@ class GlobalLexicalModel : public StatelessFeatureFunction
typedef std::map< const Word*, float, WordComparer > SingleHash;
typedef std::map< const TargetPhrase*, float > LexiconCache;
- struct ThreadLocalStorage
- {
+ struct ThreadLocalStorage {
LexiconCache cache;
const Sentence *input;
};
@@ -64,18 +63,17 @@ private:
public:
GlobalLexicalModel(const std::string &line);
- virtual ~GlobalLexicalModel();
+ virtual ~GlobalLexicalModel();
void InitializeForInput( Sentence const& in );
void Evaluate(const PhraseBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const;
+ ScoreComponentCollection* accumulator) const;
void EvaluateChart(
const ChartBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const
- {
+ ScoreComponentCollection* accumulator) const {
throw std::logic_error("GlobalLexicalModel not supported in chart decoder, yet");
}
diff --git a/moses/FF/GlobalLexicalModelUnlimited.cpp b/moses/FF/GlobalLexicalModelUnlimited.cpp
index d4b1aeb37..5c096e43f 100644
--- a/moses/FF/GlobalLexicalModelUnlimited.cpp
+++ b/moses/FF/GlobalLexicalModelUnlimited.cpp
@@ -10,7 +10,7 @@ using namespace std;
namespace Moses
{
GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const std::string &line)
-:StatelessFeatureFunction("GlobalLexicalModelUnlimited", 0, line)
+ :StatelessFeatureFunction("GlobalLexicalModelUnlimited", 0, line)
{
const vector<string> modelSpec = Tokenize(line);
@@ -25,7 +25,7 @@ GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const std::string &line
if (spec.size() > 0) {
if (spec.size() != 2 && spec.size() != 3 && spec.size() != 4 && spec.size() != 6) {
UserMessage::Add("Format of glm feature is <factor-src>-<factor-tgt> [ignore-punct] [use-bias] "
- "[context-type] [filename-src filename-tgt]");
+ "[context-type] [filename-src filename-tgt]");
//return false;
}
@@ -41,8 +41,7 @@ GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const std::string &line
filenameTarget = spec[5];
restricted = true;
}
- }
- else
+ } else
factors = Tokenize(modelSpec[i],"-");
if ( factors.size() != 2 ) {
@@ -66,14 +65,13 @@ GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const std::string &line
}
bool GlobalLexicalModelUnlimited::Load(const std::string &filePathSource,
- const std::string &filePathTarget)
+ const std::string &filePathTarget)
{
// restricted source word vocabulary
ifstream inFileSource(filePathSource.c_str());
- if (!inFileSource)
- {
- cerr << "could not open file " << filePathSource << endl;
- return false;
+ if (!inFileSource) {
+ cerr << "could not open file " << filePathSource << endl;
+ return false;
}
std::string line;
@@ -85,10 +83,9 @@ bool GlobalLexicalModelUnlimited::Load(const std::string &filePathSource,
// restricted target word vocabulary
ifstream inFileTarget(filePathTarget.c_str());
- if (!inFileTarget)
- {
- cerr << "could not open file " << filePathTarget << endl;
- return false;
+ if (!inFileTarget) {
+ cerr << "could not open file " << filePathTarget << endl;
+ return false;
}
while (getline(inFileTarget, line)) {
@@ -109,228 +106,222 @@ void GlobalLexicalModelUnlimited::InitializeForInput( Sentence const& in )
void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComponentCollection* accumulator) const
{
- const Sentence& input = *(m_local->input);
- const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
-
- for(int targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) {
- StringPiece targetString = targetPhrase.GetWord(targetIndex).GetString(0); // TODO: change for other factors
-
- if (m_ignorePunctuation) {
- // check if first char is punctuation
- char firstChar = targetString[0];
- CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
- if(charIterator != m_punctuationHash.end())
- continue;
- }
-
- if (m_biasFeature) {
- stringstream feature;
- feature << "glm_";
- feature << targetString;
- feature << "~";
- feature << "**BIAS**";
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
-
- boost::unordered_set<uint64_t> alreadyScored;
- for(int sourceIndex = 0; sourceIndex < input.GetSize(); sourceIndex++ ) {
- const StringPiece sourceString = input.GetWord(sourceIndex).GetString(0); // TODO: change for other factors
-
- if (m_ignorePunctuation) {
- // check if first char is punctuation
- char firstChar = sourceString[0];
- CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
- if(charIterator != m_punctuationHash.end())
- continue;
- }
- const uint64_t sourceHash = util::MurmurHashNative(sourceString.data(), sourceString.size());
-
- if ( alreadyScored.find(sourceHash) == alreadyScored.end()) {
- bool sourceExists, targetExists;
- if (!m_unrestricted) {
- sourceExists = FindStringPiece(m_vocabSource, sourceString ) != m_vocabSource.end();
- targetExists = FindStringPiece(m_vocabTarget, targetString) != m_vocabTarget.end();
- }
-
- // no feature if vocab is in use and both words are not in restricted vocabularies
- if (m_unrestricted || (sourceExists && targetExists)) {
- if (m_sourceContext) {
- if (sourceIndex == 0) {
- // add <s> trigger feature for source
- stringstream feature;
- feature << "glm_";
- feature << targetString;
- feature << "~";
- feature << "<s>,";
- feature << sourceString;
- accumulator->SparsePlusEquals(feature.str(), 1);
- alreadyScored.insert(sourceHash);
- }
-
- // add source words to the right of current source word as context
- for(int contextIndex = sourceIndex+1; contextIndex < input.GetSize(); contextIndex++ ) {
- StringPiece contextString = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
- bool contextExists;
- if (!m_unrestricted)
- contextExists = FindStringPiece(m_vocabSource, contextString ) != m_vocabSource.end();
-
- if (m_unrestricted || contextExists) {
- stringstream feature;
- feature << "glm_";
- feature << targetString;
- feature << "~";
- feature << sourceString;
- feature << ",";
- feature << contextString;
- accumulator->SparsePlusEquals(feature.str(), 1);
- alreadyScored.insert(sourceHash);
- }
- }
- }
- else if (m_biphrase) {
- // --> look backwards for constructing context
- int globalTargetIndex = cur_hypo.GetSize() - targetPhrase.GetSize() + targetIndex;
-
- // 1) source-target pair, trigger source word (can be discont.) and adjacent target word (bigram)
- StringPiece targetContext;
- if (globalTargetIndex > 0)
- targetContext = cur_hypo.GetWord(globalTargetIndex-1).GetString(0); // TODO: change for other factors
- else
- targetContext = "<s>";
-
- if (sourceIndex == 0) {
- StringPiece sourceTrigger = "<s>";
- AddFeature(accumulator, sourceTrigger, sourceString,
- targetContext, targetString);
- }
- else
- for(int contextIndex = sourceIndex-1; contextIndex >= 0; contextIndex-- ) {
- StringPiece sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
- bool sourceTriggerExists = false;
- if (!m_unrestricted)
- sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
-
- if (m_unrestricted || sourceTriggerExists)
- AddFeature(accumulator, sourceTrigger, sourceString,
- targetContext, targetString);
- }
-
- // 2) source-target pair, adjacent source word (bigram) and trigger target word (can be discont.)
- StringPiece sourceContext;
- if (sourceIndex-1 >= 0)
- sourceContext = input.GetWord(sourceIndex-1).GetString(0); // TODO: change for other factors
- else
- sourceContext = "<s>";
-
- if (globalTargetIndex == 0) {
- string targetTrigger = "<s>";
- AddFeature(accumulator, sourceContext, sourceString,
- targetTrigger, targetString);
- }
- else
- for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
- StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
- bool targetTriggerExists = false;
- if (!m_unrestricted)
- targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end();
-
- if (m_unrestricted || targetTriggerExists)
- AddFeature(accumulator, sourceContext, sourceString,
- targetTrigger, targetString);
- }
- }
- else if (m_bitrigger) {
- // allow additional discont. triggers on both sides
- int globalTargetIndex = cur_hypo.GetSize() - targetPhrase.GetSize() + targetIndex;
-
- if (sourceIndex == 0) {
- StringPiece sourceTrigger = "<s>";
- bool sourceTriggerExists = true;
-
- if (globalTargetIndex == 0) {
- string targetTrigger = "<s>";
- bool targetTriggerExists = true;
-
- if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
- AddFeature(accumulator, sourceTrigger, sourceString,
- targetTrigger, targetString);
- }
- else {
- // iterate backwards over target
- for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
- StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
- bool targetTriggerExists = false;
- if (!m_unrestricted)
- targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end();
-
- if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
- AddFeature(accumulator, sourceTrigger, sourceString,
- targetTrigger, targetString);
- }
- }
- }
- // iterate over both source and target
- else {
- // iterate backwards over source
- for(int contextIndex = sourceIndex-1; contextIndex >= 0; contextIndex-- ) {
- StringPiece sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
- bool sourceTriggerExists = false;
- if (!m_unrestricted)
- sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
-
- if (globalTargetIndex == 0) {
- string targetTrigger = "<s>";
- bool targetTriggerExists = true;
-
- if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
- AddFeature(accumulator, sourceTrigger, sourceString,
- targetTrigger, targetString);
- }
- else {
- // iterate backwards over target
- for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
- StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
- bool targetTriggerExists = false;
- if (!m_unrestricted)
- targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end();
-
- if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
- AddFeature(accumulator, sourceTrigger, sourceString,
- targetTrigger, targetString);
- }
- }
- }
- }
- }
- else {
- stringstream feature;
- feature << "glm_";
- feature << targetString;
- feature << "~";
- feature << sourceString;
- accumulator->SparsePlusEquals(feature.str(), 1);
- alreadyScored.insert(sourceHash);
-
- }
- }
- }
- }
+ const Sentence& input = *(m_local->input);
+ const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
+
+ for(int targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) {
+ StringPiece targetString = targetPhrase.GetWord(targetIndex).GetString(0); // TODO: change for other factors
+
+ if (m_ignorePunctuation) {
+ // check if first char is punctuation
+ char firstChar = targetString[0];
+ CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+ if(charIterator != m_punctuationHash.end())
+ continue;
+ }
+
+ if (m_biasFeature) {
+ stringstream feature;
+ feature << "glm_";
+ feature << targetString;
+ feature << "~";
+ feature << "**BIAS**";
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ }
+
+ boost::unordered_set<uint64_t> alreadyScored;
+ for(int sourceIndex = 0; sourceIndex < input.GetSize(); sourceIndex++ ) {
+ const StringPiece sourceString = input.GetWord(sourceIndex).GetString(0); // TODO: change for other factors
+
+ if (m_ignorePunctuation) {
+ // check if first char is punctuation
+ char firstChar = sourceString[0];
+ CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+ if(charIterator != m_punctuationHash.end())
+ continue;
+ }
+ const uint64_t sourceHash = util::MurmurHashNative(sourceString.data(), sourceString.size());
+
+ if ( alreadyScored.find(sourceHash) == alreadyScored.end()) {
+ bool sourceExists, targetExists;
+ if (!m_unrestricted) {
+ sourceExists = FindStringPiece(m_vocabSource, sourceString ) != m_vocabSource.end();
+ targetExists = FindStringPiece(m_vocabTarget, targetString) != m_vocabTarget.end();
+ }
+
+ // no feature if vocab is in use and both words are not in restricted vocabularies
+ if (m_unrestricted || (sourceExists && targetExists)) {
+ if (m_sourceContext) {
+ if (sourceIndex == 0) {
+ // add <s> trigger feature for source
+ stringstream feature;
+ feature << "glm_";
+ feature << targetString;
+ feature << "~";
+ feature << "<s>,";
+ feature << sourceString;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ alreadyScored.insert(sourceHash);
+ }
+
+ // add source words to the right of current source word as context
+ for(int contextIndex = sourceIndex+1; contextIndex < input.GetSize(); contextIndex++ ) {
+ StringPiece contextString = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
+ bool contextExists;
+ if (!m_unrestricted)
+ contextExists = FindStringPiece(m_vocabSource, contextString ) != m_vocabSource.end();
+
+ if (m_unrestricted || contextExists) {
+ stringstream feature;
+ feature << "glm_";
+ feature << targetString;
+ feature << "~";
+ feature << sourceString;
+ feature << ",";
+ feature << contextString;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ alreadyScored.insert(sourceHash);
+ }
+ }
+ } else if (m_biphrase) {
+ // --> look backwards for constructing context
+ int globalTargetIndex = cur_hypo.GetSize() - targetPhrase.GetSize() + targetIndex;
+
+ // 1) source-target pair, trigger source word (can be discont.) and adjacent target word (bigram)
+ StringPiece targetContext;
+ if (globalTargetIndex > 0)
+ targetContext = cur_hypo.GetWord(globalTargetIndex-1).GetString(0); // TODO: change for other factors
+ else
+ targetContext = "<s>";
+
+ if (sourceIndex == 0) {
+ StringPiece sourceTrigger = "<s>";
+ AddFeature(accumulator, sourceTrigger, sourceString,
+ targetContext, targetString);
+ } else
+ for(int contextIndex = sourceIndex-1; contextIndex >= 0; contextIndex-- ) {
+ StringPiece sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
+ bool sourceTriggerExists = false;
+ if (!m_unrestricted)
+ sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
+
+ if (m_unrestricted || sourceTriggerExists)
+ AddFeature(accumulator, sourceTrigger, sourceString,
+ targetContext, targetString);
+ }
+
+ // 2) source-target pair, adjacent source word (bigram) and trigger target word (can be discont.)
+ StringPiece sourceContext;
+ if (sourceIndex-1 >= 0)
+ sourceContext = input.GetWord(sourceIndex-1).GetString(0); // TODO: change for other factors
+ else
+ sourceContext = "<s>";
+
+ if (globalTargetIndex == 0) {
+ string targetTrigger = "<s>";
+ AddFeature(accumulator, sourceContext, sourceString,
+ targetTrigger, targetString);
+ } else
+ for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
+ StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
+ bool targetTriggerExists = false;
+ if (!m_unrestricted)
+ targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end();
+
+ if (m_unrestricted || targetTriggerExists)
+ AddFeature(accumulator, sourceContext, sourceString,
+ targetTrigger, targetString);
+ }
+ } else if (m_bitrigger) {
+ // allow additional discont. triggers on both sides
+ int globalTargetIndex = cur_hypo.GetSize() - targetPhrase.GetSize() + targetIndex;
+
+ if (sourceIndex == 0) {
+ StringPiece sourceTrigger = "<s>";
+ bool sourceTriggerExists = true;
+
+ if (globalTargetIndex == 0) {
+ string targetTrigger = "<s>";
+ bool targetTriggerExists = true;
+
+ if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
+ AddFeature(accumulator, sourceTrigger, sourceString,
+ targetTrigger, targetString);
+ } else {
+ // iterate backwards over target
+ for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
+ StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
+ bool targetTriggerExists = false;
+ if (!m_unrestricted)
+ targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end();
+
+ if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
+ AddFeature(accumulator, sourceTrigger, sourceString,
+ targetTrigger, targetString);
+ }
+ }
+ }
+ // iterate over both source and target
+ else {
+ // iterate backwards over source
+ for(int contextIndex = sourceIndex-1; contextIndex >= 0; contextIndex-- ) {
+ StringPiece sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
+ bool sourceTriggerExists = false;
+ if (!m_unrestricted)
+ sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
+
+ if (globalTargetIndex == 0) {
+ string targetTrigger = "<s>";
+ bool targetTriggerExists = true;
+
+ if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
+ AddFeature(accumulator, sourceTrigger, sourceString,
+ targetTrigger, targetString);
+ } else {
+ // iterate backwards over target
+ for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
+ StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
+ bool targetTriggerExists = false;
+ if (!m_unrestricted)
+ targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end();
+
+ if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
+ AddFeature(accumulator, sourceTrigger, sourceString,
+ targetTrigger, targetString);
+ }
+ }
+ }
+ }
+ } else {
+ stringstream feature;
+ feature << "glm_";
+ feature << targetString;
+ feature << "~";
+ feature << sourceString;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ alreadyScored.insert(sourceHash);
+
+ }
+ }
+ }
+ }
}
}
void GlobalLexicalModelUnlimited::AddFeature(ScoreComponentCollection* accumulator,
- StringPiece sourceTrigger, StringPiece sourceWord,
- StringPiece targetTrigger, StringPiece targetWord) const {
- stringstream feature;
- feature << "glm_";
- feature << targetTrigger;
- feature << ",";
- feature << targetWord;
- feature << "~";
- feature << sourceTrigger;
- feature << ",";
- feature << sourceWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ StringPiece sourceTrigger, StringPiece sourceWord,
+ StringPiece targetTrigger, StringPiece targetWord) const
+{
+ stringstream feature;
+ feature << "glm_";
+ feature << targetTrigger;
+ feature << ",";
+ feature << targetWord;
+ feature << "~";
+ feature << sourceTrigger;
+ feature << ",";
+ feature << sourceWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
}
diff --git a/moses/FF/GlobalLexicalModelUnlimited.h b/moses/FF/GlobalLexicalModelUnlimited.h
index 42b7abae9..28579f55c 100644
--- a/moses/FF/GlobalLexicalModelUnlimited.h
+++ b/moses/FF/GlobalLexicalModelUnlimited.h
@@ -38,11 +38,10 @@ class InputType;
class GlobalLexicalModelUnlimited : public StatelessFeatureFunction
{
- typedef std::map< char, short > CharHash;
- typedef std::map< std::string, short > StringHash;
+ typedef std::map< char, short > CharHash;
+ typedef std::map< std::string, short > StringHash;
- struct ThreadLocalStorage
- {
+ struct ThreadLocalStorage {
const Sentence *input;
};
@@ -77,23 +76,23 @@ public:
void InitializeForInput( Sentence const& in );
const FFState* EmptyHypothesisState(const InputType &) const {
- return new DummyState();
+ return new DummyState();
}
//TODO: This implements the old interface, but cannot be updated because
//it appears to be stateful
void Evaluate(const Hypothesis& cur_hypo,
- ScoreComponentCollection* accumulator) const;
+ ScoreComponentCollection* accumulator) const;
void EvaluateChart(const ChartHypothesis& /* cur_hypo */,
- int /* featureID */,
- ScoreComponentCollection* ) const {
+ int /* featureID */,
+ ScoreComponentCollection* ) const {
throw std::logic_error("GlobalLexicalModelUnlimited not supported in chart decoder, yet");
}
- void AddFeature(ScoreComponentCollection* accumulator,
- StringPiece sourceTrigger, StringPiece sourceWord, StringPiece targetTrigger,
- StringPiece targetWord) const;
+ void AddFeature(ScoreComponentCollection* accumulator,
+ StringPiece sourceTrigger, StringPiece sourceWord, StringPiece targetTrigger,
+ StringPiece targetWord) const;
};
diff --git a/moses/FF/InputFeature.cpp b/moses/FF/InputFeature.cpp
index 6dc60f94a..1ef394f9f 100644
--- a/moses/FF/InputFeature.cpp
+++ b/moses/FF/InputFeature.cpp
@@ -7,7 +7,7 @@ using namespace std;
namespace Moses
{
InputFeature::InputFeature(const std::string &line)
-:StatelessFeatureFunction("InputFeature", line)
+ :StatelessFeatureFunction("InputFeature", line)
{
}
@@ -17,19 +17,19 @@ const InputFeature &InputFeature::GetInputFeature()
static const InputFeature *staticObj = NULL;
if (staticObj) {
- return *staticObj;
+ return *staticObj;
}
// 1st time looking up the feature
const std::vector<const StatelessFeatureFunction*> &statefulFFs = StatelessFeatureFunction::GetStatelessFeatureFunctions();
for (size_t i = 0; i < statefulFFs.size(); ++i) {
- const StatelessFeatureFunction *ff = statefulFFs[i];
- const InputFeature *lm = dynamic_cast<const InputFeature*>(ff);
+ const StatelessFeatureFunction *ff = statefulFFs[i];
+ const InputFeature *lm = dynamic_cast<const InputFeature*>(ff);
- if (lm) {
- staticObj = lm;
- return *staticObj;
- }
+ if (lm) {
+ staticObj = lm;
+ return *staticObj;
+ }
}
throw std::logic_error("No input feature.");
diff --git a/moses/FF/PhraseBasedFeatureContext.cpp b/moses/FF/PhraseBasedFeatureContext.cpp
index 46e754801..4127a587c 100644
--- a/moses/FF/PhraseBasedFeatureContext.cpp
+++ b/moses/FF/PhraseBasedFeatureContext.cpp
@@ -11,7 +11,7 @@ PhraseBasedFeatureContext::PhraseBasedFeatureContext(const Hypothesis* hypothesi
m_source(m_hypothesis->GetManager().GetSource()) {}
PhraseBasedFeatureContext::PhraseBasedFeatureContext
- (const TranslationOption& translationOption, const InputType& source) :
+(const TranslationOption& translationOption, const InputType& source) :
m_hypothesis(NULL),
m_translationOption(translationOption),
m_source(source)
diff --git a/moses/FF/PhraseBasedFeatureContext.h b/moses/FF/PhraseBasedFeatureContext.h
index b2c7052f6..0c41712ca 100644
--- a/moses/FF/PhraseBasedFeatureContext.h
+++ b/moses/FF/PhraseBasedFeatureContext.h
@@ -17,7 +17,7 @@ class WordsBitmap;
**/
class PhraseBasedFeatureContext
{
- // The context either has a hypothesis (during search), or a TranslationOption and
+ // The context either has a hypothesis (during search), or a TranslationOption and
// source sentence (during pre-calculation).
const Hypothesis* m_hypothesis;
const TranslationOption& m_translationOption;
@@ -28,10 +28,12 @@ public:
PhraseBasedFeatureContext(const TranslationOption& translationOption,
const InputType& source);
- const TranslationOption& GetTranslationOption() const
- { return m_translationOption; }
- const InputType& GetSource() const
- { return m_source; }
+ const TranslationOption& GetTranslationOption() const {
+ return m_translationOption;
+ }
+ const InputType& GetSource() const {
+ return m_source;
+ }
const TargetPhrase& GetTargetPhrase() const; //convenience method
const WordsBitmap& GetWordsBitmap() const;
diff --git a/moses/FF/PhraseBoundaryFeature.cpp b/moses/FF/PhraseBoundaryFeature.cpp
index 671cc903e..ff73c760e 100644
--- a/moses/FF/PhraseBoundaryFeature.cpp
+++ b/moses/FF/PhraseBoundaryFeature.cpp
@@ -4,9 +4,10 @@
using namespace std;
-namespace Moses {
+namespace Moses
+{
-int PhraseBoundaryState::Compare(const FFState& other) const
+int PhraseBoundaryState::Compare(const FFState& other) const
{
const PhraseBoundaryState& rhs = dynamic_cast<const PhraseBoundaryState&>(other);
int tgt = Word::Compare(*m_targetWord,*(rhs.m_targetWord));
@@ -15,7 +16,7 @@ int PhraseBoundaryState::Compare(const FFState& other) const
}
PhraseBoundaryFeature::PhraseBoundaryFeature(const std::string &line)
-: StatefulFeatureFunction("PhraseBoundaryFeature", 0, line)
+ : StatefulFeatureFunction("PhraseBoundaryFeature", 0, line)
{
std::cerr << "Initializing source word deletion feature.." << std::endl;
@@ -24,17 +25,15 @@ PhraseBoundaryFeature::PhraseBoundaryFeature(const std::string &line)
if (args[0] == "source") {
m_sourceFactors = Tokenize<FactorType>(args[1], ",");
- }
- else if (args[0] == "target") {
+ } else if (args[0] == "target") {
m_targetFactors = Tokenize<FactorType>(args[1], ",");
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
}
-const FFState* PhraseBoundaryFeature::EmptyHypothesisState(const InputType &) const
+const FFState* PhraseBoundaryFeature::EmptyHypothesisState(const InputType &) const
{
return new PhraseBoundaryState(NULL,NULL);
}
@@ -42,31 +41,32 @@ const FFState* PhraseBoundaryFeature::EmptyHypothesisState(const InputType &) co
void PhraseBoundaryFeature::AddFeatures(
const Word* leftWord, const Word* rightWord, const FactorList& factors, const string& side,
- ScoreComponentCollection* scores) const {
- for (size_t i = 0; i < factors.size(); ++i) {
- ostringstream name;
- name << side << ":";
- name << factors[i];
- name << ":";
- if (leftWord) {
- name << leftWord->GetFactor(factors[i])->GetString();
- } else {
- name << BOS_;
- }
- name << ":";
- if (rightWord) {
- name << rightWord->GetFactor(factors[i])->GetString();
- } else {
- name << EOS_;
- }
- scores->PlusEquals(this,name.str(),1);
+ ScoreComponentCollection* scores) const
+{
+ for (size_t i = 0; i < factors.size(); ++i) {
+ ostringstream name;
+ name << side << ":";
+ name << factors[i];
+ name << ":";
+ if (leftWord) {
+ name << leftWord->GetFactor(factors[i])->GetString();
+ } else {
+ name << BOS_;
+ }
+ name << ":";
+ if (rightWord) {
+ name << rightWord->GetFactor(factors[i])->GetString();
+ } else {
+ name << EOS_;
}
+ scores->PlusEquals(this,name.str(),1);
+ }
}
FFState* PhraseBoundaryFeature::Evaluate
- (const Hypothesis& cur_hypo, const FFState* prev_state,
- ScoreComponentCollection* scores) const
+(const Hypothesis& cur_hypo, const FFState* prev_state,
+ ScoreComponentCollection* scores) const
{
const PhraseBoundaryState* pbState = dynamic_cast<const PhraseBoundaryState*>(prev_state);
const Phrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
diff --git a/moses/FF/PhraseBoundaryFeature.h b/moses/FF/PhraseBoundaryFeature.h
index 34b12abf6..b06e66eea 100644
--- a/moses/FF/PhraseBoundaryFeature.h
+++ b/moses/FF/PhraseBoundaryFeature.h
@@ -12,12 +12,17 @@
namespace Moses
{
-class PhraseBoundaryState : public FFState {
+class PhraseBoundaryState : public FFState
+{
public:
PhraseBoundaryState(const Word* sourceWord, const Word* targetWord) :
- m_sourceWord(sourceWord), m_targetWord(targetWord) {}
- const Word* GetSourceWord() const {return m_sourceWord;}
- const Word* GetTargetWord() const {return m_targetWord;}
+ m_sourceWord(sourceWord), m_targetWord(targetWord) {}
+ const Word* GetSourceWord() const {
+ return m_sourceWord;
+ }
+ const Word* GetTargetWord() const {
+ return m_targetWord;
+ }
virtual int Compare(const FFState& other) const;
@@ -30,7 +35,8 @@ private:
/**
* Concatenations of factors on boundaries of phrases.
**/
-class PhraseBoundaryFeature : public StatefulFeatureFunction {
+class PhraseBoundaryFeature : public StatefulFeatureFunction
+{
public:
PhraseBoundaryFeature(const std::string &line);
@@ -39,7 +45,7 @@ public:
virtual const FFState* EmptyHypothesisState(const InputType &) const;
virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state,
- ScoreComponentCollection* accumulator) const;
+ ScoreComponentCollection* accumulator) const;
virtual FFState* EvaluateChart( const ChartHypothesis& /* cur_hypo */,
int /* featureID */,
@@ -49,7 +55,7 @@ public:
private:
void AddFeatures(
- const Word* leftWord, const Word* rightWord, const FactorList& factors,
+ const Word* leftWord, const Word* rightWord, const FactorList& factors,
const std::string& side, ScoreComponentCollection* scores) const ;
FactorList m_sourceFactors;
FactorList m_targetFactors;
diff --git a/moses/FF/PhraseLengthFeature.cpp b/moses/FF/PhraseLengthFeature.cpp
index b9e8e9e1d..2efeb07d2 100644
--- a/moses/FF/PhraseLengthFeature.cpp
+++ b/moses/FF/PhraseLengthFeature.cpp
@@ -4,20 +4,21 @@
#include "moses/ScoreComponentCollection.h"
#include "moses/TranslationOption.h"
-namespace Moses {
+namespace Moses
+{
using namespace std;
PhraseLengthFeature::PhraseLengthFeature(const std::string &line)
-:StatelessFeatureFunction("PhraseLengthFeature", 0, line)
+ :StatelessFeatureFunction("PhraseLengthFeature", 0, line)
{
}
void PhraseLengthFeature::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
// get length of source and target phrase
size_t targetLength = targetPhrase.GetSize();
diff --git a/moses/FF/PhraseLengthFeature.h b/moses/FF/PhraseLengthFeature.h
index 327865558..23c168417 100644
--- a/moses/FF/PhraseLengthFeature.h
+++ b/moses/FF/PhraseLengthFeature.h
@@ -15,7 +15,8 @@ namespace Moses
/** Sets the features for length of source phrase, target phrase, both.
*/
-class PhraseLengthFeature : public StatelessFeatureFunction {
+class PhraseLengthFeature : public StatelessFeatureFunction
+{
public:
PhraseLengthFeature(const std::string &line);
@@ -25,9 +26,9 @@ public:
}
virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const;
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
};
diff --git a/moses/FF/PhrasePairFeature.cpp b/moses/FF/PhrasePairFeature.cpp
index 58f71271f..9fce7ff4e 100644
--- a/moses/FF/PhrasePairFeature.cpp
+++ b/moses/FF/PhrasePairFeature.cpp
@@ -9,10 +9,11 @@
using namespace std;
-namespace Moses {
+namespace Moses
+{
PhrasePairFeature::PhrasePairFeature(const std::string &line)
-:StatelessFeatureFunction("PhrasePairFeature", 0, line)
+ :StatelessFeatureFunction("PhrasePairFeature", 0, line)
{
std::cerr << "Initializing PhrasePairFeature.." << std::endl;
@@ -44,47 +45,44 @@ PhrasePairFeature::PhrasePairFeature(const std::string &line)
Load(filePathSource);
}
-bool PhrasePairFeature::Load(const std::string &filePathSource/*, const std::string &filePathTarget*/)
+bool PhrasePairFeature::Load(const std::string &filePathSource/*, const std::string &filePathTarget*/)
{
if (m_domainTrigger) {
// domain trigger terms for each input document
ifstream inFileSource(filePathSource.c_str());
- if (!inFileSource)
- {
- cerr << "could not open file " << filePathSource << endl;
- return false;
- }
-
+ if (!inFileSource) {
+ cerr << "could not open file " << filePathSource << endl;
+ return false;
+ }
+
std::string line;
while (getline(inFileSource, line)) {
std::set<std::string> terms;
vector<string> termVector;
boost::split(termVector, line, boost::is_any_of("\t "));
- for (size_t i=0; i < termVector.size(); ++i)
+ for (size_t i=0; i < termVector.size(); ++i)
terms.insert(termVector[i]);
-
+
// add term set for current document
m_vocabDomain.push_back(terms);
}
-
+
inFileSource.close();
- }
- else {
+ } else {
// restricted source word vocabulary
ifstream inFileSource(filePathSource.c_str());
- if (!inFileSource)
- {
- cerr << "could not open file " << filePathSource << endl;
- return false;
- }
-
+ if (!inFileSource) {
+ cerr << "could not open file " << filePathSource << endl;
+ return false;
+ }
+
std::string line;
while (getline(inFileSource, line)) {
m_vocabSource.insert(line);
}
-
+
inFileSource.close();
-
+
/* // restricted target word vocabulary
ifstream inFileTarget(filePathTarget.c_str());
if (!inFileTarget)
@@ -105,11 +103,11 @@ bool PhrasePairFeature::Load(const std::string &filePathSource/*, const std::str
}
void PhrasePairFeature::Evaluate(
- const PhraseBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const
+ const PhraseBasedFeatureContext& context,
+ ScoreComponentCollection* accumulator) const
{
const TargetPhrase& target = context.GetTargetPhrase();
- const Phrase& source = *(context.GetTranslationOption().GetSourcePhrase());
+ const Phrase& source = *(context.GetTranslationOption().GetSourcePhrase());
if (m_simple) {
ostringstream namestr;
namestr << "pp_";
@@ -126,11 +124,11 @@ void PhrasePairFeature::Evaluate(
namestr << ",";
namestr << targetFactor->GetString();
}
-
+
accumulator->SparsePlusEquals(namestr.str(),1);
}
if (m_domainTrigger) {
- const Sentence& input = static_cast<const Sentence&>(context.GetSource());
+ const Sentence& input = static_cast<const Sentence&>(context.GetSource());
const bool use_topicid = input.GetUseTopicId();
const bool use_topicid_prob = input.GetUseTopicIdAndProb();
@@ -149,95 +147,92 @@ void PhrasePairFeature::Evaluate(
pair << ",";
pair << targetFactor->GetString();
}
-
+
if (use_topicid || use_topicid_prob) {
if(use_topicid) {
- // use topicid as trigger
- const long topicid = input.GetTopicId();
- stringstream feature;
- feature << "pp_";
- if (topicid == -1)
- feature << "unk";
- else
- feature << topicid;
-
- feature << "_";
- feature << pair.str();
- accumulator->SparsePlusEquals(feature.str(), 1);
+ // use topicid as trigger
+ const long topicid = input.GetTopicId();
+ stringstream feature;
+ feature << "pp_";
+ if (topicid == -1)
+ feature << "unk";
+ else
+ feature << topicid;
+
+ feature << "_";
+ feature << pair.str();
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ } else {
+ // use topic probabilities
+ const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
+ if (atol(topicid_prob[0].c_str()) == -1) {
+ stringstream feature;
+ feature << "pp_unk_";
+ feature << pair.str();
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ } else {
+ for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
+ stringstream feature;
+ feature << "pp_";
+ feature << topicid_prob[i];
+ feature << "_";
+ feature << pair.str();
+ accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
+ }
+ }
}
- else {
- // use topic probabilities
- const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
- if (atol(topicid_prob[0].c_str()) == -1) {
- stringstream feature;
- feature << "pp_unk_";
- feature << pair.str();
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
- else {
- for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
- stringstream feature;
- feature << "pp_";
- feature << topicid_prob[i];
- feature << "_";
- feature << pair.str();
- accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
- }
- }
- }
- }
- else {
+ } else {
// range over domain trigger words
const long docid = input.GetDocumentId();
for (set<string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
- string sourceTrigger = *p;
- ostringstream namestr;
- namestr << "pp_";
- namestr << sourceTrigger;
- namestr << "_";
- namestr << pair.str();
- accumulator->SparsePlusEquals(namestr.str(),1);
+ string sourceTrigger = *p;
+ ostringstream namestr;
+ namestr << "pp_";
+ namestr << sourceTrigger;
+ namestr << "_";
+ namestr << pair.str();
+ accumulator->SparsePlusEquals(namestr.str(),1);
}
}
}
if (m_sourceContext) {
const Sentence& input = static_cast<const Sentence&>(context.GetSource());
-
+
// range over source words to get context
for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_sourceFactorId)->GetString();
if (m_ignorePunctuation) {
- // check if trigger is punctuation
- char firstChar = sourceTrigger[0];
- CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
- if(charIterator != m_punctuationHash.end())
- continue;
+ // check if trigger is punctuation
+ char firstChar = sourceTrigger[0];
+ CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+ if(charIterator != m_punctuationHash.end())
+ continue;
}
-
+
bool sourceTriggerExists = false;
if (!m_unrestricted)
- sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
-
+ sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
+
if (m_unrestricted || sourceTriggerExists) {
- ostringstream namestr;
- namestr << "pp_";
- namestr << sourceTrigger;
- namestr << "~";
- namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
- for (size_t i = 1; i < source.GetSize(); ++i) {
- const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
- namestr << ",";
- namestr << sourceFactor->GetString();
- }
- namestr << "~";
- namestr << target.GetWord(0).GetFactor(m_targetFactorId)->GetString();
- for (size_t i = 1; i < target.GetSize(); ++i) {
- const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId);
- namestr << ",";
- namestr << targetFactor->GetString();
- }
-
- accumulator->SparsePlusEquals(namestr.str(),1);
+ ostringstream namestr;
+ namestr << "pp_";
+ namestr << sourceTrigger;
+ namestr << "~";
+ namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
+ for (size_t i = 1; i < source.GetSize(); ++i) {
+ const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
+ namestr << ",";
+ namestr << sourceFactor->GetString();
+ }
+ namestr << "~";
+ namestr << target.GetWord(0).GetFactor(m_targetFactorId)->GetString();
+ for (size_t i = 1; i < target.GetSize(); ++i) {
+ const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId);
+ namestr << ",";
+ namestr << targetFactor->GetString();
+ }
+
+ accumulator->SparsePlusEquals(namestr.str(),1);
}
}
}
diff --git a/moses/FF/PhrasePairFeature.h b/moses/FF/PhrasePairFeature.h
index e895110f8..d7aa80be7 100644
--- a/moses/FF/PhrasePairFeature.h
+++ b/moses/FF/PhrasePairFeature.h
@@ -8,39 +8,41 @@
#include "moses/Factor.h"
#include "moses/Sentence.h"
-namespace Moses {
+namespace Moses
+{
/**
* Phrase pair feature: complete source/target phrase pair
**/
-class PhrasePairFeature: public StatelessFeatureFunction {
-
- typedef std::map< char, short > CharHash;
- typedef std::vector< std::set<std::string> > DocumentVector;
-
- boost::unordered_set<std::string> m_vocabSource;
- DocumentVector m_vocabDomain;
- FactorType m_sourceFactorId;
- FactorType m_targetFactorId;
- bool m_unrestricted;
- bool m_simple;
- bool m_sourceContext;
- bool m_domainTrigger;
- bool m_ignorePunctuation;
- CharHash m_punctuationHash;
-
- public:
- PhrasePairFeature(const std::string &line);
-
- void Evaluate(const PhraseBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const;
-
- void EvaluateChart(const ChartBasedFeatureContext& context,
- ScoreComponentCollection*) const {
- throw std::logic_error("PhrasePairFeature not valid in chart decoder");
- }
-
- bool Load(const std::string &filePathSource/*, const std::string &filePathTarget*/);
+class PhrasePairFeature: public StatelessFeatureFunction
+{
+
+ typedef std::map< char, short > CharHash;
+ typedef std::vector< std::set<std::string> > DocumentVector;
+
+ boost::unordered_set<std::string> m_vocabSource;
+ DocumentVector m_vocabDomain;
+ FactorType m_sourceFactorId;
+ FactorType m_targetFactorId;
+ bool m_unrestricted;
+ bool m_simple;
+ bool m_sourceContext;
+ bool m_domainTrigger;
+ bool m_ignorePunctuation;
+ CharHash m_punctuationHash;
+
+public:
+ PhrasePairFeature(const std::string &line);
+
+ void Evaluate(const PhraseBasedFeatureContext& context,
+ ScoreComponentCollection* accumulator) const;
+
+ void EvaluateChart(const ChartBasedFeatureContext& context,
+ ScoreComponentCollection*) const {
+ throw std::logic_error("PhrasePairFeature not valid in chart decoder");
+ }
+
+ bool Load(const std::string &filePathSource/*, const std::string &filePathTarget*/);
};
diff --git a/moses/FF/SourceWordDeletionFeature.cpp b/moses/FF/SourceWordDeletionFeature.cpp
index 085dbbeea..693812105 100644
--- a/moses/FF/SourceWordDeletionFeature.cpp
+++ b/moses/FF/SourceWordDeletionFeature.cpp
@@ -11,13 +11,14 @@
#include "util/string_piece_hash.hh"
-namespace Moses {
+namespace Moses
+{
using namespace std;
SourceWordDeletionFeature::SourceWordDeletionFeature(const std::string &line)
-:StatelessFeatureFunction("SourceWordDeletionFeature", 0, line),
-m_unrestricted(true)
+ :StatelessFeatureFunction("SourceWordDeletionFeature", 0, line),
+ m_unrestricted(true)
{
std::cerr << "Initializing source word deletion feature.." << std::endl;
@@ -27,11 +28,9 @@ m_unrestricted(true)
if (args[0] == "factor") {
m_factorType = Scan<FactorType>(args[1]);
- }
- else if (args[0] == "path") {
+ } else if (args[0] == "path") {
filename = args[1];
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
@@ -40,19 +39,18 @@ m_unrestricted(true)
if (filename != "") {
cerr << "loading source word deletion word list from " << filename << endl;
if (!Load(filename)) {
- UserMessage::Add("Unable to load word list for source word deletion feature from file " + filename);
- //return false;
+ UserMessage::Add("Unable to load word list for source word deletion feature from file " + filename);
+ //return false;
}
}
}
-bool SourceWordDeletionFeature::Load(const std::string &filePath)
+bool SourceWordDeletionFeature::Load(const std::string &filePath)
{
ifstream inFile(filePath.c_str());
- if (!inFile)
- {
- cerr << "could not open file " << filePath << endl;
- return false;
+ if (!inFile) {
+ cerr << "could not open file " << filePath << endl;
+ return false;
}
std::string line;
@@ -67,23 +65,23 @@ bool SourceWordDeletionFeature::Load(const std::string &filePath)
}
void SourceWordDeletionFeature::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm();
ComputeFeatures(source, targetPhrase, &scoreBreakdown, alignmentInfo);
}
void SourceWordDeletionFeature::ComputeFeatures(const Phrase &source,
- const TargetPhrase& targetPhrase,
- ScoreComponentCollection* accumulator,
- const AlignmentInfo &alignmentInfo) const
+ const TargetPhrase& targetPhrase,
+ ScoreComponentCollection* accumulator,
+ const AlignmentInfo &alignmentInfo) const
{
// handle special case: unknown words (they have no word alignment)
- size_t targetLength = targetPhrase.GetSize();
- size_t sourceLength = source.GetSize();
- if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
+ size_t targetLength = targetPhrase.GetSize();
+ size_t sourceLength = source.GetSize();
+ if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
// flag aligned words
bool aligned[16];
@@ -92,22 +90,21 @@ void SourceWordDeletionFeature::ComputeFeatures(const Phrase &source,
aligned[i] = false;
for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); alignmentPoint++)
aligned[ alignmentPoint->first ] = true;
-
+
// process unaligned source words
for(size_t i=0; i<sourceLength; i++) {
if (!aligned[i]) {
- const Word &w = source.GetWord(i);
- if (!w.IsNonTerminal()) {
- const StringPiece word = w.GetFactor(m_factorType)->GetString();
- if (word != "<s>" && word != "</s>") {
- if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) {
- accumulator->PlusEquals(this, StringPiece("OTHER"),1);
- }
- else {
- accumulator->PlusEquals(this,word,1);
- }
- }
- }
+ const Word &w = source.GetWord(i);
+ if (!w.IsNonTerminal()) {
+ const StringPiece word = w.GetFactor(m_factorType)->GetString();
+ if (word != "<s>" && word != "</s>") {
+ if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) {
+ accumulator->PlusEquals(this, StringPiece("OTHER"),1);
+ } else {
+ accumulator->PlusEquals(this,word,1);
+ }
+ }
+ }
}
}
}
diff --git a/moses/FF/SourceWordDeletionFeature.h b/moses/FF/SourceWordDeletionFeature.h
index 1bf6323be..7a25ee6e1 100644
--- a/moses/FF/SourceWordDeletionFeature.h
+++ b/moses/FF/SourceWordDeletionFeature.h
@@ -13,7 +13,8 @@ namespace Moses
/** Sets the features for source word deletion
*/
-class SourceWordDeletionFeature : public StatelessFeatureFunction {
+class SourceWordDeletionFeature : public StatelessFeatureFunction
+{
private:
boost::unordered_set<std::string> m_vocab;
FactorType m_factorType;
@@ -21,18 +22,18 @@ private:
public:
SourceWordDeletionFeature(const std::string &line);
-
+
bool Load(const std::string &filePath);
virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const;
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
void ComputeFeatures(const Phrase &source,
- const TargetPhrase& targetPhrase,
- ScoreComponentCollection* accumulator,
- const AlignmentInfo &alignmentInfo) const;
+ const TargetPhrase& targetPhrase,
+ ScoreComponentCollection* accumulator,
+ const AlignmentInfo &alignmentInfo) const;
};
}
diff --git a/moses/FF/StatefulFeatureFunction.cpp b/moses/FF/StatefulFeatureFunction.cpp
index a97846311..0aeeed62c 100644
--- a/moses/FF/StatefulFeatureFunction.cpp
+++ b/moses/FF/StatefulFeatureFunction.cpp
@@ -4,13 +4,13 @@ namespace Moses
{
StatefulFeatureFunction::StatefulFeatureFunction(const std::string& description, const std::string &line)
-: FeatureFunction(description, line)
+ : FeatureFunction(description, line)
{
m_statefulFFs.push_back(this);
}
StatefulFeatureFunction::StatefulFeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line)
-: FeatureFunction(description,numScoreComponents, line)
+ : FeatureFunction(description,numScoreComponents, line)
{
m_statefulFFs.push_back(this);
}
diff --git a/moses/FF/StatefulFeatureFunction.h b/moses/FF/StatefulFeatureFunction.h
index d2721d4ae..fc5cd4faf 100644
--- a/moses/FF/StatefulFeatureFunction.h
+++ b/moses/FF/StatefulFeatureFunction.h
@@ -6,7 +6,7 @@ namespace Moses
{
/** base class for all stateful feature functions.
- * eg. LM, distortion penalty
+ * eg. LM, distortion penalty
*/
class StatefulFeatureFunction: public FeatureFunction
{
@@ -14,7 +14,9 @@ class StatefulFeatureFunction: public FeatureFunction
static std::vector<const StatefulFeatureFunction*> m_statefulFFs;
public:
- static const std::vector<const StatefulFeatureFunction*>& GetStatefulFeatureFunctions() {return m_statefulFFs;}
+ static const std::vector<const StatefulFeatureFunction*>& GetStatefulFeatureFunctions() {
+ return m_statefulFFs;
+ }
StatefulFeatureFunction(const std::string& description, const std::string &line);
StatefulFeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line);
@@ -39,8 +41,9 @@ public:
//! return the state associated with the empty hypothesis for a given sentence
virtual const FFState* EmptyHypothesisState(const InputType &input) const = 0;
- bool IsStateless() const
- { return false; }
+ bool IsStateless() const {
+ return false;
+ }
};
diff --git a/moses/FF/StatelessFeatureFunction.cpp b/moses/FF/StatelessFeatureFunction.cpp
index 1c5e604de..278a90c54 100644
--- a/moses/FF/StatelessFeatureFunction.cpp
+++ b/moses/FF/StatelessFeatureFunction.cpp
@@ -4,13 +4,13 @@ namespace Moses
{
StatelessFeatureFunction::StatelessFeatureFunction(const std::string& description, const std::string &line)
-:FeatureFunction(description, line)
+ :FeatureFunction(description, line)
{
m_statelessFFs.push_back(this);
}
StatelessFeatureFunction::StatelessFeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line)
-:FeatureFunction(description, numScoreComponents, line)
+ :FeatureFunction(description, numScoreComponents, line)
{
m_statelessFFs.push_back(this);
}
diff --git a/moses/FF/StatelessFeatureFunction.h b/moses/FF/StatelessFeatureFunction.h
index d8db7f514..3f120a1de 100644
--- a/moses/FF/StatelessFeatureFunction.h
+++ b/moses/FF/StatelessFeatureFunction.h
@@ -14,7 +14,9 @@ class StatelessFeatureFunction: public FeatureFunction
static std::vector<const StatelessFeatureFunction*> m_statelessFFs;
public:
- static const std::vector<const StatelessFeatureFunction*>& GetStatelessFeatureFunctions() {return m_statelessFFs;}
+ static const std::vector<const StatelessFeatureFunction*>& GetStatelessFeatureFunctions() {
+ return m_statelessFFs;
+ }
StatelessFeatureFunction(const std::string& description, const std::string &line);
StatelessFeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line);
@@ -22,7 +24,7 @@ public:
* This should be implemented for features that apply to phrase-based models.
**/
virtual void Evaluate(const PhraseBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const
+ ScoreComponentCollection* accumulator) const
{}
/**
@@ -32,8 +34,9 @@ public:
ScoreComponentCollection* accumulator) const
{}
- virtual bool IsStateless() const
- { return true; }
+ virtual bool IsStateless() const {
+ return true;
+ }
};
diff --git a/moses/FF/TargetBigramFeature.cpp b/moses/FF/TargetBigramFeature.cpp
index 441cf9e15..fc30a737f 100644
--- a/moses/FF/TargetBigramFeature.cpp
+++ b/moses/FF/TargetBigramFeature.cpp
@@ -7,15 +7,17 @@
using namespace std;
-namespace Moses {
+namespace Moses
+{
-int TargetBigramState::Compare(const FFState& other) const {
+int TargetBigramState::Compare(const FFState& other) const
+{
const TargetBigramState& rhs = dynamic_cast<const TargetBigramState&>(other);
return Word::Compare(m_word,rhs.m_word);
}
TargetBigramFeature::TargetBigramFeature(const std::string &line)
-:StatefulFeatureFunction("TargetBigramFeature", 0, line)
+ :StatefulFeatureFunction("TargetBigramFeature", 0, line)
{
std::cerr << "Initializing target bigram feature.." << std::endl;
@@ -27,7 +29,7 @@ TargetBigramFeature::TargetBigramFeature(const std::string &line)
FactorCollection& factorCollection = FactorCollection::Instance();
const Factor* bosFactor =
- factorCollection.AddFactor(Output,m_factorType,BOS_);
+ factorCollection.AddFactor(Output,m_factorType,BOS_);
m_bos.SetFactor(m_factorType,bosFactor);
const string &filePath = tokens[2];
@@ -35,13 +37,12 @@ TargetBigramFeature::TargetBigramFeature(const std::string &line)
}
-bool TargetBigramFeature::Load(const std::string &filePath)
+bool TargetBigramFeature::Load(const std::string &filePath)
{
if (filePath == "*") return true; //allow all
ifstream inFile(filePath.c_str());
- if (!inFile)
- {
- return false;
+ if (!inFile) {
+ return false;
}
std::string line;
@@ -87,7 +88,7 @@ FFState* TargetBigramFeature::Evaluate(const Hypothesis& cur_hypo,
const StringPiece w2 = f2->GetString();
// skip bigrams if they don't belong to a given restricted vocabulary
- if (m_vocab.size() &&
+ if (m_vocab.size() &&
(FindStringPiece(m_vocab, w1) == m_vocab.end() || FindStringPiece(m_vocab, w2) == m_vocab.end())) {
continue;
}
diff --git a/moses/FF/TargetBigramFeature.h b/moses/FF/TargetBigramFeature.h
index f514f2405..e29eace14 100644
--- a/moses/FF/TargetBigramFeature.h
+++ b/moses/FF/TargetBigramFeature.h
@@ -13,35 +13,38 @@
namespace Moses
{
-class TargetBigramState : public FFState {
- public:
- TargetBigramState(const Word& word): m_word(word) {}
- const Word& GetWord() const {return m_word;}
- virtual int Compare(const FFState& other) const;
-
- private:
- Word m_word;
+class TargetBigramState : public FFState
+{
+public:
+ TargetBigramState(const Word& word): m_word(word) {}
+ const Word& GetWord() const {
+ return m_word;
+ }
+ virtual int Compare(const FFState& other) const;
+
+private:
+ Word m_word;
};
/** Sets the features of observed bigrams.
*/
-class TargetBigramFeature : public StatefulFeatureFunction {
+class TargetBigramFeature : public StatefulFeatureFunction
+{
public:
- TargetBigramFeature(const std::string &line);
+ TargetBigramFeature(const std::string &line);
- bool Load(const std::string &filePath);
+ bool Load(const std::string &filePath);
- virtual const FFState* EmptyHypothesisState(const InputType &input) const;
+ virtual const FFState* EmptyHypothesisState(const InputType &input) const;
- virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state,
- ScoreComponentCollection* accumulator) const;
+ virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const;
virtual FFState* EvaluateChart( const ChartHypothesis& /* cur_hypo */,
int /* featureID */,
- ScoreComponentCollection* ) const
- {
- abort();
- }
+ ScoreComponentCollection* ) const {
+ abort();
+ }
private:
FactorType m_factorType;
diff --git a/moses/FF/TargetNgramFeature.cpp b/moses/FF/TargetNgramFeature.cpp
index 174fcfa1a..3c36aef0e 100644
--- a/moses/FF/TargetNgramFeature.cpp
+++ b/moses/FF/TargetNgramFeature.cpp
@@ -7,38 +7,38 @@
#include "util/string_piece_hash.hh"
-namespace Moses {
+namespace Moses
+{
using namespace std;
-int TargetNgramState::Compare(const FFState& other) const {
+int TargetNgramState::Compare(const FFState& other) const
+{
const TargetNgramState& rhs = dynamic_cast<const TargetNgramState&>(other);
int result;
if (m_words.size() == rhs.m_words.size()) {
- for (size_t i = 0; i < m_words.size(); ++i) {
- result = Word::Compare(m_words[i],rhs.m_words[i]);
- if (result != 0) return result;
- }
+ for (size_t i = 0; i < m_words.size(); ++i) {
+ result = Word::Compare(m_words[i],rhs.m_words[i]);
+ if (result != 0) return result;
+ }
return 0;
- }
- else if (m_words.size() < rhs.m_words.size()) {
- for (size_t i = 0; i < m_words.size(); ++i) {
- result = Word::Compare(m_words[i],rhs.m_words[i]);
- if (result != 0) return result;
- }
- return -1;
- }
- else {
- for (size_t i = 0; i < rhs.m_words.size(); ++i) {
- result = Word::Compare(m_words[i],rhs.m_words[i]);
- if (result != 0) return result;
- }
- return 1;
+ } else if (m_words.size() < rhs.m_words.size()) {
+ for (size_t i = 0; i < m_words.size(); ++i) {
+ result = Word::Compare(m_words[i],rhs.m_words[i]);
+ if (result != 0) return result;
+ }
+ return -1;
+ } else {
+ for (size_t i = 0; i < rhs.m_words.size(); ++i) {
+ result = Word::Compare(m_words[i],rhs.m_words[i]);
+ if (result != 0) return result;
+ }
+ return 1;
}
}
TargetNgramFeature::TargetNgramFeature(const std::string &line)
-:StatefulFeatureFunction("TargetNgramFeature", 0, line)
+ :StatefulFeatureFunction("TargetNgramFeature", 0, line)
{
std::cerr << "Initializing target ngram feature.." << std::endl;
@@ -56,9 +56,8 @@ bool TargetNgramFeature::Load(const std::string &filePath)
{
if (filePath == "*") return true; //allow all
ifstream inFile(filePath.c_str());
- if (!inFile)
- {
- return false;
+ if (!inFile) {
+ return false;
}
std::string line;
@@ -74,13 +73,13 @@ bool TargetNgramFeature::Load(const std::string &filePath)
const FFState* TargetNgramFeature::EmptyHypothesisState(const InputType &/*input*/) const
{
- vector<Word> bos(1,m_bos);
+ vector<Word> bos(1,m_bos);
return new TargetNgramState(bos);
}
FFState* TargetNgramFeature::Evaluate(const Hypothesis& cur_hypo,
- const FFState* prev_state,
- ScoreComponentCollection* accumulator) const
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const
{
const TargetNgramState* tnState = static_cast<const TargetNgramState*>(prev_state);
assert(tnState);
@@ -99,92 +98,92 @@ FFState* TargetNgramFeature::Evaluate(const Hypothesis& cur_hypo,
if (m_lower_ngrams) smallest_n = 1;
for (size_t n = m_n; n >= smallest_n; --n) { // iterate over ngram size
- for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
+ for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
// const string& curr_w = targetPhrase.GetWord(i).GetFactor(m_factorType)->GetString();
- const StringPiece& curr_w = targetPhrase.GetWord(i).GetString(m_factorType);
-
- if (m_vocab.size() && (FindStringPiece(m_vocab, curr_w) == m_vocab.end())) continue; // skip ngrams
-
- if (n > 1) {
- // can we build an ngram at this position? ("<s> this" --> cannot build 3gram at this position)
- size_t pos_in_translation = cur_hypo.GetSize() - targetPhrase.GetSize() + i;
- if (pos_in_translation < n - 2) continue; // need at least m_n - 1 words
-
- // how many words needed from previous state?
- int from_prev_state = n - (i+1);
- skip = false;
- if (from_prev_state > 0) {
- if (prev_words.size() < from_prev_state) {
- // context is too short, make new state from previous state and target phrase
- vector<Word> new_prev_words;
- for (size_t i = 0; i < prev_words.size(); ++i)
- new_prev_words.push_back(prev_words[i]);
- for (size_t i = 0; i < targetPhrase.GetSize(); ++i)
- new_prev_words.push_back(targetPhrase.GetWord(i));
- return new TargetNgramState(new_prev_words);
- }
-
- // add words from previous state
- for (size_t j = prev_words.size()-from_prev_state; j < prev_words.size() && !skip; ++j)
- appendNgram(prev_words[j], skip, curr_ngram);
+ const StringPiece& curr_w = targetPhrase.GetWord(i).GetString(m_factorType);
+
+ if (m_vocab.size() && (FindStringPiece(m_vocab, curr_w) == m_vocab.end())) continue; // skip ngrams
+
+ if (n > 1) {
+ // can we build an ngram at this position? ("<s> this" --> cannot build 3gram at this position)
+ size_t pos_in_translation = cur_hypo.GetSize() - targetPhrase.GetSize() + i;
+ if (pos_in_translation < n - 2) continue; // need at least m_n - 1 words
+
+ // how many words needed from previous state?
+ int from_prev_state = n - (i+1);
+ skip = false;
+ if (from_prev_state > 0) {
+ if (prev_words.size() < from_prev_state) {
+ // context is too short, make new state from previous state and target phrase
+ vector<Word> new_prev_words;
+ for (size_t i = 0; i < prev_words.size(); ++i)
+ new_prev_words.push_back(prev_words[i]);
+ for (size_t i = 0; i < targetPhrase.GetSize(); ++i)
+ new_prev_words.push_back(targetPhrase.GetWord(i));
+ return new TargetNgramState(new_prev_words);
+ }
+
+ // add words from previous state
+ for (size_t j = prev_words.size()-from_prev_state; j < prev_words.size() && !skip; ++j)
+ appendNgram(prev_words[j], skip, curr_ngram);
}
- // add words from current target phrase
- int start = i - n + 1; // add m_n-1 previous words
- if (start < 0) start = 0; // or less
- for (size_t j = start; j < i && !skip; ++j)
- appendNgram(targetPhrase.GetWord(j), skip, curr_ngram);
+ // add words from current target phrase
+ int start = i - n + 1; // add m_n-1 previous words
+ if (start < 0) start = 0; // or less
+ for (size_t j = start; j < i && !skip; ++j)
+ appendNgram(targetPhrase.GetWord(j), skip, curr_ngram);
}
- if (!skip) {
- curr_ngram << curr_w;
- accumulator->PlusEquals(this,curr_ngram.str(),1);
+ if (!skip) {
+ curr_ngram << curr_w;
+ accumulator->PlusEquals(this,curr_ngram.str(),1);
}
- curr_ngram.str("");
- }
+ curr_ngram.str("");
+ }
}
if (cur_hypo.GetWordsBitmap().IsComplete()) {
- for (size_t n = m_n; n >= smallest_n; --n) {
- stringstream last_ngram;
- skip = false;
- for (size_t i = cur_hypo.GetSize() - n + 1; i < cur_hypo.GetSize() && !skip; ++i)
- appendNgram(cur_hypo.GetWord(i), skip, last_ngram);
-
- if (n > 1 && !skip) {
- last_ngram << EOS_;
- accumulator->PlusEquals(this, last_ngram.str(), 1);
- }
- }
- return NULL;
+ for (size_t n = m_n; n >= smallest_n; --n) {
+ stringstream last_ngram;
+ skip = false;
+ for (size_t i = cur_hypo.GetSize() - n + 1; i < cur_hypo.GetSize() && !skip; ++i)
+ appendNgram(cur_hypo.GetWord(i), skip, last_ngram);
+
+ if (n > 1 && !skip) {
+ last_ngram << EOS_;
+ accumulator->PlusEquals(this, last_ngram.str(), 1);
+ }
+ }
+ return NULL;
}
// prepare new state
vector<Word> new_prev_words;
if (targetPhrase.GetSize() >= m_n-1) {
- // take subset of target words
- for (size_t i = targetPhrase.GetSize() - m_n + 1; i < targetPhrase.GetSize(); ++i)
- new_prev_words.push_back(targetPhrase.GetWord(i));
- }
- else {
- // take words from previous state and from target phrase
- int from_prev_state = m_n - 1 - targetPhrase.GetSize();
- for (size_t i = prev_words.size()-from_prev_state; i < prev_words.size(); ++i)
- new_prev_words.push_back(prev_words[i]);
- for (size_t i = 0; i < targetPhrase.GetSize(); ++i)
- new_prev_words.push_back(targetPhrase.GetWord(i));
+ // take subset of target words
+ for (size_t i = targetPhrase.GetSize() - m_n + 1; i < targetPhrase.GetSize(); ++i)
+ new_prev_words.push_back(targetPhrase.GetWord(i));
+ } else {
+ // take words from previous state and from target phrase
+ int from_prev_state = m_n - 1 - targetPhrase.GetSize();
+ for (size_t i = prev_words.size()-from_prev_state; i < prev_words.size(); ++i)
+ new_prev_words.push_back(prev_words[i]);
+ for (size_t i = 0; i < targetPhrase.GetSize(); ++i)
+ new_prev_words.push_back(targetPhrase.GetWord(i));
}
return new TargetNgramState(new_prev_words);
}
-void TargetNgramFeature::appendNgram(const Word& word, bool& skip, stringstream &ngram) const {
+void TargetNgramFeature::appendNgram(const Word& word, bool& skip, stringstream &ngram) const
+{
// const string& w = word.GetFactor(m_factorType)->GetString();
- const StringPiece& w = word.GetString(m_factorType);
- if (m_vocab.size() && (FindStringPiece(m_vocab, w) == m_vocab.end())) skip = true;
- else {
- ngram << w;
- ngram << ":";
- }
+ const StringPiece& w = word.GetString(m_factorType);
+ if (m_vocab.size() && (FindStringPiece(m_vocab, w) == m_vocab.end())) skip = true;
+ else {
+ ngram << w;
+ ngram << ":";
+ }
}
FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int featureId, ScoreComponentCollection* accumulator) const
@@ -205,159 +204,149 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int
bool onlyTerminals = true;
bool prev_is_NT = false;
size_t prev_subPhraseLength = 0;
- for (size_t phrasePos = 0; phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize(); phrasePos++)
- {
+ for (size_t phrasePos = 0; phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize(); phrasePos++) {
// consult rule for either word or non-terminal
const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(phrasePos);
// cerr << "word: " << word << endl;
// regular word
if (!word.IsNonTerminal()) {
- contextFactor.push_back(&word);
- prev_is_NT = false;
+ contextFactor.push_back(&word);
+ prev_is_NT = false;
if (phrasePos==0)
- makePrefix = true;
+ makePrefix = true;
if (phrasePos==cur_hypo.GetCurrTargetPhrase().GetSize()-1 || prev_is_NT)
- makeSuffix = true;
-
+ makeSuffix = true;
+
// beginning/end of sentence symbol <s>,</s>?
StringPiece factorZero = word.GetString(0);
if (factorZero.compare("<s>") == 0)
- prefixTerminals++;
+ prefixTerminals++;
// end of sentence symbol </s>?
else if (factorZero.compare("</s>") == 0)
- suffixTerminals++;
+ suffixTerminals++;
// everything else
else {
- stringstream ngram;
- ngram << m_baseName;
- if (m_factorType == 0)
- ngram << factorZero;
- else
- ngram << word.GetString(m_factorType);
- accumulator->SparsePlusEquals(ngram.str(), 1);
-
- if (collectForPrefix)
- prefixTerminals++;
- else
- suffixTerminals++;
+ stringstream ngram;
+ ngram << m_baseName;
+ if (m_factorType == 0)
+ ngram << factorZero;
+ else
+ ngram << word.GetString(m_factorType);
+ accumulator->SparsePlusEquals(ngram.str(), 1);
+
+ if (collectForPrefix)
+ prefixTerminals++;
+ else
+ suffixTerminals++;
}
}
// non-terminal, add phrase from underlying hypothesis
- else if (m_n > 1)
- {
+ else if (m_n > 1) {
// look up underlying hypothesis
size_t nonTermIndex = nonTermIndexMap[phrasePos];
const ChartHypothesis *prevHypo = cur_hypo.GetPrevHypo(nonTermIndex);
const TargetNgramChartState* prevState =
- static_cast<const TargetNgramChartState*>(prevHypo->GetFFState(featureId));
+ static_cast<const TargetNgramChartState*>(prevHypo->GetFFState(featureId));
size_t subPhraseLength = prevState->GetNumTargetTerminals();
// special case: rule starts with non-terminal
if (phrasePos == 0) {
- if (subPhraseLength == 1) {
- makePrefix = true;
- ++prefixTerminals;
+ if (subPhraseLength == 1) {
+ makePrefix = true;
+ ++prefixTerminals;
- const Word &word = prevState->GetSuffix().GetWord(0);
+ const Word &word = prevState->GetSuffix().GetWord(0);
// cerr << "NT0 --> : " << word << endl;
- contextFactor.push_back(&word);
- }
- else {
- onlyTerminals = false;
- collectForPrefix = false;
- int suffixPos = prevState->GetSuffix().GetSize() - (m_n-1);
- if (suffixPos < 0) suffixPos = 0; // push all words if less than order
- for(;(size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++)
- {
- const Word &word = prevState->GetSuffix().GetWord(suffixPos);
+ contextFactor.push_back(&word);
+ } else {
+ onlyTerminals = false;
+ collectForPrefix = false;
+ int suffixPos = prevState->GetSuffix().GetSize() - (m_n-1);
+ if (suffixPos < 0) suffixPos = 0; // push all words if less than order
+ for(; (size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) {
+ const Word &word = prevState->GetSuffix().GetWord(suffixPos);
// cerr << "NT0 --> : " << word << endl;
- contextFactor.push_back(&word);
- }
- }
+ contextFactor.push_back(&word);
+ }
+ }
}
// internal non-terminal
- else
- {
- // push its prefix
- for(size_t prefixPos = 0; prefixPos < m_n-1
- && prefixPos < subPhraseLength; prefixPos++)
- {
+ else {
+ // push its prefix
+ for(size_t prefixPos = 0; prefixPos < m_n-1
+ && prefixPos < subPhraseLength; prefixPos++) {
const Word &word = prevState->GetPrefix().GetWord(prefixPos);
// cerr << "NT --> " << word << endl;
contextFactor.push_back(&word);
}
- if (subPhraseLength==1) {
- if (collectForPrefix)
- ++prefixTerminals;
- else
- ++suffixTerminals;
-
- if (phrasePos == cur_hypo.GetCurrTargetPhrase().GetSize()-1)
- makeSuffix = true;
- }
- else {
- onlyTerminals = false;
- collectForPrefix = true;
-
- // check if something follows this NT
- bool wordFollowing = (phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize() - 1)? true : false;
-
- // check if we are dealing with a large sub-phrase
- if (wordFollowing && subPhraseLength > m_n - 1)
- {
- // clear up pending ngrams
- MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
- contextFactor.clear();
- makePrefix = false;
- makeSuffix = true;
- collectForPrefix = false;
- prefixTerminals = 0;
- suffixTerminals = 0;
-
- // push its suffix
- size_t remainingWords = (remainingWords > m_n-1) ? m_n-1 : subPhraseLength - (m_n-1);
- for(size_t suffixPos = 0; suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) {
- const Word &word = prevState->GetSuffix().GetWord(suffixPos);
+ if (subPhraseLength==1) {
+ if (collectForPrefix)
+ ++prefixTerminals;
+ else
+ ++suffixTerminals;
+
+ if (phrasePos == cur_hypo.GetCurrTargetPhrase().GetSize()-1)
+ makeSuffix = true;
+ } else {
+ onlyTerminals = false;
+ collectForPrefix = true;
+
+ // check if something follows this NT
+ bool wordFollowing = (phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize() - 1)? true : false;
+
+ // check if we are dealing with a large sub-phrase
+ if (wordFollowing && subPhraseLength > m_n - 1) {
+ // clear up pending ngrams
+ MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
+ contextFactor.clear();
+ makePrefix = false;
+ makeSuffix = true;
+ collectForPrefix = false;
+ prefixTerminals = 0;
+ suffixTerminals = 0;
+
+ // push its suffix
+ size_t remainingWords = (remainingWords > m_n-1) ? m_n-1 : subPhraseLength - (m_n-1);
+ for(size_t suffixPos = 0; suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) {
+ const Word &word = prevState->GetSuffix().GetWord(suffixPos);
// cerr << "NT --> : " << word << endl;
- contextFactor.push_back(&word);
- }
- }
- // subphrase can be used as suffix and as prefix for the next part
- else if (wordFollowing && subPhraseLength == m_n - 1)
- {
- // clear up pending ngrams
- MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
- makePrefix = false;
- makeSuffix = true;
- collectForPrefix = false;
- prefixTerminals = 0;
- suffixTerminals = 0;
- }
- else if (prev_is_NT && prev_subPhraseLength > 1 && subPhraseLength > 1) {
- // two NTs in a row: make transition
- MakePrefixNgrams(contextFactor, accumulator, 1, m_n-2);
- MakeSuffixNgrams(contextFactor, accumulator, 1, m_n-2);
- makePrefix = false;
- makeSuffix = false;
- collectForPrefix = false;
- prefixTerminals = 0;
- suffixTerminals = 0;
-
- // remove duplicates
- stringstream curr_ngram;
- curr_ngram << m_baseName;
- curr_ngram << (*contextFactor[m_n-2]).GetString(m_factorType);
- curr_ngram << ":";
- curr_ngram << (*contextFactor[m_n-1]).GetString(m_factorType);
- accumulator->SparseMinusEquals(curr_ngram.str(),1);
- }
- }
+ contextFactor.push_back(&word);
+ }
+ }
+ // subphrase can be used as suffix and as prefix for the next part
+ else if (wordFollowing && subPhraseLength == m_n - 1) {
+ // clear up pending ngrams
+ MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
+ makePrefix = false;
+ makeSuffix = true;
+ collectForPrefix = false;
+ prefixTerminals = 0;
+ suffixTerminals = 0;
+ } else if (prev_is_NT && prev_subPhraseLength > 1 && subPhraseLength > 1) {
+ // two NTs in a row: make transition
+ MakePrefixNgrams(contextFactor, accumulator, 1, m_n-2);
+ MakeSuffixNgrams(contextFactor, accumulator, 1, m_n-2);
+ makePrefix = false;
+ makeSuffix = false;
+ collectForPrefix = false;
+ prefixTerminals = 0;
+ suffixTerminals = 0;
+
+ // remove duplicates
+ stringstream curr_ngram;
+ curr_ngram << m_baseName;
+ curr_ngram << (*contextFactor[m_n-2]).GetString(m_factorType);
+ curr_ngram << ":";
+ curr_ngram << (*contextFactor[m_n-1]).GetString(m_factorType);
+ accumulator->SparseMinusEquals(curr_ngram.str(),1);
+ }
+ }
}
prev_is_NT = true;
prev_subPhraseLength = subPhraseLength;
@@ -366,25 +355,24 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int
if (m_n > 1) {
if (onlyTerminals) {
- MakePrefixNgrams(contextFactor, accumulator, prefixTerminals-1);
- }
- else {
+ MakePrefixNgrams(contextFactor, accumulator, prefixTerminals-1);
+ } else {
if (makePrefix)
- MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
+ MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
if (makeSuffix)
- MakeSuffixNgrams(contextFactor, accumulator, suffixTerminals);
+ MakeSuffixNgrams(contextFactor, accumulator, suffixTerminals);
// remove duplicates
size_t size = contextFactor.size();
if (makePrefix && makeSuffix && (size <= m_n)) {
- stringstream curr_ngram;
- curr_ngram << m_baseName;
- for (size_t i = 0; i < size; ++i) {
- curr_ngram << (*contextFactor[i]).GetString(m_factorType);
- if (i < size-1)
- curr_ngram << ":";
- }
- accumulator->SparseMinusEquals(curr_ngram.str(), 1);
+ stringstream curr_ngram;
+ curr_ngram << m_baseName;
+ for (size_t i = 0; i < size; ++i) {
+ curr_ngram << (*contextFactor[i]).GetString(m_factorType);
+ if (i < size-1)
+ curr_ngram << ":";
+ }
+ accumulator->SparseMinusEquals(curr_ngram.str(), 1);
}
}
}
@@ -393,22 +381,23 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int
return new TargetNgramChartState(cur_hypo, featureId, m_n);
}
-void TargetNgramFeature::MakePrefixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfStartPos, size_t offset) const {
- stringstream ngram;
- size_t size = contextFactor.size();
+void TargetNgramFeature::MakePrefixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfStartPos, size_t offset) const
+{
+ stringstream ngram;
+ size_t size = contextFactor.size();
for (size_t k = 0; k < numberOfStartPos; ++k) {
size_t max_end = (size < m_n+k+offset)? size: m_n+k+offset;
for (size_t end_pos = 1+k+offset; end_pos < max_end; ++end_pos) {
ngram << m_baseName;
- for (size_t i=k+offset; i <= end_pos; ++i) {
- if (i > k+offset)
- ngram << ":";
+ for (size_t i=k+offset; i <= end_pos; ++i) {
+ if (i > k+offset)
+ ngram << ":";
StringPiece factorZero = (*contextFactor[i]).GetString(0);
if (m_factorType == 0 || factorZero.compare("<s>") == 0 || factorZero.compare("</s>") == 0)
- ngram << factorZero;
- else
- ngram << (*contextFactor[i]).GetString(m_factorType);
- const Word w = *contextFactor[i];
+ ngram << factorZero;
+ else
+ ngram << (*contextFactor[i]).GetString(m_factorType);
+ const Word w = *contextFactor[i];
}
// cerr << "p-ngram: " << ngram.str() << endl;
accumulator->SparsePlusEquals(ngram.str(), 1);
@@ -417,21 +406,22 @@ void TargetNgramFeature::MakePrefixNgrams(std::vector<const Word*> &contextFacto
}
}
-void TargetNgramFeature::MakeSuffixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfEndPos, size_t offset) const {
- stringstream ngram;
+void TargetNgramFeature::MakeSuffixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfEndPos, size_t offset) const
+{
+ stringstream ngram;
for (size_t k = 0; k < numberOfEndPos; ++k) {
size_t end_pos = contextFactor.size()-1-k-offset;
for (int start_pos=end_pos-1; (start_pos >= 0) && (end_pos-start_pos < m_n); --start_pos) {
- ngram << m_baseName;
- for (size_t j=start_pos; j <= end_pos; ++j){
- StringPiece factorZero = (*contextFactor[j]).GetString(0);
- if (m_factorType == 0 || factorZero.compare("<s>") == 0 || factorZero.compare("</s>") == 0)
- ngram << factorZero;
- else
- ngram << (*contextFactor[j]).GetString(m_factorType);
- if (j < end_pos)
- ngram << ":";
- }
+ ngram << m_baseName;
+ for (size_t j=start_pos; j <= end_pos; ++j) {
+ StringPiece factorZero = (*contextFactor[j]).GetString(0);
+ if (m_factorType == 0 || factorZero.compare("<s>") == 0 || factorZero.compare("</s>") == 0)
+ ngram << factorZero;
+ else
+ ngram << (*contextFactor[j]).GetString(m_factorType);
+ if (j < end_pos)
+ ngram << ":";
+ }
// cerr << "s-ngram: " << ngram.str() << endl;
accumulator->SparsePlusEquals(ngram.str(), 1);
ngram.str("");
diff --git a/moses/FF/TargetNgramFeature.h b/moses/FF/TargetNgramFeature.h
index b50391d43..8001f2f87 100644
--- a/moses/FF/TargetNgramFeature.h
+++ b/moses/FF/TargetNgramFeature.h
@@ -16,14 +16,17 @@
namespace Moses
{
-class TargetNgramState : public FFState {
- public:
- TargetNgramState(std::vector<Word> &words): m_words(words) {}
- const std::vector<Word> GetWords() const {return m_words;}
- virtual int Compare(const FFState& other) const;
-
- private:
- std::vector<Word> m_words;
+class TargetNgramState : public FFState
+{
+public:
+ TargetNgramState(std::vector<Word> &words): m_words(words) {}
+ const std::vector<Word> GetWords() const {
+ return m_words;
+ }
+ virtual int Compare(const FFState& other) const;
+
+private:
+ std::vector<Word> m_words;
};
class TargetNgramChartState : public FFState
@@ -39,8 +42,7 @@ private:
* \param ret prefix string
* \param size maximum size (typically max lm context window)
*/
- size_t CalcPrefix(const ChartHypothesis &hypo, const int featureId, Phrase &ret, size_t size) const
- {
+ size_t CalcPrefix(const ChartHypothesis &hypo, const int featureId, Phrase &ret, size_t size) const {
const TargetPhrase &target = hypo.GetCurrTargetPhrase();
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
target.GetAlignNonTerm().GetNonTermIndexMap();
@@ -76,9 +78,8 @@ private:
* \param ret suffix phrase
* \param size maximum size of suffix
*/
- size_t CalcSuffix(const ChartHypothesis &hypo, int featureId, Phrase &ret, size_t size) const
- {
- size_t prefixSize = m_contextPrefix.GetSize();
+ size_t CalcSuffix(const ChartHypothesis &hypo, int featureId, Phrase &ret, size_t size) const {
+ size_t prefixSize = m_contextPrefix.GetSize();
assert(prefixSize <= m_numTargetTerminals);
// special handling for small hypotheses
@@ -98,9 +99,9 @@ private:
}
// construct suffix analogous to prefix
else {
- const TargetPhrase targetPhrase = hypo.GetCurrTargetPhrase();
+ const TargetPhrase targetPhrase = hypo.GetCurrTargetPhrase();
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
- targetPhrase.GetAlignTerm().GetNonTermIndexMap();
+ targetPhrase.GetAlignTerm().GetNonTermIndexMap();
for (int pos = (int) targetPhrase.GetSize() - 1; pos >= 0 ; --pos) {
const Word &word = targetPhrase.GetWord(pos);
@@ -108,8 +109,7 @@ private:
size_t nonTermInd = nonTermIndexMap[pos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd);
size = static_cast<const TargetNgramChartState*>(prevHypo->GetFFState(featureId))->CalcSuffix(*prevHypo, featureId, ret, size);
- }
- else {
+ } else {
ret.PrependWord(word);
size--;
}
@@ -124,9 +124,8 @@ private:
public:
TargetNgramChartState(const ChartHypothesis &hypo, int featureId, size_t order)
- :m_contextPrefix(order - 1),
- m_contextSuffix(order - 1)
- {
+ :m_contextPrefix(order - 1),
+ m_contextSuffix(order - 1) {
m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals();
const WordsRange range = hypo.GetCurrSourceRange();
m_startPos = range.GetStartPos();
@@ -159,15 +158,13 @@ public:
static_cast<const TargetNgramChartState &>( o );
// prefix
- if (m_startPos > 0) // not for "<s> ..."
- {
+ if (m_startPos > 0) { // not for "<s> ..."
int ret = GetPrefix().Compare(other.GetPrefix());
if (ret != 0)
return ret;
}
- if (m_endPos < m_inputSize - 1)// not for "... </s>"
- {
+ if (m_endPos < m_inputSize - 1) { // not for "... </s>"
int ret = GetSuffix().Compare(other.GetSuffix());
if (ret != 0)
return ret;
@@ -178,34 +175,35 @@ public:
/** Sets the features of observed ngrams.
*/
-class TargetNgramFeature : public StatefulFeatureFunction {
+class TargetNgramFeature : public StatefulFeatureFunction
+{
public:
TargetNgramFeature(const std::string &line);
- bool Load(const std::string &filePath);
+ bool Load(const std::string &filePath);
- virtual const FFState* EmptyHypothesisState(const InputType &input) const;
+ virtual const FFState* EmptyHypothesisState(const InputType &input) const;
- virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state,
- ScoreComponentCollection* accumulator) const;
+ virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const;
virtual FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureId,
- ScoreComponentCollection* accumulator) const;
+ ScoreComponentCollection* accumulator) const;
private:
FactorType m_factorType;
Word m_bos;
boost::unordered_set<std::string> m_vocab;
- size_t m_n;
- bool m_lower_ngrams;
+ size_t m_n;
+ bool m_lower_ngrams;
- std::string m_baseName;
+ std::string m_baseName;
- void appendNgram(const Word& word, bool& skip, std::stringstream& ngram) const;
- void MakePrefixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator,
- size_t numberOfStartPos = 1, size_t offset = 0) const;
- void MakeSuffixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator,
- size_t numberOfEndPos = 1, size_t offset = 0) const;
+ void appendNgram(const Word& word, bool& skip, std::stringstream& ngram) const;
+ void MakePrefixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator,
+ size_t numberOfStartPos = 1, size_t offset = 0) const;
+ void MakeSuffixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator,
+ size_t numberOfEndPos = 1, size_t offset = 0) const;
};
}
diff --git a/moses/FF/TargetWordInsertionFeature.cpp b/moses/FF/TargetWordInsertionFeature.cpp
index 386e943be..f20a652e4 100644
--- a/moses/FF/TargetWordInsertionFeature.cpp
+++ b/moses/FF/TargetWordInsertionFeature.cpp
@@ -9,13 +9,14 @@
#include "moses/UserMessage.h"
#include "util/string_piece_hash.hh"
-namespace Moses {
+namespace Moses
+{
using namespace std;
TargetWordInsertionFeature::TargetWordInsertionFeature(const std::string &line)
-:StatelessFeatureFunction("TargetWordInsertionFeature", 0, line),
-m_unrestricted(true)
+ :StatelessFeatureFunction("TargetWordInsertionFeature", 0, line),
+ m_unrestricted(true)
{
std::cerr << "Initializing target word insertion feature.." << std::endl;
@@ -26,11 +27,9 @@ m_unrestricted(true)
if (args[0] == "factor") {
m_factorType = Scan<FactorType>(args[1]);
- }
- else if (args[0] == "path") {
+ } else if (args[0] == "path") {
filename = args[1];
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
@@ -46,13 +45,12 @@ m_unrestricted(true)
}
-bool TargetWordInsertionFeature::Load(const std::string &filePath)
+bool TargetWordInsertionFeature::Load(const std::string &filePath)
{
ifstream inFile(filePath.c_str());
- if (!inFile)
- {
- cerr << "could not open file " << filePath << endl;
- return false;
+ if (!inFile) {
+ cerr << "could not open file " << filePath << endl;
+ return false;
}
std::string line;
@@ -67,18 +65,18 @@ bool TargetWordInsertionFeature::Load(const std::string &filePath)
}
void TargetWordInsertionFeature::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm();
ComputeFeatures(source, targetPhrase, &scoreBreakdown, alignmentInfo);
}
void TargetWordInsertionFeature::ComputeFeatures(const Phrase &source,
- const TargetPhrase& targetPhrase,
- ScoreComponentCollection* accumulator,
- const AlignmentInfo &alignmentInfo) const
+ const TargetPhrase& targetPhrase,
+ ScoreComponentCollection* accumulator,
+ const AlignmentInfo &alignmentInfo) const
{
// handle special case: unknown words (they have no word alignment)
size_t targetLength = targetPhrase.GetSize();
@@ -100,15 +98,14 @@ void TargetWordInsertionFeature::ComputeFeatures(const Phrase &source,
if (!aligned[i]) {
Word w = targetPhrase.GetWord(i);
if (!w.IsNonTerminal()) {
- const StringPiece word = w.GetFactor(m_factorType)->GetString();
- if (word != "<s>" && word != "</s>") {
- if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) {
- accumulator->PlusEquals(this,StringPiece("OTHER"),1);
- }
- else {
- accumulator->PlusEquals(this,word,1);
- }
- }
+ const StringPiece word = w.GetFactor(m_factorType)->GetString();
+ if (word != "<s>" && word != "</s>") {
+ if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) {
+ accumulator->PlusEquals(this,StringPiece("OTHER"),1);
+ } else {
+ accumulator->PlusEquals(this,word,1);
+ }
+ }
}
}
}
diff --git a/moses/FF/TargetWordInsertionFeature.h b/moses/FF/TargetWordInsertionFeature.h
index aabc4cffc..50f7e5f88 100644
--- a/moses/FF/TargetWordInsertionFeature.h
+++ b/moses/FF/TargetWordInsertionFeature.h
@@ -13,7 +13,8 @@ namespace Moses
/** Sets the features for length of source phrase, target phrase, both.
*/
-class TargetWordInsertionFeature : public StatelessFeatureFunction {
+class TargetWordInsertionFeature : public StatelessFeatureFunction
+{
private:
boost::unordered_set<std::string> m_vocab;
FactorType m_factorType;
@@ -21,18 +22,18 @@ private:
public:
TargetWordInsertionFeature(const std::string &line);
-
+
bool Load(const std::string &filePath);
virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const;
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
void ComputeFeatures(const Phrase &source,
- const TargetPhrase& targetPhrase,
- ScoreComponentCollection* accumulator,
- const AlignmentInfo &alignmentInfo) const;
+ const TargetPhrase& targetPhrase,
+ ScoreComponentCollection* accumulator,
+ const AlignmentInfo &alignmentInfo) const;
};
diff --git a/moses/FF/UnknownWordPenaltyProducer.h b/moses/FF/UnknownWordPenaltyProducer.h
index b60967746..200033cfc 100644
--- a/moses/FF/UnknownWordPenaltyProducer.h
+++ b/moses/FF/UnknownWordPenaltyProducer.h
@@ -14,10 +14,9 @@ class WordsRange;
class UnknownWordPenaltyProducer : public StatelessFeatureFunction
{
public:
- UnknownWordPenaltyProducer(const std::string &line)
- : StatelessFeatureFunction("UnknownWordPenalty",1, line)
- {
- m_tuneable = false;
+ UnknownWordPenaltyProducer(const std::string &line)
+ : StatelessFeatureFunction("UnknownWordPenalty",1, line) {
+ m_tuneable = false;
}
};
diff --git a/moses/FF/WordPenaltyProducer.cpp b/moses/FF/WordPenaltyProducer.cpp
index ba97852e4..1dc425742 100644
--- a/moses/FF/WordPenaltyProducer.cpp
+++ b/moses/FF/WordPenaltyProducer.cpp
@@ -5,9 +5,9 @@
namespace Moses
{
void WordPenaltyProducer::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
float score = - (float) targetPhrase.GetNumTerminals();
scoreBreakdown.Assign(this, score);
diff --git a/moses/FF/WordPenaltyProducer.h b/moses/FF/WordPenaltyProducer.h
index fc824dd84..1892c459c 100644
--- a/moses/FF/WordPenaltyProducer.h
+++ b/moses/FF/WordPenaltyProducer.h
@@ -14,12 +14,12 @@ class ScoreComponentCollection;
class WordPenaltyProducer : public StatelessFeatureFunction
{
public:
- WordPenaltyProducer(const std::string &line) : StatelessFeatureFunction("WordPenalty",1, line) {}
+ WordPenaltyProducer(const std::string &line) : StatelessFeatureFunction("WordPenalty",1, line) {}
virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const;
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
};
diff --git a/moses/FF/WordTranslationFeature.cpp b/moses/FF/WordTranslationFeature.cpp
index 2648ac9f1..3f282609f 100644
--- a/moses/FF/WordTranslationFeature.cpp
+++ b/moses/FF/WordTranslationFeature.cpp
@@ -10,18 +10,19 @@
#include "moses/UserMessage.h"
#include "util/string_piece_hash.hh"
-namespace Moses {
+namespace Moses
+{
using namespace std;
WordTranslationFeature::WordTranslationFeature(const std::string &line)
-:StatelessFeatureFunction("WordTranslationFeature", 0, line)
-,m_unrestricted(true)
-,m_simple(true)
-,m_sourceContext(false)
-,m_targetContext(false)
-,m_ignorePunctuation(false)
-,m_domainTrigger(false)
+ :StatelessFeatureFunction("WordTranslationFeature", 0, line)
+ ,m_unrestricted(true)
+ ,m_simple(true)
+ ,m_sourceContext(false)
+ ,m_targetContext(false)
+ ,m_ignorePunctuation(false)
+ ,m_domainTrigger(false)
{
std::cerr << "Initializing word translation feature.. " << endl;
@@ -34,35 +35,25 @@ WordTranslationFeature::WordTranslationFeature(const std::string &line)
if (args[0] == "input-factor") {
m_factorTypeSource = Scan<FactorType>(args[1]);
- }
- else if (args[0] == "output-factor") {
+ } else if (args[0] == "output-factor") {
m_factorTypeTarget = Scan<FactorType>(args[1]);
- }
- else if (args[0] == "simple") {
+ } else if (args[0] == "simple") {
m_simple = Scan<bool>(args[1]);
- }
- else if (args[0] == "source-context") {
+ } else if (args[0] == "source-context") {
m_sourceContext = Scan<bool>(args[1]);
- }
- else if (args[0] == "target-context") {
+ } else if (args[0] == "target-context") {
m_targetContext = Scan<bool>(args[1]);
- }
- else if (args[0] == "ignore-punctuation") {
+ } else if (args[0] == "ignore-punctuation") {
m_ignorePunctuation = Scan<bool>(args[1]);
- }
- else if (args[0] == "domain-trigger") {
+ } else if (args[0] == "domain-trigger") {
m_domainTrigger = Scan<bool>(args[1]);
- }
- else if (args[0] == "texttype") {
+ } else if (args[0] == "texttype") {
texttype = args[1];
- }
- else if (args[0] == "source-path") {
+ } else if (args[0] == "source-path") {
filenameSource = args[1];
- }
- else if (args[0] == "target-path") {
+ } else if (args[0] == "target-path") {
filenameTarget = args[1];
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
@@ -108,65 +99,62 @@ WordTranslationFeature::WordTranslationFeature(const std::string &line)
}
-bool WordTranslationFeature::Load(const std::string &filePathSource, const std::string &filePathTarget)
+bool WordTranslationFeature::Load(const std::string &filePathSource, const std::string &filePathTarget)
{
if (m_domainTrigger) {
// domain trigger terms for each input document
ifstream inFileSource(filePathSource.c_str());
- if (!inFileSource){
+ if (!inFileSource) {
cerr << "could not open file " << filePathSource << endl;
return false;
}
-
+
std::string line;
while (getline(inFileSource, line)) {
- m_vocabDomain.resize(m_vocabDomain.size() + 1);
- vector<string> termVector;
- boost::split(termVector, line, boost::is_any_of("\t "));
- for (size_t i=0; i < termVector.size(); ++i)
- m_vocabDomain.back().insert(termVector[i]);
+ m_vocabDomain.resize(m_vocabDomain.size() + 1);
+ vector<string> termVector;
+ boost::split(termVector, line, boost::is_any_of("\t "));
+ for (size_t i=0; i < termVector.size(); ++i)
+ m_vocabDomain.back().insert(termVector[i]);
}
-
+
inFileSource.close();
- }
- else {
+ } else {
// restricted source word vocabulary
ifstream inFileSource(filePathSource.c_str());
- if (!inFileSource)
- {
- cerr << "could not open file " << filePathSource << endl;
- return false;
- }
-
+ if (!inFileSource) {
+ cerr << "could not open file " << filePathSource << endl;
+ return false;
+ }
+
std::string line;
while (getline(inFileSource, line)) {
m_vocabSource.insert(line);
}
-
+
inFileSource.close();
-
+
// restricted target word vocabulary
ifstream inFileTarget(filePathTarget.c_str());
- if (!inFileTarget)
- {
- cerr << "could not open file " << filePathTarget << endl;
- return false;
- }
-
+ if (!inFileTarget) {
+ cerr << "could not open file " << filePathTarget << endl;
+ return false;
+ }
+
while (getline(inFileTarget, line)) {
m_vocabTarget.insert(line);
}
-
+
inFileTarget.close();
-
+
m_unrestricted = false;
}
return true;
}
void WordTranslationFeature::Evaluate
- (const PhraseBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const
+(const PhraseBasedFeatureContext& context,
+ ScoreComponentCollection* accumulator) const
{
const Sentence& input = static_cast<const Sentence&>(context.GetSource());
const TargetPhrase& targetPhrase = context.GetTargetPhrase();
@@ -188,7 +176,7 @@ void WordTranslationFeature::Evaluate
char firstChar = sourceWord[0];
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
- continue;
+ continue;
firstChar = targetWord[0];
charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
@@ -197,9 +185,9 @@ void WordTranslationFeature::Evaluate
if (!m_unrestricted) {
if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end())
- sourceWord = "OTHER";
+ sourceWord = "OTHER";
if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end())
- targetWord = "OTHER";
+ targetWord = "OTHER";
}
if (m_simple) {
@@ -215,174 +203,169 @@ void WordTranslationFeature::Evaluate
const bool use_topicid = input.GetUseTopicId();
const bool use_topicid_prob = input.GetUseTopicIdAndProb();
if (use_topicid || use_topicid_prob) {
- if(use_topicid) {
- // use topicid as trigger
- const long topicid = input.GetTopicId();
- stringstream feature;
- feature << m_description << "_";
- if (topicid == -1)
- feature << "unk";
- else
- feature << topicid;
-
- feature << "_";
- feature << sourceWord;
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
- else {
- // use topic probabilities
- const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
- if (atol(topicid_prob[0].c_str()) == -1) {
- stringstream feature;
- feature << m_description << "_unk_";
- feature << sourceWord;
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
- else {
- for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
- stringstream feature;
- feature << m_description << "_";
- feature << topicid_prob[i];
- feature << "_";
- feature << sourceWord;
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
- }
- }
- }
- }
- else {
- // range over domain trigger words (keywords)
- const long docid = input.GetDocumentId();
- for (boost::unordered_set<std::string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
- string sourceTrigger = *p;
- stringstream feature;
- feature << m_description << "_";
- feature << sourceTrigger;
- feature << "_";
- feature << sourceWord;
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
+ if(use_topicid) {
+ // use topicid as trigger
+ const long topicid = input.GetTopicId();
+ stringstream feature;
+ feature << m_description << "_";
+ if (topicid == -1)
+ feature << "unk";
+ else
+ feature << topicid;
+
+ feature << "_";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ } else {
+ // use topic probabilities
+ const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
+ if (atol(topicid_prob[0].c_str()) == -1) {
+ stringstream feature;
+ feature << m_description << "_unk_";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ } else {
+ for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
+ stringstream feature;
+ feature << m_description << "_";
+ feature << topicid_prob[i];
+ feature << "_";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
+ }
+ }
+ }
+ } else {
+ // range over domain trigger words (keywords)
+ const long docid = input.GetDocumentId();
+ for (boost::unordered_set<std::string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
+ string sourceTrigger = *p;
+ stringstream feature;
+ feature << m_description << "_";
+ feature << sourceTrigger;
+ feature << "_";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ }
}
}
if (m_sourceContext) {
size_t globalSourceIndex = context.GetTranslationOption().GetStartPos() + sourceIndex;
if (!m_domainTrigger && globalSourceIndex == 0) {
- // add <s> trigger feature for source
- stringstream feature;
- feature << m_description << "_";
- feature << "<s>,";
- feature << sourceWord;
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ // add <s> trigger feature for source
+ stringstream feature;
+ feature << m_description << "_";
+ feature << "<s>,";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
}
// range over source words to get context
for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
- if (contextIndex == globalSourceIndex) continue;
- StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString();
- if (m_ignorePunctuation) {
- // check if trigger is punctuation
- char firstChar = sourceTrigger[0];
- CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
- if(charIterator != m_punctuationHash.end())
- continue;
- }
-
- const long docid = input.GetDocumentId();
- bool sourceTriggerExists = false;
- if (m_domainTrigger)
- sourceTriggerExists = FindStringPiece(m_vocabDomain[docid], sourceTrigger ) != m_vocabDomain[docid].end();
- else if (!m_unrestricted)
- sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
-
- if (m_domainTrigger) {
- if (sourceTriggerExists) {
- stringstream feature;
- feature << m_description << "_";
- feature << sourceTrigger;
- feature << "_";
- feature << sourceWord;
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
- }
- else if (m_unrestricted || sourceTriggerExists) {
- stringstream feature;
- feature << m_description << "_";
- if (contextIndex < globalSourceIndex) {
- feature << sourceTrigger;
- feature << ",";
- feature << sourceWord;
- }
- else {
- feature << sourceWord;
- feature << ",";
- feature << sourceTrigger;
- }
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
+ if (contextIndex == globalSourceIndex) continue;
+ StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString();
+ if (m_ignorePunctuation) {
+ // check if trigger is punctuation
+ char firstChar = sourceTrigger[0];
+ CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+ if(charIterator != m_punctuationHash.end())
+ continue;
+ }
+
+ const long docid = input.GetDocumentId();
+ bool sourceTriggerExists = false;
+ if (m_domainTrigger)
+ sourceTriggerExists = FindStringPiece(m_vocabDomain[docid], sourceTrigger ) != m_vocabDomain[docid].end();
+ else if (!m_unrestricted)
+ sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
+
+ if (m_domainTrigger) {
+ if (sourceTriggerExists) {
+ stringstream feature;
+ feature << m_description << "_";
+ feature << sourceTrigger;
+ feature << "_";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ }
+ } else if (m_unrestricted || sourceTriggerExists) {
+ stringstream feature;
+ feature << m_description << "_";
+ if (contextIndex < globalSourceIndex) {
+ feature << sourceTrigger;
+ feature << ",";
+ feature << sourceWord;
+ } else {
+ feature << sourceWord;
+ feature << ",";
+ feature << sourceTrigger;
+ }
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ }
}
}
if (m_targetContext) {
throw runtime_error("Can't use target words outside current translation option in a stateless feature");
/*
- size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex;
- if (globalTargetIndex == 0) {
- // add <s> trigger feature for source
- stringstream feature;
- feature << "wt_";
- feature << sourceWord;
- feature << "~";
- feature << "<s>,";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
-
- // range over target words (up to current position) to get context
- for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) {
- string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
- if (m_ignorePunctuation) {
- // check if trigger is punctuation
- char firstChar = targetTrigger.at(0);
- CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
- if(charIterator != m_punctuationHash.end())
- continue;
- }
-
- bool targetTriggerExists = false;
- if (!m_unrestricted)
- targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
-
- if (m_unrestricted || targetTriggerExists) {
- stringstream feature;
- feature << "wt_";
- feature << sourceWord;
- feature << "~";
- feature << targetTrigger;
- feature << ",";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
- }*/
+ size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex;
+ if (globalTargetIndex == 0) {
+ // add <s> trigger feature for source
+ stringstream feature;
+ feature << "wt_";
+ feature << sourceWord;
+ feature << "~";
+ feature << "<s>,";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ }
+
+ // range over target words (up to current position) to get context
+ for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) {
+ string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
+ if (m_ignorePunctuation) {
+ // check if trigger is punctuation
+ char firstChar = targetTrigger.at(0);
+ CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+ if(charIterator != m_punctuationHash.end())
+ continue;
+ }
+
+ bool targetTriggerExists = false;
+ if (!m_unrestricted)
+ targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
+
+ if (m_unrestricted || targetTriggerExists) {
+ stringstream feature;
+ feature << "wt_";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetTrigger;
+ feature << ",";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ }
+ }*/
}
}
}
void WordTranslationFeature::EvaluateChart(
- const ChartBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const
+ const ChartBasedFeatureContext& context,
+ ScoreComponentCollection* accumulator) const
{
const TargetPhrase& targetPhrase = context.GetTargetPhrase();
const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm();
@@ -403,7 +386,7 @@ void WordTranslationFeature::EvaluateChart(
char firstChar = sourceWord[0];
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
- continue;
+ continue;
firstChar = targetWord[0];
charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
@@ -411,118 +394,118 @@ void WordTranslationFeature::EvaluateChart(
}
if (!m_unrestricted) {
- if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end())
- sourceWord = "OTHER";
- if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end())
- targetWord = "OTHER";
+ if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end())
+ sourceWord = "OTHER";
+ if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end())
+ targetWord = "OTHER";
}
-
+
if (m_simple) {
- // construct feature name
- stringstream featureName;
- featureName << m_description << "_";
- //featureName << ((sourceExists||m_unrestricted) ? sourceWord : "OTHER");
- featureName << sourceWord;
- featureName << "~";
- //featureName << ((targetExists||m_unrestricted) ? targetWord : "OTHER");
- featureName << targetWord;
- accumulator->SparsePlusEquals(featureName.str(), 1);
+ // construct feature name
+ stringstream featureName;
+ featureName << m_description << "_";
+ //featureName << ((sourceExists||m_unrestricted) ? sourceWord : "OTHER");
+ featureName << sourceWord;
+ featureName << "~";
+ //featureName << ((targetExists||m_unrestricted) ? targetWord : "OTHER");
+ featureName << targetWord;
+ accumulator->SparsePlusEquals(featureName.str(), 1);
}
- /* if (m_sourceContext) {
- size_t globalSourceIndex = cur_hypo.GetCurrSourceRange().GetStartPos() + sourceIndex;
- if (globalSourceIndex == 0) {
- // add <s> trigger feature for source
- stringstream feature;
- feature << "wt_";
- feature << "<s>,";
- feature << sourceWord;
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- cerr << feature.str() << endl;
- }
-
- // range over source words to get context
- for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
- if (contextIndex == globalSourceIndex) continue;
- string sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString();
- if (m_ignorePunctuation) {
- // check if trigger is punctuation
- char firstChar = sourceTrigger.at(0);
- CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
- if(charIterator != m_punctuationHash.end())
- continue;
- }
-
- bool sourceTriggerExists = false;
- if (!m_unrestricted)
- sourceTriggerExists = m_vocabSource.find( sourceTrigger ) != m_vocabSource.end();
-
- if (m_unrestricted || sourceTriggerExists) {
- stringstream feature;
- feature << "wt_";
- if (contextIndex < globalSourceIndex) {
- feature << sourceTrigger;
- feature << ",";
- feature << sourceWord;
- }
- else {
- feature << sourceWord;
- feature << ",";
- feature << sourceTrigger;
- }
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- cerr << feature.str() << endl;
- }
- }
- }*/
-/* if (m_targetContext) {
- size_t globalTargetIndex = 0; // TODO
-// size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex;
- if (globalTargetIndex == 0) {
- // add <s> trigger feature for source
- stringstream feature;
- feature << "wt_";
- feature << sourceWord;
- feature << "~";
- feature << "<s>,";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- cerr << feature.str() << endl;
- }
-
- // range over target words (up to current position) to get context
- for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) {
- Phrase outputPhrase = cur_hypo.GetOutputPhrase();
- string targetTrigger = outputPhrase.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
- //string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
- if (m_ignorePunctuation) {
- // check if trigger is punctuation
- char firstChar = targetTrigger.at(0);
- CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
- if(charIterator != m_punctuationHash.end())
- continue;
- }
-
- bool targetTriggerExists = false;
- if (!m_unrestricted)
- targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
-
- if (m_unrestricted || targetTriggerExists) {
- stringstream feature;
- feature << "wt_";
- feature << sourceWord;
- feature << "~";
- feature << targetTrigger;
- feature << ",";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- cerr << feature.str() << endl;
- }
- }
+ /* if (m_sourceContext) {
+ size_t globalSourceIndex = cur_hypo.GetCurrSourceRange().GetStartPos() + sourceIndex;
+ if (globalSourceIndex == 0) {
+ // add <s> trigger feature for source
+ stringstream feature;
+ feature << "wt_";
+ feature << "<s>,";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ cerr << feature.str() << endl;
+ }
+
+ // range over source words to get context
+ for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
+ if (contextIndex == globalSourceIndex) continue;
+ string sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString();
+ if (m_ignorePunctuation) {
+ // check if trigger is punctuation
+ char firstChar = sourceTrigger.at(0);
+ CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+ if(charIterator != m_punctuationHash.end())
+ continue;
+ }
+
+ bool sourceTriggerExists = false;
+ if (!m_unrestricted)
+ sourceTriggerExists = m_vocabSource.find( sourceTrigger ) != m_vocabSource.end();
+
+ if (m_unrestricted || sourceTriggerExists) {
+ stringstream feature;
+ feature << "wt_";
+ if (contextIndex < globalSourceIndex) {
+ feature << sourceTrigger;
+ feature << ",";
+ feature << sourceWord;
+ }
+ else {
+ feature << sourceWord;
+ feature << ",";
+ feature << sourceTrigger;
+ }
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ cerr << feature.str() << endl;
+ }
+ }
}*/
+ /* if (m_targetContext) {
+ size_t globalTargetIndex = 0; // TODO
+ // size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex;
+ if (globalTargetIndex == 0) {
+ // add <s> trigger feature for source
+ stringstream feature;
+ feature << "wt_";
+ feature << sourceWord;
+ feature << "~";
+ feature << "<s>,";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ cerr << feature.str() << endl;
+ }
+
+ // range over target words (up to current position) to get context
+ for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) {
+ Phrase outputPhrase = cur_hypo.GetOutputPhrase();
+ string targetTrigger = outputPhrase.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
+ //string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
+ if (m_ignorePunctuation) {
+ // check if trigger is punctuation
+ char firstChar = targetTrigger.at(0);
+ CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+ if(charIterator != m_punctuationHash.end())
+ continue;
+ }
+
+ bool targetTriggerExists = false;
+ if (!m_unrestricted)
+ targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
+
+ if (m_unrestricted || targetTriggerExists) {
+ stringstream feature;
+ feature << "wt_";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetTrigger;
+ feature << ",";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ cerr << feature.str() << endl;
+ }
+ }
+ }*/
}
}
diff --git a/moses/FF/WordTranslationFeature.h b/moses/FF/WordTranslationFeature.h
index 3379e8c84..b3a434325 100644
--- a/moses/FF/WordTranslationFeature.h
+++ b/moses/FF/WordTranslationFeature.h
@@ -14,11 +14,12 @@ namespace Moses
/** Sets the features for word translation
*/
-class WordTranslationFeature : public StatelessFeatureFunction {
+class WordTranslationFeature : public StatelessFeatureFunction
+{
typedef std::map< char, short > CharHash;
typedef std::vector< boost::unordered_set<std::string> > DocumentVector;
-
+
private:
boost::unordered_set<std::string> m_vocabSource;
boost::unordered_set<std::string> m_vocabTarget;
@@ -32,18 +33,18 @@ private:
bool m_domainTrigger;
bool m_ignorePunctuation;
CharHash m_punctuationHash;
-
+
public:
WordTranslationFeature(const std::string &line);
-
+
bool Load(const std::string &filePathSource, const std::string &filePathTarget);
-
+
const FFState* EmptyHypothesisState(const InputType &) const {
return new DummyState();
}
-
- void Evaluate(const PhraseBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const;
+
+ void Evaluate(const PhraseBasedFeatureContext& context,
+ ScoreComponentCollection* accumulator) const;
void EvaluateChart(const ChartBasedFeatureContext& context,
ScoreComponentCollection* accumulator) const;
diff --git a/moses/Factor.h b/moses/Factor.h
index 87e8f8028..f4bb2074d 100644
--- a/moses/Factor.h
+++ b/moses/Factor.h
@@ -34,8 +34,8 @@ namespace Moses
struct FactorFriend;
class FactorCollection;
-/** Represents a factor (word, POS, etc).
- * A Factor has a contiguous identifier and string value.
+/** Represents a factor (word, POS, etc).
+ * A Factor has a contiguous identifier and string value.
*/
class Factor
{
@@ -53,10 +53,10 @@ class Factor
//! protected constructor. only friend class, FactorCollection, is allowed to create Factor objects
Factor() {}
- // Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly.
+ // Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly.
Factor(const Factor &factor) : m_string(factor.m_string), m_id(factor.m_id) {}
- // Not implemented. Shouldn't be called.
+ // Not implemented. Shouldn't be called.
Factor &operator=(const Factor &factor);
public:
diff --git a/moses/FactorCollection.cpp b/moses/FactorCollection.cpp
index 969bb39d1..5d6eb1c53 100644
--- a/moses/FactorCollection.cpp
+++ b/moses/FactorCollection.cpp
@@ -38,11 +38,12 @@ FactorCollection FactorCollection::s_instance;
const Factor *FactorCollection::AddFactor(const StringPiece &factorString)
{
FactorFriend to_ins;
- to_ins.in.m_string = factorString;
+ to_ins.in.m_string = factorString;
to_ins.in.m_id = m_factorId;
// If we're threaded, hope a read-only lock is sufficient.
#ifdef WITH_THREADS
- { // read=lock scope
+ {
+ // read=lock scope
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
Set::const_iterator i = m_set.find(to_ins);
if (i != m_set.end()) return &i->in;
@@ -52,8 +53,8 @@ const Factor *FactorCollection::AddFactor(const StringPiece &factorString)
std::pair<Set::iterator, bool> ret(m_set.insert(to_ins));
if (ret.second) {
ret.first->in.m_string.set(
- memcpy(m_string_backing.Allocate(factorString.size()), factorString.data(), factorString.size()),
- factorString.size());
+ memcpy(m_string_backing.Allocate(factorString.size()), factorString.data(), factorString.size()),
+ factorString.size());
m_factorId++;
}
return &ret.first->in;
diff --git a/moses/FactorCollection.h b/moses/FactorCollection.h
index e7749244f..8c3db5da9 100644
--- a/moses/FactorCollection.h
+++ b/moses/FactorCollection.h
@@ -44,7 +44,7 @@ namespace Moses
* private and friended to FactorFriend. The STL containers can delegate
* copying, so friending the container isn't sufficient. STL containers see
* FactorFriend's public copy constructor and everybody else sees Factor's
- * private copy constructor.
+ * private copy constructor.
*/
struct FactorFriend {
Factor in;
diff --git a/moses/FeatureVector.cpp b/moses/FeatureVector.cpp
index f58bb5cab..96dd9a0ce 100644
--- a/moses/FeatureVector.cpp
+++ b/moses/FeatureVector.cpp
@@ -1,22 +1,22 @@
/*
Moses - factored phrase-based language decoder
Copyright (C) 2010 University of Edinburgh
-
-
+
+
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
-
+
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
+
*/
#include <algorithm>
@@ -31,744 +31,815 @@
using namespace std;
-namespace Moses {
-
- const string FName::SEP = "_";
- FName::Name2Id FName::name2id;
- vector<string> FName::id2name;
- FName::Id2Count FName::id2hopeCount;
- FName::Id2Count FName::id2fearCount;
+namespace Moses
+{
+
+const string FName::SEP = "_";
+FName::Name2Id FName::name2id;
+vector<string> FName::id2name;
+FName::Id2Count FName::id2hopeCount;
+FName::Id2Count FName::id2fearCount;
#ifdef WITH_THREADS
- boost::shared_mutex FName::m_idLock;
+boost::shared_mutex FName::m_idLock;
#endif
-
- void FName::init(const StringPiece &name) {
+
+void FName::init(const StringPiece &name)
+{
#ifdef WITH_THREADS
- //reader lock
- boost::shared_lock<boost::shared_mutex> lock(m_idLock);
+ //reader lock
+ boost::shared_lock<boost::shared_mutex> lock(m_idLock);
#endif
- Name2Id::iterator i = FindStringPiece(name2id, name);
- if (i != name2id.end()) {
- m_id = i->second;
- } else {
+ Name2Id::iterator i = FindStringPiece(name2id, name);
+ if (i != name2id.end()) {
+ m_id = i->second;
+ } else {
#ifdef WITH_THREADS
- //release the reader lock, and upgrade to writer lock
- lock.unlock();
- boost::unique_lock<boost::shared_mutex> write_lock(m_idLock);
+ //release the reader lock, and upgrade to writer lock
+ lock.unlock();
+ boost::unique_lock<boost::shared_mutex> write_lock(m_idLock);
#endif
- std::pair<std::string, size_t> to_ins;
- to_ins.first.assign(name.data(), name.size());
- to_ins.second = name2id.size();
- std::pair<Name2Id::iterator, bool> res(name2id.insert(to_ins));
- if (res.second) {
- // TODO this should be string pointers backed by the hash table.
- id2name.push_back(to_ins.first);
- }
- m_id = res.first->second;
+ std::pair<std::string, size_t> to_ins;
+ to_ins.first.assign(name.data(), name.size());
+ to_ins.second = name2id.size();
+ std::pair<Name2Id::iterator, bool> res(name2id.insert(to_ins));
+ if (res.second) {
+ // TODO this should be string pointers backed by the hash table.
+ id2name.push_back(to_ins.first);
}
+ m_id = res.first->second;
+ }
+}
+
+size_t FName::getId(const string& name)
+{
+ Name2Id::iterator i = name2id.find(name);
+ assert (i != name2id.end());
+ return i->second;
+}
+
+size_t FName::getHopeIdCount(const string& name)
+{
+ Name2Id::iterator i = name2id.find(name);
+ if (i != name2id.end()) {
+ float id = i->second;
+ return id2hopeCount[id];
+ }
+ return 0;
+}
+
+size_t FName::getFearIdCount(const string& name)
+{
+ Name2Id::iterator i = name2id.find(name);
+ if (i != name2id.end()) {
+ float id = i->second;
+ return id2fearCount[id];
}
-
- size_t FName::getId(const string& name) {
- Name2Id::iterator i = name2id.find(name);
- assert (i != name2id.end());
- return i->second;
- }
-
- size_t FName::getHopeIdCount(const string& name) {
- Name2Id::iterator i = name2id.find(name);
- if (i != name2id.end()) {
- float id = i->second;
- return id2hopeCount[id];
- }
- return 0;
- }
-
- size_t FName::getFearIdCount(const string& name) {
- Name2Id::iterator i = name2id.find(name);
- if (i != name2id.end()) {
- float id = i->second;
- return id2fearCount[id];
- }
- return 0;
- }
-
- void FName::incrementHopeId(const string& name) {
- Name2Id::iterator i = name2id.find(name);
- assert(i != name2id.end());
+ return 0;
+}
+
+void FName::incrementHopeId(const string& name)
+{
+ Name2Id::iterator i = name2id.find(name);
+ assert(i != name2id.end());
#ifdef WITH_THREADS
- // get upgradable lock and upgrade to writer lock
- boost::upgrade_lock<boost::shared_mutex> upgradeLock(m_idLock);
- boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(upgradeLock);
+ // get upgradable lock and upgrade to writer lock
+ boost::upgrade_lock<boost::shared_mutex> upgradeLock(m_idLock);
+ boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(upgradeLock);
#endif
- id2hopeCount[i->second] += 1;
- }
+ id2hopeCount[i->second] += 1;
+}
- void FName::incrementFearId(const string& name) {
- Name2Id::iterator i = name2id.find(name);
- assert(i != name2id.end());
+void FName::incrementFearId(const string& name)
+{
+ Name2Id::iterator i = name2id.find(name);
+ assert(i != name2id.end());
#ifdef WITH_THREADS
- // get upgradable lock and upgrade to writer lock
- boost::upgrade_lock<boost::shared_mutex> upgradeLock(m_idLock);
- boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(upgradeLock);
+ // get upgradable lock and upgrade to writer lock
+ boost::upgrade_lock<boost::shared_mutex> upgradeLock(m_idLock);
+ boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(upgradeLock);
#endif
- id2fearCount[i->second] += 1;
- }
-
- void FName::eraseId(size_t id) {
+ id2fearCount[i->second] += 1;
+}
+
+void FName::eraseId(size_t id)
+{
#ifdef WITH_THREADS
- // get upgradable lock and upgrade to writer lock
- boost::upgrade_lock<boost::shared_mutex> upgradeLock(m_idLock);
- boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(upgradeLock);
+ // get upgradable lock and upgrade to writer lock
+ boost::upgrade_lock<boost::shared_mutex> upgradeLock(m_idLock);
+ boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(upgradeLock);
#endif
- id2hopeCount.erase(id);
- id2fearCount.erase(id);
- }
-
- std::ostream& operator<<( std::ostream& out, const FName& name) {
- out << name.name();
- return out;
- }
-
- size_t FName::hash() const {
- return boost::hash_value(m_id);
- }
-
- const std::string& FName::name() const {
- return id2name[m_id];
- }
-
-
- bool FName::operator==(const FName& rhs) const {
- return m_id == rhs.m_id;
- }
-
- bool FName::operator!=(const FName& rhs) const {
- return ! (*this == rhs);
- }
-
- FVector::FVector(size_t coreFeatures) : m_coreFeatures(coreFeatures) {}
-
- void FVector::resize(size_t newsize) {
- valarray<FValue> oldValues(m_coreFeatures);
- m_coreFeatures.resize(newsize);
- for (size_t i = 0; i < min(m_coreFeatures.size(), oldValues.size()); ++i) {
- m_coreFeatures[i] = oldValues[i];
- }
- }
-
- void FVector::clear() {
- m_coreFeatures.resize(0);
- m_features.clear();
- }
-
- bool FVector::load(const std::string& filename) {
- clear();
- ifstream in (filename.c_str());
- if (!in) {
- return false;
- }
- string line;
- while(getline(in,line)) {
- if (line[0] == '#') continue;
- istringstream linestream(line);
- string namestring;
- FValue value;
- linestream >> namestring;
- linestream >> value;
- FName fname(namestring);
- //cerr << "Setting sparse weight " << fname << " to value " << value << "." << endl;
- set(fname,value);
- }
- return true;
- }
+ id2hopeCount.erase(id);
+ id2fearCount.erase(id);
+}
- void FVector::save(const string& filename) const {
- ofstream out(filename.c_str());
- if (!out) {
- ostringstream msg;
- msg << "Unable to open " << filename;
- throw runtime_error(msg.str());
- }
- write(out);
- out.close();
+std::ostream& operator<<( std::ostream& out, const FName& name)
+{
+ out << name.name();
+ return out;
+}
+
+size_t FName::hash() const
+{
+ return boost::hash_value(m_id);
+}
+
+const std::string& FName::name() const
+{
+ return id2name[m_id];
+}
+
+
+bool FName::operator==(const FName& rhs) const
+{
+ return m_id == rhs.m_id;
+}
+
+bool FName::operator!=(const FName& rhs) const
+{
+ return ! (*this == rhs);
+}
+
+FVector::FVector(size_t coreFeatures) : m_coreFeatures(coreFeatures) {}
+
+void FVector::resize(size_t newsize)
+{
+ valarray<FValue> oldValues(m_coreFeatures);
+ m_coreFeatures.resize(newsize);
+ for (size_t i = 0; i < min(m_coreFeatures.size(), oldValues.size()); ++i) {
+ m_coreFeatures[i] = oldValues[i];
}
+}
- void FVector::write(ostream& out) const {
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- out << i->first << " " << i->second << endl;
- }
+void FVector::clear()
+{
+ m_coreFeatures.resize(0);
+ m_features.clear();
+}
+
+bool FVector::load(const std::string& filename)
+{
+ clear();
+ ifstream in (filename.c_str());
+ if (!in) {
+ return false;
+ }
+ string line;
+ while(getline(in,line)) {
+ if (line[0] == '#') continue;
+ istringstream linestream(line);
+ string namestring;
+ FValue value;
+ linestream >> namestring;
+ linestream >> value;
+ FName fname(namestring);
+ //cerr << "Setting sparse weight " << fname << " to value " << value << "." << endl;
+ set(fname,value);
+ }
+ return true;
+}
+
+void FVector::save(const string& filename) const
+{
+ ofstream out(filename.c_str());
+ if (!out) {
+ ostringstream msg;
+ msg << "Unable to open " << filename;
+ throw runtime_error(msg.str());
}
+ write(out);
+ out.close();
+}
- static bool equalsTolerance(FValue lhs, FValue rhs) {
- if (lhs == rhs) return true;
- static const FValue TOLERANCE = 1e-4;
- FValue diff = abs(lhs-rhs);
- FValue mean = (abs(lhs)+abs(rhs))/2;
- //cerr << "ET " << lhs << " " << rhs << " " << diff << " " << mean << " " << endl;
- return diff/mean < TOLERANCE ;
+void FVector::write(ostream& out) const
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ out << i->first << " " << i->second << endl;
}
-
- bool FVector::operator== (const FVector& rhs) const {
- if (this == &rhs) {
- return true;
- }
- if (m_coreFeatures.size() != rhs.m_coreFeatures.size()) {
- return false;
- }
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- if (!equalsTolerance(m_coreFeatures[i], rhs.m_coreFeatures[i])) return false;
- }
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- if (!equalsTolerance(i->second,rhs.get(i->first))) return false;
- }
- for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) {
- if (!equalsTolerance(i->second, get(i->first))) return false;
- }
+}
+
+static bool equalsTolerance(FValue lhs, FValue rhs)
+{
+ if (lhs == rhs) return true;
+ static const FValue TOLERANCE = 1e-4;
+ FValue diff = abs(lhs-rhs);
+ FValue mean = (abs(lhs)+abs(rhs))/2;
+ //cerr << "ET " << lhs << " " << rhs << " " << diff << " " << mean << " " << endl;
+ return diff/mean < TOLERANCE ;
+}
+
+bool FVector::operator== (const FVector& rhs) const
+{
+ if (this == &rhs) {
return true;
}
-
- bool FVector::operator!= (const FVector& rhs) const {
- return ! (*this == rhs);
+ if (m_coreFeatures.size() != rhs.m_coreFeatures.size()) {
+ return false;
}
-
- ProxyFVector FVector::operator[](const FName& name) {
- // At this point, we don't know whether operator[] was called, so we return
- // a proxy object and defer the decision until later
- return ProxyFVector(this, name);
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ if (!equalsTolerance(m_coreFeatures[i], rhs.m_coreFeatures[i])) return false;
}
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ if (!equalsTolerance(i->second,rhs.get(i->first))) return false;
+ }
+ for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) {
+ if (!equalsTolerance(i->second, get(i->first))) return false;
+ }
+ return true;
+}
+
+bool FVector::operator!= (const FVector& rhs) const
+{
+ return ! (*this == rhs);
+}
+
+ProxyFVector FVector::operator[](const FName& name)
+{
+ // At this point, we don't know whether operator[] was called, so we return
+ // a proxy object and defer the decision until later
+ return ProxyFVector(this, name);
+}
- /** Equivalent for core features. */
- FValue& FVector::operator[](size_t index) {
- return m_coreFeatures[index];
+/** Equivalent for core features. */
+FValue& FVector::operator[](size_t index)
+{
+ return m_coreFeatures[index];
+}
+
+
+FValue FVector::operator[](const FName& name) const
+{
+ return get(name);
+}
+
+FValue FVector::operator[](size_t index) const
+{
+ return m_coreFeatures[index];
+}
+
+ostream& FVector::print(ostream& out) const
+{
+ out << "core=(";
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ out << m_coreFeatures[i];
+ if (i + 1 < m_coreFeatures.size()) {
+ out << ",";
+ }
}
+ out << ") ";
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ if (i != cbegin())
+ out << " ";
+ out << i->first << "=" << i->second;
+ }
+ return out;
+}
+
+ostream& operator<<(ostream& out, const FVector& fv)
+{
+ return fv.print(out);
+}
-
- FValue FVector::operator[](const FName& name) const {
- return get(name);
+const FValue& FVector::get(const FName& name) const
+{
+ static const FValue DEFAULT = 0;
+ const_iterator fi = m_features.find(name);
+ if (fi == m_features.end()) {
+ return DEFAULT;
+ } else {
+ return fi->second;
}
+}
- FValue FVector::operator[](size_t index) const {
- return m_coreFeatures[index];
+FValue FVector::getBackoff(const FName& name, float backoff) const
+{
+ const_iterator fi = m_features.find(name);
+ if (fi == m_features.end()) {
+ return backoff;
+ } else {
+ return fi->second;
}
+}
- ostream& FVector::print(ostream& out) const {
- out << "core=(";
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- out << m_coreFeatures[i];
- if (i + 1 < m_coreFeatures.size()) {
- out << ",";
- }
- }
- out << ") ";
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- if (i != cbegin())
- out << " ";
- out << i->first << "=" << i->second;
- }
- return out;
- }
-
- ostream& operator<<(ostream& out, const FVector& fv) {
- return fv.print(out);
- }
-
- const FValue& FVector::get(const FName& name) const {
- static const FValue DEFAULT = 0;
- const_iterator fi = m_features.find(name);
- if (fi == m_features.end()) {
- return DEFAULT;
- } else {
- return fi->second;
+void FVector::thresholdScale(FValue maxValue )
+{
+ FValue factor = 1.0;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ FValue value = i->second;
+ if (abs(value)*factor > maxValue) {
+ factor = abs(value) / maxValue;
}
}
+ operator*=(factor);
+}
- FValue FVector::getBackoff(const FName& name, float backoff) const {
- const_iterator fi = m_features.find(name);
- if (fi == m_features.end()) {
- return backoff;
- } else {
- return fi->second;
+void FVector::capMax(FValue maxValue)
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ if (i->second > maxValue)
+ set(i->first, maxValue);
+}
+
+void FVector::capMin(FValue minValue)
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ if (i->second < minValue)
+ set(i->first, minValue);
+}
+
+void FVector::set(const FName& name, const FValue& value)
+{
+ m_features[name] = value;
+}
+
+void FVector::printCoreFeatures()
+{
+ cerr << "core=(";
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ cerr << m_coreFeatures[i];
+ if (i + 1 < m_coreFeatures.size()) {
+ cerr << ",";
}
}
+ cerr << ") ";
+}
- void FVector::thresholdScale(FValue maxValue ) {
- FValue factor = 1.0;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- FValue value = i->second;
- if (abs(value)*factor > maxValue) {
- factor = abs(value) / maxValue;
- }
+FVector& FVector::operator+= (const FVector& rhs)
+{
+ if (rhs.m_coreFeatures.size() > m_coreFeatures.size())
+ resize(rhs.m_coreFeatures.size());
+ for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
+ set(i->first, get(i->first) + i->second);
+ for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
+ m_coreFeatures[i] += rhs.m_coreFeatures[i];
+ return *this;
+}
+
+// add only sparse features
+void FVector::sparsePlusEquals(const FVector& rhs)
+{
+ for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
+ set(i->first, get(i->first) + i->second);
+}
+
+// assign only core features
+void FVector::coreAssign(const FVector& rhs)
+{
+ for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
+ m_coreFeatures[i] = rhs.m_coreFeatures[i];
+}
+
+void FVector::incrementSparseHopeFeatures()
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ FName::incrementHopeId((i->first).name());
+}
+
+void FVector::incrementSparseFearFeatures()
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ FName::incrementFearId((i->first).name());
+}
+
+void FVector::printSparseHopeFeatureCounts(std::ofstream& out)
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ out << (i->first).name() << ": " << FName::getHopeIdCount((i->first).name()) << std::endl;
+}
+
+void FVector::printSparseFearFeatureCounts(std::ofstream& out)
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ out << (i->first).name() << ": " << FName::getFearIdCount((i->first).name()) << std::endl;
+}
+
+void FVector::printSparseHopeFeatureCounts()
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ std::cerr << (i->first).name() << ": " << FName::getHopeIdCount((i->first).name()) << std::endl;
+}
+
+void FVector::printSparseFearFeatureCounts()
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ std::cerr << (i->first).name() << ": " << FName::getFearIdCount((i->first).name()) << std::endl;
+}
+
+size_t FVector::pruneSparseFeatures(size_t threshold)
+{
+ size_t count = 0;
+ vector<FName> toErase;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ const std::string& fname = (i->first).name();
+ if (FName::getHopeIdCount(fname) < threshold && FName::getFearIdCount(fname) < threshold) {
+ toErase.push_back(i->first);
+ std::cerr << "pruning: " << fname << " (" << FName::getHopeIdCount(fname) << ", " << FName::getFearIdCount(fname) << ")" << std::endl;
+ FName::eraseId(FName::getId(fname));
+ ++count;
}
- operator*=(factor);
}
- void FVector::capMax(FValue maxValue) {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- if (i->second > maxValue)
- set(i->first, maxValue);
- }
+ for (size_t i = 0; i < toErase.size(); ++i)
+ m_features.erase(toErase[i]);
- void FVector::capMin(FValue minValue) {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- if (i->second < minValue)
- set(i->first, minValue);
- }
+ return count;
+}
- void FVector::set(const FName& name, const FValue& value) {
- m_features[name] = value;
- }
-
- void FVector::printCoreFeatures() {
- cerr << "core=(";
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- cerr << m_coreFeatures[i];
- if (i + 1 < m_coreFeatures.size()) {
- cerr << ",";
- }
+size_t FVector::pruneZeroWeightFeatures()
+{
+ size_t count = 0;
+ vector<FName> toErase;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ const std::string& fname = (i->first).name();
+ if (i->second == 0) {
+ toErase.push_back(i->first);
+ //std::cerr << "prune: " << fname << std::endl;
+ FName::eraseId(FName::getId(fname));
+ ++count;
}
- cerr << ") ";
}
- FVector& FVector::operator+= (const FVector& rhs) {
- if (rhs.m_coreFeatures.size() > m_coreFeatures.size())
- resize(rhs.m_coreFeatures.size());
- for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
- set(i->first, get(i->first) + i->second);
- for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
- m_coreFeatures[i] += rhs.m_coreFeatures[i];
- return *this;
- }
-
- // add only sparse features
- void FVector::sparsePlusEquals(const FVector& rhs) {
- for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
- set(i->first, get(i->first) + i->second);
- }
-
- // assign only core features
- void FVector::coreAssign(const FVector& rhs) {
- for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
- m_coreFeatures[i] = rhs.m_coreFeatures[i];
+ for (size_t i = 0; i < toErase.size(); ++i)
+ m_features.erase(toErase[i]);
+
+ return count;
+}
+
+void FVector::updateConfidenceCounts(const FVector& weightUpdate, bool signedCounts)
+{
+ for (size_t i = 0; i < weightUpdate.m_coreFeatures.size(); ++i) {
+ if (signedCounts) {
+ //int sign = weightUpdate.m_coreFeatures[i] >= 0 ? 1 : -1;
+ //m_coreFeatures[i] += (weightUpdate.m_coreFeatures[i] * weightUpdate.m_coreFeatures[i]) * sign;
+ m_coreFeatures[i] += weightUpdate.m_coreFeatures[i];
+ } else
+ //m_coreFeatures[i] += (weightUpdate.m_coreFeatures[i] * weightUpdate.m_coreFeatures[i]);
+ m_coreFeatures[i] += abs(weightUpdate.m_coreFeatures[i]);
+ }
+
+ for (const_iterator i = weightUpdate.cbegin(); i != weightUpdate.cend(); ++i) {
+ if (weightUpdate[i->first] == 0)
+ continue;
+ float value = get(i->first);
+ if (signedCounts) {
+ //int sign = weightUpdate[i->first] >= 0 ? 1 : -1;
+ //value += (weightUpdate[i->first] * weightUpdate[i->first]) * sign;
+ value += weightUpdate[i->first];
+ } else
+ //value += (weightUpdate[i->first] * weightUpdate[i->first]);
+ value += abs(weightUpdate[i->first]);
+ set(i->first, value);
}
-
- void FVector::incrementSparseHopeFeatures() {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- FName::incrementHopeId((i->first).name());
+}
+
+void FVector::updateLearningRates(float decay_core, float decay_sparse, const FVector &confidenceCounts, float core_r0, float sparse_r0)
+{
+ for (size_t i = 0; i < confidenceCounts.m_coreFeatures.size(); ++i) {
+ m_coreFeatures[i] = 1.0/(1.0/core_r0 + decay_core * abs(confidenceCounts.m_coreFeatures[i]));
}
- void FVector::incrementSparseFearFeatures() {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- FName::incrementFearId((i->first).name());
+ for (const_iterator i = confidenceCounts.cbegin(); i != confidenceCounts.cend(); ++i) {
+ float value = 1.0/(1.0/sparse_r0 + decay_sparse * abs(i->second));
+ set(i->first, value);
}
-
- void FVector::printSparseHopeFeatureCounts(std::ofstream& out) {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- out << (i->first).name() << ": " << FName::getHopeIdCount((i->first).name()) << std::endl;
+}
+
+// count non-zero occurrences for all sparse features
+void FVector::setToBinaryOf(const FVector& rhs)
+{
+ for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
+ if (rhs.get(i->first) != 0)
+ set(i->first, 1);
+ for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
+ m_coreFeatures[i] = 1;
+}
+
+// divide only core features by scalar
+FVector& FVector::coreDivideEquals(float scalar)
+{
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i)
+ m_coreFeatures[i] /= scalar;
+ return *this;
+}
+
+// lhs vector is a sum of vectors, rhs vector holds number of non-zero summands
+FVector& FVector::divideEquals(const FVector& rhs)
+{
+ assert(m_coreFeatures.size() == rhs.m_coreFeatures.size());
+ for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
+ set(i->first, get(i->first)/rhs.get(i->first)); // divide by number of summands
+ for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
+ m_coreFeatures[i] /= rhs.m_coreFeatures[i]; // divide by number of summands
+ return *this;
+}
+
+FVector& FVector::operator-= (const FVector& rhs)
+{
+ if (rhs.m_coreFeatures.size() > m_coreFeatures.size())
+ resize(rhs.m_coreFeatures.size());
+ for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
+ set(i->first, get(i->first) -(i->second));
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ if (i < rhs.m_coreFeatures.size()) {
+ m_coreFeatures[i] -= rhs.m_coreFeatures[i];
+ }
}
+ return *this;
+}
- void FVector::printSparseFearFeatureCounts(std::ofstream& out) {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- out << (i->first).name() << ": " << FName::getFearIdCount((i->first).name()) << std::endl;
+FVector& FVector::operator*= (const FVector& rhs)
+{
+ if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) {
+ resize(rhs.m_coreFeatures.size());
}
-
- void FVector::printSparseHopeFeatureCounts() {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- std::cerr << (i->first).name() << ": " << FName::getHopeIdCount((i->first).name()) << std::endl;
+ for (iterator i = begin(); i != end(); ++i) {
+ FValue lhsValue = i->second;
+ FValue rhsValue = rhs.get(i->first);
+ set(i->first,lhsValue*rhsValue);
}
-
- void FVector::printSparseFearFeatureCounts() {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- std::cerr << (i->first).name() << ": " << FName::getFearIdCount((i->first).name()) << std::endl;
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ if (i < rhs.m_coreFeatures.size()) {
+ m_coreFeatures[i] *= rhs.m_coreFeatures[i];
+ } else {
+ m_coreFeatures[i] = 0;
+ }
}
+ return *this;
+}
- size_t FVector::pruneSparseFeatures(size_t threshold) {
- size_t count = 0;
- vector<FName> toErase;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- const std::string& fname = (i->first).name();
- if (FName::getHopeIdCount(fname) < threshold && FName::getFearIdCount(fname) < threshold) {
- toErase.push_back(i->first);
- std::cerr << "pruning: " << fname << " (" << FName::getHopeIdCount(fname) << ", " << FName::getFearIdCount(fname) << ")" << std::endl;
- FName::eraseId(FName::getId(fname));
- ++count;
- }
- }
-
- for (size_t i = 0; i < toErase.size(); ++i)
- m_features.erase(toErase[i]);
-
- return count;
- }
-
- size_t FVector::pruneZeroWeightFeatures() {
- size_t count = 0;
- vector<FName> toErase;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- const std::string& fname = (i->first).name();
- if (i->second == 0) {
- toErase.push_back(i->first);
- //std::cerr << "prune: " << fname << std::endl;
- FName::eraseId(FName::getId(fname));
- ++count;
- }
- }
-
- for (size_t i = 0; i < toErase.size(); ++i)
- m_features.erase(toErase[i]);
-
- return count;
- }
-
- void FVector::updateConfidenceCounts(const FVector& weightUpdate, bool signedCounts) {
- for (size_t i = 0; i < weightUpdate.m_coreFeatures.size(); ++i) {
- if (signedCounts) {
- //int sign = weightUpdate.m_coreFeatures[i] >= 0 ? 1 : -1;
- //m_coreFeatures[i] += (weightUpdate.m_coreFeatures[i] * weightUpdate.m_coreFeatures[i]) * sign;
- m_coreFeatures[i] += weightUpdate.m_coreFeatures[i];
- }
- else
- //m_coreFeatures[i] += (weightUpdate.m_coreFeatures[i] * weightUpdate.m_coreFeatures[i]);
- m_coreFeatures[i] += abs(weightUpdate.m_coreFeatures[i]);
- }
-
- for (const_iterator i = weightUpdate.cbegin(); i != weightUpdate.cend(); ++i) {
- if (weightUpdate[i->first] == 0)
- continue;
- float value = get(i->first);
- if (signedCounts) {
- //int sign = weightUpdate[i->first] >= 0 ? 1 : -1;
- //value += (weightUpdate[i->first] * weightUpdate[i->first]) * sign;
- value += weightUpdate[i->first];
+FVector& FVector::operator/= (const FVector& rhs)
+{
+ if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) {
+ resize(rhs.m_coreFeatures.size());
+ }
+ for (iterator i = begin(); i != end(); ++i) {
+ FValue lhsValue = i->second;
+ FValue rhsValue = rhs.get(i->first);
+ set(i->first, lhsValue / rhsValue) ;
+ }
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ if (i < rhs.m_coreFeatures.size()) {
+ m_coreFeatures[i] /= rhs.m_coreFeatures[i];
+ } else {
+ if (m_coreFeatures[i] < 0) {
+ m_coreFeatures[i] = -numeric_limits<FValue>::infinity();
+ } else if (m_coreFeatures[i] > 0) {
+ m_coreFeatures[i] = numeric_limits<FValue>::infinity();
}
- else
- //value += (weightUpdate[i->first] * weightUpdate[i->first]);
- value += abs(weightUpdate[i->first]);
- set(i->first, value);
}
}
+ return *this;
+}
- void FVector::updateLearningRates(float decay_core, float decay_sparse, const FVector &confidenceCounts, float core_r0, float sparse_r0) {
- for (size_t i = 0; i < confidenceCounts.m_coreFeatures.size(); ++i) {
- m_coreFeatures[i] = 1.0/(1.0/core_r0 + decay_core * abs(confidenceCounts.m_coreFeatures[i]));
- }
-
- for (const_iterator i = confidenceCounts.cbegin(); i != confidenceCounts.cend(); ++i) {
- float value = 1.0/(1.0/sparse_r0 + decay_sparse * abs(i->second));
- set(i->first, value);
- }
+FVector& FVector::operator*= (const FValue& rhs)
+{
+ //NB Could do this with boost::bind ?
+ for (iterator i = begin(); i != end(); ++i) {
+ i->second *= rhs;
}
+ m_coreFeatures *= rhs;
+ return *this;
+}
- // count non-zero occurrences for all sparse features
- void FVector::setToBinaryOf(const FVector& rhs) {
- for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
- if (rhs.get(i->first) != 0)
- set(i->first, 1);
- for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
- m_coreFeatures[i] = 1;
- }
-
- // divide only core features by scalar
- FVector& FVector::coreDivideEquals(float scalar) {
- for (size_t i = 0; i < m_coreFeatures.size(); ++i)
- m_coreFeatures[i] /= scalar;
- return *this;
- }
-
- // lhs vector is a sum of vectors, rhs vector holds number of non-zero summands
- FVector& FVector::divideEquals(const FVector& rhs) {
- assert(m_coreFeatures.size() == rhs.m_coreFeatures.size());
- for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
- set(i->first, get(i->first)/rhs.get(i->first)); // divide by number of summands
- for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
- m_coreFeatures[i] /= rhs.m_coreFeatures[i]; // divide by number of summands
- return *this;
- }
-
- FVector& FVector::operator-= (const FVector& rhs) {
- if (rhs.m_coreFeatures.size() > m_coreFeatures.size())
- resize(rhs.m_coreFeatures.size());
- for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
- set(i->first, get(i->first) -(i->second));
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- if (i < rhs.m_coreFeatures.size()) {
- m_coreFeatures[i] -= rhs.m_coreFeatures[i];
- }
- }
- return *this;
+FVector& FVector::operator/= (const FValue& rhs)
+{
+ for (iterator i = begin(); i != end(); ++i) {
+ i->second /= rhs;
}
-
- FVector& FVector::operator*= (const FVector& rhs) {
- if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) {
- resize(rhs.m_coreFeatures.size());
- }
- for (iterator i = begin(); i != end(); ++i) {
- FValue lhsValue = i->second;
- FValue rhsValue = rhs.get(i->first);
- set(i->first,lhsValue*rhsValue);
- }
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- if (i < rhs.m_coreFeatures.size()) {
- m_coreFeatures[i] *= rhs.m_coreFeatures[i];
- } else {
- m_coreFeatures[i] = 0;
- }
- }
- return *this;
+ m_coreFeatures /= rhs;
+ return *this;
+}
+
+FVector& FVector::multiplyEqualsBackoff(const FVector& rhs, float backoff)
+{
+ if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) {
+ resize(rhs.m_coreFeatures.size());
}
-
- FVector& FVector::operator/= (const FVector& rhs) {
- if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) {
- resize(rhs.m_coreFeatures.size());
- }
- for (iterator i = begin(); i != end(); ++i) {
- FValue lhsValue = i->second;
- FValue rhsValue = rhs.get(i->first);
- set(i->first, lhsValue / rhsValue) ;
- }
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- if (i < rhs.m_coreFeatures.size()) {
- m_coreFeatures[i] /= rhs.m_coreFeatures[i];
- } else {
- if (m_coreFeatures[i] < 0) {
- m_coreFeatures[i] = -numeric_limits<FValue>::infinity();
- } else if (m_coreFeatures[i] > 0) {
- m_coreFeatures[i] = numeric_limits<FValue>::infinity();
- }
- }
- }
- return *this;
+ for (iterator i = begin(); i != end(); ++i) {
+ FValue lhsValue = i->second;
+ FValue rhsValue = rhs.getBackoff(i->first, backoff);
+ set(i->first,lhsValue*rhsValue);
}
-
- FVector& FVector::operator*= (const FValue& rhs) {
- //NB Could do this with boost::bind ?
- for (iterator i = begin(); i != end(); ++i) {
- i->second *= rhs;
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ if (i < rhs.m_coreFeatures.size()) {
+ m_coreFeatures[i] *= rhs.m_coreFeatures[i];
+ } else {
+ m_coreFeatures[i] = 0;
}
- m_coreFeatures *= rhs;
- return *this;
}
-
- FVector& FVector::operator/= (const FValue& rhs) {
- for (iterator i = begin(); i != end(); ++i) {
- i->second /= rhs;
- }
- m_coreFeatures /= rhs;
- return *this;
+ return *this;
+}
+
+FVector& FVector::multiplyEquals(float core_r0, float sparse_r0)
+{
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ m_coreFeatures[i] *= core_r0;
}
+ for (iterator i = begin(); i != end(); ++i)
+ set(i->first,(i->second)*sparse_r0);
+ return *this;
+}
- FVector& FVector::multiplyEqualsBackoff(const FVector& rhs, float backoff) {
- if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) {
- resize(rhs.m_coreFeatures.size());
- }
- for (iterator i = begin(); i != end(); ++i) {
- FValue lhsValue = i->second;
- FValue rhsValue = rhs.getBackoff(i->first, backoff);
- set(i->first,lhsValue*rhsValue);
- }
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- if (i < rhs.m_coreFeatures.size()) {
- m_coreFeatures[i] *= rhs.m_coreFeatures[i];
- } else {
- m_coreFeatures[i] = 0;
- }
- }
- return *this;
+FValue FVector::l1norm() const
+{
+ FValue norm = 0;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ norm += abs(i->second);
}
-
- FVector& FVector::multiplyEquals(float core_r0, float sparse_r0) {
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- m_coreFeatures[i] *= core_r0;
- }
- for (iterator i = begin(); i != end(); ++i)
- set(i->first,(i->second)*sparse_r0);
- return *this;
- }
-
- FValue FVector::l1norm() const {
- FValue norm = 0;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- norm += abs(i->second);
- }
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- norm += abs(m_coreFeatures[i]);
- }
- return norm;
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ norm += abs(m_coreFeatures[i]);
}
+ return norm;
+}
+
+FValue FVector::l1norm_coreFeatures() const
+{
+ FValue norm = 0;
+ // ignore Bleu score feature (last feature)
+ for (size_t i = 0; i < m_coreFeatures.size()-1; ++i)
+ norm += abs(m_coreFeatures[i]);
+ return norm;
+}
+
+FValue FVector::l2norm() const
+{
+ return sqrt(inner_product(*this));
+}
- FValue FVector::l1norm_coreFeatures() const {
- FValue norm = 0;
- // ignore Bleu score feature (last feature)
- for (size_t i = 0; i < m_coreFeatures.size()-1; ++i)
- norm += abs(m_coreFeatures[i]);
- return norm;
+FValue FVector::linfnorm() const
+{
+ FValue norm = 0;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ float absValue = abs(i->second);
+ if (absValue > norm)
+ norm = absValue;
}
-
- FValue FVector::l2norm() const {
- return sqrt(inner_product(*this));
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ float absValue = abs(m_coreFeatures[i]);
+ if (absValue > norm)
+ norm = absValue;
}
+ return norm;
+}
- FValue FVector::linfnorm() const {
- FValue norm = 0;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- float absValue = abs(i->second);
- if (absValue > norm)
- norm = absValue;
- }
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- float absValue = abs(m_coreFeatures[i]);
- if (absValue > norm)
- norm = absValue;
+size_t FVector::l1regularize(float lambda)
+{
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ float value = m_coreFeatures[i];
+ if (value > 0) {
+ m_coreFeatures[i] = max(0.0f, value - lambda);
+ } else {
+ m_coreFeatures[i] = min(0.0f, value + lambda);
}
- return norm;
}
- size_t FVector::l1regularize(float lambda) {
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- float value = m_coreFeatures[i];
- if (value > 0) {
- m_coreFeatures[i] = max(0.0f, value - lambda);
- }
- else {
- m_coreFeatures[i] = min(0.0f, value + lambda);
- }
- }
+ size_t numberPruned = size();
+ vector<FName> toErase;
+ for (iterator i = begin(); i != end(); ++i) {
+ float value = i->second;
+ if (value != 0.0f) {
+ if (value > 0)
+ value = max(0.0f, value - lambda);
+ else
+ value = min(0.0f, value + lambda);
- size_t numberPruned = size();
- vector<FName> toErase;
- for (iterator i = begin(); i != end(); ++i) {
- float value = i->second;
- if (value != 0.0f) {
- if (value > 0)
- value = max(0.0f, value - lambda);
- else
- value = min(0.0f, value + lambda);
-
- if (value != 0.0f)
- i->second = value;
- else {
- toErase.push_back(i->first);
- const std::string& fname = (i->first).name();
- FName::eraseId(FName::getId(fname));
- }
+ if (value != 0.0f)
+ i->second = value;
+ else {
+ toErase.push_back(i->first);
+ const std::string& fname = (i->first).name();
+ FName::eraseId(FName::getId(fname));
}
}
-
- // erase features that have become zero
- for (size_t i = 0; i < toErase.size(); ++i)
- m_features.erase(toErase[i]);
- numberPruned -= size();
- return numberPruned;
}
- void FVector::l2regularize(float lambda) {
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- m_coreFeatures[i] *= (1 - lambda);
- }
+ // erase features that have become zero
+ for (size_t i = 0; i < toErase.size(); ++i)
+ m_features.erase(toErase[i]);
+ numberPruned -= size();
+ return numberPruned;
+}
- for (iterator i = begin(); i != end(); ++i) {
- i->second *= (1 - lambda);
- }
+void FVector::l2regularize(float lambda)
+{
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ m_coreFeatures[i] *= (1 - lambda);
}
- size_t FVector::sparseL1regularize(float lambda) {
- /*for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- float value = m_coreFeatures[i];
- if (value > 0) {
- m_coreFeatures[i] = max(0.0f, value - lambda);
- }
+ for (iterator i = begin(); i != end(); ++i) {
+ i->second *= (1 - lambda);
+ }
+}
+
+size_t FVector::sparseL1regularize(float lambda)
+{
+ /*for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ float value = m_coreFeatures[i];
+ if (value > 0) {
+ m_coreFeatures[i] = max(0.0f, value - lambda);
+ }
+ else {
+ m_coreFeatures[i] = min(0.0f, value + lambda);
+ }
+ }*/
+
+ size_t numberPruned = size();
+ vector<FName> toErase;
+ for (iterator i = begin(); i != end(); ++i) {
+ float value = i->second;
+ if (value != 0.0f) {
+ if (value > 0)
+ value = max(0.0f, value - lambda);
+ else
+ value = min(0.0f, value + lambda);
+
+ if (value != 0.0f)
+ i->second = value;
else {
- m_coreFeatures[i] = min(0.0f, value + lambda);
- }
- }*/
-
- size_t numberPruned = size();
- vector<FName> toErase;
- for (iterator i = begin(); i != end(); ++i) {
- float value = i->second;
- if (value != 0.0f) {
- if (value > 0)
- value = max(0.0f, value - lambda);
- else
- value = min(0.0f, value + lambda);
-
- if (value != 0.0f)
- i->second = value;
- else {
- toErase.push_back(i->first);
- const std::string& fname = (i->first).name();
- FName::eraseId(FName::getId(fname));
- }
+ toErase.push_back(i->first);
+ const std::string& fname = (i->first).name();
+ FName::eraseId(FName::getId(fname));
}
}
-
- // erase features that have become zero
- for (size_t i = 0; i < toErase.size(); ++i)
- m_features.erase(toErase[i]);
- numberPruned -= size();
- return numberPruned;
}
- void FVector::sparseL2regularize(float lambda) {
- /*for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- m_coreFeatures[i] *= (1 - lambda);
- }*/
+ // erase features that have become zero
+ for (size_t i = 0; i < toErase.size(); ++i)
+ m_features.erase(toErase[i]);
+ numberPruned -= size();
+ return numberPruned;
+}
- for (iterator i = begin(); i != end(); ++i) {
- i->second *= (1 - lambda);
- }
- }
+void FVector::sparseL2regularize(float lambda)
+{
+ /*for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ m_coreFeatures[i] *= (1 - lambda);
+ }*/
- FValue FVector::sum() const {
- FValue sum = 0;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- sum += i->second;
- }
- sum += m_coreFeatures.sum();
- return sum;
- }
-
- FValue FVector::inner_product(const FVector& rhs) const {
- CHECK(m_coreFeatures.size() == rhs.m_coreFeatures.size());
- FValue product = 0.0;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- product += ((i->second)*(rhs.get(i->first)));
- }
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- product += m_coreFeatures[i]*rhs.m_coreFeatures[i];
- }
- return product;
+ for (iterator i = begin(); i != end(); ++i) {
+ i->second *= (1 - lambda);
}
+}
- const FVector operator+(const FVector& lhs, const FVector& rhs) {
- return FVector(lhs) += rhs;
- }
-
- const FVector operator-(const FVector& lhs, const FVector& rhs) {
- return FVector(lhs) -= rhs;
+FValue FVector::sum() const
+{
+ FValue sum = 0;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ sum += i->second;
}
-
- const FVector operator*(const FVector& lhs, const FVector& rhs) {
- return FVector(lhs) *= rhs;
- }
-
- const FVector operator/(const FVector& lhs, const FVector& rhs) {
- return FVector(lhs) /= rhs;
- }
-
-
- const FVector operator*(const FVector& lhs, const FValue& rhs) {
- return FVector(lhs) *= rhs;
+ sum += m_coreFeatures.sum();
+ return sum;
+}
+
+FValue FVector::inner_product(const FVector& rhs) const
+{
+ CHECK(m_coreFeatures.size() == rhs.m_coreFeatures.size());
+ FValue product = 0.0;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ product += ((i->second)*(rhs.get(i->first)));
}
-
- const FVector operator/(const FVector& lhs, const FValue& rhs) {
- return FVector(lhs) /= rhs;
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ product += m_coreFeatures[i]*rhs.m_coreFeatures[i];
}
+ return product;
+}
- FValue inner_product(const FVector& lhs, const FVector& rhs) {
- if (lhs.size() >= rhs.size()) {
- return rhs.inner_product(lhs);
- } else {
- return lhs.inner_product(rhs);
- }
+const FVector operator+(const FVector& lhs, const FVector& rhs)
+{
+ return FVector(lhs) += rhs;
+}
+
+const FVector operator-(const FVector& lhs, const FVector& rhs)
+{
+ return FVector(lhs) -= rhs;
+}
+
+const FVector operator*(const FVector& lhs, const FVector& rhs)
+{
+ return FVector(lhs) *= rhs;
+}
+
+const FVector operator/(const FVector& lhs, const FVector& rhs)
+{
+ return FVector(lhs) /= rhs;
+}
+
+
+const FVector operator*(const FVector& lhs, const FValue& rhs)
+{
+ return FVector(lhs) *= rhs;
+}
+
+const FVector operator/(const FVector& lhs, const FValue& rhs)
+{
+ return FVector(lhs) /= rhs;
+}
+
+FValue inner_product(const FVector& lhs, const FVector& rhs)
+{
+ if (lhs.size() >= rhs.size()) {
+ return rhs.inner_product(lhs);
+ } else {
+ return lhs.inner_product(rhs);
}
}
+}
diff --git a/moses/FeatureVector.h b/moses/FeatureVector.h
index 9c15ba4f7..f4261b520 100644
--- a/moses/FeatureVector.h
+++ b/moses/FeatureVector.h
@@ -1,21 +1,21 @@
/*
Moses - factored phrase-based language decoder
Copyright (C) 2010 University of Edinburgh
-
+
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
-
+
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
+
*/
#pragma once
@@ -47,325 +47,336 @@
#include "util/check.hh"
#include "util/string_piece.hh"
-namespace Moses {
-
- typedef float FValue;
-
- /**
- * Feature name
- **/
- struct FName {
-
- static const std::string SEP;
-
- typedef boost::unordered_map<std::string,size_t> Name2Id;
- typedef boost::unordered_map<size_t,size_t> Id2Count;
- //typedef std::map<std::string, size_t> Name2Id;
- static Name2Id name2id;
- static std::vector<std::string> id2name;
- static Id2Count id2hopeCount;
- static Id2Count id2fearCount;
-
- //A feature name can either be initialised as a pair of strings,
- //which will be concatenated with a SEP between them, or as
- //a single string, which will be used as-is.
- FName(const StringPiece &root, const StringPiece &name) {
- std::string assembled(root.data(), root.size());
- assembled += SEP;
- assembled.append(name.data(), name.size());
- init(assembled);
- }
- explicit FName(const StringPiece &name)
- {init(name);}
-
- const std::string& name() const;
- //const std::string& root() const {return m_root;}
-
- size_t hash() const;
-
- bool operator==(const FName& rhs) const ;
- bool operator!=(const FName& rhs) const ;
-
- static size_t getId(const std::string& name);
- static size_t getHopeIdCount(const std::string& name);
- static size_t getFearIdCount(const std::string& name);
- static void incrementHopeId(const std::string& name);
- static void incrementFearId(const std::string& name);
- static void eraseId(size_t id);
-
- private:
- void init(const StringPiece& name);
- size_t m_id;
+namespace Moses
+{
+
+typedef float FValue;
+
+/**
+ * Feature name
+ **/
+struct FName {
+
+ static const std::string SEP;
+
+ typedef boost::unordered_map<std::string,size_t> Name2Id;
+ typedef boost::unordered_map<size_t,size_t> Id2Count;
+ //typedef std::map<std::string, size_t> Name2Id;
+ static Name2Id name2id;
+ static std::vector<std::string> id2name;
+ static Id2Count id2hopeCount;
+ static Id2Count id2fearCount;
+
+ //A feature name can either be initialised as a pair of strings,
+ //which will be concatenated with a SEP between them, or as
+ //a single string, which will be used as-is.
+ FName(const StringPiece &root, const StringPiece &name) {
+ std::string assembled(root.data(), root.size());
+ assembled += SEP;
+ assembled.append(name.data(), name.size());
+ init(assembled);
+ }
+ explicit FName(const StringPiece &name) {
+ init(name);
+ }
+
+ const std::string& name() const;
+ //const std::string& root() const {return m_root;}
+
+ size_t hash() const;
+
+ bool operator==(const FName& rhs) const ;
+ bool operator!=(const FName& rhs) const ;
+
+ static size_t getId(const std::string& name);
+ static size_t getHopeIdCount(const std::string& name);
+ static size_t getFearIdCount(const std::string& name);
+ static void incrementHopeId(const std::string& name);
+ static void incrementFearId(const std::string& name);
+ static void eraseId(size_t id);
+
+private:
+ void init(const StringPiece& name);
+ size_t m_id;
#ifdef WITH_THREADS
- //reader-writer lock
- static boost::shared_mutex m_idLock;
+ //reader-writer lock
+ static boost::shared_mutex m_idLock;
#endif
- };
-
- std::ostream& operator<<(std::ostream& out,const FName& name);
-
- struct FNameEquals {
- inline bool operator() (const FName& lhs, const FName& rhs) const {
- return (lhs == rhs);
- }
- };
-
- struct FNameHash
- : std::unary_function<FName, std::size_t>
- {
- std::size_t operator()(const FName& x) const
- {
- return x.hash();
- }
- };
-
- class ProxyFVector;
-
- /**
- * A sparse feature (or weight) vector.
- **/
- class FVector
- {
- public:
- /** Empty feature vector */
- FVector(size_t coreFeatures = 0);
-
- FVector& operator=( const FVector& rhs ) {
- m_features = rhs.m_features;
- m_coreFeatures = rhs.m_coreFeatures;
- return *this;
- }
+};
- /*
- * Change the number of core features
- **/
- void resize(size_t newsize);
-
- typedef boost::unordered_map<FName,FValue,FNameHash, FNameEquals> FNVmap;
- /** Iterators */
- typedef FNVmap::iterator iterator;
- typedef FNVmap::const_iterator const_iterator;
- iterator begin() {return m_features.begin();}
- iterator end() {return m_features.end();}
- const_iterator cbegin() const {return m_features.cbegin();}
- const_iterator cend() const {return m_features.cend();}
-
- bool hasNonDefaultValue(FName name) const { return m_features.find(name) != m_features.end();}
- void clear();
-
-
- /** Load from file - each line should be 'root[_name] value' */
- bool load(const std::string& filename);
- void save(const std::string& filename) const;
- void write(std::ostream& out) const ;
-
- /** Element access */
- ProxyFVector operator[](const FName& name);
- FValue& operator[](size_t index);
- FValue operator[](const FName& name) const;
- FValue operator[](size_t index) const;
-
- /** Size */
- size_t size() const {
- return m_features.size() + m_coreFeatures.size();
- }
+std::ostream& operator<<(std::ostream& out,const FName& name);
- size_t coreSize() const {
- return m_coreFeatures.size();
- }
-
- const std::valarray<FValue> &getCoreFeatures() const {
- return m_coreFeatures;
- }
-
- /** Equality */
- bool operator== (const FVector& rhs) const;
- bool operator!= (const FVector& rhs) const;
-
- FValue inner_product(const FVector& rhs) const;
-
- friend class ProxyFVector;
-
- /**arithmetic */
- //Element-wise
- //If one side has fewer core features, take the missing ones to be 0.
- FVector& operator+= (const FVector& rhs);
- FVector& operator-= (const FVector& rhs);
- FVector& operator*= (const FVector& rhs);
- FVector& operator/= (const FVector& rhs);
- //Scalar
- FVector& operator*= (const FValue& rhs);
- FVector& operator/= (const FValue& rhs);
-
- FVector& multiplyEqualsBackoff(const FVector& rhs, float backoff);
- FVector& multiplyEquals(float core_r0, float sparse_r0);
-
- FVector& max_equals(const FVector& rhs);
-
- /** norms and sums */
- FValue l1norm() const;
- FValue l1norm_coreFeatures() const;
- FValue l2norm() const;
- FValue linfnorm() const;
- size_t l1regularize(float lambda);
- void l2regularize(float lambda);
- size_t sparseL1regularize(float lambda);
- void sparseL2regularize(float lambda);
- FValue sum() const;
-
- /** pretty printing */
- std::ostream& print(std::ostream& out) const;
-
- /** additional */
- void printCoreFeatures();
- //scale so that abs. value is less than maxvalue
- void thresholdScale(float maxValue );
-
- void capMax(FValue maxValue);
- void capMin(FValue minValue);
-
- void sparsePlusEquals(const FVector& rhs);
- void coreAssign(const FVector& rhs);
-
- void incrementSparseHopeFeatures();
- void incrementSparseFearFeatures();
- void printSparseHopeFeatureCounts(std::ofstream& out);
- void printSparseFearFeatureCounts(std::ofstream& out);
- void printSparseHopeFeatureCounts();
- void printSparseFearFeatureCounts();
- size_t pruneSparseFeatures(size_t threshold);
- size_t pruneZeroWeightFeatures();
- void updateConfidenceCounts(const FVector& weightUpdate, bool signedCounts);
- void updateLearningRates(float decay_core, float decay_sparse, const FVector& confidence_counts, float core_r0, float sparse_r0);
-
- // vector which, for each element of the original vector, reflects whether an element is zero or non-zero
- void setToBinaryOf(const FVector& rhs);
-
- // divide only core features by scalar
- FVector& coreDivideEquals(float scalar);
-
- // divide each element by the number given in the rhs vector
- FVector& divideEquals(const FVector& rhs);
+struct FNameEquals {
+ inline bool operator() (const FName& lhs, const FName& rhs) const {
+ return (lhs == rhs);
+ }
+};
+
+struct FNameHash
+ : std::unary_function<FName, std::size_t> {
+ std::size_t operator()(const FName& x) const {
+ return x.hash();
+ }
+};
+
+class ProxyFVector;
+
+/**
+ * A sparse feature (or weight) vector.
+ **/
+class FVector
+{
+public:
+ /** Empty feature vector */
+ FVector(size_t coreFeatures = 0);
+
+ FVector& operator=( const FVector& rhs ) {
+ m_features = rhs.m_features;
+ m_coreFeatures = rhs.m_coreFeatures;
+ return *this;
+ }
+
+ /*
+ * Change the number of core features
+ **/
+ void resize(size_t newsize);
+
+ typedef boost::unordered_map<FName,FValue,FNameHash, FNameEquals> FNVmap;
+ /** Iterators */
+ typedef FNVmap::iterator iterator;
+ typedef FNVmap::const_iterator const_iterator;
+ iterator begin() {
+ return m_features.begin();
+ }
+ iterator end() {
+ return m_features.end();
+ }
+ const_iterator cbegin() const {
+ return m_features.cbegin();
+ }
+ const_iterator cend() const {
+ return m_features.cend();
+ }
+
+ bool hasNonDefaultValue(FName name) const {
+ return m_features.find(name) != m_features.end();
+ }
+ void clear();
+
+
+ /** Load from file - each line should be 'root[_name] value' */
+ bool load(const std::string& filename);
+ void save(const std::string& filename) const;
+ void write(std::ostream& out) const ;
+
+ /** Element access */
+ ProxyFVector operator[](const FName& name);
+ FValue& operator[](size_t index);
+ FValue operator[](const FName& name) const;
+ FValue operator[](size_t index) const;
+
+ /** Size */
+ size_t size() const {
+ return m_features.size() + m_coreFeatures.size();
+ }
+
+ size_t coreSize() const {
+ return m_coreFeatures.size();
+ }
+
+ const std::valarray<FValue> &getCoreFeatures() const {
+ return m_coreFeatures;
+ }
+
+ /** Equality */
+ bool operator== (const FVector& rhs) const;
+ bool operator!= (const FVector& rhs) const;
+
+ FValue inner_product(const FVector& rhs) const;
+
+ friend class ProxyFVector;
+
+ /**arithmetic */
+ //Element-wise
+ //If one side has fewer core features, take the missing ones to be 0.
+ FVector& operator+= (const FVector& rhs);
+ FVector& operator-= (const FVector& rhs);
+ FVector& operator*= (const FVector& rhs);
+ FVector& operator/= (const FVector& rhs);
+ //Scalar
+ FVector& operator*= (const FValue& rhs);
+ FVector& operator/= (const FValue& rhs);
+
+ FVector& multiplyEqualsBackoff(const FVector& rhs, float backoff);
+ FVector& multiplyEquals(float core_r0, float sparse_r0);
+
+ FVector& max_equals(const FVector& rhs);
+
+ /** norms and sums */
+ FValue l1norm() const;
+ FValue l1norm_coreFeatures() const;
+ FValue l2norm() const;
+ FValue linfnorm() const;
+ size_t l1regularize(float lambda);
+ void l2regularize(float lambda);
+ size_t sparseL1regularize(float lambda);
+ void sparseL2regularize(float lambda);
+ FValue sum() const;
+
+ /** pretty printing */
+ std::ostream& print(std::ostream& out) const;
+
+ /** additional */
+ void printCoreFeatures();
+ //scale so that abs. value is less than maxvalue
+ void thresholdScale(float maxValue );
+
+ void capMax(FValue maxValue);
+ void capMin(FValue minValue);
+
+ void sparsePlusEquals(const FVector& rhs);
+ void coreAssign(const FVector& rhs);
+
+ void incrementSparseHopeFeatures();
+ void incrementSparseFearFeatures();
+ void printSparseHopeFeatureCounts(std::ofstream& out);
+ void printSparseFearFeatureCounts(std::ofstream& out);
+ void printSparseHopeFeatureCounts();
+ void printSparseFearFeatureCounts();
+ size_t pruneSparseFeatures(size_t threshold);
+ size_t pruneZeroWeightFeatures();
+ void updateConfidenceCounts(const FVector& weightUpdate, bool signedCounts);
+ void updateLearningRates(float decay_core, float decay_sparse, const FVector& confidence_counts, float core_r0, float sparse_r0);
+
+ // vector which, for each element of the original vector, reflects whether an element is zero or non-zero
+ void setToBinaryOf(const FVector& rhs);
+
+ // divide only core features by scalar
+ FVector& coreDivideEquals(float scalar);
+
+ // divide each element by the number given in the rhs vector
+ FVector& divideEquals(const FVector& rhs);
#ifdef MPI_ENABLE
- friend class boost::serialization::access;
-#endif
-
- private:
-
- /** Internal get and set. */
- const FValue& get(const FName& name) const;
- FValue getBackoff(const FName& name, float backoff) const;
- void set(const FName& name, const FValue& value);
-
- FNVmap m_features;
- std::valarray<FValue> m_coreFeatures;
-
+ friend class boost::serialization::access;
+#endif
+
+private:
+
+ /** Internal get and set. */
+ const FValue& get(const FName& name) const;
+ FValue getBackoff(const FName& name, float backoff) const;
+ void set(const FName& name, const FValue& value);
+
+ FNVmap m_features;
+ std::valarray<FValue> m_coreFeatures;
+
#ifdef MPI_ENABLE
- //serialization
- template<class Archive>
- void save(Archive &ar, const unsigned int version) const {
- std::vector<std::string> names;
- std::vector<FValue> values;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- std::ostringstream ostr;
- ostr << i->first;
- names.push_back(ostr.str());
- values.push_back(i->second);
- }
- ar << names;
- ar << values;
- ar << m_coreFeatures;
+ //serialization
+ template<class Archive>
+ void save(Archive &ar, const unsigned int version) const {
+ std::vector<std::string> names;
+ std::vector<FValue> values;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ std::ostringstream ostr;
+ ostr << i->first;
+ names.push_back(ostr.str());
+ values.push_back(i->second);
}
-
- template<class Archive>
- void load(Archive &ar, const unsigned int version) {
- clear();
- std::vector<std::string> names;
- std::vector<FValue> values;
- ar >> names;
- ar >> values;
- ar >> m_coreFeatures;
- CHECK(names.size() == values.size());
- for (size_t i = 0; i < names.size(); ++i) {
- set(FName(names[i]), values[i]);
- }
+ ar << names;
+ ar << values;
+ ar << m_coreFeatures;
+ }
+
+ template<class Archive>
+ void load(Archive &ar, const unsigned int version) {
+ clear();
+ std::vector<std::string> names;
+ std::vector<FValue> values;
+ ar >> names;
+ ar >> values;
+ ar >> m_coreFeatures;
+ CHECK(names.size() == values.size());
+ for (size_t i = 0; i < names.size(); ++i) {
+ set(FName(names[i]), values[i]);
}
-
- BOOST_SERIALIZATION_SPLIT_MEMBER()
-
+ }
+
+ BOOST_SERIALIZATION_SPLIT_MEMBER()
+
#endif
-
- };
-
- std::ostream& operator<<( std::ostream& out, const FVector& fv);
- //Element-wise operations
- const FVector operator+(const FVector& lhs, const FVector& rhs);
- const FVector operator-(const FVector& lhs, const FVector& rhs);
- const FVector operator*(const FVector& lhs, const FVector& rhs);
- const FVector operator/(const FVector& lhs, const FVector& rhs);
-
- //Scalar operations
- const FVector operator*(const FVector& lhs, const FValue& rhs);
- const FVector operator/(const FVector& lhs, const FValue& rhs);
-
- const FVector fvmax(const FVector& lhs, const FVector& rhs);
-
- FValue inner_product(const FVector& lhs, const FVector& rhs);
-
- struct FVectorPlus {
- FVector operator()(const FVector& lhs, const FVector& rhs) const {
- return lhs + rhs;
- }
- };
-
- /**
- * Used to help with subscript operator overloading.
- * See http://stackoverflow.com/questions/1386075/overloading-operator-for-a-sparse-vector
- **/
- class ProxyFVector {
- public:
- ProxyFVector(FVector *fv, const FName& name ) : m_fv(fv), m_name(name) {}
- ProxyFVector &operator=(const FValue& value) {
- // If we get here, we know that operator[] was called to perform a write access,
- // so we can insert an item in the vector if needed
- //std::cerr << "Inserting " << value << " into " << m_name << std::endl;
- m_fv->set(m_name,value);
- return *this;
-
- }
-
- operator FValue() {
- // If we get here, we know that operator[] was called to perform a read access,
- // so we can simply return the value from the vector
- return m_fv->get(m_name);
- }
-
- /*operator FValue&() {
- return m_fv->m_features[m_name];
- }*/
-
- FValue operator++() {
- return ++m_fv->m_features[m_name];
- }
-
- FValue operator +=(FValue lhs) {
- return (m_fv->m_features[m_name] += lhs);
- }
-
- FValue operator -=(FValue lhs) {
- return (m_fv->m_features[m_name] -= lhs);
- }
- private:
- FValue m_tmp;
-
- private:
- FVector* m_fv;
- const FName& m_name;
-
- };
-
+};
+
+std::ostream& operator<<( std::ostream& out, const FVector& fv);
+//Element-wise operations
+const FVector operator+(const FVector& lhs, const FVector& rhs);
+const FVector operator-(const FVector& lhs, const FVector& rhs);
+const FVector operator*(const FVector& lhs, const FVector& rhs);
+const FVector operator/(const FVector& lhs, const FVector& rhs);
+
+//Scalar operations
+const FVector operator*(const FVector& lhs, const FValue& rhs);
+const FVector operator/(const FVector& lhs, const FValue& rhs);
+
+const FVector fvmax(const FVector& lhs, const FVector& rhs);
+
+FValue inner_product(const FVector& lhs, const FVector& rhs);
+
+struct FVectorPlus {
+ FVector operator()(const FVector& lhs, const FVector& rhs) const {
+ return lhs + rhs;
+ }
+};
+
+/**
+ * Used to help with subscript operator overloading.
+ * See http://stackoverflow.com/questions/1386075/overloading-operator-for-a-sparse-vector
+ **/
+class ProxyFVector
+{
+public:
+ ProxyFVector(FVector *fv, const FName& name ) : m_fv(fv), m_name(name) {}
+ ProxyFVector &operator=(const FValue& value) {
+ // If we get here, we know that operator[] was called to perform a write access,
+ // so we can insert an item in the vector if needed
+ //std::cerr << "Inserting " << value << " into " << m_name << std::endl;
+ m_fv->set(m_name,value);
+ return *this;
+
+ }
+
+ operator FValue() {
+ // If we get here, we know that operator[] was called to perform a read access,
+ // so we can simply return the value from the vector
+ return m_fv->get(m_name);
+ }
+
+ /*operator FValue&() {
+ return m_fv->m_features[m_name];
+ }*/
+
+ FValue operator++() {
+ return ++m_fv->m_features[m_name];
+ }
+
+ FValue operator +=(FValue lhs) {
+ return (m_fv->m_features[m_name] += lhs);
+ }
+
+ FValue operator -=(FValue lhs) {
+ return (m_fv->m_features[m_name] -= lhs);
+ }
+
+private:
+ FValue m_tmp;
+
+private:
+ FVector* m_fv;
+ const FName& m_name;
+
+};
+
}
#endif
diff --git a/moses/FeatureVectorTest.cpp b/moses/FeatureVectorTest.cpp
index af1829e62..2e00b276e 100644
--- a/moses/FeatureVectorTest.cpp
+++ b/moses/FeatureVectorTest.cpp
@@ -28,41 +28,49 @@ static const float TOL = 0.00001;
BOOST_AUTO_TEST_SUITE(fv)
-BOOST_AUTO_TEST_CASE(vector_sum_diff)
+BOOST_AUTO_TEST_CASE(vector_sum_diff)
{
FVector f1,f2,f3;
FName n1("a");
FName n2("b");
FName n3("c");
FName n4("d");
- f1[n1] = 1.2; f1[n2] = 1.4; f1[n3] = -0.1;
- f2[n1] = 0.01; f2[n3] = 5.6; f2[n4] = 0.6;
+ f1[n1] = 1.2;
+ f1[n2] = 1.4;
+ f1[n3] = -0.1;
+ f2[n1] = 0.01;
+ f2[n3] = 5.6;
+ f2[n4] = 0.6;
f3[n1] =1.2;
FVector sum = f1 + f2;
FVector diff = f1 - f2;
- BOOST_CHECK_CLOSE((FValue)sum[n1], 1.21, TOL);
- BOOST_CHECK_CLOSE((FValue)sum[n2], 1.4, TOL);
- BOOST_CHECK_CLOSE((FValue)sum[n3], 5.5, TOL);
- BOOST_CHECK_CLOSE((FValue)sum[n4], 0.6, TOL);
- BOOST_CHECK_CLOSE((FValue)diff[n1], 1.19, TOL);
- BOOST_CHECK_CLOSE((FValue)diff[n2], 1.4, TOL);
- BOOST_CHECK_CLOSE((FValue)diff[n3], -5.7, TOL);
- BOOST_CHECK_CLOSE((FValue)diff[n4], -0.6, TOL);
+ BOOST_CHECK_CLOSE((FValue)sum[n1], 1.21, TOL);
+ BOOST_CHECK_CLOSE((FValue)sum[n2], 1.4, TOL);
+ BOOST_CHECK_CLOSE((FValue)sum[n3], 5.5, TOL);
+ BOOST_CHECK_CLOSE((FValue)sum[n4], 0.6, TOL);
+ BOOST_CHECK_CLOSE((FValue)diff[n1], 1.19, TOL);
+ BOOST_CHECK_CLOSE((FValue)diff[n2], 1.4, TOL);
+ BOOST_CHECK_CLOSE((FValue)diff[n3], -5.7, TOL);
+ BOOST_CHECK_CLOSE((FValue)diff[n4], -0.6, TOL);
f1 -= f3;
cerr << f1 << endl << f3 << endl ;
BOOST_CHECK_CLOSE((FValue)f1[n1],0,TOL);
}
-BOOST_AUTO_TEST_CASE(scalar)
+BOOST_AUTO_TEST_CASE(scalar)
{
FVector f1,f2;
FName n1("a");
FName n2("b");
FName n3("c");
FName n4("d");
- f1[n1] = 0.2; f1[n2] = 9.178; f1[n3] = -0.1;
- f2[n1] = 0.01; f2[n3] = 5.6; f2[n4] = 0.6;
+ f1[n1] = 0.2;
+ f1[n2] = 9.178;
+ f1[n3] = -0.1;
+ f2[n1] = 0.01;
+ f2[n3] = 5.6;
+ f2[n4] = 0.6;
FVector prod1 = f1 * 2;
FVector prod2 = f1 * -0.1;
FVector quot = f2 / 2;
@@ -80,12 +88,13 @@ BOOST_AUTO_TEST_CASE(scalar)
BOOST_CHECK_CLOSE((FValue)quot[n4], 0.3, TOL);
}
-BOOST_AUTO_TEST_CASE(inc)
+BOOST_AUTO_TEST_CASE(inc)
{
FVector f1;
FName n1("a");
FName n2("b");
- f1[n1] = 2.3; f1[n2] = -0.4;
+ f1[n1] = 2.3;
+ f1[n2] = -0.4;
f1[n1]+=2;
BOOST_CHECK_CLOSE((FValue)f1[n1], 4.3, TOL);
BOOST_CHECK_CLOSE((FValue)f1[n2], -0.4, TOL);
@@ -103,8 +112,13 @@ BOOST_AUTO_TEST_CASE(vector_mult)
FName n2("b");
FName n3("c");
FName n4("d");
- f1[n1] = 0.2; f1[n2] = 9.178; f1[n3] = -0.1;
- f2[n1] = 0.01; f2[n2] = 5.6; f2[n3] = 1; f2[n4] = 0.6;
+ f1[n1] = 0.2;
+ f1[n2] = 9.178;
+ f1[n3] = -0.1;
+ f2[n1] = 0.01;
+ f2[n2] = 5.6;
+ f2[n3] = 1;
+ f2[n4] = 0.6;
FVector prod = f1 * f2;
FVector quot = f1/f2;
BOOST_CHECK_CLOSE((FValue)prod[n1], 0.002, TOL);
@@ -118,7 +132,7 @@ BOOST_AUTO_TEST_CASE(vector_mult)
BOOST_CHECK_CLOSE((FValue)quot[n4], 0, TOL);
}
-BOOST_AUTO_TEST_CASE(core)
+BOOST_AUTO_TEST_CASE(core)
{
FVector f1(2);
f1[0] = 1.3;
@@ -127,7 +141,7 @@ BOOST_AUTO_TEST_CASE(core)
BOOST_CHECK_CLOSE(f1[1],-1.9,TOL);
f1[1] = 0.1;
BOOST_CHECK_CLOSE(f1[1],0.1,TOL);
-
+
BOOST_CHECK_EQUAL(f1.size(),2);
f1[FName("a")] = 1.2;
@@ -140,8 +154,13 @@ BOOST_AUTO_TEST_CASE(core_arith)
FVector f2(2);
FName n1("a");
FName n2("b");
- f1[0] = 1.1; f1[1] = 0.25; f1[n1] = 3.6; f1[n2] = -1.5;
- f2[0] = 0.5; f2[1] = -0.1; f2[n1] = 1;
+ f1[0] = 1.1;
+ f1[1] = 0.25;
+ f1[n1] = 3.6;
+ f1[n2] = -1.5;
+ f2[0] = 0.5;
+ f2[1] = -0.1;
+ f2[n1] = 1;
//vector ops
FVector sum = f1+f2;
@@ -172,9 +191,10 @@ BOOST_AUTO_TEST_CASE(core_arith)
//with different length vectors
FVector f3(2);
FVector f4(1);
- f3[0] = 2; f3[1] = -1;
+ f3[0] = 2;
+ f3[1] = -1;
f4[0] = 5;
-
+
FVector sum1 = f3 + f4;
FVector sum2 = f4 + f3;
BOOST_CHECK_EQUAL(sum1,sum2);
@@ -200,14 +220,17 @@ BOOST_AUTO_TEST_CASE(core_arith)
BOOST_CHECK_EQUAL(quot1[1], -numeric_limits<float>::infinity());
BOOST_CHECK_CLOSE(quot2[0], 2.5, TOL);
BOOST_CHECK_CLOSE(quot2[1], 0, TOL);
-
+
}
BOOST_AUTO_TEST_CASE(core_scalar)
{
FVector f1(3);
FName n1("a");
- f1[0] = 1.5; f1[1] = 2.1; f1[2] = 4; f1[n1] = -0.5;
+ f1[0] = 1.5;
+ f1[1] = 2.1;
+ f1[2] = 4;
+ f1[n1] = -0.5;
FVector prod = f1*2;
FVector quot = f1/5;
@@ -224,31 +247,41 @@ BOOST_AUTO_TEST_CASE(core_scalar)
}
-BOOST_AUTO_TEST_CASE(l1norm)
+BOOST_AUTO_TEST_CASE(l1norm)
{
FVector f1(3);
FName n1("a");
- f1[0] = 1.5; f1[1] = 2.1; f1[2] = 4; f1[n1] = -0.5;
+ f1[0] = 1.5;
+ f1[1] = 2.1;
+ f1[2] = 4;
+ f1[n1] = -0.5;
FValue n = f1.l1norm();
BOOST_CHECK_CLOSE((FValue)n, abs(1.5)+abs(2.1)+abs(4)+abs(-0.5), TOL);
}
-BOOST_AUTO_TEST_CASE(sum)
+BOOST_AUTO_TEST_CASE(sum)
{
FVector f1(3);
FName n1("a");
FName n2("b");
- f1[0] = 1.5; f1[1] = 2.1; f1[2] = 4; f1[n1] = -0.5; f1[n2] = 2.7;
+ f1[0] = 1.5;
+ f1[1] = 2.1;
+ f1[2] = 4;
+ f1[n1] = -0.5;
+ f1[n2] = 2.7;
FValue n = f1.sum();
BOOST_CHECK_CLOSE((FValue)n, 1.5+2.1+4-0.5+2.7, TOL);
}
-BOOST_AUTO_TEST_CASE(l2norm)
+BOOST_AUTO_TEST_CASE(l2norm)
{
FVector f1(3);
FName n1("a");
- f1[0] = 1.5; f1[1] = 2.1; f1[2] = 4; f1[n1] = -0.5;
+ f1[0] = 1.5;
+ f1[1] = 2.1;
+ f1[2] = 4;
+ f1[n1] = -0.5;
FValue n = f1.l2norm();
BOOST_CHECK_CLOSE((FValue)n, sqrt((1.5*1.5)+(2.1*2.1)+(4*4)+(-0.5*-0.5)), TOL);
}
@@ -260,8 +293,14 @@ BOOST_AUTO_TEST_CASE(ip)
FName n1("a");
FName n2("b");
FName n3("c");
- f1[0] = 1.1; f1[1] = -0.1; ; f1[n2] = -1.5; f1[n3] = 2.2;
- f2[0] = 0.5; f2[1] = 0.25; f2[n1] = 1; f2[n3] = 2.4;
+ f1[0] = 1.1;
+ f1[1] = -0.1; ;
+ f1[n2] = -1.5;
+ f1[n3] = 2.2;
+ f2[0] = 0.5;
+ f2[1] = 0.25;
+ f2[n1] = 1;
+ f2[n3] = 2.4;
FValue p1 = inner_product(f1,f2);
FValue p2 = inner_product(f2,f1);
BOOST_CHECK_CLOSE(p1,p2,TOL);
diff --git a/moses/GenerationDictionary.cpp b/moses/GenerationDictionary.cpp
index f9f418197..dbc0eedb3 100644
--- a/moses/GenerationDictionary.cpp
+++ b/moses/GenerationDictionary.cpp
@@ -35,7 +35,7 @@ namespace Moses
{
GenerationDictionary::GenerationDictionary(const std::string &line)
-: DecodeFeature("Generation", line)
+ : DecodeFeature("Generation", line)
{
string filePath;
@@ -44,8 +44,7 @@ GenerationDictionary::GenerationDictionary(const std::string &line)
if (args[0] == "path") {
filePath = args[1];
- }
- else {
+ } else {
//UserMessage::Add("Unknown argument " + args[0]);
//abort();
}
diff --git a/moses/GenerationDictionary.h b/moses/GenerationDictionary.h
index 6a1e4de9a..b2aeb0d96 100644
--- a/moses/GenerationDictionary.h
+++ b/moses/GenerationDictionary.h
@@ -53,22 +53,21 @@ protected:
public:
GenerationDictionary(const std::string &line);
- virtual ~GenerationDictionary();
-
- //! load data file
- bool Load(const std::string &filePath, FactorDirection direction);
-
- /** number of unique input entries in the generation table.
- * NOT the number of lines in the generation table
- */
- size_t GetSize() const
- {
- return m_collection.size();
- }
- /** returns a bag of output words, OutputWordCollection, for a particular input word.
- * Or NULL if the input word isn't found. The search function used is the WordComparer functor
- */
- const OutputWordCollection *FindWord(const Word &word) const;
+ virtual ~GenerationDictionary();
+
+ //! load data file
+ bool Load(const std::string &filePath, FactorDirection direction);
+
+ /** number of unique input entries in the generation table.
+ * NOT the number of lines in the generation table
+ */
+ size_t GetSize() const {
+ return m_collection.size();
+ }
+ /** returns a bag of output words, OutputWordCollection, for a particular input word.
+ * Or NULL if the input word isn't found. The search function used is the WordComparer functor
+ */
+ const OutputWordCollection *FindWord(const Word &word) const;
};
diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp
index 7bc3e6a75..50443904c 100644
--- a/moses/Hypothesis.cpp
+++ b/moses/Hypothesis.cpp
@@ -60,8 +60,8 @@ Hypothesis::Hypothesis(Manager& manager, InputType const& source, const TargetPh
, m_arcList(NULL)
, m_transOpt(NULL)
, m_manager(manager)
-, m_totalScore(0.0f)
-, m_futureScore(0.0f)
+ , m_totalScore(0.0f)
+ , m_futureScore(0.0f)
, m_id(m_manager.GetNextHypoId())
{
@@ -248,20 +248,22 @@ int Hypothesis::RecombineCompare(const Hypothesis &compare) const
}
if (comp != 0) return comp;
}
-
+
return 0;
}
void Hypothesis::EvaluateWith(const StatefulFeatureFunction &sfff,
- int state_idx) {
+ int state_idx)
+{
m_ffStates[state_idx] = sfff.Evaluate(
- *this,
- m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL,
- &m_scoreBreakdown);
-
+ *this,
+ m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL,
+ &m_scoreBreakdown);
+
}
-void Hypothesis::EvaluateWith(const StatelessFeatureFunction& slff) {
+void Hypothesis::EvaluateWith(const StatelessFeatureFunction& slff)
+{
slff.Evaluate(PhraseBasedFeatureContext(this), &m_scoreBreakdown);
}
@@ -280,14 +282,14 @@ void Hypothesis::CalcScore(const SquareMatrix &futureScore)
// compute values of stateless feature functions that were not
// cached in the translation option
const vector<const StatelessFeatureFunction*>& sfs =
- StatelessFeatureFunction::GetStatelessFeatureFunctions();
+ StatelessFeatureFunction::GetStatelessFeatureFunctions();
for (unsigned i = 0; i < sfs.size(); ++i) {
- const StatelessFeatureFunction &ff = *sfs[i];
+ const StatelessFeatureFunction &ff = *sfs[i];
EvaluateWith(ff);
}
const vector<const StatefulFeatureFunction*>& ffs =
- StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ StatefulFeatureFunction::GetStatefulFeatureFunctions();
for (unsigned i = 0; i < ffs.size(); ++i) {
const StatefulFeatureFunction &ff = *ffs[i];
m_ffStates[i] = ff.Evaluate(
diff --git a/moses/HypothesisStack.h b/moses/HypothesisStack.h
index 26e6ed21b..0c3d4198f 100644
--- a/moses/HypothesisStack.h
+++ b/moses/HypothesisStack.h
@@ -11,7 +11,7 @@ namespace Moses
class Manager;
-/** abstract unique set of hypotheses that cover a certain number of words,
+/** abstract unique set of hypotheses that cover a certain number of words,
* ie. a stack in phrase-based decoding
*/
class HypothesisStack
diff --git a/moses/Incremental.cpp b/moses/Incremental.cpp
index 3eb66fb0e..e4159063c 100644
--- a/moses/Incremental.cpp
+++ b/moses/Incremental.cpp
@@ -19,90 +19,98 @@
#include <boost/lexical_cast.hpp>
-namespace Moses {
-namespace Incremental {
-namespace {
+namespace Moses
+{
+namespace Incremental
+{
+namespace
+{
// This is called by EdgeGenerator. Route hypotheses to separate vertices for
-// each left hand side label, populating ChartCellLabelSet out.
-template <class Best> class HypothesisCallback {
- private:
- typedef search::VertexGenerator<Best> Gen;
- public:
- HypothesisCallback(search::ContextBase &context, Best &best, ChartCellLabelSet &out, boost::object_pool<search::Vertex> &vertex_pool)
- : context_(context), best_(best), out_(out), vertex_pool_(vertex_pool) {}
-
- void NewHypothesis(search::PartialEdge partial) {
- // Get the LHS, look it up in the output ChartCellLabel, and upcast it.
- // It's not part of the union because it would have been ugly to expose template types in ChartCellLabel.
- ChartCellLabel::Stack &stack = out_.FindOrInsert(static_cast<const TargetPhrase *>(partial.GetNote().vp)->GetTargetLHS());
- Gen *entry = static_cast<Gen*>(stack.incr_generator);
- if (!entry) {
- entry = generator_pool_.construct(context_, *vertex_pool_.construct(), best_);
- stack.incr_generator = entry;
- }
- entry->NewHypothesis(partial);
+// each left hand side label, populating ChartCellLabelSet out.
+template <class Best> class HypothesisCallback
+{
+private:
+ typedef search::VertexGenerator<Best> Gen;
+public:
+ HypothesisCallback(search::ContextBase &context, Best &best, ChartCellLabelSet &out, boost::object_pool<search::Vertex> &vertex_pool)
+ : context_(context), best_(best), out_(out), vertex_pool_(vertex_pool) {}
+
+ void NewHypothesis(search::PartialEdge partial) {
+ // Get the LHS, look it up in the output ChartCellLabel, and upcast it.
+ // It's not part of the union because it would have been ugly to expose template types in ChartCellLabel.
+ ChartCellLabel::Stack &stack = out_.FindOrInsert(static_cast<const TargetPhrase *>(partial.GetNote().vp)->GetTargetLHS());
+ Gen *entry = static_cast<Gen*>(stack.incr_generator);
+ if (!entry) {
+ entry = generator_pool_.construct(context_, *vertex_pool_.construct(), best_);
+ stack.incr_generator = entry;
}
+ entry->NewHypothesis(partial);
+ }
- void FinishedSearch() {
- for (ChartCellLabelSet::iterator i(out_.mutable_begin()); i != out_.mutable_end(); ++i) {
- ChartCellLabel::Stack &stack = i->second.MutableStack();
- Gen *gen = static_cast<Gen*>(stack.incr_generator);
- gen->FinishedSearch();
- stack.incr = &gen->Generating();
- }
+ void FinishedSearch() {
+ for (ChartCellLabelSet::iterator i(out_.mutable_begin()); i != out_.mutable_end(); ++i) {
+ ChartCellLabel::Stack &stack = i->second.MutableStack();
+ Gen *gen = static_cast<Gen*>(stack.incr_generator);
+ gen->FinishedSearch();
+ stack.incr = &gen->Generating();
}
+ }
- private:
- search::ContextBase &context_;
+private:
+ search::ContextBase &context_;
- Best &best_;
+ Best &best_;
- ChartCellLabelSet &out_;
+ ChartCellLabelSet &out_;
- boost::object_pool<search::Vertex> &vertex_pool_;
- boost::object_pool<Gen> generator_pool_;
+ boost::object_pool<search::Vertex> &vertex_pool_;
+ boost::object_pool<Gen> generator_pool_;
};
// This is called by the moses parser to collect hypotheses. It converts to my
-// edges (search::PartialEdge).
-template <class Model> class Fill : public ChartParserCallback {
- public:
- Fill(search::Context<Model> &context, const std::vector<lm::WordIndex> &vocab_mapping, search::Score oov_weight)
- : context_(context), vocab_mapping_(vocab_mapping), oov_weight_(oov_weight) {}
+// edges (search::PartialEdge).
+template <class Model> class Fill : public ChartParserCallback
+{
+public:
+ Fill(search::Context<Model> &context, const std::vector<lm::WordIndex> &vocab_mapping, search::Score oov_weight)
+ : context_(context), vocab_mapping_(vocab_mapping), oov_weight_(oov_weight) {}
- void Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &ignored);
+ void Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &ignored);
- void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range);
+ void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range);
- bool Empty() const { return edges_.Empty(); }
+ bool Empty() const {
+ return edges_.Empty();
+ }
- template <class Best> void Search(Best &best, ChartCellLabelSet &out, boost::object_pool<search::Vertex> &vertex_pool) {
- HypothesisCallback<Best> callback(context_, best, out, vertex_pool);
- edges_.Search(context_, callback);
- }
+ template <class Best> void Search(Best &best, ChartCellLabelSet &out, boost::object_pool<search::Vertex> &vertex_pool) {
+ HypothesisCallback<Best> callback(context_, best, out, vertex_pool);
+ edges_.Search(context_, callback);
+ }
- // Root: everything into one vertex.
- template <class Best> search::History RootSearch(Best &best) {
- search::Vertex vertex;
- search::RootVertexGenerator<Best> gen(vertex, best);
- edges_.Search(context_, gen);
- return vertex.BestChild();
- }
+ // Root: everything into one vertex.
+ template <class Best> search::History RootSearch(Best &best) {
+ search::Vertex vertex;
+ search::RootVertexGenerator<Best> gen(vertex, best);
+ edges_.Search(context_, gen);
+ return vertex.BestChild();
+ }
- private:
- lm::WordIndex Convert(const Word &word) const;
+private:
+ lm::WordIndex Convert(const Word &word) const;
- search::Context<Model> &context_;
+ search::Context<Model> &context_;
- const std::vector<lm::WordIndex> &vocab_mapping_;
+ const std::vector<lm::WordIndex> &vocab_mapping_;
- search::EdgeGenerator edges_;
+ search::EdgeGenerator edges_;
- const search::Score oov_weight_;
+ const search::Score oov_weight_;
};
-template <class Model> void Fill<Model>::Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &) {
+template <class Model> void Fill<Model>::Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &)
+{
std::vector<search::PartialVertex> vertices;
vertices.reserve(nts.size());
float below_score = 0.0;
@@ -131,7 +139,7 @@ template <class Model> void Fill<Model>::Add(const TargetPhraseCollection &targe
}
edge.SetScore(phrase.GetFutureScore() + below_score);
- // prob and oov were already accounted for.
+ // prob and oov were already accounted for.
search::ScoreRule(context_.LanguageModel(), words, edge.Between());
search::Note note;
@@ -142,14 +150,15 @@ template <class Model> void Fill<Model>::Add(const TargetPhraseCollection &targe
}
}
-template <class Model> void Fill<Model>::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &, const WordsRange &) {
+template <class Model> void Fill<Model>::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &, const WordsRange &)
+{
std::vector<lm::WordIndex> words;
CHECK(phrase.GetSize() <= 1);
if (phrase.GetSize())
words.push_back(Convert(phrase.GetWord(0)));
search::PartialEdge edge(edges_.AllocateEdge(0));
- // Appears to be a bug that FutureScore does not already include language model.
+ // Appears to be a bug that FutureScore does not already include language model.
search::ScoreRuleRet scored(search::ScoreRule(context_.LanguageModel(), words, edge.Between()));
edge.SetScore(phrase.GetFutureScore() + scored.prob * context_.LMWeight() + static_cast<search::Score>(scored.oov) * oov_weight_);
@@ -160,8 +169,9 @@ template <class Model> void Fill<Model>::AddPhraseOOV(TargetPhrase &phrase, std:
edges_.AddEdge(edge);
}
-// TODO: factors (but chart doesn't seem to support factors anyway).
-template <class Model> lm::WordIndex Fill<Model>::Convert(const Word &word) const {
+// TODO: factors (but chart doesn't seem to support factors anyway).
+template <class Model> lm::WordIndex Fill<Model>::Convert(const Word &word) const
+{
std::size_t factor = word.GetFactor(0)->GetId();
return (factor >= vocab_mapping_.size() ? 0 : vocab_mapping_[factor]);
}
@@ -180,10 +190,12 @@ Manager::Manager(const InputType &source) :
parser_(source, cells_),
n_best_(search::NBestConfig(StaticData::Instance().GetNBestSize())) {}
-Manager::~Manager() {
+Manager::~Manager()
+{
}
-template <class Model, class Best> search::History Manager::PopulateBest(const Model &model, const std::vector<lm::WordIndex> &words, Best &out) {
+template <class Model, class Best> search::History Manager::PopulateBest(const Model &model, const std::vector<lm::WordIndex> &words, Best &out)
+{
const LanguageModel &abstract = LanguageModel::GetFirstLM();
const float oov_weight = abstract.OOVFeatureEnabled() ? abstract.GetOOVWeight() : 0.0;
const StaticData &data = StaticData::Instance();
@@ -192,7 +204,7 @@ template <class Model, class Best> search::History Manager::PopulateBest(const M
size_t size = source_.GetSize();
boost::object_pool<search::Vertex> vertex_pool(std::max<size_t>(size * size / 2, 32));
-
+
for (size_t width = 1; width < size; ++width) {
for (size_t startPos = 0; startPos <= size-width; ++startPos) {
WordsRange range(startPos, startPos + width - 1);
@@ -208,7 +220,8 @@ template <class Model, class Best> search::History Manager::PopulateBest(const M
return filler.RootSearch(out);
}
-template <class Model> void Manager::LMCallback(const Model &model, const std::vector<lm::WordIndex> &words) {
+template <class Model> void Manager::LMCallback(const Model &model, const std::vector<lm::WordIndex> &words)
+{
std::size_t nbest = StaticData::Instance().GetNBestSize();
if (nbest <= 1) {
search::History ret = PopulateBest(model, words, single_best_);
@@ -237,12 +250,14 @@ template void Manager::LMCallback<lm::ngram::QuantTrieModel>(const lm::ngram::Qu
template void Manager::LMCallback<lm::ngram::ArrayTrieModel>(const lm::ngram::ArrayTrieModel &model, const std::vector<lm::WordIndex> &words);
template void Manager::LMCallback<lm::ngram::QuantArrayTrieModel>(const lm::ngram::QuantArrayTrieModel &model, const std::vector<lm::WordIndex> &words);
-const std::vector<search::Applied> &Manager::ProcessSentence() {
+const std::vector<search::Applied> &Manager::ProcessSentence()
+{
LanguageModel::GetFirstLM().IncrementalCallback(*this);
return *completed_nbest_;
}
-namespace {
+namespace
+{
struct NoOp {
void operator()(const TargetPhrase &) const {}
@@ -254,7 +269,8 @@ struct AccumScore {
}
ScoreComponentCollection *out_;
};
-template <class Action> void AppendToPhrase(const search::Applied final, Phrase &out, Action action) {
+template <class Action> void AppendToPhrase(const search::Applied final, Phrase &out, Action action)
+{
assert(final.Valid());
const TargetPhrase &phrase = *static_cast<const TargetPhrase*>(final.GetNote().vp);
action(phrase);
@@ -271,23 +287,25 @@ template <class Action> void AppendToPhrase(const search::Applied final, Phrase
} // namespace
-void ToPhrase(const search::Applied final, Phrase &out) {
+void ToPhrase(const search::Applied final, Phrase &out)
+{
out.Clear();
AppendToPhrase(final, out, NoOp());
}
-void PhraseAndFeatures(const search::Applied final, Phrase &phrase, ScoreComponentCollection &features) {
+void PhraseAndFeatures(const search::Applied final, Phrase &phrase, ScoreComponentCollection &features)
+{
phrase.Clear();
features.ZeroAll();
AppendToPhrase(final, phrase, AccumScore(features));
- // If we made it this far, there is only one language model.
+ // If we made it this far, there is only one language model.
float full, ignored_ngram;
std::size_t ignored_oov;
const LanguageModel &model = LanguageModel::GetFirstLM();
model.CalcScore(phrase, full, ignored_ngram, ignored_oov);
- // CalcScore transforms, but EvaluateChart doesn't.
+ // CalcScore transforms, but EvaluateChart doesn't.
features.Assign(&model, full);
}
diff --git a/moses/Incremental.h b/moses/Incremental.h
index 30f7c588c..20040bf45 100644
--- a/moses/Incremental.h
+++ b/moses/Incremental.h
@@ -10,49 +10,52 @@
#include <vector>
#include <string>
-namespace Moses {
+namespace Moses
+{
class ScoreComponentCollection;
class InputType;
class LanguageModel;
-namespace Incremental {
+namespace Incremental
+{
-class Manager {
- public:
- Manager(const InputType &source);
+class Manager
+{
+public:
+ Manager(const InputType &source);
- ~Manager();
+ ~Manager();
- template <class Model> void LMCallback(const Model &model, const std::vector<lm::WordIndex> &words);
-
- const std::vector<search::Applied> &ProcessSentence();
+ template <class Model> void LMCallback(const Model &model, const std::vector<lm::WordIndex> &words);
- // Call to get the same value as ProcessSentence returned.
- const std::vector<search::Applied> &Completed() const {
- return *completed_nbest_;
- }
+ const std::vector<search::Applied> &ProcessSentence();
- private:
- template <class Model, class Best> search::History PopulateBest(const Model &model, const std::vector<lm::WordIndex> &words, Best &out);
+ // Call to get the same value as ProcessSentence returned.
+ const std::vector<search::Applied> &Completed() const {
+ return *completed_nbest_;
+ }
- const InputType &source_;
- ChartCellCollectionBase cells_;
- ChartParser parser_;
+private:
+ template <class Model, class Best> search::History PopulateBest(const Model &model, const std::vector<lm::WordIndex> &words, Best &out);
- // Only one of single_best_ or n_best_ will be used, but it was easier to do this than a template.
- search::SingleBest single_best_;
- // ProcessSentence returns a reference to a vector. ProcessSentence
- // doesn't have one, so this is populated and returned.
- std::vector<search::Applied> backing_for_single_;
+ const InputType &source_;
+ ChartCellCollectionBase cells_;
+ ChartParser parser_;
- search::NBest n_best_;
-
- const std::vector<search::Applied> *completed_nbest_;
+ // Only one of single_best_ or n_best_ will be used, but it was easier to do this than a template.
+ search::SingleBest single_best_;
+ // ProcessSentence returns a reference to a vector. ProcessSentence
+ // doesn't have one, so this is populated and returned.
+ std::vector<search::Applied> backing_for_single_;
+
+ search::NBest n_best_;
+
+ const std::vector<search::Applied> *completed_nbest_;
};
// Just get the phrase.
void ToPhrase(const search::Applied final, Phrase &out);
-// Get the phrase and the features.
+// Get the phrase and the features.
void PhraseAndFeatures(const search::Applied final, Phrase &phrase, ScoreComponentCollection &features);
diff --git a/moses/InputType.cpp b/moses/InputType.cpp
index 64dc9a7fd..1ca3da63b 100644
--- a/moses/InputType.cpp
+++ b/moses/InputType.cpp
@@ -67,7 +67,7 @@ std::vector <ChartTranslationOptions*> InputType::GetXmlChartTranslationOptions(
std::vector <ChartTranslationOptions*> ret;
return ret;
}
-
+
}
diff --git a/moses/InputType.h b/moses/InputType.h
index a065c0bf0..d0106e5ca 100644
--- a/moses/InputType.h
+++ b/moses/InputType.h
@@ -38,9 +38,9 @@ class Factor;
class PhraseDictionary;
class TranslationOptionCollection;
class ChartTranslationOptions;
-
+
/** base class for all types of inputs to the decoder,
- * eg. sentences, confusion networks, lattices and tree
+ * eg. sentences, confusion networks, lattices and tree
*/
class InputType
{
@@ -81,7 +81,7 @@ public:
}
void SetDocumentId(long documentId) {
m_documentId = documentId;
- }
+ }
long GetTopicId() const {
return m_topicId;
}
@@ -111,7 +111,7 @@ public:
}
void SetTextType(std::string type) {
m_textType = type;
- }
+ }
std::string GetPassthroughInformation() const {
return m_passthrough;
}
diff --git a/moses/LM/Backward.cpp b/moses/LM/Backward.cpp
index a9fca1c75..263c90fec 100644
--- a/moses/LM/Backward.cpp
+++ b/moses/LM/Backward.cpp
@@ -35,281 +35,288 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//#include "moses/StaticData.h"
//#include <iostream>
-namespace Moses {
-
- /** Constructs a new backward language model. */
- template <class Model> BackwardLanguageModel<Model>::BackwardLanguageModel(const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(file,factorType,lazy) {
- //
- // This space intentionally left blank
- //
- }
-
- /**
- * Constructs an empty backward language model state.
- *
- * This state will correspond with a translation hypothesis
- * where no source words have been translated.
- *
- * In a forward language model, the language model state of an empty hypothesis
- * would store the beginning of sentence marker <s>.
- *
- * Because this is a backward language model, the language model state returned by this method
- * instead stores the end of sentence marker </s>.
- */
- template <class Model> const FFState *BackwardLanguageModel<Model>::EmptyHypothesisState(const InputType &/*input*/) const {
- BackwardLMState *ret = new BackwardLMState();
- lm::ngram::RuleScore<Model> ruleScore(*m_ngram, ret->state);
- ruleScore.Terminal(m_ngram->GetVocabulary().EndSentence());
- // float score =
- ruleScore.Finish();
- // VERBOSE(1, "BackwardLM EmptyHypothesisState has score " << score);
- return ret;
- }
- /*
- template <class Model> double BackwardLanguageModel<Model>::Score(FFState *ffState) {
- BackwardLMState *lmState = static_cast< BackwardLMState* >(ffState);
- lm::ngram::ChartState &state = lmState->state;
- lm::ngram::RuleScore<Model> ruleScore(*m_ngram, lmState);
- return ruleScore.Finish();
- }
+namespace Moses
+{
+
+/** Constructs a new backward language model. */
+template <class Model> BackwardLanguageModel<Model>::BackwardLanguageModel(const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(file,factorType,lazy)
+{
+ //
+ // This space intentionally left blank
+ //
+}
+
+/**
+ * Constructs an empty backward language model state.
+ *
+ * This state will correspond with a translation hypothesis
+ * where no source words have been translated.
+ *
+ * In a forward language model, the language model state of an empty hypothesis
+ * would store the beginning of sentence marker <s>.
+ *
+ * Because this is a backward language model, the language model state returned by this method
+ * instead stores the end of sentence marker </s>.
+ */
+template <class Model> const FFState *BackwardLanguageModel<Model>::EmptyHypothesisState(const InputType &/*input*/) const
+{
+ BackwardLMState *ret = new BackwardLMState();
+ lm::ngram::RuleScore<Model> ruleScore(*m_ngram, ret->state);
+ ruleScore.Terminal(m_ngram->GetVocabulary().EndSentence());
+ // float score =
+ ruleScore.Finish();
+ // VERBOSE(1, "BackwardLM EmptyHypothesisState has score " << score);
+ return ret;
+}
+/*
+template <class Model> double BackwardLanguageModel<Model>::Score(FFState *ffState) {
+ BackwardLMState *lmState = static_cast< BackwardLMState* >(ffState);
+ lm::ngram::ChartState &state = lmState->state;
+ lm::ngram::RuleScore<Model> ruleScore(*m_ngram, lmState);
+ return ruleScore.Finish();
+}
*/
- /**
- * Pre-calculate the n-gram probabilities for the words in the specified phrase.
- *
- * Note that when this method is called, we do not have access to the context
- * in which this phrase will eventually be applied.
- *
- * In other words, we know what words are in this phrase,
- * but we do not know what words will come before or after this phrase.
- *
- * The parameters fullScore, ngramScore, and oovCount are all output parameters.
- *
- * The value stored in oovCount is the number of words in the phrase
- * that are not in the language model's vocabulary.
- *
- * The sum of the ngram scores for all words in this phrase are stored in fullScore.
- *
- * The value stored in ngramScore is similar, but only full-order ngram scores are included.
- *
- * This is best shown by example:
- *
- * Assume a trigram backward language model and a phrase "a b c d e f g"
- *
- * fullScore would represent the sum of the logprob scores for the following values:
- *
- * p(g)
- * p(f | g)
- * p(e | g f)
- * p(d | f e)
- * p(c | e d)
- * p(b | d c)
- * p(a | c b)
- *
- * ngramScore would represent the sum of the logprob scores for the following values:
- *
- * p(g)
- * p(f | g)
- * p(e | g f)
- * p(d | f e)
- * p(c | e d)
- * p(b | d c)
- * p(a | c b)
- */
- template <class Model> void BackwardLanguageModel<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
- fullScore = 0;
- ngramScore = 0;
- oovCount = 0;
-
- if (!phrase.GetSize()) return;
-
- lm::ngram::ChartState discarded_sadly;
- lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
-
+/**
+ * Pre-calculate the n-gram probabilities for the words in the specified phrase.
+ *
+ * Note that when this method is called, we do not have access to the context
+ * in which this phrase will eventually be applied.
+ *
+ * In other words, we know what words are in this phrase,
+ * but we do not know what words will come before or after this phrase.
+ *
+ * The parameters fullScore, ngramScore, and oovCount are all output parameters.
+ *
+ * The value stored in oovCount is the number of words in the phrase
+ * that are not in the language model's vocabulary.
+ *
+ * The sum of the ngram scores for all words in this phrase are stored in fullScore.
+ *
+ * The value stored in ngramScore is similar, but only full-order ngram scores are included.
+ *
+ * This is best shown by example:
+ *
+ * Assume a trigram backward language model and a phrase "a b c d e f g"
+ *
+ * fullScore would represent the sum of the logprob scores for the following values:
+ *
+ * p(g)
+ * p(f | g)
+ * p(e | g f)
+ * p(d | f e)
+ * p(c | e d)
+ * p(b | d c)
+ * p(a | c b)
+ *
+ * ngramScore would represent the sum of the logprob scores for the following values:
+ *
+ * p(g)
+ * p(f | g)
+ * p(e | g f)
+ * p(d | f e)
+ * p(c | e d)
+ * p(b | d c)
+ * p(a | c b)
+ */
+template <class Model> void BackwardLanguageModel<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
+{
+ fullScore = 0;
+ ngramScore = 0;
+ oovCount = 0;
+
+ if (!phrase.GetSize()) return;
+
+ lm::ngram::ChartState discarded_sadly;
+ lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
+
+ UTIL_THROW_IF(
+ (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)),
+ util::Exception,
+ "BackwardLanguageModel does not currently support rules that include <s>"
+ );
+
+ float before_boundary = 0.0f;
+
+ int lastWord = phrase.GetSize() - 1;
+ int ngramBoundary = m_ngram->Order() - 1;
+ int boundary = ( lastWord < ngramBoundary ) ? 0 : ngramBoundary;
+
+ int position;
+ for (position = lastWord; position >= 0; position-=1) {
+ const Word &word = phrase.GetWord(position);
UTIL_THROW_IF(
- (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)),
- util::Exception,
- "BackwardLanguageModel does not currently support rules that include <s>"
- );
-
- float before_boundary = 0.0f;
-
- int lastWord = phrase.GetSize() - 1;
- int ngramBoundary = m_ngram->Order() - 1;
- int boundary = ( lastWord < ngramBoundary ) ? 0 : ngramBoundary;
-
- int position;
- for (position = lastWord; position >= 0; position-=1) {
- const Word &word = phrase.GetWord(position);
- UTIL_THROW_IF(
- (word.IsNonTerminal()),
- util::Exception,
- "BackwardLanguageModel does not currently support rules that include non-terminals "
- );
-
- lm::WordIndex index = TranslateID(word);
- scorer.Terminal(index);
- if (!index) ++oovCount;
-
- if (position==boundary) {
- before_boundary = scorer.Finish();
- }
+ (word.IsNonTerminal()),
+ util::Exception,
+ "BackwardLanguageModel does not currently support rules that include non-terminals "
+ );
- }
+ lm::WordIndex index = TranslateID(word);
+ scorer.Terminal(index);
+ if (!index) ++oovCount;
- fullScore = scorer.Finish();
-
- ngramScore = TransformLMScore(fullScore - before_boundary);
- fullScore = TransformLMScore(fullScore);
+ if (position==boundary) {
+ before_boundary = scorer.Finish();
+ }
}
- /**
- * Calculate the ngram probabilities for the words at the beginning
- * (and under some circumstances, also at the end)
- * of the phrase represented by the provided hypothesis.
- *
- * Additionally, calculate a new language model state.
- *
- * This is best shown by example:
- *
- * Assume a trigram language model.
- *
- * Assume the previous phrase was "a b c d e f g",
- * which means the previous language model state is "g f".
- *
- * When the phrase corresponding to "a b c d e f g" was previously processed by CalcScore
- * the following full-order ngrams would have been calculated:
- *
- * p(a | c b)
- * p(b | d c)
- * p(c | e d)
- * p(d | f e)
- * p(e | g f)
- *
- * The following less-than-full-order ngrams would also have been calculated by CalcScore:
- *
- * p(f | g)
- * p(g)
- *
- * In this method, we now have access to additional context which may allow
- * us to compute the full-order ngrams for f and g.
- *
- * Assume the new provided hypothesis contains the new phrase "h i j k"
- *
- * Given these assumptions, this method is responsible
- * for calculating the scores for the following:
- *
- * p(f | h g)
- * p(g | i h)
- *
- * This method must also calculate and return a new language model state.
- *
- * In this example, the returned language model state would be "k j"
- *
- * If the provided hypothesis represents the end of a completed translation
- * (all source words have been translated)
- * then this method is additionally responsible for calculating the following:
- *
- * p(j | <s> k)
- * p(k | <s>)
- *
- */
- template <class Model> FFState *BackwardLanguageModel<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const {
-
- // If the current hypothesis contains zero target words
- if (!hypo.GetCurrTargetLength()) {
-
- // reuse and return the previous state
- std::auto_ptr<BackwardLMState> ret(new BackwardLMState());
- ret->state = static_cast<const BackwardLMState&>(*ps).state;
- return ret.release();
+ fullScore = scorer.Finish();
+
+ ngramScore = TransformLMScore(fullScore - before_boundary);
+ fullScore = TransformLMScore(fullScore);
+
+}
+
+/**
+ * Calculate the ngram probabilities for the words at the beginning
+ * (and under some circumstances, also at the end)
+ * of the phrase represented by the provided hypothesis.
+ *
+ * Additionally, calculate a new language model state.
+ *
+ * This is best shown by example:
+ *
+ * Assume a trigram language model.
+ *
+ * Assume the previous phrase was "a b c d e f g",
+ * which means the previous language model state is "g f".
+ *
+ * When the phrase corresponding to "a b c d e f g" was previously processed by CalcScore
+ * the following full-order ngrams would have been calculated:
+ *
+ * p(a | c b)
+ * p(b | d c)
+ * p(c | e d)
+ * p(d | f e)
+ * p(e | g f)
+ *
+ * The following less-than-full-order ngrams would also have been calculated by CalcScore:
+ *
+ * p(f | g)
+ * p(g)
+ *
+ * In this method, we now have access to additional context which may allow
+ * us to compute the full-order ngrams for f and g.
+ *
+ * Assume the new provided hypothesis contains the new phrase "h i j k"
+ *
+ * Given these assumptions, this method is responsible
+ * for calculating the scores for the following:
+ *
+ * p(f | h g)
+ * p(g | i h)
+ *
+ * This method must also calculate and return a new language model state.
+ *
+ * In this example, the returned language model state would be "k j"
+ *
+ * If the provided hypothesis represents the end of a completed translation
+ * (all source words have been translated)
+ * then this method is additionally responsible for calculating the following:
+ *
+ * p(j | <s> k)
+ * p(k | <s>)
+ *
+ */
+template <class Model> FFState *BackwardLanguageModel<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
+{
+
+ // If the current hypothesis contains zero target words
+ if (!hypo.GetCurrTargetLength()) {
+
+ // reuse and return the previous state
+ std::auto_ptr<BackwardLMState> ret(new BackwardLMState());
+ ret->state = static_cast<const BackwardLMState&>(*ps).state;
+ return ret.release();
- } else {
+ } else {
- float returnedScore;
+ float returnedScore;
- FFState *returnedState = this->Evaluate(hypo.GetCurrTargetPhrase(), ps, returnedScore);
+ FFState *returnedState = this->Evaluate(hypo.GetCurrTargetPhrase(), ps, returnedScore);
- out->PlusEquals(this, returnedScore);
+ out->PlusEquals(this, returnedScore);
- return returnedState;
+ return returnedState;
- }
}
+}
- template <class Model> FFState *BackwardLanguageModel<Model>::Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const {
-
- returnedScore = 0.0f;
+template <class Model> FFState *BackwardLanguageModel<Model>::Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const
+{
- const lm::ngram::ChartState &previous = static_cast<const BackwardLMState&>(*ps).state;
+ returnedScore = 0.0f;
- std::auto_ptr<BackwardLMState> ret(new BackwardLMState());
-
- lm::ngram::RuleScore<Model> scorer(*m_ngram, ret->state);
-
- int ngramBoundary = m_ngram->Order() - 1;
- int lastWord = phrase.GetSize() - 1;
-
- // Get scores for words at the end of the previous phrase
- // that are now adjacent to words at the the beginning of this phrase
- for (int position=std::min( lastWord, ngramBoundary - 1); position >= 0; position-=1) {
- const Word &word = phrase.GetWord(position);
- UTIL_THROW_IF(
- (word.IsNonTerminal()),
- util::Exception,
- "BackwardLanguageModel does not currently support rules that include non-terminals "
- );
-
- lm::WordIndex index = TranslateID(word);
- scorer.Terminal(index);
- }
- scorer.NonTerminal(previous);
- returnedScore = scorer.Finish();
- /*
- out->PlusEquals(this, score);
-
-
- UTIL_THROW_IF(
- (1==1),
- util::Exception,
- "This method (BackwardLanguageModel<Model>::Evaluate) is not yet fully implemented"
- );
- */
- return ret.release();
+ const lm::ngram::ChartState &previous = static_cast<const BackwardLMState&>(*ps).state;
+
+ std::auto_ptr<BackwardLMState> ret(new BackwardLMState());
+
+ lm::ngram::RuleScore<Model> scorer(*m_ngram, ret->state);
-
+ int ngramBoundary = m_ngram->Order() - 1;
+ int lastWord = phrase.GetSize() - 1;
+ // Get scores for words at the end of the previous phrase
+ // that are now adjacent to words at the the beginning of this phrase
+ for (int position=std::min( lastWord, ngramBoundary - 1); position >= 0; position-=1) {
+ const Word &word = phrase.GetWord(position);
+ UTIL_THROW_IF(
+ (word.IsNonTerminal()),
+ util::Exception,
+ "BackwardLanguageModel does not currently support rules that include non-terminals "
+ );
+
+ lm::WordIndex index = TranslateID(word);
+ scorer.Terminal(index);
}
+ scorer.NonTerminal(previous);
+ returnedScore = scorer.Finish();
+ /*
+ out->PlusEquals(this, score);
- LanguageModel *ConstructBackwardLM(const std::string &file, FactorType factorType, bool lazy) {
- try {
- lm::ngram::ModelType model_type;
- if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
- switch(model_type) {
- case lm::ngram::PROBING:
- return new BackwardLanguageModel<lm::ngram::ProbingModel>(file, factorType, lazy);
- case lm::ngram::REST_PROBING:
- return new BackwardLanguageModel<lm::ngram::RestProbingModel>(file, factorType, lazy);
- case lm::ngram::TRIE:
- return new BackwardLanguageModel<lm::ngram::TrieModel>(file, factorType, lazy);
- case lm::ngram::QUANT_TRIE:
- return new BackwardLanguageModel<lm::ngram::QuantTrieModel>(file, factorType, lazy);
- case lm::ngram::ARRAY_TRIE:
- return new BackwardLanguageModel<lm::ngram::ArrayTrieModel>(file, factorType, lazy);
- case lm::ngram::QUANT_ARRAY_TRIE:
- return new BackwardLanguageModel<lm::ngram::QuantArrayTrieModel>(file, factorType, lazy);
- default:
- std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
- abort();
- }
- } else {
- return new BackwardLanguageModel<lm::ngram::ProbingModel>(file, factorType, lazy);
+
+ UTIL_THROW_IF(
+ (1==1),
+ util::Exception,
+ "This method (BackwardLanguageModel<Model>::Evaluate) is not yet fully implemented"
+ );
+ */
+ return ret.release();
+
+
+
+}
+
+LanguageModel *ConstructBackwardLM(const std::string &file, FactorType factorType, bool lazy)
+{
+ try {
+ lm::ngram::ModelType model_type;
+ if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
+ switch(model_type) {
+ case lm::ngram::PROBING:
+ return new BackwardLanguageModel<lm::ngram::ProbingModel>(file, factorType, lazy);
+ case lm::ngram::REST_PROBING:
+ return new BackwardLanguageModel<lm::ngram::RestProbingModel>(file, factorType, lazy);
+ case lm::ngram::TRIE:
+ return new BackwardLanguageModel<lm::ngram::TrieModel>(file, factorType, lazy);
+ case lm::ngram::QUANT_TRIE:
+ return new BackwardLanguageModel<lm::ngram::QuantTrieModel>(file, factorType, lazy);
+ case lm::ngram::ARRAY_TRIE:
+ return new BackwardLanguageModel<lm::ngram::ArrayTrieModel>(file, factorType, lazy);
+ case lm::ngram::QUANT_ARRAY_TRIE:
+ return new BackwardLanguageModel<lm::ngram::QuantArrayTrieModel>(file, factorType, lazy);
+ default:
+ std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
+ abort();
}
- } catch (std::exception &e) {
- std::cerr << e.what() << std::endl;
- abort();
+ } else {
+ return new BackwardLanguageModel<lm::ngram::ProbingModel>(file, factorType, lazy);
}
+ } catch (std::exception &e) {
+ std::cerr << e.what() << std::endl;
+ abort();
}
+}
} // namespace Moses
diff --git a/moses/LM/Backward.h b/moses/LM/Backward.h
index 1bf6b560c..c81c0633d 100644
--- a/moses/LM/Backward.h
+++ b/moses/LM/Backward.h
@@ -29,53 +29,55 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "lm/state.hh"
-namespace Moses {
+namespace Moses
+{
//! This will also load. Returns a templated backward LM.
LanguageModel *ConstructBackwardLM(const std::string &file, FactorType factorType, bool lazy);
- class FFState;
- // template<typename M> class BackwardLanguageModelTest;
- class BackwardLanguageModelTest;
+class FFState;
+// template<typename M> class BackwardLanguageModelTest;
+class BackwardLanguageModelTest;
/*
* An implementation of single factor backward LM using Kenneth's code.
*/
-template <class Model> class BackwardLanguageModel : public LanguageModelKen<Model> {
- public:
- BackwardLanguageModel(const std::string &file, FactorType factorType, bool lazy);
-
- virtual const FFState *EmptyHypothesisState(const InputType &/*input*/) const;
-
- virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
-
- virtual FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
-
- FFState *Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const;
-
- private:
-
- // These lines are required to make the parent class's protected members visible to this class
- using LanguageModelKen<Model>::m_ngram;
- using LanguageModelKen<Model>::m_beginSentenceFactor;
- using LanguageModelKen<Model>::m_factorType;
- using LanguageModelKen<Model>::TranslateID;
-
- // friend class Moses::BackwardLanguageModelTest<Model>;
- friend class Moses::BackwardLanguageModelTest;
- /*
- lm::ngram::ChartState* GetState(FFState *ffState) {
- return NULL;
- }
- */
- /*
- double Score(FFState *ffState) {
- BackwardLMState *lmState = static_cast< BackwardLMState* >(ffState);
- lm::ngram::ChartState &state = lmState->state;
- lm::ngram::RuleScore<Model> ruleScore(*m_ngram, lmState);
- return ruleScore.Finish();
+template <class Model> class BackwardLanguageModel : public LanguageModelKen<Model>
+{
+public:
+ BackwardLanguageModel(const std::string &file, FactorType factorType, bool lazy);
+
+ virtual const FFState *EmptyHypothesisState(const InputType &/*input*/) const;
+
+ virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
+
+ virtual FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
+
+ FFState *Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const;
+
+private:
+
+ // These lines are required to make the parent class's protected members visible to this class
+ using LanguageModelKen<Model>::m_ngram;
+ using LanguageModelKen<Model>::m_beginSentenceFactor;
+ using LanguageModelKen<Model>::m_factorType;
+ using LanguageModelKen<Model>::TranslateID;
+
+ // friend class Moses::BackwardLanguageModelTest<Model>;
+ friend class Moses::BackwardLanguageModelTest;
+ /*
+ lm::ngram::ChartState* GetState(FFState *ffState) {
+ return NULL;
}
- */
+ */
+ /*
+ double Score(FFState *ffState) {
+ BackwardLMState *lmState = static_cast< BackwardLMState* >(ffState);
+ lm::ngram::ChartState &state = lmState->state;
+ lm::ngram::RuleScore<Model> ruleScore(*m_ngram, lmState);
+ return ruleScore.Finish();
+ }
+ */
};
} // namespace Moses
@@ -83,7 +85,7 @@ template <class Model> class BackwardLanguageModel : public LanguageModelKen<Mod
#endif
// To create a sample backward language model using SRILM:
-//
+//
// (ngram-count and reverse-text are SRILM programs)
//
// head -n 49 ./contrib/synlm/hhmm/LICENSE | tail -n 45 | tr '\n' ' ' | ./scripts/ems/support/split-sentences.perl | ./scripts/tokenizer/lowercase.perl | ./scripts/tokenizer/tokenizer.perl | reverse-text | ngram-count -order 3 -text - -lm - > lm/backward.arpa
diff --git a/moses/LM/BackwardLMState.cpp b/moses/LM/BackwardLMState.cpp
index 37a3ab7da..466c4b655 100644
--- a/moses/LM/BackwardLMState.cpp
+++ b/moses/LM/BackwardLMState.cpp
@@ -22,11 +22,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/LM/BackwardLMState.h"
#include "lm/state.hh"
-namespace Moses {
+namespace Moses
+{
- int BackwardLMState::Compare(const FFState &o) const {
- const BackwardLMState &other = static_cast<const BackwardLMState &>(o);
- return state.left.Compare(other.state.left);
- }
+int BackwardLMState::Compare(const FFState &o) const
+{
+ const BackwardLMState &other = static_cast<const BackwardLMState &>(o);
+ return state.left.Compare(other.state.left);
+}
}
diff --git a/moses/LM/BackwardLMState.h b/moses/LM/BackwardLMState.h
index 7c6ebff62..e6d1f325a 100644
--- a/moses/LM/BackwardLMState.h
+++ b/moses/LM/BackwardLMState.h
@@ -36,14 +36,16 @@ namespace lm {
//#include "lm/state.hh"
-namespace Moses {
+namespace Moses
+{
- //template<typename M>
+//template<typename M>
class BackwardLanguageModelTest;
-class BackwardLMState : public FFState {
+class BackwardLMState : public FFState
+{
- public:
+public:
/*
int Compare(const FFState &o) const {
@@ -53,14 +55,14 @@ class BackwardLMState : public FFState {
*/
int Compare(const FFState &o) const;
- // Allow BackwardLanguageModel to access the private members of this class
- template <class Model> friend class BackwardLanguageModel;
+ // Allow BackwardLanguageModel to access the private members of this class
+ template <class Model> friend class BackwardLanguageModel;
// template <class Model> friend class Moses::BackwardLanguageModelTest;
- friend class Moses::BackwardLanguageModelTest;
+ friend class Moses::BackwardLanguageModelTest;
- private:
- lm::ngram::ChartState state;
+private:
+ lm::ngram::ChartState state;
};
diff --git a/moses/LM/BackwardTest.cpp b/moses/LM/BackwardTest.cpp
index 5f58c9f32..dc5de32bd 100644
--- a/moses/LM/BackwardTest.cpp
+++ b/moses/LM/BackwardTest.cpp
@@ -47,7 +47,7 @@ template <class M> void Foo() {
Moses::BackwardLanguageModel<M> *backwardLM;
// = new Moses::BackwardLanguageModel<M>( filename, factorType, lazy );
-
+
}
template <class M> void Everything() {
@@ -55,159 +55,160 @@ template <class M> void Everything() {
}
*/
-namespace Moses {
+namespace Moses
+{
-// Apparently some Boost versions use templates and are pretty strict about types matching.
+// Apparently some Boost versions use templates and are pretty strict about types matching.
#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol));
-class BackwardLanguageModelTest {
-
- public:
- BackwardLanguageModelTest() :
- dummyInput(new Sentence()),
- backwardLM(
- static_cast< BackwardLanguageModel<lm::ngram::ProbingModel> * >(
- ConstructBackwardLM(
- boost::unit_test::framework::master_test_suite().argv[1],
- 0,
- false)
- )
- )
+class BackwardLanguageModelTest
+{
+
+public:
+ BackwardLanguageModelTest() :
+ dummyInput(new Sentence()),
+ backwardLM(
+ static_cast< BackwardLanguageModel<lm::ngram::ProbingModel> * >(
+ ConstructBackwardLM(
+ boost::unit_test::framework::master_test_suite().argv[1],
+ 0,
+ false)
+ )
+ ) {
+ // This space intentionally left blank
+ }
+
+ ~BackwardLanguageModelTest() {
+ delete dummyInput;
+ delete backwardLM;
+ }
+
+ void testEmptyHypothesis() {
+ FFState *ffState = const_cast< FFState * >(backwardLM->EmptyHypothesisState( *dummyInput ));
+
+ BOOST_CHECK( ffState != NULL );
+
+ delete ffState;
+ }
+
+ void testCalcScore() {
+
+ double p_the = -1.383059;
+ double p_licenses = -2.360783;
+ double p_for = -1.661813;
+ double p_most = -2.360783;
+ // double p_software = -1.62042;
+
+ double p_the_licenses = -0.9625873;
+ double p_licenses_for = -1.661557;
+ double p_for_most = -0.4526253;
+ // double p_most_software = -1.70295;
+
+ double p_the_licenses_for = p_the_licenses + p_licenses_for;
+ // double p_licenses_for_most = p_licenses_for + p_for_most;
+
+ // the
{
- // This space intentionally left blank
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "the",
+ StaticData::Instance().GetFactorDelimiter());
+
+ BOOST_CHECK( phrase.GetSize() == 1 );
+
+ float fullScore;
+ float ngramScore;
+ size_t oovCount;
+ backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
+
+ BOOST_CHECK( oovCount == 0 );
+ SLOPPY_CHECK_CLOSE( TransformLMScore(p_the), fullScore, 0.01);
+ SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01);
}
- ~BackwardLanguageModelTest() {
- delete dummyInput;
- delete backwardLM;
+ // the licenses
+ {
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "the licenses",
+ StaticData::Instance().GetFactorDelimiter());
+
+ BOOST_CHECK( phrase.GetSize() == 2 );
+
+ float fullScore;
+ float ngramScore;
+ size_t oovCount;
+ backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
+
+ BOOST_CHECK( oovCount == 0 );
+ SLOPPY_CHECK_CLOSE( TransformLMScore(p_licenses + p_the_licenses), fullScore, 0.01);
+ SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01);
}
- void testEmptyHypothesis() {
- FFState *ffState = const_cast< FFState * >(backwardLM->EmptyHypothesisState( *dummyInput ));
+ // the licenses for
+ {
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "the licenses for",
+ StaticData::Instance().GetFactorDelimiter());
- BOOST_CHECK( ffState != NULL );
+ BOOST_CHECK( phrase.GetSize() == 3 );
- delete ffState;
+ float fullScore;
+ float ngramScore;
+ size_t oovCount;
+ backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
+
+ BOOST_CHECK( oovCount == 0 );
+ SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses_for ), ngramScore, 0.01);
+ SLOPPY_CHECK_CLOSE( TransformLMScore(p_for + p_licenses_for + p_the_licenses), fullScore, 0.01);
}
- void testCalcScore() {
-
- double p_the = -1.383059;
- double p_licenses = -2.360783;
- double p_for = -1.661813;
- double p_most = -2.360783;
- // double p_software = -1.62042;
-
- double p_the_licenses = -0.9625873;
- double p_licenses_for = -1.661557;
- double p_for_most = -0.4526253;
- // double p_most_software = -1.70295;
-
- double p_the_licenses_for = p_the_licenses + p_licenses_for;
- // double p_licenses_for_most = p_licenses_for + p_for_most;
-
- // the
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "the",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 1 );
-
- float fullScore;
- float ngramScore;
- size_t oovCount;
- backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
-
- BOOST_CHECK( oovCount == 0 );
- SLOPPY_CHECK_CLOSE( TransformLMScore(p_the), fullScore, 0.01);
- SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01);
- }
-
- // the licenses
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "the licenses",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 2 );
-
- float fullScore;
- float ngramScore;
- size_t oovCount;
- backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
-
- BOOST_CHECK( oovCount == 0 );
- SLOPPY_CHECK_CLOSE( TransformLMScore(p_licenses + p_the_licenses), fullScore, 0.01);
- SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01);
- }
-
- // the licenses for
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "the licenses for",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 3 );
-
- float fullScore;
- float ngramScore;
- size_t oovCount;
- backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
-
- BOOST_CHECK( oovCount == 0 );
- SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses_for ), ngramScore, 0.01);
- SLOPPY_CHECK_CLOSE( TransformLMScore(p_for + p_licenses_for + p_the_licenses), fullScore, 0.01);
- }
-
- // the licenses for most
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "the licenses for most",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 4 );
-
- float fullScore;
- float ngramScore;
- size_t oovCount;
- backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
-
- BOOST_CHECK( oovCount == 0 );
- SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses + p_licenses_for ), ngramScore, 0.01);
- SLOPPY_CHECK_CLOSE( TransformLMScore(p_most + p_for_most + p_licenses_for + p_the_licenses), fullScore, 0.01);
- }
-
+ // the licenses for most
+ {
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "the licenses for most",
+ StaticData::Instance().GetFactorDelimiter());
+
+ BOOST_CHECK( phrase.GetSize() == 4 );
+
+ float fullScore;
+ float ngramScore;
+ size_t oovCount;
+ backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
+
+ BOOST_CHECK( oovCount == 0 );
+ SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses + p_licenses_for ), ngramScore, 0.01);
+ SLOPPY_CHECK_CLOSE( TransformLMScore(p_most + p_for_most + p_licenses_for + p_the_licenses), fullScore, 0.01);
}
-
+
+ }
+
void testEvaluate() {
FFState *nextState;
@@ -223,132 +224,134 @@ class BackwardLanguageModelTest {
double p_for_licenses = -1.661557;
double p_licenses_the = -0.9625873;
double p_the_eos = -1.940311;
-
-
- // the
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "the",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 1 );
-
- float score;
- nextState = backwardLM->Evaluate(phrase, prevState, score);
-
- // p(the) * p(</s> | the) / p(</s>)
- SLOPPY_CHECK_CLOSE( (p_the + p_the_eos - p_eos), score, 0.01);
-
- delete prevState;
- prevState = nextState;
-
- }
-
- // the licenses
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "licenses",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 1 );
-
- float score;
- nextState = backwardLM->Evaluate(phrase, prevState, score);
-
- // p(licenses) * p(licenses | the) / p(the)
- SLOPPY_CHECK_CLOSE( (p_licenses + p_licenses_the - p_the), score, 0.01);
-
- delete prevState;
- prevState = nextState;
-
- }
-
- // the licenses for
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "for",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 1 );
-
- float score;
- nextState = backwardLM->Evaluate(phrase, prevState, score);
-
- // p(for) * p(for | licenses) / p(licenses)
- SLOPPY_CHECK_CLOSE( (p_for + p_for_licenses - p_licenses), score, 0.01);
-
- delete prevState;
- prevState = nextState;
-
- }
-
- // the licenses for most
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "most",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 1 );
-
- float score;
- nextState = backwardLM->Evaluate(phrase, prevState, score);
-
- // p(most) * p(most | for) / p(for)
- SLOPPY_CHECK_CLOSE( (p_most + p_most_for - p_for), score, 0.01);
-
- delete prevState;
- prevState = nextState;
-
- }
+
+
+ // the
+ {
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "the",
+ StaticData::Instance().GetFactorDelimiter());
+
+ BOOST_CHECK( phrase.GetSize() == 1 );
+
+ float score;
+ nextState = backwardLM->Evaluate(phrase, prevState, score);
+
+ // p(the) * p(</s> | the) / p(</s>)
+ SLOPPY_CHECK_CLOSE( (p_the + p_the_eos - p_eos), score, 0.01);
+
+ delete prevState;
+ prevState = nextState;
+
+ }
+
+ // the licenses
+ {
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "licenses",
+ StaticData::Instance().GetFactorDelimiter());
+
+ BOOST_CHECK( phrase.GetSize() == 1 );
+
+ float score;
+ nextState = backwardLM->Evaluate(phrase, prevState, score);
+
+ // p(licenses) * p(licenses | the) / p(the)
+ SLOPPY_CHECK_CLOSE( (p_licenses + p_licenses_the - p_the), score, 0.01);
delete prevState;
+ prevState = nextState;
+
+ }
+
+ // the licenses for
+ {
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "for",
+ StaticData::Instance().GetFactorDelimiter());
+
+ BOOST_CHECK( phrase.GetSize() == 1 );
+
+ float score;
+ nextState = backwardLM->Evaluate(phrase, prevState, score);
+
+ // p(for) * p(for | licenses) / p(licenses)
+ SLOPPY_CHECK_CLOSE( (p_for + p_for_licenses - p_licenses), score, 0.01);
+
+ delete prevState;
+ prevState = nextState;
+
+ }
+
+ // the licenses for most
+ {
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "most",
+ StaticData::Instance().GetFactorDelimiter());
+
+ BOOST_CHECK( phrase.GetSize() == 1 );
+
+ float score;
+ nextState = backwardLM->Evaluate(phrase, prevState, score);
+
+ // p(most) * p(most | for) / p(for)
+ SLOPPY_CHECK_CLOSE( (p_most + p_most_for - p_for), score, 0.01);
+
+ delete prevState;
+ prevState = nextState;
+
+ }
+
+ delete prevState;
}
-
- private:
- const Sentence *dummyInput;
- BackwardLanguageModel<lm::ngram::ProbingModel> *backwardLM;
+
+private:
+ const Sentence *dummyInput;
+ BackwardLanguageModel<lm::ngram::ProbingModel> *backwardLM;
};
}
-const char *FileLocation() {
+const char *FileLocation()
+{
if (boost::unit_test::framework::master_test_suite().argc < 2) {
BOOST_FAIL("Jamfile must specify arpa file for this test, but did not");
}
return boost::unit_test::framework::master_test_suite().argv[1];
}
-BOOST_AUTO_TEST_CASE(ProbingAll) {
+BOOST_AUTO_TEST_CASE(ProbingAll)
+{
BackwardLanguageModelTest test;
test.testEmptyHypothesis();
diff --git a/moses/LM/Base.cpp b/moses/LM/Base.cpp
index fe35604b0..37dc704de 100644
--- a/moses/LM/Base.cpp
+++ b/moses/LM/Base.cpp
@@ -31,63 +31,67 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
-namespace Moses {
+namespace Moses
+{
LanguageModel::LanguageModel(const std::string& description, const std::string &line) :
StatefulFeatureFunction(description, StaticData::Instance().GetLMEnableOOVFeature() ? 2 : 1, line )
{
- m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
+ m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
}
LanguageModel::~LanguageModel() {}
-float LanguageModel::GetWeight() const {
+float LanguageModel::GetWeight() const
+{
//return StaticData::Instance().GetAllWeights().GetScoresForProducer(this)[0];
return StaticData::Instance().GetWeights(this)[0];
}
-float LanguageModel::GetOOVWeight() const {
+float LanguageModel::GetOOVWeight() const
+{
if (m_enableOOVFeature) {
//return StaticData::Instance().GetAllWeights().GetScoresForProducer(this)[1];
- return StaticData::Instance().GetWeights(this)[1];
+ return StaticData::Instance().GetWeights(this)[1];
} else {
return 0;
}
}
-void LanguageModel::IncrementalCallback(Incremental::Manager &manager) const {
+void LanguageModel::IncrementalCallback(Incremental::Manager &manager) const
+{
UTIL_THROW(util::Exception, "Incremental search is only supported by KenLM.");
}
void LanguageModel::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
- if (Useable(targetPhrase)) {
- // contains factors used by this LM
- float fullScore, nGramScore;
- size_t oovCount;
-
- CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
- float estimateScore = fullScore - nGramScore;
-
- if (StaticData::Instance().GetLMEnableOOVFeature()) {
- vector<float> scores(2), estimateScores(2);
- scores[0] = nGramScore;
- scores[1] = oovCount;
- scoreBreakdown.Assign(this, scores);
-
- estimateScores[0] = estimateScore;
- estimateScores[1] = 0;
- estimatedFutureScore.Assign(this, estimateScores);
- } else {
- scoreBreakdown.Assign(this, nGramScore);
- estimatedFutureScore.Assign(this, estimateScore);
- }
-
- }
+ if (Useable(targetPhrase)) {
+ // contains factors used by this LM
+ float fullScore, nGramScore;
+ size_t oovCount;
+
+ CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
+ float estimateScore = fullScore - nGramScore;
+
+ if (StaticData::Instance().GetLMEnableOOVFeature()) {
+ vector<float> scores(2), estimateScores(2);
+ scores[0] = nGramScore;
+ scores[1] = oovCount;
+ scoreBreakdown.Assign(this, scores);
+
+ estimateScores[0] = estimateScore;
+ estimateScores[1] = 0;
+ estimatedFutureScore.Assign(this, estimateScores);
+ } else {
+ scoreBreakdown.Assign(this, nGramScore);
+ estimatedFutureScore.Assign(this, estimateScore);
+ }
+
+ }
}
const LanguageModel &LanguageModel::GetFirstLM()
diff --git a/moses/LM/Base.h b/moses/LM/Base.h
index 961fead5f..1f976ee53 100644
--- a/moses/LM/Base.h
+++ b/moses/LM/Base.h
@@ -30,21 +30,25 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
-namespace Incremental { class Manager; }
+namespace Incremental
+{
+class Manager;
+}
class FactorCollection;
class Factor;
class Phrase;
//! Abstract base class which represent a language model on a contiguous phrase
-class LanguageModel : public StatefulFeatureFunction {
+class LanguageModel : public StatefulFeatureFunction
+{
protected:
LanguageModel(const std::string& description, const std::string &line);
// This can't be in the constructor for virual function dispatch reasons
bool m_enableOOVFeature;
-
+
public:
static const LanguageModel &GetFirstLM();
@@ -89,9 +93,9 @@ public:
virtual void IncrementalCallback(Incremental::Manager &manager) const;
virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const;
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
};
diff --git a/moses/LM/ChartState.h b/moses/LM/ChartState.h
index b6bdd8f7c..186694927 100644
--- a/moses/LM/ChartState.h
+++ b/moses/LM/ChartState.h
@@ -19,16 +19,15 @@ private:
const ChartHypothesis &m_hypo;
- /** Construct the prefix string of up to specified size
+ /** Construct the prefix string of up to specified size
* \param ret prefix string
* \param size maximum size (typically max lm context window)
*/
- size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const
- {
+ size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const {
const TargetPhrase &target = hypo.GetCurrTargetPhrase();
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
- target.GetAlignNonTerm().GetNonTermIndexMap();
-
+ target.GetAlignNonTerm().GetNonTermIndexMap();
+
// loop over the rule that is being applied
for (size_t pos = 0; pos < target.GetSize(); ++pos) {
const Word &word = target.GetWord(pos);
@@ -53,13 +52,12 @@ private:
return size;
}
- /** Construct the suffix phrase of up to specified size
+ /** Construct the suffix phrase of up to specified size
* will always be called after the construction of prefix phrase
* \param ret suffix phrase
* \param size maximum size of suffix
*/
- size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const
- {
+ size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const {
CHECK(m_contextPrefix.GetSize() <= m_numTargetTerminals);
// special handling for small hypotheses
@@ -81,7 +79,7 @@ private:
else {
const TargetPhrase& target = hypo.GetCurrTargetPhrase();
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
- target.GetAlignNonTerm().GetNonTermIndexMap();
+ target.GetAlignNonTerm().GetNonTermIndexMap();
for (int pos = (int) target.GetSize() - 1; pos >= 0 ; --pos) {
const Word &word = target.GetWord(pos);
@@ -89,8 +87,7 @@ private:
size_t nonTermInd = nonTermIndexMap[pos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd);
size = static_cast<const LanguageModelChartState*>(prevHypo->GetFFState(featureID))->CalcSuffix(*prevHypo, featureID, ret, size);
- }
- else {
+ } else {
ret.PrependWord(hypo.GetCurrTargetPhrase().GetWord(pos));
size--;
}
@@ -106,11 +103,10 @@ private:
public:
LanguageModelChartState(const ChartHypothesis &hypo, int featureID, size_t order)
- :m_lmRightContext(NULL)
- ,m_contextPrefix(order - 1)
- ,m_contextSuffix( order - 1)
- ,m_hypo(hypo)
- {
+ :m_lmRightContext(NULL)
+ ,m_contextPrefix(order - 1)
+ ,m_contextSuffix( order - 1)
+ ,m_hypo(hypo) {
m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals();
for (std::vector<const ChartHypothesis*>::const_iterator i = hypo.GetPrevHypos().begin(); i != hypo.GetPrevHypos().end(); ++i) {
@@ -131,8 +127,12 @@ public:
m_lmRightContext = rightState;
}
- float GetPrefixScore() const { return m_prefixScore; }
- FFState* GetRightContext() const { return m_lmRightContext; }
+ float GetPrefixScore() const {
+ return m_prefixScore;
+ }
+ FFState* GetRightContext() const {
+ return m_lmRightContext;
+ }
size_t GetNumTargetTerminals() const {
return m_numTargetTerminals;
@@ -150,8 +150,7 @@ public:
dynamic_cast<const LanguageModelChartState &>( o );
// prefix
- if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) // not for "<s> ..."
- {
+ if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) { // not for "<s> ..."
int ret = GetPrefix().Compare(other.GetPrefix());
if (ret != 0)
return ret;
@@ -159,8 +158,7 @@ public:
// suffix
size_t inputSize = m_hypo.GetManager().GetSource().GetSize();
- if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1)// not for "... </s>"
- {
+ if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1) { // not for "... </s>"
int ret = other.GetRightContext()->Compare(*m_lmRightContext);
if (ret != 0)
return ret;
diff --git a/moses/LM/IRST.cpp b/moses/LM/IRST.cpp
index 2d58bd310..ae1bb677d 100644
--- a/moses/LM/IRST.cpp
+++ b/moses/LM/IRST.cpp
@@ -40,25 +40,22 @@ using namespace std;
namespace Moses
{
LanguageModelIRST::LanguageModelIRST(const std::string &line)
-:LanguageModelSingleFactor("IRSTLM", line)
+ :LanguageModelSingleFactor("IRSTLM", line)
{
FactorType factorType;
size_t nGramOrder;
string filePath;
for (size_t i = 0; i < m_args.size(); ++i) {
- const vector<string> &args = m_args[i];
+ const vector<string> &args = m_args[i];
if (args[0] == "factor") {
factorType = Scan<FactorType>(args[1]);
- }
- else if (args[0] == "order") {
+ } else if (args[0] == "order") {
nGramOrder = Scan<size_t>(args[1]);
- }
- else if (args[0] == "path") {
+ } else if (args[0] == "path") {
filePath = args[1];
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
@@ -86,8 +83,7 @@ bool LanguageModelIRST::Load(const std::string &filePath,
const StaticData &staticData = StaticData::Instance();
int threadCount = staticData.ThreadCount();
- if (threadCount != 1)
- {
+ if (threadCount != 1) {
UserMessage::Add(threadCount + " number of threads specified but IRST LM is not threadsafe.");
return false;
}
@@ -99,7 +95,7 @@ bool LanguageModelIRST::Load(const std::string &filePath,
m_filePath = filePath;
- m_lmtb = m_lmtb->CreateLanguageModel(m_filePath);
+ m_lmtb = m_lmtb->CreateLanguageModel(m_filePath);
m_lmtb->setMaxLoadedLevel(1000);
m_lmtb->load(m_filePath);
d=m_lmtb->getDict();
@@ -170,7 +166,7 @@ int LanguageModelIRST::GetLmID( const std::string &str ) const
}
int LanguageModelIRST::GetLmID( const Factor *factor ) const
-{
+{
size_t factorId = factor->GetId();
if ((factorId >= m_lmIdLookup.size()) || (m_lmIdLookup[factorId] == m_empty)) {
@@ -180,12 +176,12 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
//////////
///poiche' non c'e' distinzione tra i factorIDs delle parole sorgenti
- ///e delle parole target in Moses, puo' accadere che una parola target
+ ///e delle parole target in Moses, puo' accadere che una parola target
///di cui non sia stato ancora calcolato il suo codice target abbia
///comunque un factorID noto (e quindi minore di m_lmIdLookup.size())
///E' necessario dunque identificare questi casi di indeterminatezza
///del codice target. Attualamente, questo controllo e' stato implementato
- ///impostando a m_empty tutti i termini che non hanno ancora
+ ///impostando a m_empty tutti i termini che non hanno ancora
//ricevuto un codice target effettivo
///////////
@@ -197,7 +193,7 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
/// IN POSIZIONE (factorID-1) invece che in posizione factrID dove dopo andiamo a leggerlo (vedi caso C
/// Cosi' funziona ....
/// ho un dubbio su cosa c'e' nelle prime posizioni di m_lmIdLookup
-/// quindi
+/// quindi
/// e scopro che rimane vuota una entry ogni due
/// perche' factorID cresce di due in due (perche' codifica sia source che target) "vuota" la posizione (factorID-1)
/// non da problemi di correttezza, ma solo di "spreco" di memoria
@@ -207,10 +203,10 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
////////////////
- if (factorId >= m_lmIdLookup.size()){
- //resize and fill with m_empty
- //increment the array more than needed to avoid too many resizing operation.
- m_lmIdLookup.resize(factorId+10, m_empty);
+ if (factorId >= m_lmIdLookup.size()) {
+ //resize and fill with m_empty
+ //increment the array more than needed to avoid too many resizing operation.
+ m_lmIdLookup.resize(factorId+10, m_empty);
}
//insert new code
diff --git a/moses/LM/Implementation.cpp b/moses/LM/Implementation.cpp
index 798a12775..e9c651089 100644
--- a/moses/LM/Implementation.cpp
+++ b/moses/LM/Implementation.cpp
@@ -69,8 +69,9 @@ void LanguageModelImplementation::GetState(
GetValueForgotState(contextFactor, state);
}
-// Calculate score of a phrase.
-void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
+// Calculate score of a phrase.
+void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
+{
fullScore = 0;
ngramScore = 0;
@@ -82,7 +83,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
vector<const Word*> contextFactor;
contextFactor.reserve(GetNGramOrder());
std::auto_ptr<FFState> state(NewState((phrase.GetWord(0) == GetSentenceStartWord()) ?
- GetBeginSentenceState() : GetNullContextState()));
+ GetBeginSentenceState() : GetNullContextState()));
size_t currPos = 0;
while (currPos < phraseSize) {
const Word &word = phrase.GetWord(currPos);
@@ -109,7 +110,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
fullScore += result.score;
if (contextFactor.size() == GetNGramOrder())
ngramScore += result.score;
- if (result.unknown) ++oovCount;
+ if (result.unknown) ++oovCount;
}
}
@@ -117,7 +118,8 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
}
}
-FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const {
+FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
+{
// In this function, we only compute the LM scores of n-grams that overlap a
// phrase boundary. Phrase-internal scores are taken directly from the
// translation option.
@@ -179,9 +181,7 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS
contextFactor[i] = &hypo.GetWord((size_t)currPos);
}
lmScore += GetValueForgotState(contextFactor, *res).score;
- }
- else
- {
+ } else {
if (endPos < currEndPos) {
//need to get the LM state (otherwise the last LM state is fine)
for (size_t currPos = endPos+1; currPos <= currEndPos; currPos++) {
@@ -208,7 +208,8 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS
return res;
}
-FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out) const {
+FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out) const
+{
LanguageModelChartState *ret = new LanguageModelChartState(hypo, featureID, GetNGramOrder());
// data structure for factored context phrase (history and predicted word)
vector<const Word*> contextFactor;
@@ -223,38 +224,33 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
// get index map for underlying hypotheses
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
- hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap();
+ hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap();
// loop over rule
for (size_t phrasePos = 0, wordPos = 0;
phrasePos < hypo.GetCurrTargetPhrase().GetSize();
- phrasePos++)
- {
+ phrasePos++) {
// consult rule for either word or non-terminal
const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos);
// regular word
- if (!word.IsNonTerminal())
- {
+ if (!word.IsNonTerminal()) {
ShiftOrPush(contextFactor, word);
// beginning of sentence symbol <s>? -> just update state
- if (word == GetSentenceStartWord())
- {
+ if (word == GetSentenceStartWord()) {
CHECK(phrasePos == 0);
delete lmState;
lmState = NewState( GetBeginSentenceState() );
}
// score a regular word added by the rule
- else
- {
+ else {
updateChartScore( &prefixScore, &finalizedScore, GetValueGivenState(contextFactor, *lmState).score, ++wordPos );
}
}
// non-terminal, add phrase from underlying hypothesis
- else
- {
+ else {
// look up underlying hypothesis
size_t nonTermIndex = nonTermIndexMap[phrasePos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
@@ -278,8 +274,7 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
// push suffix
int suffixPos = prevState->GetSuffix().GetSize() - (GetNGramOrder()-1);
if (suffixPos < 0) suffixPos = 0; // push all words if less than order
- for(;(size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++)
- {
+ for(; (size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) {
const Word &word = prevState->GetSuffix().GetWord(suffixPos);
ShiftOrPush(contextFactor, word);
wordPos++;
@@ -287,22 +282,19 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
}
// internal non-terminal
- else
- {
+ else {
// score its prefix
for(size_t prefixPos = 0;
prefixPos < GetNGramOrder()-1 // up to LM order window
- && prefixPos < subPhraseLength; // up to length
- prefixPos++)
- {
+ && prefixPos < subPhraseLength; // up to length
+ prefixPos++) {
const Word &word = prevState->GetPrefix().GetWord(prefixPos);
ShiftOrPush(contextFactor, word);
updateChartScore( &prefixScore, &finalizedScore, GetValueGivenState(contextFactor, *lmState).score, ++wordPos );
}
// check if we are dealing with a large sub-phrase
- if (subPhraseLength > GetNGramOrder() - 1)
- {
+ if (subPhraseLength > GetNGramOrder() - 1) {
// add its finalized language model score
finalizedScore +=
prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0] // full score
@@ -337,11 +329,11 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
return ret;
}
-void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const {
+void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const
+{
if (wordPos < GetNGramOrder()) {
*prefixScore += score;
- }
- else {
+ } else {
*finalizedScore += score;
}
}
diff --git a/moses/LM/Implementation.h b/moses/LM/Implementation.h
index d3f83dfe1..fa6619208 100644
--- a/moses/LM/Implementation.h
+++ b/moses/LM/Implementation.h
@@ -44,7 +44,7 @@ class Phrase;
struct LMResult {
// log probability
float score;
- // Is the word unknown?
+ // Is the word unknown?
bool unknown;
};
@@ -62,7 +62,7 @@ protected:
//! Usually <s> and </s>
LanguageModelImplementation(const std::string& description, const std::string &line)
- :LanguageModel(description, line)
+ :LanguageModel(description, line)
{}
public:
@@ -108,8 +108,7 @@ public:
return m_sentenceEndWord;
}
- const FFState* EmptyHypothesisState(const InputType &/*input*/) const
- {
+ const FFState* EmptyHypothesisState(const InputType &/*input*/) const {
return NewState(GetBeginSentenceState());
}
diff --git a/moses/LM/Joint.h b/moses/LM/Joint.h
index 5bc52e2da..3a675cbd6 100644
--- a/moses/LM/Joint.h
+++ b/moses/LM/Joint.h
@@ -50,8 +50,7 @@ protected:
size_t m_implFactor;
public:
LanguageModelJoint(const std::string &line, LanguageModelSingleFactor *lmImpl)
- :LanguageModelMultiFactor("JointLM", line)
- {
+ :LanguageModelMultiFactor("JointLM", line) {
m_lmImpl = lmImpl;
}
diff --git a/moses/LM/Ken.cpp b/moses/LM/Ken.cpp
index e251661c3..af24ad858 100644
--- a/moses/LM/Ken.cpp
+++ b/moses/LM/Ken.cpp
@@ -45,8 +45,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
-namespace Moses {
-namespace {
+namespace Moses
+{
+namespace
+{
struct KenLMState : public FFState {
lm::ngram::State state;
@@ -61,63 +63,65 @@ struct KenLMState : public FFState {
/*
* An implementation of single factor LM using Ken's code.
*/
-template <class Model> class LanguageModelKen : public LanguageModel {
- public:
- LanguageModelKen(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy);
+template <class Model> class LanguageModelKen : public LanguageModel
+{
+public:
+ LanguageModelKen(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy);
- bool Useable(const Phrase &phrase) const {
- return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
- }
+ bool Useable(const Phrase &phrase) const {
+ return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
+ }
- const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
- KenLMState *ret = new KenLMState();
- ret->state = m_ngram->BeginSentenceState();
- return ret;
- }
+ const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
+ KenLMState *ret = new KenLMState();
+ ret->state = m_ngram->BeginSentenceState();
+ return ret;
+ }
- void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
+ void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
- FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
+ FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
- FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
+ FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
- void IncrementalCallback(Incremental::Manager &manager) const {
- manager.LMCallback(*m_ngram, m_lmIdLookup);
- }
+ void IncrementalCallback(Incremental::Manager &manager) const {
+ manager.LMCallback(*m_ngram, m_lmIdLookup);
+ }
- private:
- LanguageModelKen(const LanguageModelKen<Model> &copy_from);
+private:
+ LanguageModelKen(const LanguageModelKen<Model> &copy_from);
- lm::WordIndex TranslateID(const Word &word) const {
- std::size_t factor = word.GetFactor(m_factorType)->GetId();
- return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
- }
+ lm::WordIndex TranslateID(const Word &word) const {
+ std::size_t factor = word.GetFactor(m_factorType)->GetId();
+ return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
+ }
- // Convert last words of hypothesis into vocab ids, returning an end pointer.
- lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
- lm::WordIndex *index = indices;
- lm::WordIndex *end = indices + m_ngram->Order() - 1;
- int position = hypo.GetCurrTargetWordsRange().GetEndPos();
- for (; ; ++index, --position) {
- if (index == end) return index;
- if (position == -1) {
- *index = m_ngram->GetVocabulary().BeginSentence();
- return index + 1;
- }
- *index = TranslateID(hypo.GetWord(position));
+ // Convert last words of hypothesis into vocab ids, returning an end pointer.
+ lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
+ lm::WordIndex *index = indices;
+ lm::WordIndex *end = indices + m_ngram->Order() - 1;
+ int position = hypo.GetCurrTargetWordsRange().GetEndPos();
+ for (; ; ++index, --position) {
+ if (index == end) return index;
+ if (position == -1) {
+ *index = m_ngram->GetVocabulary().BeginSentence();
+ return index + 1;
}
+ *index = TranslateID(hypo.GetWord(position));
}
+ }
- boost::shared_ptr<Model> m_ngram;
-
- std::vector<lm::WordIndex> m_lmIdLookup;
+ boost::shared_ptr<Model> m_ngram;
- FactorType m_factorType;
+ std::vector<lm::WordIndex> m_lmIdLookup;
- const Factor *m_beginSentenceFactor;
+ FactorType m_factorType;
+
+ const Factor *m_beginSentenceFactor;
};
-class MappingBuilder : public lm::EnumerateVocab {
+class MappingBuilder : public lm::EnumerateVocab
+{
public:
MappingBuilder(FactorCollection &factorCollection, std::vector<lm::WordIndex> &mapping)
: m_factorCollection(factorCollection), m_mapping(mapping) {}
@@ -137,13 +141,14 @@ private:
};
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy)
-:LanguageModel(description, line)
-,m_factorType(factorType)
+ :LanguageModel(description, line)
+ ,m_factorType(factorType)
{
lm::ngram::Config config;
IFVERBOSE(1) {
config.messages = &std::cerr;
- } else {
+ }
+ else {
config.messages = NULL;
}
FactorCollection &collection = FactorCollection::Instance();
@@ -157,15 +162,17 @@ template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::stri
}
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const LanguageModelKen<Model> &copy_from)
-:LanguageModel(copy_from.GetScoreProducerDescription(), copy_from.GetArgLine()),
-m_ngram(copy_from.m_ngram),
+ :LanguageModel(copy_from.GetScoreProducerDescription(), copy_from.GetArgLine()),
+ m_ngram(copy_from.m_ngram),
// TODO: don't copy this.
-m_lmIdLookup(copy_from.m_lmIdLookup),
-m_factorType(copy_from.m_factorType),
-m_beginSentenceFactor(copy_from.m_beginSentenceFactor) {
+ m_lmIdLookup(copy_from.m_lmIdLookup),
+ m_factorType(copy_from.m_factorType),
+ m_beginSentenceFactor(copy_from.m_beginSentenceFactor)
+{
}
-template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
+template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
+{
fullScore = 0;
ngramScore = 0;
oovCount = 0;
@@ -174,7 +181,7 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
lm::ngram::ChartState discarded_sadly;
lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
-
+
size_t position;
if (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)) {
scorer.BeginSentence();
@@ -182,7 +189,7 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
} else {
position = 0;
}
-
+
size_t ngramBoundary = m_ngram->Order() - 1;
size_t end_loop = std::min(ngramBoundary, phrase.GetSize());
@@ -199,7 +206,7 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
}
float before_boundary = fullScore + scorer.Finish();
for (; position < phrase.GetSize(); ++position) {
- const Word &word = phrase.GetWord(position);
+ const Word &word = phrase.GetWord(position);
if (word.IsNonTerminal()) {
fullScore += scorer.Finish();
scorer.Reset();
@@ -207,7 +214,7 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
lm::WordIndex index = TranslateID(word);
scorer.Terminal(index);
if (!index) ++oovCount;
- }
+ }
}
fullScore += scorer.Finish();
@@ -215,11 +222,12 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
fullScore = TransformLMScore(fullScore);
}
-template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const {
+template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
+{
const lm::ngram::State &in_state = static_cast<const KenLMState&>(*ps).state;
std::auto_ptr<KenLMState> ret(new KenLMState());
-
+
if (!hypo.GetCurrTargetLength()) {
ret->state = in_state;
return ret.release();
@@ -242,17 +250,17 @@ template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothes
}
if (hypo.IsSourceCompleted()) {
- // Score end of sentence.
+ // Score end of sentence.
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
score += m_ngram->FullScoreForgotState(&indices.front(), last, m_ngram->GetVocabulary().EndSentence(), ret->state).prob;
} else if (adjust_end < end) {
- // Get state after adding a long phrase.
+ // Get state after adding a long phrase.
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
m_ngram->GetState(&indices.front(), last, ret->state);
} else if (state0 != &ret->state) {
- // Short enough phrase that we can just reuse the state.
+ // Short enough phrase that we can just reuse the state.
ret->state = *state0;
}
@@ -270,34 +278,39 @@ template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothes
return ret.release();
}
-class LanguageModelChartStateKenLM : public FFState {
- public:
- LanguageModelChartStateKenLM() {}
+class LanguageModelChartStateKenLM : public FFState
+{
+public:
+ LanguageModelChartStateKenLM() {}
- const lm::ngram::ChartState &GetChartState() const { return m_state; }
- lm::ngram::ChartState &GetChartState() { return m_state; }
+ const lm::ngram::ChartState &GetChartState() const {
+ return m_state;
+ }
+ lm::ngram::ChartState &GetChartState() {
+ return m_state;
+ }
- int Compare(const FFState& o) const
- {
- const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM&>(o);
- int ret = m_state.Compare(other.m_state);
- return ret;
- }
+ int Compare(const FFState& o) const {
+ const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM&>(o);
+ int ret = m_state.Compare(other.m_state);
+ return ret;
+ }
- private:
- lm::ngram::ChartState m_state;
+private:
+ lm::ngram::ChartState m_state;
};
-template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const {
+template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const
+{
LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM();
lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState->GetChartState());
const TargetPhrase &target = hypo.GetCurrTargetPhrase();
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
- target.GetAlignNonTerm().GetNonTermIndexMap();
+ target.GetAlignNonTerm().GetNonTermIndexMap();
const size_t size = hypo.GetCurrTargetPhrase().GetSize();
size_t phrasePos = 0;
- // Special cases for first word.
+ // Special cases for first word.
if (size) {
const Word &word = hypo.GetCurrTargetPhrase().GetWord(0);
if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
@@ -305,7 +318,7 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const Cha
ruleScore.BeginSentence();
phrasePos++;
} else if (word.IsNonTerminal()) {
- // Non-terminal is first so we can copy instead of rescoring.
+ // Non-terminal is first so we can copy instead of rescoring.
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetFFState(featureID))->GetChartState();
float prob = UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
@@ -347,20 +360,15 @@ LanguageModel *ConstructKenLM(const std::string &description, const std::string
if (args[0] == "factor") {
factorType = Scan<FactorType>(args[1]);
- }
- else if (args[0] == "order") {
+ } else if (args[0] == "order") {
//nGramOrder = Scan<size_t>(args[1]);
- }
- else if (args[0] == "path") {
+ } else if (args[0] == "path") {
filePath = args[1];
- }
- else if (args[0] == "lazyken") {
+ } else if (args[0] == "lazyken") {
lazy = Scan<bool>(args[1]);
- }
- else if (args[0] == "name") {
+ } else if (args[0] == "name") {
// that's ok. do nothing, passes onto LM constructor
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
@@ -368,26 +376,27 @@ LanguageModel *ConstructKenLM(const std::string &description, const std::string
return ConstructKenLM(description, line, filePath, factorType, lazy);
}
-LanguageModel *ConstructKenLM(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy) {
+LanguageModel *ConstructKenLM(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy)
+{
try {
lm::ngram::ModelType model_type;
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
switch(model_type) {
- case lm::ngram::PROBING:
- return new LanguageModelKen<lm::ngram::ProbingModel>(description, line, file, factorType, lazy);
- case lm::ngram::REST_PROBING:
- return new LanguageModelKen<lm::ngram::RestProbingModel>(description, line, file, factorType, lazy);
- case lm::ngram::TRIE:
- return new LanguageModelKen<lm::ngram::TrieModel>(description, line, file, factorType, lazy);
- case lm::ngram::QUANT_TRIE:
- return new LanguageModelKen<lm::ngram::QuantTrieModel>(description, line, file, factorType, lazy);
- case lm::ngram::ARRAY_TRIE:
- return new LanguageModelKen<lm::ngram::ArrayTrieModel>(description, line, file, factorType, lazy);
- case lm::ngram::QUANT_ARRAY_TRIE:
- return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(description, line, file, factorType, lazy);
- default:
- std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
- abort();
+ case lm::ngram::PROBING:
+ return new LanguageModelKen<lm::ngram::ProbingModel>(description, line, file, factorType, lazy);
+ case lm::ngram::REST_PROBING:
+ return new LanguageModelKen<lm::ngram::RestProbingModel>(description, line, file, factorType, lazy);
+ case lm::ngram::TRIE:
+ return new LanguageModelKen<lm::ngram::TrieModel>(description, line, file, factorType, lazy);
+ case lm::ngram::QUANT_TRIE:
+ return new LanguageModelKen<lm::ngram::QuantTrieModel>(description, line, file, factorType, lazy);
+ case lm::ngram::ARRAY_TRIE:
+ return new LanguageModelKen<lm::ngram::ArrayTrieModel>(description, line, file, factorType, lazy);
+ case lm::ngram::QUANT_ARRAY_TRIE:
+ return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(description, line, file, factorType, lazy);
+ default:
+ std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
+ abort();
}
} else {
return new LanguageModelKen<lm::ngram::ProbingModel>(description, line, file, factorType, lazy);
diff --git a/moses/LM/Ken.h b/moses/LM/Ken.h
index 3c2ceb774..360ac7be8 100644
--- a/moses/LM/Ken.h
+++ b/moses/LM/Ken.h
@@ -26,7 +26,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/TypeDef.h"
-namespace Moses {
+namespace Moses
+{
class LanguageModel;
diff --git a/moses/LM/LDHT.cpp b/moses/LM/LDHT.cpp
index 5f52b2514..1b4e70661 100644
--- a/moses/LM/LDHT.cpp
+++ b/moses/LM/LDHT.cpp
@@ -16,7 +16,8 @@
#include <boost/thread/tss.hpp>
-namespace Moses {
+namespace Moses
+{
struct LDHTLMState : public FFState {
LDHT::NewNgram gram_fingerprints;
@@ -60,338 +61,358 @@ struct LDHTLMState : public FFState {
}
};
-class LanguageModelLDHT : public LanguageModel {
+class LanguageModelLDHT : public LanguageModel
+{
public:
- LanguageModelLDHT();
- LanguageModelLDHT(const std::string& path,
- ScoreIndexManager& manager,
- FactorType factorType);
- LanguageModelLDHT(ScoreIndexManager& manager,
- LanguageModelLDHT& copyFrom);
-
- LDHT::Client* getClientUnsafe() const;
- LDHT::Client* getClientSafe();
- LDHT::Client* initTSSClient();
- virtual ~LanguageModelLDHT();
- virtual void InitializeForInput(InputType const& source);
- virtual void CleanUpAfterSentenceProcessing(const InputType &source);
- virtual const FFState* EmptyHypothesisState(const InputType& input) const;
- virtual bool Useable(const Phrase& phrase) const;
- virtual void CalcScore(const Phrase& phrase,
- float& fullScore,
- float& ngramScore,
- std::size_t& oovCount) const;
- virtual void CalcScoreFromCache(const Phrase& phrase,
- float& fullScore,
- float& ngramScore,
- std::size_t& oovCount) const;
- FFState* Evaluate(const Hypothesis& hypo,
- const FFState* input_state,
- ScoreComponentCollection* score_output) const;
- FFState* EvaluateChart(const ChartHypothesis& hypo,
- int featureID,
- ScoreComponentCollection* accumulator) const;
-
- virtual void IssueRequestsFor(Hypothesis& hypo,
- const FFState* input_state);
- float calcScoreFromState(LDHTLMState* hypo) const;
- void sync();
- void SetFFStateIdx(int state_idx);
+ LanguageModelLDHT();
+ LanguageModelLDHT(const std::string& path,
+ ScoreIndexManager& manager,
+ FactorType factorType);
+ LanguageModelLDHT(ScoreIndexManager& manager,
+ LanguageModelLDHT& copyFrom);
+
+ LDHT::Client* getClientUnsafe() const;
+ LDHT::Client* getClientSafe();
+ LDHT::Client* initTSSClient();
+ virtual ~LanguageModelLDHT();
+ virtual void InitializeForInput(InputType const& source);
+ virtual void CleanUpAfterSentenceProcessing(const InputType &source);
+ virtual const FFState* EmptyHypothesisState(const InputType& input) const;
+ virtual bool Useable(const Phrase& phrase) const;
+ virtual void CalcScore(const Phrase& phrase,
+ float& fullScore,
+ float& ngramScore,
+ std::size_t& oovCount) const;
+ virtual void CalcScoreFromCache(const Phrase& phrase,
+ float& fullScore,
+ float& ngramScore,
+ std::size_t& oovCount) const;
+ FFState* Evaluate(const Hypothesis& hypo,
+ const FFState* input_state,
+ ScoreComponentCollection* score_output) const;
+ FFState* EvaluateChart(const ChartHypothesis& hypo,
+ int featureID,
+ ScoreComponentCollection* accumulator) const;
+
+ virtual void IssueRequestsFor(Hypothesis& hypo,
+ const FFState* input_state);
+ float calcScoreFromState(LDHTLMState* hypo) const;
+ void sync();
+ void SetFFStateIdx(int state_idx);
protected:
- boost::thread_specific_ptr<LDHT::Client> m_client;
- std::string m_configPath;
- FactorType m_factorType;
- int m_state_idx;
- int m_calc_score_count;
- uint64_t m_start_tick;
+ boost::thread_specific_ptr<LDHT::Client> m_client;
+ std::string m_configPath;
+ FactorType m_factorType;
+ int m_state_idx;
+ int m_calc_score_count;
+ uint64_t m_start_tick;
};
LanguageModel* ConstructLDHTLM(const std::string& path,
ScoreIndexManager& manager,
- FactorType factorType) {
- return new LanguageModelLDHT(path, manager, factorType);
+ FactorType factorType)
+{
+ return new LanguageModelLDHT(path, manager, factorType);
}
-LanguageModelLDHT::LanguageModelLDHT() : LanguageModel(), m_client(NULL) {
- m_enableOOVFeature = false;
+LanguageModelLDHT::LanguageModelLDHT() : LanguageModel(), m_client(NULL)
+{
+ m_enableOOVFeature = false;
}
LanguageModelLDHT::LanguageModelLDHT(ScoreIndexManager& manager,
- LanguageModelLDHT& copyFrom) {
- m_calc_score_count = 0;
- //m_client = copyFrom.m_client;
- m_factorType = copyFrom.m_factorType;
- m_configPath = copyFrom.m_configPath;
- Init(manager);
+ LanguageModelLDHT& copyFrom)
+{
+ m_calc_score_count = 0;
+ //m_client = copyFrom.m_client;
+ m_factorType = copyFrom.m_factorType;
+ m_configPath = copyFrom.m_configPath;
+ Init(manager);
}
LanguageModelLDHT::LanguageModelLDHT(const std::string& path,
ScoreIndexManager& manager,
FactorType factorType)
- : m_factorType(factorType) {
- m_configPath = path;
- Init(manager);
+ : m_factorType(factorType)
+{
+ m_configPath = path;
+ Init(manager);
}
-LanguageModelLDHT::~LanguageModelLDHT() {
- // TODO(wilson): should cleanup for each individual thread.
- //delete getClientSafe();
+LanguageModelLDHT::~LanguageModelLDHT()
+{
+ // TODO(wilson): should cleanup for each individual thread.
+ //delete getClientSafe();
}
// Check that there is a TSS Client instance, and instantiate one if
// there isn't.
-LDHT::Client* LanguageModelLDHT::getClientSafe() {
- if (m_client.get() == NULL)
- m_client.reset(initTSSClient());
- return m_client.get();
+LDHT::Client* LanguageModelLDHT::getClientSafe()
+{
+ if (m_client.get() == NULL)
+ m_client.reset(initTSSClient());
+ return m_client.get();
}
// Do not check that there is a TSS Client instance.
-LDHT::Client* LanguageModelLDHT::getClientUnsafe() const {
- return m_client.get();
+LDHT::Client* LanguageModelLDHT::getClientUnsafe() const
+{
+ return m_client.get();
}
-LDHT::Client* LanguageModelLDHT::initTSSClient() {
- std::ifstream config_file(m_configPath.c_str());
- std::string ldht_config_path;
- getline(config_file, ldht_config_path);
- std::string ldhtlm_config_path;
- getline(config_file, ldhtlm_config_path);
-
- LDHT::FactoryCollection* factory_collection =
- LDHT::FactoryCollection::createDefaultFactoryCollection();
-
- LDHT::Client* client;
- //client = new LDHT::ClientLocal();
- client = new LDHT::Client();
- client->fromXmlFiles(*factory_collection,
- ldht_config_path,
- ldhtlm_config_path);
- return client;
+LDHT::Client* LanguageModelLDHT::initTSSClient()
+{
+ std::ifstream config_file(m_configPath.c_str());
+ std::string ldht_config_path;
+ getline(config_file, ldht_config_path);
+ std::string ldhtlm_config_path;
+ getline(config_file, ldhtlm_config_path);
+
+ LDHT::FactoryCollection* factory_collection =
+ LDHT::FactoryCollection::createDefaultFactoryCollection();
+
+ LDHT::Client* client;
+ //client = new LDHT::ClientLocal();
+ client = new LDHT::Client();
+ client->fromXmlFiles(*factory_collection,
+ ldht_config_path,
+ ldhtlm_config_path);
+ return client;
}
-void LanguageModelLDHT::InitializeForInput(InputType const& source) {
- getClientSafe()->clearCache();
- m_start_tick = LDHT::Util::rdtsc();
+void LanguageModelLDHT::InitializeForInput(InputType const& source)
+{
+ getClientSafe()->clearCache();
+ m_start_tick = LDHT::Util::rdtsc();
}
-void LanguageModelLDHT::CleanUpAfterSentenceProcessing(const InputType &source) {
- LDHT::Client* client = getClientSafe();
-
- std::cerr << "LDHT sentence stats:" << std::endl;
- std::cerr << " ngrams submitted: " << client->getNumNgramsSubmitted() << std::endl
- << " ngrams requested: " << client->getNumNgramsRequested() << std::endl
- << " ngrams not found: " << client->getKeyNotFoundCount() << std::endl
- << " cache hits: " << client->getCacheHitCount() << std::endl
- << " inferences: " << client->getInferenceCount() << std::endl
- << " pcnt latency: " << (float)client->getLatencyTicks() / (float)(LDHT::Util::rdtsc() - m_start_tick) * 100.0 << std::endl;
- m_start_tick = 0;
- client->resetLatencyTicks();
- client->resetNumNgramsSubmitted();
- client->resetNumNgramsRequested();
- client->resetInferenceCount();
- client->resetCacheHitCount();
- client->resetKeyNotFoundCount();
+void LanguageModelLDHT::CleanUpAfterSentenceProcessing(const InputType &source)
+{
+ LDHT::Client* client = getClientSafe();
+
+ std::cerr << "LDHT sentence stats:" << std::endl;
+ std::cerr << " ngrams submitted: " << client->getNumNgramsSubmitted() << std::endl
+ << " ngrams requested: " << client->getNumNgramsRequested() << std::endl
+ << " ngrams not found: " << client->getKeyNotFoundCount() << std::endl
+ << " cache hits: " << client->getCacheHitCount() << std::endl
+ << " inferences: " << client->getInferenceCount() << std::endl
+ << " pcnt latency: " << (float)client->getLatencyTicks() / (float)(LDHT::Util::rdtsc() - m_start_tick) * 100.0 << std::endl;
+ m_start_tick = 0;
+ client->resetLatencyTicks();
+ client->resetNumNgramsSubmitted();
+ client->resetNumNgramsRequested();
+ client->resetInferenceCount();
+ client->resetCacheHitCount();
+ client->resetKeyNotFoundCount();
}
const FFState* LanguageModelLDHT::EmptyHypothesisState(
- const InputType& input) const {
- return NULL;
+ const InputType& input) const
+{
+ return NULL;
}
-bool LanguageModelLDHT::Useable(const Phrase& phrase) const {
- return (phrase.GetSize() > 0 && phrase.GetFactor(0, m_factorType) != NULL);
+bool LanguageModelLDHT::Useable(const Phrase& phrase) const
+{
+ return (phrase.GetSize() > 0 && phrase.GetFactor(0, m_factorType) != NULL);
}
void LanguageModelLDHT::CalcScore(const Phrase& phrase,
float& fullScore,
float& ngramScore,
- std::size_t& oovCount) const {
- const_cast<LanguageModelLDHT*>(this)->m_calc_score_count++;
- if (m_calc_score_count > 10000) {
- const_cast<LanguageModelLDHT*>(this)->m_calc_score_count = 0;
- const_cast<LanguageModelLDHT*>(this)->sync();
- }
+ std::size_t& oovCount) const
+{
+ const_cast<LanguageModelLDHT*>(this)->m_calc_score_count++;
+ if (m_calc_score_count > 10000) {
+ const_cast<LanguageModelLDHT*>(this)->m_calc_score_count = 0;
+ const_cast<LanguageModelLDHT*>(this)->sync();
+ }
- // TODO(wilson): handle nonterminal words.
- LDHT::Client* client = getClientUnsafe();
- // Score the first order - 1 words of the phrase.
- int order = LDHT::NewNgram::k_max_order;
- int prefix_start = 0;
- int prefix_end = std::min(phrase.GetSize(), static_cast<size_t>(order - 1));
- LDHT::NewNgram ngram;
- for (int word_idx = prefix_start; word_idx < prefix_end; ++word_idx) {
- ngram.appendGram(phrase.GetWord(word_idx)
- .GetFactor(m_factorType)->GetString().c_str());
- client->requestNgram(ngram);
- }
- // Now score all subsequent ngrams to end of phrase.
- int internal_start = prefix_end;
- int internal_end = phrase.GetSize();
- for (int word_idx = internal_start; word_idx < internal_end; ++word_idx) {
- ngram.appendGram(phrase.GetWord(word_idx)
- .GetFactor(m_factorType)->GetString().c_str());
- client->requestNgram(ngram);
- }
+ // TODO(wilson): handle nonterminal words.
+ LDHT::Client* client = getClientUnsafe();
+ // Score the first order - 1 words of the phrase.
+ int order = LDHT::NewNgram::k_max_order;
+ int prefix_start = 0;
+ int prefix_end = std::min(phrase.GetSize(), static_cast<size_t>(order - 1));
+ LDHT::NewNgram ngram;
+ for (int word_idx = prefix_start; word_idx < prefix_end; ++word_idx) {
+ ngram.appendGram(phrase.GetWord(word_idx)
+ .GetFactor(m_factorType)->GetString().c_str());
+ client->requestNgram(ngram);
+ }
+ // Now score all subsequent ngrams to end of phrase.
+ int internal_start = prefix_end;
+ int internal_end = phrase.GetSize();
+ for (int word_idx = internal_start; word_idx < internal_end; ++word_idx) {
+ ngram.appendGram(phrase.GetWord(word_idx)
+ .GetFactor(m_factorType)->GetString().c_str());
+ client->requestNgram(ngram);
+ }
- fullScore = 0;
- ngramScore = 0;
- oovCount = 0;
+ fullScore = 0;
+ ngramScore = 0;
+ oovCount = 0;
}
void LanguageModelLDHT::CalcScoreFromCache(const Phrase& phrase,
- float& fullScore,
- float& ngramScore,
- std::size_t& oovCount) const {
- // Issue requests for phrase internal ngrams.
- // Sync if necessary. (or autosync).
- const_cast<LanguageModelLDHT*>(this)->sync();
-
- // TODO(wilson): handle nonterminal words.
- LDHT::Client* client = getClientUnsafe();
- // Score the first order - 1 words of the phrase.
- int order = LDHT::NewNgram::k_max_order;
- int prefix_start = 0;
- int prefix_end = std::min(phrase.GetSize(), static_cast<size_t>(order - 1));
- LDHT::NewNgram ngram;
- std::deque<int> full_score_tags;
- for (int word_idx = prefix_start; word_idx < prefix_end; ++word_idx) {
- ngram.appendGram(phrase.GetWord(word_idx)
- .GetFactor(m_factorType)->GetString().c_str());
- full_score_tags.push_back(client->requestNgram(ngram));
- }
- // Now score all subsequent ngrams to end of phrase.
- int internal_start = prefix_end;
- int internal_end = phrase.GetSize();
- std::deque<int> internal_score_tags;
- for (int word_idx = internal_start; word_idx < internal_end; ++word_idx) {
- ngram.appendGram(phrase.GetWord(word_idx)
- .GetFactor(m_factorType)->GetString().c_str());
- internal_score_tags.push_back(client->requestNgram(ngram));
- }
+ float& fullScore,
+ float& ngramScore,
+ std::size_t& oovCount) const
+{
+ // Issue requests for phrase internal ngrams.
+ // Sync if necessary. (or autosync).
+ const_cast<LanguageModelLDHT*>(this)->sync();
+
+ // TODO(wilson): handle nonterminal words.
+ LDHT::Client* client = getClientUnsafe();
+ // Score the first order - 1 words of the phrase.
+ int order = LDHT::NewNgram::k_max_order;
+ int prefix_start = 0;
+ int prefix_end = std::min(phrase.GetSize(), static_cast<size_t>(order - 1));
+ LDHT::NewNgram ngram;
+ std::deque<int> full_score_tags;
+ for (int word_idx = prefix_start; word_idx < prefix_end; ++word_idx) {
+ ngram.appendGram(phrase.GetWord(word_idx)
+ .GetFactor(m_factorType)->GetString().c_str());
+ full_score_tags.push_back(client->requestNgram(ngram));
+ }
+ // Now score all subsequent ngrams to end of phrase.
+ int internal_start = prefix_end;
+ int internal_end = phrase.GetSize();
+ std::deque<int> internal_score_tags;
+ for (int word_idx = internal_start; word_idx < internal_end; ++word_idx) {
+ ngram.appendGram(phrase.GetWord(word_idx)
+ .GetFactor(m_factorType)->GetString().c_str());
+ internal_score_tags.push_back(client->requestNgram(ngram));
+ }
- // Wait for resposes from the servers.
- //client->awaitResponses();
+ // Wait for resposes from the servers.
+ //client->awaitResponses();
- // Calculate the full phrase score, and the internal score.
- fullScore = 0.0;
- while (!full_score_tags.empty()) {
- fullScore += client->getNgramScore(full_score_tags.front());
- full_score_tags.pop_front();
- }
- ngramScore = 0.0;
- while (!internal_score_tags.empty()) {
- float score = client->getNgramScore(internal_score_tags.front());
- internal_score_tags.pop_front();
- fullScore += score;
- ngramScore += score;
- }
- fullScore = TransformLMScore(fullScore);
- ngramScore = TransformLMScore(ngramScore);
- oovCount = 0;
+ // Calculate the full phrase score, and the internal score.
+ fullScore = 0.0;
+ while (!full_score_tags.empty()) {
+ fullScore += client->getNgramScore(full_score_tags.front());
+ full_score_tags.pop_front();
+ }
+ ngramScore = 0.0;
+ while (!internal_score_tags.empty()) {
+ float score = client->getNgramScore(internal_score_tags.front());
+ internal_score_tags.pop_front();
+ fullScore += score;
+ ngramScore += score;
+ }
+ fullScore = TransformLMScore(fullScore);
+ ngramScore = TransformLMScore(ngramScore);
+ oovCount = 0;
}
void LanguageModelLDHT::IssueRequestsFor(Hypothesis& hypo,
- const FFState* input_state) {
- // TODO(wilson): handle nonterminal words.
- LDHT::Client* client = getClientUnsafe();
-
- // Create a new state and copy the contents of the input_state if
- // supplied.
- LDHTLMState* new_state = new LDHTLMState();
- if (input_state == NULL) {
- if (hypo.GetCurrTargetWordsRange().GetStartPos() != 0) {
- V("got a null state but not at start of sentence");
- abort();
- }
- new_state->gram_fingerprints.appendGram(BOS_);
+ const FFState* input_state)
+{
+ // TODO(wilson): handle nonterminal words.
+ LDHT::Client* client = getClientUnsafe();
+
+ // Create a new state and copy the contents of the input_state if
+ // supplied.
+ LDHTLMState* new_state = new LDHTLMState();
+ if (input_state == NULL) {
+ if (hypo.GetCurrTargetWordsRange().GetStartPos() != 0) {
+ V("got a null state but not at start of sentence");
+ abort();
}
- else {
- if (hypo.GetCurrTargetWordsRange().GetStartPos() == 0) {
- V("got a non null state but at start of sentence");
- abort();
- }
- new_state->copyFrom(static_cast<const LDHTLMState&>(*input_state));
+ new_state->gram_fingerprints.appendGram(BOS_);
+ } else {
+ if (hypo.GetCurrTargetWordsRange().GetStartPos() == 0) {
+ V("got a non null state but at start of sentence");
+ abort();
}
+ new_state->copyFrom(static_cast<const LDHTLMState&>(*input_state));
+ }
- // Score ngrams that overlap with the previous phrase.
- int order = LDHT::NewNgram::k_max_order;
- int phrase_start = hypo.GetCurrTargetWordsRange().GetStartPos();
- int phrase_end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
- int overlap_start = phrase_start;
- int overlap_end = std::min(phrase_end, phrase_start + order - 1);
- int word_idx = overlap_start;
- LDHT::NewNgram& ngram = new_state->gram_fingerprints;
- for (; word_idx < overlap_end; ++word_idx) {
- ngram.appendGram(
- hypo.GetFactor(word_idx, m_factorType)->GetString().c_str());
- new_state->appendRequestTag(client->requestNgram(ngram));
- }
- // No need to score phrase internal ngrams, but keep track of them
- // in the state (which in this case is the NewNgram containing the
- // hashes of the individual grams).
- for (; word_idx < phrase_end; ++word_idx) {
- ngram.appendGram(
- hypo.GetFactor(word_idx, m_factorType)->GetString().c_str());
- }
- // If this is the last phrase in the sentence, score the last ngram
- // with the end of sentence marker on it.
- if (hypo.IsSourceCompleted()) {
- ngram.appendGram(EOS_);
- //request_tags.push_back(client->requestNgram(ngram));
- new_state->appendRequestTag(client->requestNgram(ngram));
- }
- hypo.SetFFState(m_state_idx, new_state);
+ // Score ngrams that overlap with the previous phrase.
+ int order = LDHT::NewNgram::k_max_order;
+ int phrase_start = hypo.GetCurrTargetWordsRange().GetStartPos();
+ int phrase_end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
+ int overlap_start = phrase_start;
+ int overlap_end = std::min(phrase_end, phrase_start + order - 1);
+ int word_idx = overlap_start;
+ LDHT::NewNgram& ngram = new_state->gram_fingerprints;
+ for (; word_idx < overlap_end; ++word_idx) {
+ ngram.appendGram(
+ hypo.GetFactor(word_idx, m_factorType)->GetString().c_str());
+ new_state->appendRequestTag(client->requestNgram(ngram));
+ }
+ // No need to score phrase internal ngrams, but keep track of them
+ // in the state (which in this case is the NewNgram containing the
+ // hashes of the individual grams).
+ for (; word_idx < phrase_end; ++word_idx) {
+ ngram.appendGram(
+ hypo.GetFactor(word_idx, m_factorType)->GetString().c_str());
+ }
+ // If this is the last phrase in the sentence, score the last ngram
+ // with the end of sentence marker on it.
+ if (hypo.IsSourceCompleted()) {
+ ngram.appendGram(EOS_);
+ //request_tags.push_back(client->requestNgram(ngram));
+ new_state->appendRequestTag(client->requestNgram(ngram));
+ }
+ hypo.SetFFState(m_state_idx, new_state);
}
-void LanguageModelLDHT::sync() {
- m_calc_score_count = 0;
- getClientUnsafe()->awaitResponses();
+void LanguageModelLDHT::sync()
+{
+ m_calc_score_count = 0;
+ getClientUnsafe()->awaitResponses();
}
-void LanguageModelLDHT::SetFFStateIdx(int state_idx) {
- m_state_idx = state_idx;
+void LanguageModelLDHT::SetFFStateIdx(int state_idx)
+{
+ m_state_idx = state_idx;
}
FFState* LanguageModelLDHT::Evaluate(
- const Hypothesis& hypo,
- const FFState* input_state_ignored,
- ScoreComponentCollection* score_output) const {
- // Input state is the state from the previous hypothesis, which
- // we are not interested in. The requests for this hypo should
- // already have been issued via IssueRequestsFor() and the LM then
- // synced and all responses processed, and the tags placed in our
- // FFState of hypo.
- LDHTLMState* state = const_cast<LDHTLMState*>(static_cast<const LDHTLMState*>(hypo.GetFFState(m_state_idx)));
-
- float score = calcScoreFromState(state);
- score = FloorScore(TransformLMScore(score));
- score_output->PlusEquals(this, score);
-
- return state;
+ const Hypothesis& hypo,
+ const FFState* input_state_ignored,
+ ScoreComponentCollection* score_output) const
+{
+ // Input state is the state from the previous hypothesis, which
+ // we are not interested in. The requests for this hypo should
+ // already have been issued via IssueRequestsFor() and the LM then
+ // synced and all responses processed, and the tags placed in our
+ // FFState of hypo.
+ LDHTLMState* state = const_cast<LDHTLMState*>(static_cast<const LDHTLMState*>(hypo.GetFFState(m_state_idx)));
+
+ float score = calcScoreFromState(state);
+ score = FloorScore(TransformLMScore(score));
+ score_output->PlusEquals(this, score);
+
+ return state;
}
FFState* LanguageModelLDHT::EvaluateChart(
- const ChartHypothesis& hypo,
- int featureID,
- ScoreComponentCollection* accumulator) const {
- return NULL;
+ const ChartHypothesis& hypo,
+ int featureID,
+ ScoreComponentCollection* accumulator) const
+{
+ return NULL;
}
-float LanguageModelLDHT::calcScoreFromState(LDHTLMState* state) const {
- float score = 0.0;
- std::vector<int>::iterator tag_iter;
- LDHT::Client* client = getClientUnsafe();
- for (tag_iter = state->requestTagsBegin();
- tag_iter != state->requestTagsEnd();
- ++tag_iter) {
- score += client->getNgramScore(*tag_iter);
- }
- state->clearRequestTags();
- state->setFinalised();
- return score;
+float LanguageModelLDHT::calcScoreFromState(LDHTLMState* state) const
+{
+ float score = 0.0;
+ std::vector<int>::iterator tag_iter;
+ LDHT::Client* client = getClientUnsafe();
+ for (tag_iter = state->requestTagsBegin();
+ tag_iter != state->requestTagsEnd();
+ ++tag_iter) {
+ score += client->getNgramScore(*tag_iter);
+ }
+ state->clearRequestTags();
+ state->setFinalised();
+ return score;
}
} // namespace Moses.
diff --git a/moses/LM/LDHT.h b/moses/LM/LDHT.h
index a8489c0e3..8c5c3c36b 100644
--- a/moses/LM/LDHT.h
+++ b/moses/LM/LDHT.h
@@ -7,7 +7,8 @@
#include "moses/TypeDef.h"
-namespace Moses {
+namespace Moses
+{
class ScoreIndexManager;
class LanguageModel;
diff --git a/moses/LM/MultiFactor.h b/moses/LM/MultiFactor.h
index 491da4abe..21a9d493b 100644
--- a/moses/LM/MultiFactor.h
+++ b/moses/LM/MultiFactor.h
@@ -33,7 +33,7 @@ namespace Moses
class Phrase;
-/* Abstract class for for multi factor LM. Only inherited by the JointLM at the moment.
+/* Abstract class for for multi factor LM. Only inherited by the JointLM at the moment.
* Could use this when factored LM are implemented
*/
class LanguageModelMultiFactor : public LanguageModelImplementation
@@ -41,16 +41,16 @@ class LanguageModelMultiFactor : public LanguageModelImplementation
protected:
FactorMask m_factorTypes;
- LanguageModelMultiFactor(const std::string& description, const std::string &line)
- :LanguageModelImplementation(description, line)
+ LanguageModelMultiFactor(const std::string& description, const std::string &line)
+ :LanguageModelImplementation(description, line)
{}
-
+
public:
virtual bool Load(const std::string &filePath
, const std::vector<FactorType> &factorTypes
, size_t nGramOrder) = 0;
- bool Useable(const Phrase &phrase) const;
+ bool Useable(const Phrase &phrase) const;
};
}
diff --git a/moses/LM/ORLM.cpp b/moses/LM/ORLM.cpp
index 226267ee2..44fd64efb 100644
--- a/moses/LM/ORLM.cpp
+++ b/moses/LM/ORLM.cpp
@@ -9,10 +9,11 @@
#include "ORLM.h"
using std::map;
-namespace Moses
+namespace Moses
+{
+bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
+ size_t nGramOrder)
{
-bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
- size_t nGramOrder) {
cerr << "Loading LanguageModelORLM..." << endl;
m_filePath = filePath;
m_factorType = factorType;
@@ -26,13 +27,14 @@ bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
CreateFactors();
return true;
}
-void LanguageModelORLM::CreateFactors() {
+void LanguageModelORLM::CreateFactors()
+{
FactorCollection &factorCollection = FactorCollection::Instance();
size_t maxFactorId = 0; // to create lookup vector later on
std::map<size_t, wordID_t> m_lmids_map; // map from factor id -> word id
for(std::map<Word, wordID_t>::const_iterator vIter = m_lm->vocab_->VocabStart();
- vIter != m_lm->vocab_->VocabEnd(); vIter++){
+ vIter != m_lm->vocab_->VocabEnd(); vIter++) {
// get word from ORLM vocab and associate with (new) factor id
size_t factorId = factorCollection.AddFactor(Output,m_factorType,vIter->first.ToString())->GetId();
m_lmids_map[factorId] = vIter->second;
@@ -50,7 +52,7 @@ void LanguageModelORLM::CreateFactors() {
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
m_sentenceEndWord[m_factorType] = m_sentenceEnd;
// add to lookup vector in object
- lm_ids_vec_.resize(maxFactorId+1);
+ lm_ids_vec_.resize(maxFactorId+1);
// fill with OOV code
fill(lm_ids_vec_.begin(), lm_ids_vec_.end(), m_oov_id);
@@ -58,15 +60,18 @@ void LanguageModelORLM::CreateFactors() {
iter != m_lmids_map.end() ; ++iter)
lm_ids_vec_[iter->first] = iter->second;
}
-wordID_t LanguageModelORLM::GetLmID(const std::string& str) const {
+wordID_t LanguageModelORLM::GetLmID(const std::string& str) const
+{
return m_lm->vocab_->GetWordID(str);
}
-wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const {
+wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const
+{
size_t factorId = factor->GetId();
return (factorId >= lm_ids_vec_.size()) ? m_oov_id : lm_ids_vec_[factorId];
}
-LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor,
- State* finalState) const {
+LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor,
+ State* finalState) const
+{
FactorType factorType = GetFactorType();
// set up context
//std::vector<long unsigned int> factor(1,0);
@@ -88,13 +93,14 @@ LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFact
*/
return ret;
}
-bool LanguageModelORLM::UpdateORLM(const std::vector<string>& ngram, const int value) {
+bool LanguageModelORLM::UpdateORLM(const std::vector<string>& ngram, const int value)
+{
/*cerr << "Inserting into ORLM: \"";
iterate(ngram, nit)
cerr << *nit << " ";
cerr << "\"\t" << value << endl; */
m_lm->vocab_->MakeOpen();
- bool res = m_lm->update(ngram, value);
+ bool res = m_lm->update(ngram, value);
m_lm->vocab_->MakeClosed();
return res;
}
diff --git a/moses/LM/ORLM.h b/moses/LM/ORLM.h
index 48909191e..d7a8b5d35 100644
--- a/moses/LM/ORLM.h
+++ b/moses/LM/ORLM.h
@@ -17,7 +17,8 @@ class Phrase;
/** @todo ask ollie
*/
-class LanguageModelORLM : public LanguageModelSingleFactor {
+class LanguageModelORLM : public LanguageModelSingleFactor
+{
public:
typedef count_t T; // type for ORLM filter
LanguageModelORLM(const std::string &line)
@@ -34,10 +35,12 @@ public:
fout.close();
delete m_lm;
}
- void CleanUpAfterSentenceProcessing() {m_lm->clearCache();} // clear caches
+ void CleanUpAfterSentenceProcessing() {
+ m_lm->clearCache(); // clear caches
+ }
bool UpdateORLM(const std::vector<string>& ngram, const int value);
- protected:
+protected:
OnlineRLM<T>* m_lm;
//MultiOnlineRLM<T>* m_lm;
wordID_t m_oov_id;
diff --git a/moses/LM/ParallelBackoff.cpp b/moses/LM/ParallelBackoff.cpp
index cf8c1509b..0b996de2b 100644
--- a/moses/LM/ParallelBackoff.cpp
+++ b/moses/LM/ParallelBackoff.cpp
@@ -70,7 +70,7 @@ private:
public:
LanguageModelParallelBackoff(const std::string &line)
- :LanguageModelMultiFactor("ParallelBackoffLM", line)
+ :LanguageModelMultiFactor("ParallelBackoffLM", line)
{}
~LanguageModelParallelBackoff();
diff --git a/moses/LM/Rand.cpp b/moses/LM/Rand.cpp
index 8e3e37a1f..5e31029d5 100644
--- a/moses/LM/Rand.cpp
+++ b/moses/LM/Rand.cpp
@@ -37,7 +37,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
-namespace
+namespace
{
using namespace std;
@@ -45,8 +45,8 @@ class LanguageModelRandLM : public LanguageModelSingleFactor
{
public:
LanguageModelRandLM(const std::string &line)
- :LanguageModelSingleFactor("RandLM", line)
- , m_lm(0)
+ :LanguageModelSingleFactor("RandLM", line)
+ , m_lm(0)
{}
bool Load(const std::string &filePath, FactorType factorType, size_t nGramOrder);
virtual LMResult GetValue(const std::vector<const Word*> &contextFactor, State* finalState = NULL) const;
@@ -133,7 +133,7 @@ randlm::WordID LanguageModelRandLM::GetLmID( const std::string &str ) const
}
LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
- State* finalState) const
+ State* finalState) const
{
FactorType factorType = GetFactorType();
// set up context
diff --git a/moses/LM/SRI.cpp b/moses/LM/SRI.cpp
index b6281512c..54e6f93b9 100644
--- a/moses/LM/SRI.cpp
+++ b/moses/LM/SRI.cpp
@@ -39,29 +39,26 @@ using namespace std;
namespace Moses
{
LanguageModelSRI::LanguageModelSRI(const std::string &line)
-:LanguageModelSingleFactor("SRILM", line)
-,m_srilmVocab(0)
-,m_srilmModel(0)
+ :LanguageModelSingleFactor("SRILM", line)
+ ,m_srilmVocab(0)
+ ,m_srilmModel(0)
{
FactorType factorType;
size_t nGramOrder;
string filePath;
for (size_t i = 0; i < m_args.size(); ++i) {
- const vector<string> &args = m_args[i];
-
- if (args[0] == "factor") {
- factorType = Scan<FactorType>(args[1]);
- }
- else if (args[0] == "order") {
- nGramOrder = Scan<size_t>(args[1]);
- }
- else if (args[0] == "path") {
- filePath = args[1];
- }
- else {
- throw "Unknown argument " + args[0];
- }
+ const vector<string> &args = m_args[i];
+
+ if (args[0] == "factor") {
+ factorType = Scan<FactorType>(args[1]);
+ } else if (args[0] == "order") {
+ nGramOrder = Scan<size_t>(args[1]);
+ } else if (args[0] == "path") {
+ filePath = args[1];
+ } else {
+ throw "Unknown argument " + args[0];
+ }
}
Load(filePath, factorType, nGramOrder);
diff --git a/moses/LM/SingleFactor.cpp b/moses/LM/SingleFactor.cpp
index 031fa38ac..abd8aca51 100644
--- a/moses/LM/SingleFactor.cpp
+++ b/moses/LM/SingleFactor.cpp
@@ -38,7 +38,7 @@ namespace Moses
{
LanguageModelSingleFactor::LanguageModelSingleFactor(const std::string& description, const std::string &line)
-:LanguageModelImplementation(description, line)
+ :LanguageModelImplementation(description, line)
{
m_nullContextState = new PointerState(NULL);
m_beginSentenceState = new PointerState(NULL);
diff --git a/moses/LM/SingleFactor.h b/moses/LM/SingleFactor.h
index cb51808ac..9a1f30216 100644
--- a/moses/LM/SingleFactor.h
+++ b/moses/LM/SingleFactor.h
@@ -43,31 +43,27 @@ protected:
FFState *m_nullContextState;
FFState *m_beginSentenceState;
- LanguageModelSingleFactor(const std::string& description, const std::string &line);
+ LanguageModelSingleFactor(const std::string& description, const std::string &line);
public:
- virtual ~LanguageModelSingleFactor();
- virtual bool Load(const std::string &filePath
- , FactorType factorType
- , size_t nGramOrder) = 0;
-
- bool Useable(const Phrase &phrase) const
- {
- return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
- }
-
- const Factor *GetSentenceStart() const
- {
- return m_sentenceStart;
- }
- const Factor *GetSentenceEnd() const
- {
- return m_sentenceEnd;
- }
- FactorType GetFactorType() const
- {
- return m_factorType;
- }
+ virtual ~LanguageModelSingleFactor();
+ virtual bool Load(const std::string &filePath
+ , FactorType factorType
+ , size_t nGramOrder) = 0;
+
+ bool Useable(const Phrase &phrase) const {
+ return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
+ }
+
+ const Factor *GetSentenceStart() const {
+ return m_sentenceStart;
+ }
+ const Factor *GetSentenceEnd() const {
+ return m_sentenceEnd;
+ }
+ FactorType GetFactorType() const {
+ return m_factorType;
+ }
virtual const FFState *GetNullContextState() const;
virtual const FFState *GetBeginSentenceState() const;
diff --git a/moses/LexicalReordering.cpp b/moses/LexicalReordering.cpp
index 71c8fb2b8..98dca7b5f 100644
--- a/moses/LexicalReordering.cpp
+++ b/moses/LexicalReordering.cpp
@@ -10,7 +10,7 @@ using namespace std;
namespace Moses
{
LexicalReordering::LexicalReordering(const std::string &line)
-: StatefulFeatureFunction("LexicalReordering", line)
+ : StatefulFeatureFunction("LexicalReordering", line)
{
std::cerr << "Initializing LexicalReordering.." << std::endl;
@@ -24,41 +24,37 @@ LexicalReordering::LexicalReordering(const std::string &line)
m_configuration = new LexicalReorderingConfiguration(args[1]);
m_configuration->SetScoreProducer(this);
m_modelTypeString = m_configuration->GetModelString();
- }
- else if (args[0] == "input-factor") {
+ } else if (args[0] == "input-factor") {
f_factors =Tokenize<FactorType>(args[1]);
- }
- else if (args[0] == "output-factor") {
+ } else if (args[0] == "output-factor") {
e_factors =Tokenize<FactorType>(args[1]);
- }
- else if (args[0] == "path") {
+ } else if (args[0] == "path") {
filePath = args[1];
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
switch(m_configuration->GetCondition()) {
- case LexicalReorderingConfiguration::FE:
- case LexicalReorderingConfiguration::E:
- m_factorsE = e_factors;
- if(m_factorsE.empty()) {
- UserMessage::Add("TL factor mask for lexical reordering is unexpectedly empty");
- exit(1);
- }
- if(m_configuration->GetCondition() == LexicalReorderingConfiguration::E)
- break; // else fall through
- case LexicalReorderingConfiguration::F:
- m_factorsF = f_factors;
- if(m_factorsF.empty()) {
- UserMessage::Add("SL factor mask for lexical reordering is unexpectedly empty");
- exit(1);
- }
- break;
- default:
- UserMessage::Add("Unknown conditioning option!");
+ case LexicalReorderingConfiguration::FE:
+ case LexicalReorderingConfiguration::E:
+ m_factorsE = e_factors;
+ if(m_factorsE.empty()) {
+ UserMessage::Add("TL factor mask for lexical reordering is unexpectedly empty");
exit(1);
+ }
+ if(m_configuration->GetCondition() == LexicalReorderingConfiguration::E)
+ break; // else fall through
+ case LexicalReorderingConfiguration::F:
+ m_factorsF = f_factors;
+ if(m_factorsF.empty()) {
+ UserMessage::Add("SL factor mask for lexical reordering is unexpectedly empty");
+ exit(1);
+ }
+ break;
+ default:
+ UserMessage::Add("Unknown conditioning option!");
+ exit(1);
}
m_table = LexicalReorderingTable::LoadAvailable(filePath, m_factorsF, m_factorsE, std::vector<FactorType>());
diff --git a/moses/LexicalReordering.h b/moses/LexicalReordering.h
index 51cf797f0..abaa31c25 100644
--- a/moses/LexicalReordering.h
+++ b/moses/LexicalReordering.h
@@ -24,15 +24,16 @@ class InputType;
/** implementation of lexical reordering (Tilman ...) for phrase-based decoding
*/
-class LexicalReordering : public StatefulFeatureFunction {
-public:
+class LexicalReordering : public StatefulFeatureFunction
+{
+public:
LexicalReordering(const std::string &line);
virtual ~LexicalReordering();
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
- void InitializeForInput(const InputType& i){
- m_table->InitializeForInput(i);
+ void InitializeForInput(const InputType& i) {
+ m_table->InitializeForInput(i);
}
Scores GetProb(const Phrase& f, const Phrase& e) const;
@@ -43,25 +44,25 @@ public:
virtual FFState* EvaluateChart(const ChartHypothesis&,
int /* featureID */,
- ScoreComponentCollection*) const {
- CHECK(0); // not valid for chart decoder
- return NULL;
- }
+ ScoreComponentCollection*) const {
+ CHECK(0); // not valid for chart decoder
+ return NULL;
+ }
private:
- bool DecodeCondition(std::string s);
- bool DecodeDirection(std::string s);
- bool DecodeNumFeatureFunctions(std::string s);
+ bool DecodeCondition(std::string s);
+ bool DecodeDirection(std::string s);
+ bool DecodeNumFeatureFunctions(std::string s);
- LexicalReorderingConfiguration *m_configuration;
- std::string m_modelTypeString;
- std::vector<std::string> m_modelType;
- LexicalReorderingTable* m_table;
- //std::vector<Direction> m_direction;
- std::vector<LexicalReorderingConfiguration::Condition> m_condition;
- //std::vector<size_t> m_scoreOffset;
- //bool m_oneScorePerDirection;
- std::vector<FactorType> m_factorsE, m_factorsF;
+ LexicalReorderingConfiguration *m_configuration;
+ std::string m_modelTypeString;
+ std::vector<std::string> m_modelType;
+ LexicalReorderingTable* m_table;
+ //std::vector<Direction> m_direction;
+ std::vector<LexicalReorderingConfiguration::Condition> m_condition;
+ //std::vector<size_t> m_scoreOffset;
+ //bool m_oneScorePerDirection;
+ std::vector<FactorType> m_factorsE, m_factorsF;
};
}
diff --git a/moses/LexicalReorderingState.cpp b/moses/LexicalReorderingState.cpp
index ddb089055..3165e447f 100644
--- a/moses/LexicalReorderingState.cpp
+++ b/moses/LexicalReorderingState.cpp
@@ -212,7 +212,7 @@ LexicalReorderingState* PhraseBasedReorderingState::Expand(const TranslationOpti
if (m_direction == LexicalReorderingConfiguration::Forward && m_first) {
ClearScores(scores);
} else {
- if (!m_first || m_useFirstBackwardScore){
+ if (!m_first || m_useFirstBackwardScore) {
if (modelType == LexicalReorderingConfiguration::MSD) {
reoType = GetOrientationTypeMSD(currWordsRange);
} else if (modelType == LexicalReorderingConfiguration::MSLR) {
diff --git a/moses/LexicalReorderingTable.cpp b/moses/LexicalReorderingTable.cpp
index c0da31402..65ba66047 100644
--- a/moses/LexicalReorderingTable.cpp
+++ b/moses/LexicalReorderingTable.cpp
@@ -9,7 +9,7 @@
#include "TargetPhraseCollection.h"
#ifndef WIN32
-#include "TranslationModel/CompactPT/LexicalReorderingTableCompact.h"
+#include "TranslationModel/CompactPT/LexicalReorderingTableCompact.h"
#endif
namespace Moses
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index b8e958d04..76809f224 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -80,7 +80,7 @@ void Manager::ProcessSentence()
{
// reset statistics
ResetSentenceStats(m_source);
-
+
Timer getOptionsTime;
getOptionsTime.start();
m_transOptColl->CreateTranslationOptions();
@@ -262,8 +262,9 @@ struct SGNReverseCompare {
/**
* Implements lattice sampling, as in Chatterjee & Cancedda, emnlp 2010
**/
-void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
-
+void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const
+{
+
vector<SearchGraphNode> searchGraph;
GetSearchGraph(searchGraph);
@@ -277,15 +278,15 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
map<int,const Hypothesis*> idToHyp;
map<int,float> fscores;
- //Iterating through the hypos in reverse order of id gives a reverse
- //topological order. We rely on the fact that hypo ids are given out
+ //Iterating through the hypos in reverse order of id gives a reverse
+ //topological order. We rely on the fact that hypo ids are given out
//sequentially, as the search proceeds.
- //NB: Could just sort by stack.
+ //NB: Could just sort by stack.
sort(searchGraph.begin(), searchGraph.end(), SGNReverseCompare());
//first task is to fill in the outgoing hypos and edge scores.
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
- i != searchGraph.end(); ++i) {
+ i != searchGraph.end(); ++i) {
const Hypothesis* hypo = i->hypo;
idToHyp[hypo->GetId()] = hypo;
fscores[hypo->GetId()] = i->fscore;
@@ -293,7 +294,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
//back to current
const Hypothesis* prevHypo = i->hypo->GetPrevHypo();
outgoingHyps[prevHypo].insert(hypo);
- edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] =
+ edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] =
hypo->GetScore() - prevHypo->GetScore();
}
//forward from current
@@ -304,7 +305,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
outgoingHyps[hypo].insert(nextHypo);
map<int,float>::const_iterator fscoreIter = fscores.find(nextHypo->GetId());
CHECK(fscoreIter != fscores.end());
- edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] =
+ edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] =
i->fscore - fscoreIter->second;
}
}
@@ -312,26 +313,26 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
//then run through again to calculate sigmas
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
- i != searchGraph.end(); ++i) {
+ i != searchGraph.end(); ++i) {
if (i->forward == -1) {
sigmas[i->hypo] = 0;
} else {
- map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
+ map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
outgoingHyps.find(i->hypo);
-
+
CHECK(outIter != outgoingHyps.end());
float sigma = 0;
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
- j != outIter->second.end(); ++j) {
+ j != outIter->second.end(); ++j) {
map<const Hypothesis*, float>::const_iterator succIter = sigmas.find(*j);
CHECK(succIter != sigmas.end());
- map<Edge,float>::const_iterator edgeScoreIter =
+ map<Edge,float>::const_iterator edgeScoreIter =
edgeScores.find(Edge(i->hypo->GetId(),(*j)->GetId()));
CHECK(edgeScoreIter != edgeScores.end());
float term = edgeScoreIter->second + succIter->second; // Add sigma(*j)
if (sigma == 0) {
- sigma = term;
+ sigma = term;
} else {
sigma = log_sum(sigma,term);
}
@@ -347,7 +348,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
vector<const Hypothesis*> path;
path.push_back(startHypo);
while(1) {
- map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
+ map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
outgoingHyps.find(path.back());
if (outIter == outgoingHyps.end() || !outIter->second.size()) {
//end of the path
@@ -358,7 +359,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
vector<float> candidateScores;
float scoreTotal = 0;
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
- j != outIter->second.end(); ++j) {
+ j != outIter->second.end(); ++j) {
candidates.push_back(*j);
CHECK(sigmas.find(*j) != sigmas.end());
Edge edge(path.back()->GetId(),(*j)->GetId());
@@ -385,18 +386,18 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
}
//cerr << "Random: " << random << " Chose " << position-1 << endl;
const Hypothesis* chosen = candidates[position-1];
- path.push_back(chosen);
+ path.push_back(chosen);
}
//cerr << "Path: " << endl;
//for (size_t j = 0; j < path.size(); ++j) {
- // cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl;
+ // cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl;
//}
//cerr << endl;
//Convert the hypos to TrellisPath
ret.Add(new TrellisPath(path));
//cerr << ret.at(ret.GetSize()-1).GetScoreBreakdown() << endl;
- }
+ }
}
@@ -680,7 +681,7 @@ void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std:
// outputSearchGraphStream << endl;
// outputSearchGraphStream << (*hypo) << endl;
- // const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
+ // const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
// outputSearchGraphStream << scoreCollection << endl;
const StaticData& staticData = StaticData::Instance();
@@ -753,10 +754,10 @@ size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction*
if (numScoreComps != 0) {
vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
for (size_t i = 0; i < numScoreComps; ++i) {
- outputSearchGraphStream << "# " << ff->GetScoreProducerDescription()
- << " " << ff->GetScoreProducerDescription()
- << " " << (i+1) << " of " << numScoreComps << endl
- << "x" << (index+i) << "scale=" << values[i] << endl;
+ outputSearchGraphStream << "# " << ff->GetScoreProducerDescription()
+ << " " << ff->GetScoreProducerDescription()
+ << " " << (i+1) << " of " << numScoreComps << endl
+ << "x" << (index+i) << "scale=" << values[i] << endl;
}
return index+numScoreComps;
} else {
@@ -779,28 +780,28 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth
// // cout << "m_scores.coreSize()==" << m_scores.coreSize() << endl;
// // cout << "m_scores.cbegin() ?= m_scores.cend()\t" << (m_scores.cbegin() == m_scores.cend()) << endl;
-
+
// // for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
// // std::cout<<prefix << "\t" << (i->first) << "\t" << (i->second) << std::endl;
// // }
// for(int i=0, n=v.size(); i<n; i+=1) {
// // outputSearchGraphStream << prefix << i << "==" << v[i] << std::endl;
-
+
// }
// }
// FVector featureValues = scoreCollection.GetVectorForProducer(ff);
// outputSearchGraphStream << featureValues << endl;
- const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
+ const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
size_t numScoreComps = featureValues.size();//featureValues.coreSize();
// if (numScoreComps != ScoreProducer::unlimited) {
- // vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+ // vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
for (size_t i = 0; i < numScoreComps; ++i) {
outputSearchGraphStream << "x" << (index+i) << "=" << ((zeros) ? 0.0 : featureValues[i]) << " ";
- }
- return index+numScoreComps;
+ }
+ return index+numScoreComps;
// } else {
// cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
// assert(false);
@@ -810,7 +811,7 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth
size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
{
- ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown();
+ ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown();
const Hypothesis *prevHypo = hypo->GetPrevHypo();
if (prevHypo) {
scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() );
@@ -851,60 +852,60 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
{
long hypergraphHypothesisID = 0;
for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) {
-
+
// Get an id number for the previous hypothesis
const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo();
if (prevHypo!=NULL) {
- int mosesPrevHypothesisID = prevHypo->GetId();
- if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) {
- mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID;
- // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID;
- hypergraphHypothesisID += 1;
- }
+ int mosesPrevHypothesisID = prevHypo->GetId();
+ if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) {
+ mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID;
+ // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID;
+ hypergraphHypothesisID += 1;
+ }
}
// Get an id number for this hypothesis
int mosesHypothesisID;
if (searchGraph[arcNumber].recombinationHypo) {
- mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId();
+ mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId();
} else {
- mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
+ mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
}
if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) {
-
- mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
- // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID;
- bool terminalNode = (searchGraph[arcNumber].forward == -1);
- if (terminalNode) {
- // Final arc to end node, representing the end of the sentence </s>
- terminalNodes.insert(hypergraphHypothesisID);
- }
+ mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
+ // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID;
- hypergraphHypothesisID += 1;
+ bool terminalNode = (searchGraph[arcNumber].forward == -1);
+ if (terminalNode) {
+ // Final arc to end node, representing the end of the sentence </s>
+ terminalNodes.insert(hypergraphHypothesisID);
+ }
+
+ hypergraphHypothesisID += 1;
}
// Record that this arc ends at this node
hypergraphIDToArcs.insert(pair<int,int>(mosesIDToHypergraphID[mosesHypothesisID],arcNumber));
}
-
+
// Unique end node
endNode = hypergraphHypothesisID;
// mosesIDToHypergraphID[hypergraphHypothesisID] = hypergraphHypothesisID;
numNodes = endNode + 1;
}
-
+
long numArcs = searchGraph.size() + terminalNodes.size();
// Print number of nodes and arcs
outputSearchGraphStream << numNodes << " " << numArcs << endl;
- VERBOSE(2,"Search graph to output as hypergraph for sentence " << translationId
- << " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl)
+ VERBOSE(2,"Search graph to output as hypergraph for sentence " << translationId
+ << " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl)
VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << translationId << std::endl)
@@ -920,51 +921,51 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
outputSearchGraphStream << count << "\n";
pair<multimap<int,int>::iterator, multimap<int,int>::iterator> range =
- hypergraphIDToArcs.equal_range(hypergraphHypothesisID);
+ hypergraphIDToArcs.equal_range(hypergraphHypothesisID);
for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) {
- int lineNumber = (*it).second;
- const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
- int mosesHypothesisID;// = thisHypo->GetId();
- if (searchGraph[lineNumber].recombinationHypo) {
- mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId();
- } else {
- mosesHypothesisID = searchGraph[lineNumber].hypo->GetId();
- }
- // int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
- UTIL_THROW_IF(
- (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
- util::Exception,
- "Error while writing search lattice as hypergraph for sentence " << translationId << ". " <<
- "Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
- ", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] <<
- ". There are " << numNodes << " nodes in the search lattice."
- );
-
- const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
- if (prevHypo==NULL) {
- // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " start of sentence" << std::endl)
- outputSearchGraphStream << "<s> ||| \n";
- } else {
- int startNode = mosesIDToHypergraphID[prevHypo->GetId()];
- // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl)
- UTIL_THROW_IF(
- (startNode >= hypergraphHypothesisID),
- util::Exception,
- "Error while writing search lattice as hypergraph for sentence" << translationId << ". " <<
- "The nodes must be output in topological order. The code attempted to violate this restriction."
- );
-
- const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
- int targetWordCount = targetPhrase.GetSize();
-
- outputSearchGraphStream << "[" << startNode << "]";
- for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
- outputSearchGraphStream << " " << targetPhrase.GetWord(targetWordIndex);
- }
- outputSearchGraphStream << " ||| ";
- OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
- outputSearchGraphStream << "\n";
- }
+ int lineNumber = (*it).second;
+ const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
+ int mosesHypothesisID;// = thisHypo->GetId();
+ if (searchGraph[lineNumber].recombinationHypo) {
+ mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId();
+ } else {
+ mosesHypothesisID = searchGraph[lineNumber].hypo->GetId();
+ }
+ // int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
+ UTIL_THROW_IF(
+ (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
+ util::Exception,
+ "Error while writing search lattice as hypergraph for sentence " << translationId << ". " <<
+ "Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
+ ", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] <<
+ ". There are " << numNodes << " nodes in the search lattice."
+ );
+
+ const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
+ if (prevHypo==NULL) {
+ // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " start of sentence" << std::endl)
+ outputSearchGraphStream << "<s> ||| \n";
+ } else {
+ int startNode = mosesIDToHypergraphID[prevHypo->GetId()];
+ // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl)
+ UTIL_THROW_IF(
+ (startNode >= hypergraphHypothesisID),
+ util::Exception,
+ "Error while writing search lattice as hypergraph for sentence" << translationId << ". " <<
+ "The nodes must be output in topological order. The code attempted to violate this restriction."
+ );
+
+ const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
+ int targetWordCount = targetPhrase.GetSize();
+
+ outputSearchGraphStream << "[" << startNode << "]";
+ for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
+ outputSearchGraphStream << " " << targetPhrase.GetWord(targetWordIndex);
+ }
+ outputSearchGraphStream << " ||| ";
+ OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
+ outputSearchGraphStream << "\n";
+ }
}
}
}
@@ -1001,14 +1002,14 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea
int hypothesisID = searchGraph[arcNumber].hypo->GetId();
if (nodes.count(hypothesisID) == 0) {
-
+
numNodes += targetWordCount;
nodes[hypothesisID] = numNodes;
//numNodes += 1;
bool terminalNode = (searchGraph[arcNumber].forward == -1);
if (terminalNode) {
- numArcs += 1;
+ numArcs += 1;
}
}
@@ -1038,35 +1039,35 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea
int targetWordCount = targetPhrase.GetSize();
for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
- int x = (targetWordCount-targetWordIndex);
+ int x = (targetWordCount-targetWordIndex);
- outputSearchGraphStream << "J=" << arcNumber;
+ outputSearchGraphStream << "J=" << arcNumber;
- if (targetWordIndex==0) {
- outputSearchGraphStream << " S=" << startNode;
- } else {
- outputSearchGraphStream << " S=" << endNode - x;
- }
+ if (targetWordIndex==0) {
+ outputSearchGraphStream << " S=" << startNode;
+ } else {
+ outputSearchGraphStream << " S=" << endNode - x;
+ }
- outputSearchGraphStream << " E=" << endNode - (x-1)
- << " W=" << targetPhrase.GetWord(targetWordIndex);
+ outputSearchGraphStream << " E=" << endNode - (x-1)
+ << " W=" << targetPhrase.GetWord(targetWordIndex);
- OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream);
+ OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream);
- outputSearchGraphStream << endl;
+ outputSearchGraphStream << endl;
- arcNumber += 1;
+ arcNumber += 1;
}
if (terminalNode && terminalNodes.count(endNode) == 0) {
- terminalNodes.insert(endNode);
- outputSearchGraphStream << "J=" << arcNumber
- << " S=" << endNode
- << " E=" << numNodes
- << endl;
- arcNumber += 1;
+ terminalNodes.insert(endNode);
+ outputSearchGraphStream << "J=" << arcNumber
+ << " S=" << endNode
+ << " E=" << numNodes
+ << endl;
+ arcNumber += 1;
}
- }
+ }
}
}
@@ -1124,17 +1125,17 @@ void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
outputSearchGraphStream << " recombined=" << searchNode.recombinationHypo->GetId();
outputSearchGraphStream << " forward=" << searchNode.forward << " fscore=" << searchNode.fscore
- << " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos()
- << "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos();
+ << " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos()
+ << "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos();
// Modified so that -osgx is a superset of -osg (GST Oct 2011)
ScoreComponentCollection scoreBreakdown = searchNode.hypo->GetScoreBreakdown();
scoreBreakdown.MinusEquals( prevHypo->GetScoreBreakdown() );
//outputSearchGraphStream << " scores = [ " << StaticData::Instance().GetAllWeights();
- outputSearchGraphStream << " scores=\"" << scoreBreakdown << "\"";
+ outputSearchGraphStream << " scores=\"" << scoreBreakdown << "\"";
outputSearchGraphStream << " out=\"" << searchNode.hypo->GetSourcePhraseStringRep() << "|" <<
- searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << "\"" << endl;
+ searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << "\"" << endl;
// outputSearchGraphStream << " out=" << searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
}
diff --git a/moses/Manager.h b/moses/Manager.h
index 687d8dbeb..fd329c309 100644
--- a/moses/Manager.h
+++ b/moses/Manager.h
@@ -56,9 +56,9 @@ struct SearchGraphNode {
hypo(theHypo), recombinationHypo(theRecombinationHypo),
forward(theForward), fscore(theFscore) {}
- bool operator<(const SearchGraphNode& sgn) const {
- return this->hypo->GetId() < sgn.hypo->GetId();
- }
+ bool operator<(const SearchGraphNode& sgn) const {
+ return this->hypo->GetId() < sgn.hypo->GetId();
+ }
};
diff --git a/moses/MockHypothesis.cpp b/moses/MockHypothesis.cpp
index e98794cb7..826104565 100644
--- a/moses/MockHypothesis.cpp
+++ b/moses/MockHypothesis.cpp
@@ -19,7 +19,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-#include "MockHypothesis.h"
+#include "MockHypothesis.h"
#include <boost/test/unit_test.hpp>
@@ -28,19 +28,20 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace Moses;
using namespace std;
-namespace MosesTest {
+namespace MosesTest
+{
MockHypothesisGuard::MockHypothesisGuard(
- const string& sourceSentence,
- const vector<Alignment>& alignments,
- const vector<string>& targetSegments)
-: m_emptyTarget(),
- m_sentence(),
- m_wp("WordPenalty"),
- m_uwp("UnknownWordPenalty"),
- m_dist("Distortion"),
- m_manager(0,m_sentence,Normal)
+ const string& sourceSentence,
+ const vector<Alignment>& alignments,
+ const vector<string>& targetSegments)
+ : m_emptyTarget(),
+ m_sentence(),
+ m_wp("WordPenalty"),
+ m_uwp("UnknownWordPenalty"),
+ m_dist("Distortion"),
+ m_manager(0,m_sentence,Normal)
{
BOOST_CHECK_EQUAL(alignments.size(), targetSegments.size());
@@ -49,7 +50,7 @@ MockHypothesisGuard::MockHypothesisGuard(
stringstream in(sourceSentence + "\n");
m_sentence.Read(in,factors);
-
+
//Initial empty hypothesis
m_manager.ResetSentenceStats(m_sentence);
@@ -58,21 +59,20 @@ MockHypothesisGuard::MockHypothesisGuard(
//create the chain
vector<Alignment>::const_iterator ai = alignments.begin();
vector<string>::const_iterator ti = targetSegments.begin();
- for (; ti != targetSegments.end() && ai != alignments.end(); ++ti,++ai)
- {
+ for (; ti != targetSegments.end() && ai != alignments.end(); ++ti,++ai) {
Hypothesis* prevHypo = m_hypothesis;
WordsRange wordsRange(ai->first,ai->second);
m_targetPhrases.push_back(TargetPhrase());
m_targetPhrases.back().CreateFromString(Input, factors, *ti, "|", NULL);
m_toptions.push_back(new TranslationOption
- (wordsRange,m_targetPhrases.back()));
- m_hypothesis = Hypothesis::Create(*prevHypo,*m_toptions.back(),NULL);
+ (wordsRange,m_targetPhrases.back()));
+ m_hypothesis = Hypothesis::Create(*prevHypo,*m_toptions.back(),NULL);
}
}
-MockHypothesisGuard::~MockHypothesisGuard()
+MockHypothesisGuard::~MockHypothesisGuard()
{
RemoveAllInColl(m_toptions);
while (m_hypothesis) {
diff --git a/moses/MockHypothesis.h b/moses/MockHypothesis.h
index 2490dd5a6..67182ad56 100644
--- a/moses/MockHypothesis.h
+++ b/moses/MockHypothesis.h
@@ -29,7 +29,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Hypothesis.h"
#include "Manager.h"
-namespace MosesTest {
+namespace MosesTest
+{
//
// Construct a hypothesis with arbitrary source and target phrase
@@ -38,42 +39,52 @@ namespace MosesTest {
typedef std::pair<size_t,size_t> Alignment; //(first,last) in source
-class MockHypothesisGuard {
- public:
- /** Creates a phrase-based hypothesis.
- */
- MockHypothesisGuard(
- const std::string& sourceSentence,
- const std::vector<Alignment>& alignments,
- const std::vector<std::string>& targetSegments);
- Moses::Hypothesis* operator*() const {return m_hypothesis;}
-
- /** Destroy the hypothesis chain */
- ~MockHypothesisGuard();
-
- private:
- Moses::TargetPhrase m_emptyTarget;
- Moses::Sentence m_sentence;
- Moses::WordPenaltyProducer m_wp;
- Moses::UnknownWordPenaltyProducer m_uwp;
- Moses::DistortionScoreProducer m_dist;
- Moses::Manager m_manager;
- Moses::Hypothesis* m_hypothesis;
- std::vector<Moses::TargetPhrase> m_targetPhrases;
- std::vector<Moses::TranslationOption*> m_toptions;
+class MockHypothesisGuard
+{
+public:
+ /** Creates a phrase-based hypothesis.
+ */
+ MockHypothesisGuard(
+ const std::string& sourceSentence,
+ const std::vector<Alignment>& alignments,
+ const std::vector<std::string>& targetSegments);
+ Moses::Hypothesis* operator*() const {
+ return m_hypothesis;
+ }
+
+ /** Destroy the hypothesis chain */
+ ~MockHypothesisGuard();
+
+private:
+ Moses::TargetPhrase m_emptyTarget;
+ Moses::Sentence m_sentence;
+ Moses::WordPenaltyProducer m_wp;
+ Moses::UnknownWordPenaltyProducer m_uwp;
+ Moses::DistortionScoreProducer m_dist;
+ Moses::Manager m_manager;
+ Moses::Hypothesis* m_hypothesis;
+ std::vector<Moses::TargetPhrase> m_targetPhrases;
+ std::vector<Moses::TranslationOption*> m_toptions;
};
-class HypothesisFixture {
- public:
- HypothesisFixture();
- const Moses::Hypothesis* empty() {return **m_empty;}
- const Moses::Hypothesis* partial() {return **m_partial;}
- const Moses::Hypothesis* full() {return **m_full;}
-
- private:
- std::auto_ptr<MockHypothesisGuard> m_empty;
- std::auto_ptr<MockHypothesisGuard> m_partial;
- std::auto_ptr<MockHypothesisGuard> m_full;
+class HypothesisFixture
+{
+public:
+ HypothesisFixture();
+ const Moses::Hypothesis* empty() {
+ return **m_empty;
+ }
+ const Moses::Hypothesis* partial() {
+ return **m_partial;
+ }
+ const Moses::Hypothesis* full() {
+ return **m_full;
+ }
+
+private:
+ std::auto_ptr<MockHypothesisGuard> m_empty;
+ std::auto_ptr<MockHypothesisGuard> m_partial;
+ std::auto_ptr<MockHypothesisGuard> m_full;
};
diff --git a/moses/OutputCollector.h b/moses/OutputCollector.h
index 96353934e..5f72433d8 100644
--- a/moses/OutputCollector.h
+++ b/moses/OutputCollector.h
@@ -45,27 +45,23 @@ public:
OutputCollector(std::ostream* outStream= &std::cout, std::ostream* debugStream=&std::cerr) :
m_nextOutput(0),m_outStream(outStream),m_debugStream(debugStream),
m_isHoldingOutputStream(false), m_isHoldingDebugStream(false) {}
-
- ~OutputCollector()
- {
+
+ ~OutputCollector() {
if (m_isHoldingOutputStream)
delete m_outStream;
if (m_isHoldingDebugStream)
delete m_debugStream;
}
-
- void HoldOutputStream()
- {
+
+ void HoldOutputStream() {
m_isHoldingOutputStream = true;
}
-
- void HoldDebugStream()
- {
+
+ void HoldDebugStream() {
m_isHoldingDebugStream = true;
}
-
- bool OutputIsCout() const
- {
+
+ bool OutputIsCout() const {
return (m_outStream == std::cout);
}
@@ -87,7 +83,7 @@ public:
*m_outStream << iter->second << std::flush;
++m_nextOutput;
std::map<int,std::string>::iterator debugIter = m_debugs.find(iter->first);
- m_outputs.erase(iter);
+ m_outputs.erase(iter);
if (debugIter != m_debugs.end()) {
*m_debugStream << debugIter->second << std::flush;
m_debugs.erase(debugIter);
diff --git a/moses/PCNTools.h b/moses/PCNTools.h
index 8a31e99ad..ea43df838 100644
--- a/moses/PCNTools.h
+++ b/moses/PCNTools.h
@@ -36,7 +36,7 @@ namespace PCN
typedef std::pair<std::pair<std::string, std::vector<float> >, size_t> CNAlt;
typedef std::vector<CNAlt> CNCol;
typedef std::vector<CNCol> CN;
-
+
/** Given a string ((('foo',0.1,1),('bar',0.9,2)),...) representation of a
* word lattice in PCN format, return a CN object representing the lattice
*/
diff --git a/moses/PDTAimp.h b/moses/PDTAimp.h
index 835cd6895..735179217 100644
--- a/moses/PDTAimp.h
+++ b/moses/PDTAimp.h
@@ -38,7 +38,7 @@ protected:
: m_dict(0),
m_obj(p),useCache(1),m_numInputScores(nis),totalE(0),distinctE(0) {}
- public:
+public:
std::vector<FactorType> m_input,m_output;
PhraseDictionaryTree *m_dict;
typedef std::vector<TargetPhraseCollection const*> vTPC;
@@ -185,7 +185,7 @@ protected:
void Create(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::string &filePath
- , const std::vector<float> &weight
+ , const std::vector<float> &weight
) {
// set my members
@@ -267,10 +267,10 @@ protected:
StringTgtCand::Tokens const& factorStrings,
Scores const& scoreVector,
const ScoreComponentCollection& sparseFeatures,
- std::vector<float> &weights,
- float weightWP,
+ std::vector<float> &weights,
+ float weightWP,
Phrase const* srcPtr) const {
- FactorCollection &factorCollection = FactorCollection::Instance();
+ FactorCollection &factorCollection = FactorCollection::Instance();
for(size_t k=0; k<factorStrings.size(); ++k) {
util::TokenIter<util::MultiCharacter, false> word(*factorStrings[k], StaticData::Instance().GetFactorDelimiter());
@@ -438,8 +438,8 @@ protected:
//put in phrase table scores, logging as we insert
std::transform(tcands[i].scores.begin(),tcands[i].scores.end(),nscores.begin() + m_numInputScores,TransformScore);
-
- CHECK(nscores.size()==weightT.size());
+
+ CHECK(nscores.size()==weightT.size());
//tally up
float score=std::inner_product(nscores.begin(), nscores.end(), weightT.begin(), 0.0f);
diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index 4264010cd..e16b6d08f 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -69,30 +69,30 @@ Parameter::Parameter()
AddParam("report-all-factors", "report all factors in output, not just first");
AddParam("report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false");
#ifdef HAVE_SYNLM
- AddParam("slmodel-file", "location of the syntactic language model file(s)");
- AddParam("slmodel-factor", "factor to use with syntactic language model");
- AddParam("slmodel-beam", "beam width to use with syntactic language model's parser");
+ AddParam("slmodel-file", "location of the syntactic language model file(s)");
+ AddParam("slmodel-factor", "factor to use with syntactic language model");
+ AddParam("slmodel-beam", "beam width to use with syntactic language model's parser");
#endif
AddParam("stack", "s", "maximum stack size for histogram pruning");
AddParam("stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)");
AddParam("threads","th", "number of threads to use in decoding (defaults to single-threaded)");
- AddParam("translation-details", "T", "for each best hypothesis, report translation details to the given file");
- AddParam("ttable-file", "location and properties of the translation tables");
- AddParam("ttable-limit", "ttl", "maximum number of translation table entries per input phrase");
- AddParam("translation-option-threshold", "tot", "threshold for translation options relative to best for input phrase");
- AddParam("early-discarding-threshold", "edt", "threshold for constructing hypotheses based on estimate cost");
- AddParam("verbose", "v", "verbosity level of the logging");
+ AddParam("translation-details", "T", "for each best hypothesis, report translation details to the given file");
+ AddParam("ttable-file", "location and properties of the translation tables");
+ AddParam("ttable-limit", "ttl", "maximum number of translation table entries per input phrase");
+ AddParam("translation-option-threshold", "tot", "threshold for translation options relative to best for input phrase");
+ AddParam("early-discarding-threshold", "edt", "threshold for constructing hypotheses based on estimate cost");
+ AddParam("verbose", "v", "verbosity level of the logging");
AddParam("references", "Reference file(s) - used for bleu score feature");
- AddParam("output-factors", "list if factors in the output");
- AddParam("cache-path", "?");
- AddParam("distortion-limit", "dl", "distortion (reordering) limit in maximum number of words (0 = monotone, -1 = unlimited)");
- AddParam("monotone-at-punctuation", "mp", "do not reorder over punctuation");
- AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables");
- AddParam("distortion", "configurations for each factorized/lexicalized reordering model.");
- AddParam("early-distortion-cost", "edc", "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no");
- AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'");
+ AddParam("output-factors", "list if factors in the output");
+ AddParam("cache-path", "?");
+ AddParam("distortion-limit", "dl", "distortion (reordering) limit in maximum number of words (0 = monotone, -1 = unlimited)");
+ AddParam("monotone-at-punctuation", "mp", "do not reorder over punctuation");
+ AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables");
+ AddParam("distortion", "configurations for each factorized/lexicalized reordering model.");
+ AddParam("early-distortion-cost", "edc", "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no");
+ AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'");
AddParam("xml-brackets", "xb", "specify strings to be used as xml tags opening and closing, e.g. \"{{ }}\" (default \"< >\"). Avoid square brackets because of configuration file format. Valid only with text input mode" );
- AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation");
+ AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation");
AddParam("lminimum-bayes-risk", "lmbr", "use lattice miminum Bayes risk to determine best translation");
AddParam("mira", "do mira training");
AddParam("consensus-decoding", "con", "use consensus decoding (De Nero et. al. 2009)");
@@ -119,17 +119,17 @@ Parameter::Parameter()
#ifdef HAVE_PROTOBUF
AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
#endif
- AddParam("cube-pruning-pop-limit", "cbp", "How many hypotheses should be popped for each stack. (default = 1000)");
- AddParam("cube-pruning-diversity", "cbd", "How many hypotheses should be created for each coverage. (default = 0)");
- AddParam("search-algorithm", "Which search algorithm to use. 0=normal stack, 1=cube pruning, 2=cube growing. (default = 0)");
- AddParam("constraint", "Location of the file with target sentences to produce constraining the search");
- AddParam("description", "Source language, target language, description");
- AddParam("max-chart-span", "maximum num. of source word chart rules can consume (default 10)");
- AddParam("non-terminals", "list of non-term symbols, space separated");
- AddParam("rule-limit", "a little like table limit. But for chart decoding rules. Default is DEFAULT_MAX_TRANS_OPT_SIZE");
- AddParam("source-label-overlap", "What happens if a span already has a label. 0=add more. 1=replace. 2=discard. Default is 0");
- AddParam("output-hypo-score", "Output the hypo score to stdout with the output string. For search error analysis. Default is false");
- AddParam("unknown-lhs", "file containing target lhs of unknown words. 1 per line: LHS prob");
+ AddParam("cube-pruning-pop-limit", "cbp", "How many hypotheses should be popped for each stack. (default = 1000)");
+ AddParam("cube-pruning-diversity", "cbd", "How many hypotheses should be created for each coverage. (default = 0)");
+ AddParam("search-algorithm", "Which search algorithm to use. 0=normal stack, 1=cube pruning, 2=cube growing. (default = 0)");
+ AddParam("constraint", "Location of the file with target sentences to produce constraining the search");
+ AddParam("description", "Source language, target language, description");
+ AddParam("max-chart-span", "maximum num. of source word chart rules can consume (default 10)");
+ AddParam("non-terminals", "list of non-term symbols, space separated");
+ AddParam("rule-limit", "a little like table limit. But for chart decoding rules. Default is DEFAULT_MAX_TRANS_OPT_SIZE");
+ AddParam("source-label-overlap", "What happens if a span already has a label. 0=add more. 1=replace. 2=discard. Default is 0");
+ AddParam("output-hypo-score", "Output the hypo score to stdout with the output string. For search error analysis. Default is false");
+ AddParam("unknown-lhs", "file containing target lhs of unknown words. 1 per line: LHS prob");
AddParam("phrase-pair-feature", "Source and target factors for phrase pair feature");
AddParam("phrase-boundary-source-feature", "Source factors for phrase boundary feature");
AddParam("phrase-boundary-target-feature", "Target factors for phrase boundary feature");
@@ -153,9 +153,9 @@ Parameter::Parameter()
AddParam("show-weights", "print feature weights and exit");
AddParam("start-translation-id", "Id of 1st input. Default = 0");
AddParam("output-unknowns", "Output the unknown (OOV) words to the given file, one line per sentence");
-
- // Compact phrase table and reordering table.
- AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");
+
+ // Compact phrase table and reordering table.
+ AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");
AddParam("minphr-memory", "Load phrase table in minphr format into memory");
AddParam("print-alignment-info", "Output word-to-word alignment to standard out, separated from translation by |||. Word-to-word alignments are takne from the phrase table if any. Default is false");
@@ -214,7 +214,7 @@ void Parameter::AddParam(const string &paramName, const string &abbrevName, cons
m_valid[paramName] = true;
m_valid[abbrevName] = true;
m_abbreviation[paramName] = abbrevName;
- m_fullname[abbrevName] = paramName;
+ m_fullname[abbrevName] = paramName;
m_description[paramName] = description;
}
@@ -263,7 +263,7 @@ bool Parameter::LoadParam(int argc, char* argv[])
PrintCredit();
Explain();
- cerr << endl;
+ cerr << endl;
UserMessage::Add("No configuration file was specified. Use -config or -f");
cerr << endl;
return false;
@@ -381,11 +381,9 @@ void Parameter::ConvertWeightArgsSingleWeight(const string &oldWeightName, const
PARAM_MAP::iterator iterMap;
iterMap = m_setting.find(oldWeightName);
- if (iterMap != m_setting.end())
- {
+ if (iterMap != m_setting.end()) {
const PARAM_VEC &weights = iterMap->second;
- for (size_t i = 0; i < weights.size(); ++i)
- {
+ for (size_t i = 0; i < weights.size(); ++i) {
SetWeight(newWeightName, ind, Scan<float>(weights[i]));
}
@@ -403,8 +401,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
CHECK(numInputScores.size() == 0);
numInputScores.push_back("1");
numInputScores.push_back("0");
- }
- else if (inputWeights.size() == 2) {
+ } else if (inputWeights.size() == 2) {
CHECK(numInputScores.size() == 0);
numInputScores.push_back("1");
numInputScores.push_back("1");
@@ -463,8 +460,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
PhraseTableImplementation implementation = (PhraseTableImplementation) Scan<int>(token[0]);
string ptType;
- switch (implementation)
- {
+ switch (implementation) {
case Memory:
ptType = "PhraseDictionaryMemory";
break;
@@ -488,8 +484,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
if (ptIndices.find(ptType) == ptIndices.end()) {
ptIndices[ptType] = 0;
ptInd = 0;
- }
- else {
+ } else {
ptInd = ++ptIndices[ptType];
}
@@ -516,7 +511,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
//characteristics of the phrase table
vector<FactorType> input = Tokenize<FactorType>(token[1], ",")
- ,output = Tokenize<FactorType>(token[2], ",");
+ ,output = Tokenize<FactorType>(token[2], ",");
size_t numScoreComponent = Scan<size_t>(token[3]);
string filePath= token[4];
@@ -561,14 +556,13 @@ void Parameter::ConvertWeightArgsDistortion()
// distortion / lex distortion
const PARAM_VEC &oldWeights = GetParam(oldWeightName);
- if (oldWeights.size() > 0)
- {
+ if (oldWeights.size() > 0) {
if (!isParamSpecified("search-algorithm") ||
- (GetParam("search-algorithm").size() > 0
- && (Trim(GetParam("search-algorithm")[0]) == "0"
+ (GetParam("search-algorithm").size() > 0
+ && (Trim(GetParam("search-algorithm")[0]) == "0"
||Trim(GetParam("search-algorithm")[0]) == "1"
- )
- )
+ )
+ )
) {
// phrase-based. Add distance distortion to list of features
AddFeature("Distortion");
@@ -587,8 +581,7 @@ void Parameter::ConvertWeightArgsDistortion()
size_t numFF = Scan<size_t>(toks[2]);
vector<float> weights(numFF);
- for (size_t currFF = 0; currFF < numFF; ++currFF)
- {
+ for (size_t currFF = 0; currFF < numFF; ++currFF) {
CHECK(currOldInd < oldWeights.size());
float weight = Scan<float>(oldWeights[currOldInd]);
weights[currFF] = weight;
@@ -625,12 +618,12 @@ void Parameter::ConvertWeightArgsLM()
bool isChartDecoding = true;
if (!isParamSpecified("search-algorithm") ||
- (GetParam("search-algorithm").size() > 0
- && (Trim(GetParam("search-algorithm")[0]) == "0"
- ||Trim(GetParam("search-algorithm")[0]) == "1"
- )
- )
- ) {
+ (GetParam("search-algorithm").size() > 0
+ && (Trim(GetParam("search-algorithm")[0]) == "0"
+ ||Trim(GetParam("search-algorithm")[0]) == "1"
+ )
+ )
+ ) {
isChartDecoding = false;
}
@@ -643,8 +636,7 @@ void Parameter::ConvertWeightArgsLM()
PARAM_MAP::iterator iterMap;
iterMap = m_setting.find(oldWeightName);
- if (iterMap != m_setting.end())
- {
+ if (iterMap != m_setting.end()) {
size_t currOldInd = 0;
const PARAM_VEC &weights = iterMap->second;
@@ -656,8 +648,7 @@ void Parameter::ConvertWeightArgsLM()
int lmType = Scan<int>(modelToks[0]);
string newFeatureName;
- switch (lmType)
- {
+ switch (lmType) {
case 0:
newFeatureName = "SRILM";
break;
@@ -677,12 +668,11 @@ void Parameter::ConvertWeightArgsLM()
numFF += oovWeights[lmIndex];
vector<float> weightsLM(numFF);
- for (size_t currFF = 0; currFF < numFF; ++currFF)
- {
+ for (size_t currFF = 0; currFF < numFF; ++currFF) {
CHECK(currOldInd < weights.size());
weightsLM[currFF] = Scan<float>(weights[currOldInd]);
if (isChartDecoding) {
- weightsLM[currFF] = UntransformLMScore(weightsLM[currFF]);
+ weightsLM[currFF] = UntransformLMScore(weightsLM[currFF]);
}
++currOldInd;
@@ -691,12 +681,11 @@ void Parameter::ConvertWeightArgsLM()
SetWeight(newFeatureName, ind, weightsLM);
string featureLine = newFeatureName + " "
- + "factor=" + modelToks[1] + " " // factor
- + "order=" + modelToks[2] + " "; // order
+ + "factor=" + modelToks[1] + " " // factor
+ + "order=" + modelToks[2] + " "; // order
if (lmType == 9) {
featureLine += "lazyken=1 ";
- }
- else if (lmType == 8) {
+ } else if (lmType == 8) {
featureLine += "lazyken=0 ";
}
@@ -718,8 +707,7 @@ void Parameter::ConvertWeightArgsGeneration(const std::string &oldWeightName, co
// distortion / lex distortion
PARAM_VEC &oldWeights = m_setting[oldWeightName];
- if (oldWeights.size() > 0)
- {
+ if (oldWeights.size() > 0) {
size_t currOldInd = 0;
PARAM_VEC &models = m_setting[oldFeatureName];
@@ -730,8 +718,7 @@ void Parameter::ConvertWeightArgsGeneration(const std::string &oldWeightName, co
size_t numFF = Scan<size_t>(modelToks[2]);
vector<float> weights(numFF);
- for (size_t currFF = 0; currFF < numFF; ++currFF)
- {
+ for (size_t currFF = 0; currFF < numFF; ++currFF) {
CHECK(currOldInd < oldWeights.size());
float weight = Scan<float>(oldWeights[currOldInd]);
weights[currFF] = weight;
@@ -742,10 +729,10 @@ void Parameter::ConvertWeightArgsGeneration(const std::string &oldWeightName, co
stringstream strme;
strme << "Generation "
- << "input-factor=" << modelToks[0] << " "
- << "output-factor=" << modelToks[1] << " "
- << "num-features=" << modelToks[2] << " "
- << "path=" << modelToks[3];
+ << "input-factor=" << modelToks[0] << " "
+ << "output-factor=" << modelToks[1] << " "
+ << "num-features=" << modelToks[2] << " "
+ << "path=" << modelToks[3];
AddFeature(strme.str());
}
}
@@ -761,23 +748,21 @@ void Parameter::ConvertWeightArgsWordPenalty()
bool isChartDecoding = true;
if (!isParamSpecified("search-algorithm") ||
- (GetParam("search-algorithm").size() > 0
- && (Trim(GetParam("search-algorithm")[0]) == "0"
- ||Trim(GetParam("search-algorithm")[0]) == "1"
- )
- )
- ) {
+ (GetParam("search-algorithm").size() > 0
+ && (Trim(GetParam("search-algorithm")[0]) == "0"
+ ||Trim(GetParam("search-algorithm")[0]) == "1"
+ )
+ )
+ ) {
isChartDecoding = false;
}
PARAM_MAP::iterator iterMap;
iterMap = m_setting.find(oldWeightName);
- if (iterMap != m_setting.end())
- {
+ if (iterMap != m_setting.end()) {
const PARAM_VEC &weights = iterMap->second;
- for (size_t i = 0; i < weights.size(); ++i)
- {
+ for (size_t i = 0; i < weights.size(); ++i) {
float weight = Scan<float>(weights[i]);
if (isChartDecoding) {
weight *= 0.434294482;
@@ -800,8 +785,7 @@ void Parameter::ConvertWeightArgs()
(m_setting.count("weight-i") || m_setting.count("weight-t") || m_setting.count("weight-w") ||
m_setting.count("weight-l") || m_setting.count("weight-u") || m_setting.count("weight-lex") ||
m_setting.count("weight-generation") || m_setting.count("weight-lr") || m_setting.count("weight-d")
- ))
- {
+ )) {
cerr << "Do not mix old and new format for specify weights";
}
@@ -833,8 +817,7 @@ void Parameter::ConvertWeightArgs()
void Parameter::CreateWeightsMap()
{
PARAM_VEC &vec = m_setting["weight"];
- for (size_t i = 0; i < vec.size(); ++i)
- {
+ for (size_t i = 0; i < vec.size(); ++i) {
const string &line = vec[i];
vector<string> toks = Tokenize(line);
CHECK(toks.size() >= 2);
@@ -865,8 +848,7 @@ void Parameter::WeightOverwrite()
string name("");
vector<float> weights;
vector<string> toks = Tokenize(vec[0]);
- for (size_t i = 0; i < toks.size(); ++i)
- {
+ for (size_t i = 0; i < toks.size(); ++i) {
const string &tok = toks[i];
if (tok.substr(tok.size() - 1, 1) == "=") {
@@ -879,8 +861,7 @@ void Parameter::WeightOverwrite()
}
name = tok.substr(0, tok.size() - 1);
- }
- else {
+ } else {
// a weight for curr ff
float weight = Scan<float>(toks[i]);
weights.push_back(weight);
@@ -899,14 +880,13 @@ bool Parameter::Validate()
PARAM_MAP::const_iterator iterParams;
for (iterParams = m_setting.begin(); iterParams != m_setting.end(); ++iterParams) {
const std::string &key = iterParams->first;
-
- if (m_valid.find(key) == m_valid.end())
- {
+
+ if (m_valid.find(key) == m_valid.end()) {
UserMessage::Add("Unknown parameter " + key);
noErrorFlag = false;
}
}
-
+
if (m_setting["lmodel-dub"].size() > 0) {
if (m_setting["lmodel-file"].size() != m_setting["lmodel-dub"].size()) {
stringstream errorMsg("");
@@ -1082,8 +1062,7 @@ bool Parameter::ReadConfigFile(const string &filePath )
if (line.size() == 0) {
// blank line. do nothing.
- }
- else if (line[0]=='[') {
+ } else if (line[0]=='[') {
// new parameter
for (size_t currPos = 0 ; currPos < line.size() ; currPos++) {
if (line[currPos] == ']') {
@@ -1227,23 +1206,23 @@ void Parameter::PrintCredit()
* \param values inew values for paramName */
void Parameter::OverwriteParam(const string &paramName, PARAM_VEC values)
{
- VERBOSE(2,"Overwriting parameter " << paramName);
-
- m_setting[paramName]; // defines the parameter, important for boolean switches
- if (m_setting[paramName].size() > 1){
- VERBOSE(2," (the parameter had " << m_setting[paramName].size() << " previous values)");
- CHECK(m_setting[paramName].size() == values.size());
- }else{
- VERBOSE(2," (the parameter does not have previous values)");
- m_setting[paramName].resize(values.size());
- }
- VERBOSE(2," with the following values:");
- int i=0;
- for (PARAM_VEC::iterator iter = values.begin(); iter != values.end() ; iter++, i++){
- m_setting[paramName][i] = *iter;
- VERBOSE(2, " " << *iter);
- }
- VERBOSE(2, std::endl);
+ VERBOSE(2,"Overwriting parameter " << paramName);
+
+ m_setting[paramName]; // defines the parameter, important for boolean switches
+ if (m_setting[paramName].size() > 1) {
+ VERBOSE(2," (the parameter had " << m_setting[paramName].size() << " previous values)");
+ CHECK(m_setting[paramName].size() == values.size());
+ } else {
+ VERBOSE(2," (the parameter does not have previous values)");
+ m_setting[paramName].resize(values.size());
+ }
+ VERBOSE(2," with the following values:");
+ int i=0;
+ for (PARAM_VEC::iterator iter = values.begin(); iter != values.end() ; iter++, i++) {
+ m_setting[paramName][i] = *iter;
+ VERBOSE(2, " " << *iter);
+ }
+ VERBOSE(2, std::endl);
}
std::set<std::string> Parameter::GetWeightNames() const
@@ -1256,7 +1235,7 @@ std::set<std::string> Parameter::GetWeightNames() const
}
return ret;
}
-
+
}
diff --git a/moses/Parameter.h b/moses/Parameter.h
index a78314692..a0c372a53 100644
--- a/moses/Parameter.h
+++ b/moses/Parameter.h
@@ -38,16 +38,16 @@ typedef std::map<std::string, std::string > PARAM_STRING;
/** Handles parameter values set in config file or on command line.
* Process raw parameter data (names and values as strings) for StaticData
- * to parse; to get useful values, see StaticData.
+ * to parse; to get useful values, see StaticData.
*/
class Parameter
{
protected:
- PARAM_MAP m_setting;
- PARAM_BOOL m_valid;
- PARAM_STRING m_abbreviation;
- PARAM_STRING m_description;
- PARAM_STRING m_fullname;
+ PARAM_MAP m_setting;
+ PARAM_BOOL m_valid;
+ PARAM_STRING m_abbreviation;
+ PARAM_STRING m_description;
+ PARAM_STRING m_fullname;
std::map<std::string, std::vector<float> > m_weights;
@@ -93,32 +93,30 @@ public:
bool isParamSpecified(const std::string &paramName) {
return m_setting.find( paramName ) != m_setting.end();
}
-
- const std::string GetFullName(std::string abbr)
- {
- return m_fullname[abbr];
- }
-
- const std::string GetAbbreviation(std::string full)
- {
- return m_abbreviation[full];
- }
- const PARAM_VEC &GetParamShortName(const std::string &paramName)
- {
- return GetParam(GetFullName(paramName));
- }
-
- void OverwriteParam(const std::string &paramName, PARAM_VEC values);
-
- void OverwriteParamShortName(const std::string &paramShortName, PARAM_VEC values){
- OverwriteParam(GetFullName(paramShortName),values);
- }
-
+
+ const std::string GetFullName(std::string abbr) {
+ return m_fullname[abbr];
+ }
+
+ const std::string GetAbbreviation(std::string full) {
+ return m_abbreviation[full];
+ }
+ const PARAM_VEC &GetParamShortName(const std::string &paramName) {
+ return GetParam(GetFullName(paramName));
+ }
+
+ void OverwriteParam(const std::string &paramName, PARAM_VEC values);
+
+ void OverwriteParamShortName(const std::string &paramShortName, PARAM_VEC values) {
+ OverwriteParam(GetFullName(paramShortName),values);
+ }
+
std::vector<float> &GetWeights(const std::string &name);
std::set<std::string> GetWeightNames() const;
- const PARAM_MAP &GetParams() const
- { return m_setting; }
+ const PARAM_MAP &GetParams() const {
+ return m_setting;
+ }
};
diff --git a/moses/PartialTranslOptColl.h b/moses/PartialTranslOptColl.h
index f4f40d413..5a4e816de 100644
--- a/moses/PartialTranslOptColl.h
+++ b/moses/PartialTranslOptColl.h
@@ -39,7 +39,7 @@ namespace Moses
* The expansion process itself may be still explode, so efficient handling
* of partial translation options during expansion is required.
* This class assists in this tasks by implementing pruning.
- * This implementation is similar to the one in HypothesisStack.
+ * This implementation is similar to the one in HypothesisStack.
*/
class PartialTranslOptColl
{
diff --git a/moses/Phrase.cpp b/moses/Phrase.cpp
index ef5d09b23..3fa607fb4 100644
--- a/moses/Phrase.cpp
+++ b/moses/Phrase.cpp
@@ -103,16 +103,15 @@ Phrase Phrase::GetSubString(const WordsRange &wordsRange) const
Phrase Phrase::GetSubString(const WordsRange &wordsRange, FactorType factorType) const
{
- Phrase retPhrase(wordsRange.GetNumWordsCovered());
+ Phrase retPhrase(wordsRange.GetNumWordsCovered());
- for (size_t currPos = wordsRange.GetStartPos() ; currPos <= wordsRange.GetEndPos() ; currPos++)
- {
- const Factor* f = GetFactor(currPos, factorType);
- Word &word = retPhrase.AddWord();
- word.SetFactor(factorType, f);
- }
+ for (size_t currPos = wordsRange.GetStartPos() ; currPos <= wordsRange.GetEndPos() ; currPos++) {
+ const Factor* f = GetFactor(currPos, factorType);
+ Word &word = retPhrase.AddWord();
+ word.SetFactor(factorType, f);
+ }
- return retPhrase;
+ return retPhrase;
}
std::string Phrase::GetStringRep(const vector<FactorType> factorsToPrint) const
@@ -153,10 +152,10 @@ void Phrase::PrependWord(const Word &newWord)
}
void Phrase::CreateFromString(FactorDirection direction
- ,const std::vector<FactorType> &factorOrder
- ,const StringPiece &phraseString
- ,const StringPiece &factorDelimiter
- ,Word **lhs)
+ ,const std::vector<FactorType> &factorOrder
+ ,const StringPiece &phraseString
+ ,const StringPiece &factorDelimiter
+ ,Word **lhs)
{
// parse
vector<StringPiece> annotatedWordVector;
@@ -165,9 +164,9 @@ void Phrase::CreateFromString(FactorDirection direction
}
if (annotatedWordVector.size() == 0) {
- if (lhs) {
- (*lhs) = NULL;
- }
+ if (lhs) {
+ (*lhs) = NULL;
+ }
return;
}
@@ -188,8 +187,7 @@ void Phrase::CreateFromString(FactorDirection direction
(*lhs) = new Word(true);
(*lhs)->CreateFromString(direction, factorOrder, annotatedWord.substr(1, annotatedWord.size() - 2), true);
assert((*lhs)->IsNonTerminal());
- }
- else {
+ } else {
numWords = annotatedWordVector.size();
//CHECK(lhs == NULL);
if (lhs) {
diff --git a/moses/Phrase.h b/moses/Phrase.h
index 196e403ac..209f92f9e 100644
--- a/moses/Phrase.h
+++ b/moses/Phrase.h
@@ -69,13 +69,13 @@ public:
/** Fills phrase with words from format string, typically from phrase table or sentence input
* \param factorOrder factor types of each element in 2D string vector
* \param phraseString formatted input string to parse
- * \param factorDelimiter delimiter between factors.
+ * \param factorDelimiter delimiter between factors.
*/
void CreateFromString(FactorDirection direction
, const std::vector<FactorType> &factorOrder
- , const StringPiece &phraseString
- , const StringPiece &factorDelimiter
- , Word **lhs);
+ , const StringPiece &phraseString
+ , const StringPiece &factorDelimiter
+ , Word **lhs);
/** copy factors from the other phrase to this phrase.
IsCompatible() must be run beforehand to ensure incompatible factors aren't overwritten
@@ -127,52 +127,49 @@ public:
void AddWord(const Word &newWord) {
AddWord() = newWord;
}
-
- /** appends a phrase at the end of current phrase **/
- void Append(const Phrase &endPhrase);
- void PrependWord(const Word &newWord);
-
- void Clear()
- {
- m_words.clear();
- }
-
- void RemoveWord(size_t pos)
- {
- CHECK(pos < m_words.size());
- m_words.erase(m_words.begin() + pos);
- }
-
- //! create new phrase class that is a substring of this phrase
- Phrase GetSubString(const WordsRange &wordsRange) const;
+
+ /** appends a phrase at the end of current phrase **/
+ void Append(const Phrase &endPhrase);
+ void PrependWord(const Word &newWord);
+
+ void Clear() {
+ m_words.clear();
+ }
+
+ void RemoveWord(size_t pos) {
+ CHECK(pos < m_words.size());
+ m_words.erase(m_words.begin() + pos);
+ }
+
+ //! create new phrase class that is a substring of this phrase
+ Phrase GetSubString(const WordsRange &wordsRange) const;
Phrase GetSubString(const WordsRange &wordsRange, FactorType factorType) const;
-
- //! return a string rep of the phrase. Each factor is separated by the factor delimiter as specified in StaticData class
- std::string GetStringRep(const std::vector<FactorType> factorsToPrint) const;
-
- TO_STRING();
-
-
- int Compare(const Phrase &other) const;
-
- /** transitive comparison between 2 phrases
- * used to insert & find phrase in dictionary
- */
- bool operator< (const Phrase &compare) const
- {
- return Compare(compare) < 0;
- }
-
- bool operator== (const Phrase &compare) const
- {
- return Compare(compare) == 0;
- }
-
- void OnlyTheseFactors(const FactorMask &factors);
+
+ //! return a string rep of the phrase. Each factor is separated by the factor delimiter as specified in StaticData class
+ std::string GetStringRep(const std::vector<FactorType> factorsToPrint) const;
+
+ TO_STRING();
+
+
+ int Compare(const Phrase &other) const;
+
+ /** transitive comparison between 2 phrases
+ * used to insert & find phrase in dictionary
+ */
+ bool operator< (const Phrase &compare) const {
+ return Compare(compare) < 0;
+ }
+
+ bool operator== (const Phrase &compare) const {
+ return Compare(compare) == 0;
+ }
+
+ void OnlyTheseFactors(const FactorMask &factors);
};
-inline size_t hash_value(const Phrase& phrase) {
+inline size_t hash_value(const Phrase& phrase)
+{
size_t seed = 0;
for (size_t i = 0; i < phrase.GetSize(); ++i) {
boost::hash_combine(seed, phrase.GetWord(i));
diff --git a/moses/PrefixTree.h b/moses/PrefixTree.h
index 9cf1360e6..5b81ea175 100644
--- a/moses/PrefixTree.h
+++ b/moses/PrefixTree.h
@@ -63,7 +63,7 @@ public:
keys.insert(i,*b);
data.insert(data.begin()+pos,def);
- Self *self = NULL;
+ Self *self = NULL;
ptr.insert(ptr.begin()+pos, self);
}
if(++b!=e) {
diff --git a/moses/PrefixTreeMap.h b/moses/PrefixTreeMap.h
index fae875bd4..06066878d 100644
--- a/moses/PrefixTreeMap.h
+++ b/moses/PrefixTreeMap.h
@@ -59,7 +59,7 @@ private:
ScoreList m_ScoreList;
};
-
+
/** @todo How is this used in the pb binary phrase table?
*/
struct PPimp {
diff --git a/moses/RuleCube.h b/moses/RuleCube.h
index 05f9f1a24..d0c6ea66a 100644
--- a/moses/RuleCube.h
+++ b/moses/RuleCube.h
@@ -44,7 +44,7 @@ class ChartTranslationOptions;
*/
class RuleCubeItemScoreOrderer
{
- public:
+public:
bool operator()(const RuleCubeItem *p, const RuleCubeItem *q) const {
return p->GetScore() < q->GetScore();
}
@@ -56,7 +56,7 @@ class RuleCubeItemScoreOrderer
*/
class RuleCubeItemPositionOrderer
{
- public:
+public:
bool operator()(const RuleCubeItem *p, const RuleCubeItem *q) const {
return *p < *q;
}
@@ -66,7 +66,7 @@ class RuleCubeItemPositionOrderer
*/
class RuleCubeItemHasher
{
- public:
+public:
size_t operator()(const RuleCubeItem *p) const {
size_t seed = 0;
boost::hash_combine(seed, p->GetHypothesisDimensions());
@@ -79,7 +79,7 @@ class RuleCubeItemHasher
*/
class RuleCubeItemEqualityPred
{
- public:
+public:
bool operator()(const RuleCubeItem *p, const RuleCubeItem *q) const {
return p->GetHypothesisDimensions() == q->GetHypothesisDimensions() &&
p->GetTranslationDimension() == q->GetTranslationDimension();
@@ -90,7 +90,7 @@ class RuleCubeItemEqualityPred
*/
class RuleCube
{
- public:
+public:
RuleCube(const ChartTranslationOptions &, const ChartCellCollection &,
ChartManager &);
@@ -104,26 +104,28 @@ class RuleCube
RuleCubeItem *Pop(ChartManager &);
- bool IsEmpty() const { return m_queue.empty(); }
+ bool IsEmpty() const {
+ return m_queue.empty();
+ }
const ChartTranslationOptions &GetTranslationOption() const {
return m_transOpt;
}
- private:
+private:
#if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200)
typedef boost::unordered_set<RuleCubeItem*,
- RuleCubeItemHasher,
- RuleCubeItemEqualityPred
- > ItemSet;
+ RuleCubeItemHasher,
+ RuleCubeItemEqualityPred
+ > ItemSet;
#else
typedef std::set<RuleCubeItem*, RuleCubeItemPositionOrderer> ItemSet;
#endif
typedef std::priority_queue<RuleCubeItem*,
- std::vector<RuleCubeItem*>,
- RuleCubeItemScoreOrderer
- > Queue;
+ std::vector<RuleCubeItem*>,
+ RuleCubeItemScoreOrderer
+ > Queue;
RuleCube(const RuleCube &); // Not implemented
RuleCube &operator=(const RuleCube &); // Not implemented
diff --git a/moses/RuleCubeItem.h b/moses/RuleCubeItem.h
index 612079172..75669598b 100644
--- a/moses/RuleCubeItem.h
+++ b/moses/RuleCubeItem.h
@@ -39,14 +39,16 @@ typedef std::vector<const ChartHypothesis*> HypoList;
*/
class TranslationDimension
{
- public:
+public:
TranslationDimension(std::size_t pos,
const std::vector<TargetPhrase*> &orderedTargetPhrases)
: m_pos(pos)
, m_orderedTargetPhrases(&orderedTargetPhrases)
{}
- std::size_t IncrementPos() { return m_pos++; }
+ std::size_t IncrementPos() {
+ return m_pos++;
+ }
bool HasMoreTranslations() const {
return m_pos+1 < m_orderedTargetPhrases->size();
@@ -64,7 +66,7 @@ class TranslationDimension
return GetTargetPhrase() == compare.GetTargetPhrase();
}
- private:
+private:
std::size_t m_pos;
const std::vector<TargetPhrase*> *m_orderedTargetPhrases;
};
@@ -81,7 +83,9 @@ public:
, m_orderedHypos(&orderedHypos)
{}
- std::size_t IncrementPos() { return m_pos++; }
+ std::size_t IncrementPos() {
+ return m_pos++;
+ }
bool HasMoreHypo() const {
return m_pos+1 < m_orderedHypos->size();
@@ -109,7 +113,7 @@ std::size_t hash_value(const HypothesisDimension &);
/** @todo How is this used. Split out into separate source file */
class RuleCubeItem
{
- public:
+public:
RuleCubeItem(const ChartTranslationOptions &, const ChartCellCollection &);
RuleCubeItem(const RuleCubeItem &, int);
~RuleCubeItem();
@@ -122,7 +126,9 @@ class RuleCubeItem
return m_hypothesisDimensions;
}
- float GetScore() const { return m_score; }
+ float GetScore() const {
+ return m_score;
+ }
void EstimateScore();
@@ -132,7 +138,7 @@ class RuleCubeItem
bool operator<(const RuleCubeItem &) const;
- private:
+private:
RuleCubeItem(const RuleCubeItem &); // Not implemented
RuleCubeItem &operator=(const RuleCubeItem &); // Not implemented
diff --git a/moses/RuleCubeQueue.h b/moses/RuleCubeQueue.h
index 9763b3877..ae4d20be0 100644
--- a/moses/RuleCubeQueue.h
+++ b/moses/RuleCubeQueue.h
@@ -36,7 +36,7 @@ class ChartManager;
*/
class RuleCubeOrderer
{
- public:
+public:
bool operator()(const RuleCube *p, const RuleCube *q) const {
return p->GetTopScore() < q->GetTopScore();
}
@@ -45,17 +45,19 @@ class RuleCubeOrderer
/** @todo how is this used */
class RuleCubeQueue
{
- public:
+public:
RuleCubeQueue(ChartManager &manager) : m_manager(manager) {}
~RuleCubeQueue();
void Add(RuleCube *);
ChartHypothesis *Pop();
- bool IsEmpty() const { return m_queue.empty(); }
+ bool IsEmpty() const {
+ return m_queue.empty();
+ }
- private:
+private:
typedef std::priority_queue<RuleCube*, std::vector<RuleCube*>,
- RuleCubeOrderer > Queue;
+ RuleCubeOrderer > Queue;
Queue m_queue;
ChartManager &m_manager;
diff --git a/moses/ScoreComponentCollection.cpp b/moses/ScoreComponentCollection.cpp
index c836ea5b3..44f08b316 100644
--- a/moses/ScoreComponentCollection.cpp
+++ b/moses/ScoreComponentCollection.cpp
@@ -17,7 +17,7 @@ ScoreComponentCollection::ScoreComponentCollection() : m_scores(s_denseVectorSiz
void ScoreComponentCollection::RegisterScoreProducer
- (const FeatureFunction* scoreProducer)
+(const FeatureFunction* scoreProducer)
{
size_t start = s_denseVectorSize;
size_t end = start + scoreProducer->GetNumScoreComponents();
@@ -29,56 +29,58 @@ void ScoreComponentCollection::RegisterScoreProducer
float ScoreComponentCollection::GetWeightedScore() const
{
- return m_scores.inner_product(StaticData::Instance().GetAllWeights().m_scores);
+ return m_scores.inner_product(StaticData::Instance().GetAllWeights().m_scores);
}
void ScoreComponentCollection::MultiplyEquals(float scalar)
{
- m_scores *= scalar;
+ m_scores *= scalar;
}
// Multiply all weights of this sparse producer by a given scalar
-void ScoreComponentCollection::MultiplyEquals(const FeatureFunction* sp, float scalar) {
+void ScoreComponentCollection::MultiplyEquals(const FeatureFunction* sp, float scalar)
+{
std::string prefix = sp->GetScoreProducerDescription() + FName::SEP;
for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
std::stringstream name;
name << i->first;
if (name.str().substr( 0, prefix.length() ).compare( prefix ) == 0)
- m_scores[i->first] = i->second * scalar;
+ m_scores[i->first] = i->second * scalar;
}
}
// Count weights belonging to this sparse producer
-size_t ScoreComponentCollection::GetNumberWeights(const FeatureFunction* sp) {
+size_t ScoreComponentCollection::GetNumberWeights(const FeatureFunction* sp)
+{
std::string prefix = sp->GetScoreProducerDescription() + FName::SEP;
size_t weights = 0;
for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
std::stringstream name;
name << i->first;
if (name.str().substr( 0, prefix.length() ).compare( prefix ) == 0)
- weights++;
+ weights++;
}
return weights;
}
void ScoreComponentCollection::DivideEquals(float scalar)
{
- m_scores /= scalar;
+ m_scores /= scalar;
}
void ScoreComponentCollection::CoreDivideEquals(float scalar)
{
- m_scores.coreDivideEquals(scalar);
+ m_scores.coreDivideEquals(scalar);
}
void ScoreComponentCollection::DivideEquals(const ScoreComponentCollection& rhs)
{
- m_scores.divideEquals(rhs.m_scores);
+ m_scores.divideEquals(rhs.m_scores);
}
void ScoreComponentCollection::MultiplyEquals(const ScoreComponentCollection& rhs)
{
- m_scores *= rhs.m_scores;
+ m_scores *= rhs.m_scores;
}
void ScoreComponentCollection::MultiplyEqualsBackoff(const ScoreComponentCollection& rhs, float backoff)
@@ -93,42 +95,51 @@ void ScoreComponentCollection::MultiplyEquals(float core_r0, float sparse_r0)
std::ostream& operator<<(std::ostream& os, const ScoreComponentCollection& rhs)
{
- os << rhs.m_scores;
- return os;
+ os << rhs.m_scores;
+ return os;
}
-void ScoreComponentCollection::L1Normalise() {
+void ScoreComponentCollection::L1Normalise()
+{
m_scores /= m_scores.l1norm_coreFeatures();
}
-float ScoreComponentCollection::GetL1Norm() const {
+float ScoreComponentCollection::GetL1Norm() const
+{
return m_scores.l1norm();
}
-float ScoreComponentCollection::GetL2Norm() const {
+float ScoreComponentCollection::GetL2Norm() const
+{
return m_scores.l2norm();
}
-float ScoreComponentCollection::GetLInfNorm() const {
+float ScoreComponentCollection::GetLInfNorm() const
+{
return m_scores.linfnorm();
}
-size_t ScoreComponentCollection::L1Regularize(float lambda) {
+size_t ScoreComponentCollection::L1Regularize(float lambda)
+{
return m_scores.l1regularize(lambda);
}
-void ScoreComponentCollection::L2Regularize(float lambda) {
+void ScoreComponentCollection::L2Regularize(float lambda)
+{
m_scores.l2regularize(lambda);
}
-size_t ScoreComponentCollection::SparseL1Regularize(float lambda) {
+size_t ScoreComponentCollection::SparseL1Regularize(float lambda)
+{
return m_scores.sparseL1regularize(lambda);
}
-void ScoreComponentCollection::SparseL2Regularize(float lambda) {
+void ScoreComponentCollection::SparseL2Regularize(float lambda)
+{
m_scores.sparseL2regularize(lambda);
}
-void ScoreComponentCollection::Save(ostream& out) const {
+void ScoreComponentCollection::Save(ostream& out) const
+{
ScoreIndexMap::const_iterator iter = s_scoreIndexes.begin();
for (; iter != s_scoreIndexes.end(); ++iter ) {
string name = iter->first->GetScoreProducerDescription();
@@ -148,7 +159,8 @@ void ScoreComponentCollection::Save(ostream& out) const {
m_scores.write(out);
}
-void ScoreComponentCollection::Save(const string& filename) const {
+void ScoreComponentCollection::Save(const string& filename) const
+{
ofstream out(filename.c_str());
if (!out) {
ostringstream msg;
@@ -159,7 +171,8 @@ void ScoreComponentCollection::Save(const string& filename) const {
out.close();
}
-void ScoreComponentCollection::Assign(const FeatureFunction* sp, const string line) {
+void ScoreComponentCollection::Assign(const FeatureFunction* sp, const string line)
+{
istringstream istr(line);
while(istr) {
string namestring;
diff --git a/moses/ScoreComponentCollection.h b/moses/ScoreComponentCollection.h
index e76c9d06b..70c2a05f1 100644
--- a/moses/ScoreComponentCollection.h
+++ b/moses/ScoreComponentCollection.h
@@ -64,19 +64,18 @@ class ScoreComponentCollection
{
friend std::ostream& operator<<(std::ostream& os, const ScoreComponentCollection& rhs);
private:
- FVector m_scores;
+ FVector m_scores;
typedef std::pair<size_t,size_t> IndexPair;
typedef std::map<const FeatureFunction*,IndexPair> ScoreIndexMap;
static ScoreIndexMap s_scoreIndexes;
static size_t s_denseVectorSize;
- static IndexPair GetIndexes(const FeatureFunction* sp)
- {
+ static IndexPair GetIndexes(const FeatureFunction* sp) {
ScoreIndexMap::const_iterator indexIter = s_scoreIndexes.find(sp);
if (indexIter == s_scoreIndexes.end()) {
std::cerr << "ERROR: FeatureFunction: " << sp->GetScoreProducerDescription() <<
- " not registered with ScoreIndexMap" << std::endl;
+ " not registered with ScoreIndexMap" << std::endl;
std::cerr << "You must call ScoreComponentCollection.RegisterScoreProducer() " <<
- " for every FeatureFunction" << std::endl;
+ " for every FeatureFunction" << std::endl;
abort();
}
return indexIter->second;
@@ -91,9 +90,9 @@ public:
ScoreComponentCollection();
//! Clone a score collection
- ScoreComponentCollection(const ScoreComponentCollection& rhs)
- : m_scores(rhs.m_scores)
- {}
+ ScoreComponentCollection(const ScoreComponentCollection& rhs)
+ : m_scores(rhs.m_scores)
+ {}
ScoreComponentCollection& operator=( const ScoreComponentCollection& rhs ) {
m_scores = rhs.m_scores;
@@ -101,124 +100,108 @@ public:
}
/**
- * Register a ScoreProducer with a fixed number of scores, so that it can
+ * Register a ScoreProducer with a fixed number of scores, so that it can
* be allocated space in the dense part of the feature vector.
**/
static void RegisterScoreProducer(const FeatureFunction* scoreProducer);
/** Load from file */
- bool Load(const std::string& filename)
- {
- return m_scores.load(filename);
+ bool Load(const std::string& filename) {
+ return m_scores.load(filename);
}
- const FVector& GetScoresVector() const
- {
- return m_scores;
+ const FVector& GetScoresVector() const {
+ return m_scores;
}
const std::valarray<FValue> &getCoreFeatures() const {
return m_scores.getCoreFeatures();
}
- size_t Size() const
- {
- return m_scores.size();
+ size_t Size() const {
+ return m_scores.size();
}
- void Resize()
- {
+ void Resize() {
if (m_scores.coreSize() != s_denseVectorSize) {
m_scores.resize(s_denseVectorSize);
}
}
/** Create and FVector with the right number of core features */
- static FVector CreateFVector()
- {
+ static FVector CreateFVector() {
return FVector(s_denseVectorSize);
}
- void SetToBinaryOf(const ScoreComponentCollection& rhs)
- {
- m_scores.setToBinaryOf(rhs.m_scores);
+ void SetToBinaryOf(const ScoreComponentCollection& rhs) {
+ m_scores.setToBinaryOf(rhs.m_scores);
}
//! Set all values to 0.0
- void ZeroAll()
- {
- m_scores.clear();
- }
-
- void MultiplyEquals(float scalar);
- void DivideEquals(float scalar);
- void CoreDivideEquals(float scalar);
- void DivideEquals(const ScoreComponentCollection& rhs);
- void MultiplyEquals(const ScoreComponentCollection& rhs);
- void MultiplyEqualsBackoff(const ScoreComponentCollection& rhs, float backoff);
- void MultiplyEquals(float core_r0, float sparse_r0);
- void MultiplyEquals(const FeatureFunction* sp, float scalar);
-
- size_t GetNumberWeights(const FeatureFunction* sp);
-
- void CoreAssign(const ScoreComponentCollection& rhs)
- {
- m_scores.coreAssign(rhs.m_scores);
- }
-
- //! add the score in rhs
- void PlusEquals(const ScoreComponentCollection& rhs)
- {
- m_scores += rhs.m_scores;
- }
-
- // add only sparse features
- void SparsePlusEquals(const ScoreComponentCollection& rhs)
- {
- m_scores.sparsePlusEquals(rhs.m_scores);
- }
-
- void PlusEquals(const FVector& scores)
- {
- m_scores += scores;
- }
-
- //! subtract the score in rhs
- void MinusEquals(const ScoreComponentCollection& rhs)
- {
- m_scores -= rhs.m_scores;
- }
+ void ZeroAll() {
+ m_scores.clear();
+ }
+
+ void MultiplyEquals(float scalar);
+ void DivideEquals(float scalar);
+ void CoreDivideEquals(float scalar);
+ void DivideEquals(const ScoreComponentCollection& rhs);
+ void MultiplyEquals(const ScoreComponentCollection& rhs);
+ void MultiplyEqualsBackoff(const ScoreComponentCollection& rhs, float backoff);
+ void MultiplyEquals(float core_r0, float sparse_r0);
+ void MultiplyEquals(const FeatureFunction* sp, float scalar);
+
+ size_t GetNumberWeights(const FeatureFunction* sp);
+
+ void CoreAssign(const ScoreComponentCollection& rhs) {
+ m_scores.coreAssign(rhs.m_scores);
+ }
+
+ //! add the score in rhs
+ void PlusEquals(const ScoreComponentCollection& rhs) {
+ m_scores += rhs.m_scores;
+ }
+
+ // add only sparse features
+ void SparsePlusEquals(const ScoreComponentCollection& rhs) {
+ m_scores.sparsePlusEquals(rhs.m_scores);
+ }
+
+ void PlusEquals(const FVector& scores) {
+ m_scores += scores;
+ }
+
+ //! subtract the score in rhs
+ void MinusEquals(const ScoreComponentCollection& rhs) {
+ m_scores -= rhs.m_scores;
+ }
//For features which have an unbounded number of components
- void MinusEquals(const FeatureFunction*sp, const std::string& name, float score)
- {
+ void MinusEquals(const FeatureFunction*sp, const std::string& name, float score) {
FName fname(sp->GetScoreProducerDescription(),name);
m_scores[fname] -= score;
}
//For features which have an unbounded number of components
- void SparseMinusEquals(const std::string& full_name, float score)
- {
+ void SparseMinusEquals(const std::string& full_name, float score) {
FName fname(full_name);
m_scores[fname] -= score;
}
- //! Add scores from a single ScoreProducer only
- //! The length of scores must be equal to the number of score components
- //! produced by sp
- void PlusEquals(const FeatureFunction* sp, const ScoreComponentCollection& scores)
- {
+ //! Add scores from a single ScoreProducer only
+ //! The length of scores must be equal to the number of score components
+ //! produced by sp
+ void PlusEquals(const FeatureFunction* sp, const ScoreComponentCollection& scores) {
IndexPair indexes = GetIndexes(sp);
for (size_t i = indexes.first; i < indexes.second; ++i) {
m_scores[i] += scores.m_scores[i];
}
- }
+ }
- //! Add scores from a single FeatureFunction only
- //! The length of scores must be equal to the number of score components
- //! produced by sp
- void PlusEquals(const FeatureFunction* sp, const std::vector<float>& scores)
- {
+ //! Add scores from a single FeatureFunction only
+ //! The length of scores must be equal to the number of score components
+ //! produced by sp
+ void PlusEquals(const FeatureFunction* sp, const std::vector<float>& scores) {
IndexPair indexes = GetIndexes(sp);
CHECK(scores.size() == indexes.second - indexes.first);
for (size_t i = 0; i < scores.size(); ++i) {
@@ -226,56 +209,50 @@ public:
}
}
- //! Special version PlusEquals(ScoreProducer, vector<float>)
- //! to add the score from a single ScoreProducer that produces
- //! a single value
- void PlusEquals(const FeatureFunction* sp, float score)
- {
+ //! Special version PlusEquals(ScoreProducer, vector<float>)
+ //! to add the score from a single ScoreProducer that produces
+ //! a single value
+ void PlusEquals(const FeatureFunction* sp, float score) {
IndexPair indexes = GetIndexes(sp);
CHECK(1 == indexes.second - indexes.first);
m_scores[indexes.first] += score;
- }
+ }
//For features which have an unbounded number of components
- void PlusEquals(const FeatureFunction*sp, const StringPiece& name, float score)
- {
+ void PlusEquals(const FeatureFunction*sp, const StringPiece& name, float score) {
FName fname(sp->GetScoreProducerDescription(),name);
m_scores[fname] += score;
}
//For features which have an unbounded number of components
- void SparsePlusEquals(const std::string& full_name, float score)
- {
- FName fname(full_name);
+ void SparsePlusEquals(const std::string& full_name, float score) {
+ FName fname(full_name);
m_scores[fname] += score;
}
- void Assign(const FeatureFunction* sp, const std::vector<float>& scores)
- {
+ void Assign(const FeatureFunction* sp, const std::vector<float>& scores) {
IndexPair indexes = GetIndexes(sp);
CHECK(scores.size() == indexes.second - indexes.first);
for (size_t i = 0; i < scores.size(); ++i) {
m_scores[i + indexes.first] = scores[i];
}
}
-
+
//! Special version Assign(ScoreProducer, vector<float>)
//! to add the score from a single ScoreProducer that produces
//! a single value
- void Assign(const FeatureFunction* sp, float score)
- {
+ void Assign(const FeatureFunction* sp, float score) {
IndexPair indexes = GetIndexes(sp);
CHECK(1 == indexes.second - indexes.first);
m_scores[indexes.first] = score;
}
-
+
// Assign core weight by index
void Assign(size_t index, float score) {
m_scores[index] = score;
}
- void Assign(const FeatureFunction*sp, const StringPiece &name, float score)
- {
+ void Assign(const FeatureFunction*sp, const StringPiece &name, float score) {
FName fname(sp->GetScoreProducerDescription(),name);
m_scores[fname] = score;
}
@@ -285,27 +262,23 @@ public:
void Assign(const FeatureFunction* sp, const std::string line);
// shortcut: setting the value directly using the feature name
- void Assign(const std::string name, float score)
- {
- FName fname(name);
- m_scores[fname] = score;
- }
-
- float InnerProduct(const ScoreComponentCollection& rhs) const
- {
- return m_scores.inner_product(rhs.m_scores);
- }
-
- float PartialInnerProduct(const FeatureFunction* sp, const std::vector<float>& rhs) const
- {
- std::vector<float> lhs = GetScoresForProducer(sp);
- CHECK(lhs.size() == rhs.size());
- return std::inner_product(lhs.begin(), lhs.end(), rhs.begin(), 0.0f);
- }
-
- //! return a vector of all the scores associated with a certain FeatureFunction
- std::vector<float> GetScoresForProducer(const FeatureFunction* sp) const
- {
+ void Assign(const std::string name, float score) {
+ FName fname(name);
+ m_scores[fname] = score;
+ }
+
+ float InnerProduct(const ScoreComponentCollection& rhs) const {
+ return m_scores.inner_product(rhs.m_scores);
+ }
+
+ float PartialInnerProduct(const FeatureFunction* sp, const std::vector<float>& rhs) const {
+ std::vector<float> lhs = GetScoresForProducer(sp);
+ CHECK(lhs.size() == rhs.size());
+ return std::inner_product(lhs.begin(), lhs.end(), rhs.begin(), 0.0f);
+ }
+
+ //! return a vector of all the scores associated with a certain FeatureFunction
+ std::vector<float> GetScoresForProducer(const FeatureFunction* sp) const {
size_t components = sp->GetNumScoreComponents();
std::vector<float> res(components);
@@ -314,58 +287,52 @@ public:
res[i] = m_scores[i + indexes.first];
}
return res;
- }
+ }
//! get subset of scores that belong to a certain sparse ScoreProducer
FVector GetVectorForProducer(const FeatureFunction* sp) const;
- float GetSparseWeight(const FName& featureName) const
- {
+ float GetSparseWeight(const FName& featureName) const {
return m_scores[featureName];
}
-
+
void PrintCoreFeatures() {
m_scores.printCoreFeatures();
}
- void ThresholdScaling(float maxValue)
- {
- // find (smallest) factor for which all weights are <= maxValue
- // 0.1 / 0.14 = 0.714285714
- // 0.1 / 0.17 = 0.588235294
+ void ThresholdScaling(float maxValue) {
+ // find (smallest) factor for which all weights are <= maxValue
+ // 0.1 / 0.14 = 0.714285714
+ // 0.1 / 0.17 = 0.588235294
m_scores.thresholdScale(maxValue);
- }
-
- void CapMax(float maxValue)
- {
- // cap all sparse features to maxValue
- m_scores.capMax(maxValue);
- }
-
- void CapMin(float minValue)
- {
- // cap all sparse features to minValue
- m_scores.capMin(minValue);
- }
-
- //! if a FeatureFunction produces a single score (for example, a language model score)
- //! this will return it. If not, this method will throw
- float GetScoreForProducer(const FeatureFunction* sp) const
- {
+ }
+
+ void CapMax(float maxValue) {
+ // cap all sparse features to maxValue
+ m_scores.capMax(maxValue);
+ }
+
+ void CapMin(float minValue) {
+ // cap all sparse features to minValue
+ m_scores.capMin(minValue);
+ }
+
+ //! if a FeatureFunction produces a single score (for example, a language model score)
+ //! this will return it. If not, this method will throw
+ float GetScoreForProducer(const FeatureFunction* sp) const {
IndexPair indexes = GetIndexes(sp);
CHECK(indexes.second - indexes.first == 1);
return m_scores[indexes.first];
- }
+ }
//For features which have an unbounded number of components
float GetScoreForProducer
- (const FeatureFunction* sp, const std::string& name) const
- {
+ (const FeatureFunction* sp, const std::string& name) const {
FName fname(sp->GetScoreProducerDescription(),name);
return m_scores[fname];
}
- float GetWeightedScore() const;
+ float GetWeightedScore() const;
void ZeroDenseFeatures(const FeatureFunction* sp);
void L1Normalise();
@@ -378,45 +345,65 @@ public:
void SparseL2Regularize(float lambda);
void Save(const std::string& filename) const;
void Save(std::ostream&) const;
-
- void IncrementSparseHopeFeatures() { m_scores.incrementSparseHopeFeatures(); }
- void IncrementSparseFearFeatures() { m_scores.incrementSparseFearFeatures(); }
- void PrintSparseHopeFeatureCounts(std::ofstream& out) { m_scores.printSparseHopeFeatureCounts(out); }
- void PrintSparseFearFeatureCounts(std::ofstream& out) { m_scores.printSparseFearFeatureCounts(out); }
- void PrintSparseHopeFeatureCounts() { m_scores.printSparseHopeFeatureCounts(); }
- void PrintSparseFearFeatureCounts() { m_scores.printSparseFearFeatureCounts(); }
- size_t PruneSparseFeatures(size_t threshold) { return m_scores.pruneSparseFeatures(threshold); }
- size_t PruneZeroWeightFeatures() { return m_scores.pruneZeroWeightFeatures(); }
- void UpdateConfidenceCounts(ScoreComponentCollection &weightUpdate, bool signedCounts) { m_scores.updateConfidenceCounts(weightUpdate.m_scores, signedCounts); }
- void UpdateLearningRates(float decay_core, float decay_sparse, ScoreComponentCollection &confidenceCounts, float core_r0, float sparse_r0) { m_scores.updateLearningRates(decay_core, decay_sparse, confidenceCounts.m_scores, core_r0, sparse_r0); }
+
+ void IncrementSparseHopeFeatures() {
+ m_scores.incrementSparseHopeFeatures();
+ }
+ void IncrementSparseFearFeatures() {
+ m_scores.incrementSparseFearFeatures();
+ }
+ void PrintSparseHopeFeatureCounts(std::ofstream& out) {
+ m_scores.printSparseHopeFeatureCounts(out);
+ }
+ void PrintSparseFearFeatureCounts(std::ofstream& out) {
+ m_scores.printSparseFearFeatureCounts(out);
+ }
+ void PrintSparseHopeFeatureCounts() {
+ m_scores.printSparseHopeFeatureCounts();
+ }
+ void PrintSparseFearFeatureCounts() {
+ m_scores.printSparseFearFeatureCounts();
+ }
+ size_t PruneSparseFeatures(size_t threshold) {
+ return m_scores.pruneSparseFeatures(threshold);
+ }
+ size_t PruneZeroWeightFeatures() {
+ return m_scores.pruneZeroWeightFeatures();
+ }
+ void UpdateConfidenceCounts(ScoreComponentCollection &weightUpdate, bool signedCounts) {
+ m_scores.updateConfidenceCounts(weightUpdate.m_scores, signedCounts);
+ }
+ void UpdateLearningRates(float decay_core, float decay_sparse, ScoreComponentCollection &confidenceCounts, float core_r0, float sparse_r0) {
+ m_scores.updateLearningRates(decay_core, decay_sparse, confidenceCounts.m_scores, core_r0, sparse_r0);
+ }
#ifdef MPI_ENABLE
- public:
- friend class boost::serialization::access;
-
- private:
- //serialization
- template<class Archive>
- void save(Archive &ar, const unsigned int version) const {
- ar << m_scores;
- }
-
- template<class Archive>
- void load(Archive &ar, const unsigned int version) {
- ar >> m_scores;
+public:
+ friend class boost::serialization::access;
- }
+private:
+ //serialization
+ template<class Archive>
+ void save(Archive &ar, const unsigned int version) const {
+ ar << m_scores;
+ }
+
+ template<class Archive>
+ void load(Archive &ar, const unsigned int version) {
+ ar >> m_scores;
+
+ }
+
+ BOOST_SERIALIZATION_SPLIT_MEMBER()
- BOOST_SERIALIZATION_SPLIT_MEMBER()
-
#endif
};
struct SCCPlus {
ScoreComponentCollection operator()
- (const ScoreComponentCollection& lhs,
- const ScoreComponentCollection& rhs) {
+ (const ScoreComponentCollection& lhs,
+ const ScoreComponentCollection& rhs) {
ScoreComponentCollection sum(lhs);
sum.PlusEquals(rhs);
return sum;
diff --git a/moses/ScoreComponentCollectionTest.cpp b/moses/ScoreComponentCollectionTest.cpp
index f0813f4e8..41fa6562f 100644
--- a/moses/ScoreComponentCollectionTest.cpp
+++ b/moses/ScoreComponentCollectionTest.cpp
@@ -29,31 +29,35 @@ using namespace std;
BOOST_AUTO_TEST_SUITE(scc)
-class MockStatelessFeatureFunction : public StatelessFeatureFunction {
- public:
- MockStatelessFeatureFunction(const string& desc, size_t n, const string &line) :
- StatelessFeatureFunction(desc,n, line) {}
- virtual void Evaluate(const PhraseBasedFeatureContext&, ScoreComponentCollection*) const {}
- virtual void EvaluateChart(const ChartBasedFeatureContext&, ScoreComponentCollection*) const {}
- virtual void Evaluate(const TargetPhrase &targetPhrase
+class MockStatelessFeatureFunction : public StatelessFeatureFunction
+{
+public:
+ MockStatelessFeatureFunction(const string& desc, size_t n, const string &line) :
+ StatelessFeatureFunction(desc,n, line) {}
+ virtual void Evaluate(const PhraseBasedFeatureContext&, ScoreComponentCollection*) const {}
+ virtual void EvaluateChart(const ChartBasedFeatureContext&, ScoreComponentCollection*) const {}
+ virtual void Evaluate(const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
- { }
+ { }
};
-class MockSingleFeature : public MockStatelessFeatureFunction {
- public:
- MockSingleFeature(): MockStatelessFeatureFunction("MockSingle",1, "MockSingle") {}
+class MockSingleFeature : public MockStatelessFeatureFunction
+{
+public:
+ MockSingleFeature(): MockStatelessFeatureFunction("MockSingle",1, "MockSingle") {}
};
-class MockMultiFeature : public MockStatelessFeatureFunction {
- public:
- MockMultiFeature(): MockStatelessFeatureFunction("MockMulti", 5, "MockMulti") {}
+class MockMultiFeature : public MockStatelessFeatureFunction
+{
+public:
+ MockMultiFeature(): MockStatelessFeatureFunction("MockMulti", 5, "MockMulti") {}
};
-class MockSparseFeature : public MockStatelessFeatureFunction {
- public:
- MockSparseFeature(): MockStatelessFeatureFunction("MockSparse", 0, "MockSparse") {}
+class MockSparseFeature : public MockStatelessFeatureFunction
+{
+public:
+ MockSparseFeature(): MockStatelessFeatureFunction("MockSparse", 0, "MockSparse") {}
};
@@ -66,7 +70,7 @@ struct MockProducers {
MockSparseFeature sparse;
};
-BOOST_FIXTURE_TEST_CASE(ctor, MockProducers)
+BOOST_FIXTURE_TEST_CASE(ctor, MockProducers)
{
ScoreComponentCollection scc;
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&single),0);
@@ -88,11 +92,11 @@ BOOST_FIXTURE_TEST_CASE(plusequals, MockProducers)
scc.PlusEquals(&multi,vec1);
std::vector<float> actual = scc.GetScoresForProducer(&multi);
BOOST_CHECK_EQUAL_COLLECTIONS(vec1.begin(),vec1.end()
- ,actual.begin(), actual.end());
+ ,actual.begin(), actual.end());
scc.PlusEquals(&multi,vec1);
actual = scc.GetScoresForProducer(&multi);
BOOST_CHECK_EQUAL_COLLECTIONS(vec2.begin(),vec2.end(),
- actual.begin(), actual.end());
+ actual.begin(), actual.end());
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&single), 3.4f);
}
diff --git a/moses/SearchNormalBatch.cpp b/moses/SearchNormalBatch.cpp
index ca72b3973..aa3aeb0a3 100644
--- a/moses/SearchNormalBatch.cpp
+++ b/moses/SearchNormalBatch.cpp
@@ -18,21 +18,21 @@ SearchNormalBatch::SearchNormalBatch(Manager& manager, const InputType &source,
// Split the feature functions into sets of stateless, stateful
// distributed lm, and stateful non-distributed.
const vector<const StatefulFeatureFunction*>& ffs =
- StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ StatefulFeatureFunction::GetStatefulFeatureFunctions();
for (unsigned i = 0; i < ffs.size(); ++i) {
- if (ffs[i]->GetScoreProducerDescription() == "DLM_5gram") { // TODO WFT
- m_dlm_ffs[i] = const_cast<LanguageModel*>(static_cast<const LanguageModel* const>(ffs[i]));
- m_dlm_ffs[i]->SetFFStateIdx(i);
- }
- else {
- m_stateful_ffs[i] = const_cast<StatefulFeatureFunction*>(ffs[i]);
- }
+ if (ffs[i]->GetScoreProducerDescription() == "DLM_5gram") { // TODO WFT
+ m_dlm_ffs[i] = const_cast<LanguageModel*>(static_cast<const LanguageModel* const>(ffs[i]));
+ m_dlm_ffs[i]->SetFFStateIdx(i);
+ } else {
+ m_stateful_ffs[i] = const_cast<StatefulFeatureFunction*>(ffs[i]);
+ }
}
m_stateless_ffs = StatelessFeatureFunction::GetStatelessFeatureFunctions();
-
+
}
-SearchNormalBatch::~SearchNormalBatch() {
+SearchNormalBatch::~SearchNormalBatch()
+{
}
/**
@@ -138,79 +138,79 @@ void SearchNormalBatch::ExpandHypothesis(const Hypothesis &hypothesis, const Tra
for (dlm_iter = m_dlm_ffs.begin();
dlm_iter != m_dlm_ffs.end();
++dlm_iter) {
- const FFState* input_state = newHypo->GetPrevHypo() ? newHypo->GetPrevHypo()->GetFFState((*dlm_iter).first) : NULL;
- (*dlm_iter).second->IssueRequestsFor(*newHypo, input_state);
+ const FFState* input_state = newHypo->GetPrevHypo() ? newHypo->GetPrevHypo()->GetFFState((*dlm_iter).first) : NULL;
+ (*dlm_iter).second->IssueRequestsFor(*newHypo, input_state);
}
m_partial_hypos.push_back(newHypo);
- }
- else {
+ } else {
std::cerr << "can't use early discarding with batch decoding!" << std::endl;
abort();
}
}
-void SearchNormalBatch::EvalAndMergePartialHypos() {
- std::vector<Hypothesis*>::iterator partial_hypo_iter;
- for (partial_hypo_iter = m_partial_hypos.begin();
- partial_hypo_iter != m_partial_hypos.end();
- ++partial_hypo_iter) {
- Hypothesis* hypo = *partial_hypo_iter;
-
- // Evaluate with other ffs.
- std::map<int, StatefulFeatureFunction*>::iterator sfff_iter;
- for (sfff_iter = m_stateful_ffs.begin();
- sfff_iter != m_stateful_ffs.end();
- ++sfff_iter) {
- const StatefulFeatureFunction &ff = *(sfff_iter->second);
- int state_idx = sfff_iter->first;
- hypo->EvaluateWith(ff, state_idx);
- }
- std::vector<const StatelessFeatureFunction*>::iterator slff_iter;
- for (slff_iter = m_stateless_ffs.begin();
- slff_iter != m_stateless_ffs.end();
- ++slff_iter) {
- hypo->EvaluateWith(**slff_iter);
- }
+void SearchNormalBatch::EvalAndMergePartialHypos()
+{
+ std::vector<Hypothesis*>::iterator partial_hypo_iter;
+ for (partial_hypo_iter = m_partial_hypos.begin();
+ partial_hypo_iter != m_partial_hypos.end();
+ ++partial_hypo_iter) {
+ Hypothesis* hypo = *partial_hypo_iter;
+
+ // Evaluate with other ffs.
+ std::map<int, StatefulFeatureFunction*>::iterator sfff_iter;
+ for (sfff_iter = m_stateful_ffs.begin();
+ sfff_iter != m_stateful_ffs.end();
+ ++sfff_iter) {
+ const StatefulFeatureFunction &ff = *(sfff_iter->second);
+ int state_idx = sfff_iter->first;
+ hypo->EvaluateWith(ff, state_idx);
+ }
+ std::vector<const StatelessFeatureFunction*>::iterator slff_iter;
+ for (slff_iter = m_stateless_ffs.begin();
+ slff_iter != m_stateless_ffs.end();
+ ++slff_iter) {
+ hypo->EvaluateWith(**slff_iter);
}
+ }
- // Wait for all requests from the distributed LM to come back.
+ // Wait for all requests from the distributed LM to come back.
+ std::map<int, LanguageModel*>::iterator dlm_iter;
+ for (dlm_iter = m_dlm_ffs.begin();
+ dlm_iter != m_dlm_ffs.end();
+ ++dlm_iter) {
+ (*dlm_iter).second->sync();
+ }
+
+ // Incorporate the DLM scores into all hypotheses and put into their
+ // stacks.
+ for (partial_hypo_iter = m_partial_hypos.begin();
+ partial_hypo_iter != m_partial_hypos.end();
+ ++partial_hypo_iter) {
+ Hypothesis* hypo = *partial_hypo_iter;
+
+ // Calculate DLM scores.
std::map<int, LanguageModel*>::iterator dlm_iter;
for (dlm_iter = m_dlm_ffs.begin();
dlm_iter != m_dlm_ffs.end();
++dlm_iter) {
- (*dlm_iter).second->sync();
+ LanguageModel &lm = *(dlm_iter->second);
+ hypo->EvaluateWith(lm, (*dlm_iter).first);
}
- // Incorporate the DLM scores into all hypotheses and put into their
- // stacks.
- for (partial_hypo_iter = m_partial_hypos.begin();
- partial_hypo_iter != m_partial_hypos.end();
- ++partial_hypo_iter) {
- Hypothesis* hypo = *partial_hypo_iter;
-
- // Calculate DLM scores.
- std::map<int, LanguageModel*>::iterator dlm_iter;
- for (dlm_iter = m_dlm_ffs.begin();
- dlm_iter != m_dlm_ffs.end();
- ++dlm_iter) {
- LanguageModel &lm = *(dlm_iter->second);
- hypo->EvaluateWith(lm, (*dlm_iter).first);
- }
-
- // Put completed hypothesis onto its stack.
- size_t wordsTranslated = hypo->GetWordsBitmap().GetNumWordsCovered();
- m_hypoStackColl[wordsTranslated]->AddPrune(hypo);
- }
- m_partial_hypos.clear();
-
- std::vector < HypothesisStack* >::iterator stack_iter;
- HypothesisStackNormal* stack;
- for (stack_iter = m_hypoStackColl.begin();
- stack_iter != m_hypoStackColl.end();
- ++stack_iter) {
- stack = static_cast<HypothesisStackNormal*>(*stack_iter);
- stack->PruneToSize(m_max_stack_size);
- }
+ // Put completed hypothesis onto its stack.
+ size_t wordsTranslated = hypo->GetWordsBitmap().GetNumWordsCovered();
+ m_hypoStackColl[wordsTranslated]->AddPrune(hypo);
+ }
+ m_partial_hypos.clear();
+
+ std::vector < HypothesisStack* >::iterator stack_iter;
+ HypothesisStackNormal* stack;
+ for (stack_iter = m_hypoStackColl.begin();
+ stack_iter != m_hypoStackColl.end();
+ ++stack_iter) {
+ stack = static_cast<HypothesisStackNormal*>(*stack_iter);
+ stack->PruneToSize(m_max_stack_size);
+ }
}
}
diff --git a/moses/SearchNormalBatch.h b/moses/SearchNormalBatch.h
index fcfda7054..7f6764635 100644
--- a/moses/SearchNormalBatch.h
+++ b/moses/SearchNormalBatch.h
@@ -13,7 +13,7 @@ class TranslationOptionCollection;
/** Implements the phrase-based stack decoding algorithm (no cube pruning) with a twist...
* Language model requests are batched together, duplicate requests are removed, and requests are sent together.
* Useful for distributed LM where network latency is an issue.
- */
+ */
class SearchNormalBatch: public SearchNormal
{
protected:
@@ -21,7 +21,7 @@ protected:
// Added for asynclm decoding.
std::vector<const StatelessFeatureFunction*> m_stateless_ffs;
std::map<int, LanguageModel*> m_dlm_ffs;
- std::map<int, StatefulFeatureFunction*> m_stateful_ffs;
+ std::map<int, StatefulFeatureFunction*> m_stateful_ffs;
std::vector<Hypothesis*> m_partial_hypos;
int m_batch_size;
int m_max_stack_size;
diff --git a/moses/Sentence.cpp b/moses/Sentence.cpp
index b2e5a6633..8e76b0f03 100644
--- a/moses/Sentence.cpp
+++ b/moses/Sentence.cpp
@@ -104,8 +104,7 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
this->SetTopicId(atol(topic_params[0].c_str()));
this->SetUseTopicId(true);
this->SetUseTopicIdAndProb(false);
- }
- else {
+ } else {
this->SetTopicIdAndProb(topic_params);
this->SetUseTopicId(false);
this->SetUseTopicIdAndProb(true);
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index df4c14cde..f822e4e13 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -109,15 +109,15 @@ StaticData::~StaticData()
typedef std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> > Coll;
Coll::iterator iter;
for (iter = m_transOptCache.begin(); iter != m_transOptCache.end(); ++iter) {
- std::pair<TranslationOptionList*,clock_t> &valuePair =iter->second;
- TranslationOptionList *transOptList = valuePair.first;
- delete transOptList;
+ std::pair<TranslationOptionList*,clock_t> &valuePair =iter->second;
+ TranslationOptionList *transOptList = valuePair.first;
+ delete transOptList;
}
/*
const std::vector<FeatureFunction*> &producers = FeatureFunction::GetFeatureFunctions();
for(size_t i=0;i<producers.size();++i) {
- FeatureFunction *ff = producers[i];
+ FeatureFunction *ff = producers[i];
delete ff;
}
*/
@@ -126,7 +126,8 @@ StaticData::~StaticData()
Phrase::FinalizeMemPool();
}
-bool StaticData::LoadDataStatic(Parameter *parameter, const std::string &execPath) {
+bool StaticData::LoadDataStatic(Parameter *parameter, const std::string &execPath)
+{
s_instance.SetExecPath(execPath);
return s_instance.LoadData(parameter);
}
@@ -143,7 +144,7 @@ bool StaticData::LoadData(Parameter *parameter)
}
m_parsingAlgorithm = (m_parameter->GetParam("parsing-algorithm").size() > 0) ?
- (ParsingAlgorithm) Scan<size_t>(m_parameter->GetParam("parsing-algorithm")[0]) : ParseCYKPlus;
+ (ParsingAlgorithm) Scan<size_t>(m_parameter->GetParam("parsing-algorithm")[0]) : ParseCYKPlus;
// to cube or not to cube
m_searchAlgorithm = (m_parameter->GetParam("search-algorithm").size() > 0) ?
@@ -217,7 +218,7 @@ bool StaticData::LoadData(Parameter *parameter)
} else {
m_nBestFactor = 20;
}
-
+
//lattice samples
if (m_parameter->GetParam("lattice-samples").size() ==2 ) {
m_latticeSamplesFilePath = m_parameter->GetParam("lattice-samples")[0];
@@ -276,11 +277,11 @@ bool StaticData::LoadData(Parameter *parameter)
#endif
SetBooleanParameter( &m_unprunedSearchGraph, "unpruned-search-graph", false );
SetBooleanParameter( &m_includeLHSInSearchGraph, "include-lhs-in-search-graph", false );
-
+
if (m_parameter->isParamSpecified("output-unknowns")) {
if (m_parameter->GetParam("output-unknowns").size() == 1) {
- m_outputUnknownsFile =Scan<string>(m_parameter->GetParam("output-unknowns")[0]);
+ m_outputUnknownsFile =Scan<string>(m_parameter->GetParam("output-unknowns")[0]);
} else {
UserMessage::Add(string("need to specify exactly one file name for unknowns"));
return false;
@@ -422,7 +423,7 @@ bool StaticData::LoadData(Parameter *parameter)
cerr << "Errror: Cannot use both n-best mbr and lattice mbr together" << endl;
exit(1);
}
-
+
//mira training
SetBooleanParameter( &m_mira, "mira", false );
@@ -446,7 +447,7 @@ bool StaticData::LoadData(Parameter *parameter)
exit(1);
}
if (m_useConsensusDecoding) m_mbr=true;
-
+
// Compact phrase table and reordering model
SetBooleanParameter( &m_minphrMemory, "minphr-memory", false );
SetBooleanParameter( &m_minlexrMemory, "minlexr-memory", false );
@@ -489,7 +490,7 @@ bool StaticData::LoadData(Parameter *parameter)
}
m_startTranslationId = (m_parameter->GetParam("start-translation-id").size() > 0) ?
- Scan<long>(m_parameter->GetParam("start-translation-id")[0]) : 0;
+ Scan<long>(m_parameter->GetParam("start-translation-id")[0]) : 0;
// Read in constraint decoding file, if provided
if(m_parameter->GetParam("constraint").size()) {
@@ -503,7 +504,7 @@ bool StaticData::LoadData(Parameter *parameter)
InputFileStream constraintFile(m_constraintFileName);
std::string line;
-
+
long sentenceID = GetStartTranslationId() - 1;
while (getline(constraintFile, line)) {
vector<string> vecStr = Tokenize(line, "\t");
@@ -546,14 +547,14 @@ bool StaticData::LoadData(Parameter *parameter)
// specify XML tags opening and closing brackets for XML option
if (m_parameter->GetParam("xml-brackets").size() > 0) {
- std::vector<std::string> brackets = Tokenize(m_parameter->GetParam("xml-brackets")[0]);
- if(brackets.size()!=2) {
- cerr << "invalid xml-brackets value, must specify exactly 2 blank-delimited strings for XML tags opening and closing brackets" << endl;
- exit(1);
- }
- m_xmlBrackets.first= brackets[0];
- m_xmlBrackets.second=brackets[1];
- cerr << "XML tags opening and closing brackets for XML input are: " << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl;
+ std::vector<std::string> brackets = Tokenize(m_parameter->GetParam("xml-brackets")[0]);
+ if(brackets.size()!=2) {
+ cerr << "invalid xml-brackets value, must specify exactly 2 blank-delimited strings for XML tags opening and closing brackets" << endl;
+ exit(1);
+ }
+ m_xmlBrackets.first= brackets[0];
+ m_xmlBrackets.second=brackets[1];
+ cerr << "XML tags opening and closing brackets for XML input are: " << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl;
}
// all features
@@ -574,58 +575,47 @@ bool StaticData::LoadData(Parameter *parameter)
GlobalLexicalModel *model = new GlobalLexicalModel(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
- }
- else if (feature == "GlobalLexicalModelUnlimited") {
+ } else if (feature == "GlobalLexicalModelUnlimited") {
GlobalLexicalModelUnlimited *model = NULL; //new GlobalLexicalModelUnlimited(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
- }
- else if (feature == "SourceWordDeletionFeature") {
+ } else if (feature == "SourceWordDeletionFeature") {
SourceWordDeletionFeature *model = new SourceWordDeletionFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "TargetWordInsertionFeature") {
+ } else if (feature == "TargetWordInsertionFeature") {
TargetWordInsertionFeature *model = new TargetWordInsertionFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "PhraseBoundaryFeature") {
+ } else if (feature == "PhraseBoundaryFeature") {
PhraseBoundaryFeature *model = new PhraseBoundaryFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "PhraseLengthFeature") {
+ } else if (feature == "PhraseLengthFeature") {
PhraseLengthFeature *model = new PhraseLengthFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "WordTranslationFeature") {
+ } else if (feature == "WordTranslationFeature") {
WordTranslationFeature *model = new WordTranslationFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "TargetBigramFeature") {
+ } else if (feature == "TargetBigramFeature") {
TargetBigramFeature *model = new TargetBigramFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "TargetNgramFeature") {
+ } else if (feature == "TargetNgramFeature") {
TargetNgramFeature *model = new TargetNgramFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "PhrasePairFeature") {
+ } else if (feature == "PhrasePairFeature") {
PhrasePairFeature *model = new PhrasePairFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "LexicalReordering") {
+ } else if (feature == "LexicalReordering") {
LexicalReordering *model = new LexicalReordering(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
- }
- else if (feature == "KENLM") {
+ } else if (feature == "KENLM") {
LanguageModel *model = ConstructKenLM(feature, line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
@@ -648,69 +638,58 @@ bool StaticData::LoadData(Parameter *parameter)
GenerationDictionary *model = new GenerationDictionary(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
- }
- else if (feature == "BleuScoreFeature") {
+ } else if (feature == "BleuScoreFeature") {
BleuScoreFeature *model = new BleuScoreFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
- }
- else if (feature == "Distortion") {
+ } else if (feature == "Distortion") {
DistortionScoreProducer *model = new DistortionScoreProducer(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
- }
- else if (feature == "WordPenalty") {
+ } else if (feature == "WordPenalty") {
WordPenaltyProducer *model = new WordPenaltyProducer(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_wpProducer = model;
- }
- else if (feature == "UnknownWordPenalty") {
+ } else if (feature == "UnknownWordPenalty") {
UnknownWordPenaltyProducer *model = new UnknownWordPenaltyProducer(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
if (weights.size() == 0)
weights.push_back(1.0f);
SetWeights(model, weights);
m_unknownWordPenaltyProducer = model;
- }
- else if (feature == "PhraseDictionaryBinary") {
+ } else if (feature == "PhraseDictionaryBinary") {
PhraseDictionaryTreeAdaptor* model = new PhraseDictionaryTreeAdaptor(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_phraseDictionary.push_back(model);
- }
- else if (feature == "PhraseDictionaryOnDisk") {
+ } else if (feature == "PhraseDictionaryOnDisk") {
PhraseDictionaryOnDisk* model = new PhraseDictionaryOnDisk(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_phraseDictionary.push_back(model);
- }
- else if (feature == "PhraseDictionaryMemory") {
+ } else if (feature == "PhraseDictionaryMemory") {
PhraseDictionaryMemory* model = new PhraseDictionaryMemory(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_phraseDictionary.push_back(model);
- }
- else if (feature == "PhraseDictionaryCompact") {
+ } else if (feature == "PhraseDictionaryCompact") {
PhraseDictionaryCompact* model = new PhraseDictionaryCompact(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_phraseDictionary.push_back(model);
- }
- else if (feature == "PhraseDictionaryMultiModel") {
+ } else if (feature == "PhraseDictionaryMultiModel") {
PhraseDictionaryMultiModel* model = new PhraseDictionaryMultiModel(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_phraseDictionary.push_back(model);
- }
- else if (feature == "PhraseDictionaryMultiModelCounts") {
+ } else if (feature == "PhraseDictionaryMultiModelCounts") {
PhraseDictionaryMultiModelCounts* model = new PhraseDictionaryMultiModelCounts(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_phraseDictionary.push_back(model);
- }
- else if (feature == "PhraseDictionaryALSuffixArray") {
- PhraseDictionaryALSuffixArray* model = new PhraseDictionaryALSuffixArray(line);
+ } else if (feature == "PhraseDictionaryALSuffixArray") {
+ PhraseDictionaryALSuffixArray* model = new PhraseDictionaryALSuffixArray(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_phraseDictionary.push_back(model);
@@ -912,7 +891,7 @@ bool StaticData::LoadDecodeGraphs()
DecodeGraph *decodeGraph;
if (IsChart()) {
size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN;
- cerr << "max-chart-span: " << maxChartSpans[decodeGraphInd] << endl;
+ cerr << "max-chart-span: " << maxChartSpans[decodeGraphInd] << endl;
decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan);
} else {
decodeGraph = new DecodeGraph(m_decodeGraphs.size());
@@ -947,7 +926,7 @@ const TranslationOptionList* StaticData::FindTransOptListInCache(const DecodeGra
boost::mutex::scoped_lock lock(m_transOptCacheMutex);
#endif
std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iter
- = m_transOptCache.find(key);
+ = m_transOptCache.find(key);
if (iter == m_transOptCache.end())
return NULL;
iter->second.second = clock(); // update last used time
@@ -994,7 +973,8 @@ void StaticData::AddTransOptListToCache(const DecodeGraph &decodeGraph, const Ph
m_transOptCache[key] = make_pair( storedTransOptList, clock() );
ReduceTransOptCache();
}
-void StaticData::ClearTransOptionCache() const {
+void StaticData::ClearTransOptionCache() const
+{
map<std::pair<size_t, Phrase>, std::pair< TranslationOptionList*, clock_t > >::iterator iterCache;
for (iterCache = m_transOptCache.begin() ; iterCache != m_transOptCache.end() ; ++iterCache) {
TranslationOptionList *transOptList = iterCache->second.first;
@@ -1091,20 +1071,19 @@ void StaticData::SetExecPath(const std::string &path)
{
/*
namespace fs = boost::filesystem;
-
+
fs::path full_path( fs::initial_path<fs::path>() );
-
+
full_path = fs::system_complete( fs::path( path ) );
-
+
//Without file name
m_binPath = full_path.parent_path().string();
*/
-
+
// NOT TESTED
size_t pos = path.rfind("/");
- if (pos != string::npos)
- {
- m_binPath = path.substr(0, pos);
+ if (pos != string::npos) {
+ m_binPath = path.substr(0, pos);
}
cerr << m_binPath << endl;
}
@@ -1114,27 +1093,31 @@ const string &StaticData::GetBinDirectory() const
return m_binPath;
}
-float StaticData::GetWeightWordPenalty() const {
+float StaticData::GetWeightWordPenalty() const
+{
float weightWP = GetWeight(m_wpProducer);
//VERBOSE(1, "Read weightWP from translation sytem: " << weightWP << std::endl);
return weightWP;
}
-float StaticData::GetWeightUnknownWordPenalty() const {
+float StaticData::GetWeightUnknownWordPenalty() const
+{
return GetWeight(m_unknownWordPenaltyProducer);
}
-void StaticData::InitializeForInput(const InputType& source) const {
+void StaticData::InitializeForInput(const InputType& source) const
+{
const std::vector<FeatureFunction*> &producers = FeatureFunction::GetFeatureFunctions();
- for(size_t i=0;i<producers.size();++i) {
+ for(size_t i=0; i<producers.size(); ++i) {
FeatureFunction &ff = *producers[i];
ff.InitializeForInput(source);
}
}
-void StaticData::CleanUpAfterSentenceProcessing(const InputType& source) const {
+void StaticData::CleanUpAfterSentenceProcessing(const InputType& source) const
+{
const std::vector<FeatureFunction*> &producers = FeatureFunction::GetFeatureFunctions();
- for(size_t i=0;i<producers.size();++i) {
+ for(size_t i=0; i<producers.size(); ++i) {
FeatureFunction &ff = *producers[i];
ff.CleanUpAfterSentenceProcessing(source);
}
@@ -1172,8 +1155,7 @@ bool StaticData::CheckWeights() const
set<string>::iterator iter = weightNames.find(descr);
if (iter == weightNames.end()) {
cerr << "Can't find weights for feature function " << descr << endl;
- }
- else {
+ } else {
weightNames.erase(iter);
}
}
diff --git a/moses/StaticData.h b/moses/StaticData.h
index 01a0a19df..5a1cec213 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -87,7 +87,7 @@ protected:
m_translationOptionThreshold,
m_wordDeletionWeight;
-
+
// PhraseTrans, Generation & LanguageModelScore has multiple weights.
int m_maxDistortion;
// do it differently from old pharaoh
@@ -206,7 +206,7 @@ protected:
int m_threadCount;
long m_startTranslationId;
-
+
StaticData();
@@ -223,7 +223,7 @@ protected:
bool m_continuePartialTranslation;
std::string m_binPath;
-
+
public:
bool IsAlwaysCreateDirectTranslationOption() const {
@@ -363,15 +363,15 @@ public:
bool IsLabeledNBestList() const {
return m_labeledNBestList;
}
-
+
bool UseMinphrInMemory() const {
- return m_minphrMemory;
+ return m_minphrMemory;
}
bool UseMinlexrInMemory() const {
- return m_minlexrMemory;
+ return m_minlexrMemory;
}
-
+
size_t GetNumRealWordsInInput() const {
return m_numRealWordsInInput;
}
@@ -421,13 +421,16 @@ public:
bool IsChart() const {
return m_searchAlgorithm == ChartDecoding || m_searchAlgorithm == ChartIncremental;
}
- const WordPenaltyProducer *GetWordPenaltyProducer() const
- { return m_wpProducer; }
- WordPenaltyProducer *GetWordPenaltyProducer() // for mira
- { return m_wpProducer; }
+ const WordPenaltyProducer *GetWordPenaltyProducer() const {
+ return m_wpProducer;
+ }
+ WordPenaltyProducer *GetWordPenaltyProducer() { // for mira
+ return m_wpProducer;
+ }
- const UnknownWordPenaltyProducer *GetUnknownWordPenaltyProducer() const
- { return m_unknownWordPenaltyProducer; }
+ const UnknownWordPenaltyProducer *GetUnknownWordPenaltyProducer() const {
+ return m_unknownWordPenaltyProducer;
+ }
size_t GetNumInputScores() const {
return m_numInputScores;
@@ -458,7 +461,7 @@ public:
float GetSparseWeight(const FName& featureName) const {
return m_allWeights.GetSparseWeight(featureName);
}
-
+
//Weights for feature with fixed number of values
void SetWeights(const FeatureFunction* sp, const std::vector<float>& weights);
@@ -627,15 +630,17 @@ public:
int ThreadCount() const {
return m_threadCount;
}
-
- long GetStartTranslationId() const
- { return m_startTranslationId; }
-
+
+ long GetStartTranslationId() const {
+ return m_startTranslationId;
+ }
+
void SetExecPath(const std::string &path);
const std::string &GetBinDirectory() const;
bool NeedAlignmentInfo() const {
- return m_needAlignmentInfo; }
+ return m_needAlignmentInfo;
+ }
const std::string &GetAlignmentOutputFile() const {
return m_alignmentOutputFile;
}
@@ -656,19 +661,26 @@ public:
float GetWeightWordPenalty() const;
float GetWeightUnknownWordPenalty() const;
- const std::vector<PhraseDictionary*>& GetPhraseDictionaries() const
- { return m_phraseDictionary;}
- const std::vector<const GenerationDictionary*>& GetGenerationDictionaries() const
- { return m_generationDictionary;}
- const PhraseDictionary*GetTranslationScoreProducer(size_t index) const
- { return GetPhraseDictionaries().at(index); }
+ const std::vector<PhraseDictionary*>& GetPhraseDictionaries() const {
+ return m_phraseDictionary;
+ }
+ const std::vector<const GenerationDictionary*>& GetGenerationDictionaries() const {
+ return m_generationDictionary;
+ }
+ const PhraseDictionary*GetTranslationScoreProducer(size_t index) const {
+ return GetPhraseDictionaries().at(index);
+ }
std::vector<float> GetTranslationWeights(size_t index) const {
std::vector<float> weights = GetWeights(GetTranslationScoreProducer(index));
return weights;
}
- const std::vector<DecodeGraph*>& GetDecodeGraphs() const {return m_decodeGraphs;}
- const std::vector<size_t>& GetDecodeGraphBackoff() const {return m_decodeGraphBackoff;}
+ const std::vector<DecodeGraph*>& GetDecodeGraphs() const {
+ return m_decodeGraphs;
+ }
+ const std::vector<size_t>& GetDecodeGraphBackoff() const {
+ return m_decodeGraphBackoff;
+ }
//sentence (and thread) specific initialisationn and cleanup
void InitializeForInput(const InputType& source) const;
@@ -697,8 +709,7 @@ public:
#ifdef WITH_THREADS
if (m_multimodelweights_tmp.find(boost::this_thread::get_id()) != m_multimodelweights_tmp.end()) {
return &m_multimodelweights_tmp.find(boost::this_thread::get_id())->second;
- }
- else {
+ } else {
return NULL;
}
#else
diff --git a/moses/SyntacticLanguageModel.cpp b/moses/SyntacticLanguageModel.cpp
index 4a3b26ff1..cde041fe7 100644
--- a/moses/SyntacticLanguageModel.cpp
+++ b/moses/SyntacticLanguageModel.cpp
@@ -10,154 +10,159 @@
namespace Moses
{
- SyntacticLanguageModel::SyntacticLanguageModel(const std::string &line)
- // Initialize member variables
- /*
- : m_NumScoreComponents(weights.size())
- , m_files(new SyntacticLanguageModelFiles<YModel,XModel>(filePath))
- , m_factorType(factorType)
- , m_beamWidth(beamWidth) {
- */
- {
- /* taken from StaticData::LoadSyntacticLanguageModel()
- cerr << "Loading syntactic language models..." << std::endl;
-
- const vector<float> weights = Scan<float>(m_parameter->GetParam("weight-slm"));
- const vector<string> files = m_parameter->GetParam("slmodel-file");
-
- const FactorType factorType = (m_parameter->GetParam("slmodel-factor").size() > 0) ?
- TransformScore(Scan<int>(m_parameter->GetParam("slmodel-factor")[0]))
- : 0;
-
- const size_t beamWidth = (m_parameter->GetParam("slmodel-beam").size() > 0) ?
- TransformScore(Scan<int>(m_parameter->GetParam("slmodel-beam")[0]))
- : 500;
-
- if (files.size() < 1) {
- cerr << "No syntactic language model files specified!" << std::endl;
- return false;
- }
+SyntacticLanguageModel::SyntacticLanguageModel(const std::string &line)
+// Initialize member variables
+/*
+: m_NumScoreComponents(weights.size())
+, m_files(new SyntacticLanguageModelFiles<YModel,XModel>(filePath))
+, m_factorType(factorType)
+, m_beamWidth(beamWidth) {
+*/
+{
+ /* taken from StaticData::LoadSyntacticLanguageModel()
+ cerr << "Loading syntactic language models..." << std::endl;
- // check if feature is used
- if (weights.size() >= 1) {
+ const vector<float> weights = Scan<float>(m_parameter->GetParam("weight-slm"));
+ const vector<string> files = m_parameter->GetParam("slmodel-file");
- //cout.setf(ios::scientific,ios::floatfield);
- //cerr.setf(ios::scientific,ios::floatfield);
+ const FactorType factorType = (m_parameter->GetParam("slmodel-factor").size() > 0) ?
+ TransformScore(Scan<int>(m_parameter->GetParam("slmodel-factor")[0]))
+ : 0;
- // create the feature
- m_syntacticLanguageModel = new SyntacticLanguageModel(files,weights,factorType,beamWidth);
+ const size_t beamWidth = (m_parameter->GetParam("slmodel-beam").size() > 0) ?
+ TransformScore(Scan<int>(m_parameter->GetParam("slmodel-beam")[0]))
+ : 500;
+ if (files.size() < 1) {
+ cerr << "No syntactic language model files specified!" << std::endl;
+ return false;
+ }
- /////////////////////////////////////////
- // BEGIN LANE's UNSTABLE EXPERIMENT :)
- //
+ // check if feature is used
+ if (weights.size() >= 1) {
- //double ppl = m_syntacticLanguageModel->perplexity();
- //cerr << "Probability is " << ppl << endl;
+ //cout.setf(ios::scientific,ios::floatfield);
+ //cerr.setf(ios::scientific,ios::floatfield);
+ // create the feature
+ m_syntacticLanguageModel = new SyntacticLanguageModel(files,weights,factorType,beamWidth);
- //
- // END LANE's UNSTABLE EXPERIMENT
- /////////////////////////////////////////
+ /////////////////////////////////////////
+ // BEGIN LANE's UNSTABLE EXPERIMENT :)
+ //
+ //double ppl = m_syntacticLanguageModel->perplexity();
+ //cerr << "Probability is " << ppl << endl;
- if (m_syntacticLanguageModel==NULL) {
- return false;
- }
- }
+ //
+ // END LANE's UNSTABLE EXPERIMENT
+ /////////////////////////////////////////
- return true;
- */
- }
- SyntacticLanguageModel::~SyntacticLanguageModel() {
- VERBOSE(3,"Destructing SyntacticLanguageModel" << std::endl);
- delete m_files;
- }
+ if (m_syntacticLanguageModel==NULL) {
+ return false;
+ }
- size_t SyntacticLanguageModel::GetNumScoreComponents() const {
- return m_NumScoreComponents;
}
- std::string SyntacticLanguageModel::GetScoreProducerDescription() const {
- return "SyntacticLM";
- }
+ return true;
- const FFState* SyntacticLanguageModel::EmptyHypothesisState(const InputType &input) const {
+ */
+}
- return new SyntacticLanguageModelState<YModel,XModel,S,R>(m_files,m_beamWidth);
+SyntacticLanguageModel::~SyntacticLanguageModel()
+{
+ VERBOSE(3,"Destructing SyntacticLanguageModel" << std::endl);
+ delete m_files;
+}
- }
+size_t SyntacticLanguageModel::GetNumScoreComponents() const
+{
+ return m_NumScoreComponents;
+}
- /*
- double SyntacticLanguageModel::perplexity() {
+std::string SyntacticLanguageModel::GetScoreProducerDescription() const
+{
+ return "SyntacticLM";
+}
+
+const FFState* SyntacticLanguageModel::EmptyHypothesisState(const InputType &input) const
+{
- SyntacticLanguageModelState<YModel,XModel,S,R> *prev =
- new SyntacticLanguageModelState<YModel,XModel,S,R>(m_files,m_beamWidth);
+ return new SyntacticLanguageModelState<YModel,XModel,S,R>(m_files,m_beamWidth);
- std::cerr << "Initial prob:" << "\t" << prev->getProb() <<std::endl;
+}
+/*
+double SyntacticLanguageModel::perplexity() {
- std::vector<std::string> words(3);
- words[0] = "no";
- words[1] = ",";
- words[2] = "zxvth";
+ SyntacticLanguageModelState<YModel,XModel,S,R> *prev =
+ new SyntacticLanguageModelState<YModel,XModel,S,R>(m_files,m_beamWidth);
+ std::cerr << "Initial prob:" << "\t" << prev->getProb() <<std::endl;
- for (std::vector<std::string>::iterator i=words.begin();
- i != words.end();
- i++) {
- prev = new SyntacticLanguageModelState<YModel,XModel,S,R>(prev, *i);
- std::cerr << *i << "\t" << prev->getProb() <<std::endl;
+ std::vector<std::string> words(3);
+ words[0] = "no";
+ words[1] = ",";
+ words[2] = "zxvth";
- }
- if (true) exit(-1);
+ for (std::vector<std::string>::iterator i=words.begin();
+ i != words.end();
+ i++) {
- return prev->getProb();
+ prev = new SyntacticLanguageModelState<YModel,XModel,S,R>(prev, *i);
+ std::cerr << *i << "\t" << prev->getProb() <<std::endl;
}
- */
- FFState* SyntacticLanguageModel::Evaluate(const Hypothesis& cur_hypo,
- const FFState* prev_state,
- ScoreComponentCollection* accumulator) const {
-
- VERBOSE(3,"Evaluating SyntacticLanguageModel for a hypothesis" << endl);
-
- SyntacticLanguageModelState<YModel,XModel,S,R>* tmpState = NULL;
- SyntacticLanguageModelState<YModel,XModel,S,R>* nextState = NULL;
-
-
- const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
-
- for (size_t i=0, n=targetPhrase.GetSize(); i<n; i++) {
-
- const Word& word = targetPhrase.GetWord(i);
- const Factor* factor = word.GetFactor(m_factorType);
-
- const std::string& string = factor->GetString();
-
- if (i==0) {
- nextState = new SyntacticLanguageModelState<YModel,XModel,S,R>((const SyntacticLanguageModelState<YModel,XModel,S,R>*)prev_state, string);
- } else {
- tmpState = nextState;
- nextState = new SyntacticLanguageModelState<YModel,XModel,S,R>(tmpState, string);
- delete tmpState;
- }
-
- double score = nextState->getScore();
- VERBOSE(3,"SynLM evaluated a score of " << score << endl);
- accumulator->Assign( this, score );
- }
-
+ if (true) exit(-1);
+
+ return prev->getProb();
+
+}
+*/
+FFState* SyntacticLanguageModel::Evaluate(const Hypothesis& cur_hypo,
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const
+{
+
+ VERBOSE(3,"Evaluating SyntacticLanguageModel for a hypothesis" << endl);
+
+ SyntacticLanguageModelState<YModel,XModel,S,R>* tmpState = NULL;
+ SyntacticLanguageModelState<YModel,XModel,S,R>* nextState = NULL;
+
- return nextState;
+ const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
+ for (size_t i=0, n=targetPhrase.GetSize(); i<n; i++) {
+
+ const Word& word = targetPhrase.GetWord(i);
+ const Factor* factor = word.GetFactor(m_factorType);
+
+ const std::string& string = factor->GetString();
+
+ if (i==0) {
+ nextState = new SyntacticLanguageModelState<YModel,XModel,S,R>((const SyntacticLanguageModelState<YModel,XModel,S,R>*)prev_state, string);
+ } else {
+ tmpState = nextState;
+ nextState = new SyntacticLanguageModelState<YModel,XModel,S,R>(tmpState, string);
+ delete tmpState;
+ }
+
+ double score = nextState->getScore();
+ VERBOSE(3,"SynLM evaluated a score of " << score << endl);
+ accumulator->Assign( this, score );
}
+
+
+ return nextState;
+
+}
+
}
diff --git a/moses/SyntacticLanguageModel.h b/moses/SyntacticLanguageModel.h
index 3cd4c58e9..6e88d85c1 100644
--- a/moses/SyntacticLanguageModel.h
+++ b/moses/SyntacticLanguageModel.h
@@ -12,40 +12,41 @@ class XModel; // observed model
namespace Moses
{
- template <class MH, class MO> class SyntacticLanguageModelFiles;
-
- class SyntacticLanguageModel : public StatefulFeatureFunction {
+template <class MH, class MO> class SyntacticLanguageModelFiles;
- public:
- SyntacticLanguageModel(const std::string &line);
+class SyntacticLanguageModel : public StatefulFeatureFunction
+{
+
+public:
+ SyntacticLanguageModel(const std::string &line);
- ~SyntacticLanguageModel();
+ ~SyntacticLanguageModel();
- size_t GetNumScoreComponents() const;
+ size_t GetNumScoreComponents() const;
- const FFState* EmptyHypothesisState(const InputType &input) const;
+ const FFState* EmptyHypothesisState(const InputType &input) const;
- FFState* Evaluate(const Hypothesis& cur_hypo,
- const FFState* prev_state,
- ScoreComponentCollection* accumulator) const;
+ FFState* Evaluate(const Hypothesis& cur_hypo,
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const;
- FFState* EvaluateChart(const ChartHypothesis& cur_hypo,
- int featureID,
- ScoreComponentCollection* accumulator) const {
- throw std::runtime_error("Syntactic LM can only be used with phrase-based decoder.");
- }
+ FFState* EvaluateChart(const ChartHypothesis& cur_hypo,
+ int featureID,
+ ScoreComponentCollection* accumulator) const {
+ throw std::runtime_error("Syntactic LM can only be used with phrase-based decoder.");
+ }
- // double perplexity();
+ // double perplexity();
- private:
+private:
- const size_t m_NumScoreComponents;
- SyntacticLanguageModelFiles<YModel,XModel>* m_files;
- const FactorType m_factorType;
- const size_t m_beamWidth;
+ const size_t m_NumScoreComponents;
+ SyntacticLanguageModelFiles<YModel,XModel>* m_files;
+ const FactorType m_factorType;
+ const size_t m_beamWidth;
- };
+};
}
diff --git a/moses/SyntacticLanguageModelFiles.h b/moses/SyntacticLanguageModelFiles.h
index 2e12e88c6..b91c0abfe 100644
--- a/moses/SyntacticLanguageModelFiles.h
+++ b/moses/SyntacticLanguageModelFiles.h
@@ -9,50 +9,55 @@
namespace Moses
{
-template <class MH, class MO>
-class SyntacticLanguageModelFiles {
+template <class MH, class MO>
+class SyntacticLanguageModelFiles
+{
- public:
+public:
SyntacticLanguageModelFiles(const std::vector<std::string>& filePaths);
~SyntacticLanguageModelFiles();
-
+
MH* getHiddenModel();
MO* getObservedModel();
- private:
+private:
MH* hiddenModel;
MO* observedModel;
-
+
};
template <class MH, class MO>
- SyntacticLanguageModelFiles<MH,MO>::SyntacticLanguageModelFiles(const std::vector<std::string>& filePaths) {
+SyntacticLanguageModelFiles<MH,MO>::SyntacticLanguageModelFiles(const std::vector<std::string>& filePaths)
+{
this->hiddenModel = new MH();
this->observedModel = new MO();
-
+
//// I. LOAD MODELS...
std::cerr << "Reading syntactic language model files...\n";
// For each model file...
for ( int a=0, n=filePaths.size(); a<n; a++ ) { // read models
FILE* pf = fopen(filePaths[a].c_str(),"r"); //CHECK(pf); // Read model file
- if(!pf){
+ if(!pf) {
std::cerr << "Error loading model file " << filePaths[a] << std::endl;
return;
}
std::cerr << "Loading model \'" << filePaths[a] << "\'...\n";
- int c=' '; int i=0; int line=1; String sBuff(1000); // Lookahead/ctrs/buffers
+ int c=' ';
+ int i=0;
+ int line=1;
+ String sBuff(1000); // Lookahead/ctrs/buffers
CONSUME_ALL ( pf, c, WHITESPACE(c), line); // Get to first record
while ( c!=-1 && c!='\0' && c!='\5' ) { // For each record
CONSUME_STR ( pf, c, (c!='\n' && c!='\0' && c!='\5'), sBuff, i, line ); // Consume line
StringInput si(sBuff.c_array());
if ( !( sBuff[0]=='#' // Accept comments/fields
- || si>>*(this->hiddenModel)>>"\0"!=NULL
- || si>>*(this->observedModel)>>"\0"!=NULL
- ))
- std::cerr<<"\nERROR: can't parse \'"<<sBuff<<"\' in line "<<line<<"\n\n";
+ || si>>*(this->hiddenModel)>>"\0"!=NULL
+ || si>>*(this->observedModel)>>"\0"!=NULL
+ ))
+ std::cerr<<"\nERROR: can't parse \'"<<sBuff<<"\' in line "<<line<<"\n\n";
CONSUME_ALL ( pf, c, WHITESPACE(c), line); // Consume whitespace
if ( line%100000==0 ) std::cerr<<" "<<line<<" lines read...\n"; // Progress for big models
}
@@ -66,7 +71,8 @@ template <class MH, class MO>
template <class MH, class MO>
- SyntacticLanguageModelFiles<MH,MO>::~SyntacticLanguageModelFiles() {
+SyntacticLanguageModelFiles<MH,MO>::~SyntacticLanguageModelFiles()
+{
VERBOSE(3,"Destructing syntactic language model files" << std::endl);
delete hiddenModel;
@@ -76,15 +82,17 @@ template <class MH, class MO>
template <class MH, class MO>
- MH* SyntacticLanguageModelFiles<MH,MO>::getHiddenModel() {
-
+MH* SyntacticLanguageModelFiles<MH,MO>::getHiddenModel()
+{
+
return this->hiddenModel;
}
template <class MH, class MO>
- MO* SyntacticLanguageModelFiles<MH,MO>::getObservedModel() {
-
+MO* SyntacticLanguageModelFiles<MH,MO>::getObservedModel()
+{
+
return this->observedModel;
}
diff --git a/moses/SyntacticLanguageModelState.h b/moses/SyntacticLanguageModelState.h
index 15828eedc..bf35616d9 100644
--- a/moses/SyntacticLanguageModelState.h
+++ b/moses/SyntacticLanguageModelState.h
@@ -15,8 +15,9 @@ namespace Moses
{
template <class MY, class MX, class YS=typename MY::RandVarType, class B=NullBackDat<typename MY::RandVarType> >
- class SyntacticLanguageModelState : public FFState {
- public:
+class SyntacticLanguageModelState : public FFState
+{
+public:
// Initialize an empty LM state
SyntacticLanguageModelState( SyntacticLanguageModelFiles<MY,MX>* modelData, int beamSize );
@@ -25,52 +26,53 @@ template <class MY, class MX, class YS=typename MY::RandVarType, class B=NullBac
SyntacticLanguageModelState( const SyntacticLanguageModelState* prev, std::string word );
- ~SyntacticLanguageModelState() {
- VERBOSE(3,"Destructing SyntacticLanguageModelState" << std::endl);
- delete randomVariableStore;
- }
+ ~SyntacticLanguageModelState() {
+ VERBOSE(3,"Destructing SyntacticLanguageModelState" << std::endl);
+ delete randomVariableStore;
+ }
- virtual int Compare(const FFState& other) const;
+ virtual int Compare(const FFState& other) const;
// Get the LM score from this LM state
double getScore() const;
- double getProb() const;
+ double getProb() const;
- private:
+private:
- void setScore(double score);
- void printRV();
+ void setScore(double score);
+ void printRV();
- SafeArray1D<Id<int>,pair<YS,LogProb> >* randomVariableStore;
- double prob;
- double score;
- int beamSize;
- SyntacticLanguageModelFiles<MY,MX>* modelData;
- bool sentenceStart;
+ SafeArray1D<Id<int>,pair<YS,LogProb> >* randomVariableStore;
+ double prob;
+ double score;
+ int beamSize;
+ SyntacticLanguageModelFiles<MY,MX>* modelData;
+ bool sentenceStart;
};
////////////////////////////////////////////////////////////////////////////////
-
- template <class MY, class MX, class YS, class B>
- void SyntacticLanguageModelState<MY,MX,YS,B>::printRV() {
- cerr << "*********** BEGIN printRV() ******************" << endl;
- int size=randomVariableStore->getSize();
- cerr << "randomVariableStore->getSize() == " << size << endl;
+template <class MY, class MX, class YS, class B>
+void SyntacticLanguageModelState<MY,MX,YS,B>::printRV()
+{
+
+ cerr << "*********** BEGIN printRV() ******************" << endl;
+ int size=randomVariableStore->getSize();
+ cerr << "randomVariableStore->getSize() == " << size << endl;
+
+ for (int depth=0; depth<size; depth+=1) {
- for (int depth=0; depth<size; depth+=1) {
-
- const pair<YS,LogProb> *data = &(randomVariableStore->get(depth));
- std::cerr << "randomVariableStore[" << depth << "]\t" << data->first << "\tprob = " << data->second.toProb() << "\tlogProb = " << double(data->second.toInt())/100 << std::endl;
+ const pair<YS,LogProb> *data = &(randomVariableStore->get(depth));
+ std::cerr << "randomVariableStore[" << depth << "]\t" << data->first << "\tprob = " << data->second.toProb() << "\tlogProb = " << double(data->second.toInt())/100 << std::endl;
- }
- cerr << "*********** END printRV() ******************" << endl;
+ }
+ cerr << "*********** END printRV() ******************" << endl;
- }
+}
// Initialize an empty LM state from grammar files
//
@@ -78,7 +80,8 @@ template <class MY, class MX, class YS=typename MY::RandVarType, class B=NullBac
// argv is the list of model file names
//
template <class MY, class MX, class YS, class B>
- SyntacticLanguageModelState<MY,MX,YS,B>::SyntacticLanguageModelState( SyntacticLanguageModelFiles<MY,MX>* modelData, int beamSize ) {
+SyntacticLanguageModelState<MY,MX,YS,B>::SyntacticLanguageModelState( SyntacticLanguageModelFiles<MY,MX>* modelData, int beamSize )
+{
this->randomVariableStore = new SafeArray1D<Id<int>,pair<YS,LogProb> >();
this->modelData = modelData;
@@ -89,7 +92,7 @@ template <class MY, class MX, class YS, class B>
StringInput(String(BEG_STATE).c_array())>>xBEG>>"\0";
cerr<<xBEG<<"\n";
- // cout << "Examining RV store just before RV init" << endl;
+ // cout << "Examining RV store just before RV init" << endl;
//printRV();
// Initialize the random variable store
@@ -107,16 +110,17 @@ template <class MY, class MX, class YS, class B>
//score = l.toDouble();
setScore(l.toDouble());
// MY::F_ROOT_OBS = true;
- // this->modelData->getHiddenModel()->setRootObs(true);
-
-
+// this->modelData->getHiddenModel()->setRootObs(true);
+
+
}
template <class MY, class MX, class YS, class B>
- int SyntacticLanguageModelState<MY,MX,YS,B>::Compare(const FFState& other) const {
+int SyntacticLanguageModelState<MY,MX,YS,B>::Compare(const FFState& other) const
+{
/*
- const SyntacticLanguageModelState<MY,MX,YS,B>& o =
+ const SyntacticLanguageModelState<MY,MX,YS,B>& o =
static_cast<const SyntacticLanguageModelState<MY,MX,YS,B>&>(other);
if (o.score > score) return 1;
@@ -124,13 +128,14 @@ template <class MY, class MX, class YS, class B>
else return 0;
*/
return 0;
- }
+}
template <class MY, class MX, class YS, class B>
- SyntacticLanguageModelState<MY,MX,YS,B>::SyntacticLanguageModelState( const SyntacticLanguageModelState* prev, std::string word ) {
+SyntacticLanguageModelState<MY,MX,YS,B>::SyntacticLanguageModelState( const SyntacticLanguageModelState* prev, std::string word )
+{
- // Initialize member variables
+ // Initialize member variables
this->randomVariableStore = new SafeArray1D<Id<int>,pair<YS,LogProb> >();
this->modelData = prev->modelData;
this->beamSize = prev->beamSize;
@@ -143,13 +148,13 @@ template <class MY, class MX, class YS, class B>
// Get HHMM model files
MY& mH = *(modelData->getHiddenModel());
MX& mO = *(modelData->getObservedModel());
-
+
// Initialize HHMM
- HMM<MY,MX,YS,B> hmm(mH,mO);
+ HMM<MY,MX,YS,B> hmm(mH,mO);
int MAX_WORDS = 2;
hmm.init(MAX_WORDS,this->beamSize,prev->randomVariableStore);
- typename MX::RandVarType x(word.c_str());
- // cout << "Examining HHMM just after hmm.init" << endl;
+ typename MX::RandVarType x(word.c_str());
+ // cout << "Examining HHMM just after hmm.init" << endl;
// hmm.debugPrint();
@@ -158,21 +163,21 @@ template <class MY, class MX, class YS, class B>
hmm.writeCurr(cout,1);
cerr << "*********** END writeCurr() ******************" << endl;
*/
-/*
- {
-
- int wnum=1;
- list<TrellNode<YS,B> > lys = hmm.getMLSnodes(ysEND); // get mls list
- for ( typename list<TrellNode<YS,B> >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame
- cout << "HYPOTH " << wnum
- << " " << i->getBackData()
- << " " << x
- << " " << i->getId()
- << " (" << i->getLogProb() << ")"
- << endl; // print RV val
- }
- }
- */
+ /*
+ {
+
+ int wnum=1;
+ list<TrellNode<YS,B> > lys = hmm.getMLSnodes(ysEND); // get mls list
+ for ( typename list<TrellNode<YS,B> >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame
+ cout << "HYPOTH " << wnum
+ << " " << i->getBackData()
+ << " " << x
+ << " " << i->getId()
+ << " (" << i->getLogProb() << ")"
+ << endl; // print RV val
+ }
+ }
+ */
/*
@@ -189,7 +194,7 @@ template <class MY, class MX, class YS, class B>
// typename MX::RandVarType ov;
// ov.set(word.c_str(),mO);
// MY::WORD = ov.getW();
- //bool endOfSentence = prev->sentenceStart;//true;
+ //bool endOfSentence = prev->sentenceStart;//true;
// std::cerr << "About to give HHMM a word of input:\t" << word << std::endl;
@@ -197,27 +202,27 @@ template <class MY, class MX, class YS, class B>
// cout << "Examining HHMM just after hmm.updateRanked(" << x << "," << prev->sentenceStart << ")" << endl;
// hmm.debugPrint();
-/*
- cerr << "*********** BEGIN writeCurr() ******************" << endl;
- hmm.writeCurr(cout,0);
- hmm.writeCurr(cout,1);
- cerr << "*********** END writeCurr() ******************" << endl;
- */
-/*
-{
+ /*
+ cerr << "*********** BEGIN writeCurr() ******************" << endl;
+ hmm.writeCurr(cout,0);
+ hmm.writeCurr(cout,1);
+ cerr << "*********** END writeCurr() ******************" << endl;
+ */
+ /*
+ {
- int wnum=1;
- list<TrellNode<YS,B> > lys = hmm.getMLSnodes(ysEND); // get mls list
- for ( typename list<TrellNode<YS,B> >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame
- cout << "HYPOTH " << wnum
- << " " << i->getBackData()
- << " " << x
- << " " << i->getId()
- << " (" << i->getLogProb() << ")"
- << endl; // print RV val
- }
- }
- */
+ int wnum=1;
+ list<TrellNode<YS,B> > lys = hmm.getMLSnodes(ysEND); // get mls list
+ for ( typename list<TrellNode<YS,B> >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame
+ cout << "HYPOTH " << wnum
+ << " " << i->getBackData()
+ << " " << x
+ << " " << i->getId()
+ << " (" << i->getLogProb() << ")"
+ << endl; // print RV val
+ }
+ }
+ */
// X ov(word.c_str());
//mH.setWord(ov);
// MY::WORD = ov;//ov.getW();
@@ -226,17 +231,17 @@ template <class MY, class MX, class YS, class B>
//hmm.updateRanked(ov);
//mH.setRootObs(true);
//MY::F_ROOT_OBS = false;
-
+
// Get the current score
- double currSum = hmm.getCurrSum();
- //VERBOSE(3,"Setting score using currSum for " << scientific << x << " = " << currSum << endl);
+ double currSum = hmm.getCurrSum();
+ //VERBOSE(3,"Setting score using currSum for " << scientific << x << " = " << currSum << endl);
setScore(currSum);
- // cout << "Examining RV store just before RV init via gatherElementsInBeam" << endl;
+ // cout << "Examining RV store just before RV init via gatherElementsInBeam" << endl;
// printRV();
// Get new hidden random variable store from HHMM
hmm.gatherElementsInBeam(randomVariableStore);
- // cout << "Examining RV store just after RV init via gatherElementsInBeam" << endl;
+ // cout << "Examining RV store just after RV init via gatherElementsInBeam" << endl;
// printRV();
/*
cerr << "Writing hmm.writeCurr..." << endl;
@@ -248,22 +253,25 @@ template <class MY, class MX, class YS, class B>
template <class MY, class MX, class YS, class B>
-double SyntacticLanguageModelState<MY,MX,YS,B>::getProb() const {
-
+double SyntacticLanguageModelState<MY,MX,YS,B>::getProb() const
+{
+
return prob;
}
template <class MY, class MX, class YS, class B>
-double SyntacticLanguageModelState<MY,MX,YS,B>::getScore() const {
-
+double SyntacticLanguageModelState<MY,MX,YS,B>::getScore() const
+{
+
return score;
}
template <class MY, class MX, class YS, class B>
- void SyntacticLanguageModelState<MY,MX,YS,B>::setScore(double score) {
+void SyntacticLanguageModelState<MY,MX,YS,B>::setScore(double score)
+{
+
-
this->prob = score;
diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp
index 81b7adf44..f3cf9d1e1 100644
--- a/moses/TargetPhrase.cpp
+++ b/moses/TargetPhrase.cpp
@@ -38,10 +38,10 @@ using namespace std;
namespace Moses
{
TargetPhrase::TargetPhrase( std::string out_string)
-:Phrase(0), m_fullScore(0.0), m_sourcePhrase(0)
-, m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
-, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
-, m_lhsTarget(NULL)
+ :Phrase(0), m_fullScore(0.0), m_sourcePhrase(0)
+ , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
+ , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
+ , m_lhsTarget(NULL)
{
//ACAT
@@ -50,37 +50,36 @@ TargetPhrase::TargetPhrase( std::string out_string)
}
TargetPhrase::TargetPhrase()
-:Phrase()
-, m_fullScore(0.0)
-,m_sourcePhrase()
-, m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
-, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
-, m_lhsTarget(NULL)
+ :Phrase()
+ , m_fullScore(0.0)
+ ,m_sourcePhrase()
+ , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
+ , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
+ , m_lhsTarget(NULL)
{
}
TargetPhrase::TargetPhrase(const Phrase &phrase)
-: Phrase(phrase)
-, m_fullScore(0.0)
-, m_sourcePhrase()
-, m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
-, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
-, m_lhsTarget(NULL)
+ : Phrase(phrase)
+ , m_fullScore(0.0)
+ , m_sourcePhrase()
+ , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
+ , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
+ , m_lhsTarget(NULL)
{
}
TargetPhrase::TargetPhrase(const TargetPhrase &copy)
-: Phrase(copy)
-, m_fullScore(copy.m_fullScore)
-, m_sourcePhrase(copy.m_sourcePhrase)
-, m_alignTerm(copy.m_alignTerm)
-, m_alignNonTerm(copy.m_alignNonTerm)
-, m_scoreBreakdown(copy.m_scoreBreakdown)
+ : Phrase(copy)
+ , m_fullScore(copy.m_fullScore)
+ , m_sourcePhrase(copy.m_sourcePhrase)
+ , m_alignTerm(copy.m_alignTerm)
+ , m_alignNonTerm(copy.m_alignNonTerm)
+ , m_scoreBreakdown(copy.m_scoreBreakdown)
{
if (copy.m_lhsTarget) {
m_lhsTarget = new Word(copy.m_lhsTarget);
- }
- else {
+ } else {
m_lhsTarget = NULL;
}
@@ -125,8 +124,8 @@ void TargetPhrase::Evaluate(const InputType &input)
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
for (size_t i = 0; i < ffs.size(); ++i) {
- const FeatureFunction &ff = *ffs[i];
- ff.Evaluate(input, m_scoreBreakdown);
+ const FeatureFunction &ff = *ffs[i];
+ ff.Evaluate(input, m_scoreBreakdown);
}
}
@@ -180,7 +179,7 @@ TargetPhrase *TargetPhrase::MergeNext(const TargetPhrase &inputPhrase) const
void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString)
{
- AlignmentInfo::CollType alignTerm, alignNonTerm;
+ AlignmentInfo::CollType alignTerm, alignNonTerm;
for (util::TokenIter<util::AnyCharacter, true> token(alignString, util::AnyCharacter(" \t")); token; ++token) {
util::TokenIter<util::SingleCharacter, false> dash(*token, util::SingleCharacter('-'));
@@ -194,11 +193,10 @@ void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString)
if (GetWord(targetPos).IsNonTerminal()) {
- alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
+ alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
+ } else {
+ alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
}
- else {
- alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
- }
}
SetAlignTerm(alignTerm);
SetAlignNonTerm(alignNonTerm);
@@ -207,15 +205,15 @@ void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString)
void TargetPhrase::SetAlignTerm(const AlignmentInfo::CollType &coll)
{
- const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
- m_alignTerm = alignmentInfo;
+ const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
+ m_alignTerm = alignmentInfo;
}
void TargetPhrase::SetAlignNonTerm(const AlignmentInfo::CollType &coll)
{
- const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
- m_alignNonTerm = alignmentInfo;
+ const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
+ m_alignNonTerm = alignmentInfo;
}
void TargetPhrase::SetSparseScore(const FeatureFunction* translationScoreProducer, const StringPiece &sparseString)
diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h
index df876a00a..bb1c7c5a7 100644
--- a/moses/TargetPhrase.h
+++ b/moses/TargetPhrase.h
@@ -48,10 +48,10 @@ protected:
float m_fullScore;
ScoreComponentCollection m_scoreBreakdown;
- // in case of confusion net, ptr to source phrase
- Phrase m_sourcePhrase;
- const AlignmentInfo* m_alignTerm, *m_alignNonTerm;
- const Word *m_lhsTarget;
+ // in case of confusion net, ptr to source phrase
+ Phrase m_sourcePhrase;
+ const AlignmentInfo* m_alignTerm, *m_alignNonTerm;
+ const Word *m_lhsTarget;
public:
TargetPhrase();
@@ -86,26 +86,28 @@ public:
return m_fullScore;
}
- inline const ScoreComponentCollection &GetScoreBreakdown() const
- { return m_scoreBreakdown; }
- inline ScoreComponentCollection &GetScoreBreakdown()
- { return m_scoreBreakdown; }
+ inline const ScoreComponentCollection &GetScoreBreakdown() const {
+ return m_scoreBreakdown;
+ }
+ inline ScoreComponentCollection &GetScoreBreakdown() {
+ return m_scoreBreakdown;
+ }
//TODO: Probably shouldn't copy this, but otherwise ownership is unclear
- void SetSourcePhrase(const Phrase& p)
- {
- m_sourcePhrase=p;
- }
- const Phrase& GetSourcePhrase() const
- {
- return m_sourcePhrase;
- }
-
- void SetTargetLHS(const Word *lhs)
- { m_lhsTarget = lhs; }
- const Word &GetTargetLHS() const
- { return *m_lhsTarget; }
-
+ void SetSourcePhrase(const Phrase& p) {
+ m_sourcePhrase=p;
+ }
+ const Phrase& GetSourcePhrase() const {
+ return m_sourcePhrase;
+ }
+
+ void SetTargetLHS(const Word *lhs) {
+ m_lhsTarget = lhs;
+ }
+ const Word &GetTargetLHS() const {
+ return *m_lhsTarget;
+ }
+
void SetAlignmentInfo(const StringPiece &alignString);
void SetAlignTerm(const AlignmentInfo *alignTerm) {
m_alignTerm = alignTerm;
@@ -117,11 +119,13 @@ public:
void SetAlignTerm(const AlignmentInfo::CollType &coll);
void SetAlignNonTerm(const AlignmentInfo::CollType &coll);
- const AlignmentInfo &GetAlignTerm() const
- { return *m_alignTerm; }
- const AlignmentInfo &GetAlignNonTerm() const
- { return *m_alignNonTerm; }
-
+ const AlignmentInfo &GetAlignTerm() const {
+ return *m_alignTerm;
+ }
+ const AlignmentInfo &GetAlignNonTerm() const {
+ return *m_alignNonTerm;
+ }
+
TO_STRING();
};
@@ -131,10 +135,8 @@ std::ostream& operator<<(std::ostream&, const TargetPhrase&);
/**
* Hasher that looks at source and target phrase.
**/
-struct TargetPhraseHasher
-{
- inline size_t operator()(const TargetPhrase& targetPhrase) const
- {
+struct TargetPhraseHasher {
+ inline size_t operator()(const TargetPhrase& targetPhrase) const {
size_t seed = 0;
boost::hash_combine(seed, targetPhrase);
boost::hash_combine(seed, targetPhrase.GetSourcePhrase());
@@ -145,14 +147,12 @@ struct TargetPhraseHasher
}
};
-struct TargetPhraseComparator
-{
- inline bool operator()(const TargetPhrase& lhs, const TargetPhrase& rhs) const
- {
+struct TargetPhraseComparator {
+ inline bool operator()(const TargetPhrase& lhs, const TargetPhrase& rhs) const {
return lhs.Compare(rhs) == 0 &&
- lhs.GetSourcePhrase().Compare(rhs.GetSourcePhrase()) == 0 &&
- lhs.GetAlignTerm() == rhs.GetAlignTerm() &&
- lhs.GetAlignNonTerm() == rhs.GetAlignNonTerm();
+ lhs.GetSourcePhrase().Compare(rhs.GetSourcePhrase()) == 0 &&
+ lhs.GetAlignTerm() == rhs.GetAlignTerm() &&
+ lhs.GetAlignNonTerm() == rhs.GetAlignNonTerm();
}
};
diff --git a/moses/TargetPhraseCollection.cpp b/moses/TargetPhraseCollection.cpp
index 78b63d852..88ce28eb6 100644
--- a/moses/TargetPhraseCollection.cpp
+++ b/moses/TargetPhraseCollection.cpp
@@ -59,8 +59,8 @@ void TargetPhraseCollection::Sort(bool adhereTableLimit, size_t tableLimit)
{
std::vector<TargetPhrase*>::iterator iterMiddle;
iterMiddle = (tableLimit == 0 || m_collection.size() < tableLimit)
- ? m_collection.end()
- : m_collection.begin()+tableLimit;
+ ? m_collection.end()
+ : m_collection.begin()+tableLimit;
std::partial_sort(m_collection.begin(), iterMiddle, m_collection.end(),
CompareTargetPhrase());
diff --git a/moses/TargetPhraseCollection.h b/moses/TargetPhraseCollection.h
index 4efb911fb..4207bccef 100644
--- a/moses/TargetPhraseCollection.h
+++ b/moses/TargetPhraseCollection.h
@@ -60,7 +60,9 @@ public:
RemoveAllInColl(m_collection);
}
- const std::vector<TargetPhrase*> &GetCollection() const { return m_collection; }
+ const std::vector<TargetPhrase*> &GetCollection() const {
+ return m_collection;
+ }
//! divide collection into 2 buckets using std::nth_element, the top & bottom according to table limit
void NthElement(size_t tableLimit);
diff --git a/moses/Terminal.h b/moses/Terminal.h
index 6247d0b6c..e7d18676e 100644
--- a/moses/Terminal.h
+++ b/moses/Terminal.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -29,7 +29,7 @@ namespace Moses
class TerminalHasher
{
- public:
+public:
// Generate a hash value for a word representing a terminal. It's
// assumed that the same subset of factors will be active for all words
// that are hashed.
@@ -47,7 +47,7 @@ class TerminalHasher
class TerminalEqualityPred
{
- public:
+public:
// Equality predicate for comparing words representing terminals. As
// with the hasher, it's assumed that all words will have the same
// subset of active factors.
diff --git a/moses/ThreadPool.h b/moses/ThreadPool.h
index fad236a98..bf981a2da 100644
--- a/moses/ThreadPool.h
+++ b/moses/ThreadPool.h
@@ -42,7 +42,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
/**
* Classes to implement a ThreadPool.
**/
-namespace Moses {
+namespace Moses
+{
/** A task to be executed by the ThreadPool
*/
@@ -50,7 +51,9 @@ class Task
{
public:
virtual void Run() = 0;
- virtual bool DeleteAfterExecution() { return true; }
+ virtual bool DeleteAfterExecution() {
+ return true;
+ }
virtual ~Task() {}
};
@@ -58,7 +61,7 @@ public:
class ThreadPool
{
- public:
+public:
/**
* Construct a thread pool of a fixed size.
**/
@@ -82,7 +85,9 @@ class ThreadPool
/**
* Set maximum number of queued threads (otherwise Submit blocks)
**/
- void SetQueueLimit( size_t limit ) { m_queueLimit = limit; }
+ void SetQueueLimit( size_t limit ) {
+ m_queueLimit = limit;
+ }
private:
/**
@@ -109,7 +114,7 @@ public:
#ifdef BOOST_HAS_PTHREADS
pthread_t tid = pthread_self();
#else
- typedef void * pthread_t;
+ typedef void * pthread_t;
pthread_t tid = 0;
#endif
std::cerr << "Executing " << m_id << " in thread id " << tid << std::endl;
diff --git a/moses/Timer.h b/moses/Timer.h
index deefa4a71..a6bd0e91a 100644
--- a/moses/Timer.h
+++ b/moses/Timer.h
@@ -33,7 +33,7 @@ public:
* using 'start' or 'restart'
*/
Timer() : running(false) {
- start_time = 0;
+ start_time = 0;
}
void start(const char* msg = 0);
diff --git a/moses/TranslationModel/BilingualDynSuffixArray.cpp b/moses/TranslationModel/BilingualDynSuffixArray.cpp
index 824529b91..a47473de5 100644
--- a/moses/TranslationModel/BilingualDynSuffixArray.cpp
+++ b/moses/TranslationModel/BilingualDynSuffixArray.cpp
@@ -7,495 +7,494 @@
using namespace std;
-namespace Moses {
+namespace Moses
+{
BilingualDynSuffixArray::BilingualDynSuffixArray():
- m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
- m_maxSampleSize(20)
-{
- m_srcSA = 0;
- m_trgSA = 0;
- m_srcCorpus = new std::vector<wordID_t>();
- m_trgCorpus = new std::vector<wordID_t>();
- m_srcVocab = new Vocab(false);
- m_trgVocab = new Vocab(false);
- m_scoreCmp = 0;
+ m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
+ m_maxSampleSize(20)
+{
+ m_srcSA = 0;
+ m_trgSA = 0;
+ m_srcCorpus = new std::vector<wordID_t>();
+ m_trgCorpus = new std::vector<wordID_t>();
+ m_srcVocab = new Vocab(false);
+ m_trgVocab = new Vocab(false);
+ m_scoreCmp = 0;
}
-BilingualDynSuffixArray::~BilingualDynSuffixArray()
+BilingualDynSuffixArray::~BilingualDynSuffixArray()
{
- if(m_srcSA) delete m_srcSA;
- if(m_trgSA) delete m_trgSA;
- if(m_srcVocab) delete m_srcVocab;
- if(m_trgVocab) delete m_trgVocab;
- if(m_srcCorpus) delete m_srcCorpus;
- if(m_trgCorpus) delete m_trgCorpus;
- if(m_scoreCmp) delete m_scoreCmp;
+ if(m_srcSA) delete m_srcSA;
+ if(m_trgSA) delete m_trgSA;
+ if(m_srcVocab) delete m_srcVocab;
+ if(m_trgVocab) delete m_trgVocab;
+ if(m_srcCorpus) delete m_srcCorpus;
+ if(m_trgCorpus) delete m_trgCorpus;
+ if(m_scoreCmp) delete m_scoreCmp;
}
bool BilingualDynSuffixArray::Load(
- const std::vector<FactorType>& inputFactors,
- const std::vector<FactorType>& outputFactors,
- std::string source, std::string target, std::string alignments,
- const std::vector<float> &weight)
+ const std::vector<FactorType>& inputFactors,
+ const std::vector<FactorType>& outputFactors,
+ std::string source, std::string target, std::string alignments,
+ const std::vector<float> &weight)
{
m_inputFactors = inputFactors;
m_outputFactors = outputFactors;
- m_scoreCmp = new ScoresComp(weight);
- InputFileStream sourceStrme(source);
- InputFileStream targetStrme(target);
- cerr << "Loading source corpus...\n";
- LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
- cerr << "Loading target corpus...\n";
- LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
- CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
-
- // build suffix arrays and auxilliary arrays
- cerr << "Building Source Suffix Array...\n";
- m_srcSA = new DynSuffixArray(m_srcCorpus);
- if(!m_srcSA) return false;
- cerr << "Building Target Suffix Array...\n";
- //m_trgSA = new DynSuffixArray(m_trgCorpus);
- //if(!m_trgSA) return false;
+ m_scoreCmp = new ScoresComp(weight);
+ InputFileStream sourceStrme(source);
+ InputFileStream targetStrme(target);
+ cerr << "Loading source corpus...\n";
+ LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
+ cerr << "Loading target corpus...\n";
+ LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
+ CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
+
+ // build suffix arrays and auxilliary arrays
+ cerr << "Building Source Suffix Array...\n";
+ m_srcSA = new DynSuffixArray(m_srcCorpus);
+ if(!m_srcSA) return false;
+ cerr << "Building Target Suffix Array...\n";
+ //m_trgSA = new DynSuffixArray(m_trgCorpus);
+ //if(!m_trgSA) return false;
cerr << "\t(Skipped. Not used)\n";
-
- InputFileStream alignStrme(alignments);
- cerr << "Loading Alignment File...\n";
- LoadRawAlignments(alignStrme);
- //LoadAlignments(alignStrme);
+
+ InputFileStream alignStrme(alignments);
+ cerr << "Loading Alignment File...\n";
+ LoadRawAlignments(alignStrme);
+ //LoadAlignments(alignStrme);
cerr << "Building frequent word cache...\n";
CacheFreqWords();
- return true;
+ return true;
}
-
+
bool BilingualDynSuffixArray::LoadTM(
- const std::vector<FactorType>& inputFactors,
- const std::vector<FactorType>& outputFactors,
- std::string source, std::string target, std::string alignments,
- const std::vector<float> &weight)
+ const std::vector<FactorType>& inputFactors,
+ const std::vector<FactorType>& outputFactors,
+ std::string source, std::string target, std::string alignments,
+ const std::vector<float> &weight)
{
m_inputFactors = inputFactors;
m_outputFactors = outputFactors;
-
+
m_scoreCmp = new ScoresComp(weight);
InputFileStream sourceStrme(source);
InputFileStream targetStrme(target);
- cerr << "Loading target corpus...\n";
+ cerr << "Loading target corpus...\n";
LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
-
- cerr << "Loading source corpus...\n";
+
+ cerr << "Loading source corpus...\n";
LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
-
+
CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
-
+
// build suffix arrays and auxilliary arrays
- cerr << "Building Source Suffix Array...\n";
- m_srcSA = new DynSuffixArray(m_srcCorpus);
+ cerr << "Building Source Suffix Array...\n";
+ m_srcSA = new DynSuffixArray(m_srcCorpus);
if(!m_srcSA) return false;
- cerr << "Building Target Suffix Array...\n";
- //m_trgSA = new DynSuffixArray(m_trgCorpus);
+ cerr << "Building Target Suffix Array...\n";
+ //m_trgSA = new DynSuffixArray(m_trgCorpus);
//if(!m_trgSA) return false;
cerr << "\t(Skipped. Not used)\n";
-
+
InputFileStream alignStrme(alignments);
- cerr << "Loading Alignment File...\n";
+ cerr << "Loading Alignment File...\n";
LoadRawAlignments(alignStrme);
//LoadAlignments(alignStrme);
cerr << "Building frequent word cache...\n";
CacheFreqWords();
return true;
-
+
}
-int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align)
+int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align)
{
- // stores the alignments in the raw file format
- std::string line;
- std::vector<int> vtmp;
+ // stores the alignments in the raw file format
+ std::string line;
+ std::vector<int> vtmp;
int lineNum = 1;
- while(getline(align, line)) {
+ while(getline(align, line)) {
if (lineNum % 10000 == 0)
cerr << lineNum;
- Utils::splitToInt(line, vtmp, "- ");
- CHECK(vtmp.size() % 2 == 0);
- std::vector<short> vAlgn; // store as short ints for memory
- for (std::vector<int>::const_iterator itr = vtmp.begin();
- itr != vtmp.end(); ++itr) {
- vAlgn.push_back(short(*itr));
- }
- m_rawAlignments.push_back(vAlgn);
+ Utils::splitToInt(line, vtmp, "- ");
+ CHECK(vtmp.size() % 2 == 0);
+ std::vector<short> vAlgn; // store as short ints for memory
+ for (std::vector<int>::const_iterator itr = vtmp.begin();
+ itr != vtmp.end(); ++itr) {
+ vAlgn.push_back(short(*itr));
+ }
+ m_rawAlignments.push_back(vAlgn);
++lineNum;
- }
- return m_rawAlignments.size();
+ }
+ return m_rawAlignments.size();
}
-int BilingualDynSuffixArray::LoadRawAlignments(string& align) {
- // stores the alignments in the raw file format
+int BilingualDynSuffixArray::LoadRawAlignments(string& align)
+{
+ // stores the alignments in the raw file format
vector<int> vtmp;
Utils::splitToInt(align, vtmp, "- ");
CHECK(vtmp.size() % 2 == 0);
vector<short> vAlgn; // store as short ints for memory
for (std::vector<int>::const_iterator itr = vtmp.begin();
- itr != vtmp.end(); ++itr) {
- vAlgn.push_back(short(*itr));
+ itr != vtmp.end(); ++itr) {
+ vAlgn.push_back(short(*itr));
}
m_rawAlignments.push_back(vAlgn);
return m_rawAlignments.size();
}
-int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align)
+int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align)
{
- std::string line;
- std::vector<int> vtmp;
- int sntIndex(0);
-
- while(getline(align, line)) {
- Utils::splitToInt(line, vtmp, "- ");
- CHECK(vtmp.size() % 2 == 0);
-
- int sourceSize = GetSourceSentenceSize(sntIndex);
- int targetSize = GetTargetSentenceSize(sntIndex);
-
- SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
- for(int i=0; i < (int)vtmp.size(); i+=2) {
- int sourcePos = vtmp[i];
- int targetPos = vtmp[i+1];
- CHECK(sourcePos < sourceSize);
- CHECK(targetPos < targetSize);
-
- curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
- curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
- }
- curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
- curSnt.trgSnt = m_trgCorpus + sntIndex;
- m_alignments.push_back(curSnt);
-
- sntIndex++;
- }
- return m_alignments.size();
+ std::string line;
+ std::vector<int> vtmp;
+ int sntIndex(0);
+
+ while(getline(align, line)) {
+ Utils::splitToInt(line, vtmp, "- ");
+ CHECK(vtmp.size() % 2 == 0);
+
+ int sourceSize = GetSourceSentenceSize(sntIndex);
+ int targetSize = GetTargetSentenceSize(sntIndex);
+
+ SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
+ for(int i=0; i < (int)vtmp.size(); i+=2) {
+ int sourcePos = vtmp[i];
+ int targetPos = vtmp[i+1];
+ CHECK(sourcePos < sourceSize);
+ CHECK(targetPos < targetSize);
+
+ curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
+ curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
+ }
+ curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
+ curSnt.trgSnt = m_trgCorpus + sntIndex;
+ m_alignments.push_back(curSnt);
+
+ sntIndex++;
+ }
+ return m_alignments.size();
}
-SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const
+SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const
{
- // retrieves the alignments in the format used by SentenceAlignment.Extract()
- int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex);
- int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex);
- std::vector<short> alignment = m_rawAlignments.at(sntIndex);
- SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
- for(size_t i=0; i < alignment.size(); i+=2) {
- int sourcePos = alignment[i];
- int targetPos = alignment[i+1];
- if(trg2Src) {
- curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word
- curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
- }
- else {
- curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
- curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
- }
- }
- curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
- curSnt.trgSnt = m_trgCorpus + sntIndex;
-
- return curSnt;
+ // retrieves the alignments in the format used by SentenceAlignment.Extract()
+ int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex);
+ int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex);
+ std::vector<short> alignment = m_rawAlignments.at(sntIndex);
+ SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
+ for(size_t i=0; i < alignment.size(); i+=2) {
+ int sourcePos = alignment[i];
+ int targetPos = alignment[i+1];
+ if(trg2Src) {
+ curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word
+ curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
+ } else {
+ curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
+ curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
+ }
+ }
+ curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
+ curSnt.trgSnt = m_trgCorpus + sntIndex;
+
+ return curSnt;
}
-bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,
- const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const
+bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,
+ const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const
{
- /* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
- * parameter */
- SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
- // get span of phrase in source sentence
- int beginSentence = m_srcSntBreaks[sntIndex];
- int rightIdx = wordIndex - beginSentence
- ,leftIdx = rightIdx - sourceSize + 1;
- return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
+ /* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
+ * parameter */
+ SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
+ // get span of phrase in source sentence
+ int beginSentence = m_srcSntBreaks[sntIndex];
+ int rightIdx = wordIndex - beginSentence
+ ,leftIdx = rightIdx - sourceSize + 1;
+ return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
}
int BilingualDynSuffixArray::LoadCorpus(FactorDirection direction, InputFileStream& corpus, const FactorList& factors,
- std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray,
- Vocab* vocab)
+ std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray,
+ Vocab* vocab)
{
- std::string line, word;
- int sntIdx(0);
+ std::string line, word;
+ int sntIdx(0);
// corpus.seekg(0); Seems needless -> commented out to allow loading of gzipped corpora (gzfilebuf doesn't support seeking).
- const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
- while(getline(corpus, line)) {
- sntArray.push_back(sntIdx);
- Phrase phrase(ARRAY_SIZE_INCR);
- // parse phrase
- phrase.CreateFromString(direction, factors, line, factorDelimiter, NULL);
- // store words in vocabulary and corpus
- for( size_t i = 0; i < phrase.GetSize(); ++i) {
- cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
- }
- sntIdx += phrase.GetSize();
- }
- //cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus
+ const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
+ while(getline(corpus, line)) {
+ sntArray.push_back(sntIdx);
+ Phrase phrase(ARRAY_SIZE_INCR);
+ // parse phrase
+ phrase.CreateFromString(direction, factors, line, factorDelimiter, NULL);
+ // store words in vocabulary and corpus
+ for( size_t i = 0; i < phrase.GetSize(); ++i) {
+ cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
+ }
+ sntIdx += phrase.GetSize();
+ }
+ //cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus
vocab->MakeClosed(); // avoid adding words
- return cArray.size();
+ return cArray.size();
}
-bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
+bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
{
- // looks up the SA vocab ids for the current src phrase
- size_t phraseSize = src.GetSize();
- for (size_t pos = 0; pos < phraseSize; ++pos) {
- const Word &word = src.GetWord(pos);
- wordID_t arrayId = m_srcVocab->GetWordID(word);
- if (arrayId == m_srcVocab->GetkOOVWordID())
- { // oov
- return false;
- }
- else
- {
- output.SetId(pos, arrayId);
- //cerr << arrayId << " ";
- }
- }
- return true;
+ // looks up the SA vocab ids for the current src phrase
+ size_t phraseSize = src.GetSize();
+ for (size_t pos = 0; pos < phraseSize; ++pos) {
+ const Word &word = src.GetWord(pos);
+ wordID_t arrayId = m_srcVocab->GetWordID(word);
+ if (arrayId == m_srcVocab->GetkOOVWordID()) {
+ // oov
+ return false;
+ } else {
+ output.SetId(pos, arrayId);
+ //cerr << arrayId << " ";
+ }
+ }
+ return true;
}
-pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
+pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
{
- //return pair<float, float>(1, 1);
- float srcLexWeight(1.0), trgLexWeight(1.0);
- std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words
- //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
- const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
- std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
- // for each source word
- for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
- float srcSumPairProbs(0);
- wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
- const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
+ //return pair<float, float>(1, 1);
+ float srcLexWeight(1.0), trgLexWeight(1.0);
+ std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words
+ //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
+ const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
+ std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
+ // for each source word
+ for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
+ float srcSumPairProbs(0);
+ wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
+ const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
// for each target word aligned to this source word in this alignment
- if(srcWordAlignments.size() == 0) { // get p(NULL|src)
- pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
- itrCache = m_wordPairCache.find(wordpair);
- if(itrCache == m_wordPairCache.end()) { // if not in cache
- CacheWordProbs(srcWord);
- itrCache = m_wordPairCache.find(wordpair); // search cache again
- }
- CHECK(itrCache != m_wordPairCache.end());
- srcSumPairProbs += itrCache->second.first;
- targetProbs[wordpair] = itrCache->second.second;
- }
- else { // extract p(trg|src)
- for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
- int trgIdx = srcWordAlignments[i];
- wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
- // get probability of this source->target word pair
- pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
- itrCache = m_wordPairCache.find(wordpair);
- if(itrCache == m_wordPairCache.end()) { // if not in cache
+ if(srcWordAlignments.size() == 0) { // get p(NULL|src)
+ pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
+ itrCache = m_wordPairCache.find(wordpair);
+ if(itrCache == m_wordPairCache.end()) { // if not in cache
+ CacheWordProbs(srcWord);
+ itrCache = m_wordPairCache.find(wordpair); // search cache again
+ }
+ CHECK(itrCache != m_wordPairCache.end());
+ srcSumPairProbs += itrCache->second.first;
+ targetProbs[wordpair] = itrCache->second.second;
+ } else { // extract p(trg|src)
+ for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
+ int trgIdx = srcWordAlignments[i];
+ wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
+ // get probability of this source->target word pair
+ pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
+ itrCache = m_wordPairCache.find(wordpair);
+ if(itrCache == m_wordPairCache.end()) { // if not in cache
CacheWordProbs(srcWord);
- itrCache = m_wordPairCache.find(wordpair); // search cache again
- }
- CHECK(itrCache != m_wordPairCache.end());
- srcSumPairProbs += itrCache->second.first;
- targetProbs[wordpair] = itrCache->second.second;
- }
- }
- float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
- srcLexWeight *= (srcNormalizer * srcSumPairProbs);
- } // end for each source word
- for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
- float trgSumPairProbs(0);
- wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
- for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
- = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
- if(trgItr->first.second == trgWord)
- trgSumPairProbs += trgItr->second;
+ itrCache = m_wordPairCache.find(wordpair); // search cache again
}
- if(trgSumPairProbs == 0) continue; // currently don't store target-side SA
- int noAligned = alignment.numberAligned.at(trgIdx);
- float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
- trgLexWeight *= (trgNormalizer * trgSumPairProbs);
- }
- // TODO::Need to get p(NULL|trg)
- return pair<float, float>(srcLexWeight, trgLexWeight);
+ CHECK(itrCache != m_wordPairCache.end());
+ srcSumPairProbs += itrCache->second.first;
+ targetProbs[wordpair] = itrCache->second.second;
+ }
+ }
+ float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
+ srcLexWeight *= (srcNormalizer * srcSumPairProbs);
+ } // end for each source word
+ for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
+ float trgSumPairProbs(0);
+ wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
+ for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
+ = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
+ if(trgItr->first.second == trgWord)
+ trgSumPairProbs += trgItr->second;
+ }
+ if(trgSumPairProbs == 0) continue; // currently don't store target-side SA
+ int noAligned = alignment.numberAligned.at(trgIdx);
+ float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
+ trgLexWeight *= (trgNormalizer * trgSumPairProbs);
+ }
+ // TODO::Need to get p(NULL|trg)
+ return pair<float, float>(srcLexWeight, trgLexWeight);
}
-void BilingualDynSuffixArray::CacheFreqWords() const {
+void BilingualDynSuffixArray::CacheFreqWords() const
+{
std::multimap<int, wordID_t> wordCnts;
// for each source word in vocab
- Vocab::Word2Id::const_iterator it;
+ Vocab::Word2Id::const_iterator it;
for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) {
// get its frequency
wordID_t srcWord = it->second;
std::vector<wordID_t> sword(1, srcWord), wrdIndices;
m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
- if(wrdIndices.size() >= 1000) { // min count
+ if(wrdIndices.size() >= 1000) { // min count
wordCnts.insert(make_pair(wrdIndices.size(), srcWord));
}
}
int numSoFar(0);
- std::multimap<int, wordID_t>::reverse_iterator ritr;
- for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
+ std::multimap<int, wordID_t>::reverse_iterator ritr;
+ for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
m_freqWordsCached.insert(ritr->second);
CacheWordProbs(ritr->second);
if(++numSoFar == 50) break; // get top counts
}
cerr << "\tCached " << m_freqWordsCached.size() << " source words\n";
}
-void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
+void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
{
- std::map<wordID_t, int> counts;
- std::vector<wordID_t> sword(1, srcWord), wrdIndices;
- bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
- CHECK(ret);
- std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
- float denom(0);
- // for each occurrence of this word
- for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
- int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
- CHECK(sntIdx != -1);
- int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
- const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
- if(srcAlg.size() == 0) {
- ++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
- ++denom;
- }
- else { //get target words aligned to srcword in this sentence
- for(size_t i=0; i < srcAlg.size(); ++i) {
- wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
- ++counts[trgWord];
- ++denom;
- }
- }
- }
- // now we've gotten counts of all target words aligned to this source word
- // get probs and cache all pairs
- for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
- itrCnt != counts.end(); ++itrCnt) {
- pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
- float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg)
- float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src)
- m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
- }
+ std::map<wordID_t, int> counts;
+ std::vector<wordID_t> sword(1, srcWord), wrdIndices;
+ bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
+ CHECK(ret);
+ std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
+ float denom(0);
+ // for each occurrence of this word
+ for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
+ int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
+ CHECK(sntIdx != -1);
+ int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
+ const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
+ if(srcAlg.size() == 0) {
+ ++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
+ ++denom;
+ } else { //get target words aligned to srcword in this sentence
+ for(size_t i=0; i < srcAlg.size(); ++i) {
+ wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
+ ++counts[trgWord];
+ ++denom;
+ }
+ }
+ }
+ // now we've gotten counts of all target words aligned to this source word
+ // get probs and cache all pairs
+ for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
+ itrCnt != counts.end(); ++itrCnt) {
+ pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
+ float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg)
+ float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src)
+ m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
+ }
}
-SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
+SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
{
- // takes sentence indexes and looks up vocab IDs
- SAPhrase phraseIds(phrasepair.GetTargetSize());
- int sntIndex = phrasepair.m_sntIndex;
- int id(-1), pos(0);
- for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words
- id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
- phraseIds.SetId(pos++, id);
- }
- return phraseIds;
+ // takes sentence indexes and looks up vocab IDs
+ SAPhrase phraseIds(phrasepair.GetTargetSize());
+ int sntIndex = phrasepair.m_sntIndex;
+ int id(-1), pos(0);
+ for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words
+ id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
+ phraseIds.SetId(pos++, id);
+ }
+ return phraseIds;
}
-
+
TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase) const
{
- TargetPhrase* targetPhrase = new TargetPhrase();
- for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
- Word& word = m_trgVocab->GetWord( phrase.words[i]);
- CHECK(word != m_trgVocab->GetkOOVWord());
- targetPhrase->AddWord(word);
- }
- targetPhrase->SetSourcePhrase(sourcePhrase);
- // scoring
- return targetPhrase;
+ TargetPhrase* targetPhrase = new TargetPhrase();
+ for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
+ Word& word = m_trgVocab->GetWord( phrase.words[i]);
+ CHECK(word != m_trgVocab->GetkOOVWord());
+ targetPhrase->AddWord(word);
+ }
+ targetPhrase->SetSourcePhrase(sourcePhrase);
+ // scoring
+ return targetPhrase;
}
-void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const
+void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const
{
//cerr << "phrase is \"" << src << endl;
- size_t sourceSize = src.GetSize();
- SAPhrase localIDs(sourceSize);
- if(!GetLocalVocabIDs(src, localIDs)) return;
- float totalTrgPhrases(0);
- std::map<SAPhrase, int> phraseCounts;
- //std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from
- std::map<SAPhrase, pair<float, float> > lexicalWeights;
- std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
- std::vector<unsigned> wrdIndices;
- // extract sentence IDs from SA and return rightmost index of phrases
- if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
+ size_t sourceSize = src.GetSize();
+ SAPhrase localIDs(sourceSize);
+ if(!GetLocalVocabIDs(src, localIDs)) return;
+ float totalTrgPhrases(0);
+ std::map<SAPhrase, int> phraseCounts;
+ //std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from
+ std::map<SAPhrase, pair<float, float> > lexicalWeights;
+ std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
+ std::vector<unsigned> wrdIndices;
+ // extract sentence IDs from SA and return rightmost index of phrases
+ if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
SampleSelection(wrdIndices);
- std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
- // for each sentence with this phrase
- for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
- std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence
- int sntIndex = sntIndexes.at(snt); // get corpus index for sentence
- if(sntIndex == -1) continue; // bad flag set by GetSntIndexes()
- ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs);
- //cerr << "extracted " << phrasePairs.size() << endl;
- totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair
- std::vector<PhrasePair*>::iterator iterPhrasePair;
- for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
- SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
- phraseCounts[phrase]++; // count each unique phrase
- // NOTE::Correct but slow to extract lexical weight here. could do
+ std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
+ // for each sentence with this phrase
+ for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
+ std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence
+ int sntIndex = sntIndexes.at(snt); // get corpus index for sentence
+ if(sntIndex == -1) continue; // bad flag set by GetSntIndexes()
+ ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs);
+ //cerr << "extracted " << phrasePairs.size() << endl;
+ totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair
+ std::vector<PhrasePair*>::iterator iterPhrasePair;
+ for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
+ SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
+ phraseCounts[phrase]++; // count each unique phrase
+ // NOTE::Correct but slow to extract lexical weight here. could do
// it later for only the top phrases chosen by phrase prob p(e|f)
- pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair
- itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached
- if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
- itrLexW->second = lexWeight; // if this lex weight is greater save it
- else lexicalWeights[phrase] = lexWeight; // else save
- }
- // done with sentence. delete SA phrase pairs
- RemoveAllInColl(phrasePairs);
- } // done with all sentences
- // convert to moses phrase pairs
- std::map<SAPhrase, int>::const_iterator iterPhrases;
- std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp);
- // get scores of all phrases
- for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) {
- float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases;
- itrLexW = lexicalWeights.find(iterPhrases->first);
- CHECK(itrLexW != lexicalWeights.end());
- Scores scoreVector(3);
- scoreVector[0] = trg2SrcMLE;
- scoreVector[1] = itrLexW->second.first;
- scoreVector[2] = 2.718; // exp(1);
- phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
- }
- // return top scoring phrases
- std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
- for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) {
- Scores scoreVector = ritr->first;
- TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second, src);
- target.push_back(make_pair( scoreVector, targetPhrase));
- if(target.size() == m_maxSampleSize) break;
- }
+ pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair
+ itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached
+ if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
+ itrLexW->second = lexWeight; // if this lex weight is greater save it
+ else lexicalWeights[phrase] = lexWeight; // else save
+ }
+ // done with sentence. delete SA phrase pairs
+ RemoveAllInColl(phrasePairs);
+ } // done with all sentences
+ // convert to moses phrase pairs
+ std::map<SAPhrase, int>::const_iterator iterPhrases;
+ std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp);
+ // get scores of all phrases
+ for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) {
+ float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases;
+ itrLexW = lexicalWeights.find(iterPhrases->first);
+ CHECK(itrLexW != lexicalWeights.end());
+ Scores scoreVector(3);
+ scoreVector[0] = trg2SrcMLE;
+ scoreVector[1] = itrLexW->second.first;
+ scoreVector[2] = 2.718; // exp(1);
+ phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
+ }
+ // return top scoring phrases
+ std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
+ for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) {
+ Scores scoreVector = ritr->first;
+ TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second, src);
+ target.push_back(make_pair( scoreVector, targetPhrase));
+ if(target.size() == m_maxSampleSize) break;
+ }
}
-std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
- const int sourceSize, const std::vector<unsigned>& sntBreaks) const
+std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
+ const int sourceSize, const std::vector<unsigned>& sntBreaks) const
{
- std::vector<unsigned>::const_iterator vit;
- std::vector<int> sntIndexes;
- for(size_t i=0; i < wrdIndices.size(); ++i) {
- vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
- int index = int(vit - sntBreaks.begin()) - 1;
- // check for phrases that cross sentence boundaries
- if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
- sntIndexes.push_back(-1); // set bad flag
- else
- sntIndexes.push_back(index); // store the index of the sentence in the corpus
- }
- return sntIndexes;
+ std::vector<unsigned>::const_iterator vit;
+ std::vector<int> sntIndexes;
+ for(size_t i=0; i < wrdIndices.size(); ++i) {
+ vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
+ int index = int(vit - sntBreaks.begin()) - 1;
+ // check for phrases that cross sentence boundaries
+ if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
+ sntIndexes.push_back(-1); // set bad flag
+ else
+ sntIndexes.push_back(index); // store the index of the sentence in the corpus
+ }
+ return sntIndexes;
}
int BilingualDynSuffixArray::SampleSelection(std::vector<unsigned>& sample,
- int sampleSize) const
+ int sampleSize) const
{
// only use top 'sampleSize' number of samples
- if(sample.size() > (size_t)sampleSize)
- sample.erase(sample.begin()+sampleSize, sample.end());
- return sample.size();
+ if(sample.size() > (size_t)sampleSize)
+ sample.erase(sample.begin()+sampleSize, sample.end());
+ return sample.size();
}
-void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment) {
+void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment)
+{
vuint_t srcFactor, trgFactor;
cerr << "source, target, alignment = " << source << ", " << target << ", " << alignment << endl;
- const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
+ const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
Phrase sphrase(ARRAY_SIZE_INCR);
@@ -511,7 +510,7 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
cerr << "srcFactor[" << (srcFactor.size() - 1) << "] = " << srcFactor.back() << endl;
m_srcCorpus->push_back(srcFactor.back()); // add word to corpus
}
- m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
+ m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
m_srcVocab->MakeClosed();
Phrase tphrase(ARRAY_SIZE_INCR);
tphrase.CreateFromString(Output, m_outputFactors, target, factorDelimiter, NULL);
@@ -534,16 +533,17 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
LoadRawAlignments(alignment);
m_trgVocab->MakeClosed();
//for(size_t i=0; i < sphrase.GetSize(); ++i)
- //ClearWordInCache(sIDs[i]);
-
+ //ClearWordInCache(sIDs[i]);
+
}
-void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) {
+void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord)
+{
if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end())
return;
- std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it,
- first, last;
+ std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it,
+ first, last;
for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) {
- if(it->first.first == srcWord) { // all source words grouped
+ if(it->first.first == srcWord) { // all source words grouped
first = it; // copy first entry of srcWord
last = it++;
while(it != m_wordPairCache.end() && (it->first.first == srcWord)) {
@@ -553,80 +553,77 @@ void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) {
m_wordPairCache.erase(first, last);
}
}
-SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
- :m_sntIndex(sntIndex)
- ,numberAligned(targetSize, 0)
- ,alignedList(sourceSize)
+SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
+ :m_sntIndex(sntIndex)
+ ,numberAligned(targetSize, 0)
+ ,alignedList(sourceSize)
{
- for(int i=0; i < sourceSize; ++i) {
- std::vector<int> trgWrd;
- alignedList[i] = trgWrd;
- }
+ for(int i=0; i < sourceSize; ++i) {
+ std::vector<int> trgWrd;
+ alignedList[i] = trgWrd;
+ }
}
bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const
{
- // foreign = target, F=T
- // english = source, E=S
- int countTarget = numberAligned.size();
-
- int minTarget = 9999;
- int maxTarget = -1;
- std::vector< int > usedTarget = numberAligned;
- for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++)
- {
- for(int ind=0; ind < (int)alignedList[sourcePos].size();ind++)
- {
- int targetPos = alignedList[sourcePos][ind];
- // cout << "point (" << targetPos << ", " << sourcePos << ")\n";
- if (targetPos<minTarget) { minTarget = targetPos; }
- if (targetPos>maxTarget) { maxTarget = targetPos; }
- usedTarget[ targetPos ]--;
- } // for(int ind=0;ind<sentence
- } // for(int sourcePos=startSource
-
- // cout << "f projected ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
-
- if (maxTarget >= 0 && // aligned to any foreign words at all
- maxTarget-minTarget < maxPhraseLength)
- { // foreign phrase within limits
-
- // check if foreign words are aligned to out of bound english words
- bool out_of_bounds = false;
- for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++)
- {
- if (usedTarget[targetPos]>0)
- {
- // cout << "ouf of bounds: " << targetPos << "\n";
- out_of_bounds = true;
- }
- }
-
- // cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
- if (!out_of_bounds)
- {
- // start point of foreign phrase may retreat over unaligned
- for(int startTarget = minTarget;
- (startTarget >= 0 &&
- startTarget > maxTarget-maxPhraseLength && // within length limit
- (startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned
- startTarget--)
- {
- // end point of foreign phrase may advance over unaligned
- for (int endTarget=maxTarget;
- (endTarget<countTarget &&
- endTarget<startTarget+maxPhraseLength && // within length limit
- (endTarget==maxTarget || numberAligned[endTarget]==0)); // unaligned
- endTarget++)
- {
- PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
- ret.push_back(phrasePair);
- } // for (int endTarget=maxTarget;
- } // for(int startTarget=minTarget;
- } // if (!out_of_bounds)
- } // if (maxTarget >= 0 &&
- return (ret.size() > 0);
-
+ // foreign = target, F=T
+ // english = source, E=S
+ int countTarget = numberAligned.size();
+
+ int minTarget = 9999;
+ int maxTarget = -1;
+ std::vector< int > usedTarget = numberAligned;
+ for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) {
+ for(int ind=0; ind < (int)alignedList[sourcePos].size(); ind++) {
+ int targetPos = alignedList[sourcePos][ind];
+ // cout << "point (" << targetPos << ", " << sourcePos << ")\n";
+ if (targetPos<minTarget) {
+ minTarget = targetPos;
+ }
+ if (targetPos>maxTarget) {
+ maxTarget = targetPos;
+ }
+ usedTarget[ targetPos ]--;
+ } // for(int ind=0;ind<sentence
+ } // for(int sourcePos=startSource
+
+ // cout << "f projected ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
+
+ if (maxTarget >= 0 && // aligned to any foreign words at all
+ maxTarget-minTarget < maxPhraseLength) {
+ // foreign phrase within limits
+
+ // check if foreign words are aligned to out of bound english words
+ bool out_of_bounds = false;
+ for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++) {
+ if (usedTarget[targetPos]>0) {
+ // cout << "ouf of bounds: " << targetPos << "\n";
+ out_of_bounds = true;
+ }
+ }
+
+ // cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
+ if (!out_of_bounds) {
+ // start point of foreign phrase may retreat over unaligned
+ for(int startTarget = minTarget;
+ (startTarget >= 0 &&
+ startTarget > maxTarget-maxPhraseLength && // within length limit
+ (startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned
+ startTarget--) {
+ // end point of foreign phrase may advance over unaligned
+ for (int endTarget=maxTarget;
+ (endTarget<countTarget &&
+ endTarget<startTarget+maxPhraseLength && // within length limit
+ (endTarget==maxTarget || numberAligned[endTarget]==0)); // unaligned
+ endTarget++) {
+ PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
+ ret.push_back(phrasePair);
+ } // for (int endTarget=maxTarget;
+ } // for(int startTarget=minTarget;
+ } // if (!out_of_bounds)
+ } // if (maxTarget >= 0 &&
+ return (ret.size() > 0);
+
}
}// end namepsace
diff --git a/moses/TranslationModel/BilingualDynSuffixArray.h b/moses/TranslationModel/BilingualDynSuffixArray.h
index 5dda1e274..08637d095 100644
--- a/moses/TranslationModel/BilingualDynSuffixArray.h
+++ b/moses/TranslationModel/BilingualDynSuffixArray.h
@@ -1,7 +1,7 @@
#ifndef moses_BilingualDynSuffixArray_h
#define moses_BilingualDynSuffixArray_h
-#include "DynSuffixArray.h"
+#include "DynSuffixArray.h"
#include "moses/TranslationModel/DynSAInclude/vocab.h"
#include "moses/TranslationModel/DynSAInclude/types.h"
#include "moses/TranslationModel/DynSAInclude/utils.h"
@@ -9,26 +9,27 @@
#include "moses/FactorTypeSet.h"
#include "moses/TargetPhrase.h"
-namespace Moses {
+namespace Moses
+{
/** @todo ask Abbey Levenberg
*/
class SAPhrase
{
public:
- std::vector<wordID_t> words;
-
- SAPhrase(size_t phraseSize)
- :words(phraseSize)
- {}
-
- void SetId(size_t pos, wordID_t id)
- {
+ std::vector<wordID_t> words;
+
+ SAPhrase(size_t phraseSize)
+ :words(phraseSize)
+ {}
+
+ void SetId(size_t pos, wordID_t id) {
CHECK(pos < words.size());
- words[pos] = id;
- }
- bool operator<(const SAPhrase& phr2) const
- { return words < phr2.words; }
+ words[pos] = id;
+ }
+ bool operator<(const SAPhrase& phr2) const {
+ return words < phr2.words;
+ }
};
/** @todo ask Abbey Levenberg
@@ -36,42 +37,44 @@ public:
class PhrasePair
{
public:
- int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
- PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
- : m_startTarget(startTarget)
- , m_endTarget(endTarget)
- , m_startSource(startSource)
- , m_endSource(endSource)
- , m_sntIndex(sntIndex)
- {}
-
- size_t GetTargetSize() const
- { return m_endTarget - m_startTarget + 1; }
+ int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
+ PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
+ : m_startTarget(startTarget)
+ , m_endTarget(endTarget)
+ , m_startSource(startSource)
+ , m_endSource(endSource)
+ , m_sntIndex(sntIndex)
+ {}
+
+ size_t GetTargetSize() const {
+ return m_endTarget - m_startTarget + 1;
+ }
};
-
+
/** @todo ask Abbey Levenberg
*/
-class SentenceAlignment
+class SentenceAlignment
{
public:
- SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
- int m_sntIndex;
- std::vector<wordID_t>* trgSnt;
- std::vector<wordID_t>* srcSnt;
- std::vector<int> numberAligned;
- std::vector< std::vector<int> > alignedList;
- bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
+ SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
+ int m_sntIndex;
+ std::vector<wordID_t>* trgSnt;
+ std::vector<wordID_t>* srcSnt;
+ std::vector<int> numberAligned;
+ std::vector< std::vector<int> > alignedList;
+ bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
};
-class ScoresComp {
-public:
+class ScoresComp
+{
+public:
ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
- bool operator()(const Scores& s1, const Scores& s2) const {
+ bool operator()(const Scores& s1, const Scores& s2) const {
return s1[0] < s2[0]; // just p(e|f) as approximation
/*float score1(0), score2(0);
int idx1(0), idx2(0);
- for (Scores::const_iterator itr = s1.begin();
+ for (Scores::const_iterator itr = s1.begin();
itr != s1.end(); ++itr) {
- score1 += log(*itr * m_weights.at(idx1++));
+ score1 += log(*itr * m_weights.at(idx1++));
}
for (Scores::const_iterator itr = s2.begin();
itr != s2.end(); ++itr) {
@@ -79,78 +82,77 @@ public:
}
return score1 < score2;*/
}
-private:
+private:
const std::vector<float>& m_weights;
};
-
+
/** @todo ask Abbey Levenberg
*/
-class BilingualDynSuffixArray {
-public:
- BilingualDynSuffixArray();
- ~BilingualDynSuffixArray();
- bool Load( const std::vector<FactorType>& inputFactors,
- const std::vector<FactorType>& outputTactors,
- std::string source, std::string target, std::string alignments,
- const std::vector<float> &weight);
- bool LoadTM( const std::vector<FactorType>& inputFactors,
- const std::vector<FactorType>& outputTactors,
- std::string source, std::string target, std::string alignments,
- const std::vector<float> &weight);
- void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
+class BilingualDynSuffixArray
+{
+public:
+ BilingualDynSuffixArray();
+ ~BilingualDynSuffixArray();
+ bool Load( const std::vector<FactorType>& inputFactors,
+ const std::vector<FactorType>& outputTactors,
+ std::string source, std::string target, std::string alignments,
+ const std::vector<float> &weight);
+ bool LoadTM( const std::vector<FactorType>& inputFactors,
+ const std::vector<FactorType>& outputTactors,
+ std::string source, std::string target, std::string alignments,
+ const std::vector<float> &weight);
+ void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
void addSntPair(string& source, string& target, string& alignment);
private:
- DynSuffixArray* m_srcSA;
- DynSuffixArray* m_trgSA;
- std::vector<wordID_t>* m_srcCorpus;
- std::vector<wordID_t>* m_trgCorpus;
+ DynSuffixArray* m_srcSA;
+ DynSuffixArray* m_trgSA;
+ std::vector<wordID_t>* m_srcCorpus;
+ std::vector<wordID_t>* m_trgCorpus;
std::vector<FactorType> m_inputFactors;
std::vector<FactorType> m_outputFactors;
- std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
+ std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
- Vocab* m_srcVocab, *m_trgVocab;
- ScoresComp* m_scoreCmp;
+ Vocab* m_srcVocab, *m_trgVocab;
+ ScoresComp* m_scoreCmp;
- std::vector<SentenceAlignment> m_alignments;
- std::vector<std::vector<short> > m_rawAlignments;
+ std::vector<SentenceAlignment> m_alignments;
+ std::vector<std::vector<short> > m_rawAlignments;
- mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
+ mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
mutable std::set<wordID_t> m_freqWordsCached;
- const size_t m_maxPhraseLength, m_maxSampleSize;
-
- int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector<FactorType>& factors,
- std::vector<wordID_t>&, std::vector<wordID_t>&,
- Vocab*);
- int LoadAlignments(InputFileStream& aligs);
- int LoadRawAlignments(InputFileStream& aligs);
- int LoadRawAlignments(string& aligs);
-
- bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
- SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
- int SampleSelection(std::vector<unsigned>&, int = 300) const;
-
- std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
- TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
- SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
- bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
- void CacheWordProbs(wordID_t) const;
+ const size_t m_maxPhraseLength, m_maxSampleSize;
+
+ int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector<FactorType>& factors,
+ std::vector<wordID_t>&, std::vector<wordID_t>&,
+ Vocab*);
+ int LoadAlignments(InputFileStream& aligs);
+ int LoadRawAlignments(InputFileStream& aligs);
+ int LoadRawAlignments(string& aligs);
+
+ bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
+ SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
+ int SampleSelection(std::vector<unsigned>&, int = 300) const;
+
+ std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
+ TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
+ SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
+ bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
+ void CacheWordProbs(wordID_t) const;
void CacheFreqWords() const;
void ClearWordInCache(wordID_t);
- std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
-
- int GetSourceSentenceSize(size_t sentenceId) const
- {
- return (sentenceId==m_srcSntBreaks.size()-1) ?
- m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
- m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
- }
- int GetTargetSentenceSize(size_t sentenceId) const
- {
- return (sentenceId==m_trgSntBreaks.size()-1) ?
- m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
- m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
- }
+ std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
+
+ int GetSourceSentenceSize(size_t sentenceId) const {
+ return (sentenceId==m_srcSntBreaks.size()-1) ?
+ m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
+ m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
+ }
+ int GetTargetSentenceSize(size_t sentenceId) const {
+ return (sentenceId==m_trgSntBreaks.size()-1) ?
+ m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
+ m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
+ }
};
} // end namespace
#endif
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp
index 4f0f6c2cd..e8d2f734a 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h
index 61982299f..3c3f468c2 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -35,12 +35,12 @@ class WordsRange;
*/
class ChartRuleLookupManagerCYKPlus : public ChartRuleLookupManager
{
- public:
+public:
ChartRuleLookupManagerCYKPlus(const InputType &sentence,
const ChartCellCollectionBase &cellColl)
: ChartRuleLookupManager(sentence, cellColl) {}
- protected:
+protected:
void AddCompletedRule(
const DottedRule &dottedRule,
const TargetPhraseCollection &tpc,
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp
index ce6a1d30d..c0c1986f4 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp
@@ -75,19 +75,19 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
// get list of all rules that apply to spans at same starting position
DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()];
const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList();
-
+
const ChartCellLabel &sourceWordLabel = GetSourceAt(absEndPos);
// loop through the rules
- // (note that expandableDottedRuleList can be expanded as the loop runs
+ // (note that expandableDottedRuleList can be expanded as the loop runs
// through calls to ExtendPartialRuleApplication())
for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) {
// rule we are about to extend
const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind];
// we will now try to extend it, starting after where it ended
size_t startPos = prevDottedRule.IsRoot()
- ? range.GetStartPos()
- : prevDottedRule.GetWordsRange().GetEndPos() + 1;
+ ? range.GetStartPos()
+ : prevDottedRule.GetWordsRange().GetEndPos() + 1;
// search for terminal symbol
// (if only one more word position needs to be covered)
@@ -100,15 +100,15 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
// if we found a new rule -> create it and add it to the list
if (node != NULL) {
- // create the rule
+ // create the rule
#ifdef USE_BOOST_POOL
DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc();
new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel,
prevDottedRule);
#else
DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node,
- sourceWordLabel,
- prevDottedRule);
+ sourceWordLabel,
+ prevDottedRule);
#endif
dottedRuleCol.Add(relEndPos+1, dottedRule);
}
@@ -134,9 +134,7 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
// word.
endPos = absEndPos - 1;
stackInd = relEndPos;
- }
- else
- {
+ } else {
endPos = absEndPos;
stackInd = relEndPos + 1;
}
@@ -208,7 +206,7 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
// We'll do whichever minimises the number of lookups:
if (numCombinations <= numChildren*2) {
- // loop over possible source non-terminal labels (as found in input tree)
+ // loop over possible source non-terminal labels (as found in input tree)
NonTerminalSet::const_iterator p = sourceNonTerms.begin();
NonTerminalSet::const_iterator sEnd = sourceNonTerms.end();
for (; p != sEnd; ++p) {
@@ -235,14 +233,12 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule);
#else
DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel,
- prevDottedRule);
+ prevDottedRule);
#endif
dottedRuleColl.Add(stackInd, rule);
}
}
- }
- else
- {
+ } else {
// loop over possible expansions of the rule
PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator p;
PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator end =
@@ -267,7 +263,7 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule);
#else
DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel,
- prevDottedRule);
+ prevDottedRule);
#endif
dottedRuleColl.Add(stackInd, rule);
}
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h
index 784e1c70d..74bc7d253 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h
@@ -44,7 +44,7 @@ class WordsRange;
//! Implementation of ChartRuleLookupManager for in-memory rule tables.
class ChartRuleLookupManagerMemory : public ChartRuleLookupManagerCYKPlus
{
- public:
+public:
ChartRuleLookupManagerMemory(const InputType &sentence,
const ChartCellCollectionBase &cellColl,
const PhraseDictionaryMemory &ruleTable);
@@ -55,7 +55,7 @@ class ChartRuleLookupManagerMemory : public ChartRuleLookupManagerCYKPlus
const WordsRange &range,
ChartParserCallback &outColl);
- private:
+private:
void ExtendPartialRuleApplication(
const DottedRuleInMemory &prevDottedRule,
size_t startPos,
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp
index 4ad60eb43..412840782 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp
@@ -75,17 +75,17 @@ void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection(
// get list of all rules that apply to spans at same starting position
DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()];
const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList();
-
+
// loop through the rules
- // (note that expandableDottedRuleList can be expanded as the loop runs
+ // (note that expandableDottedRuleList can be expanded as the loop runs
// through calls to ExtendPartialRuleApplication())
for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) {
// rule we are about to extend
const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind];
// we will now try to extend it, starting after where it ended
size_t startPos = prevDottedRule.IsRoot()
- ? range.GetStartPos()
- : prevDottedRule.GetWordsRange().GetEndPos() + 1;
+ ? range.GetStartPos()
+ : prevDottedRule.GetWordsRange().GetEndPos() + 1;
// search for terminal symbol
// (if only one more word position needs to be covered)
@@ -99,15 +99,15 @@ void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection(
// if we found a new rule -> create it and add it to the list
if (node != NULL) {
- // create the rule
+ // create the rule
#ifdef USE_BOOST_POOL
DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc();
new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel,
prevDottedRule);
#else
DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node,
- sourceWordLabel,
- prevDottedRule);
+ sourceWordLabel,
+ prevDottedRule);
#endif
dottedRuleCol.Add(relEndPos+1, dottedRule);
}
@@ -133,9 +133,7 @@ void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection(
// word.
endPos = absEndPos - 1;
stackInd = relEndPos;
- }
- else
- {
+ } else {
endPos = absEndPos;
stackInd = relEndPos + 1;
}
@@ -207,7 +205,7 @@ void ChartRuleLookupManagerMemoryPerSentence::ExtendPartialRuleApplication(
// We'll do whichever minimises the number of lookups:
if (numCombinations <= numChildren*2) {
- // loop over possible source non-terminal labels (as found in input tree)
+ // loop over possible source non-terminal labels (as found in input tree)
NonTerminalSet::const_iterator p = sourceNonTerms.begin();
NonTerminalSet::const_iterator sEnd = sourceNonTerms.end();
for (; p != sEnd; ++p) {
@@ -234,14 +232,12 @@ void ChartRuleLookupManagerMemoryPerSentence::ExtendPartialRuleApplication(
new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule);
#else
DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel,
- prevDottedRule);
+ prevDottedRule);
#endif
dottedRuleColl.Add(stackInd, rule);
}
}
- }
- else
- {
+ } else {
// loop over possible expansions of the rule
PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator p;
PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator end =
@@ -266,7 +262,7 @@ void ChartRuleLookupManagerMemoryPerSentence::ExtendPartialRuleApplication(
new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule);
#else
DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel,
- prevDottedRule);
+ prevDottedRule);
#endif
dottedRuleColl.Add(stackInd, rule);
}
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h
index 6f2b209a7..ebb8cdd7c 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h
@@ -44,10 +44,10 @@ class WordsRange;
//! Implementation of ChartRuleLookupManager for in-memory rule tables.
class ChartRuleLookupManagerMemoryPerSentence : public ChartRuleLookupManagerCYKPlus
{
- public:
+public:
ChartRuleLookupManagerMemoryPerSentence(const InputType &sentence,
- const ChartCellCollectionBase &cellColl,
- const PhraseDictionaryFuzzyMatch &ruleTable);
+ const ChartCellCollectionBase &cellColl,
+ const PhraseDictionaryFuzzyMatch &ruleTable);
~ChartRuleLookupManagerMemoryPerSentence();
@@ -55,7 +55,7 @@ class ChartRuleLookupManagerMemoryPerSentence : public ChartRuleLookupManagerCYK
const WordsRange &range,
ChartParserCallback &outColl);
- private:
+private:
void ExtendPartialRuleApplication(
const DottedRuleInMemory &prevDottedRule,
size_t startPos,
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h
index 23f83623d..24d06270b 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h
@@ -35,7 +35,7 @@ namespace Moses
//! Implementation of ChartRuleLookupManager for on-disk rule tables.
class ChartRuleLookupManagerOnDisk : public ChartRuleLookupManagerCYKPlus
{
- public:
+public:
ChartRuleLookupManagerOnDisk(const InputType &sentence,
const ChartCellCollectionBase &cellColl,
const PhraseDictionaryOnDisk &dictionary,
@@ -49,7 +49,7 @@ class ChartRuleLookupManagerOnDisk : public ChartRuleLookupManagerCYKPlus
virtual void GetChartRuleCollection(const WordsRange &range,
ChartParserCallback &outColl);
- private:
+private:
const PhraseDictionaryOnDisk &m_dictionary;
OnDiskPt::OnDiskWrapper &m_dbWrapper;
const std::vector<FactorType> &m_inputFactorsVec;
diff --git a/moses/TranslationModel/CYKPlusParser/DotChart.h b/moses/TranslationModel/CYKPlusParser/DotChart.h
index 9dd34593f..946f36ff2 100644
--- a/moses/TranslationModel/CYKPlusParser/DotChart.h
+++ b/moses/TranslationModel/CYKPlusParser/DotChart.h
@@ -28,26 +28,38 @@ namespace Moses
*/
class DottedRule
{
- public:
+public:
// used only to init dot stack.
DottedRule()
- : m_cellLabel(NULL)
- , m_prev(NULL) {}
+ : m_cellLabel(NULL)
+ , m_prev(NULL) {}
DottedRule(const ChartCellLabel &ccl, const DottedRule &prev)
- : m_cellLabel(&ccl)
- , m_prev(&prev) {}
+ : m_cellLabel(&ccl)
+ , m_prev(&prev) {}
- const WordsRange &GetWordsRange() const { return m_cellLabel->GetCoverage(); }
- const Word &GetSourceWord() const { return m_cellLabel->GetLabel(); }
- bool IsNonTerminal() const { return m_cellLabel->GetLabel().IsNonTerminal(); }
- const DottedRule *GetPrev() const { return m_prev; }
- bool IsRoot() const { return m_prev == NULL; }
- const ChartCellLabel &GetChartCellLabel() const { return *m_cellLabel; }
+ const WordsRange &GetWordsRange() const {
+ return m_cellLabel->GetCoverage();
+ }
+ const Word &GetSourceWord() const {
+ return m_cellLabel->GetLabel();
+ }
+ bool IsNonTerminal() const {
+ return m_cellLabel->GetLabel().IsNonTerminal();
+ }
+ const DottedRule *GetPrev() const {
+ return m_prev;
+ }
+ bool IsRoot() const {
+ return m_prev == NULL;
+ }
+ const ChartCellLabel &GetChartCellLabel() const {
+ return *m_cellLabel;
+ }
- private:
+private:
const ChartCellLabel *m_cellLabel; // usually contains something, unless
- // it's the init processed rule
+ // it's the init processed rule
const DottedRule *m_prev;
};
diff --git a/moses/TranslationModel/CYKPlusParser/DotChartInMemory.cpp b/moses/TranslationModel/CYKPlusParser/DotChartInMemory.cpp
index a28387027..616a2907c 100644
--- a/moses/TranslationModel/CYKPlusParser/DotChartInMemory.cpp
+++ b/moses/TranslationModel/CYKPlusParser/DotChartInMemory.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h b/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h
index f0753a8f1..cfd986d7a 100644
--- a/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h
+++ b/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h
@@ -32,21 +32,23 @@ namespace Moses
*/
class DottedRuleInMemory : public DottedRule
{
- public:
+public:
// used only to init dot stack.
explicit DottedRuleInMemory(const PhraseDictionaryNodeMemory &node)
- : DottedRule()
- , m_node(node) {}
+ : DottedRule()
+ , m_node(node) {}
DottedRuleInMemory(const PhraseDictionaryNodeMemory &node,
const ChartCellLabel &cellLabel,
const DottedRuleInMemory &prev)
- : DottedRule(cellLabel, prev)
- , m_node(node) {}
-
- const PhraseDictionaryNodeMemory &GetLastNode() const { return m_node; }
+ : DottedRule(cellLabel, prev)
+ , m_node(node) {}
+
+ const PhraseDictionaryNodeMemory &GetLastNode() const {
+ return m_node;
+ }
- private:
+private:
const PhraseDictionaryNodeMemory &m_node;
};
diff --git a/moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h b/moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h
index 5b756ba8d..edd9f3a62 100644
--- a/moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h
+++ b/moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h
@@ -36,26 +36,32 @@ namespace Moses
*/
class DottedRuleOnDisk : public DottedRule
{
- public:
+public:
// used only to init dot stack.
explicit DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode)
- : DottedRule()
- , m_lastNode(lastNode)
- , m_done(false) {}
+ : DottedRule()
+ , m_lastNode(lastNode)
+ , m_done(false) {}
DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode,
const ChartCellLabel &cellLabel,
const DottedRuleOnDisk &prev)
- : DottedRule(cellLabel, prev)
- , m_lastNode(lastNode)
- , m_done(false) {}
+ : DottedRule(cellLabel, prev)
+ , m_lastNode(lastNode)
+ , m_done(false) {}
- const OnDiskPt::PhraseNode &GetLastNode() const { return m_lastNode; }
+ const OnDiskPt::PhraseNode &GetLastNode() const {
+ return m_lastNode;
+ }
- bool Done() const { return m_done; }
- void Done(bool value) const { m_done = value; }
+ bool Done() const {
+ return m_done;
+ }
+ void Done(bool value) const {
+ m_done = value;
+ }
- private:
+private:
const OnDiskPt::PhraseNode &m_lastNode;
mutable bool m_done;
};
diff --git a/moses/TranslationModel/CompactPT/BlockHashIndex.cpp b/moses/TranslationModel/CompactPT/BlockHashIndex.cpp
index 705493ab7..9afe474f7 100644
--- a/moses/TranslationModel/CompactPT/BlockHashIndex.cpp
+++ b/moses/TranslationModel/CompactPT/BlockHashIndex.cpp
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#include "ThrowingFwrite.h"
#include "BlockHashIndex.h"
@@ -32,25 +32,27 @@ namespace Moses
#ifdef WITH_THREADS
BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
size_t threadsNum)
-: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
- m_fileHandle(0), m_fileHandleStart(0), m_size(0),
- m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0),
- m_threadPool(threadsNum) {
+ : m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
+ m_fileHandle(0), m_fileHandleStart(0), m_size(0),
+ m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0),
+ m_threadPool(threadsNum)
+{
#ifndef HAVE_CMPH
- std::cerr << "minphr: CMPH support not compiled in." << std::endl;
- exit(1);
-#endif
- }
+ std::cerr << "minphr: CMPH support not compiled in." << std::endl;
+ exit(1);
+#endif
+}
#else
BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits)
-: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
- m_fileHandle(0), m_fileHandleStart(0), m_size(0),
- m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0) {
+ : m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
+ m_fileHandle(0), m_fileHandleStart(0), m_size(0),
+ m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0)
+{
#ifndef HAVE_CMPH
- std::cerr << "minphr: CMPH support not compiled in." << std::endl;
- exit(1);
-#endif
- }
+ std::cerr << "minphr: CMPH support not compiled in." << std::endl;
+ exit(1);
+#endif
+}
#endif
BlockHashIndex::~BlockHashIndex()
@@ -60,7 +62,7 @@ BlockHashIndex::~BlockHashIndex()
it != m_hashes.end(); it++)
if(*it != 0)
cmph_destroy((cmph_t*)*it);
-
+
for(std::vector<PairedPackedArray<>*>::iterator it = m_arrays.begin();
it != m_arrays.end(); it++)
if(*it != 0)
@@ -72,15 +74,15 @@ size_t BlockHashIndex::GetHash(const char* key)
{
std::string keyStr(key);
size_t i = std::distance(m_landmarks.begin(),
- std::upper_bound(m_landmarks.begin(),
- m_landmarks.end(), keyStr)) - 1;
-
+ std::upper_bound(m_landmarks.begin(),
+ m_landmarks.end(), keyStr)) - 1;
+
if(i == 0ul-1)
return GetSize();
-
+
size_t pos = GetHash(i, key);
if(pos != GetSize())
- return (1ul << m_orderBits) * i + pos;
+ return (1ul << m_orderBits) * i + pos;
else
return GetSize();
}
@@ -100,7 +102,7 @@ size_t BlockHashIndex::GetHash(size_t i, const char* key)
#endif
if(m_hashes[i] == 0)
LoadRange(i);
-#ifdef HAVE_CMPH
+#ifdef HAVE_CMPH
size_t idx = cmph_search((cmph_t*)m_hashes[i], key, (cmph_uint32) strlen(key));
#else
assert(0);
@@ -109,11 +111,11 @@ size_t BlockHashIndex::GetHash(size_t i, const char* key)
std::pair<size_t, size_t> orderPrint = m_arrays[i]->Get(idx, m_orderBits, m_fingerPrintBits);
m_clocks[i] = clock();
-
+
if(GetFprint(key) == orderPrint.second)
- return orderPrint.first;
+ return orderPrint.first;
else
- return GetSize();
+ return GetSize();
}
size_t BlockHashIndex::GetHash(std::string key)
@@ -144,11 +146,11 @@ void BlockHashIndex::BeginSave(std::FILE * mphf)
m_fileHandle = mphf;
ThrowingFwrite(&m_orderBits, sizeof(size_t), 1, m_fileHandle);
ThrowingFwrite(&m_fingerPrintBits, sizeof(size_t), 1, m_fileHandle);
-
+
m_fileHandleStart = std::ftell(m_fileHandle);
-
+
size_t relIndexPos = 0;
- ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
+ ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
}
void BlockHashIndex::SaveRange(size_t i)
@@ -168,25 +170,22 @@ void BlockHashIndex::SaveLastRange()
boost::mutex::scoped_lock lock(m_mutex);
#endif
- while(!m_queue.empty() && m_lastSaved + 1 == -m_queue.top())
- {
+ while(!m_queue.empty() && m_lastSaved + 1 == -m_queue.top()) {
size_t current = -m_queue.top();
m_queue.pop();
SaveRange(current);
m_lastSaved = current;
- }
+ }
}
void BlockHashIndex::DropRange(size_t i)
{
#ifdef HAVE_CMPH
- if(m_hashes[i] != 0)
- {
+ if(m_hashes[i] != 0) {
cmph_destroy((cmph_t*)m_hashes[i]);
m_hashes[i] = 0;
}
- if(m_arrays[i] != 0)
- {
+ if(m_arrays[i] != 0) {
delete m_arrays[i];
m_arrays[i] = 0;
m_clocks[i] = 0;
@@ -201,7 +200,7 @@ void BlockHashIndex::DropLastRange()
boost::mutex::scoped_lock lock(m_mutex);
#endif
- while(m_lastDropped != m_lastSaved)
+ while(m_lastDropped != m_lastSaved)
DropRange(++m_lastDropped);
}
@@ -219,24 +218,24 @@ size_t BlockHashIndex::FinalizeSave()
#endif
SaveLastRange();
-
+
size_t relIndexPos = std::ftell(m_fileHandle) - m_fileHandleStart;
-
+
std::fseek(m_fileHandle, m_fileHandleStart, SEEK_SET);
ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
-
+
std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET);
m_landmarks.save(m_fileHandle);
-
+
size_t seekIndexSize = m_seekIndex.size();
ThrowingFwrite(&seekIndexSize, sizeof(size_t), 1, m_fileHandle);
ThrowingFwrite(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle);
-
+
ThrowingFwrite(&m_size, sizeof(size_t), 1, m_fileHandle);
-
+
size_t fileHandleStop = std::ftell(m_fileHandle);
return fileHandleStop - m_fileHandleStart + sizeof(m_orderBits)
- + sizeof(m_fingerPrintBits);
+ + sizeof(m_fingerPrintBits);
}
size_t BlockHashIndex::Save(std::FILE * mphf)
@@ -251,14 +250,14 @@ size_t BlockHashIndex::Save(std::FILE * mphf)
size_t BlockHashIndex::LoadIndex(std::FILE* mphf)
{
m_fileHandle = mphf;
-
+
size_t beginning = std::ftell(mphf);
size_t read = 0;
read += std::fread(&m_orderBits, sizeof(size_t), 1, mphf);
read += std::fread(&m_fingerPrintBits, sizeof(size_t), 1, mphf);
m_fileHandleStart = std::ftell(m_fileHandle);
-
+
size_t relIndexPos;
read += std::fread(&relIndexPos, sizeof(size_t), 1, mphf);
std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET);
@@ -272,12 +271,12 @@ size_t BlockHashIndex::LoadIndex(std::FILE* mphf)
m_hashes.resize(seekIndexSize, 0);
m_clocks.resize(seekIndexSize, 0);
m_arrays.resize(seekIndexSize, 0);
-
+
read += std::fread(&m_size, sizeof(size_t), 1, m_fileHandle);
size_t end = std::ftell(mphf);
- return end - beginning;
+ return end - beginning;
}
void BlockHashIndex::LoadRange(size_t i)
@@ -288,10 +287,10 @@ void BlockHashIndex::LoadRange(size_t i)
m_arrays[i] = new PairedPackedArray<>(0, m_orderBits,
m_fingerPrintBits);
m_arrays[i]->Load(m_fileHandle);
-
+
m_hashes[i] = (void*)hash;
m_clocks[i] = clock();
-
+
m_numLoadedRanges++;
#endif
}
@@ -308,9 +307,9 @@ size_t BlockHashIndex::Load(std::FILE * mphf)
{
size_t byteSize = LoadIndex(mphf);
size_t end = std::ftell(mphf);
-
+
for(size_t i = 0; i < m_seekIndex.size(); i++)
- LoadRange(i);
+ LoadRange(i);
std::fseek(m_fileHandle, end, SEEK_SET);
return byteSize;
}
@@ -327,14 +326,13 @@ void BlockHashIndex::KeepNLastRanges(float ratio, float tolerance)
#endif
size_t n = m_hashes.size() * ratio;
size_t max = n * (1 + tolerance);
- if(m_numLoadedRanges > max)
- {
+ if(m_numLoadedRanges > max) {
typedef std::vector<std::pair<clock_t, size_t> > LastLoaded;
LastLoaded lastLoaded;
for(size_t i = 0; i < m_hashes.size(); i++)
if(m_hashes[i] != 0)
lastLoaded.push_back(std::make_pair(m_clocks[i], i));
-
+
std::sort(lastLoaded.begin(), lastLoaded.end());
for(LastLoaded::reverse_iterator it = lastLoaded.rbegin() + size_t(n * (1 - tolerance));
it != lastLoaded.rend(); it++)
@@ -348,24 +346,23 @@ void BlockHashIndex::CalcHash(size_t current, void* source_void)
cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void;
cmph_config_t *config = cmph_config_new(source);
cmph_config_set_algo(config, CMPH_CHD);
-
+
cmph_t* hash = cmph_new(config);
PairedPackedArray<> *pv =
new PairedPackedArray<>(source->nkeys, m_orderBits, m_fingerPrintBits);
size_t i = 0;
-
+
source->rewind(source->data);
-
+
std::string lastKey = "";
- while(i < source->nkeys)
- {
+ while(i < source->nkeys) {
unsigned keylen;
char* key;
source->read(source->data, &key, &keylen);
std::string temp(key, keylen);
source->dispose(source->data, key, keylen);
-
+
if(lastKey > temp) {
if(source->nkeys != 2 || temp != "###DUMMY_KEY###") {
std::cerr << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort" << std::endl;
@@ -375,41 +372,40 @@ void BlockHashIndex::CalcHash(size_t current, void* source_void)
}
}
lastKey = temp;
-
+
size_t fprint = GetFprint(temp.c_str());
size_t idx = cmph_search(hash, temp.c_str(),
(cmph_uint32) temp.size());
-
+
pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits);
i++;
}
-
+
cmph_config_destroy(config);
-
+
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_mutex);
#endif
- if(m_hashes.size() <= current)
- {
- m_hashes.resize(current + 1, 0);
+ if(m_hashes.size() <= current) {
+ m_hashes.resize(current + 1, 0);
m_arrays.resize(current + 1, 0);
m_clocks.resize(current + 1, 0);
}
-
+
m_hashes[current] = (void*)hash;
m_arrays[current] = pv;
m_clocks[current] = clock();
- m_queue.push(-current);
+ m_queue.push(-current);
#endif
}
-#ifdef HAVE_CMPH
+#ifdef HAVE_CMPH
void* BlockHashIndex::vectorAdapter(std::vector<std::string>& v)
{
return (void*)CmphVectorAdapter(v);
}
-
+
void* BlockHashIndex::vectorAdapter(StringVector<unsigned, size_t, std::allocator>& sv)
{
return (void*)CmphStringVectorAdapter(sv);
diff --git a/moses/TranslationModel/CompactPT/BlockHashIndex.h b/moses/TranslationModel/CompactPT/BlockHashIndex.h
index 8541a2a19..c245d2d66 100644
--- a/moses/TranslationModel/CompactPT/BlockHashIndex.h
+++ b/moses/TranslationModel/CompactPT/BlockHashIndex.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_BlockHashIndex_h
#define moses_BlockHashIndex_h
@@ -25,7 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <iostream>
#include <string>
#include <vector>
-#include <queue>
+#include <queue>
#include <cstring>
#include <cstdio>
@@ -42,144 +42,139 @@ namespace Moses
class BlockHashIndex
{
- private:
- std::priority_queue<int> m_queue;
-
- size_t m_orderBits;
- size_t m_fingerPrintBits;
-
- std::FILE* m_fileHandle;
- size_t m_fileHandleStart;
-
- StringVector<unsigned char, unsigned long> m_landmarks;
-
- std::vector<void*> m_hashes;
- std::vector<clock_t> m_clocks;
- std::vector<PairedPackedArray<>*> m_arrays;
-
- std::vector<size_t> m_seekIndex;
-
- size_t m_size;
- int m_lastSaved;
- int m_lastDropped;
- size_t m_numLoadedRanges;
-
+private:
+ std::priority_queue<int> m_queue;
+
+ size_t m_orderBits;
+ size_t m_fingerPrintBits;
+
+ std::FILE* m_fileHandle;
+ size_t m_fileHandleStart;
+
+ StringVector<unsigned char, unsigned long> m_landmarks;
+
+ std::vector<void*> m_hashes;
+ std::vector<clock_t> m_clocks;
+ std::vector<PairedPackedArray<>*> m_arrays;
+
+ std::vector<size_t> m_seekIndex;
+
+ size_t m_size;
+ int m_lastSaved;
+ int m_lastDropped;
+ size_t m_numLoadedRanges;
+
#ifdef WITH_THREADS
- ThreadPool m_threadPool;
- boost::mutex m_mutex;
-
- template <typename Keys>
- class HashTask : public Task
- {
- public:
- HashTask(int id, BlockHashIndex& hash, Keys& keys)
- : m_id(id), m_hash(hash), m_keys(new Keys(keys)) {}
-
- virtual void Run()
- {
- m_hash.CalcHash(m_id, *m_keys);
- }
-
- virtual ~HashTask()
- {
- delete m_keys;
- }
-
- private:
- int m_id;
- BlockHashIndex& m_hash;
- Keys* m_keys;
- };
-#endif
-
- size_t GetFprint(const char* key) const;
- size_t GetHash(size_t i, const char* key);
-
+ ThreadPool m_threadPool;
+ boost::mutex m_mutex;
+
+ template <typename Keys>
+ class HashTask : public Task
+ {
public:
+ HashTask(int id, BlockHashIndex& hash, Keys& keys)
+ : m_id(id), m_hash(hash), m_keys(new Keys(keys)) {}
+
+ virtual void Run() {
+ m_hash.CalcHash(m_id, *m_keys);
+ }
+
+ virtual ~HashTask() {
+ delete m_keys;
+ }
+
+ private:
+ int m_id;
+ BlockHashIndex& m_hash;
+ Keys* m_keys;
+ };
+#endif
+
+ size_t GetFprint(const char* key) const;
+ size_t GetHash(size_t i, const char* key);
+
+public:
#ifdef WITH_THREADS
- BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
- size_t threadsNum = 2);
+ BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
+ size_t threadsNum = 2);
#else
- BlockHashIndex(size_t orderBits, size_t fingerPrintBits);
+ BlockHashIndex(size_t orderBits, size_t fingerPrintBits);
#endif
- ~BlockHashIndex();
-
- size_t GetHash(const char* key);
- size_t GetHash(std::string key);
-
- size_t operator[](std::string key);
- size_t operator[](char* key);
-
- void BeginSave(std::FILE* mphf);
- void SaveRange(size_t i);
- void SaveLastRange();
- size_t FinalizeSave();
+ ~BlockHashIndex();
+
+ size_t GetHash(const char* key);
+ size_t GetHash(std::string key);
+
+ size_t operator[](std::string key);
+ size_t operator[](char* key);
+
+ void BeginSave(std::FILE* mphf);
+ void SaveRange(size_t i);
+ void SaveLastRange();
+ size_t FinalizeSave();
#ifdef WITH_THREADS
- void WaitAll();
+ void WaitAll();
#endif
-
- void DropRange(size_t i);
- void DropLastRange();
-
- size_t LoadIndex(std::FILE* mphf);
- void LoadRange(size_t i);
-
- size_t Save(std::string filename);
- size_t Save(std::FILE * mphf);
-
- size_t Load(std::string filename);
- size_t Load(std::FILE * mphf);
-
- size_t GetSize() const;
-
- void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1);
-
- template <typename Keys>
- void AddRange(Keys &keys)
- {
- size_t current = m_landmarks.size();
-
- if(m_landmarks.size() && m_landmarks.back().str() >= keys[0])
- {
- std::cerr << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort" << std::endl;
- std::cerr << "1: " << m_landmarks.back().str() << std::endl;
- std::cerr << "2: " << keys[0] << std::endl;
- abort();
- }
-
- m_landmarks.push_back(keys[0]);
- m_size += keys.size();
-
- if(keys.size() == 1) {
- // add dummy key to avoid null hash
- keys.push_back("###DUMMY_KEY###");
- }
-
+
+ void DropRange(size_t i);
+ void DropLastRange();
+
+ size_t LoadIndex(std::FILE* mphf);
+ void LoadRange(size_t i);
+
+ size_t Save(std::string filename);
+ size_t Save(std::FILE * mphf);
+
+ size_t Load(std::string filename);
+ size_t Load(std::FILE * mphf);
+
+ size_t GetSize() const;
+
+ void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1);
+
+ template <typename Keys>
+ void AddRange(Keys &keys) {
+ size_t current = m_landmarks.size();
+
+ if(m_landmarks.size() && m_landmarks.back().str() >= keys[0]) {
+ std::cerr << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort" << std::endl;
+ std::cerr << "1: " << m_landmarks.back().str() << std::endl;
+ std::cerr << "2: " << keys[0] << std::endl;
+ abort();
+ }
+
+ m_landmarks.push_back(keys[0]);
+ m_size += keys.size();
+
+ if(keys.size() == 1) {
+ // add dummy key to avoid null hash
+ keys.push_back("###DUMMY_KEY###");
+ }
+
#ifdef WITH_THREADS
- HashTask<Keys>* ht = new HashTask<Keys>(current, *this, keys);
- m_threadPool.Submit(ht);
+ HashTask<Keys>* ht = new HashTask<Keys>(current, *this, keys);
+ m_threadPool.Submit(ht);
#else
- CalcHash(current, keys);
+ CalcHash(current, keys);
#endif
- }
-
- template <typename Keys>
- void CalcHash(size_t current, Keys &keys)
- {
-#ifdef HAVE_CMPH
- void* source = vectorAdapter(keys);
- CalcHash(current, source);
+ }
+
+ template <typename Keys>
+ void CalcHash(size_t current, Keys &keys) {
+#ifdef HAVE_CMPH
+ void* source = vectorAdapter(keys);
+ CalcHash(current, source);
#endif
- }
+ }
+
+ void CalcHash(size_t current, void* source);
- void CalcHash(size_t current, void* source);
-
-#ifdef HAVE_CMPH
- void* vectorAdapter(std::vector<std::string>& v);
- void* vectorAdapter(StringVector<unsigned, size_t, std::allocator>& sv);
- void* vectorAdapter(StringVector<unsigned, size_t, MmapAllocator>& sv);
+#ifdef HAVE_CMPH
+ void* vectorAdapter(std::vector<std::string>& v);
+ void* vectorAdapter(StringVector<unsigned, size_t, std::allocator>& sv);
+ void* vectorAdapter(StringVector<unsigned, size_t, MmapAllocator>& sv);
#endif
};
diff --git a/moses/TranslationModel/CompactPT/CanonicalHuffman.h b/moses/TranslationModel/CompactPT/CanonicalHuffman.h
index faf7ce411..8d6e1cbb1 100644
--- a/moses/TranslationModel/CompactPT/CanonicalHuffman.h
+++ b/moses/TranslationModel/CompactPT/CanonicalHuffman.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_CanonicalHuffman_h
#define moses_CanonicalHuffman_h
@@ -29,320 +29,293 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "ThrowingFwrite.h"
-namespace Moses {
+namespace Moses
+{
template <typename Data>
class CanonicalHuffman
{
- private:
- std::vector<Data> m_symbols;
- std::vector<size_t> m_firstCodes;
- std::vector<size_t> m_lengthIndex;
-
- typedef boost::unordered_map<Data, boost::dynamic_bitset<> > EncodeMap;
- EncodeMap m_encodeMap;
-
- struct MinHeapSorter {
- std::vector<size_t>& m_vec;
-
- MinHeapSorter(std::vector<size_t>& vec) : m_vec(vec) { }
-
- bool operator()(size_t a, size_t b)
- {
- return m_vec[a] > m_vec[b];
- }
- };
-
- template <class Iterator>
- void CalcLengths(Iterator begin, Iterator end, std::vector<size_t>& lengths)
- {
- size_t n = std::distance(begin, end);
- std::vector<size_t> A(2 * n, 0);
-
- m_symbols.resize(n);
- size_t i = 0;
- for(Iterator it = begin; it != end; it++)
- {
- m_symbols[i] = it->first;
-
- A[i] = n + i;
- A[n + i] = it->second;
- i++;
- }
-
- if(n == 1)
- {
- lengths.push_back(1);
- return;
- }
-
- MinHeapSorter hs(A);
- std::make_heap(A.begin(), A.begin() + n, hs);
-
- size_t h = n;
- size_t m1, m2;
- while(h > 1)
- {
- m1 = A[0];
- std::pop_heap(A.begin(), A.begin() + h, hs);
-
- h--;
-
- m2 = A[0];
- std::pop_heap(A.begin(), A.begin() + h, hs);
-
- A[h] = A[m1] + A[m2];
- A[h-1] = h;
- A[m1] = A[m2] = h;
-
- std::push_heap(A.begin(), A.begin() + h, hs);
- }
-
- A[1] = 0;
- for(size_t i = 2; i < 2*n; i++)
- A[i] = A[A[i]] + 1;
-
- lengths.resize(n);
- for(size_t i = 0; i < n; i++)
- lengths[i] = A[i + n];
- }
+private:
+ std::vector<Data> m_symbols;
+ std::vector<size_t> m_firstCodes;
+ std::vector<size_t> m_lengthIndex;
- void CalcCodes(std::vector<size_t>& lengths)
- {
- std::vector<size_t> numLength;
- for(std::vector<size_t>::iterator it = lengths.begin();
- it != lengths.end(); it++) {
- size_t length = *it;
- if(numLength.size() <= length)
- numLength.resize(length + 1, 0);
- numLength[length]++;
- }
-
- m_lengthIndex.resize(numLength.size());
- m_lengthIndex[0] = 0;
- for(size_t l = 1; l < numLength.size(); l++)
- m_lengthIndex[l] = m_lengthIndex[l - 1] + numLength[l - 1];
-
- size_t maxLength = numLength.size() - 1;
-
- m_firstCodes.resize(maxLength + 1, 0);
- for(size_t l = maxLength - 1; l > 0; l--)
- m_firstCodes[l] = (m_firstCodes[l + 1] + numLength[l + 1]) / 2;
-
- std::vector<Data> t_symbols;
- t_symbols.resize(lengths.size());
-
- std::vector<size_t> nextCode = m_firstCodes;
- for(size_t i = 0; i < lengths.size(); i++)
- {
- Data data = m_symbols[i];
- size_t length = lengths[i];
-
- size_t pos = m_lengthIndex[length]
- + (nextCode[length] - m_firstCodes[length]);
- t_symbols[pos] = data;
-
- nextCode[length] = nextCode[length] + 1;
- }
-
- m_symbols.swap(t_symbols);
- }
-
- void CreateCodeMap()
- {
- for(size_t l = 1; l < m_lengthIndex.size(); l++)
- {
- size_t intCode = m_firstCodes[l];
- size_t num = ((l+1 < m_lengthIndex.size()) ? m_lengthIndex[l+1]
- : m_symbols.size()) - m_lengthIndex[l];
-
- for(size_t i = 0; i < num; i++)
- {
- Data data = m_symbols[m_lengthIndex[l] + i];
- boost::dynamic_bitset<> bitCode(l, intCode);
- m_encodeMap[data] = bitCode;
- intCode++;
- }
- }
- }
-
- boost::dynamic_bitset<>& Encode(Data data)
- {
- return m_encodeMap[data];
+ typedef boost::unordered_map<Data, boost::dynamic_bitset<> > EncodeMap;
+ EncodeMap m_encodeMap;
+
+ struct MinHeapSorter {
+ std::vector<size_t>& m_vec;
+
+ MinHeapSorter(std::vector<size_t>& vec) : m_vec(vec) { }
+
+ bool operator()(size_t a, size_t b) {
+ return m_vec[a] > m_vec[b];
}
-
- template <class BitWrapper>
- void PutCode(BitWrapper& bitWrapper, boost::dynamic_bitset<>& code)
- {
- for(int j = code.size()-1; j >= 0; j--)
- bitWrapper.Put(code[j]);
+ };
+
+ template <class Iterator>
+ void CalcLengths(Iterator begin, Iterator end, std::vector<size_t>& lengths) {
+ size_t n = std::distance(begin, end);
+ std::vector<size_t> A(2 * n, 0);
+
+ m_symbols.resize(n);
+ size_t i = 0;
+ for(Iterator it = begin; it != end; it++) {
+ m_symbols[i] = it->first;
+
+ A[i] = n + i;
+ A[n + i] = it->second;
+ i++;
}
-
- public:
-
- template <class Iterator>
- CanonicalHuffman(Iterator begin, Iterator end, bool forEncoding = true)
- {
- std::vector<size_t> lengths;
- CalcLengths(begin, end, lengths);
- CalcCodes(lengths);
-
- if(forEncoding)
- CreateCodeMap();
+
+ if(n == 1) {
+ lengths.push_back(1);
+ return;
}
-
- CanonicalHuffman(std::FILE* pFile, bool forEncoding = false)
- {
- Load(pFile);
-
- if(forEncoding)
- CreateCodeMap();
+
+ MinHeapSorter hs(A);
+ std::make_heap(A.begin(), A.begin() + n, hs);
+
+ size_t h = n;
+ size_t m1, m2;
+ while(h > 1) {
+ m1 = A[0];
+ std::pop_heap(A.begin(), A.begin() + h, hs);
+
+ h--;
+
+ m2 = A[0];
+ std::pop_heap(A.begin(), A.begin() + h, hs);
+
+ A[h] = A[m1] + A[m2];
+ A[h-1] = h;
+ A[m1] = A[m2] = h;
+
+ std::push_heap(A.begin(), A.begin() + h, hs);
}
-
- template <class BitWrapper>
- void Put(BitWrapper& bitWrapper, Data data)
- {
- PutCode(bitWrapper, Encode(data));
+
+ A[1] = 0;
+ for(size_t i = 2; i < 2*n; i++)
+ A[i] = A[A[i]] + 1;
+
+ lengths.resize(n);
+ for(size_t i = 0; i < n; i++)
+ lengths[i] = A[i + n];
+ }
+
+ void CalcCodes(std::vector<size_t>& lengths) {
+ std::vector<size_t> numLength;
+ for(std::vector<size_t>::iterator it = lengths.begin();
+ it != lengths.end(); it++) {
+ size_t length = *it;
+ if(numLength.size() <= length)
+ numLength.resize(length + 1, 0);
+ numLength[length]++;
}
-
- template <class BitWrapper>
- Data Read(BitWrapper& bitWrapper)
- {
- if(bitWrapper.TellFromEnd())
- {
- size_t intCode = bitWrapper.Read();
- size_t len = 1;
- while(intCode < m_firstCodes[len]) {
- intCode = 2 * intCode + bitWrapper.Read();
- len++;
- }
- return m_symbols[m_lengthIndex[len] + (intCode - m_firstCodes[len])];
- }
- return Data();
+
+ m_lengthIndex.resize(numLength.size());
+ m_lengthIndex[0] = 0;
+ for(size_t l = 1; l < numLength.size(); l++)
+ m_lengthIndex[l] = m_lengthIndex[l - 1] + numLength[l - 1];
+
+ size_t maxLength = numLength.size() - 1;
+
+ m_firstCodes.resize(maxLength + 1, 0);
+ for(size_t l = maxLength - 1; l > 0; l--)
+ m_firstCodes[l] = (m_firstCodes[l + 1] + numLength[l + 1]) / 2;
+
+ std::vector<Data> t_symbols;
+ t_symbols.resize(lengths.size());
+
+ std::vector<size_t> nextCode = m_firstCodes;
+ for(size_t i = 0; i < lengths.size(); i++) {
+ Data data = m_symbols[i];
+ size_t length = lengths[i];
+
+ size_t pos = m_lengthIndex[length]
+ + (nextCode[length] - m_firstCodes[length]);
+ t_symbols[pos] = data;
+
+ nextCode[length] = nextCode[length] + 1;
}
-
- size_t Load(std::FILE* pFile)
- {
- size_t start = std::ftell(pFile);
- size_t read = 0;
-
- size_t size;
- read += std::fread(&size, sizeof(size_t), 1, pFile);
- m_symbols.resize(size);
- read += std::fread(&m_symbols[0], sizeof(Data), size, pFile);
-
- read += std::fread(&size, sizeof(size_t), 1, pFile);
- m_firstCodes.resize(size);
- read += std::fread(&m_firstCodes[0], sizeof(size_t), size, pFile);
-
- read += std::fread(&size, sizeof(size_t), 1, pFile);
- m_lengthIndex.resize(size);
- read += std::fread(&m_lengthIndex[0], sizeof(size_t), size, pFile);
-
- return std::ftell(pFile) - start;
+
+ m_symbols.swap(t_symbols);
+ }
+
+ void CreateCodeMap() {
+ for(size_t l = 1; l < m_lengthIndex.size(); l++) {
+ size_t intCode = m_firstCodes[l];
+ size_t num = ((l+1 < m_lengthIndex.size()) ? m_lengthIndex[l+1]
+ : m_symbols.size()) - m_lengthIndex[l];
+
+ for(size_t i = 0; i < num; i++) {
+ Data data = m_symbols[m_lengthIndex[l] + i];
+ boost::dynamic_bitset<> bitCode(l, intCode);
+ m_encodeMap[data] = bitCode;
+ intCode++;
+ }
}
-
- size_t Save(std::FILE* pFile)
- {
- size_t start = std::ftell(pFile);
-
- size_t size = m_symbols.size();
- ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
- ThrowingFwrite(&m_symbols[0], sizeof(Data), size, pFile);
-
- size = m_firstCodes.size();
- ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
- ThrowingFwrite(&m_firstCodes[0], sizeof(size_t), size, pFile);
-
- size = m_lengthIndex.size();
- ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
- ThrowingFwrite(&m_lengthIndex[0], sizeof(size_t), size, pFile);
-
- return std::ftell(pFile) - start;
+ }
+
+ boost::dynamic_bitset<>& Encode(Data data) {
+ return m_encodeMap[data];
+ }
+
+ template <class BitWrapper>
+ void PutCode(BitWrapper& bitWrapper, boost::dynamic_bitset<>& code) {
+ for(int j = code.size()-1; j >= 0; j--)
+ bitWrapper.Put(code[j]);
+ }
+
+public:
+
+ template <class Iterator>
+ CanonicalHuffman(Iterator begin, Iterator end, bool forEncoding = true) {
+ std::vector<size_t> lengths;
+ CalcLengths(begin, end, lengths);
+ CalcCodes(lengths);
+
+ if(forEncoding)
+ CreateCodeMap();
+ }
+
+ CanonicalHuffman(std::FILE* pFile, bool forEncoding = false) {
+ Load(pFile);
+
+ if(forEncoding)
+ CreateCodeMap();
+ }
+
+ template <class BitWrapper>
+ void Put(BitWrapper& bitWrapper, Data data) {
+ PutCode(bitWrapper, Encode(data));
+ }
+
+ template <class BitWrapper>
+ Data Read(BitWrapper& bitWrapper) {
+ if(bitWrapper.TellFromEnd()) {
+ size_t intCode = bitWrapper.Read();
+ size_t len = 1;
+ while(intCode < m_firstCodes[len]) {
+ intCode = 2 * intCode + bitWrapper.Read();
+ len++;
+ }
+ return m_symbols[m_lengthIndex[len] + (intCode - m_firstCodes[len])];
}
+ return Data();
+ }
+
+ size_t Load(std::FILE* pFile) {
+ size_t start = std::ftell(pFile);
+ size_t read = 0;
+
+ size_t size;
+ read += std::fread(&size, sizeof(size_t), 1, pFile);
+ m_symbols.resize(size);
+ read += std::fread(&m_symbols[0], sizeof(Data), size, pFile);
+
+ read += std::fread(&size, sizeof(size_t), 1, pFile);
+ m_firstCodes.resize(size);
+ read += std::fread(&m_firstCodes[0], sizeof(size_t), size, pFile);
+
+ read += std::fread(&size, sizeof(size_t), 1, pFile);
+ m_lengthIndex.resize(size);
+ read += std::fread(&m_lengthIndex[0], sizeof(size_t), size, pFile);
+
+ return std::ftell(pFile) - start;
+ }
+
+ size_t Save(std::FILE* pFile) {
+ size_t start = std::ftell(pFile);
+
+ size_t size = m_symbols.size();
+ ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
+ ThrowingFwrite(&m_symbols[0], sizeof(Data), size, pFile);
+
+ size = m_firstCodes.size();
+ ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
+ ThrowingFwrite(&m_firstCodes[0], sizeof(size_t), size, pFile);
+
+ size = m_lengthIndex.size();
+ ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
+ ThrowingFwrite(&m_lengthIndex[0], sizeof(size_t), size, pFile);
+
+ return std::ftell(pFile) - start;
+ }
};
template <class Container = std::string>
class BitWrapper
{
- private:
- Container& m_data;
-
- typename Container::iterator m_iterator;
- typename Container::value_type m_currentValue;
-
- size_t m_valueBits;
- typename Container::value_type m_mask;
- size_t m_bitPos;
-
- public:
-
- BitWrapper(Container &data)
+private:
+ Container& m_data;
+
+ typename Container::iterator m_iterator;
+ typename Container::value_type m_currentValue;
+
+ size_t m_valueBits;
+ typename Container::value_type m_mask;
+ size_t m_bitPos;
+
+public:
+
+ BitWrapper(Container &data)
: m_data(data), m_iterator(m_data.begin()), m_currentValue(0),
m_valueBits(sizeof(typename Container::value_type) * 8),
m_mask(1), m_bitPos(0) { }
-
- bool Read()
- {
- if(m_bitPos % m_valueBits == 0)
- {
- if(m_iterator != m_data.end())
- m_currentValue = *m_iterator++;
- }
- else
- m_currentValue = m_currentValue >> 1;
-
- m_bitPos++;
- return (m_currentValue & m_mask);
- }
-
- void Put(bool bit) {
- if(m_bitPos % m_valueBits == 0)
- m_data.push_back(0);
-
- if(bit)
- m_data[m_data.size()-1] |= m_mask << (m_bitPos % m_valueBits);
-
- m_bitPos++;
- }
-
- size_t Tell()
- {
- return m_bitPos;
- }
-
- size_t TellFromEnd()
- {
- if(m_data.size() * m_valueBits < m_bitPos)
- return 0;
- return m_data.size() * m_valueBits - m_bitPos;
- }
-
- void Seek(size_t bitPos)
- {
- m_bitPos = bitPos;
- m_iterator = m_data.begin() + int((m_bitPos-1)/m_valueBits);
- m_currentValue = (*m_iterator) >> ((m_bitPos-1) % m_valueBits);
- m_iterator++;
- }
-
- void SeekFromEnd(size_t bitPosFromEnd)
- {
- size_t bitPos = m_data.size() * m_valueBits - bitPosFromEnd;
- Seek(bitPos);
- }
-
- void Reset()
- {
- m_iterator = m_data.begin();
- m_currentValue = 0;
- m_bitPos = 0;
- }
-
- Container& GetContainer()
- {
- return m_data;
- }
+
+ bool Read() {
+ if(m_bitPos % m_valueBits == 0) {
+ if(m_iterator != m_data.end())
+ m_currentValue = *m_iterator++;
+ } else
+ m_currentValue = m_currentValue >> 1;
+
+ m_bitPos++;
+ return (m_currentValue & m_mask);
+ }
+
+ void Put(bool bit) {
+ if(m_bitPos % m_valueBits == 0)
+ m_data.push_back(0);
+
+ if(bit)
+ m_data[m_data.size()-1] |= m_mask << (m_bitPos % m_valueBits);
+
+ m_bitPos++;
+ }
+
+ size_t Tell() {
+ return m_bitPos;
+ }
+
+ size_t TellFromEnd() {
+ if(m_data.size() * m_valueBits < m_bitPos)
+ return 0;
+ return m_data.size() * m_valueBits - m_bitPos;
+ }
+
+ void Seek(size_t bitPos) {
+ m_bitPos = bitPos;
+ m_iterator = m_data.begin() + int((m_bitPos-1)/m_valueBits);
+ m_currentValue = (*m_iterator) >> ((m_bitPos-1) % m_valueBits);
+ m_iterator++;
+ }
+
+ void SeekFromEnd(size_t bitPosFromEnd) {
+ size_t bitPos = m_data.size() * m_valueBits - bitPosFromEnd;
+ Seek(bitPos);
+ }
+
+ void Reset() {
+ m_iterator = m_data.begin();
+ m_currentValue = 0;
+ m_bitPos = 0;
+ }
+
+ Container& GetContainer() {
+ return m_data;
+ }
};
}
diff --git a/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp b/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp
index 40fff6690..8e4d1641f 100644
--- a/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp
+++ b/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifdef HAVE_CMPH
@@ -25,70 +25,70 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
-
- void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
- {
- delete[] key;
- }
-
- void CmphStringVectorAdapterRewind(void *data)
- {
- cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
- cmph_vector->position = 0;
- }
-
- //************************************************************************//
-
- cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v)
- {
- cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
- cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
- assert(key_source);
- assert(cmph_vector);
-
- cmph_vector->vector = (void *)&v;
- cmph_vector->position = 0;
- key_source->data = (void *)cmph_vector;
- key_source->nkeys = v.size();
-
- return key_source;
- }
-
- int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
- {
- cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
- std::vector<std::string>* v = (std::vector<std::string>*)cmph_vector->vector;
- size_t size;
- *keylen = (*v)[cmph_vector->position].size();
- size = *keylen;
- *key = new char[size + 1];
- std::string temp = (*v)[cmph_vector->position];
- strcpy(*key, temp.c_str());
- cmph_vector->position = cmph_vector->position + 1;
- return (int)(*keylen);
- }
-
- void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
- {
- delete[] key;
- }
-
- void CmphVectorAdapterRewind(void *data)
- {
- cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
- cmph_vector->position = 0;
- }
-
- cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v)
- {
- cmph_io_adapter_t * key_source = CmphVectorAdapterNew(v);
-
- key_source->read = CmphVectorAdapterRead;
- key_source->dispose = CmphVectorAdapterDispose;
- key_source->rewind = CmphVectorAdapterRewind;
- return key_source;
- }
-
+
+void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
+{
+ delete[] key;
+}
+
+void CmphStringVectorAdapterRewind(void *data)
+{
+ cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
+ cmph_vector->position = 0;
+}
+
+//************************************************************************//
+
+cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v)
+{
+ cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
+ cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
+ assert(key_source);
+ assert(cmph_vector);
+
+ cmph_vector->vector = (void *)&v;
+ cmph_vector->position = 0;
+ key_source->data = (void *)cmph_vector;
+ key_source->nkeys = v.size();
+
+ return key_source;
+}
+
+int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
+{
+ cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
+ std::vector<std::string>* v = (std::vector<std::string>*)cmph_vector->vector;
+ size_t size;
+ *keylen = (*v)[cmph_vector->position].size();
+ size = *keylen;
+ *key = new char[size + 1];
+ std::string temp = (*v)[cmph_vector->position];
+ strcpy(*key, temp.c_str());
+ cmph_vector->position = cmph_vector->position + 1;
+ return (int)(*keylen);
+}
+
+void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
+{
+ delete[] key;
+}
+
+void CmphVectorAdapterRewind(void *data)
+{
+ cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
+ cmph_vector->position = 0;
+}
+
+cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v)
+{
+ cmph_io_adapter_t * key_source = CmphVectorAdapterNew(v);
+
+ key_source->read = CmphVectorAdapterRead;
+ key_source->dispose = CmphVectorAdapterDispose;
+ key_source->rewind = CmphVectorAdapterRewind;
+ return key_source;
+}
+
}
#endif
diff --git a/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h b/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h
index 5516d4f4d..4a532c289 100644
--- a/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h
+++ b/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_CmphStringVectorAdapterNew_h
#define moses_CmphStringVectorAdapterNew_h
@@ -33,72 +33,71 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
- typedef struct
- {
- void *vector;
- cmph_uint32 position;
- }
- cmph_vector_t;
-
-
- template <typename ValueT, typename PosT, template <typename> class Allocator>
- cmph_io_adapter_t *CmphStringVectorAdapterNew(StringVector<ValueT, PosT, Allocator>& sv)
- {
- cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
- cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
- assert(key_source);
- assert(cmph_vector);
-
- cmph_vector->vector = (void *)&sv;
- cmph_vector->position = 0;
- key_source->data = (void *)cmph_vector;
- key_source->nkeys = sv.size();
-
- return key_source;
- }
-
- template <typename ValueT, typename PosT, template <typename> class Allocator>
- int CmphStringVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
- {
- cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
- StringVector<ValueT, PosT, Allocator>* sv = (StringVector<ValueT, PosT, Allocator>*)cmph_vector->vector;
- size_t size;
- *keylen = (*sv)[cmph_vector->position].size();
- size = *keylen;
- *key = new char[size + 1];
- std::string temp = (*sv)[cmph_vector->position];
- std::strcpy(*key, temp.c_str());
- cmph_vector->position = cmph_vector->position + 1;
- return (int)(*keylen);
- }
-
- void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
-
- void CmphStringVectorAdapterRewind(void *data);
-
- template <typename ValueT, typename PosT, template <typename> class Allocator>
- cmph_io_adapter_t* CmphStringVectorAdapter(StringVector<ValueT, PosT, Allocator>& sv)
- {
- cmph_io_adapter_t * key_source = CmphStringVectorAdapterNew(sv);
-
- key_source->read = CmphStringVectorAdapterRead<ValueT, PosT, Allocator>;
- key_source->dispose = CmphStringVectorAdapterDispose;
- key_source->rewind = CmphStringVectorAdapterRewind;
- return key_source;
- }
-
- //************************************************************************//
-
- cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v);
-
- int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen);
-
- void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
-
- void CmphVectorAdapterRewind(void *data);
-
- cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v);
-
+typedef struct {
+ void *vector;
+ cmph_uint32 position;
+}
+cmph_vector_t;
+
+
+template <typename ValueT, typename PosT, template <typename> class Allocator>
+cmph_io_adapter_t *CmphStringVectorAdapterNew(StringVector<ValueT, PosT, Allocator>& sv)
+{
+ cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
+ cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
+ assert(key_source);
+ assert(cmph_vector);
+
+ cmph_vector->vector = (void *)&sv;
+ cmph_vector->position = 0;
+ key_source->data = (void *)cmph_vector;
+ key_source->nkeys = sv.size();
+
+ return key_source;
+}
+
+template <typename ValueT, typename PosT, template <typename> class Allocator>
+int CmphStringVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
+{
+ cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
+ StringVector<ValueT, PosT, Allocator>* sv = (StringVector<ValueT, PosT, Allocator>*)cmph_vector->vector;
+ size_t size;
+ *keylen = (*sv)[cmph_vector->position].size();
+ size = *keylen;
+ *key = new char[size + 1];
+ std::string temp = (*sv)[cmph_vector->position];
+ std::strcpy(*key, temp.c_str());
+ cmph_vector->position = cmph_vector->position + 1;
+ return (int)(*keylen);
+}
+
+void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
+
+void CmphStringVectorAdapterRewind(void *data);
+
+template <typename ValueT, typename PosT, template <typename> class Allocator>
+cmph_io_adapter_t* CmphStringVectorAdapter(StringVector<ValueT, PosT, Allocator>& sv)
+{
+ cmph_io_adapter_t * key_source = CmphStringVectorAdapterNew(sv);
+
+ key_source->read = CmphStringVectorAdapterRead<ValueT, PosT, Allocator>;
+ key_source->dispose = CmphStringVectorAdapterDispose;
+ key_source->rewind = CmphStringVectorAdapterRewind;
+ return key_source;
+}
+
+//************************************************************************//
+
+cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v);
+
+int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen);
+
+void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
+
+void CmphVectorAdapterRewind(void *data);
+
+cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v);
+
}
#endif
diff --git a/moses/TranslationModel/CompactPT/ConsistentPhrases.h b/moses/TranslationModel/CompactPT/ConsistentPhrases.h
index 0ec86e1ac..c7b7c733b 100644
--- a/moses/TranslationModel/CompactPT/ConsistentPhrases.h
+++ b/moses/TranslationModel/CompactPT/ConsistentPhrases.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_ConsistentPhrases_h
#define moses_ConsistentPhrases_h
@@ -29,97 +29,82 @@ namespace Moses
class ConsistentPhrases
{
- public:
- struct Phrase
- {
- int i, j, m, n;
- Phrase(int i_, int m_, int j_, int n_) : i(i_), j(j_), m(m_), n(n_) { }
- };
-
- struct PhraseSorter
- {
- bool operator()(Phrase a, Phrase b)
- {
- if(a.n > b.n)
- return true;
- if(a.n == b.n && a.j < b.j)
- return true;
- if(a.n == b.n && a.j == b.j && a.m > b.m)
- return true;
- if(a.n == b.n && a.j == b.j && a.m == b.m && a.i < b.i)
- return true;
- return false;
- }
- };
-
- private:
- typedef std::set<Phrase, PhraseSorter> PhraseQueue;
- PhraseQueue m_phraseQueue;
-
- typedef std::pair<unsigned char, unsigned char> AlignPoint;
- typedef std::set<AlignPoint> Alignment;
-
- public:
-
- ConsistentPhrases(int mmax, int nmax, Alignment& a)
- {
- for(int i = 0; i < mmax; i++)
- {
- for(int m = 1; m <= mmax-i; m++)
- {
- for(int j = 0; j < nmax; j++)
- {
- for(int n = 1; n <= nmax-j; n++)
- {
- bool consistant = true;
- for(Alignment::iterator it = a.begin(); it != a.end(); it++)
- {
- int ip = it->first;
- int jp = it->second;
- if((i <= ip && ip < i+m) != (j <= jp && jp < j+n))
- {
- consistant = false;
- break;
- }
+public:
+ struct Phrase {
+ int i, j, m, n;
+ Phrase(int i_, int m_, int j_, int n_) : i(i_), j(j_), m(m_), n(n_) { }
+ };
+
+ struct PhraseSorter {
+ bool operator()(Phrase a, Phrase b) {
+ if(a.n > b.n)
+ return true;
+ if(a.n == b.n && a.j < b.j)
+ return true;
+ if(a.n == b.n && a.j == b.j && a.m > b.m)
+ return true;
+ if(a.n == b.n && a.j == b.j && a.m == b.m && a.i < b.i)
+ return true;
+ return false;
+ }
+ };
+
+private:
+ typedef std::set<Phrase, PhraseSorter> PhraseQueue;
+ PhraseQueue m_phraseQueue;
+
+ typedef std::pair<unsigned char, unsigned char> AlignPoint;
+ typedef std::set<AlignPoint> Alignment;
+
+public:
+
+ ConsistentPhrases(int mmax, int nmax, Alignment& a) {
+ for(int i = 0; i < mmax; i++) {
+ for(int m = 1; m <= mmax-i; m++) {
+ for(int j = 0; j < nmax; j++) {
+ for(int n = 1; n <= nmax-j; n++) {
+ bool consistant = true;
+ for(Alignment::iterator it = a.begin(); it != a.end(); it++) {
+ int ip = it->first;
+ int jp = it->second;
+ if((i <= ip && ip < i+m) != (j <= jp && jp < j+n)) {
+ consistant = false;
+ break;
}
- if(consistant)
- m_phraseQueue.insert(Phrase(i, m, j, n));
- }
- }
- }
+ }
+ if(consistant)
+ m_phraseQueue.insert(Phrase(i, m, j, n));
+ }
+ }
}
- m_phraseQueue.erase(Phrase(0, mmax, 0, nmax));
- }
-
- size_t Empty()
- {
- return !m_phraseQueue.size();
}
-
- Phrase Pop()
- {
- if(m_phraseQueue.size())
- {
- Phrase p = *m_phraseQueue.begin();
- m_phraseQueue.erase(m_phraseQueue.begin());
- return p;
- }
- return Phrase(0,0,0,0);
+ m_phraseQueue.erase(Phrase(0, mmax, 0, nmax));
+ }
+
+ size_t Empty() {
+ return !m_phraseQueue.size();
+ }
+
+ Phrase Pop() {
+ if(m_phraseQueue.size()) {
+ Phrase p = *m_phraseQueue.begin();
+ m_phraseQueue.erase(m_phraseQueue.begin());
+ return p;
}
-
- void RemoveOverlap(Phrase p)
- {
- PhraseQueue ok;
- for(PhraseQueue::iterator it = m_phraseQueue.begin(); it != m_phraseQueue.end(); it++)
- {
- Phrase pp = *it;
- if(!((p.i <= pp.i && pp.i < p.i + p.m) || (pp.i <= p.i && p.i < pp.i + pp.m) ||
- (p.j <= pp.j && pp.j < p.j + p.n) || (pp.j <= p.j && p.j < pp.j + pp.n)))
- ok.insert(pp);
- }
- m_phraseQueue = ok;
+ return Phrase(0,0,0,0);
+ }
+
+ void RemoveOverlap(Phrase p) {
+ PhraseQueue ok;
+ for(PhraseQueue::iterator it = m_phraseQueue.begin(); it != m_phraseQueue.end(); it++) {
+ Phrase pp = *it;
+ if(!((p.i <= pp.i && pp.i < p.i + p.m) || (pp.i <= p.i && p.i < pp.i + pp.m) ||
+ (p.j <= pp.j && pp.j < p.j + p.n) || (pp.j <= p.j && p.j < pp.j + pp.n)))
+ ok.insert(pp);
}
-
+ m_phraseQueue = ok;
+ }
+
};
}
diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
index ff1c663c9..ad7591a7b 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
@@ -1,27 +1,28 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#include "LexicalReorderingTableCompact.h"
-namespace Moses {
+namespace Moses
+{
LexicalReorderingTableCompact::LexicalReorderingTableCompact(
const std::string& filePath,
@@ -29,9 +30,9 @@ LexicalReorderingTableCompact::LexicalReorderingTableCompact(
const std::vector<FactorType>& e_factors,
const std::vector<FactorType>& c_factors)
: LexicalReorderingTable(f_factors, e_factors, c_factors),
- m_inMemory(StaticData::Instance().UseMinlexrInMemory()),
- m_numScoreComponent(6), m_multipleScoreTrees(true),
- m_hash(10, 16), m_scoreTrees(1)
+ m_inMemory(StaticData::Instance().UseMinlexrInMemory()),
+ m_numScoreComponent(6), m_multipleScoreTrees(true),
+ m_hash(10, 16), m_scoreTrees(1)
{
Load(filePath);
}
@@ -41,12 +42,13 @@ LexicalReorderingTableCompact::LexicalReorderingTableCompact(
const std::vector<FactorType>& e_factors,
const std::vector<FactorType>& c_factors)
: LexicalReorderingTable(f_factors, e_factors, c_factors),
- m_inMemory(StaticData::Instance().UseMinlexrInMemory()),
- m_numScoreComponent(6), m_multipleScoreTrees(true),
- m_hash(10, 16), m_scoreTrees(1)
+ m_inMemory(StaticData::Instance().UseMinlexrInMemory()),
+ m_numScoreComponent(6), m_multipleScoreTrees(true),
+ m_hash(10, 16), m_scoreTrees(1)
{ }
-LexicalReorderingTableCompact::~LexicalReorderingTableCompact() {
+LexicalReorderingTableCompact::~LexicalReorderingTableCompact()
+{
for(size_t i = 0; i < m_scoreTrees.size(); i++)
delete m_scoreTrees[i];
}
@@ -57,25 +59,23 @@ std::vector<float> LexicalReorderingTableCompact::GetScore(const Phrase& f,
{
std::string key;
Scores scores;
-
+
if(0 == c.GetSize())
key = MakeKey(f, e, c);
else
- for(size_t i = 0; i <= c.GetSize(); ++i)
- {
+ for(size_t i = 0; i <= c.GetSize(); ++i) {
Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1)));
key = MakeKey(f,e,sub_c);
}
-
+
size_t index = m_hash[key];
- if(m_hash.GetSize() != index)
- {
+ if(m_hash.GetSize() != index) {
std::string scoresString;
if(m_inMemory)
scoresString = m_scoresMemory[index];
else
scoresString = m_scoresMapped[index];
-
+
BitWrapper<> bitStream(scoresString);
for(size_t i = 0; i < m_numScoreComponent; i++)
scores.push_back(m_scoreTrees[m_multipleScoreTrees ? i : 0]->Read(bitStream));
@@ -100,22 +100,17 @@ std::string LexicalReorderingTableCompact::MakeKey(const std::string& f,
const std::string& c) const
{
std::string key;
- if(!f.empty())
- {
+ if(!f.empty()) {
key += f;
}
- if(!m_FactorsE.empty())
- {
- if(!key.empty())
- {
+ if(!m_FactorsE.empty()) {
+ if(!key.empty()) {
key += " ||| ";
}
key += e;
}
- if(!m_FactorsC.empty())
- {
- if(!key.empty())
- {
+ if(!m_FactorsC.empty()) {
+ if(!key.empty()) {
key += " ||| ";
}
key += c;
@@ -133,48 +128,43 @@ LexicalReorderingTable* LexicalReorderingTableCompact::CheckAndLoad(
#ifdef HAVE_CMPH
std::string minlexr = ".minlexr";
// file name is specified without suffix
- if(FileExists(filePath + minlexr))
- {
+ if(FileExists(filePath + minlexr)) {
//there exists a compact binary version use that
- VERBOSE(2,"Using compact lexical reordering table" << std::endl);
- return new LexicalReorderingTableCompact(filePath + minlexr, f_factors, e_factors, c_factors);
+ VERBOSE(2,"Using compact lexical reordering table" << std::endl);
+ return new LexicalReorderingTableCompact(filePath + minlexr, f_factors, e_factors, c_factors);
}
// file name is specified with suffix
if(filePath.substr(filePath.length() - minlexr.length(), minlexr.length()) == minlexr
- && FileExists(filePath))
- {
+ && FileExists(filePath)) {
//there exists a compact binary version use that
- VERBOSE(2,"Using compact lexical reordering table" << std::endl);
- return new LexicalReorderingTableCompact(filePath, f_factors, e_factors, c_factors);
+ VERBOSE(2,"Using compact lexical reordering table" << std::endl);
+ return new LexicalReorderingTableCompact(filePath, f_factors, e_factors, c_factors);
}
#endif
return 0;
}
void LexicalReorderingTableCompact::Load(std::string filePath)
-{
+{
std::FILE* pFile = std::fopen(filePath.c_str(), "r");
if(m_inMemory)
m_hash.Load(pFile);
else
m_hash.LoadIndex(pFile);
-
+
size_t read = 0;
read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, pFile);
read += std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, pFile);
-
- if(m_multipleScoreTrees)
- {
+
+ if(m_multipleScoreTrees) {
m_scoreTrees.resize(m_numScoreComponent);
for(size_t i = 0; i < m_numScoreComponent; i++)
m_scoreTrees[i] = new CanonicalHuffman<float>(pFile);
- }
- else
- {
+ } else {
m_scoreTrees.resize(1);
m_scoreTrees[0] = new CanonicalHuffman<float>(pFile);
}
-
+
if(m_inMemory)
m_scoresMemory.load(pFile, false);
else
diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h
index 849c61c08..46f2228c9 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_LexicalReorderingTableCompact_h
#define moses_LexicalReorderingTableCompact_h
@@ -33,50 +33,51 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "CanonicalHuffman.h"
#include "StringVector.h"
-namespace Moses {
+namespace Moses
+{
class LexicalReorderingTableCompact: public LexicalReorderingTable
{
- private:
- bool m_inMemory;
-
- size_t m_numScoreComponent;
- bool m_multipleScoreTrees;
-
- BlockHashIndex m_hash;
-
- typedef CanonicalHuffman<float> ScoreTree;
- std::vector<ScoreTree*> m_scoreTrees;
-
- StringVector<unsigned char, unsigned long, MmapAllocator> m_scoresMapped;
- StringVector<unsigned char, unsigned long, std::allocator> m_scoresMemory;
-
- std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const;
- std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const;
-
- public:
- LexicalReorderingTableCompact(
- const std::string& filePath,
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors);
-
- LexicalReorderingTableCompact(
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors);
-
- virtual ~LexicalReorderingTableCompact();
-
- virtual std::vector<float> GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
-
- static LexicalReorderingTable* CheckAndLoad(
- const std::string& filePath,
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors);
-
- void Load(std::string filePath);
+private:
+ bool m_inMemory;
+
+ size_t m_numScoreComponent;
+ bool m_multipleScoreTrees;
+
+ BlockHashIndex m_hash;
+
+ typedef CanonicalHuffman<float> ScoreTree;
+ std::vector<ScoreTree*> m_scoreTrees;
+
+ StringVector<unsigned char, unsigned long, MmapAllocator> m_scoresMapped;
+ StringVector<unsigned char, unsigned long, std::allocator> m_scoresMemory;
+
+ std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const;
+ std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const;
+
+public:
+ LexicalReorderingTableCompact(
+ const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors);
+
+ LexicalReorderingTableCompact(
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors);
+
+ virtual ~LexicalReorderingTableCompact();
+
+ virtual std::vector<float> GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
+
+ static LexicalReorderingTable* CheckAndLoad(
+ const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors);
+
+ void Load(std::string filePath);
};
}
diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
index a3eee1694..655ed01ca 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#include "LexicalReorderingTableCreator.h"
#include "ThrowingFwrite.h"
@@ -25,7 +25,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "util/file.hh"
-namespace Moses {
+namespace Moses
+{
LexicalReorderingTableCreator::LexicalReorderingTableCreator(
std::string inPath, std::string outPath, std::string tempfilePath,
@@ -34,49 +35,47 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator(
#ifdef WITH_THREADS
, size_t threads
#endif
- )
+)
: m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath),
- m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
- m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees),
- m_quantize(quantize), m_separator(" ||| "),
- m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1)
-#ifdef WITH_THREADS
- , m_threads(threads)
+ m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
+ m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees),
+ m_quantize(quantize), m_separator(" ||| "),
+ m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1)
+#ifdef WITH_THREADS
+ , m_threads(threads)
#endif
-{
+{
PrintInfo();
-
+
m_outFile = std::fopen(m_outPath.c_str(), "w");
-
+
std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl;
- m_hash.BeginSave(m_outFile);
+ m_hash.BeginSave(m_outFile);
if(tempfilePath.size()) {
MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
- }
- else {
+ } else {
m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>();
}
-
+
EncodeScores();
-
+
std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl;
CalcHuffmanCodes();
-
+
std::cerr << "Pass 2/2: Compressing scores" << std::endl;
-
-
- if(tempfilePath.size()) {
+
+
+ if(tempfilePath.size()) {
MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
- }
- else {
+ } else {
m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>();
}
CompressScores();
-
+
std::cerr << "Saving to " << m_outPath << std::endl;
Save();
std::cerr << "Done" << std::endl;
@@ -84,20 +83,20 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator(
}
void LexicalReorderingTableCreator::PrintInfo()
-{
+{
std::cerr << "Used options:" << std::endl;
std::cerr << "\tText reordering table will be read from: " << m_inPath << std::endl;
std::cerr << "\tOutput reordering table will be written to: " << m_outPath << std::endl;
std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl;
std::cerr << "\tPhrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl;
- std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
+ std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
std::cerr << "\tUsing score quantization: ";
if(m_quantize)
std::cerr << m_quantize << " best" << std::endl;
else
std::cerr << "no" << std::endl;
-
-#ifdef WITH_THREADS
+
+#ifdef WITH_THREADS
std::cerr << "\tRunning with " << m_threads << " threads" << std::endl;
#endif
std::cerr << std::endl;
@@ -109,7 +108,7 @@ LexicalReorderingTableCreator::~LexicalReorderingTableCreator()
delete m_scoreTrees[i];
delete m_scoreCounters[i];
}
-
+
delete m_encodedScores;
delete m_compressedScores;
}
@@ -121,9 +120,8 @@ void LexicalReorderingTableCreator::EncodeScores()
#ifdef WITH_THREADS
boost::thread_group threads;
- for (size_t i = 0; i < m_threads; ++i)
- {
- EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this);
+ for (size_t i = 0; i < m_threads; ++i) {
+ EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this);
threads.create_thread(*et);
}
threads.join_all();
@@ -136,17 +134,16 @@ void LexicalReorderingTableCreator::EncodeScores()
}
void LexicalReorderingTableCreator::CalcHuffmanCodes()
-{
+{
std::vector<ScoreTree*>::iterator treeIt = m_scoreTrees.begin();
for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
- it != m_scoreCounters.end(); it++)
- {
+ it != m_scoreCounters.end(); it++) {
if(m_quantize)
- (*it)->Quantize(m_quantize);
-
+ (*it)->Quantize(m_quantize);
+
std::cerr << "\tCreating Huffman codes for " << (*it)->Size()
- << " scores" << std::endl;
-
+ << " scores" << std::endl;
+
*treeIt = new ScoreTree((*it)->Begin(), (*it)->End());
treeIt++;
}
@@ -158,7 +155,7 @@ void LexicalReorderingTableCreator::CompressScores()
#ifdef WITH_THREADS
boost::thread_group threads;
for (size_t i = 0; i < m_threads; ++i) {
- CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
+ CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
threads.create_thread(*ct);
}
threads.join_all();
@@ -171,12 +168,12 @@ void LexicalReorderingTableCreator::CompressScores()
}
void LexicalReorderingTableCreator::Save()
-{
+{
ThrowingFwrite(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, m_outFile);
ThrowingFwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile);
for(size_t i = 0; i < m_scoreTrees.size(); i++)
m_scoreTrees[i]->Save(m_outFile);
-
+
m_compressedScores->save(m_outFile);
}
@@ -192,38 +189,37 @@ std::string LexicalReorderingTableCreator::EncodeLine(std::vector<std::string>&
{
std::string scoresString = tokens.back();
std::stringstream scoresStream;
-
+
std::vector<float> scores;
Tokenize<float>(scores, scoresString);
-
+
if(!m_numScoreComponent) {
m_numScoreComponent = scores.size();
m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
it != m_scoreCounters.end(); it++)
- *it = new ScoreCounter();
+ *it = new ScoreCounter();
m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
}
-
+
if(m_numScoreComponent != scores.size()) {
std::cerr << "Error: Wrong number of scores detected ("
- << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
+ << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
std::cerr << "Line: " << tokens[0] << " ||| ... ||| " << scoresString << std::endl;
- abort();
+ abort();
}
-
+
size_t c = 0;
float score;
- while(c < m_numScoreComponent)
- {
+ while(c < m_numScoreComponent) {
score = scores[c];
score = FloorScore(TransformScore(score));
scoresStream.write((char*)&score, sizeof(score));
-
+
m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score);
c++;
}
-
+
return scoresStream.str();
}
@@ -232,25 +228,23 @@ void LexicalReorderingTableCreator::AddEncodedLine(PackedItem& pi)
m_queue.push(pi);
}
-void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) {
- if(force || m_queue.size() > 10000)
- {
- while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine())
- {
+void LexicalReorderingTableCreator::FlushEncodedQueue(bool force)
+{
+ if(force || m_queue.size() > 10000) {
+ while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
PackedItem pi = m_queue.top();
m_queue.pop();
m_lastFlushedLine++;
-
- m_lastRange.push_back(pi.GetSrc());
+
+ m_lastRange.push_back(pi.GetSrc());
m_encodedScores->push_back(pi.GetTrg());
-
+
if((pi.GetLine()+1) % 100000 == 0)
- std::cerr << ".";
+ std::cerr << ".";
if((pi.GetLine()+1) % 5000000 == 0)
- std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
-
- if(m_lastRange.size() == (1ul << m_orderBits))
- {
+ std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
+
+ if(m_lastRange.size() == (1ul << m_orderBits)) {
m_hash.AddRange(m_lastRange);
m_hash.SaveLastRange();
m_hash.DropLastRange();
@@ -258,14 +252,13 @@ void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) {
}
}
}
-
- if(force)
- {
+
+ if(force) {
m_lastFlushedLine = -1;
m_hash.AddRange(m_lastRange);
m_lastRange.clear();
-
+
#ifdef WITH_THREADS
m_hash.WaitAll();
#endif
@@ -278,56 +271,55 @@ void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) {
}
}
-std::string LexicalReorderingTableCreator::CompressEncodedScores(std::string &encodedScores) {
+std::string LexicalReorderingTableCreator::CompressEncodedScores(std::string &encodedScores)
+{
std::stringstream encodedScoresStream(encodedScores);
encodedScoresStream.unsetf(std::ios::skipws);
-
+
std::string compressedScores;
BitWrapper<> compressedScoresStream(compressedScores);
-
+
size_t currScore = 0;
float score;
encodedScoresStream.read((char*) &score, sizeof(score));
-
+
while(encodedScoresStream) {
size_t index = currScore % m_scoreTrees.size();
-
+
if(m_quantize)
score = m_scoreCounters[index]->LowerBound(score);
-
+
m_scoreTrees[index]->Put(compressedScoresStream, score);
encodedScoresStream.read((char*) &score, sizeof(score));
currScore++;
}
-
+
return compressedScores;
}
-void LexicalReorderingTableCreator::AddCompressedScores(PackedItem& pi) {
- m_queue.push(pi);
+void LexicalReorderingTableCreator::AddCompressedScores(PackedItem& pi)
+{
+ m_queue.push(pi);
}
void LexicalReorderingTableCreator::FlushCompressedQueue(bool force)
-{
- if(force || m_queue.size() > 10000)
- {
- while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine())
- {
+{
+ if(force || m_queue.size() > 10000) {
+ while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
PackedItem pi = m_queue.top();
m_queue.pop();
m_lastFlushedLine++;
-
+
m_compressedScores->push_back(pi.GetTrg());
-
+
if((pi.GetLine()+1) % 100000 == 0)
- std::cerr << ".";
+ std::cerr << ".";
if((pi.GetLine()+1) % 5000000 == 0)
- std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
+ std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
}
}
-
- if(force)
- {
+
+ if(force) {
m_lastFlushedLine = -1;
std::cerr << std::endl << std::endl;
}
@@ -343,63 +335,61 @@ boost::mutex EncodingTaskReordering::m_fileMutex;
EncodingTaskReordering::EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator)
: m_inFile(inFile), m_creator(creator) {}
-
+
void EncodingTaskReordering::operator()()
{
size_t lineNum = 0;
-
+
std::vector<std::string> lines;
size_t max_lines = 1000;
lines.reserve(max_lines);
-
+
{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_fileMutex);
#endif
std::string line;
while(lines.size() < max_lines && std::getline(m_inFile, line))
- lines.push_back(line);
+ lines.push_back(line);
lineNum = m_lineNum;
m_lineNum += lines.size();
}
-
+
std::vector<PackedItem> result;
result.reserve(max_lines);
-
- while(lines.size())
- {
- for(size_t i = 0; i < lines.size(); i++)
- {
+
+ while(lines.size()) {
+ for(size_t i = 0; i < lines.size(); i++) {
std::vector<std::string> tokens;
Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
-
+
std::string encodedLine = m_creator.EncodeLine(tokens);
-
+
std::string f = tokens[0];
-
+
std::string e;
if(tokens.size() > 2)
e = tokens[1];
-
+
PackedItem packedItem(lineNum + i, m_creator.MakeSourceTargetKey(f, e),
encodedLine, i);
result.push_back(packedItem);
}
lines.clear();
-
+
{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_mutex);
#endif
- for(size_t i = 0; i < result.size(); i++)
+ for(size_t i = 0; i < result.size(); i++)
m_creator.AddEncodedLine(result[i]);
- m_creator.FlushEncodedQueue();
+ m_creator.FlushEncodedQueue();
}
-
+
result.clear();
lines.reserve(max_lines);
result.reserve(max_lines);
-
+
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_fileMutex);
#endif
@@ -419,11 +409,11 @@ boost::mutex CompressionTaskReordering::m_mutex;
#endif
CompressionTaskReordering::CompressionTaskReordering(StringVector<unsigned char, unsigned long,
- MmapAllocator>& encodedScores,
- LexicalReorderingTableCreator& creator)
+ MmapAllocator>& encodedScores,
+ LexicalReorderingTableCreator& creator)
: m_encodedScores(encodedScores), m_creator(creator)
{ }
-
+
void CompressionTaskReordering::operator()()
{
size_t scoresNum;
@@ -434,12 +424,11 @@ void CompressionTaskReordering::operator()()
scoresNum = m_scoresNum;
m_scoresNum++;
}
-
- while(scoresNum < m_encodedScores.size())
- {
+
+ while(scoresNum < m_encodedScores.size()) {
std::string scores = m_encodedScores[scoresNum];
std::string compressedScores
- = m_creator.CompressEncodedScores(scores);
+ = m_creator.CompressEncodedScores(scores);
std::string dummy;
PackedItem packedItem(scoresNum, dummy, compressedScores, 0);
@@ -449,9 +438,9 @@ void CompressionTaskReordering::operator()()
#endif
m_creator.AddCompressedScores(packedItem);
m_creator.FlushCompressedQueue();
-
- scoresNum = m_scoresNum;
- m_scoresNum++;
+
+ scoresNum = m_scoresNum;
+ m_scoresNum++;
}
}
diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h
index 2e202ce9b..1bf8444fe 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h
@@ -1,139 +1,141 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_LexicalReorderingTableCreator_h
#define moses_LexicalReorderingTableCreator_h
#include "PhraseTableCreator.h"
-namespace Moses {
-
-class LexicalReorderingTableCreator {
- private:
- std::string m_inPath;
- std::string m_outPath;
- std::string m_tempfilePath;
-
- std::FILE* m_outFile;
-
- size_t m_orderBits;
- size_t m_fingerPrintBits;
-
- size_t m_numScoreComponent;
-
- bool m_multipleScoreTrees;
- bool m_quantize;
-
- std::string m_separator;
-
- BlockHashIndex m_hash;
-
- typedef Counter<float> ScoreCounter;
- typedef CanonicalHuffman<float> ScoreTree;
-
- std::vector<ScoreCounter*> m_scoreCounters;
- std::vector<ScoreTree*> m_scoreTrees;
-
- StringVector<unsigned char, unsigned long, MmapAllocator>* m_encodedScores;
- StringVector<unsigned char, unsigned long, MmapAllocator>* m_compressedScores;
-
- std::priority_queue<PackedItem> m_queue;
- long m_lastFlushedLine;
- long m_lastFlushedSourceNum;
- std::string m_lastFlushedSourcePhrase;
- std::vector<std::string> m_lastRange;
-
-#ifdef WITH_THREADS
- size_t m_threads;
+namespace Moses
+{
+
+class LexicalReorderingTableCreator
+{
+private:
+ std::string m_inPath;
+ std::string m_outPath;
+ std::string m_tempfilePath;
+
+ std::FILE* m_outFile;
+
+ size_t m_orderBits;
+ size_t m_fingerPrintBits;
+
+ size_t m_numScoreComponent;
+
+ bool m_multipleScoreTrees;
+ bool m_quantize;
+
+ std::string m_separator;
+
+ BlockHashIndex m_hash;
+
+ typedef Counter<float> ScoreCounter;
+ typedef CanonicalHuffman<float> ScoreTree;
+
+ std::vector<ScoreCounter*> m_scoreCounters;
+ std::vector<ScoreTree*> m_scoreTrees;
+
+ StringVector<unsigned char, unsigned long, MmapAllocator>* m_encodedScores;
+ StringVector<unsigned char, unsigned long, MmapAllocator>* m_compressedScores;
+
+ std::priority_queue<PackedItem> m_queue;
+ long m_lastFlushedLine;
+ long m_lastFlushedSourceNum;
+ std::string m_lastFlushedSourcePhrase;
+ std::vector<std::string> m_lastRange;
+
+#ifdef WITH_THREADS
+ size_t m_threads;
#endif
-
- void PrintInfo();
-
- void EncodeScores();
- void CalcHuffmanCodes();
- void CompressScores();
- void Save();
-
- std::string MakeSourceTargetKey(std::string&, std::string&);
-
- std::string EncodeLine(std::vector<std::string>& tokens);
- void AddEncodedLine(PackedItem& pi);
- void FlushEncodedQueue(bool force = false);
-
- std::string CompressEncodedScores(std::string &encodedScores);
- void AddCompressedScores(PackedItem& pi);
- void FlushCompressedQueue(bool force = false);
-
- public:
- LexicalReorderingTableCreator(std::string inPath,
- std::string outPath,
- std::string tempfilePath,
- size_t orderBits = 10,
- size_t fingerPrintBits = 16,
- bool multipleScoreTrees = true,
- size_t quantize = 0
+
+ void PrintInfo();
+
+ void EncodeScores();
+ void CalcHuffmanCodes();
+ void CompressScores();
+ void Save();
+
+ std::string MakeSourceTargetKey(std::string&, std::string&);
+
+ std::string EncodeLine(std::vector<std::string>& tokens);
+ void AddEncodedLine(PackedItem& pi);
+ void FlushEncodedQueue(bool force = false);
+
+ std::string CompressEncodedScores(std::string &encodedScores);
+ void AddCompressedScores(PackedItem& pi);
+ void FlushCompressedQueue(bool force = false);
+
+public:
+ LexicalReorderingTableCreator(std::string inPath,
+ std::string outPath,
+ std::string tempfilePath,
+ size_t orderBits = 10,
+ size_t fingerPrintBits = 16,
+ bool multipleScoreTrees = true,
+ size_t quantize = 0
#ifdef WITH_THREADS
- , size_t threads = 2
-#endif
- );
-
- ~LexicalReorderingTableCreator();
-
+ , size_t threads = 2
+#endif
+ );
+
+ ~LexicalReorderingTableCreator();
+
friend class EncodingTaskReordering;
friend class CompressionTaskReordering;
};
class EncodingTaskReordering
{
- private:
+private:
#ifdef WITH_THREADS
- static boost::mutex m_mutex;
- static boost::mutex m_fileMutex;
+ static boost::mutex m_mutex;
+ static boost::mutex m_fileMutex;
#endif
- static size_t m_lineNum;
- static size_t m_sourcePhraseNum;
- static std::string m_lastSourcePhrase;
-
- InputFileStream& m_inFile;
- LexicalReorderingTableCreator& m_creator;
-
- public:
- EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator);
- void operator()();
+ static size_t m_lineNum;
+ static size_t m_sourcePhraseNum;
+ static std::string m_lastSourcePhrase;
+
+ InputFileStream& m_inFile;
+ LexicalReorderingTableCreator& m_creator;
+
+public:
+ EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator);
+ void operator()();
};
class CompressionTaskReordering
{
- private:
+private:
#ifdef WITH_THREADS
- static boost::mutex m_mutex;
+ static boost::mutex m_mutex;
#endif
- static size_t m_scoresNum;
- StringVector<unsigned char, unsigned long, MmapAllocator> &m_encodedScores;
- LexicalReorderingTableCreator &m_creator;
-
- public:
- CompressionTaskReordering(StringVector<unsigned char, unsigned long, MmapAllocator>&
- m_encodedScores, LexicalReorderingTableCreator& creator);
- void operator()();
+ static size_t m_scoresNum;
+ StringVector<unsigned char, unsigned long, MmapAllocator> &m_encodedScores;
+ LexicalReorderingTableCreator &m_creator;
+
+public:
+ CompressionTaskReordering(StringVector<unsigned char, unsigned long, MmapAllocator>&
+ m_encodedScores, LexicalReorderingTableCreator& creator);
+ void operator()();
};
}
diff --git a/moses/TranslationModel/CompactPT/ListCoders.h b/moses/TranslationModel/CompactPT/ListCoders.h
index 329e1297a..b41e183ce 100644
--- a/moses/TranslationModel/CompactPT/ListCoders.h
+++ b/moses/TranslationModel/CompactPT/ListCoders.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_ListCoders_h
#define moses_ListCoders_h
@@ -31,94 +31,86 @@ namespace Moses
template <typename T = unsigned int>
class VarIntType
{
- private:
- template <typename IntType, typename OutIt>
- static void EncodeSymbol(IntType input, OutIt output)
- {
- if(input == 0)
- {
- *output = 0;
- output++;
- return;
- }
-
- T msb = 1 << (sizeof(T)*8-1);
- IntType mask = ~msb;
- IntType shift = (sizeof(T)*8-1);
-
- while(input)
- {
- T res = input & mask;
- input >>= shift;
- if(input)
- res |= msb;
- *output = res;
- output++;
- }
- };
-
- template <typename InIt, typename IntType>
- static void DecodeSymbol(InIt &it, InIt end, IntType &output)
- {
- T msb = 1 << (sizeof(T)*8-1);
- IntType shift = (sizeof(T)*8-1);
-
- output = 0;
- size_t i = 0;
- while(it != end && *it & msb) {
- IntType temp = *it & ~msb;
- temp <<= shift*i;
- output |= temp;
- it++; i++;
- }
- assert(it != end);
-
- IntType temp = *it;
+private:
+ template <typename IntType, typename OutIt>
+ static void EncodeSymbol(IntType input, OutIt output) {
+ if(input == 0) {
+ *output = 0;
+ output++;
+ return;
+ }
+
+ T msb = 1 << (sizeof(T)*8-1);
+ IntType mask = ~msb;
+ IntType shift = (sizeof(T)*8-1);
+
+ while(input) {
+ T res = input & mask;
+ input >>= shift;
+ if(input)
+ res |= msb;
+ *output = res;
+ output++;
+ }
+ };
+
+ template <typename InIt, typename IntType>
+ static void DecodeSymbol(InIt &it, InIt end, IntType &output) {
+ T msb = 1 << (sizeof(T)*8-1);
+ IntType shift = (sizeof(T)*8-1);
+
+ output = 0;
+ size_t i = 0;
+ while(it != end && *it & msb) {
+ IntType temp = *it & ~msb;
temp <<= shift*i;
output |= temp;
it++;
+ i++;
}
+ assert(it != end);
- public:
-
- template <typename InIt, typename OutIt>
- static void Encode(InIt it, InIt end, OutIt outIt)
- {
- while(it != end)
- {
- EncodeSymbol(*it, outIt);
- it++;
- }
+ IntType temp = *it;
+ temp <<= shift*i;
+ output |= temp;
+ it++;
+ }
+
+public:
+
+ template <typename InIt, typename OutIt>
+ static void Encode(InIt it, InIt end, OutIt outIt) {
+ while(it != end) {
+ EncodeSymbol(*it, outIt);
+ it++;
}
-
- template <typename InIt, typename OutIt>
- static void Decode(InIt &it, InIt end, OutIt outIt)
- {
- while(it != end)
- {
- size_t output;
- DecodeSymbol(it, end, output);
- *outIt = output;
- outIt++;
- }
+ }
+
+ template <typename InIt, typename OutIt>
+ static void Decode(InIt &it, InIt end, OutIt outIt) {
+ while(it != end) {
+ size_t output;
+ DecodeSymbol(it, end, output);
+ *outIt = output;
+ outIt++;
}
-
- template <typename InIt>
- static size_t DecodeAndSum(InIt &it, InIt end, size_t num)
- {
- size_t sum = 0;
- size_t curr = 0;
-
- while(it != end && curr < num)
- {
- size_t output;
- DecodeSymbol(it, end, output);
- sum += output; curr++;
- }
-
- return sum;
+ }
+
+ template <typename InIt>
+ static size_t DecodeAndSum(InIt &it, InIt end, size_t num) {
+ size_t sum = 0;
+ size_t curr = 0;
+
+ while(it != end && curr < num) {
+ size_t output;
+ DecodeSymbol(it, end, output);
+ sum += output;
+ curr++;
}
+ return sum;
+ }
+
};
typedef VarIntType<unsigned char> VarByte;
@@ -129,179 +121,262 @@ typedef VarIntType<unsigned int> VarInt32;
class Simple9
{
- private:
- typedef unsigned int uint;
-
- template <typename InIt>
- inline static void EncodeSymbol(uint &output, InIt it, InIt end)
- {
- uint length = end - it;
-
- uint type = 0;
- uint bitlength = 0;
-
- switch(length)
- {
- case 1: type = 1; bitlength = 28; break;
- case 2: type = 2; bitlength = 14; break;
- case 3: type = 3; bitlength = 9; break;
- case 4: type = 4; bitlength = 7; break;
- case 5: type = 5; bitlength = 5; break;
- case 7: type = 6; bitlength = 4; break;
- case 9: type = 7; bitlength = 3; break;
- case 14: type = 8; bitlength = 2; break;
- case 28: type = 9; bitlength = 1; break;
- }
-
- output = 0;
- output |= (type << 28);
-
- uint i = 0;
- while(it != end)
- {
- uint l = bitlength * (length-i-1);
- output |= *it << l;
- it++;
- i++;
- }
+private:
+ typedef unsigned int uint;
+
+ template <typename InIt>
+ inline static void EncodeSymbol(uint &output, InIt it, InIt end) {
+ uint length = end - it;
+
+ uint type = 0;
+ uint bitlength = 0;
+
+ switch(length) {
+ case 1:
+ type = 1;
+ bitlength = 28;
+ break;
+ case 2:
+ type = 2;
+ bitlength = 14;
+ break;
+ case 3:
+ type = 3;
+ bitlength = 9;
+ break;
+ case 4:
+ type = 4;
+ bitlength = 7;
+ break;
+ case 5:
+ type = 5;
+ bitlength = 5;
+ break;
+ case 7:
+ type = 6;
+ bitlength = 4;
+ break;
+ case 9:
+ type = 7;
+ bitlength = 3;
+ break;
+ case 14:
+ type = 8;
+ bitlength = 2;
+ break;
+ case 28:
+ type = 9;
+ bitlength = 1;
+ break;
}
-
- template <typename OutIt>
- static inline void DecodeSymbol(uint input, OutIt outIt)
- {
- uint type = (input >> 28);
-
- uint bitlen = 0;
- uint shift = 0;
- uint mask = 0;
-
- switch(type)
- {
- case 1: bitlen = 28; shift = 0; mask = 268435455; break;
- case 2: bitlen = 14; shift = 14; mask = 16383; break;
- case 3: bitlen = 9; shift = 18; mask = 511; break;
- case 4: bitlen = 7; shift = 21; mask = 127; break;
- case 5: bitlen = 5; shift = 20; mask = 31; break;
- case 6: bitlen = 4; shift = 24; mask = 15; break;
- case 7: bitlen = 3; shift = 24; mask = 7; break;
- case 8: bitlen = 2; shift = 26; mask = 3; break;
- case 9: bitlen = 1; shift = 27; mask = 1; break;
- }
-
- while(shift > 0)
- {
- *outIt = (input >> shift) & mask;
- shift -= bitlen;
- outIt++;
- }
- *outIt = input & mask;
- outIt++;
+
+ output = 0;
+ output |= (type << 28);
+
+ uint i = 0;
+ while(it != end) {
+ uint l = bitlength * (length-i-1);
+ output |= *it << l;
+ it++;
+ i++;
}
-
- static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr)
- {
- uint type = (input >> 28);
-
- uint bitlen = 0;
- uint shift = 0;
- uint mask = 0;
-
- switch(type)
- {
- case 1: bitlen = 28; shift = 0; mask = 268435455; break;
- case 2: bitlen = 14; shift = 14; mask = 16383; break;
- case 3: bitlen = 9; shift = 18; mask = 511; break;
- case 4: bitlen = 7; shift = 21; mask = 127; break;
- case 5: bitlen = 5; shift = 20; mask = 31; break;
- case 6: bitlen = 4; shift = 24; mask = 15; break;
- case 7: bitlen = 3; shift = 24; mask = 7; break;
- case 8: bitlen = 2; shift = 26; mask = 3; break;
- case 9: bitlen = 1; shift = 27; mask = 1; break;
- }
+ }
- size_t sum = 0;
- while(shift > 0)
- {
- sum += (input >> shift) & mask;
- shift -= bitlen;
- if(++curr == num)
- return sum;
- }
- sum += input & mask;
- curr++;
- return sum;
+ template <typename OutIt>
+ static inline void DecodeSymbol(uint input, OutIt outIt) {
+ uint type = (input >> 28);
+
+ uint bitlen = 0;
+ uint shift = 0;
+ uint mask = 0;
+
+ switch(type) {
+ case 1:
+ bitlen = 28;
+ shift = 0;
+ mask = 268435455;
+ break;
+ case 2:
+ bitlen = 14;
+ shift = 14;
+ mask = 16383;
+ break;
+ case 3:
+ bitlen = 9;
+ shift = 18;
+ mask = 511;
+ break;
+ case 4:
+ bitlen = 7;
+ shift = 21;
+ mask = 127;
+ break;
+ case 5:
+ bitlen = 5;
+ shift = 20;
+ mask = 31;
+ break;
+ case 6:
+ bitlen = 4;
+ shift = 24;
+ mask = 15;
+ break;
+ case 7:
+ bitlen = 3;
+ shift = 24;
+ mask = 7;
+ break;
+ case 8:
+ bitlen = 2;
+ shift = 26;
+ mask = 3;
+ break;
+ case 9:
+ bitlen = 1;
+ shift = 27;
+ mask = 1;
+ break;
}
-
- public:
- template <typename InIt, typename OutIt>
- static void Encode(InIt it, InIt end, OutIt outIt)
- {
- uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
-
- uint buffer[28];
- for(InIt i = it; i < end; i++)
- {
- uint lastbit = 1;
- uint lastpos = 0;
- uint lastyes = 0;
- uint j = 0;
-
- double log2 = log(2);
- while(j < 9 && lastpos < 28 && (i+lastpos) < end)
- {
- if(lastpos >= parts[j])
- j++;
-
- buffer[lastpos] = *(i + lastpos);
-
- uint reqbit = ceil(log(buffer[lastpos]+1)/log2);
- assert(reqbit <= 28);
-
- uint bit = 28/floor(28/reqbit);
- if(lastbit < bit)
- lastbit = bit;
-
- if(parts[j] > 28/lastbit)
- break;
- else if(lastpos == parts[j]-1)
- lastyes = lastpos;
-
- lastpos++;
- }
- i += lastyes;
-
- uint length = lastyes + 1;
- uint output;
- EncodeSymbol(output, buffer, buffer + length);
-
- *outIt = output;
- outIt++;
- }
+
+ while(shift > 0) {
+ *outIt = (input >> shift) & mask;
+ shift -= bitlen;
+ outIt++;
}
-
- template <typename InIt, typename OutIt>
- static void Decode(InIt &it, InIt end, OutIt outIt)
- {
- while(it != end)
- {
- DecodeSymbol(*it, outIt);
- it++;
- }
+ *outIt = input & mask;
+ outIt++;
+ }
+
+ static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr) {
+ uint type = (input >> 28);
+
+ uint bitlen = 0;
+ uint shift = 0;
+ uint mask = 0;
+
+ switch(type) {
+ case 1:
+ bitlen = 28;
+ shift = 0;
+ mask = 268435455;
+ break;
+ case 2:
+ bitlen = 14;
+ shift = 14;
+ mask = 16383;
+ break;
+ case 3:
+ bitlen = 9;
+ shift = 18;
+ mask = 511;
+ break;
+ case 4:
+ bitlen = 7;
+ shift = 21;
+ mask = 127;
+ break;
+ case 5:
+ bitlen = 5;
+ shift = 20;
+ mask = 31;
+ break;
+ case 6:
+ bitlen = 4;
+ shift = 24;
+ mask = 15;
+ break;
+ case 7:
+ bitlen = 3;
+ shift = 24;
+ mask = 7;
+ break;
+ case 8:
+ bitlen = 2;
+ shift = 26;
+ mask = 3;
+ break;
+ case 9:
+ bitlen = 1;
+ shift = 27;
+ mask = 1;
+ break;
+ }
+
+ size_t sum = 0;
+ while(shift > 0) {
+ sum += (input >> shift) & mask;
+ shift -= bitlen;
+ if(++curr == num)
+ return sum;
}
-
- template <typename InIt>
- static size_t DecodeAndSum(InIt &it, InIt end, size_t num)
- {
- size_t sum = 0;
- size_t curr = 0;
- while(it != end && curr < num)
- {
- sum += DecodeAndSumSymbol(*it, num, curr);
- it++;
+ sum += input & mask;
+ curr++;
+ return sum;
+ }
+
+public:
+ template <typename InIt, typename OutIt>
+ static void Encode(InIt it, InIt end, OutIt outIt) {
+ uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
+
+ uint buffer[28];
+ for(InIt i = it; i < end; i++) {
+ uint lastbit = 1;
+ uint lastpos = 0;
+ uint lastyes = 0;
+ uint j = 0;
+
+ double log2 = log(2);
+ while(j < 9 && lastpos < 28 && (i+lastpos) < end) {
+ if(lastpos >= parts[j])
+ j++;
+
+ buffer[lastpos] = *(i + lastpos);
+
+ uint reqbit = ceil(log(buffer[lastpos]+1)/log2);
+ assert(reqbit <= 28);
+
+ uint bit = 28/floor(28/reqbit);
+ if(lastbit < bit)
+ lastbit = bit;
+
+ if(parts[j] > 28/lastbit)
+ break;
+ else if(lastpos == parts[j]-1)
+ lastyes = lastpos;
+
+ lastpos++;
}
- assert(curr == num);
- return sum;
+ i += lastyes;
+
+ uint length = lastyes + 1;
+ uint output;
+ EncodeSymbol(output, buffer, buffer + length);
+
+ *outIt = output;
+ outIt++;
+ }
+ }
+
+ template <typename InIt, typename OutIt>
+ static void Decode(InIt &it, InIt end, OutIt outIt) {
+ while(it != end) {
+ DecodeSymbol(*it, outIt);
+ it++;
+ }
+ }
+
+ template <typename InIt>
+ static size_t DecodeAndSum(InIt &it, InIt end, size_t num) {
+ size_t sum = 0;
+ size_t curr = 0;
+ while(it != end && curr < num) {
+ sum += DecodeAndSumSymbol(*it, num, curr);
+ it++;
}
+ assert(curr == num);
+ return sum;
+ }
};
}
diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h
index 049c0149d..7cd6dd49e 100644
--- a/moses/TranslationModel/CompactPT/MmapAllocator.h
+++ b/moses/TranslationModel/CompactPT/MmapAllocator.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_MmapAllocator_h
#define moses_MmapAllocator_h
@@ -30,175 +30,161 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
- template <class T>
- class MmapAllocator
- {
- protected:
- std::FILE* m_file_ptr;
- size_t m_file_desc;
-
- size_t m_page_size;
- size_t m_map_size;
-
- char* m_data_ptr;
- size_t m_data_offset;
- bool m_fixed;
- size_t* m_count;
-
- public:
- typedef T value_type;
- typedef T* pointer;
- typedef const T* const_pointer;
- typedef T& reference;
- typedef const T& const_reference;
- typedef std::size_t size_type;
- typedef std::ptrdiff_t difference_type;
-
- MmapAllocator() throw()
- : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)),
- m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
- m_data_offset(0), m_fixed(false), m_count(new size_t(0))
- { }
-
- MmapAllocator(std::FILE* f_ptr) throw()
- : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
- m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
- m_data_offset(0), m_fixed(false), m_count(new size_t(0))
- { }
-
- MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw()
- : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
- m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
- m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0))
- { }
-
- MmapAllocator(std::string fileName) throw()
- : m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)),
- m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
- m_data_offset(0), m_fixed(false), m_count(new size_t(0))
- { }
-
- MmapAllocator(const MmapAllocator& c) throw()
- : m_file_ptr(c.m_file_ptr), m_file_desc(c.m_file_desc),
- m_page_size(c.m_page_size), m_map_size(c.m_map_size),
- m_data_ptr(c.m_data_ptr), m_data_offset(c.m_data_offset),
- m_fixed(c.m_fixed), m_count(c.m_count)
- {
- (*m_count)++;
- }
-
- ~MmapAllocator() throw()
- {
- if(m_data_ptr && *m_count == 0)
- {
- munmap(m_data_ptr, m_map_size);
- if(!m_fixed && std::ftell(m_file_ptr) != -1)
- std::fclose(m_file_ptr);
- }
- (*m_count)--;
- }
-
- template <class U>
- struct rebind {
- typedef MmapAllocator<U> other;
- };
-
- pointer address (reference value) const
- {
- return &value;
- }
-
- const_pointer address (const_reference value) const
- {
- return &value;
- }
-
- size_type max_size () const throw()
- {
- return std::numeric_limits<size_t>::max() / sizeof(value_type);
- }
-
- pointer allocate (size_type num, const void* = 0)
- {
- m_map_size = num * sizeof(T);
-
- if(!m_fixed)
- {
- size_t read = 0;
- read += ftruncate(m_file_desc, m_map_size);
- m_data_ptr = (char*)mmap(0, m_map_size, PROT_READ|PROT_WRITE, MAP_SHARED,
- m_file_desc, 0);
- if(m_data_ptr == MAP_FAILED)
- std::cerr << "Error: mmapping" << std::endl;
- return (pointer)m_data_ptr;
- }
- else
- {
- size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
- size_t relative_offset = m_data_offset - map_offset;
-
- size_t map_size = m_map_size + relative_offset;
-
- m_data_ptr = (char*)mmap(0, map_size, PROT_READ, MAP_SHARED,
- m_file_desc, map_offset);
-
- return (pointer)(m_data_ptr + relative_offset);
- }
- }
-
- void deallocate (pointer p, size_type num)
- {
- if(!m_fixed) {
- munmap(p, num * sizeof(T));
- }
- else {
- size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
- size_t relative_offset = m_data_offset - map_offset;
- munmap((pointer)((char*)p - relative_offset), num * sizeof(T));
- }
-
- }
-
- void construct (pointer p, const T& value)
- {
- if(!m_fixed)
- new(p) value_type(value);
- }
- void destroy (pointer p)
- {
- if(!m_fixed)
- p->~T();
- }
-
- template <class T1, class T2>
- friend bool operator== (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
-
- template <class T1, class T2>
- friend bool operator!= (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
- };
-
- template <class T1, class T2>
- bool operator== (const MmapAllocator<T1>& a1,
- const MmapAllocator<T2>& a2) throw()
- {
- bool equal = true;
- equal &= a1.m_file_ptr == a2.m_file_ptr;
- equal &= a1.m_file_desc == a2.m_file_desc;
- equal &= a1.m_page_size == a2.m_page_size;
- equal &= a1.m_map_size == a2.m_map_size;
- equal &= a1.m_data_ptr == a2.m_data_ptr;
- equal &= a1.m_data_offset == a2.m_data_offset;
- equal &= a1.m_fixed == a2.m_fixed;
- return equal;
+template <class T>
+class MmapAllocator
+{
+protected:
+ std::FILE* m_file_ptr;
+ size_t m_file_desc;
+
+ size_t m_page_size;
+ size_t m_map_size;
+
+ char* m_data_ptr;
+ size_t m_data_offset;
+ bool m_fixed;
+ size_t* m_count;
+
+public:
+ typedef T value_type;
+ typedef T* pointer;
+ typedef const T* const_pointer;
+ typedef T& reference;
+ typedef const T& const_reference;
+ typedef std::size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+
+ MmapAllocator() throw()
+ : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)),
+ m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
+ m_data_offset(0), m_fixed(false), m_count(new size_t(0))
+ { }
+
+ MmapAllocator(std::FILE* f_ptr) throw()
+ : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
+ m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
+ m_data_offset(0), m_fixed(false), m_count(new size_t(0))
+ { }
+
+ MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw()
+ : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
+ m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
+ m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0))
+ { }
+
+ MmapAllocator(std::string fileName) throw()
+ : m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)),
+ m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
+ m_data_offset(0), m_fixed(false), m_count(new size_t(0))
+ { }
+
+ MmapAllocator(const MmapAllocator& c) throw()
+ : m_file_ptr(c.m_file_ptr), m_file_desc(c.m_file_desc),
+ m_page_size(c.m_page_size), m_map_size(c.m_map_size),
+ m_data_ptr(c.m_data_ptr), m_data_offset(c.m_data_offset),
+ m_fixed(c.m_fixed), m_count(c.m_count) {
+ (*m_count)++;
+ }
+
+ ~MmapAllocator() throw() {
+ if(m_data_ptr && *m_count == 0) {
+ munmap(m_data_ptr, m_map_size);
+ if(!m_fixed && std::ftell(m_file_ptr) != -1)
+ std::fclose(m_file_ptr);
+ }
+ (*m_count)--;
+ }
+
+ template <class U>
+ struct rebind {
+ typedef MmapAllocator<U> other;
+ };
+
+ pointer address (reference value) const {
+ return &value;
+ }
+
+ const_pointer address (const_reference value) const {
+ return &value;
+ }
+
+ size_type max_size () const throw() {
+ return std::numeric_limits<size_t>::max() / sizeof(value_type);
+ }
+
+ pointer allocate (size_type num, const void* = 0) {
+ m_map_size = num * sizeof(T);
+
+ if(!m_fixed) {
+ size_t read = 0;
+ read += ftruncate(m_file_desc, m_map_size);
+ m_data_ptr = (char*)mmap(0, m_map_size, PROT_READ|PROT_WRITE, MAP_SHARED,
+ m_file_desc, 0);
+ if(m_data_ptr == MAP_FAILED)
+ std::cerr << "Error: mmapping" << std::endl;
+ return (pointer)m_data_ptr;
+ } else {
+ size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
+ size_t relative_offset = m_data_offset - map_offset;
+
+ size_t map_size = m_map_size + relative_offset;
+
+ m_data_ptr = (char*)mmap(0, map_size, PROT_READ, MAP_SHARED,
+ m_file_desc, map_offset);
+
+ return (pointer)(m_data_ptr + relative_offset);
}
-
- template <class T1, class T2>
- bool operator!=(const MmapAllocator<T1>& a1,
- const MmapAllocator<T2>& a2) throw()
- {
- return !(a1 == a2);
+ }
+
+ void deallocate (pointer p, size_type num) {
+ if(!m_fixed) {
+ munmap(p, num * sizeof(T));
+ } else {
+ size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
+ size_t relative_offset = m_data_offset - map_offset;
+ munmap((pointer)((char*)p - relative_offset), num * sizeof(T));
}
+ }
+
+ void construct (pointer p, const T& value) {
+ if(!m_fixed)
+ new(p) value_type(value);
+ }
+ void destroy (pointer p) {
+ if(!m_fixed)
+ p->~T();
+ }
+
+ template <class T1, class T2>
+ friend bool operator== (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
+
+ template <class T1, class T2>
+ friend bool operator!= (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
+};
+
+template <class T1, class T2>
+bool operator== (const MmapAllocator<T1>& a1,
+ const MmapAllocator<T2>& a2) throw()
+{
+ bool equal = true;
+ equal &= a1.m_file_ptr == a2.m_file_ptr;
+ equal &= a1.m_file_desc == a2.m_file_desc;
+ equal &= a1.m_page_size == a2.m_page_size;
+ equal &= a1.m_map_size == a2.m_map_size;
+ equal &= a1.m_data_ptr == a2.m_data_ptr;
+ equal &= a1.m_data_offset == a2.m_data_offset;
+ equal &= a1.m_fixed == a2.m_fixed;
+ return equal;
+}
+
+template <class T1, class T2>
+bool operator!=(const MmapAllocator<T1>& a1,
+ const MmapAllocator<T2>& a2) throw()
+{
+ return !(a1 == a2);
+}
+
}
#endif
diff --git a/moses/TranslationModel/CompactPT/MonotonicVector.h b/moses/TranslationModel/CompactPT/MonotonicVector.h
index a4423c369..5e965d3e5 100644
--- a/moses/TranslationModel/CompactPT/MonotonicVector.h
+++ b/moses/TranslationModel/CompactPT/MonotonicVector.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_MonotonicVector_h
#define moses_MonotonicVector_h
@@ -43,206 +43,187 @@ namespace Moses
{
template<typename PosT = size_t, typename NumT = size_t, PosT stepSize = 32,
-template <typename> class Allocator = std::allocator>
+ template <typename> class Allocator = std::allocator>
class MonotonicVector
{
- private:
- typedef std::vector<NumT, Allocator<NumT> > Anchors;
- typedef std::vector<unsigned int, Allocator<unsigned int> > Diffs;
-
- Anchors m_anchors;
- Diffs m_diffs;
- std::vector<unsigned int> m_tempDiffs;
-
- size_t m_size;
- PosT m_last;
- bool m_final;
-
- public:
- typedef PosT value_type;
-
- MonotonicVector() : m_size(0), m_last(0), m_final(false) {}
-
- size_t size() const
- {
- return m_size + m_tempDiffs.size();
- }
-
- PosT at(size_t i) const
- {
- PosT s = stepSize;
- PosT j = m_anchors[i / s];
- PosT r = i % s;
-
- typename Diffs::const_iterator it = m_diffs.begin() + j;
-
- PosT k = 0;
- k += VarInt32::DecodeAndSum(it, m_diffs.end(), 1);
- if(i < m_size)
- k += Simple9::DecodeAndSum(it, m_diffs.end(), r);
- else if(i < m_size + m_tempDiffs.size())
- for(size_t l = 0; l < r; l++)
- k += m_tempDiffs[l];
-
- return k;
- }
-
- PosT operator[](PosT i) const
- {
- return at(i);
- }
-
- PosT back() const
- {
- return at(size()-1);
- }
-
- void push_back(PosT i)
- {
- assert(m_final != true);
-
- if(m_anchors.size() == 0 && m_tempDiffs.size() == 0)
- {
- m_anchors.push_back(0);
- VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
- m_last = i;
- m_size++;
-
- return;
- }
-
- if(m_tempDiffs.size() == stepSize-1)
- {
- Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
- std::back_inserter(m_diffs));
- m_anchors.push_back(m_diffs.size());
- VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
-
- m_size += m_tempDiffs.size() + 1;
- m_tempDiffs.clear();
- }
- else
- {
- PosT last = m_last;
- PosT diff = i - last;
- m_tempDiffs.push_back(diff);
- }
+private:
+ typedef std::vector<NumT, Allocator<NumT> > Anchors;
+ typedef std::vector<unsigned int, Allocator<unsigned int> > Diffs;
+
+ Anchors m_anchors;
+ Diffs m_diffs;
+ std::vector<unsigned int> m_tempDiffs;
+
+ size_t m_size;
+ PosT m_last;
+ bool m_final;
+
+public:
+ typedef PosT value_type;
+
+ MonotonicVector() : m_size(0), m_last(0), m_final(false) {}
+
+ size_t size() const {
+ return m_size + m_tempDiffs.size();
+ }
+
+ PosT at(size_t i) const {
+ PosT s = stepSize;
+ PosT j = m_anchors[i / s];
+ PosT r = i % s;
+
+ typename Diffs::const_iterator it = m_diffs.begin() + j;
+
+ PosT k = 0;
+ k += VarInt32::DecodeAndSum(it, m_diffs.end(), 1);
+ if(i < m_size)
+ k += Simple9::DecodeAndSum(it, m_diffs.end(), r);
+ else if(i < m_size + m_tempDiffs.size())
+ for(size_t l = 0; l < r; l++)
+ k += m_tempDiffs[l];
+
+ return k;
+ }
+
+ PosT operator[](PosT i) const {
+ return at(i);
+ }
+
+ PosT back() const {
+ return at(size()-1);
+ }
+
+ void push_back(PosT i) {
+ assert(m_final != true);
+
+ if(m_anchors.size() == 0 && m_tempDiffs.size() == 0) {
+ m_anchors.push_back(0);
+ VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
m_last = i;
+ m_size++;
+
+ return;
}
-
- void commit()
- {
- assert(m_final != true);
+
+ if(m_tempDiffs.size() == stepSize-1) {
Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
std::back_inserter(m_diffs));
- m_size += m_tempDiffs.size();
+ m_anchors.push_back(m_diffs.size());
+ VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
+
+ m_size += m_tempDiffs.size() + 1;
m_tempDiffs.clear();
- m_final = true;
- }
-
- size_t usage()
- {
- return m_diffs.size() * sizeof(unsigned int)
- + m_anchors.size() * sizeof(NumT);
+ } else {
+ PosT last = m_last;
+ PosT diff = i - last;
+ m_tempDiffs.push_back(diff);
}
-
- size_t load(std::FILE* in, bool map = false)
- {
- size_t byteSize = 0;
-
- byteSize += fread(&m_final, sizeof(bool), 1, in) * sizeof(bool);
- byteSize += fread(&m_size, sizeof(size_t), 1, in) * sizeof(size_t);
- byteSize += fread(&m_last, sizeof(PosT), 1, in) * sizeof(PosT);
-
- byteSize += loadVector(m_diffs, in, map);
- byteSize += loadVector(m_anchors, in, map);
-
- return byteSize;
- }
-
- template <typename ValueT>
- size_t loadVector(std::vector<ValueT, std::allocator<ValueT> >& v,
- std::FILE* in, bool map = false)
- {
- // Can only be read into memory. Mapping not possible with std:allocator.
- assert(map == false);
-
- size_t byteSize = 0;
-
- size_t valSize;
- byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
-
+ m_last = i;
+ }
+
+ void commit() {
+ assert(m_final != true);
+ Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
+ std::back_inserter(m_diffs));
+ m_size += m_tempDiffs.size();
+ m_tempDiffs.clear();
+ m_final = true;
+ }
+
+ size_t usage() {
+ return m_diffs.size() * sizeof(unsigned int)
+ + m_anchors.size() * sizeof(NumT);
+ }
+
+ size_t load(std::FILE* in, bool map = false) {
+ size_t byteSize = 0;
+
+ byteSize += fread(&m_final, sizeof(bool), 1, in) * sizeof(bool);
+ byteSize += fread(&m_size, sizeof(size_t), 1, in) * sizeof(size_t);
+ byteSize += fread(&m_last, sizeof(PosT), 1, in) * sizeof(PosT);
+
+ byteSize += loadVector(m_diffs, in, map);
+ byteSize += loadVector(m_anchors, in, map);
+
+ return byteSize;
+ }
+
+ template <typename ValueT>
+ size_t loadVector(std::vector<ValueT, std::allocator<ValueT> >& v,
+ std::FILE* in, bool map = false) {
+ // Can only be read into memory. Mapping not possible with std:allocator.
+ assert(map == false);
+
+ size_t byteSize = 0;
+
+ size_t valSize;
+ byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
+
+ v.resize(valSize, 0);
+ byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
+
+ return byteSize;
+ }
+
+ template <typename ValueT>
+ size_t loadVector(std::vector<ValueT, MmapAllocator<ValueT> >& v,
+ std::FILE* in, bool map = false) {
+ size_t byteSize = 0;
+
+ size_t valSize;
+ byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
+
+ if(map == false) {
+ // Read data into temporary file (default constructor of MmapAllocator)
+ // and map memory onto temporary file. Can be resized.
+
v.resize(valSize, 0);
byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
-
- return byteSize;
- }
-
- template <typename ValueT>
- size_t loadVector(std::vector<ValueT, MmapAllocator<ValueT> >& v,
- std::FILE* in, bool map = false)
- {
- size_t byteSize = 0;
-
- size_t valSize;
- byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
-
- if(map == false)
- {
- // Read data into temporary file (default constructor of MmapAllocator)
- // and map memory onto temporary file. Can be resized.
-
- v.resize(valSize, 0);
- byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
- }
- else
- {
- // Map it directly on specified region of file "in" starting at valPos
- // with length valSize * sizeof(ValueT). Mapped region cannot be resized.
-
- size_t valPos = std::ftell(in);
-
- Allocator<ValueT> alloc(in, valPos);
- std::vector<ValueT, Allocator<ValueT> > vTemp(alloc);
- vTemp.resize(valSize);
- v.swap(vTemp);
-
- std::fseek(in, valSize * sizeof(ValueT), SEEK_CUR);
- byteSize += valSize * sizeof(ValueT);
- }
-
- return byteSize;
- }
-
- size_t save(std::FILE* out)
- {
- if(!m_final)
- commit();
-
- bool byteSize = 0;
- byteSize += ThrowingFwrite(&m_final, sizeof(bool), 1, out) * sizeof(bool);
- byteSize += ThrowingFwrite(&m_size, sizeof(size_t), 1, out) * sizeof(size_t);
- byteSize += ThrowingFwrite(&m_last, sizeof(PosT), 1, out) * sizeof(PosT);
-
- size_t size = m_diffs.size();
- byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
- byteSize += ThrowingFwrite(&m_diffs[0], sizeof(unsigned int), size, out) * sizeof(unsigned int);
-
- size = m_anchors.size();
- byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
- byteSize += ThrowingFwrite(&m_anchors[0], sizeof(NumT), size, out) * sizeof(NumT);
-
- return byteSize;
- }
-
- void swap(MonotonicVector<PosT, NumT, stepSize, Allocator> &mv)
- {
- if(!m_final)
- commit();
-
- m_diffs.swap(mv.m_diffs);
- m_anchors.swap(mv.m_anchors);
+ } else {
+ // Map it directly on specified region of file "in" starting at valPos
+ // with length valSize * sizeof(ValueT). Mapped region cannot be resized.
+
+ size_t valPos = std::ftell(in);
+
+ Allocator<ValueT> alloc(in, valPos);
+ std::vector<ValueT, Allocator<ValueT> > vTemp(alloc);
+ vTemp.resize(valSize);
+ v.swap(vTemp);
+
+ std::fseek(in, valSize * sizeof(ValueT), SEEK_CUR);
+ byteSize += valSize * sizeof(ValueT);
}
+
+ return byteSize;
+ }
+
+ size_t save(std::FILE* out) {
+ if(!m_final)
+ commit();
+
+ bool byteSize = 0;
+ byteSize += ThrowingFwrite(&m_final, sizeof(bool), 1, out) * sizeof(bool);
+ byteSize += ThrowingFwrite(&m_size, sizeof(size_t), 1, out) * sizeof(size_t);
+ byteSize += ThrowingFwrite(&m_last, sizeof(PosT), 1, out) * sizeof(PosT);
+
+ size_t size = m_diffs.size();
+ byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
+ byteSize += ThrowingFwrite(&m_diffs[0], sizeof(unsigned int), size, out) * sizeof(unsigned int);
+
+ size = m_anchors.size();
+ byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
+ byteSize += ThrowingFwrite(&m_anchors[0], sizeof(NumT), size, out) * sizeof(NumT);
+
+ return byteSize;
+ }
+
+ void swap(MonotonicVector<PosT, NumT, stepSize, Allocator> &mv) {
+ if(!m_final)
+ commit();
+
+ m_diffs.swap(mv.m_diffs);
+ m_anchors.swap(mv.m_anchors);
+ }
};
}
diff --git a/moses/TranslationModel/CompactPT/MurmurHash3.cpp b/moses/TranslationModel/CompactPT/MurmurHash3.cpp
index 0bf738662..d16cd9502 100644
--- a/moses/TranslationModel/CompactPT/MurmurHash3.cpp
+++ b/moses/TranslationModel/CompactPT/MurmurHash3.cpp
@@ -107,16 +107,15 @@ void MurmurHash3_x86_32 ( const void * key, int len,
const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
- for(int i = -nblocks; i; i++)
- {
+ for(int i = -nblocks; i; i++) {
uint32_t k1 = getblock(blocks,i);
k1 *= c1;
k1 = ROTL32(k1,15);
k1 *= c2;
-
+
h1 ^= k1;
- h1 = ROTL32(h1,13);
+ h1 = ROTL32(h1,13);
h1 = h1*5+0xe6546b64;
}
@@ -127,12 +126,17 @@ void MurmurHash3_x86_32 ( const void * key, int len,
uint32_t k1 = 0;
- switch(len & 3)
- {
- case 3: k1 ^= tail[2] << 16;
- case 2: k1 ^= tail[1] << 8;
- case 1: k1 ^= tail[0];
- k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+ switch(len & 3) {
+ case 3:
+ k1 ^= tail[2] << 16;
+ case 2:
+ k1 ^= tail[1] << 8;
+ case 1:
+ k1 ^= tail[0];
+ k1 *= c1;
+ k1 = ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
};
//----------
@@ -143,7 +147,7 @@ void MurmurHash3_x86_32 ( const void * key, int len,
h1 = fmix(h1);
*(uint32_t*)out = h1;
-}
+}
//-----------------------------------------------------------------------------
@@ -158,9 +162,9 @@ void MurmurHash3_x86_128 ( const void * key, const int len,
uint32_t h3 = seed;
uint32_t h4 = seed;
- uint32_t c1 = 0x239b961b;
+ uint32_t c1 = 0x239b961b;
uint32_t c2 = 0xab0e9789;
- uint32_t c3 = 0x38b34ae5;
+ uint32_t c3 = 0x38b34ae5;
uint32_t c4 = 0xa1e38b93;
//----------
@@ -168,28 +172,47 @@ void MurmurHash3_x86_128 ( const void * key, const int len,
const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
- for(int i = -nblocks; i; i++)
- {
+ for(int i = -nblocks; i; i++) {
uint32_t k1 = getblock(blocks,i*4+0);
uint32_t k2 = getblock(blocks,i*4+1);
uint32_t k3 = getblock(blocks,i*4+2);
uint32_t k4 = getblock(blocks,i*4+3);
- k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+ k1 *= c1;
+ k1 = ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
- h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
+ h1 = ROTL32(h1,19);
+ h1 += h2;
+ h1 = h1*5+0x561ccd1b;
- k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+ k2 *= c2;
+ k2 = ROTL32(k2,16);
+ k2 *= c3;
+ h2 ^= k2;
- h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
+ h2 = ROTL32(h2,17);
+ h2 += h3;
+ h2 = h2*5+0x0bcaa747;
- k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+ k3 *= c3;
+ k3 = ROTL32(k3,17);
+ k3 *= c4;
+ h3 ^= k3;
- h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
+ h3 = ROTL32(h3,15);
+ h3 += h4;
+ h3 = h3*5+0x96cd1c35;
- k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+ k4 *= c4;
+ k4 = ROTL32(k4,18);
+ k4 *= c1;
+ h4 ^= k4;
- h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
+ h4 = ROTL32(h4,13);
+ h4 += h1;
+ h4 = h4*5+0x32ac3b17;
}
//----------
@@ -202,47 +225,84 @@ void MurmurHash3_x86_128 ( const void * key, const int len,
uint32_t k3 = 0;
uint32_t k4 = 0;
- switch(len & 15)
- {
- case 15: k4 ^= tail[14] << 16;
- case 14: k4 ^= tail[13] << 8;
- case 13: k4 ^= tail[12] << 0;
- k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
-
- case 12: k3 ^= tail[11] << 24;
- case 11: k3 ^= tail[10] << 16;
- case 10: k3 ^= tail[ 9] << 8;
- case 9: k3 ^= tail[ 8] << 0;
- k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
-
- case 8: k2 ^= tail[ 7] << 24;
- case 7: k2 ^= tail[ 6] << 16;
- case 6: k2 ^= tail[ 5] << 8;
- case 5: k2 ^= tail[ 4] << 0;
- k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
-
- case 4: k1 ^= tail[ 3] << 24;
- case 3: k1 ^= tail[ 2] << 16;
- case 2: k1 ^= tail[ 1] << 8;
- case 1: k1 ^= tail[ 0] << 0;
- k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+ switch(len & 15) {
+ case 15:
+ k4 ^= tail[14] << 16;
+ case 14:
+ k4 ^= tail[13] << 8;
+ case 13:
+ k4 ^= tail[12] << 0;
+ k4 *= c4;
+ k4 = ROTL32(k4,18);
+ k4 *= c1;
+ h4 ^= k4;
+
+ case 12:
+ k3 ^= tail[11] << 24;
+ case 11:
+ k3 ^= tail[10] << 16;
+ case 10:
+ k3 ^= tail[ 9] << 8;
+ case 9:
+ k3 ^= tail[ 8] << 0;
+ k3 *= c3;
+ k3 = ROTL32(k3,17);
+ k3 *= c4;
+ h3 ^= k3;
+
+ case 8:
+ k2 ^= tail[ 7] << 24;
+ case 7:
+ k2 ^= tail[ 6] << 16;
+ case 6:
+ k2 ^= tail[ 5] << 8;
+ case 5:
+ k2 ^= tail[ 4] << 0;
+ k2 *= c2;
+ k2 = ROTL32(k2,16);
+ k2 *= c3;
+ h2 ^= k2;
+
+ case 4:
+ k1 ^= tail[ 3] << 24;
+ case 3:
+ k1 ^= tail[ 2] << 16;
+ case 2:
+ k1 ^= tail[ 1] << 8;
+ case 1:
+ k1 ^= tail[ 0] << 0;
+ k1 *= c1;
+ k1 = ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
};
//----------
// finalization
- h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+ h1 ^= len;
+ h2 ^= len;
+ h3 ^= len;
+ h4 ^= len;
- h1 += h2; h1 += h3; h1 += h4;
- h2 += h1; h3 += h1; h4 += h1;
+ h1 += h2;
+ h1 += h3;
+ h1 += h4;
+ h2 += h1;
+ h3 += h1;
+ h4 += h1;
h1 = fmix(h1);
h2 = fmix(h2);
h3 = fmix(h3);
h4 = fmix(h4);
- h1 += h2; h1 += h3; h1 += h4;
- h2 += h1; h3 += h1; h4 += h1;
+ h1 += h2;
+ h1 += h3;
+ h1 += h4;
+ h2 += h1;
+ h3 += h1;
+ h4 += h1;
((uint32_t*)out)[0] = h1;
((uint32_t*)out)[1] = h2;
@@ -269,18 +329,27 @@ void MurmurHash3_x64_128 ( const void * key, const int len,
const uint64_t * blocks = (const uint64_t *)(data);
- for(int i = 0; i < nblocks; i++)
- {
+ for(int i = 0; i < nblocks; i++) {
uint64_t k1 = getblock(blocks,i*2+0);
uint64_t k2 = getblock(blocks,i*2+1);
- k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+ k1 *= c1;
+ k1 = ROTL64(k1,31);
+ k1 *= c2;
+ h1 ^= k1;
- h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
+ h1 = ROTL64(h1,27);
+ h1 += h2;
+ h1 = h1*5+0x52dce729;
- k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+ k2 *= c2;
+ k2 = ROTL64(k2,33);
+ k2 *= c1;
+ h2 ^= k2;
- h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
+ h2 = ROTL64(h2,31);
+ h2 += h1;
+ h2 = h2*5+0x38495ab5;
}
//----------
@@ -291,32 +360,53 @@ void MurmurHash3_x64_128 ( const void * key, const int len,
uint64_t k1 = 0;
uint64_t k2 = 0;
- switch(len & 15)
- {
- case 15: k2 ^= uint64_t(tail[14]) << 48;
- case 14: k2 ^= uint64_t(tail[13]) << 40;
- case 13: k2 ^= uint64_t(tail[12]) << 32;
- case 12: k2 ^= uint64_t(tail[11]) << 24;
- case 11: k2 ^= uint64_t(tail[10]) << 16;
- case 10: k2 ^= uint64_t(tail[ 9]) << 8;
- case 9: k2 ^= uint64_t(tail[ 8]) << 0;
- k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
-
- case 8: k1 ^= uint64_t(tail[ 7]) << 56;
- case 7: k1 ^= uint64_t(tail[ 6]) << 48;
- case 6: k1 ^= uint64_t(tail[ 5]) << 40;
- case 5: k1 ^= uint64_t(tail[ 4]) << 32;
- case 4: k1 ^= uint64_t(tail[ 3]) << 24;
- case 3: k1 ^= uint64_t(tail[ 2]) << 16;
- case 2: k1 ^= uint64_t(tail[ 1]) << 8;
- case 1: k1 ^= uint64_t(tail[ 0]) << 0;
- k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+ switch(len & 15) {
+ case 15:
+ k2 ^= uint64_t(tail[14]) << 48;
+ case 14:
+ k2 ^= uint64_t(tail[13]) << 40;
+ case 13:
+ k2 ^= uint64_t(tail[12]) << 32;
+ case 12:
+ k2 ^= uint64_t(tail[11]) << 24;
+ case 11:
+ k2 ^= uint64_t(tail[10]) << 16;
+ case 10:
+ k2 ^= uint64_t(tail[ 9]) << 8;
+ case 9:
+ k2 ^= uint64_t(tail[ 8]) << 0;
+ k2 *= c2;
+ k2 = ROTL64(k2,33);
+ k2 *= c1;
+ h2 ^= k2;
+
+ case 8:
+ k1 ^= uint64_t(tail[ 7]) << 56;
+ case 7:
+ k1 ^= uint64_t(tail[ 6]) << 48;
+ case 6:
+ k1 ^= uint64_t(tail[ 5]) << 40;
+ case 5:
+ k1 ^= uint64_t(tail[ 4]) << 32;
+ case 4:
+ k1 ^= uint64_t(tail[ 3]) << 24;
+ case 3:
+ k1 ^= uint64_t(tail[ 2]) << 16;
+ case 2:
+ k1 ^= uint64_t(tail[ 1]) << 8;
+ case 1:
+ k1 ^= uint64_t(tail[ 0]) << 0;
+ k1 *= c1;
+ k1 = ROTL64(k1,31);
+ k1 *= c2;
+ h1 ^= k1;
};
//----------
// finalization
- h1 ^= len; h2 ^= len;
+ h1 ^= len;
+ h2 ^= len;
h1 += h2;
h2 += h1;
diff --git a/moses/TranslationModel/CompactPT/PackedArray.h b/moses/TranslationModel/CompactPT/PackedArray.h
index ad4596546..479c2cc79 100644
--- a/moses/TranslationModel/CompactPT/PackedArray.h
+++ b/moses/TranslationModel/CompactPT/PackedArray.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_PackedArray_h
#define moses_PackedArray_h
@@ -35,128 +35,117 @@ namespace Moses
template <typename T = size_t, typename D = unsigned char>
class PackedArray
{
- protected:
- static size_t m_dataBits;
-
- size_t m_size;
- size_t m_storageSize;
- D* m_storage;
-
- public:
- PackedArray()
- {
- m_size = 0;
- m_storageSize = 0;
- m_storage = new D[0];
- }
-
- PackedArray(size_t size, size_t bits) : m_size(size)
- {
- m_storageSize = ceil(float(bits * size) / float(m_dataBits));
- m_storage = new D[m_storageSize];
- }
-
- PackedArray(const PackedArray<T, D> &c)
- {
- m_size = c.m_size;
-
- m_storageSize = c.m_storageSize;
- m_storage = new D[m_storageSize];
-
- std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D));
- }
-
- virtual ~PackedArray()
- {
- delete [] m_storage;
- m_size = 0;
- m_storageSize = 0;
- m_storage = 0;
- }
-
- T Get(size_t i, size_t bits) const
- {
- T out = 0;
-
- size_t bitstart = (i * bits);
- size_t bitpos = bitstart;
-
- size_t zero = ((1ul << (bits)) - 1);
-
- while(bitpos - bitstart < bits) {
- size_t pos = bitpos / m_dataBits;
- size_t off = bitpos % m_dataBits;
-
- out |= (T(m_storage[pos]) << (bitpos - bitstart)) >> off;
-
- bitpos += (m_dataBits - off);
- }
-
- out &= zero;
- return out;
- }
-
- void Set(size_t i, T v, size_t bits)
- {
- size_t bitstart = (i * bits);
- size_t bitpos = bitstart;
-
- while(bitpos - bitstart < bits) {
- size_t pos = bitpos / m_dataBits;
- size_t off = bitpos % m_dataBits;
-
- size_t rest = bits - (bitpos - bitstart);
- D zero = ~((1ul << (rest + off)) - 1) | ((1ul << off) - 1);
-
- m_storage[pos] &= zero;
- m_storage[pos] |= v << off;
- v = v >> (m_dataBits - off);
- bitpos += (m_dataBits - off);
- }
- }
-
- virtual D*& GetStorage()
- {
- return m_storage;
- }
-
- virtual size_t GetStorageSize() const
- {
- return m_storageSize;
- }
-
- virtual size_t Size() const
- {
- return m_size;
- }
-
- virtual size_t Load(std::FILE* in)
- {
- size_t a1 = std::ftell(in);
-
- size_t read = 0;
- read += std::fread(&m_size, sizeof(m_size), 1, in);
- read += std::fread(&m_storageSize, sizeof(m_storageSize), 1, in);
- delete [] m_storage;
- m_storage = new D[m_storageSize];
- read += std::fread(m_storage, sizeof(D), m_storageSize, in);
-
- size_t a2 = std::ftell(in);
- return a2 - a1;
+protected:
+ static size_t m_dataBits;
+
+ size_t m_size;
+ size_t m_storageSize;
+ D* m_storage;
+
+public:
+ PackedArray() {
+ m_size = 0;
+ m_storageSize = 0;
+ m_storage = new D[0];
+ }
+
+ PackedArray(size_t size, size_t bits) : m_size(size) {
+ m_storageSize = ceil(float(bits * size) / float(m_dataBits));
+ m_storage = new D[m_storageSize];
+ }
+
+ PackedArray(const PackedArray<T, D> &c) {
+ m_size = c.m_size;
+
+ m_storageSize = c.m_storageSize;
+ m_storage = new D[m_storageSize];
+
+ std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D));
+ }
+
+ virtual ~PackedArray() {
+ delete [] m_storage;
+ m_size = 0;
+ m_storageSize = 0;
+ m_storage = 0;
+ }
+
+ T Get(size_t i, size_t bits) const {
+ T out = 0;
+
+ size_t bitstart = (i * bits);
+ size_t bitpos = bitstart;
+
+ size_t zero = ((1ul << (bits)) - 1);
+
+ while(bitpos - bitstart < bits) {
+ size_t pos = bitpos / m_dataBits;
+ size_t off = bitpos % m_dataBits;
+
+ out |= (T(m_storage[pos]) << (bitpos - bitstart)) >> off;
+
+ bitpos += (m_dataBits - off);
}
-
- virtual size_t Save(std::FILE* out)
- {
- size_t a1 = std::ftell(out);
-
- ThrowingFwrite(&m_size, sizeof(m_size), 1, out);
- ThrowingFwrite(&m_storageSize, sizeof(m_storageSize), 1, out);
- ThrowingFwrite(m_storage, sizeof(D), m_storageSize, out);
-
- size_t a2 = std::ftell(out);
- return a2 - a1;
+
+ out &= zero;
+ return out;
+ }
+
+ void Set(size_t i, T v, size_t bits) {
+ size_t bitstart = (i * bits);
+ size_t bitpos = bitstart;
+
+ while(bitpos - bitstart < bits) {
+ size_t pos = bitpos / m_dataBits;
+ size_t off = bitpos % m_dataBits;
+
+ size_t rest = bits - (bitpos - bitstart);
+ D zero = ~((1ul << (rest + off)) - 1) | ((1ul << off) - 1);
+
+ m_storage[pos] &= zero;
+ m_storage[pos] |= v << off;
+ v = v >> (m_dataBits - off);
+ bitpos += (m_dataBits - off);
}
-
+ }
+
+ virtual D*& GetStorage() {
+ return m_storage;
+ }
+
+ virtual size_t GetStorageSize() const {
+ return m_storageSize;
+ }
+
+ virtual size_t Size() const {
+ return m_size;
+ }
+
+ virtual size_t Load(std::FILE* in) {
+ size_t a1 = std::ftell(in);
+
+ size_t read = 0;
+ read += std::fread(&m_size, sizeof(m_size), 1, in);
+ read += std::fread(&m_storageSize, sizeof(m_storageSize), 1, in);
+ delete [] m_storage;
+ m_storage = new D[m_storageSize];
+ read += std::fread(m_storage, sizeof(D), m_storageSize, in);
+
+ size_t a2 = std::ftell(in);
+ return a2 - a1;
+ }
+
+ virtual size_t Save(std::FILE* out) {
+ size_t a1 = std::ftell(out);
+
+ ThrowingFwrite(&m_size, sizeof(m_size), 1, out);
+ ThrowingFwrite(&m_storageSize, sizeof(m_storageSize), 1, out);
+ ThrowingFwrite(m_storage, sizeof(D), m_storageSize, out);
+
+ size_t a2 = std::ftell(out);
+ return a2 - a1;
+ }
+
};
template <typename T, typename D>
@@ -166,34 +155,31 @@ size_t PackedArray<T, D>::m_dataBits = sizeof(D)*8;
template <typename T = size_t, typename D = unsigned char>
class PairedPackedArray : public PackedArray<T,D>
-{
- public:
- PairedPackedArray() : PackedArray<T,D>() {}
-
- PairedPackedArray(size_t size, size_t bits1, size_t bits2)
+{
+public:
+ PairedPackedArray() : PackedArray<T,D>() {}
+
+ PairedPackedArray(size_t size, size_t bits1, size_t bits2)
: PackedArray<T, D>(size, bits1 + bits2) { }
-
- void Set(size_t i, T a, T b, size_t bits1, size_t bits2)
- {
- T c = 0;
- c = a | (b << bits1);
- PackedArray<T,D>::Set(i, c, bits1 + bits2);
- }
-
- void Set(size_t i, std::pair<T,T> p, size_t bits1, size_t bits2)
- {
- T c = 0;
- c = p.second | (p.first << bits1);
- PackedArray<T, D>::Set(i, c);
- }
-
- std::pair<T, T> Get(size_t i, size_t bits1, size_t bits2)
- {
- T v = PackedArray<T, D>::Get(i, bits1 + bits2);
- T a = v & ((1 << bits1) - 1);
- T b = v >> bits1;
- return std::pair<T, T>(a, b);
- }
+
+ void Set(size_t i, T a, T b, size_t bits1, size_t bits2) {
+ T c = 0;
+ c = a | (b << bits1);
+ PackedArray<T,D>::Set(i, c, bits1 + bits2);
+ }
+
+ void Set(size_t i, std::pair<T,T> p, size_t bits1, size_t bits2) {
+ T c = 0;
+ c = p.second | (p.first << bits1);
+ PackedArray<T, D>::Set(i, c);
+ }
+
+ std::pair<T, T> Get(size_t i, size_t bits1, size_t bits2) {
+ T v = PackedArray<T, D>::Get(i, bits1 + bits2);
+ T a = v & ((1 << bits1) - 1);
+ T b = v >> bits1;
+ return std::pair<T, T>(a, b);
+ }
};
}
diff --git a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp
index 03b3f6825..085a7337c 100644
--- a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#include <deque>
@@ -37,23 +37,23 @@ PhraseDecoder::PhraseDecoder(
const std::vector<float>* weight
)
: m_coding(None), m_numScoreComponent(numScoreComponent),
- m_containsAlignmentInfo(true), m_maxRank(0),
- m_symbolTree(0), m_multipleScoreTrees(false),
- m_scoreTrees(1), m_alignTree(0),
- m_phraseDictionary(phraseDictionary), m_input(input), m_output(output),
- m_weight(weight),
- m_separator(" ||| ")
+ m_containsAlignmentInfo(true), m_maxRank(0),
+ m_symbolTree(0), m_multipleScoreTrees(false),
+ m_scoreTrees(1), m_alignTree(0),
+ m_phraseDictionary(phraseDictionary), m_input(input), m_output(output),
+ m_weight(weight),
+ m_separator(" ||| ")
{ }
PhraseDecoder::~PhraseDecoder()
{
if(m_symbolTree)
delete m_symbolTree;
-
+
for(size_t i = 0; i < m_scoreTrees.size(); i++)
if(m_scoreTrees[i])
delete m_scoreTrees[i];
-
+
if(m_alignTree)
delete m_alignTree;
}
@@ -61,10 +61,10 @@ PhraseDecoder::~PhraseDecoder()
inline unsigned PhraseDecoder::GetSourceSymbolId(std::string& symbol)
{
boost::unordered_map<std::string, unsigned>::iterator it
- = m_sourceSymbolsMap.find(symbol);
+ = m_sourceSymbolsMap.find(symbol);
if(it != m_sourceSymbolsMap.end())
return it->second;
-
+
size_t idx = m_sourceSymbols.find(symbol);
m_sourceSymbolsMap[symbol] = idx;
return idx;
@@ -144,76 +144,70 @@ size_t PhraseDecoder::Load(std::FILE* in)
{
size_t start = std::ftell(in);
size_t read = 0;
-
+
read += std::fread(&m_coding, sizeof(m_coding), 1, in);
read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, in);
read += std::fread(&m_containsAlignmentInfo, sizeof(m_containsAlignmentInfo), 1, in);
read += std::fread(&m_maxRank, sizeof(m_maxRank), 1, in);
read += std::fread(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, in);
-
- if(m_coding == REnc)
- {
+
+ if(m_coding == REnc) {
m_sourceSymbols.load(in);
-
+
size_t size;
read += std::fread(&size, sizeof(size_t), 1, in);
m_lexicalTableIndex.resize(size);
read += std::fread(&m_lexicalTableIndex[0], sizeof(size_t), size, in);
-
+
read += std::fread(&size, sizeof(size_t), 1, in);
m_lexicalTable.resize(size);
read += std::fread(&m_lexicalTable[0], sizeof(SrcTrg), size, in);
}
-
+
m_targetSymbols.load(in);
-
+
m_symbolTree = new CanonicalHuffman<unsigned>(in);
-
+
read += std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, in);
- if(m_multipleScoreTrees)
- {
+ if(m_multipleScoreTrees) {
m_scoreTrees.resize(m_numScoreComponent);
for(size_t i = 0; i < m_numScoreComponent; i++)
m_scoreTrees[i] = new CanonicalHuffman<float>(in);
- }
- else
- {
+ } else {
m_scoreTrees.resize(1);
m_scoreTrees[0] = new CanonicalHuffman<float>(in);
}
-
+
if(m_containsAlignmentInfo)
m_alignTree = new CanonicalHuffman<AlignPoint>(in);
-
+
size_t end = std::ftell(in);
return end - start;
}
-
+
std::string PhraseDecoder::MakeSourceKey(std::string &source)
{
- return source + m_separator;
+ return source + m_separator;
}
-
+
TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &sourcePhrase, bool topLevel)
{
-
+
// Not using TargetPhraseCollection avoiding "new" operator
// which can introduce heavy locking with multiple threads
TargetPhraseVectorPtr tpv(new TargetPhraseVector());
size_t bitsLeft = 0;
-
- if(m_coding == PREnc)
- {
+
+ if(m_coding == PREnc) {
std::pair<TargetPhraseVectorPtr, size_t> cachedPhraseColl
- = m_decodingCache.Retrieve(sourcePhrase);
-
+ = m_decodingCache.Retrieve(sourcePhrase);
+
// Has been cached and is complete or does not need to be completed
if(cachedPhraseColl.first != NULL && (!topLevel || cachedPhraseColl.second == 0))
return cachedPhraseColl.first;
-
+
// Has been cached, but is incomplete
- else if(cachedPhraseColl.first != NULL)
- {
+ else if(cachedPhraseColl.first != NULL) {
bitsLeft = cachedPhraseColl.second;
tpv->resize(cachedPhraseColl.first->size());
std::copy(cachedPhraseColl.first->begin(),
@@ -221,220 +215,187 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &
tpv->begin());
}
}
-
+
// Retrieve source phrase identifier
std::string sourcePhraseString = sourcePhrase.GetStringRep(*m_input);
size_t sourcePhraseId = m_phraseDictionary.m_hash[MakeSourceKey(sourcePhraseString)];
-
- if(sourcePhraseId != m_phraseDictionary.m_hash.GetSize())
- {
- // Retrieve compressed and encoded target phrase collection
+
+ if(sourcePhraseId != m_phraseDictionary.m_hash.GetSize()) {
+ // Retrieve compressed and encoded target phrase collection
std::string encodedPhraseCollection;
if(m_phraseDictionary.m_inMemory)
encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMemory[sourcePhraseId];
else
encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMapped[sourcePhraseId];
-
+
BitWrapper<> encodedBitStream(encodedPhraseCollection);
if(m_coding == PREnc && bitsLeft)
encodedBitStream.SeekFromEnd(bitsLeft);
-
+
// Decompress and decode target phrase collection
TargetPhraseVectorPtr decodedPhraseColl =
DecodeCollection(tpv, encodedBitStream, sourcePhrase, topLevel);
-
+
return decodedPhraseColl;
- }
- else
- return TargetPhraseVectorPtr();
+ } else
+ return TargetPhraseVectorPtr();
}
-
+
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream,
const Phrase &sourcePhrase, bool topLevel)
{
-
+
bool extending = tpv->size();
size_t bitsLeft = encodedBitStream.TellFromEnd();
-
+
typedef std::pair<size_t, size_t> AlignPointSizeT;
-
+
std::vector<int> sourceWords;
- if(m_coding == REnc)
- {
- for(size_t i = 0; i < sourcePhrase.GetSize(); i++)
- {
+ if(m_coding == REnc) {
+ for(size_t i = 0; i < sourcePhrase.GetSize(); i++) {
std::string sourceWord
- = sourcePhrase.GetWord(i).GetString(*m_input, false);
+ = sourcePhrase.GetWord(i).GetString(*m_input, false);
unsigned idx = GetSourceSymbolId(sourceWord);
sourceWords.push_back(idx);
}
}
-
+
unsigned phraseStopSymbol = 0;
AlignPoint alignStopSymbol(-1, -1);
-
+
std::vector<float> scores;
std::set<AlignPointSizeT> alignment;
-
+
enum DecodeState { New, Symbol, Score, Alignment, Add } state = New;
-
+
size_t srcSize = sourcePhrase.GetSize();
-
+
TargetPhrase* targetPhrase = NULL;
- while(encodedBitStream.TellFromEnd())
- {
-
- if(state == New)
- {
+ while(encodedBitStream.TellFromEnd()) {
+
+ if(state == New) {
// Creating new TargetPhrase on the heap
tpv->push_back(TargetPhrase());
targetPhrase = &tpv->back();
-
+
targetPhrase->SetSourcePhrase(sourcePhrase);
alignment.clear();
scores.clear();
-
+
state = Symbol;
}
-
- if(state == Symbol)
- {
- unsigned symbol = m_symbolTree->Read(encodedBitStream);
- if(symbol == phraseStopSymbol)
- {
+
+ if(state == Symbol) {
+ unsigned symbol = m_symbolTree->Read(encodedBitStream);
+ if(symbol == phraseStopSymbol) {
state = Score;
- }
- else
- {
- if(m_coding == REnc)
- {
+ } else {
+ if(m_coding == REnc) {
std::string wordString;
size_t type = GetREncType(symbol);
-
- if(type == 1)
- {
+
+ if(type == 1) {
unsigned decodedSymbol = DecodeREncSymbol1(symbol);
wordString = GetTargetSymbol(decodedSymbol);
- }
- else if (type == 2)
- {
+ } else if (type == 2) {
size_t rank = DecodeREncSymbol2Rank(symbol);
size_t srcPos = DecodeREncSymbol2Position(symbol);
-
+
if(srcPos >= sourceWords.size())
- return TargetPhraseVectorPtr();
-
+ return TargetPhraseVectorPtr();
+
wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));
- if(m_phraseDictionary.m_useAlignmentInfo)
- {
+ if(m_phraseDictionary.m_useAlignmentInfo) {
size_t trgPos = targetPhrase->GetSize();
alignment.insert(AlignPoint(srcPos, trgPos));
}
- }
- else if(type == 3)
- {
+ } else if(type == 3) {
size_t rank = DecodeREncSymbol3(symbol);
size_t srcPos = targetPhrase->GetSize();
-
+
if(srcPos >= sourceWords.size())
- return TargetPhraseVectorPtr();
-
- wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));
- if(m_phraseDictionary.m_useAlignmentInfo)
- {
+ return TargetPhraseVectorPtr();
+
+ wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));
+ if(m_phraseDictionary.m_useAlignmentInfo) {
size_t trgPos = srcPos;
alignment.insert(AlignPoint(srcPos, trgPos));
}
}
-
+
Word word;
word.CreateFromString(Output, *m_output, wordString, false);
targetPhrase->AddWord(word);
- }
- else if(m_coding == PREnc)
- {
+ } else if(m_coding == PREnc) {
// if the symbol is just a word
- if(GetPREncType(symbol) == 1)
- {
+ if(GetPREncType(symbol) == 1) {
unsigned decodedSymbol = DecodePREncSymbol1(symbol);
-
+
Word word;
word.CreateFromString(Output, *m_output,
GetTargetSymbol(decodedSymbol), false);
targetPhrase->AddWord(word);
}
// if the symbol is a subphrase pointer
- else
- {
+ else {
int left = DecodePREncSymbol2Left(symbol);
int right = DecodePREncSymbol2Right(symbol);
unsigned rank = DecodePREncSymbol2Rank(symbol);
-
+
int srcStart = left + targetPhrase->GetSize();
int srcEnd = srcSize - right - 1;
-
+
// false positive consistency check
if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize)
return TargetPhraseVectorPtr();
-
+
// false positive consistency check
if(m_maxRank && rank > m_maxRank)
- return TargetPhraseVectorPtr();
-
+ return TargetPhraseVectorPtr();
+
// set subphrase by default to itself
TargetPhraseVectorPtr subTpv = tpv;
-
+
// if range smaller than source phrase retrieve subphrase
- if(unsigned(srcEnd - srcStart + 1) != srcSize)
- {
+ if(unsigned(srcEnd - srcStart + 1) != srcSize) {
Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd));
subTpv = CreateTargetPhraseCollection(subPhrase, false);
- }
- else {
+ } else {
// false positive consistency check
if(rank >= tpv->size()-1)
return TargetPhraseVectorPtr();
}
-
+
// false positive consistency check
- if(subTpv != NULL && rank < subTpv->size())
- {
+ if(subTpv != NULL && rank < subTpv->size()) {
// insert the subphrase into the main target phrase
TargetPhrase& subTp = subTpv->at(rank);
- if(m_phraseDictionary.m_useAlignmentInfo)
- {
+ if(m_phraseDictionary.m_useAlignmentInfo) {
// reconstruct the alignment data based on the alignment of the subphrase
for(AlignmentInfo::const_iterator it = subTp.GetAlignTerm().begin();
- it != subTp.GetAlignTerm().end(); it++)
- {
+ it != subTp.GetAlignTerm().end(); it++) {
alignment.insert(AlignPointSizeT(srcStart + it->first,
targetPhrase->GetSize() + it->second));
}
}
targetPhrase->Append(subTp);
- }
- else
+ } else
return TargetPhraseVectorPtr();
}
- }
- else
- {
- Word word;
- word.CreateFromString(Output, *m_output,
- GetTargetSymbol(symbol), false);
- targetPhrase->AddWord(word);
+ } else {
+ Word word;
+ word.CreateFromString(Output, *m_output,
+ GetTargetSymbol(symbol), false);
+ targetPhrase->AddWord(word);
}
}
- }
- else if(state == Score)
- {
+ } else if(state == Score) {
size_t idx = m_multipleScoreTrees ? scores.size() : 0;
float score = m_scoreTrees[idx]->Read(encodedBitStream);
scores.push_back(score);
-
- if(scores.size() == m_numScoreComponent)
- {
+
+ if(scores.size() == m_numScoreComponent) {
targetPhrase->GetScoreBreakdown().Assign(&m_phraseDictionary, scores);
targetPhrase->Evaluate(sourcePhrase);
@@ -443,49 +404,41 @@ TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
else
state = Add;
}
- }
- else if(state == Alignment)
- {
+ } else if(state == Alignment) {
AlignPoint alignPoint = m_alignTree->Read(encodedBitStream);
- if(alignPoint == alignStopSymbol)
- {
+ if(alignPoint == alignStopSymbol) {
state = Add;
- }
- else
- {
- if(m_phraseDictionary.m_useAlignmentInfo)
+ } else {
+ if(m_phraseDictionary.m_useAlignmentInfo)
alignment.insert(AlignPointSizeT(alignPoint));
}
}
-
- if(state == Add)
- {
+
+ if(state == Add) {
if(m_phraseDictionary.m_useAlignmentInfo) {
targetPhrase->SetAlignTerm(alignment);
}
-
- if(m_coding == PREnc)
- {
+
+ if(m_coding == PREnc) {
if(!m_maxRank || tpv->size() <= m_maxRank)
bitsLeft = encodedBitStream.TellFromEnd();
-
+
if(!topLevel && m_maxRank && tpv->size() >= m_maxRank)
break;
}
-
+
if(encodedBitStream.TellFromEnd() <= 8)
break;
-
+
state = New;
- }
+ }
}
-
- if(m_coding == PREnc && !extending)
- {
+
+ if(m_coding == PREnc && !extending) {
bitsLeft = bitsLeft > 8 ? bitsLeft : 0;
m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank);
}
-
+
return tpv;
}
diff --git a/moses/TranslationModel/CompactPT/PhraseDecoder.h b/moses/TranslationModel/CompactPT/PhraseDecoder.h
index 13c8af300..85e9334da 100644
--- a/moses/TranslationModel/CompactPT/PhraseDecoder.h
+++ b/moses/TranslationModel/CompactPT/PhraseDecoder.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_PhraseDecoder_h
#define moses_PhraseDecoder_h
@@ -52,93 +52,93 @@ class PhraseDictionaryCompact;
class PhraseDecoder
{
- protected:
-
- friend class PhraseDictionaryCompact;
-
- typedef std::pair<unsigned char, unsigned char> AlignPoint;
- typedef std::pair<unsigned, unsigned> SrcTrg;
-
- enum Coding { None, REnc, PREnc } m_coding;
-
- size_t m_numScoreComponent;
- bool m_containsAlignmentInfo;
- size_t m_maxRank;
- size_t m_maxPhraseLength;
-
- boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
- StringVector<unsigned char, unsigned, std::allocator> m_sourceSymbols;
- StringVector<unsigned char, unsigned, std::allocator> m_targetSymbols;
-
- std::vector<size_t> m_lexicalTableIndex;
- std::vector<SrcTrg> m_lexicalTable;
-
- CanonicalHuffman<unsigned>* m_symbolTree;
-
- bool m_multipleScoreTrees;
- std::vector<CanonicalHuffman<float>*> m_scoreTrees;
-
- CanonicalHuffman<AlignPoint>* m_alignTree;
-
- TargetPhraseCollectionCache m_decodingCache;
-
- PhraseDictionaryCompact& m_phraseDictionary;
-
- // ***********************************************
-
- const std::vector<FactorType>* m_input;
- const std::vector<FactorType>* m_output;
- const std::vector<float>* m_weight;
-
- std::string m_separator;
-
- // ***********************************************
-
- unsigned GetSourceSymbolId(std::string& s);
- std::string GetTargetSymbol(unsigned id) const;
-
- size_t GetREncType(unsigned encodedSymbol);
- size_t GetPREncType(unsigned encodedSymbol);
-
- unsigned GetTranslation(unsigned srcIdx, size_t rank);
-
- size_t GetMaxSourcePhraseLength();
-
- unsigned DecodeREncSymbol1(unsigned encodedSymbol);
- unsigned DecodeREncSymbol2Rank(unsigned encodedSymbol);
- unsigned DecodeREncSymbol2Position(unsigned encodedSymbol);
- unsigned DecodeREncSymbol3(unsigned encodedSymbol);
-
- unsigned DecodePREncSymbol1(unsigned encodedSymbol);
- int DecodePREncSymbol2Left(unsigned encodedSymbol);
- int DecodePREncSymbol2Right(unsigned encodedSymbol);
- unsigned DecodePREncSymbol2Rank(unsigned encodedSymbol);
-
- std::string MakeSourceKey(std::string &);
-
- public:
-
- PhraseDecoder(
- PhraseDictionaryCompact &phraseDictionary,
- const std::vector<FactorType>* input,
- const std::vector<FactorType>* output,
- size_t numScoreComponent,
- const std::vector<float>* weight
- );
-
- ~PhraseDecoder();
-
- size_t Load(std::FILE* in);
-
- TargetPhraseVectorPtr CreateTargetPhraseCollection(const Phrase &sourcePhrase,
- bool topLevel = false);
-
- TargetPhraseVectorPtr DecodeCollection(TargetPhraseVectorPtr tpv,
- BitWrapper<> &encodedBitStream,
- const Phrase &sourcePhrase,
- bool topLevel);
-
- void PruneCache();
+protected:
+
+ friend class PhraseDictionaryCompact;
+
+ typedef std::pair<unsigned char, unsigned char> AlignPoint;
+ typedef std::pair<unsigned, unsigned> SrcTrg;
+
+ enum Coding { None, REnc, PREnc } m_coding;
+
+ size_t m_numScoreComponent;
+ bool m_containsAlignmentInfo;
+ size_t m_maxRank;
+ size_t m_maxPhraseLength;
+
+ boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
+ StringVector<unsigned char, unsigned, std::allocator> m_sourceSymbols;
+ StringVector<unsigned char, unsigned, std::allocator> m_targetSymbols;
+
+ std::vector<size_t> m_lexicalTableIndex;
+ std::vector<SrcTrg> m_lexicalTable;
+
+ CanonicalHuffman<unsigned>* m_symbolTree;
+
+ bool m_multipleScoreTrees;
+ std::vector<CanonicalHuffman<float>*> m_scoreTrees;
+
+ CanonicalHuffman<AlignPoint>* m_alignTree;
+
+ TargetPhraseCollectionCache m_decodingCache;
+
+ PhraseDictionaryCompact& m_phraseDictionary;
+
+ // ***********************************************
+
+ const std::vector<FactorType>* m_input;
+ const std::vector<FactorType>* m_output;
+ const std::vector<float>* m_weight;
+
+ std::string m_separator;
+
+ // ***********************************************
+
+ unsigned GetSourceSymbolId(std::string& s);
+ std::string GetTargetSymbol(unsigned id) const;
+
+ size_t GetREncType(unsigned encodedSymbol);
+ size_t GetPREncType(unsigned encodedSymbol);
+
+ unsigned GetTranslation(unsigned srcIdx, size_t rank);
+
+ size_t GetMaxSourcePhraseLength();
+
+ unsigned DecodeREncSymbol1(unsigned encodedSymbol);
+ unsigned DecodeREncSymbol2Rank(unsigned encodedSymbol);
+ unsigned DecodeREncSymbol2Position(unsigned encodedSymbol);
+ unsigned DecodeREncSymbol3(unsigned encodedSymbol);
+
+ unsigned DecodePREncSymbol1(unsigned encodedSymbol);
+ int DecodePREncSymbol2Left(unsigned encodedSymbol);
+ int DecodePREncSymbol2Right(unsigned encodedSymbol);
+ unsigned DecodePREncSymbol2Rank(unsigned encodedSymbol);
+
+ std::string MakeSourceKey(std::string &);
+
+public:
+
+ PhraseDecoder(
+ PhraseDictionaryCompact &phraseDictionary,
+ const std::vector<FactorType>* input,
+ const std::vector<FactorType>* output,
+ size_t numScoreComponent,
+ const std::vector<float>* weight
+ );
+
+ ~PhraseDecoder();
+
+ size_t Load(std::FILE* in);
+
+ TargetPhraseVectorPtr CreateTargetPhraseCollection(const Phrase &sourcePhrase,
+ bool topLevel = false);
+
+ TargetPhraseVectorPtr DecodeCollection(TargetPhraseVectorPtr tpv,
+ BitWrapper<> &encodedBitStream,
+ const Phrase &sourcePhrase,
+ bool topLevel);
+
+ void PruneCache();
};
}
diff --git a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
index e863eb812..ff33f10a7 100644
--- a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#include <fstream>
#include <string>
@@ -40,42 +40,35 @@ using namespace std;
namespace Moses
{
-
+
bool PhraseDictionaryCompact::InitDictionary()
{
const StaticData &staticData = StaticData::Instance();
m_weight = staticData.GetWeights(this);
-
+
std::string tFilePath = m_filePath;
-
+
std::string suffix = ".minphr";
- if(tFilePath.substr(tFilePath.length() - suffix.length(), suffix.length()) == suffix)
- {
- if(!FileExists(tFilePath))
- {
+ if(tFilePath.substr(tFilePath.length() - suffix.length(), suffix.length()) == suffix) {
+ if(!FileExists(tFilePath)) {
std::cerr << "Error: File " << tFilePath << " does not exit." << std::endl;
exit(1);
}
- }
- else
- {
- if(FileExists(tFilePath + suffix))
- {
+ } else {
+ if(FileExists(tFilePath + suffix)) {
tFilePath += suffix;
- }
- else
- {
- std::cerr << "Error: File " << tFilePath << ".minphr does not exit." << std::endl;
- exit(1);
+ } else {
+ std::cerr << "Error: File " << tFilePath << ".minphr does not exit." << std::endl;
+ exit(1);
}
}
m_phraseDecoder = new PhraseDecoder(*this, &m_input, &m_output,
- m_numScoreComponents, &m_weight);
+ m_numScoreComponents, &m_weight);
std::FILE* pFile = std::fopen(tFilePath.c_str() , "r");
-
+
size_t indexSize;
if(m_inMemory)
// Load source phrase index into memory
@@ -85,7 +78,7 @@ bool PhraseDictionaryCompact::InitDictionary()
indexSize = m_hash.LoadIndex(pFile);
size_t coderSize = m_phraseDecoder->Load(pFile);
-
+
size_t phraseSize;
if(m_inMemory)
// Load target phrase collections into memory
@@ -93,8 +86,8 @@ bool PhraseDictionaryCompact::InitDictionary()
else
// Keep target phrase collections on disk
phraseSize = m_targetPhrasesMapped.load(pFile, true);
-
- return indexSize && coderSize && phraseSize;
+
+ return indexSize && coderSize && phraseSize;
}
struct CompareTargetPhrase {
@@ -104,21 +97,22 @@ struct CompareTargetPhrase {
};
const TargetPhraseCollection*
-PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) const {
-
+PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) const
+{
+
// There is no souch source phrase if source phrase is longer than longest
- // observed source phrase during compilation
+ // observed source phrase during compilation
if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
return NULL;
// Retrieve target phrase collection from phrase table
TargetPhraseVectorPtr decodedPhraseColl
- = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
-
+ = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
+
if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));
TargetPhraseCollection* phraseColl = new TargetPhraseCollection();
-
+
// Score phrases and if possible apply ttable_limit
TargetPhraseVector::iterator nth =
(m_tableLimit == 0 || tpv->size() < m_tableLimit) ?
@@ -129,21 +123,21 @@ PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) c
cerr << *tp << endl;
phraseColl->Add(tp);
}
-
+
// Cache phrase pair for for clean-up or retrieval with PREnc
const_cast<PhraseDictionaryCompact*>(this)->CacheForCleanup(phraseColl);
-
+
return phraseColl;
- }
- else
+ } else
return NULL;
}
TargetPhraseVectorPtr
-PhraseDictionaryCompact::GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase) const {
+PhraseDictionaryCompact::GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase) const
+{
// There is no souch source phrase if source phrase is longer than longest
- // observed source phrase during compilation
+ // observed source phrase during compilation
if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
return TargetPhraseVectorPtr();
@@ -151,42 +145,45 @@ PhraseDictionaryCompact::GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase
return m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
}
-PhraseDictionaryCompact::~PhraseDictionaryCompact() {
+PhraseDictionaryCompact::~PhraseDictionaryCompact()
+{
if(m_phraseDecoder)
delete m_phraseDecoder;
}
//TO_STRING_BODY(PhraseDictionaryCompact)
-void PhraseDictionaryCompact::CacheForCleanup(TargetPhraseCollection* tpc) {
+void PhraseDictionaryCompact::CacheForCleanup(TargetPhraseCollection* tpc)
+{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_sentenceMutex);
- PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
+ PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
#else
- PhraseCache &ref = m_sentenceCache;
+ PhraseCache &ref = m_sentenceCache;
#endif
ref.push_back(tpc);
}
void PhraseDictionaryCompact::AddEquivPhrase(const Phrase &source,
- const TargetPhrase &targetPhrase) { }
+ const TargetPhrase &targetPhrase) { }
-void PhraseDictionaryCompact::CleanUpAfterSentenceProcessing(const InputType &source) {
+void PhraseDictionaryCompact::CleanUpAfterSentenceProcessing(const InputType &source)
+{
if(!m_inMemory)
m_hash.KeepNLastRanges(0.01, 0.2);
-
+
m_phraseDecoder->PruneCache();
-
+
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_sentenceMutex);
- PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
+ PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
#else
- PhraseCache &ref = m_sentenceCache;
+ PhraseCache &ref = m_sentenceCache;
#endif
-
- for(PhraseCache::iterator it = ref.begin(); it != ref.end(); it++)
- delete *it;
-
+
+ for(PhraseCache::iterator it = ref.begin(); it != ref.end(); it++)
+ delete *it;
+
PhraseCache temp;
temp.swap(ref);
}
diff --git a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h
index 1eab58894..60969665a 100644
--- a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h
+++ b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_PhraseDictionaryCompact_h
#define moses_PhraseDictionaryCompact_h
@@ -50,7 +50,7 @@ protected:
bool m_inMemory;
bool m_useAlignmentInfo;
-
+
typedef std::vector<TargetPhraseCollection*> PhraseCache;
#ifdef WITH_THREADS
boost::mutex m_sentenceMutex;
@@ -59,23 +59,22 @@ protected:
typedef PhraseCache SentenceCache;
#endif
SentenceCache m_sentenceCache;
-
+
BlockHashIndex m_hash;
PhraseDecoder* m_phraseDecoder;
-
+
StringVector<unsigned char, size_t, MmapAllocator> m_targetPhrasesMapped;
StringVector<unsigned char, size_t, std::allocator> m_targetPhrasesMemory;
std::vector<float> m_weight;
public:
PhraseDictionaryCompact(const std::string &line)
- :PhraseDictionary("PhraseDictionaryCompact", line)
- ,m_inMemory(true)
- ,m_useAlignmentInfo(true)
- ,m_hash(10, 16)
- ,m_phraseDecoder(0)
- ,m_weight(0)
- {
+ :PhraseDictionary("PhraseDictionaryCompact", line)
+ ,m_inMemory(true)
+ ,m_useAlignmentInfo(true)
+ ,m_hash(10, 16)
+ ,m_phraseDecoder(0)
+ ,m_weight(0) {
}
~PhraseDictionaryCompact();
@@ -84,16 +83,15 @@ public:
const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase &source) const;
TargetPhraseVectorPtr GetTargetPhraseCollectionRaw(const Phrase &source) const;
-
+
void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase);
-
+
void CacheForCleanup(TargetPhraseCollection* tpc);
void CleanUpAfterSentenceProcessing(const InputType &source);
virtual ChartRuleLookupManager *CreateRuleLookupManager(
const InputType &,
- const ChartCellCollectionBase &)
- {
+ const ChartCellCollectionBase &) {
assert(false);
return 0;
}
diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
index c7bd81019..fc3b056c6 100644
--- a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#include <cstdio>
@@ -29,17 +29,17 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
-
+
bool operator<(const PackedItem &pi1, const PackedItem &pi2)
{
if(pi1.GetLine() < pi2.GetLine())
- return false;
+ return false;
return true;
}
-
+
std::string PhraseTableCreator::m_phraseStopSymbol = "__SPECIAL_STOP_SYMBOL__";
std::string PhraseTableCreator::m_separator = " ||| ";
-
+
PhraseTableCreator::PhraseTableCreator(std::string inPath,
std::string outPath,
std::string tempfilePath,
@@ -56,7 +56,7 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath,
#ifdef WITH_THREADS
, size_t threads
#endif
- )
+ )
: m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath),
m_outFile(std::fopen(m_outPath.c_str(), "w")), m_numScoreComponent(numScoreComponent),
m_sortScoreIndex(sortScoreIndex), m_warnMe(warnMe),
@@ -64,81 +64,76 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath,
m_useAlignmentInfo(useAlignmentInfo),
m_multipleScoreTrees(multipleScoreTrees),
m_quantize(quantize), m_maxRank(maxRank),
- #ifdef WITH_THREADS
+#ifdef WITH_THREADS
m_threads(threads),
m_srcHash(m_orderBits, m_fingerPrintBits, 1),
m_rnkHash(10, 24, m_threads),
- #else
+#else
m_srcHash(m_orderBits, m_fingerPrintBits),
m_rnkHash(m_orderBits, m_fingerPrintBits),
- #endif
+#endif
m_maxPhraseLength(0),
m_lastFlushedLine(-1), m_lastFlushedSourceNum(0),
m_lastFlushedSourcePhrase("")
{
PrintInfo();
-
+
AddTargetSymbolId(m_phraseStopSymbol);
-
+
size_t cur_pass = 1;
size_t all_passes = 2;
if(m_coding == PREnc)
all_passes = 3;
-
+
m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
- it != m_scoreCounters.end(); it++)
+ it != m_scoreCounters.end(); it++)
*it = new ScoreCounter();
m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
-
+
// 0th pass
- if(m_coding == REnc)
- {
+ if(m_coding == REnc) {
size_t found = inPath.find_last_of("/\\");
std::string path;
- if(found != std::string::npos)
+ if(found != std::string::npos)
path = inPath.substr(0, found);
else
path = ".";
LoadLexicalTable(path + "/lex.f2e");
- }
- else if(m_coding == PREnc)
- {
+ } else if(m_coding == PREnc) {
std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating hash function for rank assignment" << std::endl;
cur_pass++;
CreateRankHash();
}
-
+
// 1st pass
std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating source phrase index + Encoding target phrases" << std::endl;
m_srcHash.BeginSave(m_outFile);
-
+
if(tempfilePath.size()) {
MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
m_encodedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
- }
- else {
- m_encodedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>();
+ } else {
+ m_encodedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>();
}
EncodeTargetPhrases();
-
+
cur_pass++;
-
+
std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl;
CalcHuffmanCodes();
-
+
// 2nd pass
std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Compressing target phrases" << std::endl;
-
+
if(tempfilePath.size()) {
MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
- }
- else {
+ } else {
m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>();
}
CompressTargetPhrases();
-
+
std::cerr << "Saving to " << m_outPath << std::endl;
Save();
std::cerr << "Done" << std::endl;
@@ -149,44 +144,43 @@ PhraseTableCreator::~PhraseTableCreator()
{
delete m_symbolTree;
if(m_useAlignmentInfo)
- delete m_alignTree;
+ delete m_alignTree;
for(size_t i = 0; i < m_scoreTrees.size(); i++) {
delete m_scoreTrees[i];
delete m_scoreCounters[i];
}
-
+
delete m_encodedTargetPhrases;
- delete m_compressedTargetPhrases;
+ delete m_compressedTargetPhrases;
}
void PhraseTableCreator::PrintInfo()
{
std::string encodings[3] = {"Huffman", "Huffman + REnc", "Huffman + PREnc"};
-
+
std::cerr << "Used options:" << std::endl;
std::cerr << "\tText phrase table will be read from: " << m_inPath << std::endl;
std::cerr << "\tOutput phrase table will be written to: " << m_outPath << std::endl;
std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl;
std::cerr << "\tSource phrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl;
std::cerr << "\tSelected target phrase encoding: " << encodings[m_coding] << std::endl;
- if(m_coding == PREnc)
- {
+ if(m_coding == PREnc) {
std::cerr << "\tMaxiumum allowed rank for PREnc: ";
if(!m_maxRank)
std::cerr << "unlimited" << std::endl;
else
- std::cerr << m_maxRank << std::endl;
+ std::cerr << m_maxRank << std::endl;
}
- std::cerr << "\tNumber of score components in phrase table: " << m_numScoreComponent << std::endl;
- std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
+ std::cerr << "\tNumber of score components in phrase table: " << m_numScoreComponent << std::endl;
+ std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
std::cerr << "\tUsing score quantization: ";
if(m_quantize)
std::cerr << m_quantize << " best" << std::endl;
else
std::cerr << "no" << std::endl;
- std::cerr << "\tExplicitly included alignment information: " << (m_useAlignmentInfo ? "yes" : "no") << std::endl;
-
-#ifdef WITH_THREADS
+ std::cerr << "\tExplicitly included alignment information: " << (m_useAlignmentInfo ? "yes" : "no") << std::endl;
+
+#ifdef WITH_THREADS
std::cerr << "\tRunning with " << m_threads << " threads" << std::endl;
#endif
std::cerr << std::endl;
@@ -200,22 +194,21 @@ void PhraseTableCreator::Save()
ThrowingFwrite(&m_useAlignmentInfo, sizeof(m_useAlignmentInfo), 1, m_outFile);
ThrowingFwrite(&m_maxRank, sizeof(m_maxRank), 1, m_outFile);
ThrowingFwrite(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, m_outFile);
-
- if(m_coding == REnc)
- {
+
+ if(m_coding == REnc) {
// Save source language symbols for REnc
std::vector<std::string> temp1;
temp1.resize(m_sourceSymbolsMap.size());
for(boost::unordered_map<std::string, unsigned>::iterator it
= m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++)
- temp1[it->second] = it->first;
+ temp1[it->second] = it->first;
std::sort(temp1.begin(), temp1.end());
StringVector<unsigned char, unsigned, std::allocator> sourceSymbols;
for(std::vector<std::string>::iterator it = temp1.begin();
it != temp1.end(); it++)
- sourceSymbols.push_back(*it);
+ sourceSymbols.push_back(*it);
sourceSymbols.save(m_outFile);
-
+
// Save lexical translation table for REnc
size_t size = m_lexicalTableIndex.size();
ThrowingFwrite(&size, sizeof(size_t), 1, m_outFile);
@@ -224,95 +217,92 @@ void PhraseTableCreator::Save()
ThrowingFwrite(&size, sizeof(size_t), 1, m_outFile);
ThrowingFwrite(&m_lexicalTable[0], sizeof(SrcTrg), size, m_outFile);
}
-
+
// Save target language symbols
std::vector<std::string> temp2;
temp2.resize(m_targetSymbolsMap.size());
for(boost::unordered_map<std::string, unsigned>::iterator it
- = m_targetSymbolsMap.begin(); it != m_targetSymbolsMap.end(); it++)
+ = m_targetSymbolsMap.begin(); it != m_targetSymbolsMap.end(); it++)
temp2[it->second] = it->first;
StringVector<unsigned char, unsigned, std::allocator> targetSymbols;
for(std::vector<std::string>::iterator it = temp2.begin();
- it != temp2.end(); it++)
+ it != temp2.end(); it++)
targetSymbols.push_back(*it);
targetSymbols.save(m_outFile);
-
+
// Save Huffman codes for target language symbols
m_symbolTree->Save(m_outFile);
-
+
// Save number of Huffman code sets for scores and
// save Huffman code sets
ThrowingFwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile);
size_t numScoreTrees = m_scoreTrees.size();
for(size_t i = 0; i < numScoreTrees; i++)
m_scoreTrees[i]->Save(m_outFile);
-
+
// Save Huffman codes for alignments
if(m_useAlignmentInfo)
m_alignTree->Save(m_outFile);
-
- // Save compressed target phrase collections
+
+ // Save compressed target phrase collections
m_compressedTargetPhrases->save(m_outFile);
}
-
+
void PhraseTableCreator::LoadLexicalTable(std::string filePath)
{
std::vector<SrcTrgProb> t_lexTable;
-
+
std::cerr << "Reading in lexical table for Rank Encoding" << std::endl;
std::ifstream lexIn(filePath.c_str(), std::ifstream::in);
std::string src, trg;
float prob;
-
+
// Reading in the translation probability lexicon
-
+
std::cerr << "\tLoading from " << filePath << std::endl;
- while(lexIn >> trg >> src >> prob)
- {
+ while(lexIn >> trg >> src >> prob) {
t_lexTable.push_back(SrcTrgProb(SrcTrgString(src, trg), prob));
AddSourceSymbolId(src);
AddTargetSymbolId(trg);
}
-
+
// Sorting lexicon by source words by lexicographical order, corresponding
// target words by decreasing probability.
-
+
std::cerr << "\tSorting according to translation rank" << std::endl;
std::sort(t_lexTable.begin(), t_lexTable.end(), SrcTrgProbSorter());
-
+
// Re-assigning source word ids in lexicographical order
-
+
std::vector<std::string> temp1;
temp1.resize(m_sourceSymbolsMap.size());
for(boost::unordered_map<std::string, unsigned>::iterator it
- = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++)
+ = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++)
temp1[it->second] = it->first;
-
+
std::sort(temp1.begin(), temp1.end());
-
+
for(size_t i = 0; i < temp1.size(); i++)
m_sourceSymbolsMap[temp1[i]] = i;
-
+
// Building the lexicon based on source and target word ids
-
+
std::string srcWord = "";
size_t srcIdx = 0;
for(std::vector<SrcTrgProb>::iterator it = t_lexTable.begin();
- it != t_lexTable.end(); it++)
- {
+ it != t_lexTable.end(); it++) {
// If we encounter a new source word
- if(it->first.first != srcWord)
- {
+ if(it->first.first != srcWord) {
srcIdx = GetSourceSymbolId(it->first.first);
-
+
// Store position of first translation
if(srcIdx >= m_lexicalTableIndex.size())
m_lexicalTableIndex.resize(srcIdx + 1);
m_lexicalTableIndex[srcIdx] = m_lexicalTable.size();
}
-
+
// Store pair of source word and target word
- size_t trgIdx = GetTargetSymbolId(it->first.second);
+ size_t trgIdx = GetTargetSymbolId(it->first.second);
m_lexicalTable.push_back(SrcTrg(srcIdx, trgIdx));
srcWord = it->first.first;
@@ -322,14 +312,13 @@ void PhraseTableCreator::LoadLexicalTable(std::string filePath)
}
void PhraseTableCreator::CreateRankHash()
-{
+{
InputFileStream inFile(m_inPath);
#ifdef WITH_THREADS
boost::thread_group threads;
- for (size_t i = 0; i < m_threads; ++i)
- {
- RankingTask* rt = new RankingTask(inFile, *this);
+ for (size_t i = 0; i < m_threads; ++i) {
+ RankingTask* rt = new RankingTask(inFile, *this);
threads.create_thread(*rt);
}
threads.join_all();
@@ -343,7 +332,7 @@ void PhraseTableCreator::CreateRankHash()
inline std::string PhraseTableCreator::MakeSourceKey(std::string &source)
{
- return source + m_separator;
+ return source + m_separator;
}
inline std::string PhraseTableCreator::MakeSourceTargetKey(std::string &source, std::string &target)
@@ -357,9 +346,8 @@ void PhraseTableCreator::EncodeTargetPhrases()
#ifdef WITH_THREADS
boost::thread_group threads;
- for (size_t i = 0; i < m_threads; ++i)
- {
- EncodingTask* et = new EncodingTask(inFile, *this);
+ for (size_t i = 0; i < m_threads; ++i) {
+ EncodingTask* et = new EncodingTask(inFile, *this);
threads.create_thread(*et);
}
threads.join_all();
@@ -368,17 +356,17 @@ void PhraseTableCreator::EncodeTargetPhrases()
(*et)();
delete et;
#endif
- FlushEncodedQueue(true);
+ FlushEncodedQueue(true);
}
void PhraseTableCreator::CompressTargetPhrases()
-{
+{
#ifdef WITH_THREADS
boost::thread_group threads;
for (size_t i = 0; i < m_threads; ++i) {
- CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this);
- threads.create_thread(*ct);
+ CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this);
+ threads.create_thread(*ct);
}
threads.join_all();
#else
@@ -392,29 +380,27 @@ void PhraseTableCreator::CompressTargetPhrases()
void PhraseTableCreator::CalcHuffmanCodes()
{
std::cerr << "\tCreating Huffman codes for " << m_symbolCounter.Size()
- << " target phrase symbols" << std::endl;
-
+ << " target phrase symbols" << std::endl;
+
m_symbolTree = new SymbolTree(m_symbolCounter.Begin(),
- m_symbolCounter.End());
-
+ m_symbolCounter.End());
+
std::vector<ScoreTree*>::iterator treeIt = m_scoreTrees.begin();
for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
- it != m_scoreCounters.end(); it++)
- {
+ it != m_scoreCounters.end(); it++) {
if(m_quantize)
- (*it)->Quantize(m_quantize);
-
+ (*it)->Quantize(m_quantize);
+
std::cerr << "\tCreating Huffman codes for " << (*it)->Size()
- << " scores" << std::endl;
-
+ << " scores" << std::endl;
+
*treeIt = new ScoreTree((*it)->Begin(), (*it)->End());
treeIt++;
}
-
- if(m_useAlignmentInfo)
- {
+
+ if(m_useAlignmentInfo) {
std::cerr << "\tCreating Huffman codes for " << m_alignCounter.Size()
- << " alignment points" << std::endl;
+ << " alignment points" << std::endl;
m_alignTree = new AlignTree(m_alignCounter.Begin(), m_alignCounter.End());
}
std::cerr << std::endl;
@@ -440,9 +426,9 @@ void PhraseTableCreator::AddTargetSymbolId(std::string& symbol)
unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol)
{
boost::unordered_map<std::string, unsigned>::iterator it
- = m_sourceSymbolsMap.find(symbol);
-
- if(it != m_sourceSymbolsMap.end())
+ = m_sourceSymbolsMap.find(symbol);
+
+ if(it != m_sourceSymbolsMap.end())
return it->second;
else
return m_sourceSymbolsMap.size();
@@ -451,9 +437,9 @@ unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol)
unsigned PhraseTableCreator::GetTargetSymbolId(std::string& symbol)
{
boost::unordered_map<std::string, unsigned>::iterator it
- = m_targetSymbolsMap.find(symbol);
-
- if(it != m_targetSymbolsMap.end())
+ = m_targetSymbolsMap.find(symbol);
+
+ if(it != m_targetSymbolsMap.end())
return it->second;
else
return m_targetSymbolsMap.size();
@@ -465,12 +451,11 @@ unsigned PhraseTableCreator::GetOrAddTargetSymbolId(std::string& symbol)
boost::mutex::scoped_lock lock(m_mutex);
#endif
boost::unordered_map<std::string, unsigned>::iterator it
- = m_targetSymbolsMap.find(symbol);
-
- if(it != m_targetSymbolsMap.end())
+ = m_targetSymbolsMap.find(symbol);
+
+ if(it != m_targetSymbolsMap.end())
return it->second;
- else
- {
+ else {
unsigned value = m_targetSymbolsMap.size();
m_targetSymbolsMap[symbol] = value;
return value;
@@ -481,12 +466,12 @@ unsigned PhraseTableCreator::GetRank(unsigned srcIdx, unsigned trgIdx)
{
size_t srcTrgIdx = m_lexicalTableIndex[srcIdx];
while(srcTrgIdx < m_lexicalTable.size()
- && srcIdx == m_lexicalTable[srcTrgIdx].first
- && m_lexicalTable[srcTrgIdx].second != trgIdx)
+ && srcIdx == m_lexicalTable[srcTrgIdx].first
+ && m_lexicalTable[srcTrgIdx].second != trgIdx)
srcTrgIdx++;
-
+
if(srcTrgIdx < m_lexicalTable.size()
- && m_lexicalTable[srcTrgIdx].second == trgIdx)
+ && m_lexicalTable[srcTrgIdx].second == trgIdx)
return srcTrgIdx - m_lexicalTableIndex[srcIdx];
else
return m_lexicalTable.size();
@@ -522,14 +507,14 @@ unsigned PhraseTableCreator::EncodePREncSymbol1(unsigned trgIdx)
unsigned PhraseTableCreator::EncodePREncSymbol2(int left, int right, unsigned rank)
{
// "left" and "right" must be smaller than 2^5
- // "rank" must be smaller than 2^19
+ // "rank" must be smaller than 2^19
left = left + 32;
right = right + 32;
-
+
assert(64 > left);
assert(64 > right);
assert(524288 > rank);
-
+
unsigned symbol = 0;
symbol |= 1 << 31;
symbol |= left << 25;
@@ -539,151 +524,135 @@ unsigned PhraseTableCreator::EncodePREncSymbol2(int left, int right, unsigned ra
}
void PhraseTableCreator::EncodeTargetPhraseNone(std::vector<std::string>& t,
- std::ostream& os)
+ std::ostream& os)
{
std::stringstream encodedTargetPhrase;
size_t j = 0;
- while(j < t.size())
- {
+ while(j < t.size()) {
unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]);
-
+
m_symbolCounter.Increase(targetSymbolId);
os.write((char*)&targetSymbolId, sizeof(targetSymbolId));
j++;
}
-
+
unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
os.write((char*)&stopSymbolId, sizeof(stopSymbolId));
m_symbolCounter.Increase(stopSymbolId);
}
void PhraseTableCreator::EncodeTargetPhraseREnc(std::vector<std::string>& s,
- std::vector<std::string>& t,
- std::set<AlignPoint>& a,
- std::ostream& os)
-{
+ std::vector<std::string>& t,
+ std::set<AlignPoint>& a,
+ std::ostream& os)
+{
std::stringstream encodedTargetPhrase;
std::vector<std::vector<size_t> > a2(t.size());
for(std::set<AlignPoint>::iterator it = a.begin(); it != a.end(); it++)
a2[it->second].push_back(it->first);
- for(size_t i = 0; i < t.size(); i++)
- {
+ for(size_t i = 0; i < t.size(); i++) {
unsigned idxTarget = GetOrAddTargetSymbolId(t[i]);
unsigned encodedSymbol = -1;
-
+
unsigned bestSrcPos = s.size();
unsigned bestDiff = s.size();
unsigned bestRank = m_lexicalTable.size();
unsigned badRank = m_lexicalTable.size();
-
- for(std::vector<size_t>::iterator it = a2[i].begin(); it != a2[i].end(); it++)
- {
+
+ for(std::vector<size_t>::iterator it = a2[i].begin(); it != a2[i].end(); it++) {
unsigned idxSource = GetSourceSymbolId(s[*it]);
size_t r = GetRank(idxSource, idxTarget);
- if(r != badRank)
- {
- if(r < bestRank)
- {
+ if(r != badRank) {
+ if(r < bestRank) {
bestRank = r;
bestSrcPos = *it;
bestDiff = abs(*it-i);
- }
- else if(r == bestRank && unsigned(abs(*it-i)) < bestDiff)
- {
+ } else if(r == bestRank && unsigned(abs(*it-i)) < bestDiff) {
bestSrcPos = *it;
bestDiff = abs(*it-i);
}
}
}
-
- if(bestRank != badRank && bestSrcPos < s.size())
- {
+
+ if(bestRank != badRank && bestSrcPos < s.size()) {
if(bestSrcPos == i)
encodedSymbol = EncodeREncSymbol3(bestRank);
else
- encodedSymbol = EncodeREncSymbol2(bestSrcPos, bestRank);
+ encodedSymbol = EncodeREncSymbol2(bestSrcPos, bestRank);
a.erase(AlignPoint(bestSrcPos, i));
- }
- else
- {
+ } else {
encodedSymbol = EncodeREncSymbol1(idxTarget);
}
-
+
os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
m_symbolCounter.Increase(encodedSymbol);
}
-
+
unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
unsigned encodedSymbol = EncodeREncSymbol1(stopSymbolId);
os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
- m_symbolCounter.Increase(encodedSymbol);
+ m_symbolCounter.Increase(encodedSymbol);
}
void PhraseTableCreator::EncodeTargetPhrasePREnc(std::vector<std::string>& s,
- std::vector<std::string>& t,
- std::set<AlignPoint>& a,
- size_t ownRank,
- std::ostream& os)
+ std::vector<std::string>& t,
+ std::set<AlignPoint>& a,
+ size_t ownRank,
+ std::ostream& os)
{
std::vector<unsigned> encodedSymbols(t.size());
std::vector<unsigned> encodedSymbolsLengths(t.size(), 0);
-
+
ConsistentPhrases cp(s.size(), t.size(), a);
while(!cp.Empty()) {
ConsistentPhrases::Phrase p = cp.Pop();
-
+
std::stringstream key1;
key1 << s[p.i];
for(int i = p.i+1; i < p.i+p.m; i++)
key1 << " " << s[i];
-
+
std::stringstream key2;
key2 << t[p.j];
for(int i = p.j+1; i < p.j+p.n; i++)
key2 << " " << t[i];
-
+
int rank = -1;
std::string key1Str = key1.str(), key2Str = key2.str();
size_t idx = m_rnkHash[MakeSourceTargetKey(key1Str, key2Str)];
if(idx != m_rnkHash.GetSize())
- rank = m_ranks[idx];
-
- if(rank >= 0 && (m_maxRank == 0 || unsigned(rank) < m_maxRank))
- {
- if(unsigned(p.m) != s.size() || unsigned(rank) < ownRank)
- {
+ rank = m_ranks[idx];
+
+ if(rank >= 0 && (m_maxRank == 0 || unsigned(rank) < m_maxRank)) {
+ if(unsigned(p.m) != s.size() || unsigned(rank) < ownRank) {
std::stringstream encodedSymbol;
encodedSymbols[p.j] = EncodePREncSymbol2(p.i-p.j, s.size()-(p.i+p.m), rank);
encodedSymbolsLengths[p.j] = p.n;
-
+
std::set<AlignPoint> tAlignment;
for(std::set<AlignPoint>::iterator it = a.begin();
- it != a.end(); it++)
+ it != a.end(); it++)
if(it->first < p.i || it->first >= p.i + p.m
- || it->second < p.j || it->second >= p.j + p.n)
- tAlignment.insert(*it);
+ || it->second < p.j || it->second >= p.j + p.n)
+ tAlignment.insert(*it);
a = tAlignment;
- cp.RemoveOverlap(p);
+ cp.RemoveOverlap(p);
}
}
}
-
+
std::stringstream encodedTargetPhrase;
-
+
size_t j = 0;
- while(j < t.size())
- {
- if(encodedSymbolsLengths[j] > 0)
- {
+ while(j < t.size()) {
+ if(encodedSymbolsLengths[j] > 0) {
unsigned encodedSymbol = encodedSymbols[j];
m_symbolCounter.Increase(encodedSymbol);
os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
j += encodedSymbolsLengths[j];
- }
- else
- {
+ } else {
unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]);
unsigned encodedSymbol = EncodePREncSymbol1(targetSymbolId);
m_symbolCounter.Increase(encodedSymbol);
@@ -691,7 +660,7 @@ void PhraseTableCreator::EncodeTargetPhrasePREnc(std::vector<std::string>& s,
j++;
}
}
-
+
unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
unsigned encodedSymbol = EncodePREncSymbol1(stopSymbolId);
os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
@@ -702,9 +671,8 @@ void PhraseTableCreator::EncodeScores(std::vector<float>& scores, std::ostream&
{
size_t c = 0;
float score;
-
- while(c < scores.size())
- {
+
+ while(c < scores.size()) {
score = scores[c];
score = FloorScore(TransformScore(score));
os.write((char*)&score, sizeof(score));
@@ -714,11 +682,10 @@ void PhraseTableCreator::EncodeScores(std::vector<float>& scores, std::ostream&
}
void PhraseTableCreator::EncodeAlignment(std::set<AlignPoint>& alignment,
- std::ostream& os)
+ std::ostream& os)
{
for(std::set<AlignPoint>::iterator it = alignment.begin();
- it != alignment.end(); it++)
- {
+ it != alignment.end(); it++) {
os.write((char*)&(*it), sizeof(AlignPoint));
m_alignCounter.Increase(*it);
}
@@ -728,83 +695,77 @@ void PhraseTableCreator::EncodeAlignment(std::set<AlignPoint>& alignment,
}
std::string PhraseTableCreator::EncodeLine(std::vector<std::string>& tokens, size_t ownRank)
-{
+{
std::string sourcePhraseStr = tokens[0];
std::string targetPhraseStr = tokens[1];
std::string scoresStr = tokens[2];
-
+
std::string alignmentStr = "";
if(tokens.size() > 3)
alignmentStr = tokens[3];
-
+
std::vector<std::string> s = Tokenize(sourcePhraseStr);
-
+
size_t phraseLength = s.size();
if(m_maxPhraseLength < phraseLength)
m_maxPhraseLength = phraseLength;
-
+
std::vector<std::string> t = Tokenize(targetPhraseStr);
std::vector<float> scores = Tokenize<float>(scoresStr);
-
+
if(scores.size() != m_numScoreComponent) {
std::cerr << "Error: Wrong number of scores detected ("
- << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
+ << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
std::cerr << "Line: " << tokens[0] << " ||| " << tokens[1] << " ||| " << tokens[3] << " ..." << std::endl;
- abort();
+ abort();
}
-
+
std::set<AlignPoint> a;
- if(m_coding != None || m_useAlignmentInfo)
- {
+ if(m_coding != None || m_useAlignmentInfo) {
std::vector<size_t> positions = Tokenize<size_t>(alignmentStr, " \t-");
- for(size_t i = 0; i < positions.size(); i += 2)
- {
+ for(size_t i = 0; i < positions.size(); i += 2) {
a.insert(AlignPoint(positions[i], positions[i+1]));
}
}
-
+
std::stringstream encodedTargetPhrase;
-
- if(m_coding == PREnc)
- {
+
+ if(m_coding == PREnc) {
EncodeTargetPhrasePREnc(s, t, a, ownRank, encodedTargetPhrase);
+ } else if(m_coding == REnc) {
+ EncodeTargetPhraseREnc(s, t, a, encodedTargetPhrase);
+ } else {
+ EncodeTargetPhraseNone(t, encodedTargetPhrase);
}
- else if(m_coding == REnc)
- {
- EncodeTargetPhraseREnc(s, t, a, encodedTargetPhrase);
- }
- else
- {
- EncodeTargetPhraseNone(t, encodedTargetPhrase);
- }
-
+
EncodeScores(scores, encodedTargetPhrase);
-
+
if(m_useAlignmentInfo)
EncodeAlignment(a, encodedTargetPhrase);
-
+
return encodedTargetPhrase.str();
}
std::string PhraseTableCreator::CompressEncodedCollection(std::string encodedCollection)
-{
+{
enum EncodeState {
- ReadSymbol, ReadScore, ReadAlignment,
- EncodeSymbol, EncodeScore, EncodeAlignment };
+ ReadSymbol, ReadScore, ReadAlignment,
+ EncodeSymbol, EncodeScore, EncodeAlignment
+ };
EncodeState state = ReadSymbol;
unsigned phraseStopSymbolId;
if(m_coding == REnc)
phraseStopSymbolId = EncodeREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol));
else if(m_coding == PREnc)
- phraseStopSymbolId = EncodePREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol));
+ phraseStopSymbolId = EncodePREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol));
else
phraseStopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
AlignPoint alignStopSymbol(-1, -1);
std::stringstream encodedStream(encodedCollection);
encodedStream.unsetf(std::ios::skipws);
-
+
std::string compressedEncodedCollection;
BitWrapper<> bitStream(compressedEncodedCollection);
@@ -812,56 +773,50 @@ std::string PhraseTableCreator::CompressEncodedCollection(std::string encodedCol
float score;
size_t currScore = 0;
AlignPoint alignPoint;
-
- while(encodedStream)
- {
- switch(state)
- {
- case ReadSymbol:
- encodedStream.read((char*) &symbol, sizeof(unsigned));
- state = EncodeSymbol;
- break;
- case ReadScore:
- if(currScore == m_numScoreComponent)
- {
- currScore = 0;
- if(m_useAlignmentInfo)
- state = ReadAlignment;
- else
- state = ReadSymbol;
- }
+
+ while(encodedStream) {
+ switch(state) {
+ case ReadSymbol:
+ encodedStream.read((char*) &symbol, sizeof(unsigned));
+ state = EncodeSymbol;
+ break;
+ case ReadScore:
+ if(currScore == m_numScoreComponent) {
+ currScore = 0;
+ if(m_useAlignmentInfo)
+ state = ReadAlignment;
else
- {
- encodedStream.read((char*) &score, sizeof(float));
- currScore++;
- state = EncodeScore;
- }
- break;
- case ReadAlignment:
- encodedStream.read((char*) &alignPoint, sizeof(AlignPoint));
- state = EncodeAlignment;
- break;
-
- case EncodeSymbol:
- state = (symbol == phraseStopSymbolId) ? ReadScore : ReadSymbol;
- m_symbolTree->Put(bitStream, symbol);
- break;
- case EncodeScore:
- {
- state = ReadScore;
- size_t idx = m_multipleScoreTrees ? currScore-1 : 0;
- if(m_quantize)
- score = m_scoreCounters[idx]->LowerBound(score);
- m_scoreTrees[idx]->Put(bitStream, score);
- }
- break;
- case EncodeAlignment:
- state = (alignPoint == alignStopSymbol) ? ReadSymbol : ReadAlignment;
- m_alignTree->Put(bitStream, alignPoint);
- break;
+ state = ReadSymbol;
+ } else {
+ encodedStream.read((char*) &score, sizeof(float));
+ currScore++;
+ state = EncodeScore;
+ }
+ break;
+ case ReadAlignment:
+ encodedStream.read((char*) &alignPoint, sizeof(AlignPoint));
+ state = EncodeAlignment;
+ break;
+
+ case EncodeSymbol:
+ state = (symbol == phraseStopSymbolId) ? ReadScore : ReadSymbol;
+ m_symbolTree->Put(bitStream, symbol);
+ break;
+ case EncodeScore: {
+ state = ReadScore;
+ size_t idx = m_multipleScoreTrees ? currScore-1 : 0;
+ if(m_quantize)
+ score = m_scoreCounters[idx]->LowerBound(score);
+ m_scoreTrees[idx]->Put(bitStream, score);
+ }
+ break;
+ case EncodeAlignment:
+ state = (alignPoint == alignStopSymbol) ? ReadSymbol : ReadAlignment;
+ m_alignTree->Put(bitStream, alignPoint);
+ break;
}
}
-
+
return compressedEncodedCollection;
}
@@ -873,32 +828,28 @@ void PhraseTableCreator::AddRankedLine(PackedItem& pi)
void PhraseTableCreator::FlushRankedQueue(bool force)
{
size_t step = 1ul << 10;
-
- while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine())
- {
+
+ while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
m_lastFlushedLine++;
PackedItem pi = m_queue.top();
m_queue.pop();
-
- if(m_lastSourceRange.size() == step)
- {
+
+ if(m_lastSourceRange.size() == step) {
m_rnkHash.AddRange(m_lastSourceRange);
m_lastSourceRange.clear();
}
-
- if(m_lastFlushedSourcePhrase != pi.GetSrc())
- {
- if(m_rankQueue.size()) {
+
+ if(m_lastFlushedSourcePhrase != pi.GetSrc()) {
+ if(m_rankQueue.size()) {
m_lastFlushedSourceNum++;
if(m_lastFlushedSourceNum % 100000 == 0) {
std::cerr << ".";
}
- if(m_lastFlushedSourceNum % 5000000 == 0)
- {
+ if(m_lastFlushedSourceNum % 5000000 == 0) {
std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl;
}
-
+
m_ranks.resize(m_lastFlushedLine + 1);
int r = 0;
while(!m_rankQueue.empty()) {
@@ -907,33 +858,31 @@ void PhraseTableCreator::FlushRankedQueue(bool force)
}
}
}
-
+
m_lastSourceRange.push_back(pi.GetTrg());
-
+
m_rankQueue.push(std::make_pair(pi.GetScore(), pi.GetLine()));
m_lastFlushedSourcePhrase = pi.GetSrc();
}
-
- if(force)
- {
+
+ if(force) {
m_rnkHash.AddRange(m_lastSourceRange);
m_lastSourceRange.clear();
#ifdef WITH_THREADS
m_rnkHash.WaitAll();
#endif
-
+
m_ranks.resize(m_lastFlushedLine + 1);
int r = 0;
- while(!m_rankQueue.empty())
- {
+ while(!m_rankQueue.empty()) {
m_ranks[m_rankQueue.top().second] = r++;
m_rankQueue.pop();
}
m_lastFlushedLine = -1;
m_lastFlushedSourceNum = 0;
-
+
std::cerr << std::endl << std::endl;
}
}
@@ -946,74 +895,65 @@ void PhraseTableCreator::AddEncodedLine(PackedItem& pi)
void PhraseTableCreator::FlushEncodedQueue(bool force)
{
- while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine())
- {
+ while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
PackedItem pi = m_queue.top();
m_queue.pop();
m_lastFlushedLine++;
-
- if(m_lastFlushedSourcePhrase != pi.GetSrc())
- {
- if(m_lastCollection.size())
- {
+
+ if(m_lastFlushedSourcePhrase != pi.GetSrc()) {
+ if(m_lastCollection.size()) {
std::stringstream targetPhraseCollection;
for(std::vector<std::string>::iterator it =
- m_lastCollection.begin(); it != m_lastCollection.end(); it++)
+ m_lastCollection.begin(); it != m_lastCollection.end(); it++)
targetPhraseCollection << *it;
-
- m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase));
+
+ m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase));
m_encodedTargetPhrases->push_back(targetPhraseCollection.str());
-
+
m_lastFlushedSourceNum++;
if(m_lastFlushedSourceNum % 100000 == 0)
std::cerr << ".";
if(m_lastFlushedSourceNum % 5000000 == 0)
std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl;
-
+
m_lastCollection.clear();
}
}
-
- if(m_lastSourceRange.size() == (1ul << m_orderBits))
- {
+
+ if(m_lastSourceRange.size() == (1ul << m_orderBits)) {
m_srcHash.AddRange(m_lastSourceRange);
m_srcHash.SaveLastRange();
m_srcHash.DropLastRange();
m_lastSourceRange.clear();
}
-
+
m_lastFlushedSourcePhrase = pi.GetSrc();
- if(m_coding == PREnc)
- {
+ if(m_coding == PREnc) {
if(m_lastCollection.size() <= pi.GetRank())
m_lastCollection.resize(pi.GetRank() + 1);
m_lastCollection[pi.GetRank()] = pi.GetTrg();
- }
- else
- {
+ } else {
m_lastCollection.push_back(pi.GetTrg());
}
}
-
- if(force)
- {
+
+ if(force) {
if(!m_lastSourceRange.size() || m_lastSourceRange.back() != m_lastFlushedSourcePhrase)
m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase));
-
- if(m_lastCollection.size())
- {
+
+ if(m_lastCollection.size()) {
std::stringstream targetPhraseCollection;
for(std::vector<std::string>::iterator it =
- m_lastCollection.begin(); it != m_lastCollection.end(); it++)
+ m_lastCollection.begin(); it != m_lastCollection.end(); it++)
targetPhraseCollection << *it;
-
+
m_encodedTargetPhrases->push_back(targetPhraseCollection.str());
m_lastCollection.clear();
}
-
+
m_srcHash.AddRange(m_lastSourceRange);
m_lastSourceRange.clear();
-
+
#ifdef WITH_THREADS
m_srcHash.WaitAll();
#endif
@@ -1021,7 +961,7 @@ void PhraseTableCreator::FlushEncodedQueue(bool force)
m_srcHash.SaveLastRange();
m_srcHash.DropLastRange();
m_srcHash.FinalizeSave();
-
+
m_lastFlushedLine = -1;
m_lastFlushedSourceNum = 0;
@@ -1031,30 +971,27 @@ void PhraseTableCreator::FlushEncodedQueue(bool force)
void PhraseTableCreator::AddCompressedCollection(PackedItem& pi)
{
- m_queue.push(pi);
+ m_queue.push(pi);
}
void PhraseTableCreator::FlushCompressedQueue(bool force)
{
- if(force || m_queue.size() > 10000)
- {
- while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine())
- {
+ if(force || m_queue.size() > 10000) {
+ while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
PackedItem pi = m_queue.top();
m_queue.pop();
m_lastFlushedLine++;
-
+
m_compressedTargetPhrases->push_back(pi.GetTrg());
-
+
if((pi.GetLine()+1) % 100000 == 0)
std::cerr << ".";
if((pi.GetLine()+1) % 5000000 == 0)
std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
}
}
-
- if(force)
- {
+
+ if(force) {
m_lastFlushedLine = -1;
std::cerr << std::endl << std::endl;
}
@@ -1070,38 +1007,35 @@ boost::mutex RankingTask::m_fileMutex;
RankingTask::RankingTask(InputFileStream& inFile, PhraseTableCreator& creator)
: m_inFile(inFile), m_creator(creator) {}
-
+
void RankingTask::operator()()
{
size_t lineNum = 0;
-
+
std::vector<std::string> lines;
size_t max_lines = 1000;
lines.reserve(max_lines);
-
+
{
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_fileMutex);
+ boost::mutex::scoped_lock lock(m_fileMutex);
#endif
- std::string line;
- while(lines.size() < max_lines && std::getline(m_inFile, line))
- lines.push_back(line);
- lineNum = m_lineNum;
- m_lineNum += lines.size();
+ std::string line;
+ while(lines.size() < max_lines && std::getline(m_inFile, line))
+ lines.push_back(line);
+ lineNum = m_lineNum;
+ m_lineNum += lines.size();
}
-
+
std::vector<PackedItem> result;
result.reserve(max_lines);
-
- while(lines.size())
- {
- for(size_t i = 0; i < lines.size(); i++)
- {
+
+ while(lines.size()) {
+ for(size_t i = 0; i < lines.size(); i++) {
std::vector<std::string> tokens;
Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
-
- if(tokens.size() < 3)
- {
+
+ if(tokens.size() < 3) {
std::cerr << "Error: It seems the following line has a wrong format:" << std::endl;
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
abort();
@@ -1112,38 +1046,38 @@ void RankingTask::operator()()
std::cerr << "Better use -encoding None or disable this warning with -no-warnings ." << std::endl;
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
}
-
+
std::vector<float> scores = Tokenize<float>(tokens[2]);
if(scores.size() != m_creator.m_numScoreComponent) {
std::cerr << "Error: It seems the following line has a wrong number of scores ("
- << scores.size() << " != " << m_creator.m_numScoreComponent << ") :" << std::endl;
+ << scores.size() << " != " << m_creator.m_numScoreComponent << ") :" << std::endl;
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
- abort();
+ abort();
}
-
+
float sortScore = scores[m_creator.m_sortScoreIndex];
-
+
std::string key1 = m_creator.MakeSourceKey(tokens[0]);
std::string key2 = m_creator.MakeSourceTargetKey(tokens[0], tokens[1]);
-
+
PackedItem packedItem(lineNum + i, key1, key2, 0, sortScore);
result.push_back(packedItem);
}
lines.clear();
-
+
{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_mutex);
#endif
- for(size_t i = 0; i < result.size(); i++)
+ for(size_t i = 0; i < result.size(); i++)
m_creator.AddRankedLine(result[i]);
- m_creator.FlushRankedQueue();
+ m_creator.FlushRankedQueue();
}
-
+
result.clear();
lines.reserve(max_lines);
result.reserve(max_lines);
-
+
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_fileMutex);
#endif
@@ -1163,15 +1097,15 @@ boost::mutex EncodingTask::m_fileMutex;
EncodingTask::EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator)
: m_inFile(inFile), m_creator(creator) {}
-
+
void EncodingTask::operator()()
{
size_t lineNum = 0;
-
+
std::vector<std::string> lines;
size_t max_lines = 1000;
lines.reserve(max_lines);
-
+
{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_fileMutex);
@@ -1182,19 +1116,16 @@ void EncodingTask::operator()()
lineNum = m_lineNum;
m_lineNum += lines.size();
}
-
+
std::vector<PackedItem> result;
result.reserve(max_lines);
-
- while(lines.size())
- {
- for(size_t i = 0; i < lines.size(); i++)
- {
+
+ while(lines.size()) {
+ for(size_t i = 0; i < lines.size(); i++) {
std::vector<std::string> tokens;
Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
-
- if(tokens.size() < 3)
- {
+
+ if(tokens.size() < 3) {
std::cerr << "Error: It seems the following line has a wrong format:" << std::endl;
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
abort();
@@ -1207,31 +1138,31 @@ void EncodingTask::operator()()
std::cerr << "Better use -encoding None or disable this warning with -no-warnings." << std::endl;
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
}
-
+
size_t ownRank = 0;
if(m_creator.m_coding == PhraseTableCreator::PREnc)
ownRank = m_creator.m_ranks[lineNum + i];
-
+
std::string encodedLine = m_creator.EncodeLine(tokens, ownRank);
-
+
PackedItem packedItem(lineNum + i, tokens[0], encodedLine, ownRank);
result.push_back(packedItem);
}
lines.clear();
-
+
{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_mutex);
#endif
- for(size_t i = 0; i < result.size(); i++)
+ for(size_t i = 0; i < result.size(); i++)
m_creator.AddEncodedLine(result[i]);
- m_creator.FlushEncodedQueue();
+ m_creator.FlushEncodedQueue();
}
-
+
result.clear();
lines.reserve(max_lines);
result.reserve(max_lines);
-
+
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_fileMutex);
#endif
@@ -1251,10 +1182,10 @@ boost::mutex CompressionTask::m_mutex;
#endif
CompressionTask::CompressionTask(StringVector<unsigned char, unsigned long,
- MmapAllocator>& encodedCollections,
- PhraseTableCreator& creator)
+ MmapAllocator>& encodedCollections,
+ PhraseTableCreator& creator)
: m_encodedCollections(encodedCollections), m_creator(creator) {}
-
+
void CompressionTask::operator()()
{
size_t collectionNum;
@@ -1265,12 +1196,11 @@ void CompressionTask::operator()()
collectionNum = m_collectionNum;
m_collectionNum++;
}
-
- while(collectionNum < m_encodedCollections.size())
- {
+
+ while(collectionNum < m_encodedCollections.size()) {
std::string collection = m_encodedCollections[collectionNum];
std::string compressedCollection
- = m_creator.CompressEncodedCollection(collection);
+ = m_creator.CompressEncodedCollection(collection);
std::string dummy;
PackedItem packedItem(collectionNum, dummy, compressedCollection, 0);
@@ -1280,29 +1210,44 @@ void CompressionTask::operator()()
#endif
m_creator.AddCompressedCollection(packedItem);
m_creator.FlushCompressedQueue();
-
- collectionNum = m_collectionNum;
- m_collectionNum++;
+
+ collectionNum = m_collectionNum;
+ m_collectionNum++;
}
}
//****************************************************************************//
PackedItem::PackedItem(long line, std::string sourcePhrase,
- std::string packedTargetPhrase, size_t rank,
- float score)
+ std::string packedTargetPhrase, size_t rank,
+ float score)
: m_line(line), m_sourcePhrase(sourcePhrase),
m_packedTargetPhrase(packedTargetPhrase), m_rank(rank),
m_score(score) {}
-long PackedItem::GetLine() const { return m_line; }
+long PackedItem::GetLine() const
+{
+ return m_line;
+}
-const std::string& PackedItem::GetSrc() const { return m_sourcePhrase; }
+const std::string& PackedItem::GetSrc() const
+{
+ return m_sourcePhrase;
+}
-const std::string& PackedItem::GetTrg() const { return m_packedTargetPhrase; }
+const std::string& PackedItem::GetTrg() const
+{
+ return m_packedTargetPhrase;
+}
-size_t PackedItem::GetRank() const { return m_rank; }
+size_t PackedItem::GetRank() const
+{
+ return m_rank;
+}
-float PackedItem::GetScore() const { return m_score; }
+float PackedItem::GetScore() const
+{
+ return m_score;
+}
}
diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.h b/moses/TranslationModel/CompactPT/PhraseTableCreator.h
index ded3a84eb..fd5fc1581 100644
--- a/moses/TranslationModel/CompactPT/PhraseTableCreator.h
+++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_PhraseTableCreator_h
#define moses_PhraseTableCreator_h
@@ -40,386 +40,371 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
-
+
typedef std::pair<unsigned char, unsigned char> AlignPoint;
-template <typename DataType>
+template <typename DataType>
class Counter
{
- public:
- typedef boost::unordered_map<DataType, size_t> FreqMap;
- typedef typename FreqMap::iterator iterator;
- typedef typename FreqMap::mapped_type mapped_type;
- typedef typename FreqMap::value_type value_type;
-
- private:
-#ifdef WITH_THREADS
- boost::mutex m_mutex;
+public:
+ typedef boost::unordered_map<DataType, size_t> FreqMap;
+ typedef typename FreqMap::iterator iterator;
+ typedef typename FreqMap::mapped_type mapped_type;
+ typedef typename FreqMap::value_type value_type;
+
+private:
+#ifdef WITH_THREADS
+ boost::mutex m_mutex;
#endif
- FreqMap m_freqMap;
- size_t m_maxSize;
- std::vector<DataType> m_bestVec;
-
- struct FreqSorter
- {
- bool operator()(const value_type& a, const value_type& b) const
- {
- if(a.second > b.second)
- return true;
- // Check impact on translation quality!
- if(a.second == b.second && a.first > b.first)
- return true;
- return false;
- }
- };
-
- public:
- Counter() : m_maxSize(0) {}
-
- iterator Begin()
- {
- return m_freqMap.begin();
- }
-
- iterator End()
- {
- return m_freqMap.end();
+ FreqMap m_freqMap;
+ size_t m_maxSize;
+ std::vector<DataType> m_bestVec;
+
+ struct FreqSorter {
+ bool operator()(const value_type& a, const value_type& b) const {
+ if(a.second > b.second)
+ return true;
+ // Check impact on translation quality!
+ if(a.second == b.second && a.first > b.first)
+ return true;
+ return false;
}
-
- void Increase(DataType data)
- {
+ };
+
+public:
+ Counter() : m_maxSize(0) {}
+
+ iterator Begin() {
+ return m_freqMap.begin();
+ }
+
+ iterator End() {
+ return m_freqMap.end();
+ }
+
+ void Increase(DataType data) {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- m_freqMap[data]++;
- }
-
- void IncreaseBy(DataType data, size_t num)
- {
+ m_freqMap[data]++;
+ }
+
+ void IncreaseBy(DataType data, size_t num) {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- m_freqMap[data] += num;
- }
-
- mapped_type& operator[](DataType data)
- {
- return m_freqMap[data];
- }
-
- size_t Size()
- {
+ m_freqMap[data] += num;
+ }
+
+ mapped_type& operator[](DataType data) {
+ return m_freqMap[data];
+ }
+
+ size_t Size() {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- return m_freqMap.size();
- }
-
- void Quantize(size_t maxSize)
- {
+ return m_freqMap.size();
+ }
+
+ void Quantize(size_t maxSize) {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- m_maxSize = maxSize;
- std::vector<std::pair<DataType, mapped_type> > freqVec;
- freqVec.insert(freqVec.begin(), m_freqMap.begin(), m_freqMap.end());
- std::sort(freqVec.begin(), freqVec.end(), FreqSorter());
-
- for(size_t i = 0; i < freqVec.size() && i < m_maxSize; i++)
- m_bestVec.push_back(freqVec[i].first);
-
- std::sort(m_bestVec.begin(), m_bestVec.end());
-
- FreqMap t_freqMap;
- for(typename std::vector<std::pair<DataType, mapped_type> >::iterator it
- = freqVec.begin(); it != freqVec.end(); it++)
- {
- DataType closest = LowerBound(it->first);
- t_freqMap[closest] += it->second;
- }
-
- m_freqMap.swap(t_freqMap);
+ m_maxSize = maxSize;
+ std::vector<std::pair<DataType, mapped_type> > freqVec;
+ freqVec.insert(freqVec.begin(), m_freqMap.begin(), m_freqMap.end());
+ std::sort(freqVec.begin(), freqVec.end(), FreqSorter());
+
+ for(size_t i = 0; i < freqVec.size() && i < m_maxSize; i++)
+ m_bestVec.push_back(freqVec[i].first);
+
+ std::sort(m_bestVec.begin(), m_bestVec.end());
+
+ FreqMap t_freqMap;
+ for(typename std::vector<std::pair<DataType, mapped_type> >::iterator it
+ = freqVec.begin(); it != freqVec.end(); it++) {
+ DataType closest = LowerBound(it->first);
+ t_freqMap[closest] += it->second;
}
-
- void Clear()
- {
+
+ m_freqMap.swap(t_freqMap);
+ }
+
+ void Clear() {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- m_freqMap.clear();
- }
-
- DataType LowerBound(DataType data)
- {
- if(m_maxSize == 0 || m_bestVec.size() == 0)
- return data;
+ m_freqMap.clear();
+ }
+
+ DataType LowerBound(DataType data) {
+ if(m_maxSize == 0 || m_bestVec.size() == 0)
+ return data;
+ else {
+ typename std::vector<DataType>::iterator it
+ = std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data);
+ if(it != m_bestVec.end())
+ return *it;
else
- {
- typename std::vector<DataType>::iterator it
- = std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data);
- if(it != m_bestVec.end())
- return *it;
- else
- return m_bestVec.back();
- }
+ return m_bestVec.back();
}
+ }
};
-
+
class PackedItem
{
- private:
- long m_line;
- std::string m_sourcePhrase;
- std::string m_packedTargetPhrase;
- size_t m_rank;
- float m_score;
-
- public:
- PackedItem(long line, std::string sourcePhrase,
- std::string packedTargetPhrase, size_t rank,
- float m_score = 0);
-
- long GetLine() const;
- const std::string& GetSrc() const;
- const std::string& GetTrg() const;
- size_t GetRank() const;
- float GetScore() const;
+private:
+ long m_line;
+ std::string m_sourcePhrase;
+ std::string m_packedTargetPhrase;
+ size_t m_rank;
+ float m_score;
+
+public:
+ PackedItem(long line, std::string sourcePhrase,
+ std::string packedTargetPhrase, size_t rank,
+ float m_score = 0);
+
+ long GetLine() const;
+ const std::string& GetSrc() const;
+ const std::string& GetTrg() const;
+ size_t GetRank() const;
+ float GetScore() const;
};
bool operator<(const PackedItem &pi1, const PackedItem &pi2);
class PhraseTableCreator
{
- public:
- enum Coding { None, REnc, PREnc };
-
- private:
- std::string m_inPath;
- std::string m_outPath;
- std::string m_tempfilePath;
-
- std::FILE* m_outFile;
-
- size_t m_numScoreComponent;
- size_t m_sortScoreIndex;
- size_t m_warnMe;
-
- Coding m_coding;
- size_t m_orderBits;
- size_t m_fingerPrintBits;
- bool m_useAlignmentInfo;
- bool m_multipleScoreTrees;
- size_t m_quantize;
- size_t m_maxRank;
-
- static std::string m_phraseStopSymbol;
- static std::string m_separator;
-
+public:
+ enum Coding { None, REnc, PREnc };
+
+private:
+ std::string m_inPath;
+ std::string m_outPath;
+ std::string m_tempfilePath;
+
+ std::FILE* m_outFile;
+
+ size_t m_numScoreComponent;
+ size_t m_sortScoreIndex;
+ size_t m_warnMe;
+
+ Coding m_coding;
+ size_t m_orderBits;
+ size_t m_fingerPrintBits;
+ bool m_useAlignmentInfo;
+ bool m_multipleScoreTrees;
+ size_t m_quantize;
+ size_t m_maxRank;
+
+ static std::string m_phraseStopSymbol;
+ static std::string m_separator;
+
#ifdef WITH_THREADS
- size_t m_threads;
- boost::mutex m_mutex;
+ size_t m_threads;
+ boost::mutex m_mutex;
#endif
-
- BlockHashIndex m_srcHash;
- BlockHashIndex m_rnkHash;
-
- size_t m_maxPhraseLength;
-
- std::vector<unsigned> m_ranks;
-
- typedef std::pair<unsigned, unsigned> SrcTrg;
- typedef std::pair<std::string, std::string> SrcTrgString;
- typedef std::pair<SrcTrgString, float> SrcTrgProb;
-
- struct SrcTrgProbSorter
- {
- bool operator()(const SrcTrgProb& a, const SrcTrgProb& b) const
- {
- if(a.first.first < b.first.first)
- return true;
-
- if(a.first.first == b.first.first && a.second > b.second)
- return true;
-
- if(a.first.first == b.first.first
- && a.second == b.second
- && a.first.second < b.first.second)
- return true;
-
- return false;
- }
- };
-
- std::vector<size_t> m_lexicalTableIndex;
- std::vector<SrcTrg> m_lexicalTable;
-
- StringVector<unsigned char, unsigned long, MmapAllocator>*
- m_encodedTargetPhrases;
-
- StringVector<unsigned char, unsigned long, MmapAllocator>*
- m_compressedTargetPhrases;
-
- boost::unordered_map<std::string, unsigned> m_targetSymbolsMap;
- boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
-
- typedef Counter<unsigned> SymbolCounter;
- typedef Counter<float> ScoreCounter;
- typedef Counter<AlignPoint> AlignCounter;
-
- typedef CanonicalHuffman<unsigned> SymbolTree;
- typedef CanonicalHuffman<float> ScoreTree;
- typedef CanonicalHuffman<AlignPoint> AlignTree;
-
- SymbolCounter m_symbolCounter;
- SymbolTree* m_symbolTree;
-
- AlignCounter m_alignCounter;
- AlignTree* m_alignTree;
-
- std::vector<ScoreCounter*> m_scoreCounters;
- std::vector<ScoreTree*> m_scoreTrees;
-
- std::priority_queue<PackedItem> m_queue;
- long m_lastFlushedLine;
- long m_lastFlushedSourceNum;
- std::string m_lastFlushedSourcePhrase;
- std::vector<std::string> m_lastSourceRange;
- std::priority_queue<std::pair<float, size_t> > m_rankQueue;
- std::vector<std::string> m_lastCollection;
-
- void Save();
- void PrintInfo();
-
- void AddSourceSymbolId(std::string& symbol);
- unsigned GetSourceSymbolId(std::string& symbol);
-
- void AddTargetSymbolId(std::string& symbol);
- unsigned GetTargetSymbolId(std::string& symbol);
- unsigned GetOrAddTargetSymbolId(std::string& symbol);
-
- unsigned GetRank(unsigned srcIdx, unsigned trgIdx);
-
- unsigned EncodeREncSymbol1(unsigned symbol);
- unsigned EncodeREncSymbol2(unsigned position, unsigned rank);
- unsigned EncodeREncSymbol3(unsigned rank);
-
- unsigned EncodePREncSymbol1(unsigned symbol);
- unsigned EncodePREncSymbol2(int lOff, int rOff, unsigned rank);
-
- void EncodeTargetPhraseNone(std::vector<std::string>& t,
- std::ostream& os);
-
- void EncodeTargetPhraseREnc(std::vector<std::string>& s,
- std::vector<std::string>& t,
- std::set<AlignPoint>& a,
- std::ostream& os);
-
- void EncodeTargetPhrasePREnc(std::vector<std::string>& s,
- std::vector<std::string>& t,
- std::set<AlignPoint>& a, size_t ownRank,
- std::ostream& os);
-
- void EncodeScores(std::vector<float>& scores, std::ostream& os);
- void EncodeAlignment(std::set<AlignPoint>& alignment, std::ostream& os);
-
- std::string MakeSourceKey(std::string&);
- std::string MakeSourceTargetKey(std::string&, std::string&);
-
- void LoadLexicalTable(std::string filePath);
-
- void CreateRankHash();
- void EncodeTargetPhrases();
- void CalcHuffmanCodes();
- void CompressTargetPhrases();
-
- void AddRankedLine(PackedItem& pi);
- void FlushRankedQueue(bool force = false);
-
- std::string EncodeLine(std::vector<std::string>& tokens, size_t ownRank);
- void AddEncodedLine(PackedItem& pi);
- void FlushEncodedQueue(bool force = false);
-
- std::string CompressEncodedCollection(std::string encodedCollection);
- void AddCompressedCollection(PackedItem& pi);
- void FlushCompressedQueue(bool force = false);
-
- public:
-
- PhraseTableCreator(std::string inPath,
- std::string outPath,
- std::string tempfilePath,
- size_t numScoreComponent = 5,
- size_t sortScoreIndex = 2,
- Coding coding = PREnc,
- size_t orderBits = 10,
- size_t fingerPrintBits = 16,
- bool useAlignmentInfo = false,
- bool multipleScoreTrees = true,
- size_t quantize = 0,
- size_t maxRank = 100,
- bool warnMe = true
+
+ BlockHashIndex m_srcHash;
+ BlockHashIndex m_rnkHash;
+
+ size_t m_maxPhraseLength;
+
+ std::vector<unsigned> m_ranks;
+
+ typedef std::pair<unsigned, unsigned> SrcTrg;
+ typedef std::pair<std::string, std::string> SrcTrgString;
+ typedef std::pair<SrcTrgString, float> SrcTrgProb;
+
+ struct SrcTrgProbSorter {
+ bool operator()(const SrcTrgProb& a, const SrcTrgProb& b) const {
+ if(a.first.first < b.first.first)
+ return true;
+
+ if(a.first.first == b.first.first && a.second > b.second)
+ return true;
+
+ if(a.first.first == b.first.first
+ && a.second == b.second
+ && a.first.second < b.first.second)
+ return true;
+
+ return false;
+ }
+ };
+
+ std::vector<size_t> m_lexicalTableIndex;
+ std::vector<SrcTrg> m_lexicalTable;
+
+ StringVector<unsigned char, unsigned long, MmapAllocator>*
+ m_encodedTargetPhrases;
+
+ StringVector<unsigned char, unsigned long, MmapAllocator>*
+ m_compressedTargetPhrases;
+
+ boost::unordered_map<std::string, unsigned> m_targetSymbolsMap;
+ boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
+
+ typedef Counter<unsigned> SymbolCounter;
+ typedef Counter<float> ScoreCounter;
+ typedef Counter<AlignPoint> AlignCounter;
+
+ typedef CanonicalHuffman<unsigned> SymbolTree;
+ typedef CanonicalHuffman<float> ScoreTree;
+ typedef CanonicalHuffman<AlignPoint> AlignTree;
+
+ SymbolCounter m_symbolCounter;
+ SymbolTree* m_symbolTree;
+
+ AlignCounter m_alignCounter;
+ AlignTree* m_alignTree;
+
+ std::vector<ScoreCounter*> m_scoreCounters;
+ std::vector<ScoreTree*> m_scoreTrees;
+
+ std::priority_queue<PackedItem> m_queue;
+ long m_lastFlushedLine;
+ long m_lastFlushedSourceNum;
+ std::string m_lastFlushedSourcePhrase;
+ std::vector<std::string> m_lastSourceRange;
+ std::priority_queue<std::pair<float, size_t> > m_rankQueue;
+ std::vector<std::string> m_lastCollection;
+
+ void Save();
+ void PrintInfo();
+
+ void AddSourceSymbolId(std::string& symbol);
+ unsigned GetSourceSymbolId(std::string& symbol);
+
+ void AddTargetSymbolId(std::string& symbol);
+ unsigned GetTargetSymbolId(std::string& symbol);
+ unsigned GetOrAddTargetSymbolId(std::string& symbol);
+
+ unsigned GetRank(unsigned srcIdx, unsigned trgIdx);
+
+ unsigned EncodeREncSymbol1(unsigned symbol);
+ unsigned EncodeREncSymbol2(unsigned position, unsigned rank);
+ unsigned EncodeREncSymbol3(unsigned rank);
+
+ unsigned EncodePREncSymbol1(unsigned symbol);
+ unsigned EncodePREncSymbol2(int lOff, int rOff, unsigned rank);
+
+ void EncodeTargetPhraseNone(std::vector<std::string>& t,
+ std::ostream& os);
+
+ void EncodeTargetPhraseREnc(std::vector<std::string>& s,
+ std::vector<std::string>& t,
+ std::set<AlignPoint>& a,
+ std::ostream& os);
+
+ void EncodeTargetPhrasePREnc(std::vector<std::string>& s,
+ std::vector<std::string>& t,
+ std::set<AlignPoint>& a, size_t ownRank,
+ std::ostream& os);
+
+ void EncodeScores(std::vector<float>& scores, std::ostream& os);
+ void EncodeAlignment(std::set<AlignPoint>& alignment, std::ostream& os);
+
+ std::string MakeSourceKey(std::string&);
+ std::string MakeSourceTargetKey(std::string&, std::string&);
+
+ void LoadLexicalTable(std::string filePath);
+
+ void CreateRankHash();
+ void EncodeTargetPhrases();
+ void CalcHuffmanCodes();
+ void CompressTargetPhrases();
+
+ void AddRankedLine(PackedItem& pi);
+ void FlushRankedQueue(bool force = false);
+
+ std::string EncodeLine(std::vector<std::string>& tokens, size_t ownRank);
+ void AddEncodedLine(PackedItem& pi);
+ void FlushEncodedQueue(bool force = false);
+
+ std::string CompressEncodedCollection(std::string encodedCollection);
+ void AddCompressedCollection(PackedItem& pi);
+ void FlushCompressedQueue(bool force = false);
+
+public:
+
+ PhraseTableCreator(std::string inPath,
+ std::string outPath,
+ std::string tempfilePath,
+ size_t numScoreComponent = 5,
+ size_t sortScoreIndex = 2,
+ Coding coding = PREnc,
+ size_t orderBits = 10,
+ size_t fingerPrintBits = 16,
+ bool useAlignmentInfo = false,
+ bool multipleScoreTrees = true,
+ size_t quantize = 0,
+ size_t maxRank = 100,
+ bool warnMe = true
#ifdef WITH_THREADS
- , size_t threads = 2
+ , size_t threads = 2
#endif
- );
-
- ~PhraseTableCreator();
-
- friend class RankingTask;
- friend class EncodingTask;
- friend class CompressionTask;
+ );
+
+ ~PhraseTableCreator();
+
+ friend class RankingTask;
+ friend class EncodingTask;
+ friend class CompressionTask;
};
class RankingTask
{
- private:
+private:
#ifdef WITH_THREADS
- static boost::mutex m_mutex;
- static boost::mutex m_fileMutex;
+ static boost::mutex m_mutex;
+ static boost::mutex m_fileMutex;
#endif
- static size_t m_lineNum;
- InputFileStream& m_inFile;
- PhraseTableCreator& m_creator;
-
- public:
- RankingTask(InputFileStream& inFile, PhraseTableCreator& creator);
- void operator()();
+ static size_t m_lineNum;
+ InputFileStream& m_inFile;
+ PhraseTableCreator& m_creator;
+
+public:
+ RankingTask(InputFileStream& inFile, PhraseTableCreator& creator);
+ void operator()();
};
class EncodingTask
{
- private:
+private:
#ifdef WITH_THREADS
- static boost::mutex m_mutex;
- static boost::mutex m_fileMutex;
+ static boost::mutex m_mutex;
+ static boost::mutex m_fileMutex;
#endif
- static size_t m_lineNum;
- static size_t m_sourcePhraseNum;
- static std::string m_lastSourcePhrase;
-
- InputFileStream& m_inFile;
- PhraseTableCreator& m_creator;
-
- public:
- EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator);
- void operator()();
+ static size_t m_lineNum;
+ static size_t m_sourcePhraseNum;
+ static std::string m_lastSourcePhrase;
+
+ InputFileStream& m_inFile;
+ PhraseTableCreator& m_creator;
+
+public:
+ EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator);
+ void operator()();
};
class CompressionTask
{
- private:
+private:
#ifdef WITH_THREADS
- static boost::mutex m_mutex;
+ static boost::mutex m_mutex;
#endif
- static size_t m_collectionNum;
- StringVector<unsigned char, unsigned long, MmapAllocator>&
- m_encodedCollections;
- PhraseTableCreator& m_creator;
-
- public:
- CompressionTask(StringVector<unsigned char, unsigned long, MmapAllocator>&
- encodedCollections, PhraseTableCreator& creator);
- void operator()();
+ static size_t m_collectionNum;
+ StringVector<unsigned char, unsigned long, MmapAllocator>&
+ m_encodedCollections;
+ PhraseTableCreator& m_creator;
+
+public:
+ CompressionTask(StringVector<unsigned char, unsigned long, MmapAllocator>&
+ encodedCollections, PhraseTableCreator& creator);
+ void operator()();
};
}
diff --git a/moses/TranslationModel/CompactPT/StringVector.h b/moses/TranslationModel/CompactPT/StringVector.h
index fcc545a19..4545d61c6 100644
--- a/moses/TranslationModel/CompactPT/StringVector.h
+++ b/moses/TranslationModel/CompactPT/StringVector.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_StringVector_h
#define moses_StringVector_h
@@ -43,255 +43,241 @@ namespace Moses
template <typename ValueIteratorT>
class ValueIteratorRange
{
- private:
- ValueIteratorT m_begin;
- ValueIteratorT m_end;
-
- public:
- ValueIteratorRange(ValueIteratorT begin, ValueIteratorT end);
-
- const ValueIteratorT& begin() const;
- const ValueIteratorT& end() const;
- const std::string str() const;
- operator const std::string()
- {
- return str();
- }
-
- size_t size()
- {
- return std::distance(m_begin, m_end);
- }
-
- template <typename StringT>
- bool operator==(const StringT& o) const;
- bool operator==(const char* c) const;
-
- template <typename StringT>
- bool operator<(const StringT& o) const;
- bool operator<(const char* c) const;
+private:
+ ValueIteratorT m_begin;
+ ValueIteratorT m_end;
+
+public:
+ ValueIteratorRange(ValueIteratorT begin, ValueIteratorT end);
+
+ const ValueIteratorT& begin() const;
+ const ValueIteratorT& end() const;
+ const std::string str() const;
+ operator const std::string() {
+ return str();
+ }
+
+ size_t size() {
+ return std::distance(m_begin, m_end);
+ }
+
+ template <typename StringT>
+ bool operator==(const StringT& o) const;
+ bool operator==(const char* c) const;
+
+ template <typename StringT>
+ bool operator<(const StringT& o) const;
+ bool operator<(const char* c) const;
};
// ********** StringVector **********
template <typename ValueT = unsigned char, typename PosT = unsigned int,
- template <typename> class Allocator = std::allocator>
+ template <typename> class Allocator = std::allocator>
class StringVector
-{
- protected:
- bool m_sorted;
- bool m_memoryMapped;
-
- std::vector<ValueT, Allocator<ValueT> >* m_charArray;
- MonotonicVector<PosT, unsigned int, 32> m_positions;
-
- virtual const ValueT* value_ptr(PosT i) const;
-
+{
+protected:
+ bool m_sorted;
+ bool m_memoryMapped;
+
+ std::vector<ValueT, Allocator<ValueT> >* m_charArray;
+ MonotonicVector<PosT, unsigned int, 32> m_positions;
+
+ virtual const ValueT* value_ptr(PosT i) const;
+
+public:
+ typedef ValueIteratorRange<typename std::vector<ValueT, Allocator<ValueT> >::const_iterator> range;
+
+ // ********** RangeIterator **********
+
+ class RangeIterator : public boost::iterator_facade<RangeIterator,
+ range, std::random_access_iterator_tag, range, PosT>
+ {
+
+ private:
+ PosT m_index;
+ StringVector<ValueT, PosT, Allocator>* m_container;
+
public:
- typedef ValueIteratorRange<typename std::vector<ValueT, Allocator<ValueT> >::const_iterator> range;
-
- // ********** RangeIterator **********
-
- class RangeIterator : public boost::iterator_facade<RangeIterator,
- range, std::random_access_iterator_tag, range, PosT>
- {
-
- private:
- PosT m_index;
- StringVector<ValueT, PosT, Allocator>* m_container;
-
- public:
- RangeIterator();
- RangeIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index=0);
-
- PosT get_index();
-
- private:
- friend class boost::iterator_core_access;
-
- range dereference() const;
- bool equal(RangeIterator const& other) const;
- void increment();
- void decrement();
- void advance(PosT n);
-
- PosT distance_to(RangeIterator const& other) const;
- };
-
- // ********** StringIterator **********
-
- class StringIterator : public boost::iterator_facade<StringIterator,
- std::string, std::random_access_iterator_tag, const std::string, PosT>
- {
-
- private:
- PosT m_index;
- StringVector<ValueT, PosT, Allocator>* m_container;
-
- public:
- StringIterator();
- StringIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index=0);
-
- PosT get_index();
-
- private:
- friend class boost::iterator_core_access;
-
- const std::string dereference() const;
- bool equal(StringIterator const& other) const;
- void increment();
- void decrement();
- void advance(PosT n);
- PosT distance_to(StringIterator const& other) const;
- };
-
- typedef RangeIterator iterator;
- typedef StringIterator string_iterator;
-
- StringVector();
- StringVector(Allocator<ValueT> alloc);
-
- virtual ~StringVector()
- {
- delete m_charArray;
- }
-
- void swap(StringVector<ValueT, PosT, Allocator> &c)
- {
- m_positions.commit();
- m_positions.swap(c.m_positions);
- m_charArray->swap(*c.m_charArray);
-
- bool temp = m_sorted;
- m_sorted = c.m_sorted;
- c.m_sorted = temp;
- }
-
- bool is_sorted() const;
- PosT size() const;
- virtual PosT size2() const;
-
- template<class Iterator> Iterator begin() const;
- template<class Iterator> Iterator end() const;
-
- iterator begin() const;
- iterator end() const;
-
- PosT length(PosT i) const;
- typename std::vector<ValueT, Allocator<ValueT> >::const_iterator begin(PosT i) const;
- typename std::vector<ValueT, Allocator<ValueT> >::const_iterator end(PosT i) const;
-
- void clear()
- {
- m_charArray->clear();
- m_sorted = true;
- m_positions = MonotonicVector<PosT, unsigned int, 32>();
- }
-
- range at(PosT i) const;
- range operator[](PosT i) const;
- range back() const;
-
- template <typename StringT>
- void push_back(StringT s);
- void push_back(const char* c);
-
- template <typename StringT>
- PosT find(StringT &s) const;
- PosT find(const char* c) const;
-
- virtual size_t load(std::FILE* in, bool memoryMapped = false)
- {
- size_t size = 0;
- m_memoryMapped = memoryMapped;
-
- size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool);
- size += m_positions.load(in, m_memoryMapped);
-
- size += loadCharArray(*m_charArray, in, m_memoryMapped);
- return size;
- }
-
- size_t loadCharArray(std::vector<ValueT, std::allocator<ValueT> >& c,
- std::FILE* in, bool map = false)
- {
- // Can only be read into memory. Mapping not possible with std:allocator.
- assert(map == false);
-
- size_t byteSize = 0;
-
- size_t valSize;
- byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
-
+ RangeIterator();
+ RangeIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index=0);
+
+ PosT get_index();
+
+ private:
+ friend class boost::iterator_core_access;
+
+ range dereference() const;
+ bool equal(RangeIterator const& other) const;
+ void increment();
+ void decrement();
+ void advance(PosT n);
+
+ PosT distance_to(RangeIterator const& other) const;
+ };
+
+ // ********** StringIterator **********
+
+ class StringIterator : public boost::iterator_facade<StringIterator,
+ std::string, std::random_access_iterator_tag, const std::string, PosT>
+ {
+
+ private:
+ PosT m_index;
+ StringVector<ValueT, PosT, Allocator>* m_container;
+
+ public:
+ StringIterator();
+ StringIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index=0);
+
+ PosT get_index();
+
+ private:
+ friend class boost::iterator_core_access;
+
+ const std::string dereference() const;
+ bool equal(StringIterator const& other) const;
+ void increment();
+ void decrement();
+ void advance(PosT n);
+ PosT distance_to(StringIterator const& other) const;
+ };
+
+ typedef RangeIterator iterator;
+ typedef StringIterator string_iterator;
+
+ StringVector();
+ StringVector(Allocator<ValueT> alloc);
+
+ virtual ~StringVector() {
+ delete m_charArray;
+ }
+
+ void swap(StringVector<ValueT, PosT, Allocator> &c) {
+ m_positions.commit();
+ m_positions.swap(c.m_positions);
+ m_charArray->swap(*c.m_charArray);
+
+ bool temp = m_sorted;
+ m_sorted = c.m_sorted;
+ c.m_sorted = temp;
+ }
+
+ bool is_sorted() const;
+ PosT size() const;
+ virtual PosT size2() const;
+
+ template<class Iterator> Iterator begin() const;
+ template<class Iterator> Iterator end() const;
+
+ iterator begin() const;
+ iterator end() const;
+
+ PosT length(PosT i) const;
+ typename std::vector<ValueT, Allocator<ValueT> >::const_iterator begin(PosT i) const;
+ typename std::vector<ValueT, Allocator<ValueT> >::const_iterator end(PosT i) const;
+
+ void clear() {
+ m_charArray->clear();
+ m_sorted = true;
+ m_positions = MonotonicVector<PosT, unsigned int, 32>();
+ }
+
+ range at(PosT i) const;
+ range operator[](PosT i) const;
+ range back() const;
+
+ template <typename StringT>
+ void push_back(StringT s);
+ void push_back(const char* c);
+
+ template <typename StringT>
+ PosT find(StringT &s) const;
+ PosT find(const char* c) const;
+
+ virtual size_t load(std::FILE* in, bool memoryMapped = false) {
+ size_t size = 0;
+ m_memoryMapped = memoryMapped;
+
+ size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool);
+ size += m_positions.load(in, m_memoryMapped);
+
+ size += loadCharArray(*m_charArray, in, m_memoryMapped);
+ return size;
+ }
+
+ size_t loadCharArray(std::vector<ValueT, std::allocator<ValueT> >& c,
+ std::FILE* in, bool map = false) {
+ // Can only be read into memory. Mapping not possible with std:allocator.
+ assert(map == false);
+
+ size_t byteSize = 0;
+
+ size_t valSize;
+ byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
+
+ c.resize(valSize, 0);
+ byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
+
+ return byteSize;
+ }
+
+ size_t loadCharArray(std::vector<ValueT, MmapAllocator<ValueT> >& c,
+ std::FILE* in, bool map = false) {
+ size_t byteSize = 0;
+
+ size_t valSize;
+ byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
+
+ if(map == false) {
+ // Read data into temporary file (default constructor of MmapAllocator)
+ // and map memory onto temporary file. Can be resized.
+
c.resize(valSize, 0);
byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
-
- return byteSize;
- }
-
- size_t loadCharArray(std::vector<ValueT, MmapAllocator<ValueT> >& c,
- std::FILE* in, bool map = false)
- {
- size_t byteSize = 0;
-
- size_t valSize;
- byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
-
- if(map == false)
- {
- // Read data into temporary file (default constructor of MmapAllocator)
- // and map memory onto temporary file. Can be resized.
-
- c.resize(valSize, 0);
- byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
- }
- else
- {
- // Map it directly on specified region of file "in" starting at valPos
- // with length valSize * sizeof(ValueT). Mapped region cannot be resized.
-
- size_t valPos = std::ftell(in);
- Allocator<ValueT> alloc(in, valPos);
- std::vector<ValueT, Allocator<ValueT> > charArrayTemp(alloc);
- charArrayTemp.resize(valSize);
- c.swap(charArrayTemp);
-
- byteSize += valSize * sizeof(ValueT);
- }
-
- return byteSize;
- }
-
- size_t load(std::string filename, bool memoryMapped = false)
- {
- std::FILE* pFile = fopen(filename.c_str(), "r");
- size_t byteSize = load(pFile, memoryMapped);
- fclose(pFile);
- return byteSize;
- }
+ } else {
+ // Map it directly on specified region of file "in" starting at valPos
+ // with length valSize * sizeof(ValueT). Mapped region cannot be resized.
- size_t save(std::FILE* out)
- {
- size_t byteSize = 0;
- byteSize += ThrowingFwrite(&m_sorted, sizeof(bool), 1, out) * sizeof(bool);
-
- byteSize += m_positions.save(out);
-
- size_t valSize = size2();
- byteSize += ThrowingFwrite(&valSize, sizeof(size_t), 1, out) * sizeof(size_t);
- byteSize += ThrowingFwrite(&(*m_charArray)[0], sizeof(ValueT), valSize, out) * sizeof(ValueT);
-
- return byteSize;
- }
-
- size_t save(std::string filename)
- {
- std::FILE* pFile = fopen(filename.c_str(), "w");
- size_t byteSize = save(pFile);
- fclose(pFile);
- return byteSize;
+ size_t valPos = std::ftell(in);
+ Allocator<ValueT> alloc(in, valPos);
+ std::vector<ValueT, Allocator<ValueT> > charArrayTemp(alloc);
+ charArrayTemp.resize(valSize);
+ c.swap(charArrayTemp);
+
+ byteSize += valSize * sizeof(ValueT);
}
-
+
+ return byteSize;
+ }
+
+ size_t load(std::string filename, bool memoryMapped = false) {
+ std::FILE* pFile = fopen(filename.c_str(), "r");
+ size_t byteSize = load(pFile, memoryMapped);
+ fclose(pFile);
+ return byteSize;
+ }
+
+ size_t save(std::FILE* out) {
+ size_t byteSize = 0;
+ byteSize += ThrowingFwrite(&m_sorted, sizeof(bool), 1, out) * sizeof(bool);
+
+ byteSize += m_positions.save(out);
+
+ size_t valSize = size2();
+ byteSize += ThrowingFwrite(&valSize, sizeof(size_t), 1, out) * sizeof(size_t);
+ byteSize += ThrowingFwrite(&(*m_charArray)[0], sizeof(ValueT), valSize, out) * sizeof(ValueT);
+
+ return byteSize;
+ }
+
+ size_t save(std::string filename) {
+ std::FILE* pFile = fopen(filename.c_str(), "w");
+ size_t byteSize = save(pFile);
+ fclose(pFile);
+ return byteSize;
+ }
+
};
// ********** Implementation **********
@@ -300,214 +286,214 @@ class StringVector
template <typename ValueIteratorT>
ValueIteratorRange<ValueIteratorT>::ValueIteratorRange(ValueIteratorT begin,
- ValueIteratorT end) : m_begin(begin), m_end(end) { }
-
+ ValueIteratorT end) : m_begin(begin), m_end(end) { }
+
template <typename ValueIteratorT>
const ValueIteratorT& ValueIteratorRange<ValueIteratorT>::begin() const
{
- return m_begin;
+ return m_begin;
}
template <typename ValueIteratorT>
const ValueIteratorT& ValueIteratorRange<ValueIteratorT>::end() const
{
- return m_end;
+ return m_end;
}
template <typename ValueIteratorT>
const std::string ValueIteratorRange<ValueIteratorT>::str() const
{
- std::string dummy;
- for(ValueIteratorT it = m_begin; it != m_end; it++)
- dummy.push_back(*it);
- return dummy;
+ std::string dummy;
+ for(ValueIteratorT it = m_begin; it != m_end; it++)
+ dummy.push_back(*it);
+ return dummy;
}
template <typename ValueIteratorT>
template <typename StringT>
bool ValueIteratorRange<ValueIteratorT>::operator==(const StringT& o) const
{
- if(std::distance(m_begin, m_end) == std::distance(o.begin(), o.end()))
- return std::equal(m_begin, m_end, o.begin());
- else
- return false;
+ if(std::distance(m_begin, m_end) == std::distance(o.begin(), o.end()))
+ return std::equal(m_begin, m_end, o.begin());
+ else
+ return false;
}
-
+
template <typename ValueIteratorT>
bool ValueIteratorRange<ValueIteratorT>::operator==(const char* c) const
{
- return *this == std::string(c);
+ return *this == std::string(c);
}
template <typename ValueIteratorT>
template <typename StringT>
bool ValueIteratorRange<ValueIteratorT>::operator<(const StringT &s2) const
{
- return std::lexicographical_compare(m_begin, m_end, s2.begin(), s2.end(),
- std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
+ return std::lexicographical_compare(m_begin, m_end, s2.begin(), s2.end(),
+ std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
}
template <typename ValueIteratorT>
bool ValueIteratorRange<ValueIteratorT>::operator<(const char* c) const
{
- return *this < std::string(c);
+ return *this < std::string(c);
}
template <typename StringT, typename ValueIteratorT>
bool operator<(const StringT &s1, const ValueIteratorRange<ValueIteratorT> &s2)
{
- return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), s2.end(),
- std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
+ return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), s2.end(),
+ std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
}
template <typename ValueIteratorT>
bool operator<(const char* c, const ValueIteratorRange<ValueIteratorT> &s2)
{
- size_t len = std::char_traits<char>::length(c);
- return std::lexicographical_compare(c, c + len, s2.begin(), s2.end(),
- std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
+ size_t len = std::char_traits<char>::length(c);
+ return std::lexicographical_compare(c, c + len, s2.begin(), s2.end(),
+ std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
}
template <typename OStream, typename ValueIteratorT>
OStream& operator<<(OStream &os, ValueIteratorRange<ValueIteratorT> cr)
{
ValueIteratorT it = cr.begin();
- while(it != cr.end())
- os << *(it++);
- return os;
+ while(it != cr.end())
+ os << *(it++);
+ return os;
}
// StringVector
template<typename ValueT, typename PosT, template <typename> class Allocator>
StringVector<ValueT, PosT, Allocator>::StringVector()
- : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >()) { }
+ : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >()) { }
template<typename ValueT, typename PosT, template <typename> class Allocator>
StringVector<ValueT, PosT, Allocator>::StringVector(Allocator<ValueT> alloc)
- : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >(alloc)) { }
+ : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >(alloc)) { }
-template<typename ValueT, typename PosT, template <typename> class Allocator>
+template<typename ValueT, typename PosT, template <typename> class Allocator>
template <typename StringT>
void StringVector<ValueT, PosT, Allocator>::push_back(StringT s)
{
- if(is_sorted() && size() && !(back() < s))
- m_sorted = false;
+ if(is_sorted() && size() && !(back() < s))
+ m_sorted = false;
- m_positions.push_back(size2());
- std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray));
+ m_positions.push_back(size2());
+ std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray));
}
-template<typename ValueT, typename PosT, template <typename> class Allocator>
+template<typename ValueT, typename PosT, template <typename> class Allocator>
void StringVector<ValueT, PosT, Allocator>::push_back(const char* c)
{
- std::string dummy(c);
- push_back(dummy);
+ std::string dummy(c);
+ push_back(dummy);
}
-template<typename ValueT, typename PosT, template <typename> class Allocator>
+template<typename ValueT, typename PosT, template <typename> class Allocator>
template <typename Iterator>
Iterator StringVector<ValueT, PosT, Allocator>::begin() const
{
- return Iterator(const_cast<StringVector<ValueT, PosT, Allocator>&>(*this), 0);
+ return Iterator(const_cast<StringVector<ValueT, PosT, Allocator>&>(*this), 0);
}
-
-template<typename ValueT, typename PosT, template <typename> class Allocator>
+
+template<typename ValueT, typename PosT, template <typename> class Allocator>
template <typename Iterator>
Iterator StringVector<ValueT, PosT, Allocator>::end() const
{
- return Iterator(const_cast<StringVector<ValueT, PosT, Allocator>&>(*this), size());
+ return Iterator(const_cast<StringVector<ValueT, PosT, Allocator>&>(*this), size());
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename StringVector<ValueT, PosT, Allocator>::iterator StringVector<ValueT, PosT, Allocator>::begin() const
{
- return begin<iterator>();
-};
+ return begin<iterator>();
+};
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename StringVector<ValueT, PosT, Allocator>::iterator StringVector<ValueT, PosT, Allocator>::end() const
{
- return end<iterator>();
-};
+ return end<iterator>();
+};
template<typename ValueT, typename PosT, template <typename> class Allocator>
bool StringVector<ValueT, PosT, Allocator>::is_sorted() const
{
- return m_sorted;
+ return m_sorted;
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::size() const
{
- return m_positions.size();
+ return m_positions.size();
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::size2() const
{
- return m_charArray->size();
+ return m_charArray->size();
}
-
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT, Allocator>::at(PosT i) const
{
- return range(begin(i), end(i));
+ return range(begin(i), end(i));
}
-
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT, Allocator>::operator[](PosT i) const
{
- return at(i);
+ return at(i);
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT, Allocator>::back() const
{
- return at(size()-1);
+ return at(size()-1);
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::length(PosT i) const
{
- if(i+1 < size())
- return m_positions[i+1] - m_positions[i];
- else
- return size2() - m_positions[i];
+ if(i+1 < size())
+ return m_positions[i+1] - m_positions[i];
+ else
+ return size2() - m_positions[i];
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
const ValueT* StringVector<ValueT, PosT, Allocator>::value_ptr(PosT i) const
{
- return &(*m_charArray)[m_positions[i]];
+ return &(*m_charArray)[m_positions[i]];
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVector<ValueT, PosT, Allocator>::begin(PosT i) const
{
- return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i));
-}
+ return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i));
+}
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVector<ValueT, PosT, Allocator>::end(PosT i) const
{
- return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i) + length(i));
-}
+ return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i) + length(i));
+}
template<typename ValueT, typename PosT, template <typename> class Allocator>
template <typename StringT>
PosT StringVector<ValueT, PosT, Allocator>::find(StringT &s) const
{
- if(m_sorted)
- return std::distance(begin(), std::lower_bound(begin(), end(), s));
- return std::distance(begin(), std::find(begin(), end(), s));
+ if(m_sorted)
+ return std::distance(begin(), std::lower_bound(begin(), end(), s));
+ return std::distance(begin(), std::find(begin(), end(), s));
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::find(const char* c) const
{
- std::string s(c);
- return find(s);
+ std::string s(c);
+ return find(s);
}
// RangeIterator
@@ -518,21 +504,21 @@ StringVector<ValueT, PosT, Allocator>::RangeIterator::RangeIterator() : m_index(
template<typename ValueT, typename PosT, template <typename> class Allocator>
StringVector<ValueT, PosT, Allocator>::RangeIterator::RangeIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index)
: m_index(index), m_container(&sv) { }
-
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::RangeIterator::get_index()
{
return m_index;
}
-
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename StringVector<ValueT, PosT, Allocator>::range
- StringVector<ValueT, PosT, Allocator>::RangeIterator::dereference() const
+StringVector<ValueT, PosT, Allocator>::RangeIterator::dereference() const
{
return typename StringVector<ValueT, PosT, Allocator>::range(
- m_container->begin(m_index),
- m_container->end(m_index)
- );
+ m_container->begin(m_index),
+ m_container->end(m_index)
+ );
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
@@ -577,18 +563,18 @@ template<typename ValueT, typename PosT, template <typename> class Allocator>
StringVector<ValueT, PosT, Allocator>::StringIterator::StringIterator(
StringVector<ValueT, PosT, Allocator> &sv, PosT index) : m_index(index),
m_container(&sv) { }
-
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::StringIterator::get_index()
{
return m_index;
}
-
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
const std::string StringVector<ValueT, PosT, Allocator>::StringIterator::dereference() const
{
return StringVector<ValueT, PosT, Allocator>::range(m_container->begin(m_index),
- m_container->end(m_index)).str();
+ m_container->end(m_index)).str();
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
@@ -620,7 +606,7 @@ template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::StringIterator::distance_to(
StringVector<ValueT, PosT, Allocator>::StringIterator const& other) const
{
- return other.m_index - m_index;
+ return other.m_index - m_index;
}
// ********** Some typedefs **********
diff --git a/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h b/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h
index 7687d1498..3eac0226a 100644
--- a/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h
+++ b/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_TargetPhraseCollectionCache_h
#define moses_TargetPhraseCollectionCache_h
@@ -46,135 +46,119 @@ typedef boost::shared_ptr<TargetPhraseVector> TargetPhraseVectorPtr;
class TargetPhraseCollectionCache
{
- private:
- size_t m_max;
- float m_tolerance;
-
- struct LastUsed {
- clock_t m_clock;
- TargetPhraseVectorPtr m_tpv;
- size_t m_bitsLeft;
-
- LastUsed() : m_clock(0), m_bitsLeft(0) {}
-
- LastUsed(clock_t clock, TargetPhraseVectorPtr tpv, size_t bitsLeft = 0)
+private:
+ size_t m_max;
+ float m_tolerance;
+
+ struct LastUsed {
+ clock_t m_clock;
+ TargetPhraseVectorPtr m_tpv;
+ size_t m_bitsLeft;
+
+ LastUsed() : m_clock(0), m_bitsLeft(0) {}
+
+ LastUsed(clock_t clock, TargetPhraseVectorPtr tpv, size_t bitsLeft = 0)
: m_clock(clock), m_tpv(tpv), m_bitsLeft(bitsLeft) {}
- };
-
- typedef std::map<Phrase, LastUsed> CacheMap;
-
- CacheMap m_phraseCache;
-
+ };
+
+ typedef std::map<Phrase, LastUsed> CacheMap;
+
+ CacheMap m_phraseCache;
+
#ifdef WITH_THREADS
- boost::mutex m_mutex;
+ boost::mutex m_mutex;
#endif
- public:
-
- typedef CacheMap::iterator iterator;
- typedef CacheMap::const_iterator const_iterator;
-
- TargetPhraseCollectionCache(size_t max = 5000, float tolerance = 0.2)
+public:
+
+ typedef CacheMap::iterator iterator;
+ typedef CacheMap::const_iterator const_iterator;
+
+ TargetPhraseCollectionCache(size_t max = 5000, float tolerance = 0.2)
: m_max(max), m_tolerance(tolerance)
- {}
-
- iterator Begin()
- {
- return m_phraseCache.begin();
- }
-
- const_iterator Begin() const
- {
- return m_phraseCache.begin();
- }
-
- iterator End()
- {
- return m_phraseCache.end();
- }
-
- const_iterator End() const
- {
- return m_phraseCache.end();
- }
-
- void Cache(const Phrase &sourcePhrase, TargetPhraseVectorPtr tpv,
- size_t bitsLeft = 0, size_t maxRank = 0)
- {
+ {}
+
+ iterator Begin() {
+ return m_phraseCache.begin();
+ }
+
+ const_iterator Begin() const {
+ return m_phraseCache.begin();
+ }
+
+ iterator End() {
+ return m_phraseCache.end();
+ }
+
+ const_iterator End() const {
+ return m_phraseCache.end();
+ }
+
+ void Cache(const Phrase &sourcePhrase, TargetPhraseVectorPtr tpv,
+ size_t bitsLeft = 0, size_t maxRank = 0) {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- iterator it = m_phraseCache.find(sourcePhrase);
- if(it != m_phraseCache.end())
- it->second.m_clock = clock();
- else
- {
- if(maxRank && tpv->size() > maxRank)
- {
- TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector());
- tpv_temp->resize(maxRank);
- std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin());
- m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft);
- }
- else
- m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft);
- }
+ iterator it = m_phraseCache.find(sourcePhrase);
+ if(it != m_phraseCache.end())
+ it->second.m_clock = clock();
+ else {
+ if(maxRank && tpv->size() > maxRank) {
+ TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector());
+ tpv_temp->resize(maxRank);
+ std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin());
+ m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft);
+ } else
+ m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft);
}
+ }
- std::pair<TargetPhraseVectorPtr, size_t> Retrieve(const Phrase &sourcePhrase)
- {
+ std::pair<TargetPhraseVectorPtr, size_t> Retrieve(const Phrase &sourcePhrase) {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- iterator it = m_phraseCache.find(sourcePhrase);
- if(it != m_phraseCache.end())
- {
- LastUsed &lu = it->second;
- lu.m_clock = clock();
- return std::make_pair(lu.m_tpv, lu.m_bitsLeft);
- }
- else
- return std::make_pair(TargetPhraseVectorPtr(), 0);
- }
+ iterator it = m_phraseCache.find(sourcePhrase);
+ if(it != m_phraseCache.end()) {
+ LastUsed &lu = it->second;
+ lu.m_clock = clock();
+ return std::make_pair(lu.m_tpv, lu.m_bitsLeft);
+ } else
+ return std::make_pair(TargetPhraseVectorPtr(), 0);
+ }
- void Prune()
- {
+ void Prune() {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- if(m_phraseCache.size() > m_max * (1 + m_tolerance))
- {
- typedef std::set<std::pair<clock_t, Phrase> > Cands;
- Cands cands;
- for(CacheMap::iterator it = m_phraseCache.begin();
- it != m_phraseCache.end(); it++)
- {
- LastUsed &lu = it->second;
- cands.insert(std::make_pair(lu.m_clock, it->first));
- }
-
- for(Cands::iterator it = cands.begin(); it != cands.end(); it++)
- {
- const Phrase& p = it->second;
- m_phraseCache.erase(p);
-
- if(m_phraseCache.size() < (m_max * (1 - m_tolerance)))
- break;
- }
+ if(m_phraseCache.size() > m_max * (1 + m_tolerance)) {
+ typedef std::set<std::pair<clock_t, Phrase> > Cands;
+ Cands cands;
+ for(CacheMap::iterator it = m_phraseCache.begin();
+ it != m_phraseCache.end(); it++) {
+ LastUsed &lu = it->second;
+ cands.insert(std::make_pair(lu.m_clock, it->first));
+ }
+
+ for(Cands::iterator it = cands.begin(); it != cands.end(); it++) {
+ const Phrase& p = it->second;
+ m_phraseCache.erase(p);
+
+ if(m_phraseCache.size() < (m_max * (1 - m_tolerance)))
+ break;
}
}
+ }
- void CleanUp()
- {
+ void CleanUp() {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- m_phraseCache.clear();
- }
-
+ m_phraseCache.clear();
+ }
+
};
}
diff --git a/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp b/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp
index 35e8e3122..b231836f5 100644
--- a/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp
+++ b/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp
@@ -1,27 +1,28 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
-#include "ThrowingFwrite.h"
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
-size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream) {
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "ThrowingFwrite.h"
+
+size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream)
+{
assert(size);
size_t returnValue = std::fwrite(ptr, size, count, stream);
UTIL_THROW_IF(count != returnValue, util::ErrnoException, "Short fwrite; requested size " << size);
diff --git a/moses/TranslationModel/CompactPT/ThrowingFwrite.h b/moses/TranslationModel/CompactPT/ThrowingFwrite.h
index 4f45ae8f5..466d3973b 100644
--- a/moses/TranslationModel/CompactPT/ThrowingFwrite.h
+++ b/moses/TranslationModel/CompactPT/ThrowingFwrite.h
@@ -1,30 +1,30 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_ThrowingFwrite_h
#define moses_ThrowingFwrite_h
#include <cassert>
#include <cstdio>
-#include "util/exception.hh"
+#include "util/exception.hh"
size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream);
diff --git a/moses/TranslationModel/DynSAInclude/RandLMCache.h b/moses/TranslationModel/DynSAInclude/RandLMCache.h
index b92a2a164..06ce240a1 100644
--- a/moses/TranslationModel/DynSAInclude/RandLMCache.h
+++ b/moses/TranslationModel/DynSAInclude/RandLMCache.h
@@ -22,177 +22,180 @@
#include <ctime>
#include <iostream>
-namespace randlm {
-
- //! @todo ask abby2
- template<typename T>
- class CacheNode {
- public:
- typedef std::map<wordID_t, CacheNode<T>* > childMap;
- // initialise value to 'unknown' (i.e. not yet queried or cached).
- CacheNode(T unknown_value) : value_(unknown_value) {}
- childMap childs_; // child pointers
- T value_; // value stored
- const void* state_; // state pointer
- };
-
- template<typename T>
- class Cache {
- public:
- typedef typename std::map<wordID_t, CacheNode<T>* >::iterator childPtr;
- // unknown_value is used to indicate the ngram was not queried (yet)
- // null_value_ indicates it was queried but not found in model
- // space usage is handled by client.
- Cache(T unknown_value, T null_value) :
- cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) {
- root_ = newNode();
+namespace randlm
+{
+
+//! @todo ask abby2
+template<typename T>
+class CacheNode
+{
+public:
+ typedef std::map<wordID_t, CacheNode<T>* > childMap;
+ // initialise value to 'unknown' (i.e. not yet queried or cached).
+ CacheNode(T unknown_value) : value_(unknown_value) {}
+ childMap childs_; // child pointers
+ T value_; // value stored
+ const void* state_; // state pointer
+};
+
+template<typename T>
+class Cache
+{
+public:
+ typedef typename std::map<wordID_t, CacheNode<T>* >::iterator childPtr;
+ // unknown_value is used to indicate the ngram was not queried (yet)
+ // null_value_ indicates it was queried but not found in model
+ // space usage is handled by client.
+ Cache(T unknown_value, T null_value) :
+ cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) {
+ root_ = newNode();
+ }
+ ~Cache() {
+ if(clear()) {
+ delete root_;
+ root_ = NULL;
+ } else {
+ std::cerr << "Error freeing cache memory.\n";
}
- ~Cache() {
- if(clear()) {
- delete root_;
- root_ = NULL;
+ }
+ bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) {
+ // inserts full ngram into cache
+ CacheNode<T>* node = root_;
+ for (int i = len - 1; i > -1; --i) {
+ childPtr child = node->childs_.find(ngram[i]);
+ if( child != node->childs_.end() ) {
+ // current node is already prefix. Go to child node
+ node = node->childs_[ngram[i]];
} else {
- std::cerr << "Error freeing cache memory.\n";
- }
- }
- bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) {
- // inserts full ngram into cache
- CacheNode<T>* node = root_;
- for (int i = len - 1; i > -1; --i) {
- childPtr child = node->childs_.find(ngram[i]);
- if( child != node->childs_.end() ) {
- // current node is already prefix. Go to child node
- node = node->childs_[ngram[i]];
- } else {
- // no child for prefix. set new child link in current node
- CacheNode<T> * newChild = newNode(node);
- node->childs_[ngram[i]] = newChild;
- // go to new node
- node = newChild;
- }
+ // no child for prefix. set new child link in current node
+ CacheNode<T> * newChild = newNode(node);
+ node->childs_[ngram[i]] = newChild;
+ // go to new node
+ node = newChild;
}
- node->value_ = value;
- node->state_ = state;
- return true;
}
- bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) {
- // finds value for this full ngram only (returns false if full ngram not in cache)
- CacheNode<T> * node = root_;
- for(int i = len - 1; i > -1; --i) {
- // go to deepest level node of ngram in cache
- childPtr child = node->childs_.find(ngram[i]);
- if( child != node->childs_.end() ) {
- // switch to child node
- node = node->childs_[ngram[i]];
- } else {
- // not cached
- return false;
- }
+ node->value_ = value;
+ node->state_ = state;
+ return true;
+ }
+ bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) {
+ // finds value for this full ngram only (returns false if full ngram not in cache)
+ CacheNode<T> * node = root_;
+ for(int i = len - 1; i > -1; --i) {
+ // go to deepest level node of ngram in cache
+ childPtr child = node->childs_.find(ngram[i]);
+ if( child != node->childs_.end() ) {
+ // switch to child node
+ node = node->childs_[ngram[i]];
+ } else {
+ // not cached
+ return false;
}
- *value = node->value_;
- if(state) *state = node->state_;
- return *value != null_value_ && *value != unknown_value_;
}
- int getCache2(const wordID_t* ngram, int len, T** values, int* found) {
- // set values array to point to cache value nodes
- CacheNode<T> * node = root_;
- *found = 0;
- //values[0] = &node->value_; // pointer to root node's value
- bool all_found = true;
- for(int i = len - 1; i > -1; --i) {
- // go to deepest level node of ngram in cache
- childPtr child = node->childs_.find(ngram[i]);
- if( child != node->childs_.end() ) {
- // switch to child node
- node = node->childs_[ngram[i]];
- // get pointer to value (index by length - 1)
- values[i] = &node->value_;
- // if null_value then assume all extensions impossible
- if (node->value_ == null_value_) {
- return len - 1 - i; // max length posible
- }
- all_found = all_found && (node->value_ != unknown_value_);
- if (all_found)
- ++(*found);
- } else {
- // initialise uncached values
- CacheNode<T> * newChild = newNode(node);
- node->childs_[ngram[i]] = newChild;
- // go to new node
- node = newChild;
- values[i] = &node->value_;
- }
+ *value = node->value_;
+ if(state) *state = node->state_;
+ return *value != null_value_ && *value != unknown_value_;
+ }
+ int getCache2(const wordID_t* ngram, int len, T** values, int* found) {
+ // set values array to point to cache value nodes
+ CacheNode<T> * node = root_;
+ *found = 0;
+ //values[0] = &node->value_; // pointer to root node's value
+ bool all_found = true;
+ for(int i = len - 1; i > -1; --i) {
+ // go to deepest level node of ngram in cache
+ childPtr child = node->childs_.find(ngram[i]);
+ if( child != node->childs_.end() ) {
+ // switch to child node
+ node = node->childs_[ngram[i]];
+ // get pointer to value (index by length - 1)
+ values[i] = &node->value_;
+ // if null_value then assume all extensions impossible
+ if (node->value_ == null_value_) {
+ return len - 1 - i; // max length posible
+ }
+ all_found = all_found && (node->value_ != unknown_value_);
+ if (all_found)
+ ++(*found);
+ } else {
+ // initialise uncached values
+ CacheNode<T> * newChild = newNode(node);
+ node->childs_[ngram[i]] = newChild;
+ // go to new node
+ node = newChild;
+ values[i] = &node->value_;
}
- return len; // all possible
}
- int getCache(const wordID_t* ngram, int len, T** values, int* found) {
- // get pointers to values for ngram and constituents.
- // returns upper bound on longest subngram in model.
- // 'found' stores longest non-null and known value found.
- CacheNode<T> * node = root_;
- *found = 0;
- values[0] = &node->value_; // pointer to root node's value
- bool all_found = true;
- for(int i = len - 1; i > -1; --i) {
- // go to deepest level node of ngram in cache
- childPtr child = node->childs_.find(ngram[i]);
- if( child != node->childs_.end() ) {
- // switch to child node
- node = node->childs_[ngram[i]];
- // get pointer to value (index by length - 1)
- values[len - i] = &node->value_;
- // if null_value then assume all extensions impossible
- if (node->value_ == null_value_)
- return len - 1 - i; // max length posible
- all_found = all_found && (node->value_ != unknown_value_);
- if (all_found)
- ++(*found);
- } else {
- // initialise uncached values
- CacheNode<T> * newChild = newNode(node);
- node->childs_[ngram[i]] = newChild;
- // go to new node
- node = newChild;
- values[len - i] = &node->value_;
- }
+ return len; // all possible
+ }
+ int getCache(const wordID_t* ngram, int len, T** values, int* found) {
+ // get pointers to values for ngram and constituents.
+ // returns upper bound on longest subngram in model.
+ // 'found' stores longest non-null and known value found.
+ CacheNode<T> * node = root_;
+ *found = 0;
+ values[0] = &node->value_; // pointer to root node's value
+ bool all_found = true;
+ for(int i = len - 1; i > -1; --i) {
+ // go to deepest level node of ngram in cache
+ childPtr child = node->childs_.find(ngram[i]);
+ if( child != node->childs_.end() ) {
+ // switch to child node
+ node = node->childs_[ngram[i]];
+ // get pointer to value (index by length - 1)
+ values[len - i] = &node->value_;
+ // if null_value then assume all extensions impossible
+ if (node->value_ == null_value_)
+ return len - 1 - i; // max length posible
+ all_found = all_found && (node->value_ != unknown_value_);
+ if (all_found)
+ ++(*found);
+ } else {
+ // initialise uncached values
+ CacheNode<T> * newChild = newNode(node);
+ node->childs_[ngram[i]] = newChild;
+ // go to new node
+ node = newChild;
+ values[len - i] = &node->value_;
}
- return len; // all possible
- }
- bool clear() {
- std::cerr << "Clearing cache with " << static_cast<float>(cur_nodes_ * nodeSize())
- / static_cast<float>(1ull << 20) << "MB" << std::endl;
- return clearNodes(root_);
- }
- int nodes() {
- // returns number of nodes
- return cur_nodes_;
- }
- int nodeSize() {
- return sizeof(CacheNode<T>) + sizeof(root_->childs_);
- }
- private:
- CacheNode<T> * root_;
- count_t cur_nodes_;
- T unknown_value_; // Used to initialise data at each node
- T null_value_; // Indicates cached something not in model
- CacheNode<T>* newNode(CacheNode<T> * node = 0) {
- ++cur_nodes_;
- return new CacheNode<T>(unknown_value_);
}
- bool clearNodes(CacheNode<T> * node) {
- //delete children from this node
- if(!node->childs_.empty()) {
- iterate(node->childs_, itr) {
- if(!clearNodes(itr->second))
- std::cerr << "Error emptying cache\n";
- delete itr->second;
- --cur_nodes_;
- }
- node->childs_.clear();
+ return len; // all possible
+ }
+ bool clear() {
+ std::cerr << "Clearing cache with " << static_cast<float>(cur_nodes_ * nodeSize())
+ / static_cast<float>(1ull << 20) << "MB" << std::endl;
+ return clearNodes(root_);
+ }
+ int nodes() {
+ // returns number of nodes
+ return cur_nodes_;
+ }
+ int nodeSize() {
+ return sizeof(CacheNode<T>) + sizeof(root_->childs_);
+ }
+private:
+ CacheNode<T> * root_;
+ count_t cur_nodes_;
+ T unknown_value_; // Used to initialise data at each node
+ T null_value_; // Indicates cached something not in model
+ CacheNode<T>* newNode(CacheNode<T> * node = 0) {
+ ++cur_nodes_;
+ return new CacheNode<T>(unknown_value_);
+ }
+ bool clearNodes(CacheNode<T> * node) {
+ //delete children from this node
+ if(!node->childs_.empty()) {
+ iterate(node->childs_, itr) {
+ if(!clearNodes(itr->second))
+ std::cerr << "Error emptying cache\n";
+ delete itr->second;
+ --cur_nodes_;
}
- return true;
+ node->childs_.clear();
}
+ return true;
+ }
- };
+};
} //end namespace
#endif //INC_RANDLM_CACHE_H
diff --git a/moses/TranslationModel/DynSAInclude/RandLMFilter.h b/moses/TranslationModel/DynSAInclude/RandLMFilter.h
index 298464693..0923f52af 100644
--- a/moses/TranslationModel/DynSAInclude/RandLMFilter.h
+++ b/moses/TranslationModel/DynSAInclude/RandLMFilter.h
@@ -24,296 +24,307 @@
#define log2(X) (log((double)X)/log((double)2))
#endif
-namespace randlm {
-
- /* Class Filter wraps a contiguous array of data. Filter and its subclasses
- * implement read/write/increment functionality on arrays with arbitrary sized addresses
- * (i.e. an address may not use a full number of bytes). When converting to byte-based
- * representation we assume "unused" bits are to left.
- * E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11
- * to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1])
- * and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have
- * been masked out.
- */
- template<typename T>
- class Filter {
- public:
- Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) {
- // number of bits in T
- cell_width_ = sizeof(T) << 3;
- // current implementation has following constraints
- CHECK(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width);
- // used for >> division
- log_cell_width_ = static_cast<int>(floor(log((double)cell_width_)/log((double)2) + 0.000001));
- // size of underlying data in Ts
- cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_;
- // instantiate underlying data
- data_ = new T[cells_];
- CHECK(data_ != NULL);
- CHECK(reset());
- // 'first_bit' marks the first bit used by 'address' (left padded with zeros).
- first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_);
- // mask for full cell
- full_mask_ = static_cast<T>(0xffffffffffffffffull);
- // mask for bits that make up the address
- address_mask_ = full_mask_ >> first_bit_;
- }
- Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) {
- CHECK(loadHeader(fin));
- if (loaddata)
- CHECK(loadData(fin));
- }
- virtual ~Filter() {
- delete[] data_;
- }
- bool reset() {
- for (uint64_t i = 0; i < cells_; ++i)
- data_[i] = 0;
+namespace randlm
+{
+
+/* Class Filter wraps a contiguous array of data. Filter and its subclasses
+ * implement read/write/increment functionality on arrays with arbitrary sized addresses
+ * (i.e. an address may not use a full number of bytes). When converting to byte-based
+ * representation we assume "unused" bits are to left.
+ * E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11
+ * to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1])
+ * and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have
+ * been masked out.
+ */
+template<typename T>
+class Filter
+{
+public:
+ Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) {
+ // number of bits in T
+ cell_width_ = sizeof(T) << 3;
+ // current implementation has following constraints
+ CHECK(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width);
+ // used for >> division
+ log_cell_width_ = static_cast<int>(floor(log((double)cell_width_)/log((double)2) + 0.000001));
+ // size of underlying data in Ts
+ cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_;
+ // instantiate underlying data
+ data_ = new T[cells_];
+ CHECK(data_ != NULL);
+ CHECK(reset());
+ // 'first_bit' marks the first bit used by 'address' (left padded with zeros).
+ first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_);
+ // mask for full cell
+ full_mask_ = static_cast<T>(0xffffffffffffffffull);
+ // mask for bits that make up the address
+ address_mask_ = full_mask_ >> first_bit_;
+ }
+ Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) {
+ CHECK(loadHeader(fin));
+ if (loaddata)
+ CHECK(loadData(fin));
+ }
+ virtual ~Filter() {
+ delete[] data_;
+ }
+ bool reset() {
+ for (uint64_t i = 0; i < cells_; ++i)
+ data_[i] = 0;
+ return true;
+ }
+ count_t size() {
+ // return approx size of filter in MBs
+ return cells_ * sizeof(T) >> 20;
+ }
+ // read / write functions
+ inline bool read(uint64_t address, T* value) {
+ CHECK(address <= addresses_);
+ // copy address to 'value'
+ uint64_t data_bit = address * width_;
+ uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
+ // 'offset' shows how address in 'data' and 'value' align
+ int offset = (data_bit % cell_width_) - first_bit_;
+ // they align so just copy across masking unneeded leading bits
+ if (offset == 0) {
+ *value = data_[data_cell] & address_mask_;
return true;
}
- count_t size() {
- // return approx size of filter in MBs
- return cells_ * sizeof(T) >> 20;
- }
- // read / write functions
- inline bool read(uint64_t address, T* value) {
- CHECK(address <= addresses_);
- // copy address to 'value'
- uint64_t data_bit = address * width_;
- uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
- // 'offset' shows how address in 'data' and 'value' align
- int offset = (data_bit % cell_width_) - first_bit_;
- // they align so just copy across masking unneeded leading bits
- if (offset == 0) {
- *value = data_[data_cell] & address_mask_;
- return true;
- }
- // data address starts to left so shift it right
- if (offset < 0) {
- *value = (data_[data_cell] >> -offset) & address_mask_;
- return true;
- }
- // data address is to right so shift it left and look at one more cell to right
- *value = ((data_[data_cell] << offset)
- | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
+ // data address starts to left so shift it right
+ if (offset < 0) {
+ *value = (data_[data_cell] >> -offset) & address_mask_;
return true;
}
- inline T read(uint64_t address) {
- CHECK(address <= addresses_);
- // return value at address
- T value = 0;
- uint64_t data_bit = address * width_;
- uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
- // 'offset' shows how address in 'data' and 'value' align
- int offset = (data_bit % cell_width_) - first_bit_;
- // they align so just copy across masking unneeded leading bits
- if (offset == 0) {
- value = data_[data_cell] & address_mask_;
- }
- // data address starts to left so shift it right
- else if (offset < 0) {
- value = (data_[data_cell] >> -offset) & address_mask_;
- }
- // data address is to right so shift it left and look at one more cell to right
- else
- value = ((data_[data_cell] << offset)
- | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
- return value;
- }
- inline bool write(uint64_t address, T value) {
- CHECK(address <= addresses_);
- CHECK(log2(value) <= width_);
- // write 'value' to address
- uint64_t data_bit = address * width_;
- uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
- // 'offset' shows how address in 'data' and 'value' align
- int offset = (data_bit % cell_width_) - first_bit_;
- // they align so just copy across masking unneeded leading zeros of value
- if (offset == 0) {
- data_[data_cell] = value | (data_[data_cell] & ~address_mask_);
- return true;
- }
- // address in data is to left so shift value left by -offset
- if (offset < 0) {
- data_[data_cell] = (value << -offset)
- | (data_[data_cell] & ~(address_mask_ << -offset));
- return true;
- }
- // address in data is to right so shift value right by offset
- data_[data_cell] = (value >> offset) |
- (data_[data_cell] & ~(address_mask_ >> offset));
- data_[data_cell + 1] = (value << (cell_width_ - offset)) |
- (data_[data_cell + 1] & (full_mask_ >> offset));
- return true;
- }
- inline bool readWithFingerprint(uint64_t address, T finger, T* value) {
- // copy 'address' ^ 'finger' to 'value'
- uint64_t data_bit = address * width_;
- uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
- // 'offset' shows how address in 'data' and 'value' align
- int offset = (data_bit % cell_width_) - first_bit_;
- // they align so just copy across masking unneeded leading bits
- if (offset == 0) {
- *value = (finger ^ data_[data_cell]) & address_mask_;
- return true;
- }
- // data address starts to left so shift it right
- if (offset < 0) {
- *value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_;
- return true;
- }
- // data address is to right so shift it left and look at one more cell to right
- *value = (((data_[data_cell] << offset)
- | (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger)
- & address_mask_ ;
- return true;
+ // data address is to right so shift it left and look at one more cell to right
+ *value = ((data_[data_cell] << offset)
+ | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
+ return true;
+ }
+ inline T read(uint64_t address) {
+ CHECK(address <= addresses_);
+ // return value at address
+ T value = 0;
+ uint64_t data_bit = address * width_;
+ uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
+ // 'offset' shows how address in 'data' and 'value' align
+ int offset = (data_bit % cell_width_) - first_bit_;
+ // they align so just copy across masking unneeded leading bits
+ if (offset == 0) {
+ value = data_[data_cell] & address_mask_;
}
- inline bool writeWithFingerprint(uint64_t address, T finger, T value) {
- // write 'value' ^ 'finger' to address
- finger &= address_mask_; // make sure fingerprint is correct size
- uint64_t data_bit = address * width_;
- uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
- // 'offset' shows how address in 'data' and 'value' align
- int offset = (data_bit % cell_width_) - first_bit_;
- // they align so just copy across masking unneeded leading zeros of value
- if (offset == 0) {
- data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_);
- return true;
- }
- // address in data is to left so shift value left by -offset
- if (offset < 0) {
- data_[data_cell] = ((finger ^ value) << -offset)
- | (data_[data_cell] & ~(address_mask_ << -offset));
- return true;
- }
- // address in data is to right so shift value right by offset
- data_[data_cell] = ((finger ^ value) >> offset) |
- (data_[data_cell] & ~(address_mask_ >> offset));
- data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) |
- (data_[data_cell + 1] & (full_mask_ >> offset));
- return true;
- }
- // debugging
- void printFilter(const std::string & prefix = "", uint32_t truncate = 64){
- std::cout << prefix;
- for (uint32_t i = 0; i < cells_ && i < truncate; ++i) {
- for (int j = cell_width_ - 1; j >= 0; --j)
- if (data_[i] & (1ull << j))
- std::cout << 1;
- else
- std::cout << 0;
- std::cout << "\n";
- }
- std::cout << std::endl;
+ // data address starts to left so shift it right
+ else if (offset < 0) {
+ value = (data_[data_cell] >> -offset) & address_mask_;
}
- // i/o
- uint64_t getAddresses() { return addresses_; }
- int getWidth() { return width_; }
- int getCellWidth() { return cell_width_; }
- uint32_t getCells() { return cells_; }
- virtual bool save(FileHandler* out) {
- CHECK(out != NULL);
- CHECK(out->write((char*)&cells_, sizeof(cells_)));
- CHECK(out->write((char*)&cell_width_, sizeof(cell_width_)));
- CHECK(out->write((char*)&log_cell_width_, sizeof(log_cell_width_)));
- CHECK(out->write((char*)&addresses_, sizeof(addresses_)));
- CHECK(out->write((char*)&width_, sizeof(width_)));
- CHECK(out->write((char*)&first_bit_, sizeof(first_bit_)));
- CHECK(out->write((char*)&full_mask_, sizeof(full_mask_)));
- CHECK(out->write((char*)&address_mask_, sizeof(address_mask_)));
- //CHECK(out->write((char*)data_, cells_ * sizeof(T)));
- const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29);
- if((width_ == 1) || cells_ < jump)
- CHECK(out->write((char*)data_, cells_ * sizeof(T)));
- else {
- uint64_t idx(0);
- while(idx + jump < cells_) {
- CHECK(out->write((char*)&data_[idx], jump * sizeof(T)));
- idx += jump;
- }
- CHECK(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T)));
- }
+ // data address is to right so shift it left and look at one more cell to right
+ else
+ value = ((data_[data_cell] << offset)
+ | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
+ return value;
+ }
+ inline bool write(uint64_t address, T value) {
+ CHECK(address <= addresses_);
+ CHECK(log2(value) <= width_);
+ // write 'value' to address
+ uint64_t data_bit = address * width_;
+ uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
+ // 'offset' shows how address in 'data' and 'value' align
+ int offset = (data_bit % cell_width_) - first_bit_;
+ // they align so just copy across masking unneeded leading zeros of value
+ if (offset == 0) {
+ data_[data_cell] = value | (data_[data_cell] & ~address_mask_);
return true;
}
- protected:
- bool loadHeader(FileHandler* fin) {
- CHECK(fin != NULL);
- CHECK(fin->read((char*)&cells_, sizeof(cells_)));
- CHECK(fin->read((char*)&cell_width_, sizeof(cell_width_)));
- CHECK(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type
- CHECK(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_)));
- CHECK(fin->read((char*)&addresses_, sizeof(addresses_)));
- CHECK(fin->read((char*)&width_, sizeof(width_)));
- CHECK(fin->read((char*)&first_bit_, sizeof(first_bit_)));
- CHECK(fin->read((char*)&full_mask_, sizeof(full_mask_)));
- CHECK(fin->read((char*)&address_mask_, sizeof(address_mask_)));
+ // address in data is to left so shift value left by -offset
+ if (offset < 0) {
+ data_[data_cell] = (value << -offset)
+ | (data_[data_cell] & ~(address_mask_ << -offset));
return true;
}
- bool loadData(FileHandler* fin) {
- // instantiate underlying array
- data_ = new T[cells_];
- CHECK(data_ != NULL);
- CHECK(fin->read((char*)data_, cells_ * sizeof(T)));
- //CHECK(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T)));
- //CHECK(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T)));
+ // address in data is to right so shift value right by offset
+ data_[data_cell] = (value >> offset) |
+ (data_[data_cell] & ~(address_mask_ >> offset));
+ data_[data_cell + 1] = (value << (cell_width_ - offset)) |
+ (data_[data_cell + 1] & (full_mask_ >> offset));
+ return true;
+ }
+ inline bool readWithFingerprint(uint64_t address, T finger, T* value) {
+ // copy 'address' ^ 'finger' to 'value'
+ uint64_t data_bit = address * width_;
+ uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
+ // 'offset' shows how address in 'data' and 'value' align
+ int offset = (data_bit % cell_width_) - first_bit_;
+ // they align so just copy across masking unneeded leading bits
+ if (offset == 0) {
+ *value = (finger ^ data_[data_cell]) & address_mask_;
return true;
}
- uint64_t cells_; // number T making up 'data_'
- int cell_width_; // bits per cell (i.e. sizeof(T) << 3)
- int log_cell_width_; // log of bits used for >> division
- uint64_t addresses_; // number of addresses in the filter
- int width_; // width in bits of each address
- int first_bit_; // position of first bit in initial byte
- T full_mask_; // all 1s
- T address_mask_; // 1s in those positions that are part of address
- T* data_; // the raw data as bytes
- };
-
- // Extension with bit test/setter methods added
- class BitFilter : public Filter<uint8_t> {
- public:
- BitFilter(uint64_t bits) : Filter<uint8_t>(bits, 1) {}
- BitFilter(FileHandler* fin, bool loaddata = true)
- : Filter<uint8_t>(fin, loaddata) {
- if (loaddata)
- CHECK(load(fin));
- }
- // TODO: overload operator[]
- virtual bool testBit(uint64_t location) {
- // test bit referenced by location
- return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8);
- }
- virtual bool setBit(uint64_t location) {
- // set bit referenced by location
- data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8);
+ // data address starts to left so shift it right
+ if (offset < 0) {
+ *value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_;
return true;
}
- virtual bool clearBit(uint64_t location) {
- // set bit referenced by location
- data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8);
+ // data address is to right so shift it left and look at one more cell to right
+ *value = (((data_[data_cell] << offset)
+ | (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger)
+ & address_mask_ ;
+ return true;
+ }
+ inline bool writeWithFingerprint(uint64_t address, T finger, T value) {
+ // write 'value' ^ 'finger' to address
+ finger &= address_mask_; // make sure fingerprint is correct size
+ uint64_t data_bit = address * width_;
+ uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
+ // 'offset' shows how address in 'data' and 'value' align
+ int offset = (data_bit % cell_width_) - first_bit_;
+ // they align so just copy across masking unneeded leading zeros of value
+ if (offset == 0) {
+ data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_);
return true;
}
- bool save(FileHandler* fout) {
- CHECK(Filter<uint8_t>::save(fout));
- std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;;
+ // address in data is to left so shift value left by -offset
+ if (offset < 0) {
+ data_[data_cell] = ((finger ^ value) << -offset)
+ | (data_[data_cell] & ~(address_mask_ << -offset));
return true;
}
- float rho(uint64_t limit = 0) {
- uint64_t ones = 0;
- uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_;
- for (uint64_t i = 0; i < range; ++i)
- for (int j = 0; j < 8; ++j)
- if (data_[i] & (1 << j))
- ++ones;
- return static_cast<float>((range << 3) - ones)/static_cast<float>(range << 3);
+ // address in data is to right so shift value right by offset
+ data_[data_cell] = ((finger ^ value) >> offset) |
+ (data_[data_cell] & ~(address_mask_ >> offset));
+ data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) |
+ (data_[data_cell + 1] & (full_mask_ >> offset));
+ return true;
+ }
+ // debugging
+ void printFilter(const std::string & prefix = "", uint32_t truncate = 64) {
+ std::cout << prefix;
+ for (uint32_t i = 0; i < cells_ && i < truncate; ++i) {
+ for (int j = cell_width_ - 1; j >= 0; --j)
+ if (data_[i] & (1ull << j))
+ std::cout << 1;
+ else
+ std::cout << 0;
+ std::cout << "\n";
}
- protected:
- bool load(FileHandler* fin) {
- std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;;
- return true;
+ std::cout << std::endl;
+ }
+ // i/o
+ uint64_t getAddresses() {
+ return addresses_;
+ }
+ int getWidth() {
+ return width_;
+ }
+ int getCellWidth() {
+ return cell_width_;
+ }
+ uint32_t getCells() {
+ return cells_;
+ }
+ virtual bool save(FileHandler* out) {
+ CHECK(out != NULL);
+ CHECK(out->write((char*)&cells_, sizeof(cells_)));
+ CHECK(out->write((char*)&cell_width_, sizeof(cell_width_)));
+ CHECK(out->write((char*)&log_cell_width_, sizeof(log_cell_width_)));
+ CHECK(out->write((char*)&addresses_, sizeof(addresses_)));
+ CHECK(out->write((char*)&width_, sizeof(width_)));
+ CHECK(out->write((char*)&first_bit_, sizeof(first_bit_)));
+ CHECK(out->write((char*)&full_mask_, sizeof(full_mask_)));
+ CHECK(out->write((char*)&address_mask_, sizeof(address_mask_)));
+ //CHECK(out->write((char*)data_, cells_ * sizeof(T)));
+ const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29);
+ if((width_ == 1) || cells_ < jump)
+ CHECK(out->write((char*)data_, cells_ * sizeof(T)));
+ else {
+ uint64_t idx(0);
+ while(idx + jump < cells_) {
+ CHECK(out->write((char*)&data_[idx], jump * sizeof(T)));
+ idx += jump;
+ }
+ CHECK(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T)));
}
- };
-/*
+ return true;
+ }
+protected:
+ bool loadHeader(FileHandler* fin) {
+ CHECK(fin != NULL);
+ CHECK(fin->read((char*)&cells_, sizeof(cells_)));
+ CHECK(fin->read((char*)&cell_width_, sizeof(cell_width_)));
+ CHECK(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type
+ CHECK(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_)));
+ CHECK(fin->read((char*)&addresses_, sizeof(addresses_)));
+ CHECK(fin->read((char*)&width_, sizeof(width_)));
+ CHECK(fin->read((char*)&first_bit_, sizeof(first_bit_)));
+ CHECK(fin->read((char*)&full_mask_, sizeof(full_mask_)));
+ CHECK(fin->read((char*)&address_mask_, sizeof(address_mask_)));
+ return true;
+ }
+ bool loadData(FileHandler* fin) {
+ // instantiate underlying array
+ data_ = new T[cells_];
+ CHECK(data_ != NULL);
+ CHECK(fin->read((char*)data_, cells_ * sizeof(T)));
+ //CHECK(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T)));
+ //CHECK(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T)));
+ return true;
+ }
+ uint64_t cells_; // number T making up 'data_'
+ int cell_width_; // bits per cell (i.e. sizeof(T) << 3)
+ int log_cell_width_; // log of bits used for >> division
+ uint64_t addresses_; // number of addresses in the filter
+ int width_; // width in bits of each address
+ int first_bit_; // position of first bit in initial byte
+ T full_mask_; // all 1s
+ T address_mask_; // 1s in those positions that are part of address
+ T* data_; // the raw data as bytes
+};
+
+// Extension with bit test/setter methods added
+class BitFilter : public Filter<uint8_t>
+{
+public:
+ BitFilter(uint64_t bits) : Filter<uint8_t>(bits, 1) {}
+ BitFilter(FileHandler* fin, bool loaddata = true)
+ : Filter<uint8_t>(fin, loaddata) {
+ if (loaddata)
+ CHECK(load(fin));
+ }
+ // TODO: overload operator[]
+ virtual bool testBit(uint64_t location) {
+ // test bit referenced by location
+ return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8);
+ }
+ virtual bool setBit(uint64_t location) {
+ // set bit referenced by location
+ data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8);
+ return true;
+ }
+ virtual bool clearBit(uint64_t location) {
+ // set bit referenced by location
+ data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8);
+ return true;
+ }
+ bool save(FileHandler* fout) {
+ CHECK(Filter<uint8_t>::save(fout));
+ std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;;
+ return true;
+ }
+ float rho(uint64_t limit = 0) {
+ uint64_t ones = 0;
+ uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_;
+ for (uint64_t i = 0; i < range; ++i)
+ for (int j = 0; j < 8; ++j)
+ if (data_[i] & (1 << j))
+ ++ones;
+ return static_cast<float>((range << 3) - ones)/static_cast<float>(range << 3);
+ }
+protected:
+ bool load(FileHandler* fin) {
+ std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;;
+ return true;
+ }
+};
+/*
// ResizedBitFilter deals with resizing to save memory
// whereas other filters should expect locations to be within range
// this filter will need to resize (and possibly rehash) locations
@@ -385,9 +396,9 @@ namespace randlm {
carry = incrementSubCell(data_bit, this->width_, &this->data_[data_cell]);
}
// last update must not have carried
- if (!carry)
+ if (!carry)
return true;
- // wrapped round so check whether need to reset to max count
+ // wrapped round so check whether need to reset to max count
if (!wrap_around_)
CHECK(this->write(address, this->address_mask_));
return false; // false to indicate that overflowed
@@ -402,7 +413,7 @@ namespace randlm {
}
inline bool incrementSubCell(int bit, int len, T* cell) {
// increment counter consisting of bits [startbit, startbit + len - 1] rest stays unchanged
- *cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1)
+ *cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1)
& (this->full_mask_ >> (this->cell_width_ - len))) << (this->cell_width_ - bit - len))
| (*cell & ~(((this->full_mask_ >> (this->cell_width_ - len)) << (this->cell_width_ - bit - len))));
// indicate overflow as true
diff --git a/moses/TranslationModel/DynSAInclude/hash.h b/moses/TranslationModel/DynSAInclude/hash.h
index 03669845e..9e6cfe62a 100644
--- a/moses/TranslationModel/DynSAInclude/hash.h
+++ b/moses/TranslationModel/DynSAInclude/hash.h
@@ -11,60 +11,68 @@ typedef uint64_t P; // largest input range is 2^64
//! @todo ask abby2
template <typename T>
-class HashBase {
- protected:
- T m_; // range of hash output
- count_t H_; // number of hash functions to instantiate
- virtual void initSeeds()=0;
- virtual void freeSeeds()=0;
- public:
- HashBase(float m, count_t H=1):m_((T)m), H_(H) {
- //cerr << "range = (0..." << m_ << "]" << endl;
- }
- HashBase(FileHandler* fin) {
- load(fin);
- }
- virtual ~HashBase(){}
- virtual T hash(const char*s, count_t h)=0; // string hashing
- virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing
- count_t size() { return H_;}
- virtual void save(FileHandler* fout) {
- CHECK(fout != 0);
- fout->write((char*)&m_, sizeof(m_));
- fout->write((char*)&H_, sizeof(H_));
- }
- virtual void load(FileHandler* fin) {
- CHECK(fin != 0);
- fin->read((char*)&m_, sizeof(m_));
- fin->read((char*)&H_, sizeof(H_));
- }
+class HashBase
+{
+protected:
+ T m_; // range of hash output
+ count_t H_; // number of hash functions to instantiate
+ virtual void initSeeds()=0;
+ virtual void freeSeeds()=0;
+public:
+ HashBase(float m, count_t H=1):m_((T)m), H_(H) {
+ //cerr << "range = (0..." << m_ << "]" << endl;
+ }
+ HashBase(FileHandler* fin) {
+ load(fin);
+ }
+ virtual ~HashBase() {}
+ virtual T hash(const char*s, count_t h)=0; // string hashing
+ virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing
+ count_t size() {
+ return H_;
+ }
+ virtual void save(FileHandler* fout) {
+ CHECK(fout != 0);
+ fout->write((char*)&m_, sizeof(m_));
+ fout->write((char*)&H_, sizeof(H_));
+ }
+ virtual void load(FileHandler* fin) {
+ CHECK(fin != 0);
+ fin->read((char*)&m_, sizeof(m_));
+ fin->read((char*)&H_, sizeof(H_));
+ }
};
//! @todo ask abby2
template <typename T>
-class UnivHash_linear: public HashBase<T> {
- public:
- UnivHash_linear(float m, count_t H, P pr):
- HashBase<T>(m, H), pr_(pr) {
- //CHECK(isPrime(pr_));
- initSeeds();
- }
- UnivHash_linear(FileHandler* fin):
- HashBase<T>(fin) {
- load(fin);
- }
- ~UnivHash_linear() {freeSeeds();}
- T hash(const char* s, count_t h){return 0;} //not implemented
- T hash(const wordID_t* id, const int len, count_t h);
- T hash(const wordID_t id, const count_t pos,
- const T prevValue, count_t h);
- void save(FileHandler* fout);
- void load(FileHandler* fin);
- private:
- T** a_, **b_;
- P pr_;
- void initSeeds();
- void freeSeeds();
+class UnivHash_linear: public HashBase<T>
+{
+public:
+ UnivHash_linear(float m, count_t H, P pr):
+ HashBase<T>(m, H), pr_(pr) {
+ //CHECK(isPrime(pr_));
+ initSeeds();
+ }
+ UnivHash_linear(FileHandler* fin):
+ HashBase<T>(fin) {
+ load(fin);
+ }
+ ~UnivHash_linear() {
+ freeSeeds();
+ }
+ T hash(const char* s, count_t h) {
+ return 0; //not implemented
+ }
+ T hash(const wordID_t* id, const int len, count_t h);
+ T hash(const wordID_t id, const count_t pos,
+ const T prevValue, count_t h);
+ void save(FileHandler* fout);
+ void load(FileHandler* fin);
+private:
+ T** a_, **b_;
+ P pr_;
+ void initSeeds();
+ void freeSeeds();
};
/** UnivHash_noPrimes:
@@ -74,76 +82,91 @@ class UnivHash_linear: public HashBase<T> {
* # of hash function = 2^(l-1)
*/
template <typename T>
-class UnivHash_noPrimes: public HashBase<T> {
- public:
- UnivHash_noPrimes(float k, float l):
- HashBase<T>(k, 100), d_(count_t((l-k))) {
- if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1;
- else p_ = (P) pow(2,l);
- initSeeds();
- }
- UnivHash_noPrimes(FileHandler* fin):
- HashBase<T>(fin) {
- load(fin);
- }
- ~UnivHash_noPrimes() {freeSeeds();}
- T hash(const char* s, count_t h);
- T hash(const wordID_t* id, const int len, count_t h);
- T hash(const P x, count_t h);
- void save(FileHandler* fout);
- void load(FileHandler* fin);
- private:
- count_t d_; // l-k
- P p_, *a_; // real-valued input range, storage
- void initSeeds();
- void freeSeeds() {delete[] a_;}
+class UnivHash_noPrimes: public HashBase<T>
+{
+public:
+ UnivHash_noPrimes(float k, float l):
+ HashBase<T>(k, 100), d_(count_t((l-k))) {
+ if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1;
+ else p_ = (P) pow(2,l);
+ initSeeds();
+ }
+ UnivHash_noPrimes(FileHandler* fin):
+ HashBase<T>(fin) {
+ load(fin);
+ }
+ ~UnivHash_noPrimes() {
+ freeSeeds();
+ }
+ T hash(const char* s, count_t h);
+ T hash(const wordID_t* id, const int len, count_t h);
+ T hash(const P x, count_t h);
+ void save(FileHandler* fout);
+ void load(FileHandler* fin);
+private:
+ count_t d_; // l-k
+ P p_, *a_; // real-valued input range, storage
+ void initSeeds();
+ void freeSeeds() {
+ delete[] a_;
+ }
};
//! @todo ask abby2
template <typename T>
-class Hash_shiftAddXOR: public HashBase<T> {
- public:
- Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H),
- l_(5), r_(2) {
- initSeeds();
- }
- ~Hash_shiftAddXOR() {freeSeeds();}
- T hash(const char* s, count_t h);
- T hash(const wordID_t* id, const int len, count_t h) {} // empty
- private:
- T* v_; // random seed storage
- const unsigned short l_, r_; // left-shift bits, right-shift bits
- void initSeeds();
- void freeSeeds() {delete[] v_;}
+class Hash_shiftAddXOR: public HashBase<T>
+{
+public:
+ Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H),
+ l_(5), r_(2) {
+ initSeeds();
+ }
+ ~Hash_shiftAddXOR() {
+ freeSeeds();
+ }
+ T hash(const char* s, count_t h);
+ T hash(const wordID_t* id, const int len, count_t h) {} // empty
+private:
+ T* v_; // random seed storage
+ const unsigned short l_, r_; // left-shift bits, right-shift bits
+ void initSeeds();
+ void freeSeeds() {
+ delete[] v_;
+ }
};
//! @todo ask abby2
template <typename T>
-class UnivHash_tableXOR: public HashBase<T> {
- public:
- UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H),
- table_(NULL), tblLen_(255*MAX_STR_LEN) {
- initSeeds();
- }
- ~UnivHash_tableXOR() {freeSeeds();}
- T hash(const char* s, count_t h);
- T hash(const wordID_t* id, const int len, count_t h) {}
- private:
- T** table_; // storage for random numbers
- count_t tblLen_; // length of table
- void initSeeds();
- void freeSeeds();
+class UnivHash_tableXOR: public HashBase<T>
+{
+public:
+ UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H),
+ table_(NULL), tblLen_(255*MAX_STR_LEN) {
+ initSeeds();
+ }
+ ~UnivHash_tableXOR() {
+ freeSeeds();
+ }
+ T hash(const char* s, count_t h);
+ T hash(const wordID_t* id, const int len, count_t h) {}
+private:
+ T** table_; // storage for random numbers
+ count_t tblLen_; // length of table
+ void initSeeds();
+ void freeSeeds();
};
// ShiftAddXor
template <typename T>
-void Hash_shiftAddXOR<T>::initSeeds() {
+void Hash_shiftAddXOR<T>::initSeeds()
+{
v_ = new T[this->H_];
for(count_t i=0; i < this->H_; i++)
- v_[i] = Utils::rand<T>() + 1;
+ v_[i] = Utils::rand<T>() + 1;
}
template <typename T>
-T Hash_shiftAddXOR<T>::hash(const char* s, count_t h) {
+T Hash_shiftAddXOR<T>::hash(const char* s, count_t h)
+{
T value = v_[h];
int pos(0);
unsigned char c;
@@ -155,40 +178,44 @@ T Hash_shiftAddXOR<T>::hash(const char* s, count_t h) {
// UnivHash_tableXOR
template <typename T>
-void UnivHash_tableXOR<T>::initSeeds() {
+void UnivHash_tableXOR<T>::initSeeds()
+{
// delete any values in table
- if(table_) freeSeeds();
+ if(table_) freeSeeds();
// instance of new table
table_ = new T* [this->H_];
// fill with random values
for(count_t j=0; j < this->H_; j++) {
table_[j] = new T[tblLen_];
- for(count_t i=0; i < tblLen_; i++) {
- table_[j][i] = Utils::rand<T>(this->m_-1);
+ for(count_t i=0; i < tblLen_; i++) {
+ table_[j][i] = Utils::rand<T>(this->m_-1);
}
}
}
template <typename T>
-void UnivHash_tableXOR<T>::freeSeeds() {
+void UnivHash_tableXOR<T>::freeSeeds()
+{
for(count_t j = 0; j < this->H_; j++)
delete[] table_[j];
delete[] table_;
table_ = NULL;
}
template <typename T>
-T UnivHash_tableXOR<T>::hash(const char* s, count_t h) {
+T UnivHash_tableXOR<T>::hash(const char* s, count_t h)
+{
T value = 0;
count_t pos = 0, idx = 0;
unsigned char c;
while((c = *s++) && (++pos < MAX_STR_LEN))
value ^= table_[h][idx += c];
- CHECK(value < this->m_);
+ CHECK(value < this->m_);
return value;
}
// UnivHash_noPrimes
template <typename T>
-void UnivHash_noPrimes<T>::initSeeds() {
+void UnivHash_noPrimes<T>::initSeeds()
+{
a_ = new P[this->H_];
for(T i=0; i < this->H_; i++) {
a_[i] = Utils::rand<P>();
@@ -196,14 +223,16 @@ void UnivHash_noPrimes<T>::initSeeds() {
}
}
template <typename T>
-T UnivHash_noPrimes<T>::hash(const P x, count_t h) {
+T UnivHash_noPrimes<T>::hash(const P x, count_t h)
+{
// h_a(x) = (ax mod 2^l) div 2^(l-k)
T value = ((a_[h] * x) % p_) >> d_;
return value % this->m_;
}
template <typename T>
-T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
- count_t h) {
+T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
+ count_t h)
+{
T value = 0;
int pos(0);
while(pos < len) {
@@ -213,39 +242,42 @@ T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
return value % this->m_;
}
template <typename T>
-T UnivHash_noPrimes<T>::hash(const char* s, count_t h) {
+T UnivHash_noPrimes<T>::hash(const char* s, count_t h)
+{
T value = 0;
int pos(0);
unsigned char c;
while((c = *s++) && (++pos < MAX_STR_LEN)) {
- value ^= hash((P)c, h);
+ value ^= hash((P)c, h);
}
return value % this->m_;
}
template <typename T>
-void UnivHash_noPrimes<T>::save(FileHandler* fout) {
+void UnivHash_noPrimes<T>::save(FileHandler* fout)
+{
HashBase<T>::save(fout);
fout->write((char*)&p_, sizeof(p_));
fout->write((char*)&d_, sizeof(d_));
- for(T i=0; i < this->H_; i++) {
+ for(T i=0; i < this->H_; i++) {
fout->write((char*)&a_[i], sizeof(a_[i]));
}
}
template <typename T>
-void UnivHash_noPrimes<T>::load(FileHandler* fin) {
+void UnivHash_noPrimes<T>::load(FileHandler* fin)
+{
a_ = new P[this->H_];
// HashBase<T>::load(fin) already done in constructor
fin->read((char*)&p_, sizeof(p_));
fin->read((char*)&d_, sizeof(d_));
- for(T i=0; i < this->H_; i++)
- {
+ for(T i=0; i < this->H_; i++) {
fin->read((char*)&a_[i], sizeof(a_[i]));
}
}
//UnivHash_linear
template <typename T>
-void UnivHash_linear<T>::initSeeds() {
+void UnivHash_linear<T>::initSeeds()
+{
a_ = new T*[this->H_];
b_ = new T*[this->H_];
for(count_t i=0; i < this->H_; i++) {
@@ -258,7 +290,8 @@ void UnivHash_linear<T>::initSeeds() {
}
}
template <typename T>
-void UnivHash_linear<T>::freeSeeds() {
+void UnivHash_linear<T>::freeSeeds()
+{
for(count_t i=0; i < this->H_; i++) {
delete[] a_[i];
delete[] b_[i];
@@ -268,8 +301,9 @@ void UnivHash_linear<T>::freeSeeds() {
a_ = b_ = NULL;
}
template <typename T>
-inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
- count_t h) {
+inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
+ count_t h)
+{
CHECK(h < this->H_);
T value = 0;
int pos(0);
@@ -281,19 +315,21 @@ inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
}
template <typename T>
inline T UnivHash_linear<T>::hash(const wordID_t id, const count_t pos,
- const T prevValue, count_t h) {
+ const T prevValue, count_t h)
+{
CHECK(h < this->H_);
T value = prevValue + ((a_[h][pos] * id) + b_[h][pos]); // % pr_;
return value % this->m_;
}
template <typename T>
-void UnivHash_linear<T>::save(FileHandler* fout) {
+void UnivHash_linear<T>::save(FileHandler* fout)
+{
// int bytes = sizeof(a_[0][0]);
HashBase<T>::save(fout);
fout->write((char*)&pr_, sizeof(pr_));
for(count_t i=0; i < this->H_; i++) {
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
- fout->write((char*)&a_[i][j], sizeof(a_[i][j]));
+ fout->write((char*)&a_[i][j], sizeof(a_[i][j]));
fout->write((char*)&b_[i][j], sizeof(b_[i][j]));
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
@@ -301,7 +337,8 @@ void UnivHash_linear<T>::save(FileHandler* fout) {
}
}
template <typename T>
-void UnivHash_linear<T>::load(FileHandler* fin) {
+void UnivHash_linear<T>::load(FileHandler* fin)
+{
// HashBase<T>::load(fin) already done in constructor
fin->read((char*)&pr_, sizeof(pr_));
a_ = new T*[this->H_];
@@ -310,8 +347,8 @@ void UnivHash_linear<T>::load(FileHandler* fin) {
a_[i] = new T[MAX_NGRAM_ORDER];
b_[i] = new T[MAX_NGRAM_ORDER];
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
- fin->read((char*)&a_[i][j], sizeof(a_[i][j]));
- fin->read((char*)&b_[i][j], sizeof(b_[i][j]));
+ fin->read((char*)&a_[i][j], sizeof(a_[i][j]));
+ fin->read((char*)&b_[i][j], sizeof(b_[i][j]));
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
}
diff --git a/moses/TranslationModel/DynSAInclude/onlineRLM.h b/moses/TranslationModel/DynSAInclude/onlineRLM.h
index b47cfdd0e..527f1e5d9 100644
--- a/moses/TranslationModel/DynSAInclude/onlineRLM.h
+++ b/moses/TranslationModel/DynSAInclude/onlineRLM.h
@@ -18,27 +18,28 @@ const bool strict_checks_ = false;
//! @todo ask abby2
template<typename T>
-class OnlineRLM: public PerfectHash<T> {
+class OnlineRLM: public PerfectHash<T>
+{
public:
- OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order,
- Moses::Vocab* v, float qBase = 8): PerfectHash<T>(MBs, width, bucketRange, qBase),
+ OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order,
+ Moses::Vocab* v, float qBase = 8): PerfectHash<T>(MBs, width, bucketRange, qBase),
vocab_(v), bAdapting_(false), order_(order), corpusSize_(0), alpha_(0) {
CHECK(vocab_ != 0);
//instantiate quantizer class here
cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
alpha_ = new float[order_ + 1];
- for(count_t i = 0; i <= order_; ++i)
+ for(count_t i = 0; i <= order_; ++i)
alpha_[i] = i * log10(0.4);
cerr << "Initialzing auxillary bit filters...\n";
bPrefix_ = new BitFilter(this->cells_);
bHit_ = new BitFilter(this->cells_);
}
- OnlineRLM(FileHandler* fin, count_t order):
+ OnlineRLM(FileHandler* fin, count_t order):
PerfectHash<T>(fin), bAdapting_(true), order_(order), corpusSize_(0) {
load(fin);
cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
alpha_ = new float[order_ + 1];
- for(count_t i = 0; i <= order_; ++i)
+ for(count_t i = 0; i <= order_; ++i)
alpha_[i] = i * log10(0.4);
}
~OnlineRLM() {
@@ -54,14 +55,18 @@ public:
bool insert(const std::vector<string>& ngram, const int value);
bool update(const std::vector<string>& ngram, const int value);
int query(const wordID_t* IDs, const int len);
- int sbsqQuery(const std::vector<string>& ngram, int* len,
- bool bStrict = false);
- int sbsqQuery(const wordID_t* IDs, const int len, int* codes,
- bool bStrict = false);
+ int sbsqQuery(const std::vector<string>& ngram, int* len,
+ bool bStrict = false);
+ int sbsqQuery(const wordID_t* IDs, const int len, int* codes,
+ bool bStrict = false);
void remove(const std::vector<string>& ngram);
count_t heurDelete(count_t num2del, count_t order = 5);
- uint64_t corpusSize() {return corpusSize_;}
- void corpusSize(uint64_t c) {corpusSize_ = c;}
+ uint64_t corpusSize() {
+ return corpusSize_;
+ }
+ void corpusSize(uint64_t c) {
+ corpusSize_ = c;
+ }
void clearCache() {
if(cache_) cache_->clear();
}
@@ -79,7 +84,7 @@ protected:
void markQueried(hpdEntry_t& value);
bool markPrefix(const wordID_t* IDs, const int len, bool bSet);
private:
- const void* getContext(const wordID_t* ngram, int len);
+ const void* getContext(const wordID_t* ngram, int len);
const bool bAdapting_; // used to signal adaptation of model
const count_t order_; // LM order
uint64_t corpusSize_; // total training corpus size
@@ -90,48 +95,50 @@ private:
};
template<typename T>
-bool OnlineRLM<T>::insert(const std::vector<string>& ngram, const int value) {
+bool OnlineRLM<T>::insert(const std::vector<string>& ngram, const int value)
+{
int len = ngram.size();
wordID_t wrdIDs[len];
uint64_t index(this->cells_ + 1);
- for(int i = 0; i < len; ++i)
+ for(int i = 0; i < len; ++i)
wrdIDs[i] = vocab_->GetWordID(ngram[i]);
index = PerfectHash<T>::insert(wrdIDs, len, value);
if(value > 1 && len < order_)
markPrefix(wrdIDs, ngram.size(), true); // mark context
// keep track of total items from training data minus "<s>"
- if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting
+ if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting
corpusSize_ += (wrdIDs[0] != vocab_->GetBOSWordID()) ? value : 0;
- if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting
+ if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting
markQueried(index);
return true;
}
template<typename T>
-bool OnlineRLM<T>::update(const std::vector<string>& ngram, const int value) {
+bool OnlineRLM<T>::update(const std::vector<string>& ngram, const int value)
+{
int len = ngram.size();
std::vector<wordID_t> wrdIDs(len);
uint64_t index(this->cells_ + 1);
hpdEntry_t hpdItr;
vocab_->MakeOpen();
- for(int i = 0; i < len; ++i)
+ for(int i = 0; i < len; ++i)
wrdIDs[i] = vocab_->GetWordID(ngram[i]);
- // if updating, minimize false positives by pre-checking if context already in model
- bool bIncluded(true);
+ // if updating, minimize false positives by pre-checking if context already in model
+ bool bIncluded(true);
if(value > 1 && len < (int)order_)
bIncluded = markPrefix(&wrdIDs[0], ngram.size(), true); // mark context
- if(bIncluded) { // if context found
+ if(bIncluded) { // if context found
bIncluded = PerfectHash<T>::update2(&wrdIDs[0], len, value, hpdItr, index);
if(index < this->cells_) {
markQueried(index);
- }
- else if(hpdItr != this->dict_.end()) markQueried(hpdItr);
+ } else if(hpdItr != this->dict_.end()) markQueried(hpdItr);
}
return bIncluded;
}
template<typename T>
-int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
+int OnlineRLM<T>::query(const wordID_t* IDs, int len)
+{
uint64_t filterIdx = 0;
hpdEntry_t hpdItr;
int value(0);
@@ -140,8 +147,7 @@ int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
if(hpdItr != this->dict_.end()) {
//markQueried(hpdItr); // mark this event as "hit"
value -= ((value & this->hitMask_) != 0) ? this->hitMask_ : 0; // check for previous hit marks
- }
- else {
+ } else {
CHECK(filterIdx < this->cells_);
//markQueried(filterIdx);
}
@@ -150,15 +156,16 @@ int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
}
template<typename T>
-bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
- if(len <= 1) return true; // only do this for for ngrams with context
- static Cache<int> pfCache(-1, -1); // local prefix cache
+bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet)
+{
+ if(len <= 1) return true; // only do this for for ngrams with context
+ static Cache<int> pfCache(-1, -1); // local prefix cache
int code(0);
- if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) {
- hpdEntry_t hpdItr;
+ if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) {
+ hpdEntry_t hpdItr;
uint64_t filterIndex(0);
code = PerfectHash<T>::query(IDs, len - 1, hpdItr, filterIndex); // hash IDs[0..len-1]
- if(code == -1) { // encountered false positive in pipeline
+ if(code == -1) { // encountered false positive in pipeline
cerr << "WARNING: markPrefix(). The O-RLM is *not* well-formed.\n";
// add all prefixes or return false;
return false;
@@ -167,10 +174,9 @@ bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
CHECK(hpdItr == this->dict_.end());
if(bSet) bPrefix_->setBit(filterIndex); // mark index
else bPrefix_->clearBit(filterIndex); // unset index
- }
- else {
+ } else {
CHECK(filterIndex == this->cells_ + 1);
- //how to handle hpd prefixes?
+ //how to handle hpd prefixes?
}
if(pfCache.nodes() > 10000) pfCache.clear();
pfCache.setCacheNgram(IDs, len - 1, code, NULL);
@@ -179,39 +185,43 @@ bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
}
template<typename T>
-void OnlineRLM<T>::markQueried(const uint64_t& index) {
+void OnlineRLM<T>::markQueried(const uint64_t& index)
+{
bHit_->setBit(index);
//cerr << "filter[" << index << "] = " << this->filter_->read(index) << endl;
}
template<typename T>
-void OnlineRLM<T>::markQueried(hpdEntry_t& value) {
- // set high bit of counter to indicate "hit" status
+void OnlineRLM<T>::markQueried(hpdEntry_t& value)
+{
+ // set high bit of counter to indicate "hit" status
value->second |= this->hitMask_;
}
template<typename T>
-void OnlineRLM<T>::remove(const std::vector<string>& ngram) {
+void OnlineRLM<T>::remove(const std::vector<string>& ngram)
+{
wordID_t IDs[ngram.size()];
- for(count_t i = 0; i < ngram.size(); ++i)
+ for(count_t i = 0; i < ngram.size(); ++i)
IDs[i] = vocab_->GetWordID(ngram[i]);
PerfectHash<T>::remove(IDs, ngram.size());
}
template<typename T>
-count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order) {
+count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order)
+{
count_t deleted = 0;
cout << "Deleting " << num2del << " of order "<< order << endl;
// delete from filter first
- int full = *std::max_element(this->idxTracker_, this->idxTracker_
- + this->totBuckets_);
+ int full = *std::max_element(this->idxTracker_, this->idxTracker_
+ + this->totBuckets_);
for(; full > 0; --full) // delete from fullest buckets first
- for(int bk = 0; bk < this->totBuckets_; ++bk) {
+ for(int bk = 0; bk < this->totBuckets_; ++bk) {
if(deleted >= num2del) break;
if(this->idxTracker_[bk] == full) { // if full
uint64_t first = bk * this->bucketRange_,
- last = first + this->bucketRange_;
- for(uint64_t row = first; row < last; ++row) { // check each row
+ last = first + this->bucketRange_;
+ for(uint64_t row = first; row < last; ++row) { // check each row
if(!(bHit_->testBit(row) || bPrefix_->testBit(row) )) {
if(this->filter_->read(row) != 0) {
PerfectHash<T>::remove(row); // remove from filter
@@ -231,16 +241,18 @@ count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order) {
template<typename T>
int OnlineRLM<T>::sbsqQuery(const std::vector<string>& ngram, int* codes,
- bool bStrict) {
+ bool bStrict)
+{
wordID_t IDs[ngram.size()];
- for(count_t i = 0; i < ngram.size(); ++i)
+ for(count_t i = 0; i < ngram.size(); ++i)
IDs[i] = vocab_->GetWordID(ngram[i]);
return sbsqQuery(IDs, ngram.size(), codes, bStrict);
}
template<typename T>
-int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
- bool bStrict) {
+int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
+ bool bStrict)
+{
uint64_t filterIdx = 0;
int val(0), fnd(0);
hpdEntry_t hpdItr;
@@ -252,14 +264,13 @@ int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
if(hpdItr != this->dict_.end()) {
val -= ((val & this->hitMask_) != 0) ? this->hitMask_ : 0; // account for previous hit marks
}
- }
- else if(bStrict) {
- break;
+ } else if(bStrict) {
+ break;
}
// add to value array
codes[i] = val > 0 ? val : 0;
}
- while(bStrict && (fnd > 1)) { // do checks the other way
+ while(bStrict && (fnd > 1)) { // do checks the other way
val = PerfectHash<T>::query(&IDs[len - fnd], fnd - 1, hpdItr, filterIdx);
if(val != -1) break; // if anything found
else --fnd; // else decrement found
@@ -269,8 +280,9 @@ int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
}
template<typename T>
-float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
- const void** state) {
+float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
+ const void** state)
+{
static const float oovprob = log10(1.0 / (static_cast<float>(vocab_->Size()) - 1));
float logprob(0);
const void* context = (state) ? *state : 0;
@@ -278,66 +290,66 @@ float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
if(!cache_->checkCacheNgram(ngram, len, &logprob, &context)) {
// get full prob and put in cache
int num_fnd(0), den_val(0);
- int *in = new int[len]; // in[] keeps counts of increasing order numerator
+ int *in = new int[len]; // in[] keeps counts of increasing order numerator
for(int i = 0; i < len; ++i) in[i] = 0;
for(int i = len - 1; i >= 0; --i) {
if(ngram[i] == vocab_->GetkOOVWordID()) break; // no need to query if OOV
in[i] = query(&ngram[i], len - i);
if(in[i] > 0) {
num_fnd = len - i;
- }
- else if(strict_checks_) break;
+ } else if(strict_checks_) break;
}
while(num_fnd > 1) { // get lower order count
- //get sub-context of size one less than length found (exluding target)
+ //get sub-context of size one less than length found (exluding target)
if(((den_val = query(&ngram[len - num_fnd], num_fnd - 1)) > 0) &&
(den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) {
break;
- }
- else --num_fnd; // else backoff to lower ngram order
+ } else --num_fnd; // else backoff to lower ngram order
}
- if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams
+ if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams
num_fnd = 0;
switch(num_fnd) { // find prob (need to refactor into precomputation)
- case 0: // OOV
- logprob = alpha_[len] + oovprob;
- break;
- case 1: // unigram found only
- CHECK(in[len - 1] > 0);
- logprob = alpha_[len - 1] + (corpusSize_ > 0 ?
- log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_)) : 0);
- //logprob = alpha_[len - 1] +
- //log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_));
- break;
- default:
- CHECK(den_val > 0);
- //if(subgram == in[len - found]) ++subgram; // avoid returning zero probs????
- logprob = alpha_[len - num_fnd] +
- log10(static_cast<float>(in[len - num_fnd]) / static_cast<float>(den_val));
- break;
+ case 0: // OOV
+ logprob = alpha_[len] + oovprob;
+ break;
+ case 1: // unigram found only
+ CHECK(in[len - 1] > 0);
+ logprob = alpha_[len - 1] + (corpusSize_ > 0 ?
+ log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_)) : 0);
+ //logprob = alpha_[len - 1] +
+ //log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_));
+ break;
+ default:
+ CHECK(den_val > 0);
+ //if(subgram == in[len - found]) ++subgram; // avoid returning zero probs????
+ logprob = alpha_[len - num_fnd] +
+ log10(static_cast<float>(in[len - num_fnd]) / static_cast<float>(den_val));
+ break;
}
// need unique context
context = getContext(&ngram[len - num_fnd], num_fnd);
// put whatever was found in cache
cache_->setCacheNgram(ngram, len, logprob, context);
} // end checkCache
- return logprob;
+ return logprob;
}
template<typename T>
-const void* OnlineRLM<T>::getContext(const wordID_t* ngram, int len) {
+const void* OnlineRLM<T>::getContext(const wordID_t* ngram, int len)
+{
int dummy(0);
float* *addresses = new float*[len]; // only interested in addresses of cache
CHECK(cache_->getCache2(ngram, len, &addresses[0], &dummy) == len);
// return address of cache node
-
+
float *addr0 = addresses[0];
free( addresses );
return (const void*)addr0;
}
template<typename T>
-void OnlineRLM<T>::randDelete(int num2del) {
+void OnlineRLM<T>::randDelete(int num2del)
+{
int deleted = 0;
for(uint64_t i = 0; i < this->cells_; i++) {
if(this->filter_->read(i) != 0) {
@@ -349,19 +361,21 @@ void OnlineRLM<T>::randDelete(int num2del) {
}
template<typename T>
-int OnlineRLM<T>::countHits() {
+int OnlineRLM<T>::countHits()
+{
int hit(0);
for(uint64_t i = 0; i < this->cells_; ++i)
if(bHit_->testBit(i)) ++hit;
iterate(this->dict_, itr)
- if((itr->second & this->hitMask_) != 0)
- ++hit;
+ if((itr->second & this->hitMask_) != 0)
+ ++hit;
cerr << "Hit count = " << hit << endl;
return hit;
}
template<typename T>
-int OnlineRLM<T>::countPrefixes() {
+int OnlineRLM<T>::countPrefixes()
+{
int pfx(0);
for(uint64_t i = 0; i < this->cells_; ++i)
if(bPrefix_->testBit(i)) ++pfx;
@@ -371,23 +385,25 @@ int OnlineRLM<T>::countPrefixes() {
}
template<typename T>
-int OnlineRLM<T>::cleanUpHPD() {
+int OnlineRLM<T>::cleanUpHPD()
+{
cerr << "HPD size before = " << this->dict_.size() << endl;
std::vector<string> vDel, vtmp;
iterate(this->dict_, itr) {
if(((itr->second & this->hitMask_) == 0) && // if not hit during testing
- (Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram
+ (Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram
vDel.push_back(itr->first);
}
}
- iterate(vDel, vitr)
- this->dict_.erase(*vitr);
+ iterate(vDel, vitr)
+ this->dict_.erase(*vitr);
cerr << "HPD size after = " << this->dict_.size() << endl;
return vDel.size();
}
template<typename T>
-void OnlineRLM<T>::clearMarkings() {
+void OnlineRLM<T>::clearMarkings()
+{
cerr << "clearing all event hits\n";
bHit_->reset();
count_t* value(0);
@@ -398,7 +414,8 @@ void OnlineRLM<T>::clearMarkings() {
}
template<typename T>
-void OnlineRLM<T>::save(FileHandler* fout) {
+void OnlineRLM<T>::save(FileHandler* fout)
+{
cerr << "Saving ORLM...\n";
// save vocab
vocab_->Save(fout);
@@ -412,7 +429,8 @@ void OnlineRLM<T>::save(FileHandler* fout) {
}
template<typename T>
-void OnlineRLM<T>::load(FileHandler* fin) {
+void OnlineRLM<T>::load(FileHandler* fin)
+{
cerr << "Loading ORLM...\n";
// load vocab first
vocab_ = new Moses::Vocab(fin);
@@ -428,12 +446,13 @@ void OnlineRLM<T>::load(FileHandler* fin) {
}
template<typename T>
-void OnlineRLM<T>::removeNonMarked() {
+void OnlineRLM<T>::removeNonMarked()
+{
cerr << "deleting all unused events\n";
int deleted(0);
for(uint64_t i = 0; i < this->cells_; ++i) {
- if(!(bHit_->testBit(i) || bPrefix_->testBit(i))
- && (this->filter_->read(i) != 0)) {
+ if(!(bHit_->testBit(i) || bPrefix_->testBit(i))
+ && (this->filter_->read(i) != 0)) {
PerfectHash<T>::remove(i);
++deleted;
}
@@ -456,36 +475,36 @@ float OnlineRLM<T>::getProb2(const wordID_t* ngram, int len, const void** state)
// constrain cache queries using model assumptions
int denom_len = cache_->getCache(ngram, len - 1, &denom_codes[0], &denom_found);
cerr << "denom_len = " << denom_len << endl;
- int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1,
+ int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1,
&num_codes[0], &found);
cerr << "num_len= " << num_len << endl;
// keed reducing ngram size until both denominator and numerator are found
// allowed to leave kUnknownCode in cache because we check for this.
found = num_len; // guaranteed to be <= denom_len + 1
// still check for OOV
- for (int i = len - found; i < len; ++i)
- if (ngram[i] == Vocab::kOOVWordID) {
+ for (int i = len - found; i < len; ++i)
+ if (ngram[i] == Vocab::kOOVWordID) {
found = len - i - 1;
}
// check for relative estimator
while(found > 1) {
- if(*denom_codes[found-1] == cache_unk_ &&
- ((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) {
+ if(*denom_codes[found-1] == cache_unk_ &&
+ ((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) {
//!struct_->query(&ngram[len-*found], *found-1, kMainEventIdx, denom_codes[*found-1])) {
*num_codes[found] = cache_unk_;
} else {
if(*num_codes[found] != cache_unk_ ||
((*num_codes[found] = query(&ngram[len-found], found)) <= *denom_codes[found-1]))
- // struct_->query(&ngram[len-*found], *found, kMainEventIdx,
+ // struct_->query(&ngram[len-*found], *found, kMainEventIdx,
// num_codes[*found], *denom_codes[*found-1]))
break;
- }
+ }
--found;
}
- // didn't find bigram numerator or unigram denominator
+ // didn't find bigram numerator or unigram denominator
if (found == 1)
- found = *num_codes[1] != cache_unk_
- || ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0);
+ found = *num_codes[1] != cache_unk_
+ || ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0);
//struct_->query(&ngram[len - 1], 1, kMainEventIdx, num_codes[1]);
// ....
// return estimate applying correct backoff score (precomputed)
@@ -496,20 +515,20 @@ float OnlineRLM<T>::getProb2(const wordID_t* ngram, int len, const void** state)
//log_prob = stupid_backoff_log10_[len] + uniform_log10prob_;
break;
case 1: // unigram over whole corpus
- log_prob = alpha_[len - 1] +
+ log_prob = alpha_[len - 1] +
log10(static_cast<float>(*num_codes[1]) / static_cast<float>(corpusSize_));
- //log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_
+ //log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_
// + stupid_backoff_log10_[len - 1]; // precomputed
break;
default: // otherwise use both statistics and (possibly zero) backoff weight
- log_prob = alpha_[len - found] +
+ log_prob = alpha_[len - found] +
log10(static_cast<float>(*num_codes[found]) / static_cast<float>(*denom_codes[found-1]));
- //log_prob = log_quantiser_->getLog10Value(*num_codes[*found ])
- // - log_quantiser_->getLog10Value(*denom_codes[*found - 1])
+ //log_prob = log_quantiser_->getLog10Value(*num_codes[*found ])
+ // - log_quantiser_->getLog10Value(*denom_codes[*found - 1])
// + stupid_backoff_log10_[len - *found];
}
context_state = (const void*)num_codes[found == len ? found - 1 : found];;
- //probCache_->store(len, log_prob, context_state);
+ //probCache_->store(len, log_prob, context_state);
if (state)
*state = context_state;
return log_prob;
diff --git a/moses/TranslationModel/DynSAInclude/params.cpp b/moses/TranslationModel/DynSAInclude/params.cpp
index 4be3a1676..a4d51d5b2 100644
--- a/moses/TranslationModel/DynSAInclude/params.cpp
+++ b/moses/TranslationModel/DynSAInclude/params.cpp
@@ -1,10 +1,11 @@
#include "params.h"
-namespace Moses {
+namespace Moses
+{
// parameter constants
const std::string Parameters::kNotSetValue = "__NOT_SET__";
-const int Parameters::kBoolValue = 0;
+const int Parameters::kBoolValue = 0;
const int Parameters::kIntValue = 1;
const int Parameters::kFloatValue = 2;
const int Parameters::kStringValue = 3;
@@ -13,26 +14,30 @@ const int Parameters::kUndefinedValue = -1;
const std::string Parameters::kTrueValue = "1";
const std::string Parameters::kFalseValue = "0";
-Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum) {
+Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum)
+{
initialize(paramdefs, paramNum);
}
-Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs,
- const count_t paramNum) {
+Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs,
+ const count_t paramNum)
+{
initialize(paramdefs, paramNum);
loadParams(argc, argv);
}
-void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum) {
+void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum)
+{
for( count_t i = 0; i < paramNum; i++ ) {
params_[paramdefs[i].name] = paramdefs[i]; // assign name
}
cerr << "Default parameter values:\n";
- iterate(params_, itr)
- cerr << "\t" << itr->first << " --> " << itr->second.value << endl;
+ iterate(params_, itr)
+ cerr << "\t" << itr->first << " --> " << itr->second.value << endl;
}
-bool Parameters::loadParams(int argc, char ** argv) {
+bool Parameters::loadParams(int argc, char ** argv)
+{
// load params from commandline args
//if( argc < 3 ) {
// fprintf(stderr, "ERROR: No parameters. Use \"-config\" or \"-f\" to specify configuration file.\n");
@@ -66,7 +71,7 @@ bool Parameters::loadParams(int argc, char ** argv) {
std::string val = argv[i+1];
Utils::trim(val);
if( param == "config" )
- load_from_file = true;
+ load_from_file = true;
if(!setParamValue(param, val)) {
std::cerr << "Invalid Param name->value " << param << "->" << val << std::endl;
return false;
@@ -80,35 +85,40 @@ bool Parameters::loadParams(int argc, char ** argv) {
return success;
}
-std::string Parameters::normaliseParamName(const std::string & name) {
+std::string Parameters::normaliseParamName(const std::string & name)
+{
// Map valid abbreviations to long names. Retain other names.
if( params_.find(name) == params_.end() )
- iterate(params_, i)
- if( i->second.abbrev == name )
- return i->first;
+ iterate(params_, i)
+ if( i->second.abbrev == name )
+ return i->first;
return name;
}
-int Parameters::getValueType(const std::string& name) {
+int Parameters::getValueType(const std::string& name)
+{
if(params_.find(name) != params_.end())
return params_[name].type;
return Parameters::kUndefinedValue;
}
-bool Parameters::isValidParamName(const std::string & name) {
- return params_.find(name) != params_.end();
+bool Parameters::isValidParamName(const std::string & name)
+{
+ return params_.find(name) != params_.end();
}
-bool Parameters::setParamValue(const std::string& name, const std::string& val) {
- // TODO: Add basic type checking w verifyValueType()
- bool set = isValidParamName(name);
- if(set) {
- params_[name].value = val;
+bool Parameters::setParamValue(const std::string& name, const std::string& val)
+{
+ // TODO: Add basic type checking w verifyValueType()
+ bool set = isValidParamName(name);
+ if(set) {
+ params_[name].value = val;
std::cerr << "PARAM SET: "<< name << "=" << val << std::endl;
}
return( set );
}
-std::string Parameters::getParamValue(const std::string& name) {
+std::string Parameters::getParamValue(const std::string& name)
+{
std::string value = Parameters::kNotSetValue;
if(isValidParamName(name))
if(params_.find(name) != params_.end())
@@ -117,43 +127,46 @@ std::string Parameters::getParamValue(const std::string& name) {
value = kFalseValue;
return value;
}
-std::string Parameters::getParam(const std::string& name) {
+std::string Parameters::getParam(const std::string& name)
+{
return getParamValue(name);
-/*void* Parameters::getParam(const std::string& name) {
- void* paramVal = 0;
- int type = getValueType(name);
- const char* sval = getParamValue(name).c_str();
- switch(type) {
- case kIntValue: {
- int ival = atoi(sval);
- paramVal = (void*)&ival;
- break;
- }
- case kFloatValue: {
- float fval = atof(sval);
- paramVal = (void*)&fval;
- break;
- }
- case kStringValue: {
- paramVal = (void*)sval;
- break;
- }
- case kBoolValue: {
- bool bval = sval == Parameters::kTrueValue ? true : false;
- paramVal = (void*)&bval;
- break;
+ /*void* Parameters::getParam(const std::string& name) {
+ void* paramVal = 0;
+ int type = getValueType(name);
+ const char* sval = getParamValue(name).c_str();
+ switch(type) {
+ case kIntValue: {
+ int ival = atoi(sval);
+ paramVal = (void*)&ival;
+ break;
+ }
+ case kFloatValue: {
+ float fval = atof(sval);
+ paramVal = (void*)&fval;
+ break;
+ }
+ case kStringValue: {
+ paramVal = (void*)sval;
+ break;
+ }
+ case kBoolValue: {
+ bool bval = sval == Parameters::kTrueValue ? true : false;
+ paramVal = (void*)&bval;
+ break;
+ }
+ default: // --> Parameters::kUndefinedValue
+ paramVal = (void*)sval; // will set to Parameters::kNotSetValue
}
- default: // --> Parameters::kUndefinedValue
- paramVal = (void*)sval; // will set to Parameters::kNotSetValue
- }
- return paramVal;*/
+ return paramVal;*/
}
-bool Parameters::verifyValueType(const std::string& name, const std::string& val) {
+bool Parameters::verifyValueType(const std::string& name, const std::string& val)
+{
// Implement basic type checking
return true;
}
-int Parameters::getParamCount() const {
+int Parameters::getParamCount() const
+{
return params_.size();
}
@@ -161,7 +174,8 @@ int Parameters::getParamCount() const {
* HAVE TO CHANGE loadParams() from file to not overwrite command lines but
* override default if different*/
bool Parameters::loadParams(const std::string & file_path,
- std::set<std::string>& setParams) {
+ std::set<std::string>& setParams)
+{
// parameters loaded from file don't override cmd line paramters
/*std::set<std::string>::iterator end = setParams.end();
FileHandler file(file_path.c_str(), std::ios::in);
diff --git a/moses/TranslationModel/DynSAInclude/params.h b/moses/TranslationModel/DynSAInclude/params.h
index d5af6331d..efc0a6ba3 100644
--- a/moses/TranslationModel/DynSAInclude/params.h
+++ b/moses/TranslationModel/DynSAInclude/params.h
@@ -10,21 +10,23 @@
#include "utils.h"
#include "types.h"
-#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0]))
+#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0]))
-namespace Moses {
+namespace Moses
+{
typedef struct ParamDefs {
std::string name;
- std::string value;
+ std::string value;
std::string abbrev;
int type;
std::string description;
} ParamDefs;
- //! @todo ask abby2
-class Parameters {
+//! @todo ask abby2
+class Parameters
+{
public:
- static const std::string kNotSetValue;
+ static const std::string kNotSetValue;
static const int kBoolValue;
static const int kIntValue;
static const int kFloatValue;
@@ -32,15 +34,15 @@ public:
static const int kUndefinedValue;
static const std::string kFalseValue;
static const std::string kTrueValue;
-
+
Parameters(const ParamDefs * paramdefs, const count_t paramNum);
Parameters(int argc, char** argv, const ParamDefs * paramdefs, const count_t paramNum);
~Parameters() {}
bool loadParams(int argc, char ** argv);
bool loadParams(const std::string& param_file, std::set<std::string>&);
int getValueType(const std::string & name);
- bool setParamValue(const std::string& name, const std::string& value);
- bool verifyValueType(const std::string& name, const std::string& value);
+ bool setParamValue(const std::string& name, const std::string& value);
+ bool verifyValueType(const std::string& name, const std::string& value);
bool isValidParamName(const std::string & name);
std::string getParamValue(const std::string& name);
//void* getParam(const std::string& name);
diff --git a/moses/TranslationModel/DynSAInclude/perfectHash.h b/moses/TranslationModel/DynSAInclude/perfectHash.h
index f445e063a..8ea20fa06 100644
--- a/moses/TranslationModel/DynSAInclude/perfectHash.h
+++ b/moses/TranslationModel/DynSAInclude/perfectHash.h
@@ -9,18 +9,19 @@
#include "quantizer.h"
/**
- * PerfectHash handles setting up hash functions and storage
- * for LM data.
- */
+ * PerfectHash handles setting up hash functions and storage
+ * for LM data.
+ */
using randlm::Filter;
using randlm::BitFilter;
typedef std::map<string, count_t> hpDict_t;
typedef hpDict_t::iterator hpdEntry_t;
static count_t collisions_ = 0;
-/* Based on Mortenson et. al. 2006 */
+/* Based on Mortenson et. al. 2006 */
template<typename T>
-class PerfectHash {
+class PerfectHash
+{
public:
PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase);
PerfectHash(FileHandler* fin) {
@@ -41,11 +42,11 @@ protected:
uint8_t* idxTracker_;
uint64_t insert(const wordID_t* IDs, const int len, const count_t value);
bool update(const wordID_t* IDs, const int len, const count_t value,
- hpdEntry_t& hpdAddr, uint64_t& filterIdx);
+ hpdEntry_t& hpdAddr, uint64_t& filterIdx);
bool update2(const wordID_t* IDs, const int len, const count_t value,
- hpdEntry_t& hpdAddr, uint64_t& filterIdx);
- int query(const wordID_t* IDs, const int len,
- hpdEntry_t& hpdAddr, uint64_t& filterIdx);
+ hpdEntry_t& hpdAddr, uint64_t& filterIdx);
+ int query(const wordID_t* IDs, const int len,
+ hpdEntry_t& hpdAddr, uint64_t& filterIdx);
virtual void remove(const wordID_t* IDs, const int len);
void remove(uint64_t index);
void save(FileHandler* fout);
@@ -54,33 +55,34 @@ protected:
//pointer to a specific entry in a hpDict_t
virtual void markQueried(hpdEntry_t&)=0;
private:
- T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket);
+ T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket);
string hpDictKeyValue(const wordID_t* IDs, const int len);
uint64_t memBound_; // total memory bound in bytes
uint16_t cellWidth_; // in bits
- UnivHash_linear<count_t>* bucketHash_;
+ UnivHash_linear<count_t>* bucketHash_;
UnivHash_linear<T>* fingerHash_;
LogQtizer* qtizer_;
};
template<typename T>
-PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
- float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)),
- cellWidth_(width) {
+PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
+ float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)),
+ cellWidth_(width)
+{
bucketRange_ = static_cast<uint8_t>(bucketRange);
if(bucketRange > 255) {
- cerr << "ERROR: Max bucket range is > 2^8\n";
+ cerr << "ERROR: Max bucket range is > 2^8\n";
exit(1);
}
qtizer_ = new LogQtizer(qBase);
int valBits = (int)ceil(log2((float)qtizer_->maxcode()));
cerr << "BITS FOR VALUES ARRAY = " << valBits << endl;
uint64_t totalBits = memBound_ << 3;
- cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells
+ cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells
cells_ += (cells_ % bucketRange_); // make cells multiple of bucket range
totBuckets_ = (cells_ / bucketRange_) - 1; // minus 1 so totBuckets * bucksize + bucksize = cells
filter_ = new Filter<T>(cells_, cellWidth_);
- values_ = new Filter<T>(cells_, valBits);
+ values_ = new Filter<T>(cells_, valBits);
idxTracker_ = new uint8_t[totBuckets_];
for(int i=0; i < totBuckets_; ++i) idxTracker_[i] = 0;
// initialize ranges for each hash function
@@ -89,7 +91,8 @@ PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
}
template<typename T>
-PerfectHash<T>::~PerfectHash() {
+PerfectHash<T>::~PerfectHash()
+{
delete[] idxTracker_;
delete filter_;
filter_ = NULL;
@@ -99,22 +102,22 @@ PerfectHash<T>::~PerfectHash() {
delete values_;
}
-template<typename T>
-uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
- const count_t value) {
+template<typename T>
+uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
+ const count_t value)
+{
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
- if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows
+ if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t emptyidx = cells_ + 1;
uint64_t index = bucket * bucketRange_, // starting bucket row
- lastrow = index + bucketRange_; // ending row
- while(index < lastrow) { // unique so check each row for "matching" signature
+ lastrow = index + bucketRange_; // ending row
+ while(index < lastrow) { // unique so check each row for "matching" signature
T filterVal = filter_->read(index);
- if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row
+ if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row
emptyidx = index;
- }
- else if(filterVal == fp) {
+ } else if(filterVal == fp) {
++collisions_;
dict_[hpDictKeyValue(IDs, len)] = value; // store exact in hpd
return cells_ + 1; // finished
@@ -127,21 +130,21 @@ uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
values_->write(emptyidx, code);
++idxTracker_[bucket]; // keep track of bucket size
return emptyidx;
- }
- else { // bucket is full
+ } else { // bucket is full
dict_[hpDictKeyValue(IDs, len)] = value; // add to hpd
return cells_ + 1;
}
}
-template<typename T>
-bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
- const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
+template<typename T>
+bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
+ const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx)
+{
// check if key is in high perf. dictionary
filterIdx = cells_ + 1;
string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
- hpdAddr->second = value;
+ hpdAddr->second = value;
return true;
}
// else hash ngram
@@ -150,45 +153,45 @@ bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t index = bucket * bucketRange_, // starting bucket row
- lastrow = index + bucketRange_;
+ lastrow = index + bucketRange_;
while(index < lastrow) { // must check each row for matching fp event
T filterVal = filter_->read(index);
if(filterVal == fp) { // found event w.h.p.
- values_->write(index, (T)qtizer_->code(value));
+ values_->write(index, (T)qtizer_->code(value));
filterIdx = index;
return true;
}
++index;
}
- // could add if it gets here.
+ // could add if it gets here.
return false;
}
-template<typename T>
-int PerfectHash<T>::query(const wordID_t* IDs, const int len,
- hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
+template<typename T>
+int PerfectHash<T>::query(const wordID_t* IDs, const int len,
+ hpdEntry_t& hpdAddr, uint64_t& filterIdx)
+{
// check if key is in high perf. dictionary
string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
filterIdx = cells_ + 1;
return(hpdAddr->second); // returns copy of value
- }
- else { // check if key is in filter
- // get bucket
+ } else { // check if key is in filter
+ // get bucket
//count_t bucket = bucketHash_->hash(IDs, len);
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
// return value if ngram is in filter
uint64_t index = bucket * bucketRange_,
- lastrow = index + bucketRange_;
+ lastrow = index + bucketRange_;
for(; index < lastrow; ++index) {
if(filter_->read(index) == fp) {
- //cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" <<
- //filter_->read(index) << "\tcode = " << code << endl;
+ //cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" <<
+ //filter_->read(index) << "\tcode = " << code << endl;
filterIdx = index;
hpdAddr = dict_.end();
- return (int)qtizer_->value(values_->read(index));
+ return (int)qtizer_->value(values_->read(index));
}
}
}
@@ -196,22 +199,23 @@ int PerfectHash<T>::query(const wordID_t* IDs, const int len,
}
template<typename T>
-void PerfectHash<T>::remove(const wordID_t* IDs, const int len) {
+void PerfectHash<T>::remove(const wordID_t* IDs, const int len)
+{
// delete key if in high perf. dictionary
string skey = hpDictKeyValue(IDs, len);
if(dict_.find(skey) != dict_.end())
dict_.erase(skey);
else { // check if key is in filter
- // get small representation for ngrams
+ // get small representation for ngrams
//count_t bucket = bucketHash_->hash(IDs, len);
count_t bucket = (bucketHash_->size() > 1? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
// retrieve non zero fingerprint for ngram
- T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
+ T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
// return value if ngram is in filter
uint64_t index = bucket * bucketRange_,
- lastrow = index + bucketRange_;
+ lastrow = index + bucketRange_;
for(; index < lastrow; ++index) {
- if(filter_->read(index) == fp) {
+ if(filter_->read(index) == fp) {
filter_->write(index, 0);
values_->write(index, 0);
--idxTracker_[bucket]; // track bucket size reduction
@@ -222,7 +226,8 @@ void PerfectHash<T>::remove(const wordID_t* IDs, const int len) {
}
template<typename T> // clear filter index
-void PerfectHash<T>::remove(uint64_t index) {
+void PerfectHash<T>::remove(uint64_t index)
+{
CHECK(index < cells_);
CHECK(filter_->read(index) != 0); // slow
filter_->write(index, 0);
@@ -234,20 +239,22 @@ void PerfectHash<T>::remove(uint64_t index) {
template<typename T>
T PerfectHash<T>::nonZeroSignature(const wordID_t* IDs, const int len,
- count_t bucket) {
+ count_t bucket)
+{
count_t h = bucket;
T fingerprint(0);
do {
fingerprint = fingerHash_->hash(IDs, len, h);
- h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around
+ h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around
} while((fingerprint == 0) && (h != bucket));
- if(fingerprint == 0)
+ if(fingerprint == 0)
cerr << "WARNING: Unable to find non-zero signature for ngram\n" << endl;
return fingerprint;
}
template<typename T>
-string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len) {
+string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len)
+{
string skey(" ");
for(int i = 0; i < len; ++i)
skey += Utils::IntToStr(IDs[i]) + "¬";
@@ -256,19 +263,22 @@ string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len) {
}
template<typename T>
-count_t PerfectHash<T>::hpDictMemUse() {
+count_t PerfectHash<T>::hpDictMemUse()
+{
// return hpDict memory usage in MBs
return (count_t) sizeof(hpDict_t::value_type)* dict_.size() >> 20;
}
template<typename T>
-count_t PerfectHash<T>::bucketsMemUse() {
+count_t PerfectHash<T>::bucketsMemUse()
+{
// return bucket memory usage in MBs
- return (count_t) (filter_->size() + values_->size());
+ return (count_t) (filter_->size() + values_->size());
}
template<typename T>
-void PerfectHash<T>::save(FileHandler* fout) {
+void PerfectHash<T>::save(FileHandler* fout)
+{
CHECK(fout != 0);
cerr << "\tSaving perfect hash parameters...\n";
fout->write((char*)&hitMask_, sizeof(hitMask_));
@@ -289,12 +299,13 @@ void PerfectHash<T>::save(FileHandler* fout) {
count_t size = dict_.size();
fout->write((char*)&size, sizeof(count_t));
*fout << endl;
- iterate(dict_, t)
- *fout << t->first << "\t" << t->second << "\n";
+ iterate(dict_, t)
+ *fout << t->first << "\t" << t->second << "\n";
}
template<typename T>
-void PerfectHash<T>::load(FileHandler* fin) {
+void PerfectHash<T>::load(FileHandler* fin)
+{
CHECK(fin != 0);
cerr << "\tLoading perfect hash parameters...\n";
fin->read((char*)&hitMask_, sizeof(hitMask_));
@@ -331,12 +342,13 @@ void PerfectHash<T>::load(FileHandler* fin) {
}
template<typename T>
-void PerfectHash<T>::analyze() {
+void PerfectHash<T>::analyze()
+{
cerr << "Analyzing Dynamic Bloomier Filter...\n";
// see how many items in each bucket
uint8_t* bucketCnt = new uint8_t[totBuckets_];
- unsigned largestBucket = 0, totalCellsSet = 0,
- smallestBucket = bucketRange_, totalZeroes = 0;
+ unsigned largestBucket = 0, totalCellsSet = 0,
+ smallestBucket = bucketRange_, totalZeroes = 0;
int curBucket = -1, fullBuckets(0);
for(int i = 0; i < totBuckets_; ++i) bucketCnt[i] = 0;
for(uint64_t i =0; i < cells_; ++i) {
@@ -344,16 +356,14 @@ void PerfectHash<T>::analyze() {
if(filter_->read(i) != 0) {
++bucketCnt[curBucket];
++totalCellsSet;
- }
- else ++totalZeroes;
+ } else ++totalZeroes;
}
count_t bi = 0, si = 0;
for(int i = 0; i < totBuckets_; ++i) {
if(bucketCnt[i] > largestBucket) {
largestBucket = bucketCnt[i];
bi = i;
- }
- else if(bucketCnt[i] < smallestBucket) {
+ } else if(bucketCnt[i] < smallestBucket) {
smallestBucket = bucketCnt[i];
si = i;
}
@@ -366,8 +376,8 @@ void PerfectHash<T>::analyze() {
}
for(int i = 0; i < totBuckets_; ++i) {
if(bucketCnt[i] != idxTracker_[i])
- cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] <<
- "\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl;
+ cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] <<
+ "\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl;
}
cerr << "total cells= " << cells_ << endl;
cerr << "total buckets= " << totBuckets_ << endl;
@@ -380,7 +390,7 @@ void PerfectHash<T>::analyze() {
cerr << "largest bucket (" << bi << ") size= " << largestBucket << endl;
cerr << "smallest bucket (" << si << ") size= " << smallestBucket << endl;
cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] <<
- " (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl;
+ " (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl;
cerr << "total buckets full = " << fullBuckets << endl;
cerr << "total collision errors= " << collisions_ << endl;
cerr << "high performance dictionary size= " << dict_.size() << endl;
@@ -390,14 +400,15 @@ void PerfectHash<T>::analyze() {
delete[] bucketCnt;
}
-template<typename T>
-bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
- const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
+template<typename T>
+bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
+ const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx)
+{
// check if key is in high perf. dictionary
filterIdx = cells_ + 1;
string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
- hpdAddr->second += value;
+ hpdAddr->second += value;
return true;
}
// else hash ngram
@@ -406,18 +417,18 @@ bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t index = bucket * bucketRange_, // starting bucket row
- lastrow = index + bucketRange_;
+ lastrow = index + bucketRange_;
while(index < lastrow) { // must check each row for matching fp event
T filterVal = filter_->read(index);
if(filterVal == fp) { // found event w.h.p.
- int oldval = (int)qtizer_->value(values_->read(index));
- values_->write(index, (T)qtizer_->code(oldval + value));
+ int oldval = (int)qtizer_->value(values_->read(index));
+ values_->write(index, (T)qtizer_->code(oldval + value));
filterIdx = index;
return true;
}
++index;
}
- // add if it gets here.
+ // add if it gets here.
insert(IDs, len, value);
return false;
}
diff --git a/moses/TranslationModel/DynSAInclude/quantizer.h b/moses/TranslationModel/DynSAInclude/quantizer.h
index 6c6850fa6..68d6a55a3 100644
--- a/moses/TranslationModel/DynSAInclude/quantizer.h
+++ b/moses/TranslationModel/DynSAInclude/quantizer.h
@@ -14,7 +14,8 @@ static const float kFloatErr = 0.00001f;
#endif
//! @todo ask abby2
-class LogQtizer {
+class LogQtizer
+{
public:
LogQtizer(float i): base_(pow(2, 1 / i)) {
CHECK(base_ > 1);
@@ -22,8 +23,8 @@ public:
float value = 1; // code = 1 -> value = 1 for any base
std::vector<float> code_to_value_vec;
while (log2(value) < 30) { // assume 2^30 is largest count
- code_to_value_vec.push_back(value);
- value = pow(base_, ++max_code_);
+ code_to_value_vec.push_back(value);
+ value = pow(base_, ++max_code_);
}
code_to_value_vec.push_back(value); // store max_code_ so in total [0, max_code_]
// get valid range
@@ -46,22 +47,22 @@ public:
int code(float value) {
// should just be: return log_b(value)
CHECK(!(value < min_value_ || value > max_value_));
- // but binary search removes errors due to floor operator above
- int code = static_cast<int>(std::lower_bound(code_to_value_, code_to_value_+ max_code_,
- value) - code_to_value_);
- // make sure not overestimating
+ // but binary search removes errors due to floor operator above
+ int code = static_cast<int>(std::lower_bound(code_to_value_, code_to_value_+ max_code_,
+ value) - code_to_value_);
+ // make sure not overestimating
code = code_to_value_[code] > value ? code - 1 : code;
return code;
}
inline float value(int code) {
- // table look up for values
+ // table look up for values
return code_to_value_[code];
}
inline int maxcode() {
return max_code_;
}
inline float logValue(int code) {
- // table look up for log of values
+ // table look up for log of values
return code_to_log_value_[code];
}
~LogQtizer() {
@@ -75,15 +76,15 @@ public:
fout->write((char*)&min_value_, sizeof(min_value_));
for (int j = 0; j <= max_code_; ++j)
fout->write((char*)&code_to_value_[j], sizeof(code_to_value_[j]));
- for (int j = 0; j <= max_code_; ++j)
+ for (int j = 0; j <= max_code_; ++j)
fout->write((char*)&code_to_log_value_[j], sizeof(code_to_log_value_[j]));
std::cerr << "Saved log codebook with " << max_code_ + 1 << " codes." <<std::endl;
}
private:
float base_;
- float* code_to_value_;
+ float* code_to_value_;
float* code_to_log_value_;
- int max_code_;
+ int max_code_;
float max_value_;
float min_value_;
void load(FileHandler* fin) {
diff --git a/moses/TranslationModel/DynSAInclude/vocab.cpp b/moses/TranslationModel/DynSAInclude/vocab.cpp
index 27e052260..da1443f66 100644
--- a/moses/TranslationModel/DynSAInclude/vocab.cpp
+++ b/moses/TranslationModel/DynSAInclude/vocab.cpp
@@ -103,10 +103,11 @@ bool Vocab::Load(const std::string & vocab_path, const FactorDirection& directio
std::cerr << "Loading vocab from " << vocab_path << std::endl;
return Load(&vcbin, direction, factors, closed);
}
-bool Vocab::Load(FileHandler* vcbin) {
+bool Vocab::Load(FileHandler* vcbin)
+{
FactorList factors;
factors.push_back(0);
- return Load(vcbin, Input, factors);
+ return Load(vcbin, Input, factors);
}
bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction,
const FactorList& factors, bool closed)
diff --git a/moses/TranslationModel/DynSAInclude/vocab.h b/moses/TranslationModel/DynSAInclude/vocab.h
index 0c0d50a07..18ff96c8d 100644
--- a/moses/TranslationModel/DynSAInclude/vocab.h
+++ b/moses/TranslationModel/DynSAInclude/vocab.h
@@ -11,7 +11,7 @@
namespace Moses
{
-
+
//! Vocab maps between strings and uint32 ids.
class Vocab
{
diff --git a/moses/TranslationModel/DynSuffixArray.cpp b/moses/TranslationModel/DynSuffixArray.cpp
index 7d5847cc6..c5fddf3f0 100644
--- a/moses/TranslationModel/DynSuffixArray.cpp
+++ b/moses/TranslationModel/DynSuffixArray.cpp
@@ -74,12 +74,12 @@ int DynSuffixArray::F_firstIdx(unsigned word)
// return index of first row where word is found in m_F
/*for(int i=0; i < m_F->size(); ++i) {
if(m_F->at(i) == word) {
- return i;
+ return i;
}
}
return -1;*/
- //NOTE: lower_bound is faster than linear search above but may cause issues
- // if ordering of vocab is not consecutive (ie..after deletions)
+ //NOTE: lower_bound is faster than linear search above but may cause issues
+ // if ordering of vocab is not consecutive (ie..after deletions)
int low = std::lower_bound(m_F->begin(), m_F->end(), word) - m_F->begin();
//cerr << "in F_firstIdx with word = " << word << " and low = " << low << " and F->size() =" << m_F->size() << endl;
if((size_t)low >= m_F->size())
@@ -146,8 +146,8 @@ void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
{
set<pair<unsigned, unsigned> > seen;
while(j != jprime) {
- // this 'seenit' check added for data with many loops. will remove after double
- // checking.
+ // this 'seenit' check added for data with many loops. will remove after double
+ // checking.
bool seenit = seen.insert(std::make_pair(j, jprime)).second;
if(seenit) {
for(size_t i=1; i < m_SA->size(); ++i) {
@@ -163,9 +163,9 @@ void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
int new_j = LastFirstFunc(j);
CHECK(j <= jprime);
// for SA and L, the element at pos j is moved to pos j'
- m_L->insert(m_L->begin() + jprime + 1, m_L->at(j));
+ m_L->insert(m_L->begin() + jprime + 1, m_L->at(j));
m_L->erase(m_L->begin() + j);
- m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j));
+ m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j));
m_SA->erase(m_SA->begin() + j);
// all ISA values between (j...j'] decremented
for(size_t i = 0; i < m_ISA->size(); ++i) {
diff --git a/moses/TranslationModel/PhraseDictionary.cpp b/moses/TranslationModel/PhraseDictionary.cpp
index 808f7ce81..a0c94ccdc 100644
--- a/moses/TranslationModel/PhraseDictionary.cpp
+++ b/moses/TranslationModel/PhraseDictionary.cpp
@@ -31,7 +31,7 @@ namespace Moses
{
PhraseDictionary::PhraseDictionary(const std::string &description, const std::string &line)
-:DecodeFeature(description, line)
+ :DecodeFeature(description, line)
{
m_tableLimit= 20; // TODO default?
@@ -40,20 +40,15 @@ PhraseDictionary::PhraseDictionary(const std::string &description, const std::st
if (args[0] == "num-input-features") {
m_numInputScores = Scan<unsigned>(args[1]);
- }
- else if (args[0] == "path") {
+ } else if (args[0] == "path") {
m_filePath = args[1];
- }
- else if (args[0] == "table-limit") {
+ } else if (args[0] == "table-limit") {
m_tableLimit = Scan<size_t>(args[1]);
- }
- else if (args[0] == "target-path") {
+ } else if (args[0] == "target-path") {
m_targetFile = args[1];
- }
- else if (args[0] == "alignment-path") {
+ } else if (args[0] == "alignment-path") {
m_alignmentsFile = args[1];
- }
- else {
+ } else {
//throw "Unknown argument " + args[0];
}
} // for (size_t i = 0; i < toks.size(); ++i) {
diff --git a/moses/TranslationModel/PhraseDictionary.h b/moses/TranslationModel/PhraseDictionary.h
index 4c10c2e6b..1b1197eb1 100644
--- a/moses/TranslationModel/PhraseDictionary.h
+++ b/moses/TranslationModel/PhraseDictionary.h
@@ -88,7 +88,9 @@ public:
const PhraseDictionary* GetDictionary() const;
PhraseDictionary* GetDictionary();
- const std::string &GetFilePath() const { return m_filePath; }
+ const std::string &GetFilePath() const {
+ return m_filePath;
+ }
protected:
size_t m_tableLimit;
diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
index 126dd3365..afa1c4abc 100644
--- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
@@ -9,7 +9,7 @@ using namespace std;
namespace Moses
{
PhraseDictionaryDynSuffixArray::PhraseDictionaryDynSuffixArray(const std::string &line)
-:PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
+ :PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
{
m_biSA = new BilingualDynSuffixArray();
}
@@ -63,7 +63,7 @@ const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCol
void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment)
{
m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
- //StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
+ //StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
}
void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
{
diff --git a/moses/TranslationModel/PhraseDictionaryMemory.cpp b/moses/TranslationModel/PhraseDictionaryMemory.cpp
index 27cac9f5f..c43b919a4 100644
--- a/moses/TranslationModel/PhraseDictionaryMemory.cpp
+++ b/moses/TranslationModel/PhraseDictionaryMemory.cpp
@@ -41,9 +41,9 @@ namespace Moses
{
TargetPhraseCollection &PhraseDictionaryMemory::GetOrCreateTargetPhraseCollection(
- const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS)
+ const Phrase &source
+ , const TargetPhrase &target
+ , const Word *sourceLHS)
{
PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(source, target, sourceLHS);
return currNode.GetOrCreateTargetPhraseCollection();
@@ -73,8 +73,8 @@ const TargetPhraseCollection *PhraseDictionaryMemory::GetTargetPhraseCollection(
}
PhraseDictionaryNodeMemory &PhraseDictionaryMemory::GetOrCreateNode(const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS)
+ , const TargetPhrase &target
+ , const Word *sourceLHS)
{
const size_t size = source.GetSize();
@@ -102,12 +102,12 @@ PhraseDictionaryNodeMemory &PhraseDictionaryMemory::GetOrCreateNode(const Phrase
CHECK(currNode != NULL);
}
-
+
// finally, the source LHS
//currNode = currNode->GetOrCreateChild(sourceLHS);
//CHECK(currNode != NULL);
-
+
return *currNode;
}
@@ -120,8 +120,7 @@ ChartRuleLookupManager *PhraseDictionaryMemory::CreateRuleLookupManager(
void PhraseDictionaryMemory::SortAndPrune()
{
- if (GetTableLimit())
- {
+ if (GetTableLimit()) {
m_collection.Sort(GetTableLimit());
}
}
diff --git a/moses/TranslationModel/PhraseDictionaryMemory.h b/moses/TranslationModel/PhraseDictionaryMemory.h
index dad8b3bbd..d2a8d0ad3 100644
--- a/moses/TranslationModel/PhraseDictionaryMemory.h
+++ b/moses/TranslationModel/PhraseDictionaryMemory.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -38,15 +38,17 @@ class PhraseDictionaryMemory : public RuleTableTrie
protected:
PhraseDictionaryMemory(const std::string &description, const std::string &line)
- : RuleTableTrie(description, line)
+ : RuleTableTrie(description, line)
{}
public:
PhraseDictionaryMemory(const std::string &line)
- : RuleTableTrie("PhraseDictionaryMemory", line)
+ : RuleTableTrie("PhraseDictionaryMemory", line)
{}
- const PhraseDictionaryNodeMemory &GetRootNode() const { return m_collection; }
+ const PhraseDictionaryNodeMemory &GetRootNode() const {
+ return m_collection;
+ }
ChartRuleLookupManager *CreateRuleLookupManager(
const InputType &,
@@ -54,14 +56,14 @@ public:
TO_STRING();
- protected:
+protected:
TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
- const Phrase &source, const TargetPhrase &target, const Word *sourceLHS);
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS);
const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase& source) const;
PhraseDictionaryNodeMemory &GetOrCreateNode(const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS);
+ , const TargetPhrase &target
+ , const Word *sourceLHS);
void SortAndPrune();
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModel.cpp b/moses/TranslationModel/PhraseDictionaryMultiModel.cpp
index bf3f01a1e..e395cb5a3 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModel.cpp
+++ b/moses/TranslationModel/PhraseDictionaryMultiModel.cpp
@@ -26,7 +26,7 @@ namespace Moses
{
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
-:PhraseDictionary("PhraseDictionaryMultiModel", line)
+ :PhraseDictionary("PhraseDictionaryMultiModel", line)
{
for (size_t i = 0; i < m_args.size(); ++i) {
const vector<string> &args = m_args[i];
@@ -37,12 +37,10 @@ PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
msg << "combination mode unknown: " << m_mode;
throw runtime_error(msg.str());
}
- }
- else if (args[0] == "components") {
+ } else if (args[0] == "components") {
m_pdStr = Tokenize(args[1], ",");
m_numModels = m_pdStr.size();
- }
- else if (args[0] == "lambda") {
+ } else if (args[0] == "lambda") {
m_multimodelweights = Tokenize<float>(args[1], ",");
}
} // for
@@ -55,15 +53,14 @@ PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
}
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &description, const std::string &line)
-:PhraseDictionary(description, line)
+ :PhraseDictionary(description, line)
{
for (size_t i = 0; i < m_args.size(); ++i) {
const vector<string> &args = m_args[i];
if (args[0] == "components") {
m_pdStr = Tokenize(args[1], ",");
m_numModels = m_pdStr.size();
- }
- else if (args[0] == "lambda") {
+ } else if (args[0] == "lambda") {
m_multimodelweights = Tokenize<float>(args[1], ",");
}
} // for
@@ -83,7 +80,7 @@ bool PhraseDictionaryMultiModel::InitDictionary()
// one could choose a higher value than tableLimit (or 0) here for maximal precision, at a cost of speed.
- for(size_t i = 0; i < m_numModels; ++i){
+ for(size_t i = 0; i < m_numModels; ++i) {
const string &ptName = m_pdStr[i];
PhraseDictionary *pt = FindPhraseDictionary(ptName);
@@ -144,7 +141,7 @@ const TargetPhraseCollection *PhraseDictionaryMultiModel::GetTargetPhraseCollect
void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src, std::map<std::string,multiModelStatistics*>* allStats) const
{
- for(size_t i = 0; i < m_numModels; ++i){
+ for(size_t i = 0; i < m_numModels; ++i) {
const PhraseDictionary &pd = *m_pd[i];
TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection( src);
@@ -152,10 +149,9 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src,
TargetPhraseCollection::iterator iterTargetPhrase, iterLast;
if (m_tableLimit != 0 && ret_raw->GetSize() > m_tableLimit) {
- iterLast = ret_raw->begin() + m_tableLimit;
- }
- else {
- iterLast = ret_raw->end();
+ iterLast = ret_raw->begin() + m_tableLimit;
+ } else {
+ iterLast = ret_raw->end();
}
for (iterTargetPhrase = ret_raw->begin(); iterTargetPhrase != iterLast; ++iterTargetPhrase) {
@@ -173,9 +169,9 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src,
Scores scoreVector(m_numScoreComponents);
statistics->p.resize(m_numScoreComponents);
- for(size_t j = 0; j < m_numScoreComponents; ++j){
- statistics->p[j].resize(m_numModels);
- scoreVector[j] = -raw_scores[j];
+ for(size_t j = 0; j < m_numScoreComponents; ++j) {
+ statistics->p[j].resize(m_numModels);
+ scoreVector[j] = -raw_scores[j];
}
statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); // set scores to 0
@@ -186,8 +182,8 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src,
}
multiModelStatistics * statistics = (*allStats)[targetString];
- for(size_t j = 0; j < m_numScoreComponents; ++j){
- statistics->p[j][i] = UntransformScore(raw_scores[j]);
+ for(size_t j = 0; j < m_numScoreComponents; ++j) {
+ statistics->p[j][i] = UntransformScore(raw_scores[j]);
}
(*allStats)[targetString] = statistics;
@@ -199,26 +195,26 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src,
TargetPhraseCollection* PhraseDictionaryMultiModel::CreateTargetPhraseCollectionLinearInterpolation(const Phrase& src, std::map<std::string,multiModelStatistics*>* allStats, std::vector<std::vector<float> > &multimodelweights) const
{
- TargetPhraseCollection *ret = new TargetPhraseCollection();
- for ( std::map< std::string, multiModelStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {
+ TargetPhraseCollection *ret = new TargetPhraseCollection();
+ for ( std::map< std::string, multiModelStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {
- multiModelStatistics * statistics = iter->second;
+ multiModelStatistics * statistics = iter->second;
- Scores scoreVector(m_numScoreComponents);
+ Scores scoreVector(m_numScoreComponents);
- for(size_t i = 0; i < m_numScoreComponents-1; ++i){
- scoreVector[i] = TransformScore(std::inner_product(statistics->p[i].begin(), statistics->p[i].end(), multimodelweights[i].begin(), 0.0));
- }
+ for(size_t i = 0; i < m_numScoreComponents-1; ++i) {
+ scoreVector[i] = TransformScore(std::inner_product(statistics->p[i].begin(), statistics->p[i].end(), multimodelweights[i].begin(), 0.0));
+ }
- //assuming that last value is phrase penalty
- scoreVector[m_numScoreComponents-1] = 1.0;
+ //assuming that last value is phrase penalty
+ scoreVector[m_numScoreComponents-1] = 1.0;
- statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
- statistics->targetPhrase->Evaluate(src);
+ statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
+ statistics->targetPhrase->Evaluate(src);
- ret->Add(new TargetPhrase(*statistics->targetPhrase));
- }
- return ret;
+ ret->Add(new TargetPhrase(*statistics->targetPhrase));
+ }
+ return ret;
}
@@ -235,8 +231,7 @@ std::vector<std::vector<float> > PhraseDictionaryMultiModel::getWeights(size_t n
//checking weights passed to mosesserver; only valid for this sentence; *don't* raise exception if client weights are malformed
if (weights_ptr == NULL || weights_ptr->size() == 0) {
weights_ptr = &m_multimodelweights; //fall back to weights defined in config
- }
- else if(weights_ptr->size() != m_numModels && weights_ptr->size() != m_numModels * numWeights) {
+ } else if(weights_ptr->size() != m_numModels && weights_ptr->size() != m_numModels * numWeights) {
//TODO: can we pass error message to client if weights are malformed?
std::stringstream strme;
strme << "Must have either one multimodel weight per model (" << m_numModels << "), or one per weighted feature and model (" << numWeights << "*" << m_numModels << "). You have " << weights_ptr->size() << ". Reverting to weights in config";
@@ -246,34 +241,30 @@ std::vector<std::vector<float> > PhraseDictionaryMultiModel::getWeights(size_t n
//checking weights defined in config; only valid for this sentence; raise exception if config weights are malformed
if (weights_ptr == NULL || weights_ptr->size() == 0) {
- for (size_t i=0;i < m_numModels;i++) {
+ for (size_t i=0; i < m_numModels; i++) {
raw_weights.push_back(1.0/m_numModels); //uniform weights created online
}
- }
- else if(weights_ptr->size() != m_numModels && weights_ptr->size() != m_numModels * numWeights) {
+ } else if(weights_ptr->size() != m_numModels && weights_ptr->size() != m_numModels * numWeights) {
std::stringstream strme;
strme << "Must have either one multimodel weight per model (" << m_numModels << "), or one per weighted feature and model (" << numWeights << "*" << m_numModels << "). You have " << weights_ptr->size() << ".";
UTIL_THROW(util::Exception, strme.str());
- }
- else {
- raw_weights = *weights_ptr;
+ } else {
+ raw_weights = *weights_ptr;
}
std::vector<std::vector<float> > multimodelweights (numWeights);
- for (size_t i=0;i < numWeights;i++) {
+ for (size_t i=0; i < numWeights; i++) {
std::vector<float> weights_onefeature (m_numModels);
if(raw_weights.size() == m_numModels) {
- weights_onefeature = raw_weights;
- }
- else {
- copy ( raw_weights.begin()+i*m_numModels, raw_weights.begin()+(i+1)*m_numModels, weights_onefeature.begin() );
+ weights_onefeature = raw_weights;
+ } else {
+ copy ( raw_weights.begin()+i*m_numModels, raw_weights.begin()+(i+1)*m_numModels, weights_onefeature.begin() );
}
if(normalize) {
- multimodelweights[i] = normalizeWeights(weights_onefeature);
- }
- else {
- multimodelweights[i] = weights_onefeature;
+ multimodelweights[i] = normalizeWeights(weights_onefeature);
+ } else {
+ multimodelweights[i] = weights_onefeature;
}
}
@@ -282,12 +273,12 @@ std::vector<std::vector<float> > PhraseDictionaryMultiModel::getWeights(size_t n
std::vector<float> PhraseDictionaryMultiModel::normalizeWeights(std::vector<float> &weights) const
{
- std::vector<float> ret (m_numModels);
- float total = std::accumulate(weights.begin(),weights.end(),0.0);
- for (size_t i=0;i < weights.size();i++) {
- ret[i] = weights[i]/total;
- }
- return ret;
+ std::vector<float> ret (m_numModels);
+ float total = std::accumulate(weights.begin(),weights.end(),0.0);
+ for (size_t i=0; i < weights.size(); i++) {
+ ret[i] = weights[i]/total;
+ }
+ return ret;
}
@@ -298,7 +289,8 @@ ChartRuleLookupManager *PhraseDictionaryMultiModel::CreateRuleLookupManager(cons
//copied from PhraseDictionaryCompact; free memory allocated to TargetPhraseCollection (and each TargetPhrase) at end of sentence
-void PhraseDictionaryMultiModel::CacheForCleanup(TargetPhraseCollection* tpc) {
+void PhraseDictionaryMultiModel::CacheForCleanup(TargetPhraseCollection* tpc)
+{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_sentenceMutex);
PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
@@ -309,7 +301,8 @@ void PhraseDictionaryMultiModel::CacheForCleanup(TargetPhraseCollection* tpc) {
}
-void PhraseDictionaryMultiModel::CleanUpAfterSentenceProcessing(const InputType &source) {
+void PhraseDictionaryMultiModel::CleanUpAfterSentenceProcessing(const InputType &source)
+{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_sentenceMutex);
PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
@@ -317,7 +310,7 @@ void PhraseDictionaryMultiModel::CleanUpAfterSentenceProcessing(const InputType
PhraseCache &ref = m_sentenceCache;
#endif
for(PhraseCache::iterator it = ref.begin(); it != ref.end(); it++) {
- delete *it;
+ delete *it;
}
PhraseCache temp;
@@ -331,149 +324,150 @@ void PhraseDictionaryMultiModel::CleanUpAfterSentenceProcessing(const InputType
}
-void PhraseDictionaryMultiModel::CleanUpComponentModels(const InputType &source) {
- for(size_t i = 0; i < m_numModels; ++i){
+void PhraseDictionaryMultiModel::CleanUpComponentModels(const InputType &source)
+{
+ for(size_t i = 0; i < m_numModels; ++i) {
m_pd[i]->CleanUpAfterSentenceProcessing(source);
}
}
#ifdef WITH_DLIB
-vector<float> PhraseDictionaryMultiModel::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector) {
+vector<float> PhraseDictionaryMultiModel::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector)
+{
- const StaticData &staticData = StaticData::Instance();
- const string& factorDelimiter = staticData.GetFactorDelimiter();
+ const StaticData &staticData = StaticData::Instance();
+ const string& factorDelimiter = staticData.GetFactorDelimiter();
- map<pair<string, string>, size_t> phrase_pair_map;
+ map<pair<string, string>, size_t> phrase_pair_map;
- for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
- phrase_pair_map[*iter] += 1;
- }
+ for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
+ phrase_pair_map[*iter] += 1;
+ }
- vector<multiModelStatisticsOptimization*> optimizerStats;
+ vector<multiModelStatisticsOptimization*> optimizerStats;
- for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) {
+ for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) {
- pair<string, string> phrase_pair = iter->first;
- string source_string = phrase_pair.first;
- string target_string = phrase_pair.second;
+ pair<string, string> phrase_pair = iter->first;
+ string source_string = phrase_pair.first;
+ string target_string = phrase_pair.second;
- vector<float> fs(m_numModels);
- map<string,multiModelStatistics*>* allStats = new(map<string,multiModelStatistics*>);
+ vector<float> fs(m_numModels);
+ map<string,multiModelStatistics*>* allStats = new(map<string,multiModelStatistics*>);
- Phrase sourcePhrase(0);
- sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL);
+ Phrase sourcePhrase(0);
+ sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL);
- CollectSufficientStatistics(sourcePhrase, allStats); //optimization potential: only call this once per source phrase
+ CollectSufficientStatistics(sourcePhrase, allStats); //optimization potential: only call this once per source phrase
- //phrase pair not found; leave cache empty
- if (allStats->find(target_string) == allStats->end()) {
- RemoveAllInMap(*allStats);
- delete allStats;
- continue;
- }
+ //phrase pair not found; leave cache empty
+ if (allStats->find(target_string) == allStats->end()) {
+ RemoveAllInMap(*allStats);
+ delete allStats;
+ continue;
+ }
- multiModelStatisticsOptimization* targetStatistics = new multiModelStatisticsOptimization();
- targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase);
- targetStatistics->p = (*allStats)[target_string]->p;
- targetStatistics->f = iter->second;
- optimizerStats.push_back(targetStatistics);
+ multiModelStatisticsOptimization* targetStatistics = new multiModelStatisticsOptimization();
+ targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase);
+ targetStatistics->p = (*allStats)[target_string]->p;
+ targetStatistics->f = iter->second;
+ optimizerStats.push_back(targetStatistics);
- RemoveAllInMap(*allStats);
- delete allStats;
- }
+ RemoveAllInMap(*allStats);
+ delete allStats;
+ }
- Sentence sentence;
- CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables
+ Sentence sentence;
+ CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables
- size_t numWeights = m_numScoreComponents;
- if (m_mode == "interpolate") {
- //interpolation of phrase penalty is skipped, and fixed-value (2.718) is used instead. results will be screwed up if phrase penalty is not last feature
- numWeights = m_numScoreComponents-1;
- }
+ size_t numWeights = m_numScoreComponents;
+ if (m_mode == "interpolate") {
+ //interpolation of phrase penalty is skipped, and fixed-value (2.718) is used instead. results will be screwed up if phrase penalty is not last feature
+ numWeights = m_numScoreComponents-1;
+ }
- vector<float> ret (m_numModels*numWeights);
- for (size_t iFeature=0; iFeature < numWeights; iFeature++) {
+ vector<float> ret (m_numModels*numWeights);
+ for (size_t iFeature=0; iFeature < numWeights; iFeature++) {
- CrossEntropy * ObjectiveFunction = new CrossEntropy(optimizerStats, this, iFeature);
+ CrossEntropy * ObjectiveFunction = new CrossEntropy(optimizerStats, this, iFeature);
- vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels);
+ vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels);
- if (m_mode == "interpolate") {
- weight_vector = normalizeWeights(weight_vector);
- }
+ if (m_mode == "interpolate") {
+ weight_vector = normalizeWeights(weight_vector);
+ }
- cerr << "Weight vector for feature " << iFeature << ": ";
- for (size_t i=0; i < m_numModels; i++) {
- ret[(iFeature*m_numModels)+i] = weight_vector[i];
- cerr << weight_vector[i] << " ";
- }
- cerr << endl;
- delete ObjectiveFunction;
+ cerr << "Weight vector for feature " << iFeature << ": ";
+ for (size_t i=0; i < m_numModels; i++) {
+ ret[(iFeature*m_numModels)+i] = weight_vector[i];
+ cerr << weight_vector[i] << " ";
}
+ cerr << endl;
+ delete ObjectiveFunction;
+ }
- RemoveAllInColl(optimizerStats);
- return ret;
+ RemoveAllInColl(optimizerStats);
+ return ret;
}
-vector<float> PhraseDictionaryMultiModel::Optimize(OptimizationObjective *ObjectiveFunction, size_t numModels) {
-
- dlib::matrix<double,0,1> starting_point;
- starting_point.set_size(numModels);
- starting_point = 1.0;
-
- try {
- dlib::find_min_bobyqa(*ObjectiveFunction,
- starting_point,
- 2*numModels+1, // number of interpolation points
- dlib::uniform_matrix<double>(numModels,1, 1e-09), // lower bound constraint
- dlib::uniform_matrix<double>(numModels,1, 1e100), // upper bound constraint
- 1.0, // initial trust region radius
- 1e-5, // stopping trust region radius
- 10000 // max number of objective function evaluations
- );
- }
- catch (dlib::bobyqa_failure& e)
- {
- cerr << e.what() << endl;
- }
+vector<float> PhraseDictionaryMultiModel::Optimize(OptimizationObjective *ObjectiveFunction, size_t numModels)
+{
- vector<float> weight_vector (numModels);
+ dlib::matrix<double,0,1> starting_point;
+ starting_point.set_size(numModels);
+ starting_point = 1.0;
+
+ try {
+ dlib::find_min_bobyqa(*ObjectiveFunction,
+ starting_point,
+ 2*numModels+1, // number of interpolation points
+ dlib::uniform_matrix<double>(numModels,1, 1e-09), // lower bound constraint
+ dlib::uniform_matrix<double>(numModels,1, 1e100), // upper bound constraint
+ 1.0, // initial trust region radius
+ 1e-5, // stopping trust region radius
+ 10000 // max number of objective function evaluations
+ );
+ } catch (dlib::bobyqa_failure& e) {
+ cerr << e.what() << endl;
+ }
- for (int i=0; i < starting_point.nr(); i++) {
- weight_vector[i] = starting_point(i);
- }
+ vector<float> weight_vector (numModels);
+
+ for (int i=0; i < starting_point.nr(); i++) {
+ weight_vector[i] = starting_point(i);
+ }
- cerr << "Cross-entropy: " << (*ObjectiveFunction)(starting_point) << endl;
- return weight_vector;
+ cerr << "Cross-entropy: " << (*ObjectiveFunction)(starting_point) << endl;
+ return weight_vector;
}
double CrossEntropy::operator() ( const dlib::matrix<double,0,1>& arg) const
{
- double total = 0.0;
- double n = 0.0;
- std::vector<float> weight_vector (m_model->m_numModels);
+ double total = 0.0;
+ double n = 0.0;
+ std::vector<float> weight_vector (m_model->m_numModels);
- for (int i=0; i < arg.nr(); i++) {
- weight_vector[i] = arg(i);
- }
- if (m_model->m_mode == "interpolate") {
- weight_vector = m_model->normalizeWeights(weight_vector);
- }
+ for (int i=0; i < arg.nr(); i++) {
+ weight_vector[i] = arg(i);
+ }
+ if (m_model->m_mode == "interpolate") {
+ weight_vector = m_model->normalizeWeights(weight_vector);
+ }
- for ( std::vector<multiModelStatisticsOptimization*>::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) {
- multiModelStatisticsOptimization* statistics = *iter;
- size_t f = statistics->f;
+ for ( std::vector<multiModelStatisticsOptimization*>::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) {
+ multiModelStatisticsOptimization* statistics = *iter;
+ size_t f = statistics->f;
- double score;
- score = std::inner_product(statistics->p[m_iFeature].begin(), statistics->p[m_iFeature].end(), weight_vector.begin(), 0.0);
+ double score;
+ score = std::inner_product(statistics->p[m_iFeature].begin(), statistics->p[m_iFeature].end(), weight_vector.begin(), 0.0);
- total -= (FloorScore(TransformScore(score))/TransformScore(2))*f;
- n += f;
- }
- return total/n;
+ total -= (FloorScore(TransformScore(score))/TransformScore(2))*f;
+ n += f;
+ }
+ return total/n;
}
#endif
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModel.h b/moses/TranslationModel/PhraseDictionaryMultiModel.h
index 467333b0a..5feb4f373 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModel.h
+++ b/moses/TranslationModel/PhraseDictionaryMultiModel.h
@@ -36,15 +36,17 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
- struct multiModelStatistics {
- TargetPhrase *targetPhrase;
- std::vector<std::vector<float> > p;
- ~multiModelStatistics() {delete targetPhrase;};
+struct multiModelStatistics {
+ TargetPhrase *targetPhrase;
+ std::vector<std::vector<float> > p;
+ ~multiModelStatistics() {
+ delete targetPhrase;
};
+};
- struct multiModelStatisticsOptimization: multiModelStatistics {
- size_t f;
- };
+struct multiModelStatisticsOptimization: multiModelStatistics {
+ size_t f;
+};
class OptimizationObjective;
@@ -53,7 +55,7 @@ class OptimizationObjective;
class PhraseDictionaryMultiModel: public PhraseDictionary
{
#ifdef WITH_DLIB
-friend class CrossEntropy;
+ friend class CrossEntropy;
#endif
public:
@@ -100,34 +102,33 @@ protected:
};
#ifdef WITH_DLIB
-class OptimizationObjective
+class OptimizationObjective
{
public:
- virtual double operator() ( const dlib::matrix<double,0,1>& arg) const = 0;
+ virtual double operator() ( const dlib::matrix<double,0,1>& arg) const = 0;
};
class CrossEntropy: public OptimizationObjective
{
public:
- CrossEntropy (
- std::vector<multiModelStatisticsOptimization*> &optimizerStats,
- PhraseDictionaryMultiModel * model,
- size_t iFeature
- )
- {
- m_optimizerStats = optimizerStats;
- m_model = model;
- m_iFeature = iFeature;
- }
+ CrossEntropy (
+ std::vector<multiModelStatisticsOptimization*> &optimizerStats,
+ PhraseDictionaryMultiModel * model,
+ size_t iFeature
+ ) {
+ m_optimizerStats = optimizerStats;
+ m_model = model;
+ m_iFeature = iFeature;
+ }
- double operator() ( const dlib::matrix<double,0,1>& arg) const;
+ double operator() ( const dlib::matrix<double,0,1>& arg) const;
protected:
- std::vector<multiModelStatisticsOptimization*> m_optimizerStats;
- PhraseDictionaryMultiModel * m_model;
- size_t m_iFeature;
+ std::vector<multiModelStatisticsOptimization*> m_optimizerStats;
+ PhraseDictionaryMultiModel * m_model;
+ size_t m_iFeature;
};
#endif
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
index 4c61fba91..298e23a9b 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
+++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
@@ -61,59 +61,56 @@ namespace Moses
{
PhraseDictionaryMultiModelCounts::PhraseDictionaryMultiModelCounts(const std::string &line)
-:PhraseDictionaryMultiModel("PhraseDictionaryMultiModelCounts", line)
+ :PhraseDictionaryMultiModel("PhraseDictionaryMultiModelCounts", line)
{
- m_mode = "instance_weighting"; //TODO: set this in config; use m_mode to switch between interpolation and instance weighting
- m_combineFunction = InstanceWeighting;
- //m_mode = "interpolate";
- //m_combineFunction = LinearInterpolationFromCounts;
-
- for (size_t i = 0; i < m_args.size(); ++i) {
- const vector<string> &args = m_args[i];
- if (args[0] == "mode") {
- m_mode = args[1];
- if (m_mode == "instance_weighting")
- m_combineFunction = InstanceWeighting;
- else if (m_mode == "interpolate") {
- m_combineFunction = LinearInterpolationFromCounts;
- }
- else {
- ostringstream msg;
- msg << "combination mode unknown: " << m_mode;
- throw runtime_error(msg.str());
- }
-
- }
- else if (args[0] == "lex-e2f") {
- m_lexE2FStr = Tokenize(args[1], ",");
- CHECK(m_lexE2FStr.size() == m_pdStr.size());
- }
- else if (args[0] == "lex-f2e") {
- m_lexF2EStr = Tokenize(args[1], ",");
- CHECK(m_lexF2EStr.size() == m_pdStr.size());
+ m_mode = "instance_weighting"; //TODO: set this in config; use m_mode to switch between interpolation and instance weighting
+ m_combineFunction = InstanceWeighting;
+ //m_mode = "interpolate";
+ //m_combineFunction = LinearInterpolationFromCounts;
+
+ for (size_t i = 0; i < m_args.size(); ++i) {
+ const vector<string> &args = m_args[i];
+ if (args[0] == "mode") {
+ m_mode = args[1];
+ if (m_mode == "instance_weighting")
+ m_combineFunction = InstanceWeighting;
+ else if (m_mode == "interpolate") {
+ m_combineFunction = LinearInterpolationFromCounts;
+ } else {
+ ostringstream msg;
+ msg << "combination mode unknown: " << m_mode;
+ throw runtime_error(msg.str());
}
- else if (args[0] == "target-table") {
- m_targetTable = Tokenize(args[1], ",");
- CHECK(m_targetTable.size() == m_pdStr.size());
- }
+ } else if (args[0] == "lex-e2f") {
+ m_lexE2FStr = Tokenize(args[1], ",");
+ CHECK(m_lexE2FStr.size() == m_pdStr.size());
+ } else if (args[0] == "lex-f2e") {
+ m_lexF2EStr = Tokenize(args[1], ",");
+ CHECK(m_lexF2EStr.size() == m_pdStr.size());
+ }
+ else if (args[0] == "target-table") {
+ m_targetTable = Tokenize(args[1], ",");
+ CHECK(m_targetTable.size() == m_pdStr.size());
+ }
- } // for
+
+ } // for
}
PhraseDictionaryMultiModelCounts::~PhraseDictionaryMultiModelCounts()
{
- RemoveAllInColl(m_lexTable_e2f);
- RemoveAllInColl(m_lexTable_f2e);
+ RemoveAllInColl(m_lexTable_e2f);
+ RemoveAllInColl(m_lexTable_f2e);
}
bool PhraseDictionaryMultiModelCounts::InitDictionary()
{
- for(size_t i = 0; i < m_numModels; ++i){
+ for(size_t i = 0; i < m_numModels; ++i) {
// phrase table
const string &ptName = m_pdStr[i];
@@ -189,8 +186,8 @@ bool PhraseDictionaryMultiModelCounts::InitDictionary()
pdta_inverse->SetNumScoreComponentMultiModel(m_numScoreComponent);
pdta_inverse->Load(input, output, target_table, m_weight, m_componentTableLimit, languageModels, m_weightWP);
m_inverse_pd.push_back(pdta_inverse);
- } else if (implementation == Compact) {
-#ifndef WIN32
+ } else if (implementation == Compact) {
+ #ifndef WIN32
PhraseDictionaryCompact* pdc = new PhraseDictionaryCompact(m_numScoreComponent, implementation, m_feature_load);
pdc->SetNumScoreComponentMultiModel(m_numScoreComponent); //for compact models, we need to pass number of log-linear components to correctly resize the score vector
pdc->Load( input, output, main_table, m_weight, componentTableLimit, languageModels, m_weightWP);
@@ -200,9 +197,9 @@ bool PhraseDictionaryMultiModelCounts::InitDictionary()
pdc_inverse->SetNumScoreComponentMultiModel(m_numScoreComponent);
pdc_inverse->Load( input, output, target_table, m_weight, componentTableLimit, languageModels, m_weightWP);
m_inverse_pd.push_back(pdc_inverse);
-#else
- UTIL_THROW(util::Exception, "Compact phrase table not supported in windows");
-#endif
+ #else
+ UTIL_THROW(util::Exception, "Compact phrase table not supported in windows");
+ #endif
}
else {
UTIL_THROW(util::Exception,"PhraseDictionaryMultiModel does not support phrase table type " << implementation);
@@ -218,7 +215,7 @@ bool PhraseDictionaryMultiModelCounts::InitDictionary()
}
-*/
+ */
return true;
}
@@ -250,7 +247,7 @@ const TargetPhraseCollection *PhraseDictionaryMultiModelCounts::GetTargetPhraseC
void PhraseDictionaryMultiModelCounts::CollectSufficientStatistics(const Phrase& src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats) const
//fill fs and allStats with statistics from models
{
- for(size_t i = 0; i < m_numModels; ++i){
+ for(size_t i = 0; i < m_numModels; ++i) {
const PhraseDictionary &pd = *m_pd[i];
TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection( src);
@@ -298,9 +295,9 @@ void PhraseDictionaryMultiModelCounts::CollectSufficientStatistics(const Phrase&
multiModelCountsStatistics * statistics = iter->second;
for (size_t i = 0; i < m_numModels; ++i) {
- if (!statistics->ft[i]) {
- statistics->ft[i] = GetTargetCount(static_cast<const Phrase&>(*statistics->targetPhrase), i);
- }
+ if (!statistics->ft[i]) {
+ statistics->ft[i] = GetTargetCount(static_cast<const Phrase&>(*statistics->targetPhrase), i);
+ }
}
}
}
@@ -313,28 +310,27 @@ TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseColl
multiModelCountsStatistics * statistics = iter->second;
if (statistics->targetPhrase->GetAlignTerm().GetSize() == 0) {
- UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables.");
+ UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables.");
}
try {
- pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(src, static_cast<const Phrase&>(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm());
- vector< set<size_t> > alignedToT = alignment.first;
- vector< set<size_t> > alignedToS = alignment.second;
- double lexst = ComputeWeightedLexicalTranslation(static_cast<const Phrase&>(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], m_output, m_input );
- double lexts = ComputeWeightedLexicalTranslation(src, static_cast<const Phrase&>(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], m_input, m_output );
-
- Scores scoreVector(5);
- scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0])));
- scoreVector[1] = FloorScore(TransformScore(lexst));
- scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2])));
- scoreVector[3] = FloorScore(TransformScore(lexts));
- scoreVector[4] = FloorScore(TransformScore(2.718));
-
- statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
- statistics->targetPhrase->Evaluate(src);
- }
- catch (AlignmentException& e) {
- continue;
+ pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(src, static_cast<const Phrase&>(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm());
+ vector< set<size_t> > alignedToT = alignment.first;
+ vector< set<size_t> > alignedToS = alignment.second;
+ double lexst = ComputeWeightedLexicalTranslation(static_cast<const Phrase&>(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], m_output, m_input );
+ double lexts = ComputeWeightedLexicalTranslation(src, static_cast<const Phrase&>(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], m_input, m_output );
+
+ Scores scoreVector(5);
+ scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0])));
+ scoreVector[1] = FloorScore(TransformScore(lexst));
+ scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2])));
+ scoreVector[3] = FloorScore(TransformScore(lexts));
+ scoreVector[4] = FloorScore(TransformScore(2.718));
+
+ statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
+ statistics->targetPhrase->Evaluate(src);
+ } catch (AlignmentException& e) {
+ continue;
}
ret->Add(new TargetPhrase(*statistics->targetPhrase));
@@ -346,47 +342,50 @@ TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseColl
}
-float PhraseDictionaryMultiModelCounts::GetTargetCount(const Phrase &target, size_t modelIndex) const {
+float PhraseDictionaryMultiModelCounts::GetTargetCount(const Phrase &target, size_t modelIndex) const
+{
- const PhraseDictionary &pd = *m_inverse_pd[modelIndex];
- TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection(target);
+ const PhraseDictionary &pd = *m_inverse_pd[modelIndex];
+ TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection(target);
- // in inverse mode, we want the first score of the first phrase pair (note: if we were to work with truly symmetric models, it would be the third score)
- if (ret_raw != NULL) {
- TargetPhrase * targetPhrase = *(ret_raw->begin());
- return UntransformScore(targetPhrase->GetScoreBreakdown().GetScoresForProducer(&pd)[0]);
- }
+ // in inverse mode, we want the first score of the first phrase pair (note: if we were to work with truly symmetric models, it would be the third score)
+ if (ret_raw != NULL) {
+ TargetPhrase * targetPhrase = *(ret_raw->begin());
+ return UntransformScore(targetPhrase->GetScoreBreakdown().GetScoresForProducer(&pd)[0]);
+ }
- // target phrase unknown
- else return 0;
+ // target phrase unknown
+ else return 0;
}
-pair<PhraseDictionaryMultiModelCounts::AlignVector,PhraseDictionaryMultiModelCounts::AlignVector> PhraseDictionaryMultiModelCounts::GetAlignmentsForLexWeights(const Phrase &phraseS, const Phrase &phraseT, const AlignmentInfo &alignment) const {
+pair<PhraseDictionaryMultiModelCounts::AlignVector,PhraseDictionaryMultiModelCounts::AlignVector> PhraseDictionaryMultiModelCounts::GetAlignmentsForLexWeights(const Phrase &phraseS, const Phrase &phraseT, const AlignmentInfo &alignment) const
+{
- size_t tsize = phraseT.GetSize();
- size_t ssize = phraseS.GetSize();
- AlignVector alignedToT (tsize);
- AlignVector alignedToS (ssize);
- AlignmentInfo::const_iterator iter;
+ size_t tsize = phraseT.GetSize();
+ size_t ssize = phraseS.GetSize();
+ AlignVector alignedToT (tsize);
+ AlignVector alignedToS (ssize);
+ AlignmentInfo::const_iterator iter;
- for (iter = alignment.begin(); iter != alignment.end(); ++iter) {
+ for (iter = alignment.begin(); iter != alignment.end(); ++iter) {
const pair<size_t,size_t> &alignPair = *iter;
- size_t s = alignPair.first;
- size_t t = alignPair.second;
- if (s >= ssize || t >= tsize) {
- cerr << "Error: inconsistent alignment for phrase pair: " << phraseS << " - " << phraseT << endl;
- cerr << "phrase pair will be discarded" << endl;
- throw AlignmentException();
- }
- alignedToT[t].insert( s );
- alignedToS[s].insert( t );
+ size_t s = alignPair.first;
+ size_t t = alignPair.second;
+ if (s >= ssize || t >= tsize) {
+ cerr << "Error: inconsistent alignment for phrase pair: " << phraseS << " - " << phraseT << endl;
+ cerr << "phrase pair will be discarded" << endl;
+ throw AlignmentException();
+ }
+ alignedToT[t].insert( s );
+ alignedToS[s].insert( t );
}
return make_pair(alignedToT,alignedToS);
}
-double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslation( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, vector<float> &multimodelweights, const vector<FactorType> &input_factors, const vector<FactorType> &output_factors ) const {
+double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslation( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, vector<float> &multimodelweights, const vector<FactorType> &input_factors, const vector<FactorType> &output_factors ) const
+{
// lexical translation probability
double lexScore = 1.0;
@@ -414,7 +413,8 @@ double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslation( cons
}
-lexicalCache PhraseDictionaryMultiModelCounts::CacheLexicalStatistics( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, const vector<FactorType> &input_factors, const vector<FactorType> &output_factors ) {
+lexicalCache PhraseDictionaryMultiModelCounts::CacheLexicalStatistics( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, const vector<FactorType> &input_factors, const vector<FactorType> &output_factors )
+{
//do all the necessary lexical table lookups and get counts, but don't apply weights yet
string null = "NULL";
@@ -455,60 +455,65 @@ lexicalCache PhraseDictionaryMultiModelCounts::CacheLexicalStatistics( const Phr
}
-double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslationFromCache( lexicalCache &cache, vector<float> &weights ) const {
+double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslationFromCache( lexicalCache &cache, vector<float> &weights ) const
+{
// lexical translation probability
double lexScore = 1.0;
for (lexicalCache::const_iterator iter = cache.begin(); iter != cache.end(); ++iter) {
- vector<lexicalPair> t_vector = *iter;
- double thisWordScore = 0;
- for ( vector<lexicalPair>::const_iterator iter2 = t_vector.begin(); iter2 != t_vector.end(); ++iter2) {
- vector<float> joint_count = iter2->first;
- vector<float> marginal = iter2->second;
- thisWordScore += m_combineFunction(joint_count, marginal, weights);
- }
- lexScore *= thisWordScore / t_vector.size();
+ vector<lexicalPair> t_vector = *iter;
+ double thisWordScore = 0;
+ for ( vector<lexicalPair>::const_iterator iter2 = t_vector.begin(); iter2 != t_vector.end(); ++iter2) {
+ vector<float> joint_count = iter2->first;
+ vector<float> marginal = iter2->second;
+ thisWordScore += m_combineFunction(joint_count, marginal, weights);
+ }
+ lexScore *= thisWordScore / t_vector.size();
}
return lexScore;
}
// get lexical probability for single word alignment pair
-double PhraseDictionaryMultiModelCounts::GetLexicalProbability( string &wordS, string &wordT, const vector<lexicalTable*> &tables, vector<float> &multimodelweights ) const {
- vector<float> joint_count (m_numModels);
- vector<float> marginals (m_numModels);
+double PhraseDictionaryMultiModelCounts::GetLexicalProbability( string &wordS, string &wordT, const vector<lexicalTable*> &tables, vector<float> &multimodelweights ) const
+{
+ vector<float> joint_count (m_numModels);
+ vector<float> marginals (m_numModels);
- FillLexicalCountsJoint(wordS, wordT, joint_count, tables);
- FillLexicalCountsMarginal(wordS, marginals, tables);
+ FillLexicalCountsJoint(wordS, wordT, joint_count, tables);
+ FillLexicalCountsMarginal(wordS, marginals, tables);
- double lexProb = m_combineFunction(joint_count, marginals, multimodelweights);
+ double lexProb = m_combineFunction(joint_count, marginals, multimodelweights);
return lexProb;
}
-void PhraseDictionaryMultiModelCounts::FillLexicalCountsJoint(string &wordS, string &wordT, vector<float> &count, const vector<lexicalTable*> &tables) const {
- for (size_t i=0;i < m_numModels;i++) {
- lexicalMapJoint::iterator joint_s = tables[i]->joint.find( wordS );
- if (joint_s == tables[i]->joint.end()) count[i] = 0.0;
- else {
- lexicalMap::iterator joint_t = joint_s->second.find( wordT );
- if (joint_t == joint_s->second.end()) count[i] = 0.0;
- else count[i] = joint_t->second;
- }
+void PhraseDictionaryMultiModelCounts::FillLexicalCountsJoint(string &wordS, string &wordT, vector<float> &count, const vector<lexicalTable*> &tables) const
+{
+ for (size_t i=0; i < m_numModels; i++) {
+ lexicalMapJoint::iterator joint_s = tables[i]->joint.find( wordS );
+ if (joint_s == tables[i]->joint.end()) count[i] = 0.0;
+ else {
+ lexicalMap::iterator joint_t = joint_s->second.find( wordT );
+ if (joint_t == joint_s->second.end()) count[i] = 0.0;
+ else count[i] = joint_t->second;
}
+ }
}
-void PhraseDictionaryMultiModelCounts::FillLexicalCountsMarginal(string &wordS, vector<float> &count, const vector<lexicalTable*> &tables) const {
- for (size_t i=0;i < m_numModels;i++) {
- lexicalMap::iterator marginal_s = tables[i]->marginal.find( wordS );
- if (marginal_s == tables[i]->marginal.end()) count[i] = 0.0;
- else count[i] = marginal_s->second;
- }
+void PhraseDictionaryMultiModelCounts::FillLexicalCountsMarginal(string &wordS, vector<float> &count, const vector<lexicalTable*> &tables) const
+{
+ for (size_t i=0; i < m_numModels; i++) {
+ lexicalMap::iterator marginal_s = tables[i]->marginal.find( wordS );
+ if (marginal_s == tables[i]->marginal.end()) count[i] = 0.0;
+ else count[i] = marginal_s->second;
+ }
}
-void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexicalTable* ltable) {
+void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexicalTable* ltable)
+{
cerr << "Loading lexical translation table from " << fileName;
ifstream inFile;
@@ -549,165 +554,161 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic
#ifdef WITH_DLIB
-vector<float> PhraseDictionaryMultiModelCounts::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector) {
+vector<float> PhraseDictionaryMultiModelCounts::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector)
+{
- const StaticData &staticData = StaticData::Instance();
- const string& factorDelimiter = staticData.GetFactorDelimiter();
+ const StaticData &staticData = StaticData::Instance();
+ const string& factorDelimiter = staticData.GetFactorDelimiter();
- map<pair<string, string>, size_t> phrase_pair_map;
+ map<pair<string, string>, size_t> phrase_pair_map;
- for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
- phrase_pair_map[*iter] += 1;
- }
+ for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
+ phrase_pair_map[*iter] += 1;
+ }
- vector<multiModelCountsStatisticsOptimization*> optimizerStats;
+ vector<multiModelCountsStatisticsOptimization*> optimizerStats;
- for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) {
+ for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) {
- pair<string, string> phrase_pair = iter->first;
- string source_string = phrase_pair.first;
- string target_string = phrase_pair.second;
+ pair<string, string> phrase_pair = iter->first;
+ string source_string = phrase_pair.first;
+ string target_string = phrase_pair.second;
- vector<float> fs(m_numModels);
- map<string,multiModelCountsStatistics*>* allStats = new(map<string,multiModelCountsStatistics*>);
+ vector<float> fs(m_numModels);
+ map<string,multiModelCountsStatistics*>* allStats = new(map<string,multiModelCountsStatistics*>);
- Phrase sourcePhrase(0);
- sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL);
+ Phrase sourcePhrase(0);
+ sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL);
- CollectSufficientStatistics(sourcePhrase, fs, allStats); //optimization potential: only call this once per source phrase
+ CollectSufficientStatistics(sourcePhrase, fs, allStats); //optimization potential: only call this once per source phrase
- //phrase pair not found; leave cache empty
- if (allStats->find(target_string) == allStats->end()) {
- RemoveAllInMap(*allStats);
- delete allStats;
- continue;
- }
+ //phrase pair not found; leave cache empty
+ if (allStats->find(target_string) == allStats->end()) {
+ RemoveAllInMap(*allStats);
+ delete allStats;
+ continue;
+ }
- multiModelCountsStatisticsOptimization * targetStatistics = new multiModelCountsStatisticsOptimization();
- targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase);
- targetStatistics->fs = fs;
- targetStatistics->fst = (*allStats)[target_string]->fst;
- targetStatistics->ft = (*allStats)[target_string]->ft;
- targetStatistics->f = iter->second;
+ multiModelCountsStatisticsOptimization * targetStatistics = new multiModelCountsStatisticsOptimization();
+ targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase);
+ targetStatistics->fs = fs;
+ targetStatistics->fst = (*allStats)[target_string]->fst;
+ targetStatistics->ft = (*allStats)[target_string]->ft;
+ targetStatistics->f = iter->second;
- try {
- pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), targetStatistics->targetPhrase->GetAlignTerm());
- targetStatistics->lexCachee2f = CacheLexicalStatistics(static_cast<const Phrase&>(*targetStatistics->targetPhrase), sourcePhrase, alignment.second, m_lexTable_e2f, m_output, m_input );
- targetStatistics->lexCachef2e = CacheLexicalStatistics(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), alignment.first, m_lexTable_f2e, m_input, m_output );
+ try {
+ pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), targetStatistics->targetPhrase->GetAlignTerm());
+ targetStatistics->lexCachee2f = CacheLexicalStatistics(static_cast<const Phrase&>(*targetStatistics->targetPhrase), sourcePhrase, alignment.second, m_lexTable_e2f, m_output, m_input );
+ targetStatistics->lexCachef2e = CacheLexicalStatistics(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), alignment.first, m_lexTable_f2e, m_input, m_output );
- optimizerStats.push_back(targetStatistics);
- }
- catch (AlignmentException& e) {}
+ optimizerStats.push_back(targetStatistics);
+ } catch (AlignmentException& e) {}
- RemoveAllInMap(*allStats);
- delete allStats;
- }
+ RemoveAllInMap(*allStats);
+ delete allStats;
+ }
- Sentence sentence;
- CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables
+ Sentence sentence;
+ CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables
- vector<float> ret (m_numModels*4);
- for (size_t iFeature=0; iFeature < 4; iFeature++) {
+ vector<float> ret (m_numModels*4);
+ for (size_t iFeature=0; iFeature < 4; iFeature++) {
- CrossEntropyCounts * ObjectiveFunction = new CrossEntropyCounts(optimizerStats, this, iFeature);
+ CrossEntropyCounts * ObjectiveFunction = new CrossEntropyCounts(optimizerStats, this, iFeature);
- vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels);
+ vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels);
- if (m_mode == "interpolate") {
- weight_vector = normalizeWeights(weight_vector);
- }
- else if (m_mode == "instance_weighting") {
- float first_value = weight_vector[0];
- for (size_t i=0; i < m_numModels; i++) {
- weight_vector[i] = weight_vector[i]/first_value;
- }
- }
- cerr << "Weight vector for feature " << iFeature << ": ";
- for (size_t i=0; i < m_numModels; i++) {
- ret[(iFeature*m_numModels)+i] = weight_vector[i];
- cerr << weight_vector[i] << " ";
- }
- cerr << endl;
- delete ObjectiveFunction;
+ if (m_mode == "interpolate") {
+ weight_vector = normalizeWeights(weight_vector);
+ } else if (m_mode == "instance_weighting") {
+ float first_value = weight_vector[0];
+ for (size_t i=0; i < m_numModels; i++) {
+ weight_vector[i] = weight_vector[i]/first_value;
+ }
+ }
+ cerr << "Weight vector for feature " << iFeature << ": ";
+ for (size_t i=0; i < m_numModels; i++) {
+ ret[(iFeature*m_numModels)+i] = weight_vector[i];
+ cerr << weight_vector[i] << " ";
}
+ cerr << endl;
+ delete ObjectiveFunction;
+ }
- RemoveAllInColl(optimizerStats);
- return ret;
+ RemoveAllInColl(optimizerStats);
+ return ret;
}
double CrossEntropyCounts::operator() ( const dlib::matrix<double,0,1>& arg) const
{
- double total = 0.0;
- double n = 0.0;
- std::vector<float> weight_vector (m_model->m_numModels);
+ double total = 0.0;
+ double n = 0.0;
+ std::vector<float> weight_vector (m_model->m_numModels);
- for (int i=0; i < arg.nr(); i++) {
- weight_vector[i] = arg(i);
- }
- if (m_model->m_mode == "interpolate") {
- weight_vector = m_model->normalizeWeights(weight_vector);
- }
-
- for ( std::vector<multiModelCountsStatisticsOptimization*>::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) {
- multiModelCountsStatisticsOptimization* statistics = *iter;
- size_t f = statistics->f;
+ for (int i=0; i < arg.nr(); i++) {
+ weight_vector[i] = arg(i);
+ }
+ if (m_model->m_mode == "interpolate") {
+ weight_vector = m_model->normalizeWeights(weight_vector);
+ }
- double score;
- if (m_iFeature == 0) {
- score = m_model->m_combineFunction(statistics->fst, statistics->ft, weight_vector);
- }
- else if (m_iFeature == 1) {
- score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachee2f, weight_vector);
- }
- else if (m_iFeature == 2) {
- score = m_model->m_combineFunction(statistics->fst, statistics->fs, weight_vector);
- }
- else if (m_iFeature == 3) {
- score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachef2e, weight_vector);
- }
- else {
- score = 0;
- UTIL_THROW(util::Exception, "Trying to optimize feature that I don't know. Aborting");
- }
- total -= (FloorScore(TransformScore(score))/TransformScore(2))*f;
- n += f;
+ for ( std::vector<multiModelCountsStatisticsOptimization*>::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) {
+ multiModelCountsStatisticsOptimization* statistics = *iter;
+ size_t f = statistics->f;
+
+ double score;
+ if (m_iFeature == 0) {
+ score = m_model->m_combineFunction(statistics->fst, statistics->ft, weight_vector);
+ } else if (m_iFeature == 1) {
+ score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachee2f, weight_vector);
+ } else if (m_iFeature == 2) {
+ score = m_model->m_combineFunction(statistics->fst, statistics->fs, weight_vector);
+ } else if (m_iFeature == 3) {
+ score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachef2e, weight_vector);
+ } else {
+ score = 0;
+ UTIL_THROW(util::Exception, "Trying to optimize feature that I don't know. Aborting");
}
- return total/n;
+ total -= (FloorScore(TransformScore(score))/TransformScore(2))*f;
+ n += f;
+ }
+ return total/n;
}
#endif
// calculate weighted probability based on instance weighting of joint counts and marginal counts
-double InstanceWeighting(vector<float> &joint_counts, vector<float> &marginals, vector<float> &multimodelweights) {
+double InstanceWeighting(vector<float> &joint_counts, vector<float> &marginals, vector<float> &multimodelweights)
+{
- double joint_counts_weighted = inner_product(joint_counts.begin(), joint_counts.end(), multimodelweights.begin(), 0.0);
- double marginals_weighted = inner_product(marginals.begin(), marginals.end(), multimodelweights.begin(), 0.0);
+ double joint_counts_weighted = inner_product(joint_counts.begin(), joint_counts.end(), multimodelweights.begin(), 0.0);
+ double marginals_weighted = inner_product(marginals.begin(), marginals.end(), multimodelweights.begin(), 0.0);
- if (marginals_weighted == 0) {
- return 0;
- }
- else {
- return joint_counts_weighted/marginals_weighted;
- }
+ if (marginals_weighted == 0) {
+ return 0;
+ } else {
+ return joint_counts_weighted/marginals_weighted;
+ }
}
// calculate linear interpolation of relative frequency estimates based on joint count and marginal counts
//unused for now; enable in config?
-double LinearInterpolationFromCounts(vector<float> &joint_counts, vector<float> &marginals, vector<float> &multimodelweights) {
+double LinearInterpolationFromCounts(vector<float> &joint_counts, vector<float> &marginals, vector<float> &multimodelweights)
+{
- vector<float> p(marginals.size());
+ vector<float> p(marginals.size());
- for (size_t i=0;i < marginals.size();i++) {
- if (marginals[i] != 0) {
- p[i] = joint_counts[i]/marginals[i];
- }
+ for (size_t i=0; i < marginals.size(); i++) {
+ if (marginals[i] != 0) {
+ p[i] = joint_counts[i]/marginals[i];
}
+ }
- double p_weighted = inner_product(p.begin(), p.end(), multimodelweights.begin(), 0.0);
+ double p_weighted = inner_product(p.begin(), p.end(), multimodelweights.begin(), 0.0);
- return p_weighted;
+ return p_weighted;
}
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h
index ef89272c3..04be77dd6 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h
+++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h
@@ -35,32 +35,33 @@ extern std::vector<std::string> tokenize( const char*);
namespace Moses
{
- typedef boost::unordered_map<std::string, double > lexicalMap;
- typedef boost::unordered_map<std::string, lexicalMap > lexicalMapJoint;
- typedef std::pair<std::vector<float>, std::vector<float> > lexicalPair;
- typedef std::vector<std::vector<lexicalPair> > lexicalCache;
+typedef boost::unordered_map<std::string, double > lexicalMap;
+typedef boost::unordered_map<std::string, lexicalMap > lexicalMapJoint;
+typedef std::pair<std::vector<float>, std::vector<float> > lexicalPair;
+typedef std::vector<std::vector<lexicalPair> > lexicalCache;
- struct multiModelCountsStatistics : multiModelStatistics {
- std::vector<float> fst, ft;
- };
+struct multiModelCountsStatistics : multiModelStatistics {
+ std::vector<float> fst, ft;
+};
- struct multiModelCountsStatisticsOptimization: multiModelCountsStatistics {
- std::vector<float> fs;
- lexicalCache lexCachee2f, lexCachef2e;
- size_t f;
- };
+struct multiModelCountsStatisticsOptimization: multiModelCountsStatistics {
+ std::vector<float> fs;
+ lexicalCache lexCachee2f, lexCachef2e;
+ size_t f;
+};
- struct lexicalTable {
- lexicalMapJoint joint;
- lexicalMap marginal;
- };
+struct lexicalTable {
+ lexicalMapJoint joint;
+ lexicalMap marginal;
+};
- double InstanceWeighting(std::vector<float> &joint_counts, std::vector<float> &marginals, std::vector<float> &multimodelweights);
- double LinearInterpolationFromCounts(std::vector<float> &joint_counts, std::vector<float> &marginals, std::vector<float> &multimodelweights);
+double InstanceWeighting(std::vector<float> &joint_counts, std::vector<float> &marginals, std::vector<float> &multimodelweights);
+double LinearInterpolationFromCounts(std::vector<float> &joint_counts, std::vector<float> &marginals, std::vector<float> &multimodelweights);
//thrown if alignment information does not match phrase pair (out-of-bound alignment points)
-class AlignmentException : public std::runtime_error {
+class AlignmentException : public std::runtime_error
+{
public:
AlignmentException() : std::runtime_error("AlignmentException") { }
};
@@ -72,10 +73,10 @@ class PhraseDictionaryMultiModelCounts: public PhraseDictionaryMultiModel
{
#ifdef WITH_DLIB
-friend class CrossEntropyCounts;
+ friend class CrossEntropyCounts;
#endif
-typedef std::vector< std::set<size_t> > AlignVector;
+ typedef std::vector< std::set<size_t> > AlignVector;
public:
@@ -116,23 +117,22 @@ class CrossEntropyCounts: public OptimizationObjective
{
public:
- CrossEntropyCounts (
- std::vector<multiModelCountsStatisticsOptimization*> &optimizerStats,
- PhraseDictionaryMultiModelCounts * model,
- size_t iFeature
- )
- {
- m_optimizerStats = optimizerStats;
- m_model = model;
- m_iFeature = iFeature;
- }
+ CrossEntropyCounts (
+ std::vector<multiModelCountsStatisticsOptimization*> &optimizerStats,
+ PhraseDictionaryMultiModelCounts * model,
+ size_t iFeature
+ ) {
+ m_optimizerStats = optimizerStats;
+ m_model = model;
+ m_iFeature = iFeature;
+ }
- double operator() ( const dlib::matrix<double,0,1>& arg) const;
+ double operator() ( const dlib::matrix<double,0,1>& arg) const;
private:
- std::vector<multiModelCountsStatisticsOptimization*> m_optimizerStats;
- PhraseDictionaryMultiModelCounts * m_model;
- size_t m_iFeature;
+ std::vector<multiModelCountsStatisticsOptimization*> m_optimizerStats;
+ PhraseDictionaryMultiModelCounts * m_model;
+ size_t m_iFeature;
};
#endif
diff --git a/moses/TranslationModel/PhraseDictionaryNodeMemory.cpp b/moses/TranslationModel/PhraseDictionaryNodeMemory.cpp
index 465c076d5..389c74394 100644
--- a/moses/TranslationModel/PhraseDictionaryNodeMemory.cpp
+++ b/moses/TranslationModel/PhraseDictionaryNodeMemory.cpp
@@ -31,8 +31,8 @@ namespace Moses
PhraseDictionaryNodeMemory::~PhraseDictionaryNodeMemory()
{
for (TerminalMap::iterator iter = m_sourceTermMap.begin(); iter != m_sourceTermMap.end(); ++iter) {
- const PhraseDictionaryNodeMemory *node = iter->second;
- delete node;
+ const PhraseDictionaryNodeMemory *node = iter->second;
+ delete node;
}
for (NonTerminalMap::iterator iter = m_nonTermMap.begin(); iter != m_nonTermMap.end(); ++iter) {
const PhraseDictionaryNodeMemory *node = iter->second;
@@ -41,7 +41,8 @@ PhraseDictionaryNodeMemory::~PhraseDictionaryNodeMemory()
delete m_targetPhraseCollection;
}
-TargetPhraseCollection &PhraseDictionaryNodeMemory::GetOrCreateTargetPhraseCollection() {
+TargetPhraseCollection &PhraseDictionaryNodeMemory::GetOrCreateTargetPhraseCollection()
+{
if (m_targetPhraseCollection == NULL)
m_targetPhraseCollection = new TargetPhraseCollection();
return *m_targetPhraseCollection;
@@ -138,9 +139,9 @@ void PhraseDictionaryNodeMemory::Clear()
m_sourceTermMap.clear();
m_nonTermMap.clear();
delete m_targetPhraseCollection;
-
+
}
-
+
std::ostream& operator<<(std::ostream &out, const PhraseDictionaryNodeMemory &node)
{
out << node.GetTargetPhraseCollection();
diff --git a/moses/TranslationModel/PhraseDictionaryNodeMemory.h b/moses/TranslationModel/PhraseDictionaryNodeMemory.h
index 672196ba2..136e10c0a 100644
--- a/moses/TranslationModel/PhraseDictionaryNodeMemory.h
+++ b/moses/TranslationModel/PhraseDictionaryNodeMemory.h
@@ -39,8 +39,8 @@ namespace Moses
class PhraseDictionaryMemory;
class PhraseDictionaryFuzzyMatch;
-
- //! @todo why?
+
+//! @todo why?
class NonTerminalMapKeyHasher
{
public:
@@ -152,7 +152,7 @@ public:
}
void Clear();
-
+
TO_STRING();
};
diff --git a/moses/TranslationModel/PhraseDictionaryTree.cpp b/moses/TranslationModel/PhraseDictionaryTree.cpp
index 321924dfe..c5eefc290 100644
--- a/moses/TranslationModel/PhraseDictionaryTree.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTree.cpp
@@ -157,7 +157,8 @@ PhraseDictionaryTree::PrefixPtr::operator bool() const
typedef LVoc<std::string> WordVoc;
-class PDTimp {
+class PDTimp
+{
public:
typedef PrefixTreeF<LabelId,OFF_T> PTF;
typedef FilePtr<PTF> CPT;
@@ -481,7 +482,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
std::vector<OFF_T> vo;
size_t lnc=0;
size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info
- size_t missingAlignmentCount = 0;
+ size_t missingAlignmentCount = 0;
while(getline(inFile, line)) {
++lnc;
@@ -553,9 +554,9 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
if (!sparseFeatureString.empty()) {
std::vector<std::string> sparseTokens = Tokenize(sparseFeatureString);
if (sparseTokens.size() % 2 != 0) {
- TRACE_ERR("ERROR: incorrectly formatted sparse feature string: " <<
- sparseFeatureString << std::endl);
- abort();
+ TRACE_ERR("ERROR: incorrectly formatted sparse feature string: " <<
+ sparseFeatureString << std::endl);
+ abort();
}
for (size_t i = 0; i < sparseTokens.size(); i+=2) {
fnames.push_back(imp->tv.add(sparseTokens[i]));
@@ -624,7 +625,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
if ( PrintWordAlignment()) {
TRACE_ERR("Count of lines with missing alignments: " <<
- missingAlignmentCount << "/" << lnc << "\n");
+ missingAlignmentCount << "/" << lnc << "\n");
}
fClose(os);
diff --git a/moses/TranslationModel/PhraseDictionaryTree.h b/moses/TranslationModel/PhraseDictionaryTree.h
index 1b88637c3..6214c8194 100644
--- a/moses/TranslationModel/PhraseDictionaryTree.h
+++ b/moses/TranslationModel/PhraseDictionaryTree.h
@@ -31,8 +31,7 @@ class PDTimp;
typedef PrefixTreeF<LabelId,OFF_T> PTF;
//typedef std::pair<std::vector<std::string const*>,Scores > StringTgtCand;
-struct StringTgtCand
-{
+struct StringTgtCand {
typedef std::vector<std::string const*> Tokens;
Tokens tokens;
Scores scores;
@@ -86,7 +85,7 @@ public:
// get the target candidates for a given phrase
void GetTargetCandidates(const std::vector<std::string>& src,
std::vector<StringTgtCand>& rv) const;
-
+
// get the target candidates for a given phrase
void GetTargetCandidates(const std::vector<std::string>& src,
diff --git a/moses/TranslationModel/RuleTable/Loader.h b/moses/TranslationModel/RuleTable/Loader.h
index 4d3e03351..48390e37e 100644
--- a/moses/TranslationModel/RuleTable/Loader.h
+++ b/moses/TranslationModel/RuleTable/Loader.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -32,7 +32,7 @@ namespace Moses
*/
class RuleTableLoader
{
- public:
+public:
virtual ~RuleTableLoader() {}
virtual bool Load(const std::vector<FactorType> &input,
@@ -41,7 +41,7 @@ class RuleTableLoader
size_t tableLimit,
RuleTableTrie &) = 0;
- protected:
+protected:
// Provide access to RuleTableTrie's private SortAndPrune function.
void SortAndPrune(RuleTableTrie &ruleTable) {
ruleTable.SortAndPrune();
@@ -50,10 +50,10 @@ class RuleTableLoader
// Provide access to RuleTableTrie's private
// GetOrCreateTargetPhraseCollection function.
TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
- RuleTableTrie &ruleTable
- , const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS) {
+ RuleTableTrie &ruleTable
+ , const Phrase &source
+ , const TargetPhrase &target
+ , const Word *sourceLHS) {
return ruleTable.GetOrCreateTargetPhraseCollection(source, target, sourceLHS);
}
};
diff --git a/moses/TranslationModel/RuleTable/LoaderCompact.cpp b/moses/TranslationModel/RuleTable/LoaderCompact.cpp
index f235b3e79..2b4a6003a 100644
--- a/moses/TranslationModel/RuleTable/LoaderCompact.cpp
+++ b/moses/TranslationModel/RuleTable/LoaderCompact.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -84,9 +84,9 @@ bool RuleTableLoaderCompact::Load(const std::vector<FactorType> &input,
}
void RuleTableLoaderCompact::LoadVocabularySection(
- LineReader &reader,
- const std::vector<FactorType> &factorTypes,
- std::vector<Word> &vocabulary)
+ LineReader &reader,
+ const std::vector<FactorType> &factorTypes,
+ std::vector<Word> &vocabulary)
{
// Read symbol count.
reader.ReadLine();
@@ -106,10 +106,10 @@ void RuleTableLoaderCompact::LoadVocabularySection(
}
void RuleTableLoaderCompact::LoadPhraseSection(
- LineReader &reader,
- const std::vector<Word> &vocab,
- std::vector<Phrase> &rhsPhrases,
- std::vector<size_t> &lhsIds)
+ LineReader &reader,
+ const std::vector<Word> &vocab,
+ std::vector<Phrase> &rhsPhrases,
+ std::vector<size_t> &lhsIds)
{
// Read phrase count.
reader.ReadLine();
@@ -132,7 +132,7 @@ void RuleTableLoaderCompact::LoadPhraseSection(
}
void RuleTableLoaderCompact::LoadAlignmentSection(
- LineReader &reader, std::vector<const AlignmentInfo *> &alignmentSets, std::vector<Phrase> &sourcePhrases)
+ LineReader &reader, std::vector<const AlignmentInfo *> &alignmentSets, std::vector<Phrase> &sourcePhrases)
{
// Read alignment set count.
reader.ReadLine();
@@ -144,8 +144,8 @@ void RuleTableLoaderCompact::LoadAlignmentSection(
std::vector<size_t> points;
for (size_t i = 0; i < alignmentSetCount; ++i) {
// Read alignment set, lookup in collection, and store pointer.
- alignTerm.clear();
- alignNonTerm.clear();
+ alignTerm.clear();
+ alignNonTerm.clear();
tokens.clear();
reader.ReadLine();
@@ -157,11 +157,10 @@ void RuleTableLoaderCompact::LoadAlignmentSection(
std::pair<size_t, size_t> alignmentPair(points[0], points[1]);
if (sourcePhrases[i].GetWord(alignmentPair.first).IsNonTerminal()) {
- alignNonTerm.insert(alignmentPair);
+ alignNonTerm.insert(alignmentPair);
+ } else {
+ alignTerm.insert(alignmentPair);
}
- else {
- alignTerm.insert(alignmentPair);
- }
}
alignmentSets[i*2] = AlignmentInfoCollection::Instance().Add(alignNonTerm);
@@ -170,13 +169,13 @@ void RuleTableLoaderCompact::LoadAlignmentSection(
}
bool RuleTableLoaderCompact::LoadRuleSection(
- LineReader &reader,
- const std::vector<Word> &vocab,
- const std::vector<Phrase> &sourcePhrases,
- const std::vector<Phrase> &targetPhrases,
- const std::vector<size_t> &targetLhsIds,
- const std::vector<const AlignmentInfo *> &alignmentSets,
- RuleTableTrie &ruleTable)
+ LineReader &reader,
+ const std::vector<Word> &vocab,
+ const std::vector<Phrase> &sourcePhrases,
+ const std::vector<Phrase> &targetPhrases,
+ const std::vector<size_t> &targetLhsIds,
+ const std::vector<const AlignmentInfo *> &alignmentSets,
+ RuleTableTrie &ruleTable)
{
// Read rule count.
reader.ReadLine();
@@ -232,7 +231,7 @@ bool RuleTableLoaderCompact::LoadRuleSection(
// Insert rule into table.
TargetPhraseCollection &coll = GetOrCreateTargetPhraseCollection(
- ruleTable, sourcePhrase, *targetPhrase, &sourceLHS);
+ ruleTable, sourcePhrase, *targetPhrase, &sourceLHS);
coll.Add(targetPhrase);
}
diff --git a/moses/TranslationModel/RuleTable/LoaderCompact.h b/moses/TranslationModel/RuleTable/LoaderCompact.h
index 314cfca57..26e19fce6 100644
--- a/moses/TranslationModel/RuleTable/LoaderCompact.h
+++ b/moses/TranslationModel/RuleTable/LoaderCompact.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -35,14 +35,14 @@ class RuleTableTrie;
//! @todo ask phil williams
class RuleTableLoaderCompact : public RuleTableLoader
{
- public:
+public:
bool Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
const std::string &inFile,
size_t tableLimit,
RuleTableTrie &);
- private:
+private:
struct LineReader {
LineReader(std::istream &input) : m_input(input), m_lineNum(0) {}
void ReadLine() {
@@ -78,8 +78,7 @@ class RuleTableLoaderCompact : public RuleTableLoader
// Like Tokenize() but records starting positions of tokens (instead of
// copying substrings) and assumes delimiter is ASCII space character.
- void FindTokens(std::vector<size_t> &output, const std::string &str) const
- {
+ void FindTokens(std::vector<size_t> &output, const std::string &str) const {
// Skip delimiters at beginning.
size_t lastPos = str.find_first_not_of(' ', 0);
// Find first "non-delimiter".
diff --git a/moses/TranslationModel/RuleTable/LoaderFactory.cpp b/moses/TranslationModel/RuleTable/LoaderFactory.cpp
index b3bd00555..cdbfc965a 100644
--- a/moses/TranslationModel/RuleTable/LoaderFactory.cpp
+++ b/moses/TranslationModel/RuleTable/LoaderFactory.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -37,7 +37,7 @@ namespace Moses
// Determines the rule table type by peeking inside the file then creates
// a suitable RuleTableLoader object.
std::auto_ptr<RuleTableLoader> RuleTableLoaderFactory::Create(
- const std::string &path)
+ const std::string &path)
{
InputFileStream input(path);
std::string line;
@@ -54,17 +54,15 @@ std::auto_ptr<RuleTableLoader> RuleTableLoaderFactory::Create(
msg << "Unsupported compact rule table format: " << tokens[0];
UserMessage::Add(msg.str());
return std::auto_ptr<RuleTableLoader>();
+ } else if (tokens[0] == "[X]" && tokens[1] == "|||") {
+ return std::auto_ptr<RuleTableLoader>(new
+ RuleTableLoaderHiero());
+
}
- else if (tokens[0] == "[X]" && tokens[1] == "|||") {
- return std::auto_ptr<RuleTableLoader>(new
- RuleTableLoaderHiero());
-
- }
-
+
return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderStandard());
- }
- else
- { // empty phrase table
+ } else {
+ // empty phrase table
return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderStandard());
}
}
diff --git a/moses/TranslationModel/RuleTable/LoaderFactory.h b/moses/TranslationModel/RuleTable/LoaderFactory.h
index 01c168680..c695738e4 100644
--- a/moses/TranslationModel/RuleTable/LoaderFactory.h
+++ b/moses/TranslationModel/RuleTable/LoaderFactory.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -30,7 +30,7 @@ class RuleTableLoader;
//! Creates a RuleTableLoader object suitable for loading the specified file.
class RuleTableLoaderFactory
{
- public:
+public:
static std::auto_ptr<RuleTableLoader> Create(const std::string &);
};
diff --git a/moses/TranslationModel/RuleTable/LoaderHiero.cpp b/moses/TranslationModel/RuleTable/LoaderHiero.cpp
index c0526be02..81289d9b2 100644
--- a/moses/TranslationModel/RuleTable/LoaderHiero.cpp
+++ b/moses/TranslationModel/RuleTable/LoaderHiero.cpp
@@ -11,19 +11,20 @@
using namespace std;
-namespace Moses {
-
+namespace Moses
+{
+
bool RuleTableLoaderHiero::Load(const std::vector<FactorType> &input,
- const std::vector<FactorType> &output,
- const std::string &inFile,
- size_t tableLimit,
- RuleTableTrie &ruleTable)
+ const std::vector<FactorType> &output,
+ const std::string &inFile,
+ size_t tableLimit,
+ RuleTableTrie &ruleTable)
{
bool ret = RuleTableLoaderStandard::Load(HieroFormat
- ,input, output
- ,inFile
- ,tableLimit
- ,ruleTable);
+ ,input, output
+ ,inFile
+ ,tableLimit
+ ,ruleTable);
return ret;
}
diff --git a/moses/TranslationModel/RuleTable/LoaderHiero.h b/moses/TranslationModel/RuleTable/LoaderHiero.h
index 1f6b66725..099787281 100644
--- a/moses/TranslationModel/RuleTable/LoaderHiero.h
+++ b/moses/TranslationModel/RuleTable/LoaderHiero.h
@@ -11,7 +11,8 @@
#include "LoaderStandard.h"
-namespace Moses {
+namespace Moses
+{
//! specific implementation of SCFG loader to load rule tables formatted in Hiero-style format
class RuleTableLoaderHiero : public RuleTableLoaderStandard
diff --git a/moses/TranslationModel/RuleTable/LoaderStandard.cpp b/moses/TranslationModel/RuleTable/LoaderStandard.cpp
index 566684775..fb5052c40 100644
--- a/moses/TranslationModel/RuleTable/LoaderStandard.cpp
+++ b/moses/TranslationModel/RuleTable/LoaderStandard.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -59,38 +59,34 @@ bool RuleTableLoaderStandard::Load(const std::vector<FactorType> &input
return ret;
}
-
+
void ReformatHieroRule(int sourceTarget, string &phrase, map<size_t, pair<size_t, size_t> > &ntAlign)
{
vector<string> toks;
Tokenize(toks, phrase, " ");
- for (size_t i = 0; i < toks.size(); ++i)
- {
+ for (size_t i = 0; i < toks.size(); ++i) {
string &tok = toks[i];
size_t tokLen = tok.size();
- if (tok.substr(0, 1) == "[" && tok.substr(tokLen - 1, 1) == "]")
- { // no-term
+ if (tok.substr(0, 1) == "[" && tok.substr(tokLen - 1, 1) == "]") {
+ // no-term
vector<string> split = Tokenize(tok, ",");
CHECK(split.size() == 2);
-
+
tok = "[X]" + split[0] + "]";
size_t coIndex = Scan<size_t>(split[1]);
-
+
pair<size_t, size_t> &alignPoint = ntAlign[coIndex];
- if (sourceTarget == 0)
- {
+ if (sourceTarget == 0) {
alignPoint.first = i;
- }
- else
- {
+ } else {
alignPoint.second = i;
}
}
}
-
+
phrase = Join(" ", toks) + " [X]";
-
+
}
void ReformateHieroScore(string &scoreString)
@@ -98,8 +94,7 @@ void ReformateHieroScore(string &scoreString)
vector<string> toks;
Tokenize(toks, scoreString, " ");
- for (size_t i = 0; i < toks.size(); ++i)
- {
+ for (size_t i = 0; i < toks.size(); ++i) {
string &tok = toks[i];
vector<string> nameValue = Tokenize(tok, "=");
CHECK(nameValue.size() == 2);
@@ -108,49 +103,48 @@ void ReformateHieroScore(string &scoreString)
score = exp(-score);
tok = SPrint(score);
}
-
+
scoreString = Join(" ", toks);
}
-
+
void ReformatHieroRule(const string &lineOrig, string &out)
-{
+{
vector<string> tokens;
vector<float> scoreVector;
-
+
TokenizeMultiCharSeparator(tokens, lineOrig, "|||" );
string &sourcePhraseString = tokens[1]
- , &targetPhraseString = tokens[2]
- , &scoreString = tokens[3];
+ , &targetPhraseString = tokens[2]
+ , &scoreString = tokens[3];
map<size_t, pair<size_t, size_t> > ntAlign;
ReformatHieroRule(0, sourcePhraseString, ntAlign);
ReformatHieroRule(1, targetPhraseString, ntAlign);
ReformateHieroScore(scoreString);
-
+
stringstream align;
map<size_t, pair<size_t, size_t> >::const_iterator iterAlign;
- for (iterAlign = ntAlign.begin(); iterAlign != ntAlign.end(); ++iterAlign)
- {
+ for (iterAlign = ntAlign.begin(); iterAlign != ntAlign.end(); ++iterAlign) {
const pair<size_t, size_t> &alignPoint = iterAlign->second;
align << alignPoint.first << "-" << alignPoint.second << " ";
}
-
+
stringstream ret;
ret << sourcePhraseString << " ||| "
- << targetPhraseString << " ||| "
+ << targetPhraseString << " ||| "
<< scoreString << " ||| "
<< align.str();
-
+
out = ret.str();
}
-
+
bool RuleTableLoaderStandard::Load(FormatType format
- , const std::vector<FactorType> &input
- , const std::vector<FactorType> &output
- , const std::string &inFile
- , size_t /* tableLimit */
- , RuleTableTrie &ruleTable)
+ , const std::vector<FactorType> &input
+ , const std::vector<FactorType> &output
+ , const std::string &inFile
+ , size_t /* tableLimit */
+ , RuleTableTrie &ruleTable)
{
PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format");
@@ -174,7 +168,9 @@ bool RuleTableLoaderStandard::Load(FormatType format
while(true) {
try {
line = in.ReadLine();
- } catch (const util::EndOfFileException &e) { break; }
+ } catch (const util::EndOfFileException &e) {
+ break;
+ }
if (format == HieroFormat) { // inefficiently reformat line
hiero_before.assign(line.data(), line.size());
@@ -186,7 +182,7 @@ bool RuleTableLoaderStandard::Load(FormatType format
StringPiece sourcePhraseString(*pipes);
StringPiece targetPhraseString(*++pipes);
StringPiece scoreString(*++pipes);
-
+
StringPiece alignString;
if (++pipes) {
StringPiece temp(*pipes);
@@ -237,9 +233,9 @@ bool RuleTableLoaderStandard::Load(FormatType format
// rest of target phrase
targetPhrase->SetAlignmentInfo(alignString);
targetPhrase->SetTargetLHS(targetLHS);
-
+
//targetPhrase->SetDebugOutput(string("New Format pt ") + line);
-
+
if (++pipes) {
StringPiece sparseString(*pipes);
targetPhrase->SetSparseScore(&ruleTable, sparseString);
diff --git a/moses/TranslationModel/RuleTable/LoaderStandard.h b/moses/TranslationModel/RuleTable/LoaderStandard.h
index 4beefea39..b47f7c00b 100644
--- a/moses/TranslationModel/RuleTable/LoaderStandard.h
+++ b/moses/TranslationModel/RuleTable/LoaderStandard.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -35,7 +35,7 @@ protected:
const std::string &inFile,
size_t tableLimit,
RuleTableTrie &);
- public:
+public:
bool Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
const std::string &inFile,
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp
index 8f736af60..1f8ebab15 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp
@@ -18,14 +18,14 @@
using namespace std;
-namespace Moses
+namespace Moses
{
PhraseDictionaryALSuffixArray::PhraseDictionaryALSuffixArray(const std::string &line)
-: PhraseDictionaryMemory("PhraseDictionaryALSuffixArray", line)
+ : PhraseDictionaryMemory("PhraseDictionaryALSuffixArray", line)
{
const StaticData &staticData = StaticData::Instance();
if (staticData.ThreadCount() > 1) {
- throw runtime_error("Suffix array implementation is not threadsafe");
+ throw runtime_error("Suffix array implementation is not threadsafe");
}
}
@@ -33,14 +33,14 @@ void PhraseDictionaryALSuffixArray::InitializeForInput(InputType const& source)
{
// populate with rules for this sentence
long translationId = source.GetTranslationId();
-
+
string grammarFile = GetFilePath() + "/grammar." + SPrint(translationId) + ".gz";
-
+
std::auto_ptr<RuleTableLoader> loader =
- RuleTableLoaderFactory::Create(grammarFile);
+ RuleTableLoaderFactory::Create(grammarFile);
bool ret = loader->Load(m_input, m_output, grammarFile, m_tableLimit,
*this);
-
+
CHECK(ret);
}
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h
index 81e1e02cf..aa4c15258 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h
@@ -11,26 +11,28 @@
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
-namespace Moses {
-
+namespace Moses
+{
+
/** Implementation of in-memory phrase table for use with Adam Lopez's suffix array.
* Does 2 things that the normal in-memory pt doesn't do:
* 1. Loads grammar for a sentence to be decoded only when the sentence is being decoded. Unload afterwards
2. Format of the pt file follows Hiero, rather than Moses
- */
+ */
class PhraseDictionaryALSuffixArray : public PhraseDictionaryMemory
{
public:
PhraseDictionaryALSuffixArray(const std::string &line);
- bool InitDictionary()
- { return true; }
+ bool InitDictionary() {
+ return true;
+ }
void InitializeForInput(InputType const& source);
void CleanUpAfterSentenceProcessing(const InputType& source);
protected:
-
+
};
-
+
}
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
index be6996399..669e7306b 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
@@ -3,17 +3,17 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -49,316 +49,312 @@ using namespace std;
namespace Moses
{
- PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line)
+PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line)
: PhraseDictionary("PhraseDictionaryFuzzyMatch", line)
- {}
-
- bool PhraseDictionaryFuzzyMatch::Load(const std::vector<FactorType> &input
- , const std::vector<FactorType> &output
- , const std::string &initStr
- , size_t tableLimit)
- {
- m_tableLimit = tableLimit;
- m_input = &input;
- m_output = &output;
-
-
- cerr << "initStr=" << initStr << endl;
- m_config = Tokenize(initStr, ";");
- assert(m_config.size() == 3);
-
- m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]);
-
- return true;
+{}
+
+bool PhraseDictionaryFuzzyMatch::Load(const std::vector<FactorType> &input
+ , const std::vector<FactorType> &output
+ , const std::string &initStr
+ , size_t tableLimit)
+{
+ m_tableLimit = tableLimit;
+ m_input = &input;
+ m_output = &output;
+
+
+ cerr << "initStr=" << initStr << endl;
+ m_config = Tokenize(initStr, ";");
+ assert(m_config.size() == 3);
+
+ m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]);
+
+ return true;
+}
+
+ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager(
+ const InputType &sentence,
+ const ChartCellCollectionBase &cellCollection)
+{
+ return new ChartRuleLookupManagerMemoryPerSentence(sentence, cellCollection, *this);
+}
+
+
+int removedirectoryrecursively(const char *dirname)
+{
+ DIR *dir;
+ struct dirent *entry;
+ char path[PATH_MAX];
+
+ if (path == NULL) {
+ fprintf(stderr, "Out of memory error\n");
+ return 0;
}
-
- ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager(
- const InputType &sentence,
- const ChartCellCollectionBase &cellCollection)
- {
- return new ChartRuleLookupManagerMemoryPerSentence(sentence, cellCollection, *this);
+ dir = opendir(dirname);
+ if (dir == NULL) {
+ perror("Error opendir()");
+ return 0;
}
-
-
- int removedirectoryrecursively(const char *dirname)
- {
- DIR *dir;
- struct dirent *entry;
- char path[PATH_MAX];
-
- if (path == NULL) {
- fprintf(stderr, "Out of memory error\n");
- return 0;
- }
- dir = opendir(dirname);
- if (dir == NULL) {
- perror("Error opendir()");
- return 0;
- }
-
- while ((entry = readdir(dir)) != NULL) {
- if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) {
- snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name);
- if (entry->d_type == DT_DIR) {
- removedirectoryrecursively(path);
- }
-
- remove(path);
- /*
- * Here, the actual deletion must be done. Beacuse this is
- * quite a dangerous thing to do, and this program is not very
- * well tested, we are just printing as if we are deleting.
- */
- //printf("(not really) Deleting: %s\n", path);
- /*
- * When you are finished testing this and feel you are ready to do the real
- * deleting, use this: remove*STUB*(path);
- * (see "man 3 remove")
- * Please note that I DONT TAKE RESPONSIBILITY for data you delete with this!
- */
+
+ while ((entry = readdir(dir)) != NULL) {
+ if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) {
+ snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name);
+ if (entry->d_type == DT_DIR) {
+ removedirectoryrecursively(path);
}
-
+
+ remove(path);
+ /*
+ * Here, the actual deletion must be done. Beacuse this is
+ * quite a dangerous thing to do, and this program is not very
+ * well tested, we are just printing as if we are deleting.
+ */
+ //printf("(not really) Deleting: %s\n", path);
+ /*
+ * When you are finished testing this and feel you are ready to do the real
+ * deleting, use this: remove*STUB*(path);
+ * (see "man 3 remove")
+ * Please note that I DONT TAKE RESPONSIBILITY for data you delete with this!
+ */
}
- closedir(dir);
-
- rmdir(dirname);
- /*
- * Now the directory is emtpy, finally delete the directory itself. (Just
- * printing here, see above)
- */
- //printf("(not really) Deleting: %s\n", dirname);
-
- return 1;
+
+ }
+ closedir(dir);
+
+ rmdir(dirname);
+ /*
+ * Now the directory is emtpy, finally delete the directory itself. (Just
+ * printing here, see above)
+ */
+ //printf("(not really) Deleting: %s\n", dirname);
+
+ return 1;
+}
+
+void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence)
+{
+ char dirName[] = "/tmp/moses.XXXXXX";
+ char *temp = mkdtemp(dirName);
+ CHECK(temp);
+ string dirNameStr(dirName);
+
+ string inFileName(dirNameStr + "/in");
+
+ ofstream inFile(inFileName.c_str());
+
+ for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) {
+ inFile << inputSentence.GetWord(i);
}
+ inFile << endl;
+ inFile.close();
+
+ long translationId = inputSentence.GetTranslationId();
+ string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr);
+
+ // populate with rules for this sentence
+ PhraseDictionaryNodeMemory &rootNode = m_collection[translationId];
+ FormatType format = MosesFormat;
- void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence)
- {
- char dirName[] = "/tmp/moses.XXXXXX";
- char *temp = mkdtemp(dirName);
- CHECK(temp);
- string dirNameStr(dirName);
-
- string inFileName(dirNameStr + "/in");
-
- ofstream inFile(inFileName.c_str());
-
- for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i)
- {
- inFile << inputSentence.GetWord(i);
+ // data from file
+ InputFileStream inStream(ptFileName);
+
+ // copied from class LoaderStandard
+ PrintUserTime("Start loading fuzzy-match phrase model");
+
+ const StaticData &staticData = StaticData::Instance();
+ const std::string& factorDelimiter = staticData.GetFactorDelimiter();
+
+
+ string lineOrig;
+ size_t count = 0;
+
+ while(getline(inStream, lineOrig)) {
+ const string *line;
+ if (format == HieroFormat) { // reformat line
+ assert(false);
+ //line = ReformatHieroRule(lineOrig);
+ } else {
+ // do nothing to format of line
+ line = &lineOrig;
}
- inFile << endl;
- inFile.close();
-
- long translationId = inputSentence.GetTranslationId();
- string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr);
-
- // populate with rules for this sentence
- PhraseDictionaryNodeMemory &rootNode = m_collection[translationId];
- FormatType format = MosesFormat;
-
- // data from file
- InputFileStream inStream(ptFileName);
-
- // copied from class LoaderStandard
- PrintUserTime("Start loading fuzzy-match phrase model");
-
- const StaticData &staticData = StaticData::Instance();
- const std::string& factorDelimiter = staticData.GetFactorDelimiter();
-
-
- string lineOrig;
- size_t count = 0;
-
- while(getline(inStream, lineOrig)) {
- const string *line;
- if (format == HieroFormat) { // reformat line
- assert(false);
- //line = ReformatHieroRule(lineOrig);
- }
- else
- { // do nothing to format of line
- line = &lineOrig;
- }
-
- vector<string> tokens;
- vector<float> scoreVector;
-
- TokenizeMultiCharSeparator(tokens, *line , "|||" );
-
- if (tokens.size() != 4 && tokens.size() != 5) {
- stringstream strme;
- strme << "Syntax error at " << ptFileName << ":" << count;
- UserMessage::Add(strme.str());
- abort();
- }
-
- const string &sourcePhraseString = tokens[0]
- , &targetPhraseString = tokens[1]
- , &scoreString = tokens[2]
- , &alignString = tokens[3];
-
- bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
- if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
- TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");
- continue;
- }
-
- Tokenize<float>(scoreVector, scoreString);
- const size_t numScoreComponents = GetNumScoreComponents();
- if (scoreVector.size() != numScoreComponents) {
- stringstream strme;
- strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
- << numScoreComponents << ") of score components on line " << count;
- UserMessage::Add(strme.str());
- abort();
- }
- CHECK(scoreVector.size() == numScoreComponents);
-
- // parse source & find pt node
-
- // constituent labels
- Word *sourceLHS;
- Word *targetLHS;
-
- // source
- Phrase sourcePhrase( 0);
- sourcePhrase.CreateFromString(Input, *m_input, sourcePhraseString, factorDelimiter, &sourceLHS);
-
- // create target phrase obj
- TargetPhrase *targetPhrase = new TargetPhrase();
- targetPhrase->CreateFromString(Output, *m_output, targetPhraseString, factorDelimiter, &targetLHS);
-
- // rest of target phrase
- targetPhrase->SetAlignmentInfo(alignString);
- targetPhrase->SetTargetLHS(targetLHS);
- //targetPhrase->SetDebugOutput(string("New Format pt ") + line);
-
- // component score, for n-best output
- std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
- std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
-
- targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
- targetPhrase->Evaluate(sourcePhrase);
-
- TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS);
- phraseColl.Add(targetPhrase);
-
- count++;
-
- if (format == HieroFormat) { // reformat line
- delete line;
- }
- else
- { // do nothing
- }
-
+
+ vector<string> tokens;
+ vector<float> scoreVector;
+
+ TokenizeMultiCharSeparator(tokens, *line , "|||" );
+
+ if (tokens.size() != 4 && tokens.size() != 5) {
+ stringstream strme;
+ strme << "Syntax error at " << ptFileName << ":" << count;
+ UserMessage::Add(strme.str());
+ abort();
}
-
- // sort and prune each target phrase collection
- SortAndPrune(rootNode);
-
- //removedirectoryrecursively(dirName);
- }
-
- TargetPhraseCollection &PhraseDictionaryFuzzyMatch::GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode
- , const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS)
- {
- PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS);
- return currNode.GetOrCreateTargetPhraseCollection();
- }
- PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode
- , const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS)
- {
- cerr << source << endl << target << endl;
- const size_t size = source.GetSize();
-
- const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
- AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
-
- PhraseDictionaryNodeMemory *currNode = &rootNode;
- for (size_t pos = 0 ; pos < size ; ++pos) {
- const Word& word = source.GetWord(pos);
-
- if (word.IsNonTerminal()) {
- // indexed by source label 1st
- const Word &sourceNonTerm = word;
-
- CHECK(iterAlign != alignmentInfo.end());
- CHECK(iterAlign->first == pos);
- size_t targetNonTermInd = iterAlign->second;
- ++iterAlign;
- const Word &targetNonTerm = target.GetWord(targetNonTermInd);
-
- currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm);
- } else {
- currNode = currNode->GetOrCreateChild(word);
- }
-
- CHECK(currNode != NULL);
+ const string &sourcePhraseString = tokens[0]
+ , &targetPhraseString = tokens[1]
+ , &scoreString = tokens[2]
+ , &alignString = tokens[3];
+
+ bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
+ if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
+ TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");
+ continue;
}
-
- // finally, the source LHS
- //currNode = currNode->GetOrCreateChild(sourceLHS);
- //CHECK(currNode != NULL);
-
-
- return *currNode;
- }
- void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeMemory &rootNode)
- {
- if (GetTableLimit())
- {
- rootNode.Sort(GetTableLimit());
+ Tokenize<float>(scoreVector, scoreString);
+ const size_t numScoreComponents = GetNumScoreComponents();
+ if (scoreVector.size() != numScoreComponents) {
+ stringstream strme;
+ strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
+ << numScoreComponents << ") of score components on line " << count;
+ UserMessage::Add(strme.str());
+ abort();
}
+ CHECK(scoreVector.size() == numScoreComponents);
+
+ // parse source & find pt node
+
+ // constituent labels
+ Word *sourceLHS;
+ Word *targetLHS;
+
+ // source
+ Phrase sourcePhrase( 0);
+ sourcePhrase.CreateFromString(Input, *m_input, sourcePhraseString, factorDelimiter, &sourceLHS);
+
+ // create target phrase obj
+ TargetPhrase *targetPhrase = new TargetPhrase();
+ targetPhrase->CreateFromString(Output, *m_output, targetPhraseString, factorDelimiter, &targetLHS);
+
+ // rest of target phrase
+ targetPhrase->SetAlignmentInfo(alignString);
+ targetPhrase->SetTargetLHS(targetLHS);
+ //targetPhrase->SetDebugOutput(string("New Format pt ") + line);
+
+ // component score, for n-best output
+ std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
+ std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
+
+ targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
+ targetPhrase->Evaluate(sourcePhrase);
+
+ TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS);
+ phraseColl.Add(targetPhrase);
+
+ count++;
+
+ if (format == HieroFormat) { // reformat line
+ delete line;
+ } else {
+ // do nothing
+ }
+
}
-
- void PhraseDictionaryFuzzyMatch::CleanUpAfterSentenceProcessing(const InputType &source)
- {
- m_collection.erase(source.GetTranslationId());
+
+ // sort and prune each target phrase collection
+ SortAndPrune(rootNode);
+
+ //removedirectoryrecursively(dirName);
+}
+
+TargetPhraseCollection &PhraseDictionaryFuzzyMatch::GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode
+ , const Phrase &source
+ , const TargetPhrase &target
+ , const Word *sourceLHS)
+{
+ PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS);
+ return currNode.GetOrCreateTargetPhraseCollection();
+}
+
+PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode
+ , const Phrase &source
+ , const TargetPhrase &target
+ , const Word *sourceLHS)
+{
+ cerr << source << endl << target << endl;
+ const size_t size = source.GetSize();
+
+ const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
+ AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
+
+ PhraseDictionaryNodeMemory *currNode = &rootNode;
+ for (size_t pos = 0 ; pos < size ; ++pos) {
+ const Word& word = source.GetWord(pos);
+
+ if (word.IsNonTerminal()) {
+ // indexed by source label 1st
+ const Word &sourceNonTerm = word;
+
+ CHECK(iterAlign != alignmentInfo.end());
+ CHECK(iterAlign->first == pos);
+ size_t targetNonTermInd = iterAlign->second;
+ ++iterAlign;
+ const Word &targetNonTerm = target.GetWord(targetNonTermInd);
+
+ currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm);
+ } else {
+ currNode = currNode->GetOrCreateChild(word);
+ }
+
+ CHECK(currNode != NULL);
}
- const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) const
- {
- long transId = source.GetTranslationId();
- std::map<long, PhraseDictionaryNodeMemory>::const_iterator iter = m_collection.find(transId);
- CHECK(iter != m_collection.end());
- return iter->second;
+ // finally, the source LHS
+ //currNode = currNode->GetOrCreateChild(sourceLHS);
+ //CHECK(currNode != NULL);
+
+
+ return *currNode;
+}
+
+void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeMemory &rootNode)
+{
+ if (GetTableLimit()) {
+ rootNode.Sort(GetTableLimit());
}
- PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source)
- {
- long transId = source.GetTranslationId();
- std::map<long, PhraseDictionaryNodeMemory>::iterator iter = m_collection.find(transId);
- CHECK(iter != m_collection.end());
- return iter->second;
+}
+
+void PhraseDictionaryFuzzyMatch::CleanUpAfterSentenceProcessing(const InputType &source)
+{
+ m_collection.erase(source.GetTranslationId());
+}
+
+const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) const
+{
+ long transId = source.GetTranslationId();
+ std::map<long, PhraseDictionaryNodeMemory>::const_iterator iter = m_collection.find(transId);
+ CHECK(iter != m_collection.end());
+ return iter->second;
+}
+PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source)
+{
+ long transId = source.GetTranslationId();
+ std::map<long, PhraseDictionaryNodeMemory>::iterator iter = m_collection.find(transId);
+ CHECK(iter != m_collection.end());
+ return iter->second;
+}
+
+TO_STRING_BODY(PhraseDictionaryFuzzyMatch);
+
+// friend
+ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict)
+{
+ typedef PhraseDictionaryNodeMemory::TerminalMap TermMap;
+ typedef PhraseDictionaryNodeMemory::NonTerminalMap NonTermMap;
+
+ /*
+ const PhraseDictionaryNodeMemory &coll = phraseDict.m_collection;
+ for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) {
+ const Word &sourceNonTerm = p->first.first;
+ out << sourceNonTerm;
}
-
- TO_STRING_BODY(PhraseDictionaryFuzzyMatch);
-
- // friend
- ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict)
- {
- typedef PhraseDictionaryNodeMemory::TerminalMap TermMap;
- typedef PhraseDictionaryNodeMemory::NonTerminalMap NonTermMap;
-
- /*
- const PhraseDictionaryNodeMemory &coll = phraseDict.m_collection;
- for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) {
- const Word &sourceNonTerm = p->first.first;
- out << sourceNonTerm;
- }
- for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) {
- const Word &sourceTerm = p->first;
- out << sourceTerm;
- }
- */
-
- return out;
+ for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) {
+ const Word &sourceTerm = p->first;
+ out << sourceTerm;
}
-
+ */
+
+ return out;
+}
+
}
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h
index 8e4d20423..94966b175 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -29,61 +29,60 @@
namespace Moses
{
- class PhraseDictionaryNodeMemory;
-
- /** Implementation of a SCFG rule table in a trie. Looking up a rule of
- * length n symbols requires n look-ups to find the TargetPhraseCollection.
- */
- class PhraseDictionaryFuzzyMatch : public PhraseDictionary
- {
- friend std::ostream& operator<<(std::ostream&, const PhraseDictionaryFuzzyMatch&);
- friend class RuleTableLoader;
-
- public:
- PhraseDictionaryFuzzyMatch(const std::string &line);
- bool Load(const std::vector<FactorType> &input
- , const std::vector<FactorType> &output
- , const std::string &initStr
- , size_t tableLimit);
-
- const PhraseDictionaryNodeMemory &GetRootNode(const InputType &source) const;
-
- ChartRuleLookupManager *CreateRuleLookupManager(
- const InputType &,
- const ChartCellCollectionBase &);
- void InitializeForInput(InputType const& inputSentence);
- void CleanUpAfterSentenceProcessing(const InputType& source);
-
- virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase& src) const
- {
- assert(false);
- return NULL;
- }
-
- TO_STRING();
-
- protected:
- TargetPhraseCollection &GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode
- , const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS);
-
- PhraseDictionaryNodeMemory &GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode
- , const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS);
-
- void SortAndPrune(PhraseDictionaryNodeMemory &rootNode);
- PhraseDictionaryNodeMemory &GetRootNode(const InputType &source);
-
- std::map<long, PhraseDictionaryNodeMemory> m_collection;
- std::vector<std::string> m_config;
-
- const std::vector<FactorType> *m_input, *m_output;
- const std::vector<float> *m_weight;
-
- tmmt::FuzzyMatchWrapper *m_FuzzyMatchWrapper;
-
- };
-
+class PhraseDictionaryNodeMemory;
+
+/** Implementation of a SCFG rule table in a trie. Looking up a rule of
+ * length n symbols requires n look-ups to find the TargetPhraseCollection.
+ */
+class PhraseDictionaryFuzzyMatch : public PhraseDictionary
+{
+ friend std::ostream& operator<<(std::ostream&, const PhraseDictionaryFuzzyMatch&);
+ friend class RuleTableLoader;
+
+public:
+ PhraseDictionaryFuzzyMatch(const std::string &line);
+ bool Load(const std::vector<FactorType> &input
+ , const std::vector<FactorType> &output
+ , const std::string &initStr
+ , size_t tableLimit);
+
+ const PhraseDictionaryNodeMemory &GetRootNode(const InputType &source) const;
+
+ ChartRuleLookupManager *CreateRuleLookupManager(
+ const InputType &,
+ const ChartCellCollectionBase &);
+ void InitializeForInput(InputType const& inputSentence);
+ void CleanUpAfterSentenceProcessing(const InputType& source);
+
+ virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase& src) const {
+ assert(false);
+ return NULL;
+ }
+
+ TO_STRING();
+
+protected:
+ TargetPhraseCollection &GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode
+ , const Phrase &source
+ , const TargetPhrase &target
+ , const Word *sourceLHS);
+
+ PhraseDictionaryNodeMemory &GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode
+ , const Phrase &source
+ , const TargetPhrase &target
+ , const Word *sourceLHS);
+
+ void SortAndPrune(PhraseDictionaryNodeMemory &rootNode);
+ PhraseDictionaryNodeMemory &GetRootNode(const InputType &source);
+
+ std::map<long, PhraseDictionaryNodeMemory> m_collection;
+ std::vector<std::string> m_config;
+
+ const std::vector<FactorType> *m_input, *m_output;
+ const std::vector<float> *m_weight;
+
+ tmmt::FuzzyMatchWrapper *m_FuzzyMatchWrapper;
+
+};
+
} // namespace Moses
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
index cd509f544..38cf247af 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
@@ -1,4 +1,4 @@
- // vim:tabstop=2
+// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2010 Hieu Hoang
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h
index 9b186def9..874478cdc 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h
@@ -35,7 +35,7 @@ class TargetPhraseCollection;
class DottedRuleStackOnDisk;
/** Implementation of on-disk phrase table for hierarchical/syntax model.
- */
+ */
class PhraseDictionaryOnDisk : public PhraseDictionary
{
typedef PhraseDictionary MyBase;
diff --git a/moses/TranslationModel/RuleTable/Trie.cpp b/moses/TranslationModel/RuleTable/Trie.cpp
index c3590074d..950271d29 100644
--- a/moses/TranslationModel/RuleTable/Trie.cpp
+++ b/moses/TranslationModel/RuleTable/Trie.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -38,7 +38,7 @@ bool RuleTableTrie::InitDictionary()
{
std::auto_ptr<Moses::RuleTableLoader> loader =
- Moses::RuleTableLoaderFactory::Create(m_filePath);
+ Moses::RuleTableLoaderFactory::Create(m_filePath);
if (!loader.get()) {
return false;
}
diff --git a/moses/TranslationModel/RuleTable/Trie.h b/moses/TranslationModel/RuleTable/Trie.h
index 822ef8b92..c2f757ab8 100644
--- a/moses/TranslationModel/RuleTable/Trie.h
+++ b/moses/TranslationModel/RuleTable/Trie.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -39,28 +39,27 @@ class Word;
*/
class RuleTableTrie : public PhraseDictionary
{
- public:
+public:
RuleTableTrie(const std::string &description, const std::string &line)
- : PhraseDictionary(description, line)
- {}
+ : PhraseDictionary(description, line)
+ {}
virtual ~RuleTableTrie();
bool InitDictionary();
// Required by PhraseDictionary.
- virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const
- {
+ virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const {
CHECK(false);
return NULL;
}
- private:
+private:
friend class RuleTableLoader;
virtual TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
- const Phrase &source, const TargetPhrase &target,
- const Word *sourceLHS) = 0;
+ const Phrase &source, const TargetPhrase &target,
+ const Word *sourceLHS) = 0;
virtual void SortAndPrune() = 0;
diff --git a/moses/TranslationModel/RuleTable/UTrie.cpp b/moses/TranslationModel/RuleTable/UTrie.cpp
index bcfc0d538..17f457f22 100644
--- a/moses/TranslationModel/RuleTable/UTrie.cpp
+++ b/moses/TranslationModel/RuleTable/UTrie.cpp
@@ -39,15 +39,15 @@ namespace Moses
{
TargetPhraseCollection &RuleTableUTrie::GetOrCreateTargetPhraseCollection(
- const Phrase &source, const TargetPhrase &target, const Word *sourceLHS)
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS)
{
UTrieNode &currNode = GetOrCreateNode(source, target, sourceLHS);
return currNode.GetOrCreateTargetPhraseCollection(target);
}
UTrieNode &RuleTableUTrie::GetOrCreateNode(const Phrase &source,
- const TargetPhrase &target,
- const Word */*sourceLHS*/)
+ const TargetPhrase &target,
+ const Word */*sourceLHS*/)
{
const size_t size = source.GetSize();
diff --git a/moses/TranslationModel/RuleTable/UTrie.h b/moses/TranslationModel/RuleTable/UTrie.h
index d31e22cc7..a8f218158 100644
--- a/moses/TranslationModel/RuleTable/UTrie.h
+++ b/moses/TranslationModel/RuleTable/UTrie.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -43,21 +43,23 @@ class Word;
*/
class RuleTableUTrie : public RuleTableTrie
{
- public:
+public:
RuleTableUTrie(const std::string &line)
- : RuleTableTrie("RuleTableUTrie", line)
+ : RuleTableTrie("RuleTableUTrie", line)
{}
- const UTrieNode &GetRootNode() const { return m_root; }
+ const UTrieNode &GetRootNode() const {
+ return m_root;
+ }
ChartRuleLookupManager *CreateRuleLookupManager(const InputType &,
- const ChartCellCollectionBase &);
+ const ChartCellCollectionBase &);
- private:
+private:
const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const;
TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
- const Phrase &source, const TargetPhrase &target, const Word *sourceLHS);
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS);
UTrieNode &GetOrCreateNode(const Phrase &source, const TargetPhrase &target,
const Word *sourceLHS);
diff --git a/moses/TranslationModel/RuleTable/UTrieNode.cpp b/moses/TranslationModel/RuleTable/UTrieNode.cpp
index d2275422e..725f02c97 100644
--- a/moses/TranslationModel/RuleTable/UTrieNode.cpp
+++ b/moses/TranslationModel/RuleTable/UTrieNode.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -90,7 +90,7 @@ UTrieNode *UTrieNode::GetOrCreateNonTerminalChild(const Word &targetNonTerm)
}
TargetPhraseCollection &UTrieNode::GetOrCreateTargetPhraseCollection(
- const TargetPhrase &target)
+ const TargetPhrase &target)
{
const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
const size_t rank = alignmentInfo.GetSize();
diff --git a/moses/TranslationModel/RuleTable/UTrieNode.h b/moses/TranslationModel/RuleTable/UTrieNode.h
index b3d82cddc..436bcbea1 100644
--- a/moses/TranslationModel/RuleTable/UTrieNode.h
+++ b/moses/TranslationModel/RuleTable/UTrieNode.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -42,49 +42,62 @@ class RuleTableUTrie;
//! @todo ask phil williams - whats the diff between this and phrasedictionaryNode
class UTrieNode
{
- public:
+public:
typedef std::vector<std::vector<Word> > LabelTable;
#if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200)
typedef boost::unordered_map<Word,
- UTrieNode,
- TerminalHasher,
- TerminalEqualityPred> TerminalMap;
+ UTrieNode,
+ TerminalHasher,
+ TerminalEqualityPred> TerminalMap;
typedef boost::unordered_map<std::vector<int>,
- TargetPhraseCollection> LabelMap;
+ TargetPhraseCollection> LabelMap;
#else
typedef std::map<Word, UTrieNode> TerminalMap;
typedef std::map<std::vector<int>, TargetPhraseCollection> LabelMap;
#endif
- ~UTrieNode() { delete m_gapNode; }
+ ~UTrieNode() {
+ delete m_gapNode;
+ }
- const LabelTable &GetLabelTable() const { return m_labelTable; }
- const LabelMap &GetLabelMap() const { return m_labelMap; }
- const TerminalMap &GetTerminalMap() const { return m_terminalMap; }
+ const LabelTable &GetLabelTable() const {
+ return m_labelTable;
+ }
+ const LabelMap &GetLabelMap() const {
+ return m_labelMap;
+ }
+ const TerminalMap &GetTerminalMap() const {
+ return m_terminalMap;
+ }
- const UTrieNode *GetNonTerminalChild() const { return m_gapNode; }
+ const UTrieNode *GetNonTerminalChild() const {
+ return m_gapNode;
+ }
UTrieNode *GetOrCreateTerminalChild(const Word &sourceTerm);
UTrieNode *GetOrCreateNonTerminalChild(const Word &targetNonTerm);
TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
- const TargetPhrase &);
+ const TargetPhrase &);
- bool IsLeaf() const { return m_terminalMap.empty() && m_gapNode == NULL; }
+ bool IsLeaf() const {
+ return m_terminalMap.empty() && m_gapNode == NULL;
+ }
- bool HasRules() const { return !m_labelMap.empty(); }
+ bool HasRules() const {
+ return !m_labelMap.empty();
+ }
void Prune(size_t tableLimit);
void Sort(size_t tableLimit);
- private:
+private:
friend class RuleTableUTrie;
UTrieNode() : m_gapNode(NULL) {}
- int InsertLabel(int i, const Word &w)
- {
+ int InsertLabel(int i, const Word &w) {
std::vector<Word> &inner = m_labelTable[i];
for (size_t j = 0; j < inner.size(); ++j) {
if (inner[j] == w) {
diff --git a/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp b/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp
index ec5c1d8f1..b635dc050 100644
--- a/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp
+++ b/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -39,7 +39,7 @@ void ApplicableRuleTrie::Extend(const UTrieNode &root, int minPos,
size_t index = *r;
if (index == (size_t)minPos || (followsGap && index > (size_t)minPos) || minPos == -1) {
ApplicableRuleTrie *subTrie = new ApplicableRuleTrie(index, index,
- child);
+ child);
subTrie->Extend(child, index+1, sentMap, false);
m_children.push_back(subTrie);
}
diff --git a/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h b/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h
index 35243adde..9d2f2cda9 100644
--- a/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h
+++ b/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h
@@ -33,14 +33,13 @@ struct VarSpanNode;
/** @todo what is this?
*/
-struct ApplicableRuleTrie
-{
- public:
+struct ApplicableRuleTrie {
+public:
ApplicableRuleTrie(int start, int end, const UTrieNode &node)
- : m_start(start)
- , m_end(end)
- , m_node(&node)
- , m_vstNode(NULL) {}
+ : m_start(start)
+ , m_end(end)
+ , m_node(&node)
+ , m_vstNode(NULL) {}
~ApplicableRuleTrie() {
RemoveAllInColl(m_children);
diff --git a/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h b/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h
index 353fabf22..499085127 100644
--- a/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h
+++ b/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h
@@ -26,9 +26,8 @@ namespace Moses
/** @todo what is this?
*/
-struct IntermediateVarSpanNode
-{
- public:
+struct IntermediateVarSpanNode {
+public:
typedef std::pair<int, int> Range;
IntermediateVarSpanNode()
@@ -41,8 +40,12 @@ struct IntermediateVarSpanNode
, m_end(end)
, m_numSplitPoints(0) {}
- bool isOpen() { return m_end.second == -1; }
- bool isClosed() { return !isOpen(); }
+ bool isOpen() {
+ return m_end.second == -1;
+ }
+ bool isClosed() {
+ return !isOpen();
+ }
Range m_start;
Range m_end;
diff --git a/moses/TranslationModel/Scope3Parser/Parser.cpp b/moses/TranslationModel/Scope3Parser/Parser.cpp
index bfcacb1ed..81e156b3d 100644
--- a/moses/TranslationModel/Scope3Parser/Parser.cpp
+++ b/moses/TranslationModel/Scope3Parser/Parser.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -38,8 +38,8 @@ namespace Moses
{
void Scope3Parser::GetChartRuleCollection(
- const WordsRange &range,
- ChartParserCallback &outColl)
+ const WordsRange &range,
+ ChartParserCallback &outColl)
{
const size_t start = range.GetStartPos();
const size_t end = range.GetEndPos();
@@ -122,7 +122,7 @@ void Scope3Parser::InitRuleApplicationVector()
}
void Scope3Parser::FillSentenceMap(
- const Sentence &sent, SentenceMap &sentMap)
+ const Sentence &sent, SentenceMap &sentMap)
{
for (size_t i = 0; i < sent.GetSize(); ++i) {
sentMap[sent.GetWord(i)].push_back(i);
@@ -130,10 +130,10 @@ void Scope3Parser::FillSentenceMap(
}
void Scope3Parser::AddRulesToCells(
- const ApplicableRuleTrie &node,
- std::pair<int, int> start,
- int maxPos,
- int depth)
+ const ApplicableRuleTrie &node,
+ std::pair<int, int> start,
+ int maxPos,
+ int depth)
{
if (depth > 0) {
// Determine the start range for this path if not already known.
@@ -183,7 +183,7 @@ void Scope3Parser::AddRulesToCells(
break;
}
m_ruleApplications[i][span].push_back(std::make_pair(node.m_node,
- node.m_vstNode));
+ node.m_vstNode));
}
}
}
diff --git a/moses/TranslationModel/Scope3Parser/Parser.h b/moses/TranslationModel/Scope3Parser/Parser.h
index 0b5e63d95..2a46de9a8 100644
--- a/moses/TranslationModel/Scope3Parser/Parser.h
+++ b/moses/TranslationModel/Scope3Parser/Parser.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -46,15 +46,14 @@ class WordsRange;
*/
class Scope3Parser : public ChartRuleLookupManager
{
- public:
+public:
Scope3Parser(const InputType &sentence,
const ChartCellCollectionBase &cellColl,
const RuleTableUTrie &ruleTable,
size_t maxChartSpan)
- : ChartRuleLookupManager(sentence, cellColl)
- , m_ruleTable(ruleTable)
- , m_maxChartSpan(maxChartSpan)
- {
+ : ChartRuleLookupManager(sentence, cellColl)
+ , m_ruleTable(ruleTable)
+ , m_maxChartSpan(maxChartSpan) {
Init();
}
@@ -62,23 +61,21 @@ class Scope3Parser : public ChartRuleLookupManager
const WordsRange &range,
ChartParserCallback &outColl);
- private:
+private:
// Define a callback type for use by StackLatticeSearcher.
- struct MatchCallback
- {
- public:
- MatchCallback(const WordsRange &range,
- ChartParserCallback &out)
- : m_range(range)
- , m_out(out)
- , m_tpc(NULL) {}
- void operator()(const StackVec &stackVec)
- {
- m_out.Add(*m_tpc, stackVec, m_range);
- }
- const WordsRange &m_range;
- ChartParserCallback &m_out;
- const TargetPhraseCollection *m_tpc;
+ struct MatchCallback {
+ public:
+ MatchCallback(const WordsRange &range,
+ ChartParserCallback &out)
+ : m_range(range)
+ , m_out(out)
+ , m_tpc(NULL) {}
+ void operator()(const StackVec &stackVec) {
+ m_out.Add(*m_tpc, stackVec, m_range);
+ }
+ const WordsRange &m_range;
+ ChartParserCallback &m_out;
+ const TargetPhraseCollection *m_tpc;
};
void Init();
@@ -89,7 +86,7 @@ class Scope3Parser : public ChartRuleLookupManager
const RuleTableUTrie &m_ruleTable;
std::vector<std::vector<std::vector<
- std::pair<const UTrieNode *, const VarSpanNode *> > > > m_ruleApplications;
+ std::pair<const UTrieNode *, const VarSpanNode *> > > > m_ruleApplications;
std::auto_ptr<VarSpanNode> m_varSpanTrie;
StackVec m_emptyStackVec;
const size_t m_maxChartSpan;
diff --git a/moses/TranslationModel/Scope3Parser/SentenceMap.h b/moses/TranslationModel/Scope3Parser/SentenceMap.h
index 9bc46db93..a7a1fdad9 100644
--- a/moses/TranslationModel/Scope3Parser/SentenceMap.h
+++ b/moses/TranslationModel/Scope3Parser/SentenceMap.h
@@ -29,7 +29,7 @@
namespace Moses
{
typedef boost::unordered_map<Word,
- std::vector<size_t>,
- TerminalHasher,
- TerminalEqualityPred> SentenceMap;
+ std::vector<size_t>,
+ TerminalHasher,
+ TerminalEqualityPred> SentenceMap;
}
diff --git a/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp b/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp
index bb553a116..26e4e6aca 100644
--- a/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp
+++ b/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp
@@ -28,14 +28,14 @@ namespace Moses
{
void StackLatticeBuilder::Build(
- int start,
- int end,
- const UTrieNode &ruleNode,
- const VarSpanNode &varSpanNode,
- const std::vector<VarSpanNode::NonTermRange> &ranges,
- const ChartRuleLookupManager &manager,
- StackLattice &lattice,
- std::vector<std::vector<bool> > &checkTable)
+ int start,
+ int end,
+ const UTrieNode &ruleNode,
+ const VarSpanNode &varSpanNode,
+ const std::vector<VarSpanNode::NonTermRange> &ranges,
+ const ChartRuleLookupManager &manager,
+ StackLattice &lattice,
+ std::vector<std::vector<bool> > &checkTable)
{
// Extend the lattice if necessary. Do not shrink it.
const size_t span = end - start + 1;
diff --git a/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h b/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h
index 7091e8f18..551655e30 100644
--- a/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h
+++ b/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h
@@ -32,7 +32,7 @@ class ChartCellCollection;
*/
class StackLatticeBuilder
{
- public:
+public:
StackLatticeBuilder() {}
void Build(int, int, const UTrieNode &, const VarSpanNode &,
diff --git a/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h b/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h
index 749a3a2c1..4deac31f8 100644
--- a/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h
+++ b/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -33,22 +33,20 @@ class ChartHypothesisCollection;
template<typename MatchCallBackType>
class StackLatticeSearcher
{
- public:
+public:
StackLatticeSearcher(const StackLattice &lattice,
const std::vector<VarSpanNode::NonTermRange> &ranges)
- : m_lattice(lattice)
- , m_ranges(ranges) {}
+ : m_lattice(lattice)
+ , m_ranges(ranges) {}
- void Search(const std::vector<int> &labels, MatchCallBackType &callback)
- {
+ void Search(const std::vector<int> &labels, MatchCallBackType &callback) {
m_labels = &labels;
m_matchCB = &callback;
SearchInner(0, 0);
}
- private:
- void SearchInner(int start, size_t index)
- {
+private:
+ void SearchInner(int start, size_t index) {
assert(m_stackVec.size() == index);
const VarSpanNode::NonTermRange &range = m_ranges[index];
diff --git a/moses/TranslationModel/Scope3Parser/VarSpanNode.h b/moses/TranslationModel/Scope3Parser/VarSpanNode.h
index 52dc32382..0dda6a787 100644
--- a/moses/TranslationModel/Scope3Parser/VarSpanNode.h
+++ b/moses/TranslationModel/Scope3Parser/VarSpanNode.h
@@ -33,9 +33,8 @@ namespace Moses
/** @todo what is this?
*/
-struct VarSpanNode
-{
- public:
+struct VarSpanNode {
+public:
struct NonTermRange {
size_t s1;
size_t s2;
@@ -48,8 +47,7 @@ struct VarSpanNode
VarSpanNode() : m_parent(0), m_label(0), m_rank(0) {}
- VarSpanNode &Insert(const NodeVec &vec)
- {
+ VarSpanNode &Insert(const NodeVec &vec) {
if (vec.empty()) {
return *this;
}
@@ -59,8 +57,7 @@ struct VarSpanNode
// Given a span, determine the ranges of possible start and end offsets
// for each non-terminal.
void CalculateRanges(int start, int end,
- std::vector<NonTermRange> &ranges) const
- {
+ std::vector<NonTermRange> &ranges) const {
ranges.resize(m_rank);
const VarSpanNode *n = this;
size_t firstIndex = m_rank;
@@ -103,10 +100,9 @@ struct VarSpanNode
size_t m_rank;
MapType m_children;
- private:
+private:
VarSpanNode &Insert(NodeVec::const_iterator first,
- NodeVec::const_iterator last)
- {
+ NodeVec::const_iterator last) {
assert(first != last);
KeyType key;
@@ -117,7 +113,7 @@ struct VarSpanNode
key[4] = first->m_numSplitPoints;
std::pair<MapType::iterator, bool> result = m_children.insert(
- std::make_pair(key, VarSpanNode()));
+ std::make_pair(key, VarSpanNode()));
VarSpanNode &child = result.first->second;
if (result.second) {
child.m_parent = this;
diff --git a/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp b/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp
index 16b180ea5..35e66978b 100644
--- a/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp
+++ b/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp
@@ -30,7 +30,7 @@ namespace Moses
{
std::auto_ptr<VarSpanNode> VarSpanTrieBuilder::Build(
- ApplicableRuleTrie &root)
+ ApplicableRuleTrie &root)
{
std::auto_ptr<VarSpanNode> vstRoot(new VarSpanNode());
NodeVec vec;
diff --git a/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h b/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h
index 13c701b4f..2513a2878 100644
--- a/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h
+++ b/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h
@@ -34,13 +34,12 @@ struct VarSpanNode;
*/
class VarSpanTrieBuilder
{
- public:
+public:
std::auto_ptr<VarSpanNode> Build(ApplicableRuleTrie &);
- private:
+private:
typedef std::vector<IntermediateVarSpanNode> NodeVec;
- struct NodeVecState
- {
+ struct NodeVecState {
std::size_t m_size;
IntermediateVarSpanNode m_lastNode;
};
diff --git a/moses/TranslationModel/fuzzy-match/Alignments.cpp b/moses/TranslationModel/fuzzy-match/Alignments.cpp
index f15d82a5e..142aff251 100644
--- a/moses/TranslationModel/fuzzy-match/Alignments.cpp
+++ b/moses/TranslationModel/fuzzy-match/Alignments.cpp
@@ -8,12 +8,11 @@ using namespace std;
using namespace Moses;
Alignments::Alignments(const std::string &str, size_t sourceSize, size_t targetSize)
-:m_alignS2T(sourceSize)
-,m_alignT2S(targetSize)
+ :m_alignS2T(sourceSize)
+ ,m_alignT2S(targetSize)
{
vector<string> toks = Tokenize(str, " ");
- for (size_t i = 0; i < toks.size(); ++i)
- {
+ for (size_t i = 0; i < toks.size(); ++i) {
string &tok = toks[i];
vector<int> point = Tokenize<int>(tok, "-");
@@ -25,20 +24,18 @@ Alignments::Alignments(const std::string &str, size_t sourceSize, size_t targetS
std::map<int, int> &targets = m_alignS2T[ point[0] ];
iter = targets.find(point[1]);
if (iter == targets .end()) {
- targets[ point[1] ] = 0;
- }
- else {
- ++(iter->second);
+ targets[ point[1] ] = 0;
+ } else {
+ ++(iter->second);
}
// m_alignedToS
std::map<int, int> &sources = m_alignT2S[ point[1] ];
iter = sources.find(point[0]);
if (iter == targets .end()) {
- sources[ point[0] ] = 0;
- }
- else {
- ++(iter->second);
+ sources[ point[0] ] = 0;
+ } else {
+ ++(iter->second);
}
}
diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
index 065368ca7..a4264f6a4 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
@@ -17,10 +17,10 @@
using namespace std;
-namespace tmmt
+namespace tmmt
{
- FuzzyMatchWrapper::FuzzyMatchWrapper(const std::string &sourcePath, const std::string &targetPath, const std::string &alignmentPath)
+FuzzyMatchWrapper::FuzzyMatchWrapper(const std::string &sourcePath, const std::string &targetPath, const std::string &alignmentPath)
:basic_flag(false)
,lsed_flag(true)
,refined_flag(true)
@@ -30,790 +30,735 @@ namespace tmmt
,multiple_flag(true)
,multiple_slack(0)
,multiple_max(100)
- {
- cerr << "creating suffix array" << endl;
- suffixArray = new tmmt::SuffixArray( sourcePath );
+{
+ cerr << "creating suffix array" << endl;
+ suffixArray = new tmmt::SuffixArray( sourcePath );
- //cerr << "loading source data" << endl;
- //load_corpus(sourcePath, source);
+ //cerr << "loading source data" << endl;
+ //load_corpus(sourcePath, source);
- cerr << "loading target data" << endl;
- load_target(targetPath, targetAndAlignment);
+ cerr << "loading target data" << endl;
+ load_target(targetPath, targetAndAlignment);
- cerr << "loading alignment" << endl;
- load_alignment(alignmentPath, targetAndAlignment);
+ cerr << "loading alignment" << endl;
+ load_alignment(alignmentPath, targetAndAlignment);
- // create suffix array
- //load_corpus(m_config[0], input);
-
- cerr << "loading completed" << endl;
- }
+ // create suffix array
+ //load_corpus(m_config[0], input);
+
+ cerr << "loading completed" << endl;
+}
- string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr)
- {
- const Moses::StaticData &staticData = Moses::StaticData::Instance();
+string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr)
+{
+ const Moses::StaticData &staticData = Moses::StaticData::Instance();
+
+ WordIndex wordIndex;
- WordIndex wordIndex;
+ string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr);
- string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr);
-
- // create extrac files
- create_xml(fuzzyMatchFile);
+ // create extrac files
+ create_xml(fuzzyMatchFile);
- // create phrase table with usual Moses scoring and consolidate programs
- string cmd;
- cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract | gzip -c > "
- + fuzzyMatchFile + ".extract.sorted.gz";
- system(cmd.c_str());
- cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract.inv | gzip -c > "
- + fuzzyMatchFile + ".extract.inv.sorted.gz";
- system(cmd.c_str());
+ // create phrase table with usual Moses scoring and consolidate programs
+ string cmd;
+ cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract | gzip -c > "
+ + fuzzyMatchFile + ".extract.sorted.gz";
+ system(cmd.c_str());
+ cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract.inv | gzip -c > "
+ + fuzzyMatchFile + ".extract.inv.sorted.gz";
+ system(cmd.c_str());
#ifdef IS_XCODE
- cmd = "/Users/hieuhoang/unison/workspace/github/moses-smt/bin";
+ cmd = "/Users/hieuhoang/unison/workspace/github/moses-smt/bin";
#elif IS_ECLIPSE
- cmd = "/home/hieu/workspace/github/moses-smt/bin";
+ cmd = "/home/hieu/workspace/github/moses-smt/bin";
#else
- cmd = staticData.GetBinDirectory();
+ cmd = staticData.GetBinDirectory();
#endif
- cmd += string("/../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical ")
- + " -extract-file " + fuzzyMatchFile + ".extract -lexical-file - -score-options \"--NoLex\" "
- + " -phrase-translation-table " + fuzzyMatchFile + ".pt";
- system(cmd.c_str());
+ cmd += string("/../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical ")
+ + " -extract-file " + fuzzyMatchFile + ".extract -lexical-file - -score-options \"--NoLex\" "
+ + " -phrase-translation-table " + fuzzyMatchFile + ".pt";
+ system(cmd.c_str());
+
+
+ return fuzzyMatchFile + ".pt.gz";
+}
+
+string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr)
+{
+ const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus();
+
+ string inputPath = dirNameStr + "/in";
+ string fuzzyMatchFile = dirNameStr + "/fuzzyMatchFile";
+ ofstream fuzzyMatchStream(fuzzyMatchFile.c_str());
+
+ vector< vector< WORD_ID > > input;
+ load_corpus(inputPath, input);
+
+ assert(input.size() == 1);
+ size_t sentenceInd = 0;
+
+ clock_t start_clock = clock();
+ // if (i % 10 == 0) cerr << ".";
+
+ // establish some basic statistics
+ // int input_length = compute_length( input[i] );
+ int input_length = input[sentenceInd].size();
+ int best_cost = input_length * (100-min_match) / 100 + 1;
- return fuzzyMatchFile + ".pt.gz";
+ int match_count = 0; // how many substring matches to be considered
+ //cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl;
+
+ // find match ranges in suffix array
+ vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
+ for(size_t start=0; start<input[sentenceInd].size(); start++) {
+ SuffixArray::INDEX prior_first_match = 0;
+ SuffixArray::INDEX prior_last_match = suffixArray->GetSize()-1;
+ vector< string > substring;
+ bool stillMatched = true;
+ vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
+ //cerr << "start: " << start;
+ for(int word=start; stillMatched && word<input[sentenceInd].size(); word++) {
+ substring.push_back( GetVocabulary().GetWord( input[sentenceInd][word] ) );
+
+ // only look up, if needed (i.e. no unnecessary short gram lookups)
+ // if (! word-start+1 <= short_match_max_length( input_length ) )
+ // {
+ SuffixArray::INDEX first_match, last_match;
+ stillMatched = false;
+ if (suffixArray->FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) ) {
+ stillMatched = true;
+ matchedAtThisStart.push_back( make_pair( first_match, last_match ) );
+ //cerr << " (" << first_match << "," << last_match << ")";
+ //cerr << " " << ( last_match - first_match + 1 );
+ prior_first_match = first_match;
+ prior_last_match = last_match;
+ }
+ //}
+ }
+ //cerr << endl;
+ match_range.push_back( matchedAtThisStart );
}
-
- string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr)
- {
- const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus();
-
- string inputPath = dirNameStr + "/in";
- string fuzzyMatchFile = dirNameStr + "/fuzzyMatchFile";
- ofstream fuzzyMatchStream(fuzzyMatchFile.c_str());
-
- vector< vector< WORD_ID > > input;
- load_corpus(inputPath, input);
-
- assert(input.size() == 1);
- size_t sentenceInd = 0;
-
- clock_t start_clock = clock();
- // if (i % 10 == 0) cerr << ".";
-
- // establish some basic statistics
-
- // int input_length = compute_length( input[i] );
- int input_length = input[sentenceInd].size();
- int best_cost = input_length * (100-min_match) / 100 + 1;
-
- int match_count = 0; // how many substring matches to be considered
- //cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl;
-
- // find match ranges in suffix array
- vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
- for(size_t start=0;start<input[sentenceInd].size();start++)
- {
- SuffixArray::INDEX prior_first_match = 0;
- SuffixArray::INDEX prior_last_match = suffixArray->GetSize()-1;
- vector< string > substring;
- bool stillMatched = true;
- vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
- //cerr << "start: " << start;
- for(int word=start; stillMatched && word<input[sentenceInd].size(); word++)
- {
- substring.push_back( GetVocabulary().GetWord( input[sentenceInd][word] ) );
-
- // only look up, if needed (i.e. no unnecessary short gram lookups)
- // if (! word-start+1 <= short_match_max_length( input_length ) )
- // {
- SuffixArray::INDEX first_match, last_match;
- stillMatched = false;
- if (suffixArray->FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) )
- {
- stillMatched = true;
- matchedAtThisStart.push_back( make_pair( first_match, last_match ) );
- //cerr << " (" << first_match << "," << last_match << ")";
- //cerr << " " << ( last_match - first_match + 1 );
- prior_first_match = first_match;
- prior_last_match = last_match;
- }
- //}
- }
- //cerr << endl;
- match_range.push_back( matchedAtThisStart );
- }
-
- clock_t clock_range = clock();
-
- map< int, vector< Match > > sentence_match;
- map< int, int > sentence_match_word_count;
-
- // go through all matches, longest first
- for(int length = input[sentenceInd].size(); length >= 1; length--)
- {
- // do not create matches, if these are handled by the short match function
- if (length <= short_match_max_length( input_length ) )
- {
- continue;
- }
-
- unsigned int count = 0;
- for(int start = 0; start <= input[sentenceInd].size() - length; start++)
- {
- if (match_range[start].size() >= length)
- {
- pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1];
- // cerr << " (" << range.first << "," << range.second << ")";
- count += range.second - range.first + 1;
-
- for(SuffixArray::INDEX i=range.first; i<=range.second; i++)
- {
- int position = suffixArray->GetPosition( i );
-
- // sentence length mismatch
- size_t sentence_id = suffixArray->GetSentence( position );
- int sentence_length = suffixArray->GetSentenceLength( sentence_id );
- int diff = abs( (int)sentence_length - (int)input_length );
- // cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length;
- //if (length <= 2 && input_length>=5 &&
- // sentence_match.find( sentence_id ) == sentence_match.end())
- // continue;
-
- if (diff > best_cost)
- continue;
-
- // compute minimal cost
- int start_pos = suffixArray->GetWordInSentence( position );
- int end_pos = start_pos + length-1;
- // cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. "
- // << start << "-" << (start+length-1) << " (" << input_length << ")";
- // different number of prior words -> cost is at least diff
- int min_cost = abs( start - start_pos );
-
- // same number of words, but not sent. start -> cost is at least 1
- if (start == start_pos && start>0)
- min_cost++;
-
- // different number of remaining words -> cost is at least diff
- min_cost += abs( ( sentence_length-1 - end_pos ) -
- ( input_length-1 - (start+length-1) ) );
-
- // same number of words, but not sent. end -> cost is at least 1
- if ( sentence_length-1 - end_pos ==
- input_length-1 - (start+length-1)
- && end_pos != sentence_length-1 )
- min_cost++;
-
- // cerr << " -> min_cost " << min_cost;
- if (min_cost > best_cost)
- continue;
-
- // valid match
- match_count++;
-
- // compute maximal cost
- int max_cost = max( start, start_pos )
- + max( sentence_length-1 - end_pos,
- input_length-1 - (start+length-1) );
- // cerr << ", max_cost " << max_cost;
-
- Match m = Match( start, start+length-1,
- start_pos, start_pos+length-1,
- min_cost, max_cost, 0);
- sentence_match[ sentence_id ].push_back( m );
- sentence_match_word_count[ sentence_id ] += length;
-
- if (max_cost < best_cost)
- {
- best_cost = max_cost;
- if (best_cost == 0) break;
- }
- //if (match_count >= MAX_MATCH_COUNT) break;
- }
- }
- // cerr << endl;
- if (best_cost == 0) break;
- //if (match_count >= MAX_MATCH_COUNT) break;
- }
- // cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl;
-
- if (best_cost == 0) break;
- //if (match_count >= MAX_MATCH_COUNT) break;
- }
- cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl;
-
- clock_t clock_matches = clock();
-
- // consider each sentence for which we have matches
- int old_best_cost = best_cost;
- int tm_count_word_match = 0;
- int tm_count_word_match2 = 0;
- int pruned_match_count = 0;
- if (short_match_max_length( input_length ))
- {
- init_short_matches(wordIndex, translationId, input[sentenceInd] );
- }
- vector< int > best_tm;
- typedef map< int, vector< Match > >::iterator I;
-
- clock_t clock_validation_sum = 0;
-
- for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++)
- {
- int tmID = tm->first;
- int tm_length = suffixArray->GetSentenceLength(tmID);
- vector< Match > &match = tm->second;
- add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost );
-
- //cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;
-
- // quick look: how many words are matched
- int words_matched = 0;
- for(int m=0;m<match.size();m++) {
-
- if (match[m].min_cost <= best_cost) // makes no difference
- words_matched += match[m].input_end - match[m].input_start + 1;
- }
- if (max(input_length,tm_length) - words_matched > best_cost)
- {
- if (length_filter_flag) continue;
- }
- tm_count_word_match++;
-
- // prune, check again how many words are matched
- vector< Match > pruned = prune_matches( match, best_cost );
- words_matched = 0;
- for(int p=0;p<pruned.size();p++) {
- words_matched += pruned[p].input_end - pruned[p].input_start + 1;
- }
- if (max(input_length,tm_length) - words_matched > best_cost)
- {
- if (length_filter_flag) continue;
- }
- tm_count_word_match2++;
-
- pruned_match_count += pruned.size();
- int prior_best_cost = best_cost;
- int cost;
-
- clock_t clock_validation_start = clock();
- if (! parse_flag ||
- pruned.size()>=10) // to prevent worst cases
- {
- string path;
- cost = sed( input[sentenceInd], source[tmID], path, false );
- if (cost < best_cost)
- {
- best_cost = cost;
- }
- }
-
- else
- {
- cost = parse_matches( pruned, input_length, tm_length, best_cost );
- if (prior_best_cost != best_cost)
- {
- best_tm.clear();
- }
- }
- clock_validation_sum += clock() - clock_validation_start;
- if (cost == best_cost)
- {
- best_tm.push_back( tmID );
- }
- }
- cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl;
- cerr << "tm considered: " << sentence_match.size()
- << " word-matched: " << tm_count_word_match
- << " word-matched2: " << tm_count_word_match2
- << " best: " << best_tm.size() << endl;
-
- cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl;
-
- // create xml and extract files
- string inputStr, sourceStr;
- for (size_t pos = 0; pos < input_length; ++pos) {
- inputStr += GetVocabulary().GetWord(input[sentenceInd][pos]) + " ";
+
+ clock_t clock_range = clock();
+
+ map< int, vector< Match > > sentence_match;
+ map< int, int > sentence_match_word_count;
+
+ // go through all matches, longest first
+ for(int length = input[sentenceInd].size(); length >= 1; length--) {
+ // do not create matches, if these are handled by the short match function
+ if (length <= short_match_max_length( input_length ) ) {
+ continue;
}
-
- // do not try to find the best ... report multiple matches
- if (multiple_flag) {
- int input_letter_length = compute_length( input[sentenceInd] );
- for(int si=0; si<best_tm.size(); si++) {
- int s = best_tm[si];
- string path;
- unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
- // do not report multiple identical sentences, but just their count
- //cout << sentenceInd << " "; // sentence number
- //cout << letter_cost << "/" << input_letter_length << " ";
- //cout << "(" << best_cost <<"/" << input_length <<") ";
- //cout << "||| " << s << " ||| " << path << endl;
-
- const vector<WORD_ID> &sourceSentence = source[s];
- vector<SentenceAlignment> &targets = targetAndAlignment[s];
- create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream);
-
- }
- } // if (multiple_flag)
- else {
-
- // find the best matches according to letter sed
- string best_path = "";
- int best_match = -1;
- int best_letter_cost;
- if (lsed_flag) {
- best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
- for(int si=0; si<best_tm.size(); si++)
- {
- int s = best_tm[si];
- string path;
- unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
- if (letter_cost < best_letter_cost)
- {
- best_letter_cost = letter_cost;
- best_path = path;
- best_match = s;
+
+ unsigned int count = 0;
+ for(int start = 0; start <= input[sentenceInd].size() - length; start++) {
+ if (match_range[start].size() >= length) {
+ pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1];
+ // cerr << " (" << range.first << "," << range.second << ")";
+ count += range.second - range.first + 1;
+
+ for(SuffixArray::INDEX i=range.first; i<=range.second; i++) {
+ int position = suffixArray->GetPosition( i );
+
+ // sentence length mismatch
+ size_t sentence_id = suffixArray->GetSentence( position );
+ int sentence_length = suffixArray->GetSentenceLength( sentence_id );
+ int diff = abs( (int)sentence_length - (int)input_length );
+ // cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length;
+ //if (length <= 2 && input_length>=5 &&
+ // sentence_match.find( sentence_id ) == sentence_match.end())
+ // continue;
+
+ if (diff > best_cost)
+ continue;
+
+ // compute minimal cost
+ int start_pos = suffixArray->GetWordInSentence( position );
+ int end_pos = start_pos + length-1;
+ // cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. "
+ // << start << "-" << (start+length-1) << " (" << input_length << ")";
+ // different number of prior words -> cost is at least diff
+ int min_cost = abs( start - start_pos );
+
+ // same number of words, but not sent. start -> cost is at least 1
+ if (start == start_pos && start>0)
+ min_cost++;
+
+ // different number of remaining words -> cost is at least diff
+ min_cost += abs( ( sentence_length-1 - end_pos ) -
+ ( input_length-1 - (start+length-1) ) );
+
+ // same number of words, but not sent. end -> cost is at least 1
+ if ( sentence_length-1 - end_pos ==
+ input_length-1 - (start+length-1)
+ && end_pos != sentence_length-1 )
+ min_cost++;
+
+ // cerr << " -> min_cost " << min_cost;
+ if (min_cost > best_cost)
+ continue;
+
+ // valid match
+ match_count++;
+
+ // compute maximal cost
+ int max_cost = max( start, start_pos )
+ + max( sentence_length-1 - end_pos,
+ input_length-1 - (start+length-1) );
+ // cerr << ", max_cost " << max_cost;
+
+ Match m = Match( start, start+length-1,
+ start_pos, start_pos+length-1,
+ min_cost, max_cost, 0);
+ sentence_match[ sentence_id ].push_back( m );
+ sentence_match_word_count[ sentence_id ] += length;
+
+ if (max_cost < best_cost) {
+ best_cost = max_cost;
+ if (best_cost == 0) break;
}
+ //if (match_count >= MAX_MATCH_COUNT) break;
}
}
- // if letter sed turned off, just compute path for first match
- else {
- if (best_tm.size() > 0) {
- string path;
- sed( input[sentenceInd], source[best_tm[0]], path, false );
- best_path = path;
- best_match = best_tm[0];
- }
- }
- cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC)
- << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC)
- << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC)
- << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC)
- << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")"
- << " )" << endl;
- if (lsed_flag) {
- //cout << best_letter_cost << "/" << compute_length( input[sentenceInd] ) << " (";
- }
- //cout << best_cost <<"/" << input_length;
- if (lsed_flag) {
- //cout << ")";
- }
- //cout << " ||| " << best_match << " ||| " << best_path << endl;
-
- if (best_match == -1) {
- CHECK(source.size());
- best_match = 0;
- }
-
- // creat xml & extracts
- const vector<WORD_ID> &sourceSentence = source[best_match];
- vector<SentenceAlignment> &targets = targetAndAlignment[best_match];
- create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path, fuzzyMatchStream);
-
- } // else if (multiple_flag)
-
- fuzzyMatchStream.close();
-
- return fuzzyMatchFile;
+ // cerr << endl;
+ if (best_cost == 0) break;
+ //if (match_count >= MAX_MATCH_COUNT) break;
+ }
+ // cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl;
+
+ if (best_cost == 0) break;
+ //if (match_count >= MAX_MATCH_COUNT) break;
}
+ cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl;
- void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector< WORD_ID > > &corpus )
- { // source
- ifstream fileStream;
- fileStream.open(fileName.c_str());
- if (!fileStream) {
- cerr << "file not found: " << fileName << endl;
- exit(1);
+ clock_t clock_matches = clock();
+
+ // consider each sentence for which we have matches
+ int old_best_cost = best_cost;
+ int tm_count_word_match = 0;
+ int tm_count_word_match2 = 0;
+ int pruned_match_count = 0;
+ if (short_match_max_length( input_length )) {
+ init_short_matches(wordIndex, translationId, input[sentenceInd] );
+ }
+ vector< int > best_tm;
+ typedef map< int, vector< Match > >::iterator I;
+
+ clock_t clock_validation_sum = 0;
+
+ for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++) {
+ int tmID = tm->first;
+ int tm_length = suffixArray->GetSentenceLength(tmID);
+ vector< Match > &match = tm->second;
+ add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost );
+
+ //cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;
+
+ // quick look: how many words are matched
+ int words_matched = 0;
+ for(int m=0; m<match.size(); m++) {
+
+ if (match[m].min_cost <= best_cost) // makes no difference
+ words_matched += match[m].input_end - match[m].input_start + 1;
}
- cerr << "loading " << fileName << endl;
-
- istream *fileStreamP = &fileStream;
-
- char line[LINE_MAX_LENGTH];
- while(true)
- {
- SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
- if (fileStreamP->eof()) break;
- corpus.push_back( GetVocabulary().Tokenize( line ) );
+ if (max(input_length,tm_length) - words_matched > best_cost) {
+ if (length_filter_flag) continue;
}
- }
-
- void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus)
- {
- ifstream fileStream;
- fileStream.open(fileName.c_str());
- if (!fileStream) {
- cerr << "file not found: " << fileName << endl;
- exit(1);
+ tm_count_word_match++;
+
+ // prune, check again how many words are matched
+ vector< Match > pruned = prune_matches( match, best_cost );
+ words_matched = 0;
+ for(int p=0; p<pruned.size(); p++) {
+ words_matched += pruned[p].input_end - pruned[p].input_start + 1;
}
- cerr << "loading " << fileName << endl;
-
- istream *fileStreamP = &fileStream;
-
- WORD_ID delimiter = GetVocabulary().StoreIfNew("|||");
-
- int lineNum = 0;
- char line[LINE_MAX_LENGTH];
- while(true)
- {
- SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
- if (fileStreamP->eof()) break;
-
- vector<WORD_ID> toks = GetVocabulary().Tokenize( line );
-
- corpus.push_back(vector< SentenceAlignment >());
- vector< SentenceAlignment > &vec = corpus.back();
-
- vec.push_back(SentenceAlignment());
- SentenceAlignment *sentence = &vec.back();
-
- const WORD &countStr = GetVocabulary().GetWord(toks[0]);
- sentence->count = atoi(countStr.c_str());
-
- for (size_t i = 1; i < toks.size(); ++i) {
- WORD_ID wordId = toks[i];
-
- if (wordId == delimiter) {
- // target and alignments can have multiple sentences.
- vec.push_back(SentenceAlignment());
- sentence = &vec.back();
-
- // count
- ++i;
-
- const WORD &countStr = GetVocabulary().GetWord(toks[i]);
- sentence->count = atoi(countStr.c_str());
- }
- else {
- // just a normal word, add
- sentence->target.push_back(wordId);
- }
+ if (max(input_length,tm_length) - words_matched > best_cost) {
+ if (length_filter_flag) continue;
+ }
+ tm_count_word_match2++;
+
+ pruned_match_count += pruned.size();
+ int prior_best_cost = best_cost;
+ int cost;
+
+ clock_t clock_validation_start = clock();
+ if (! parse_flag ||
+ pruned.size()>=10) { // to prevent worst cases
+ string path;
+ cost = sed( input[sentenceInd], source[tmID], path, false );
+ if (cost < best_cost) {
+ best_cost = cost;
+ }
+ }
+
+ else {
+ cost = parse_matches( pruned, input_length, tm_length, best_cost );
+ if (prior_best_cost != best_cost) {
+ best_tm.clear();
}
-
- ++lineNum;
-
}
-
+ clock_validation_sum += clock() - clock_validation_start;
+ if (cost == best_cost) {
+ best_tm.push_back( tmID );
+ }
+ }
+ cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl;
+ cerr << "tm considered: " << sentence_match.size()
+ << " word-matched: " << tm_count_word_match
+ << " word-matched2: " << tm_count_word_match2
+ << " best: " << best_tm.size() << endl;
+
+ cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl;
+
+ // create xml and extract files
+ string inputStr, sourceStr;
+ for (size_t pos = 0; pos < input_length; ++pos) {
+ inputStr += GetVocabulary().GetWord(input[sentenceInd][pos]) + " ";
}
-
-
- void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus )
- {
- ifstream fileStream;
- fileStream.open(fileName.c_str());
- if (!fileStream) {
- cerr << "file not found: " << fileName << endl;
- exit(1);
+
+ // do not try to find the best ... report multiple matches
+ if (multiple_flag) {
+ int input_letter_length = compute_length( input[sentenceInd] );
+ for(int si=0; si<best_tm.size(); si++) {
+ int s = best_tm[si];
+ string path;
+ unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
+ // do not report multiple identical sentences, but just their count
+ //cout << sentenceInd << " "; // sentence number
+ //cout << letter_cost << "/" << input_letter_length << " ";
+ //cout << "(" << best_cost <<"/" << input_length <<") ";
+ //cout << "||| " << s << " ||| " << path << endl;
+
+ const vector<WORD_ID> &sourceSentence = source[s];
+ vector<SentenceAlignment> &targets = targetAndAlignment[s];
+ create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream);
+
}
- cerr << "loading " << fileName << endl;
-
- istream *fileStreamP = &fileStream;
-
- string delimiter = "|||";
-
- int lineNum = 0;
- char line[LINE_MAX_LENGTH];
- while(true)
- {
- SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
- if (fileStreamP->eof()) break;
-
- vector< SentenceAlignment > &vec = corpus[lineNum];
- size_t targetInd = 0;
- SentenceAlignment *sentence = &vec[targetInd];
-
- vector<string> toks = Moses::Tokenize(line);
-
- for (size_t i = 0; i < toks.size(); ++i) {
- string &tok = toks[i];
-
- if (tok == delimiter) {
- // target and alignments can have multiple sentences.
- ++targetInd;
- sentence = &vec[targetInd];
-
- ++i;
- }
- else {
- // just a normal alignment, add
- vector<int> alignPoint = Moses::Tokenize<int>(tok, "-");
- assert(alignPoint.size() == 2);
- sentence->alignment.push_back(pair<int,int>(alignPoint[0], alignPoint[1]));
+ } // if (multiple_flag)
+ else {
+
+ // find the best matches according to letter sed
+ string best_path = "";
+ int best_match = -1;
+ int best_letter_cost;
+ if (lsed_flag) {
+ best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
+ for(int si=0; si<best_tm.size(); si++) {
+ int s = best_tm[si];
+ string path;
+ unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
+ if (letter_cost < best_letter_cost) {
+ best_letter_cost = letter_cost;
+ best_path = path;
+ best_match = s;
}
}
-
- ++lineNum;
-
}
+ // if letter sed turned off, just compute path for first match
+ else {
+ if (best_tm.size() > 0) {
+ string path;
+ sed( input[sentenceInd], source[best_tm[0]], path, false );
+ best_path = path;
+ best_match = best_tm[0];
+ }
+ }
+ cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC)
+ << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC)
+ << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC)
+ << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC)
+ << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")"
+ << " )" << endl;
+ if (lsed_flag) {
+ //cout << best_letter_cost << "/" << compute_length( input[sentenceInd] ) << " (";
+ }
+ //cout << best_cost <<"/" << input_length;
+ if (lsed_flag) {
+ //cout << ")";
+ }
+ //cout << " ||| " << best_match << " ||| " << best_path << endl;
+
+ if (best_match == -1) {
+ CHECK(source.size());
+ best_match = 0;
+ }
+
+ // creat xml & extracts
+ const vector<WORD_ID> &sourceSentence = source[best_match];
+ vector<SentenceAlignment> &targets = targetAndAlignment[best_match];
+ create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path, fuzzyMatchStream);
+
+ } // else if (multiple_flag)
+
+ fuzzyMatchStream.close();
+
+ return fuzzyMatchFile;
+}
+
+void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector< WORD_ID > > &corpus )
+{
+ // source
+ ifstream fileStream;
+ fileStream.open(fileName.c_str());
+ if (!fileStream) {
+ cerr << "file not found: " << fileName << endl;
+ exit(1);
}
-
- bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
- {
-#ifdef WITH_THREADS
- boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
-#endif
- map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
- if (lookup != m_lsed.end()) {
- value = lookup->second;
- return true;
+ cerr << "loading " << fileName << endl;
+
+ istream *fileStreamP = &fileStream;
+
+ char line[LINE_MAX_LENGTH];
+ while(true) {
+ SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
+ if (fileStreamP->eof()) break;
+ corpus.push_back( GetVocabulary().Tokenize( line ) );
+ }
+}
+
+void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus)
+{
+ ifstream fileStream;
+ fileStream.open(fileName.c_str());
+ if (!fileStream) {
+ cerr << "file not found: " << fileName << endl;
+ exit(1);
+ }
+ cerr << "loading " << fileName << endl;
+
+ istream *fileStreamP = &fileStream;
+
+ WORD_ID delimiter = GetVocabulary().StoreIfNew("|||");
+
+ int lineNum = 0;
+ char line[LINE_MAX_LENGTH];
+ while(true) {
+ SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
+ if (fileStreamP->eof()) break;
+
+ vector<WORD_ID> toks = GetVocabulary().Tokenize( line );
+
+ corpus.push_back(vector< SentenceAlignment >());
+ vector< SentenceAlignment > &vec = corpus.back();
+
+ vec.push_back(SentenceAlignment());
+ SentenceAlignment *sentence = &vec.back();
+
+ const WORD &countStr = GetVocabulary().GetWord(toks[0]);
+ sentence->count = atoi(countStr.c_str());
+
+ for (size_t i = 1; i < toks.size(); ++i) {
+ WORD_ID wordId = toks[i];
+
+ if (wordId == delimiter) {
+ // target and alignments can have multiple sentences.
+ vec.push_back(SentenceAlignment());
+ sentence = &vec.back();
+
+ // count
+ ++i;
+
+ const WORD &countStr = GetVocabulary().GetWord(toks[i]);
+ sentence->count = atoi(countStr.c_str());
+ } else {
+ // just a normal word, add
+ sentence->target.push_back(wordId);
+ }
+ }
+
+ ++lineNum;
+
+ }
+
+}
+
+
+void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus )
+{
+ ifstream fileStream;
+ fileStream.open(fileName.c_str());
+ if (!fileStream) {
+ cerr << "file not found: " << fileName << endl;
+ exit(1);
+ }
+ cerr << "loading " << fileName << endl;
+
+ istream *fileStreamP = &fileStream;
+
+ string delimiter = "|||";
+
+ int lineNum = 0;
+ char line[LINE_MAX_LENGTH];
+ while(true) {
+ SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
+ if (fileStreamP->eof()) break;
+
+ vector< SentenceAlignment > &vec = corpus[lineNum];
+ size_t targetInd = 0;
+ SentenceAlignment *sentence = &vec[targetInd];
+
+ vector<string> toks = Moses::Tokenize(line);
+
+ for (size_t i = 0; i < toks.size(); ++i) {
+ string &tok = toks[i];
+
+ if (tok == delimiter) {
+ // target and alignments can have multiple sentences.
+ ++targetInd;
+ sentence = &vec[targetInd];
+
+ ++i;
+ } else {
+ // just a normal alignment, add
+ vector<int> alignPoint = Moses::Tokenize<int>(tok, "-");
+ assert(alignPoint.size() == 2);
+ sentence->alignment.push_back(pair<int,int>(alignPoint[0], alignPoint[1]));
+ }
}
- return false;
+ ++lineNum;
+
}
+}
- void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
- {
+bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
+{
#ifdef WITH_THREADS
- boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+ boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
#endif
- m_lsed[ key ] = value;
+ map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
+ if (lookup != m_lsed.end()) {
+ value = lookup->second;
+ return true;
}
+ return false;
+}
+
+void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
+{
+#ifdef WITH_THREADS
+ boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+#endif
+ m_lsed[ key ] = value;
+}
+
/* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */
unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx )
{
- // check if already computed -> lookup in cache
- pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
- unsigned int value;
- bool ret = GetLSEDCache(pIdx, value);
- if (ret) {
- return value;
- }
-
- // get surface strings for word indices
- const string &a = GetVocabulary().GetWord( aIdx );
- const string &b = GetVocabulary().GetWord( bIdx );
-
- // initialize cost matrix
- unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
- for( unsigned int i=0; i<=a.size(); i++ ) {
- cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
- cost[i][0] = i;
- }
- for( unsigned int j=0; j<=b.size(); j++ ) {
- cost[0][j] = j;
- }
-
- // core string edit distance loop
- for( unsigned int i=1; i<=a.size(); i++ ) {
- for( unsigned int j=1; j<=b.size(); j++ ) {
-
- unsigned int ins = cost[i-1][j] + 1;
- unsigned int del = cost[i][j-1] + 1;
- bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0);
- unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1);
-
- unsigned int min = (ins < del) ? ins : del;
- min = (diag < min) ? diag : min;
-
- cost[i][j] = min;
- }
- }
-
- // clear out memory
- unsigned int final = cost[a.size()][b.size()];
- for( unsigned int i=0; i<=a.size(); i++ ) {
- free( cost[i] );
- }
- free( cost );
-
- // cache and return result
- SetLSEDCache(pIdx, final);
- return final;
-}
-
- /* string edit distance implementation */
-
- unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) {
-
- // initialize cost and path matrices
- unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
- char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
-
- for( unsigned int i=0; i<=a.size(); i++ ) {
- cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
- path[i] = (char*) calloc( sizeof(char), b.size()+1 );
- if (i>0)
- {
- cost[i][0] = cost[i-1][0];
- if (use_letter_sed)
- {
- cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size();
- }
- else
- {
- cost[i][0]++;
- }
- }
- else
- {
- cost[i][0] = 0;
- }
- path[i][0] = 'I';
- }
+ // check if already computed -> lookup in cache
+ pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
+ unsigned int value;
+ bool ret = GetLSEDCache(pIdx, value);
+ if (ret) {
+ return value;
+ }
- for( unsigned int j=0; j<=b.size(); j++ ) {
- if (j>0)
- {
- cost[0][j] = cost[0][j-1];
- if (use_letter_sed)
- {
- cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size();
- }
- else
- {
- cost[0][j]++;
- }
- }
- else
- {
- cost[0][j] = 0;
- }
- path[0][j] = 'D';
+ // get surface strings for word indices
+ const string &a = GetVocabulary().GetWord( aIdx );
+ const string &b = GetVocabulary().GetWord( bIdx );
+
+ // initialize cost matrix
+ unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
+ for( unsigned int i=0; i<=a.size(); i++ ) {
+ cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
+ cost[i][0] = i;
+ }
+ for( unsigned int j=0; j<=b.size(); j++ ) {
+ cost[0][j] = j;
+ }
+
+ // core string edit distance loop
+ for( unsigned int i=1; i<=a.size(); i++ ) {
+ for( unsigned int j=1; j<=b.size(); j++ ) {
+
+ unsigned int ins = cost[i-1][j] + 1;
+ unsigned int del = cost[i][j-1] + 1;
+ bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0);
+ unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1);
+
+ unsigned int min = (ins < del) ? ins : del;
+ min = (diag < min) ? diag : min;
+
+ cost[i][j] = min;
}
+ }
- // core string edit distance algorithm
- for( unsigned int i=1; i<=a.size(); i++ ) {
- for( unsigned int j=1; j<=b.size(); j++ ) {
- unsigned int ins = cost[i-1][j];
- unsigned int del = cost[i][j-1];
- unsigned int match;
- if (use_letter_sed)
- {
- ins += GetVocabulary().GetWord( a[i-1] ).size();
- del += GetVocabulary().GetWord( b[j-1] ).size();
- match = letter_sed( a[i-1], b[j-1] );
- }
- else
- {
- ins++;
- del++;
- match = ( a[i-1] == b[j-1] ) ? 0 : 1;
- }
- unsigned int diag = cost[i-1][j-1] + match;
-
- char action = (ins < del) ? 'I' : 'D';
- unsigned int min = (ins < del) ? ins : del;
- if (diag < min)
- {
- action = (match>0) ? 'S' : 'M';
- min = diag;
- }
+ // clear out memory
+ unsigned int final = cost[a.size()][b.size()];
+ for( unsigned int i=0; i<=a.size(); i++ ) {
+ free( cost[i] );
+ }
+ free( cost );
+
+ // cache and return result
+ SetLSEDCache(pIdx, final);
+ return final;
+}
+
+/* string edit distance implementation */
- cost[i][j] = min;
- path[i][j] = action;
+unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed )
+{
+
+ // initialize cost and path matrices
+ unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
+ char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
+
+ for( unsigned int i=0; i<=a.size(); i++ ) {
+ cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
+ path[i] = (char*) calloc( sizeof(char), b.size()+1 );
+ if (i>0) {
+ cost[i][0] = cost[i-1][0];
+ if (use_letter_sed) {
+ cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size();
+ } else {
+ cost[i][0]++;
}
+ } else {
+ cost[i][0] = 0;
}
+ path[i][0] = 'I';
+ }
- // construct string for best path
- unsigned int i = a.size();
- unsigned int j = b.size();
- best_path = "";
- while( i>0 || j>0 )
- {
- best_path = path[i][j] + best_path;
- if (path[i][j] == 'I')
- {
- i--;
+ for( unsigned int j=0; j<=b.size(); j++ ) {
+ if (j>0) {
+ cost[0][j] = cost[0][j-1];
+ if (use_letter_sed) {
+ cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size();
+ } else {
+ cost[0][j]++;
}
- else if (path[i][j] == 'D')
- {
- j--;
+ } else {
+ cost[0][j] = 0;
+ }
+ path[0][j] = 'D';
+ }
+
+ // core string edit distance algorithm
+ for( unsigned int i=1; i<=a.size(); i++ ) {
+ for( unsigned int j=1; j<=b.size(); j++ ) {
+ unsigned int ins = cost[i-1][j];
+ unsigned int del = cost[i][j-1];
+ unsigned int match;
+ if (use_letter_sed) {
+ ins += GetVocabulary().GetWord( a[i-1] ).size();
+ del += GetVocabulary().GetWord( b[j-1] ).size();
+ match = letter_sed( a[i-1], b[j-1] );
+ } else {
+ ins++;
+ del++;
+ match = ( a[i-1] == b[j-1] ) ? 0 : 1;
}
- else
- {
- i--;
- j--;
+ unsigned int diag = cost[i-1][j-1] + match;
+
+ char action = (ins < del) ? 'I' : 'D';
+ unsigned int min = (ins < del) ? ins : del;
+ if (diag < min) {
+ action = (match>0) ? 'S' : 'M';
+ min = diag;
}
+
+ cost[i][j] = min;
+ path[i][j] = action;
}
+ }
+ // construct string for best path
+ unsigned int i = a.size();
+ unsigned int j = b.size();
+ best_path = "";
+ while( i>0 || j>0 ) {
+ best_path = path[i][j] + best_path;
+ if (path[i][j] == 'I') {
+ i--;
+ } else if (path[i][j] == 'D') {
+ j--;
+ } else {
+ i--;
+ j--;
+ }
+ }
- // clear out memory
- unsigned int final = cost[a.size()][b.size()];
- for( unsigned int i=0; i<=a.size(); i++ ) {
- free( cost[i] );
- free( path[i] );
- }
- free( cost );
- free( path );
+ // clear out memory
+ unsigned int final = cost[a.size()][b.size()];
- // return result
- return final;
+ for( unsigned int i=0; i<=a.size(); i++ ) {
+ free( cost[i] );
+ free( path[i] );
}
+ free( cost );
+ free( path );
+
+ // return result
+ return final;
+}
-/* utlility function: compute length of sentence in characters
+/* utlility function: compute length of sentence in characters
(spaces do not count) */
unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentence )
{
- unsigned int length = 0; for( unsigned int i=0; i<sentence.size(); i++ )
- {
- length += GetVocabulary().GetWord( sentence[i] ).size();
- }
- return length;
+ unsigned int length = 0;
+ for( unsigned int i=0; i<sentence.size(); i++ ) {
+ length += GetVocabulary().GetWord( sentence[i] ).size();
+ }
+ return length;
}
/* brute force method: compare input to all corpus sentences */
- int FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
- vector< vector< WORD_ID > > input )
+int FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
+ vector< vector< WORD_ID > > input )
{
- // go through input set...
- for(unsigned int i=0;i<input.size();i++)
- {
- bool use_letter_sed = false;
-
- // compute sentence length and worst allowed cost
- unsigned int input_length;
- if (use_letter_sed)
- {
- input_length = compute_length( input[i] );
- }
- else
- {
- input_length = input[i].size();
- }
- unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
- string best_path = "";
- int best_match = -1;
-
- // go through all corpus sentences
- for(unsigned int s=0;s<source.size();s++)
- {
- int source_length;
- if (use_letter_sed)
- {
- source_length = compute_length( source[s] );
- }
- else
- {
- source_length = source[s].size();
- }
- int diff = abs((int)source_length - (int)input_length);
- if (length_filter_flag && (diff >= best_cost))
- {
- continue;
- }
-
- // compute string edit distance
- string path;
- unsigned int cost = sed( input[i], source[s], path, use_letter_sed );
-
- // update if new best
- if (cost < best_cost)
- {
- best_cost = cost;
- best_path = path;
- best_match = s;
- }
- }
- //cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl;
- }
+ // go through input set...
+ for(unsigned int i=0; i<input.size(); i++) {
+ bool use_letter_sed = false;
+
+ // compute sentence length and worst allowed cost
+ unsigned int input_length;
+ if (use_letter_sed) {
+ input_length = compute_length( input[i] );
+ } else {
+ input_length = input[i].size();
+ }
+ unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
+ string best_path = "";
+ int best_match = -1;
+
+ // go through all corpus sentences
+ for(unsigned int s=0; s<source.size(); s++) {
+ int source_length;
+ if (use_letter_sed) {
+ source_length = compute_length( source[s] );
+ } else {
+ source_length = source[s].size();
+ }
+ int diff = abs((int)source_length - (int)input_length);
+ if (length_filter_flag && (diff >= best_cost)) {
+ continue;
+ }
+
+ // compute string edit distance
+ string path;
+ unsigned int cost = sed( input[i], source[s], path, use_letter_sed );
+
+ // update if new best
+ if (cost < best_cost) {
+ best_cost = cost;
+ best_path = path;
+ best_match = s;
+ }
+ }
+ //cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl;
+ }
}
/* definition of short matches
@@ -823,274 +768,250 @@ unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentenc
int FuzzyMatchWrapper::short_match_max_length( int input_length )
{
- if ( ! refined_flag )
+ if ( ! refined_flag )
return 0;
if ( input_length >= 5 )
return 1;
- return 0;
+ return 0;
}
/* if we have non-short matches in a sentence, we need to
- take a closer look at it.
+ take a closer look at it.
this function creates a hash map for all input words and their positions
- (to be used by the next function)
+ (to be used by the next function)
(done here, because this has be done only once for an input sentence) */
void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translationId, const vector< WORD_ID > &input )
{
- int max_length = short_match_max_length( input.size() );
- if (max_length == 0)
- return;
-
- wordIndex.clear();
-
- // store input words and their positions in hash map
- for(int i=0; i<input.size(); i++)
- {
- if (wordIndex.find( input[i] ) == wordIndex.end())
- {
- vector< int > position_vector;
- wordIndex[ input[i] ] = position_vector;
- }
- wordIndex[ input[i] ].push_back( i );
- }
+ int max_length = short_match_max_length( input.size() );
+ if (max_length == 0)
+ return;
+
+ wordIndex.clear();
+
+ // store input words and their positions in hash map
+ for(int i=0; i<input.size(); i++) {
+ if (wordIndex.find( input[i] ) == wordIndex.end()) {
+ vector< int > position_vector;
+ wordIndex[ input[i] ] = position_vector;
+ }
+ wordIndex[ input[i] ].push_back( i );
+ }
}
/* add all short matches to list of matches for a sentence */
void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
-{
- int max_length = short_match_max_length( input_length );
- if (max_length == 0)
- return;
-
- int tm_length = tm.size();
- map< WORD_ID,vector< int > >::iterator input_word_hit;
- for(int t_pos=0; t_pos<tm.size(); t_pos++)
- {
- input_word_hit = wordIndex.find( tm[t_pos] );
- if (input_word_hit != wordIndex.end())
- {
- vector< int > &position_vector = input_word_hit->second;
- for(int j=0; j<position_vector.size(); j++)
- {
- int &i_pos = position_vector[j];
-
- // before match
- int max_cost = max( i_pos , t_pos );
- int min_cost = abs( i_pos - t_pos );
- if ( i_pos>0 && i_pos == t_pos )
- min_cost++;
-
- // after match
- max_cost += max( (input_length-i_pos) , (tm_length-t_pos));
- min_cost += abs( (input_length-i_pos) - (tm_length-t_pos));
- if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos))
- min_cost++;
-
- if (min_cost <= best_cost)
- {
- Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 );
- match.push_back( new_match );
- }
- }
- }
- }
+{
+ int max_length = short_match_max_length( input_length );
+ if (max_length == 0)
+ return;
+
+ int tm_length = tm.size();
+ map< WORD_ID,vector< int > >::iterator input_word_hit;
+ for(int t_pos=0; t_pos<tm.size(); t_pos++) {
+ input_word_hit = wordIndex.find( tm[t_pos] );
+ if (input_word_hit != wordIndex.end()) {
+ vector< int > &position_vector = input_word_hit->second;
+ for(int j=0; j<position_vector.size(); j++) {
+ int &i_pos = position_vector[j];
+
+ // before match
+ int max_cost = max( i_pos , t_pos );
+ int min_cost = abs( i_pos - t_pos );
+ if ( i_pos>0 && i_pos == t_pos )
+ min_cost++;
+
+ // after match
+ max_cost += max( (input_length-i_pos) , (tm_length-t_pos));
+ min_cost += abs( (input_length-i_pos) - (tm_length-t_pos));
+ if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos))
+ min_cost++;
+
+ if (min_cost <= best_cost) {
+ Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 );
+ match.push_back( new_match );
+ }
+ }
+ }
+ }
}
/* remove matches that are subsumed by a larger match */
vector< Match > FuzzyMatchWrapper::prune_matches( const vector< Match > &match, int best_cost )
{
- //cerr << "\tpruning";
- vector< Match > pruned;
- for(int i=match.size()-1; i>=0; i--)
- {
- //cerr << " (" << match[i].input_start << "," << match[i].input_end
- // << " ; " << match[i].tm_start << "," << match[i].tm_end
- // << " * " << match[i].min_cost << ")";
-
- //if (match[i].min_cost > best_cost)
- // continue;
-
- bool subsumed = false;
- for(int j=match.size()-1; j>=0; j--)
- {
- if (i!=j // do not compare match with itself
- && ( match[i].input_end - match[i].input_start <=
- match[j].input_end - match[j].input_start ) // i shorter than j
- && ((match[i].input_start == match[j].input_start &&
- match[i].tm_start == match[j].tm_start ) ||
- (match[i].input_end == match[j].input_end &&
- match[i].tm_end == match[j].tm_end) ) )
- {
- subsumed = true;
- }
- }
- if (! subsumed && match[i].min_cost <= best_cost)
- {
- //cerr << "*";
- pruned.push_back( match[i] );
- }
- }
- //cerr << endl;
- return pruned;
+ //cerr << "\tpruning";
+ vector< Match > pruned;
+ for(int i=match.size()-1; i>=0; i--) {
+ //cerr << " (" << match[i].input_start << "," << match[i].input_end
+ // << " ; " << match[i].tm_start << "," << match[i].tm_end
+ // << " * " << match[i].min_cost << ")";
+
+ //if (match[i].min_cost > best_cost)
+ // continue;
+
+ bool subsumed = false;
+ for(int j=match.size()-1; j>=0; j--) {
+ if (i!=j // do not compare match with itself
+ && ( match[i].input_end - match[i].input_start <=
+ match[j].input_end - match[j].input_start ) // i shorter than j
+ && ((match[i].input_start == match[j].input_start &&
+ match[i].tm_start == match[j].tm_start ) ||
+ (match[i].input_end == match[j].input_end &&
+ match[i].tm_end == match[j].tm_end) ) ) {
+ subsumed = true;
+ }
+ }
+ if (! subsumed && match[i].min_cost <= best_cost) {
+ //cerr << "*";
+ pruned.push_back( match[i] );
+ }
+ }
+ //cerr << endl;
+ return pruned;
}
/* A* parsing method to compute string edit distance */
int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost )
-{
- // cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl;
-
- if (match.size() == 1)
- return match[0].max_cost;
- if (match.size() == 0)
- return input_length+tm_length;
-
- int this_best_cost = input_length + tm_length;
- for(int i=0;i<match.size();i++)
- {
- this_best_cost = min( this_best_cost, match[i].max_cost );
- }
- // cerr << "\tthis best cost: " << this_best_cost << endl;
-
- // bottom up combination of spans
- vector< vector< Match > > multi_match;
- multi_match.push_back( match );
-
- int match_level = 1;
- while(multi_match[ match_level-1 ].size()>0)
- {
- // init vector
- vector< Match > empty;
- multi_match.push_back( empty );
-
- for(int first_level = 0; first_level <= (match_level-1)/2; first_level++)
- {
- int second_level = match_level - first_level -1;
- //cerr << "\tcombining level " << first_level << " and " << second_level << endl;
-
- vector< Match > &first_match = multi_match[ first_level ];
- vector< Match > &second_match = multi_match[ second_level ];
-
- for(int i1 = 0; i1 < first_match.size(); i1++) {
- for(int i2 = 0; i2 < second_match.size(); i2++) {
-
- // do not combine the same pair twice
- if (first_level == second_level && i2 <= i1)
- {
- continue;
- }
-
- // get sorted matches (first is before second)
- Match *first, *second;
- if (first_match[i1].input_start < second_match[i2].input_start )
- {
- first = &first_match[i1];
- second = &second_match[i2];
- }
- else
- {
- second = &first_match[i1];
- first = &second_match[i2];
- }
-
- //cerr << "\tcombining "
- // << "(" << first->input_start << "," << first->input_end << "), "
- // << first->tm_start << " [" << first->internal_cost << "]"
- // << " with "
- // << "(" << second->input_start << "," << second->input_end << "), "
- // << second->tm_start<< " [" << second->internal_cost << "]"
- // << endl;
-
- // do not process overlapping matches
- if (first->input_end >= second->input_start)
- {
- continue;
- }
-
- // no overlap / mismatch in tm
- if (first->tm_end >= second->tm_start)
- {
- continue;
- }
-
- // compute cost
- int min_cost = 0;
- int max_cost = 0;
-
- // initial
- min_cost += abs( first->input_start - first->tm_start );
- max_cost += max( first->input_start, first->tm_start );
-
- // same number of words, but not sent. start -> cost is at least 1
- if (first->input_start == first->tm_start && first->input_start > 0)
- {
- min_cost++;
- }
-
- // in-between
- int skipped_words = second->input_start - first->input_end -1;
- int skipped_words_tm = second->tm_start - first->tm_end -1;
- int internal_cost = max( skipped_words, skipped_words_tm );
- internal_cost += first->internal_cost + second->internal_cost;
- min_cost += internal_cost;
- max_cost += internal_cost;
-
- // final
- min_cost += abs( (tm_length-1 - second->tm_end) -
- (input_length-1 - second->input_end) );
- max_cost += max( (tm_length-1 - second->tm_end),
- (input_length-1 - second->input_end) );
-
- // same number of words, but not sent. end -> cost is at least 1
- if ( ( input_length-1 - second->input_end
- == tm_length-1 - second->tm_end )
- && input_length-1 != second->input_end )
- {
- min_cost++;
- }
-
- // cerr << "\tcost: " << min_cost << "-" << max_cost << endl;
-
- // if worst than best cost, forget it
- if (min_cost > best_cost)
- {
- continue;
- }
-
- // add match
- Match new_match( first->input_start,
- second->input_end,
- first->tm_start,
- second->tm_end,
- min_cost,
- max_cost,
- internal_cost);
- multi_match[ match_level ].push_back( new_match );
- // cerr << "\tstored\n";
-
- // possibly updating this_best_cost
- if (max_cost < this_best_cost)
- {
- // cerr << "\tupdating this best cost to " << max_cost << "\n";
- this_best_cost = max_cost;
-
- // possibly updating best_cost
- if (max_cost < best_cost)
- {
- // cerr << "\tupdating best cost to " << max_cost << "\n";
- best_cost = max_cost;
- }
- }
- }
- }
- }
- match_level++;
- }
- return this_best_cost;
+{
+ // cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl;
+
+ if (match.size() == 1)
+ return match[0].max_cost;
+ if (match.size() == 0)
+ return input_length+tm_length;
+
+ int this_best_cost = input_length + tm_length;
+ for(int i=0; i<match.size(); i++) {
+ this_best_cost = min( this_best_cost, match[i].max_cost );
+ }
+ // cerr << "\tthis best cost: " << this_best_cost << endl;
+
+ // bottom up combination of spans
+ vector< vector< Match > > multi_match;
+ multi_match.push_back( match );
+
+ int match_level = 1;
+ while(multi_match[ match_level-1 ].size()>0) {
+ // init vector
+ vector< Match > empty;
+ multi_match.push_back( empty );
+
+ for(int first_level = 0; first_level <= (match_level-1)/2; first_level++) {
+ int second_level = match_level - first_level -1;
+ //cerr << "\tcombining level " << first_level << " and " << second_level << endl;
+
+ vector< Match > &first_match = multi_match[ first_level ];
+ vector< Match > &second_match = multi_match[ second_level ];
+
+ for(int i1 = 0; i1 < first_match.size(); i1++) {
+ for(int i2 = 0; i2 < second_match.size(); i2++) {
+
+ // do not combine the same pair twice
+ if (first_level == second_level && i2 <= i1) {
+ continue;
+ }
+
+ // get sorted matches (first is before second)
+ Match *first, *second;
+ if (first_match[i1].input_start < second_match[i2].input_start ) {
+ first = &first_match[i1];
+ second = &second_match[i2];
+ } else {
+ second = &first_match[i1];
+ first = &second_match[i2];
+ }
+
+ //cerr << "\tcombining "
+ // << "(" << first->input_start << "," << first->input_end << "), "
+ // << first->tm_start << " [" << first->internal_cost << "]"
+ // << " with "
+ // << "(" << second->input_start << "," << second->input_end << "), "
+ // << second->tm_start<< " [" << second->internal_cost << "]"
+ // << endl;
+
+ // do not process overlapping matches
+ if (first->input_end >= second->input_start) {
+ continue;
+ }
+
+ // no overlap / mismatch in tm
+ if (first->tm_end >= second->tm_start) {
+ continue;
+ }
+
+ // compute cost
+ int min_cost = 0;
+ int max_cost = 0;
+
+ // initial
+ min_cost += abs( first->input_start - first->tm_start );
+ max_cost += max( first->input_start, first->tm_start );
+
+ // same number of words, but not sent. start -> cost is at least 1
+ if (first->input_start == first->tm_start && first->input_start > 0) {
+ min_cost++;
+ }
+
+ // in-between
+ int skipped_words = second->input_start - first->input_end -1;
+ int skipped_words_tm = second->tm_start - first->tm_end -1;
+ int internal_cost = max( skipped_words, skipped_words_tm );
+ internal_cost += first->internal_cost + second->internal_cost;
+ min_cost += internal_cost;
+ max_cost += internal_cost;
+
+ // final
+ min_cost += abs( (tm_length-1 - second->tm_end) -
+ (input_length-1 - second->input_end) );
+ max_cost += max( (tm_length-1 - second->tm_end),
+ (input_length-1 - second->input_end) );
+
+ // same number of words, but not sent. end -> cost is at least 1
+ if ( ( input_length-1 - second->input_end
+ == tm_length-1 - second->tm_end )
+ && input_length-1 != second->input_end ) {
+ min_cost++;
+ }
+
+ // cerr << "\tcost: " << min_cost << "-" << max_cost << endl;
+
+ // if worst than best cost, forget it
+ if (min_cost > best_cost) {
+ continue;
+ }
+
+ // add match
+ Match new_match( first->input_start,
+ second->input_end,
+ first->tm_start,
+ second->tm_end,
+ min_cost,
+ max_cost,
+ internal_cost);
+ multi_match[ match_level ].push_back( new_match );
+ // cerr << "\tstored\n";
+
+ // possibly updating this_best_cost
+ if (max_cost < this_best_cost) {
+ // cerr << "\tupdating this best cost to " << max_cost << "\n";
+ this_best_cost = max_cost;
+
+ // possibly updating best_cost
+ if (max_cost < best_cost) {
+ // cerr << "\tupdating best cost to " << max_cost << "\n";
+ best_cost = max_cost;
+ }
+ }
+ }
+ }
+ }
+ match_level++;
+ }
+ return this_best_cost;
}
@@ -1101,22 +1022,22 @@ void FuzzyMatchWrapper::create_extract(int sentenceInd, int cost, const vector<
WORD_ID wordId = sourceSentence[pos];
sourceStr += GetVocabulary().GetWord(wordId) + " ";
}
-
+
for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) {
- const SentenceAlignment &sentenceAlignment = targets[targetInd];
+ const SentenceAlignment &sentenceAlignment = targets[targetInd];
string targetStr = sentenceAlignment.getTargetString(GetVocabulary());
string alignStr = sentenceAlignment.getAlignmentString();
-
+
outputFile
- << sentenceInd << endl
- << cost << endl
- << sourceStr << endl
- << inputStr << endl
- << targetStr << endl
- << alignStr << endl
- << path << endl
- << sentenceAlignment.count << endl;
-
+ << sentenceInd << endl
+ << cost << endl
+ << sourceStr << endl
+ << inputStr << endl
+ << targetStr << endl
+ << alignStr << endl
+ << path << endl
+ << sentenceAlignment.count << endl;
+
}
}
diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
index a6f772fb9..d8813a65c 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
@@ -20,18 +20,18 @@
#include "Match.h"
#include "moses/InputType.h"
-namespace tmmt
+namespace tmmt
{
class Match;
class SentenceAlignment;
-
+
class FuzzyMatchWrapper
{
public:
FuzzyMatchWrapper(const std::string &source, const std::string &target, const std::string &alignment);
std::string Extract(long translationId, const std::string &dirNameStr);
-
+
protected:
// tm-mt
std::vector< std::vector< tmmt::SentenceAlignment > > targetAndAlignment;
@@ -58,13 +58,13 @@ protected:
void load_corpus( const std::string &fileName, std::vector< std::vector< tmmt::WORD_ID > > &corpus );
void load_target( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus);
void load_alignment( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus );
-
+
/** brute force method: compare input to all corpus sentences */
- int basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source,
- std::vector< std::vector< tmmt::WORD_ID > > input ) ;
-
- /** utlility function: compute length of sentence in characters
- (spaces do not count) */
+ int basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source,
+ std::vector< std::vector< tmmt::WORD_ID > > input ) ;
+
+ /** utlility function: compute length of sentence in characters
+ (spaces do not count) */
unsigned int compute_length( const std::vector< tmmt::WORD_ID > &sentence );
unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx );
unsigned int sed( const std::vector< WORD_ID > &a, const std::vector< WORD_ID > &b, std::string &best_path, bool use_letter_sed );
@@ -77,8 +77,9 @@ protected:
void create_extract(int sentenceInd, int cost, const std::vector< WORD_ID > &sourceSentence, const std::vector<SentenceAlignment> &targets, const std::string &inputStr, const std::string &path, std::ofstream &outputFile);
std::string ExtractTM(WordIndex &wordIndex, long translationId, const std::string &inputPath);
- Vocabulary &GetVocabulary()
- { return suffixArray->GetVocabulary(); }
+ Vocabulary &GetVocabulary() {
+ return suffixArray->GetVocabulary();
+ }
bool GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const;
void SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value);
diff --git a/moses/TranslationModel/fuzzy-match/Match.h b/moses/TranslationModel/fuzzy-match/Match.h
index 7feb25769..f2ba2c150 100644
--- a/moses/TranslationModel/fuzzy-match/Match.h
+++ b/moses/TranslationModel/fuzzy-match/Match.h
@@ -14,17 +14,18 @@ namespace tmmt
/* data structure for n-gram match between input and corpus */
-class Match {
+class Match
+{
public:
- int input_start;
- int input_end;
- int tm_start;
- int tm_end;
- int min_cost;
- int max_cost;
- int internal_cost;
- Match( int is, int ie, int ts, int te, int min, int max, int i )
- :input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i)
+ int input_start;
+ int input_end;
+ int tm_start;
+ int tm_end;
+ int min_cost;
+ int max_cost;
+ int internal_cost;
+ Match( int is, int ie, int ts, int te, int min, int max, int i )
+ :input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i)
{}
};
diff --git a/moses/TranslationModel/fuzzy-match/SentenceAlignment.h b/moses/TranslationModel/fuzzy-match/SentenceAlignment.h
index 30c887fc1..466baa149 100644
--- a/moses/TranslationModel/fuzzy-match/SentenceAlignment.h
+++ b/moses/TranslationModel/fuzzy-match/SentenceAlignment.h
@@ -15,20 +15,18 @@
namespace tmmt
{
-
-struct SentenceAlignment
-{
+
+struct SentenceAlignment {
int count;
std::vector< WORD_ID > target;
std::vector< std::pair<int,int> > alignment;
-
+
SentenceAlignment()
{}
-
+
std::string getTargetString(const Vocabulary &vocab) const;
-
- std::string getAlignmentString() const
- {
+
+ std::string getAlignmentString() const {
std::stringstream strme;
for (size_t i = 0; i < alignment.size(); ++i) {
const std::pair<int,int> &alignPair = alignment[i];
@@ -36,7 +34,7 @@ struct SentenceAlignment
}
return strme.str();
}
-
+
};
}
diff --git a/moses/TranslationModel/fuzzy-match/SuffixArray.cpp b/moses/TranslationModel/fuzzy-match/SuffixArray.cpp
index 8a67fd954..5f49952ce 100644
--- a/moses/TranslationModel/fuzzy-match/SuffixArray.cpp
+++ b/moses/TranslationModel/fuzzy-match/SuffixArray.cpp
@@ -8,247 +8,235 @@ using namespace std;
namespace tmmt
{
-SuffixArray::SuffixArray( string fileName )
+SuffixArray::SuffixArray( string fileName )
{
- m_vcb.StoreIfNew( "<uNk>" );
- m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
-
- ifstream extractFile;
- char line[LINE_MAX_LENGTH];
-
- // count the number of words first;
- extractFile.open(fileName.c_str());
- istream *fileP = &extractFile;
- m_size = 0;
- size_t sentenceCount = 0;
- while(!fileP->eof()) {
- SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
- if (fileP->eof()) break;
- vector< WORD_ID > words = m_vcb.Tokenize( line );
- m_size += words.size() + 1;
- sentenceCount++;
- }
- extractFile.close();
- cerr << m_size << " words (incl. sentence boundaries)" << endl;
-
- // allocate memory
- m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
- m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
- m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
- m_sentence = (size_t*) calloc( sizeof( size_t ), m_size );
- m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount );
-
- // fill the array
- int wordIndex = 0;
- int sentenceId = 0;
- extractFile.open(fileName.c_str());
- fileP = &extractFile;
- while(!fileP->eof()) {
- SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
- if (fileP->eof()) break;
- vector< WORD_ID > words = m_vcb.Tokenize( line );
-
- // add to corpus vector
- corpus.push_back(words);
-
- // create SA
-
- vector< WORD_ID >::const_iterator i;
- for( i=words.begin(); i!=words.end(); i++)
- {
- m_index[ wordIndex ] = wordIndex;
- m_sentence[ wordIndex ] = sentenceId;
- m_wordInSentence[ wordIndex ] = i-words.begin();
- m_array[ wordIndex++ ] = *i;
- }
- m_index[ wordIndex ] = wordIndex;
- m_array[ wordIndex++ ] = m_endOfSentence;
- m_sentenceLength[ sentenceId++ ] = words.size();
- }
- extractFile.close();
- cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
- // List(0,9);
-
- // sort
- m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
- Sort( 0, m_size-1 );
- free( m_buffer );
- cerr << "done sorting" << endl;
+ m_vcb.StoreIfNew( "<uNk>" );
+ m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
+
+ ifstream extractFile;
+ char line[LINE_MAX_LENGTH];
+
+ // count the number of words first;
+ extractFile.open(fileName.c_str());
+ istream *fileP = &extractFile;
+ m_size = 0;
+ size_t sentenceCount = 0;
+ while(!fileP->eof()) {
+ SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
+ if (fileP->eof()) break;
+ vector< WORD_ID > words = m_vcb.Tokenize( line );
+ m_size += words.size() + 1;
+ sentenceCount++;
+ }
+ extractFile.close();
+ cerr << m_size << " words (incl. sentence boundaries)" << endl;
+
+ // allocate memory
+ m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
+ m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
+ m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
+ m_sentence = (size_t*) calloc( sizeof( size_t ), m_size );
+ m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount );
+
+ // fill the array
+ int wordIndex = 0;
+ int sentenceId = 0;
+ extractFile.open(fileName.c_str());
+ fileP = &extractFile;
+ while(!fileP->eof()) {
+ SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
+ if (fileP->eof()) break;
+ vector< WORD_ID > words = m_vcb.Tokenize( line );
+
+ // add to corpus vector
+ corpus.push_back(words);
+
+ // create SA
+
+ vector< WORD_ID >::const_iterator i;
+ for( i=words.begin(); i!=words.end(); i++) {
+ m_index[ wordIndex ] = wordIndex;
+ m_sentence[ wordIndex ] = sentenceId;
+ m_wordInSentence[ wordIndex ] = i-words.begin();
+ m_array[ wordIndex++ ] = *i;
+ }
+ m_index[ wordIndex ] = wordIndex;
+ m_array[ wordIndex++ ] = m_endOfSentence;
+ m_sentenceLength[ sentenceId++ ] = words.size();
+ }
+ extractFile.close();
+ cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
+ // List(0,9);
+
+ // sort
+ m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
+ Sort( 0, m_size-1 );
+ free( m_buffer );
+ cerr << "done sorting" << endl;
}
// good ol' quick sort
-void SuffixArray::Sort(INDEX start, INDEX end) {
- if (start == end) return;
- INDEX mid = (start+end+1)/2;
- Sort( start, mid-1 );
- Sort( mid, end );
-
- // merge
- int i = start;
- int j = mid;
- int k = 0;
- int length = end-start+1;
- while( k<length )
- {
- if (i == mid )
- {
- m_buffer[ k++ ] = m_index[ j++ ];
- }
- else if (j > end )
- {
- m_buffer[ k++ ] = m_index[ i++ ];
- }
- else {
- if (CompareIndex( m_index[i], m_index[j] ) < 0)
- {
- m_buffer[ k++ ] = m_index[ i++ ];
- }
- else
- {
- m_buffer[ k++ ] = m_index[ j++ ];
- }
- }
- }
-
- memcpy( ((char*)m_index) + sizeof( INDEX ) * start,
- ((char*)m_buffer), sizeof( INDEX ) * (end-start+1) );
+void SuffixArray::Sort(INDEX start, INDEX end)
+{
+ if (start == end) return;
+ INDEX mid = (start+end+1)/2;
+ Sort( start, mid-1 );
+ Sort( mid, end );
+
+ // merge
+ int i = start;
+ int j = mid;
+ int k = 0;
+ int length = end-start+1;
+ while( k<length ) {
+ if (i == mid ) {
+ m_buffer[ k++ ] = m_index[ j++ ];
+ } else if (j > end ) {
+ m_buffer[ k++ ] = m_index[ i++ ];
+ } else {
+ if (CompareIndex( m_index[i], m_index[j] ) < 0) {
+ m_buffer[ k++ ] = m_index[ i++ ];
+ } else {
+ m_buffer[ k++ ] = m_index[ j++ ];
+ }
+ }
+ }
+
+ memcpy( ((char*)m_index) + sizeof( INDEX ) * start,
+ ((char*)m_buffer), sizeof( INDEX ) * (end-start+1) );
}
SuffixArray::~SuffixArray()
-{
- free(m_index);
- free(m_array);
+{
+ free(m_index);
+ free(m_array);
}
int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
{
- // skip over identical words
- INDEX offset = 0;
- while( a+offset < m_size &&
- b+offset < m_size &&
- m_array[ a+offset ] == m_array[ b+offset ] )
- { offset++; }
-
- if( a+offset == m_size ) return -1;
- if( b+offset == m_size ) return 1;
- return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );
+ // skip over identical words
+ INDEX offset = 0;
+ while( a+offset < m_size &&
+ b+offset < m_size &&
+ m_array[ a+offset ] == m_array[ b+offset ] ) {
+ offset++;
+ }
+
+ if( a+offset == m_size ) return -1;
+ if( b+offset == m_size ) return 1;
+ return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );
}
inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
{
- // cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
- return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
+ // cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
+ return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
}
int SuffixArray::Count( const vector< WORD > &phrase )
{
- INDEX dummy;
- return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );
+ INDEX dummy;
+ return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );
}
bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min )
{
- INDEX dummy;
- return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;
+ INDEX dummy;
+ return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;
}
bool SuffixArray::Exists( const vector< WORD > &phrase )
{
- INDEX dummy;
- return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;
+ INDEX dummy;
+ return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;
}
int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
{
- return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );
+ return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );
}
int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
{
- // cerr << "FindFirst\n";
- INDEX start = search_start;
- INDEX end = (search_end == -1) ? (m_size-1) : search_end;
- INDEX mid = FindFirst( phrase, start, end );
- // cerr << "done\n";
- if (mid == m_size) return 0; // no matches
- if (min == 1) return 1; // only existance check
+ // cerr << "FindFirst\n";
+ INDEX start = search_start;
+ INDEX end = (search_end == -1) ? (m_size-1) : search_end;
+ INDEX mid = FindFirst( phrase, start, end );
+ // cerr << "done\n";
+ if (mid == m_size) return 0; // no matches
+ if (min == 1) return 1; // only existance check
- int matchCount = 1;
+ int matchCount = 1;
- //cerr << "before...\n";
- firstMatch = FindLast( phrase, mid, start, -1 );
- matchCount += mid - firstMatch;
+ //cerr << "before...\n";
+ firstMatch = FindLast( phrase, mid, start, -1 );
+ matchCount += mid - firstMatch;
- //cerr << "after...\n";
- lastMatch = FindLast( phrase, mid, end, 1 );
- matchCount += lastMatch - mid;
+ //cerr << "after...\n";
+ lastMatch = FindLast( phrase, mid, end, 1 );
+ matchCount += lastMatch - mid;
- return matchCount;
+ return matchCount;
}
SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction )
{
- end += direction;
- while(true)
- {
- INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;
-
- int match = Match( phrase, mid );
- int matchNext = Match( phrase, mid+direction );
- //cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl;
-
- if (match == 0 && matchNext != 0) return mid;
-
- if (match == 0) // mid point is a match
- start = mid;
- else
- end = mid;
- }
+ end += direction;
+ while(true) {
+ INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;
+
+ int match = Match( phrase, mid );
+ int matchNext = Match( phrase, mid+direction );
+ //cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl;
+
+ if (match == 0 && matchNext != 0) return mid;
+
+ if (match == 0) // mid point is a match
+ start = mid;
+ else
+ end = mid;
+ }
}
SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end )
{
- while(true)
- {
- INDEX mid = ( start + end + 1 )/2;
- //cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n";
- int match = Match( phrase, mid );
-
- if (match == 0) return mid;
- if (start >= end && match != 0 ) return m_size;
-
- if (match > 0)
- start = mid+1;
- else
- end = mid-1;
- }
+ while(true) {
+ INDEX mid = ( start + end + 1 )/2;
+ //cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n";
+ int match = Match( phrase, mid );
+
+ if (match == 0) return mid;
+ if (start >= end && match != 0 ) return m_size;
+
+ if (match > 0)
+ start = mid+1;
+ else
+ end = mid-1;
+ }
}
int SuffixArray::Match( const vector< WORD > &phrase, INDEX index )
{
- INDEX pos = m_index[ index ];
- for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++)
- {
- int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );
- // cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl;
- if (match != 0)
- return match;
- }
- return 0;
+ INDEX pos = m_index[ index ];
+ for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++) {
+ int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );
+ // cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl;
+ if (match != 0)
+ return match;
+ }
+ return 0;
}
void SuffixArray::List(INDEX start, INDEX end)
{
- for(INDEX i=start; i<=end; i++)
- {
- INDEX pos = m_index[ i ];
- // cerr << i << ":" << pos << "\t";
- for(int j=0; j<5 && j+pos<m_size; j++)
- {
- //cout << " " << m_vcb.GetWord( m_array[ pos+j ] );
- }
- // cerr << "\n";
- }
+ for(INDEX i=start; i<=end; i++) {
+ INDEX pos = m_index[ i ];
+ // cerr << i << ":" << pos << "\t";
+ for(int j=0; j<5 && j+pos<m_size; j++) {
+ //cout << " " << m_vcb.GetWord( m_array[ pos+j ] );
+ }
+ // cerr << "\n";
+ }
}
}
diff --git a/moses/TranslationModel/fuzzy-match/SuffixArray.h b/moses/TranslationModel/fuzzy-match/SuffixArray.h
index 5cfb120d6..a2dbf892c 100644
--- a/moses/TranslationModel/fuzzy-match/SuffixArray.h
+++ b/moses/TranslationModel/fuzzy-match/SuffixArray.h
@@ -6,51 +6,63 @@
namespace tmmt
{
-
-class SuffixArray
+
+class SuffixArray
{
public:
- typedef unsigned int INDEX;
+ typedef unsigned int INDEX;
private:
std::vector< std::vector< WORD_ID > > corpus;
WORD_ID *m_array;
- INDEX *m_index;
- INDEX *m_buffer;
- char *m_wordInSentence;
- size_t *m_sentence;
- char *m_sentenceLength;
- WORD_ID m_endOfSentence;
- Vocabulary m_vcb;
- INDEX m_size;
+ INDEX *m_index;
+ INDEX *m_buffer;
+ char *m_wordInSentence;
+ size_t *m_sentence;
+ char *m_sentenceLength;
+ WORD_ID m_endOfSentence;
+ Vocabulary m_vcb;
+ INDEX m_size;
public:
- SuffixArray( std::string fileName );
- ~SuffixArray();
-
- void Sort(INDEX start, INDEX end);
- int CompareIndex( INDEX a, INDEX b ) const;
- inline int CompareWord( WORD_ID a, WORD_ID b ) const;
- int Count( const std::vector< WORD > &phrase );
- bool MinCount( const std::vector< WORD > &phrase, INDEX min );
- bool Exists( const std::vector< WORD > &phrase );
- int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
- int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
- INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end );
- INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
- int Match( const std::vector< WORD > &phrase, INDEX index );
- void List( INDEX start, INDEX end );
- inline INDEX GetPosition( INDEX index ) { return m_index[ index ]; }
- inline size_t GetSentence( INDEX position ) { return m_sentence[position]; }
- inline char GetWordInSentence( INDEX position ) { return m_wordInSentence[position]; }
- inline char GetSentenceLength( size_t sentenceId ) { return m_sentenceLength[sentenceId]; }
- inline INDEX GetSize() { return m_size; }
-
- Vocabulary &GetVocabulary()
- { return m_vcb; }
- const std::vector< std::vector< WORD_ID > > &GetCorpus() const
- { return corpus; }
+ SuffixArray( std::string fileName );
+ ~SuffixArray();
+
+ void Sort(INDEX start, INDEX end);
+ int CompareIndex( INDEX a, INDEX b ) const;
+ inline int CompareWord( WORD_ID a, WORD_ID b ) const;
+ int Count( const std::vector< WORD > &phrase );
+ bool MinCount( const std::vector< WORD > &phrase, INDEX min );
+ bool Exists( const std::vector< WORD > &phrase );
+ int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
+ int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
+ INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end );
+ INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
+ int Match( const std::vector< WORD > &phrase, INDEX index );
+ void List( INDEX start, INDEX end );
+ inline INDEX GetPosition( INDEX index ) {
+ return m_index[ index ];
+ }
+ inline size_t GetSentence( INDEX position ) {
+ return m_sentence[position];
+ }
+ inline char GetWordInSentence( INDEX position ) {
+ return m_wordInSentence[position];
+ }
+ inline char GetSentenceLength( size_t sentenceId ) {
+ return m_sentenceLength[sentenceId];
+ }
+ inline INDEX GetSize() {
+ return m_size;
+ }
+
+ Vocabulary &GetVocabulary() {
+ return m_vcb;
+ }
+ const std::vector< std::vector< WORD_ID > > &GetCorpus() const {
+ return corpus;
+ }
};
}
diff --git a/moses/TranslationModel/fuzzy-match/Vocabulary.cpp b/moses/TranslationModel/fuzzy-match/Vocabulary.cpp
index 0c833ff78..ab1439a29 100644
--- a/moses/TranslationModel/fuzzy-match/Vocabulary.cpp
+++ b/moses/TranslationModel/fuzzy-match/Vocabulary.cpp
@@ -10,7 +10,8 @@ namespace tmmt
{
// as in beamdecoder/tables.cpp
-vector<WORD_ID> Vocabulary::Tokenize( const char input[] ) {
+vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
+{
vector< WORD_ID > token;
bool betweenWords = true;
int start=0;
@@ -21,8 +22,7 @@ vector<WORD_ID> Vocabulary::Tokenize( const char input[] ) {
if (!isSpace && betweenWords) {
start = i;
betweenWords = false;
- }
- else if (isSpace && !betweenWords) {
+ } else if (isSpace && !betweenWords) {
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
betweenWords = true;
}
@@ -32,9 +32,11 @@ vector<WORD_ID> Vocabulary::Tokenize( const char input[] ) {
return token;
}
-WORD_ID Vocabulary::StoreIfNew( const WORD& word ) {
+WORD_ID Vocabulary::StoreIfNew( const WORD& word )
+{
- { // read=lock scope
+ {
+ // read=lock scope
#ifdef WITH_THREADS
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
#endif
@@ -43,17 +45,18 @@ WORD_ID Vocabulary::StoreIfNew( const WORD& word ) {
if( i != lookup.end() )
return i->second;
}
-
+
#ifdef WITH_THREADS
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
#endif
WORD_ID id = vocab.size();
vocab.push_back( word );
lookup[ word ] = id;
- return id;
+ return id;
}
-WORD_ID Vocabulary::GetWordID( const WORD &word ) {
+WORD_ID Vocabulary::GetWordID( const WORD &word )
+{
#ifdef WITH_THREADS
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
#endif
diff --git a/moses/TranslationModel/fuzzy-match/Vocabulary.h b/moses/TranslationModel/fuzzy-match/Vocabulary.h
index 7be82bcbe..dfa11c1db 100644
--- a/moses/TranslationModel/fuzzy-match/Vocabulary.h
+++ b/moses/TranslationModel/fuzzy-match/Vocabulary.h
@@ -34,16 +34,20 @@ namespace tmmt
typedef std::string WORD;
typedef unsigned int WORD_ID;
-class Vocabulary {
- public:
+class Vocabulary
+{
+public:
std::map<WORD, WORD_ID> lookup;
std::vector< WORD > vocab;
WORD_ID StoreIfNew( const WORD& );
WORD_ID GetWordID( const WORD& );
std::vector<WORD_ID> Tokenize( const char[] );
- inline WORD &GetWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; }
+ inline WORD &GetWord( WORD_ID id ) const {
+ WORD &i = (WORD&) vocab[ id ];
+ return i;
+ }
- protected:
+protected:
#ifdef WITH_THREADS
//reader-writer lock
mutable boost::shared_mutex m_accessLock;
diff --git a/moses/TranslationModel/fuzzy-match/create_xml.cpp b/moses/TranslationModel/fuzzy-match/create_xml.cpp
index 783fb93eb..44c1efc9f 100644
--- a/moses/TranslationModel/fuzzy-match/create_xml.cpp
+++ b/moses/TranslationModel/fuzzy-match/create_xml.cpp
@@ -42,12 +42,10 @@ void create_xml(const string &inPath)
string inLine;
int step = 0;
- while (!inStrme.eof())
- {
+ while (!inStrme.eof()) {
getline(inStrme, inLine);
//cout << inLine << endl;
- switch (step)
- {
+ switch (step) {
case 0:
setenceId = Scan<int>(inLine);
++step;
@@ -63,8 +61,7 @@ void create_xml(const string &inPath)
case 3:
if (input == NULL) {
input = new string(inLine);
- }
- else {
+ } else {
assert(inLine == *input);
}
++step;
@@ -87,9 +84,9 @@ void create_xml(const string &inPath)
//print STDOUT $frame."\n";
rule << ret.ruleS << " [X] ||| " << ret.ruleT << " [X] ||| " << ret.ruleAlignment
- << " ||| " << count << endl;
+ << " ||| " << count << endl;
ruleInv << ret.ruleT << " [X] ||| " << ret.ruleS << " [X] ||| " << ret.ruleAlignmentInv
- << " ||| " << count << endl;
+ << " ||| " << count << endl;
//print STDOUT "$sentenceInd ||| $score ||| $count\n";
++ruleCount;
@@ -112,8 +109,8 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
{
CreateXMLRetValues ret;
vector<string> sourceToks = Tokenize(source, " ")
- ,inputToks = Tokenize(input, " ")
- ,targetsToks = Tokenize(target, " ");
+ ,inputToks = Tokenize(input, " ")
+ ,targetsToks = Tokenize(target, " ");
Alignments alignments(align, sourceToks.size(), targetsToks.size());
map<int, string> frameInput;
map<int, int> alignI2S;
@@ -241,8 +238,7 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
if (action == "M") {
inputBitmap.push_back(1);
- }
- else if (action == "I" || action == "S") {
+ } else if (action == "I" || action == "S") {
inputBitmap.push_back(0);
}
@@ -358,9 +354,8 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
}
// end of tm target inclusion (not included word or inserted input)
else if (currently_included
- && ( targetBitmap[t] || frameInput.find(t) != frameInput.end() )
- )
- {
+ && ( targetBitmap[t] || frameInput.find(t) != frameInput.end() )
+ ) {
// add xml (unless change is at the beginning of the sentence
if ( start_t >= 0 ) {
string target = "";
diff --git a/moses/TranslationOption.cpp b/moses/TranslationOption.cpp
index 87721bc52..dfe3312fc 100644
--- a/moses/TranslationOption.cpp
+++ b/moses/TranslationOption.cpp
@@ -42,11 +42,11 @@ TranslationOption::TranslationOption(const WordsRange &wordsRange
}
TranslationOption::TranslationOption(const TranslationOption &copy, const WordsRange &sourceWordsRange)
-: m_targetPhrase(copy.m_targetPhrase)
+ : m_targetPhrase(copy.m_targetPhrase)
//, m_sourcePhrase(new Phrase(*copy.m_sourcePhrase)) // TODO use when confusion network trans opt for confusion net properly implemented
-, m_sourceWordsRange(sourceWordsRange)
-, m_futureScore(copy.m_futureScore)
-, m_lexReorderingScores(copy.m_lexReorderingScores)
+ , m_sourceWordsRange(sourceWordsRange)
+ , m_futureScore(copy.m_futureScore)
+ , m_lexReorderingScores(copy.m_lexReorderingScores)
{}
bool TranslationOption::IsCompatible(const Phrase& phrase, const std::vector<FactorType>& featuresToCheck) const
diff --git a/moses/TranslationOption.h b/moses/TranslationOption.h
index 8e2064f83..b1de31eb1 100644
--- a/moses/TranslationOption.h
+++ b/moses/TranslationOption.h
@@ -146,18 +146,18 @@ public:
void CacheLexReorderingScores(const LexicalReordering &scoreProducer, const Scores &score);
TO_STRING();
-
- bool operator== (const TranslationOption &rhs) const
- {
+
+ bool operator== (const TranslationOption &rhs) const {
return m_sourceWordsRange == rhs.m_sourceWordsRange &&
- m_targetPhrase == rhs.m_targetPhrase;
- }
+ m_targetPhrase == rhs.m_targetPhrase;
+ }
};
//XXX: This doesn't look at the alignment. Is this correct?
-inline size_t hash_value(const TranslationOption& translationOption) {
+inline size_t hash_value(const TranslationOption& translationOption)
+{
size_t seed = 0;
boost::hash_combine(seed, translationOption.GetTargetPhrase());
boost::hash_combine(seed, translationOption.GetStartPos());
diff --git a/moses/TranslationOptionCollection.cpp b/moses/TranslationOptionCollection.cpp
index 16bcce791..2d7024c7a 100644
--- a/moses/TranslationOptionCollection.cpp
+++ b/moses/TranslationOptionCollection.cpp
@@ -48,11 +48,11 @@ bool CompareTranslationOption(const TranslationOption *a, const TranslationOptio
* This fn should be called by inherited classes
*/
TranslationOptionCollection::TranslationOptionCollection(
- InputType const& src, size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
+ InputType const& src, size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
: m_source(src)
- ,m_futureScore(src.GetSize())
- ,m_maxNoTransOptPerCoverage(maxNoTransOptPerCoverage)
- ,m_translationOptionThreshold(translationOptionThreshold)
+ ,m_futureScore(src.GetSize())
+ ,m_maxNoTransOptPerCoverage(maxNoTransOptPerCoverage)
+ ,m_translationOptionThreshold(translationOptionThreshold)
{
// create 2-d vector
size_t size = src.GetSize();
@@ -202,73 +202,68 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord,s
const UnknownWordPenaltyProducer *unknownWordPenaltyProducer = staticData.GetUnknownWordPenaltyProducer();
float unknownScore = FloorScore(TransformScore(0));
- // unknown word, add as trans opt
- FactorCollection &factorCollection = FactorCollection::Instance();
-
- size_t isDigit = 0;
-
- const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
- const StringPiece s = f->GetString();
- bool isEpsilon = (s=="" || s==EPSILON);
- if (StaticData::Instance().GetDropUnknown())
- {
-
-
- isDigit = s.find_first_of("0123456789");
- if (isDigit == 1)
- isDigit = 1;
- else
- isDigit = 0;
- // modify the starting bitmap
- }
-
- Phrase* m_unksrc = new Phrase(1);
+ // unknown word, add as trans opt
+ FactorCollection &factorCollection = FactorCollection::Instance();
+
+ size_t isDigit = 0;
+
+ const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
+ const StringPiece s = f->GetString();
+ bool isEpsilon = (s=="" || s==EPSILON);
+ if (StaticData::Instance().GetDropUnknown()) {
+
+
+ isDigit = s.find_first_of("0123456789");
+ if (isDigit == 1)
+ isDigit = 1;
+ else
+ isDigit = 0;
+ // modify the starting bitmap
+ }
+
+ Phrase* m_unksrc = new Phrase(1);
m_unksrc->AddWord() = sourceWord;
- m_unksrcs.push_back(m_unksrc);
-
- TranslationOption *transOpt;
- TargetPhrase targetPhrase;
- targetPhrase.SetSourcePhrase(*m_unksrc);
-
- if (!(staticData.GetDropUnknown() || isEpsilon) || isDigit)
- {
- // add to dictionary
-
- Word &targetWord = targetPhrase.AddWord();
-
- for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++)
- {
- FactorType factorType = static_cast<FactorType>(currFactor);
-
- const Factor *sourceFactor = sourceWord[currFactor];
- if (sourceFactor == NULL)
- targetWord[factorType] = factorCollection.AddFactor(UNKNOWN_FACTOR);
- else
- targetWord[factorType] = factorCollection.AddFactor(sourceFactor->GetString());
- }
- //create a one-to-one alignment between UNKNOWN_FACTOR and its verbatim translation
-
- targetPhrase.SetAlignmentInfo("0-0");
-
- }
- else
- {
- // drop source word. create blank trans opt
-
- //targetPhrase.SetAlignment();
-
- }
+ m_unksrcs.push_back(m_unksrc);
+
+ TranslationOption *transOpt;
+ TargetPhrase targetPhrase;
+ targetPhrase.SetSourcePhrase(*m_unksrc);
+
+ if (!(staticData.GetDropUnknown() || isEpsilon) || isDigit) {
+ // add to dictionary
+
+ Word &targetWord = targetPhrase.AddWord();
+
+ for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) {
+ FactorType factorType = static_cast<FactorType>(currFactor);
+
+ const Factor *sourceFactor = sourceWord[currFactor];
+ if (sourceFactor == NULL)
+ targetWord[factorType] = factorCollection.AddFactor(UNKNOWN_FACTOR);
+ else
+ targetWord[factorType] = factorCollection.AddFactor(sourceFactor->GetString());
+ }
+ //create a one-to-one alignment between UNKNOWN_FACTOR and its verbatim translation
+
+ targetPhrase.SetAlignmentInfo("0-0");
+
+ } else {
+ // drop source word. create blank trans opt
+
+ //targetPhrase.SetAlignment();
+
+ }
targetPhrase.GetScoreBreakdown().Assign(unknownWordPenaltyProducer, unknownScore);
- if (inputScores != NULL) {
- targetPhrase.SetInputScore(*inputScores);
- }
+ if (inputScores != NULL) {
+ targetPhrase.SetInputScore(*inputScores);
+ }
- targetPhrase.Evaluate(*m_unksrc);
+ targetPhrase.Evaluate(*m_unksrc);
- transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos + length - 1), targetPhrase);
- Add(transOpt);
+ transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos + length - 1), targetPhrase);
+ Add(transOpt);
}
@@ -426,19 +421,19 @@ void TranslationOptionCollection::EvaluateWithSource()
{
const size_t size = m_source.GetSize();
for (size_t startPos = 0 ; startPos < size ; ++startPos) {
- size_t maxSize = m_source.GetSize() - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
-
- for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) {
- TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);
-
- TranslationOptionList::const_iterator iterTransOpt;
- for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) {
- TranslationOption &transOpt = **iterTransOpt;
- transOpt.Evaluate(m_source);
- }
- }
+ size_t maxSize = m_source.GetSize() - startPos;
+ size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
+ maxSize = std::min(maxSize, maxSizePhrase);
+
+ for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) {
+ TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);
+
+ TranslationOptionList::const_iterator iterTransOpt;
+ for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) {
+ TranslationOption &transOpt = **iterTransOpt;
+ transOpt.Evaluate(m_source);
+ }
+ }
}
}
@@ -514,7 +509,7 @@ void TranslationOptionCollection::CreateTranslationOptionsForRange(
for (++iterStep ; iterStep != decodeGraph.end() ; ++iterStep) {
- const DecodeStep &decodeStep = **iterStep;
+ const DecodeStep &decodeStep = **iterStep;
PartialTranslOptColl* newPtoc = new PartialTranslOptColl;
// go thru each intermediate trans opt just created
@@ -634,7 +629,7 @@ std::ostream& operator<<(std::ostream& out, const TranslationOptionCollection& c
return out;
}
-const std::vector<Phrase*>& TranslationOptionCollection::GetUnknownSources() const
+const std::vector<Phrase*>& TranslationOptionCollection::GetUnknownSources() const
{
return m_unksrcs;
}
diff --git a/moses/TranslationOptionCollection.h b/moses/TranslationOptionCollection.h
index 36164f135..171a082e2 100644
--- a/moses/TranslationOptionCollection.h
+++ b/moses/TranslationOptionCollection.h
@@ -42,7 +42,7 @@ class InputType;
class FactorMask;
class Word;
class DecodeGraph;
-
+
/** Contains all phrase translations applicable to current input type (a sentence or confusion network).
* A key insight into efficient decoding is that various input
* conditions (trelliss, factored input, normal text, xml markup)
diff --git a/moses/TranslationOptionCollectionConfusionNet.cpp b/moses/TranslationOptionCollectionConfusionNet.cpp
index a25e8cffb..93953ba8a 100644
--- a/moses/TranslationOptionCollectionConfusionNet.cpp
+++ b/moses/TranslationOptionCollectionConfusionNet.cpp
@@ -10,8 +10,8 @@ namespace Moses
/** constructor; just initialize the base class */
TranslationOptionCollectionConfusionNet::TranslationOptionCollectionConfusionNet(
- const ConfusionNet &input
- , size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
+ const ConfusionNet &input
+ , size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
: TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold) {}
/* forcibly create translation option for a particular source word.
diff --git a/moses/TreeInput.cpp b/moses/TreeInput.cpp
index acae0bdb1..166445602 100644
--- a/moses/TreeInput.cpp
+++ b/moses/TreeInput.cpp
@@ -149,7 +149,7 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput>
return false;
}
- // may be either a input span label ("label"), or a specified output translation "translation"
+ // may be either a input span label ("label"), or a specified output translation "translation"
string label = ParseXmlTagAttribute(tagContent,"label");
string translation = ParseXmlTagAttribute(tagContent,"translation");
@@ -165,18 +165,17 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput>
vector<string> altTexts = TokenizeMultiCharSeparator(translation, "||");
vector<string> altLabel = TokenizeMultiCharSeparator(label, "||");
vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
- //TRACE_ERR("number of translations: " << altTexts.size() << endl);
+ //TRACE_ERR("number of translations: " << altTexts.size() << endl);
for (size_t i=0; i<altTexts.size(); ++i) {
// set target phrase
TargetPhrase targetPhrase;
targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);
// set constituent label
- string targetLHSstr;
+ string targetLHSstr;
if (altLabel.size() > i && altLabel[i].size() > 0) {
targetLHSstr = altLabel[i];
- }
- else {
+ } else {
const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS();
UnknownLHSList::const_iterator iterLHS = lhsList.begin();
targetLHSstr = iterLHS->first;
diff --git a/moses/TrellisPath.cpp b/moses/TrellisPath.cpp
index c73575b2c..fed8f9658 100644
--- a/moses/TrellisPath.cpp
+++ b/moses/TrellisPath.cpp
@@ -41,7 +41,8 @@ TrellisPath::TrellisPath(const Hypothesis *hypo)
}
}
-void TrellisPath::InitScore() {
+void TrellisPath::InitScore()
+{
m_totalScore = m_path[0]->GetWinningHypo()->GetTotalScore();
m_scoreBreakdown= m_path[0]->GetWinningHypo()->GetScoreBreakdown();
@@ -82,8 +83,8 @@ TrellisPath::TrellisPath(const TrellisPath &copy, size_t edgeIndex, const Hypoth
InitScore();
}
-TrellisPath::TrellisPath(const vector<const Hypothesis*> edges)
-:m_prevEdgeChanged(NOT_FOUND)
+TrellisPath::TrellisPath(const vector<const Hypothesis*> edges)
+ :m_prevEdgeChanged(NOT_FOUND)
{
m_path.resize(edges.size());
copy(edges.rbegin(),edges.rend(),m_path.begin());
diff --git a/moses/TrellisPath.h b/moses/TrellisPath.h
index d8005435c..26e722696 100644
--- a/moses/TrellisPath.h
+++ b/moses/TrellisPath.h
@@ -59,36 +59,36 @@ protected:
void InitScore();
public:
- TrellisPath(); // not implemented
-
- //! create path OF pure hypo
- TrellisPath(const Hypothesis *hypo);
-
- /** create path from another path, deviate at edgeIndex by using arc instead,
- * which may change other hypo back from there
- */
- TrellisPath(const TrellisPath &copy, size_t edgeIndex, const Hypothesis *arc);
-
- //! get score for this path throught trellis
- inline float GetTotalScore() const { return m_totalScore; }
-
- /** list of each hypo/arcs in path. For anything other than the best hypo, it is not possible just to follow the
- * m_prevHypo variable in the hypothesis object
- */
- inline const std::vector<const Hypothesis *> &GetEdges() const
- {
- return m_path;
- }
-
- inline size_t GetSize() const
- {
- return m_path.size();
- }
-
- //! create a set of next best paths by wiggling 1 of the node at a time.
- void CreateDeviantPaths(TrellisPathCollection &pathColl) const;
-
- //! create a list of next best paths by wiggling 1 of the node at a time.
+ TrellisPath(); // not implemented
+
+ //! create path OF pure hypo
+ TrellisPath(const Hypothesis *hypo);
+
+ /** create path from another path, deviate at edgeIndex by using arc instead,
+ * which may change other hypo back from there
+ */
+ TrellisPath(const TrellisPath &copy, size_t edgeIndex, const Hypothesis *arc);
+
+ //! get score for this path throught trellis
+ inline float GetTotalScore() const {
+ return m_totalScore;
+ }
+
+ /** list of each hypo/arcs in path. For anything other than the best hypo, it is not possible just to follow the
+ * m_prevHypo variable in the hypothesis object
+ */
+ inline const std::vector<const Hypothesis *> &GetEdges() const {
+ return m_path;
+ }
+
+ inline size_t GetSize() const {
+ return m_path.size();
+ }
+
+ //! create a set of next best paths by wiggling 1 of the node at a time.
+ void CreateDeviantPaths(TrellisPathCollection &pathColl) const;
+
+ //! create a list of next best paths by wiggling 1 of the node at a time.
void CreateDeviantPaths(TrellisPathList &pathColl) const;
inline const ScoreComponentCollection &GetScoreBreakdown() const {
diff --git a/moses/TypeDef.h b/moses/TypeDef.h
index e44946a2f..2b98b5bc3 100644
--- a/moses/TypeDef.h
+++ b/moses/TypeDef.h
@@ -129,7 +129,7 @@ enum InputTypeEnum {
,WordLatticeInput = 2
,TreeInputType = 3
,WordLatticeInput2 = 4
-
+
};
enum XmlInputType {
@@ -169,8 +169,7 @@ enum WordAlignmentSort {
,TargetOrder = 1
};
-enum FormatType
-{
+enum FormatType {
MosesFormat
,HieroFormat
};
diff --git a/moses/Util.cpp b/moses/Util.cpp
index 13cee27f9..f92c32dbb 100644
--- a/moses/Util.cpp
+++ b/moses/Util.cpp
@@ -182,8 +182,7 @@ std::string PassthroughSGML(std::string &line, const std::string tagName, const
size_t close = lline.find(rbrack, open);
//check whether the tag is closed with '/>'; if not return the empty string
- if (close == std::string::npos)
- {
+ if (close == std::string::npos) {
TRACE_ERR("PassthroughSGML error: the <passthrough info/> tag does not end properly\n");
return meta;
}
@@ -198,8 +197,7 @@ std::string PassthroughSGML(std::string &line, const std::string tagName, const
lline = ToLower(line);
open = lline.find(lbrack+tagName);
- if (open != std::string::npos)
- {
+ if (open != std::string::npos) {
TRACE_ERR("PassthroughSGML error: there are two <passthrough> tags\n");
}
return meta;
diff --git a/moses/Util.h b/moses/Util.h
index 9f43d9dc3..e5bdc820a 100644
--- a/moses/Util.h
+++ b/moses/Util.h
@@ -363,27 +363,27 @@ std::string PassthroughSGML(std::string &line, const std::string tagName,const s
*/
inline std::string GetFirstString(const std::string& str, int& first_pos, const std::string& delimiters = " \t")
{
-
- std::string first_str;
- // Skip delimiters at beginning.
- std::string::size_type lastPos = str.find_first_not_of(delimiters, first_pos);
-
- // Find first "non-delimiter".
- std::string::size_type pos = str.find_first_of(delimiters, lastPos);
-
- if (std::string::npos != pos || std::string::npos != lastPos){
-
- first_str = str.substr(lastPos, pos - lastPos);
-
- // Skip delimiters. Note the "not_of"
- lastPos = str.find_first_not_of(delimiters, pos);
-
- }
-
- first_pos = lastPos;
- return first_str;
+
+ std::string first_str;
+ // Skip delimiters at beginning.
+ std::string::size_type lastPos = str.find_first_not_of(delimiters, first_pos);
+
+ // Find first "non-delimiter".
+ std::string::size_type pos = str.find_first_of(delimiters, lastPos);
+
+ if (std::string::npos != pos || std::string::npos != lastPos) {
+
+ first_str = str.substr(lastPos, pos - lastPos);
+
+ // Skip delimiters. Note the "not_of"
+ lastPos = str.find_first_not_of(delimiters, pos);
+
+ }
+
+ first_pos = lastPos;
+ return first_str;
}
-
+
template<class T>
T log_sum (T log_a, T log_b)
{
diff --git a/moses/Word.cpp b/moses/Word.cpp
index 69d382c8a..41e5fae03 100644
--- a/moses/Word.cpp
+++ b/moses/Word.cpp
@@ -87,7 +87,8 @@ std::string Word::GetString(const vector<FactorType> factorType,bool endWithBlan
return strme.str();
}
-StringPiece Word::GetString(FactorType factorType) const {
+StringPiece Word::GetString(FactorType factorType) const
+{
return m_factorArray[factorType]->GetString();
}
diff --git a/moses/Word.h b/moses/Word.h
index d650fb67e..e88b0441b 100644
--- a/moses/Word.h
+++ b/moses/Word.h
@@ -152,8 +152,9 @@ struct WordComparer {
};
-inline size_t hash_value(const Word& word) {
- return word.hash();
+inline size_t hash_value(const Word& word)
+{
+ return word.hash();
}
}
diff --git a/moses/XmlOption.cpp b/moses/XmlOption.cpp
index c8d639e0a..4b703b247 100644
--- a/moses/XmlOption.cpp
+++ b/moses/XmlOption.cpp
@@ -83,8 +83,8 @@ string TrimXml(const string& str, const std::string& lbrackStr, const std::strin
*/
bool isXmlTag(const string& tag, const std::string& lbrackStr, const std::string& rbrackStr)
{
- return (tag.substr(0,lbrackStr.length()) == lbrackStr &&
- (tag[lbrackStr.length()] == '/' ||
+ return (tag.substr(0,lbrackStr.length()) == lbrackStr &&
+ (tag[lbrackStr.length()] == '/' ||
(tag[lbrackStr.length()] >= 'a' && tag[lbrackStr.length()] <= 'z') ||
(tag[lbrackStr.length()] >= 'A' && tag[lbrackStr.length()] <= 'Z')));
}
@@ -111,7 +111,7 @@ vector<string> TokenizeXml(const string& str, const std::string& lbrackStr, cons
// walk thorugh the string (loop vver cpos)
while (cpos != str.size()) {
// find the next opening "<" of an xml tag
- lpos = str.find(lbrack, cpos); // lpos = str.find_first_of(lbrack, cpos);
+ lpos = str.find(lbrack, cpos); // lpos = str.find_first_of(lbrack, cpos);
if (lpos != string::npos) {
// find the end of the xml tag
rpos = str.find(rbrack, lpos+lbrackStr.length()-1); // rpos = str.find_first_of(rbrack, lpos);
@@ -149,8 +149,8 @@ vector<string> TokenizeXml(const string& str, const std::string& lbrackStr, cons
* \param lbrackStr xml tag's left bracket string, typically "<"
* \param rbrackStr xml tag's right bracket string, typically ">"
*/
-bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls,
- const std::string& lbrackStr, const std::string& rbrackStr)
+bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls,
+ const std::string& lbrackStr, const std::string& rbrackStr)
{
//parse XML markup in translation line
diff --git a/moses/XmlOption.h b/moses/XmlOption.h
index 45989c841..942446b26 100644
--- a/moses/XmlOption.h
+++ b/moses/XmlOption.h
@@ -30,8 +30,8 @@ std::string TrimXml(const std::string& str, const std::string& lbrackStr="<", co
bool isXmlTag(const std::string& tag, const std::string& lbrackStr="<", const std::string& rbrackStr=">");
std::vector<std::string> TokenizeXml(const std::string& str, const std::string& lbrackStr="<", const std::string& rbrackStr=">");
-bool ProcessAndStripXMLTags(std::string &line, std::vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, std::vector< size_t > &walls,
- const std::string& lbrackStr="<", const std::string& rbrackStr=">");
+bool ProcessAndStripXMLTags(std::string &line, std::vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, std::vector< size_t > &walls,
+ const std::string& lbrackStr="<", const std::string& rbrackStr=">");
}
diff --git a/phrase-extract/AlignmentPhrase.h b/phrase-extract/AlignmentPhrase.h
index ec6431f18..52d9c85ea 100644
--- a/phrase-extract/AlignmentPhrase.h
+++ b/phrase-extract/AlignmentPhrase.h
@@ -25,7 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace MosesTraining
{
-
+
class WordsRange;
class AlignmentElement
diff --git a/phrase-extract/ExtractedRule.cpp b/phrase-extract/ExtractedRule.cpp
index 985f2f093..50d9085e6 100644
--- a/phrase-extract/ExtractedRule.cpp
+++ b/phrase-extract/ExtractedRule.cpp
@@ -23,20 +23,19 @@ void ExtractedRule::OutputNTLengths(std::ostream &out) const
void ExtractedRule::OutputNTLengths(std::ostringstream &outString) const
{
std::map<size_t, std::pair<size_t, size_t> >::const_iterator iter;
- for (iter = m_ntLengths.begin(); iter != m_ntLengths.end(); ++iter)
- {
+ for (iter = m_ntLengths.begin(); iter != m_ntLengths.end(); ++iter) {
size_t sourcePos = iter->first;
const std::pair<size_t, size_t> &spanLengths = iter->second;
- outString << sourcePos << "=" << spanLengths.first << "," <<spanLengths.second << " ";
+ outString << sourcePos << "=" << spanLengths.first << "," <<spanLengths.second << " ";
}
}
std::ostream& operator<<(std::ostream &out, const ExtractedRule &obj)
{
- out << obj.source << " ||| " << obj.target << " ||| "
+ out << obj.source << " ||| " << obj.target << " ||| "
<< obj.alignment << " ||| "
<< obj.alignmentInv << " ||| ";
-
+
obj.OutputNTLengths(out);
return out;
diff --git a/phrase-extract/ExtractedRule.h b/phrase-extract/ExtractedRule.h
index 992a807b3..c26de37ca 100644
--- a/phrase-extract/ExtractedRule.h
+++ b/phrase-extract/ExtractedRule.h
@@ -49,7 +49,7 @@ public:
double pcfgScore;
std::map<size_t, std::pair<size_t, size_t> > m_ntLengths;
-
+
ExtractedRule(int sT, int eT, int sS, int eS)
: source()
, target()
@@ -64,12 +64,11 @@ public:
, count(0)
, pcfgScore(0.0)
{}
-
- void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength)
- {
+
+ void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength) {
m_ntLengths[sourcePos] = std::pair<size_t, size_t>(sourceLength, targetLength);
}
-
+
void OutputNTLengths(std::ostream &out) const;
void OutputNTLengths(std::ostringstream &out) const;
};
diff --git a/phrase-extract/Hole.h b/phrase-extract/Hole.h
index c570ec7a1..efedf2f53 100644
--- a/phrase-extract/Hole.h
+++ b/phrase-extract/Hole.h
@@ -72,7 +72,7 @@ public:
int GetSize(size_t direction) const {
return m_end[direction] - m_start[direction] + 1;
}
-
+
void SetPos(int pos, size_t direction) {
m_pos[direction] = pos;
}
diff --git a/phrase-extract/HoleCollection.cpp b/phrase-extract/HoleCollection.cpp
index fba295993..e63e2eacc 100644
--- a/phrase-extract/HoleCollection.cpp
+++ b/phrase-extract/HoleCollection.cpp
@@ -64,7 +64,7 @@ int HoleCollection::Scope(const Hole &proposedHole) const
const int holeEnd = proposedHole.GetEnd(0);
int scope = m_scope.back();
if (holeStart == m_sourcePhraseStart.back() ||
- find(m_sourceHoleEndPoints.begin(), m_sourceHoleEndPoints.end(), holeStart-1) != m_sourceHoleEndPoints.end()) {
+ find(m_sourceHoleEndPoints.begin(), m_sourceHoleEndPoints.end(), holeStart-1) != m_sourceHoleEndPoints.end()) {
++scope; // Adding hole would introduce choice point at start of hole.
}
if (holeEnd == m_sourcePhraseEnd.back() ||
diff --git a/phrase-extract/OutputFileStream.cpp b/phrase-extract/OutputFileStream.cpp
index 2cad33bb9..a61ce1ab1 100644
--- a/phrase-extract/OutputFileStream.cpp
+++ b/phrase-extract/OutputFileStream.cpp
@@ -46,11 +46,11 @@ OutputFileStream::~OutputFileStream()
bool OutputFileStream::Open(const std::string &filePath)
{
- m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
+ m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
if (m_outFile->fail()) {
return false;
}
-
+
if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
this->push(boost::iostreams::gzip_compressor());
}
@@ -64,10 +64,10 @@ void OutputFileStream::Close()
if (m_outFile == NULL) {
return;
}
-
+
this->flush();
this->pop(); // file
-
+
m_outFile->close();
delete m_outFile;
m_outFile = NULL;
diff --git a/phrase-extract/PhraseAlignment.cpp b/phrase-extract/PhraseAlignment.cpp
index bdfead082..f830e411f 100644
--- a/phrase-extract/PhraseAlignment.cpp
+++ b/phrase-extract/PhraseAlignment.cpp
@@ -29,10 +29,10 @@ extern bool hierarchicalFlag;
template<typename T>
inline T Scan(const std::string &input)
{
- std::stringstream stream(input);
- T ret;
- stream >> ret;
- return ret;
+ std::stringstream stream(input);
+ T ret;
+ stream >> ret;
+ return ret;
}
@@ -40,11 +40,10 @@ inline T Scan(const std::string &input)
template<typename T>
inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
{
- output.resize(input.size());
- for (size_t i = 0 ; i < input.size() ; i++)
- {
- output[i] = Scan<T>( input[i] );
- }
+ output.resize(input.size());
+ for (size_t i = 0 ; i < input.size() ; i++) {
+ output[i] = Scan<T>( input[i] );
+ }
}
@@ -56,7 +55,7 @@ inline void Tokenize(std::vector<std::string> &output
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
// Find first "non-delimiter".
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
-
+
while (std::string::npos != pos || std::string::npos != lastPos) {
// Found a token, add it to the vector.
output.push_back(str.substr(lastPos, pos - lastPos));
@@ -70,12 +69,12 @@ inline void Tokenize(std::vector<std::string> &output
// speeded up version of above
template<typename T>
inline void Tokenize( std::vector<T> &output
- , const std::string &input
- , const std::string& delimiters = " \t")
+ , const std::string &input
+ , const std::string& delimiters = " \t")
{
- std::vector<std::string> stringVector;
- Tokenize(stringVector, input, delimiters);
- return Scan<T>(output, stringVector );
+ std::vector<std::string> stringVector;
+ Tokenize(stringVector, input, delimiters);
+ return Scan<T>(output, stringVector );
}
// read in a phrase pair and store it
@@ -94,8 +93,7 @@ void PhraseAlignment::create( char line[], int lineID, bool includeSentenceIdFla
else if (item == 2) { // target phrase
phraseT.push_back( vcbT.storeIfNew( token[j] ) );
- }
- else if (item == 3) { // alignment
+ } else if (item == 3) { // alignment
int s,t;
sscanf(token[j].c_str(), "%d-%d", &s, &t);
if ((size_t)t >= phraseT.size() || (size_t)s >= phraseS.size()) {
@@ -135,17 +133,17 @@ void PhraseAlignment::create( char line[], int lineID, bool includeSentenceIdFla
void PhraseAlignment::addNTLength(const std::string &tok)
{
vector< string > tokens;
-
+
Tokenize(tokens, tok, "=");
assert(tokens.size() == 2);
-
+
size_t sourcePos = Scan<size_t>(tokens[0]);
assert(sourcePos < phraseS.size());
-
+
vector< size_t > ntLengths;
Tokenize<size_t>(ntLengths, tokens[1], ",");
assert(ntLengths.size() == 2);
-
+
m_ntLengths[sourcePos] = std::pair<size_t, size_t>(ntLengths[0], ntLengths[1]);
}
@@ -211,13 +209,13 @@ int PhraseAlignment::Compare(const PhraseAlignment &other) const
if (this == &other) // comparing with itself
return 0;
- if (GetTarget() != other.GetTarget())
+ if (GetTarget() != other.GetTarget())
return ( GetTarget() < other.GetTarget() ) ? -1 : +1;
if (GetSource() != other.GetSource())
- return ( GetSource() < other.GetSource() ) ? -1 : +1;
+ return ( GetSource() < other.GetSource() ) ? -1 : +1;
- if (!hierarchicalFlag)
+ if (!hierarchicalFlag)
return 0;
// loop over all words (note: 0 = left hand side of rule)
@@ -228,15 +226,14 @@ int PhraseAlignment::Compare(const PhraseAlignment &other) const
if (alignedToT[i].size() != 1 ||
other.alignedToT[i].size() != 1 ||
- thisAlign != otherAlign)
- {
+ thisAlign != otherAlign) {
int ret = (thisAlign < otherAlign) ? -1 : +1;
return ret;
}
}
}
return 0;
-
+
}
}
diff --git a/phrase-extract/PhraseAlignment.h b/phrase-extract/PhraseAlignment.h
index c0df2aa37..06d9cfad0 100644
--- a/phrase-extract/PhraseAlignment.h
+++ b/phrase-extract/PhraseAlignment.h
@@ -24,7 +24,7 @@ protected:
PHRASE phraseT;
std::map<size_t, std::pair<size_t, size_t> > m_ntLengths;
-
+
void createAlignVec(size_t sourceSize, size_t targetSize);
void addNTLength(const std::string &tok);
public:
@@ -41,11 +41,10 @@ public:
bool equals( const PhraseAlignment& );
bool match( const PhraseAlignment& );
- int Compare(const PhraseAlignment &compare) const;
- inline bool operator<(const PhraseAlignment &compare) const
- {
- return Compare(compare) < 0;
- }
+ int Compare(const PhraseAlignment &compare) const;
+ inline bool operator<(const PhraseAlignment &compare) const {
+ return Compare(compare) < 0;
+ }
const PHRASE &GetSource() const {
return phraseS;
@@ -53,9 +52,10 @@ public:
const PHRASE &GetTarget() const {
return phraseT;
}
-
- const std::map<size_t, std::pair<size_t, size_t> > &GetNTLengths() const
- { return m_ntLengths; }
+
+ const std::map<size_t, std::pair<size_t, size_t> > &GetNTLengths() const {
+ return m_ntLengths;
+ }
};
@@ -67,8 +67,7 @@ typedef std::vector<PhraseAlignment*> PhraseAlignmentCollection;
class PhraseAlignmentCollectionOrderer
{
public:
- bool operator()(const PhraseAlignmentCollection &collA, const PhraseAlignmentCollection &collB) const
- {
+ bool operator()(const PhraseAlignmentCollection &collA, const PhraseAlignmentCollection &collB) const {
assert(collA.size() > 0);
assert(collB.size() > 0);
@@ -77,7 +76,7 @@ public:
bool ret = objA < objB;
return ret;
- }
+ }
};
@@ -97,10 +96,12 @@ public:
std::pair<Coll::iterator,bool> insert ( const PhraseAlignmentCollection& obj );
- const SortedColl &GetSortedColl() const
- { return m_sortedColl; }
- size_t GetSize() const
- { return m_coll.size(); }
+ const SortedColl &GetSortedColl() const {
+ return m_sortedColl;
+ }
+ size_t GetSize() const {
+ return m_coll.size();
+ }
private:
SortedColl m_sortedColl;
diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h
index 2daeaf0ca..60e56b08c 100644
--- a/phrase-extract/PhraseExtractionOptions.h
+++ b/phrase-extract/PhraseExtractionOptions.h
@@ -29,11 +29,12 @@ enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
-class PhraseExtractionOptions {
-
- public:
- const int maxPhraseLength;
- private:
+class PhraseExtractionOptions
+{
+
+public:
+ const int maxPhraseLength;
+private:
bool allModelsOutputFlag;
bool wordModel;
REO_MODEL_TYPE wordType;
@@ -48,103 +49,103 @@ class PhraseExtractionOptions {
bool gzOutput;
std::string instanceWeightsFile; //weights for each sentence
-public:
+public:
PhraseExtractionOptions(const int initmaxPhraseLength):
- maxPhraseLength(initmaxPhraseLength),
- allModelsOutputFlag(false),
- wordModel(false),
- wordType(REO_MSD),
- phraseModel(false),
- phraseType(REO_MSD),
- hierModel(false),
- hierType(REO_MSD),
- orientationFlag(false),
- translationFlag(true),
- includeSentenceIdFlag(false),
- onlyOutputSpanInfo(false),
- gzOutput(false){}
-
- //functions for initialization of options
- void initAllModelsOutputFlag(const bool initallModelsOutputFlag){
- allModelsOutputFlag=initallModelsOutputFlag;
- }
- void initWordModel(const bool initwordModel){
- wordModel=initwordModel;
- }
- void initWordType(REO_MODEL_TYPE initwordType ){
- wordType=initwordType;
- }
- void initPhraseModel(const bool initphraseModel ){
- phraseModel=initphraseModel;
- }
- void initPhraseType(REO_MODEL_TYPE initphraseType){
- phraseType=initphraseType;
- }
- void initHierModel(const bool inithierModel){
- hierModel=inithierModel;
- }
- void initHierType(REO_MODEL_TYPE inithierType){
- hierType=inithierType;
- }
- void initOrientationFlag(const bool initorientationFlag){
- orientationFlag=initorientationFlag;
- }
- void initTranslationFlag(const bool inittranslationFlag){
- translationFlag=inittranslationFlag;
- }
- void initIncludeSentenceIdFlag(const bool initincludeSentenceIdFlag){
- includeSentenceIdFlag=initincludeSentenceIdFlag;
- }
- void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo){
- onlyOutputSpanInfo= initonlyOutputSpanInfo;
- }
- void initGzOutput (const bool initgzOutput){
- gzOutput= initgzOutput;
- }
- void initInstanceWeightsFile(const char* initInstanceWeightsFile) {
- instanceWeightsFile = std::string(initInstanceWeightsFile);
- }
-
- // functions for getting values
- bool isAllModelsOutputFlag() const {
- return allModelsOutputFlag;
- }
- bool isWordModel() const {
- return wordModel;
- }
- REO_MODEL_TYPE isWordType() const {
- return wordType;
- }
- bool isPhraseModel() const {
- return phraseModel;
- }
- REO_MODEL_TYPE isPhraseType() const {
- return phraseType;
- }
- bool isHierModel() const {
- return hierModel;
- }
- REO_MODEL_TYPE isHierType() const {
- return hierType;
- }
- bool isOrientationFlag() const {
- return orientationFlag;
- }
- bool isTranslationFlag() const {
- return translationFlag;
- }
- bool isIncludeSentenceIdFlag() const {
- return includeSentenceIdFlag;
- }
- bool isOnlyOutputSpanInfo() const {
- return onlyOutputSpanInfo;
- }
- bool isGzOutput () const {
- return gzOutput;
- }
- std::string getInstanceWeightsFile() const {
- return instanceWeightsFile;
- }
+ maxPhraseLength(initmaxPhraseLength),
+ allModelsOutputFlag(false),
+ wordModel(false),
+ wordType(REO_MSD),
+ phraseModel(false),
+ phraseType(REO_MSD),
+ hierModel(false),
+ hierType(REO_MSD),
+ orientationFlag(false),
+ translationFlag(true),
+ includeSentenceIdFlag(false),
+ onlyOutputSpanInfo(false),
+ gzOutput(false) {}
+
+ //functions for initialization of options
+ void initAllModelsOutputFlag(const bool initallModelsOutputFlag) {
+ allModelsOutputFlag=initallModelsOutputFlag;
+ }
+ void initWordModel(const bool initwordModel) {
+ wordModel=initwordModel;
+ }
+ void initWordType(REO_MODEL_TYPE initwordType ) {
+ wordType=initwordType;
+ }
+ void initPhraseModel(const bool initphraseModel ) {
+ phraseModel=initphraseModel;
+ }
+ void initPhraseType(REO_MODEL_TYPE initphraseType) {
+ phraseType=initphraseType;
+ }
+ void initHierModel(const bool inithierModel) {
+ hierModel=inithierModel;
+ }
+ void initHierType(REO_MODEL_TYPE inithierType) {
+ hierType=inithierType;
+ }
+ void initOrientationFlag(const bool initorientationFlag) {
+ orientationFlag=initorientationFlag;
+ }
+ void initTranslationFlag(const bool inittranslationFlag) {
+ translationFlag=inittranslationFlag;
+ }
+ void initIncludeSentenceIdFlag(const bool initincludeSentenceIdFlag) {
+ includeSentenceIdFlag=initincludeSentenceIdFlag;
+ }
+ void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo) {
+ onlyOutputSpanInfo= initonlyOutputSpanInfo;
+ }
+ void initGzOutput (const bool initgzOutput) {
+ gzOutput= initgzOutput;
+ }
+ void initInstanceWeightsFile(const char* initInstanceWeightsFile) {
+ instanceWeightsFile = std::string(initInstanceWeightsFile);
+ }
+
+ // functions for getting values
+ bool isAllModelsOutputFlag() const {
+ return allModelsOutputFlag;
+ }
+ bool isWordModel() const {
+ return wordModel;
+ }
+ REO_MODEL_TYPE isWordType() const {
+ return wordType;
+ }
+ bool isPhraseModel() const {
+ return phraseModel;
+ }
+ REO_MODEL_TYPE isPhraseType() const {
+ return phraseType;
+ }
+ bool isHierModel() const {
+ return hierModel;
+ }
+ REO_MODEL_TYPE isHierType() const {
+ return hierType;
+ }
+ bool isOrientationFlag() const {
+ return orientationFlag;
+ }
+ bool isTranslationFlag() const {
+ return translationFlag;
+ }
+ bool isIncludeSentenceIdFlag() const {
+ return includeSentenceIdFlag;
+ }
+ bool isOnlyOutputSpanInfo() const {
+ return onlyOutputSpanInfo;
+ }
+ bool isGzOutput () const {
+ return gzOutput;
+ }
+ std::string getInstanceWeightsFile() const {
+ return instanceWeightsFile;
+ }
};
}
diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h
index 431be58b0..772d803a4 100644
--- a/phrase-extract/RuleExtractionOptions.h
+++ b/phrase-extract/RuleExtractionOptions.h
@@ -54,7 +54,7 @@ public:
bool unpairedExtractFormat;
bool conditionOnTargetLhs;
bool boundaryRules;
-
+
RuleExtractionOptions()
: maxSpan(10)
, minHoleSource(2)
diff --git a/phrase-extract/ScoreFeature.cpp b/phrase-extract/ScoreFeature.cpp
index 5998c528c..25e497df2 100644
--- a/phrase-extract/ScoreFeature.cpp
+++ b/phrase-extract/ScoreFeature.cpp
@@ -22,82 +22,81 @@
using namespace std;
-namespace MosesTraining
+namespace MosesTraining
{
- const string& ScoreFeatureManager::usage() const
- {
- const static string& usage = "[--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]]" ;
- return usage;
- }
+const string& ScoreFeatureManager::usage() const
+{
+ const static string& usage = "[--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]]" ;
+ return usage;
+}
- void ScoreFeatureManager::configure(const std::vector<std::string> args)
- {
- bool domainAdded = false;
- bool sparseDomainAdded = false;
- for (size_t i = 0; i < args.size(); ++i) {
- if (args[i] == "--IgnoreSentenceId") {
- m_includeSentenceId = true;
+void ScoreFeatureManager::configure(const std::vector<std::string> args)
+{
+ bool domainAdded = false;
+ bool sparseDomainAdded = false;
+ for (size_t i = 0; i < args.size(); ++i) {
+ if (args[i] == "--IgnoreSentenceId") {
+ m_includeSentenceId = true;
+ } else if (args[i].substr(0,8) == "--Domain") {
+ string type = args[i].substr(8);
+ ++i;
+ UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
+ string domainFile = args[i];
+ UTIL_THROW_IF(domainAdded, ScoreFeatureArgumentException,
+ "Only allowed one domain feature");
+ if (type == "Subset") {
+ m_features.push_back(ScoreFeaturePtr(new SubsetDomainFeature(domainFile)));
+ } else if (type == "Ratio") {
+ m_features.push_back(ScoreFeaturePtr(new RatioDomainFeature(domainFile)));
+ } else if (type == "Indicator") {
+ m_features.push_back(ScoreFeaturePtr(new IndicatorDomainFeature(domainFile)));
+ } else {
+ UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type);
}
- else if (args[i].substr(0,8) == "--Domain") {
- string type = args[i].substr(8);
- ++i;
- UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
- string domainFile = args[i];
- UTIL_THROW_IF(domainAdded, ScoreFeatureArgumentException,
- "Only allowed one domain feature");
- if (type == "Subset") {
- m_features.push_back(ScoreFeaturePtr(new SubsetDomainFeature(domainFile)));
- } else if (type == "Ratio") {
- m_features.push_back(ScoreFeaturePtr(new RatioDomainFeature(domainFile)));
- } else if (type == "Indicator") {
- m_features.push_back(ScoreFeaturePtr(new IndicatorDomainFeature(domainFile)));
- } else {
- UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type);
- }
- domainAdded = true;
- m_includeSentenceId = true;
- } else if (args[i].substr(0,14) == "--SparseDomain") {
- string type = args[i].substr(14);
- ++i;
- UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
- string domainFile = args[i];
- UTIL_THROW_IF(sparseDomainAdded, ScoreFeatureArgumentException,
- "Only allowed one sparse domain feature");
- if (type == "Subset") {
- m_features.push_back(ScoreFeaturePtr(new SparseSubsetDomainFeature(domainFile)));
- } else if (type == "Ratio") {
- m_features.push_back(ScoreFeaturePtr(new SparseRatioDomainFeature(domainFile)));
- } else if (type == "Indicator") {
- m_features.push_back(ScoreFeaturePtr(new SparseIndicatorDomainFeature(domainFile)));
- } else {
- UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type);
- }
- sparseDomainAdded = true;
- m_includeSentenceId = true;
+ domainAdded = true;
+ m_includeSentenceId = true;
+ } else if (args[i].substr(0,14) == "--SparseDomain") {
+ string type = args[i].substr(14);
+ ++i;
+ UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
+ string domainFile = args[i];
+ UTIL_THROW_IF(sparseDomainAdded, ScoreFeatureArgumentException,
+ "Only allowed one sparse domain feature");
+ if (type == "Subset") {
+ m_features.push_back(ScoreFeaturePtr(new SparseSubsetDomainFeature(domainFile)));
+ } else if (type == "Ratio") {
+ m_features.push_back(ScoreFeaturePtr(new SparseRatioDomainFeature(domainFile)));
+ } else if (type == "Indicator") {
+ m_features.push_back(ScoreFeaturePtr(new SparseIndicatorDomainFeature(domainFile)));
} else {
- UTIL_THROW(ScoreFeatureArgumentException,"Unknown score argument " << args[i]);
+ UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type);
}
+ sparseDomainAdded = true;
+ m_includeSentenceId = true;
+ } else {
+ UTIL_THROW(ScoreFeatureArgumentException,"Unknown score argument " << args[i]);
}
-
}
- bool ScoreFeatureManager::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const
- {
- for (size_t i = 0; i < m_features.size(); ++i) {
- if (!m_features[i]->equals(lhs,rhs)) return false;
- }
- return true;
+}
+
+bool ScoreFeatureManager::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const
+{
+ for (size_t i = 0; i < m_features.size(); ++i) {
+ if (!m_features[i]->equals(lhs,rhs)) return false;
}
+ return true;
+}
- void ScoreFeatureManager::addFeatures(const ScoreFeatureContext& context,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const
- {
- for (size_t i = 0; i < m_features.size(); ++i) {
- m_features[i]->add(context, denseValues, sparseValues);
- }
- }
+void ScoreFeatureManager::addFeatures(const ScoreFeatureContext& context,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
+{
+ for (size_t i = 0; i < m_features.size(); ++i) {
+ m_features[i]->add(context, denseValues, sparseValues);
+ }
+}
}
diff --git a/phrase-extract/ScoreFeature.h b/phrase-extract/ScoreFeature.h
index c7d856bcf..76939436f 100644
--- a/phrase-extract/ScoreFeature.h
+++ b/phrase-extract/ScoreFeature.h
@@ -20,7 +20,7 @@
/**
* This contains extra features that can be added to the scorer. To add a new feature:
* 1. Implement a subclass of ScoreFeature
- * 2. Updated ScoreFeatureManager.configure() to configure your feature, and usage() to
+ * 2. Updated ScoreFeatureManager.configure() to configure your feature, and usage() to
* display usage info.
* 3. Write unit tests (see ScoreFeatureTest.cpp) and regression tests
**/
@@ -37,35 +37,37 @@
#include "PhraseAlignment.h"
-namespace MosesTraining
+namespace MosesTraining
{
-struct MaybeLog{
+struct MaybeLog {
MaybeLog(bool useLog, float negativeLog):
m_useLog(useLog), m_negativeLog(negativeLog) {}
-
- inline float operator() (float a) const
- { return m_useLog ? m_negativeLog*log(a) : a; }
+
+ inline float operator() (float a) const {
+ return m_useLog ? m_negativeLog*log(a) : a;
+ }
float m_useLog;
float m_negativeLog;
};
-class ScoreFeatureArgumentException : public util::Exception
+class ScoreFeatureArgumentException : public util::Exception
{
- public:
- ScoreFeatureArgumentException() throw() {*this << "Unable to configure features: ";}
- ~ScoreFeatureArgumentException() throw() {}
+public:
+ ScoreFeatureArgumentException() throw() {
+ *this << "Unable to configure features: ";
+ }
+ ~ScoreFeatureArgumentException() throw() {}
};
/** Passed to each feature to be used to calculate its values */
-struct ScoreFeatureContext
-{
+struct ScoreFeatureContext {
ScoreFeatureContext(
const PhraseAlignmentCollection &thePhrasePair,
float theCount, /* Total counts of all phrase pairs*/
const MaybeLog& theMaybeLog
- ) :
+ ) :
phrasePair(thePhrasePair),
count(theCount),
maybeLog(theMaybeLog)
@@ -82,53 +84,57 @@ struct ScoreFeatureContext
**/
class ScoreFeature
{
- public:
- /** Add the values for this feature function. */
- virtual void add(const ScoreFeatureContext& context,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const = 0;
+public:
+ /** Add the values for this feature function. */
+ virtual void add(const ScoreFeatureContext& context,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const = 0;
- /** Return true if the two phrase pairs are equal from the point of this feature. Assume
- that they already compare true according to PhraseAlignment.equals()
- **/
- virtual bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const = 0;
+ /** Return true if the two phrase pairs are equal from the point of this feature. Assume
+ that they already compare true according to PhraseAlignment.equals()
+ **/
+ virtual bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const = 0;
- virtual ~ScoreFeature() {}
+ virtual ~ScoreFeature() {}
};
typedef boost::shared_ptr<ScoreFeature> ScoreFeaturePtr;
class ScoreFeatureManager
{
- public:
- ScoreFeatureManager():
- m_includeSentenceId(false) {}
-
- /** To be appended to the score usage message */
- const std::string& usage() const;
-
- /** Pass the unused command-line arguments to configure the extra features */
- void configure(const std::vector<std::string> args);
-
- /** Add all the features */
- void addFeatures(const ScoreFeatureContext& context,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const;
-
- /**
- * Used to tell if the PhraseAlignment should be considered the same by all
- * extended features.
- **/
- bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
-
- const std::vector<ScoreFeaturePtr>& getFeatures() const {return m_features;}
-
- /** Do we need to include sentence ids in phrase pairs? */
- bool includeSentenceId() const {return m_includeSentenceId;}
-
- private:
- std::vector<ScoreFeaturePtr> m_features;
- bool m_includeSentenceId;
+public:
+ ScoreFeatureManager():
+ m_includeSentenceId(false) {}
+
+ /** To be appended to the score usage message */
+ const std::string& usage() const;
+
+ /** Pass the unused command-line arguments to configure the extra features */
+ void configure(const std::vector<std::string> args);
+
+ /** Add all the features */
+ void addFeatures(const ScoreFeatureContext& context,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const;
+
+ /**
+ * Used to tell if the PhraseAlignment should be considered the same by all
+ * extended features.
+ **/
+ bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
+
+ const std::vector<ScoreFeaturePtr>& getFeatures() const {
+ return m_features;
+ }
+
+ /** Do we need to include sentence ids in phrase pairs? */
+ bool includeSentenceId() const {
+ return m_includeSentenceId;
+ }
+
+private:
+ std::vector<ScoreFeaturePtr> m_features;
+ bool m_includeSentenceId;
};
}
diff --git a/phrase-extract/ScoreFeatureTest.cpp b/phrase-extract/ScoreFeatureTest.cpp
index fecde015a..f4570fe30 100644
--- a/phrase-extract/ScoreFeatureTest.cpp
+++ b/phrase-extract/ScoreFeatureTest.cpp
@@ -31,14 +31,16 @@ using namespace MosesTraining;
using namespace std;
//pesky global variables
-namespace MosesTraining {
- bool hierarchicalFlag = false;
- Vocabulary vcbT;
- Vocabulary vcbS;
+namespace MosesTraining
+{
+bool hierarchicalFlag = false;
+Vocabulary vcbT;
+Vocabulary vcbS;
}
-const char *DomainFileLocation() {
+const char *DomainFileLocation()
+{
if (boost::unit_test::framework::master_test_suite().argc < 2) {
return "test.domain";
}
@@ -62,7 +64,7 @@ BOOST_AUTO_TEST_CASE(manager_configure_domain_except)
template <class Expected>
static void checkDomainConfigured(
- const vector<string>& args)
+ const vector<string>& args)
{
ScoreFeatureManager manager;
manager.configure(args);
@@ -76,17 +78,17 @@ static void checkDomainConfigured(
BOOST_AUTO_TEST_CASE(manager_config_domain)
{
checkDomainConfigured<RatioDomainFeature>
- (boost::assign::list_of ("--DomainRatio")("/dev/null"));
+ (boost::assign::list_of ("--DomainRatio")("/dev/null"));
checkDomainConfigured<IndicatorDomainFeature>
- (boost::assign::list_of("--DomainIndicator")("/dev/null"));
+ (boost::assign::list_of("--DomainIndicator")("/dev/null"));
checkDomainConfigured<SubsetDomainFeature>
- (boost::assign::list_of("--DomainSubset")("/dev/null"));
+ (boost::assign::list_of("--DomainSubset")("/dev/null"));
checkDomainConfigured<SparseRatioDomainFeature>
- (boost::assign::list_of("--SparseDomainRatio")("/dev/null"));
+ (boost::assign::list_of("--SparseDomainRatio")("/dev/null"));
checkDomainConfigured<SparseIndicatorDomainFeature>
- (boost::assign::list_of("--SparseDomainIndicator")("/dev/null"));
+ (boost::assign::list_of("--SparseDomainIndicator")("/dev/null"));
checkDomainConfigured<SparseSubsetDomainFeature>
- (boost::assign::list_of("--SparseDomainSubset")("/dev/null"));
+ (boost::assign::list_of("--SparseDomainSubset")("/dev/null"));
}
@@ -98,8 +100,8 @@ BOOST_AUTO_TEST_CASE(domain_equals)
char buf2[] = "a ||| b ||| 0-0 ||| 2";
char buf3[] = "a ||| b ||| 0-0 ||| 3";
a1.create(buf1, 0, true); //domain a
- a2.create(buf2, 1, true); //domain c
- a3.create(buf3, 2, true); //domain c
+ a2.create(buf2, 1, true); //domain c
+ a3.create(buf3, 2, true); //domain c
BOOST_CHECK(feature.equals(a2,a3));
BOOST_CHECK(!feature.equals(a1,a3));
BOOST_CHECK(!feature.equals(a1,a3));
diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp
index 96ef02865..b2d5520aa 100644
--- a/phrase-extract/SentenceAlignment.cpp
+++ b/phrase-extract/SentenceAlignment.cpp
@@ -94,12 +94,12 @@ bool SentenceAlignment::create( char targetString[], char sourceString[], char a
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
return false;
}
-
+
if (boundaryRules) {
++s;
++t;
}
-
+
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
if ((size_t)t >= target.size() || (size_t)s >= source.size()) {
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
@@ -109,16 +109,16 @@ bool SentenceAlignment::create( char targetString[], char sourceString[], char a
alignedToT[t].push_back( s );
alignedCountS[s]++;
}
-
+
if (boundaryRules) {
alignedToT[0].push_back(0);
alignedCountS[0]++;
-
+
alignedToT.back().push_back(alignedCountS.size() - 1);
alignedCountS.back()++;
-
+
}
-
+
return true;
}
diff --git a/phrase-extract/SentenceAlignment.h b/phrase-extract/SentenceAlignment.h
index 76cf950d4..e215f5fef 100644
--- a/phrase-extract/SentenceAlignment.h
+++ b/phrase-extract/SentenceAlignment.h
@@ -45,7 +45,7 @@ public:
bool create(char targetString[], char sourceString[],
char alignmentString[], char weightString[], int sentenceID, bool boundaryRules);
-
+
};
}
diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp
index 05bcefe20..eedb3b260 100644
--- a/phrase-extract/XmlTree.cpp
+++ b/phrase-extract/XmlTree.cpp
@@ -364,7 +364,7 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label
string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg");
float pcfgScore = pcfgString == "" ? 0.0f
- : std::atof(pcfgString.c_str());
+ : std::atof(pcfgString.c_str());
// report what we have processed so far
if (0) {
diff --git a/phrase-extract/consolidate-direct-main.cpp b/phrase-extract/consolidate-direct-main.cpp
index e7e68e977..3b38f741c 100644
--- a/phrase-extract/consolidate-direct-main.cpp
+++ b/phrase-extract/consolidate-direct-main.cpp
@@ -89,21 +89,20 @@ int main(int argc, char* argv[])
char* &fileNameConsolidated = argv[2];
ostream *fileConsolidated;
-
- if (strcmp(fileNameConsolidated, "-") == 0) {
- fileConsolidated = &cout;
- }
- else {
+
+ if (strcmp(fileNameConsolidated, "-") == 0) {
+ fileConsolidated = &cout;
+ } else {
Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
- bool success = outputFile->Open(fileNameConsolidated);
- if (!success) {
- cerr << "ERROR: could not open file phrase table file "
- << fileNameConsolidated << endl;
- exit(1);
- }
- fileConsolidated = outputFile;
- }
-
+ bool success = outputFile->Open(fileNameConsolidated);
+ if (!success) {
+ cerr << "ERROR: could not open file phrase table file "
+ << fileNameConsolidated << endl;
+ exit(1);
+ }
+ fileConsolidated = outputFile;
+ }
+
int i=0;
while(true) {
i++;
@@ -119,8 +118,8 @@ int main(int argc, char* argv[])
// output alignment and probabilities
(*fileConsolidated) << itemDirect[2] // prob direct
- << " 2.718" // phrase count feature
- << " ||| " << itemDirect[3]; // alignment
+ << " 2.718" // phrase count feature
+ << " ||| " << itemDirect[3]; // alignment
// counts
(*fileConsolidated) << "||| 0 " << itemDirect[4]; // indirect
@@ -128,11 +127,11 @@ int main(int argc, char* argv[])
}
- fileConsolidated->flush();
- if (fileConsolidated != &cout) {
- delete fileConsolidated;
- }
-
+ fileConsolidated->flush();
+ if (fileConsolidated != &cout) {
+ delete fileConsolidated;
+ }
+
cerr << "Finished" << endl;
}
diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index fd33907de..67a097910 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -42,7 +42,10 @@ bool goodTuringFlag = false;
bool kneserNeyFlag = false;
bool logProbFlag = false;
bool outputNTLengths = false;
-inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; }
+inline float maybeLogProb( float a )
+{
+ return logProbFlag ? log(a) : a;
+}
char line[LINE_MAX_LENGTH];
void processFiles( char*, char*, char*, char* );
@@ -79,7 +82,7 @@ int main(int argc, char* argv[])
cerr << "not including the phrase count feature\n";
} else if (strcmp(argv[i],"--GoodTuring") == 0) {
goodTuringFlag = true;
- if (i+1==argc) {
+ if (i+1==argc) {
cerr << "ERROR: specify count of count files for Good Turing discounting!\n";
exit(1);
}
@@ -87,7 +90,7 @@ int main(int argc, char* argv[])
cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
} else if (strcmp(argv[i],"--KneserNey") == 0) {
kneserNeyFlag = true;
- if (i+1==argc) {
+ if (i+1==argc) {
cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
exit(1);
}
@@ -105,8 +108,11 @@ int main(int argc, char* argv[])
while(i+1<argc && argv[i+1][0]>='0' && argv[i+1][0]<='9') {
int binCount = atoi(argv[++i]);
countBin.push_back( binCount );
- if (prev+1 == binCount) { cerr << " " << binCount; }
- else { cerr << " " << (prev+1) << "-" << binCount; }
+ if (prev+1 == binCount) {
+ cerr << " " << binCount;
+ } else {
+ cerr << " " << (prev+1) << "-" << binCount;
+ }
prev = binCount;
}
cerr << " " << (prev+1) << "+\n";
@@ -152,7 +158,7 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
if (goodTuringFlag) {
goodTuringDiscount.push_back(0.01); // floor value
for( size_t i=1; i<countOfCounts.size()-1; i++ ) {
- goodTuringDiscount.push_back(((float)i+1)/(float)i*((countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1)));
+ goodTuringDiscount.push_back(((float)i+1)/(float)i*((countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1)));
if (goodTuringDiscount[i]>1)
goodTuringDiscount[i] = 1;
if (goodTuringDiscount[i]<goodTuringDiscount[i-1])
@@ -253,21 +259,21 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
float adjustedCountEF_indirect = adjustedCountEF;
// Kneser Ney discounting [Foster et al, 2006]
- if (kneserNeyFlag) {
- float D = kneserNey_D3;
- if (countEF < 2) D = kneserNey_D1;
- else if (countEF < 3) D = kneserNey_D2;
- if (D > countEF) D = countEF - 0.01; // sanity constraint
-
- float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
- float alpha_F = D * n1_F / countF; // available mass
- adjustedCountEF = countEF - D + countF * alpha_F * p_b_E;
-
- // for indirect
- float p_b_F = n1_F / totalCount; // target phrase prob based on distinct
- float alpha_E = D * n1_E / countE; // available mass
- adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F;
- }
+ if (kneserNeyFlag) {
+ float D = kneserNey_D3;
+ if (countEF < 2) D = kneserNey_D1;
+ else if (countEF < 3) D = kneserNey_D2;
+ if (D > countEF) D = countEF - 0.01; // sanity constraint
+
+ float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
+ float alpha_F = D * n1_F / countF; // available mass
+ adjustedCountEF = countEF - D + countF * alpha_F * p_b_E;
+
+ // for indirect
+ float p_b_F = n1_F / totalCount; // target phrase prob based on distinct
+ float alpha_E = D * n1_E / countE; // available mass
+ adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F;
+ }
// prob indirect
if (!onlyDirectFlag) {
@@ -296,30 +302,27 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
if (!foundBin && countEF <= countBin[i]) {
fileConsolidated << " " << maybeLogProb(2.718);
foundBin = true;
- }
- else {
+ } else {
fileConsolidated << " " << maybeLogProb(1);
}
}
- fileConsolidated << " " << maybeLogProb( foundBin ? 1 : 2.718 );
+ fileConsolidated << " " << maybeLogProb( foundBin ? 1 : 2.718 );
}
// alignment
fileConsolidated << " ||| " << itemDirect[3];
// counts, for debugging
- fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
+ fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
- if (outputNTLengths)
- {
+ if (outputNTLengths) {
fileConsolidated << " ||| " << itemDirect[5];
}
-
+
// count bin feature (as a sparse feature)
- if (sparseCountBinFeatureFlag ||
- directSparseScores.compare("") != 0 ||
- indirectSparseScores.compare("") != 0)
- {
+ if (sparseCountBinFeatureFlag ||
+ directSparseScores.compare("") != 0 ||
+ indirectSparseScores.compare("") != 0) {
fileConsolidated << " |||";
if (directSparseScores.compare("") != 0)
fileConsolidated << " " << directSparseScores;
@@ -351,13 +354,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
fileConsolidated.Close();
}
-void breakdownCoreAndSparse( string combined, string &core, string &sparse )
+void breakdownCoreAndSparse( string combined, string &core, string &sparse )
{
core = "";
sparse = "";
vector<string> score = tokenize( combined.c_str() );
for(size_t i=0; i<score.size(); i++) {
- if ((score[i][0] >= '0' && score[i][0] <= '9') || i+1 == score.size())
+ if ((score[i][0] >= '0' && score[i][0] <= '9') || i+1 == score.size())
core += " " + score[i];
else {
sparse += " " + score[i];
diff --git a/phrase-extract/consolidate-reverse-main.cpp b/phrase-extract/consolidate-reverse-main.cpp
index c86d870c8..6843bf3aa 100644
--- a/phrase-extract/consolidate-reverse-main.cpp
+++ b/phrase-extract/consolidate-reverse-main.cpp
@@ -229,13 +229,12 @@ string reverseAlignment(const string &alignments)
vector<string> alignToks = tokenize(alignments.c_str());
- for (size_t i = 0; i < alignToks.size(); ++i)
- {
+ for (size_t i = 0; i < alignToks.size(); ++i) {
string &alignPair = alignToks[i];
vector<string> alignPoints;
Tokenize(alignPoints, alignPair, "-");
assert(alignPoints.size() == 2);
-
+
ret << alignPoints[1] << "-" << alignPoints[0] << " ";
}
diff --git a/phrase-extract/domain.cpp b/phrase-extract/domain.cpp
index 29ba8ee64..67b4a13c3 100644
--- a/phrase-extract/domain.cpp
+++ b/phrase-extract/domain.cpp
@@ -13,7 +13,8 @@ namespace MosesTraining
{
// handling of domain names: load database with sentence-id / domain name info
-void Domain::load( const std::string &domainFileName ) {
+void Domain::load( const std::string &domainFileName )
+{
Moses::InputFileStream fileS( domainFileName );
istream *fileP = &fileS;
while(true) {
@@ -39,7 +40,8 @@ void Domain::load( const std::string &domainFileName ) {
}
// get domain name based on sentence number
-string Domain::getDomainOfSentence( int sentenceId ) const {
+string Domain::getDomainOfSentence( int sentenceId ) const
+{
for(size_t i=0; i<spec.size(); i++) {
if (sentenceId <= spec[i].first) {
return spec[i].second;
@@ -54,9 +56,9 @@ DomainFeature::DomainFeature(const string& domainFile)
m_domain.load(domainFile);
}
-void DomainFeature::add(const ScoreFeatureContext& context,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const
+void DomainFeature::add(const ScoreFeatureContext& context,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
{
map< string, float > domainCount;
for(size_t i=0; i<context.phrasePair.size(); i++) {
@@ -71,13 +73,13 @@ void DomainFeature::add(const ScoreFeatureContext& context,
}
void SubsetDomainFeature::add(const map<string,float>& domainCount,float count,
- const MaybeLog& maybeLog,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
{
- if (m_domain.list.size() > 6) {
+ if (m_domain.list.size() > 6) {
UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException,
- "too many domains for core domain subset features");
+ "too many domains for core domain subset features");
}
size_t bitmap = 0;
for(size_t bit = 0; bit < m_domain.list.size(); bit++) {
@@ -87,13 +89,13 @@ void SubsetDomainFeature::add(const map<string,float>& domainCount,float count,
}
for(size_t i = 1; i < (1 << m_domain.list.size()); i++) {
denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 ));
- }
+ }
}
void SparseSubsetDomainFeature::add(const map<string,float>& domainCount,float count,
- const MaybeLog& maybeLog,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
{
typedef vector<string>::const_iterator I;
ostringstream key;
@@ -108,9 +110,9 @@ void SparseSubsetDomainFeature::add(const map<string,float>& domainCount,float c
void RatioDomainFeature::add(const map<string,float>& domainCount,float count,
- const MaybeLog& maybeLog,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
{
typedef vector< string >::const_iterator I;
for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
@@ -125,9 +127,9 @@ void RatioDomainFeature::add(const map<string,float>& domainCount,float count,
void SparseRatioDomainFeature::add(const map<string,float>& domainCount,float count,
- const MaybeLog& maybeLog,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
{
typedef map< string, float >::const_iterator I;
for (I i=domainCount.begin(); i != domainCount.end(); i++) {
@@ -137,9 +139,9 @@ void SparseRatioDomainFeature::add(const map<string,float>& domainCount,float co
void IndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
- const MaybeLog& maybeLog,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
{
typedef vector< string >::const_iterator I;
for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
@@ -154,20 +156,20 @@ void IndicatorDomainFeature::add(const map<string,float>& domainCount,float coun
}
void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
- const MaybeLog& maybeLog,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const
{
typedef map< string, float >::const_iterator I;
for (I i=domainCount.begin(); i != domainCount.end(); i++) {
- sparseValues["dom_" + i->first] = 1;
+ sparseValues["dom_" + i->first] = 1;
}
}
-bool DomainFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const
+bool DomainFeature::equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const
{
return m_domain.getDomainOfSentence(lhs.sentenceId) ==
- m_domain.getDomainOfSentence( rhs.sentenceId);
+ m_domain.getDomainOfSentence( rhs.sentenceId);
}
diff --git a/phrase-extract/domain.h b/phrase-extract/domain.h
index f3e1e92a3..279496e01 100644
--- a/phrase-extract/domain.h
+++ b/phrase-extract/domain.h
@@ -31,106 +31,106 @@ public:
class DomainFeature : public ScoreFeature
{
- public:
+public:
+
+ DomainFeature(const std::string& domainFile);
+ bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
+ void add(const ScoreFeatureContext& context,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const;
- DomainFeature(const std::string& domainFile);
- bool equals(const PhraseAlignment& lhs, const PhraseAlignment& rhs) const;
- void add(const ScoreFeatureContext& context,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const;
+protected:
+ /** Overriden in subclass */
+ virtual void add(const std::map<std::string,float>& domainCounts, float count,
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const = 0;
- protected:
- /** Overriden in subclass */
- virtual void add(const std::map<std::string,float>& domainCounts, float count,
- const MaybeLog& maybeLog,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const = 0;
-
- Domain m_domain;
+ Domain m_domain;
};
class SubsetDomainFeature : public DomainFeature
{
- public:
- SubsetDomainFeature(const std::string& domainFile) :
- DomainFeature(domainFile) {}
-
- protected:
- virtual void add(const std::map<std::string,float>& domainCounts, float count,
- const MaybeLog& maybeLog,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const;
+public:
+ SubsetDomainFeature(const std::string& domainFile) :
+ DomainFeature(domainFile) {}
+
+protected:
+ virtual void add(const std::map<std::string,float>& domainCounts, float count,
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const;
};
class SparseSubsetDomainFeature : public DomainFeature
{
- public:
- SparseSubsetDomainFeature(const std::string& domainFile) :
- DomainFeature(domainFile) {}
+public:
+ SparseSubsetDomainFeature(const std::string& domainFile) :
+ DomainFeature(domainFile) {}
- protected:
- virtual void add(const std::map<std::string,float>& domainCounts, float count,
- const MaybeLog& maybeLog,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const;
+protected:
+ virtual void add(const std::map<std::string,float>& domainCounts, float count,
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const;
};
class IndicatorDomainFeature : public DomainFeature
{
- public:
- IndicatorDomainFeature(const std::string& domainFile) :
- DomainFeature(domainFile) {}
-
- protected:
- virtual void add(const std::map<std::string,float>& domainCounts, float count,
- const MaybeLog& maybeLog,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const;
+public:
+ IndicatorDomainFeature(const std::string& domainFile) :
+ DomainFeature(domainFile) {}
+
+protected:
+ virtual void add(const std::map<std::string,float>& domainCounts, float count,
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const;
};
class SparseIndicatorDomainFeature : public DomainFeature
{
- public:
- SparseIndicatorDomainFeature(const std::string& domainFile) :
- DomainFeature(domainFile) {}
-
- protected:
- virtual void add(const std::map<std::string,float>& domainCounts, float count,
- const MaybeLog& maybeLog,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const;
+public:
+ SparseIndicatorDomainFeature(const std::string& domainFile) :
+ DomainFeature(domainFile) {}
+
+protected:
+ virtual void add(const std::map<std::string,float>& domainCounts, float count,
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const;
};
class RatioDomainFeature : public DomainFeature
{
- public:
- RatioDomainFeature(const std::string& domainFile) :
- DomainFeature(domainFile) {}
-
- protected:
- virtual void add(const std::map<std::string,float>& domainCounts, float count,
- const MaybeLog& maybeLog,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const;
+public:
+ RatioDomainFeature(const std::string& domainFile) :
+ DomainFeature(domainFile) {}
+
+protected:
+ virtual void add(const std::map<std::string,float>& domainCounts, float count,
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const;
};
class SparseRatioDomainFeature : public DomainFeature
{
- public:
- SparseRatioDomainFeature(const std::string& domainFile) :
- DomainFeature(domainFile) {}
-
- protected:
- virtual void add(const std::map<std::string,float>& domainCounts, float count,
- const MaybeLog& maybeLog,
- std::vector<float>& denseValues,
- std::map<std::string,float>& sparseValues) const;
+public:
+ SparseRatioDomainFeature(const std::string& domainFile) :
+ DomainFeature(domainFile) {}
+
+protected:
+ virtual void add(const std::map<std::string,float>& domainCounts, float count,
+ const MaybeLog& maybeLog,
+ std::vector<float>& denseValues,
+ std::map<std::string,float>& sparseValues) const;
};
diff --git a/phrase-extract/extract-ghkm/Alignment.cpp b/phrase-extract/extract-ghkm/Alignment.cpp
index fcd5e14e1..744b4b1a2 100644
--- a/phrase-extract/extract-ghkm/Alignment.cpp
+++ b/phrase-extract/extract-ghkm/Alignment.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -24,8 +24,10 @@
#include <cassert>
#include <cstdlib>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
Alignment ReadAlignment(const std::string &s)
{
diff --git a/phrase-extract/extract-ghkm/Alignment.h b/phrase-extract/extract-ghkm/Alignment.h
index bc42191e1..051d5ca92 100644
--- a/phrase-extract/extract-ghkm/Alignment.h
+++ b/phrase-extract/extract-ghkm/Alignment.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -25,8 +25,10 @@
#include <utility>
#include <vector>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
typedef std::vector<std::pair<int, int> > Alignment;
diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
index 6bd32a13b..974188dbd 100644
--- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -30,8 +30,10 @@
#include <memory>
#include <stack>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
AlignmentGraph::AlignmentGraph(const ParseTree *t,
const std::vector<std::string> &s,
@@ -84,8 +86,8 @@ AlignmentGraph::~AlignmentGraph()
}
Subgraph AlignmentGraph::ComputeMinimalFrontierGraphFragment(
- Node *root,
- const std::set<Node *> &frontierSet)
+ Node *root,
+ const std::set<Node *> &frontierSet)
{
std::stack<Node *> expandableNodes;
std::set<const Node *> expandedNodes;
@@ -302,7 +304,7 @@ void AlignmentGraph::CalcComplementSpans(Node *root)
}
void AlignmentGraph::GetTargetTreeLeaves(Node *root,
- std::vector<Node *> &leaves)
+ std::vector<Node *> &leaves)
{
if (root->IsSink()) {
leaves.push_back(root);
diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.h b/phrase-extract/extract-ghkm/AlignmentGraph.h
index 94948758a..cf26b8c27 100644
--- a/phrase-extract/extract-ghkm/AlignmentGraph.h
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -28,8 +28,10 @@
#include <string>
#include <vector>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
class Node;
class ParseTree;
@@ -37,20 +39,24 @@ class Subgraph;
class AlignmentGraph
{
- public:
+public:
AlignmentGraph(const ParseTree *,
const std::vector<std::string> &,
const Alignment &);
~AlignmentGraph();
- Node *GetRoot() { return m_root; }
- const std::vector<Node *> &GetTargetNodes() { return m_targetNodes; }
+ Node *GetRoot() {
+ return m_root;
+ }
+ const std::vector<Node *> &GetTargetNodes() {
+ return m_targetNodes;
+ }
void ExtractMinimalRules(const Options &);
void ExtractComposedRules(const Options &);
- private:
+private:
// Disallow copying
AlignmentGraph(const AlignmentGraph &);
AlignmentGraph &operator=(const AlignmentGraph &);
@@ -58,11 +64,11 @@ class AlignmentGraph
Node *CopyParseTree(const ParseTree *);
void ComputeFrontierSet(Node *, const Options &, std::set<Node *> &) const;
void CalcComplementSpans(Node *);
- void GetTargetTreeLeaves(Node *, std::vector<Node *> &);
+ void GetTargetTreeLeaves(Node *, std::vector<Node *> &);
void AttachUnalignedSourceWords();
Node *DetermineAttachmentPoint(int);
Subgraph ComputeMinimalFrontierGraphFragment(Node *,
- const std::set<Node *> &);
+ const std::set<Node *> &);
void ExtractComposedRules(Node *, const Options &);
Node *m_root;
diff --git a/phrase-extract/extract-ghkm/ComposedRule.cpp b/phrase-extract/extract-ghkm/ComposedRule.cpp
index 8bf3cfc72..e9fc826b7 100644
--- a/phrase-extract/extract-ghkm/ComposedRule.cpp
+++ b/phrase-extract/extract-ghkm/ComposedRule.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -27,14 +27,16 @@
#include <vector>
#include <queue>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
ComposedRule::ComposedRule(const Subgraph &baseRule)
- : m_baseRule(baseRule)
- , m_depth(baseRule.GetDepth())
- , m_size(baseRule.GetSize())
- , m_nodeCount(baseRule.GetNodeCount())
+ : m_baseRule(baseRule)
+ , m_depth(baseRule.GetDepth())
+ , m_size(baseRule.GetSize())
+ , m_nodeCount(baseRule.GetNodeCount())
{
const std::set<const Node *> &leaves = baseRule.GetLeaves();
for (std::set<const Node *>::const_iterator p = leaves.begin();
@@ -47,12 +49,12 @@ ComposedRule::ComposedRule(const Subgraph &baseRule)
ComposedRule::ComposedRule(const ComposedRule &other, const Subgraph &rule,
int depth)
- : m_baseRule(other.m_baseRule)
- , m_attachedRules(other.m_attachedRules)
- , m_openAttachmentPoints(other.m_openAttachmentPoints)
- , m_depth(depth)
- , m_size(other.m_size+rule.GetSize())
- , m_nodeCount(other.m_nodeCount+rule.GetNodeCount()-1)
+ : m_baseRule(other.m_baseRule)
+ , m_attachedRules(other.m_attachedRules)
+ , m_openAttachmentPoints(other.m_openAttachmentPoints)
+ , m_depth(depth)
+ , m_size(other.m_size+rule.GetSize())
+ , m_nodeCount(other.m_nodeCount+rule.GetNodeCount()-1)
{
m_attachedRules.push_back(&rule);
m_openAttachmentPoints.pop();
@@ -71,7 +73,7 @@ void ComposedRule::CloseAttachmentPoint()
}
ComposedRule *ComposedRule::AttemptComposition(const Subgraph &rule,
- const Options &options) const
+ const Options &options) const
{
// The smallest possible rule fragment should be rooted at a tree node.
// Note that this differs from the original GHKM definition.
diff --git a/phrase-extract/extract-ghkm/ComposedRule.h b/phrase-extract/extract-ghkm/ComposedRule.h
index 65ce9ac70..b5f72a492 100644
--- a/phrase-extract/extract-ghkm/ComposedRule.h
+++ b/phrase-extract/extract-ghkm/ComposedRule.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -26,15 +26,17 @@
#include <vector>
#include <queue>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
class Node;
struct Options;
class ComposedRule
{
- public:
+public:
// Form a 'trivial' ComposedRule from a single existing rule.
ComposedRule(const Subgraph &baseRule);
@@ -53,7 +55,7 @@ class ComposedRule
// Constructs a Subgraph object corresponding to the composed rule.
Subgraph CreateSubgraph();
- private:
+private:
ComposedRule(const ComposedRule &, const Subgraph &, int);
const Subgraph &m_baseRule;
diff --git a/phrase-extract/extract-ghkm/Exception.h b/phrase-extract/extract-ghkm/Exception.h
index 9928785f0..a1e623cd1 100644
--- a/phrase-extract/extract-ghkm/Exception.h
+++ b/phrase-extract/extract-ghkm/Exception.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -23,16 +23,20 @@
#include <string>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
class Exception
{
- public:
+public:
Exception(const char *msg) : m_msg(msg) {}
Exception(const std::string &msg) : m_msg(msg) {}
- const std::string &GetMsg() const { return m_msg; }
- private:
+ const std::string &GetMsg() const {
+ return m_msg;
+ }
+private:
std::string m_msg;
};
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index e3b52943c..80568ccd5 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -43,8 +43,10 @@
#include <sstream>
#include <vector>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
int ExtractGHKM::Main(int argc, char *argv[])
{
@@ -107,7 +109,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
++lineNum;
// Parse target tree.
- if (targetLine.size() == 0) {
+ if (targetLine.size() == 0) {
std::cerr << "skipping line " << lineNum << " with empty target tree\n";
continue;
}
@@ -263,64 +265,64 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
// Declare the command line options that are visible to the user.
po::options_description visible(usageTop.str());
visible.add_options()
- //("help", "print this help message and exit")
- ("AllowUnary",
- "allow fully non-lexical unary rules")
- ("ConditionOnTargetLHS",
- "write target LHS instead of \"X\" as source LHS")
- ("GlueGrammar",
- po::value(&options.glueGrammarFile),
- "write glue grammar to named file")
- ("GZOutput",
- "write gzipped extract files")
- ("MaxNodes",
- po::value(&options.maxNodes)->default_value(options.maxNodes),
- "set maximum number of tree nodes for composed rules")
- ("MaxRuleDepth",
- po::value(&options.maxRuleDepth)->default_value(options.maxRuleDepth),
- "set maximum depth for composed rules")
- ("MaxRuleSize",
- po::value(&options.maxRuleSize)->default_value(options.maxRuleSize),
- "set maximum size for composed rules")
- ("MaxScope",
- po::value(&options.maxScope)->default_value(options.maxScope),
- "set maximum allowed scope")
- ("Minimal",
- "extract minimal rules only")
- ("PCFG",
- "include score based on PCFG scores in target corpus")
- ("SentenceOffset",
- po::value(&options.sentenceOffset)->default_value(options.sentenceOffset),
- "set sentence number offset if processing split corpus")
- ("UnknownWordLabel",
- po::value(&options.unknownWordFile),
- "write unknown word labels to named file")
- ("UnknownWordMinRelFreq",
- po::value(&options.unknownWordMinRelFreq)->default_value(
- options.unknownWordMinRelFreq),
- "set minimum relative frequency for unknown word labels")
- ("UnknownWordUniform",
- "write uniform weights to unknown word label file")
- ("UnpairedExtractFormat",
- "do not pair non-terminals in extract files")
+ //("help", "print this help message and exit")
+ ("AllowUnary",
+ "allow fully non-lexical unary rules")
+ ("ConditionOnTargetLHS",
+ "write target LHS instead of \"X\" as source LHS")
+ ("GlueGrammar",
+ po::value(&options.glueGrammarFile),
+ "write glue grammar to named file")
+ ("GZOutput",
+ "write gzipped extract files")
+ ("MaxNodes",
+ po::value(&options.maxNodes)->default_value(options.maxNodes),
+ "set maximum number of tree nodes for composed rules")
+ ("MaxRuleDepth",
+ po::value(&options.maxRuleDepth)->default_value(options.maxRuleDepth),
+ "set maximum depth for composed rules")
+ ("MaxRuleSize",
+ po::value(&options.maxRuleSize)->default_value(options.maxRuleSize),
+ "set maximum size for composed rules")
+ ("MaxScope",
+ po::value(&options.maxScope)->default_value(options.maxScope),
+ "set maximum allowed scope")
+ ("Minimal",
+ "extract minimal rules only")
+ ("PCFG",
+ "include score based on PCFG scores in target corpus")
+ ("SentenceOffset",
+ po::value(&options.sentenceOffset)->default_value(options.sentenceOffset),
+ "set sentence number offset if processing split corpus")
+ ("UnknownWordLabel",
+ po::value(&options.unknownWordFile),
+ "write unknown word labels to named file")
+ ("UnknownWordMinRelFreq",
+ po::value(&options.unknownWordMinRelFreq)->default_value(
+ options.unknownWordMinRelFreq),
+ "set minimum relative frequency for unknown word labels")
+ ("UnknownWordUniform",
+ "write uniform weights to unknown word label file")
+ ("UnpairedExtractFormat",
+ "do not pair non-terminals in extract files")
;
// Declare the command line options that are hidden from the user
// (these are used as positional options).
po::options_description hidden("Hidden options");
hidden.add_options()
- ("TargetFile",
- po::value(&options.targetFile),
- "target file")
- ("SourceFile",
- po::value(&options.sourceFile),
- "source file")
- ("AlignmentFile",
- po::value(&options.alignmentFile),
- "alignment file")
- ("ExtractFile",
- po::value(&options.extractFile),
- "extract file")
+ ("TargetFile",
+ po::value(&options.targetFile),
+ "target file")
+ ("SourceFile",
+ po::value(&options.sourceFile),
+ "source file")
+ ("AlignmentFile",
+ po::value(&options.alignmentFile),
+ "alignment file")
+ ("ExtractFile",
+ po::value(&options.extractFile),
+ "extract file")
;
// Compose the full set of command-line options.
@@ -337,8 +339,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
// Process the command-line.
po::variables_map vm;
const int optionStyle = cls::allow_long
- | cls::long_allow_adjacent
- | cls::long_allow_next;
+ | cls::long_allow_adjacent
+ | cls::long_allow_next;
try {
po::store(po::command_line_parser(argc, argv).style(optionStyle).
options(cmdLineOptions).positional(p).run(), vm);
@@ -424,9 +426,9 @@ std::vector<std::string> ExtractGHKM::ReadTokens(const std::string &s)
}
void ExtractGHKM::WriteGlueGrammar(
- const std::set<std::string> &labelSet,
- const std::map<std::string, int> &topLabelSet,
- std::ostream &out)
+ const std::set<std::string> &labelSet,
+ const std::map<std::string, int> &topLabelSet,
+ std::ostream &out)
{
// chose a top label that is not already a label
std::string topLabel = "QQQQQQ";
@@ -457,10 +459,10 @@ void ExtractGHKM::WriteGlueGrammar(
}
void ExtractGHKM::CollectWordLabelCounts(
- ParseTree &root,
- const Options &options,
- std::map<std::string, int> &wordCount,
- std::map<std::string, std::string> &wordLabel)
+ ParseTree &root,
+ const Options &options,
+ std::map<std::string, int> &wordCount,
+ std::map<std::string, std::string> &wordLabel)
{
std::vector<const ParseTree*> leaves;
root.GetLeaves(std::back_inserter(leaves));
@@ -486,10 +488,10 @@ void ExtractGHKM::CollectWordLabelCounts(
}
void ExtractGHKM::WriteUnknownWordLabel(
- const std::map<std::string, int> &wordCount,
- const std::map<std::string, std::string> &wordLabel,
- const Options &options,
- std::ostream &out)
+ const std::map<std::string, int> &wordCount,
+ const std::map<std::string, std::string> &wordLabel,
+ const Options &options,
+ std::ostream &out)
{
std::map<std::string, int> labelCount;
int total = 0;
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.h b/phrase-extract/extract-ghkm/ExtractGHKM.h
index 6519bf675..c78aea109 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.h
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -27,22 +27,26 @@
#include <string>
#include <vector>
-namespace Moses {
+namespace Moses
+{
class OutputFileStream;
-namespace GHKM {
+namespace GHKM
+{
struct Options;
class ParseTree;
class ExtractGHKM
{
- public:
+public:
ExtractGHKM() : m_name("extract-ghkm") {}
- const std::string &GetName() const { return m_name; }
+ const std::string &GetName() const {
+ return m_name;
+ }
int Main(int argc, char *argv[]);
- private:
+private:
void Error(const std::string &) const;
void OpenInputFileOrDie(const std::string &, std::ifstream &);
void OpenOutputFileOrDie(const std::string &, std::ofstream &);
@@ -60,7 +64,7 @@ class ExtractGHKM
const std::map<std::string, int> &,
std::ostream &);
std::vector<std::string> ReadTokens(const std::string &);
-
+
void ProcessOptions(int, char *[], Options &) const;
std::string m_name;
diff --git a/phrase-extract/extract-ghkm/Main.cpp b/phrase-extract/extract-ghkm/Main.cpp
index faf3230a6..14064406b 100644
--- a/phrase-extract/extract-ghkm/Main.cpp
+++ b/phrase-extract/extract-ghkm/Main.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/phrase-extract/extract-ghkm/Node.cpp b/phrase-extract/extract-ghkm/Node.cpp
index beb7470b8..e14d8c050 100644
--- a/phrase-extract/extract-ghkm/Node.cpp
+++ b/phrase-extract/extract-ghkm/Node.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -21,8 +21,10 @@
#include "Subgraph.h"
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
Node::~Node()
{
diff --git a/phrase-extract/extract-ghkm/Node.h b/phrase-extract/extract-ghkm/Node.h
index 775473362..2eed01311 100644
--- a/phrase-extract/extract-ghkm/Node.h
+++ b/phrase-extract/extract-ghkm/Node.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -28,8 +28,10 @@
#include <string>
#include <vector>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
class Subgraph;
@@ -37,34 +39,68 @@ enum NodeType { SOURCE, TARGET, TREE };
class Node
{
- public:
+public:
Node(const std::string &label, NodeType type)
- : m_label(label)
- , m_type(type)
- , m_pcfgScore(0.0f) {}
+ : m_label(label)
+ , m_type(type)
+ , m_pcfgScore(0.0f) {}
~Node();
- const std::string &GetLabel() const { return m_label; }
- NodeType GetType() const { return m_type; }
- const std::vector<Node*> &GetChildren() const { return m_children; }
- const std::vector<Node*> &GetParents() const { return m_parents; }
- float GetPcfgScore() const { return m_pcfgScore; }
- const Span &GetSpan() const { return m_span; }
- const Span &GetComplementSpan() const { return m_complementSpan; }
- const std::vector<const Subgraph*> &GetRules() const { return m_rules; }
-
- void SetChildren(const std::vector<Node*> &c) { m_children = c; }
- void SetParents(const std::vector<Node*> &p) { m_parents = p; }
- void SetPcfgScore(float s) { m_pcfgScore = s; }
- void SetSpan(const Span &s) { m_span = s; }
- void SetComplementSpan(const Span &cs) { m_complementSpan = cs; }
-
- void AddChild(Node *c) { m_children.push_back(c); }
- void AddParent(Node *p) { m_parents.push_back(p); }
- void AddRule(const Subgraph *s) { m_rules.push_back(s); }
-
- bool IsSink() const { return m_children.empty(); }
+ const std::string &GetLabel() const {
+ return m_label;
+ }
+ NodeType GetType() const {
+ return m_type;
+ }
+ const std::vector<Node*> &GetChildren() const {
+ return m_children;
+ }
+ const std::vector<Node*> &GetParents() const {
+ return m_parents;
+ }
+ float GetPcfgScore() const {
+ return m_pcfgScore;
+ }
+ const Span &GetSpan() const {
+ return m_span;
+ }
+ const Span &GetComplementSpan() const {
+ return m_complementSpan;
+ }
+ const std::vector<const Subgraph*> &GetRules() const {
+ return m_rules;
+ }
+
+ void SetChildren(const std::vector<Node*> &c) {
+ m_children = c;
+ }
+ void SetParents(const std::vector<Node*> &p) {
+ m_parents = p;
+ }
+ void SetPcfgScore(float s) {
+ m_pcfgScore = s;
+ }
+ void SetSpan(const Span &s) {
+ m_span = s;
+ }
+ void SetComplementSpan(const Span &cs) {
+ m_complementSpan = cs;
+ }
+
+ void AddChild(Node *c) {
+ m_children.push_back(c);
+ }
+ void AddParent(Node *p) {
+ m_parents.push_back(p);
+ }
+ void AddRule(const Subgraph *s) {
+ m_rules.push_back(s);
+ }
+
+ bool IsSink() const {
+ return m_children.empty();
+ }
bool IsPreterminal() const;
void PropagateIndex(int);
@@ -82,7 +118,7 @@ class Node
template<typename InputIterator>
static Node *LowestCommonAncestor(InputIterator first, InputIterator last);
- private:
+private:
// Disallow copying
Node(const Node &);
Node &operator=(const Node &);
diff --git a/phrase-extract/extract-ghkm/Options.h b/phrase-extract/extract-ghkm/Options.h
index d348a57d8..e54a9ddae 100644
--- a/phrase-extract/extract-ghkm/Options.h
+++ b/phrase-extract/extract-ghkm/Options.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -23,25 +23,27 @@
#include <string>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
struct Options {
- public:
+public:
Options()
- : allowUnary(false)
- , conditionOnTargetLhs(false)
- , gzOutput(false)
- , maxNodes(15)
- , maxRuleDepth(3)
- , maxRuleSize(3)
- , maxScope(3)
- , minimal(false)
- , pcfg(false)
- , sentenceOffset(0)
- , unpairedExtractFormat(false)
- , unknownWordMinRelFreq(0.03f)
- , unknownWordUniform(false) {}
+ : allowUnary(false)
+ , conditionOnTargetLhs(false)
+ , gzOutput(false)
+ , maxNodes(15)
+ , maxRuleDepth(3)
+ , maxRuleSize(3)
+ , maxScope(3)
+ , minimal(false)
+ , pcfg(false)
+ , sentenceOffset(0)
+ , unpairedExtractFormat(false)
+ , unknownWordMinRelFreq(0.03f)
+ , unknownWordUniform(false) {}
// Positional options
std::string targetFile;
diff --git a/phrase-extract/extract-ghkm/ParseTree.cpp b/phrase-extract/extract-ghkm/ParseTree.cpp
index 052b8dee1..f86486487 100644
--- a/phrase-extract/extract-ghkm/ParseTree.cpp
+++ b/phrase-extract/extract-ghkm/ParseTree.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -19,8 +19,10 @@
#include "ParseTree.h"
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
ParseTree::~ParseTree()
{
diff --git a/phrase-extract/extract-ghkm/ParseTree.h b/phrase-extract/extract-ghkm/ParseTree.h
index 273e2e04e..03da17735 100644
--- a/phrase-extract/extract-ghkm/ParseTree.h
+++ b/phrase-extract/extract-ghkm/ParseTree.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -24,27 +24,39 @@
#include <string>
#include <vector>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
class ParseTree
{
- public:
+public:
ParseTree(const std::string &label)
- : m_label(label)
- , m_parent(0)
- , m_pcfgScore(0.0) {}
+ : m_label(label)
+ , m_parent(0)
+ , m_pcfgScore(0.0) {}
~ParseTree();
- const std::string &GetLabel() const { return m_label; }
- const std::vector<ParseTree*> &GetChildren() const { return m_children; }
- const ParseTree *GetParent() const { return m_parent; }
- float GetPcfgScore() const { return m_pcfgScore; }
+ const std::string &GetLabel() const {
+ return m_label;
+ }
+ const std::vector<ParseTree*> &GetChildren() const {
+ return m_children;
+ }
+ const ParseTree *GetParent() const {
+ return m_parent;
+ }
+ float GetPcfgScore() const {
+ return m_pcfgScore;
+ }
void SetParent(ParseTree *);
void SetChildren(const std::vector<ParseTree*> &);
- void SetPcfgScore(float score) { m_pcfgScore = score; }
+ void SetPcfgScore(float score) {
+ m_pcfgScore = score;
+ }
void AddChild(ParseTree *);
@@ -53,7 +65,7 @@ class ParseTree
template<typename OutputIterator>
void GetLeaves(OutputIterator);
- private:
+private:
// Disallow copying
ParseTree(const ParseTree &);
ParseTree &operator=(const ParseTree &);
diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp
index 5dc70052c..2c901413d 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRule.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -24,13 +24,15 @@
#include <algorithm>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
ScfgRule::ScfgRule(const Subgraph &fragment)
- : m_sourceLHS("X", NonTerminal)
- , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
- , m_pcfgScore(fragment.GetPcfgScore())
+ : m_sourceLHS("X", NonTerminal)
+ , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
+ , m_pcfgScore(fragment.GetPcfgScore())
{
// Source RHS
diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h
index 2405d8fa3..21a9e9900 100644
--- a/phrase-extract/extract-ghkm/ScfgRule.h
+++ b/phrase-extract/extract-ghkm/ScfgRule.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -26,42 +26,59 @@
#include <string>
#include <vector>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
class Node;
class Subgraph;
enum SymbolType { Terminal, NonTerminal };
-struct Symbol
-{
- public:
+struct Symbol {
+public:
Symbol(const std::string &v, SymbolType t) : m_value(v) , m_type(t) {}
- const std::string &GetValue() const { return m_value; }
- SymbolType GetType() const { return m_type; }
+ const std::string &GetValue() const {
+ return m_value;
+ }
+ SymbolType GetType() const {
+ return m_type;
+ }
- private:
+private:
std::string m_value;
SymbolType m_type;
};
class ScfgRule
{
- public:
+public:
ScfgRule(const Subgraph &fragment);
- const Symbol &GetSourceLHS() const { return m_sourceLHS; }
- const Symbol &GetTargetLHS() const { return m_targetLHS; }
- const std::vector<Symbol> &GetSourceRHS() const { return m_sourceRHS; }
- const std::vector<Symbol> &GetTargetRHS() const { return m_targetRHS; }
- const Alignment &GetAlignment() const { return m_alignment; }
- float GetPcfgScore() const { return m_pcfgScore; }
+ const Symbol &GetSourceLHS() const {
+ return m_sourceLHS;
+ }
+ const Symbol &GetTargetLHS() const {
+ return m_targetLHS;
+ }
+ const std::vector<Symbol> &GetSourceRHS() const {
+ return m_sourceRHS;
+ }
+ const std::vector<Symbol> &GetTargetRHS() const {
+ return m_targetRHS;
+ }
+ const Alignment &GetAlignment() const {
+ return m_alignment;
+ }
+ float GetPcfgScore() const {
+ return m_pcfgScore;
+ }
int Scope() const;
- private:
+private:
static bool PartitionOrderComp(const Node *, const Node *);
Symbol m_sourceLHS;
diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
index cd993d6e8..54b3978d1 100644
--- a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -30,8 +30,10 @@
#include <sstream>
#include <vector>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
void ScfgRuleWriter::Write(const ScfgRule &rule)
{
@@ -70,8 +72,8 @@ void ScfgRuleWriter::Write(const ScfgRule &rule)
}
void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
- std::ostream &sourceSS,
- std::ostream &targetSS)
+ std::ostream &sourceSS,
+ std::ostream &targetSS)
{
const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
@@ -122,8 +124,8 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
}
void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule,
- std::ostream &sourceSS,
- std::ostream &targetSS)
+ std::ostream &sourceSS,
+ std::ostream &targetSS)
{
const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
index b92a432a1..ee29e49e5 100644
--- a/phrase-extract/extract-ghkm/ScfgRuleWriter.h
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -23,8 +23,10 @@
#include <ostream>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
struct Options;
class ScfgRule;
@@ -32,15 +34,15 @@ struct Symbol;
class ScfgRuleWriter
{
- public:
+public:
ScfgRuleWriter(std::ostream &fwd, std::ostream &inv, const Options &options)
- : m_fwd(fwd)
- , m_inv(inv)
- , m_options(options) {}
+ : m_fwd(fwd)
+ , m_inv(inv)
+ , m_options(options) {}
void Write(const ScfgRule &);
- private:
+private:
// Disallow copying
ScfgRuleWriter(const ScfgRuleWriter &);
ScfgRuleWriter &operator=(const ScfgRuleWriter &);
diff --git a/phrase-extract/extract-ghkm/Span.cpp b/phrase-extract/extract-ghkm/Span.cpp
index f0eccbdf2..d637ec3d2 100644
--- a/phrase-extract/extract-ghkm/Span.cpp
+++ b/phrase-extract/extract-ghkm/Span.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -19,8 +19,10 @@
#include "Span.h"
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
bool SpansIntersect(const Span &a, const ContiguousSpan &b)
{
diff --git a/phrase-extract/extract-ghkm/Span.h b/phrase-extract/extract-ghkm/Span.h
index 003d1ef84..c4d146c4e 100644
--- a/phrase-extract/extract-ghkm/Span.h
+++ b/phrase-extract/extract-ghkm/Span.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -24,8 +24,10 @@
#include <map>
#include <set>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
typedef std::set<int> Span;
typedef std::pair<int, int> ContiguousSpan;
diff --git a/phrase-extract/extract-ghkm/Subgraph.cpp b/phrase-extract/extract-ghkm/Subgraph.cpp
index e048f2c55..3c0503010 100644
--- a/phrase-extract/extract-ghkm/Subgraph.cpp
+++ b/phrase-extract/extract-ghkm/Subgraph.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -21,8 +21,10 @@
#include "Node.h"
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
void Subgraph::GetTargetLeaves(std::vector<const Node *> &result) const
{
diff --git a/phrase-extract/extract-ghkm/Subgraph.h b/phrase-extract/extract-ghkm/Subgraph.h
index ede1233e9..f4d1e0c8d 100644
--- a/phrase-extract/extract-ghkm/Subgraph.h
+++ b/phrase-extract/extract-ghkm/Subgraph.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -26,47 +26,62 @@
#include <set>
#include <vector>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
class Node;
class Subgraph
{
- public:
+public:
Subgraph(const Node *root)
- : m_root(root)
- , m_depth(0)
- , m_size(root->GetType() == TREE ? 1 : 0)
- , m_nodeCount(1)
- , m_pcfgScore(0.0f) {}
+ : m_root(root)
+ , m_depth(0)
+ , m_size(root->GetType() == TREE ? 1 : 0)
+ , m_nodeCount(1)
+ , m_pcfgScore(0.0f) {}
Subgraph(const Node *root, const std::set<const Node *> &leaves)
- : m_root(root)
- , m_leaves(leaves)
- , m_depth(-1)
- , m_size(-1)
- , m_nodeCount(-1)
- , m_pcfgScore(0.0f)
- {
+ : m_root(root)
+ , m_leaves(leaves)
+ , m_depth(-1)
+ , m_size(-1)
+ , m_nodeCount(-1)
+ , m_pcfgScore(0.0f) {
m_depth = CalcDepth(m_root);
m_size = CalcSize(m_root);
m_nodeCount = CountNodes(m_root);
m_pcfgScore = CalcPcfgScore();
}
- const Node *GetRoot() const { return m_root; }
- const std::set<const Node *> &GetLeaves() const { return m_leaves; }
- int GetDepth() const { return m_depth; }
- int GetSize() const { return m_size; }
- int GetNodeCount() const { return m_nodeCount; }
- float GetPcfgScore() const { return m_pcfgScore; }
+ const Node *GetRoot() const {
+ return m_root;
+ }
+ const std::set<const Node *> &GetLeaves() const {
+ return m_leaves;
+ }
+ int GetDepth() const {
+ return m_depth;
+ }
+ int GetSize() const {
+ return m_size;
+ }
+ int GetNodeCount() const {
+ return m_nodeCount;
+ }
+ float GetPcfgScore() const {
+ return m_pcfgScore;
+ }
- bool IsTrivial() const { return m_leaves.empty(); }
+ bool IsTrivial() const {
+ return m_leaves.empty();
+ }
void GetTargetLeaves(std::vector<const Node *> &) const;
- private:
+private:
void GetTargetLeaves(const Node *, std::vector<const Node *> &) const;
int CalcDepth(const Node *) const;
int CalcSize(const Node *) const;
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
index 66024ff01..2f28c3244 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -29,13 +29,15 @@
using namespace MosesTraining;
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
std::map<std::string, int> &topLabelSet)
- : m_labelSet(labelSet)
- , m_topLabelSet(topLabelSet)
+ : m_labelSet(labelSet)
+ , m_topLabelSet(topLabelSet)
{
}
@@ -60,8 +62,8 @@ std::auto_ptr<ParseTree> XmlTreeParser::Parse(const std::string &line)
// Converts a SyntaxNode tree to a Moses::GHKM::ParseTree.
std::auto_ptr<ParseTree> XmlTreeParser::ConvertTree(
- const SyntaxNode &tree,
- const std::vector<std::string> &words)
+ const SyntaxNode &tree,
+ const std::vector<std::string> &words)
{
std::auto_ptr<ParseTree> root(new ParseTree(tree.GetLabel()));
root->SetPcfgScore(tree.GetPcfgScore());
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h
index 7b63ae1e4..d00fd7d9f 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.h
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -31,18 +31,21 @@
#include <string>
#include <vector>
-namespace Moses {
-namespace GHKM {
+namespace Moses
+{
+namespace GHKM
+{
class ParseTree;
// Parses a string in Moses' XML parse tree format and returns a ParseTree
// object.
-class XmlTreeParser {
- public:
+class XmlTreeParser
+{
+public:
XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
std::auto_ptr<ParseTree> Parse(const std::string &);
- private:
+private:
std::auto_ptr<ParseTree> ConvertTree(const MosesTraining::SyntaxNode &,
const std::vector<std::string> &);
diff --git a/phrase-extract/extract-lex-main.cpp b/phrase-extract/extract-lex-main.cpp
index a59450da8..f63015a6a 100644
--- a/phrase-extract/extract-lex-main.cpp
+++ b/phrase-extract/extract-lex-main.cpp
@@ -10,16 +10,16 @@ using namespace MosesTraining;
float COUNT_INCR = 1;
-void fix(std::ostream& stream)
+void fix(std::ostream& stream)
{
- stream.setf(std::ios::fixed);
- stream.precision(7);
+ stream.setf(std::ios::fixed);
+ stream.precision(7);
}
int main(int argc, char* argv[])
{
cerr << "Starting...\n";
-
+
assert(argc == 6);
char* &filePathTarget = argv[1];
char* &filePathSource = argv[2];
@@ -43,8 +43,7 @@ int main(int argc, char* argv[])
size_t lineCount = 0;
string lineTarget, lineSource, lineAlign;
- while (getline(streamTarget, lineTarget))
- {
+ while (getline(streamTarget, lineTarget)) {
if (lineCount % 10000 == 0)
cerr << lineCount << " ";
@@ -52,7 +51,7 @@ int main(int argc, char* argv[])
assert(isSource);
istream &isAlign = getline(streamAlign, lineAlign);
assert(isAlign);
-
+
vector<string> toksTarget, toksSource, toksAlign;
Tokenize(toksTarget, lineTarget);
Tokenize(toksSource, lineSource);
@@ -61,13 +60,13 @@ int main(int argc, char* argv[])
/*
cerr << endl
<< toksTarget.size() << " " << lineTarget << endl
- << toksSource.size() << " " << lineSource << endl
+ << toksSource.size() << " " << lineSource << endl
<< toksAlign.size() << " " << lineAlign << endl;
*/
extractSingleton.Process(toksTarget, toksSource, toksAlign, lineCount);
-
- ++lineCount;
+
+ ++lineCount;
}
extractSingleton.Output(streamLexS2T, streamLexT2S);
@@ -86,35 +85,32 @@ namespace MosesTraining
const std::string *Vocab::GetOrAdd(const std::string &word)
{
- const string *ret = &(*m_coll.insert(word).first);
+ const string *ret = &(*m_coll.insert(word).first);
return ret;
}
void ExtractLex::Process(vector<string> &toksTarget, vector<string> &toksSource, vector<string> &toksAlign, size_t lineCount)
{
std::vector<bool> m_sourceAligned(toksSource.size(), false)
- , m_targetAligned(toksTarget.size(), false);
+ , m_targetAligned(toksTarget.size(), false);
vector<string>::const_iterator iterAlign;
- for (iterAlign = toksAlign.begin(); iterAlign != toksAlign.end(); ++iterAlign)
- {
+ for (iterAlign = toksAlign.begin(); iterAlign != toksAlign.end(); ++iterAlign) {
const string &alignTok = *iterAlign;
-
+
vector<size_t> alignPos;
Tokenize(alignPos, alignTok, "-");
assert(alignPos.size() == 2);
- if (alignPos[0] >= toksSource.size())
- {
- cerr << "ERROR: alignment over source length. Alignment " << alignPos[0] << " at line " << lineCount << endl;
- continue;
- }
- if (alignPos[1] >= toksTarget.size())
- {
- cerr << "ERROR: alignment over target length. Alignment " << alignPos[1] << " at line " << lineCount << endl;
- continue;
- }
-
+ if (alignPos[0] >= toksSource.size()) {
+ cerr << "ERROR: alignment over source length. Alignment " << alignPos[0] << " at line " << lineCount << endl;
+ continue;
+ }
+ if (alignPos[1] >= toksTarget.size()) {
+ cerr << "ERROR: alignment over target length. Alignment " << alignPos[1] << " at line " << lineCount << endl;
+ continue;
+ }
+
assert(alignPos[0] < toksSource.size());
assert(alignPos[1] < toksTarget.size());
@@ -123,12 +119,12 @@ void ExtractLex::Process(vector<string> &toksTarget, vector<string> &toksSource,
const string &tmpSource = toksSource[ alignPos[0] ];
const string &tmpTarget = toksTarget[ alignPos[1] ];
-
+
const string *source = m_vocab.GetOrAdd(tmpSource);
const string *target = m_vocab.GetOrAdd(tmpTarget);
Process(target, source);
-
+
}
ProcessUnaligned(toksTarget, toksSource, m_sourceAligned, m_targetAligned);
@@ -154,15 +150,13 @@ void ExtractLex::Process(WordCount &wcIn, const std::string *out)
}
void ExtractLex::ProcessUnaligned(vector<string> &toksTarget, vector<string> &toksSource
- , const std::vector<bool> &m_sourceAligned, const std::vector<bool> &m_targetAligned)
+ , const std::vector<bool> &m_sourceAligned, const std::vector<bool> &m_targetAligned)
{
- const string *nullWord = m_vocab.GetOrAdd("NULL");
+ const string *nullWord = m_vocab.GetOrAdd("NULL");
- for (size_t pos = 0; pos < m_sourceAligned.size(); ++pos)
- {
+ for (size_t pos = 0; pos < m_sourceAligned.size(); ++pos) {
bool isAlignedCurr = m_sourceAligned[pos];
- if (!isAlignedCurr)
- {
+ if (!isAlignedCurr) {
const string &tmpWord = toksSource[pos];
const string *sourceWord = m_vocab.GetOrAdd(tmpWord);
@@ -170,11 +164,9 @@ void ExtractLex::ProcessUnaligned(vector<string> &toksTarget, vector<string> &to
}
}
- for (size_t pos = 0; pos < m_targetAligned.size(); ++pos)
- {
+ for (size_t pos = 0; pos < m_targetAligned.size(); ++pos) {
bool isAlignedCurr = m_targetAligned[pos];
- if (!isAlignedCurr)
- {
+ if (!isAlignedCurr) {
const string &tmpWord = toksTarget[pos];
const string *targetWord = m_vocab.GetOrAdd(tmpWord);
@@ -193,16 +185,14 @@ void ExtractLex::Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S
void ExtractLex::Output(const std::map<const std::string*, WordCount> &coll, std::ofstream &outStream)
{
std::map<const std::string*, WordCount>::const_iterator iterOuter;
- for (iterOuter = coll.begin(); iterOuter != coll.end(); ++iterOuter)
- {
+ for (iterOuter = coll.begin(); iterOuter != coll.end(); ++iterOuter) {
const string &inStr = *iterOuter->first;
const WordCount &inWC = iterOuter->second;
const std::map<const std::string*, WordCount> &outColl = inWC.GetColl();
std::map<const std::string*, WordCount>::const_iterator iterInner;
- for (iterInner = outColl.begin(); iterInner != outColl.end(); ++iterInner)
- {
+ for (iterInner = outColl.begin(); iterInner != outColl.end(); ++iterInner) {
const string &outStr = *iterInner->first;
const WordCount &outWC = iterInner->second;
diff --git a/phrase-extract/extract-lex.h b/phrase-extract/extract-lex.h
index d272cf6ff..d79038fc6 100644
--- a/phrase-extract/extract-lex.h
+++ b/phrase-extract/extract-lex.h
@@ -14,10 +14,10 @@ namespace MosesTraining
template<typename T>
inline T Scan(const std::string &input)
{
- std::stringstream stream(input);
- T ret;
- stream >> ret;
- return ret;
+ std::stringstream stream(input);
+ T ret;
+ stream >> ret;
+ return ret;
}
@@ -25,13 +25,12 @@ inline T Scan(const std::string &input)
template<typename T>
inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
{
- output.resize(input.size());
- for (size_t i = 0 ; i < input.size() ; i++)
- {
- output[i] = Scan<T>( input[i] );
- }
+ output.resize(input.size());
+ for (size_t i = 0 ; i < input.size() ; i++) {
+ output[i] = Scan<T>( input[i] );
+ }
}
-
+
inline void Tokenize(std::vector<std::string> &output
, const std::string& str
@@ -55,17 +54,17 @@ inline void Tokenize(std::vector<std::string> &output
// speeded up version of above
template<typename T>
inline void Tokenize( std::vector<T> &output
- , const std::string &input
- , const std::string& delimiters = " \t")
+ , const std::string &input
+ , const std::string& delimiters = " \t")
{
- std::vector<std::string> stringVector;
- Tokenize(stringVector, input, delimiters);
- return Scan<T>(output, stringVector );
+ std::vector<std::string> stringVector;
+ Tokenize(stringVector, input, delimiters);
+ return Scan<T>(output, stringVector );
}
class WordCount
{
- friend std::ostream& operator<<(std::ostream&, const WordCount&);
+ friend std::ostream& operator<<(std::ostream&, const WordCount&);
public:
float m_count;
@@ -83,13 +82,16 @@ public:
void AddCount(float incr);
- std::map<const std::string*, WordCount> &GetColl()
- { return m_coll; }
- const std::map<const std::string*, WordCount> &GetColl() const
- { return m_coll; }
+ std::map<const std::string*, WordCount> &GetColl() {
+ return m_coll;
+ }
+ const std::map<const std::string*, WordCount> &GetColl() const {
+ return m_coll;
+ }
- const float GetCount() const
- { return m_count; }
+ const float GetCount() const {
+ return m_count;
+ }
};
diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp
index cab91e92d..a8edb298a 100644
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@@ -29,7 +29,8 @@
using namespace std;
using namespace MosesTraining;
-namespace MosesTraining {
+namespace MosesTraining
+{
const long int LINE_MAX_LENGTH = 500000 ;
@@ -49,37 +50,38 @@ typedef vector < HPhrase > HPhraseVector;
// The key of the map is the English index and the value is a set of the source ones
typedef map <int, set<int> > HSentenceVertices;
- REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int));
- REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int),
const HSentenceVertices &, const HSentenceVertices &);
- REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
int, int, int, int, int, int, int,
bool (*)(int, int), bool (*)(int, int),
const HSentenceVertices &, const HSentenceVertices &,
const HSentenceVertices &, const HSentenceVertices &,
REO_POS);
- void insertVertex(HSentenceVertices &, int, int);
- void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
+void insertVertex(HSentenceVertices &, int, int);
+void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
int, int, int, int);
- string getOrientString(REO_POS, REO_MODEL_TYPE);
+string getOrientString(REO_POS, REO_MODEL_TYPE);
- bool ge(int, int);
- bool le(int, int);
- bool lt(int, int);
+bool ge(int, int);
+bool le(int, int);
+bool lt(int, int);
- bool isAligned (SentenceAlignment &, int, int);
- int sentenceOffset = 0;
+bool isAligned (SentenceAlignment &, int, int);
+int sentenceOffset = 0;
}
-namespace MosesTraining{
+namespace MosesTraining
+{
-class ExtractTask
+class ExtractTask
{
public:
ExtractTask(size_t id, SentenceAlignment &sentence,PhraseExtractionOptions &initoptions, Moses::OutputFileStream &extractFile, Moses::OutputFileStream &extractFileInv,Moses::OutputFileStream &extractFileOrientation):
@@ -87,8 +89,8 @@ public:
m_options(initoptions),
m_extractFile(extractFile),
m_extractFileInv(extractFileInv),
- m_extractFileOrientation(extractFileOrientation){}
-void Run();
+ m_extractFileOrientation(extractFileOrientation) {}
+ void Run();
private:
vector< string > m_extractedPhrases;
vector< string > m_extractedPhrasesInv;
@@ -98,7 +100,7 @@ private:
void extract(SentenceAlignment &);
void addPhrase(SentenceAlignment &, int, int, int, int, string &);
void writePhrasesToFile();
-
+
SentenceAlignment &m_sentence;
const PhraseExtractionOptions &m_options;
Moses::OutputFileStream &m_extractFile;
@@ -112,7 +114,7 @@ int main(int argc, char* argv[])
cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
<< "phrase extraction from an aligned parallel corpus\n";
- if (argc < 6) {
+ if (argc < 6) {
cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n";
exit(1);
@@ -135,7 +137,7 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--NoTTable") == 0) {
options.initTranslationFlag(false);
} else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
- options.initIncludeSentenceIdFlag(true);
+ options.initIncludeSentenceIdFlag(true);
} else if (strcmp(argv[i], "--SentenceOffset") == 0) {
if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
@@ -143,7 +145,7 @@ int main(int argc, char* argv[])
}
sentenceOffset = atoi(argv[++i]);
} else if (strcmp(argv[i], "--GZOutput") == 0) {
- options.initGzOutput(true);
+ options.initGzOutput(true);
} else if (strcmp(argv[i], "--InstanceWeights") == 0) {
if (i+1 >= argc) {
cerr << "extract: syntax error, used switch --InstanceWeights without file name" << endl;
@@ -260,7 +262,7 @@ int main(int argc, char* argv[])
SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__);
}
SentenceAlignment sentence;
- // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
+ // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
//az: output src, tgt, and alingment line
if (options.isOnlyOutputSpanInfo()) {
cout << "LOG: SRC: " << foreignString << endl;
@@ -268,8 +270,8 @@ int main(int argc, char* argv[])
cout << "LOG: ALT: " << alignmentString << endl;
cout << "LOG: PHRASES_BEGIN:" << endl;
}
- if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
- ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation);
+ if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
+ ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation);
task->Run();
delete task;
@@ -286,17 +288,18 @@ int main(int argc, char* argv[])
if (options.isTranslationFlag()) {
extractFile.Close();
extractFileInv.Close();
-
+
+ }
+ if (options.isOrientationFlag()) {
+ extractFileOrientation.Close();
}
- if (options.isOrientationFlag()){
- extractFileOrientation.Close();
- }
}
}
namespace MosesTraining
{
-void ExtractTask::Run() {
+void ExtractTask::Run()
+{
extract(m_sentence);
writePhrasesToFile();
m_extractedPhrases.clear();
@@ -665,16 +668,16 @@ void ExtractTask::addPhrase( SentenceAlignment &sentence, int startE, int endE,
{
// source
// // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
- ostringstream outextractstr;
- ostringstream outextractstrInv;
- ostringstream outextractstrOrientation;
+ ostringstream outextractstr;
+ ostringstream outextractstrInv;
+ ostringstream outextractstrOrientation;
if (m_options.isOnlyOutputSpanInfo()) {
cout << startF << " " << endF << " " << startE << " " << endE << endl;
return;
}
-for(int fi=startF; fi<=endF; fi++) {
+ for(int fi=startF; fi<=endF; fi++) {
if (m_options.isTranslationFlag()) outextractstr << sentence.source[fi] << " ";
if (m_options.isOrientationFlag()) outextractstrOrientation << sentence.source[fi] << " ";
}
@@ -693,13 +696,13 @@ for(int fi=startF; fi<=endF; fi++) {
// source (for inverse)
- if (m_options.isTranslationFlag()) {
+ if (m_options.isTranslationFlag()) {
for(int fi=startF; fi<=endF; fi++)
outextractstrInv << sentence.source[fi] << " ";
outextractstrInv << "|||";
}
// alignment
- if (m_options.isTranslationFlag()) {
+ if (m_options.isTranslationFlag()) {
for(int ei=startE; ei<=endE; ei++) {
for(unsigned int i=0; i<sentence.alignedToT[ei].size(); i++) {
int fi = sentence.alignedToT[ei][i];
@@ -732,39 +735,40 @@ for(int fi=startF; fi<=endF; fi++) {
if (m_options.isOrientationFlag()) outextractstrOrientation << "\n";
- m_extractedPhrases.push_back(outextractstr.str());
- m_extractedPhrasesInv.push_back(outextractstrInv.str());
- m_extractedPhrasesOri.push_back(outextractstrOrientation.str());
+ m_extractedPhrases.push_back(outextractstr.str());
+ m_extractedPhrasesInv.push_back(outextractstrInv.str());
+ m_extractedPhrasesOri.push_back(outextractstrOrientation.str());
}
-void ExtractTask::writePhrasesToFile(){
+void ExtractTask::writePhrasesToFile()
+{
- ostringstream outextractFile;
- ostringstream outextractFileInv;
- ostringstream outextractFileOrientation;
+ ostringstream outextractFile;
+ ostringstream outextractFileInv;
+ ostringstream outextractFileOrientation;
- for(vector<string>::const_iterator phrase=m_extractedPhrases.begin();phrase!=m_extractedPhrases.end();phrase++){
- outextractFile<<phrase->data();
- }
- for(vector<string>::const_iterator phrase=m_extractedPhrasesInv.begin();phrase!=m_extractedPhrasesInv.end();phrase++){
- outextractFileInv<<phrase->data();
- }
- for(vector<string>::const_iterator phrase=m_extractedPhrasesOri.begin();phrase!=m_extractedPhrasesOri.end();phrase++){
- outextractFileOrientation<<phrase->data();
- }
+ for(vector<string>::const_iterator phrase=m_extractedPhrases.begin(); phrase!=m_extractedPhrases.end(); phrase++) {
+ outextractFile<<phrase->data();
+ }
+ for(vector<string>::const_iterator phrase=m_extractedPhrasesInv.begin(); phrase!=m_extractedPhrasesInv.end(); phrase++) {
+ outextractFileInv<<phrase->data();
+ }
+ for(vector<string>::const_iterator phrase=m_extractedPhrasesOri.begin(); phrase!=m_extractedPhrasesOri.end(); phrase++) {
+ outextractFileOrientation<<phrase->data();
+ }
- m_extractFile << outextractFile.str();
- m_extractFileInv << outextractFileInv.str();
- m_extractFileOrientation << outextractFileOrientation.str();
+ m_extractFile << outextractFile.str();
+ m_extractFileInv << outextractFileInv.str();
+ m_extractFileOrientation << outextractFileOrientation.str();
}
// if proper conditioning, we need the number of times a source phrase occured
void ExtractTask::extractBase( SentenceAlignment &sentence )
{
- ostringstream outextractFile;
- ostringstream outextractFileInv;
+ ostringstream outextractFile;
+ ostringstream outextractFileInv;
int countF = sentence.source.size();
for(int startF=0; startF<countF; startF++) {
@@ -772,8 +776,8 @@ void ExtractTask::extractBase( SentenceAlignment &sentence )
(endF<countF && endF<startF+m_options.maxPhraseLength);
endF++) {
for(int fi=startF; fi<=endF; fi++) {
- outextractFile << sentence.source[fi] << " ";
- }
+ outextractFile << sentence.source[fi] << " ";
+ }
outextractFile << "|||" << endl;
}
}
@@ -789,8 +793,8 @@ void ExtractTask::extractBase( SentenceAlignment &sentence )
outextractFileInv << "|||" << endl;
}
}
- m_extractFile << outextractFile.str();
- m_extractFileInv << outextractFileInv.str();
+ m_extractFile << outextractFile.str();
+ m_extractFileInv << outextractFileInv.str();
}
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index 368aae1f5..f8e315e2c 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -55,7 +55,7 @@ using namespace MosesTraining;
typedef vector< int > LabelIndex;
typedef map< int, int > WordIndex;
-class ExtractTask
+class ExtractTask
{
private:
SentenceAlignmentWithSyntax &m_sentence;
@@ -64,31 +64,30 @@ private:
Moses::OutputFileStream& m_extractFileInv;
vector< ExtractedRule > m_extractedRules;
-
+
// main functions
void extractRules();
void addRuleToCollection(ExtractedRule &rule);
void consolidateRules();
void writeRulesToFile();
-
+
// subs
void addRule( int, int, int, int, int, RuleExist &ruleExist);
void addHieroRule( int startT, int endT, int startS, int endS
- , RuleExist &ruleExist, HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
+ , RuleExist &ruleExist, HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
void saveHieroPhrase( int startT, int endT, int startS, int endS
, HoleCollection &holeColl, LabelIndex &labelIndex, int countS);
string saveTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS);
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS);
string saveSourceHieroPhrase( int startT, int endT, int startS, int endS
, HoleCollection &holeColl, const LabelIndex &labelIndex);
void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
+ , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
void saveHieroAlignment( int startT, int endT, int startS, int endS
- , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
+ , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
void saveAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS);
-
- inline string IntToString( int i )
- {
+
+ inline string IntToString( int i ) {
stringstream out;
out << i;
return out.str();
@@ -123,7 +122,7 @@ int main(int argc, char* argv[])
if (argc < 5) {
cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract ["
- << " --GlueGrammar FILE"
+ << " --GlueGrammar FILE"
<< " | --UnknownWordLabel FILE"
<< " | --OnlyDirect"
<< " | --OutputNTLengths"
@@ -139,8 +138,8 @@ int main(int argc, char* argv[])
<< " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting"
<< " | --UnpairedExtractFormat"
<< " | --ConditionOnTargetLHS ]"
- << " | --BoundaryRules[" << options.boundaryRules << "]";
-
+ << " | --BoundaryRules[" << options.boundaryRules << "]";
+
exit(1);
}
char* &fileNameT = argv[1];
@@ -212,10 +211,9 @@ int main(int argc, char* argv[])
cerr << "extract error: --MaxScope should be at least 0" << endl;
exit(1);
}
+ } else if (strcmp(argv[i], "--GZOutput") == 0) {
+ options.gzOutput = true;
}
- else if (strcmp(argv[i], "--GZOutput") == 0) {
- options.gzOutput = true;
- }
// allow consecutive non-terminals (X Y | X Y)
else if (strcmp(argv[i],"--TargetSyntax") == 0) {
options.targetSyntax = true;
@@ -265,7 +263,7 @@ int main(int argc, char* argv[])
options.unpairedExtractFormat = true;
} else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
options.conditionOnTargetLhs = true;
- } else if (strcmp(argv[i],"-threads") == 0 ||
+ } else if (strcmp(argv[i],"-threads") == 0 ||
strcmp(argv[i],"--threads") == 0 ||
strcmp(argv[i],"--Threads") == 0) {
#ifdef WITH_THREADS
@@ -327,8 +325,8 @@ int main(int argc, char* argv[])
SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
SentenceAlignmentWithSyntax sentence
- (targetLabelCollection, sourceLabelCollection,
- targetTopLabelCollection, sourceTopLabelCollection, options);
+ (targetLabelCollection, sourceLabelCollection,
+ targetTopLabelCollection, sourceTopLabelCollection, options);
//az: output src, tgt, and alingment line
if (options.onlyOutputSpanInfo) {
cout << "LOG: SRC: " << sourceString << endl;
@@ -364,7 +362,8 @@ int main(int argc, char* argv[])
writeUnknownWordLabel(fileNameUnknownWordLabel);
}
-void ExtractTask::Run() {
+void ExtractTask::Run()
+{
extractRules();
consolidateRules();
writeRulesToFile();
@@ -471,7 +470,7 @@ void ExtractTask::extractRules()
}
void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex)
+ , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex)
{
vector<Hole*>::iterator iterHoleList = holeColl.GetSortedSourceHoles().begin();
assert(iterHoleList != holeColl.GetSortedSourceHoles().end());
@@ -509,8 +508,8 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
}
string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore
- , int countS)
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore
+ , int countS)
{
HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
assert(iterHoleList != holeColl.GetHoles().end());
@@ -536,11 +535,11 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
if (m_options.targetSyntax) {
targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel();
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
- targetLabel = "S";
+ targetLabel = "S";
} else {
targetLabel = "X";
}
-
+
hole.SetLabel(targetLabel, 1);
if (m_options.unpairedExtractFormat) {
@@ -571,7 +570,7 @@ string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int
}
string ExtractTask::saveSourceHieroPhrase( int startT, int endT, int startS, int endS
- , HoleCollection &holeColl, const LabelIndex &labelIndex)
+ , HoleCollection &holeColl, const LabelIndex &labelIndex)
{
vector<Hole*>::iterator iterHoleList = holeColl.GetSortedSourceHoles().begin();
assert(iterHoleList != holeColl.GetSortedSourceHoles().end());
@@ -615,7 +614,7 @@ string ExtractTask::saveSourceHieroPhrase( int startT, int endT, int startS, int
}
void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS
- , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule)
+ , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule)
{
// print alignment of words
for(int ti=startT; ti<=endT; ti++) {
@@ -636,13 +635,13 @@ void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS
HoleList::const_iterator iterHole;
for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) {
const Hole &hole = *iterHole;
-
+
std::string sourceSymbolIndex = IntToString(hole.GetPos(0));
std::string targetSymbolIndex = IntToString(hole.GetPos(1));
rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
if (!m_options.onlyDirectFlag)
rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
-
+
rule.SetSpanLength(hole.GetPos(0), hole.GetSize(0), hole.GetSize(1) ) ;
}
@@ -654,7 +653,7 @@ void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS
}
void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
- , HoleCollection &holeColl, LabelIndex &labelIndex, int countS)
+ , HoleCollection &holeColl, LabelIndex &labelIndex, int countS)
{
WordIndex indexS, indexT; // to keep track of word positions in rule
@@ -680,12 +679,12 @@ void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
if (m_options.pcfgScore) {
double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
- + " [" + targetLabel + "]";
+ + " [" + targetLabel + "]";
rule.pcfgScore = std::exp(logPCFGScore);
} else {
double logPCFGScore = 0.0f;
rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
- + " [" + targetLabel + "]";
+ + " [" + targetLabel + "]";
}
// source
@@ -754,8 +753,8 @@ void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int end
// this function is called recursively
// it pokes a new hole into the phrase pair, and then calls itself for more holes
void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
- , RuleExist &ruleExist, HoleCollection &holeColl
- , int numHoles, int initStartT, int wordCountT, int wordCountS)
+ , RuleExist &ruleExist, HoleCollection &holeColl
+ , int numHoles, int initStartT, int wordCountT, int wordCountS)
{
// done, if already the maximum number of non-terminals in phrase pair
if (numHoles >= m_options.maxNonTerm)
@@ -862,7 +861,7 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
allowablePhrase = false;
// passed all checks...
- if (allowablePhrase)
+ if (allowablePhrase)
saveAllHieroPhrases(startT, endT, startS, endS, holeColl, wordCountS);
// recursively search for next hole
@@ -880,12 +879,12 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
void ExtractTask::addRule( int startT, int endT, int startS, int endS, int countS, RuleExist &ruleExist)
{
// contains only <s> or </s>. Don't output
- if (m_options.boundaryRules
- && ( (startS == 0 && endS == 0)
- || (startS == countS-1 && endS == countS-1))) {
+ if (m_options.boundaryRules
+ && ( (startS == 0 && endS == 0)
+ || (startS == countS-1 && endS == countS-1))) {
return;
}
-
+
if (m_options.onlyOutputSpanInfo) {
cout << startS << " " << endS << " " << startT << " " << endT << endl;
return;
@@ -897,11 +896,10 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
string targetLabel,sourceLabel;
if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
sourceLabel = targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
- }
- else {
+ } else {
sourceLabel = m_options.sourceSyntax ?
m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
-
+
if (m_options.targetSyntax) {
targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
} else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
@@ -1008,7 +1006,7 @@ void ExtractTask::writeRulesToFile()
<< rule->alignment << " ||| "
<< rule->count << " ||| ";
if (m_options.outputNTLengths) {
- rule->OutputNTLengths(out);
+ rule->OutputNTLengths(out);
}
if (m_options.pcfgScore) {
out << " ||| " << rule->pcfgScore;
diff --git a/phrase-extract/lexical-reordering/reordering_classes.cpp b/phrase-extract/lexical-reordering/reordering_classes.cpp
index e5b3fe7cd..8c5163f9b 100644
--- a/phrase-extract/lexical-reordering/reordering_classes.cpp
+++ b/phrase-extract/lexical-reordering/reordering_classes.cpp
@@ -57,7 +57,7 @@ void ModelScore::reset_f()
}
void ModelScore::add_example
- (const StringPiece& previous, const StringPiece& next, float weight)
+(const StringPiece& previous, const StringPiece& next, float weight)
{
count_fe_prev[getType(previous)]+=weight;
count_f_prev[getType(previous)]+=weight;
diff --git a/phrase-extract/lexical-reordering/score.cpp b/phrase-extract/lexical-reordering/score.cpp
index 545abf303..d404822b8 100644
--- a/phrase-extract/lexical-reordering/score.cpp
+++ b/phrase-extract/lexical-reordering/score.cpp
@@ -29,11 +29,11 @@ void get_orientations(const StringPiece& pair, StringPiece& previous, StringPiec
class FileFormatException : public util::Exception
{
- public:
- FileFormatException() throw() {
- *this << "Invalid extract file format: ";
- }
- ~FileFormatException() throw() {}
+public:
+ FileFormatException() throw() {
+ *this << "Invalid extract file format: ";
+ }
+ ~FileFormatException() throw() {}
};
int main(int argc, char* argv[])
@@ -214,9 +214,10 @@ int main(int argc, char* argv[])
}
template <class It> StringPiece
-GrabOrDie(It &it, const StringPiece& line) {
- UTIL_THROW_IF(!it, FileFormatException, line.as_string());
- return *it++;
+GrabOrDie(It &it, const StringPiece& line)
+{
+ UTIL_THROW_IF(!it, FileFormatException, line.as_string());
+ return *it++;
}
@@ -236,12 +237,12 @@ void split_line(
| phrase | hier
| phrase | hier ||| weight
*/
-
+
util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter(" ||| "));
foreign = GrabOrDie(pipes,line);
english = GrabOrDie(pipes,line);
StringPiece next = GrabOrDie(pipes,line);
-
+
util::TokenIter<util::MultiCharacter> singlePipe(next, util::MultiCharacter(" | "));
wbe = GrabOrDie(singlePipe,line);
if (singlePipe) {
diff --git a/phrase-extract/pcfg-common/exception.h b/phrase-extract/pcfg-common/exception.h
index 3dbd59d0e..d9266ca36 100644
--- a/phrase-extract/pcfg-common/exception.h
+++ b/phrase-extract/pcfg-common/exception.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -23,15 +23,20 @@
#include <string>
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
-class Exception {
- public:
+class Exception
+{
+public:
Exception(const char *msg) : msg_(msg) {}
Exception(const std::string &msg) : msg_(msg) {}
- const std::string &msg() const { return msg_; }
- private:
+ const std::string &msg() const {
+ return msg_;
+ }
+private:
std::string msg_;
};
diff --git a/phrase-extract/pcfg-common/numbered_set.h b/phrase-extract/pcfg-common/numbered_set.h
index 15e768b4c..66e960404 100644
--- a/phrase-extract/pcfg-common/numbered_set.h
+++ b/phrase-extract/pcfg-common/numbered_set.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -29,32 +29,45 @@
#include <sstream>
#include <vector>
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
// Stores a set of elements of type T, each of which is allocated an integral
// ID of type I. IDs are contiguous starting at 0. Individual elements cannot
// be removed once inserted (but the whole set can be cleared).
template<typename T, typename I=std::size_t>
-class NumberedSet {
- private:
+class NumberedSet
+{
+private:
typedef boost::unordered_map<T, I> ElementToIdMap;
typedef std::vector<const T *> IdToElementMap;
- public:
+public:
typedef I IdType;
typedef typename IdToElementMap::const_iterator const_iterator;
NumberedSet() {}
- const_iterator begin() const { return id_to_element_.begin(); }
- const_iterator end() const { return id_to_element_.end(); }
+ const_iterator begin() const {
+ return id_to_element_.begin();
+ }
+ const_iterator end() const {
+ return id_to_element_.end();
+ }
// Static value
- static I NullId() { return std::numeric_limits<I>::max(); }
+ static I NullId() {
+ return std::numeric_limits<I>::max();
+ }
- bool Empty() const { return id_to_element_.empty(); }
- std::size_t Size() const { return id_to_element_.size(); }
+ bool Empty() const {
+ return id_to_element_.empty();
+ }
+ std::size_t Size() const {
+ return id_to_element_.size();
+ }
// Insert the given object and return its ID.
I Insert(const T &);
@@ -64,19 +77,21 @@ class NumberedSet {
void Clear();
- private:
+private:
ElementToIdMap element_to_id_;
IdToElementMap id_to_element_;
};
template<typename T, typename I>
-I NumberedSet<T, I>::Lookup(const T &s) const {
+I NumberedSet<T, I>::Lookup(const T &s) const
+{
typename ElementToIdMap::const_iterator p = element_to_id_.find(s);
return (p == element_to_id_.end()) ? NullId() : p->second;
}
template<typename T, typename I>
-const T &NumberedSet<T, I>::Lookup(I id) const {
+const T &NumberedSet<T, I>::Lookup(I id) const
+{
if (id < 0 || id >= id_to_element_.size()) {
std::ostringstream msg;
msg << "Value not found: " << id;
@@ -86,10 +101,11 @@ const T &NumberedSet<T, I>::Lookup(I id) const {
}
template<typename T, typename I>
-I NumberedSet<T, I>::Insert(const T &x) {
+I NumberedSet<T, I>::Insert(const T &x)
+{
std::pair<T, I> value(x, id_to_element_.size());
std::pair<typename ElementToIdMap::iterator, bool> result =
- element_to_id_.insert(value);
+ element_to_id_.insert(value);
if (result.second) {
// x is a new element.
id_to_element_.push_back(&result.first->first);
@@ -98,7 +114,8 @@ I NumberedSet<T, I>::Insert(const T &x) {
}
template<typename T, typename I>
-void NumberedSet<T, I>::Clear() {
+void NumberedSet<T, I>::Clear()
+{
element_to_id_.clear();
id_to_element_.clear();
}
diff --git a/phrase-extract/pcfg-common/pcfg.h b/phrase-extract/pcfg-common/pcfg.h
index b87336584..5398cd97e 100644
--- a/phrase-extract/pcfg-common/pcfg.h
+++ b/phrase-extract/pcfg-common/pcfg.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -28,11 +28,14 @@
#include <ostream>
#include <vector>
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
-class Pcfg {
- public:
+class Pcfg
+{
+public:
typedef std::vector<std::size_t> Key;
typedef std::map<Key, double> Map;
typedef Map::iterator iterator;
@@ -40,18 +43,26 @@ class Pcfg {
Pcfg() {}
- iterator begin() { return rules_.begin(); }
- const_iterator begin() const { return rules_.begin(); }
+ iterator begin() {
+ return rules_.begin();
+ }
+ const_iterator begin() const {
+ return rules_.begin();
+ }
- iterator end() { return rules_.end(); }
- const_iterator end() const { return rules_.end(); }
+ iterator end() {
+ return rules_.end();
+ }
+ const_iterator end() const {
+ return rules_.end();
+ }
void Add(const Key &, double);
bool Lookup(const Key &, double &) const;
void Read(std::istream &, Vocabulary &);
void Write(const Vocabulary &, std::ostream &) const;
- private:
+private:
Map rules_;
};
diff --git a/phrase-extract/pcfg-common/pcfg_tree.h b/phrase-extract/pcfg-common/pcfg_tree.h
index bdac64dfc..d125cad16 100644
--- a/phrase-extract/pcfg-common/pcfg_tree.h
+++ b/phrase-extract/pcfg-common/pcfg_tree.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -26,34 +26,43 @@
#include <string>
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
template<typename DerivedType>
-class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType> {
- public:
+class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType>
+{
+public:
typedef std::string LabelType;
typedef SyntaxTreeBase<LabelType, DerivedType> BaseType;
PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {}
- double score() const { return score_; }
- void set_score(double s) { score_ = s; }
+ double score() const {
+ return score_;
+ }
+ void set_score(double s) {
+ score_ = s;
+ }
- private:
+private:
double score_;
};
-class PcfgTree : public PcfgTreeBase<PcfgTree> {
- public:
+class PcfgTree : public PcfgTreeBase<PcfgTree>
+{
+public:
typedef PcfgTreeBase<PcfgTree> BaseType;
PcfgTree(const BaseType::LabelType &label) : BaseType(label) {}
};
// Specialise XmlOutputHandler for PcfgTree.
template<>
-class XmlOutputHandler<PcfgTree> {
- public:
+class XmlOutputHandler<PcfgTree>
+{
+public:
typedef std::map<std::string, std::string> AttributeMap;
void GetLabel(const PcfgTree &tree, std::string &label) const {
diff --git a/phrase-extract/pcfg-common/syntax_tree.h b/phrase-extract/pcfg-common/syntax_tree.h
index 89c6ec0c3..93d9dbec9 100644
--- a/phrase-extract/pcfg-common/syntax_tree.h
+++ b/phrase-extract/pcfg-common/syntax_tree.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -24,62 +24,87 @@
#include <cassert>
#include <vector>
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
// Base class for SyntaxTree, AgreementTree, and friends.
template<typename T, typename DerivedType>
-class SyntaxTreeBase {
- public:
+class SyntaxTreeBase
+{
+public:
// Constructors
SyntaxTreeBase(const T &label)
- : label_(label)
- , children_()
- , parent_(0) {}
+ : label_(label)
+ , children_()
+ , parent_(0) {}
SyntaxTreeBase(const T &label, const std::vector<DerivedType *> &children)
- : label_(label)
- , children_(children)
- , parent_(0) {}
+ : label_(label)
+ , children_(children)
+ , parent_(0) {}
// Destructor
virtual ~SyntaxTreeBase();
- const T &label() const { return label_; }
- const DerivedType *parent() const { return parent_; }
- DerivedType *parent() { return parent_; }
- const std::vector<DerivedType *> &children() const { return children_; }
- std::vector<DerivedType *> &children() { return children_; }
+ const T &label() const {
+ return label_;
+ }
+ const DerivedType *parent() const {
+ return parent_;
+ }
+ DerivedType *parent() {
+ return parent_;
+ }
+ const std::vector<DerivedType *> &children() const {
+ return children_;
+ }
+ std::vector<DerivedType *> &children() {
+ return children_;
+ }
- void set_label(const T &label) { label_ = label; }
- void set_parent(DerivedType *parent) { parent_ = parent; }
- void set_children(const std::vector<DerivedType *> &c) { children_ = c; }
+ void set_label(const T &label) {
+ label_ = label;
+ }
+ void set_parent(DerivedType *parent) {
+ parent_ = parent;
+ }
+ void set_children(const std::vector<DerivedType *> &c) {
+ children_ = c;
+ }
- bool IsLeaf() const { return children_.empty(); }
+ bool IsLeaf() const {
+ return children_.empty();
+ }
bool IsPreterminal() const {
return children_.size() == 1 && children_[0]->IsLeaf();
}
- void AddChild(DerivedType *child) { children_.push_back(child); }
+ void AddChild(DerivedType *child) {
+ children_.push_back(child);
+ }
- private:
+private:
T label_;
std::vector<DerivedType *> children_;
DerivedType *parent_;
};
template<typename T>
-class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> > {
- public:
+class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> >
+{
+public:
typedef SyntaxTreeBase<T, SyntaxTree<T> > BaseType;
SyntaxTree(const T &label) : BaseType(label) {}
SyntaxTree(const T &label, const std::vector<SyntaxTree *> &children)
- : BaseType(label, children) {}
+ : BaseType(label, children) {}
};
template<typename T, typename DerivedType>
-SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase() {
+SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase()
+{
for (std::size_t i = 0; i < children_.size(); ++i) {
delete children_[i];
}
diff --git a/phrase-extract/pcfg-common/tool.h b/phrase-extract/pcfg-common/tool.h
index 0af342569..aada036e3 100644
--- a/phrase-extract/pcfg-common/tool.h
+++ b/phrase-extract/pcfg-common/tool.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -28,18 +28,23 @@
#include <iostream>
#include <string>
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
-class Tool {
- public:
+class Tool
+{
+public:
virtual ~Tool() {}
- const std::string &name() const { return name_; }
+ const std::string &name() const {
+ return name_;
+ }
virtual int Main(int argc, char *argv[]) = 0;
- protected:
+protected:
Tool(const std::string &name) : name_(name) {}
// Returns the boost::program_options style that should be used by all tools.
@@ -77,7 +82,7 @@ class Tool {
// the file cannot be opened for writing.
void OpenNamedOutputOrDie(const std::string &, std::ofstream &);
- private:
+private:
std::string name_;
std::istream *input_ptr_;
std::ifstream input_file_stream_;
diff --git a/phrase-extract/pcfg-common/typedef.h b/phrase-extract/pcfg-common/typedef.h
index 49a12d681..ce3e0423b 100644
--- a/phrase-extract/pcfg-common/typedef.h
+++ b/phrase-extract/pcfg-common/typedef.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -26,8 +26,10 @@
#include <string>
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
typedef NumberedSet<std::string> Vocabulary;
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h
index 7d01b0684..7eec14033 100644
--- a/phrase-extract/pcfg-common/xml_tree_parser.h
+++ b/phrase-extract/pcfg-common/xml_tree_parser.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -30,16 +30,19 @@
#include <string>
#include <vector>
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
// Parses a string in Moses' XML parse tree format and returns a PcfgTree
// object.
-class XmlTreeParser {
- public:
+class XmlTreeParser
+{
+public:
XmlTreeParser();
std::auto_ptr<PcfgTree> Parse(const std::string &);
- private:
+private:
std::auto_ptr<PcfgTree> ConvertTree(const MosesTraining::SyntaxNode &,
const std::vector<std::string> &);
diff --git a/phrase-extract/pcfg-common/xml_tree_writer.h b/phrase-extract/pcfg-common/xml_tree_writer.h
index 6a9a3de05..426efec17 100644
--- a/phrase-extract/pcfg-common/xml_tree_writer.h
+++ b/phrase-extract/pcfg-common/xml_tree_writer.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -32,12 +32,15 @@
#include <vector>
#include <string>
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
template<typename InputTree>
-class XmlOutputHandler {
- public:
+class XmlOutputHandler
+{
+public:
typedef std::map<std::string, std::string> AttributeMap;
void GetLabel(const InputTree &, std::string &) const;
@@ -45,17 +48,19 @@ class XmlOutputHandler {
};
template<typename InputTree>
-class XmlTreeWriter : public XmlOutputHandler<InputTree> {
- public:
+class XmlTreeWriter : public XmlOutputHandler<InputTree>
+{
+public:
typedef XmlOutputHandler<InputTree> Base;
void Write(const InputTree &, std::ostream &) const;
- private:
+private:
std::string Escape(const std::string &) const;
};
template<typename InputTree>
void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
- std::ostream &out) const {
+ std::ostream &out) const
+{
assert(!tree.IsLeaf());
// Opening tag
@@ -99,7 +104,8 @@ void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
// Escapes XML special characters.
template<typename InputTree>
-std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const {
+std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const
+{
std::string t;
std::size_t len = s.size();
t.reserve(len);
diff --git a/phrase-extract/pcfg-extract/options.h b/phrase-extract/pcfg-extract/options.h
index 3acb31b58..2633f025a 100644
--- a/phrase-extract/pcfg-extract/options.h
+++ b/phrase-extract/pcfg-extract/options.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -23,8 +23,10 @@
#include <string>
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
struct Options {
std::string corpus_file;
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.h b/phrase-extract/pcfg-extract/pcfg_extract.h
index 1af6cb4fe..e8c306876 100644
--- a/phrase-extract/pcfg-extract/pcfg_extract.h
+++ b/phrase-extract/pcfg-extract/pcfg_extract.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -23,16 +23,19 @@
#include "pcfg-common/tool.h"
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
class Options;
-class PcfgExtract : public Tool {
- public:
+class PcfgExtract : public Tool
+{
+public:
PcfgExtract() : Tool("pcfg-extract") {}
virtual int Main(int, char *[]);
- private:
+private:
void ProcessOptions(int, char *[], Options &) const;
};
diff --git a/phrase-extract/pcfg-extract/rule_collection.h b/phrase-extract/pcfg-extract/rule_collection.h
index 452fa0e97..32cb2dc05 100644
--- a/phrase-extract/pcfg-extract/rule_collection.h
+++ b/phrase-extract/pcfg-extract/rule_collection.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -27,12 +27,15 @@
#include <vector>
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
// Contains PCFG rules and their counts.
-class RuleCollection {
- public:
+class RuleCollection
+{
+public:
typedef boost::unordered_map<std::vector<std::size_t>, std::size_t> RhsCountMap;
typedef boost::unordered_map<std::size_t, RhsCountMap> Map;
typedef Map::iterator iterator;
@@ -40,16 +43,24 @@ class RuleCollection {
RuleCollection() {}
- iterator begin() { return collection_.begin(); }
- const_iterator begin() const { return collection_.begin(); }
+ iterator begin() {
+ return collection_.begin();
+ }
+ const_iterator begin() const {
+ return collection_.begin();
+ }
- iterator end() { return collection_.end(); }
- const_iterator end() const { return collection_.end(); }
+ iterator end() {
+ return collection_.end();
+ }
+ const_iterator end() const {
+ return collection_.end();
+ }
void Add(std::size_t, const std::vector<std::size_t> &);
void CreatePcfg(Pcfg &);
- private:
+private:
Map collection_;
};
diff --git a/phrase-extract/pcfg-extract/rule_extractor.h b/phrase-extract/pcfg-extract/rule_extractor.h
index 6bcffbc61..e4b411c01 100644
--- a/phrase-extract/pcfg-extract/rule_extractor.h
+++ b/phrase-extract/pcfg-extract/rule_extractor.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -25,17 +25,20 @@
#include "pcfg-common/typedef.h"
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
class PcfgTree;
// Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
-class RuleExtractor {
- public:
+class RuleExtractor
+{
+public:
RuleExtractor(Vocabulary &);
void Extract(const PcfgTree &, RuleCollection &) const;
- private:
+private:
Vocabulary &non_term_vocab_;
};
diff --git a/phrase-extract/pcfg-score/options.h b/phrase-extract/pcfg-score/options.h
index e54b2a0b9..fd54b4b6b 100644
--- a/phrase-extract/pcfg-score/options.h
+++ b/phrase-extract/pcfg-score/options.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -23,8 +23,10 @@
#include <string>
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
struct Options {
std::string pcfg_file;
diff --git a/phrase-extract/pcfg-score/pcfg_score.h b/phrase-extract/pcfg-score/pcfg_score.h
index 5e506c39d..f49c9a0be 100644
--- a/phrase-extract/pcfg-score/pcfg_score.h
+++ b/phrase-extract/pcfg-score/pcfg_score.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -23,16 +23,19 @@
#include "pcfg-common/tool.h"
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
class Options;
-class PcfgScore : public Tool {
- public:
+class PcfgScore : public Tool
+{
+public:
PcfgScore() : Tool("pcfg-score") {}
virtual int Main(int, char *[]);
- private:
+private:
void ProcessOptions(int, char *[], Options &) const;
};
diff --git a/phrase-extract/pcfg-score/tree_scorer.h b/phrase-extract/pcfg-score/tree_scorer.h
index 36f4e1e99..8cb59c0c2 100644
--- a/phrase-extract/pcfg-score/tree_scorer.h
+++ b/phrase-extract/pcfg-score/tree_scorer.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -25,18 +25,21 @@
#include "pcfg-common/pcfg_tree.h"
#include "pcfg-common/typedef.h"
-namespace Moses {
-namespace PCFG {
+namespace Moses
+{
+namespace PCFG
+{
-class TreeScorer {
- public:
+class TreeScorer
+{
+public:
TreeScorer(const Pcfg &, const Vocabulary &);
// Score tree according to PCFG. Returns false if unsuccessful (due to
// missing rule).
bool Score(PcfgTree &) const;
- private:
+private:
const Pcfg &pcfg_;
const Vocabulary &non_term_vocab_;
};
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index 0e4ad57f4..3042cbe3e 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -68,7 +68,7 @@ float minCountHierarchical = 0;
Vocabulary vcbT;
Vocabulary vcbS;
-
+
} // namespace
vector<string> tokenize( const char [] );
@@ -130,18 +130,18 @@ int main(int argc, char* argv[])
cerr << "not computing lexical translation score\n";
} else if (strcmp(argv[i],"--GoodTuring") == 0) {
goodTuringFlag = true;
- fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
+ fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
} else if (strcmp(argv[i],"--KneserNey") == 0) {
kneserNeyFlag = true;
- fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
+ fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
} else if (strcmp(argv[i],"--UnalignedPenalty") == 0) {
unalignedFlag = true;
cerr << "using unaligned word penalty\n";
} else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) {
unalignedFWFlag = true;
- if (i+1==argc) {
+ if (i+1==argc) {
cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
exit(1);
}
@@ -204,22 +204,21 @@ int main(int argc, char* argv[])
istream &extractFileP = extractFile;
// output file: phrase translation table
- ostream *phraseTableFile;
-
- if (fileNamePhraseTable == "-") {
- phraseTableFile = &cout;
- }
- else {
- Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
- bool success = outputFile->Open(fileNamePhraseTable);
- if (!success) {
- cerr << "ERROR: could not open file phrase table file "
- << fileNamePhraseTable << endl;
- exit(1);
- }
- phraseTableFile = outputFile;
- }
-
+ ostream *phraseTableFile;
+
+ if (fileNamePhraseTable == "-") {
+ phraseTableFile = &cout;
+ } else {
+ Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
+ bool success = outputFile->Open(fileNamePhraseTable);
+ if (!success) {
+ cerr << "ERROR: could not open file phrase table file "
+ << fileNamePhraseTable << endl;
+ exit(1);
+ }
+ phraseTableFile = outputFile;
+ }
+
// loop through all extracted phrase translations
float lastCount = 0.0f;
float lastPcfgSum = 0.0f;
@@ -250,25 +249,23 @@ int main(int argc, char* argv[])
lastPcfgSum = phrasePair.pcfgSum;
// only differs in count? just add count
- if (lastPhrasePair != NULL
- && lastPhrasePair->equals( phrasePair )
- && featureManager.equals(*lastPhrasePair, phrasePair)) {
+ if (lastPhrasePair != NULL
+ && lastPhrasePair->equals( phrasePair )
+ && featureManager.equals(*lastPhrasePair, phrasePair)) {
lastPhrasePair->count += phrasePair.count;
lastPhrasePair->pcfgSum += phrasePair.pcfgSum;
continue;
}
-
+
// if new source phrase, process last batch
if (lastPhrasePair != NULL &&
lastPhrasePair->GetSource() != phrasePair.GetSource()) {
processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton, featureManager, maybeLogProb );
-
+
phrasePairsWithSameF.clear();
isSingleton = false;
lastPhrasePair = NULL;
- }
- else
- {
+ } else {
isSingleton = true;
}
@@ -277,11 +274,11 @@ int main(int argc, char* argv[])
lastPhrasePair = &phrasePairsWithSameF.back();
}
processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton, featureManager, maybeLogProb );
-
- phraseTableFile->flush();
- if (phraseTableFile != &cout) {
- delete phraseTableFile;
- }
+
+ phraseTableFile->flush();
+ if (phraseTableFile != &cout) {
+ delete phraseTableFile;
+ }
// output count of count statistics
if (goodTuringFlag || kneserNeyFlag) {
@@ -292,13 +289,13 @@ int main(int argc, char* argv[])
void writeCountOfCounts( const string &fileNameCountOfCounts )
{
// open file
- Moses::OutputFileStream countOfCountsFile;
- bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str());
- if (!success) {
- cerr << "ERROR: could not open count-of-counts file "
- << fileNameCountOfCounts << endl;
+ Moses::OutputFileStream countOfCountsFile;
+ bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str());
+ if (!success) {
+ cerr << "ERROR: could not open count-of-counts file "
+ << fileNameCountOfCounts << endl;
return;
- }
+ }
// Kneser-Ney needs the total number of phrase pairs
countOfCountsFile << totalDistinct << endl;
@@ -307,7 +304,7 @@ void writeCountOfCounts( const string &fileNameCountOfCounts )
for(int i=1; i<=COC_MAX; i++) {
countOfCountsFile << countOfCounts[ i ] << endl;
}
- countOfCountsFile.Close();
+ countOfCountsFile.Close();
}
void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager, const MaybeLog& maybeLogProb )
@@ -317,65 +314,63 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT
// group phrase pairs based on alignments that matter
// (i.e. that re-arrange non-terminals)
PhrasePairGroup phrasePairGroup;
-
+
float totalSource = 0;
//cerr << "phrasePair.size() = " << phrasePair.size() << endl;
-
+
// loop through phrase pairs
for(size_t i=0; i<phrasePair.size(); i++) {
// add to total count
PhraseAlignment &currPhrasePair = phrasePair[i];
-
+
totalSource += phrasePair[i].count;
-
+
// check for matches
//cerr << "phrasePairGroup.size() = " << phrasePairGroup.size() << endl;
-
+
PhraseAlignmentCollection phraseAlignColl;
phraseAlignColl.push_back(&currPhrasePair);
pair<PhrasePairGroup::iterator, bool> retInsert;
retInsert = phrasePairGroup.insert(phraseAlignColl);
- if (!retInsert.second)
- { // already exist. Add to that collection instead
+ if (!retInsert.second) {
+ // already exist. Add to that collection instead
PhraseAlignmentCollection &existingColl = const_cast<PhraseAlignmentCollection&>(*retInsert.first);
existingColl.push_back(&currPhrasePair);
}
-
+
}
// output the distinct phrase pairs, one at a time
const PhrasePairGroup::SortedColl &sortedColl = phrasePairGroup.GetSortedColl();
PhrasePairGroup::SortedColl::const_iterator iter;
- for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter)
- {
+ for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter) {
const PhraseAlignmentCollection &group = **iter;
outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile, isSingleton, featureManager, maybeLogProb );
}
-
+
}
const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair )
{
float bestAlignmentCount = -1;
PhraseAlignment* bestAlignment = NULL;
-
+
for(size_t i=0; i<phrasePair.size(); i++) {
size_t alignInd;
- if (inverseFlag)
- { // count backwards, so that alignments for ties will be the same for both normal & inverse scores
+ if (inverseFlag) {
+ // count backwards, so that alignments for ties will be the same for both normal & inverse scores
alignInd = phrasePair.size() - i - 1;
- }
- else {
+ } else {
alignInd = i;
}
-
+
if (phrasePair[alignInd]->count > bestAlignmentCount) {
bestAlignmentCount = phrasePair[alignInd]->count;
bestAlignment = phrasePair[alignInd];
}
- }
+ }
return *bestAlignment;
}
@@ -386,14 +381,12 @@ void calcNTLengthProb(const map<size_t, map<size_t, size_t> > &lengths
, map<size_t, map<size_t, float> > &probs)
{
map<size_t, map<size_t, size_t> >::const_iterator iterOuter;
- for (iterOuter = lengths.begin(); iterOuter != lengths.end(); ++iterOuter)
- {
+ for (iterOuter = lengths.begin(); iterOuter != lengths.end(); ++iterOuter) {
size_t sourcePos = iterOuter->first;
const map<size_t, size_t> &inner = iterOuter->second;
-
+
map<size_t, size_t>::const_iterator iterInner;
- for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner)
- {
+ for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner) {
size_t length = iterInner->first;
size_t count = iterInner->second;
float prob = (float) count / (float) total;
@@ -411,54 +404,49 @@ void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
map<size_t, size_t> totals;
// 1st = position in source phrase, 2nd = total counts
// each source pos should have same count?
-
+
vector< PhraseAlignment* >::const_iterator iterOuter;
- for (iterOuter = phrasePairs.begin(); iterOuter != phrasePairs.end(); ++iterOuter)
- {
+ for (iterOuter = phrasePairs.begin(); iterOuter != phrasePairs.end(); ++iterOuter) {
const PhraseAlignment &phrasePair = **iterOuter;
const std::map<size_t, std::pair<size_t, size_t> > &ntLengths = phrasePair.GetNTLengths();
-
+
std::map<size_t, std::pair<size_t, size_t> >::const_iterator iterInner;
- for (iterInner = ntLengths.begin(); iterInner != ntLengths.end(); ++iterInner)
- {
+ for (iterInner = ntLengths.begin(); iterInner != ntLengths.end(); ++iterInner) {
size_t sourcePos = iterInner->first;
size_t sourceLength = iterInner->second.first;
size_t targetLength = iterInner->second.second;
-
+
sourceLengths[sourcePos][sourceLength]++;
targetLengths[sourcePos][targetLength]++;
totals[sourcePos]++;
}
}
-
- if (totals.size() == 0)
- { // no non-term. Don't bother
+
+ if (totals.size() == 0) {
+ // no non-term. Don't bother
return;
}
size_t total = totals.begin()->second;
- if (totals.size() > 1)
- {
+ if (totals.size() > 1) {
assert(total == (++totals.begin())->second );
}
-
+
calcNTLengthProb(sourceLengths, total, sourceProb);
calcNTLengthProb(targetLengths, total, targetProb);
-
+
}
void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t, float> > &probs, const string &prefix)
{
map<size_t, map<size_t, float> >::const_iterator iterOuter;
- for (iterOuter = probs.begin(); iterOuter != probs.end(); ++iterOuter)
- {
+ for (iterOuter = probs.begin(); iterOuter != probs.end(); ++iterOuter) {
size_t sourcePos = iterOuter->first;
const map<size_t, float> &inner = iterOuter->second;
-
+
map<size_t, float>::const_iterator iterInner;
- for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner)
- {
+ for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner) {
size_t length = iterInner->first;
float prob = iterInner->second;
@@ -470,47 +458,40 @@ void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t,
bool calcCrossedNonTerm(size_t sourcePos, size_t targetPos, const std::vector< std::set<size_t> > &alignedToS)
{
- for (size_t currSource = 0; currSource < alignedToS.size(); ++currSource)
- {
- if (currSource == sourcePos)
- { // skip
- }
- else
- {
+ for (size_t currSource = 0; currSource < alignedToS.size(); ++currSource) {
+ if (currSource == sourcePos) {
+ // skip
+ } else {
const std::set<size_t> &targetSet = alignedToS[currSource];
std::set<size_t>::const_iterator iter;
- for (iter = targetSet.begin(); iter != targetSet.end(); ++iter)
- {
+ for (iter = targetSet.begin(); iter != targetSet.end(); ++iter) {
size_t currTarget = *iter;
-
+
if ((currSource < sourcePos && currTarget > targetPos)
|| (currSource > sourcePos && currTarget < targetPos)
- )
- {
+ ) {
return true;
}
}
-
+
}
}
-
+
return false;
}
int calcCrossedNonTerm(const PHRASE &phraseS, const PhraseAlignment &bestAlignment)
{
const std::vector< std::set<size_t> > &alignedToS = bestAlignment.alignedToS;
-
- for (size_t sourcePos = 0; sourcePos < alignedToS.size(); ++sourcePos)
- {
+
+ for (size_t sourcePos = 0; sourcePos < alignedToS.size(); ++sourcePos) {
const std::set<size_t> &targetSet = alignedToS[sourcePos];
-
+
WORD_ID wordId = phraseS[sourcePos];
const WORD &word = vcbS.getWord(wordId);
bool isNonTerm = isNonTerminal(word);
-
- if (isNonTerm)
- {
+
+ if (isNonTerm) {
assert(targetSet.size() == 1);
size_t targetPos = *targetSet.begin();
bool ret = calcCrossedNonTerm(sourcePos, targetPos, alignedToS);
@@ -518,17 +499,17 @@ int calcCrossedNonTerm(const PHRASE &phraseS, const PhraseAlignment &bestAlignme
return 1;
}
}
-
+
return 0;
}
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton, const ScoreFeatureManager& featureManager,
- const MaybeLog& maybeLogProb )
+ const MaybeLog& maybeLogProb )
{
if (phrasePair.size() == 0) return;
const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair );
-
+
// compute count
float count = 0;
for(size_t i=0; i<phrasePair.size(); i++) {
@@ -550,7 +531,7 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
if (pcfgFlag && !inverseFlag) {
float pcfgSum = 0;
for(size_t i=0; i<phrasePair.size(); ++i) {
- pcfgSum += phrasePair[i]->pcfgSum;
+ pcfgSum += phrasePair[i]->pcfgSum;
}
pcfgScore = pcfgSum / count;
}
@@ -604,11 +585,11 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
if (singletonFeature) {
phraseTableFile << " " << (isSingleton ? 1 : 0);
}
-
+
if (crossedNonTerm && !inverseFlag) {
phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment);
}
-
+
// target-side PCFG score
if (pcfgFlag && !inverseFlag) {
phraseTableFile << " " << maybeLogProb(pcfgScore );
@@ -624,7 +605,7 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
}
for (map<string,float>::const_iterator i = extraSparse.begin();
- i != extraSparse.end(); ++i) {
+ i != extraSparse.end(); ++i) {
phraseTableFile << " " << i->first << " " << i->second;
}
@@ -633,8 +614,8 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
// alignment info for non-terminals
if (! inverseFlag) {
if (hierarchicalFlag) {
- // always output alignment if hiero style, but only for non-terms
- // (eh: output all alignments, needed for some feature functions)
+ // always output alignment if hiero style, but only for non-terms
+ // (eh: output all alignments, needed for some feature functions)
assert(phraseT.size() == bestAlignment.alignedToT.size() + 1);
std::vector<std::string> alignment;
for(size_t j = 0; j < phraseT.size() - 1; j++) {
@@ -657,15 +638,15 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
std::stringstream point;
point << sourcePos << "-" << j;
alignment.push_back(point.str());
- }
- }
- }
- // now print all alignments, sorted by source index
- sort(alignment.begin(), alignment.end());
- for (size_t i = 0; i < alignment.size(); ++i) {
- phraseTableFile << alignment[i] << " ";
- }
- } else if (wordAlignmentFlag) {
+ }
+ }
+ }
+ // now print all alignments, sorted by source index
+ sort(alignment.begin(), alignment.end());
+ for (size_t i = 0; i < alignment.size(); ++i) {
+ phraseTableFile << alignment[i] << " ";
+ }
+ } else if (wordAlignmentFlag) {
// alignment info in pb model
for(size_t j=0; j<bestAlignment.alignedToT.size(); j++) {
const set< size_t > &aligned = bestAlignment.alignedToT[j];
@@ -678,28 +659,26 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
// counts
-
+
phraseTableFile << " ||| " << totalCount << " " << count;
- if (kneserNeyFlag)
+ if (kneserNeyFlag)
phraseTableFile << " " << distinctCount;
-
- // nt lengths
- if (outputNTLengths)
- {
+
+ // nt lengths
+ if (outputNTLengths) {
phraseTableFile << " ||| ";
- if (!inverseFlag)
- {
+ if (!inverseFlag) {
map<size_t, map<size_t, float> > sourceProb, targetProb;
// 1st sourcePos, 2nd = length, 3rd = prob
calcNTLengthProb(phrasePair, sourceProb, targetProb);
-
+
outputNTLengthProbs(phraseTableFile, sourceProb, "S");
outputNTLengthProbs(phraseTableFile, targetProb, "T");
- }
+ }
}
-
+
phraseTableFile << endl;
}
@@ -878,13 +857,13 @@ void printTargetPhrase(const PHRASE &phraseS, const PHRASE &phraseT,
std::pair<PhrasePairGroup::Coll::iterator,bool> PhrasePairGroup::insert ( const PhraseAlignmentCollection& obj )
{
std::pair<iterator,bool> ret = m_coll.insert(obj);
-
- if (ret.second)
- { // obj inserted. Also add to sorted vector
+
+ if (ret.second) {
+ // obj inserted. Also add to sorted vector
const PhraseAlignmentCollection &insertedObj = *ret.first;
m_sortedColl.push_back(&insertedObj);
}
-
+
return ret;
}
diff --git a/phrase-extract/score.h b/phrase-extract/score.h
index 59d2cf58f..6a10536c1 100644
--- a/phrase-extract/score.h
+++ b/phrase-extract/score.h
@@ -32,6 +32,6 @@ inline bool isNonTerminal( const std::string &word )
return (word.length()>=3 && word[0] == '[' && word[word.length()-1] == ']');
}
-
+
}
diff --git a/phrase-extract/tables-core.cpp b/phrase-extract/tables-core.cpp
index 6b35f371b..30c1544e9 100644
--- a/phrase-extract/tables-core.cpp
+++ b/phrase-extract/tables-core.cpp
@@ -33,8 +33,9 @@ vector<string> tokenize( const char* input )
namespace MosesTraining
{
-bool isNonTerminal( const WORD &symbol ) {
- return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]";
+bool isNonTerminal( const WORD &symbol )
+{
+ return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]";
}
WORD_ID Vocabulary::storeIfNew( const WORD& word )
@@ -105,7 +106,7 @@ void DTable::load( const string& fileName )
std::cerr << "Error reading from " << fileName << std::endl;
abort();
}
-
+
vector<string> token = tokenize(line.c_str());
if (token.size() < 2) {
cerr << "line " << i << " in " << fileName << " too short, skipping\n";
diff --git a/scripts/other/beautify.perl b/scripts/other/beautify.perl
index 110539893..01afb1af2 100755
--- a/scripts/other/beautify.perl
+++ b/scripts/other/beautify.perl
@@ -1,31 +1,46 @@
-#!/usr/bin/perl
+#!/usr/bin/perl
use strict;
+use File::Basename;
-#my $cmd = "astyle --style='k&r' -s2 -v --recursive *.h *.cpp";
-#print STDERR "Executing: $cmd \n";
-#system($cmd);
+sub Beautify($);
-opendir(DIR,".") or die "Can't open the current directory: $!\n";
+Beautify("/home/hieu/workspace/github/mosesdecoder");
+
+sub Beautify($)
+{
+my $path = shift;
+opendir(DIR, $path) or die "Can't open the current directory: $!\n";
# read file/directory names in that directory into @names
my @names = readdir(DIR) or die "Unable to read current dir:$!\n";
foreach my $name (@names) {
- next if ($name eq "."); # skip the current directory entry
- next if ($name eq ".."); # skip the parent directory entry
- next if ($name eq "boost"); # skip the parent directory entry
- next if ($name eq "contrib"); # skip the parent directory entry
- next if ($name eq "jam-files"); # skip the parent directory entry
- next if ($name eq ".git"); # skip the parent directory entry
-
- if (-d $name){ # is this a directory?
- my $cmd = "astyle --style='k&r' -s2 -v --recursive $name/*.h $name/*.cpp";
- print STDERR "Executing: $cmd \n";
- system($cmd);
-
- next; # can skip to the next name in the for loop
+ next if ($name eq ".");
+ next if ($name eq "..");
+ next if ($name eq "boost");
+ next if ($name eq "contrib");
+ next if ($name eq "jam-files");
+ next if ($name eq ".git");
+ next if ($name eq "util");
+ next if ($name eq "lm");
+ next if ($name eq "search");
+
+ $name = $path ."/" .$name;
+ if (-d $name) {
+ print STDERR "Into: $name \n";
+ Beautify($name);
+ }
+ else { # is this a directory?
+ (my $nameOnly, my $pathOnly,my $suffix) = fileparse($name,qr"\..[^.]*$");
+ if ($suffix eq ".cpp" || $suffix eq ".h") {
+ my $cmd = "astyle --style='k&r' -s2 -v $name";
+ print STDERR "Executing: $cmd \n";
+ system($cmd);
+ }
}
}
closedir(DIR);
+}
+
diff --git a/symal/symal.cpp b/symal/symal.cpp
index da386d973..dbe68f1b9 100644
--- a/symal/symal.cpp
+++ b/symal/symal.cpp
@@ -411,7 +411,7 @@ int main(int argc, char** argv)
"o", CMDSTRINGTYPE, &output,
"v", CMDENUMTYPE, &verbose, BoolEnum,
"verbose", CMDENUMTYPE, &verbose, BoolEnum,
-
+
(char*)NULL);
GetParams(&argc, &argv, (char*)NULL);
diff --git a/util/double-conversion/bignum-dtoa.h b/util/double-conversion/bignum-dtoa.h
index 34b961992..652a4db9a 100644
--- a/util/double-conversion/bignum-dtoa.h
+++ b/util/double-conversion/bignum-dtoa.h
@@ -30,7 +30,8 @@
#include "utils.h"
-namespace double_conversion {
+namespace double_conversion
+{
enum BignumDtoaMode {
// Return the shortest correct representation.
diff --git a/util/double-conversion/bignum.h b/util/double-conversion/bignum.h
index 5ec3544f5..5deadbfbe 100644
--- a/util/double-conversion/bignum.h
+++ b/util/double-conversion/bignum.h
@@ -30,10 +30,12 @@
#include "utils.h"
-namespace double_conversion {
+namespace double_conversion
+{
-class Bignum {
- public:
+class Bignum
+{
+public:
// 3584 = 128 * 28. We can represent 2^3584 > 10^1000 accurately.
// This bignum can encode much bigger numbers, since it contains an
// exponent.
@@ -60,7 +62,9 @@ class Bignum {
void MultiplyByUInt32(uint32_t factor);
void MultiplyByUInt64(uint64_t factor);
void MultiplyByPowerOfTen(int exponent);
- void Times10() { return MultiplyByUInt32(10); }
+ void Times10() {
+ return MultiplyByUInt32(10);
+ }
// Pseudocode:
// int result = this / other;
// this = this % other;
@@ -97,7 +101,7 @@ class Bignum {
static bool PlusLess(const Bignum& a, const Bignum& b, const Bignum& c) {
return PlusCompare(a, b, c) < 0;
}
- private:
+private:
typedef uint32_t Chunk;
typedef uint64_t DoubleChunk;
@@ -125,7 +129,9 @@ class Bignum {
// shift_amount must be < kBigitSize.
void BigitsShiftLeft(int shift_amount);
// BigitLength includes the "hidden" digits encoded in the exponent.
- int BigitLength() const { return used_digits_ + exponent_; }
+ int BigitLength() const {
+ return used_digits_ + exponent_;
+ }
Chunk BigitAt(int index) const;
void SubtractTimes(const Bignum& other, int factor);
diff --git a/util/double-conversion/cached-powers.h b/util/double-conversion/cached-powers.h
index 61a50614c..3daf52d51 100644
--- a/util/double-conversion/cached-powers.h
+++ b/util/double-conversion/cached-powers.h
@@ -30,10 +30,12 @@
#include "diy-fp.h"
-namespace double_conversion {
+namespace double_conversion
+{
-class PowersOfTenCache {
- public:
+class PowersOfTenCache
+{
+public:
// Not all powers of ten are cached. The decimal exponent of two neighboring
// cached numbers will differ by kDecimalExponentDistance.
@@ -45,9 +47,9 @@ class PowersOfTenCache {
// Returns a cached power-of-ten with a binary exponent in the range
// [min_exponent; max_exponent] (boundaries included).
static void GetCachedPowerForBinaryExponentRange(int min_exponent,
- int max_exponent,
- DiyFp* power,
- int* decimal_exponent);
+ int max_exponent,
+ DiyFp* power,
+ int* decimal_exponent);
// Returns a cached power of ten x ~= 10^k such that
// k <= decimal_exponent < k + kCachedPowersDecimalDistance.
@@ -55,8 +57,8 @@ class PowersOfTenCache {
// kMinDecimalExponent <= requested_exponent, and
// requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance.
static void GetCachedPowerForDecimalExponent(int requested_exponent,
- DiyFp* power,
- int* found_exponent);
+ DiyFp* power,
+ int* found_exponent);
};
} // namespace double_conversion
diff --git a/util/double-conversion/diy-fp.h b/util/double-conversion/diy-fp.h
index 9dcf8fbdb..39a6bd7dd 100644
--- a/util/double-conversion/diy-fp.h
+++ b/util/double-conversion/diy-fp.h
@@ -30,15 +30,17 @@
#include "utils.h"
-namespace double_conversion {
+namespace double_conversion
+{
// This "Do It Yourself Floating Point" class implements a floating-point number
// with a uint64 significand and an int exponent. Normalized DiyFp numbers will
// have the most significant bit of the significand set.
// Multiplication and Subtraction do not normalize their results.
// DiyFp are not designed to contain special doubles (NaN and Infinity).
-class DiyFp {
- public:
+class DiyFp
+{
+public:
static const int kSignificandSize = 64;
DiyFp() : f_(0), e_(0) {}
@@ -100,13 +102,21 @@ class DiyFp {
return result;
}
- uint64_t f() const { return f_; }
- int e() const { return e_; }
+ uint64_t f() const {
+ return f_;
+ }
+ int e() const {
+ return e_;
+ }
- void set_f(uint64_t new_value) { f_ = new_value; }
- void set_e(int new_value) { e_ = new_value; }
+ void set_f(uint64_t new_value) {
+ f_ = new_value;
+ }
+ void set_e(int new_value) {
+ e_ = new_value;
+ }
- private:
+private:
static const uint64_t kUint64MSB = UINT64_2PART_C(0x80000000, 00000000);
uint64_t f_;
diff --git a/util/double-conversion/double-conversion.h b/util/double-conversion/double-conversion.h
index 1c3387d4f..b3e51bae8 100644
--- a/util/double-conversion/double-conversion.h
+++ b/util/double-conversion/double-conversion.h
@@ -30,10 +30,12 @@
#include "utils.h"
-namespace double_conversion {
+namespace double_conversion
+{
-class DoubleToStringConverter {
- public:
+class DoubleToStringConverter
+{
+public:
// When calling ToFixed with a double > 10^kMaxFixedDigitsBeforePoint
// or a requested_digits parameter > kMaxFixedDigitsAfterPoint then the
// function returns false.
@@ -112,20 +114,20 @@ class DoubleToStringConverter {
int decimal_in_shortest_high,
int max_leading_padding_zeroes_in_precision_mode,
int max_trailing_padding_zeroes_in_precision_mode)
- : flags_(flags),
- infinity_symbol_(infinity_symbol),
- nan_symbol_(nan_symbol),
- exponent_character_(exponent_character),
- decimal_in_shortest_low_(decimal_in_shortest_low),
- decimal_in_shortest_high_(decimal_in_shortest_high),
- max_leading_padding_zeroes_in_precision_mode_(
- max_leading_padding_zeroes_in_precision_mode),
- max_trailing_padding_zeroes_in_precision_mode_(
- max_trailing_padding_zeroes_in_precision_mode) {
+ : flags_(flags),
+ infinity_symbol_(infinity_symbol),
+ nan_symbol_(nan_symbol),
+ exponent_character_(exponent_character),
+ decimal_in_shortest_low_(decimal_in_shortest_low),
+ decimal_in_shortest_high_(decimal_in_shortest_high),
+ max_leading_padding_zeroes_in_precision_mode_(
+ max_leading_padding_zeroes_in_precision_mode),
+ max_trailing_padding_zeroes_in_precision_mode_(
+ max_trailing_padding_zeroes_in_precision_mode) {
// When 'trailing zero after the point' is set, then 'trailing point'
// must be set too.
ASSERT(((flags & EMIT_TRAILING_DECIMAL_POINT) != 0) ||
- !((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0));
+ !((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0));
}
// Returns a converter following the EcmaScript specification.
@@ -341,7 +343,7 @@ class DoubleToStringConverter {
int* length,
int* point);
- private:
+private:
// Implementation for ToShortest and ToShortestSingle.
bool ToShortestIeeeNumber(double value,
StringBuilder* result_builder,
@@ -378,8 +380,9 @@ class DoubleToStringConverter {
};
-class StringToDoubleConverter {
- public:
+class StringToDoubleConverter
+{
+public:
// Enumeration for allowing octals and ignoring junk when converting
// strings to numbers.
enum Flags {
@@ -488,11 +491,11 @@ class StringToDoubleConverter {
double junk_string_value,
const char* infinity_symbol,
const char* nan_symbol)
- : flags_(flags),
- empty_string_value_(empty_string_value),
- junk_string_value_(junk_string_value),
- infinity_symbol_(infinity_symbol),
- nan_symbol_(nan_symbol) {
+ : flags_(flags),
+ empty_string_value_(empty_string_value),
+ junk_string_value_(junk_string_value),
+ infinity_symbol_(infinity_symbol),
+ nan_symbol_(nan_symbol) {
}
// Performs the conversion.
@@ -516,7 +519,7 @@ class StringToDoubleConverter {
processed_characters_count, false));
}
- private:
+private:
const int flags_;
const double empty_string_value_;
const double junk_string_value_;
diff --git a/util/double-conversion/fast-dtoa.h b/util/double-conversion/fast-dtoa.h
index 5f1e8eee5..184f9cade 100644
--- a/util/double-conversion/fast-dtoa.h
+++ b/util/double-conversion/fast-dtoa.h
@@ -30,7 +30,8 @@
#include "utils.h"
-namespace double_conversion {
+namespace double_conversion
+{
enum FastDtoaMode {
// Computes the shortest representation of the given input. The returned
diff --git a/util/double-conversion/fixed-dtoa.h b/util/double-conversion/fixed-dtoa.h
index 3bdd08e21..9383cb936 100644
--- a/util/double-conversion/fixed-dtoa.h
+++ b/util/double-conversion/fixed-dtoa.h
@@ -30,7 +30,8 @@
#include "utils.h"
-namespace double_conversion {
+namespace double_conversion
+{
// Produces digits necessary to print a given number with
// 'fractional_count' digits after the decimal point.
diff --git a/util/double-conversion/ieee.h b/util/double-conversion/ieee.h
index 839dc47d4..0922129d5 100644
--- a/util/double-conversion/ieee.h
+++ b/util/double-conversion/ieee.h
@@ -30,17 +30,31 @@
#include "diy-fp.h"
-namespace double_conversion {
+namespace double_conversion
+{
// We assume that doubles and uint64_t have the same endianness.
-static uint64_t double_to_uint64(double d) { return BitCast<uint64_t>(d); }
-static double uint64_to_double(uint64_t d64) { return BitCast<double>(d64); }
-static uint32_t float_to_uint32(float f) { return BitCast<uint32_t>(f); }
-static float uint32_to_float(uint32_t d32) { return BitCast<float>(d32); }
+static uint64_t double_to_uint64(double d)
+{
+ return BitCast<uint64_t>(d);
+}
+static double uint64_to_double(uint64_t d64)
+{
+ return BitCast<double>(d64);
+}
+static uint32_t float_to_uint32(float f)
+{
+ return BitCast<uint32_t>(f);
+}
+static float uint32_to_float(uint32_t d32)
+{
+ return BitCast<float>(d32);
+}
// Helper functions for doubles.
-class Double {
- public:
+class Double
+{
+public:
static const uint64_t kSignMask = UINT64_2PART_C(0x80000000, 00000000);
static const uint64_t kExponentMask = UINT64_2PART_C(0x7FF00000, 00000000);
static const uint64_t kSignificandMask = UINT64_2PART_C(0x000FFFFF, FFFFFFFF);
@@ -113,7 +127,7 @@ class Double {
uint64_t d64 = AsUint64();
int biased_e =
- static_cast<int>((d64 & kExponentMask) >> kPhysicalSignificandSize);
+ static_cast<int>((d64 & kExponentMask) >> kPhysicalSignificandSize);
return biased_e - kExponentBias;
}
@@ -143,13 +157,13 @@ class Double {
bool IsNan() const {
uint64_t d64 = AsUint64();
return ((d64 & kExponentMask) == kExponentMask) &&
- ((d64 & kSignificandMask) != 0);
+ ((d64 & kSignificandMask) != 0);
}
bool IsInfinite() const {
uint64_t d64 = AsUint64();
return ((d64 & kExponentMask) == kExponentMask) &&
- ((d64 & kSignificandMask) == 0);
+ ((d64 & kSignificandMask) == 0);
}
int Sign() const {
@@ -197,7 +211,9 @@ class Double {
return physical_significand_is_zero && (Exponent() != kDenormalExponent);
}
- double value() const { return uint64_to_double(d64_); }
+ double value() const {
+ return uint64_to_double(d64_);
+ }
// Returns the significand size for a given order of magnitude.
// If v = f*2^e with 2^p-1 <= f <= 2^p then p+e is v's order of magnitude.
@@ -221,7 +237,7 @@ class Double {
return Double(kNaN).value();
}
- private:
+private:
static const int kExponentBias = 0x3FF + kPhysicalSignificandSize;
static const int kDenormalExponent = -kExponentBias + 1;
static const int kMaxExponent = 0x7FF - kExponentBias;
@@ -254,12 +270,13 @@ class Double {
biased_exponent = static_cast<uint64_t>(exponent + kExponentBias);
}
return (significand & kSignificandMask) |
- (biased_exponent << kPhysicalSignificandSize);
+ (biased_exponent << kPhysicalSignificandSize);
}
};
-class Single {
- public:
+class Single
+{
+public:
static const uint32_t kSignMask = 0x80000000;
static const uint32_t kExponentMask = 0x7F800000;
static const uint32_t kSignificandMask = 0x007FFFFF;
@@ -289,7 +306,7 @@ class Single {
uint32_t d32 = AsUint32();
int biased_e =
- static_cast<int>((d32 & kExponentMask) >> kPhysicalSignificandSize);
+ static_cast<int>((d32 & kExponentMask) >> kPhysicalSignificandSize);
return biased_e - kExponentBias;
}
@@ -319,13 +336,13 @@ class Single {
bool IsNan() const {
uint32_t d32 = AsUint32();
return ((d32 & kExponentMask) == kExponentMask) &&
- ((d32 & kSignificandMask) != 0);
+ ((d32 & kSignificandMask) != 0);
}
bool IsInfinite() const {
uint32_t d32 = AsUint32();
return ((d32 & kExponentMask) == kExponentMask) &&
- ((d32 & kSignificandMask) == 0);
+ ((d32 & kSignificandMask) == 0);
}
int Sign() const {
@@ -373,7 +390,9 @@ class Single {
return physical_significand_is_zero && (Exponent() != kDenormalExponent);
}
- float value() const { return uint32_to_float(d32_); }
+ float value() const {
+ return uint32_to_float(d32_);
+ }
static float Infinity() {
return Single(kInfinity).value();
@@ -383,7 +402,7 @@ class Single {
return Single(kNaN).value();
}
- private:
+private:
static const int kExponentBias = 0x7F + kPhysicalSignificandSize;
static const int kDenormalExponent = -kExponentBias + 1;
static const int kMaxExponent = 0xFF - kExponentBias;
diff --git a/util/double-conversion/strtod.h b/util/double-conversion/strtod.h
index ed0293b8f..1d81078d2 100644
--- a/util/double-conversion/strtod.h
+++ b/util/double-conversion/strtod.h
@@ -30,7 +30,8 @@
#include "utils.h"
-namespace double_conversion {
+namespace double_conversion
+{
// The buffer must only contain digits in the range [0-9]. It must not
// contain a dot or a sign. It must not start with '0', and must not be empty.
diff --git a/util/double-conversion/utils.h b/util/double-conversion/utils.h
index 2bd716050..91f1e6c48 100644
--- a/util/double-conversion/utils.h
+++ b/util/double-conversion/utils.h
@@ -126,25 +126,29 @@ typedef unsigned __int64 uint64_t;
DISALLOW_COPY_AND_ASSIGN(TypeName)
#endif
-namespace double_conversion {
+namespace double_conversion
+{
static const int kCharSize = sizeof(char);
// Returns the maximum of the two parameters.
template <typename T>
-static T Max(T a, T b) {
+static T Max(T a, T b)
+{
return a < b ? b : a;
}
// Returns the minimum of the two parameters.
template <typename T>
-static T Min(T a, T b) {
+static T Min(T a, T b)
+{
return a < b ? a : b;
}
-inline int StrLength(const char* string) {
+inline int StrLength(const char* string)
+{
size_t length = strlen(string);
ASSERT(length == static_cast<size_t>(static_cast<int>(length)));
return static_cast<int>(length);
@@ -152,8 +156,9 @@ inline int StrLength(const char* string) {
// This is a simplified version of V8's Vector class.
template <typename T>
-class Vector {
- public:
+class Vector
+{
+public:
Vector() : start_(NULL), length_(0) {}
Vector(T* data, int length) : start_(data), length_(length) {
ASSERT(length == 0 || (length > 0 && data != NULL));
@@ -169,13 +174,19 @@ class Vector {
}
// Returns the length of the vector.
- int length() const { return length_; }
+ int length() const {
+ return length_;
+ }
// Returns whether or not the vector is empty.
- bool is_empty() const { return length_ == 0; }
+ bool is_empty() const {
+ return length_ == 0;
+ }
// Returns the pointer to the start of the data in the vector.
- T* start() const { return start_; }
+ T* start() const {
+ return start_;
+ }
// Access individual vector elements - checks bounds in debug mode.
T& operator[](int index) const {
@@ -183,11 +194,15 @@ class Vector {
return start_[index];
}
- T& first() { return start_[0]; }
+ T& first() {
+ return start_[0];
+ }
- T& last() { return start_[length_ - 1]; }
+ T& last() {
+ return start_[length_ - 1];
+ }
- private:
+private:
T* start_;
int length_;
};
@@ -196,14 +211,19 @@ class Vector {
// Helper class for building result strings in a character buffer. The
// purpose of the class is to use safe operations that checks the
// buffer bounds on all operations in debug mode.
-class StringBuilder {
- public:
+class StringBuilder
+{
+public:
StringBuilder(char* buffer, int size)
- : buffer_(buffer, size), position_(0) { }
+ : buffer_(buffer, size), position_(0) { }
- ~StringBuilder() { if (!is_finalized()) Finalize(); }
+ ~StringBuilder() {
+ if (!is_finalized()) Finalize();
+ }
- int size() const { return buffer_.length(); }
+ int size() const {
+ return buffer_.length();
+ }
// Get the current position in the builder.
int position() const {
@@ -212,7 +232,9 @@ class StringBuilder {
}
// Reset the position.
- void Reset() { position_ = 0; }
+ void Reset() {
+ position_ = 0;
+ }
// Add a single character to the builder. It is not allowed to add
// 0-characters; use the Finalize() method to terminate the string
@@ -262,11 +284,13 @@ class StringBuilder {
return buffer_.start();
}
- private:
+private:
Vector<char> buffer_;
int position_;
- bool is_finalized() const { return position_ < 0; }
+ bool is_finalized() const {
+ return position_ < 0;
+ }
DISALLOW_IMPLICIT_CONSTRUCTORS(StringBuilder);
};
@@ -296,7 +320,8 @@ class StringBuilder {
// enough that it can no longer see that you have cast one pointer type to
// another thus avoiding the warning.
template <class Dest, class Source>
-inline Dest BitCast(const Source& source) {
+inline Dest BitCast(const Source& source)
+{
// Compile time assertion: sizeof(Dest) == sizeof(Source)
// A compile error here means your Dest and Source have different sizes.
typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1];
@@ -307,7 +332,8 @@ inline Dest BitCast(const Source& source) {
}
template <class Dest, class Source>
-inline Dest BitCast(Source* source) {
+inline Dest BitCast(Source* source)
+{
return BitCast<Dest>(reinterpret_cast<uintptr_t>(source));
}