Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/moses
diff options
context:
space:
mode:
authorHieu Hoang <hieu@hoang.co.uk>2013-05-29 21:16:15 +0400
committerHieu Hoang <hieu@hoang.co.uk>2013-05-29 21:16:15 +0400
commit6249432407af8730c10bccc7894c0725fcaf5e47 (patch)
tree3ac1f094b9fdc199b04bc5ef209ce00e3596e37d /moses
parent59bd7deb4b6b9c4f7b3b7dbb055783528fbc31ca (diff)
beautify
Diffstat (limited to 'moses')
-rw-r--r--moses/AlignmentInfo.cpp55
-rw-r--r--moses/AlignmentInfo.h56
-rw-r--r--moses/AlignmentInfoCollection.cpp8
-rw-r--r--moses/AlignmentInfoCollection.h16
-rw-r--r--moses/AlignmentInfoTest.cpp3
-rw-r--r--moses/BitmapContainer.cpp8
-rw-r--r--moses/ChartCell.cpp24
-rw-r--r--moses/ChartCell.h41
-rw-r--r--moses/ChartCellCollection.cpp25
-rw-r--r--moses/ChartCellCollection.h68
-rw-r--r--moses/ChartCellLabel.h34
-rw-r--r--moses/ChartCellLabelSet.h47
-rw-r--r--moses/ChartHypothesis.cpp42
-rw-r--r--moses/ChartHypothesis.h62
-rw-r--r--moses/ChartHypothesisCollection.cpp7
-rw-r--r--moses/ChartHypothesisCollection.h6
-rw-r--r--moses/ChartManager.cpp117
-rw-r--r--moses/ChartManager.h14
-rw-r--r--moses/ChartParser.cpp43
-rw-r--r--moses/ChartParser.h40
-rw-r--r--moses/ChartParserCallback.h16
-rw-r--r--moses/ChartRuleLookupManager.h2
-rw-r--r--moses/ChartTranslationOptionList.cpp14
-rw-r--r--moses/ChartTranslationOptionList.h19
-rw-r--r--moses/ChartTranslationOptions.cpp4
-rw-r--r--moses/ChartTranslationOptions.h28
-rw-r--r--moses/ChartTrellisDetour.cpp14
-rw-r--r--moses/ChartTrellisDetour.h18
-rw-r--r--moses/ChartTrellisDetourQueue.cpp18
-rw-r--r--moses/ChartTrellisDetourQueue.h20
-rw-r--r--moses/ChartTrellisNode.cpp18
-rw-r--r--moses/ChartTrellisNode.h16
-rw-r--r--moses/ChartTrellisPath.cpp14
-rw-r--r--moses/ChartTrellisPath.h16
-rw-r--r--moses/ConfusionNet.h2
-rw-r--r--moses/DecodeFeature.cpp25
-rw-r--r--moses/DecodeFeature.h50
-rw-r--r--moses/DecodeStepTranslation.cpp6
-rw-r--r--moses/FF/BleuScoreFeature.cpp977
-rw-r--r--moses/FF/BleuScoreFeature.h203
-rw-r--r--moses/FF/ChartBasedFeatureContext.cpp6
-rw-r--r--moses/FF/ChartBasedFeatureContext.h12
-rw-r--r--moses/FF/DistortionScoreProducer.cpp11
-rw-r--r--moses/FF/DistortionScoreProducer.h12
-rw-r--r--moses/FF/FFState.h5
-rw-r--r--moses/FF/FeatureFunction.cpp17
-rw-r--r--moses/FF/FeatureFunction.h34
-rw-r--r--moses/FF/GlobalLexicalModel.cpp19
-rw-r--r--moses/FF/GlobalLexicalModel.h10
-rw-r--r--moses/FF/GlobalLexicalModelUnlimited.cpp453
-rw-r--r--moses/FF/GlobalLexicalModelUnlimited.h21
-rw-r--r--moses/FF/InputFeature.cpp16
-rw-r--r--moses/FF/PhraseBasedFeatureContext.cpp2
-rw-r--r--moses/FF/PhraseBasedFeatureContext.h12
-rw-r--r--moses/FF/PhraseBoundaryFeature.cpp56
-rw-r--r--moses/FF/PhraseBoundaryFeature.h20
-rw-r--r--moses/FF/PhraseLengthFeature.cpp11
-rw-r--r--moses/FF/PhraseLengthFeature.h9
-rw-r--r--moses/FF/PhrasePairFeature.cpp189
-rw-r--r--moses/FF/PhrasePairFeature.h60
-rw-r--r--moses/FF/SourceWordDeletionFeature.cpp69
-rw-r--r--moses/FF/SourceWordDeletionFeature.h17
-rw-r--r--moses/FF/StatefulFeatureFunction.cpp4
-rw-r--r--moses/FF/StatefulFeatureFunction.h11
-rw-r--r--moses/FF/StatelessFeatureFunction.cpp4
-rw-r--r--moses/FF/StatelessFeatureFunction.h11
-rw-r--r--moses/FF/TargetBigramFeature.cpp19
-rw-r--r--moses/FF/TargetBigramFeature.h39
-rw-r--r--moses/FF/TargetNgramFeature.cpp472
-rw-r--r--moses/FF/TargetNgramFeature.h74
-rw-r--r--moses/FF/TargetWordInsertionFeature.cpp51
-rw-r--r--moses/FF/TargetWordInsertionFeature.h17
-rw-r--r--moses/FF/UnknownWordPenaltyProducer.h7
-rw-r--r--moses/FF/WordPenaltyProducer.cpp6
-rw-r--r--moses/FF/WordPenaltyProducer.h8
-rw-r--r--moses/FF/WordTranslationFeature.cpp619
-rw-r--r--moses/FF/WordTranslationFeature.h17
-rw-r--r--moses/Factor.h8
-rw-r--r--moses/FactorCollection.cpp9
-rw-r--r--moses/FactorCollection.h2
-rw-r--r--moses/FeatureVector.cpp1361
-rw-r--r--moses/FeatureVector.h635
-rw-r--r--moses/FeatureVectorTest.cpp107
-rw-r--r--moses/GenerationDictionary.cpp5
-rw-r--r--moses/GenerationDictionary.h31
-rw-r--r--moses/Hypothesis.cpp26
-rw-r--r--moses/HypothesisStack.h2
-rw-r--r--moses/Incremental.cpp170
-rw-r--r--moses/Incremental.h57
-rw-r--r--moses/InputType.cpp2
-rw-r--r--moses/InputType.h8
-rw-r--r--moses/LM/Backward.cpp509
-rw-r--r--moses/LM/Backward.h80
-rw-r--r--moses/LM/BackwardLMState.cpp12
-rw-r--r--moses/LM/BackwardLMState.h20
-rw-r--r--moses/LM/BackwardTest.cpp507
-rw-r--r--moses/LM/Base.cpp68
-rw-r--r--moses/LM/Base.h16
-rw-r--r--moses/LM/ChartState.h42
-rw-r--r--moses/LM/IRST.cpp34
-rw-r--r--moses/LM/Implementation.cpp56
-rw-r--r--moses/LM/Implementation.h7
-rw-r--r--moses/LM/Joint.h3
-rw-r--r--moses/LM/Ken.cpp217
-rw-r--r--moses/LM/Ken.h3
-rw-r--r--moses/LM/LDHT.cpp551
-rw-r--r--moses/LM/LDHT.h3
-rw-r--r--moses/LM/MultiFactor.h10
-rw-r--r--moses/LM/ORLM.cpp30
-rw-r--r--moses/LM/ORLM.h9
-rw-r--r--moses/LM/ParallelBackoff.cpp2
-rw-r--r--moses/LM/Rand.cpp8
-rw-r--r--moses/LM/SRI.cpp31
-rw-r--r--moses/LM/SingleFactor.cpp2
-rw-r--r--moses/LM/SingleFactor.h42
-rw-r--r--moses/LexicalReordering.cpp50
-rw-r--r--moses/LexicalReordering.h41
-rw-r--r--moses/LexicalReorderingState.cpp2
-rw-r--r--moses/LexicalReorderingTable.cpp2
-rw-r--r--moses/Manager.cpp247
-rw-r--r--moses/Manager.h6
-rw-r--r--moses/MockHypothesis.cpp34
-rw-r--r--moses/MockHypothesis.h81
-rw-r--r--moses/OutputCollector.h22
-rw-r--r--moses/PCNTools.h2
-rw-r--r--moses/PDTAimp.h14
-rw-r--r--moses/Parameter.cpp219
-rw-r--r--moses/Parameter.h56
-rw-r--r--moses/PartialTranslOptColl.h2
-rw-r--r--moses/Phrase.cpp32
-rw-r--r--moses/Phrase.h89
-rw-r--r--moses/PrefixTree.h2
-rw-r--r--moses/PrefixTreeMap.h2
-rw-r--r--moses/RuleCube.h28
-rw-r--r--moses/RuleCubeItem.h20
-rw-r--r--moses/RuleCubeQueue.h12
-rw-r--r--moses/ScoreComponentCollection.cpp61
-rw-r--r--moses/ScoreComponentCollection.h353
-rw-r--r--moses/ScoreComponentCollectionTest.cpp44
-rw-r--r--moses/SearchNormalBatch.cpp136
-rw-r--r--moses/SearchNormalBatch.h4
-rw-r--r--moses/Sentence.cpp3
-rw-r--r--moses/StaticData.cpp150
-rw-r--r--moses/StaticData.h69
-rw-r--r--moses/SyntacticLanguageModel.cpp227
-rw-r--r--moses/SyntacticLanguageModel.h47
-rw-r--r--moses/SyntacticLanguageModelFiles.h46
-rw-r--r--moses/SyntacticLanguageModelState.h190
-rw-r--r--moses/TargetPhrase.cpp68
-rw-r--r--moses/TargetPhrase.h76
-rw-r--r--moses/TargetPhraseCollection.cpp4
-rw-r--r--moses/TargetPhraseCollection.h4
-rw-r--r--moses/Terminal.h10
-rw-r--r--moses/ThreadPool.h15
-rw-r--r--moses/Timer.h2
-rw-r--r--moses/TranslationModel/BilingualDynSuffixArray.cpp887
-rw-r--r--moses/TranslationModel/BilingualDynSuffixArray.h194
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp6
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h10
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp28
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h4
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp28
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h8
-rw-r--r--moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h4
-rw-r--r--moses/TranslationModel/CYKPlusParser/DotChart.h38
-rw-r--r--moses/TranslationModel/CYKPlusParser/DotChartInMemory.cpp6
-rw-r--r--moses/TranslationModel/CYKPlusParser/DotChartInMemory.h18
-rw-r--r--moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h28
-rw-r--r--moses/TranslationModel/CompactPT/BlockHashIndex.cpp178
-rw-r--r--moses/TranslationModel/CompactPT/BlockHashIndex.h285
-rw-r--r--moses/TranslationModel/CompactPT/CanonicalHuffman.h599
-rw-r--r--moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp168
-rw-r--r--moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h171
-rw-r--r--moses/TranslationModel/CompactPT/ConsistentPhrases.h197
-rw-r--r--moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp114
-rw-r--r--moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h123
-rw-r--r--moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp267
-rw-r--r--moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h230
-rw-r--r--moses/TranslationModel/CompactPT/ListCoders.h603
-rw-r--r--moses/TranslationModel/CompactPT/MmapAllocator.h358
-rw-r--r--moses/TranslationModel/CompactPT/MonotonicVector.h399
-rw-r--r--moses/TranslationModel/CompactPT/MurmurHash3.cpp248
-rw-r--r--moses/TranslationModel/CompactPT/PackedArray.h320
-rw-r--r--moses/TranslationModel/CompactPT/PhraseDecoder.cpp323
-rw-r--r--moses/TranslationModel/CompactPT/PhraseDecoder.h214
-rw-r--r--moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp133
-rw-r--r--moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h66
-rw-r--r--moses/TranslationModel/CompactPT/PhraseTableCreator.cpp785
-rw-r--r--moses/TranslationModel/CompactPT/PhraseTableCreator.h685
-rw-r--r--moses/TranslationModel/CompactPT/StringVector.h654
-rw-r--r--moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h238
-rw-r--r--moses/TranslationModel/CompactPT/ThrowingFwrite.cpp45
-rw-r--r--moses/TranslationModel/CompactPT/ThrowingFwrite.h42
-rw-r--r--moses/TranslationModel/DynSAInclude/RandLMCache.h321
-rw-r--r--moses/TranslationModel/DynSAInclude/RandLMFilter.h557
-rw-r--r--moses/TranslationModel/DynSAInclude/hash.h301
-rw-r--r--moses/TranslationModel/DynSAInclude/onlineRLM.h247
-rw-r--r--moses/TranslationModel/DynSAInclude/params.cpp122
-rw-r--r--moses/TranslationModel/DynSAInclude/params.h20
-rw-r--r--moses/TranslationModel/DynSAInclude/perfectHash.h173
-rw-r--r--moses/TranslationModel/DynSAInclude/quantizer.h25
-rw-r--r--moses/TranslationModel/DynSAInclude/vocab.cpp5
-rw-r--r--moses/TranslationModel/DynSAInclude/vocab.h2
-rw-r--r--moses/TranslationModel/DynSuffixArray.cpp14
-rw-r--r--moses/TranslationModel/PhraseDictionary.cpp17
-rw-r--r--moses/TranslationModel/PhraseDictionary.h4
-rw-r--r--moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp4
-rw-r--r--moses/TranslationModel/PhraseDictionaryMemory.cpp17
-rw-r--r--moses/TranslationModel/PhraseDictionaryMemory.h22
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModel.cpp314
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModel.h49
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp477
-rw-r--r--moses/TranslationModel/PhraseDictionaryMultiModelCounts.h70
-rw-r--r--moses/TranslationModel/PhraseDictionaryNodeMemory.cpp11
-rw-r--r--moses/TranslationModel/PhraseDictionaryNodeMemory.h6
-rw-r--r--moses/TranslationModel/PhraseDictionaryTree.cpp13
-rw-r--r--moses/TranslationModel/PhraseDictionaryTree.h5
-rw-r--r--moses/TranslationModel/RuleTable/Loader.h18
-rw-r--r--moses/TranslationModel/RuleTable/LoaderCompact.cpp49
-rw-r--r--moses/TranslationModel/RuleTable/LoaderCompact.h13
-rw-r--r--moses/TranslationModel/RuleTable/LoaderFactory.cpp24
-rw-r--r--moses/TranslationModel/RuleTable/LoaderFactory.h8
-rw-r--r--moses/TranslationModel/RuleTable/LoaderHiero.cpp21
-rw-r--r--moses/TranslationModel/RuleTable/LoaderHiero.h3
-rw-r--r--moses/TranslationModel/RuleTable/LoaderStandard.cpp78
-rw-r--r--moses/TranslationModel/RuleTable/LoaderStandard.h8
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp14
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h16
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp588
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h119
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp2
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h2
-rw-r--r--moses/TranslationModel/RuleTable/Trie.cpp8
-rw-r--r--moses/TranslationModel/RuleTable/Trie.h21
-rw-r--r--moses/TranslationModel/RuleTable/UTrie.cpp6
-rw-r--r--moses/TranslationModel/RuleTable/UTrie.h20
-rw-r--r--moses/TranslationModel/RuleTable/UTrieNode.cpp8
-rw-r--r--moses/TranslationModel/RuleTable/UTrieNode.h51
-rw-r--r--moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp8
-rw-r--r--moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h13
-rw-r--r--moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h13
-rw-r--r--moses/TranslationModel/Scope3Parser/Parser.cpp22
-rw-r--r--moses/TranslationModel/Scope3Parser/Parser.h47
-rw-r--r--moses/TranslationModel/Scope3Parser/SentenceMap.h6
-rw-r--r--moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp16
-rw-r--r--moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h2
-rw-r--r--moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h20
-rw-r--r--moses/TranslationModel/Scope3Parser/VarSpanNode.h18
-rw-r--r--moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp2
-rw-r--r--moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h7
-rw-r--r--moses/TranslationModel/fuzzy-match/Alignments.cpp21
-rw-r--r--moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp1853
-rw-r--r--moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h23
-rw-r--r--moses/TranslationModel/fuzzy-match/Match.h21
-rw-r--r--moses/TranslationModel/fuzzy-match/SentenceAlignment.h16
-rw-r--r--moses/TranslationModel/fuzzy-match/SuffixArray.cpp360
-rw-r--r--moses/TranslationModel/fuzzy-match/SuffixArray.h84
-rw-r--r--moses/TranslationModel/fuzzy-match/Vocabulary.cpp19
-rw-r--r--moses/TranslationModel/fuzzy-match/Vocabulary.h12
-rw-r--r--moses/TranslationModel/fuzzy-match/create_xml.cpp25
-rw-r--r--moses/TranslationOption.cpp8
-rw-r--r--moses/TranslationOption.h12
-rw-r--r--moses/TranslationOptionCollection.cpp155
-rw-r--r--moses/TranslationOptionCollection.h2
-rw-r--r--moses/TranslationOptionCollectionConfusionNet.cpp4
-rw-r--r--moses/TreeInput.cpp9
-rw-r--r--moses/TrellisPath.cpp7
-rw-r--r--moses/TrellisPath.h60
-rw-r--r--moses/TypeDef.h5
-rw-r--r--moses/Util.cpp6
-rw-r--r--moses/Util.h40
-rw-r--r--moses/Word.cpp3
-rw-r--r--moses/Word.h5
-rw-r--r--moses/XmlOption.cpp10
-rw-r--r--moses/XmlOption.h4
275 files changed, 13740 insertions, 13548 deletions
diff --git a/moses/AlignmentInfo.cpp b/moses/AlignmentInfo.cpp
index 97eff59b5..178f3438a 100644
--- a/moses/AlignmentInfo.cpp
+++ b/moses/AlignmentInfo.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -46,17 +46,18 @@ void AlignmentInfo::BuildNonTermIndexMap()
m_nonTermIndexMap.resize(maxIndex+1, NOT_FOUND);
size_t i = 0;
for (p = begin(); p != end(); ++p) {
- if (m_nonTermIndexMap[p->second] != NOT_FOUND) {
- // 1-to-many. Definitely a set of terminals. Don't bother storing 1-to-1 index map
- m_nonTermIndexMap.clear();
- return;
- }
+ if (m_nonTermIndexMap[p->second] != NOT_FOUND) {
+ // 1-to-many. Definitely a set of terminals. Don't bother storing 1-to-1 index map
+ m_nonTermIndexMap.clear();
+ return;
+ }
m_nonTermIndexMap[p->second] = i++;
}
-
+
}
-bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,size_t> *b) {
+bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,size_t> *b)
+{
if(a->second < b->second) return true;
if(a->second == b->second) return (a->first < b->first);
return false;
@@ -66,32 +67,30 @@ bool compare_target(const std::pair<size_t,size_t> *a, const std::pair<size_t,si
std::vector< const std::pair<size_t,size_t>* > AlignmentInfo::GetSortedAlignments() const
{
std::vector< const std::pair<size_t,size_t>* > ret;
-
+
CollType::const_iterator iter;
- for (iter = m_collection.begin(); iter != m_collection.end(); ++iter)
- {
+ for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
const std::pair<size_t,size_t> &alignPair = *iter;
ret.push_back(&alignPair);
}
-
+
const StaticData &staticData = StaticData::Instance();
WordAlignmentSort wordAlignmentSort = staticData.GetWordAlignmentSort();
-
- switch (wordAlignmentSort)
- {
- case NoSort:
- break;
-
- case TargetOrder:
- std::sort(ret.begin(), ret.end(), compare_target);
- break;
-
- default:
- CHECK(false);
+
+ switch (wordAlignmentSort) {
+ case NoSort:
+ break;
+
+ case TargetOrder:
+ std::sort(ret.begin(), ret.end(), compare_target);
+ break;
+
+ default:
+ CHECK(false);
}
-
+
return ret;
-
+
}
std::vector<size_t> AlignmentInfo::GetSourceIndex2PosMap() const
diff --git a/moses/AlignmentInfo.h b/moses/AlignmentInfo.h
index db92791aa..76d4d918a 100644
--- a/moses/AlignmentInfo.h
+++ b/moses/AlignmentInfo.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -42,16 +42,20 @@ class AlignmentInfo
friend struct AlignmentInfoHasher;
friend class AlignmentInfoCollection;
- public:
+public:
typedef std::set<std::pair<size_t,size_t> > CollType;
typedef std::vector<size_t> NonTermIndexMap;
typedef CollType::const_iterator const_iterator;
- const_iterator begin() const { return m_collection.begin(); }
- const_iterator end() const { return m_collection.end(); }
+ const_iterator begin() const {
+ return m_collection.begin();
+ }
+ const_iterator end() const {
+ return m_collection.end();
+ }
void Add(size_t sourcePos, size_t targetPos) {
- m_collection.insert(std::pair<size_t, size_t>(sourcePos, targetPos));
+ m_collection.insert(std::pair<size_t, size_t>(sourcePos, targetPos));
}
/** Provides a map from target-side to source-side non-terminal indices.
* The target-side index should be the rule symbol index (COUNTING terminals).
@@ -64,20 +68,21 @@ class AlignmentInfo
const CollType &GetAlignments() const {
return m_collection;
}
-
- size_t GetSize() const { return m_collection.size(); }
+
+ size_t GetSize() const {
+ return m_collection.size();
+ }
std::vector< const std::pair<size_t,size_t>* > GetSortedAlignments() const;
std::vector<size_t> GetSourceIndex2PosMap() const;
- bool operator==(const AlignmentInfo& rhs) const
- {
+ bool operator==(const AlignmentInfo& rhs) const {
return m_collection == rhs.m_collection &&
m_nonTermIndexMap == rhs.m_nonTermIndexMap;
}
-
- private:
+
+private:
//! AlignmentInfo objects should only be created by an AlignmentInfoCollection
explicit AlignmentInfo(const std::set<std::pair<size_t,size_t> > &pairs);
@@ -90,25 +95,21 @@ class AlignmentInfo
/** Define an arbitrary strict weak ordering between AlignmentInfo objects
* for use by AlignmentInfoCollection.
*/
-struct AlignmentInfoOrderer
-{
+struct AlignmentInfoOrderer {
bool operator()(const AlignmentInfo &a, const AlignmentInfo &b) const {
- if (a.m_collection == b.m_collection) {
- return a.m_nonTermIndexMap < b.m_nonTermIndexMap;
- }
- else {
- return a.m_collection < b.m_collection;
- }
+ if (a.m_collection == b.m_collection) {
+ return a.m_nonTermIndexMap < b.m_nonTermIndexMap;
+ } else {
+ return a.m_collection < b.m_collection;
+ }
}
};
-/**
+/**
* Hashing functoid
**/
-struct AlignmentInfoHasher
-{
- size_t operator()(const AlignmentInfo& a) const
- {
+struct AlignmentInfoHasher {
+ size_t operator()(const AlignmentInfo& a) const {
size_t seed = 0;
boost::hash_combine(seed,a.m_collection);
boost::hash_combine(seed,a.m_nonTermIndexMap);
@@ -117,7 +118,8 @@ struct AlignmentInfoHasher
};
-inline size_t hash_value(const AlignmentInfo& a) {
+inline size_t hash_value(const AlignmentInfo& a)
+{
static AlignmentInfoHasher hasher;
return hasher(a);
}
diff --git a/moses/AlignmentInfoCollection.cpp b/moses/AlignmentInfoCollection.cpp
index 53b83d8cd..ef6e62eb3 100644
--- a/moses/AlignmentInfoCollection.cpp
+++ b/moses/AlignmentInfoCollection.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -39,7 +39,7 @@ const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
}
const AlignmentInfo *AlignmentInfoCollection::Add(
- const std::set<std::pair<size_t,size_t> > &pairs)
+ const std::set<std::pair<size_t,size_t> > &pairs)
{
AlignmentInfo pairsAlignmentInfo(pairs);
#ifdef WITH_THREADS
diff --git a/moses/AlignmentInfoCollection.h b/moses/AlignmentInfoCollection.h
index 6185b32a9..37d717b0f 100644
--- a/moses/AlignmentInfoCollection.h
+++ b/moses/AlignmentInfoCollection.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -36,8 +36,10 @@ namespace Moses
*/
class AlignmentInfoCollection
{
- public:
- static AlignmentInfoCollection &Instance() { return s_instance; }
+public:
+ static AlignmentInfoCollection &Instance() {
+ return s_instance;
+ }
/** Returns a pointer to an AlignmentInfo object with the same source-target
* alignment pairs as given in the argument. If the collection already
@@ -49,7 +51,7 @@ class AlignmentInfoCollection
//! Returns a pointer to an empty AlignmentInfo object.
const AlignmentInfo &GetEmptyAlignmentInfo() const;
- private:
+private:
typedef std::set<AlignmentInfo, AlignmentInfoOrderer> AlignmentInfoSet;
//! Only a single static variable should be created.
@@ -62,7 +64,7 @@ class AlignmentInfoCollection
//reader-writer lock
mutable boost::shared_mutex m_accessLock;
#endif
-
+
AlignmentInfoSet m_collection;
const AlignmentInfo *m_emptyAlignmentInfo;
};
diff --git a/moses/AlignmentInfoTest.cpp b/moses/AlignmentInfoTest.cpp
index 48c88db65..26127f3cf 100644
--- a/moses/AlignmentInfoTest.cpp
+++ b/moses/AlignmentInfoTest.cpp
@@ -35,8 +35,7 @@ struct AlignmentInfoFixture {
const AlignmentInfo* ai2;
const AlignmentInfo* ai3;
- AlignmentInfoFixture()
- {
+ AlignmentInfoFixture() {
AlignmentInfoCollection& collection = AlignmentInfoCollection::Instance();
IndexSet aligns1,aligns2,aligns3;
aligns1.insert(IndexPair(1,1));
diff --git a/moses/BitmapContainer.cpp b/moses/BitmapContainer.cpp
index 7e8d470ee..64dd9081b 100644
--- a/moses/BitmapContainer.cpp
+++ b/moses/BitmapContainer.cpp
@@ -275,11 +275,11 @@ BitmapContainer::~BitmapContainer()
// As we have created the square position objects we clean up now.
while (!m_queue.empty()) {
- HypothesisQueueItem *item = m_queue.top();
- m_queue.pop();
+ HypothesisQueueItem *item = m_queue.top();
+ m_queue.pop();
- FREEHYPO( item->GetHypothesis() );
- delete item;
+ FREEHYPO( item->GetHypothesis() );
+ delete item;
}
// Delete all edges.
diff --git a/moses/ChartCell.cpp b/moses/ChartCell.cpp
index fd163450e..b57a4ab36 100644
--- a/moses/ChartCell.cpp
+++ b/moses/ChartCell.cpp
@@ -45,17 +45,18 @@ ChartCellBase::~ChartCellBase() {}
/** Constructor
* \param startPos endPos range of this cell
- * \param manager pointer back to the manager
+ * \param manager pointer back to the manager
*/
ChartCell::ChartCell(size_t startPos, size_t endPos, ChartManager &manager) :
- ChartCellBase(startPos, endPos), m_manager(manager) {
+ ChartCellBase(startPos, endPos), m_manager(manager)
+{
const StaticData &staticData = StaticData::Instance();
m_nBestIsEnabled = staticData.IsNBestEnabled();
}
ChartCell::~ChartCell() {}
-/** Add the given hypothesis to the cell.
+/** Add the given hypothesis to the cell.
* Returns true if added, false if not. Maybe it already exists in the collection or score falls below threshold etc.
* This function just calls the correspondind AddHypothesis() in ChartHypothesisCollection
* \param hypo Hypothesis to be added
@@ -98,8 +99,7 @@ void ChartCell::ProcessSentence(const ChartTranslationOptionList &transOptList
// pluck things out of queue and add to hypo collection
const size_t popLimit = staticData.GetCubePruningPopLimit();
- for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops)
- {
+ for (size_t numPops = 0; numPops < popLimit && !queue.IsEmpty(); ++numPops) {
ChartHypothesis *hypo = queue.Pop();
AddHypothesis(hypo);
}
@@ -179,15 +179,15 @@ size_t ChartCell::GetSize() const
const HypoList *ChartCell::GetAllSortedHypotheses() const
{
- HypoList *ret = new HypoList();
+ HypoList *ret = new HypoList();
- MapType::const_iterator iter;
- for (iter = m_hypoColl.begin(); iter != m_hypoColl.end(); ++iter) {
- const ChartHypothesisCollection &coll = iter->second;
- const HypoList &list = coll.GetSortedHypotheses();
+ MapType::const_iterator iter;
+ for (iter = m_hypoColl.begin(); iter != m_hypoColl.end(); ++iter) {
+ const ChartHypothesisCollection &coll = iter->second;
+ const HypoList &list = coll.GetSortedHypotheses();
std::copy(list.begin(), list.end(), std::inserter(*ret, ret->end()));
- }
- return ret;
+ }
+ return ret;
}
//! call GetSearchGraph() for each hypo collection
diff --git a/moses/ChartCell.h b/moses/ChartCell.h
index 14ac8e3b4..1fed695ac 100644
--- a/moses/ChartCell.h
+++ b/moses/ChartCell.h
@@ -44,35 +44,43 @@ class ChartTranslationOptionList;
class ChartCellCollection;
class ChartManager;
-class ChartCellBase {
- public:
- ChartCellBase(size_t startPos, size_t endPos);
+class ChartCellBase
+{
+public:
+ ChartCellBase(size_t startPos, size_t endPos);
- virtual ~ChartCellBase();
+ virtual ~ChartCellBase();
- const ChartCellLabelSet &GetTargetLabelSet() const { return m_targetLabelSet; }
+ const ChartCellLabelSet &GetTargetLabelSet() const {
+ return m_targetLabelSet;
+ }
- ChartCellLabelSet &MutableTargetLabelSet() { return m_targetLabelSet; }
+ ChartCellLabelSet &MutableTargetLabelSet() {
+ return m_targetLabelSet;
+ }
- const WordsRange &GetCoverage() const { return m_coverage; }
+ const WordsRange &GetCoverage() const {
+ return m_coverage;
+ }
- protected:
- const WordsRange m_coverage;
- ChartCellLabelSet m_targetLabelSet;
+protected:
+ const WordsRange m_coverage;
+ ChartCellLabelSet m_targetLabelSet;
};
/** 1 cell in chart decoder.
* Doesn't directly hold hypotheses. Each cell contain a map of ChartHypothesisCollection that have different constituent labels
*/
-class ChartCell : public ChartCellBase {
+class ChartCell : public ChartCellBase
+{
friend std::ostream& operator<<(std::ostream&, const ChartCell&);
public:
#if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200)
typedef boost::unordered_map<Word,
- ChartHypothesisCollection,
- NonTerminalHasher,
- NonTerminalEqualityPred
- > MapType;
+ ChartHypothesisCollection,
+ NonTerminalHasher,
+ NonTerminalEqualityPred
+ > MapType;
#else
typedef std::map<Word, ChartHypothesisCollection> MapType;
#endif
@@ -91,8 +99,7 @@ public:
,const ChartCellCollection &allChartCells);
//! Get all hypotheses in the cell that have the specified constituent label
- const HypoList *GetSortedHypotheses(const Word &constituentLabel) const
- {
+ const HypoList *GetSortedHypotheses(const Word &constituentLabel) const {
MapType::const_iterator p = m_hypoColl.find(constituentLabel);
return (p == m_hypoColl.end()) ? NULL : &(p->second.GetSortedHypotheses());
}
diff --git a/moses/ChartCellCollection.cpp b/moses/ChartCellCollection.cpp
index a34687f59..46392261d 100644
--- a/moses/ChartCellCollection.cpp
+++ b/moses/ChartCellCollection.cpp
@@ -23,24 +23,27 @@
#include "InputType.h"
#include "WordsRange.h"
-namespace Moses {
+namespace Moses
+{
-ChartCellCollectionBase::~ChartCellCollectionBase() {
+ChartCellCollectionBase::~ChartCellCollectionBase()
+{
m_source.clear();
- for (std::vector<std::vector<ChartCellBase*> >::iterator i = m_cells.begin(); i != m_cells.end(); ++i)
+ for (std::vector<std::vector<ChartCellBase*> >::iterator i = m_cells.begin(); i != m_cells.end(); ++i)
RemoveAllInColl(*i);
}
-class CubeCellFactory {
- public:
- explicit CubeCellFactory(ChartManager &manager) : m_manager(manager) {}
+class CubeCellFactory
+{
+public:
+ explicit CubeCellFactory(ChartManager &manager) : m_manager(manager) {}
- ChartCell *operator()(size_t start, size_t end) const {
- return new ChartCell(start, end, m_manager);
- }
+ ChartCell *operator()(size_t start, size_t end) const {
+ return new ChartCell(start, end, m_manager);
+ }
- private:
- ChartManager &m_manager;
+private:
+ ChartManager &m_manager;
};
/** Costructor
diff --git a/moses/ChartCellCollection.h b/moses/ChartCellCollection.h
index 7532503d7..d0423b0b2 100644
--- a/moses/ChartCellCollection.h
+++ b/moses/ChartCellCollection.h
@@ -31,57 +31,59 @@ namespace Moses
class InputType;
class ChartManager;
-class ChartCellCollectionBase {
- public:
- template <class Factory> ChartCellCollectionBase(const InputType &input, const Factory &factory) :
- m_cells(input.GetSize()) {
- size_t size = input.GetSize();
- for (size_t startPos = 0; startPos < size; ++startPos) {
- std::vector<ChartCellBase*> &inner = m_cells[startPos];
- inner.reserve(size - startPos);
- for (size_t endPos = startPos; endPos < size; ++endPos) {
- inner.push_back(factory(startPos, endPos));
- }
- /* Hack: ChartCellLabel shouldn't need to know its span, but the parser
- * gets it from there :-(. The span is actually stored as a reference,
- * which needs to point somewhere, so I have it refer to the ChartCell.
- */
- m_source.push_back(new ChartCellLabel(inner[0]->GetCoverage(), input.GetWord(startPos)));
+class ChartCellCollectionBase
+{
+public:
+ template <class Factory> ChartCellCollectionBase(const InputType &input, const Factory &factory) :
+ m_cells(input.GetSize()) {
+ size_t size = input.GetSize();
+ for (size_t startPos = 0; startPos < size; ++startPos) {
+ std::vector<ChartCellBase*> &inner = m_cells[startPos];
+ inner.reserve(size - startPos);
+ for (size_t endPos = startPos; endPos < size; ++endPos) {
+ inner.push_back(factory(startPos, endPos));
}
+ /* Hack: ChartCellLabel shouldn't need to know its span, but the parser
+ * gets it from there :-(. The span is actually stored as a reference,
+ * which needs to point somewhere, so I have it refer to the ChartCell.
+ */
+ m_source.push_back(new ChartCellLabel(inner[0]->GetCoverage(), input.GetWord(startPos)));
}
+ }
- virtual ~ChartCellCollectionBase();
+ virtual ~ChartCellCollectionBase();
- const ChartCellBase &GetBase(const WordsRange &coverage) const {
- return *m_cells[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()];
- }
+ const ChartCellBase &GetBase(const WordsRange &coverage) const {
+ return *m_cells[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()];
+ }
- ChartCellBase &MutableBase(const WordsRange &coverage) {
- return *m_cells[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()];
- }
+ ChartCellBase &MutableBase(const WordsRange &coverage) {
+ return *m_cells[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()];
+ }
- const ChartCellLabel &GetSourceWordLabel(size_t at) const {
- return m_source[at];
- }
+ const ChartCellLabel &GetSourceWordLabel(size_t at) const {
+ return m_source[at];
+ }
- private:
- std::vector<std::vector<ChartCellBase*> > m_cells;
+private:
+ std::vector<std::vector<ChartCellBase*> > m_cells;
- boost::ptr_vector<ChartCellLabel> m_source;
+ boost::ptr_vector<ChartCellLabel> m_source;
};
/** Hold all the chart cells for 1 input sentence. A variable of this type is held by the ChartManager
*/
-class ChartCellCollection : public ChartCellCollectionBase {
- public:
- ChartCellCollection(const InputType &input, ChartManager &manager);
+class ChartCellCollection : public ChartCellCollectionBase
+{
+public:
+ ChartCellCollection(const InputType &input, ChartManager &manager);
//! get a chart cell for a particular range
ChartCell &Get(const WordsRange &coverage) {
return static_cast<ChartCell&>(MutableBase(coverage));
}
-
+
//! get a chart cell for a particular range
const ChartCell &Get(const WordsRange &coverage) const {
return static_cast<const ChartCell&>(GetBase(coverage));
diff --git a/moses/ChartCellLabel.h b/moses/ChartCellLabel.h
index 218a512c0..ad6e3565d 100644
--- a/moses/ChartCellLabel.h
+++ b/moses/ChartCellLabel.h
@@ -23,7 +23,10 @@
#include "Word.h"
#include "WordsRange.h"
-namespace search { class Vertex; }
+namespace search
+{
+class Vertex;
+}
namespace Moses
{
@@ -31,17 +34,17 @@ namespace Moses
class Word;
/** Contains a range, word (non-terms?) and a vector of hypotheses.
- * @todo This is probably incompatible with lattice decoding when the word that spans
+ * @todo This is probably incompatible with lattice decoding when the word that spans
* a position (or positions) can vary.
* @todo is this to hold sorted hypotheses that are in the queue for creating the next hypos?
*/
class ChartCellLabel
{
- public:
+public:
union Stack {
const HypoList *cube; // cube pruning
- search::Vertex *incr; // incremental search after filling.
- void *incr_generator; // incremental search during filling.
+ search::Vertex *incr; // incremental search after filling.
+ void *incr_generator; // incremental search during filling.
};
@@ -52,13 +55,20 @@ class ChartCellLabel
, m_stack(stack)
{}
- const WordsRange &GetCoverage() const { return m_coverage; }
- const Word &GetLabel() const { return m_label; }
- Stack GetStack() const { return m_stack; }
- Stack &MutableStack() { return m_stack; }
+ const WordsRange &GetCoverage() const {
+ return m_coverage;
+ }
+ const Word &GetLabel() const {
+ return m_label;
+ }
+ Stack GetStack() const {
+ return m_stack;
+ }
+ Stack &MutableStack() {
+ return m_stack;
+ }
- bool operator<(const ChartCellLabel &other) const
- {
+ bool operator<(const ChartCellLabel &other) const {
// m_coverage and m_label uniquely identify a ChartCellLabel, so don't
// need to compare m_stack.
if (m_coverage == other.m_coverage) {
@@ -67,7 +77,7 @@ class ChartCellLabel
return m_coverage < other.m_coverage;
}
- private:
+private:
const WordsRange &m_coverage;
const Word &m_label;
Stack m_stack;
diff --git a/moses/ChartCellLabelSet.h b/moses/ChartCellLabelSet.h
index 5ea192e51..68c8b4263 100644
--- a/moses/ChartCellLabelSet.h
+++ b/moses/ChartCellLabelSet.h
@@ -35,46 +35,55 @@ class ChartHypothesisCollection;
*/
class ChartCellLabelSet
{
- private:
+private:
#if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200)
typedef boost::unordered_map<Word, ChartCellLabel,
- NonTerminalHasher, NonTerminalEqualityPred
- > MapType;
+ NonTerminalHasher, NonTerminalEqualityPred
+ > MapType;
#else
typedef std::map<Word, ChartCellLabel> MapType;
#endif
- public:
+public:
typedef MapType::const_iterator const_iterator;
typedef MapType::iterator iterator;
ChartCellLabelSet(const WordsRange &coverage) : m_coverage(coverage) {}
- const_iterator begin() const { return m_map.begin(); }
- const_iterator end() const { return m_map.end(); }
-
- iterator mutable_begin() { return m_map.begin(); }
- iterator mutable_end() { return m_map.end(); }
+ const_iterator begin() const {
+ return m_map.begin();
+ }
+ const_iterator end() const {
+ return m_map.end();
+ }
- void AddWord(const Word &w)
- {
+ iterator mutable_begin() {
+ return m_map.begin();
+ }
+ iterator mutable_end() {
+ return m_map.end();
+ }
+
+ void AddWord(const Word &w) {
m_map.insert(std::make_pair(w, ChartCellLabel(m_coverage, w)));
}
- // Stack is a HypoList or whatever the search algorithm uses.
- void AddConstituent(const Word &w, const HypoList *stack)
- {
+ // Stack is a HypoList or whatever the search algorithm uses.
+ void AddConstituent(const Word &w, const HypoList *stack) {
ChartCellLabel::Stack s;
s.cube = stack;
m_map.insert(std::make_pair(w, ChartCellLabel(m_coverage, w, s)));
}
- bool Empty() const { return m_map.empty(); }
+ bool Empty() const {
+ return m_map.empty();
+ }
- size_t GetSize() const { return m_map.size(); }
+ size_t GetSize() const {
+ return m_map.size();
+ }
- const ChartCellLabel *Find(const Word &w) const
- {
+ const ChartCellLabel *Find(const Word &w) const {
MapType::const_iterator p = m_map.find(w);
return p == m_map.end() ? 0 : &(p->second);
}
@@ -83,7 +92,7 @@ class ChartCellLabelSet
return m_map.insert(std::make_pair(w, ChartCellLabel(m_coverage, w))).first->second.MutableStack();
}
- private:
+private:
const WordsRange &m_coverage;
MapType m_map;
};
diff --git a/moses/ChartHypothesis.cpp b/moses/ChartHypothesis.cpp
index c7c1047f1..ce5a318ac 100644
--- a/moses/ChartHypothesis.cpp
+++ b/moses/ChartHypothesis.cpp
@@ -39,7 +39,7 @@ namespace Moses
ObjectPool<ChartHypothesis> ChartHypothesis::s_objectPool("ChartHypothesis", 300000);
#endif
-/** Create a hypothesis from a rule
+/** Create a hypothesis from a rule
* \param transOpt wrapper around the rule
* \param item @todo dunno
* \param manager reference back to manager
@@ -59,15 +59,14 @@ ChartHypothesis::ChartHypothesis(const ChartTranslationOptions &transOpt,
const std::vector<HypothesisDimension> &childEntries = item.GetHypothesisDimensions();
m_prevHypos.reserve(childEntries.size());
std::vector<HypothesisDimension>::const_iterator iter;
- for (iter = childEntries.begin(); iter != childEntries.end(); ++iter)
- {
+ for (iter = childEntries.begin(); iter != childEntries.end(); ++iter) {
m_prevHypos.push_back(iter->GetHypothesis());
}
}
ChartHypothesis::~ChartHypothesis()
{
- // delete feature function states
+ // delete feature function states
for (unsigned i = 0; i < m_ffStates.size(); ++i) {
delete m_ffStates[i];
}
@@ -98,8 +97,7 @@ void ChartHypothesis::CreateOutputPhrase(Phrase &outPhrase) const
size_t nonTermInd = GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos];
const ChartHypothesis *prevHypo = m_prevHypos[nonTermInd];
prevHypo->CreateOutputPhrase(outPhrase);
- }
- else {
+ } else {
outPhrase.AddWord(word);
}
}
@@ -124,17 +122,16 @@ Phrase ChartHypothesis::GetOutputPhrase() const
*/
int ChartHypothesis::RecombineCompare(const ChartHypothesis &compare) const
{
- int comp = 0;
+ int comp = 0;
- for (unsigned i = 0; i < m_ffStates.size(); ++i)
- {
- if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL)
+ for (unsigned i = 0; i < m_ffStates.size(); ++i) {
+ if (m_ffStates[i] == NULL || compare.m_ffStates[i] == NULL)
comp = m_ffStates[i] - compare.m_ffStates[i];
- else
+ else
comp = m_ffStates[i]->Compare(*compare.m_ffStates[i]);
- if (comp != 0)
- return comp;
+ if (comp != 0)
+ return comp;
}
return 0;
@@ -161,16 +158,16 @@ void ChartHypothesis::CalcScore()
//Add pre-computed features
m_manager.InsertPreCalculatedScores(GetCurrTargetPhrase(), &m_scoreBreakdown);
- // compute values of stateless feature functions that were not
+ // compute values of stateless feature functions that were not
// cached in the translation option-- there is no principled distinction
const std::vector<const StatelessFeatureFunction*>& sfs =
- StatelessFeatureFunction::GetStatelessFeatureFunctions();
+ StatelessFeatureFunction::GetStatelessFeatureFunctions();
for (unsigned i = 0; i < sfs.size(); ++i) {
sfs[i]->EvaluateChart(ChartBasedFeatureContext(this),&m_scoreBreakdown);
}
const std::vector<const StatefulFeatureFunction*>& ffs =
- StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ StatefulFeatureFunction::GetStatefulFeatureFunctions();
for (unsigned i = 0; i < ffs.size(); ++i)
m_ffStates[i] = ffs[i]->EvaluateChart(*this,i,&m_scoreBreakdown);
@@ -262,13 +259,12 @@ std::ostream& operator<<(std::ostream& out, const ChartHypothesis& hypo)
{
out << hypo.GetId();
-
- // recombination
- if (hypo.GetWinningHypothesis() != NULL &&
- hypo.GetWinningHypothesis() != &hypo)
- {
- out << "->" << hypo.GetWinningHypothesis()->GetId();
- }
+
+ // recombination
+ if (hypo.GetWinningHypothesis() != NULL &&
+ hypo.GetWinningHypothesis() != &hypo) {
+ out << "->" << hypo.GetWinningHypothesis()->GetId();
+ }
if (StaticData::Instance().GetIncludeLHSInSearchGraph()) {
out << " " << hypo.GetTargetLHS() << "=>";
diff --git a/moses/ChartHypothesis.h b/moses/ChartHypothesis.h
index 9dc1cba92..61c2faae1 100644
--- a/moses/ChartHypothesis.h
+++ b/moses/ChartHypothesis.h
@@ -52,7 +52,7 @@ protected:
const TargetPhrase &m_targetPhrase;
WordsRange m_currSourceWordsRange;
- std::vector<const FFState*> m_ffStates; /*! stateful feature function states */
+ std::vector<const FFState*> m_ffStates; /*! stateful feature function states */
ScoreComponentCollection m_scoreBreakdown /*! detailed score break-down by components (for instance language model, word penalty, etc) */
,m_lmNGram
,m_lmPrefix;
@@ -68,8 +68,8 @@ protected:
unsigned m_id; /* pkoehn wants to log the order in which hypotheses were generated */
//! not implemented
- ChartHypothesis();
-
+ ChartHypothesis();
+
//! not implemented
ChartHypothesis(const ChartHypothesis &copy);
@@ -96,35 +96,39 @@ public:
~ChartHypothesis();
- unsigned GetId() const { return m_id; }
+ unsigned GetId() const {
+ return m_id;
+ }
//! Get the rule that created this hypothesis
const TargetPhrase &GetCurrTargetPhrase()const {
return m_targetPhrase;
}
-
+
//! the source range that this hypothesis spans
const WordsRange &GetCurrSourceRange()const {
return m_currSourceWordsRange;
}
-
+
//! the arc list when creating n-best lists
inline const ChartArcList* GetArcList() const {
return m_arcList;
}
-
+
//! the feature function states for a particular feature \param featureID
- inline const FFState* GetFFState( size_t featureID ) const {
- return m_ffStates[ featureID ];
- }
-
+ inline const FFState* GetFFState( size_t featureID ) const {
+ return m_ffStates[ featureID ];
+ }
+
//! reference back to the manager
- inline const ChartManager& GetManager() const { return m_manager; }
+ inline const ChartManager& GetManager() const {
+ return m_manager;
+ }
void CreateOutputPhrase(Phrase &outPhrase) const;
Phrase GetOutputPhrase() const;
- int RecombineCompare(const ChartHypothesis &compare) const;
+ int RecombineCompare(const ChartHypothesis &compare) const;
void CalcScore();
@@ -133,30 +137,34 @@ public:
void SetWinningHypo(const ChartHypothesis *hypo);
//! get the unweighted score for each feature function
- const ScoreComponentCollection &GetScoreBreakdown() const
- { return m_scoreBreakdown; }
-
+ const ScoreComponentCollection &GetScoreBreakdown() const {
+ return m_scoreBreakdown;
+ }
+
//! Get the weighted total score
- float GetTotalScore() const
- { return m_totalScore; }
+ float GetTotalScore() const {
+ return m_totalScore;
+ }
- //! vector of previous hypotheses this hypo is built on
- const std::vector<const ChartHypothesis*> &GetPrevHypos() const
- { return m_prevHypos; }
+ //! vector of previous hypotheses this hypo is built on
+ const std::vector<const ChartHypothesis*> &GetPrevHypos() const {
+ return m_prevHypos;
+ }
//! get a particular previous hypos
- const ChartHypothesis* GetPrevHypo(size_t pos) const {
- return m_prevHypos[pos];
- }
-
+ const ChartHypothesis* GetPrevHypo(size_t pos) const {
+ return m_prevHypos[pos];
+ }
+
//! get the constituency label that covers this hypo
const Word &GetTargetLHS() const {
return GetCurrTargetPhrase().GetTargetLHS();
}
//! get the best hypo in the arc list when doing n-best list creation. It's either this hypothesis, or the best hypo is this hypo is in the arc list
- const ChartHypothesis* GetWinningHypothesis() const
- { return m_winningHypo; }
+ const ChartHypothesis* GetWinningHypothesis() const {
+ return m_winningHypo;
+ }
TO_STRING();
diff --git a/moses/ChartHypothesisCollection.cpp b/moses/ChartHypothesisCollection.cpp
index 752bb7f6c..3b80f68dc 100644
--- a/moses/ChartHypothesisCollection.cpp
+++ b/moses/ChartHypothesisCollection.cpp
@@ -51,7 +51,7 @@ ChartHypothesisCollection::~ChartHypothesisCollection()
//RemoveAllInColl(m_hypos);
}
-/** public function to add hypothesis to this collection.
+/** public function to add hypothesis to this collection.
* Returns false if equiv hypo exists in collection, otherwise returns true.
* Takes care of update arc list for n-best list creation.
* Will delete hypo is it exist - once this function is call don't delete hypothesis.
@@ -108,8 +108,7 @@ bool ChartHypothesisCollection::AddHypothesis(ChartHypothesis *hypo, ChartManage
VERBOSE(3,"worse than matching hyp " << hypoExisting->GetId() << ", recombining" << std::endl)
if (m_nBestIsEnabled) {
hypoExisting->AddArc(hypo);
- }
- else {
+ } else {
ChartHypothesis::Delete(hypo);
}
return false;
@@ -146,7 +145,7 @@ pair<ChartHypothesisCollection::HCType::iterator, bool> ChartHypothesisCollectio
return ret;
}
-/** Remove hypothesis pointed to by iterator but DOES NOT delete the object.
+/** Remove hypothesis pointed to by iterator but DOES NOT delete the object.
* \param iter iterator to delete
*/
void ChartHypothesisCollection::Detach(const HCType::iterator &iter)
diff --git a/moses/ChartHypothesisCollection.h b/moses/ChartHypothesisCollection.h
index f88cb8302..fa707b46d 100644
--- a/moses/ChartHypothesisCollection.h
+++ b/moses/ChartHypothesisCollection.h
@@ -46,7 +46,7 @@ public:
bool operator()(const ChartHypothesis* hypoA, const ChartHypothesis* hypoB) const {
// assert in same cell
const WordsRange &rangeA = hypoA->GetCurrSourceRange()
- , &rangeB = hypoB->GetCurrSourceRange();
+ , &rangeB = hypoB->GetCurrSourceRange();
CHECK(rangeA == rangeB);
// shouldn't be mixing hypos with different lhs
@@ -115,7 +115,9 @@ public:
}
//! return the best total score of all hypos in this collection
- float GetBestScore() const { return m_bestScore; }
+ float GetBestScore() const {
+ return m_bestScore;
+ }
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream, const std::map<unsigned,bool> &reachable) const;
diff --git a/moses/ChartManager.cpp b/moses/ChartManager.cpp
index fc4865af7..98f0e17f3 100644
--- a/moses/ChartManager.cpp
+++ b/moses/ChartManager.cpp
@@ -124,10 +124,13 @@ void ChartManager::ProcessSentence()
* Doesn't seem to do anything about walls and zones.
* @todo check walls & zones. Check that the implementation doesn't leak, xml options sometimes does if you're not careful
*/
-void ChartManager::AddXmlChartOptions() {
+void ChartManager::AddXmlChartOptions()
+{
const StaticData &staticData = StaticData::Instance();
const std::vector <ChartTranslationOptions*> xmlChartOptionsList = m_source.GetXmlChartTranslationOptions();
- IFVERBOSE(2) { cerr << "AddXmlChartOptions " << xmlChartOptionsList.size() << endl; }
+ IFVERBOSE(2) {
+ cerr << "AddXmlChartOptions " << xmlChartOptionsList.size() << endl;
+ }
if (xmlChartOptionsList.size() == 0) return;
for(std::vector<ChartTranslationOptions*>::const_iterator i = xmlChartOptionsList.begin();
@@ -160,12 +163,12 @@ const ChartHypothesis *ChartManager::GetBestHypothesis() const
}
}
- /** Calculate the n-best paths through the output hypergraph.
- * Return the list of paths with the variable ret
- * \param count how may paths to return
- * \param ret return argument
- * \param onlyDistinct whether to check for distinct output sentence or not (default - don't check, just return top n-paths)
- */
+/** Calculate the n-best paths through the output hypergraph.
+ * Return the list of paths with the variable ret
+ * \param count how may paths to return
+ * \param ret return argument
+ * \param onlyDistinct whether to check for distinct output sentence or not (default - don't check, just return top n-paths)
+ */
void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDistinct) const
{
size_t size = m_source.GetSize();
@@ -184,7 +187,7 @@ void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDi
// Add it to the n-best list.
if (count == 1) {
- ret.Add(basePath);
+ ret.Add(basePath);
return;
}
@@ -210,21 +213,21 @@ void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDi
// Get all complete translations
const HypoList *topHypos = lastCell.GetAllSortedHypotheses();
-
+
// Create a ChartTrellisDetour for each complete translation and add it to the queue
HypoList::const_iterator iter;
for (iter = topHypos->begin(); iter != topHypos->end(); ++iter) {
- const ChartHypothesis &hypo = **iter;
- boost::shared_ptr<ChartTrellisPath> basePath(new ChartTrellisPath(hypo));
- ChartTrellisDetour *detour = new ChartTrellisDetour(basePath, basePath->GetFinalNode(), hypo);
- contenders.Push(detour);
+ const ChartHypothesis &hypo = **iter;
+ boost::shared_ptr<ChartTrellisPath> basePath(new ChartTrellisPath(hypo));
+ ChartTrellisDetour *detour = new ChartTrellisDetour(basePath, basePath->GetFinalNode(), hypo);
+ contenders.Push(detour);
}
-
+
delete topHypos;
// Record the output phrase if distinct translations are required.
set<Phrase> distinctHyps;
-
+
// MAIN loop
for (size_t i = 0; ret.GetSize() < count && !contenders.Empty() && i < popLimit; ++i) {
// Get the best detour from the queue.
@@ -234,7 +237,7 @@ void ChartManager::CalcNBest(size_t count, ChartTrellisPathList &ret,bool onlyDi
// Create a full base path from the chosen detour.
//basePath.reset(new ChartTrellisPath(*detour));
boost::shared_ptr<ChartTrellisPath> path(new ChartTrellisPath(*detour));
-
+
// Generate new detours from this base path and add them to the queue of
// contenders. The new detours deviate from the base path by a single
// replacement along the previous detour sub-path.
@@ -259,17 +262,17 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
{
size_t size = m_source.GetSize();
- // which hypotheses are reachable?
- std::map<unsigned,bool> reachable;
- WordsRange fullRange(0, size-1);
- const ChartCell &lastCell = m_hypoStackColl.Get(fullRange);
+ // which hypotheses are reachable?
+ std::map<unsigned,bool> reachable;
+ WordsRange fullRange(0, size-1);
+ const ChartCell &lastCell = m_hypoStackColl.Get(fullRange);
const ChartHypothesis *hypo = lastCell.GetBestHypothesis();
if (hypo == NULL) {
// no hypothesis
return;
}
- FindReachableHypotheses( hypo, reachable);
+ FindReachableHypotheses( hypo, reachable);
for (size_t width = 1; width <= size; ++width) {
for (size_t startPos = 0; startPos <= size-width; ++startPos) {
@@ -285,42 +288,40 @@ void ChartManager::GetSearchGraph(long translationId, std::ostream &outputSearch
void ChartManager::FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const
{
- // do not recurse, if already visited
- if (reachable.find(hypo->GetId()) != reachable.end())
- {
- return;
- }
-
- // recurse
- reachable[ hypo->GetId() ] = true;
- const std::vector<const ChartHypothesis*> &previous = hypo->GetPrevHypos();
- for(std::vector<const ChartHypothesis*>::const_iterator i = previous.begin(); i != previous.end(); ++i)
- {
- FindReachableHypotheses( *i, reachable );
- }
-
- // also loop over recombined hypotheses (arcs)
- const ChartArcList *arcList = hypo->GetArcList();
- if (arcList) {
- ChartArcList::const_iterator iterArc;
- for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
- const ChartHypothesis &arc = **iterArc;
- FindReachableHypotheses( &arc, reachable );
- }
- }
+ // do not recurse, if already visited
+ if (reachable.find(hypo->GetId()) != reachable.end()) {
+ return;
+ }
+
+ // recurse
+ reachable[ hypo->GetId() ] = true;
+ const std::vector<const ChartHypothesis*> &previous = hypo->GetPrevHypos();
+ for(std::vector<const ChartHypothesis*>::const_iterator i = previous.begin(); i != previous.end(); ++i) {
+ FindReachableHypotheses( *i, reachable );
+ }
+
+ // also loop over recombined hypotheses (arcs)
+ const ChartArcList *arcList = hypo->GetArcList();
+ if (arcList) {
+ ChartArcList::const_iterator iterArc;
+ for (iterArc = arcList->begin(); iterArc != arcList->end(); ++iterArc) {
+ const ChartHypothesis &arc = **iterArc;
+ FindReachableHypotheses( &arc, reachable );
+ }
+ }
}
void ChartManager::CreateDeviantPaths(
- boost::shared_ptr<const ChartTrellisPath> basePath,
- ChartTrellisDetourQueue &q)
+ boost::shared_ptr<const ChartTrellisPath> basePath,
+ ChartTrellisDetourQueue &q)
{
CreateDeviantPaths(basePath, basePath->GetFinalNode(), q);
}
void ChartManager::CreateDeviantPaths(
- boost::shared_ptr<const ChartTrellisPath> basePath,
- const ChartTrellisNode &substitutedNode,
- ChartTrellisDetourQueue &queue)
+ boost::shared_ptr<const ChartTrellisPath> basePath,
+ const ChartTrellisNode &substitutedNode,
+ ChartTrellisDetourQueue &queue)
{
const ChartArcList *arcList = substitutedNode.GetHypothesis().GetArcList();
if (arcList) {
@@ -340,18 +341,18 @@ void ChartManager::CreateDeviantPaths(
}
}
-
-void ChartManager::PreCalculateScores()
+
+void ChartManager::PreCalculateScores()
{
for (size_t i = 0; i < m_translationOptionList.GetSize(); ++i) {
const ChartTranslationOptions& cto = m_translationOptionList.Get(i);
for (TargetPhraseCollection::const_iterator j = cto.GetTargetPhraseCollection().begin();
- j != cto.GetTargetPhraseCollection().end(); ++j) {
+ j != cto.GetTargetPhraseCollection().end(); ++j) {
const TargetPhrase* targetPhrase = *j;
if (m_precalculatedScores.find(*targetPhrase) == m_precalculatedScores.end()) {
ChartBasedFeatureContext context(*targetPhrase,m_source);
const vector<const StatelessFeatureFunction*>& sfs =
- StatelessFeatureFunction::GetStatelessFeatureFunctions();
+ StatelessFeatureFunction::GetStatelessFeatureFunctions();
ScoreComponentCollection& breakdown = m_precalculatedScores[*targetPhrase];
for (size_t k = 0; k < sfs.size(); ++k) {
sfs[k]->EvaluateChart(context,&breakdown);
@@ -362,18 +363,18 @@ void ChartManager::PreCalculateScores()
}
void ChartManager::InsertPreCalculatedScores(
- const TargetPhrase& targetPhrase, ScoreComponentCollection* scoreBreakdown) const
+ const TargetPhrase& targetPhrase, ScoreComponentCollection* scoreBreakdown) const
{
- boost::unordered_map<TargetPhrase,ScoreComponentCollection>::const_iterator scoreIter =
+ boost::unordered_map<TargetPhrase,ScoreComponentCollection>::const_iterator scoreIter =
m_precalculatedScores.find(targetPhrase);
if (scoreIter != m_precalculatedScores.end()) {
scoreBreakdown->PlusEquals(scoreIter->second);
} else {
TRACE_ERR("ERROR: " << targetPhrase << " missing from precalculation cache" << endl);
- assert(0);
+ assert(0);
}
}
-
+
} // namespace Moses
diff --git a/moses/ChartManager.h b/moses/ChartManager.h
index 7f3f24a0b..736986e05 100644
--- a/moses/ChartManager.h
+++ b/moses/ChartManager.h
@@ -79,35 +79,37 @@ public:
void CalcNBest(size_t count, ChartTrellisPathList &ret, bool onlyDistinct=0) const;
void GetSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
- void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
+ void FindReachableHypotheses( const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable ) const; /* auxilliary function for GetSearchGraph */
//! the input sentence being decoded
const InputType& GetSource() const {
return m_source;
}
-
+
//! debug data collected when decoding sentence
SentenceStats& GetSentenceStats() const {
return *m_sentenceStats;
}
-
+
/***
* to be called after processing a sentence (which may consist of more than just calling ProcessSentence() )
* currently an empty function
*/
void CalcDecoderStatistics() const
{ }
-
+
void ResetSentenceStats(const InputType& source) {
m_sentenceStats = std::auto_ptr<SentenceStats>(new SentenceStats(source));
}
//! contigious hypo id for each input sentence. For debugging purposes
- unsigned GetNextHypoId() { return m_hypothesisId++; }
+ unsigned GetNextHypoId() {
+ return m_hypothesisId++;
+ }
//! Access the pre-calculated values
void InsertPreCalculatedScores(const TargetPhrase& targetPhrase,
- ScoreComponentCollection* scoreBreakdown) const;
+ ScoreComponentCollection* scoreBreakdown) const;
};
diff --git a/moses/ChartParser.cpp b/moses/ChartParser.cpp
index 805bec7ab..0dba600e1 100644
--- a/moses/ChartParser.cpp
+++ b/moses/ChartParser.cpp
@@ -35,16 +35,18 @@ extern bool g_debug;
ChartParserUnknown::ChartParserUnknown() {}
-ChartParserUnknown::~ChartParserUnknown() {
+ChartParserUnknown::~ChartParserUnknown()
+{
RemoveAllInColl(m_unksrcs);
RemoveAllInColl(m_cacheTargetPhraseCollection);
}
-void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to) {
+void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to)
+{
// unknown word, add as trans opt
const StaticData &staticData = StaticData::Instance();
const UnknownWordPenaltyProducer *unknownWordPenaltyProducer = staticData.GetUnknownWordPenaltyProducer();
-
+
size_t isDigit = 0;
if (staticData.GetDropUnknown()) {
const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
@@ -56,11 +58,11 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
isDigit = 1;
// modify the starting bitmap
}
-
+
Phrase* unksrc = new Phrase(1);
unksrc->AddWord() = sourceWord;
m_unksrcs.push_back(unksrc);
-
+
//TranslationOption *transOpt;
if (! staticData.GetDropUnknown() || isDigit) {
// loop
@@ -69,19 +71,19 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
const string &targetLHSStr = iterLHS->first;
float prob = iterLHS->second;
-
+
// lhs
//const Word &sourceLHS = staticData.GetInputDefaultNonTerminal();
Word *targetLHS = new Word(true);
-
+
targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true);
CHECK(targetLHS->GetFactor(0) != NULL);
-
+
// add to dictionary
TargetPhrase *targetPhrase = new TargetPhrase();
Word &targetWord = targetPhrase->AddWord();
targetWord.CreateUnknownWord(sourceWord);
-
+
// scores
float unknownScore = FloorScore(TransformScore(prob));
@@ -98,7 +100,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
} else {
// drop source word. create blank trans opt
float unknownScore = FloorScore(-numeric_limits<float>::infinity());
-
+
TargetPhrase *targetPhrase = new TargetPhrase();
// loop
const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
@@ -106,11 +108,11 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
const string &targetLHSStr = iterLHS->first;
//float prob = iterLHS->second;
-
+
Word *targetLHS = new Word(true);
targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true);
CHECK(targetLHS->GetFactor(0) != NULL);
-
+
targetPhrase->GetScoreBreakdown().Assign(unknownWordPenaltyProducer, unknownScore);
targetPhrase->Evaluate(*unksrc);
@@ -125,7 +127,8 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
ChartParser::ChartParser(InputType const &source, ChartCellCollectionBase &cells) :
m_decodeGraphList(StaticData::Instance().GetDecodeGraphs()),
- m_source(source) {
+ m_source(source)
+{
const StaticData &staticData = StaticData::Instance();
staticData.InitializeForInput(source);
@@ -139,14 +142,16 @@ ChartParser::ChartParser(InputType const &source, ChartCellCollectionBase &cells
}
}
-ChartParser::~ChartParser() {
+ChartParser::~ChartParser()
+{
RemoveAllInColl(m_ruleLookupManagers);
StaticData::Instance().CleanUpAfterSentenceProcessing(m_source);
}
-void ChartParser::Create(const WordsRange &wordsRange, ChartParserCallback &to) {
+void ChartParser::Create(const WordsRange &wordsRange, ChartParserCallback &to)
+{
assert(m_decodeGraphList.size() == m_ruleLookupManagers.size());
-
+
std::vector <DecodeGraph*>::const_iterator iterDecodeGraph;
std::vector <ChartRuleLookupManager*>::const_iterator iterRuleLookupManagers = m_ruleLookupManagers.begin();
for (iterDecodeGraph = m_decodeGraphList.begin(); iterDecodeGraph != m_decodeGraphList.end(); ++iterDecodeGraph, ++iterRuleLookupManagers) {
@@ -158,7 +163,7 @@ void ChartParser::Create(const WordsRange &wordsRange, ChartParserCallback &to)
ruleLookupManager.GetChartRuleCollection(wordsRange, to);
}
}
-
+
if (wordsRange.GetNumWordsCovered() == 1 && wordsRange.GetStartPos() != 0 && wordsRange.GetStartPos() != m_source.GetSize()-1) {
bool alwaysCreateDirectTranslationOption = StaticData::Instance().IsAlwaysCreateDirectTranslationOption();
if (to.Empty() || alwaysCreateDirectTranslationOption) {
@@ -166,7 +171,7 @@ void ChartParser::Create(const WordsRange &wordsRange, ChartParserCallback &to)
const Word &sourceWord = m_source.GetWord(wordsRange.GetStartPos());
m_unknown.Process(sourceWord, wordsRange, to);
}
- }
+ }
}
-
+
} // namespace Moses
diff --git a/moses/ChartParser.h b/moses/ChartParser.h
index 9d8baa649..1ff99480d 100644
--- a/moses/ChartParser.h
+++ b/moses/ChartParser.h
@@ -39,31 +39,33 @@ class Phrase;
class TargetPhraseCollection;
class DecodeGraph;
-class ChartParserUnknown {
- public:
- ChartParserUnknown();
- ~ChartParserUnknown();
+class ChartParserUnknown
+{
+public:
+ ChartParserUnknown();
+ ~ChartParserUnknown();
- void Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to);
+ void Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to);
- private:
- std::vector<Phrase*> m_unksrcs;
- std::list<TargetPhraseCollection*> m_cacheTargetPhraseCollection;
- StackVec m_emptyStackVec;
+private:
+ std::vector<Phrase*> m_unksrcs;
+ std::list<TargetPhraseCollection*> m_cacheTargetPhraseCollection;
+ StackVec m_emptyStackVec;
};
-class ChartParser {
- public:
- ChartParser(const InputType &source, ChartCellCollectionBase &cells);
- ~ChartParser();
+class ChartParser
+{
+public:
+ ChartParser(const InputType &source, ChartCellCollectionBase &cells);
+ ~ChartParser();
- void Create(const WordsRange &range, ChartParserCallback &to);
+ void Create(const WordsRange &range, ChartParserCallback &to);
- private:
- ChartParserUnknown m_unknown;
- std::vector <DecodeGraph*> m_decodeGraphList;
- std::vector<ChartRuleLookupManager*> m_ruleLookupManagers;
- InputType const& m_source; /**< source sentence to be translated */
+private:
+ ChartParserUnknown m_unknown;
+ std::vector <DecodeGraph*> m_decodeGraphList;
+ std::vector<ChartRuleLookupManager*> m_ruleLookupManagers;
+ InputType const& m_source; /**< source sentence to be translated */
};
}
diff --git a/moses/ChartParserCallback.h b/moses/ChartParserCallback.h
index 797a57156..84ddb8b75 100644
--- a/moses/ChartParserCallback.h
+++ b/moses/ChartParserCallback.h
@@ -4,21 +4,23 @@
#include <list>
-namespace Moses {
+namespace Moses
+{
class TargetPhraseCollection;
class WordsRange;
class TargetPhrase;
-class ChartParserCallback {
- public:
- virtual ~ChartParserCallback() {}
+class ChartParserCallback
+{
+public:
+ virtual ~ChartParserCallback() {}
- virtual void Add(const TargetPhraseCollection &, const StackVec &, const WordsRange &) = 0;
+ virtual void Add(const TargetPhraseCollection &, const StackVec &, const WordsRange &) = 0;
- virtual bool Empty() const = 0;
+ virtual bool Empty() const = 0;
- virtual void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range) = 0;
+ virtual void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range) = 0;
};
} // namespace Moses
diff --git a/moses/ChartRuleLookupManager.h b/moses/ChartRuleLookupManager.h
index da8c98cb4..ad936ff9c 100644
--- a/moses/ChartRuleLookupManager.h
+++ b/moses/ChartRuleLookupManager.h
@@ -50,7 +50,7 @@ public:
const InputType &GetSentence() const {
return m_sentence;
}
-
+
const ChartCellLabelSet &GetTargetLabelSet(size_t begin, size_t end) const {
return m_cellCollection.GetBase(WordsRange(begin, end)).GetTargetLabelSet();
}
diff --git a/moses/ChartTranslationOptionList.cpp b/moses/ChartTranslationOptionList.cpp
index 8f4422e23..5b72ea7a3 100644
--- a/moses/ChartTranslationOptionList.cpp
+++ b/moses/ChartTranslationOptionList.cpp
@@ -74,11 +74,11 @@ void ChartTranslationOptionList::Add(const TargetPhraseCollection &tpc,
if (m_size == m_collection.size()) {
// m_collection has reached capacity: create a new object.
m_collection.push_back(new ChartTranslationOptions(tpc, stackVec,
- range, score));
+ range, score));
} else {
// Overwrite an unused object.
*(m_collection[m_size]) = ChartTranslationOptions(tpc, stackVec,
- range, score);
+ range, score);
}
++m_size;
@@ -98,7 +98,8 @@ void ChartTranslationOptionList::Add(const TargetPhraseCollection &tpc,
}
}
-void ChartTranslationOptionList::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range) {
+void ChartTranslationOptionList::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range)
+{
TargetPhraseCollection *tpc = new TargetPhraseCollection();
tpc->Add(&phrase);
waste_memory.push_back(tpc);
@@ -106,7 +107,8 @@ void ChartTranslationOptionList::AddPhraseOOV(TargetPhrase &phrase, std::list<Ta
Add(*tpc, empty, range);
}
-void ChartTranslationOptionList::ApplyThreshold() {
+void ChartTranslationOptionList::ApplyThreshold()
+{
if (m_size > m_ruleLimit) {
// Something's gone wrong if the list has grown to m_ruleLimit * 2
// without being pruned.
@@ -134,8 +136,8 @@ void ChartTranslationOptionList::ApplyThreshold() {
scoreThreshold += StaticData::Instance().GetTranslationOptionThreshold();
CollType::iterator bound = std::partition(m_collection.begin(),
- m_collection.begin()+m_size,
- ScoreThresholdPred(scoreThreshold));
+ m_collection.begin()+m_size,
+ ScoreThresholdPred(scoreThreshold));
m_size = std::distance(m_collection.begin(), bound);
}
diff --git a/moses/ChartTranslationOptionList.h b/moses/ChartTranslationOptionList.h
index 0b56b1f61..a2979fcbc 100644
--- a/moses/ChartTranslationOptionList.h
+++ b/moses/ChartTranslationOptionList.h
@@ -32,27 +32,34 @@ class TargetPhraseCollection;
class WordsRange;
//! a vector of translations options for a specific range, in a specific sentence
-class ChartTranslationOptionList : public ChartParserCallback {
- public:
+class ChartTranslationOptionList : public ChartParserCallback
+{
+public:
ChartTranslationOptionList(size_t);
~ChartTranslationOptionList();
- const ChartTranslationOptions &Get(size_t i) const { return *m_collection[i]; }
+ const ChartTranslationOptions &Get(size_t i) const {
+ return *m_collection[i];
+ }
//! number of translation options
- size_t GetSize() const { return m_size; }
+ size_t GetSize() const {
+ return m_size;
+ }
void Add(const TargetPhraseCollection &, const StackVec &,
const WordsRange &);
void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range);
- bool Empty() const { return m_size == 0; }
+ bool Empty() const {
+ return m_size == 0;
+ }
void Clear();
void ApplyThreshold();
- private:
+private:
typedef std::vector<ChartTranslationOptions*> CollType;
struct ScoreThresholdPred {
diff --git a/moses/ChartTranslationOptions.cpp b/moses/ChartTranslationOptions.cpp
index c55948a82..5ba88a0db 100644
--- a/moses/ChartTranslationOptions.cpp
+++ b/moses/ChartTranslationOptions.cpp
@@ -27,8 +27,8 @@ namespace Moses
{
float ChartTranslationOptions::CalcEstimateOfBestScore(
- const TargetPhraseCollection &tpc,
- const StackVec &stackVec)
+ const TargetPhraseCollection &tpc,
+ const StackVec &stackVec)
{
const TargetPhrase &targetPhrase = **(tpc.begin());
float estimateOfBestScore = targetPhrase.GetFutureScore();
diff --git a/moses/ChartTranslationOptions.h b/moses/ChartTranslationOptions.h
index 4910723f7..459c91659 100644
--- a/moses/ChartTranslationOptions.h
+++ b/moses/ChartTranslationOptions.h
@@ -35,7 +35,7 @@ namespace Moses
*/
class ChartTranslationOptions
{
- public:
+public:
/** Constructor
\param targetPhraseColl @todo dunno
\param stackVec @todo dunno
@@ -43,13 +43,13 @@ class ChartTranslationOptions
\param score @todo dunno
*/
ChartTranslationOptions(const TargetPhraseCollection &targetPhraseColl,
- const StackVec &stackVec,
- const WordsRange &wordsRange,
- float score)
- : m_stackVec(stackVec)
- , m_targetPhraseCollection(&targetPhraseColl)
- , m_wordsRange(&wordsRange)
- , m_estimateOfBestScore(score)
+ const StackVec &stackVec,
+ const WordsRange &wordsRange,
+ float score)
+ : m_stackVec(stackVec)
+ , m_targetPhraseCollection(&targetPhraseColl)
+ , m_wordsRange(&wordsRange)
+ , m_estimateOfBestScore(score)
{}
~ChartTranslationOptions() {}
@@ -58,10 +58,12 @@ class ChartTranslationOptions
const StackVec &);
//! @todo dunno
- const StackVec &GetStackVec() const { return m_stackVec; }
+ const StackVec &GetStackVec() const {
+ return m_stackVec;
+ }
//! @todo isn't the translation suppose to just contain 1 target phrase, not a whole collection of them?
- const TargetPhraseCollection &GetTargetPhraseCollection() const {
+ const TargetPhraseCollection &GetTargetPhraseCollection() const {
return *m_targetPhraseCollection;
}
@@ -74,9 +76,11 @@ class ChartTranslationOptions
* the estimate is the sum of the top target phrase's estimated score plus the
* scores of the best child hypotheses.
*/
- inline float GetEstimateOfBestScore() const { return m_estimateOfBestScore; }
+ inline float GetEstimateOfBestScore() const {
+ return m_estimateOfBestScore;
+ }
- private:
+private:
StackVec m_stackVec; //! vector of hypothesis list!
const TargetPhraseCollection *m_targetPhraseCollection;
diff --git a/moses/ChartTrellisDetour.cpp b/moses/ChartTrellisDetour.cpp
index 550a44a2c..1a187396c 100644
--- a/moses/ChartTrellisDetour.cpp
+++ b/moses/ChartTrellisDetour.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -27,15 +27,15 @@ namespace Moses
{
ChartTrellisDetour::ChartTrellisDetour(
- boost::shared_ptr<const ChartTrellisPath> basePath,
- const ChartTrellisNode &substitutedNode,
- const ChartHypothesis &replacementHypo)
+ boost::shared_ptr<const ChartTrellisPath> basePath,
+ const ChartTrellisNode &substitutedNode,
+ const ChartHypothesis &replacementHypo)
: m_basePath(basePath)
, m_substitutedNode(substitutedNode)
, m_replacementHypo(replacementHypo)
{
float diff = replacementHypo.GetTotalScore()
- - substitutedNode.GetHypothesis().GetTotalScore();
+ - substitutedNode.GetHypothesis().GetTotalScore();
m_totalScore = basePath->GetTotalScore() + diff;
}
diff --git a/moses/ChartTrellisDetour.h b/moses/ChartTrellisDetour.h
index 977ccb67d..26c98bef8 100644
--- a/moses/ChartTrellisDetour.h
+++ b/moses/ChartTrellisDetour.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -31,20 +31,24 @@ class ChartTrellisPath;
*/
class ChartTrellisDetour
{
- public:
+public:
ChartTrellisDetour(boost::shared_ptr<const ChartTrellisPath>,
const ChartTrellisNode &, const ChartHypothesis &);
- const ChartTrellisPath &GetBasePath() const { return *m_basePath; }
+ const ChartTrellisPath &GetBasePath() const {
+ return *m_basePath;
+ }
const ChartTrellisNode &GetSubstitutedNode() const {
return m_substitutedNode;
}
const ChartHypothesis &GetReplacementHypo() const {
return m_replacementHypo;
}
- float GetTotalScore() const { return m_totalScore; }
+ float GetTotalScore() const {
+ return m_totalScore;
+ }
- private:
+private:
boost::shared_ptr<const ChartTrellisPath> m_basePath;
const ChartTrellisNode &m_substitutedNode;
const ChartHypothesis &m_replacementHypo;
diff --git a/moses/ChartTrellisDetourQueue.cpp b/moses/ChartTrellisDetourQueue.cpp
index 9b359ca43..4bb81d20b 100644
--- a/moses/ChartTrellisDetourQueue.cpp
+++ b/moses/ChartTrellisDetourQueue.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -21,13 +21,16 @@
#include "Util.h"
-namespace Moses {
+namespace Moses
+{
-ChartTrellisDetourQueue::~ChartTrellisDetourQueue() {
+ChartTrellisDetourQueue::~ChartTrellisDetourQueue()
+{
RemoveAllInColl(m_queue);
}
-void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) {
+void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour)
+{
if (m_capacity == 0 || m_queue.size() < m_capacity) {
m_queue.insert(detour);
} else if (detour->GetTotalScore() > (*m_queue.rbegin())->GetTotalScore()) {
@@ -43,7 +46,8 @@ void ChartTrellisDetourQueue::Push(const ChartTrellisDetour *detour) {
}
}
-const ChartTrellisDetour *ChartTrellisDetourQueue::Pop() {
+const ChartTrellisDetour *ChartTrellisDetourQueue::Pop()
+{
QueueType::iterator p = m_queue.begin();
const ChartTrellisDetour *top = *p;
m_queue.erase(p);
diff --git a/moses/ChartTrellisDetourQueue.h b/moses/ChartTrellisDetourQueue.h
index d6505d8a2..2406a69f5 100644
--- a/moses/ChartTrellisDetourQueue.h
+++ b/moses/ChartTrellisDetourQueue.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -23,20 +23,24 @@
#include <set>
-namespace Moses {
+namespace Moses
+{
/** A bounded priority queue of ChartTrellisDetour pointers. The top item is
* the best scoring detour. The queue assumes ownership of pushed items and
* relinquishes ownership when they are popped. Any remaining items at the
* time of the queue's destruction are deleted.
*/
-class ChartTrellisDetourQueue {
- public:
+class ChartTrellisDetourQueue
+{
+public:
// Create empty queue with fixed capacity of c. Capacity 0 means unbounded.
ChartTrellisDetourQueue(size_t c) : m_capacity(c) {}
~ChartTrellisDetourQueue();
- bool Empty() const { return m_queue.empty(); }
+ bool Empty() const {
+ return m_queue.empty();
+ }
// Add the detour to the queue or delete it if the queue is full and the
// score is no better than the queue's worst score.
@@ -46,7 +50,7 @@ class ChartTrellisDetourQueue {
// caller is responsible for deleting the object.
const ChartTrellisDetour *Pop();
- private:
+private:
struct DetourOrderer {
bool operator()(const ChartTrellisDetour* a,
const ChartTrellisDetour* b) const {
diff --git a/moses/ChartTrellisNode.cpp b/moses/ChartTrellisNode.cpp
index e55d4b1ab..73651f507 100644
--- a/moses/ChartTrellisNode.cpp
+++ b/moses/ChartTrellisNode.cpp
@@ -29,16 +29,16 @@ namespace Moses
{
ChartTrellisNode::ChartTrellisNode(const ChartHypothesis &hypo)
- : m_hypo(hypo)
+ : m_hypo(hypo)
{
CreateChildren();
}
ChartTrellisNode::ChartTrellisNode(const ChartTrellisDetour &detour,
ChartTrellisNode *&deviationPoint)
- : m_hypo((&detour.GetBasePath().GetFinalNode() == &detour.GetSubstitutedNode())
- ? detour.GetReplacementHypo()
- : detour.GetBasePath().GetFinalNode().GetHypothesis())
+ : m_hypo((&detour.GetBasePath().GetFinalNode() == &detour.GetSubstitutedNode())
+ ? detour.GetReplacementHypo()
+ : detour.GetBasePath().GetFinalNode().GetHypothesis())
{
if (&m_hypo == &detour.GetReplacementHypo()) {
deviationPoint = this;
@@ -54,9 +54,9 @@ ChartTrellisNode::ChartTrellisNode(const ChartTrellisNode &root,
const ChartTrellisNode &substitutedNode,
const ChartHypothesis &replacementHypo,
ChartTrellisNode *&deviationPoint)
- : m_hypo((&root == &substitutedNode)
- ? replacementHypo
- : root.GetHypothesis())
+ : m_hypo((&root == &substitutedNode)
+ ? replacementHypo
+ : root.GetHypothesis())
{
if (&root == &substitutedNode) {
deviationPoint = this;
@@ -118,8 +118,8 @@ void ChartTrellisNode::CreateChildren(const ChartTrellisNode &rootNode,
for (size_t ind = 0; ind < children.size(); ++ind) {
const ChartTrellisNode *origChild = children[ind];
ChartTrellisNode *child = new ChartTrellisNode(*origChild, substitutedNode,
- replacementHypo,
- deviationPoint);
+ replacementHypo,
+ deviationPoint);
m_children.push_back(child);
}
}
diff --git a/moses/ChartTrellisNode.h b/moses/ChartTrellisNode.h
index 58203677e..643809728 100644
--- a/moses/ChartTrellisNode.h
+++ b/moses/ChartTrellisNode.h
@@ -34,7 +34,7 @@ class ChartTrellisDetour;
*/
class ChartTrellisNode
{
- public:
+public:
typedef std::vector<ChartTrellisNode*> NodeChildren;
ChartTrellisNode(const ChartHypothesis &hypo);
@@ -42,15 +42,21 @@ class ChartTrellisNode
~ChartTrellisNode();
- const ChartHypothesis &GetHypothesis() const { return m_hypo; }
+ const ChartHypothesis &GetHypothesis() const {
+ return m_hypo;
+ }
- const NodeChildren &GetChildren() const { return m_children; }
+ const NodeChildren &GetChildren() const {
+ return m_children;
+ }
- const ChartTrellisNode &GetChild(size_t i) const { return *m_children[i]; }
+ const ChartTrellisNode &GetChild(size_t i) const {
+ return *m_children[i];
+ }
Phrase GetOutputPhrase() const;
- private:
+private:
ChartTrellisNode(const ChartTrellisNode &); // Not implemented
ChartTrellisNode& operator=(const ChartTrellisNode &); // Not implemented
diff --git a/moses/ChartTrellisPath.cpp b/moses/ChartTrellisPath.cpp
index 231d4237a..c53e636e9 100644
--- a/moses/ChartTrellisPath.cpp
+++ b/moses/ChartTrellisPath.cpp
@@ -30,17 +30,17 @@ namespace Moses
{
ChartTrellisPath::ChartTrellisPath(const ChartHypothesis &hypo)
- : m_finalNode(new ChartTrellisNode(hypo))
- , m_deviationPoint(NULL)
- , m_scoreBreakdown(hypo.GetScoreBreakdown())
- , m_totalScore(hypo.GetTotalScore())
+ : m_finalNode(new ChartTrellisNode(hypo))
+ , m_deviationPoint(NULL)
+ , m_scoreBreakdown(hypo.GetScoreBreakdown())
+ , m_totalScore(hypo.GetTotalScore())
{
}
ChartTrellisPath::ChartTrellisPath(const ChartTrellisDetour &detour)
- : m_finalNode(new ChartTrellisNode(detour, m_deviationPoint))
- , m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown)
- , m_totalScore(0)
+ : m_finalNode(new ChartTrellisNode(detour, m_deviationPoint))
+ , m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown)
+ , m_totalScore(0)
{
CHECK(m_deviationPoint);
ScoreComponentCollection scoreChange;
diff --git a/moses/ChartTrellisPath.h b/moses/ChartTrellisPath.h
index 6e5d50e0c..1023ad7b4 100644
--- a/moses/ChartTrellisPath.h
+++ b/moses/ChartTrellisPath.h
@@ -41,18 +41,24 @@ class ChartTrellisNode;
*/
class ChartTrellisPath
{
- public:
+public:
ChartTrellisPath(const ChartHypothesis &hypo);
ChartTrellisPath(const ChartTrellisDetour &detour);
~ChartTrellisPath();
- const ChartTrellisNode &GetFinalNode() const { return *m_finalNode; }
+ const ChartTrellisNode &GetFinalNode() const {
+ return *m_finalNode;
+ }
- const ChartTrellisNode *GetDeviationPoint() const { return m_deviationPoint; }
+ const ChartTrellisNode *GetDeviationPoint() const {
+ return m_deviationPoint;
+ }
//! get score for this path throught trellis
- float GetTotalScore() const { return m_totalScore; }
+ float GetTotalScore() const {
+ return m_totalScore;
+ }
Phrase GetOutputPhrase() const;
@@ -61,7 +67,7 @@ class ChartTrellisPath
return m_scoreBreakdown;
}
- private:
+private:
ChartTrellisPath(const ChartTrellisPath &); // Not implemented
ChartTrellisPath &operator=(const ChartTrellisPath &); // Not implemented
diff --git a/moses/ConfusionNet.h b/moses/ConfusionNet.h
index 55fa0c8bf..c9c83e154 100644
--- a/moses/ConfusionNet.h
+++ b/moses/ConfusionNet.h
@@ -15,7 +15,7 @@ class FactorCollection;
class TranslationOptionCollection;
class Sentence;
-/** An input to the decoder where each position can be 1 of a number of words,
+/** An input to the decoder where each position can be 1 of a number of words,
* each with an associated probability. Compared with a sentence, where each position is a word
*/
class ConfusionNet : public InputType
diff --git a/moses/DecodeFeature.cpp b/moses/DecodeFeature.cpp
index ebec7a7e3..57137170e 100644
--- a/moses/DecodeFeature.cpp
+++ b/moses/DecodeFeature.cpp
@@ -30,8 +30,8 @@ using namespace std;
namespace Moses
{
DecodeFeature::DecodeFeature( const std::string& description
- , const std::string &line)
-: StatelessFeatureFunction(description, line)
+ , const std::string &line)
+ : StatelessFeatureFunction(description, line)
{
VERBOSE(2,"DecodeFeature:" << std::endl);
for (size_t i = 0; i < m_args.size(); ++i) {
@@ -40,8 +40,7 @@ DecodeFeature::DecodeFeature( const std::string& description
if (args[0] == "input-factor") {
m_input =Tokenize<FactorType>(args[1], ",");
m_inputFactors = FactorMask(m_input);
- }
- else if (args[0] == "output-factor") {
+ } else if (args[0] == "output-factor") {
m_output =Tokenize<FactorType>(args[1], ",");
m_outputFactors = FactorMask(m_output);
}
@@ -50,20 +49,20 @@ DecodeFeature::DecodeFeature( const std::string& description
}
DecodeFeature::DecodeFeature( const std::string& description
- , size_t numScoreComponents
- , const std::string &line)
-: StatelessFeatureFunction(description,numScoreComponents, line)
+ , size_t numScoreComponents
+ , const std::string &line)
+ : StatelessFeatureFunction(description,numScoreComponents, line)
{
VERBOSE(2,"DecodeFeature: no factors yet" << std::endl);
}
DecodeFeature::DecodeFeature(const std::string& description
- , size_t numScoreComponents
- , const std::vector<FactorType> &input
- , const std::vector<FactorType> &output
- , const std::string &line)
-: StatelessFeatureFunction(description,numScoreComponents, line)
-, m_input(input), m_output(output)
+ , size_t numScoreComponents
+ , const std::vector<FactorType> &input
+ , const std::vector<FactorType> &output
+ , const std::string &line)
+ : StatelessFeatureFunction(description,numScoreComponents, line)
+ , m_input(input), m_output(output)
{
m_inputFactors = FactorMask(input);
m_outputFactors = FactorMask(output);
diff --git a/moses/DecodeFeature.h b/moses/DecodeFeature.h
index b6352b181..d6cf3a323 100644
--- a/moses/DecodeFeature.h
+++ b/moses/DecodeFeature.h
@@ -34,9 +34,10 @@ namespace Moses
/**
* Baseclass for phrase-table or generation table feature function
**/
-class DecodeFeature : public StatelessFeatureFunction {
+class DecodeFeature : public StatelessFeatureFunction
+{
- public:
+public:
DecodeFeature( const std::string& description
, const std::string &line);
@@ -45,28 +46,29 @@ class DecodeFeature : public StatelessFeatureFunction {
, const std::string &line);
DecodeFeature( const std::string& description
- , size_t numScoreComponents
- , const std::vector<FactorType> &input
- , const std::vector<FactorType> &output
- , const std::string &line);
-
- //! returns output factor types as specified by the ini file
- const FactorMask& GetOutputFactorMask() const;
-
- //! returns input factor types as specified by the ini file
- const FactorMask& GetInputFactorMask() const;
-
- const std::vector<FactorType>& GetInput() const;
- const std::vector<FactorType>& GetOutput() const;
-
- bool IsDecodeFeature() const
- { return true; }
-
- protected:
- std::vector<FactorType> m_input;
- std::vector<FactorType> m_output;
- FactorMask m_inputFactors;
- FactorMask m_outputFactors;
+ , size_t numScoreComponents
+ , const std::vector<FactorType> &input
+ , const std::vector<FactorType> &output
+ , const std::string &line);
+
+ //! returns output factor types as specified by the ini file
+ const FactorMask& GetOutputFactorMask() const;
+
+ //! returns input factor types as specified by the ini file
+ const FactorMask& GetInputFactorMask() const;
+
+ const std::vector<FactorType>& GetInput() const;
+ const std::vector<FactorType>& GetOutput() const;
+
+ bool IsDecodeFeature() const {
+ return true;
+ }
+
+protected:
+ std::vector<FactorType> m_input;
+ std::vector<FactorType> m_output;
+ FactorMask m_inputFactors;
+ FactorMask m_outputFactors;
};
}
diff --git a/moses/DecodeStepTranslation.cpp b/moses/DecodeStepTranslation.cpp
index 0acd3479f..e4dbb673d 100644
--- a/moses/DecodeStepTranslation.cpp
+++ b/moses/DecodeStepTranslation.cpp
@@ -94,9 +94,9 @@ void DecodeStepTranslation::Process(const TranslationOption &inputPartialTranslO
void DecodeStepTranslation::ProcessInitialTranslation(
- const InputType &source
- ,PartialTranslOptColl &outputPartialTranslOptColl
- , size_t startPos, size_t endPos, bool adhereTableLimit) const
+ const InputType &source
+ ,PartialTranslOptColl &outputPartialTranslOptColl
+ , size_t startPos, size_t endPos, bool adhereTableLimit) const
{
const PhraseDictionary* phraseDictionary = GetPhraseDictionaryFeature();
const size_t tableLimit = phraseDictionary->GetTableLimit();
diff --git a/moses/FF/BleuScoreFeature.cpp b/moses/FF/BleuScoreFeature.cpp
index 091035b0f..7808d6012 100644
--- a/moses/FF/BleuScoreFeature.cpp
+++ b/moses/FF/BleuScoreFeature.cpp
@@ -5,90 +5,94 @@
using namespace std;
-namespace Moses {
+namespace Moses
+{
size_t BleuScoreState::bleu_order = 4;
BleuScoreState::BleuScoreState(): m_words(1),
- m_source_length(0),
- m_target_length(0),
- m_scaled_ref_length(0),
- m_ngram_counts(bleu_order),
- m_ngram_matches(bleu_order)
+ m_source_length(0),
+ m_target_length(0),
+ m_scaled_ref_length(0),
+ m_ngram_counts(bleu_order),
+ m_ngram_matches(bleu_order)
{
}
int BleuScoreState::Compare(const FFState& o) const
{
- if (&o == this)
- return 0;
-
- const StaticData &staticData = StaticData::Instance();
- SearchAlgorithm searchAlgorithm = staticData.GetSearchAlgorithm();
- bool chartDecoding = (searchAlgorithm == ChartDecoding);
- if (chartDecoding)
- return 0;
-
- const BleuScoreState& other = dynamic_cast<const BleuScoreState&>(o);
- int c = m_words.Compare(other.m_words);
- if (c != 0)
- return c;
-
- /*for(size_t i = 0; i < m_ngram_counts.size(); i++) {
- if (m_ngram_counts[i] < other.m_ngram_counts[i])
- return -1;
- if (m_ngram_counts[i] > other.m_ngram_counts[i])
- return 1;
- if (m_ngram_matches[i] < other.m_ngram_matches[i])
- return -1;
- if (m_ngram_matches[i] > other.m_ngram_matches[i])
- return 1;
- }*/
+ if (&o == this)
+ return 0;
+ const StaticData &staticData = StaticData::Instance();
+ SearchAlgorithm searchAlgorithm = staticData.GetSearchAlgorithm();
+ bool chartDecoding = (searchAlgorithm == ChartDecoding);
+ if (chartDecoding)
return 0;
+
+ const BleuScoreState& other = dynamic_cast<const BleuScoreState&>(o);
+ int c = m_words.Compare(other.m_words);
+ if (c != 0)
+ return c;
+
+ /*for(size_t i = 0; i < m_ngram_counts.size(); i++) {
+ if (m_ngram_counts[i] < other.m_ngram_counts[i])
+ return -1;
+ if (m_ngram_counts[i] > other.m_ngram_counts[i])
+ return 1;
+ if (m_ngram_matches[i] < other.m_ngram_matches[i])
+ return -1;
+ if (m_ngram_matches[i] > other.m_ngram_matches[i])
+ return 1;
+ }*/
+
+ return 0;
}
-std::ostream& operator<<(std::ostream& out, const BleuScoreState& state) {
+std::ostream& operator<<(std::ostream& out, const BleuScoreState& state)
+{
state.print(out);
return out;
}
-void BleuScoreState::print(std::ostream& out) const {
+void BleuScoreState::print(std::ostream& out) const
+{
out << "ref=" << m_scaled_ref_length
- << ";source=" << m_source_length
- << ";target=" << m_target_length << ";counts=";
+ << ";source=" << m_source_length
+ << ";target=" << m_target_length << ";counts=";
for (size_t i = 0; i < bleu_order; ++i) {
out << m_ngram_matches[i] << "/" << m_ngram_counts[i] << ",";
}
out << "ctxt=" << m_words;
-
+
}
void BleuScoreState::AddNgramCountAndMatches(std::vector< size_t >& counts,
- std::vector< size_t >& matches) {
- for (size_t order = 0; order < BleuScoreState::bleu_order; ++order) {
- m_ngram_counts[order] += counts[order];
- m_ngram_matches[order] += matches[order];
- }
+ std::vector< size_t >& matches)
+{
+ for (size_t order = 0; order < BleuScoreState::bleu_order; ++order) {
+ m_ngram_counts[order] += counts[order];
+ m_ngram_matches[order] += matches[order];
+ }
}
BleuScoreFeature::BleuScoreFeature(const std::string &line)
-:StatefulFeatureFunction("BleuScoreFeature",1, line),
-m_enabled(true),
-m_sentence_bleu(true),
-m_simple_history_bleu(false),
-m_count_history(BleuScoreState::bleu_order),
-m_match_history(BleuScoreState::bleu_order),
-m_source_length_history(0),
-m_target_length_history(0),
-m_ref_length_history(0),
-m_scale_by_input_length(true),
-m_scale_by_avg_input_length(false),
-m_scale_by_inverse_length(false),
-m_scale_by_avg_inverse_length(false),
-m_scale_by_x(1),
-m_historySmoothing(0.9),
-m_smoothing_scheme(PLUS_POINT_ONE)
+ :StatefulFeatureFunction("BleuScoreFeature",1, line),
+ m_enabled(true),
+ m_sentence_bleu(true),
+ m_simple_history_bleu(false),
+ m_count_history(BleuScoreState::bleu_order),
+ m_match_history(BleuScoreState::bleu_order),
+ m_source_length_history(0),
+ m_target_length_history(0),
+ m_ref_length_history(0),
+ m_scale_by_input_length(true),
+ m_scale_by_avg_input_length(false),
+ m_scale_by_inverse_length(false),
+ m_scale_by_avg_inverse_length(false),
+ m_scale_by_x(1),
+ m_historySmoothing(0.9),
+ m_smoothing_scheme(PLUS_POINT_ONE)
{
for (size_t i = 0; i < m_args.size(); ++i) {
const vector<string> &args = m_args[i];
@@ -131,10 +135,11 @@ m_smoothing_scheme(PLUS_POINT_ONE)
} // for (size_t i = 0; i < toks.size(); ++i) {
}
-void BleuScoreFeature::PrintHistory(std::ostream& out) const {
- out << "source length history=" << m_source_length_history << endl;
- out << "target length history=" << m_target_length_history << endl;
- out << "ref length history=" << m_ref_length_history << endl;
+void BleuScoreFeature::PrintHistory(std::ostream& out) const
+{
+ out << "source length history=" << m_source_length_history << endl;
+ out << "target length history=" << m_target_length_history << endl;
+ out << "ref length history=" << m_ref_length_history << endl;
for (size_t i = 0; i < BleuScoreState::bleu_order; ++i) {
out << "match history/count history (" << i << "):" << m_match_history[i] << "/" << m_count_history[i] << endl;
@@ -142,48 +147,49 @@ void BleuScoreFeature::PrintHistory(std::ostream& out) const {
}
void BleuScoreFeature::SetBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
- bool scaleByInverseLength, bool scaleByAvgInverseLength,
- float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu) {
- m_enabled = !disable;
- m_sentence_bleu = sentenceBleu;
- m_simple_history_bleu = simpleHistoryBleu;
- m_scale_by_input_length = scaleByInputLength;
- m_scale_by_avg_input_length = scaleByAvgInputLength;
- m_scale_by_inverse_length = scaleByInverseLength;
- m_scale_by_avg_inverse_length = scaleByAvgInverseLength;
- m_scale_by_x = scaleByX;
- m_historySmoothing = historySmoothing;
- m_smoothing_scheme = (SmoothingScheme)scheme;
+ bool scaleByInverseLength, bool scaleByAvgInverseLength,
+ float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu)
+{
+ m_enabled = !disable;
+ m_sentence_bleu = sentenceBleu;
+ m_simple_history_bleu = simpleHistoryBleu;
+ m_scale_by_input_length = scaleByInputLength;
+ m_scale_by_avg_input_length = scaleByAvgInputLength;
+ m_scale_by_inverse_length = scaleByInverseLength;
+ m_scale_by_avg_inverse_length = scaleByAvgInverseLength;
+ m_scale_by_x = scaleByX;
+ m_historySmoothing = historySmoothing;
+ m_smoothing_scheme = (SmoothingScheme)scheme;
}
// Incoming references (refs) are stored as refs[file_id][[sent_id][reference]]
// This data structure: m_refs[sent_id][[vector<length>][ngrams]]
void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::string > >& refs)
{
- m_refs.clear();
- FactorCollection& fc = FactorCollection::Instance();
- for (size_t file_id = 0; file_id < refs.size(); file_id++) {
- for (size_t sent_id = 0; sent_id < refs[file_id].size(); sent_id++) {
- const string& ref = refs[file_id][sent_id];
- vector<string> refTokens = Tokenize(ref);
- if (file_id == 0)
- m_refs[sent_id] = RefValue();
- pair<vector<size_t>,NGrams>& ref_pair = m_refs[sent_id];
- (ref_pair.first).push_back(refTokens.size());
- for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) {
- for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) {
- Phrase ngram(1);
- for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) {
- const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]);
- Word w;
- w.SetFactor(0, f);
- ngram.AddWord(w);
- }
- ref_pair.second[ngram] += 1;
- }
- }
- }
- }
+ m_refs.clear();
+ FactorCollection& fc = FactorCollection::Instance();
+ for (size_t file_id = 0; file_id < refs.size(); file_id++) {
+ for (size_t sent_id = 0; sent_id < refs[file_id].size(); sent_id++) {
+ const string& ref = refs[file_id][sent_id];
+ vector<string> refTokens = Tokenize(ref);
+ if (file_id == 0)
+ m_refs[sent_id] = RefValue();
+ pair<vector<size_t>,NGrams>& ref_pair = m_refs[sent_id];
+ (ref_pair.first).push_back(refTokens.size());
+ for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) {
+ for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) {
+ Phrase ngram(1);
+ for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) {
+ const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]);
+ Word w;
+ w.SetFactor(0, f);
+ ngram.AddWord(w);
+ }
+ ref_pair.second[ngram] += 1;
+ }
+ }
+ }
+ }
// cerr << "Number of ref files: " << refs.size() << endl;
// for (size_t i = 0; i < m_refs.size(); ++i) {
@@ -191,51 +197,57 @@ void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::strin
// }
}
-void BleuScoreFeature::SetCurrSourceLength(size_t source_length) {
- m_cur_source_length = source_length;
+void BleuScoreFeature::SetCurrSourceLength(size_t source_length)
+{
+ m_cur_source_length = source_length;
}
-void BleuScoreFeature::SetCurrNormSourceLength(size_t source_length) {
- m_cur_norm_source_length = source_length;
+void BleuScoreFeature::SetCurrNormSourceLength(size_t source_length)
+{
+ m_cur_norm_source_length = source_length;
}
// m_refs[sent_id][[vector<length>][ngrams]]
-void BleuScoreFeature::SetCurrShortestRefLength(size_t sent_id) {
- // look for shortest reference
- int shortestRef = -1;
- for (size_t i = 0; i < (m_refs[sent_id].first).size(); ++i) {
- if (shortestRef == -1 || (m_refs[sent_id].first)[i] < shortestRef)
- shortestRef = (m_refs[sent_id].first)[i];
- }
- m_cur_ref_length = shortestRef;
+void BleuScoreFeature::SetCurrShortestRefLength(size_t sent_id)
+{
+ // look for shortest reference
+ int shortestRef = -1;
+ for (size_t i = 0; i < (m_refs[sent_id].first).size(); ++i) {
+ if (shortestRef == -1 || (m_refs[sent_id].first)[i] < shortestRef)
+ shortestRef = (m_refs[sent_id].first)[i];
+ }
+ m_cur_ref_length = shortestRef;
// cerr << "Set shortest cur_ref_length: " << m_cur_ref_length << endl;
}
-void BleuScoreFeature::SetCurrAvgRefLength(size_t sent_id) {
- // compute average reference length
- size_t sum = 0;
- size_t numberRefs = (m_refs[sent_id].first).size();
- for (size_t i = 0; i < numberRefs; ++i) {
- sum += (m_refs[sent_id].first)[i];
- }
- m_cur_ref_length = (float)sum/numberRefs;
+void BleuScoreFeature::SetCurrAvgRefLength(size_t sent_id)
+{
+ // compute average reference length
+ size_t sum = 0;
+ size_t numberRefs = (m_refs[sent_id].first).size();
+ for (size_t i = 0; i < numberRefs; ++i) {
+ sum += (m_refs[sent_id].first)[i];
+ }
+ m_cur_ref_length = (float)sum/numberRefs;
// cerr << "Set average cur_ref_length: " << m_cur_ref_length << endl;
}
-void BleuScoreFeature::SetCurrReferenceNgrams(size_t sent_id) {
- m_cur_ref_ngrams = m_refs[sent_id].second;
+void BleuScoreFeature::SetCurrReferenceNgrams(size_t sent_id)
+{
+ m_cur_ref_ngrams = m_refs[sent_id].second;
}
-size_t BleuScoreFeature::GetShortestRefIndex(size_t ref_id) {
- // look for shortest reference
- int shortestRef = -1;
- size_t shortestRefIndex = 0;
- for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) {
- if (shortestRef == -1 || (m_refs[ref_id].first)[i] < shortestRef) {
- shortestRef = (m_refs[ref_id].first)[i];
- shortestRefIndex = i;
- }
- }
- return shortestRefIndex;
+size_t BleuScoreFeature::GetShortestRefIndex(size_t ref_id)
+{
+ // look for shortest reference
+ int shortestRef = -1;
+ size_t shortestRefIndex = 0;
+ for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) {
+ if (shortestRef == -1 || (m_refs[ref_id].first)[i] < shortestRef) {
+ shortestRef = (m_refs[ref_id].first)[i];
+ shortestRefIndex = i;
+ }
+ }
+ return shortestRefIndex;
}
/*
@@ -244,73 +256,75 @@ size_t BleuScoreFeature::GetShortestRefIndex(size_t ref_id) {
* O = m_historySmoothing * (O + c(e_oracle))
* O_f = m_historySmoothing * (O_f + |f|) input length of pseudo-document
*/
-void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo) {
- Phrase phrase(hypo);
+void BleuScoreFeature::UpdateHistory(const vector< const Word* >& hypo)
+{
+ Phrase phrase(hypo);
+ std::vector< size_t > ngram_counts(BleuScoreState::bleu_order);
+ std::vector< size_t > ngram_matches(BleuScoreState::bleu_order);
+
+ // compute vector c(e;{r_k}):
+ // vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k
+ GetNgramMatchCounts(phrase, m_cur_ref_ngrams, ngram_counts, ngram_matches, 0);
+
+ // update counts and matches for every ngram length with counts from hypo
+ for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
+ m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]);
+ m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]);
+ }
+
+ // update counts for reference and target length
+ m_source_length_history = m_historySmoothing * (m_source_length_history + m_cur_source_length);
+ m_target_length_history = m_historySmoothing * (m_target_length_history + hypo.size());
+ m_ref_length_history = m_historySmoothing * (m_ref_length_history + m_cur_ref_length);
+}
+
+/*
+ * Update history with a batch of translations
+ */
+void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypos, vector<size_t>& sourceLengths, vector<size_t>& ref_ids, size_t rank, size_t epoch)
+{
+ for (size_t ref_id = 0; ref_id < hypos.size(); ++ref_id) {
+ Phrase phrase(hypos[ref_id]);
std::vector< size_t > ngram_counts(BleuScoreState::bleu_order);
std::vector< size_t > ngram_matches(BleuScoreState::bleu_order);
+ // set current source and reference information for each oracle in the batch
+ size_t cur_source_length = sourceLengths[ref_id];
+ size_t hypo_length = hypos[ref_id].size();
+ size_t cur_ref_length = GetClosestRefLength(ref_ids[ref_id], hypo_length);
+ NGrams cur_ref_ngrams = m_refs[ref_ids[ref_id]].second;
+ cerr << "reference length: " << cur_ref_length << endl;
+
// compute vector c(e;{r_k}):
// vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k
- GetNgramMatchCounts(phrase, m_cur_ref_ngrams, ngram_counts, ngram_matches, 0);
+ GetNgramMatchCounts(phrase, cur_ref_ngrams, ngram_counts, ngram_matches, 0);
// update counts and matches for every ngram length with counts from hypo
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
- m_count_history[i] = m_historySmoothing * (m_count_history[i] + ngram_counts[i]);
- m_match_history[i] = m_historySmoothing * (m_match_history[i] + ngram_matches[i]);
+ m_count_history[i] += ngram_counts[i];
+ m_match_history[i] += ngram_matches[i];
+
+ // do this for last position in batch
+ if (ref_id == hypos.size() - 1) {
+ m_count_history[i] *= m_historySmoothing;
+ m_match_history[i] *= m_historySmoothing;
+ }
}
// update counts for reference and target length
- m_source_length_history = m_historySmoothing * (m_source_length_history + m_cur_source_length);
- m_target_length_history = m_historySmoothing * (m_target_length_history + hypo.size());
- m_ref_length_history = m_historySmoothing * (m_ref_length_history + m_cur_ref_length);
-}
-
-/*
- * Update history with a batch of translations
- */
-void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypos, vector<size_t>& sourceLengths, vector<size_t>& ref_ids, size_t rank, size_t epoch) {
- for (size_t ref_id = 0; ref_id < hypos.size(); ++ref_id){
- Phrase phrase(hypos[ref_id]);
- std::vector< size_t > ngram_counts(BleuScoreState::bleu_order);
- std::vector< size_t > ngram_matches(BleuScoreState::bleu_order);
-
- // set current source and reference information for each oracle in the batch
- size_t cur_source_length = sourceLengths[ref_id];
- size_t hypo_length = hypos[ref_id].size();
- size_t cur_ref_length = GetClosestRefLength(ref_ids[ref_id], hypo_length);
- NGrams cur_ref_ngrams = m_refs[ref_ids[ref_id]].second;
- cerr << "reference length: " << cur_ref_length << endl;
-
- // compute vector c(e;{r_k}):
- // vector of effective reference length, number of ngrams in e, number of ngram matches between e and r_k
- GetNgramMatchCounts(phrase, cur_ref_ngrams, ngram_counts, ngram_matches, 0);
-
- // update counts and matches for every ngram length with counts from hypo
- for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
- m_count_history[i] += ngram_counts[i];
- m_match_history[i] += ngram_matches[i];
-
- // do this for last position in batch
- if (ref_id == hypos.size() - 1) {
- m_count_history[i] *= m_historySmoothing;
- m_match_history[i] *= m_historySmoothing;
- }
- }
-
- // update counts for reference and target length
- m_source_length_history += cur_source_length;
- m_target_length_history += hypos[ref_id].size();
- m_ref_length_history += cur_ref_length;
-
- // do this for last position in batch
- if (ref_id == hypos.size() - 1) {
- cerr << "Rank " << rank << ", epoch " << epoch << " ,source length history: " << m_source_length_history << " --> " << m_source_length_history * m_historySmoothing << endl;
- cerr << "Rank " << rank << ", epoch " << epoch << " ,target length history: " << m_target_length_history << " --> " << m_target_length_history * m_historySmoothing << endl;
- m_source_length_history *= m_historySmoothing;
- m_target_length_history *= m_historySmoothing;
- m_ref_length_history *= m_historySmoothing;
- }
- }
+ m_source_length_history += cur_source_length;
+ m_target_length_history += hypos[ref_id].size();
+ m_ref_length_history += cur_ref_length;
+
+ // do this for last position in batch
+ if (ref_id == hypos.size() - 1) {
+ cerr << "Rank " << rank << ", epoch " << epoch << " ,source length history: " << m_source_length_history << " --> " << m_source_length_history * m_historySmoothing << endl;
+ cerr << "Rank " << rank << ", epoch " << epoch << " ,target length history: " << m_target_length_history << " --> " << m_target_length_history * m_historySmoothing << endl;
+ m_source_length_history *= m_historySmoothing;
+ m_target_length_history *= m_historySmoothing;
+ m_ref_length_history *= m_historySmoothing;
+ }
+ }
}
/*
@@ -323,17 +337,18 @@ void BleuScoreFeature::UpdateHistory(const vector< vector< const Word* > >& hypo
}
}*/
-size_t BleuScoreFeature::GetClosestRefLength(size_t ref_id, int hypoLength) {
- // look for closest reference
- int currentDist = -1;
- int closestRefLength = -1;
- for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) {
- if (closestRefLength == -1 || abs(hypoLength - (int)(m_refs[ref_id].first)[i]) < currentDist) {
- closestRefLength = (m_refs[ref_id].first)[i];
- currentDist = abs(hypoLength - (int)(m_refs[ref_id].first)[i]);
- }
- }
- return (size_t)closestRefLength;
+size_t BleuScoreFeature::GetClosestRefLength(size_t ref_id, int hypoLength)
+{
+ // look for closest reference
+ int currentDist = -1;
+ int closestRefLength = -1;
+ for (size_t i = 0; i < (m_refs[ref_id].first).size(); ++i) {
+ if (closestRefLength == -1 || abs(hypoLength - (int)(m_refs[ref_id].first)[i]) < currentDist) {
+ closestRefLength = (m_refs[ref_id].first)[i];
+ currentDist = abs(hypoLength - (int)(m_refs[ref_id].first)[i]);
+ }
+ }
+ return (size_t)closestRefLength;
}
/*
@@ -341,206 +356,206 @@ size_t BleuScoreFeature::GetClosestRefLength(size_t ref_id, int hypoLength) {
* its ngram matches against the ngrams in the reference translation
*/
void BleuScoreFeature::GetNgramMatchCounts(Phrase& phrase,
- const NGrams& ref_ngram_counts,
- std::vector< size_t >& ret_counts,
- std::vector< size_t >& ret_matches,
- size_t skip_first) const
+ const NGrams& ref_ngram_counts,
+ std::vector< size_t >& ret_counts,
+ std::vector< size_t >& ret_matches,
+ size_t skip_first) const
{
- NGrams::const_iterator ref_ngram_counts_iter;
- size_t ngram_start_idx, ngram_end_idx;
+ NGrams::const_iterator ref_ngram_counts_iter;
+ size_t ngram_start_idx, ngram_end_idx;
- // Chiang et al (2008) use unclipped counts of ngram matches
- for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) {
- for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
- if (order > end_idx) break;
+ // Chiang et al (2008) use unclipped counts of ngram matches
+ for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) {
+ for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
+ if (order > end_idx) break;
- ngram_end_idx = end_idx;
- ngram_start_idx = end_idx - order;
+ ngram_end_idx = end_idx;
+ ngram_start_idx = end_idx - order;
- Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
- ret_counts[order]++;
+ Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
+ ret_counts[order]++;
- ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
- if (ref_ngram_counts_iter != ref_ngram_counts.end())
- ret_matches[order]++;
- }
+ ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
+ if (ref_ngram_counts_iter != ref_ngram_counts.end())
+ ret_matches[order]++;
}
+ }
}
// score ngrams of words that have been added before the previous word span
void BleuScoreFeature::GetNgramMatchCounts_prefix(Phrase& phrase,
- const NGrams& ref_ngram_counts,
- std::vector< size_t >& ret_counts,
- std::vector< size_t >& ret_matches,
- size_t new_start_indices,
- size_t last_end_index) const
+ const NGrams& ref_ngram_counts,
+ std::vector< size_t >& ret_counts,
+ std::vector< size_t >& ret_matches,
+ size_t new_start_indices,
+ size_t last_end_index) const
{
- NGrams::const_iterator ref_ngram_counts_iter;
- size_t ngram_start_idx, ngram_end_idx;
-
- // Chiang et al (2008) use unclipped counts of ngram matches
- for (size_t start_idx = 0; start_idx < new_start_indices; start_idx++) {
- for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
- ngram_start_idx = start_idx;
- ngram_end_idx = start_idx + order;
- if (order > ngram_end_idx) break;
- if (ngram_end_idx > last_end_index) break;
-
- Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
- ret_counts[order]++;
-
- ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
- if (ref_ngram_counts_iter != ref_ngram_counts.end())
- ret_matches[order]++;
- }
+ NGrams::const_iterator ref_ngram_counts_iter;
+ size_t ngram_start_idx, ngram_end_idx;
+
+ // Chiang et al (2008) use unclipped counts of ngram matches
+ for (size_t start_idx = 0; start_idx < new_start_indices; start_idx++) {
+ for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
+ ngram_start_idx = start_idx;
+ ngram_end_idx = start_idx + order;
+ if (order > ngram_end_idx) break;
+ if (ngram_end_idx > last_end_index) break;
+
+ Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
+ ret_counts[order]++;
+
+ ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
+ if (ref_ngram_counts_iter != ref_ngram_counts.end())
+ ret_matches[order]++;
}
+ }
}
// score ngrams around the overlap of two previously scored phrases
void BleuScoreFeature::GetNgramMatchCounts_overlap(Phrase& phrase,
- const NGrams& ref_ngram_counts,
- std::vector< size_t >& ret_counts,
- std::vector< size_t >& ret_matches,
- size_t overlap_index) const
+ const NGrams& ref_ngram_counts,
+ std::vector< size_t >& ret_counts,
+ std::vector< size_t >& ret_matches,
+ size_t overlap_index) const
{
- NGrams::const_iterator ref_ngram_counts_iter;
- size_t ngram_start_idx, ngram_end_idx;
+ NGrams::const_iterator ref_ngram_counts_iter;
+ size_t ngram_start_idx, ngram_end_idx;
- // Chiang et al (2008) use unclipped counts of ngram matches
- for (size_t end_idx = overlap_index; end_idx < phrase.GetSize(); end_idx++) {
- if (end_idx >= (overlap_index+BleuScoreState::bleu_order-1)) break;
- for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
- if (order > end_idx) break;
+ // Chiang et al (2008) use unclipped counts of ngram matches
+ for (size_t end_idx = overlap_index; end_idx < phrase.GetSize(); end_idx++) {
+ if (end_idx >= (overlap_index+BleuScoreState::bleu_order-1)) break;
+ for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
+ if (order > end_idx) break;
- ngram_end_idx = end_idx;
- ngram_start_idx = end_idx - order;
- if (ngram_start_idx >= overlap_index) continue; // only score ngrams that span the overlap point
+ ngram_end_idx = end_idx;
+ ngram_start_idx = end_idx - order;
+ if (ngram_start_idx >= overlap_index) continue; // only score ngrams that span the overlap point
- Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
- ret_counts[order]++;
+ Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
+ ret_counts[order]++;
- ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
- if (ref_ngram_counts_iter != ref_ngram_counts.end())
- ret_matches[order]++;
- }
+ ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
+ if (ref_ngram_counts_iter != ref_ngram_counts.end())
+ ret_matches[order]++;
}
+ }
}
void BleuScoreFeature::GetClippedNgramMatchesAndCounts(Phrase& phrase,
- const NGrams& ref_ngram_counts,
- std::vector< size_t >& ret_counts,
- std::vector< size_t >& ret_matches,
- size_t skip_first) const
+ const NGrams& ref_ngram_counts,
+ std::vector< size_t >& ret_counts,
+ std::vector< size_t >& ret_matches,
+ size_t skip_first) const
{
- NGrams::const_iterator ref_ngram_counts_iter;
- size_t ngram_start_idx, ngram_end_idx;
+ NGrams::const_iterator ref_ngram_counts_iter;
+ size_t ngram_start_idx, ngram_end_idx;
- Matches ngram_matches;
- for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) {
- for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
- if (order > end_idx) break;
+ Matches ngram_matches;
+ for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) {
+ for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
+ if (order > end_idx) break;
- ngram_end_idx = end_idx;
- ngram_start_idx = end_idx - order;
+ ngram_end_idx = end_idx;
+ ngram_start_idx = end_idx - order;
- Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
- ret_counts[order]++;
+ Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
+ ret_counts[order]++;
- ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
- if (ref_ngram_counts_iter != ref_ngram_counts.end()) {
- ngram_matches[order][ngram]++;
- }
- }
- }
+ ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
+ if (ref_ngram_counts_iter != ref_ngram_counts.end()) {
+ ngram_matches[order][ngram]++;
+ }
+ }
+ }
- // clip ngram matches
- for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
- NGrams::const_iterator iter;
-
- // iterate over ngram counts for every ngram order
- for (iter=ngram_matches[order].begin(); iter != ngram_matches[order].end(); ++iter) {
- ref_ngram_counts_iter = ref_ngram_counts.find(iter->first);
- if (iter->second > ref_ngram_counts_iter->second) {
- ret_matches[order] += ref_ngram_counts_iter->second;
- }
- else {
- ret_matches[order] += iter->second;
- }
+ // clip ngram matches
+ for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
+ NGrams::const_iterator iter;
+
+ // iterate over ngram counts for every ngram order
+ for (iter=ngram_matches[order].begin(); iter != ngram_matches[order].end(); ++iter) {
+ ref_ngram_counts_iter = ref_ngram_counts.find(iter->first);
+ if (iter->second > ref_ngram_counts_iter->second) {
+ ret_matches[order] += ref_ngram_counts_iter->second;
+ } else {
+ ret_matches[order] += iter->second;
+ }
}
- }
+ }
}
/*
* Given a previous state, compute Bleu score for the updated state with an additional target
* phrase translated.
*/
-FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo,
- const FFState* prev_state,
+FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo,
+ const FFState* prev_state,
ScoreComponentCollection* accumulator) const
{
- if (!m_enabled) return new BleuScoreState();
-
- NGrams::const_iterator reference_ngrams_iter;
- const BleuScoreState& ps = dynamic_cast<const BleuScoreState&>(*prev_state);
- BleuScoreState* new_state = new BleuScoreState(ps);
-
- float old_bleu, new_bleu;
- size_t num_new_words, ctx_start_idx, ctx_end_idx;
-
- // Calculate old bleu;
- old_bleu = CalculateBleu(new_state);
-
- // Get context and append new words.
- num_new_words = cur_hypo.GetCurrTargetLength();
- if (num_new_words == 0) {
- return new_state;
- }
-
- Phrase new_words = ps.m_words;
- new_words.Append(cur_hypo.GetCurrTargetPhrase());
- //cerr << "NW: " << new_words << endl;
+ if (!m_enabled) return new BleuScoreState();
- // get ngram matches for new words
- GetNgramMatchCounts(new_words,
- m_cur_ref_ngrams,
- new_state->m_ngram_counts,
- new_state->m_ngram_matches,
- new_state->m_words.GetSize()); // number of words in previous states
+ NGrams::const_iterator reference_ngrams_iter;
+ const BleuScoreState& ps = dynamic_cast<const BleuScoreState&>(*prev_state);
+ BleuScoreState* new_state = new BleuScoreState(ps);
- // Update state variables
- ctx_end_idx = new_words.GetSize()-1;
- size_t bleu_context_length = BleuScoreState::bleu_order -1;
- if (ctx_end_idx > bleu_context_length) {
- ctx_start_idx = ctx_end_idx - bleu_context_length;
- } else {
- ctx_start_idx = 0;
- }
+ float old_bleu, new_bleu;
+ size_t num_new_words, ctx_start_idx, ctx_end_idx;
- WordsBitmap coverageVector = cur_hypo.GetWordsBitmap();
- new_state->m_source_length = coverageVector.GetNumWordsCovered();
+ // Calculate old bleu;
+ old_bleu = CalculateBleu(new_state);
- new_state->m_words = new_words.GetSubString(WordsRange(ctx_start_idx,
- ctx_end_idx));
- new_state->m_target_length += cur_hypo.GetCurrTargetLength();
+ // Get context and append new words.
+ num_new_words = cur_hypo.GetCurrTargetLength();
+ if (num_new_words == 0) {
+ return new_state;
+ }
- // we need a scaled reference length to compare the current target phrase to the corresponding reference phrase
- new_state->m_scaled_ref_length = m_cur_ref_length *
- ((float)coverageVector.GetNumWordsCovered()/coverageVector.GetSize());
+ Phrase new_words = ps.m_words;
+ new_words.Append(cur_hypo.GetCurrTargetPhrase());
+ //cerr << "NW: " << new_words << endl;
- // Calculate new bleu.
- new_bleu = CalculateBleu(new_state);
+ // get ngram matches for new words
+ GetNgramMatchCounts(new_words,
+ m_cur_ref_ngrams,
+ new_state->m_ngram_counts,
+ new_state->m_ngram_matches,
+ new_state->m_words.GetSize()); // number of words in previous states
- // Set score to new Bleu score
- accumulator->PlusEquals(this, new_bleu - old_bleu);
- return new_state;
+ // Update state variables
+ ctx_end_idx = new_words.GetSize()-1;
+ size_t bleu_context_length = BleuScoreState::bleu_order -1;
+ if (ctx_end_idx > bleu_context_length) {
+ ctx_start_idx = ctx_end_idx - bleu_context_length;
+ } else {
+ ctx_start_idx = 0;
+ }
+
+ WordsBitmap coverageVector = cur_hypo.GetWordsBitmap();
+ new_state->m_source_length = coverageVector.GetNumWordsCovered();
+
+ new_state->m_words = new_words.GetSubString(WordsRange(ctx_start_idx,
+ ctx_end_idx));
+ new_state->m_target_length += cur_hypo.GetCurrTargetLength();
+
+ // we need a scaled reference length to compare the current target phrase to the corresponding reference phrase
+ new_state->m_scaled_ref_length = m_cur_ref_length *
+ ((float)coverageVector.GetNumWordsCovered()/coverageVector.GetSize());
+
+ // Calculate new bleu.
+ new_bleu = CalculateBleu(new_state);
+
+ // Set score to new Bleu score
+ accumulator->PlusEquals(this, new_bleu - old_bleu);
+ return new_state;
}
FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int featureID,
- ScoreComponentCollection* accumulator ) const {
+ ScoreComponentCollection* accumulator ) const
+{
if (!m_enabled) return new BleuScoreState();
-
+
NGrams::const_iterator reference_ngrams_iter;
-
+
const Phrase& curr_target_phrase = static_cast<const Phrase&>(cur_hypo.GetCurrTargetPhrase());
// cerr << "\nCur target phrase: " << cur_hypo.GetTargetLHS() << " --> " << curr_target_phrase << endl;
@@ -553,35 +568,35 @@ FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int fe
assert(cur_hypo.GetPrevHypos().size() <= 2);
BleuScoreState* new_state;
if (cur_hypo.GetPrevHypos().size() == 0)
- new_state = new BleuScoreState();
+ new_state = new BleuScoreState();
else {
- const FFState* prev_state_zero = cur_hypo.GetPrevHypo(0)->GetFFState(featureID);
- const BleuScoreState& ps_zero = dynamic_cast<const BleuScoreState&>(*prev_state_zero);
- new_state = new BleuScoreState(ps_zero);
- num_words_first_prev = ps_zero.m_target_length;
-
- for (size_t i = 0; i < cur_hypo.GetPrevHypos().size(); ++i) {
- const FFState* prev_state = cur_hypo.GetPrevHypo(i)->GetFFState(featureID);
- const BleuScoreState* ps = dynamic_cast<const BleuScoreState*>(prev_state);
- BleuScoreState* ps_nonConst = const_cast<BleuScoreState*>(ps);
+ const FFState* prev_state_zero = cur_hypo.GetPrevHypo(0)->GetFFState(featureID);
+ const BleuScoreState& ps_zero = dynamic_cast<const BleuScoreState&>(*prev_state_zero);
+ new_state = new BleuScoreState(ps_zero);
+ num_words_first_prev = ps_zero.m_target_length;
+
+ for (size_t i = 0; i < cur_hypo.GetPrevHypos().size(); ++i) {
+ const FFState* prev_state = cur_hypo.GetPrevHypo(i)->GetFFState(featureID);
+ const BleuScoreState* ps = dynamic_cast<const BleuScoreState*>(prev_state);
+ BleuScoreState* ps_nonConst = const_cast<BleuScoreState*>(ps);
// cerr << "prev phrase: " << cur_hypo.GetPrevHypo(i)->GetOutputPhrase()
// << " ( " << cur_hypo.GetPrevHypo(i)->GetTargetLHS() << ")" << endl;
- old_bleu += CalculateBleu(ps_nonConst);
- num_old_words += ps->m_target_length;
+ old_bleu += CalculateBleu(ps_nonConst);
+ num_old_words += ps->m_target_length;
- if (i > 0)
- // add ngram matches from other previous states
- new_state->AddNgramCountAndMatches(ps_nonConst->m_ngram_counts, ps_nonConst->m_ngram_matches);
- }
+ if (i > 0)
+ // add ngram matches from other previous states
+ new_state->AddNgramCountAndMatches(ps_nonConst->m_ngram_counts, ps_nonConst->m_ngram_matches);
+ }
}
-
+
// check if we are already done (don't add <s> and </s>)
size_t numWordsCovered = cur_hypo.GetCurrSourceRange().GetNumWordsCovered();
if (numWordsCovered == m_cur_source_length) {
- // Bleu score stays the same, do not need to add anything
- //accumulator->PlusEquals(this, 0);
- return new_state;
+ // Bleu score stays the same, do not need to add anything
+ //accumulator->PlusEquals(this, 0);
+ return new_state;
}
// set new context
@@ -592,55 +607,52 @@ FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int fe
// get ngram matches for new words
if (num_old_words == 0) {
// cerr << "compute right ngram context" << endl;
- GetNgramMatchCounts(new_words,
- m_cur_ref_ngrams,
- new_state->m_ngram_counts,
- new_state->m_ngram_matches,
- 0);
- }
- else if (new_words.GetSize() == num_old_words) {
- // two hypotheses were glued together, compute new ngrams on the basis of first hypothesis
- num_words_added_right = num_curr_words - num_words_first_prev;
- // score around overlap point
+ GetNgramMatchCounts(new_words,
+ m_cur_ref_ngrams,
+ new_state->m_ngram_counts,
+ new_state->m_ngram_matches,
+ 0);
+ } else if (new_words.GetSize() == num_old_words) {
+ // two hypotheses were glued together, compute new ngrams on the basis of first hypothesis
+ num_words_added_right = num_curr_words - num_words_first_prev;
+ // score around overlap point
// cerr << "compute overlap ngram context (" << (num_words_first_prev) << ")" << endl;
- GetNgramMatchCounts_overlap(new_words,
- m_cur_ref_ngrams,
- new_state->m_ngram_counts,
- new_state->m_ngram_matches,
- num_words_first_prev);
- }
- else if (num_old_words + curr_target_phrase.GetNumTerminals() == num_curr_words) {
- assert(curr_target_phrase.GetSize() == curr_target_phrase.GetNumTerminals()+1);
- // previous hypothesis + rule with 1 non-terminal were combined (NT substituted by Ts)
- for (size_t i = 0; i < curr_target_phrase.GetSize(); ++i)
- if (curr_target_phrase.GetWord(i).IsNonTerminal()) {
- num_words_added_left = i;
- num_words_added_right = curr_target_phrase.GetSize() - (i+1);
- break;
- }
-
- // left context
+ GetNgramMatchCounts_overlap(new_words,
+ m_cur_ref_ngrams,
+ new_state->m_ngram_counts,
+ new_state->m_ngram_matches,
+ num_words_first_prev);
+ } else if (num_old_words + curr_target_phrase.GetNumTerminals() == num_curr_words) {
+ assert(curr_target_phrase.GetSize() == curr_target_phrase.GetNumTerminals()+1);
+ // previous hypothesis + rule with 1 non-terminal were combined (NT substituted by Ts)
+ for (size_t i = 0; i < curr_target_phrase.GetSize(); ++i)
+ if (curr_target_phrase.GetWord(i).IsNonTerminal()) {
+ num_words_added_left = i;
+ num_words_added_right = curr_target_phrase.GetSize() - (i+1);
+ break;
+ }
+
+ // left context
// cerr << "compute left ngram context" << endl;
- if (num_words_added_left > 0)
- GetNgramMatchCounts_prefix(new_words,
- m_cur_ref_ngrams,
- new_state->m_ngram_counts,
- new_state->m_ngram_matches,
- num_words_added_left,
- num_curr_words - num_words_added_right - 1);
-
- // right context
+ if (num_words_added_left > 0)
+ GetNgramMatchCounts_prefix(new_words,
+ m_cur_ref_ngrams,
+ new_state->m_ngram_counts,
+ new_state->m_ngram_matches,
+ num_words_added_left,
+ num_curr_words - num_words_added_right - 1);
+
+ // right context
// cerr << "compute right ngram context" << endl;
- if (num_words_added_right > 0)
- GetNgramMatchCounts(new_words,
- m_cur_ref_ngrams,
- new_state->m_ngram_counts,
- new_state->m_ngram_matches,
- num_words_added_left + num_old_words);
- }
- else {
- cerr << "undefined state.. " << endl;
- exit(1);
+ if (num_words_added_right > 0)
+ GetNgramMatchCounts(new_words,
+ m_cur_ref_ngrams,
+ new_state->m_ngram_counts,
+ new_state->m_ngram_matches,
+ num_words_added_left + num_old_words);
+ } else {
+ cerr << "undefined state.. " << endl;
+ exit(1);
}
// Update state variables
@@ -659,7 +671,7 @@ FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int fe
// reference phrase
size_t cur_source_length = m_cur_source_length;
new_state->m_scaled_ref_length = m_cur_ref_length * (float(new_state->m_source_length)/cur_source_length);
-
+
// Calculate new bleu.
new_bleu = CalculateBleu(new_state);
@@ -675,28 +687,28 @@ float BleuScoreFeature::CalculateBleu(Phrase translation) const
{
if (translation.GetSize() == 0)
return 0.0;
-
+
Phrase normTranslation = translation;
// remove start and end symbol for chart decoding
if (m_cur_source_length != m_cur_norm_source_length) {
WordsRange* range = new WordsRange(1, translation.GetSize()-2);
normTranslation = translation.GetSubString(*range);
}
-
+
// get ngram matches for translation
BleuScoreState* state = new BleuScoreState();
GetClippedNgramMatchesAndCounts(normTranslation,
- m_cur_ref_ngrams,
- state->m_ngram_counts,
- state->m_ngram_matches,
- 0); // number of words in previous states
+ m_cur_ref_ngrams,
+ state->m_ngram_counts,
+ state->m_ngram_matches,
+ 0); // number of words in previous states
// set state variables
state->m_words = normTranslation;
state->m_source_length = m_cur_norm_source_length;
state->m_target_length = normTranslation.GetSize();
state->m_scaled_ref_length = m_cur_ref_length;
-
+
// Calculate bleu.
return CalculateBleu(state);
}
@@ -704,52 +716,53 @@ float BleuScoreFeature::CalculateBleu(Phrase translation) const
/*
* Calculate Bleu score for a partial hypothesis given as state.
*/
-float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
+float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const
+{
if (!state->m_ngram_counts[0]) return 0;
if (!state->m_ngram_matches[0]) return 0; // if we have no unigram matches, score should be 0
-
+
float precision = 1.0;
float smooth = 1;
float smoothed_count, smoothed_matches;
-
+
if (m_sentence_bleu || m_simple_history_bleu) {
// Calculate geometric mean of modified ngram precisions
// BLEU = BP * exp(SUM_1_4 1/4 * log p_n)
// = BP * 4th root(PRODUCT_1_4 p_n)
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
if (state->m_ngram_counts[i]) {
- smoothed_matches = state->m_ngram_matches[i];
- smoothed_count = state->m_ngram_counts[i];
-
- switch (m_smoothing_scheme) {
- case PLUS_ONE:
- default:
- if (i > 0) {
- // smoothing for all n > 1
- smoothed_matches += 1;
- smoothed_count += 1;
- }
- break;
- case PLUS_POINT_ONE:
- if (i > 0) {
- // smoothing for all n > 1
- smoothed_matches += 0.1;
- smoothed_count += 0.1;
- }
- break;
- case PAPINENI:
- if (state->m_ngram_matches[i] == 0) {
- smooth *= 0.5;
- smoothed_matches += smooth;
- smoothed_count += smooth;
- }
- break;
- }
-
- if (m_simple_history_bleu) {
- smoothed_matches += m_match_history[i];
- smoothed_count += m_count_history[i];
- }
+ smoothed_matches = state->m_ngram_matches[i];
+ smoothed_count = state->m_ngram_counts[i];
+
+ switch (m_smoothing_scheme) {
+ case PLUS_ONE:
+ default:
+ if (i > 0) {
+ // smoothing for all n > 1
+ smoothed_matches += 1;
+ smoothed_count += 1;
+ }
+ break;
+ case PLUS_POINT_ONE:
+ if (i > 0) {
+ // smoothing for all n > 1
+ smoothed_matches += 0.1;
+ smoothed_count += 0.1;
+ }
+ break;
+ case PAPINENI:
+ if (state->m_ngram_matches[i] == 0) {
+ smooth *= 0.5;
+ smoothed_matches += smooth;
+ smoothed_count += smooth;
+ }
+ break;
+ }
+
+ if (m_simple_history_bleu) {
+ smoothed_matches += m_match_history[i];
+ smoothed_count += m_count_history[i];
+ }
precision *= smoothed_matches/smoothed_count;
}
@@ -766,40 +779,35 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
// r: effective reference length (sum of best match lengths for each candidate sentence)
if (m_simple_history_bleu) {
if ((m_target_length_history + state->m_target_length) < (m_ref_length_history + state->m_scaled_ref_length)) {
- float smoothed_target_length = m_target_length_history + state->m_target_length;
- float smoothed_ref_length = m_ref_length_history + state->m_scaled_ref_length;
- precision *= exp(1 - (smoothed_ref_length/smoothed_target_length));
+ float smoothed_target_length = m_target_length_history + state->m_target_length;
+ float smoothed_ref_length = m_ref_length_history + state->m_scaled_ref_length;
+ precision *= exp(1 - (smoothed_ref_length/smoothed_target_length));
}
- }
- else {
+ } else {
if (state->m_target_length < state->m_scaled_ref_length) {
- float target_length = state->m_target_length;
- float ref_length = state->m_scaled_ref_length;
- precision *= exp(1 - (ref_length/target_length));
+ float target_length = state->m_target_length;
+ float ref_length = state->m_scaled_ref_length;
+ precision *= exp(1 - (ref_length/target_length));
}
}
-
+
//cerr << "precision: " << precision << endl;
-
+
// Approximate bleu score as of Chiang/Resnik is scaled by the size of the input:
// B(e;f,{r_k}) = (O_f + |f|) * BLEU(O + c(e;{r_k}))
// where c(e;) is a vector of reference length, ngram counts and ngram matches
if (m_scale_by_input_length) {
precision *= m_cur_norm_source_length;
- }
- else if (m_scale_by_avg_input_length) {
+ } else if (m_scale_by_avg_input_length) {
precision *= m_avg_input_length;
- }
- else if (m_scale_by_inverse_length) {
+ } else if (m_scale_by_inverse_length) {
precision *= (100/m_cur_norm_source_length);
- }
- else if (m_scale_by_avg_inverse_length) {
+ } else if (m_scale_by_avg_inverse_length) {
precision *= (100/m_avg_input_length);
}
-
+
return precision * m_scale_by_x;
- }
- else {
+ } else {
// Revised history BLEU: compute Bleu in the context of the pseudo-document
// B(b) = size_of_oracle_doc * (Bleu(B_hist + b) - Bleu(B_hist))
// Calculate geometric mean of modified ngram precisions
@@ -807,12 +815,12 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
// = BP * 4th root(PRODUCT_1_4 p_n)
for (size_t i = 0; i < BleuScoreState::bleu_order; i++) {
if (state->m_ngram_counts[i]) {
- smoothed_matches = m_match_history[i] + state->m_ngram_matches[i] + 0.1;
- smoothed_count = m_count_history[i] + state->m_ngram_counts[i] + 0.1;
- precision *= smoothed_matches/smoothed_count;
+ smoothed_matches = m_match_history[i] + state->m_ngram_matches[i] + 0.1;
+ smoothed_count = m_count_history[i] + state->m_ngram_counts[i] + 0.1;
+ precision *= smoothed_matches/smoothed_count;
}
}
-
+
// take geometric mean
precision = pow(precision, (float)1/4);
@@ -826,25 +834,24 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
float precision_pd = 1.0;
if (m_target_length_history > 0) {
for (size_t i = 0; i < BleuScoreState::bleu_order; i++)
- if (m_count_history[i] != 0)
- precision_pd *= (m_match_history[i] + 0.1)/(m_count_history[i] + 0.1);
-
+ if (m_count_history[i] != 0)
+ precision_pd *= (m_match_history[i] + 0.1)/(m_count_history[i] + 0.1);
+
// take geometric mean
precision_pd = pow(precision_pd, (float)1/4);
// Apply brevity penalty if applicable.
if (m_target_length_history < m_ref_length_history)
- precision_pd *= exp(1 - (m_ref_length_history/m_target_length_history));
- }
- else
+ precision_pd *= exp(1 - (m_ref_length_history/m_target_length_history));
+ } else
precision_pd = 0;
// **end BLEU of pseudo-document**
cerr << "precision pd: " << precision_pd << endl;
float sentence_impact;
- if (m_target_length_history > 0)
- sentence_impact = m_target_length_history * (precision - precision_pd);
+ if (m_target_length_history > 0)
+ sentence_impact = m_target_length_history * (precision - precision_pd);
else
sentence_impact = precision;
@@ -855,7 +862,7 @@ float BleuScoreFeature::CalculateBleu(BleuScoreState* state) const {
const FFState* BleuScoreFeature::EmptyHypothesisState(const InputType& input) const
{
- return new BleuScoreState();
+ return new BleuScoreState();
}
} // namespace.
diff --git a/moses/FF/BleuScoreFeature.h b/moses/FF/BleuScoreFeature.h
index dc4495506..96e273672 100644
--- a/moses/FF/BleuScoreFeature.h
+++ b/moses/FF/BleuScoreFeature.h
@@ -13,31 +13,33 @@
#include "moses/Phrase.h"
#include "moses/ChartHypothesis.h"
-namespace Moses {
+namespace Moses
+{
class BleuScoreFeature;
-class BleuScoreState : public FFState {
+class BleuScoreState : public FFState
+{
public:
- friend class BleuScoreFeature;
- static size_t bleu_order;
+ friend class BleuScoreFeature;
+ static size_t bleu_order;
- BleuScoreState();
- virtual int Compare(const FFState& other) const;
- void print(std::ostream& out) const;
+ BleuScoreState();
+ virtual int Compare(const FFState& other) const;
+ void print(std::ostream& out) const;
private:
- Phrase m_words;
- size_t m_source_length;
- size_t m_target_length;
+ Phrase m_words;
+ size_t m_source_length;
+ size_t m_target_length;
- // scaled reference length is needed for scoring incomplete hypotheses against reference translation
- float m_scaled_ref_length;
+ // scaled reference length is needed for scoring incomplete hypotheses against reference translation
+ float m_scaled_ref_length;
- std::vector< size_t > m_ngram_counts;
- std::vector< size_t > m_ngram_matches;
+ std::vector< size_t > m_ngram_counts;
+ std::vector< size_t > m_ngram_matches;
- void AddNgramCountAndMatches(std::vector< size_t >& counts, std::vector< size_t >& matches);
+ void AddNgramCountAndMatches(std::vector< size_t >& counts, std::vector< size_t >& matches);
};
@@ -56,7 +58,8 @@ public:
};
-class BleuScoreFeature : public StatefulFeatureFunction {
+class BleuScoreFeature : public StatefulFeatureFunction
+{
public:
typedef boost::unordered_map<size_t, RefValue > RefCounts;
@@ -64,95 +67,105 @@ public:
BleuScoreFeature(const std::string &line);
- void PrintHistory(std::ostream& out) const;
- void LoadReferences(const std::vector< std::vector< std::string > > &);
- void SetCurrSourceLength(size_t);
- void SetCurrNormSourceLength(size_t);
- void SetCurrShortestRefLength(size_t);
- void SetCurrAvgRefLength(size_t sent_id);
- void SetAvgInputLength (float l) { m_avg_input_length = l; }
- void SetCurrReferenceNgrams(size_t sent_id);
- size_t GetShortestRefIndex(size_t ref_id);
- size_t GetClosestRefLength(size_t ref_id, int hypoLength);
- void UpdateHistory(const std::vector< const Word* >&);
- void UpdateHistory(const std::vector< std::vector< const Word* > >& hypos, std::vector<size_t>& sourceLengths, std::vector<size_t>& ref_ids, size_t rank, size_t epoch);
- void PrintRefLength(const std::vector<size_t>& ref_ids);
- void SetBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
- bool scaleByInverseLength, bool scaleByAvgInverseLength,
- float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu);
-
- void GetNgramMatchCounts(Phrase&,
- const NGrams&,
- std::vector< size_t >&,
- std::vector< size_t >&,
- size_t skip = 0) const;
- void GetNgramMatchCounts_prefix(Phrase&,
- const NGrams&,
- std::vector< size_t >&,
- std::vector< size_t >&,
- size_t new_start_indices,
- size_t last_end_index) const;
- void GetNgramMatchCounts_overlap(Phrase& phrase,
- const NGrams& ref_ngram_counts,
- std::vector< size_t >& ret_counts,
- std::vector< size_t >& ret_matches,
- size_t overlap_index) const;
- void GetClippedNgramMatchesAndCounts(Phrase&,
- const NGrams&,
- std::vector< size_t >&,
- std::vector< size_t >&,
- size_t skip = 0) const;
-
- FFState* Evaluate( const Hypothesis& cur_hypo,
- const FFState* prev_state,
- ScoreComponentCollection* accumulator) const;
- FFState* EvaluateChart(const ChartHypothesis& cur_hypo,
- int featureID,
- ScoreComponentCollection* accumulator) const;
- bool Enabled() const { return m_enabled; }
- float CalculateBleu(BleuScoreState*) const;
- float CalculateBleu(Phrase translation) const;
- const FFState* EmptyHypothesisState(const InputType&) const;
-
- float GetSourceLengthHistory() { return m_source_length_history; }
- float GetTargetLengthHistory() { return m_target_length_history; }
- float GetAverageInputLength() { return m_avg_input_length; }
+ void PrintHistory(std::ostream& out) const;
+ void LoadReferences(const std::vector< std::vector< std::string > > &);
+ void SetCurrSourceLength(size_t);
+ void SetCurrNormSourceLength(size_t);
+ void SetCurrShortestRefLength(size_t);
+ void SetCurrAvgRefLength(size_t sent_id);
+ void SetAvgInputLength (float l) {
+ m_avg_input_length = l;
+ }
+ void SetCurrReferenceNgrams(size_t sent_id);
+ size_t GetShortestRefIndex(size_t ref_id);
+ size_t GetClosestRefLength(size_t ref_id, int hypoLength);
+ void UpdateHistory(const std::vector< const Word* >&);
+ void UpdateHistory(const std::vector< std::vector< const Word* > >& hypos, std::vector<size_t>& sourceLengths, std::vector<size_t>& ref_ids, size_t rank, size_t epoch);
+ void PrintRefLength(const std::vector<size_t>& ref_ids);
+ void SetBleuParameters(bool disable, bool sentenceBleu, bool scaleByInputLength, bool scaleByAvgInputLength,
+ bool scaleByInverseLength, bool scaleByAvgInverseLength,
+ float scaleByX, float historySmoothing, size_t scheme, bool simpleHistoryBleu);
+
+ void GetNgramMatchCounts(Phrase&,
+ const NGrams&,
+ std::vector< size_t >&,
+ std::vector< size_t >&,
+ size_t skip = 0) const;
+ void GetNgramMatchCounts_prefix(Phrase&,
+ const NGrams&,
+ std::vector< size_t >&,
+ std::vector< size_t >&,
+ size_t new_start_indices,
+ size_t last_end_index) const;
+ void GetNgramMatchCounts_overlap(Phrase& phrase,
+ const NGrams& ref_ngram_counts,
+ std::vector< size_t >& ret_counts,
+ std::vector< size_t >& ret_matches,
+ size_t overlap_index) const;
+ void GetClippedNgramMatchesAndCounts(Phrase&,
+ const NGrams&,
+ std::vector< size_t >&,
+ std::vector< size_t >&,
+ size_t skip = 0) const;
+
+ FFState* Evaluate( const Hypothesis& cur_hypo,
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const;
+ FFState* EvaluateChart(const ChartHypothesis& cur_hypo,
+ int featureID,
+ ScoreComponentCollection* accumulator) const;
+ bool Enabled() const {
+ return m_enabled;
+ }
+ float CalculateBleu(BleuScoreState*) const;
+ float CalculateBleu(Phrase translation) const;
+ const FFState* EmptyHypothesisState(const InputType&) const;
+
+ float GetSourceLengthHistory() {
+ return m_source_length_history;
+ }
+ float GetTargetLengthHistory() {
+ return m_target_length_history;
+ }
+ float GetAverageInputLength() {
+ return m_avg_input_length;
+ }
private:
- bool m_enabled;
- bool m_sentence_bleu;
- bool m_simple_history_bleu;
+ bool m_enabled;
+ bool m_sentence_bleu;
+ bool m_simple_history_bleu;
- // counts for pseudo-document
- std::vector< float > m_count_history;
- std::vector< float > m_match_history;
- float m_source_length_history;
- float m_target_length_history;
- float m_ref_length_history;
+ // counts for pseudo-document
+ std::vector< float > m_count_history;
+ std::vector< float > m_match_history;
+ float m_source_length_history;
+ float m_target_length_history;
+ float m_ref_length_history;
- size_t m_cur_source_length;
- size_t m_cur_norm_source_length; // length without <s>, </s>
- RefCounts m_refs;
- NGrams m_cur_ref_ngrams;
- float m_cur_ref_length;
+ size_t m_cur_source_length;
+ size_t m_cur_norm_source_length; // length without <s>, </s>
+ RefCounts m_refs;
+ NGrams m_cur_ref_ngrams;
+ float m_cur_ref_length;
- // scale BLEU score by history of input length
- bool m_scale_by_input_length;
- bool m_scale_by_avg_input_length;
+ // scale BLEU score by history of input length
+ bool m_scale_by_input_length;
+ bool m_scale_by_avg_input_length;
- // scale by the inverse of the input length * 100
- bool m_scale_by_inverse_length;
- bool m_scale_by_avg_inverse_length;
+ // scale by the inverse of the input length * 100
+ bool m_scale_by_inverse_length;
+ bool m_scale_by_avg_inverse_length;
- float m_avg_input_length;
+ float m_avg_input_length;
- float m_scale_by_x;
+ float m_scale_by_x;
- // smoothing factor for history counts
- float m_historySmoothing;
+ // smoothing factor for history counts
+ float m_historySmoothing;
- enum SmoothingScheme { PLUS_ONE = 1, PLUS_POINT_ONE = 2, PAPINENI = 3 };
- SmoothingScheme m_smoothing_scheme;
+ enum SmoothingScheme { PLUS_ONE = 1, PLUS_POINT_ONE = 2, PAPINENI = 3 };
+ SmoothingScheme m_smoothing_scheme;
};
} // Namespace.
diff --git a/moses/FF/ChartBasedFeatureContext.cpp b/moses/FF/ChartBasedFeatureContext.cpp
index 803f81deb..a74cce50c 100644
--- a/moses/FF/ChartBasedFeatureContext.cpp
+++ b/moses/FF/ChartBasedFeatureContext.cpp
@@ -5,15 +5,15 @@
namespace Moses
{
ChartBasedFeatureContext::ChartBasedFeatureContext
- (const ChartHypothesis* hypothesis):
+(const ChartHypothesis* hypothesis):
m_hypothesis(hypothesis),
m_targetPhrase(hypothesis->GetCurrTargetPhrase()),
m_source(hypothesis->GetManager().GetSource())
{}
ChartBasedFeatureContext::ChartBasedFeatureContext(
- const TargetPhrase& targetPhrase,
- const InputType& source):
+ const TargetPhrase& targetPhrase,
+ const InputType& source):
m_hypothesis(NULL),
m_targetPhrase(targetPhrase),
m_source(source)
diff --git a/moses/FF/ChartBasedFeatureContext.h b/moses/FF/ChartBasedFeatureContext.h
index 7649effde..a204f7c77 100644
--- a/moses/FF/ChartBasedFeatureContext.h
+++ b/moses/FF/ChartBasedFeatureContext.h
@@ -11,7 +11,7 @@ class TargetPhrase;
**/
class ChartBasedFeatureContext
{
- //The context either has a hypothesis (during search) or a
+ //The context either has a hypothesis (during search) or a
//TargetPhrase and source sentence (during pre-calculation)
//TODO: should the context also include some info on where the TargetPhrase
//is anchored (assuming it's lexicalised), which is available at pre-calc?
@@ -24,11 +24,13 @@ public:
ChartBasedFeatureContext(const TargetPhrase& targetPhrase,
const InputType& source);
- const InputType& GetSource() const
- { return m_source; }
+ const InputType& GetSource() const {
+ return m_source;
+ }
- const TargetPhrase& GetTargetPhrase() const
- { return m_targetPhrase; }
+ const TargetPhrase& GetTargetPhrase() const {
+ return m_targetPhrase;
+ }
};
diff --git a/moses/FF/DistortionScoreProducer.cpp b/moses/FF/DistortionScoreProducer.cpp
index 413679779..328c833c8 100644
--- a/moses/FF/DistortionScoreProducer.cpp
+++ b/moses/FF/DistortionScoreProducer.cpp
@@ -39,8 +39,7 @@ float DistortionScoreProducer::CalculateDistortionScore(const Hypothesis& hypo,
{
if(!StaticData::Instance().UseEarlyDistortionCost()) {
return - (float) hypo.GetInput().ComputeDistortionDistance(prev, curr);
- }
- else {
+ } else {
/* Pay distortion score as soon as possible, from Moore and Quirk MT Summit 2007
Definitions:
S : current source range
@@ -50,23 +49,23 @@ float DistortionScoreProducer::CalculateDistortionScore(const Hypothesis& hypo,
int prefixEndPos = (int)FirstGap-1;
if((int)FirstGap==-1)
- prefixEndPos = -1;
+ prefixEndPos = -1;
// case1: S is adjacent to S'' => return 0
if ((int) curr.GetStartPos() == prefixEndPos+1) {
- IFVERBOSE(4) std::cerr<< "MQ07disto:case1" << std::endl;
+ IFVERBOSE(4) std::cerr<< "MQ07disto:case1" << std::endl;
return 0;
}
// case2: S is to the left of S' => return 2(length(S))
if ((int) curr.GetEndPos() < (int) prev.GetEndPos()) {
- IFVERBOSE(4) std::cerr<< "MQ07disto:case2" << std::endl;
+ IFVERBOSE(4) std::cerr<< "MQ07disto:case2" << std::endl;
return (float) -2*(int)curr.GetNumWordsCovered();
}
// case3: S' is a subsequence of S'' => return 2(nbWordBetween(S,S'')+length(S))
if ((int) prev.GetEndPos() <= prefixEndPos) {
- IFVERBOSE(4) std::cerr<< "MQ07disto:case3" << std::endl;
+ IFVERBOSE(4) std::cerr<< "MQ07disto:case3" << std::endl;
int z = (int)curr.GetStartPos()-prefixEndPos - 1;
return (float) -2*(z + (int)curr.GetNumWordsCovered());
}
diff --git a/moses/FF/DistortionScoreProducer.h b/moses/FF/DistortionScoreProducer.h
index 394e7f2e1..2601e6398 100644
--- a/moses/FF/DistortionScoreProducer.h
+++ b/moses/FF/DistortionScoreProducer.h
@@ -17,12 +17,12 @@ class WordsRange;
class DistortionScoreProducer : public StatefulFeatureFunction
{
public:
- DistortionScoreProducer(const std::string &line)
- : StatefulFeatureFunction("Distortion", 1, line)
- {}
+ DistortionScoreProducer(const std::string &line)
+ : StatefulFeatureFunction("Distortion", 1, line)
+ {}
static float CalculateDistortionScore(const Hypothesis& hypo,
- const WordsRange &prev, const WordsRange &curr, const int FirstGapPosition);
+ const WordsRange &prev, const WordsRange &curr, const int FirstGapPosition);
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
@@ -35,8 +35,8 @@ public:
const ChartHypothesis& /* cur_hypo */,
int /* featureID - used to index the state in the previous hypotheses */,
ScoreComponentCollection*) const {
- throw std::logic_error("DistortionScoreProducer not supported in chart decoder, yet");
- }
+ throw std::logic_error("DistortionScoreProducer not supported in chart decoder, yet");
+ }
};
}
diff --git a/moses/FF/FFState.h b/moses/FF/FFState.h
index 49b0e55a8..bb3a119ef 100644
--- a/moses/FF/FFState.h
+++ b/moses/FF/FFState.h
@@ -15,11 +15,12 @@ public:
virtual int Compare(const FFState& other) const = 0;
};
-class DummyState : public FFState {
+class DummyState : public FFState
+{
public:
DummyState() {}
int Compare(const FFState& other) const {
- return 0;
+ return 0;
}
};
diff --git a/moses/FF/FeatureFunction.cpp b/moses/FF/FeatureFunction.cpp
index d1a73e1a9..ea4441522 100644
--- a/moses/FF/FeatureFunction.cpp
+++ b/moses/FF/FeatureFunction.cpp
@@ -19,7 +19,7 @@ std::vector<const StatelessFeatureFunction*> StatelessFeatureFunction::m_statele
std::vector<const StatefulFeatureFunction*> StatefulFeatureFunction::m_statefulFFs;
FeatureFunction::FeatureFunction(const std::string& description, const std::string &line)
-: m_tuneable(true)
+ : m_tuneable(true)
{
ParseLine(description, line);
@@ -35,13 +35,13 @@ FeatureFunction::FeatureFunction(const std::string& description, const std::stri
m_description = dstream.str();
}
- ScoreComponentCollection::RegisterScoreProducer(this);
+ ScoreComponentCollection::RegisterScoreProducer(this);
m_producers.push_back(this);
}
FeatureFunction::FeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line)
-: m_numScoreComponents(numScoreComponents)
-, m_tuneable(true)
+ : m_numScoreComponents(numScoreComponents)
+ , m_tuneable(true)
{
ParseLine(description, line);
@@ -75,14 +75,11 @@ void FeatureFunction::ParseLine(const std::string& description, const std::strin
if (args[0] == "num-features") {
m_numScoreComponents = Scan<size_t>(args[1]);
- }
- else if (args[0] == "name") {
+ } else if (args[0] == "name") {
m_description = args[1];
- }
- else if (args[0] == "tuneable") {
+ } else if (args[0] == "tuneable") {
m_tuneable = Scan<bool>(args[1]);
- }
- else {
+ } else {
m_args.push_back(args);
}
}
diff --git a/moses/FF/FeatureFunction.h b/moses/FF/FeatureFunction.h
index 6e1fa67a8..97e7d754d 100644
--- a/moses/FF/FeatureFunction.h
+++ b/moses/FF/FeatureFunction.h
@@ -42,26 +42,33 @@ protected:
void ParseLine(const std::string& description, const std::string &line);
public:
- static const std::vector<FeatureFunction*>& GetFeatureFunctions() { return m_producers; }
+ static const std::vector<FeatureFunction*>& GetFeatureFunctions() {
+ return m_producers;
+ }
FeatureFunction(const std::string& description, const std::string &line);
FeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line);
- virtual bool IsStateless() const = 0;
+ virtual bool IsStateless() const = 0;
virtual ~FeatureFunction();
-
+
static void ResetDescriptionCounts() {
description_counts.clear();
}
//! returns the number of scores that a subclass produces.
//! For example, a language model conventionally produces 1, a translation table some arbitrary number, etc
- size_t GetNumScoreComponents() const {return m_numScoreComponents;}
+ size_t GetNumScoreComponents() const {
+ return m_numScoreComponents;
+ }
//! returns a string description of this producer
- const std::string& GetScoreProducerDescription() const
- { return m_description; }
+ const std::string& GetScoreProducerDescription() const {
+ return m_description;
+ }
- virtual bool IsTuneable() const { return m_tuneable; }
+ virtual bool IsTuneable() const {
+ return m_tuneable;
+ }
//!
virtual void InitializeForInput(InputType const& source)
@@ -71,17 +78,18 @@ public:
virtual void CleanUpAfterSentenceProcessing(const InputType& source)
{}
- const std::string &GetArgLine() const
- { return m_argLine; }
+ const std::string &GetArgLine() const {
+ return m_argLine;
+ }
virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{}
virtual void Evaluate(const InputType &source
- , ScoreComponentCollection &scoreBreakdown) const
+ , ScoreComponentCollection &scoreBreakdown) const
{}
};
diff --git a/moses/FF/GlobalLexicalModel.cpp b/moses/FF/GlobalLexicalModel.cpp
index 5724f6598..cbc6811ee 100644
--- a/moses/FF/GlobalLexicalModel.cpp
+++ b/moses/FF/GlobalLexicalModel.cpp
@@ -10,7 +10,7 @@ using namespace std;
namespace Moses
{
GlobalLexicalModel::GlobalLexicalModel(const std::string &line)
-: StatelessFeatureFunction("GlobalLexicalModel",1, line)
+ : StatelessFeatureFunction("GlobalLexicalModel",1, line)
{
std::cerr << "Creating global lexical model...\n";
@@ -23,14 +23,11 @@ GlobalLexicalModel::GlobalLexicalModel(const std::string &line)
if (args[0] == "file") {
CHECK(args.size() == 2);
filePath = args[1];
- }
- else if (args[0] == "inputFactors") {
+ } else if (args[0] == "inputFactors") {
inputFactors = Tokenize<FactorType>(args[1],",");
- }
- else if (args[0] == "outputFactors") {
+ } else if (args[0] == "outputFactors") {
outputFactors = Tokenize<FactorType>(args[1],",");
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
@@ -179,11 +176,11 @@ float GlobalLexicalModel::GetFromCacheOrScorePhrase( const TargetPhrase& targetP
}
void GlobalLexicalModel::Evaluate
- (const PhraseBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const
+(const PhraseBasedFeatureContext& context,
+ ScoreComponentCollection* accumulator) const
{
- accumulator->PlusEquals( this,
- GetFromCacheOrScorePhrase(context.GetTargetPhrase()) );
+ accumulator->PlusEquals( this,
+ GetFromCacheOrScorePhrase(context.GetTargetPhrase()) );
}
}
diff --git a/moses/FF/GlobalLexicalModel.h b/moses/FF/GlobalLexicalModel.h
index 03659b7f2..b3bf79b53 100644
--- a/moses/FF/GlobalLexicalModel.h
+++ b/moses/FF/GlobalLexicalModel.h
@@ -37,8 +37,7 @@ class GlobalLexicalModel : public StatelessFeatureFunction
typedef std::map< const Word*, float, WordComparer > SingleHash;
typedef std::map< const TargetPhrase*, float > LexiconCache;
- struct ThreadLocalStorage
- {
+ struct ThreadLocalStorage {
LexiconCache cache;
const Sentence *input;
};
@@ -64,18 +63,17 @@ private:
public:
GlobalLexicalModel(const std::string &line);
- virtual ~GlobalLexicalModel();
+ virtual ~GlobalLexicalModel();
void InitializeForInput( Sentence const& in );
void Evaluate(const PhraseBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const;
+ ScoreComponentCollection* accumulator) const;
void EvaluateChart(
const ChartBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const
- {
+ ScoreComponentCollection* accumulator) const {
throw std::logic_error("GlobalLexicalModel not supported in chart decoder, yet");
}
diff --git a/moses/FF/GlobalLexicalModelUnlimited.cpp b/moses/FF/GlobalLexicalModelUnlimited.cpp
index d4b1aeb37..5c096e43f 100644
--- a/moses/FF/GlobalLexicalModelUnlimited.cpp
+++ b/moses/FF/GlobalLexicalModelUnlimited.cpp
@@ -10,7 +10,7 @@ using namespace std;
namespace Moses
{
GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const std::string &line)
-:StatelessFeatureFunction("GlobalLexicalModelUnlimited", 0, line)
+ :StatelessFeatureFunction("GlobalLexicalModelUnlimited", 0, line)
{
const vector<string> modelSpec = Tokenize(line);
@@ -25,7 +25,7 @@ GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const std::string &line
if (spec.size() > 0) {
if (spec.size() != 2 && spec.size() != 3 && spec.size() != 4 && spec.size() != 6) {
UserMessage::Add("Format of glm feature is <factor-src>-<factor-tgt> [ignore-punct] [use-bias] "
- "[context-type] [filename-src filename-tgt]");
+ "[context-type] [filename-src filename-tgt]");
//return false;
}
@@ -41,8 +41,7 @@ GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const std::string &line
filenameTarget = spec[5];
restricted = true;
}
- }
- else
+ } else
factors = Tokenize(modelSpec[i],"-");
if ( factors.size() != 2 ) {
@@ -66,14 +65,13 @@ GlobalLexicalModelUnlimited::GlobalLexicalModelUnlimited(const std::string &line
}
bool GlobalLexicalModelUnlimited::Load(const std::string &filePathSource,
- const std::string &filePathTarget)
+ const std::string &filePathTarget)
{
// restricted source word vocabulary
ifstream inFileSource(filePathSource.c_str());
- if (!inFileSource)
- {
- cerr << "could not open file " << filePathSource << endl;
- return false;
+ if (!inFileSource) {
+ cerr << "could not open file " << filePathSource << endl;
+ return false;
}
std::string line;
@@ -85,10 +83,9 @@ bool GlobalLexicalModelUnlimited::Load(const std::string &filePathSource,
// restricted target word vocabulary
ifstream inFileTarget(filePathTarget.c_str());
- if (!inFileTarget)
- {
- cerr << "could not open file " << filePathTarget << endl;
- return false;
+ if (!inFileTarget) {
+ cerr << "could not open file " << filePathTarget << endl;
+ return false;
}
while (getline(inFileTarget, line)) {
@@ -109,228 +106,222 @@ void GlobalLexicalModelUnlimited::InitializeForInput( Sentence const& in )
void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComponentCollection* accumulator) const
{
- const Sentence& input = *(m_local->input);
- const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
-
- for(int targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) {
- StringPiece targetString = targetPhrase.GetWord(targetIndex).GetString(0); // TODO: change for other factors
-
- if (m_ignorePunctuation) {
- // check if first char is punctuation
- char firstChar = targetString[0];
- CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
- if(charIterator != m_punctuationHash.end())
- continue;
- }
-
- if (m_biasFeature) {
- stringstream feature;
- feature << "glm_";
- feature << targetString;
- feature << "~";
- feature << "**BIAS**";
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
-
- boost::unordered_set<uint64_t> alreadyScored;
- for(int sourceIndex = 0; sourceIndex < input.GetSize(); sourceIndex++ ) {
- const StringPiece sourceString = input.GetWord(sourceIndex).GetString(0); // TODO: change for other factors
-
- if (m_ignorePunctuation) {
- // check if first char is punctuation
- char firstChar = sourceString[0];
- CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
- if(charIterator != m_punctuationHash.end())
- continue;
- }
- const uint64_t sourceHash = util::MurmurHashNative(sourceString.data(), sourceString.size());
-
- if ( alreadyScored.find(sourceHash) == alreadyScored.end()) {
- bool sourceExists, targetExists;
- if (!m_unrestricted) {
- sourceExists = FindStringPiece(m_vocabSource, sourceString ) != m_vocabSource.end();
- targetExists = FindStringPiece(m_vocabTarget, targetString) != m_vocabTarget.end();
- }
-
- // no feature if vocab is in use and both words are not in restricted vocabularies
- if (m_unrestricted || (sourceExists && targetExists)) {
- if (m_sourceContext) {
- if (sourceIndex == 0) {
- // add <s> trigger feature for source
- stringstream feature;
- feature << "glm_";
- feature << targetString;
- feature << "~";
- feature << "<s>,";
- feature << sourceString;
- accumulator->SparsePlusEquals(feature.str(), 1);
- alreadyScored.insert(sourceHash);
- }
-
- // add source words to the right of current source word as context
- for(int contextIndex = sourceIndex+1; contextIndex < input.GetSize(); contextIndex++ ) {
- StringPiece contextString = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
- bool contextExists;
- if (!m_unrestricted)
- contextExists = FindStringPiece(m_vocabSource, contextString ) != m_vocabSource.end();
-
- if (m_unrestricted || contextExists) {
- stringstream feature;
- feature << "glm_";
- feature << targetString;
- feature << "~";
- feature << sourceString;
- feature << ",";
- feature << contextString;
- accumulator->SparsePlusEquals(feature.str(), 1);
- alreadyScored.insert(sourceHash);
- }
- }
- }
- else if (m_biphrase) {
- // --> look backwards for constructing context
- int globalTargetIndex = cur_hypo.GetSize() - targetPhrase.GetSize() + targetIndex;
-
- // 1) source-target pair, trigger source word (can be discont.) and adjacent target word (bigram)
- StringPiece targetContext;
- if (globalTargetIndex > 0)
- targetContext = cur_hypo.GetWord(globalTargetIndex-1).GetString(0); // TODO: change for other factors
- else
- targetContext = "<s>";
-
- if (sourceIndex == 0) {
- StringPiece sourceTrigger = "<s>";
- AddFeature(accumulator, sourceTrigger, sourceString,
- targetContext, targetString);
- }
- else
- for(int contextIndex = sourceIndex-1; contextIndex >= 0; contextIndex-- ) {
- StringPiece sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
- bool sourceTriggerExists = false;
- if (!m_unrestricted)
- sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
-
- if (m_unrestricted || sourceTriggerExists)
- AddFeature(accumulator, sourceTrigger, sourceString,
- targetContext, targetString);
- }
-
- // 2) source-target pair, adjacent source word (bigram) and trigger target word (can be discont.)
- StringPiece sourceContext;
- if (sourceIndex-1 >= 0)
- sourceContext = input.GetWord(sourceIndex-1).GetString(0); // TODO: change for other factors
- else
- sourceContext = "<s>";
-
- if (globalTargetIndex == 0) {
- string targetTrigger = "<s>";
- AddFeature(accumulator, sourceContext, sourceString,
- targetTrigger, targetString);
- }
- else
- for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
- StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
- bool targetTriggerExists = false;
- if (!m_unrestricted)
- targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end();
-
- if (m_unrestricted || targetTriggerExists)
- AddFeature(accumulator, sourceContext, sourceString,
- targetTrigger, targetString);
- }
- }
- else if (m_bitrigger) {
- // allow additional discont. triggers on both sides
- int globalTargetIndex = cur_hypo.GetSize() - targetPhrase.GetSize() + targetIndex;
-
- if (sourceIndex == 0) {
- StringPiece sourceTrigger = "<s>";
- bool sourceTriggerExists = true;
-
- if (globalTargetIndex == 0) {
- string targetTrigger = "<s>";
- bool targetTriggerExists = true;
-
- if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
- AddFeature(accumulator, sourceTrigger, sourceString,
- targetTrigger, targetString);
- }
- else {
- // iterate backwards over target
- for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
- StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
- bool targetTriggerExists = false;
- if (!m_unrestricted)
- targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end();
-
- if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
- AddFeature(accumulator, sourceTrigger, sourceString,
- targetTrigger, targetString);
- }
- }
- }
- // iterate over both source and target
- else {
- // iterate backwards over source
- for(int contextIndex = sourceIndex-1; contextIndex >= 0; contextIndex-- ) {
- StringPiece sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
- bool sourceTriggerExists = false;
- if (!m_unrestricted)
- sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
-
- if (globalTargetIndex == 0) {
- string targetTrigger = "<s>";
- bool targetTriggerExists = true;
-
- if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
- AddFeature(accumulator, sourceTrigger, sourceString,
- targetTrigger, targetString);
- }
- else {
- // iterate backwards over target
- for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
- StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
- bool targetTriggerExists = false;
- if (!m_unrestricted)
- targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end();
-
- if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
- AddFeature(accumulator, sourceTrigger, sourceString,
- targetTrigger, targetString);
- }
- }
- }
- }
- }
- else {
- stringstream feature;
- feature << "glm_";
- feature << targetString;
- feature << "~";
- feature << sourceString;
- accumulator->SparsePlusEquals(feature.str(), 1);
- alreadyScored.insert(sourceHash);
-
- }
- }
- }
- }
+ const Sentence& input = *(m_local->input);
+ const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
+
+ for(int targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) {
+ StringPiece targetString = targetPhrase.GetWord(targetIndex).GetString(0); // TODO: change for other factors
+
+ if (m_ignorePunctuation) {
+ // check if first char is punctuation
+ char firstChar = targetString[0];
+ CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+ if(charIterator != m_punctuationHash.end())
+ continue;
+ }
+
+ if (m_biasFeature) {
+ stringstream feature;
+ feature << "glm_";
+ feature << targetString;
+ feature << "~";
+ feature << "**BIAS**";
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ }
+
+ boost::unordered_set<uint64_t> alreadyScored;
+ for(int sourceIndex = 0; sourceIndex < input.GetSize(); sourceIndex++ ) {
+ const StringPiece sourceString = input.GetWord(sourceIndex).GetString(0); // TODO: change for other factors
+
+ if (m_ignorePunctuation) {
+ // check if first char is punctuation
+ char firstChar = sourceString[0];
+ CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+ if(charIterator != m_punctuationHash.end())
+ continue;
+ }
+ const uint64_t sourceHash = util::MurmurHashNative(sourceString.data(), sourceString.size());
+
+ if ( alreadyScored.find(sourceHash) == alreadyScored.end()) {
+ bool sourceExists, targetExists;
+ if (!m_unrestricted) {
+ sourceExists = FindStringPiece(m_vocabSource, sourceString ) != m_vocabSource.end();
+ targetExists = FindStringPiece(m_vocabTarget, targetString) != m_vocabTarget.end();
+ }
+
+ // no feature if vocab is in use and both words are not in restricted vocabularies
+ if (m_unrestricted || (sourceExists && targetExists)) {
+ if (m_sourceContext) {
+ if (sourceIndex == 0) {
+ // add <s> trigger feature for source
+ stringstream feature;
+ feature << "glm_";
+ feature << targetString;
+ feature << "~";
+ feature << "<s>,";
+ feature << sourceString;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ alreadyScored.insert(sourceHash);
+ }
+
+ // add source words to the right of current source word as context
+ for(int contextIndex = sourceIndex+1; contextIndex < input.GetSize(); contextIndex++ ) {
+ StringPiece contextString = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
+ bool contextExists;
+ if (!m_unrestricted)
+ contextExists = FindStringPiece(m_vocabSource, contextString ) != m_vocabSource.end();
+
+ if (m_unrestricted || contextExists) {
+ stringstream feature;
+ feature << "glm_";
+ feature << targetString;
+ feature << "~";
+ feature << sourceString;
+ feature << ",";
+ feature << contextString;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ alreadyScored.insert(sourceHash);
+ }
+ }
+ } else if (m_biphrase) {
+ // --> look backwards for constructing context
+ int globalTargetIndex = cur_hypo.GetSize() - targetPhrase.GetSize() + targetIndex;
+
+ // 1) source-target pair, trigger source word (can be discont.) and adjacent target word (bigram)
+ StringPiece targetContext;
+ if (globalTargetIndex > 0)
+ targetContext = cur_hypo.GetWord(globalTargetIndex-1).GetString(0); // TODO: change for other factors
+ else
+ targetContext = "<s>";
+
+ if (sourceIndex == 0) {
+ StringPiece sourceTrigger = "<s>";
+ AddFeature(accumulator, sourceTrigger, sourceString,
+ targetContext, targetString);
+ } else
+ for(int contextIndex = sourceIndex-1; contextIndex >= 0; contextIndex-- ) {
+ StringPiece sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
+ bool sourceTriggerExists = false;
+ if (!m_unrestricted)
+ sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
+
+ if (m_unrestricted || sourceTriggerExists)
+ AddFeature(accumulator, sourceTrigger, sourceString,
+ targetContext, targetString);
+ }
+
+ // 2) source-target pair, adjacent source word (bigram) and trigger target word (can be discont.)
+ StringPiece sourceContext;
+ if (sourceIndex-1 >= 0)
+ sourceContext = input.GetWord(sourceIndex-1).GetString(0); // TODO: change for other factors
+ else
+ sourceContext = "<s>";
+
+ if (globalTargetIndex == 0) {
+ string targetTrigger = "<s>";
+ AddFeature(accumulator, sourceContext, sourceString,
+ targetTrigger, targetString);
+ } else
+ for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
+ StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
+ bool targetTriggerExists = false;
+ if (!m_unrestricted)
+ targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end();
+
+ if (m_unrestricted || targetTriggerExists)
+ AddFeature(accumulator, sourceContext, sourceString,
+ targetTrigger, targetString);
+ }
+ } else if (m_bitrigger) {
+ // allow additional discont. triggers on both sides
+ int globalTargetIndex = cur_hypo.GetSize() - targetPhrase.GetSize() + targetIndex;
+
+ if (sourceIndex == 0) {
+ StringPiece sourceTrigger = "<s>";
+ bool sourceTriggerExists = true;
+
+ if (globalTargetIndex == 0) {
+ string targetTrigger = "<s>";
+ bool targetTriggerExists = true;
+
+ if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
+ AddFeature(accumulator, sourceTrigger, sourceString,
+ targetTrigger, targetString);
+ } else {
+ // iterate backwards over target
+ for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
+ StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
+ bool targetTriggerExists = false;
+ if (!m_unrestricted)
+ targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end();
+
+ if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
+ AddFeature(accumulator, sourceTrigger, sourceString,
+ targetTrigger, targetString);
+ }
+ }
+ }
+ // iterate over both source and target
+ else {
+ // iterate backwards over source
+ for(int contextIndex = sourceIndex-1; contextIndex >= 0; contextIndex-- ) {
+ StringPiece sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
+ bool sourceTriggerExists = false;
+ if (!m_unrestricted)
+ sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
+
+ if (globalTargetIndex == 0) {
+ string targetTrigger = "<s>";
+ bool targetTriggerExists = true;
+
+ if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
+ AddFeature(accumulator, sourceTrigger, sourceString,
+ targetTrigger, targetString);
+ } else {
+ // iterate backwards over target
+ for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
+ StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
+ bool targetTriggerExists = false;
+ if (!m_unrestricted)
+ targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger ) != m_vocabTarget.end();
+
+ if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
+ AddFeature(accumulator, sourceTrigger, sourceString,
+ targetTrigger, targetString);
+ }
+ }
+ }
+ }
+ } else {
+ stringstream feature;
+ feature << "glm_";
+ feature << targetString;
+ feature << "~";
+ feature << sourceString;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ alreadyScored.insert(sourceHash);
+
+ }
+ }
+ }
+ }
}
}
void GlobalLexicalModelUnlimited::AddFeature(ScoreComponentCollection* accumulator,
- StringPiece sourceTrigger, StringPiece sourceWord,
- StringPiece targetTrigger, StringPiece targetWord) const {
- stringstream feature;
- feature << "glm_";
- feature << targetTrigger;
- feature << ",";
- feature << targetWord;
- feature << "~";
- feature << sourceTrigger;
- feature << ",";
- feature << sourceWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ StringPiece sourceTrigger, StringPiece sourceWord,
+ StringPiece targetTrigger, StringPiece targetWord) const
+{
+ stringstream feature;
+ feature << "glm_";
+ feature << targetTrigger;
+ feature << ",";
+ feature << targetWord;
+ feature << "~";
+ feature << sourceTrigger;
+ feature << ",";
+ feature << sourceWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
}
diff --git a/moses/FF/GlobalLexicalModelUnlimited.h b/moses/FF/GlobalLexicalModelUnlimited.h
index 42b7abae9..28579f55c 100644
--- a/moses/FF/GlobalLexicalModelUnlimited.h
+++ b/moses/FF/GlobalLexicalModelUnlimited.h
@@ -38,11 +38,10 @@ class InputType;
class GlobalLexicalModelUnlimited : public StatelessFeatureFunction
{
- typedef std::map< char, short > CharHash;
- typedef std::map< std::string, short > StringHash;
+ typedef std::map< char, short > CharHash;
+ typedef std::map< std::string, short > StringHash;
- struct ThreadLocalStorage
- {
+ struct ThreadLocalStorage {
const Sentence *input;
};
@@ -77,23 +76,23 @@ public:
void InitializeForInput( Sentence const& in );
const FFState* EmptyHypothesisState(const InputType &) const {
- return new DummyState();
+ return new DummyState();
}
//TODO: This implements the old interface, but cannot be updated because
//it appears to be stateful
void Evaluate(const Hypothesis& cur_hypo,
- ScoreComponentCollection* accumulator) const;
+ ScoreComponentCollection* accumulator) const;
void EvaluateChart(const ChartHypothesis& /* cur_hypo */,
- int /* featureID */,
- ScoreComponentCollection* ) const {
+ int /* featureID */,
+ ScoreComponentCollection* ) const {
throw std::logic_error("GlobalLexicalModelUnlimited not supported in chart decoder, yet");
}
- void AddFeature(ScoreComponentCollection* accumulator,
- StringPiece sourceTrigger, StringPiece sourceWord, StringPiece targetTrigger,
- StringPiece targetWord) const;
+ void AddFeature(ScoreComponentCollection* accumulator,
+ StringPiece sourceTrigger, StringPiece sourceWord, StringPiece targetTrigger,
+ StringPiece targetWord) const;
};
diff --git a/moses/FF/InputFeature.cpp b/moses/FF/InputFeature.cpp
index 6dc60f94a..1ef394f9f 100644
--- a/moses/FF/InputFeature.cpp
+++ b/moses/FF/InputFeature.cpp
@@ -7,7 +7,7 @@ using namespace std;
namespace Moses
{
InputFeature::InputFeature(const std::string &line)
-:StatelessFeatureFunction("InputFeature", line)
+ :StatelessFeatureFunction("InputFeature", line)
{
}
@@ -17,19 +17,19 @@ const InputFeature &InputFeature::GetInputFeature()
static const InputFeature *staticObj = NULL;
if (staticObj) {
- return *staticObj;
+ return *staticObj;
}
// 1st time looking up the feature
const std::vector<const StatelessFeatureFunction*> &statefulFFs = StatelessFeatureFunction::GetStatelessFeatureFunctions();
for (size_t i = 0; i < statefulFFs.size(); ++i) {
- const StatelessFeatureFunction *ff = statefulFFs[i];
- const InputFeature *lm = dynamic_cast<const InputFeature*>(ff);
+ const StatelessFeatureFunction *ff = statefulFFs[i];
+ const InputFeature *lm = dynamic_cast<const InputFeature*>(ff);
- if (lm) {
- staticObj = lm;
- return *staticObj;
- }
+ if (lm) {
+ staticObj = lm;
+ return *staticObj;
+ }
}
throw std::logic_error("No input feature.");
diff --git a/moses/FF/PhraseBasedFeatureContext.cpp b/moses/FF/PhraseBasedFeatureContext.cpp
index 46e754801..4127a587c 100644
--- a/moses/FF/PhraseBasedFeatureContext.cpp
+++ b/moses/FF/PhraseBasedFeatureContext.cpp
@@ -11,7 +11,7 @@ PhraseBasedFeatureContext::PhraseBasedFeatureContext(const Hypothesis* hypothesi
m_source(m_hypothesis->GetManager().GetSource()) {}
PhraseBasedFeatureContext::PhraseBasedFeatureContext
- (const TranslationOption& translationOption, const InputType& source) :
+(const TranslationOption& translationOption, const InputType& source) :
m_hypothesis(NULL),
m_translationOption(translationOption),
m_source(source)
diff --git a/moses/FF/PhraseBasedFeatureContext.h b/moses/FF/PhraseBasedFeatureContext.h
index b2c7052f6..0c41712ca 100644
--- a/moses/FF/PhraseBasedFeatureContext.h
+++ b/moses/FF/PhraseBasedFeatureContext.h
@@ -17,7 +17,7 @@ class WordsBitmap;
**/
class PhraseBasedFeatureContext
{
- // The context either has a hypothesis (during search), or a TranslationOption and
+ // The context either has a hypothesis (during search), or a TranslationOption and
// source sentence (during pre-calculation).
const Hypothesis* m_hypothesis;
const TranslationOption& m_translationOption;
@@ -28,10 +28,12 @@ public:
PhraseBasedFeatureContext(const TranslationOption& translationOption,
const InputType& source);
- const TranslationOption& GetTranslationOption() const
- { return m_translationOption; }
- const InputType& GetSource() const
- { return m_source; }
+ const TranslationOption& GetTranslationOption() const {
+ return m_translationOption;
+ }
+ const InputType& GetSource() const {
+ return m_source;
+ }
const TargetPhrase& GetTargetPhrase() const; //convenience method
const WordsBitmap& GetWordsBitmap() const;
diff --git a/moses/FF/PhraseBoundaryFeature.cpp b/moses/FF/PhraseBoundaryFeature.cpp
index 671cc903e..ff73c760e 100644
--- a/moses/FF/PhraseBoundaryFeature.cpp
+++ b/moses/FF/PhraseBoundaryFeature.cpp
@@ -4,9 +4,10 @@
using namespace std;
-namespace Moses {
+namespace Moses
+{
-int PhraseBoundaryState::Compare(const FFState& other) const
+int PhraseBoundaryState::Compare(const FFState& other) const
{
const PhraseBoundaryState& rhs = dynamic_cast<const PhraseBoundaryState&>(other);
int tgt = Word::Compare(*m_targetWord,*(rhs.m_targetWord));
@@ -15,7 +16,7 @@ int PhraseBoundaryState::Compare(const FFState& other) const
}
PhraseBoundaryFeature::PhraseBoundaryFeature(const std::string &line)
-: StatefulFeatureFunction("PhraseBoundaryFeature", 0, line)
+ : StatefulFeatureFunction("PhraseBoundaryFeature", 0, line)
{
std::cerr << "Initializing source word deletion feature.." << std::endl;
@@ -24,17 +25,15 @@ PhraseBoundaryFeature::PhraseBoundaryFeature(const std::string &line)
if (args[0] == "source") {
m_sourceFactors = Tokenize<FactorType>(args[1], ",");
- }
- else if (args[0] == "target") {
+ } else if (args[0] == "target") {
m_targetFactors = Tokenize<FactorType>(args[1], ",");
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
}
-const FFState* PhraseBoundaryFeature::EmptyHypothesisState(const InputType &) const
+const FFState* PhraseBoundaryFeature::EmptyHypothesisState(const InputType &) const
{
return new PhraseBoundaryState(NULL,NULL);
}
@@ -42,31 +41,32 @@ const FFState* PhraseBoundaryFeature::EmptyHypothesisState(const InputType &) co
void PhraseBoundaryFeature::AddFeatures(
const Word* leftWord, const Word* rightWord, const FactorList& factors, const string& side,
- ScoreComponentCollection* scores) const {
- for (size_t i = 0; i < factors.size(); ++i) {
- ostringstream name;
- name << side << ":";
- name << factors[i];
- name << ":";
- if (leftWord) {
- name << leftWord->GetFactor(factors[i])->GetString();
- } else {
- name << BOS_;
- }
- name << ":";
- if (rightWord) {
- name << rightWord->GetFactor(factors[i])->GetString();
- } else {
- name << EOS_;
- }
- scores->PlusEquals(this,name.str(),1);
+ ScoreComponentCollection* scores) const
+{
+ for (size_t i = 0; i < factors.size(); ++i) {
+ ostringstream name;
+ name << side << ":";
+ name << factors[i];
+ name << ":";
+ if (leftWord) {
+ name << leftWord->GetFactor(factors[i])->GetString();
+ } else {
+ name << BOS_;
+ }
+ name << ":";
+ if (rightWord) {
+ name << rightWord->GetFactor(factors[i])->GetString();
+ } else {
+ name << EOS_;
}
+ scores->PlusEquals(this,name.str(),1);
+ }
}
FFState* PhraseBoundaryFeature::Evaluate
- (const Hypothesis& cur_hypo, const FFState* prev_state,
- ScoreComponentCollection* scores) const
+(const Hypothesis& cur_hypo, const FFState* prev_state,
+ ScoreComponentCollection* scores) const
{
const PhraseBoundaryState* pbState = dynamic_cast<const PhraseBoundaryState*>(prev_state);
const Phrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
diff --git a/moses/FF/PhraseBoundaryFeature.h b/moses/FF/PhraseBoundaryFeature.h
index 34b12abf6..b06e66eea 100644
--- a/moses/FF/PhraseBoundaryFeature.h
+++ b/moses/FF/PhraseBoundaryFeature.h
@@ -12,12 +12,17 @@
namespace Moses
{
-class PhraseBoundaryState : public FFState {
+class PhraseBoundaryState : public FFState
+{
public:
PhraseBoundaryState(const Word* sourceWord, const Word* targetWord) :
- m_sourceWord(sourceWord), m_targetWord(targetWord) {}
- const Word* GetSourceWord() const {return m_sourceWord;}
- const Word* GetTargetWord() const {return m_targetWord;}
+ m_sourceWord(sourceWord), m_targetWord(targetWord) {}
+ const Word* GetSourceWord() const {
+ return m_sourceWord;
+ }
+ const Word* GetTargetWord() const {
+ return m_targetWord;
+ }
virtual int Compare(const FFState& other) const;
@@ -30,7 +35,8 @@ private:
/**
* Concatenations of factors on boundaries of phrases.
**/
-class PhraseBoundaryFeature : public StatefulFeatureFunction {
+class PhraseBoundaryFeature : public StatefulFeatureFunction
+{
public:
PhraseBoundaryFeature(const std::string &line);
@@ -39,7 +45,7 @@ public:
virtual const FFState* EmptyHypothesisState(const InputType &) const;
virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state,
- ScoreComponentCollection* accumulator) const;
+ ScoreComponentCollection* accumulator) const;
virtual FFState* EvaluateChart( const ChartHypothesis& /* cur_hypo */,
int /* featureID */,
@@ -49,7 +55,7 @@ public:
private:
void AddFeatures(
- const Word* leftWord, const Word* rightWord, const FactorList& factors,
+ const Word* leftWord, const Word* rightWord, const FactorList& factors,
const std::string& side, ScoreComponentCollection* scores) const ;
FactorList m_sourceFactors;
FactorList m_targetFactors;
diff --git a/moses/FF/PhraseLengthFeature.cpp b/moses/FF/PhraseLengthFeature.cpp
index b9e8e9e1d..2efeb07d2 100644
--- a/moses/FF/PhraseLengthFeature.cpp
+++ b/moses/FF/PhraseLengthFeature.cpp
@@ -4,20 +4,21 @@
#include "moses/ScoreComponentCollection.h"
#include "moses/TranslationOption.h"
-namespace Moses {
+namespace Moses
+{
using namespace std;
PhraseLengthFeature::PhraseLengthFeature(const std::string &line)
-:StatelessFeatureFunction("PhraseLengthFeature", 0, line)
+ :StatelessFeatureFunction("PhraseLengthFeature", 0, line)
{
}
void PhraseLengthFeature::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
// get length of source and target phrase
size_t targetLength = targetPhrase.GetSize();
diff --git a/moses/FF/PhraseLengthFeature.h b/moses/FF/PhraseLengthFeature.h
index 327865558..23c168417 100644
--- a/moses/FF/PhraseLengthFeature.h
+++ b/moses/FF/PhraseLengthFeature.h
@@ -15,7 +15,8 @@ namespace Moses
/** Sets the features for length of source phrase, target phrase, both.
*/
-class PhraseLengthFeature : public StatelessFeatureFunction {
+class PhraseLengthFeature : public StatelessFeatureFunction
+{
public:
PhraseLengthFeature(const std::string &line);
@@ -25,9 +26,9 @@ public:
}
virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const;
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
};
diff --git a/moses/FF/PhrasePairFeature.cpp b/moses/FF/PhrasePairFeature.cpp
index 58f71271f..9fce7ff4e 100644
--- a/moses/FF/PhrasePairFeature.cpp
+++ b/moses/FF/PhrasePairFeature.cpp
@@ -9,10 +9,11 @@
using namespace std;
-namespace Moses {
+namespace Moses
+{
PhrasePairFeature::PhrasePairFeature(const std::string &line)
-:StatelessFeatureFunction("PhrasePairFeature", 0, line)
+ :StatelessFeatureFunction("PhrasePairFeature", 0, line)
{
std::cerr << "Initializing PhrasePairFeature.." << std::endl;
@@ -44,47 +45,44 @@ PhrasePairFeature::PhrasePairFeature(const std::string &line)
Load(filePathSource);
}
-bool PhrasePairFeature::Load(const std::string &filePathSource/*, const std::string &filePathTarget*/)
+bool PhrasePairFeature::Load(const std::string &filePathSource/*, const std::string &filePathTarget*/)
{
if (m_domainTrigger) {
// domain trigger terms for each input document
ifstream inFileSource(filePathSource.c_str());
- if (!inFileSource)
- {
- cerr << "could not open file " << filePathSource << endl;
- return false;
- }
-
+ if (!inFileSource) {
+ cerr << "could not open file " << filePathSource << endl;
+ return false;
+ }
+
std::string line;
while (getline(inFileSource, line)) {
std::set<std::string> terms;
vector<string> termVector;
boost::split(termVector, line, boost::is_any_of("\t "));
- for (size_t i=0; i < termVector.size(); ++i)
+ for (size_t i=0; i < termVector.size(); ++i)
terms.insert(termVector[i]);
-
+
// add term set for current document
m_vocabDomain.push_back(terms);
}
-
+
inFileSource.close();
- }
- else {
+ } else {
// restricted source word vocabulary
ifstream inFileSource(filePathSource.c_str());
- if (!inFileSource)
- {
- cerr << "could not open file " << filePathSource << endl;
- return false;
- }
-
+ if (!inFileSource) {
+ cerr << "could not open file " << filePathSource << endl;
+ return false;
+ }
+
std::string line;
while (getline(inFileSource, line)) {
m_vocabSource.insert(line);
}
-
+
inFileSource.close();
-
+
/* // restricted target word vocabulary
ifstream inFileTarget(filePathTarget.c_str());
if (!inFileTarget)
@@ -105,11 +103,11 @@ bool PhrasePairFeature::Load(const std::string &filePathSource/*, const std::str
}
void PhrasePairFeature::Evaluate(
- const PhraseBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const
+ const PhraseBasedFeatureContext& context,
+ ScoreComponentCollection* accumulator) const
{
const TargetPhrase& target = context.GetTargetPhrase();
- const Phrase& source = *(context.GetTranslationOption().GetSourcePhrase());
+ const Phrase& source = *(context.GetTranslationOption().GetSourcePhrase());
if (m_simple) {
ostringstream namestr;
namestr << "pp_";
@@ -126,11 +124,11 @@ void PhrasePairFeature::Evaluate(
namestr << ",";
namestr << targetFactor->GetString();
}
-
+
accumulator->SparsePlusEquals(namestr.str(),1);
}
if (m_domainTrigger) {
- const Sentence& input = static_cast<const Sentence&>(context.GetSource());
+ const Sentence& input = static_cast<const Sentence&>(context.GetSource());
const bool use_topicid = input.GetUseTopicId();
const bool use_topicid_prob = input.GetUseTopicIdAndProb();
@@ -149,95 +147,92 @@ void PhrasePairFeature::Evaluate(
pair << ",";
pair << targetFactor->GetString();
}
-
+
if (use_topicid || use_topicid_prob) {
if(use_topicid) {
- // use topicid as trigger
- const long topicid = input.GetTopicId();
- stringstream feature;
- feature << "pp_";
- if (topicid == -1)
- feature << "unk";
- else
- feature << topicid;
-
- feature << "_";
- feature << pair.str();
- accumulator->SparsePlusEquals(feature.str(), 1);
+ // use topicid as trigger
+ const long topicid = input.GetTopicId();
+ stringstream feature;
+ feature << "pp_";
+ if (topicid == -1)
+ feature << "unk";
+ else
+ feature << topicid;
+
+ feature << "_";
+ feature << pair.str();
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ } else {
+ // use topic probabilities
+ const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
+ if (atol(topicid_prob[0].c_str()) == -1) {
+ stringstream feature;
+ feature << "pp_unk_";
+ feature << pair.str();
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ } else {
+ for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
+ stringstream feature;
+ feature << "pp_";
+ feature << topicid_prob[i];
+ feature << "_";
+ feature << pair.str();
+ accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
+ }
+ }
}
- else {
- // use topic probabilities
- const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
- if (atol(topicid_prob[0].c_str()) == -1) {
- stringstream feature;
- feature << "pp_unk_";
- feature << pair.str();
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
- else {
- for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
- stringstream feature;
- feature << "pp_";
- feature << topicid_prob[i];
- feature << "_";
- feature << pair.str();
- accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
- }
- }
- }
- }
- else {
+ } else {
// range over domain trigger words
const long docid = input.GetDocumentId();
for (set<string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
- string sourceTrigger = *p;
- ostringstream namestr;
- namestr << "pp_";
- namestr << sourceTrigger;
- namestr << "_";
- namestr << pair.str();
- accumulator->SparsePlusEquals(namestr.str(),1);
+ string sourceTrigger = *p;
+ ostringstream namestr;
+ namestr << "pp_";
+ namestr << sourceTrigger;
+ namestr << "_";
+ namestr << pair.str();
+ accumulator->SparsePlusEquals(namestr.str(),1);
}
}
}
if (m_sourceContext) {
const Sentence& input = static_cast<const Sentence&>(context.GetSource());
-
+
// range over source words to get context
for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_sourceFactorId)->GetString();
if (m_ignorePunctuation) {
- // check if trigger is punctuation
- char firstChar = sourceTrigger[0];
- CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
- if(charIterator != m_punctuationHash.end())
- continue;
+ // check if trigger is punctuation
+ char firstChar = sourceTrigger[0];
+ CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+ if(charIterator != m_punctuationHash.end())
+ continue;
}
-
+
bool sourceTriggerExists = false;
if (!m_unrestricted)
- sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
-
+ sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
+
if (m_unrestricted || sourceTriggerExists) {
- ostringstream namestr;
- namestr << "pp_";
- namestr << sourceTrigger;
- namestr << "~";
- namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
- for (size_t i = 1; i < source.GetSize(); ++i) {
- const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
- namestr << ",";
- namestr << sourceFactor->GetString();
- }
- namestr << "~";
- namestr << target.GetWord(0).GetFactor(m_targetFactorId)->GetString();
- for (size_t i = 1; i < target.GetSize(); ++i) {
- const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId);
- namestr << ",";
- namestr << targetFactor->GetString();
- }
-
- accumulator->SparsePlusEquals(namestr.str(),1);
+ ostringstream namestr;
+ namestr << "pp_";
+ namestr << sourceTrigger;
+ namestr << "~";
+ namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString();
+ for (size_t i = 1; i < source.GetSize(); ++i) {
+ const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId);
+ namestr << ",";
+ namestr << sourceFactor->GetString();
+ }
+ namestr << "~";
+ namestr << target.GetWord(0).GetFactor(m_targetFactorId)->GetString();
+ for (size_t i = 1; i < target.GetSize(); ++i) {
+ const Factor* targetFactor = target.GetWord(i).GetFactor(m_targetFactorId);
+ namestr << ",";
+ namestr << targetFactor->GetString();
+ }
+
+ accumulator->SparsePlusEquals(namestr.str(),1);
}
}
}
diff --git a/moses/FF/PhrasePairFeature.h b/moses/FF/PhrasePairFeature.h
index e895110f8..d7aa80be7 100644
--- a/moses/FF/PhrasePairFeature.h
+++ b/moses/FF/PhrasePairFeature.h
@@ -8,39 +8,41 @@
#include "moses/Factor.h"
#include "moses/Sentence.h"
-namespace Moses {
+namespace Moses
+{
/**
* Phrase pair feature: complete source/target phrase pair
**/
-class PhrasePairFeature: public StatelessFeatureFunction {
-
- typedef std::map< char, short > CharHash;
- typedef std::vector< std::set<std::string> > DocumentVector;
-
- boost::unordered_set<std::string> m_vocabSource;
- DocumentVector m_vocabDomain;
- FactorType m_sourceFactorId;
- FactorType m_targetFactorId;
- bool m_unrestricted;
- bool m_simple;
- bool m_sourceContext;
- bool m_domainTrigger;
- bool m_ignorePunctuation;
- CharHash m_punctuationHash;
-
- public:
- PhrasePairFeature(const std::string &line);
-
- void Evaluate(const PhraseBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const;
-
- void EvaluateChart(const ChartBasedFeatureContext& context,
- ScoreComponentCollection*) const {
- throw std::logic_error("PhrasePairFeature not valid in chart decoder");
- }
-
- bool Load(const std::string &filePathSource/*, const std::string &filePathTarget*/);
+class PhrasePairFeature: public StatelessFeatureFunction
+{
+
+ typedef std::map< char, short > CharHash;
+ typedef std::vector< std::set<std::string> > DocumentVector;
+
+ boost::unordered_set<std::string> m_vocabSource;
+ DocumentVector m_vocabDomain;
+ FactorType m_sourceFactorId;
+ FactorType m_targetFactorId;
+ bool m_unrestricted;
+ bool m_simple;
+ bool m_sourceContext;
+ bool m_domainTrigger;
+ bool m_ignorePunctuation;
+ CharHash m_punctuationHash;
+
+public:
+ PhrasePairFeature(const std::string &line);
+
+ void Evaluate(const PhraseBasedFeatureContext& context,
+ ScoreComponentCollection* accumulator) const;
+
+ void EvaluateChart(const ChartBasedFeatureContext& context,
+ ScoreComponentCollection*) const {
+ throw std::logic_error("PhrasePairFeature not valid in chart decoder");
+ }
+
+ bool Load(const std::string &filePathSource/*, const std::string &filePathTarget*/);
};
diff --git a/moses/FF/SourceWordDeletionFeature.cpp b/moses/FF/SourceWordDeletionFeature.cpp
index 085dbbeea..693812105 100644
--- a/moses/FF/SourceWordDeletionFeature.cpp
+++ b/moses/FF/SourceWordDeletionFeature.cpp
@@ -11,13 +11,14 @@
#include "util/string_piece_hash.hh"
-namespace Moses {
+namespace Moses
+{
using namespace std;
SourceWordDeletionFeature::SourceWordDeletionFeature(const std::string &line)
-:StatelessFeatureFunction("SourceWordDeletionFeature", 0, line),
-m_unrestricted(true)
+ :StatelessFeatureFunction("SourceWordDeletionFeature", 0, line),
+ m_unrestricted(true)
{
std::cerr << "Initializing source word deletion feature.." << std::endl;
@@ -27,11 +28,9 @@ m_unrestricted(true)
if (args[0] == "factor") {
m_factorType = Scan<FactorType>(args[1]);
- }
- else if (args[0] == "path") {
+ } else if (args[0] == "path") {
filename = args[1];
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
@@ -40,19 +39,18 @@ m_unrestricted(true)
if (filename != "") {
cerr << "loading source word deletion word list from " << filename << endl;
if (!Load(filename)) {
- UserMessage::Add("Unable to load word list for source word deletion feature from file " + filename);
- //return false;
+ UserMessage::Add("Unable to load word list for source word deletion feature from file " + filename);
+ //return false;
}
}
}
-bool SourceWordDeletionFeature::Load(const std::string &filePath)
+bool SourceWordDeletionFeature::Load(const std::string &filePath)
{
ifstream inFile(filePath.c_str());
- if (!inFile)
- {
- cerr << "could not open file " << filePath << endl;
- return false;
+ if (!inFile) {
+ cerr << "could not open file " << filePath << endl;
+ return false;
}
std::string line;
@@ -67,23 +65,23 @@ bool SourceWordDeletionFeature::Load(const std::string &filePath)
}
void SourceWordDeletionFeature::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm();
ComputeFeatures(source, targetPhrase, &scoreBreakdown, alignmentInfo);
}
void SourceWordDeletionFeature::ComputeFeatures(const Phrase &source,
- const TargetPhrase& targetPhrase,
- ScoreComponentCollection* accumulator,
- const AlignmentInfo &alignmentInfo) const
+ const TargetPhrase& targetPhrase,
+ ScoreComponentCollection* accumulator,
+ const AlignmentInfo &alignmentInfo) const
{
// handle special case: unknown words (they have no word alignment)
- size_t targetLength = targetPhrase.GetSize();
- size_t sourceLength = source.GetSize();
- if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
+ size_t targetLength = targetPhrase.GetSize();
+ size_t sourceLength = source.GetSize();
+ if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
// flag aligned words
bool aligned[16];
@@ -92,22 +90,21 @@ void SourceWordDeletionFeature::ComputeFeatures(const Phrase &source,
aligned[i] = false;
for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); alignmentPoint++)
aligned[ alignmentPoint->first ] = true;
-
+
// process unaligned source words
for(size_t i=0; i<sourceLength; i++) {
if (!aligned[i]) {
- const Word &w = source.GetWord(i);
- if (!w.IsNonTerminal()) {
- const StringPiece word = w.GetFactor(m_factorType)->GetString();
- if (word != "<s>" && word != "</s>") {
- if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) {
- accumulator->PlusEquals(this, StringPiece("OTHER"),1);
- }
- else {
- accumulator->PlusEquals(this,word,1);
- }
- }
- }
+ const Word &w = source.GetWord(i);
+ if (!w.IsNonTerminal()) {
+ const StringPiece word = w.GetFactor(m_factorType)->GetString();
+ if (word != "<s>" && word != "</s>") {
+ if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) {
+ accumulator->PlusEquals(this, StringPiece("OTHER"),1);
+ } else {
+ accumulator->PlusEquals(this,word,1);
+ }
+ }
+ }
}
}
}
diff --git a/moses/FF/SourceWordDeletionFeature.h b/moses/FF/SourceWordDeletionFeature.h
index 1bf6323be..7a25ee6e1 100644
--- a/moses/FF/SourceWordDeletionFeature.h
+++ b/moses/FF/SourceWordDeletionFeature.h
@@ -13,7 +13,8 @@ namespace Moses
/** Sets the features for source word deletion
*/
-class SourceWordDeletionFeature : public StatelessFeatureFunction {
+class SourceWordDeletionFeature : public StatelessFeatureFunction
+{
private:
boost::unordered_set<std::string> m_vocab;
FactorType m_factorType;
@@ -21,18 +22,18 @@ private:
public:
SourceWordDeletionFeature(const std::string &line);
-
+
bool Load(const std::string &filePath);
virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const;
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
void ComputeFeatures(const Phrase &source,
- const TargetPhrase& targetPhrase,
- ScoreComponentCollection* accumulator,
- const AlignmentInfo &alignmentInfo) const;
+ const TargetPhrase& targetPhrase,
+ ScoreComponentCollection* accumulator,
+ const AlignmentInfo &alignmentInfo) const;
};
}
diff --git a/moses/FF/StatefulFeatureFunction.cpp b/moses/FF/StatefulFeatureFunction.cpp
index a97846311..0aeeed62c 100644
--- a/moses/FF/StatefulFeatureFunction.cpp
+++ b/moses/FF/StatefulFeatureFunction.cpp
@@ -4,13 +4,13 @@ namespace Moses
{
StatefulFeatureFunction::StatefulFeatureFunction(const std::string& description, const std::string &line)
-: FeatureFunction(description, line)
+ : FeatureFunction(description, line)
{
m_statefulFFs.push_back(this);
}
StatefulFeatureFunction::StatefulFeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line)
-: FeatureFunction(description,numScoreComponents, line)
+ : FeatureFunction(description,numScoreComponents, line)
{
m_statefulFFs.push_back(this);
}
diff --git a/moses/FF/StatefulFeatureFunction.h b/moses/FF/StatefulFeatureFunction.h
index d2721d4ae..fc5cd4faf 100644
--- a/moses/FF/StatefulFeatureFunction.h
+++ b/moses/FF/StatefulFeatureFunction.h
@@ -6,7 +6,7 @@ namespace Moses
{
/** base class for all stateful feature functions.
- * eg. LM, distortion penalty
+ * eg. LM, distortion penalty
*/
class StatefulFeatureFunction: public FeatureFunction
{
@@ -14,7 +14,9 @@ class StatefulFeatureFunction: public FeatureFunction
static std::vector<const StatefulFeatureFunction*> m_statefulFFs;
public:
- static const std::vector<const StatefulFeatureFunction*>& GetStatefulFeatureFunctions() {return m_statefulFFs;}
+ static const std::vector<const StatefulFeatureFunction*>& GetStatefulFeatureFunctions() {
+ return m_statefulFFs;
+ }
StatefulFeatureFunction(const std::string& description, const std::string &line);
StatefulFeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line);
@@ -39,8 +41,9 @@ public:
//! return the state associated with the empty hypothesis for a given sentence
virtual const FFState* EmptyHypothesisState(const InputType &input) const = 0;
- bool IsStateless() const
- { return false; }
+ bool IsStateless() const {
+ return false;
+ }
};
diff --git a/moses/FF/StatelessFeatureFunction.cpp b/moses/FF/StatelessFeatureFunction.cpp
index 1c5e604de..278a90c54 100644
--- a/moses/FF/StatelessFeatureFunction.cpp
+++ b/moses/FF/StatelessFeatureFunction.cpp
@@ -4,13 +4,13 @@ namespace Moses
{
StatelessFeatureFunction::StatelessFeatureFunction(const std::string& description, const std::string &line)
-:FeatureFunction(description, line)
+ :FeatureFunction(description, line)
{
m_statelessFFs.push_back(this);
}
StatelessFeatureFunction::StatelessFeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line)
-:FeatureFunction(description, numScoreComponents, line)
+ :FeatureFunction(description, numScoreComponents, line)
{
m_statelessFFs.push_back(this);
}
diff --git a/moses/FF/StatelessFeatureFunction.h b/moses/FF/StatelessFeatureFunction.h
index d8db7f514..3f120a1de 100644
--- a/moses/FF/StatelessFeatureFunction.h
+++ b/moses/FF/StatelessFeatureFunction.h
@@ -14,7 +14,9 @@ class StatelessFeatureFunction: public FeatureFunction
static std::vector<const StatelessFeatureFunction*> m_statelessFFs;
public:
- static const std::vector<const StatelessFeatureFunction*>& GetStatelessFeatureFunctions() {return m_statelessFFs;}
+ static const std::vector<const StatelessFeatureFunction*>& GetStatelessFeatureFunctions() {
+ return m_statelessFFs;
+ }
StatelessFeatureFunction(const std::string& description, const std::string &line);
StatelessFeatureFunction(const std::string& description, size_t numScoreComponents, const std::string &line);
@@ -22,7 +24,7 @@ public:
* This should be implemented for features that apply to phrase-based models.
**/
virtual void Evaluate(const PhraseBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const
+ ScoreComponentCollection* accumulator) const
{}
/**
@@ -32,8 +34,9 @@ public:
ScoreComponentCollection* accumulator) const
{}
- virtual bool IsStateless() const
- { return true; }
+ virtual bool IsStateless() const {
+ return true;
+ }
};
diff --git a/moses/FF/TargetBigramFeature.cpp b/moses/FF/TargetBigramFeature.cpp
index 441cf9e15..fc30a737f 100644
--- a/moses/FF/TargetBigramFeature.cpp
+++ b/moses/FF/TargetBigramFeature.cpp
@@ -7,15 +7,17 @@
using namespace std;
-namespace Moses {
+namespace Moses
+{
-int TargetBigramState::Compare(const FFState& other) const {
+int TargetBigramState::Compare(const FFState& other) const
+{
const TargetBigramState& rhs = dynamic_cast<const TargetBigramState&>(other);
return Word::Compare(m_word,rhs.m_word);
}
TargetBigramFeature::TargetBigramFeature(const std::string &line)
-:StatefulFeatureFunction("TargetBigramFeature", 0, line)
+ :StatefulFeatureFunction("TargetBigramFeature", 0, line)
{
std::cerr << "Initializing target bigram feature.." << std::endl;
@@ -27,7 +29,7 @@ TargetBigramFeature::TargetBigramFeature(const std::string &line)
FactorCollection& factorCollection = FactorCollection::Instance();
const Factor* bosFactor =
- factorCollection.AddFactor(Output,m_factorType,BOS_);
+ factorCollection.AddFactor(Output,m_factorType,BOS_);
m_bos.SetFactor(m_factorType,bosFactor);
const string &filePath = tokens[2];
@@ -35,13 +37,12 @@ TargetBigramFeature::TargetBigramFeature(const std::string &line)
}
-bool TargetBigramFeature::Load(const std::string &filePath)
+bool TargetBigramFeature::Load(const std::string &filePath)
{
if (filePath == "*") return true; //allow all
ifstream inFile(filePath.c_str());
- if (!inFile)
- {
- return false;
+ if (!inFile) {
+ return false;
}
std::string line;
@@ -87,7 +88,7 @@ FFState* TargetBigramFeature::Evaluate(const Hypothesis& cur_hypo,
const StringPiece w2 = f2->GetString();
// skip bigrams if they don't belong to a given restricted vocabulary
- if (m_vocab.size() &&
+ if (m_vocab.size() &&
(FindStringPiece(m_vocab, w1) == m_vocab.end() || FindStringPiece(m_vocab, w2) == m_vocab.end())) {
continue;
}
diff --git a/moses/FF/TargetBigramFeature.h b/moses/FF/TargetBigramFeature.h
index f514f2405..e29eace14 100644
--- a/moses/FF/TargetBigramFeature.h
+++ b/moses/FF/TargetBigramFeature.h
@@ -13,35 +13,38 @@
namespace Moses
{
-class TargetBigramState : public FFState {
- public:
- TargetBigramState(const Word& word): m_word(word) {}
- const Word& GetWord() const {return m_word;}
- virtual int Compare(const FFState& other) const;
-
- private:
- Word m_word;
+class TargetBigramState : public FFState
+{
+public:
+ TargetBigramState(const Word& word): m_word(word) {}
+ const Word& GetWord() const {
+ return m_word;
+ }
+ virtual int Compare(const FFState& other) const;
+
+private:
+ Word m_word;
};
/** Sets the features of observed bigrams.
*/
-class TargetBigramFeature : public StatefulFeatureFunction {
+class TargetBigramFeature : public StatefulFeatureFunction
+{
public:
- TargetBigramFeature(const std::string &line);
+ TargetBigramFeature(const std::string &line);
- bool Load(const std::string &filePath);
+ bool Load(const std::string &filePath);
- virtual const FFState* EmptyHypothesisState(const InputType &input) const;
+ virtual const FFState* EmptyHypothesisState(const InputType &input) const;
- virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state,
- ScoreComponentCollection* accumulator) const;
+ virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const;
virtual FFState* EvaluateChart( const ChartHypothesis& /* cur_hypo */,
int /* featureID */,
- ScoreComponentCollection* ) const
- {
- abort();
- }
+ ScoreComponentCollection* ) const {
+ abort();
+ }
private:
FactorType m_factorType;
diff --git a/moses/FF/TargetNgramFeature.cpp b/moses/FF/TargetNgramFeature.cpp
index 174fcfa1a..3c36aef0e 100644
--- a/moses/FF/TargetNgramFeature.cpp
+++ b/moses/FF/TargetNgramFeature.cpp
@@ -7,38 +7,38 @@
#include "util/string_piece_hash.hh"
-namespace Moses {
+namespace Moses
+{
using namespace std;
-int TargetNgramState::Compare(const FFState& other) const {
+int TargetNgramState::Compare(const FFState& other) const
+{
const TargetNgramState& rhs = dynamic_cast<const TargetNgramState&>(other);
int result;
if (m_words.size() == rhs.m_words.size()) {
- for (size_t i = 0; i < m_words.size(); ++i) {
- result = Word::Compare(m_words[i],rhs.m_words[i]);
- if (result != 0) return result;
- }
+ for (size_t i = 0; i < m_words.size(); ++i) {
+ result = Word::Compare(m_words[i],rhs.m_words[i]);
+ if (result != 0) return result;
+ }
return 0;
- }
- else if (m_words.size() < rhs.m_words.size()) {
- for (size_t i = 0; i < m_words.size(); ++i) {
- result = Word::Compare(m_words[i],rhs.m_words[i]);
- if (result != 0) return result;
- }
- return -1;
- }
- else {
- for (size_t i = 0; i < rhs.m_words.size(); ++i) {
- result = Word::Compare(m_words[i],rhs.m_words[i]);
- if (result != 0) return result;
- }
- return 1;
+ } else if (m_words.size() < rhs.m_words.size()) {
+ for (size_t i = 0; i < m_words.size(); ++i) {
+ result = Word::Compare(m_words[i],rhs.m_words[i]);
+ if (result != 0) return result;
+ }
+ return -1;
+ } else {
+ for (size_t i = 0; i < rhs.m_words.size(); ++i) {
+ result = Word::Compare(m_words[i],rhs.m_words[i]);
+ if (result != 0) return result;
+ }
+ return 1;
}
}
TargetNgramFeature::TargetNgramFeature(const std::string &line)
-:StatefulFeatureFunction("TargetNgramFeature", 0, line)
+ :StatefulFeatureFunction("TargetNgramFeature", 0, line)
{
std::cerr << "Initializing target ngram feature.." << std::endl;
@@ -56,9 +56,8 @@ bool TargetNgramFeature::Load(const std::string &filePath)
{
if (filePath == "*") return true; //allow all
ifstream inFile(filePath.c_str());
- if (!inFile)
- {
- return false;
+ if (!inFile) {
+ return false;
}
std::string line;
@@ -74,13 +73,13 @@ bool TargetNgramFeature::Load(const std::string &filePath)
const FFState* TargetNgramFeature::EmptyHypothesisState(const InputType &/*input*/) const
{
- vector<Word> bos(1,m_bos);
+ vector<Word> bos(1,m_bos);
return new TargetNgramState(bos);
}
FFState* TargetNgramFeature::Evaluate(const Hypothesis& cur_hypo,
- const FFState* prev_state,
- ScoreComponentCollection* accumulator) const
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const
{
const TargetNgramState* tnState = static_cast<const TargetNgramState*>(prev_state);
assert(tnState);
@@ -99,92 +98,92 @@ FFState* TargetNgramFeature::Evaluate(const Hypothesis& cur_hypo,
if (m_lower_ngrams) smallest_n = 1;
for (size_t n = m_n; n >= smallest_n; --n) { // iterate over ngram size
- for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
+ for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
// const string& curr_w = targetPhrase.GetWord(i).GetFactor(m_factorType)->GetString();
- const StringPiece& curr_w = targetPhrase.GetWord(i).GetString(m_factorType);
-
- if (m_vocab.size() && (FindStringPiece(m_vocab, curr_w) == m_vocab.end())) continue; // skip ngrams
-
- if (n > 1) {
- // can we build an ngram at this position? ("<s> this" --> cannot build 3gram at this position)
- size_t pos_in_translation = cur_hypo.GetSize() - targetPhrase.GetSize() + i;
- if (pos_in_translation < n - 2) continue; // need at least m_n - 1 words
-
- // how many words needed from previous state?
- int from_prev_state = n - (i+1);
- skip = false;
- if (from_prev_state > 0) {
- if (prev_words.size() < from_prev_state) {
- // context is too short, make new state from previous state and target phrase
- vector<Word> new_prev_words;
- for (size_t i = 0; i < prev_words.size(); ++i)
- new_prev_words.push_back(prev_words[i]);
- for (size_t i = 0; i < targetPhrase.GetSize(); ++i)
- new_prev_words.push_back(targetPhrase.GetWord(i));
- return new TargetNgramState(new_prev_words);
- }
-
- // add words from previous state
- for (size_t j = prev_words.size()-from_prev_state; j < prev_words.size() && !skip; ++j)
- appendNgram(prev_words[j], skip, curr_ngram);
+ const StringPiece& curr_w = targetPhrase.GetWord(i).GetString(m_factorType);
+
+ if (m_vocab.size() && (FindStringPiece(m_vocab, curr_w) == m_vocab.end())) continue; // skip ngrams
+
+ if (n > 1) {
+ // can we build an ngram at this position? ("<s> this" --> cannot build 3gram at this position)
+ size_t pos_in_translation = cur_hypo.GetSize() - targetPhrase.GetSize() + i;
+ if (pos_in_translation < n - 2) continue; // need at least m_n - 1 words
+
+ // how many words needed from previous state?
+ int from_prev_state = n - (i+1);
+ skip = false;
+ if (from_prev_state > 0) {
+ if (prev_words.size() < from_prev_state) {
+ // context is too short, make new state from previous state and target phrase
+ vector<Word> new_prev_words;
+ for (size_t i = 0; i < prev_words.size(); ++i)
+ new_prev_words.push_back(prev_words[i]);
+ for (size_t i = 0; i < targetPhrase.GetSize(); ++i)
+ new_prev_words.push_back(targetPhrase.GetWord(i));
+ return new TargetNgramState(new_prev_words);
+ }
+
+ // add words from previous state
+ for (size_t j = prev_words.size()-from_prev_state; j < prev_words.size() && !skip; ++j)
+ appendNgram(prev_words[j], skip, curr_ngram);
}
- // add words from current target phrase
- int start = i - n + 1; // add m_n-1 previous words
- if (start < 0) start = 0; // or less
- for (size_t j = start; j < i && !skip; ++j)
- appendNgram(targetPhrase.GetWord(j), skip, curr_ngram);
+ // add words from current target phrase
+ int start = i - n + 1; // add m_n-1 previous words
+ if (start < 0) start = 0; // or less
+ for (size_t j = start; j < i && !skip; ++j)
+ appendNgram(targetPhrase.GetWord(j), skip, curr_ngram);
}
- if (!skip) {
- curr_ngram << curr_w;
- accumulator->PlusEquals(this,curr_ngram.str(),1);
+ if (!skip) {
+ curr_ngram << curr_w;
+ accumulator->PlusEquals(this,curr_ngram.str(),1);
}
- curr_ngram.str("");
- }
+ curr_ngram.str("");
+ }
}
if (cur_hypo.GetWordsBitmap().IsComplete()) {
- for (size_t n = m_n; n >= smallest_n; --n) {
- stringstream last_ngram;
- skip = false;
- for (size_t i = cur_hypo.GetSize() - n + 1; i < cur_hypo.GetSize() && !skip; ++i)
- appendNgram(cur_hypo.GetWord(i), skip, last_ngram);
-
- if (n > 1 && !skip) {
- last_ngram << EOS_;
- accumulator->PlusEquals(this, last_ngram.str(), 1);
- }
- }
- return NULL;
+ for (size_t n = m_n; n >= smallest_n; --n) {
+ stringstream last_ngram;
+ skip = false;
+ for (size_t i = cur_hypo.GetSize() - n + 1; i < cur_hypo.GetSize() && !skip; ++i)
+ appendNgram(cur_hypo.GetWord(i), skip, last_ngram);
+
+ if (n > 1 && !skip) {
+ last_ngram << EOS_;
+ accumulator->PlusEquals(this, last_ngram.str(), 1);
+ }
+ }
+ return NULL;
}
// prepare new state
vector<Word> new_prev_words;
if (targetPhrase.GetSize() >= m_n-1) {
- // take subset of target words
- for (size_t i = targetPhrase.GetSize() - m_n + 1; i < targetPhrase.GetSize(); ++i)
- new_prev_words.push_back(targetPhrase.GetWord(i));
- }
- else {
- // take words from previous state and from target phrase
- int from_prev_state = m_n - 1 - targetPhrase.GetSize();
- for (size_t i = prev_words.size()-from_prev_state; i < prev_words.size(); ++i)
- new_prev_words.push_back(prev_words[i]);
- for (size_t i = 0; i < targetPhrase.GetSize(); ++i)
- new_prev_words.push_back(targetPhrase.GetWord(i));
+ // take subset of target words
+ for (size_t i = targetPhrase.GetSize() - m_n + 1; i < targetPhrase.GetSize(); ++i)
+ new_prev_words.push_back(targetPhrase.GetWord(i));
+ } else {
+ // take words from previous state and from target phrase
+ int from_prev_state = m_n - 1 - targetPhrase.GetSize();
+ for (size_t i = prev_words.size()-from_prev_state; i < prev_words.size(); ++i)
+ new_prev_words.push_back(prev_words[i]);
+ for (size_t i = 0; i < targetPhrase.GetSize(); ++i)
+ new_prev_words.push_back(targetPhrase.GetWord(i));
}
return new TargetNgramState(new_prev_words);
}
-void TargetNgramFeature::appendNgram(const Word& word, bool& skip, stringstream &ngram) const {
+void TargetNgramFeature::appendNgram(const Word& word, bool& skip, stringstream &ngram) const
+{
// const string& w = word.GetFactor(m_factorType)->GetString();
- const StringPiece& w = word.GetString(m_factorType);
- if (m_vocab.size() && (FindStringPiece(m_vocab, w) == m_vocab.end())) skip = true;
- else {
- ngram << w;
- ngram << ":";
- }
+ const StringPiece& w = word.GetString(m_factorType);
+ if (m_vocab.size() && (FindStringPiece(m_vocab, w) == m_vocab.end())) skip = true;
+ else {
+ ngram << w;
+ ngram << ":";
+ }
}
FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int featureId, ScoreComponentCollection* accumulator) const
@@ -205,159 +204,149 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int
bool onlyTerminals = true;
bool prev_is_NT = false;
size_t prev_subPhraseLength = 0;
- for (size_t phrasePos = 0; phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize(); phrasePos++)
- {
+ for (size_t phrasePos = 0; phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize(); phrasePos++) {
// consult rule for either word or non-terminal
const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(phrasePos);
// cerr << "word: " << word << endl;
// regular word
if (!word.IsNonTerminal()) {
- contextFactor.push_back(&word);
- prev_is_NT = false;
+ contextFactor.push_back(&word);
+ prev_is_NT = false;
if (phrasePos==0)
- makePrefix = true;
+ makePrefix = true;
if (phrasePos==cur_hypo.GetCurrTargetPhrase().GetSize()-1 || prev_is_NT)
- makeSuffix = true;
-
+ makeSuffix = true;
+
// beginning/end of sentence symbol <s>,</s>?
StringPiece factorZero = word.GetString(0);
if (factorZero.compare("<s>") == 0)
- prefixTerminals++;
+ prefixTerminals++;
// end of sentence symbol </s>?
else if (factorZero.compare("</s>") == 0)
- suffixTerminals++;
+ suffixTerminals++;
// everything else
else {
- stringstream ngram;
- ngram << m_baseName;
- if (m_factorType == 0)
- ngram << factorZero;
- else
- ngram << word.GetString(m_factorType);
- accumulator->SparsePlusEquals(ngram.str(), 1);
-
- if (collectForPrefix)
- prefixTerminals++;
- else
- suffixTerminals++;
+ stringstream ngram;
+ ngram << m_baseName;
+ if (m_factorType == 0)
+ ngram << factorZero;
+ else
+ ngram << word.GetString(m_factorType);
+ accumulator->SparsePlusEquals(ngram.str(), 1);
+
+ if (collectForPrefix)
+ prefixTerminals++;
+ else
+ suffixTerminals++;
}
}
// non-terminal, add phrase from underlying hypothesis
- else if (m_n > 1)
- {
+ else if (m_n > 1) {
// look up underlying hypothesis
size_t nonTermIndex = nonTermIndexMap[phrasePos];
const ChartHypothesis *prevHypo = cur_hypo.GetPrevHypo(nonTermIndex);
const TargetNgramChartState* prevState =
- static_cast<const TargetNgramChartState*>(prevHypo->GetFFState(featureId));
+ static_cast<const TargetNgramChartState*>(prevHypo->GetFFState(featureId));
size_t subPhraseLength = prevState->GetNumTargetTerminals();
// special case: rule starts with non-terminal
if (phrasePos == 0) {
- if (subPhraseLength == 1) {
- makePrefix = true;
- ++prefixTerminals;
+ if (subPhraseLength == 1) {
+ makePrefix = true;
+ ++prefixTerminals;
- const Word &word = prevState->GetSuffix().GetWord(0);
+ const Word &word = prevState->GetSuffix().GetWord(0);
// cerr << "NT0 --> : " << word << endl;
- contextFactor.push_back(&word);
- }
- else {
- onlyTerminals = false;
- collectForPrefix = false;
- int suffixPos = prevState->GetSuffix().GetSize() - (m_n-1);
- if (suffixPos < 0) suffixPos = 0; // push all words if less than order
- for(;(size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++)
- {
- const Word &word = prevState->GetSuffix().GetWord(suffixPos);
+ contextFactor.push_back(&word);
+ } else {
+ onlyTerminals = false;
+ collectForPrefix = false;
+ int suffixPos = prevState->GetSuffix().GetSize() - (m_n-1);
+ if (suffixPos < 0) suffixPos = 0; // push all words if less than order
+ for(; (size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) {
+ const Word &word = prevState->GetSuffix().GetWord(suffixPos);
// cerr << "NT0 --> : " << word << endl;
- contextFactor.push_back(&word);
- }
- }
+ contextFactor.push_back(&word);
+ }
+ }
}
// internal non-terminal
- else
- {
- // push its prefix
- for(size_t prefixPos = 0; prefixPos < m_n-1
- && prefixPos < subPhraseLength; prefixPos++)
- {
+ else {
+ // push its prefix
+ for(size_t prefixPos = 0; prefixPos < m_n-1
+ && prefixPos < subPhraseLength; prefixPos++) {
const Word &word = prevState->GetPrefix().GetWord(prefixPos);
// cerr << "NT --> " << word << endl;
contextFactor.push_back(&word);
}
- if (subPhraseLength==1) {
- if (collectForPrefix)
- ++prefixTerminals;
- else
- ++suffixTerminals;
-
- if (phrasePos == cur_hypo.GetCurrTargetPhrase().GetSize()-1)
- makeSuffix = true;
- }
- else {
- onlyTerminals = false;
- collectForPrefix = true;
-
- // check if something follows this NT
- bool wordFollowing = (phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize() - 1)? true : false;
-
- // check if we are dealing with a large sub-phrase
- if (wordFollowing && subPhraseLength > m_n - 1)
- {
- // clear up pending ngrams
- MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
- contextFactor.clear();
- makePrefix = false;
- makeSuffix = true;
- collectForPrefix = false;
- prefixTerminals = 0;
- suffixTerminals = 0;
-
- // push its suffix
- size_t remainingWords = (remainingWords > m_n-1) ? m_n-1 : subPhraseLength - (m_n-1);
- for(size_t suffixPos = 0; suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) {
- const Word &word = prevState->GetSuffix().GetWord(suffixPos);
+ if (subPhraseLength==1) {
+ if (collectForPrefix)
+ ++prefixTerminals;
+ else
+ ++suffixTerminals;
+
+ if (phrasePos == cur_hypo.GetCurrTargetPhrase().GetSize()-1)
+ makeSuffix = true;
+ } else {
+ onlyTerminals = false;
+ collectForPrefix = true;
+
+ // check if something follows this NT
+ bool wordFollowing = (phrasePos < cur_hypo.GetCurrTargetPhrase().GetSize() - 1)? true : false;
+
+ // check if we are dealing with a large sub-phrase
+ if (wordFollowing && subPhraseLength > m_n - 1) {
+ // clear up pending ngrams
+ MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
+ contextFactor.clear();
+ makePrefix = false;
+ makeSuffix = true;
+ collectForPrefix = false;
+ prefixTerminals = 0;
+ suffixTerminals = 0;
+
+ // push its suffix
+ size_t remainingWords = (remainingWords > m_n-1) ? m_n-1 : subPhraseLength - (m_n-1);
+ for(size_t suffixPos = 0; suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) {
+ const Word &word = prevState->GetSuffix().GetWord(suffixPos);
// cerr << "NT --> : " << word << endl;
- contextFactor.push_back(&word);
- }
- }
- // subphrase can be used as suffix and as prefix for the next part
- else if (wordFollowing && subPhraseLength == m_n - 1)
- {
- // clear up pending ngrams
- MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
- makePrefix = false;
- makeSuffix = true;
- collectForPrefix = false;
- prefixTerminals = 0;
- suffixTerminals = 0;
- }
- else if (prev_is_NT && prev_subPhraseLength > 1 && subPhraseLength > 1) {
- // two NTs in a row: make transition
- MakePrefixNgrams(contextFactor, accumulator, 1, m_n-2);
- MakeSuffixNgrams(contextFactor, accumulator, 1, m_n-2);
- makePrefix = false;
- makeSuffix = false;
- collectForPrefix = false;
- prefixTerminals = 0;
- suffixTerminals = 0;
-
- // remove duplicates
- stringstream curr_ngram;
- curr_ngram << m_baseName;
- curr_ngram << (*contextFactor[m_n-2]).GetString(m_factorType);
- curr_ngram << ":";
- curr_ngram << (*contextFactor[m_n-1]).GetString(m_factorType);
- accumulator->SparseMinusEquals(curr_ngram.str(),1);
- }
- }
+ contextFactor.push_back(&word);
+ }
+ }
+ // subphrase can be used as suffix and as prefix for the next part
+ else if (wordFollowing && subPhraseLength == m_n - 1) {
+ // clear up pending ngrams
+ MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
+ makePrefix = false;
+ makeSuffix = true;
+ collectForPrefix = false;
+ prefixTerminals = 0;
+ suffixTerminals = 0;
+ } else if (prev_is_NT && prev_subPhraseLength > 1 && subPhraseLength > 1) {
+ // two NTs in a row: make transition
+ MakePrefixNgrams(contextFactor, accumulator, 1, m_n-2);
+ MakeSuffixNgrams(contextFactor, accumulator, 1, m_n-2);
+ makePrefix = false;
+ makeSuffix = false;
+ collectForPrefix = false;
+ prefixTerminals = 0;
+ suffixTerminals = 0;
+
+ // remove duplicates
+ stringstream curr_ngram;
+ curr_ngram << m_baseName;
+ curr_ngram << (*contextFactor[m_n-2]).GetString(m_factorType);
+ curr_ngram << ":";
+ curr_ngram << (*contextFactor[m_n-1]).GetString(m_factorType);
+ accumulator->SparseMinusEquals(curr_ngram.str(),1);
+ }
+ }
}
prev_is_NT = true;
prev_subPhraseLength = subPhraseLength;
@@ -366,25 +355,24 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int
if (m_n > 1) {
if (onlyTerminals) {
- MakePrefixNgrams(contextFactor, accumulator, prefixTerminals-1);
- }
- else {
+ MakePrefixNgrams(contextFactor, accumulator, prefixTerminals-1);
+ } else {
if (makePrefix)
- MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
+ MakePrefixNgrams(contextFactor, accumulator, prefixTerminals);
if (makeSuffix)
- MakeSuffixNgrams(contextFactor, accumulator, suffixTerminals);
+ MakeSuffixNgrams(contextFactor, accumulator, suffixTerminals);
// remove duplicates
size_t size = contextFactor.size();
if (makePrefix && makeSuffix && (size <= m_n)) {
- stringstream curr_ngram;
- curr_ngram << m_baseName;
- for (size_t i = 0; i < size; ++i) {
- curr_ngram << (*contextFactor[i]).GetString(m_factorType);
- if (i < size-1)
- curr_ngram << ":";
- }
- accumulator->SparseMinusEquals(curr_ngram.str(), 1);
+ stringstream curr_ngram;
+ curr_ngram << m_baseName;
+ for (size_t i = 0; i < size; ++i) {
+ curr_ngram << (*contextFactor[i]).GetString(m_factorType);
+ if (i < size-1)
+ curr_ngram << ":";
+ }
+ accumulator->SparseMinusEquals(curr_ngram.str(), 1);
}
}
}
@@ -393,22 +381,23 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int
return new TargetNgramChartState(cur_hypo, featureId, m_n);
}
-void TargetNgramFeature::MakePrefixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfStartPos, size_t offset) const {
- stringstream ngram;
- size_t size = contextFactor.size();
+void TargetNgramFeature::MakePrefixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfStartPos, size_t offset) const
+{
+ stringstream ngram;
+ size_t size = contextFactor.size();
for (size_t k = 0; k < numberOfStartPos; ++k) {
size_t max_end = (size < m_n+k+offset)? size: m_n+k+offset;
for (size_t end_pos = 1+k+offset; end_pos < max_end; ++end_pos) {
ngram << m_baseName;
- for (size_t i=k+offset; i <= end_pos; ++i) {
- if (i > k+offset)
- ngram << ":";
+ for (size_t i=k+offset; i <= end_pos; ++i) {
+ if (i > k+offset)
+ ngram << ":";
StringPiece factorZero = (*contextFactor[i]).GetString(0);
if (m_factorType == 0 || factorZero.compare("<s>") == 0 || factorZero.compare("</s>") == 0)
- ngram << factorZero;
- else
- ngram << (*contextFactor[i]).GetString(m_factorType);
- const Word w = *contextFactor[i];
+ ngram << factorZero;
+ else
+ ngram << (*contextFactor[i]).GetString(m_factorType);
+ const Word w = *contextFactor[i];
}
// cerr << "p-ngram: " << ngram.str() << endl;
accumulator->SparsePlusEquals(ngram.str(), 1);
@@ -417,21 +406,22 @@ void TargetNgramFeature::MakePrefixNgrams(std::vector<const Word*> &contextFacto
}
}
-void TargetNgramFeature::MakeSuffixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfEndPos, size_t offset) const {
- stringstream ngram;
+void TargetNgramFeature::MakeSuffixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator, size_t numberOfEndPos, size_t offset) const
+{
+ stringstream ngram;
for (size_t k = 0; k < numberOfEndPos; ++k) {
size_t end_pos = contextFactor.size()-1-k-offset;
for (int start_pos=end_pos-1; (start_pos >= 0) && (end_pos-start_pos < m_n); --start_pos) {
- ngram << m_baseName;
- for (size_t j=start_pos; j <= end_pos; ++j){
- StringPiece factorZero = (*contextFactor[j]).GetString(0);
- if (m_factorType == 0 || factorZero.compare("<s>") == 0 || factorZero.compare("</s>") == 0)
- ngram << factorZero;
- else
- ngram << (*contextFactor[j]).GetString(m_factorType);
- if (j < end_pos)
- ngram << ":";
- }
+ ngram << m_baseName;
+ for (size_t j=start_pos; j <= end_pos; ++j) {
+ StringPiece factorZero = (*contextFactor[j]).GetString(0);
+ if (m_factorType == 0 || factorZero.compare("<s>") == 0 || factorZero.compare("</s>") == 0)
+ ngram << factorZero;
+ else
+ ngram << (*contextFactor[j]).GetString(m_factorType);
+ if (j < end_pos)
+ ngram << ":";
+ }
// cerr << "s-ngram: " << ngram.str() << endl;
accumulator->SparsePlusEquals(ngram.str(), 1);
ngram.str("");
diff --git a/moses/FF/TargetNgramFeature.h b/moses/FF/TargetNgramFeature.h
index b50391d43..8001f2f87 100644
--- a/moses/FF/TargetNgramFeature.h
+++ b/moses/FF/TargetNgramFeature.h
@@ -16,14 +16,17 @@
namespace Moses
{
-class TargetNgramState : public FFState {
- public:
- TargetNgramState(std::vector<Word> &words): m_words(words) {}
- const std::vector<Word> GetWords() const {return m_words;}
- virtual int Compare(const FFState& other) const;
-
- private:
- std::vector<Word> m_words;
+class TargetNgramState : public FFState
+{
+public:
+ TargetNgramState(std::vector<Word> &words): m_words(words) {}
+ const std::vector<Word> GetWords() const {
+ return m_words;
+ }
+ virtual int Compare(const FFState& other) const;
+
+private:
+ std::vector<Word> m_words;
};
class TargetNgramChartState : public FFState
@@ -39,8 +42,7 @@ private:
* \param ret prefix string
* \param size maximum size (typically max lm context window)
*/
- size_t CalcPrefix(const ChartHypothesis &hypo, const int featureId, Phrase &ret, size_t size) const
- {
+ size_t CalcPrefix(const ChartHypothesis &hypo, const int featureId, Phrase &ret, size_t size) const {
const TargetPhrase &target = hypo.GetCurrTargetPhrase();
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
target.GetAlignNonTerm().GetNonTermIndexMap();
@@ -76,9 +78,8 @@ private:
* \param ret suffix phrase
* \param size maximum size of suffix
*/
- size_t CalcSuffix(const ChartHypothesis &hypo, int featureId, Phrase &ret, size_t size) const
- {
- size_t prefixSize = m_contextPrefix.GetSize();
+ size_t CalcSuffix(const ChartHypothesis &hypo, int featureId, Phrase &ret, size_t size) const {
+ size_t prefixSize = m_contextPrefix.GetSize();
assert(prefixSize <= m_numTargetTerminals);
// special handling for small hypotheses
@@ -98,9 +99,9 @@ private:
}
// construct suffix analogous to prefix
else {
- const TargetPhrase targetPhrase = hypo.GetCurrTargetPhrase();
+ const TargetPhrase targetPhrase = hypo.GetCurrTargetPhrase();
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
- targetPhrase.GetAlignTerm().GetNonTermIndexMap();
+ targetPhrase.GetAlignTerm().GetNonTermIndexMap();
for (int pos = (int) targetPhrase.GetSize() - 1; pos >= 0 ; --pos) {
const Word &word = targetPhrase.GetWord(pos);
@@ -108,8 +109,7 @@ private:
size_t nonTermInd = nonTermIndexMap[pos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd);
size = static_cast<const TargetNgramChartState*>(prevHypo->GetFFState(featureId))->CalcSuffix(*prevHypo, featureId, ret, size);
- }
- else {
+ } else {
ret.PrependWord(word);
size--;
}
@@ -124,9 +124,8 @@ private:
public:
TargetNgramChartState(const ChartHypothesis &hypo, int featureId, size_t order)
- :m_contextPrefix(order - 1),
- m_contextSuffix(order - 1)
- {
+ :m_contextPrefix(order - 1),
+ m_contextSuffix(order - 1) {
m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals();
const WordsRange range = hypo.GetCurrSourceRange();
m_startPos = range.GetStartPos();
@@ -159,15 +158,13 @@ public:
static_cast<const TargetNgramChartState &>( o );
// prefix
- if (m_startPos > 0) // not for "<s> ..."
- {
+ if (m_startPos > 0) { // not for "<s> ..."
int ret = GetPrefix().Compare(other.GetPrefix());
if (ret != 0)
return ret;
}
- if (m_endPos < m_inputSize - 1)// not for "... </s>"
- {
+ if (m_endPos < m_inputSize - 1) { // not for "... </s>"
int ret = GetSuffix().Compare(other.GetSuffix());
if (ret != 0)
return ret;
@@ -178,34 +175,35 @@ public:
/** Sets the features of observed ngrams.
*/
-class TargetNgramFeature : public StatefulFeatureFunction {
+class TargetNgramFeature : public StatefulFeatureFunction
+{
public:
TargetNgramFeature(const std::string &line);
- bool Load(const std::string &filePath);
+ bool Load(const std::string &filePath);
- virtual const FFState* EmptyHypothesisState(const InputType &input) const;
+ virtual const FFState* EmptyHypothesisState(const InputType &input) const;
- virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state,
- ScoreComponentCollection* accumulator) const;
+ virtual FFState* Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const;
virtual FFState* EvaluateChart(const ChartHypothesis& cur_hypo, int featureId,
- ScoreComponentCollection* accumulator) const;
+ ScoreComponentCollection* accumulator) const;
private:
FactorType m_factorType;
Word m_bos;
boost::unordered_set<std::string> m_vocab;
- size_t m_n;
- bool m_lower_ngrams;
+ size_t m_n;
+ bool m_lower_ngrams;
- std::string m_baseName;
+ std::string m_baseName;
- void appendNgram(const Word& word, bool& skip, std::stringstream& ngram) const;
- void MakePrefixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator,
- size_t numberOfStartPos = 1, size_t offset = 0) const;
- void MakeSuffixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator,
- size_t numberOfEndPos = 1, size_t offset = 0) const;
+ void appendNgram(const Word& word, bool& skip, std::stringstream& ngram) const;
+ void MakePrefixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator,
+ size_t numberOfStartPos = 1, size_t offset = 0) const;
+ void MakeSuffixNgrams(std::vector<const Word*> &contextFactor, ScoreComponentCollection* accumulator,
+ size_t numberOfEndPos = 1, size_t offset = 0) const;
};
}
diff --git a/moses/FF/TargetWordInsertionFeature.cpp b/moses/FF/TargetWordInsertionFeature.cpp
index 386e943be..f20a652e4 100644
--- a/moses/FF/TargetWordInsertionFeature.cpp
+++ b/moses/FF/TargetWordInsertionFeature.cpp
@@ -9,13 +9,14 @@
#include "moses/UserMessage.h"
#include "util/string_piece_hash.hh"
-namespace Moses {
+namespace Moses
+{
using namespace std;
TargetWordInsertionFeature::TargetWordInsertionFeature(const std::string &line)
-:StatelessFeatureFunction("TargetWordInsertionFeature", 0, line),
-m_unrestricted(true)
+ :StatelessFeatureFunction("TargetWordInsertionFeature", 0, line),
+ m_unrestricted(true)
{
std::cerr << "Initializing target word insertion feature.." << std::endl;
@@ -26,11 +27,9 @@ m_unrestricted(true)
if (args[0] == "factor") {
m_factorType = Scan<FactorType>(args[1]);
- }
- else if (args[0] == "path") {
+ } else if (args[0] == "path") {
filename = args[1];
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
@@ -46,13 +45,12 @@ m_unrestricted(true)
}
-bool TargetWordInsertionFeature::Load(const std::string &filePath)
+bool TargetWordInsertionFeature::Load(const std::string &filePath)
{
ifstream inFile(filePath.c_str());
- if (!inFile)
- {
- cerr << "could not open file " << filePath << endl;
- return false;
+ if (!inFile) {
+ cerr << "could not open file " << filePath << endl;
+ return false;
}
std::string line;
@@ -67,18 +65,18 @@ bool TargetWordInsertionFeature::Load(const std::string &filePath)
}
void TargetWordInsertionFeature::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm();
ComputeFeatures(source, targetPhrase, &scoreBreakdown, alignmentInfo);
}
void TargetWordInsertionFeature::ComputeFeatures(const Phrase &source,
- const TargetPhrase& targetPhrase,
- ScoreComponentCollection* accumulator,
- const AlignmentInfo &alignmentInfo) const
+ const TargetPhrase& targetPhrase,
+ ScoreComponentCollection* accumulator,
+ const AlignmentInfo &alignmentInfo) const
{
// handle special case: unknown words (they have no word alignment)
size_t targetLength = targetPhrase.GetSize();
@@ -100,15 +98,14 @@ void TargetWordInsertionFeature::ComputeFeatures(const Phrase &source,
if (!aligned[i]) {
Word w = targetPhrase.GetWord(i);
if (!w.IsNonTerminal()) {
- const StringPiece word = w.GetFactor(m_factorType)->GetString();
- if (word != "<s>" && word != "</s>") {
- if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) {
- accumulator->PlusEquals(this,StringPiece("OTHER"),1);
- }
- else {
- accumulator->PlusEquals(this,word,1);
- }
- }
+ const StringPiece word = w.GetFactor(m_factorType)->GetString();
+ if (word != "<s>" && word != "</s>") {
+ if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) {
+ accumulator->PlusEquals(this,StringPiece("OTHER"),1);
+ } else {
+ accumulator->PlusEquals(this,word,1);
+ }
+ }
}
}
}
diff --git a/moses/FF/TargetWordInsertionFeature.h b/moses/FF/TargetWordInsertionFeature.h
index aabc4cffc..50f7e5f88 100644
--- a/moses/FF/TargetWordInsertionFeature.h
+++ b/moses/FF/TargetWordInsertionFeature.h
@@ -13,7 +13,8 @@ namespace Moses
/** Sets the features for length of source phrase, target phrase, both.
*/
-class TargetWordInsertionFeature : public StatelessFeatureFunction {
+class TargetWordInsertionFeature : public StatelessFeatureFunction
+{
private:
boost::unordered_set<std::string> m_vocab;
FactorType m_factorType;
@@ -21,18 +22,18 @@ private:
public:
TargetWordInsertionFeature(const std::string &line);
-
+
bool Load(const std::string &filePath);
virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const;
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
void ComputeFeatures(const Phrase &source,
- const TargetPhrase& targetPhrase,
- ScoreComponentCollection* accumulator,
- const AlignmentInfo &alignmentInfo) const;
+ const TargetPhrase& targetPhrase,
+ ScoreComponentCollection* accumulator,
+ const AlignmentInfo &alignmentInfo) const;
};
diff --git a/moses/FF/UnknownWordPenaltyProducer.h b/moses/FF/UnknownWordPenaltyProducer.h
index b60967746..200033cfc 100644
--- a/moses/FF/UnknownWordPenaltyProducer.h
+++ b/moses/FF/UnknownWordPenaltyProducer.h
@@ -14,10 +14,9 @@ class WordsRange;
class UnknownWordPenaltyProducer : public StatelessFeatureFunction
{
public:
- UnknownWordPenaltyProducer(const std::string &line)
- : StatelessFeatureFunction("UnknownWordPenalty",1, line)
- {
- m_tuneable = false;
+ UnknownWordPenaltyProducer(const std::string &line)
+ : StatelessFeatureFunction("UnknownWordPenalty",1, line) {
+ m_tuneable = false;
}
};
diff --git a/moses/FF/WordPenaltyProducer.cpp b/moses/FF/WordPenaltyProducer.cpp
index ba97852e4..1dc425742 100644
--- a/moses/FF/WordPenaltyProducer.cpp
+++ b/moses/FF/WordPenaltyProducer.cpp
@@ -5,9 +5,9 @@
namespace Moses
{
void WordPenaltyProducer::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
float score = - (float) targetPhrase.GetNumTerminals();
scoreBreakdown.Assign(this, score);
diff --git a/moses/FF/WordPenaltyProducer.h b/moses/FF/WordPenaltyProducer.h
index fc824dd84..1892c459c 100644
--- a/moses/FF/WordPenaltyProducer.h
+++ b/moses/FF/WordPenaltyProducer.h
@@ -14,12 +14,12 @@ class ScoreComponentCollection;
class WordPenaltyProducer : public StatelessFeatureFunction
{
public:
- WordPenaltyProducer(const std::string &line) : StatelessFeatureFunction("WordPenalty",1, line) {}
+ WordPenaltyProducer(const std::string &line) : StatelessFeatureFunction("WordPenalty",1, line) {}
virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const;
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
};
diff --git a/moses/FF/WordTranslationFeature.cpp b/moses/FF/WordTranslationFeature.cpp
index 2648ac9f1..3f282609f 100644
--- a/moses/FF/WordTranslationFeature.cpp
+++ b/moses/FF/WordTranslationFeature.cpp
@@ -10,18 +10,19 @@
#include "moses/UserMessage.h"
#include "util/string_piece_hash.hh"
-namespace Moses {
+namespace Moses
+{
using namespace std;
WordTranslationFeature::WordTranslationFeature(const std::string &line)
-:StatelessFeatureFunction("WordTranslationFeature", 0, line)
-,m_unrestricted(true)
-,m_simple(true)
-,m_sourceContext(false)
-,m_targetContext(false)
-,m_ignorePunctuation(false)
-,m_domainTrigger(false)
+ :StatelessFeatureFunction("WordTranslationFeature", 0, line)
+ ,m_unrestricted(true)
+ ,m_simple(true)
+ ,m_sourceContext(false)
+ ,m_targetContext(false)
+ ,m_ignorePunctuation(false)
+ ,m_domainTrigger(false)
{
std::cerr << "Initializing word translation feature.. " << endl;
@@ -34,35 +35,25 @@ WordTranslationFeature::WordTranslationFeature(const std::string &line)
if (args[0] == "input-factor") {
m_factorTypeSource = Scan<FactorType>(args[1]);
- }
- else if (args[0] == "output-factor") {
+ } else if (args[0] == "output-factor") {
m_factorTypeTarget = Scan<FactorType>(args[1]);
- }
- else if (args[0] == "simple") {
+ } else if (args[0] == "simple") {
m_simple = Scan<bool>(args[1]);
- }
- else if (args[0] == "source-context") {
+ } else if (args[0] == "source-context") {
m_sourceContext = Scan<bool>(args[1]);
- }
- else if (args[0] == "target-context") {
+ } else if (args[0] == "target-context") {
m_targetContext = Scan<bool>(args[1]);
- }
- else if (args[0] == "ignore-punctuation") {
+ } else if (args[0] == "ignore-punctuation") {
m_ignorePunctuation = Scan<bool>(args[1]);
- }
- else if (args[0] == "domain-trigger") {
+ } else if (args[0] == "domain-trigger") {
m_domainTrigger = Scan<bool>(args[1]);
- }
- else if (args[0] == "texttype") {
+ } else if (args[0] == "texttype") {
texttype = args[1];
- }
- else if (args[0] == "source-path") {
+ } else if (args[0] == "source-path") {
filenameSource = args[1];
- }
- else if (args[0] == "target-path") {
+ } else if (args[0] == "target-path") {
filenameTarget = args[1];
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
@@ -108,65 +99,62 @@ WordTranslationFeature::WordTranslationFeature(const std::string &line)
}
-bool WordTranslationFeature::Load(const std::string &filePathSource, const std::string &filePathTarget)
+bool WordTranslationFeature::Load(const std::string &filePathSource, const std::string &filePathTarget)
{
if (m_domainTrigger) {
// domain trigger terms for each input document
ifstream inFileSource(filePathSource.c_str());
- if (!inFileSource){
+ if (!inFileSource) {
cerr << "could not open file " << filePathSource << endl;
return false;
}
-
+
std::string line;
while (getline(inFileSource, line)) {
- m_vocabDomain.resize(m_vocabDomain.size() + 1);
- vector<string> termVector;
- boost::split(termVector, line, boost::is_any_of("\t "));
- for (size_t i=0; i < termVector.size(); ++i)
- m_vocabDomain.back().insert(termVector[i]);
+ m_vocabDomain.resize(m_vocabDomain.size() + 1);
+ vector<string> termVector;
+ boost::split(termVector, line, boost::is_any_of("\t "));
+ for (size_t i=0; i < termVector.size(); ++i)
+ m_vocabDomain.back().insert(termVector[i]);
}
-
+
inFileSource.close();
- }
- else {
+ } else {
// restricted source word vocabulary
ifstream inFileSource(filePathSource.c_str());
- if (!inFileSource)
- {
- cerr << "could not open file " << filePathSource << endl;
- return false;
- }
-
+ if (!inFileSource) {
+ cerr << "could not open file " << filePathSource << endl;
+ return false;
+ }
+
std::string line;
while (getline(inFileSource, line)) {
m_vocabSource.insert(line);
}
-
+
inFileSource.close();
-
+
// restricted target word vocabulary
ifstream inFileTarget(filePathTarget.c_str());
- if (!inFileTarget)
- {
- cerr << "could not open file " << filePathTarget << endl;
- return false;
- }
-
+ if (!inFileTarget) {
+ cerr << "could not open file " << filePathTarget << endl;
+ return false;
+ }
+
while (getline(inFileTarget, line)) {
m_vocabTarget.insert(line);
}
-
+
inFileTarget.close();
-
+
m_unrestricted = false;
}
return true;
}
void WordTranslationFeature::Evaluate
- (const PhraseBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const
+(const PhraseBasedFeatureContext& context,
+ ScoreComponentCollection* accumulator) const
{
const Sentence& input = static_cast<const Sentence&>(context.GetSource());
const TargetPhrase& targetPhrase = context.GetTargetPhrase();
@@ -188,7 +176,7 @@ void WordTranslationFeature::Evaluate
char firstChar = sourceWord[0];
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
- continue;
+ continue;
firstChar = targetWord[0];
charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
@@ -197,9 +185,9 @@ void WordTranslationFeature::Evaluate
if (!m_unrestricted) {
if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end())
- sourceWord = "OTHER";
+ sourceWord = "OTHER";
if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end())
- targetWord = "OTHER";
+ targetWord = "OTHER";
}
if (m_simple) {
@@ -215,174 +203,169 @@ void WordTranslationFeature::Evaluate
const bool use_topicid = input.GetUseTopicId();
const bool use_topicid_prob = input.GetUseTopicIdAndProb();
if (use_topicid || use_topicid_prob) {
- if(use_topicid) {
- // use topicid as trigger
- const long topicid = input.GetTopicId();
- stringstream feature;
- feature << m_description << "_";
- if (topicid == -1)
- feature << "unk";
- else
- feature << topicid;
-
- feature << "_";
- feature << sourceWord;
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
- else {
- // use topic probabilities
- const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
- if (atol(topicid_prob[0].c_str()) == -1) {
- stringstream feature;
- feature << m_description << "_unk_";
- feature << sourceWord;
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
- else {
- for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
- stringstream feature;
- feature << m_description << "_";
- feature << topicid_prob[i];
- feature << "_";
- feature << sourceWord;
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
- }
- }
- }
- }
- else {
- // range over domain trigger words (keywords)
- const long docid = input.GetDocumentId();
- for (boost::unordered_set<std::string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
- string sourceTrigger = *p;
- stringstream feature;
- feature << m_description << "_";
- feature << sourceTrigger;
- feature << "_";
- feature << sourceWord;
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
+ if(use_topicid) {
+ // use topicid as trigger
+ const long topicid = input.GetTopicId();
+ stringstream feature;
+ feature << m_description << "_";
+ if (topicid == -1)
+ feature << "unk";
+ else
+ feature << topicid;
+
+ feature << "_";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ } else {
+ // use topic probabilities
+ const vector<string> &topicid_prob = *(input.GetTopicIdAndProb());
+ if (atol(topicid_prob[0].c_str()) == -1) {
+ stringstream feature;
+ feature << m_description << "_unk_";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ } else {
+ for (size_t i=0; i+1 < topicid_prob.size(); i+=2) {
+ stringstream feature;
+ feature << m_description << "_";
+ feature << topicid_prob[i];
+ feature << "_";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str()));
+ }
+ }
+ }
+ } else {
+ // range over domain trigger words (keywords)
+ const long docid = input.GetDocumentId();
+ for (boost::unordered_set<std::string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
+ string sourceTrigger = *p;
+ stringstream feature;
+ feature << m_description << "_";
+ feature << sourceTrigger;
+ feature << "_";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ }
}
}
if (m_sourceContext) {
size_t globalSourceIndex = context.GetTranslationOption().GetStartPos() + sourceIndex;
if (!m_domainTrigger && globalSourceIndex == 0) {
- // add <s> trigger feature for source
- stringstream feature;
- feature << m_description << "_";
- feature << "<s>,";
- feature << sourceWord;
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
+ // add <s> trigger feature for source
+ stringstream feature;
+ feature << m_description << "_";
+ feature << "<s>,";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
}
// range over source words to get context
for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
- if (contextIndex == globalSourceIndex) continue;
- StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString();
- if (m_ignorePunctuation) {
- // check if trigger is punctuation
- char firstChar = sourceTrigger[0];
- CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
- if(charIterator != m_punctuationHash.end())
- continue;
- }
-
- const long docid = input.GetDocumentId();
- bool sourceTriggerExists = false;
- if (m_domainTrigger)
- sourceTriggerExists = FindStringPiece(m_vocabDomain[docid], sourceTrigger ) != m_vocabDomain[docid].end();
- else if (!m_unrestricted)
- sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
-
- if (m_domainTrigger) {
- if (sourceTriggerExists) {
- stringstream feature;
- feature << m_description << "_";
- feature << sourceTrigger;
- feature << "_";
- feature << sourceWord;
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
- }
- else if (m_unrestricted || sourceTriggerExists) {
- stringstream feature;
- feature << m_description << "_";
- if (contextIndex < globalSourceIndex) {
- feature << sourceTrigger;
- feature << ",";
- feature << sourceWord;
- }
- else {
- feature << sourceWord;
- feature << ",";
- feature << sourceTrigger;
- }
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
+ if (contextIndex == globalSourceIndex) continue;
+ StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString();
+ if (m_ignorePunctuation) {
+ // check if trigger is punctuation
+ char firstChar = sourceTrigger[0];
+ CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+ if(charIterator != m_punctuationHash.end())
+ continue;
+ }
+
+ const long docid = input.GetDocumentId();
+ bool sourceTriggerExists = false;
+ if (m_domainTrigger)
+ sourceTriggerExists = FindStringPiece(m_vocabDomain[docid], sourceTrigger ) != m_vocabDomain[docid].end();
+ else if (!m_unrestricted)
+ sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end();
+
+ if (m_domainTrigger) {
+ if (sourceTriggerExists) {
+ stringstream feature;
+ feature << m_description << "_";
+ feature << sourceTrigger;
+ feature << "_";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ }
+ } else if (m_unrestricted || sourceTriggerExists) {
+ stringstream feature;
+ feature << m_description << "_";
+ if (contextIndex < globalSourceIndex) {
+ feature << sourceTrigger;
+ feature << ",";
+ feature << sourceWord;
+ } else {
+ feature << sourceWord;
+ feature << ",";
+ feature << sourceTrigger;
+ }
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ }
}
}
if (m_targetContext) {
throw runtime_error("Can't use target words outside current translation option in a stateless feature");
/*
- size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex;
- if (globalTargetIndex == 0) {
- // add <s> trigger feature for source
- stringstream feature;
- feature << "wt_";
- feature << sourceWord;
- feature << "~";
- feature << "<s>,";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
-
- // range over target words (up to current position) to get context
- for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) {
- string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
- if (m_ignorePunctuation) {
- // check if trigger is punctuation
- char firstChar = targetTrigger.at(0);
- CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
- if(charIterator != m_punctuationHash.end())
- continue;
- }
-
- bool targetTriggerExists = false;
- if (!m_unrestricted)
- targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
-
- if (m_unrestricted || targetTriggerExists) {
- stringstream feature;
- feature << "wt_";
- feature << sourceWord;
- feature << "~";
- feature << targetTrigger;
- feature << ",";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- }
- }*/
+ size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex;
+ if (globalTargetIndex == 0) {
+ // add <s> trigger feature for source
+ stringstream feature;
+ feature << "wt_";
+ feature << sourceWord;
+ feature << "~";
+ feature << "<s>,";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ }
+
+ // range over target words (up to current position) to get context
+ for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) {
+ string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
+ if (m_ignorePunctuation) {
+ // check if trigger is punctuation
+ char firstChar = targetTrigger.at(0);
+ CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+ if(charIterator != m_punctuationHash.end())
+ continue;
+ }
+
+ bool targetTriggerExists = false;
+ if (!m_unrestricted)
+ targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
+
+ if (m_unrestricted || targetTriggerExists) {
+ stringstream feature;
+ feature << "wt_";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetTrigger;
+ feature << ",";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ }
+ }*/
}
}
}
void WordTranslationFeature::EvaluateChart(
- const ChartBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const
+ const ChartBasedFeatureContext& context,
+ ScoreComponentCollection* accumulator) const
{
const TargetPhrase& targetPhrase = context.GetTargetPhrase();
const AlignmentInfo &alignmentInfo = targetPhrase.GetAlignTerm();
@@ -403,7 +386,7 @@ void WordTranslationFeature::EvaluateChart(
char firstChar = sourceWord[0];
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
- continue;
+ continue;
firstChar = targetWord[0];
charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
@@ -411,118 +394,118 @@ void WordTranslationFeature::EvaluateChart(
}
if (!m_unrestricted) {
- if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end())
- sourceWord = "OTHER";
- if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end())
- targetWord = "OTHER";
+ if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end())
+ sourceWord = "OTHER";
+ if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end())
+ targetWord = "OTHER";
}
-
+
if (m_simple) {
- // construct feature name
- stringstream featureName;
- featureName << m_description << "_";
- //featureName << ((sourceExists||m_unrestricted) ? sourceWord : "OTHER");
- featureName << sourceWord;
- featureName << "~";
- //featureName << ((targetExists||m_unrestricted) ? targetWord : "OTHER");
- featureName << targetWord;
- accumulator->SparsePlusEquals(featureName.str(), 1);
+ // construct feature name
+ stringstream featureName;
+ featureName << m_description << "_";
+ //featureName << ((sourceExists||m_unrestricted) ? sourceWord : "OTHER");
+ featureName << sourceWord;
+ featureName << "~";
+ //featureName << ((targetExists||m_unrestricted) ? targetWord : "OTHER");
+ featureName << targetWord;
+ accumulator->SparsePlusEquals(featureName.str(), 1);
}
- /* if (m_sourceContext) {
- size_t globalSourceIndex = cur_hypo.GetCurrSourceRange().GetStartPos() + sourceIndex;
- if (globalSourceIndex == 0) {
- // add <s> trigger feature for source
- stringstream feature;
- feature << "wt_";
- feature << "<s>,";
- feature << sourceWord;
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- cerr << feature.str() << endl;
- }
-
- // range over source words to get context
- for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
- if (contextIndex == globalSourceIndex) continue;
- string sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString();
- if (m_ignorePunctuation) {
- // check if trigger is punctuation
- char firstChar = sourceTrigger.at(0);
- CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
- if(charIterator != m_punctuationHash.end())
- continue;
- }
-
- bool sourceTriggerExists = false;
- if (!m_unrestricted)
- sourceTriggerExists = m_vocabSource.find( sourceTrigger ) != m_vocabSource.end();
-
- if (m_unrestricted || sourceTriggerExists) {
- stringstream feature;
- feature << "wt_";
- if (contextIndex < globalSourceIndex) {
- feature << sourceTrigger;
- feature << ",";
- feature << sourceWord;
- }
- else {
- feature << sourceWord;
- feature << ",";
- feature << sourceTrigger;
- }
- feature << "~";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- cerr << feature.str() << endl;
- }
- }
- }*/
-/* if (m_targetContext) {
- size_t globalTargetIndex = 0; // TODO
-// size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex;
- if (globalTargetIndex == 0) {
- // add <s> trigger feature for source
- stringstream feature;
- feature << "wt_";
- feature << sourceWord;
- feature << "~";
- feature << "<s>,";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- cerr << feature.str() << endl;
- }
-
- // range over target words (up to current position) to get context
- for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) {
- Phrase outputPhrase = cur_hypo.GetOutputPhrase();
- string targetTrigger = outputPhrase.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
- //string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
- if (m_ignorePunctuation) {
- // check if trigger is punctuation
- char firstChar = targetTrigger.at(0);
- CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
- if(charIterator != m_punctuationHash.end())
- continue;
- }
-
- bool targetTriggerExists = false;
- if (!m_unrestricted)
- targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
-
- if (m_unrestricted || targetTriggerExists) {
- stringstream feature;
- feature << "wt_";
- feature << sourceWord;
- feature << "~";
- feature << targetTrigger;
- feature << ",";
- feature << targetWord;
- accumulator->SparsePlusEquals(feature.str(), 1);
- cerr << feature.str() << endl;
- }
- }
+ /* if (m_sourceContext) {
+ size_t globalSourceIndex = cur_hypo.GetCurrSourceRange().GetStartPos() + sourceIndex;
+ if (globalSourceIndex == 0) {
+ // add <s> trigger feature for source
+ stringstream feature;
+ feature << "wt_";
+ feature << "<s>,";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ cerr << feature.str() << endl;
+ }
+
+ // range over source words to get context
+ for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
+ if (contextIndex == globalSourceIndex) continue;
+ string sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString();
+ if (m_ignorePunctuation) {
+ // check if trigger is punctuation
+ char firstChar = sourceTrigger.at(0);
+ CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+ if(charIterator != m_punctuationHash.end())
+ continue;
+ }
+
+ bool sourceTriggerExists = false;
+ if (!m_unrestricted)
+ sourceTriggerExists = m_vocabSource.find( sourceTrigger ) != m_vocabSource.end();
+
+ if (m_unrestricted || sourceTriggerExists) {
+ stringstream feature;
+ feature << "wt_";
+ if (contextIndex < globalSourceIndex) {
+ feature << sourceTrigger;
+ feature << ",";
+ feature << sourceWord;
+ }
+ else {
+ feature << sourceWord;
+ feature << ",";
+ feature << sourceTrigger;
+ }
+ feature << "~";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ cerr << feature.str() << endl;
+ }
+ }
}*/
+ /* if (m_targetContext) {
+ size_t globalTargetIndex = 0; // TODO
+ // size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex;
+ if (globalTargetIndex == 0) {
+ // add <s> trigger feature for source
+ stringstream feature;
+ feature << "wt_";
+ feature << sourceWord;
+ feature << "~";
+ feature << "<s>,";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ cerr << feature.str() << endl;
+ }
+
+ // range over target words (up to current position) to get context
+ for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) {
+ Phrase outputPhrase = cur_hypo.GetOutputPhrase();
+ string targetTrigger = outputPhrase.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
+ //string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString();
+ if (m_ignorePunctuation) {
+ // check if trigger is punctuation
+ char firstChar = targetTrigger.at(0);
+ CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
+ if(charIterator != m_punctuationHash.end())
+ continue;
+ }
+
+ bool targetTriggerExists = false;
+ if (!m_unrestricted)
+ targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
+
+ if (m_unrestricted || targetTriggerExists) {
+ stringstream feature;
+ feature << "wt_";
+ feature << sourceWord;
+ feature << "~";
+ feature << targetTrigger;
+ feature << ",";
+ feature << targetWord;
+ accumulator->SparsePlusEquals(feature.str(), 1);
+ cerr << feature.str() << endl;
+ }
+ }
+ }*/
}
}
diff --git a/moses/FF/WordTranslationFeature.h b/moses/FF/WordTranslationFeature.h
index 3379e8c84..b3a434325 100644
--- a/moses/FF/WordTranslationFeature.h
+++ b/moses/FF/WordTranslationFeature.h
@@ -14,11 +14,12 @@ namespace Moses
/** Sets the features for word translation
*/
-class WordTranslationFeature : public StatelessFeatureFunction {
+class WordTranslationFeature : public StatelessFeatureFunction
+{
typedef std::map< char, short > CharHash;
typedef std::vector< boost::unordered_set<std::string> > DocumentVector;
-
+
private:
boost::unordered_set<std::string> m_vocabSource;
boost::unordered_set<std::string> m_vocabTarget;
@@ -32,18 +33,18 @@ private:
bool m_domainTrigger;
bool m_ignorePunctuation;
CharHash m_punctuationHash;
-
+
public:
WordTranslationFeature(const std::string &line);
-
+
bool Load(const std::string &filePathSource, const std::string &filePathTarget);
-
+
const FFState* EmptyHypothesisState(const InputType &) const {
return new DummyState();
}
-
- void Evaluate(const PhraseBasedFeatureContext& context,
- ScoreComponentCollection* accumulator) const;
+
+ void Evaluate(const PhraseBasedFeatureContext& context,
+ ScoreComponentCollection* accumulator) const;
void EvaluateChart(const ChartBasedFeatureContext& context,
ScoreComponentCollection* accumulator) const;
diff --git a/moses/Factor.h b/moses/Factor.h
index 87e8f8028..f4bb2074d 100644
--- a/moses/Factor.h
+++ b/moses/Factor.h
@@ -34,8 +34,8 @@ namespace Moses
struct FactorFriend;
class FactorCollection;
-/** Represents a factor (word, POS, etc).
- * A Factor has a contiguous identifier and string value.
+/** Represents a factor (word, POS, etc).
+ * A Factor has a contiguous identifier and string value.
*/
class Factor
{
@@ -53,10 +53,10 @@ class Factor
//! protected constructor. only friend class, FactorCollection, is allowed to create Factor objects
Factor() {}
- // Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly.
+ // Needed for STL containers. They'll delegate through FactorFriend, which is never exposed publicly.
Factor(const Factor &factor) : m_string(factor.m_string), m_id(factor.m_id) {}
- // Not implemented. Shouldn't be called.
+ // Not implemented. Shouldn't be called.
Factor &operator=(const Factor &factor);
public:
diff --git a/moses/FactorCollection.cpp b/moses/FactorCollection.cpp
index 969bb39d1..5d6eb1c53 100644
--- a/moses/FactorCollection.cpp
+++ b/moses/FactorCollection.cpp
@@ -38,11 +38,12 @@ FactorCollection FactorCollection::s_instance;
const Factor *FactorCollection::AddFactor(const StringPiece &factorString)
{
FactorFriend to_ins;
- to_ins.in.m_string = factorString;
+ to_ins.in.m_string = factorString;
to_ins.in.m_id = m_factorId;
// If we're threaded, hope a read-only lock is sufficient.
#ifdef WITH_THREADS
- { // read=lock scope
+ {
+ // read=lock scope
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
Set::const_iterator i = m_set.find(to_ins);
if (i != m_set.end()) return &i->in;
@@ -52,8 +53,8 @@ const Factor *FactorCollection::AddFactor(const StringPiece &factorString)
std::pair<Set::iterator, bool> ret(m_set.insert(to_ins));
if (ret.second) {
ret.first->in.m_string.set(
- memcpy(m_string_backing.Allocate(factorString.size()), factorString.data(), factorString.size()),
- factorString.size());
+ memcpy(m_string_backing.Allocate(factorString.size()), factorString.data(), factorString.size()),
+ factorString.size());
m_factorId++;
}
return &ret.first->in;
diff --git a/moses/FactorCollection.h b/moses/FactorCollection.h
index e7749244f..8c3db5da9 100644
--- a/moses/FactorCollection.h
+++ b/moses/FactorCollection.h
@@ -44,7 +44,7 @@ namespace Moses
* private and friended to FactorFriend. The STL containers can delegate
* copying, so friending the container isn't sufficient. STL containers see
* FactorFriend's public copy constructor and everybody else sees Factor's
- * private copy constructor.
+ * private copy constructor.
*/
struct FactorFriend {
Factor in;
diff --git a/moses/FeatureVector.cpp b/moses/FeatureVector.cpp
index f58bb5cab..96dd9a0ce 100644
--- a/moses/FeatureVector.cpp
+++ b/moses/FeatureVector.cpp
@@ -1,22 +1,22 @@
/*
Moses - factored phrase-based language decoder
Copyright (C) 2010 University of Edinburgh
-
-
+
+
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
-
+
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
+
*/
#include <algorithm>
@@ -31,744 +31,815 @@
using namespace std;
-namespace Moses {
-
- const string FName::SEP = "_";
- FName::Name2Id FName::name2id;
- vector<string> FName::id2name;
- FName::Id2Count FName::id2hopeCount;
- FName::Id2Count FName::id2fearCount;
+namespace Moses
+{
+
+const string FName::SEP = "_";
+FName::Name2Id FName::name2id;
+vector<string> FName::id2name;
+FName::Id2Count FName::id2hopeCount;
+FName::Id2Count FName::id2fearCount;
#ifdef WITH_THREADS
- boost::shared_mutex FName::m_idLock;
+boost::shared_mutex FName::m_idLock;
#endif
-
- void FName::init(const StringPiece &name) {
+
+void FName::init(const StringPiece &name)
+{
#ifdef WITH_THREADS
- //reader lock
- boost::shared_lock<boost::shared_mutex> lock(m_idLock);
+ //reader lock
+ boost::shared_lock<boost::shared_mutex> lock(m_idLock);
#endif
- Name2Id::iterator i = FindStringPiece(name2id, name);
- if (i != name2id.end()) {
- m_id = i->second;
- } else {
+ Name2Id::iterator i = FindStringPiece(name2id, name);
+ if (i != name2id.end()) {
+ m_id = i->second;
+ } else {
#ifdef WITH_THREADS
- //release the reader lock, and upgrade to writer lock
- lock.unlock();
- boost::unique_lock<boost::shared_mutex> write_lock(m_idLock);
+ //release the reader lock, and upgrade to writer lock
+ lock.unlock();
+ boost::unique_lock<boost::shared_mutex> write_lock(m_idLock);
#endif
- std::pair<std::string, size_t> to_ins;
- to_ins.first.assign(name.data(), name.size());
- to_ins.second = name2id.size();
- std::pair<Name2Id::iterator, bool> res(name2id.insert(to_ins));
- if (res.second) {
- // TODO this should be string pointers backed by the hash table.
- id2name.push_back(to_ins.first);
- }
- m_id = res.first->second;
+ std::pair<std::string, size_t> to_ins;
+ to_ins.first.assign(name.data(), name.size());
+ to_ins.second = name2id.size();
+ std::pair<Name2Id::iterator, bool> res(name2id.insert(to_ins));
+ if (res.second) {
+ // TODO this should be string pointers backed by the hash table.
+ id2name.push_back(to_ins.first);
}
+ m_id = res.first->second;
+ }
+}
+
+size_t FName::getId(const string& name)
+{
+ Name2Id::iterator i = name2id.find(name);
+ assert (i != name2id.end());
+ return i->second;
+}
+
+size_t FName::getHopeIdCount(const string& name)
+{
+ Name2Id::iterator i = name2id.find(name);
+ if (i != name2id.end()) {
+ float id = i->second;
+ return id2hopeCount[id];
+ }
+ return 0;
+}
+
+size_t FName::getFearIdCount(const string& name)
+{
+ Name2Id::iterator i = name2id.find(name);
+ if (i != name2id.end()) {
+ float id = i->second;
+ return id2fearCount[id];
}
-
- size_t FName::getId(const string& name) {
- Name2Id::iterator i = name2id.find(name);
- assert (i != name2id.end());
- return i->second;
- }
-
- size_t FName::getHopeIdCount(const string& name) {
- Name2Id::iterator i = name2id.find(name);
- if (i != name2id.end()) {
- float id = i->second;
- return id2hopeCount[id];
- }
- return 0;
- }
-
- size_t FName::getFearIdCount(const string& name) {
- Name2Id::iterator i = name2id.find(name);
- if (i != name2id.end()) {
- float id = i->second;
- return id2fearCount[id];
- }
- return 0;
- }
-
- void FName::incrementHopeId(const string& name) {
- Name2Id::iterator i = name2id.find(name);
- assert(i != name2id.end());
+ return 0;
+}
+
+void FName::incrementHopeId(const string& name)
+{
+ Name2Id::iterator i = name2id.find(name);
+ assert(i != name2id.end());
#ifdef WITH_THREADS
- // get upgradable lock and upgrade to writer lock
- boost::upgrade_lock<boost::shared_mutex> upgradeLock(m_idLock);
- boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(upgradeLock);
+ // get upgradable lock and upgrade to writer lock
+ boost::upgrade_lock<boost::shared_mutex> upgradeLock(m_idLock);
+ boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(upgradeLock);
#endif
- id2hopeCount[i->second] += 1;
- }
+ id2hopeCount[i->second] += 1;
+}
- void FName::incrementFearId(const string& name) {
- Name2Id::iterator i = name2id.find(name);
- assert(i != name2id.end());
+void FName::incrementFearId(const string& name)
+{
+ Name2Id::iterator i = name2id.find(name);
+ assert(i != name2id.end());
#ifdef WITH_THREADS
- // get upgradable lock and upgrade to writer lock
- boost::upgrade_lock<boost::shared_mutex> upgradeLock(m_idLock);
- boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(upgradeLock);
+ // get upgradable lock and upgrade to writer lock
+ boost::upgrade_lock<boost::shared_mutex> upgradeLock(m_idLock);
+ boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(upgradeLock);
#endif
- id2fearCount[i->second] += 1;
- }
-
- void FName::eraseId(size_t id) {
+ id2fearCount[i->second] += 1;
+}
+
+void FName::eraseId(size_t id)
+{
#ifdef WITH_THREADS
- // get upgradable lock and upgrade to writer lock
- boost::upgrade_lock<boost::shared_mutex> upgradeLock(m_idLock);
- boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(upgradeLock);
+ // get upgradable lock and upgrade to writer lock
+ boost::upgrade_lock<boost::shared_mutex> upgradeLock(m_idLock);
+ boost::upgrade_to_unique_lock<boost::shared_mutex> uniqueLock(upgradeLock);
#endif
- id2hopeCount.erase(id);
- id2fearCount.erase(id);
- }
-
- std::ostream& operator<<( std::ostream& out, const FName& name) {
- out << name.name();
- return out;
- }
-
- size_t FName::hash() const {
- return boost::hash_value(m_id);
- }
-
- const std::string& FName::name() const {
- return id2name[m_id];
- }
-
-
- bool FName::operator==(const FName& rhs) const {
- return m_id == rhs.m_id;
- }
-
- bool FName::operator!=(const FName& rhs) const {
- return ! (*this == rhs);
- }
-
- FVector::FVector(size_t coreFeatures) : m_coreFeatures(coreFeatures) {}
-
- void FVector::resize(size_t newsize) {
- valarray<FValue> oldValues(m_coreFeatures);
- m_coreFeatures.resize(newsize);
- for (size_t i = 0; i < min(m_coreFeatures.size(), oldValues.size()); ++i) {
- m_coreFeatures[i] = oldValues[i];
- }
- }
-
- void FVector::clear() {
- m_coreFeatures.resize(0);
- m_features.clear();
- }
-
- bool FVector::load(const std::string& filename) {
- clear();
- ifstream in (filename.c_str());
- if (!in) {
- return false;
- }
- string line;
- while(getline(in,line)) {
- if (line[0] == '#') continue;
- istringstream linestream(line);
- string namestring;
- FValue value;
- linestream >> namestring;
- linestream >> value;
- FName fname(namestring);
- //cerr << "Setting sparse weight " << fname << " to value " << value << "." << endl;
- set(fname,value);
- }
- return true;
- }
+ id2hopeCount.erase(id);
+ id2fearCount.erase(id);
+}
- void FVector::save(const string& filename) const {
- ofstream out(filename.c_str());
- if (!out) {
- ostringstream msg;
- msg << "Unable to open " << filename;
- throw runtime_error(msg.str());
- }
- write(out);
- out.close();
+std::ostream& operator<<( std::ostream& out, const FName& name)
+{
+ out << name.name();
+ return out;
+}
+
+size_t FName::hash() const
+{
+ return boost::hash_value(m_id);
+}
+
+const std::string& FName::name() const
+{
+ return id2name[m_id];
+}
+
+
+bool FName::operator==(const FName& rhs) const
+{
+ return m_id == rhs.m_id;
+}
+
+bool FName::operator!=(const FName& rhs) const
+{
+ return ! (*this == rhs);
+}
+
+FVector::FVector(size_t coreFeatures) : m_coreFeatures(coreFeatures) {}
+
+void FVector::resize(size_t newsize)
+{
+ valarray<FValue> oldValues(m_coreFeatures);
+ m_coreFeatures.resize(newsize);
+ for (size_t i = 0; i < min(m_coreFeatures.size(), oldValues.size()); ++i) {
+ m_coreFeatures[i] = oldValues[i];
}
+}
- void FVector::write(ostream& out) const {
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- out << i->first << " " << i->second << endl;
- }
+void FVector::clear()
+{
+ m_coreFeatures.resize(0);
+ m_features.clear();
+}
+
+bool FVector::load(const std::string& filename)
+{
+ clear();
+ ifstream in (filename.c_str());
+ if (!in) {
+ return false;
+ }
+ string line;
+ while(getline(in,line)) {
+ if (line[0] == '#') continue;
+ istringstream linestream(line);
+ string namestring;
+ FValue value;
+ linestream >> namestring;
+ linestream >> value;
+ FName fname(namestring);
+ //cerr << "Setting sparse weight " << fname << " to value " << value << "." << endl;
+ set(fname,value);
+ }
+ return true;
+}
+
+void FVector::save(const string& filename) const
+{
+ ofstream out(filename.c_str());
+ if (!out) {
+ ostringstream msg;
+ msg << "Unable to open " << filename;
+ throw runtime_error(msg.str());
}
+ write(out);
+ out.close();
+}
- static bool equalsTolerance(FValue lhs, FValue rhs) {
- if (lhs == rhs) return true;
- static const FValue TOLERANCE = 1e-4;
- FValue diff = abs(lhs-rhs);
- FValue mean = (abs(lhs)+abs(rhs))/2;
- //cerr << "ET " << lhs << " " << rhs << " " << diff << " " << mean << " " << endl;
- return diff/mean < TOLERANCE ;
+void FVector::write(ostream& out) const
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ out << i->first << " " << i->second << endl;
}
-
- bool FVector::operator== (const FVector& rhs) const {
- if (this == &rhs) {
- return true;
- }
- if (m_coreFeatures.size() != rhs.m_coreFeatures.size()) {
- return false;
- }
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- if (!equalsTolerance(m_coreFeatures[i], rhs.m_coreFeatures[i])) return false;
- }
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- if (!equalsTolerance(i->second,rhs.get(i->first))) return false;
- }
- for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) {
- if (!equalsTolerance(i->second, get(i->first))) return false;
- }
+}
+
+static bool equalsTolerance(FValue lhs, FValue rhs)
+{
+ if (lhs == rhs) return true;
+ static const FValue TOLERANCE = 1e-4;
+ FValue diff = abs(lhs-rhs);
+ FValue mean = (abs(lhs)+abs(rhs))/2;
+ //cerr << "ET " << lhs << " " << rhs << " " << diff << " " << mean << " " << endl;
+ return diff/mean < TOLERANCE ;
+}
+
+bool FVector::operator== (const FVector& rhs) const
+{
+ if (this == &rhs) {
return true;
}
-
- bool FVector::operator!= (const FVector& rhs) const {
- return ! (*this == rhs);
+ if (m_coreFeatures.size() != rhs.m_coreFeatures.size()) {
+ return false;
}
-
- ProxyFVector FVector::operator[](const FName& name) {
- // At this point, we don't know whether operator[] was called, so we return
- // a proxy object and defer the decision until later
- return ProxyFVector(this, name);
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ if (!equalsTolerance(m_coreFeatures[i], rhs.m_coreFeatures[i])) return false;
}
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ if (!equalsTolerance(i->second,rhs.get(i->first))) return false;
+ }
+ for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i) {
+ if (!equalsTolerance(i->second, get(i->first))) return false;
+ }
+ return true;
+}
+
+bool FVector::operator!= (const FVector& rhs) const
+{
+ return ! (*this == rhs);
+}
+
+ProxyFVector FVector::operator[](const FName& name)
+{
+ // At this point, we don't know whether operator[] was called, so we return
+ // a proxy object and defer the decision until later
+ return ProxyFVector(this, name);
+}
- /** Equivalent for core features. */
- FValue& FVector::operator[](size_t index) {
- return m_coreFeatures[index];
+/** Equivalent for core features. */
+FValue& FVector::operator[](size_t index)
+{
+ return m_coreFeatures[index];
+}
+
+
+FValue FVector::operator[](const FName& name) const
+{
+ return get(name);
+}
+
+FValue FVector::operator[](size_t index) const
+{
+ return m_coreFeatures[index];
+}
+
+ostream& FVector::print(ostream& out) const
+{
+ out << "core=(";
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ out << m_coreFeatures[i];
+ if (i + 1 < m_coreFeatures.size()) {
+ out << ",";
+ }
}
+ out << ") ";
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ if (i != cbegin())
+ out << " ";
+ out << i->first << "=" << i->second;
+ }
+ return out;
+}
+
+ostream& operator<<(ostream& out, const FVector& fv)
+{
+ return fv.print(out);
+}
-
- FValue FVector::operator[](const FName& name) const {
- return get(name);
+const FValue& FVector::get(const FName& name) const
+{
+ static const FValue DEFAULT = 0;
+ const_iterator fi = m_features.find(name);
+ if (fi == m_features.end()) {
+ return DEFAULT;
+ } else {
+ return fi->second;
}
+}
- FValue FVector::operator[](size_t index) const {
- return m_coreFeatures[index];
+FValue FVector::getBackoff(const FName& name, float backoff) const
+{
+ const_iterator fi = m_features.find(name);
+ if (fi == m_features.end()) {
+ return backoff;
+ } else {
+ return fi->second;
}
+}
- ostream& FVector::print(ostream& out) const {
- out << "core=(";
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- out << m_coreFeatures[i];
- if (i + 1 < m_coreFeatures.size()) {
- out << ",";
- }
- }
- out << ") ";
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- if (i != cbegin())
- out << " ";
- out << i->first << "=" << i->second;
- }
- return out;
- }
-
- ostream& operator<<(ostream& out, const FVector& fv) {
- return fv.print(out);
- }
-
- const FValue& FVector::get(const FName& name) const {
- static const FValue DEFAULT = 0;
- const_iterator fi = m_features.find(name);
- if (fi == m_features.end()) {
- return DEFAULT;
- } else {
- return fi->second;
+void FVector::thresholdScale(FValue maxValue )
+{
+ FValue factor = 1.0;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ FValue value = i->second;
+ if (abs(value)*factor > maxValue) {
+ factor = abs(value) / maxValue;
}
}
+ operator*=(factor);
+}
- FValue FVector::getBackoff(const FName& name, float backoff) const {
- const_iterator fi = m_features.find(name);
- if (fi == m_features.end()) {
- return backoff;
- } else {
- return fi->second;
+void FVector::capMax(FValue maxValue)
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ if (i->second > maxValue)
+ set(i->first, maxValue);
+}
+
+void FVector::capMin(FValue minValue)
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ if (i->second < minValue)
+ set(i->first, minValue);
+}
+
+void FVector::set(const FName& name, const FValue& value)
+{
+ m_features[name] = value;
+}
+
+void FVector::printCoreFeatures()
+{
+ cerr << "core=(";
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ cerr << m_coreFeatures[i];
+ if (i + 1 < m_coreFeatures.size()) {
+ cerr << ",";
}
}
+ cerr << ") ";
+}
- void FVector::thresholdScale(FValue maxValue ) {
- FValue factor = 1.0;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- FValue value = i->second;
- if (abs(value)*factor > maxValue) {
- factor = abs(value) / maxValue;
- }
+FVector& FVector::operator+= (const FVector& rhs)
+{
+ if (rhs.m_coreFeatures.size() > m_coreFeatures.size())
+ resize(rhs.m_coreFeatures.size());
+ for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
+ set(i->first, get(i->first) + i->second);
+ for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
+ m_coreFeatures[i] += rhs.m_coreFeatures[i];
+ return *this;
+}
+
+// add only sparse features
+void FVector::sparsePlusEquals(const FVector& rhs)
+{
+ for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
+ set(i->first, get(i->first) + i->second);
+}
+
+// assign only core features
+void FVector::coreAssign(const FVector& rhs)
+{
+ for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
+ m_coreFeatures[i] = rhs.m_coreFeatures[i];
+}
+
+void FVector::incrementSparseHopeFeatures()
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ FName::incrementHopeId((i->first).name());
+}
+
+void FVector::incrementSparseFearFeatures()
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ FName::incrementFearId((i->first).name());
+}
+
+void FVector::printSparseHopeFeatureCounts(std::ofstream& out)
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ out << (i->first).name() << ": " << FName::getHopeIdCount((i->first).name()) << std::endl;
+}
+
+void FVector::printSparseFearFeatureCounts(std::ofstream& out)
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ out << (i->first).name() << ": " << FName::getFearIdCount((i->first).name()) << std::endl;
+}
+
+void FVector::printSparseHopeFeatureCounts()
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ std::cerr << (i->first).name() << ": " << FName::getHopeIdCount((i->first).name()) << std::endl;
+}
+
+void FVector::printSparseFearFeatureCounts()
+{
+ for (const_iterator i = cbegin(); i != cend(); ++i)
+ std::cerr << (i->first).name() << ": " << FName::getFearIdCount((i->first).name()) << std::endl;
+}
+
+size_t FVector::pruneSparseFeatures(size_t threshold)
+{
+ size_t count = 0;
+ vector<FName> toErase;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ const std::string& fname = (i->first).name();
+ if (FName::getHopeIdCount(fname) < threshold && FName::getFearIdCount(fname) < threshold) {
+ toErase.push_back(i->first);
+ std::cerr << "pruning: " << fname << " (" << FName::getHopeIdCount(fname) << ", " << FName::getFearIdCount(fname) << ")" << std::endl;
+ FName::eraseId(FName::getId(fname));
+ ++count;
}
- operator*=(factor);
}
- void FVector::capMax(FValue maxValue) {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- if (i->second > maxValue)
- set(i->first, maxValue);
- }
+ for (size_t i = 0; i < toErase.size(); ++i)
+ m_features.erase(toErase[i]);
- void FVector::capMin(FValue minValue) {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- if (i->second < minValue)
- set(i->first, minValue);
- }
+ return count;
+}
- void FVector::set(const FName& name, const FValue& value) {
- m_features[name] = value;
- }
-
- void FVector::printCoreFeatures() {
- cerr << "core=(";
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- cerr << m_coreFeatures[i];
- if (i + 1 < m_coreFeatures.size()) {
- cerr << ",";
- }
+size_t FVector::pruneZeroWeightFeatures()
+{
+ size_t count = 0;
+ vector<FName> toErase;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ const std::string& fname = (i->first).name();
+ if (i->second == 0) {
+ toErase.push_back(i->first);
+ //std::cerr << "prune: " << fname << std::endl;
+ FName::eraseId(FName::getId(fname));
+ ++count;
}
- cerr << ") ";
}
- FVector& FVector::operator+= (const FVector& rhs) {
- if (rhs.m_coreFeatures.size() > m_coreFeatures.size())
- resize(rhs.m_coreFeatures.size());
- for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
- set(i->first, get(i->first) + i->second);
- for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
- m_coreFeatures[i] += rhs.m_coreFeatures[i];
- return *this;
- }
-
- // add only sparse features
- void FVector::sparsePlusEquals(const FVector& rhs) {
- for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
- set(i->first, get(i->first) + i->second);
- }
-
- // assign only core features
- void FVector::coreAssign(const FVector& rhs) {
- for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
- m_coreFeatures[i] = rhs.m_coreFeatures[i];
+ for (size_t i = 0; i < toErase.size(); ++i)
+ m_features.erase(toErase[i]);
+
+ return count;
+}
+
+void FVector::updateConfidenceCounts(const FVector& weightUpdate, bool signedCounts)
+{
+ for (size_t i = 0; i < weightUpdate.m_coreFeatures.size(); ++i) {
+ if (signedCounts) {
+ //int sign = weightUpdate.m_coreFeatures[i] >= 0 ? 1 : -1;
+ //m_coreFeatures[i] += (weightUpdate.m_coreFeatures[i] * weightUpdate.m_coreFeatures[i]) * sign;
+ m_coreFeatures[i] += weightUpdate.m_coreFeatures[i];
+ } else
+ //m_coreFeatures[i] += (weightUpdate.m_coreFeatures[i] * weightUpdate.m_coreFeatures[i]);
+ m_coreFeatures[i] += abs(weightUpdate.m_coreFeatures[i]);
+ }
+
+ for (const_iterator i = weightUpdate.cbegin(); i != weightUpdate.cend(); ++i) {
+ if (weightUpdate[i->first] == 0)
+ continue;
+ float value = get(i->first);
+ if (signedCounts) {
+ //int sign = weightUpdate[i->first] >= 0 ? 1 : -1;
+ //value += (weightUpdate[i->first] * weightUpdate[i->first]) * sign;
+ value += weightUpdate[i->first];
+ } else
+ //value += (weightUpdate[i->first] * weightUpdate[i->first]);
+ value += abs(weightUpdate[i->first]);
+ set(i->first, value);
}
-
- void FVector::incrementSparseHopeFeatures() {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- FName::incrementHopeId((i->first).name());
+}
+
+void FVector::updateLearningRates(float decay_core, float decay_sparse, const FVector &confidenceCounts, float core_r0, float sparse_r0)
+{
+ for (size_t i = 0; i < confidenceCounts.m_coreFeatures.size(); ++i) {
+ m_coreFeatures[i] = 1.0/(1.0/core_r0 + decay_core * abs(confidenceCounts.m_coreFeatures[i]));
}
- void FVector::incrementSparseFearFeatures() {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- FName::incrementFearId((i->first).name());
+ for (const_iterator i = confidenceCounts.cbegin(); i != confidenceCounts.cend(); ++i) {
+ float value = 1.0/(1.0/sparse_r0 + decay_sparse * abs(i->second));
+ set(i->first, value);
}
-
- void FVector::printSparseHopeFeatureCounts(std::ofstream& out) {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- out << (i->first).name() << ": " << FName::getHopeIdCount((i->first).name()) << std::endl;
+}
+
+// count non-zero occurrences for all sparse features
+void FVector::setToBinaryOf(const FVector& rhs)
+{
+ for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
+ if (rhs.get(i->first) != 0)
+ set(i->first, 1);
+ for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
+ m_coreFeatures[i] = 1;
+}
+
+// divide only core features by scalar
+FVector& FVector::coreDivideEquals(float scalar)
+{
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i)
+ m_coreFeatures[i] /= scalar;
+ return *this;
+}
+
+// lhs vector is a sum of vectors, rhs vector holds number of non-zero summands
+FVector& FVector::divideEquals(const FVector& rhs)
+{
+ assert(m_coreFeatures.size() == rhs.m_coreFeatures.size());
+ for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
+ set(i->first, get(i->first)/rhs.get(i->first)); // divide by number of summands
+ for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
+ m_coreFeatures[i] /= rhs.m_coreFeatures[i]; // divide by number of summands
+ return *this;
+}
+
+FVector& FVector::operator-= (const FVector& rhs)
+{
+ if (rhs.m_coreFeatures.size() > m_coreFeatures.size())
+ resize(rhs.m_coreFeatures.size());
+ for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
+ set(i->first, get(i->first) -(i->second));
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ if (i < rhs.m_coreFeatures.size()) {
+ m_coreFeatures[i] -= rhs.m_coreFeatures[i];
+ }
}
+ return *this;
+}
- void FVector::printSparseFearFeatureCounts(std::ofstream& out) {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- out << (i->first).name() << ": " << FName::getFearIdCount((i->first).name()) << std::endl;
+FVector& FVector::operator*= (const FVector& rhs)
+{
+ if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) {
+ resize(rhs.m_coreFeatures.size());
}
-
- void FVector::printSparseHopeFeatureCounts() {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- std::cerr << (i->first).name() << ": " << FName::getHopeIdCount((i->first).name()) << std::endl;
+ for (iterator i = begin(); i != end(); ++i) {
+ FValue lhsValue = i->second;
+ FValue rhsValue = rhs.get(i->first);
+ set(i->first,lhsValue*rhsValue);
}
-
- void FVector::printSparseFearFeatureCounts() {
- for (const_iterator i = cbegin(); i != cend(); ++i)
- std::cerr << (i->first).name() << ": " << FName::getFearIdCount((i->first).name()) << std::endl;
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ if (i < rhs.m_coreFeatures.size()) {
+ m_coreFeatures[i] *= rhs.m_coreFeatures[i];
+ } else {
+ m_coreFeatures[i] = 0;
+ }
}
+ return *this;
+}
- size_t FVector::pruneSparseFeatures(size_t threshold) {
- size_t count = 0;
- vector<FName> toErase;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- const std::string& fname = (i->first).name();
- if (FName::getHopeIdCount(fname) < threshold && FName::getFearIdCount(fname) < threshold) {
- toErase.push_back(i->first);
- std::cerr << "pruning: " << fname << " (" << FName::getHopeIdCount(fname) << ", " << FName::getFearIdCount(fname) << ")" << std::endl;
- FName::eraseId(FName::getId(fname));
- ++count;
- }
- }
-
- for (size_t i = 0; i < toErase.size(); ++i)
- m_features.erase(toErase[i]);
-
- return count;
- }
-
- size_t FVector::pruneZeroWeightFeatures() {
- size_t count = 0;
- vector<FName> toErase;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- const std::string& fname = (i->first).name();
- if (i->second == 0) {
- toErase.push_back(i->first);
- //std::cerr << "prune: " << fname << std::endl;
- FName::eraseId(FName::getId(fname));
- ++count;
- }
- }
-
- for (size_t i = 0; i < toErase.size(); ++i)
- m_features.erase(toErase[i]);
-
- return count;
- }
-
- void FVector::updateConfidenceCounts(const FVector& weightUpdate, bool signedCounts) {
- for (size_t i = 0; i < weightUpdate.m_coreFeatures.size(); ++i) {
- if (signedCounts) {
- //int sign = weightUpdate.m_coreFeatures[i] >= 0 ? 1 : -1;
- //m_coreFeatures[i] += (weightUpdate.m_coreFeatures[i] * weightUpdate.m_coreFeatures[i]) * sign;
- m_coreFeatures[i] += weightUpdate.m_coreFeatures[i];
- }
- else
- //m_coreFeatures[i] += (weightUpdate.m_coreFeatures[i] * weightUpdate.m_coreFeatures[i]);
- m_coreFeatures[i] += abs(weightUpdate.m_coreFeatures[i]);
- }
-
- for (const_iterator i = weightUpdate.cbegin(); i != weightUpdate.cend(); ++i) {
- if (weightUpdate[i->first] == 0)
- continue;
- float value = get(i->first);
- if (signedCounts) {
- //int sign = weightUpdate[i->first] >= 0 ? 1 : -1;
- //value += (weightUpdate[i->first] * weightUpdate[i->first]) * sign;
- value += weightUpdate[i->first];
+FVector& FVector::operator/= (const FVector& rhs)
+{
+ if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) {
+ resize(rhs.m_coreFeatures.size());
+ }
+ for (iterator i = begin(); i != end(); ++i) {
+ FValue lhsValue = i->second;
+ FValue rhsValue = rhs.get(i->first);
+ set(i->first, lhsValue / rhsValue) ;
+ }
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ if (i < rhs.m_coreFeatures.size()) {
+ m_coreFeatures[i] /= rhs.m_coreFeatures[i];
+ } else {
+ if (m_coreFeatures[i] < 0) {
+ m_coreFeatures[i] = -numeric_limits<FValue>::infinity();
+ } else if (m_coreFeatures[i] > 0) {
+ m_coreFeatures[i] = numeric_limits<FValue>::infinity();
}
- else
- //value += (weightUpdate[i->first] * weightUpdate[i->first]);
- value += abs(weightUpdate[i->first]);
- set(i->first, value);
}
}
+ return *this;
+}
- void FVector::updateLearningRates(float decay_core, float decay_sparse, const FVector &confidenceCounts, float core_r0, float sparse_r0) {
- for (size_t i = 0; i < confidenceCounts.m_coreFeatures.size(); ++i) {
- m_coreFeatures[i] = 1.0/(1.0/core_r0 + decay_core * abs(confidenceCounts.m_coreFeatures[i]));
- }
-
- for (const_iterator i = confidenceCounts.cbegin(); i != confidenceCounts.cend(); ++i) {
- float value = 1.0/(1.0/sparse_r0 + decay_sparse * abs(i->second));
- set(i->first, value);
- }
+FVector& FVector::operator*= (const FValue& rhs)
+{
+ //NB Could do this with boost::bind ?
+ for (iterator i = begin(); i != end(); ++i) {
+ i->second *= rhs;
}
+ m_coreFeatures *= rhs;
+ return *this;
+}
- // count non-zero occurrences for all sparse features
- void FVector::setToBinaryOf(const FVector& rhs) {
- for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
- if (rhs.get(i->first) != 0)
- set(i->first, 1);
- for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
- m_coreFeatures[i] = 1;
- }
-
- // divide only core features by scalar
- FVector& FVector::coreDivideEquals(float scalar) {
- for (size_t i = 0; i < m_coreFeatures.size(); ++i)
- m_coreFeatures[i] /= scalar;
- return *this;
- }
-
- // lhs vector is a sum of vectors, rhs vector holds number of non-zero summands
- FVector& FVector::divideEquals(const FVector& rhs) {
- assert(m_coreFeatures.size() == rhs.m_coreFeatures.size());
- for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
- set(i->first, get(i->first)/rhs.get(i->first)); // divide by number of summands
- for (size_t i = 0; i < rhs.m_coreFeatures.size(); ++i)
- m_coreFeatures[i] /= rhs.m_coreFeatures[i]; // divide by number of summands
- return *this;
- }
-
- FVector& FVector::operator-= (const FVector& rhs) {
- if (rhs.m_coreFeatures.size() > m_coreFeatures.size())
- resize(rhs.m_coreFeatures.size());
- for (const_iterator i = rhs.cbegin(); i != rhs.cend(); ++i)
- set(i->first, get(i->first) -(i->second));
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- if (i < rhs.m_coreFeatures.size()) {
- m_coreFeatures[i] -= rhs.m_coreFeatures[i];
- }
- }
- return *this;
+FVector& FVector::operator/= (const FValue& rhs)
+{
+ for (iterator i = begin(); i != end(); ++i) {
+ i->second /= rhs;
}
-
- FVector& FVector::operator*= (const FVector& rhs) {
- if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) {
- resize(rhs.m_coreFeatures.size());
- }
- for (iterator i = begin(); i != end(); ++i) {
- FValue lhsValue = i->second;
- FValue rhsValue = rhs.get(i->first);
- set(i->first,lhsValue*rhsValue);
- }
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- if (i < rhs.m_coreFeatures.size()) {
- m_coreFeatures[i] *= rhs.m_coreFeatures[i];
- } else {
- m_coreFeatures[i] = 0;
- }
- }
- return *this;
+ m_coreFeatures /= rhs;
+ return *this;
+}
+
+FVector& FVector::multiplyEqualsBackoff(const FVector& rhs, float backoff)
+{
+ if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) {
+ resize(rhs.m_coreFeatures.size());
}
-
- FVector& FVector::operator/= (const FVector& rhs) {
- if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) {
- resize(rhs.m_coreFeatures.size());
- }
- for (iterator i = begin(); i != end(); ++i) {
- FValue lhsValue = i->second;
- FValue rhsValue = rhs.get(i->first);
- set(i->first, lhsValue / rhsValue) ;
- }
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- if (i < rhs.m_coreFeatures.size()) {
- m_coreFeatures[i] /= rhs.m_coreFeatures[i];
- } else {
- if (m_coreFeatures[i] < 0) {
- m_coreFeatures[i] = -numeric_limits<FValue>::infinity();
- } else if (m_coreFeatures[i] > 0) {
- m_coreFeatures[i] = numeric_limits<FValue>::infinity();
- }
- }
- }
- return *this;
+ for (iterator i = begin(); i != end(); ++i) {
+ FValue lhsValue = i->second;
+ FValue rhsValue = rhs.getBackoff(i->first, backoff);
+ set(i->first,lhsValue*rhsValue);
}
-
- FVector& FVector::operator*= (const FValue& rhs) {
- //NB Could do this with boost::bind ?
- for (iterator i = begin(); i != end(); ++i) {
- i->second *= rhs;
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ if (i < rhs.m_coreFeatures.size()) {
+ m_coreFeatures[i] *= rhs.m_coreFeatures[i];
+ } else {
+ m_coreFeatures[i] = 0;
}
- m_coreFeatures *= rhs;
- return *this;
}
-
- FVector& FVector::operator/= (const FValue& rhs) {
- for (iterator i = begin(); i != end(); ++i) {
- i->second /= rhs;
- }
- m_coreFeatures /= rhs;
- return *this;
+ return *this;
+}
+
+FVector& FVector::multiplyEquals(float core_r0, float sparse_r0)
+{
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ m_coreFeatures[i] *= core_r0;
}
+ for (iterator i = begin(); i != end(); ++i)
+ set(i->first,(i->second)*sparse_r0);
+ return *this;
+}
- FVector& FVector::multiplyEqualsBackoff(const FVector& rhs, float backoff) {
- if (rhs.m_coreFeatures.size() > m_coreFeatures.size()) {
- resize(rhs.m_coreFeatures.size());
- }
- for (iterator i = begin(); i != end(); ++i) {
- FValue lhsValue = i->second;
- FValue rhsValue = rhs.getBackoff(i->first, backoff);
- set(i->first,lhsValue*rhsValue);
- }
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- if (i < rhs.m_coreFeatures.size()) {
- m_coreFeatures[i] *= rhs.m_coreFeatures[i];
- } else {
- m_coreFeatures[i] = 0;
- }
- }
- return *this;
+FValue FVector::l1norm() const
+{
+ FValue norm = 0;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ norm += abs(i->second);
}
-
- FVector& FVector::multiplyEquals(float core_r0, float sparse_r0) {
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- m_coreFeatures[i] *= core_r0;
- }
- for (iterator i = begin(); i != end(); ++i)
- set(i->first,(i->second)*sparse_r0);
- return *this;
- }
-
- FValue FVector::l1norm() const {
- FValue norm = 0;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- norm += abs(i->second);
- }
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- norm += abs(m_coreFeatures[i]);
- }
- return norm;
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ norm += abs(m_coreFeatures[i]);
}
+ return norm;
+}
+
+FValue FVector::l1norm_coreFeatures() const
+{
+ FValue norm = 0;
+ // ignore Bleu score feature (last feature)
+ for (size_t i = 0; i < m_coreFeatures.size()-1; ++i)
+ norm += abs(m_coreFeatures[i]);
+ return norm;
+}
+
+FValue FVector::l2norm() const
+{
+ return sqrt(inner_product(*this));
+}
- FValue FVector::l1norm_coreFeatures() const {
- FValue norm = 0;
- // ignore Bleu score feature (last feature)
- for (size_t i = 0; i < m_coreFeatures.size()-1; ++i)
- norm += abs(m_coreFeatures[i]);
- return norm;
+FValue FVector::linfnorm() const
+{
+ FValue norm = 0;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ float absValue = abs(i->second);
+ if (absValue > norm)
+ norm = absValue;
}
-
- FValue FVector::l2norm() const {
- return sqrt(inner_product(*this));
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ float absValue = abs(m_coreFeatures[i]);
+ if (absValue > norm)
+ norm = absValue;
}
+ return norm;
+}
- FValue FVector::linfnorm() const {
- FValue norm = 0;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- float absValue = abs(i->second);
- if (absValue > norm)
- norm = absValue;
- }
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- float absValue = abs(m_coreFeatures[i]);
- if (absValue > norm)
- norm = absValue;
+size_t FVector::l1regularize(float lambda)
+{
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ float value = m_coreFeatures[i];
+ if (value > 0) {
+ m_coreFeatures[i] = max(0.0f, value - lambda);
+ } else {
+ m_coreFeatures[i] = min(0.0f, value + lambda);
}
- return norm;
}
- size_t FVector::l1regularize(float lambda) {
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- float value = m_coreFeatures[i];
- if (value > 0) {
- m_coreFeatures[i] = max(0.0f, value - lambda);
- }
- else {
- m_coreFeatures[i] = min(0.0f, value + lambda);
- }
- }
+ size_t numberPruned = size();
+ vector<FName> toErase;
+ for (iterator i = begin(); i != end(); ++i) {
+ float value = i->second;
+ if (value != 0.0f) {
+ if (value > 0)
+ value = max(0.0f, value - lambda);
+ else
+ value = min(0.0f, value + lambda);
- size_t numberPruned = size();
- vector<FName> toErase;
- for (iterator i = begin(); i != end(); ++i) {
- float value = i->second;
- if (value != 0.0f) {
- if (value > 0)
- value = max(0.0f, value - lambda);
- else
- value = min(0.0f, value + lambda);
-
- if (value != 0.0f)
- i->second = value;
- else {
- toErase.push_back(i->first);
- const std::string& fname = (i->first).name();
- FName::eraseId(FName::getId(fname));
- }
+ if (value != 0.0f)
+ i->second = value;
+ else {
+ toErase.push_back(i->first);
+ const std::string& fname = (i->first).name();
+ FName::eraseId(FName::getId(fname));
}
}
-
- // erase features that have become zero
- for (size_t i = 0; i < toErase.size(); ++i)
- m_features.erase(toErase[i]);
- numberPruned -= size();
- return numberPruned;
}
- void FVector::l2regularize(float lambda) {
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- m_coreFeatures[i] *= (1 - lambda);
- }
+ // erase features that have become zero
+ for (size_t i = 0; i < toErase.size(); ++i)
+ m_features.erase(toErase[i]);
+ numberPruned -= size();
+ return numberPruned;
+}
- for (iterator i = begin(); i != end(); ++i) {
- i->second *= (1 - lambda);
- }
+void FVector::l2regularize(float lambda)
+{
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ m_coreFeatures[i] *= (1 - lambda);
}
- size_t FVector::sparseL1regularize(float lambda) {
- /*for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- float value = m_coreFeatures[i];
- if (value > 0) {
- m_coreFeatures[i] = max(0.0f, value - lambda);
- }
+ for (iterator i = begin(); i != end(); ++i) {
+ i->second *= (1 - lambda);
+ }
+}
+
+size_t FVector::sparseL1regularize(float lambda)
+{
+ /*for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ float value = m_coreFeatures[i];
+ if (value > 0) {
+ m_coreFeatures[i] = max(0.0f, value - lambda);
+ }
+ else {
+ m_coreFeatures[i] = min(0.0f, value + lambda);
+ }
+ }*/
+
+ size_t numberPruned = size();
+ vector<FName> toErase;
+ for (iterator i = begin(); i != end(); ++i) {
+ float value = i->second;
+ if (value != 0.0f) {
+ if (value > 0)
+ value = max(0.0f, value - lambda);
+ else
+ value = min(0.0f, value + lambda);
+
+ if (value != 0.0f)
+ i->second = value;
else {
- m_coreFeatures[i] = min(0.0f, value + lambda);
- }
- }*/
-
- size_t numberPruned = size();
- vector<FName> toErase;
- for (iterator i = begin(); i != end(); ++i) {
- float value = i->second;
- if (value != 0.0f) {
- if (value > 0)
- value = max(0.0f, value - lambda);
- else
- value = min(0.0f, value + lambda);
-
- if (value != 0.0f)
- i->second = value;
- else {
- toErase.push_back(i->first);
- const std::string& fname = (i->first).name();
- FName::eraseId(FName::getId(fname));
- }
+ toErase.push_back(i->first);
+ const std::string& fname = (i->first).name();
+ FName::eraseId(FName::getId(fname));
}
}
-
- // erase features that have become zero
- for (size_t i = 0; i < toErase.size(); ++i)
- m_features.erase(toErase[i]);
- numberPruned -= size();
- return numberPruned;
}
- void FVector::sparseL2regularize(float lambda) {
- /*for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- m_coreFeatures[i] *= (1 - lambda);
- }*/
+ // erase features that have become zero
+ for (size_t i = 0; i < toErase.size(); ++i)
+ m_features.erase(toErase[i]);
+ numberPruned -= size();
+ return numberPruned;
+}
- for (iterator i = begin(); i != end(); ++i) {
- i->second *= (1 - lambda);
- }
- }
+void FVector::sparseL2regularize(float lambda)
+{
+ /*for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ m_coreFeatures[i] *= (1 - lambda);
+ }*/
- FValue FVector::sum() const {
- FValue sum = 0;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- sum += i->second;
- }
- sum += m_coreFeatures.sum();
- return sum;
- }
-
- FValue FVector::inner_product(const FVector& rhs) const {
- CHECK(m_coreFeatures.size() == rhs.m_coreFeatures.size());
- FValue product = 0.0;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- product += ((i->second)*(rhs.get(i->first)));
- }
- for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
- product += m_coreFeatures[i]*rhs.m_coreFeatures[i];
- }
- return product;
+ for (iterator i = begin(); i != end(); ++i) {
+ i->second *= (1 - lambda);
}
+}
- const FVector operator+(const FVector& lhs, const FVector& rhs) {
- return FVector(lhs) += rhs;
- }
-
- const FVector operator-(const FVector& lhs, const FVector& rhs) {
- return FVector(lhs) -= rhs;
+FValue FVector::sum() const
+{
+ FValue sum = 0;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ sum += i->second;
}
-
- const FVector operator*(const FVector& lhs, const FVector& rhs) {
- return FVector(lhs) *= rhs;
- }
-
- const FVector operator/(const FVector& lhs, const FVector& rhs) {
- return FVector(lhs) /= rhs;
- }
-
-
- const FVector operator*(const FVector& lhs, const FValue& rhs) {
- return FVector(lhs) *= rhs;
+ sum += m_coreFeatures.sum();
+ return sum;
+}
+
+FValue FVector::inner_product(const FVector& rhs) const
+{
+ CHECK(m_coreFeatures.size() == rhs.m_coreFeatures.size());
+ FValue product = 0.0;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ product += ((i->second)*(rhs.get(i->first)));
}
-
- const FVector operator/(const FVector& lhs, const FValue& rhs) {
- return FVector(lhs) /= rhs;
+ for (size_t i = 0; i < m_coreFeatures.size(); ++i) {
+ product += m_coreFeatures[i]*rhs.m_coreFeatures[i];
}
+ return product;
+}
- FValue inner_product(const FVector& lhs, const FVector& rhs) {
- if (lhs.size() >= rhs.size()) {
- return rhs.inner_product(lhs);
- } else {
- return lhs.inner_product(rhs);
- }
+const FVector operator+(const FVector& lhs, const FVector& rhs)
+{
+ return FVector(lhs) += rhs;
+}
+
+const FVector operator-(const FVector& lhs, const FVector& rhs)
+{
+ return FVector(lhs) -= rhs;
+}
+
+const FVector operator*(const FVector& lhs, const FVector& rhs)
+{
+ return FVector(lhs) *= rhs;
+}
+
+const FVector operator/(const FVector& lhs, const FVector& rhs)
+{
+ return FVector(lhs) /= rhs;
+}
+
+
+const FVector operator*(const FVector& lhs, const FValue& rhs)
+{
+ return FVector(lhs) *= rhs;
+}
+
+const FVector operator/(const FVector& lhs, const FValue& rhs)
+{
+ return FVector(lhs) /= rhs;
+}
+
+FValue inner_product(const FVector& lhs, const FVector& rhs)
+{
+ if (lhs.size() >= rhs.size()) {
+ return rhs.inner_product(lhs);
+ } else {
+ return lhs.inner_product(rhs);
}
}
+}
diff --git a/moses/FeatureVector.h b/moses/FeatureVector.h
index 9c15ba4f7..f4261b520 100644
--- a/moses/FeatureVector.h
+++ b/moses/FeatureVector.h
@@ -1,21 +1,21 @@
/*
Moses - factored phrase-based language decoder
Copyright (C) 2010 University of Edinburgh
-
+
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
-
+
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
-
+
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
+
*/
#pragma once
@@ -47,325 +47,336 @@
#include "util/check.hh"
#include "util/string_piece.hh"
-namespace Moses {
-
- typedef float FValue;
-
- /**
- * Feature name
- **/
- struct FName {
-
- static const std::string SEP;
-
- typedef boost::unordered_map<std::string,size_t> Name2Id;
- typedef boost::unordered_map<size_t,size_t> Id2Count;
- //typedef std::map<std::string, size_t> Name2Id;
- static Name2Id name2id;
- static std::vector<std::string> id2name;
- static Id2Count id2hopeCount;
- static Id2Count id2fearCount;
-
- //A feature name can either be initialised as a pair of strings,
- //which will be concatenated with a SEP between them, or as
- //a single string, which will be used as-is.
- FName(const StringPiece &root, const StringPiece &name) {
- std::string assembled(root.data(), root.size());
- assembled += SEP;
- assembled.append(name.data(), name.size());
- init(assembled);
- }
- explicit FName(const StringPiece &name)
- {init(name);}
-
- const std::string& name() const;
- //const std::string& root() const {return m_root;}
-
- size_t hash() const;
-
- bool operator==(const FName& rhs) const ;
- bool operator!=(const FName& rhs) const ;
-
- static size_t getId(const std::string& name);
- static size_t getHopeIdCount(const std::string& name);
- static size_t getFearIdCount(const std::string& name);
- static void incrementHopeId(const std::string& name);
- static void incrementFearId(const std::string& name);
- static void eraseId(size_t id);
-
- private:
- void init(const StringPiece& name);
- size_t m_id;
+namespace Moses
+{
+
+typedef float FValue;
+
+/**
+ * Feature name
+ **/
+struct FName {
+
+ static const std::string SEP;
+
+ typedef boost::unordered_map<std::string,size_t> Name2Id;
+ typedef boost::unordered_map<size_t,size_t> Id2Count;
+ //typedef std::map<std::string, size_t> Name2Id;
+ static Name2Id name2id;
+ static std::vector<std::string> id2name;
+ static Id2Count id2hopeCount;
+ static Id2Count id2fearCount;
+
+ //A feature name can either be initialised as a pair of strings,
+ //which will be concatenated with a SEP between them, or as
+ //a single string, which will be used as-is.
+ FName(const StringPiece &root, const StringPiece &name) {
+ std::string assembled(root.data(), root.size());
+ assembled += SEP;
+ assembled.append(name.data(), name.size());
+ init(assembled);
+ }
+ explicit FName(const StringPiece &name) {
+ init(name);
+ }
+
+ const std::string& name() const;
+ //const std::string& root() const {return m_root;}
+
+ size_t hash() const;
+
+ bool operator==(const FName& rhs) const ;
+ bool operator!=(const FName& rhs) const ;
+
+ static size_t getId(const std::string& name);
+ static size_t getHopeIdCount(const std::string& name);
+ static size_t getFearIdCount(const std::string& name);
+ static void incrementHopeId(const std::string& name);
+ static void incrementFearId(const std::string& name);
+ static void eraseId(size_t id);
+
+private:
+ void init(const StringPiece& name);
+ size_t m_id;
#ifdef WITH_THREADS
- //reader-writer lock
- static boost::shared_mutex m_idLock;
+ //reader-writer lock
+ static boost::shared_mutex m_idLock;
#endif
- };
-
- std::ostream& operator<<(std::ostream& out,const FName& name);
-
- struct FNameEquals {
- inline bool operator() (const FName& lhs, const FName& rhs) const {
- return (lhs == rhs);
- }
- };
-
- struct FNameHash
- : std::unary_function<FName, std::size_t>
- {
- std::size_t operator()(const FName& x) const
- {
- return x.hash();
- }
- };
-
- class ProxyFVector;
-
- /**
- * A sparse feature (or weight) vector.
- **/
- class FVector
- {
- public:
- /** Empty feature vector */
- FVector(size_t coreFeatures = 0);
-
- FVector& operator=( const FVector& rhs ) {
- m_features = rhs.m_features;
- m_coreFeatures = rhs.m_coreFeatures;
- return *this;
- }
+};
- /*
- * Change the number of core features
- **/
- void resize(size_t newsize);
-
- typedef boost::unordered_map<FName,FValue,FNameHash, FNameEquals> FNVmap;
- /** Iterators */
- typedef FNVmap::iterator iterator;
- typedef FNVmap::const_iterator const_iterator;
- iterator begin() {return m_features.begin();}
- iterator end() {return m_features.end();}
- const_iterator cbegin() const {return m_features.cbegin();}
- const_iterator cend() const {return m_features.cend();}
-
- bool hasNonDefaultValue(FName name) const { return m_features.find(name) != m_features.end();}
- void clear();
-
-
- /** Load from file - each line should be 'root[_name] value' */
- bool load(const std::string& filename);
- void save(const std::string& filename) const;
- void write(std::ostream& out) const ;
-
- /** Element access */
- ProxyFVector operator[](const FName& name);
- FValue& operator[](size_t index);
- FValue operator[](const FName& name) const;
- FValue operator[](size_t index) const;
-
- /** Size */
- size_t size() const {
- return m_features.size() + m_coreFeatures.size();
- }
+std::ostream& operator<<(std::ostream& out,const FName& name);
- size_t coreSize() const {
- return m_coreFeatures.size();
- }
-
- const std::valarray<FValue> &getCoreFeatures() const {
- return m_coreFeatures;
- }
-
- /** Equality */
- bool operator== (const FVector& rhs) const;
- bool operator!= (const FVector& rhs) const;
-
- FValue inner_product(const FVector& rhs) const;
-
- friend class ProxyFVector;
-
- /**arithmetic */
- //Element-wise
- //If one side has fewer core features, take the missing ones to be 0.
- FVector& operator+= (const FVector& rhs);
- FVector& operator-= (const FVector& rhs);
- FVector& operator*= (const FVector& rhs);
- FVector& operator/= (const FVector& rhs);
- //Scalar
- FVector& operator*= (const FValue& rhs);
- FVector& operator/= (const FValue& rhs);
-
- FVector& multiplyEqualsBackoff(const FVector& rhs, float backoff);
- FVector& multiplyEquals(float core_r0, float sparse_r0);
-
- FVector& max_equals(const FVector& rhs);
-
- /** norms and sums */
- FValue l1norm() const;
- FValue l1norm_coreFeatures() const;
- FValue l2norm() const;
- FValue linfnorm() const;
- size_t l1regularize(float lambda);
- void l2regularize(float lambda);
- size_t sparseL1regularize(float lambda);
- void sparseL2regularize(float lambda);
- FValue sum() const;
-
- /** pretty printing */
- std::ostream& print(std::ostream& out) const;
-
- /** additional */
- void printCoreFeatures();
- //scale so that abs. value is less than maxvalue
- void thresholdScale(float maxValue );
-
- void capMax(FValue maxValue);
- void capMin(FValue minValue);
-
- void sparsePlusEquals(const FVector& rhs);
- void coreAssign(const FVector& rhs);
-
- void incrementSparseHopeFeatures();
- void incrementSparseFearFeatures();
- void printSparseHopeFeatureCounts(std::ofstream& out);
- void printSparseFearFeatureCounts(std::ofstream& out);
- void printSparseHopeFeatureCounts();
- void printSparseFearFeatureCounts();
- size_t pruneSparseFeatures(size_t threshold);
- size_t pruneZeroWeightFeatures();
- void updateConfidenceCounts(const FVector& weightUpdate, bool signedCounts);
- void updateLearningRates(float decay_core, float decay_sparse, const FVector& confidence_counts, float core_r0, float sparse_r0);
-
- // vector which, for each element of the original vector, reflects whether an element is zero or non-zero
- void setToBinaryOf(const FVector& rhs);
-
- // divide only core features by scalar
- FVector& coreDivideEquals(float scalar);
-
- // divide each element by the number given in the rhs vector
- FVector& divideEquals(const FVector& rhs);
+struct FNameEquals {
+ inline bool operator() (const FName& lhs, const FName& rhs) const {
+ return (lhs == rhs);
+ }
+};
+
+struct FNameHash
+ : std::unary_function<FName, std::size_t> {
+ std::size_t operator()(const FName& x) const {
+ return x.hash();
+ }
+};
+
+class ProxyFVector;
+
+/**
+ * A sparse feature (or weight) vector.
+ **/
+class FVector
+{
+public:
+ /** Empty feature vector */
+ FVector(size_t coreFeatures = 0);
+
+ FVector& operator=( const FVector& rhs ) {
+ m_features = rhs.m_features;
+ m_coreFeatures = rhs.m_coreFeatures;
+ return *this;
+ }
+
+ /*
+ * Change the number of core features
+ **/
+ void resize(size_t newsize);
+
+ typedef boost::unordered_map<FName,FValue,FNameHash, FNameEquals> FNVmap;
+ /** Iterators */
+ typedef FNVmap::iterator iterator;
+ typedef FNVmap::const_iterator const_iterator;
+ iterator begin() {
+ return m_features.begin();
+ }
+ iterator end() {
+ return m_features.end();
+ }
+ const_iterator cbegin() const {
+ return m_features.cbegin();
+ }
+ const_iterator cend() const {
+ return m_features.cend();
+ }
+
+ bool hasNonDefaultValue(FName name) const {
+ return m_features.find(name) != m_features.end();
+ }
+ void clear();
+
+
+ /** Load from file - each line should be 'root[_name] value' */
+ bool load(const std::string& filename);
+ void save(const std::string& filename) const;
+ void write(std::ostream& out) const ;
+
+ /** Element access */
+ ProxyFVector operator[](const FName& name);
+ FValue& operator[](size_t index);
+ FValue operator[](const FName& name) const;
+ FValue operator[](size_t index) const;
+
+ /** Size */
+ size_t size() const {
+ return m_features.size() + m_coreFeatures.size();
+ }
+
+ size_t coreSize() const {
+ return m_coreFeatures.size();
+ }
+
+ const std::valarray<FValue> &getCoreFeatures() const {
+ return m_coreFeatures;
+ }
+
+ /** Equality */
+ bool operator== (const FVector& rhs) const;
+ bool operator!= (const FVector& rhs) const;
+
+ FValue inner_product(const FVector& rhs) const;
+
+ friend class ProxyFVector;
+
+ /**arithmetic */
+ //Element-wise
+ //If one side has fewer core features, take the missing ones to be 0.
+ FVector& operator+= (const FVector& rhs);
+ FVector& operator-= (const FVector& rhs);
+ FVector& operator*= (const FVector& rhs);
+ FVector& operator/= (const FVector& rhs);
+ //Scalar
+ FVector& operator*= (const FValue& rhs);
+ FVector& operator/= (const FValue& rhs);
+
+ FVector& multiplyEqualsBackoff(const FVector& rhs, float backoff);
+ FVector& multiplyEquals(float core_r0, float sparse_r0);
+
+ FVector& max_equals(const FVector& rhs);
+
+ /** norms and sums */
+ FValue l1norm() const;
+ FValue l1norm_coreFeatures() const;
+ FValue l2norm() const;
+ FValue linfnorm() const;
+ size_t l1regularize(float lambda);
+ void l2regularize(float lambda);
+ size_t sparseL1regularize(float lambda);
+ void sparseL2regularize(float lambda);
+ FValue sum() const;
+
+ /** pretty printing */
+ std::ostream& print(std::ostream& out) const;
+
+ /** additional */
+ void printCoreFeatures();
+ //scale so that abs. value is less than maxvalue
+ void thresholdScale(float maxValue );
+
+ void capMax(FValue maxValue);
+ void capMin(FValue minValue);
+
+ void sparsePlusEquals(const FVector& rhs);
+ void coreAssign(const FVector& rhs);
+
+ void incrementSparseHopeFeatures();
+ void incrementSparseFearFeatures();
+ void printSparseHopeFeatureCounts(std::ofstream& out);
+ void printSparseFearFeatureCounts(std::ofstream& out);
+ void printSparseHopeFeatureCounts();
+ void printSparseFearFeatureCounts();
+ size_t pruneSparseFeatures(size_t threshold);
+ size_t pruneZeroWeightFeatures();
+ void updateConfidenceCounts(const FVector& weightUpdate, bool signedCounts);
+ void updateLearningRates(float decay_core, float decay_sparse, const FVector& confidence_counts, float core_r0, float sparse_r0);
+
+ // vector which, for each element of the original vector, reflects whether an element is zero or non-zero
+ void setToBinaryOf(const FVector& rhs);
+
+ // divide only core features by scalar
+ FVector& coreDivideEquals(float scalar);
+
+ // divide each element by the number given in the rhs vector
+ FVector& divideEquals(const FVector& rhs);
#ifdef MPI_ENABLE
- friend class boost::serialization::access;
-#endif
-
- private:
-
- /** Internal get and set. */
- const FValue& get(const FName& name) const;
- FValue getBackoff(const FName& name, float backoff) const;
- void set(const FName& name, const FValue& value);
-
- FNVmap m_features;
- std::valarray<FValue> m_coreFeatures;
-
+ friend class boost::serialization::access;
+#endif
+
+private:
+
+ /** Internal get and set. */
+ const FValue& get(const FName& name) const;
+ FValue getBackoff(const FName& name, float backoff) const;
+ void set(const FName& name, const FValue& value);
+
+ FNVmap m_features;
+ std::valarray<FValue> m_coreFeatures;
+
#ifdef MPI_ENABLE
- //serialization
- template<class Archive>
- void save(Archive &ar, const unsigned int version) const {
- std::vector<std::string> names;
- std::vector<FValue> values;
- for (const_iterator i = cbegin(); i != cend(); ++i) {
- std::ostringstream ostr;
- ostr << i->first;
- names.push_back(ostr.str());
- values.push_back(i->second);
- }
- ar << names;
- ar << values;
- ar << m_coreFeatures;
+ //serialization
+ template<class Archive>
+ void save(Archive &ar, const unsigned int version) const {
+ std::vector<std::string> names;
+ std::vector<FValue> values;
+ for (const_iterator i = cbegin(); i != cend(); ++i) {
+ std::ostringstream ostr;
+ ostr << i->first;
+ names.push_back(ostr.str());
+ values.push_back(i->second);
}
-
- template<class Archive>
- void load(Archive &ar, const unsigned int version) {
- clear();
- std::vector<std::string> names;
- std::vector<FValue> values;
- ar >> names;
- ar >> values;
- ar >> m_coreFeatures;
- CHECK(names.size() == values.size());
- for (size_t i = 0; i < names.size(); ++i) {
- set(FName(names[i]), values[i]);
- }
+ ar << names;
+ ar << values;
+ ar << m_coreFeatures;
+ }
+
+ template<class Archive>
+ void load(Archive &ar, const unsigned int version) {
+ clear();
+ std::vector<std::string> names;
+ std::vector<FValue> values;
+ ar >> names;
+ ar >> values;
+ ar >> m_coreFeatures;
+ CHECK(names.size() == values.size());
+ for (size_t i = 0; i < names.size(); ++i) {
+ set(FName(names[i]), values[i]);
}
-
- BOOST_SERIALIZATION_SPLIT_MEMBER()
-
+ }
+
+ BOOST_SERIALIZATION_SPLIT_MEMBER()
+
#endif
-
- };
-
- std::ostream& operator<<( std::ostream& out, const FVector& fv);
- //Element-wise operations
- const FVector operator+(const FVector& lhs, const FVector& rhs);
- const FVector operator-(const FVector& lhs, const FVector& rhs);
- const FVector operator*(const FVector& lhs, const FVector& rhs);
- const FVector operator/(const FVector& lhs, const FVector& rhs);
-
- //Scalar operations
- const FVector operator*(const FVector& lhs, const FValue& rhs);
- const FVector operator/(const FVector& lhs, const FValue& rhs);
-
- const FVector fvmax(const FVector& lhs, const FVector& rhs);
-
- FValue inner_product(const FVector& lhs, const FVector& rhs);
-
- struct FVectorPlus {
- FVector operator()(const FVector& lhs, const FVector& rhs) const {
- return lhs + rhs;
- }
- };
-
- /**
- * Used to help with subscript operator overloading.
- * See http://stackoverflow.com/questions/1386075/overloading-operator-for-a-sparse-vector
- **/
- class ProxyFVector {
- public:
- ProxyFVector(FVector *fv, const FName& name ) : m_fv(fv), m_name(name) {}
- ProxyFVector &operator=(const FValue& value) {
- // If we get here, we know that operator[] was called to perform a write access,
- // so we can insert an item in the vector if needed
- //std::cerr << "Inserting " << value << " into " << m_name << std::endl;
- m_fv->set(m_name,value);
- return *this;
-
- }
-
- operator FValue() {
- // If we get here, we know that operator[] was called to perform a read access,
- // so we can simply return the value from the vector
- return m_fv->get(m_name);
- }
-
- /*operator FValue&() {
- return m_fv->m_features[m_name];
- }*/
-
- FValue operator++() {
- return ++m_fv->m_features[m_name];
- }
-
- FValue operator +=(FValue lhs) {
- return (m_fv->m_features[m_name] += lhs);
- }
-
- FValue operator -=(FValue lhs) {
- return (m_fv->m_features[m_name] -= lhs);
- }
- private:
- FValue m_tmp;
-
- private:
- FVector* m_fv;
- const FName& m_name;
-
- };
-
+};
+
+std::ostream& operator<<( std::ostream& out, const FVector& fv);
+//Element-wise operations
+const FVector operator+(const FVector& lhs, const FVector& rhs);
+const FVector operator-(const FVector& lhs, const FVector& rhs);
+const FVector operator*(const FVector& lhs, const FVector& rhs);
+const FVector operator/(const FVector& lhs, const FVector& rhs);
+
+//Scalar operations
+const FVector operator*(const FVector& lhs, const FValue& rhs);
+const FVector operator/(const FVector& lhs, const FValue& rhs);
+
+const FVector fvmax(const FVector& lhs, const FVector& rhs);
+
+FValue inner_product(const FVector& lhs, const FVector& rhs);
+
+struct FVectorPlus {
+ FVector operator()(const FVector& lhs, const FVector& rhs) const {
+ return lhs + rhs;
+ }
+};
+
+/**
+ * Used to help with subscript operator overloading.
+ * See http://stackoverflow.com/questions/1386075/overloading-operator-for-a-sparse-vector
+ **/
+class ProxyFVector
+{
+public:
+ ProxyFVector(FVector *fv, const FName& name ) : m_fv(fv), m_name(name) {}
+ ProxyFVector &operator=(const FValue& value) {
+ // If we get here, we know that operator[] was called to perform a write access,
+ // so we can insert an item in the vector if needed
+ //std::cerr << "Inserting " << value << " into " << m_name << std::endl;
+ m_fv->set(m_name,value);
+ return *this;
+
+ }
+
+ operator FValue() {
+ // If we get here, we know that operator[] was called to perform a read access,
+ // so we can simply return the value from the vector
+ return m_fv->get(m_name);
+ }
+
+ /*operator FValue&() {
+ return m_fv->m_features[m_name];
+ }*/
+
+ FValue operator++() {
+ return ++m_fv->m_features[m_name];
+ }
+
+ FValue operator +=(FValue lhs) {
+ return (m_fv->m_features[m_name] += lhs);
+ }
+
+ FValue operator -=(FValue lhs) {
+ return (m_fv->m_features[m_name] -= lhs);
+ }
+
+private:
+ FValue m_tmp;
+
+private:
+ FVector* m_fv;
+ const FName& m_name;
+
+};
+
}
#endif
diff --git a/moses/FeatureVectorTest.cpp b/moses/FeatureVectorTest.cpp
index af1829e62..2e00b276e 100644
--- a/moses/FeatureVectorTest.cpp
+++ b/moses/FeatureVectorTest.cpp
@@ -28,41 +28,49 @@ static const float TOL = 0.00001;
BOOST_AUTO_TEST_SUITE(fv)
-BOOST_AUTO_TEST_CASE(vector_sum_diff)
+BOOST_AUTO_TEST_CASE(vector_sum_diff)
{
FVector f1,f2,f3;
FName n1("a");
FName n2("b");
FName n3("c");
FName n4("d");
- f1[n1] = 1.2; f1[n2] = 1.4; f1[n3] = -0.1;
- f2[n1] = 0.01; f2[n3] = 5.6; f2[n4] = 0.6;
+ f1[n1] = 1.2;
+ f1[n2] = 1.4;
+ f1[n3] = -0.1;
+ f2[n1] = 0.01;
+ f2[n3] = 5.6;
+ f2[n4] = 0.6;
f3[n1] =1.2;
FVector sum = f1 + f2;
FVector diff = f1 - f2;
- BOOST_CHECK_CLOSE((FValue)sum[n1], 1.21, TOL);
- BOOST_CHECK_CLOSE((FValue)sum[n2], 1.4, TOL);
- BOOST_CHECK_CLOSE((FValue)sum[n3], 5.5, TOL);
- BOOST_CHECK_CLOSE((FValue)sum[n4], 0.6, TOL);
- BOOST_CHECK_CLOSE((FValue)diff[n1], 1.19, TOL);
- BOOST_CHECK_CLOSE((FValue)diff[n2], 1.4, TOL);
- BOOST_CHECK_CLOSE((FValue)diff[n3], -5.7, TOL);
- BOOST_CHECK_CLOSE((FValue)diff[n4], -0.6, TOL);
+ BOOST_CHECK_CLOSE((FValue)sum[n1], 1.21, TOL);
+ BOOST_CHECK_CLOSE((FValue)sum[n2], 1.4, TOL);
+ BOOST_CHECK_CLOSE((FValue)sum[n3], 5.5, TOL);
+ BOOST_CHECK_CLOSE((FValue)sum[n4], 0.6, TOL);
+ BOOST_CHECK_CLOSE((FValue)diff[n1], 1.19, TOL);
+ BOOST_CHECK_CLOSE((FValue)diff[n2], 1.4, TOL);
+ BOOST_CHECK_CLOSE((FValue)diff[n3], -5.7, TOL);
+ BOOST_CHECK_CLOSE((FValue)diff[n4], -0.6, TOL);
f1 -= f3;
cerr << f1 << endl << f3 << endl ;
BOOST_CHECK_CLOSE((FValue)f1[n1],0,TOL);
}
-BOOST_AUTO_TEST_CASE(scalar)
+BOOST_AUTO_TEST_CASE(scalar)
{
FVector f1,f2;
FName n1("a");
FName n2("b");
FName n3("c");
FName n4("d");
- f1[n1] = 0.2; f1[n2] = 9.178; f1[n3] = -0.1;
- f2[n1] = 0.01; f2[n3] = 5.6; f2[n4] = 0.6;
+ f1[n1] = 0.2;
+ f1[n2] = 9.178;
+ f1[n3] = -0.1;
+ f2[n1] = 0.01;
+ f2[n3] = 5.6;
+ f2[n4] = 0.6;
FVector prod1 = f1 * 2;
FVector prod2 = f1 * -0.1;
FVector quot = f2 / 2;
@@ -80,12 +88,13 @@ BOOST_AUTO_TEST_CASE(scalar)
BOOST_CHECK_CLOSE((FValue)quot[n4], 0.3, TOL);
}
-BOOST_AUTO_TEST_CASE(inc)
+BOOST_AUTO_TEST_CASE(inc)
{
FVector f1;
FName n1("a");
FName n2("b");
- f1[n1] = 2.3; f1[n2] = -0.4;
+ f1[n1] = 2.3;
+ f1[n2] = -0.4;
f1[n1]+=2;
BOOST_CHECK_CLOSE((FValue)f1[n1], 4.3, TOL);
BOOST_CHECK_CLOSE((FValue)f1[n2], -0.4, TOL);
@@ -103,8 +112,13 @@ BOOST_AUTO_TEST_CASE(vector_mult)
FName n2("b");
FName n3("c");
FName n4("d");
- f1[n1] = 0.2; f1[n2] = 9.178; f1[n3] = -0.1;
- f2[n1] = 0.01; f2[n2] = 5.6; f2[n3] = 1; f2[n4] = 0.6;
+ f1[n1] = 0.2;
+ f1[n2] = 9.178;
+ f1[n3] = -0.1;
+ f2[n1] = 0.01;
+ f2[n2] = 5.6;
+ f2[n3] = 1;
+ f2[n4] = 0.6;
FVector prod = f1 * f2;
FVector quot = f1/f2;
BOOST_CHECK_CLOSE((FValue)prod[n1], 0.002, TOL);
@@ -118,7 +132,7 @@ BOOST_AUTO_TEST_CASE(vector_mult)
BOOST_CHECK_CLOSE((FValue)quot[n4], 0, TOL);
}
-BOOST_AUTO_TEST_CASE(core)
+BOOST_AUTO_TEST_CASE(core)
{
FVector f1(2);
f1[0] = 1.3;
@@ -127,7 +141,7 @@ BOOST_AUTO_TEST_CASE(core)
BOOST_CHECK_CLOSE(f1[1],-1.9,TOL);
f1[1] = 0.1;
BOOST_CHECK_CLOSE(f1[1],0.1,TOL);
-
+
BOOST_CHECK_EQUAL(f1.size(),2);
f1[FName("a")] = 1.2;
@@ -140,8 +154,13 @@ BOOST_AUTO_TEST_CASE(core_arith)
FVector f2(2);
FName n1("a");
FName n2("b");
- f1[0] = 1.1; f1[1] = 0.25; f1[n1] = 3.6; f1[n2] = -1.5;
- f2[0] = 0.5; f2[1] = -0.1; f2[n1] = 1;
+ f1[0] = 1.1;
+ f1[1] = 0.25;
+ f1[n1] = 3.6;
+ f1[n2] = -1.5;
+ f2[0] = 0.5;
+ f2[1] = -0.1;
+ f2[n1] = 1;
//vector ops
FVector sum = f1+f2;
@@ -172,9 +191,10 @@ BOOST_AUTO_TEST_CASE(core_arith)
//with different length vectors
FVector f3(2);
FVector f4(1);
- f3[0] = 2; f3[1] = -1;
+ f3[0] = 2;
+ f3[1] = -1;
f4[0] = 5;
-
+
FVector sum1 = f3 + f4;
FVector sum2 = f4 + f3;
BOOST_CHECK_EQUAL(sum1,sum2);
@@ -200,14 +220,17 @@ BOOST_AUTO_TEST_CASE(core_arith)
BOOST_CHECK_EQUAL(quot1[1], -numeric_limits<float>::infinity());
BOOST_CHECK_CLOSE(quot2[0], 2.5, TOL);
BOOST_CHECK_CLOSE(quot2[1], 0, TOL);
-
+
}
BOOST_AUTO_TEST_CASE(core_scalar)
{
FVector f1(3);
FName n1("a");
- f1[0] = 1.5; f1[1] = 2.1; f1[2] = 4; f1[n1] = -0.5;
+ f1[0] = 1.5;
+ f1[1] = 2.1;
+ f1[2] = 4;
+ f1[n1] = -0.5;
FVector prod = f1*2;
FVector quot = f1/5;
@@ -224,31 +247,41 @@ BOOST_AUTO_TEST_CASE(core_scalar)
}
-BOOST_AUTO_TEST_CASE(l1norm)
+BOOST_AUTO_TEST_CASE(l1norm)
{
FVector f1(3);
FName n1("a");
- f1[0] = 1.5; f1[1] = 2.1; f1[2] = 4; f1[n1] = -0.5;
+ f1[0] = 1.5;
+ f1[1] = 2.1;
+ f1[2] = 4;
+ f1[n1] = -0.5;
FValue n = f1.l1norm();
BOOST_CHECK_CLOSE((FValue)n, abs(1.5)+abs(2.1)+abs(4)+abs(-0.5), TOL);
}
-BOOST_AUTO_TEST_CASE(sum)
+BOOST_AUTO_TEST_CASE(sum)
{
FVector f1(3);
FName n1("a");
FName n2("b");
- f1[0] = 1.5; f1[1] = 2.1; f1[2] = 4; f1[n1] = -0.5; f1[n2] = 2.7;
+ f1[0] = 1.5;
+ f1[1] = 2.1;
+ f1[2] = 4;
+ f1[n1] = -0.5;
+ f1[n2] = 2.7;
FValue n = f1.sum();
BOOST_CHECK_CLOSE((FValue)n, 1.5+2.1+4-0.5+2.7, TOL);
}
-BOOST_AUTO_TEST_CASE(l2norm)
+BOOST_AUTO_TEST_CASE(l2norm)
{
FVector f1(3);
FName n1("a");
- f1[0] = 1.5; f1[1] = 2.1; f1[2] = 4; f1[n1] = -0.5;
+ f1[0] = 1.5;
+ f1[1] = 2.1;
+ f1[2] = 4;
+ f1[n1] = -0.5;
FValue n = f1.l2norm();
BOOST_CHECK_CLOSE((FValue)n, sqrt((1.5*1.5)+(2.1*2.1)+(4*4)+(-0.5*-0.5)), TOL);
}
@@ -260,8 +293,14 @@ BOOST_AUTO_TEST_CASE(ip)
FName n1("a");
FName n2("b");
FName n3("c");
- f1[0] = 1.1; f1[1] = -0.1; ; f1[n2] = -1.5; f1[n3] = 2.2;
- f2[0] = 0.5; f2[1] = 0.25; f2[n1] = 1; f2[n3] = 2.4;
+ f1[0] = 1.1;
+ f1[1] = -0.1; ;
+ f1[n2] = -1.5;
+ f1[n3] = 2.2;
+ f2[0] = 0.5;
+ f2[1] = 0.25;
+ f2[n1] = 1;
+ f2[n3] = 2.4;
FValue p1 = inner_product(f1,f2);
FValue p2 = inner_product(f2,f1);
BOOST_CHECK_CLOSE(p1,p2,TOL);
diff --git a/moses/GenerationDictionary.cpp b/moses/GenerationDictionary.cpp
index f9f418197..dbc0eedb3 100644
--- a/moses/GenerationDictionary.cpp
+++ b/moses/GenerationDictionary.cpp
@@ -35,7 +35,7 @@ namespace Moses
{
GenerationDictionary::GenerationDictionary(const std::string &line)
-: DecodeFeature("Generation", line)
+ : DecodeFeature("Generation", line)
{
string filePath;
@@ -44,8 +44,7 @@ GenerationDictionary::GenerationDictionary(const std::string &line)
if (args[0] == "path") {
filePath = args[1];
- }
- else {
+ } else {
//UserMessage::Add("Unknown argument " + args[0]);
//abort();
}
diff --git a/moses/GenerationDictionary.h b/moses/GenerationDictionary.h
index 6a1e4de9a..b2aeb0d96 100644
--- a/moses/GenerationDictionary.h
+++ b/moses/GenerationDictionary.h
@@ -53,22 +53,21 @@ protected:
public:
GenerationDictionary(const std::string &line);
- virtual ~GenerationDictionary();
-
- //! load data file
- bool Load(const std::string &filePath, FactorDirection direction);
-
- /** number of unique input entries in the generation table.
- * NOT the number of lines in the generation table
- */
- size_t GetSize() const
- {
- return m_collection.size();
- }
- /** returns a bag of output words, OutputWordCollection, for a particular input word.
- * Or NULL if the input word isn't found. The search function used is the WordComparer functor
- */
- const OutputWordCollection *FindWord(const Word &word) const;
+ virtual ~GenerationDictionary();
+
+ //! load data file
+ bool Load(const std::string &filePath, FactorDirection direction);
+
+ /** number of unique input entries in the generation table.
+ * NOT the number of lines in the generation table
+ */
+ size_t GetSize() const {
+ return m_collection.size();
+ }
+ /** returns a bag of output words, OutputWordCollection, for a particular input word.
+ * Or NULL if the input word isn't found. The search function used is the WordComparer functor
+ */
+ const OutputWordCollection *FindWord(const Word &word) const;
};
diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp
index 7bc3e6a75..50443904c 100644
--- a/moses/Hypothesis.cpp
+++ b/moses/Hypothesis.cpp
@@ -60,8 +60,8 @@ Hypothesis::Hypothesis(Manager& manager, InputType const& source, const TargetPh
, m_arcList(NULL)
, m_transOpt(NULL)
, m_manager(manager)
-, m_totalScore(0.0f)
-, m_futureScore(0.0f)
+ , m_totalScore(0.0f)
+ , m_futureScore(0.0f)
, m_id(m_manager.GetNextHypoId())
{
@@ -248,20 +248,22 @@ int Hypothesis::RecombineCompare(const Hypothesis &compare) const
}
if (comp != 0) return comp;
}
-
+
return 0;
}
void Hypothesis::EvaluateWith(const StatefulFeatureFunction &sfff,
- int state_idx) {
+ int state_idx)
+{
m_ffStates[state_idx] = sfff.Evaluate(
- *this,
- m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL,
- &m_scoreBreakdown);
-
+ *this,
+ m_prevHypo ? m_prevHypo->m_ffStates[state_idx] : NULL,
+ &m_scoreBreakdown);
+
}
-void Hypothesis::EvaluateWith(const StatelessFeatureFunction& slff) {
+void Hypothesis::EvaluateWith(const StatelessFeatureFunction& slff)
+{
slff.Evaluate(PhraseBasedFeatureContext(this), &m_scoreBreakdown);
}
@@ -280,14 +282,14 @@ void Hypothesis::CalcScore(const SquareMatrix &futureScore)
// compute values of stateless feature functions that were not
// cached in the translation option
const vector<const StatelessFeatureFunction*>& sfs =
- StatelessFeatureFunction::GetStatelessFeatureFunctions();
+ StatelessFeatureFunction::GetStatelessFeatureFunctions();
for (unsigned i = 0; i < sfs.size(); ++i) {
- const StatelessFeatureFunction &ff = *sfs[i];
+ const StatelessFeatureFunction &ff = *sfs[i];
EvaluateWith(ff);
}
const vector<const StatefulFeatureFunction*>& ffs =
- StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ StatefulFeatureFunction::GetStatefulFeatureFunctions();
for (unsigned i = 0; i < ffs.size(); ++i) {
const StatefulFeatureFunction &ff = *ffs[i];
m_ffStates[i] = ff.Evaluate(
diff --git a/moses/HypothesisStack.h b/moses/HypothesisStack.h
index 26e6ed21b..0c3d4198f 100644
--- a/moses/HypothesisStack.h
+++ b/moses/HypothesisStack.h
@@ -11,7 +11,7 @@ namespace Moses
class Manager;
-/** abstract unique set of hypotheses that cover a certain number of words,
+/** abstract unique set of hypotheses that cover a certain number of words,
* ie. a stack in phrase-based decoding
*/
class HypothesisStack
diff --git a/moses/Incremental.cpp b/moses/Incremental.cpp
index 3eb66fb0e..e4159063c 100644
--- a/moses/Incremental.cpp
+++ b/moses/Incremental.cpp
@@ -19,90 +19,98 @@
#include <boost/lexical_cast.hpp>
-namespace Moses {
-namespace Incremental {
-namespace {
+namespace Moses
+{
+namespace Incremental
+{
+namespace
+{
// This is called by EdgeGenerator. Route hypotheses to separate vertices for
-// each left hand side label, populating ChartCellLabelSet out.
-template <class Best> class HypothesisCallback {
- private:
- typedef search::VertexGenerator<Best> Gen;
- public:
- HypothesisCallback(search::ContextBase &context, Best &best, ChartCellLabelSet &out, boost::object_pool<search::Vertex> &vertex_pool)
- : context_(context), best_(best), out_(out), vertex_pool_(vertex_pool) {}
-
- void NewHypothesis(search::PartialEdge partial) {
- // Get the LHS, look it up in the output ChartCellLabel, and upcast it.
- // It's not part of the union because it would have been ugly to expose template types in ChartCellLabel.
- ChartCellLabel::Stack &stack = out_.FindOrInsert(static_cast<const TargetPhrase *>(partial.GetNote().vp)->GetTargetLHS());
- Gen *entry = static_cast<Gen*>(stack.incr_generator);
- if (!entry) {
- entry = generator_pool_.construct(context_, *vertex_pool_.construct(), best_);
- stack.incr_generator = entry;
- }
- entry->NewHypothesis(partial);
+// each left hand side label, populating ChartCellLabelSet out.
+template <class Best> class HypothesisCallback
+{
+private:
+ typedef search::VertexGenerator<Best> Gen;
+public:
+ HypothesisCallback(search::ContextBase &context, Best &best, ChartCellLabelSet &out, boost::object_pool<search::Vertex> &vertex_pool)
+ : context_(context), best_(best), out_(out), vertex_pool_(vertex_pool) {}
+
+ void NewHypothesis(search::PartialEdge partial) {
+ // Get the LHS, look it up in the output ChartCellLabel, and upcast it.
+ // It's not part of the union because it would have been ugly to expose template types in ChartCellLabel.
+ ChartCellLabel::Stack &stack = out_.FindOrInsert(static_cast<const TargetPhrase *>(partial.GetNote().vp)->GetTargetLHS());
+ Gen *entry = static_cast<Gen*>(stack.incr_generator);
+ if (!entry) {
+ entry = generator_pool_.construct(context_, *vertex_pool_.construct(), best_);
+ stack.incr_generator = entry;
}
+ entry->NewHypothesis(partial);
+ }
- void FinishedSearch() {
- for (ChartCellLabelSet::iterator i(out_.mutable_begin()); i != out_.mutable_end(); ++i) {
- ChartCellLabel::Stack &stack = i->second.MutableStack();
- Gen *gen = static_cast<Gen*>(stack.incr_generator);
- gen->FinishedSearch();
- stack.incr = &gen->Generating();
- }
+ void FinishedSearch() {
+ for (ChartCellLabelSet::iterator i(out_.mutable_begin()); i != out_.mutable_end(); ++i) {
+ ChartCellLabel::Stack &stack = i->second.MutableStack();
+ Gen *gen = static_cast<Gen*>(stack.incr_generator);
+ gen->FinishedSearch();
+ stack.incr = &gen->Generating();
}
+ }
- private:
- search::ContextBase &context_;
+private:
+ search::ContextBase &context_;
- Best &best_;
+ Best &best_;
- ChartCellLabelSet &out_;
+ ChartCellLabelSet &out_;
- boost::object_pool<search::Vertex> &vertex_pool_;
- boost::object_pool<Gen> generator_pool_;
+ boost::object_pool<search::Vertex> &vertex_pool_;
+ boost::object_pool<Gen> generator_pool_;
};
// This is called by the moses parser to collect hypotheses. It converts to my
-// edges (search::PartialEdge).
-template <class Model> class Fill : public ChartParserCallback {
- public:
- Fill(search::Context<Model> &context, const std::vector<lm::WordIndex> &vocab_mapping, search::Score oov_weight)
- : context_(context), vocab_mapping_(vocab_mapping), oov_weight_(oov_weight) {}
+// edges (search::PartialEdge).
+template <class Model> class Fill : public ChartParserCallback
+{
+public:
+ Fill(search::Context<Model> &context, const std::vector<lm::WordIndex> &vocab_mapping, search::Score oov_weight)
+ : context_(context), vocab_mapping_(vocab_mapping), oov_weight_(oov_weight) {}
- void Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &ignored);
+ void Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &ignored);
- void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range);
+ void AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range);
- bool Empty() const { return edges_.Empty(); }
+ bool Empty() const {
+ return edges_.Empty();
+ }
- template <class Best> void Search(Best &best, ChartCellLabelSet &out, boost::object_pool<search::Vertex> &vertex_pool) {
- HypothesisCallback<Best> callback(context_, best, out, vertex_pool);
- edges_.Search(context_, callback);
- }
+ template <class Best> void Search(Best &best, ChartCellLabelSet &out, boost::object_pool<search::Vertex> &vertex_pool) {
+ HypothesisCallback<Best> callback(context_, best, out, vertex_pool);
+ edges_.Search(context_, callback);
+ }
- // Root: everything into one vertex.
- template <class Best> search::History RootSearch(Best &best) {
- search::Vertex vertex;
- search::RootVertexGenerator<Best> gen(vertex, best);
- edges_.Search(context_, gen);
- return vertex.BestChild();
- }
+ // Root: everything into one vertex.
+ template <class Best> search::History RootSearch(Best &best) {
+ search::Vertex vertex;
+ search::RootVertexGenerator<Best> gen(vertex, best);
+ edges_.Search(context_, gen);
+ return vertex.BestChild();
+ }
- private:
- lm::WordIndex Convert(const Word &word) const;
+private:
+ lm::WordIndex Convert(const Word &word) const;
- search::Context<Model> &context_;
+ search::Context<Model> &context_;
- const std::vector<lm::WordIndex> &vocab_mapping_;
+ const std::vector<lm::WordIndex> &vocab_mapping_;
- search::EdgeGenerator edges_;
+ search::EdgeGenerator edges_;
- const search::Score oov_weight_;
+ const search::Score oov_weight_;
};
-template <class Model> void Fill<Model>::Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &) {
+template <class Model> void Fill<Model>::Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &)
+{
std::vector<search::PartialVertex> vertices;
vertices.reserve(nts.size());
float below_score = 0.0;
@@ -131,7 +139,7 @@ template <class Model> void Fill<Model>::Add(const TargetPhraseCollection &targe
}
edge.SetScore(phrase.GetFutureScore() + below_score);
- // prob and oov were already accounted for.
+ // prob and oov were already accounted for.
search::ScoreRule(context_.LanguageModel(), words, edge.Between());
search::Note note;
@@ -142,14 +150,15 @@ template <class Model> void Fill<Model>::Add(const TargetPhraseCollection &targe
}
}
-template <class Model> void Fill<Model>::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &, const WordsRange &) {
+template <class Model> void Fill<Model>::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &, const WordsRange &)
+{
std::vector<lm::WordIndex> words;
CHECK(phrase.GetSize() <= 1);
if (phrase.GetSize())
words.push_back(Convert(phrase.GetWord(0)));
search::PartialEdge edge(edges_.AllocateEdge(0));
- // Appears to be a bug that FutureScore does not already include language model.
+ // Appears to be a bug that FutureScore does not already include language model.
search::ScoreRuleRet scored(search::ScoreRule(context_.LanguageModel(), words, edge.Between()));
edge.SetScore(phrase.GetFutureScore() + scored.prob * context_.LMWeight() + static_cast<search::Score>(scored.oov) * oov_weight_);
@@ -160,8 +169,9 @@ template <class Model> void Fill<Model>::AddPhraseOOV(TargetPhrase &phrase, std:
edges_.AddEdge(edge);
}
-// TODO: factors (but chart doesn't seem to support factors anyway).
-template <class Model> lm::WordIndex Fill<Model>::Convert(const Word &word) const {
+// TODO: factors (but chart doesn't seem to support factors anyway).
+template <class Model> lm::WordIndex Fill<Model>::Convert(const Word &word) const
+{
std::size_t factor = word.GetFactor(0)->GetId();
return (factor >= vocab_mapping_.size() ? 0 : vocab_mapping_[factor]);
}
@@ -180,10 +190,12 @@ Manager::Manager(const InputType &source) :
parser_(source, cells_),
n_best_(search::NBestConfig(StaticData::Instance().GetNBestSize())) {}
-Manager::~Manager() {
+Manager::~Manager()
+{
}
-template <class Model, class Best> search::History Manager::PopulateBest(const Model &model, const std::vector<lm::WordIndex> &words, Best &out) {
+template <class Model, class Best> search::History Manager::PopulateBest(const Model &model, const std::vector<lm::WordIndex> &words, Best &out)
+{
const LanguageModel &abstract = LanguageModel::GetFirstLM();
const float oov_weight = abstract.OOVFeatureEnabled() ? abstract.GetOOVWeight() : 0.0;
const StaticData &data = StaticData::Instance();
@@ -192,7 +204,7 @@ template <class Model, class Best> search::History Manager::PopulateBest(const M
size_t size = source_.GetSize();
boost::object_pool<search::Vertex> vertex_pool(std::max<size_t>(size * size / 2, 32));
-
+
for (size_t width = 1; width < size; ++width) {
for (size_t startPos = 0; startPos <= size-width; ++startPos) {
WordsRange range(startPos, startPos + width - 1);
@@ -208,7 +220,8 @@ template <class Model, class Best> search::History Manager::PopulateBest(const M
return filler.RootSearch(out);
}
-template <class Model> void Manager::LMCallback(const Model &model, const std::vector<lm::WordIndex> &words) {
+template <class Model> void Manager::LMCallback(const Model &model, const std::vector<lm::WordIndex> &words)
+{
std::size_t nbest = StaticData::Instance().GetNBestSize();
if (nbest <= 1) {
search::History ret = PopulateBest(model, words, single_best_);
@@ -237,12 +250,14 @@ template void Manager::LMCallback<lm::ngram::QuantTrieModel>(const lm::ngram::Qu
template void Manager::LMCallback<lm::ngram::ArrayTrieModel>(const lm::ngram::ArrayTrieModel &model, const std::vector<lm::WordIndex> &words);
template void Manager::LMCallback<lm::ngram::QuantArrayTrieModel>(const lm::ngram::QuantArrayTrieModel &model, const std::vector<lm::WordIndex> &words);
-const std::vector<search::Applied> &Manager::ProcessSentence() {
+const std::vector<search::Applied> &Manager::ProcessSentence()
+{
LanguageModel::GetFirstLM().IncrementalCallback(*this);
return *completed_nbest_;
}
-namespace {
+namespace
+{
struct NoOp {
void operator()(const TargetPhrase &) const {}
@@ -254,7 +269,8 @@ struct AccumScore {
}
ScoreComponentCollection *out_;
};
-template <class Action> void AppendToPhrase(const search::Applied final, Phrase &out, Action action) {
+template <class Action> void AppendToPhrase(const search::Applied final, Phrase &out, Action action)
+{
assert(final.Valid());
const TargetPhrase &phrase = *static_cast<const TargetPhrase*>(final.GetNote().vp);
action(phrase);
@@ -271,23 +287,25 @@ template <class Action> void AppendToPhrase(const search::Applied final, Phrase
} // namespace
-void ToPhrase(const search::Applied final, Phrase &out) {
+void ToPhrase(const search::Applied final, Phrase &out)
+{
out.Clear();
AppendToPhrase(final, out, NoOp());
}
-void PhraseAndFeatures(const search::Applied final, Phrase &phrase, ScoreComponentCollection &features) {
+void PhraseAndFeatures(const search::Applied final, Phrase &phrase, ScoreComponentCollection &features)
+{
phrase.Clear();
features.ZeroAll();
AppendToPhrase(final, phrase, AccumScore(features));
- // If we made it this far, there is only one language model.
+ // If we made it this far, there is only one language model.
float full, ignored_ngram;
std::size_t ignored_oov;
const LanguageModel &model = LanguageModel::GetFirstLM();
model.CalcScore(phrase, full, ignored_ngram, ignored_oov);
- // CalcScore transforms, but EvaluateChart doesn't.
+ // CalcScore transforms, but EvaluateChart doesn't.
features.Assign(&model, full);
}
diff --git a/moses/Incremental.h b/moses/Incremental.h
index 30f7c588c..20040bf45 100644
--- a/moses/Incremental.h
+++ b/moses/Incremental.h
@@ -10,49 +10,52 @@
#include <vector>
#include <string>
-namespace Moses {
+namespace Moses
+{
class ScoreComponentCollection;
class InputType;
class LanguageModel;
-namespace Incremental {
+namespace Incremental
+{
-class Manager {
- public:
- Manager(const InputType &source);
+class Manager
+{
+public:
+ Manager(const InputType &source);
- ~Manager();
+ ~Manager();
- template <class Model> void LMCallback(const Model &model, const std::vector<lm::WordIndex> &words);
-
- const std::vector<search::Applied> &ProcessSentence();
+ template <class Model> void LMCallback(const Model &model, const std::vector<lm::WordIndex> &words);
- // Call to get the same value as ProcessSentence returned.
- const std::vector<search::Applied> &Completed() const {
- return *completed_nbest_;
- }
+ const std::vector<search::Applied> &ProcessSentence();
- private:
- template <class Model, class Best> search::History PopulateBest(const Model &model, const std::vector<lm::WordIndex> &words, Best &out);
+ // Call to get the same value as ProcessSentence returned.
+ const std::vector<search::Applied> &Completed() const {
+ return *completed_nbest_;
+ }
- const InputType &source_;
- ChartCellCollectionBase cells_;
- ChartParser parser_;
+private:
+ template <class Model, class Best> search::History PopulateBest(const Model &model, const std::vector<lm::WordIndex> &words, Best &out);
- // Only one of single_best_ or n_best_ will be used, but it was easier to do this than a template.
- search::SingleBest single_best_;
- // ProcessSentence returns a reference to a vector. ProcessSentence
- // doesn't have one, so this is populated and returned.
- std::vector<search::Applied> backing_for_single_;
+ const InputType &source_;
+ ChartCellCollectionBase cells_;
+ ChartParser parser_;
- search::NBest n_best_;
-
- const std::vector<search::Applied> *completed_nbest_;
+ // Only one of single_best_ or n_best_ will be used, but it was easier to do this than a template.
+ search::SingleBest single_best_;
+ // ProcessSentence returns a reference to a vector. ProcessSentence
+ // doesn't have one, so this is populated and returned.
+ std::vector<search::Applied> backing_for_single_;
+
+ search::NBest n_best_;
+
+ const std::vector<search::Applied> *completed_nbest_;
};
// Just get the phrase.
void ToPhrase(const search::Applied final, Phrase &out);
-// Get the phrase and the features.
+// Get the phrase and the features.
void PhraseAndFeatures(const search::Applied final, Phrase &phrase, ScoreComponentCollection &features);
diff --git a/moses/InputType.cpp b/moses/InputType.cpp
index 64dc9a7fd..1ca3da63b 100644
--- a/moses/InputType.cpp
+++ b/moses/InputType.cpp
@@ -67,7 +67,7 @@ std::vector <ChartTranslationOptions*> InputType::GetXmlChartTranslationOptions(
std::vector <ChartTranslationOptions*> ret;
return ret;
}
-
+
}
diff --git a/moses/InputType.h b/moses/InputType.h
index a065c0bf0..d0106e5ca 100644
--- a/moses/InputType.h
+++ b/moses/InputType.h
@@ -38,9 +38,9 @@ class Factor;
class PhraseDictionary;
class TranslationOptionCollection;
class ChartTranslationOptions;
-
+
/** base class for all types of inputs to the decoder,
- * eg. sentences, confusion networks, lattices and tree
+ * eg. sentences, confusion networks, lattices and tree
*/
class InputType
{
@@ -81,7 +81,7 @@ public:
}
void SetDocumentId(long documentId) {
m_documentId = documentId;
- }
+ }
long GetTopicId() const {
return m_topicId;
}
@@ -111,7 +111,7 @@ public:
}
void SetTextType(std::string type) {
m_textType = type;
- }
+ }
std::string GetPassthroughInformation() const {
return m_passthrough;
}
diff --git a/moses/LM/Backward.cpp b/moses/LM/Backward.cpp
index a9fca1c75..263c90fec 100644
--- a/moses/LM/Backward.cpp
+++ b/moses/LM/Backward.cpp
@@ -35,281 +35,288 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//#include "moses/StaticData.h"
//#include <iostream>
-namespace Moses {
-
- /** Constructs a new backward language model. */
- template <class Model> BackwardLanguageModel<Model>::BackwardLanguageModel(const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(file,factorType,lazy) {
- //
- // This space intentionally left blank
- //
- }
-
- /**
- * Constructs an empty backward language model state.
- *
- * This state will correspond with a translation hypothesis
- * where no source words have been translated.
- *
- * In a forward language model, the language model state of an empty hypothesis
- * would store the beginning of sentence marker <s>.
- *
- * Because this is a backward language model, the language model state returned by this method
- * instead stores the end of sentence marker </s>.
- */
- template <class Model> const FFState *BackwardLanguageModel<Model>::EmptyHypothesisState(const InputType &/*input*/) const {
- BackwardLMState *ret = new BackwardLMState();
- lm::ngram::RuleScore<Model> ruleScore(*m_ngram, ret->state);
- ruleScore.Terminal(m_ngram->GetVocabulary().EndSentence());
- // float score =
- ruleScore.Finish();
- // VERBOSE(1, "BackwardLM EmptyHypothesisState has score " << score);
- return ret;
- }
- /*
- template <class Model> double BackwardLanguageModel<Model>::Score(FFState *ffState) {
- BackwardLMState *lmState = static_cast< BackwardLMState* >(ffState);
- lm::ngram::ChartState &state = lmState->state;
- lm::ngram::RuleScore<Model> ruleScore(*m_ngram, lmState);
- return ruleScore.Finish();
- }
+namespace Moses
+{
+
+/** Constructs a new backward language model. */
+template <class Model> BackwardLanguageModel<Model>::BackwardLanguageModel(const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(file,factorType,lazy)
+{
+ //
+ // This space intentionally left blank
+ //
+}
+
+/**
+ * Constructs an empty backward language model state.
+ *
+ * This state will correspond with a translation hypothesis
+ * where no source words have been translated.
+ *
+ * In a forward language model, the language model state of an empty hypothesis
+ * would store the beginning of sentence marker <s>.
+ *
+ * Because this is a backward language model, the language model state returned by this method
+ * instead stores the end of sentence marker </s>.
+ */
+template <class Model> const FFState *BackwardLanguageModel<Model>::EmptyHypothesisState(const InputType &/*input*/) const
+{
+ BackwardLMState *ret = new BackwardLMState();
+ lm::ngram::RuleScore<Model> ruleScore(*m_ngram, ret->state);
+ ruleScore.Terminal(m_ngram->GetVocabulary().EndSentence());
+ // float score =
+ ruleScore.Finish();
+ // VERBOSE(1, "BackwardLM EmptyHypothesisState has score " << score);
+ return ret;
+}
+/*
+template <class Model> double BackwardLanguageModel<Model>::Score(FFState *ffState) {
+ BackwardLMState *lmState = static_cast< BackwardLMState* >(ffState);
+ lm::ngram::ChartState &state = lmState->state;
+ lm::ngram::RuleScore<Model> ruleScore(*m_ngram, lmState);
+ return ruleScore.Finish();
+}
*/
- /**
- * Pre-calculate the n-gram probabilities for the words in the specified phrase.
- *
- * Note that when this method is called, we do not have access to the context
- * in which this phrase will eventually be applied.
- *
- * In other words, we know what words are in this phrase,
- * but we do not know what words will come before or after this phrase.
- *
- * The parameters fullScore, ngramScore, and oovCount are all output parameters.
- *
- * The value stored in oovCount is the number of words in the phrase
- * that are not in the language model's vocabulary.
- *
- * The sum of the ngram scores for all words in this phrase are stored in fullScore.
- *
- * The value stored in ngramScore is similar, but only full-order ngram scores are included.
- *
- * This is best shown by example:
- *
- * Assume a trigram backward language model and a phrase "a b c d e f g"
- *
- * fullScore would represent the sum of the logprob scores for the following values:
- *
- * p(g)
- * p(f | g)
- * p(e | g f)
- * p(d | f e)
- * p(c | e d)
- * p(b | d c)
- * p(a | c b)
- *
- * ngramScore would represent the sum of the logprob scores for the following values:
- *
- * p(g)
- * p(f | g)
- * p(e | g f)
- * p(d | f e)
- * p(c | e d)
- * p(b | d c)
- * p(a | c b)
- */
- template <class Model> void BackwardLanguageModel<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
- fullScore = 0;
- ngramScore = 0;
- oovCount = 0;
-
- if (!phrase.GetSize()) return;
-
- lm::ngram::ChartState discarded_sadly;
- lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
-
+/**
+ * Pre-calculate the n-gram probabilities for the words in the specified phrase.
+ *
+ * Note that when this method is called, we do not have access to the context
+ * in which this phrase will eventually be applied.
+ *
+ * In other words, we know what words are in this phrase,
+ * but we do not know what words will come before or after this phrase.
+ *
+ * The parameters fullScore, ngramScore, and oovCount are all output parameters.
+ *
+ * The value stored in oovCount is the number of words in the phrase
+ * that are not in the language model's vocabulary.
+ *
+ * The sum of the ngram scores for all words in this phrase are stored in fullScore.
+ *
+ * The value stored in ngramScore is similar, but only full-order ngram scores are included.
+ *
+ * This is best shown by example:
+ *
+ * Assume a trigram backward language model and a phrase "a b c d e f g"
+ *
+ * fullScore would represent the sum of the logprob scores for the following values:
+ *
+ * p(g)
+ * p(f | g)
+ * p(e | g f)
+ * p(d | f e)
+ * p(c | e d)
+ * p(b | d c)
+ * p(a | c b)
+ *
+ * ngramScore would represent the sum of the logprob scores for the following values:
+ *
+ * p(g)
+ * p(f | g)
+ * p(e | g f)
+ * p(d | f e)
+ * p(c | e d)
+ * p(b | d c)
+ * p(a | c b)
+ */
+template <class Model> void BackwardLanguageModel<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
+{
+ fullScore = 0;
+ ngramScore = 0;
+ oovCount = 0;
+
+ if (!phrase.GetSize()) return;
+
+ lm::ngram::ChartState discarded_sadly;
+ lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
+
+ UTIL_THROW_IF(
+ (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)),
+ util::Exception,
+ "BackwardLanguageModel does not currently support rules that include <s>"
+ );
+
+ float before_boundary = 0.0f;
+
+ int lastWord = phrase.GetSize() - 1;
+ int ngramBoundary = m_ngram->Order() - 1;
+ int boundary = ( lastWord < ngramBoundary ) ? 0 : ngramBoundary;
+
+ int position;
+ for (position = lastWord; position >= 0; position-=1) {
+ const Word &word = phrase.GetWord(position);
UTIL_THROW_IF(
- (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)),
- util::Exception,
- "BackwardLanguageModel does not currently support rules that include <s>"
- );
-
- float before_boundary = 0.0f;
-
- int lastWord = phrase.GetSize() - 1;
- int ngramBoundary = m_ngram->Order() - 1;
- int boundary = ( lastWord < ngramBoundary ) ? 0 : ngramBoundary;
-
- int position;
- for (position = lastWord; position >= 0; position-=1) {
- const Word &word = phrase.GetWord(position);
- UTIL_THROW_IF(
- (word.IsNonTerminal()),
- util::Exception,
- "BackwardLanguageModel does not currently support rules that include non-terminals "
- );
-
- lm::WordIndex index = TranslateID(word);
- scorer.Terminal(index);
- if (!index) ++oovCount;
-
- if (position==boundary) {
- before_boundary = scorer.Finish();
- }
+ (word.IsNonTerminal()),
+ util::Exception,
+ "BackwardLanguageModel does not currently support rules that include non-terminals "
+ );
- }
+ lm::WordIndex index = TranslateID(word);
+ scorer.Terminal(index);
+ if (!index) ++oovCount;
- fullScore = scorer.Finish();
-
- ngramScore = TransformLMScore(fullScore - before_boundary);
- fullScore = TransformLMScore(fullScore);
+ if (position==boundary) {
+ before_boundary = scorer.Finish();
+ }
}
- /**
- * Calculate the ngram probabilities for the words at the beginning
- * (and under some circumstances, also at the end)
- * of the phrase represented by the provided hypothesis.
- *
- * Additionally, calculate a new language model state.
- *
- * This is best shown by example:
- *
- * Assume a trigram language model.
- *
- * Assume the previous phrase was "a b c d e f g",
- * which means the previous language model state is "g f".
- *
- * When the phrase corresponding to "a b c d e f g" was previously processed by CalcScore
- * the following full-order ngrams would have been calculated:
- *
- * p(a | c b)
- * p(b | d c)
- * p(c | e d)
- * p(d | f e)
- * p(e | g f)
- *
- * The following less-than-full-order ngrams would also have been calculated by CalcScore:
- *
- * p(f | g)
- * p(g)
- *
- * In this method, we now have access to additional context which may allow
- * us to compute the full-order ngrams for f and g.
- *
- * Assume the new provided hypothesis contains the new phrase "h i j k"
- *
- * Given these assumptions, this method is responsible
- * for calculating the scores for the following:
- *
- * p(f | h g)
- * p(g | i h)
- *
- * This method must also calculate and return a new language model state.
- *
- * In this example, the returned language model state would be "k j"
- *
- * If the provided hypothesis represents the end of a completed translation
- * (all source words have been translated)
- * then this method is additionally responsible for calculating the following:
- *
- * p(j | <s> k)
- * p(k | <s>)
- *
- */
- template <class Model> FFState *BackwardLanguageModel<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const {
-
- // If the current hypothesis contains zero target words
- if (!hypo.GetCurrTargetLength()) {
-
- // reuse and return the previous state
- std::auto_ptr<BackwardLMState> ret(new BackwardLMState());
- ret->state = static_cast<const BackwardLMState&>(*ps).state;
- return ret.release();
+ fullScore = scorer.Finish();
+
+ ngramScore = TransformLMScore(fullScore - before_boundary);
+ fullScore = TransformLMScore(fullScore);
+
+}
+
+/**
+ * Calculate the ngram probabilities for the words at the beginning
+ * (and under some circumstances, also at the end)
+ * of the phrase represented by the provided hypothesis.
+ *
+ * Additionally, calculate a new language model state.
+ *
+ * This is best shown by example:
+ *
+ * Assume a trigram language model.
+ *
+ * Assume the previous phrase was "a b c d e f g",
+ * which means the previous language model state is "g f".
+ *
+ * When the phrase corresponding to "a b c d e f g" was previously processed by CalcScore
+ * the following full-order ngrams would have been calculated:
+ *
+ * p(a | c b)
+ * p(b | d c)
+ * p(c | e d)
+ * p(d | f e)
+ * p(e | g f)
+ *
+ * The following less-than-full-order ngrams would also have been calculated by CalcScore:
+ *
+ * p(f | g)
+ * p(g)
+ *
+ * In this method, we now have access to additional context which may allow
+ * us to compute the full-order ngrams for f and g.
+ *
+ * Assume the new provided hypothesis contains the new phrase "h i j k"
+ *
+ * Given these assumptions, this method is responsible
+ * for calculating the scores for the following:
+ *
+ * p(f | h g)
+ * p(g | i h)
+ *
+ * This method must also calculate and return a new language model state.
+ *
+ * In this example, the returned language model state would be "k j"
+ *
+ * If the provided hypothesis represents the end of a completed translation
+ * (all source words have been translated)
+ * then this method is additionally responsible for calculating the following:
+ *
+ * p(j | <s> k)
+ * p(k | <s>)
+ *
+ */
+template <class Model> FFState *BackwardLanguageModel<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
+{
+
+ // If the current hypothesis contains zero target words
+ if (!hypo.GetCurrTargetLength()) {
+
+ // reuse and return the previous state
+ std::auto_ptr<BackwardLMState> ret(new BackwardLMState());
+ ret->state = static_cast<const BackwardLMState&>(*ps).state;
+ return ret.release();
- } else {
+ } else {
- float returnedScore;
+ float returnedScore;
- FFState *returnedState = this->Evaluate(hypo.GetCurrTargetPhrase(), ps, returnedScore);
+ FFState *returnedState = this->Evaluate(hypo.GetCurrTargetPhrase(), ps, returnedScore);
- out->PlusEquals(this, returnedScore);
+ out->PlusEquals(this, returnedScore);
- return returnedState;
+ return returnedState;
- }
}
+}
- template <class Model> FFState *BackwardLanguageModel<Model>::Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const {
-
- returnedScore = 0.0f;
+template <class Model> FFState *BackwardLanguageModel<Model>::Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const
+{
- const lm::ngram::ChartState &previous = static_cast<const BackwardLMState&>(*ps).state;
+ returnedScore = 0.0f;
- std::auto_ptr<BackwardLMState> ret(new BackwardLMState());
-
- lm::ngram::RuleScore<Model> scorer(*m_ngram, ret->state);
-
- int ngramBoundary = m_ngram->Order() - 1;
- int lastWord = phrase.GetSize() - 1;
-
- // Get scores for words at the end of the previous phrase
- // that are now adjacent to words at the the beginning of this phrase
- for (int position=std::min( lastWord, ngramBoundary - 1); position >= 0; position-=1) {
- const Word &word = phrase.GetWord(position);
- UTIL_THROW_IF(
- (word.IsNonTerminal()),
- util::Exception,
- "BackwardLanguageModel does not currently support rules that include non-terminals "
- );
-
- lm::WordIndex index = TranslateID(word);
- scorer.Terminal(index);
- }
- scorer.NonTerminal(previous);
- returnedScore = scorer.Finish();
- /*
- out->PlusEquals(this, score);
-
-
- UTIL_THROW_IF(
- (1==1),
- util::Exception,
- "This method (BackwardLanguageModel<Model>::Evaluate) is not yet fully implemented"
- );
- */
- return ret.release();
+ const lm::ngram::ChartState &previous = static_cast<const BackwardLMState&>(*ps).state;
+
+ std::auto_ptr<BackwardLMState> ret(new BackwardLMState());
+
+ lm::ngram::RuleScore<Model> scorer(*m_ngram, ret->state);
-
+ int ngramBoundary = m_ngram->Order() - 1;
+ int lastWord = phrase.GetSize() - 1;
+ // Get scores for words at the end of the previous phrase
+ // that are now adjacent to words at the the beginning of this phrase
+ for (int position=std::min( lastWord, ngramBoundary - 1); position >= 0; position-=1) {
+ const Word &word = phrase.GetWord(position);
+ UTIL_THROW_IF(
+ (word.IsNonTerminal()),
+ util::Exception,
+ "BackwardLanguageModel does not currently support rules that include non-terminals "
+ );
+
+ lm::WordIndex index = TranslateID(word);
+ scorer.Terminal(index);
}
+ scorer.NonTerminal(previous);
+ returnedScore = scorer.Finish();
+ /*
+ out->PlusEquals(this, score);
- LanguageModel *ConstructBackwardLM(const std::string &file, FactorType factorType, bool lazy) {
- try {
- lm::ngram::ModelType model_type;
- if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
- switch(model_type) {
- case lm::ngram::PROBING:
- return new BackwardLanguageModel<lm::ngram::ProbingModel>(file, factorType, lazy);
- case lm::ngram::REST_PROBING:
- return new BackwardLanguageModel<lm::ngram::RestProbingModel>(file, factorType, lazy);
- case lm::ngram::TRIE:
- return new BackwardLanguageModel<lm::ngram::TrieModel>(file, factorType, lazy);
- case lm::ngram::QUANT_TRIE:
- return new BackwardLanguageModel<lm::ngram::QuantTrieModel>(file, factorType, lazy);
- case lm::ngram::ARRAY_TRIE:
- return new BackwardLanguageModel<lm::ngram::ArrayTrieModel>(file, factorType, lazy);
- case lm::ngram::QUANT_ARRAY_TRIE:
- return new BackwardLanguageModel<lm::ngram::QuantArrayTrieModel>(file, factorType, lazy);
- default:
- std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
- abort();
- }
- } else {
- return new BackwardLanguageModel<lm::ngram::ProbingModel>(file, factorType, lazy);
+
+ UTIL_THROW_IF(
+ (1==1),
+ util::Exception,
+ "This method (BackwardLanguageModel<Model>::Evaluate) is not yet fully implemented"
+ );
+ */
+ return ret.release();
+
+
+
+}
+
+LanguageModel *ConstructBackwardLM(const std::string &file, FactorType factorType, bool lazy)
+{
+ try {
+ lm::ngram::ModelType model_type;
+ if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
+ switch(model_type) {
+ case lm::ngram::PROBING:
+ return new BackwardLanguageModel<lm::ngram::ProbingModel>(file, factorType, lazy);
+ case lm::ngram::REST_PROBING:
+ return new BackwardLanguageModel<lm::ngram::RestProbingModel>(file, factorType, lazy);
+ case lm::ngram::TRIE:
+ return new BackwardLanguageModel<lm::ngram::TrieModel>(file, factorType, lazy);
+ case lm::ngram::QUANT_TRIE:
+ return new BackwardLanguageModel<lm::ngram::QuantTrieModel>(file, factorType, lazy);
+ case lm::ngram::ARRAY_TRIE:
+ return new BackwardLanguageModel<lm::ngram::ArrayTrieModel>(file, factorType, lazy);
+ case lm::ngram::QUANT_ARRAY_TRIE:
+ return new BackwardLanguageModel<lm::ngram::QuantArrayTrieModel>(file, factorType, lazy);
+ default:
+ std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
+ abort();
}
- } catch (std::exception &e) {
- std::cerr << e.what() << std::endl;
- abort();
+ } else {
+ return new BackwardLanguageModel<lm::ngram::ProbingModel>(file, factorType, lazy);
}
+ } catch (std::exception &e) {
+ std::cerr << e.what() << std::endl;
+ abort();
}
+}
} // namespace Moses
diff --git a/moses/LM/Backward.h b/moses/LM/Backward.h
index 1bf6b560c..c81c0633d 100644
--- a/moses/LM/Backward.h
+++ b/moses/LM/Backward.h
@@ -29,53 +29,55 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "lm/state.hh"
-namespace Moses {
+namespace Moses
+{
//! This will also load. Returns a templated backward LM.
LanguageModel *ConstructBackwardLM(const std::string &file, FactorType factorType, bool lazy);
- class FFState;
- // template<typename M> class BackwardLanguageModelTest;
- class BackwardLanguageModelTest;
+class FFState;
+// template<typename M> class BackwardLanguageModelTest;
+class BackwardLanguageModelTest;
/*
* An implementation of single factor backward LM using Kenneth's code.
*/
-template <class Model> class BackwardLanguageModel : public LanguageModelKen<Model> {
- public:
- BackwardLanguageModel(const std::string &file, FactorType factorType, bool lazy);
-
- virtual const FFState *EmptyHypothesisState(const InputType &/*input*/) const;
-
- virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
-
- virtual FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
-
- FFState *Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const;
-
- private:
-
- // These lines are required to make the parent class's protected members visible to this class
- using LanguageModelKen<Model>::m_ngram;
- using LanguageModelKen<Model>::m_beginSentenceFactor;
- using LanguageModelKen<Model>::m_factorType;
- using LanguageModelKen<Model>::TranslateID;
-
- // friend class Moses::BackwardLanguageModelTest<Model>;
- friend class Moses::BackwardLanguageModelTest;
- /*
- lm::ngram::ChartState* GetState(FFState *ffState) {
- return NULL;
- }
- */
- /*
- double Score(FFState *ffState) {
- BackwardLMState *lmState = static_cast< BackwardLMState* >(ffState);
- lm::ngram::ChartState &state = lmState->state;
- lm::ngram::RuleScore<Model> ruleScore(*m_ngram, lmState);
- return ruleScore.Finish();
+template <class Model> class BackwardLanguageModel : public LanguageModelKen<Model>
+{
+public:
+ BackwardLanguageModel(const std::string &file, FactorType factorType, bool lazy);
+
+ virtual const FFState *EmptyHypothesisState(const InputType &/*input*/) const;
+
+ virtual void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
+
+ virtual FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
+
+ FFState *Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const;
+
+private:
+
+ // These lines are required to make the parent class's protected members visible to this class
+ using LanguageModelKen<Model>::m_ngram;
+ using LanguageModelKen<Model>::m_beginSentenceFactor;
+ using LanguageModelKen<Model>::m_factorType;
+ using LanguageModelKen<Model>::TranslateID;
+
+ // friend class Moses::BackwardLanguageModelTest<Model>;
+ friend class Moses::BackwardLanguageModelTest;
+ /*
+ lm::ngram::ChartState* GetState(FFState *ffState) {
+ return NULL;
}
- */
+ */
+ /*
+ double Score(FFState *ffState) {
+ BackwardLMState *lmState = static_cast< BackwardLMState* >(ffState);
+ lm::ngram::ChartState &state = lmState->state;
+ lm::ngram::RuleScore<Model> ruleScore(*m_ngram, lmState);
+ return ruleScore.Finish();
+ }
+ */
};
} // namespace Moses
@@ -83,7 +85,7 @@ template <class Model> class BackwardLanguageModel : public LanguageModelKen<Mod
#endif
// To create a sample backward language model using SRILM:
-//
+//
// (ngram-count and reverse-text are SRILM programs)
//
// head -n 49 ./contrib/synlm/hhmm/LICENSE | tail -n 45 | tr '\n' ' ' | ./scripts/ems/support/split-sentences.perl | ./scripts/tokenizer/lowercase.perl | ./scripts/tokenizer/tokenizer.perl | reverse-text | ngram-count -order 3 -text - -lm - > lm/backward.arpa
diff --git a/moses/LM/BackwardLMState.cpp b/moses/LM/BackwardLMState.cpp
index 37a3ab7da..466c4b655 100644
--- a/moses/LM/BackwardLMState.cpp
+++ b/moses/LM/BackwardLMState.cpp
@@ -22,11 +22,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/LM/BackwardLMState.h"
#include "lm/state.hh"
-namespace Moses {
+namespace Moses
+{
- int BackwardLMState::Compare(const FFState &o) const {
- const BackwardLMState &other = static_cast<const BackwardLMState &>(o);
- return state.left.Compare(other.state.left);
- }
+int BackwardLMState::Compare(const FFState &o) const
+{
+ const BackwardLMState &other = static_cast<const BackwardLMState &>(o);
+ return state.left.Compare(other.state.left);
+}
}
diff --git a/moses/LM/BackwardLMState.h b/moses/LM/BackwardLMState.h
index 7c6ebff62..e6d1f325a 100644
--- a/moses/LM/BackwardLMState.h
+++ b/moses/LM/BackwardLMState.h
@@ -36,14 +36,16 @@ namespace lm {
//#include "lm/state.hh"
-namespace Moses {
+namespace Moses
+{
- //template<typename M>
+//template<typename M>
class BackwardLanguageModelTest;
-class BackwardLMState : public FFState {
+class BackwardLMState : public FFState
+{
- public:
+public:
/*
int Compare(const FFState &o) const {
@@ -53,14 +55,14 @@ class BackwardLMState : public FFState {
*/
int Compare(const FFState &o) const;
- // Allow BackwardLanguageModel to access the private members of this class
- template <class Model> friend class BackwardLanguageModel;
+ // Allow BackwardLanguageModel to access the private members of this class
+ template <class Model> friend class BackwardLanguageModel;
// template <class Model> friend class Moses::BackwardLanguageModelTest;
- friend class Moses::BackwardLanguageModelTest;
+ friend class Moses::BackwardLanguageModelTest;
- private:
- lm::ngram::ChartState state;
+private:
+ lm::ngram::ChartState state;
};
diff --git a/moses/LM/BackwardTest.cpp b/moses/LM/BackwardTest.cpp
index 5f58c9f32..dc5de32bd 100644
--- a/moses/LM/BackwardTest.cpp
+++ b/moses/LM/BackwardTest.cpp
@@ -47,7 +47,7 @@ template <class M> void Foo() {
Moses::BackwardLanguageModel<M> *backwardLM;
// = new Moses::BackwardLanguageModel<M>( filename, factorType, lazy );
-
+
}
template <class M> void Everything() {
@@ -55,159 +55,160 @@ template <class M> void Everything() {
}
*/
-namespace Moses {
+namespace Moses
+{
-// Apparently some Boost versions use templates and are pretty strict about types matching.
+// Apparently some Boost versions use templates and are pretty strict about types matching.
#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol));
-class BackwardLanguageModelTest {
-
- public:
- BackwardLanguageModelTest() :
- dummyInput(new Sentence()),
- backwardLM(
- static_cast< BackwardLanguageModel<lm::ngram::ProbingModel> * >(
- ConstructBackwardLM(
- boost::unit_test::framework::master_test_suite().argv[1],
- 0,
- false)
- )
- )
+class BackwardLanguageModelTest
+{
+
+public:
+ BackwardLanguageModelTest() :
+ dummyInput(new Sentence()),
+ backwardLM(
+ static_cast< BackwardLanguageModel<lm::ngram::ProbingModel> * >(
+ ConstructBackwardLM(
+ boost::unit_test::framework::master_test_suite().argv[1],
+ 0,
+ false)
+ )
+ ) {
+ // This space intentionally left blank
+ }
+
+ ~BackwardLanguageModelTest() {
+ delete dummyInput;
+ delete backwardLM;
+ }
+
+ void testEmptyHypothesis() {
+ FFState *ffState = const_cast< FFState * >(backwardLM->EmptyHypothesisState( *dummyInput ));
+
+ BOOST_CHECK( ffState != NULL );
+
+ delete ffState;
+ }
+
+ void testCalcScore() {
+
+ double p_the = -1.383059;
+ double p_licenses = -2.360783;
+ double p_for = -1.661813;
+ double p_most = -2.360783;
+ // double p_software = -1.62042;
+
+ double p_the_licenses = -0.9625873;
+ double p_licenses_for = -1.661557;
+ double p_for_most = -0.4526253;
+ // double p_most_software = -1.70295;
+
+ double p_the_licenses_for = p_the_licenses + p_licenses_for;
+ // double p_licenses_for_most = p_licenses_for + p_for_most;
+
+ // the
{
- // This space intentionally left blank
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "the",
+ StaticData::Instance().GetFactorDelimiter());
+
+ BOOST_CHECK( phrase.GetSize() == 1 );
+
+ float fullScore;
+ float ngramScore;
+ size_t oovCount;
+ backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
+
+ BOOST_CHECK( oovCount == 0 );
+ SLOPPY_CHECK_CLOSE( TransformLMScore(p_the), fullScore, 0.01);
+ SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01);
}
- ~BackwardLanguageModelTest() {
- delete dummyInput;
- delete backwardLM;
+ // the licenses
+ {
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "the licenses",
+ StaticData::Instance().GetFactorDelimiter());
+
+ BOOST_CHECK( phrase.GetSize() == 2 );
+
+ float fullScore;
+ float ngramScore;
+ size_t oovCount;
+ backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
+
+ BOOST_CHECK( oovCount == 0 );
+ SLOPPY_CHECK_CLOSE( TransformLMScore(p_licenses + p_the_licenses), fullScore, 0.01);
+ SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01);
}
- void testEmptyHypothesis() {
- FFState *ffState = const_cast< FFState * >(backwardLM->EmptyHypothesisState( *dummyInput ));
+ // the licenses for
+ {
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "the licenses for",
+ StaticData::Instance().GetFactorDelimiter());
- BOOST_CHECK( ffState != NULL );
+ BOOST_CHECK( phrase.GetSize() == 3 );
- delete ffState;
+ float fullScore;
+ float ngramScore;
+ size_t oovCount;
+ backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
+
+ BOOST_CHECK( oovCount == 0 );
+ SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses_for ), ngramScore, 0.01);
+ SLOPPY_CHECK_CLOSE( TransformLMScore(p_for + p_licenses_for + p_the_licenses), fullScore, 0.01);
}
- void testCalcScore() {
-
- double p_the = -1.383059;
- double p_licenses = -2.360783;
- double p_for = -1.661813;
- double p_most = -2.360783;
- // double p_software = -1.62042;
-
- double p_the_licenses = -0.9625873;
- double p_licenses_for = -1.661557;
- double p_for_most = -0.4526253;
- // double p_most_software = -1.70295;
-
- double p_the_licenses_for = p_the_licenses + p_licenses_for;
- // double p_licenses_for_most = p_licenses_for + p_for_most;
-
- // the
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "the",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 1 );
-
- float fullScore;
- float ngramScore;
- size_t oovCount;
- backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
-
- BOOST_CHECK( oovCount == 0 );
- SLOPPY_CHECK_CLOSE( TransformLMScore(p_the), fullScore, 0.01);
- SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01);
- }
-
- // the licenses
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "the licenses",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 2 );
-
- float fullScore;
- float ngramScore;
- size_t oovCount;
- backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
-
- BOOST_CHECK( oovCount == 0 );
- SLOPPY_CHECK_CLOSE( TransformLMScore(p_licenses + p_the_licenses), fullScore, 0.01);
- SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01);
- }
-
- // the licenses for
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "the licenses for",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 3 );
-
- float fullScore;
- float ngramScore;
- size_t oovCount;
- backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
-
- BOOST_CHECK( oovCount == 0 );
- SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses_for ), ngramScore, 0.01);
- SLOPPY_CHECK_CLOSE( TransformLMScore(p_for + p_licenses_for + p_the_licenses), fullScore, 0.01);
- }
-
- // the licenses for most
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "the licenses for most",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 4 );
-
- float fullScore;
- float ngramScore;
- size_t oovCount;
- backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
-
- BOOST_CHECK( oovCount == 0 );
- SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses + p_licenses_for ), ngramScore, 0.01);
- SLOPPY_CHECK_CLOSE( TransformLMScore(p_most + p_for_most + p_licenses_for + p_the_licenses), fullScore, 0.01);
- }
-
+ // the licenses for most
+ {
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "the licenses for most",
+ StaticData::Instance().GetFactorDelimiter());
+
+ BOOST_CHECK( phrase.GetSize() == 4 );
+
+ float fullScore;
+ float ngramScore;
+ size_t oovCount;
+ backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);
+
+ BOOST_CHECK( oovCount == 0 );
+ SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses + p_licenses_for ), ngramScore, 0.01);
+ SLOPPY_CHECK_CLOSE( TransformLMScore(p_most + p_for_most + p_licenses_for + p_the_licenses), fullScore, 0.01);
}
-
+
+ }
+
void testEvaluate() {
FFState *nextState;
@@ -223,132 +224,134 @@ class BackwardLanguageModelTest {
double p_for_licenses = -1.661557;
double p_licenses_the = -0.9625873;
double p_the_eos = -1.940311;
-
-
- // the
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "the",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 1 );
-
- float score;
- nextState = backwardLM->Evaluate(phrase, prevState, score);
-
- // p(the) * p(</s> | the) / p(</s>)
- SLOPPY_CHECK_CLOSE( (p_the + p_the_eos - p_eos), score, 0.01);
-
- delete prevState;
- prevState = nextState;
-
- }
-
- // the licenses
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "licenses",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 1 );
-
- float score;
- nextState = backwardLM->Evaluate(phrase, prevState, score);
-
- // p(licenses) * p(licenses | the) / p(the)
- SLOPPY_CHECK_CLOSE( (p_licenses + p_licenses_the - p_the), score, 0.01);
-
- delete prevState;
- prevState = nextState;
-
- }
-
- // the licenses for
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "for",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 1 );
-
- float score;
- nextState = backwardLM->Evaluate(phrase, prevState, score);
-
- // p(for) * p(for | licenses) / p(licenses)
- SLOPPY_CHECK_CLOSE( (p_for + p_for_licenses - p_licenses), score, 0.01);
-
- delete prevState;
- prevState = nextState;
-
- }
-
- // the licenses for most
- {
- Phrase phrase;
- BOOST_CHECK( phrase.GetSize() == 0 );
-
- std::vector<FactorType> outputFactorOrder;
- outputFactorOrder.push_back(0);
-
- phrase.CreateFromString(
- outputFactorOrder,
- "most",
- StaticData::Instance().GetFactorDelimiter());
-
- BOOST_CHECK( phrase.GetSize() == 1 );
-
- float score;
- nextState = backwardLM->Evaluate(phrase, prevState, score);
-
- // p(most) * p(most | for) / p(for)
- SLOPPY_CHECK_CLOSE( (p_most + p_most_for - p_for), score, 0.01);
-
- delete prevState;
- prevState = nextState;
-
- }
+
+
+ // the
+ {
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "the",
+ StaticData::Instance().GetFactorDelimiter());
+
+ BOOST_CHECK( phrase.GetSize() == 1 );
+
+ float score;
+ nextState = backwardLM->Evaluate(phrase, prevState, score);
+
+ // p(the) * p(</s> | the) / p(</s>)
+ SLOPPY_CHECK_CLOSE( (p_the + p_the_eos - p_eos), score, 0.01);
+
+ delete prevState;
+ prevState = nextState;
+
+ }
+
+ // the licenses
+ {
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "licenses",
+ StaticData::Instance().GetFactorDelimiter());
+
+ BOOST_CHECK( phrase.GetSize() == 1 );
+
+ float score;
+ nextState = backwardLM->Evaluate(phrase, prevState, score);
+
+ // p(licenses) * p(licenses | the) / p(the)
+ SLOPPY_CHECK_CLOSE( (p_licenses + p_licenses_the - p_the), score, 0.01);
delete prevState;
+ prevState = nextState;
+
+ }
+
+ // the licenses for
+ {
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "for",
+ StaticData::Instance().GetFactorDelimiter());
+
+ BOOST_CHECK( phrase.GetSize() == 1 );
+
+ float score;
+ nextState = backwardLM->Evaluate(phrase, prevState, score);
+
+ // p(for) * p(for | licenses) / p(licenses)
+ SLOPPY_CHECK_CLOSE( (p_for + p_for_licenses - p_licenses), score, 0.01);
+
+ delete prevState;
+ prevState = nextState;
+
+ }
+
+ // the licenses for most
+ {
+ Phrase phrase;
+ BOOST_CHECK( phrase.GetSize() == 0 );
+
+ std::vector<FactorType> outputFactorOrder;
+ outputFactorOrder.push_back(0);
+
+ phrase.CreateFromString(
+ outputFactorOrder,
+ "most",
+ StaticData::Instance().GetFactorDelimiter());
+
+ BOOST_CHECK( phrase.GetSize() == 1 );
+
+ float score;
+ nextState = backwardLM->Evaluate(phrase, prevState, score);
+
+ // p(most) * p(most | for) / p(for)
+ SLOPPY_CHECK_CLOSE( (p_most + p_most_for - p_for), score, 0.01);
+
+ delete prevState;
+ prevState = nextState;
+
+ }
+
+ delete prevState;
}
-
- private:
- const Sentence *dummyInput;
- BackwardLanguageModel<lm::ngram::ProbingModel> *backwardLM;
+
+private:
+ const Sentence *dummyInput;
+ BackwardLanguageModel<lm::ngram::ProbingModel> *backwardLM;
};
}
-const char *FileLocation() {
+const char *FileLocation()
+{
if (boost::unit_test::framework::master_test_suite().argc < 2) {
BOOST_FAIL("Jamfile must specify arpa file for this test, but did not");
}
return boost::unit_test::framework::master_test_suite().argv[1];
}
-BOOST_AUTO_TEST_CASE(ProbingAll) {
+BOOST_AUTO_TEST_CASE(ProbingAll)
+{
BackwardLanguageModelTest test;
test.testEmptyHypothesis();
diff --git a/moses/LM/Base.cpp b/moses/LM/Base.cpp
index fe35604b0..37dc704de 100644
--- a/moses/LM/Base.cpp
+++ b/moses/LM/Base.cpp
@@ -31,63 +31,67 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
-namespace Moses {
+namespace Moses
+{
LanguageModel::LanguageModel(const std::string& description, const std::string &line) :
StatefulFeatureFunction(description, StaticData::Instance().GetLMEnableOOVFeature() ? 2 : 1, line )
{
- m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
+ m_enableOOVFeature = StaticData::Instance().GetLMEnableOOVFeature();
}
LanguageModel::~LanguageModel() {}
-float LanguageModel::GetWeight() const {
+float LanguageModel::GetWeight() const
+{
//return StaticData::Instance().GetAllWeights().GetScoresForProducer(this)[0];
return StaticData::Instance().GetWeights(this)[0];
}
-float LanguageModel::GetOOVWeight() const {
+float LanguageModel::GetOOVWeight() const
+{
if (m_enableOOVFeature) {
//return StaticData::Instance().GetAllWeights().GetScoresForProducer(this)[1];
- return StaticData::Instance().GetWeights(this)[1];
+ return StaticData::Instance().GetWeights(this)[1];
} else {
return 0;
}
}
-void LanguageModel::IncrementalCallback(Incremental::Manager &manager) const {
+void LanguageModel::IncrementalCallback(Incremental::Manager &manager) const
+{
UTIL_THROW(util::Exception, "Incremental search is only supported by KenLM.");
}
void LanguageModel::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
- if (Useable(targetPhrase)) {
- // contains factors used by this LM
- float fullScore, nGramScore;
- size_t oovCount;
-
- CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
- float estimateScore = fullScore - nGramScore;
-
- if (StaticData::Instance().GetLMEnableOOVFeature()) {
- vector<float> scores(2), estimateScores(2);
- scores[0] = nGramScore;
- scores[1] = oovCount;
- scoreBreakdown.Assign(this, scores);
-
- estimateScores[0] = estimateScore;
- estimateScores[1] = 0;
- estimatedFutureScore.Assign(this, estimateScores);
- } else {
- scoreBreakdown.Assign(this, nGramScore);
- estimatedFutureScore.Assign(this, estimateScore);
- }
-
- }
+ if (Useable(targetPhrase)) {
+ // contains factors used by this LM
+ float fullScore, nGramScore;
+ size_t oovCount;
+
+ CalcScore(targetPhrase, fullScore, nGramScore, oovCount);
+ float estimateScore = fullScore - nGramScore;
+
+ if (StaticData::Instance().GetLMEnableOOVFeature()) {
+ vector<float> scores(2), estimateScores(2);
+ scores[0] = nGramScore;
+ scores[1] = oovCount;
+ scoreBreakdown.Assign(this, scores);
+
+ estimateScores[0] = estimateScore;
+ estimateScores[1] = 0;
+ estimatedFutureScore.Assign(this, estimateScores);
+ } else {
+ scoreBreakdown.Assign(this, nGramScore);
+ estimatedFutureScore.Assign(this, estimateScore);
+ }
+
+ }
}
const LanguageModel &LanguageModel::GetFirstLM()
diff --git a/moses/LM/Base.h b/moses/LM/Base.h
index 961fead5f..1f976ee53 100644
--- a/moses/LM/Base.h
+++ b/moses/LM/Base.h
@@ -30,21 +30,25 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
-namespace Incremental { class Manager; }
+namespace Incremental
+{
+class Manager;
+}
class FactorCollection;
class Factor;
class Phrase;
//! Abstract base class which represent a language model on a contiguous phrase
-class LanguageModel : public StatefulFeatureFunction {
+class LanguageModel : public StatefulFeatureFunction
+{
protected:
LanguageModel(const std::string& description, const std::string &line);
// This can't be in the constructor for virual function dispatch reasons
bool m_enableOOVFeature;
-
+
public:
static const LanguageModel &GetFirstLM();
@@ -89,9 +93,9 @@ public:
virtual void IncrementalCallback(Incremental::Manager &manager) const;
virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const;
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
};
diff --git a/moses/LM/ChartState.h b/moses/LM/ChartState.h
index b6bdd8f7c..186694927 100644
--- a/moses/LM/ChartState.h
+++ b/moses/LM/ChartState.h
@@ -19,16 +19,15 @@ private:
const ChartHypothesis &m_hypo;
- /** Construct the prefix string of up to specified size
+ /** Construct the prefix string of up to specified size
* \param ret prefix string
* \param size maximum size (typically max lm context window)
*/
- size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const
- {
+ size_t CalcPrefix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const {
const TargetPhrase &target = hypo.GetCurrTargetPhrase();
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
- target.GetAlignNonTerm().GetNonTermIndexMap();
-
+ target.GetAlignNonTerm().GetNonTermIndexMap();
+
// loop over the rule that is being applied
for (size_t pos = 0; pos < target.GetSize(); ++pos) {
const Word &word = target.GetWord(pos);
@@ -53,13 +52,12 @@ private:
return size;
}
- /** Construct the suffix phrase of up to specified size
+ /** Construct the suffix phrase of up to specified size
* will always be called after the construction of prefix phrase
* \param ret suffix phrase
* \param size maximum size of suffix
*/
- size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const
- {
+ size_t CalcSuffix(const ChartHypothesis &hypo, int featureID, Phrase &ret, size_t size) const {
CHECK(m_contextPrefix.GetSize() <= m_numTargetTerminals);
// special handling for small hypotheses
@@ -81,7 +79,7 @@ private:
else {
const TargetPhrase& target = hypo.GetCurrTargetPhrase();
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
- target.GetAlignNonTerm().GetNonTermIndexMap();
+ target.GetAlignNonTerm().GetNonTermIndexMap();
for (int pos = (int) target.GetSize() - 1; pos >= 0 ; --pos) {
const Word &word = target.GetWord(pos);
@@ -89,8 +87,7 @@ private:
size_t nonTermInd = nonTermIndexMap[pos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermInd);
size = static_cast<const LanguageModelChartState*>(prevHypo->GetFFState(featureID))->CalcSuffix(*prevHypo, featureID, ret, size);
- }
- else {
+ } else {
ret.PrependWord(hypo.GetCurrTargetPhrase().GetWord(pos));
size--;
}
@@ -106,11 +103,10 @@ private:
public:
LanguageModelChartState(const ChartHypothesis &hypo, int featureID, size_t order)
- :m_lmRightContext(NULL)
- ,m_contextPrefix(order - 1)
- ,m_contextSuffix( order - 1)
- ,m_hypo(hypo)
- {
+ :m_lmRightContext(NULL)
+ ,m_contextPrefix(order - 1)
+ ,m_contextSuffix( order - 1)
+ ,m_hypo(hypo) {
m_numTargetTerminals = hypo.GetCurrTargetPhrase().GetNumTerminals();
for (std::vector<const ChartHypothesis*>::const_iterator i = hypo.GetPrevHypos().begin(); i != hypo.GetPrevHypos().end(); ++i) {
@@ -131,8 +127,12 @@ public:
m_lmRightContext = rightState;
}
- float GetPrefixScore() const { return m_prefixScore; }
- FFState* GetRightContext() const { return m_lmRightContext; }
+ float GetPrefixScore() const {
+ return m_prefixScore;
+ }
+ FFState* GetRightContext() const {
+ return m_lmRightContext;
+ }
size_t GetNumTargetTerminals() const {
return m_numTargetTerminals;
@@ -150,8 +150,7 @@ public:
dynamic_cast<const LanguageModelChartState &>( o );
// prefix
- if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) // not for "<s> ..."
- {
+ if (m_hypo.GetCurrSourceRange().GetStartPos() > 0) { // not for "<s> ..."
int ret = GetPrefix().Compare(other.GetPrefix());
if (ret != 0)
return ret;
@@ -159,8 +158,7 @@ public:
// suffix
size_t inputSize = m_hypo.GetManager().GetSource().GetSize();
- if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1)// not for "... </s>"
- {
+ if (m_hypo.GetCurrSourceRange().GetEndPos() < inputSize - 1) { // not for "... </s>"
int ret = other.GetRightContext()->Compare(*m_lmRightContext);
if (ret != 0)
return ret;
diff --git a/moses/LM/IRST.cpp b/moses/LM/IRST.cpp
index 2d58bd310..ae1bb677d 100644
--- a/moses/LM/IRST.cpp
+++ b/moses/LM/IRST.cpp
@@ -40,25 +40,22 @@ using namespace std;
namespace Moses
{
LanguageModelIRST::LanguageModelIRST(const std::string &line)
-:LanguageModelSingleFactor("IRSTLM", line)
+ :LanguageModelSingleFactor("IRSTLM", line)
{
FactorType factorType;
size_t nGramOrder;
string filePath;
for (size_t i = 0; i < m_args.size(); ++i) {
- const vector<string> &args = m_args[i];
+ const vector<string> &args = m_args[i];
if (args[0] == "factor") {
factorType = Scan<FactorType>(args[1]);
- }
- else if (args[0] == "order") {
+ } else if (args[0] == "order") {
nGramOrder = Scan<size_t>(args[1]);
- }
- else if (args[0] == "path") {
+ } else if (args[0] == "path") {
filePath = args[1];
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
@@ -86,8 +83,7 @@ bool LanguageModelIRST::Load(const std::string &filePath,
const StaticData &staticData = StaticData::Instance();
int threadCount = staticData.ThreadCount();
- if (threadCount != 1)
- {
+ if (threadCount != 1) {
UserMessage::Add(threadCount + " number of threads specified but IRST LM is not threadsafe.");
return false;
}
@@ -99,7 +95,7 @@ bool LanguageModelIRST::Load(const std::string &filePath,
m_filePath = filePath;
- m_lmtb = m_lmtb->CreateLanguageModel(m_filePath);
+ m_lmtb = m_lmtb->CreateLanguageModel(m_filePath);
m_lmtb->setMaxLoadedLevel(1000);
m_lmtb->load(m_filePath);
d=m_lmtb->getDict();
@@ -170,7 +166,7 @@ int LanguageModelIRST::GetLmID( const std::string &str ) const
}
int LanguageModelIRST::GetLmID( const Factor *factor ) const
-{
+{
size_t factorId = factor->GetId();
if ((factorId >= m_lmIdLookup.size()) || (m_lmIdLookup[factorId] == m_empty)) {
@@ -180,12 +176,12 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
//////////
///poiche' non c'e' distinzione tra i factorIDs delle parole sorgenti
- ///e delle parole target in Moses, puo' accadere che una parola target
+ ///e delle parole target in Moses, puo' accadere che una parola target
///di cui non sia stato ancora calcolato il suo codice target abbia
///comunque un factorID noto (e quindi minore di m_lmIdLookup.size())
///E' necessario dunque identificare questi casi di indeterminatezza
///del codice target. Attualamente, questo controllo e' stato implementato
- ///impostando a m_empty tutti i termini che non hanno ancora
+ ///impostando a m_empty tutti i termini che non hanno ancora
//ricevuto un codice target effettivo
///////////
@@ -197,7 +193,7 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
/// IN POSIZIONE (factorID-1) invece che in posizione factrID dove dopo andiamo a leggerlo (vedi caso C
/// Cosi' funziona ....
/// ho un dubbio su cosa c'e' nelle prime posizioni di m_lmIdLookup
-/// quindi
+/// quindi
/// e scopro che rimane vuota una entry ogni due
/// perche' factorID cresce di due in due (perche' codifica sia source che target) "vuota" la posizione (factorID-1)
/// non da problemi di correttezza, ma solo di "spreco" di memoria
@@ -207,10 +203,10 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
////////////////
- if (factorId >= m_lmIdLookup.size()){
- //resize and fill with m_empty
- //increment the array more than needed to avoid too many resizing operation.
- m_lmIdLookup.resize(factorId+10, m_empty);
+ if (factorId >= m_lmIdLookup.size()) {
+ //resize and fill with m_empty
+ //increment the array more than needed to avoid too many resizing operation.
+ m_lmIdLookup.resize(factorId+10, m_empty);
}
//insert new code
diff --git a/moses/LM/Implementation.cpp b/moses/LM/Implementation.cpp
index 798a12775..e9c651089 100644
--- a/moses/LM/Implementation.cpp
+++ b/moses/LM/Implementation.cpp
@@ -69,8 +69,9 @@ void LanguageModelImplementation::GetState(
GetValueForgotState(contextFactor, state);
}
-// Calculate score of a phrase.
-void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
+// Calculate score of a phrase.
+void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
+{
fullScore = 0;
ngramScore = 0;
@@ -82,7 +83,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
vector<const Word*> contextFactor;
contextFactor.reserve(GetNGramOrder());
std::auto_ptr<FFState> state(NewState((phrase.GetWord(0) == GetSentenceStartWord()) ?
- GetBeginSentenceState() : GetNullContextState()));
+ GetBeginSentenceState() : GetNullContextState()));
size_t currPos = 0;
while (currPos < phraseSize) {
const Word &word = phrase.GetWord(currPos);
@@ -109,7 +110,7 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
fullScore += result.score;
if (contextFactor.size() == GetNGramOrder())
ngramScore += result.score;
- if (result.unknown) ++oovCount;
+ if (result.unknown) ++oovCount;
}
}
@@ -117,7 +118,8 @@ void LanguageModelImplementation::CalcScore(const Phrase &phrase, float &fullSco
}
}
-FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const {
+FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
+{
// In this function, we only compute the LM scores of n-grams that overlap a
// phrase boundary. Phrase-internal scores are taken directly from the
// translation option.
@@ -179,9 +181,7 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS
contextFactor[i] = &hypo.GetWord((size_t)currPos);
}
lmScore += GetValueForgotState(contextFactor, *res).score;
- }
- else
- {
+ } else {
if (endPos < currEndPos) {
//need to get the LM state (otherwise the last LM state is fine)
for (size_t currPos = endPos+1; currPos <= currEndPos; currPos++) {
@@ -208,7 +208,8 @@ FFState *LanguageModelImplementation::Evaluate(const Hypothesis &hypo, const FFS
return res;
}
-FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out) const {
+FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection* out) const
+{
LanguageModelChartState *ret = new LanguageModelChartState(hypo, featureID, GetNGramOrder());
// data structure for factored context phrase (history and predicted word)
vector<const Word*> contextFactor;
@@ -223,38 +224,33 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
// get index map for underlying hypotheses
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
- hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap();
+ hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap();
// loop over rule
for (size_t phrasePos = 0, wordPos = 0;
phrasePos < hypo.GetCurrTargetPhrase().GetSize();
- phrasePos++)
- {
+ phrasePos++) {
// consult rule for either word or non-terminal
const Word &word = hypo.GetCurrTargetPhrase().GetWord(phrasePos);
// regular word
- if (!word.IsNonTerminal())
- {
+ if (!word.IsNonTerminal()) {
ShiftOrPush(contextFactor, word);
// beginning of sentence symbol <s>? -> just update state
- if (word == GetSentenceStartWord())
- {
+ if (word == GetSentenceStartWord()) {
CHECK(phrasePos == 0);
delete lmState;
lmState = NewState( GetBeginSentenceState() );
}
// score a regular word added by the rule
- else
- {
+ else {
updateChartScore( &prefixScore, &finalizedScore, GetValueGivenState(contextFactor, *lmState).score, ++wordPos );
}
}
// non-terminal, add phrase from underlying hypothesis
- else
- {
+ else {
// look up underlying hypothesis
size_t nonTermIndex = nonTermIndexMap[phrasePos];
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndex);
@@ -278,8 +274,7 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
// push suffix
int suffixPos = prevState->GetSuffix().GetSize() - (GetNGramOrder()-1);
if (suffixPos < 0) suffixPos = 0; // push all words if less than order
- for(;(size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++)
- {
+ for(; (size_t)suffixPos < prevState->GetSuffix().GetSize(); suffixPos++) {
const Word &word = prevState->GetSuffix().GetWord(suffixPos);
ShiftOrPush(contextFactor, word);
wordPos++;
@@ -287,22 +282,19 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
}
// internal non-terminal
- else
- {
+ else {
// score its prefix
for(size_t prefixPos = 0;
prefixPos < GetNGramOrder()-1 // up to LM order window
- && prefixPos < subPhraseLength; // up to length
- prefixPos++)
- {
+ && prefixPos < subPhraseLength; // up to length
+ prefixPos++) {
const Word &word = prevState->GetPrefix().GetWord(prefixPos);
ShiftOrPush(contextFactor, word);
updateChartScore( &prefixScore, &finalizedScore, GetValueGivenState(contextFactor, *lmState).score, ++wordPos );
}
// check if we are dealing with a large sub-phrase
- if (subPhraseLength > GetNGramOrder() - 1)
- {
+ if (subPhraseLength > GetNGramOrder() - 1) {
// add its finalized language model score
finalizedScore +=
prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0] // full score
@@ -337,11 +329,11 @@ FFState* LanguageModelImplementation::EvaluateChart(const ChartHypothesis& hypo,
return ret;
}
-void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const {
+void LanguageModelImplementation::updateChartScore(float *prefixScore, float *finalizedScore, float score, size_t wordPos) const
+{
if (wordPos < GetNGramOrder()) {
*prefixScore += score;
- }
- else {
+ } else {
*finalizedScore += score;
}
}
diff --git a/moses/LM/Implementation.h b/moses/LM/Implementation.h
index d3f83dfe1..fa6619208 100644
--- a/moses/LM/Implementation.h
+++ b/moses/LM/Implementation.h
@@ -44,7 +44,7 @@ class Phrase;
struct LMResult {
// log probability
float score;
- // Is the word unknown?
+ // Is the word unknown?
bool unknown;
};
@@ -62,7 +62,7 @@ protected:
//! Usually <s> and </s>
LanguageModelImplementation(const std::string& description, const std::string &line)
- :LanguageModel(description, line)
+ :LanguageModel(description, line)
{}
public:
@@ -108,8 +108,7 @@ public:
return m_sentenceEndWord;
}
- const FFState* EmptyHypothesisState(const InputType &/*input*/) const
- {
+ const FFState* EmptyHypothesisState(const InputType &/*input*/) const {
return NewState(GetBeginSentenceState());
}
diff --git a/moses/LM/Joint.h b/moses/LM/Joint.h
index 5bc52e2da..3a675cbd6 100644
--- a/moses/LM/Joint.h
+++ b/moses/LM/Joint.h
@@ -50,8 +50,7 @@ protected:
size_t m_implFactor;
public:
LanguageModelJoint(const std::string &line, LanguageModelSingleFactor *lmImpl)
- :LanguageModelMultiFactor("JointLM", line)
- {
+ :LanguageModelMultiFactor("JointLM", line) {
m_lmImpl = lmImpl;
}
diff --git a/moses/LM/Ken.cpp b/moses/LM/Ken.cpp
index e251661c3..af24ad858 100644
--- a/moses/LM/Ken.cpp
+++ b/moses/LM/Ken.cpp
@@ -45,8 +45,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace std;
-namespace Moses {
-namespace {
+namespace Moses
+{
+namespace
+{
struct KenLMState : public FFState {
lm::ngram::State state;
@@ -61,63 +63,65 @@ struct KenLMState : public FFState {
/*
* An implementation of single factor LM using Ken's code.
*/
-template <class Model> class LanguageModelKen : public LanguageModel {
- public:
- LanguageModelKen(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy);
+template <class Model> class LanguageModelKen : public LanguageModel
+{
+public:
+ LanguageModelKen(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy);
- bool Useable(const Phrase &phrase) const {
- return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
- }
+ bool Useable(const Phrase &phrase) const {
+ return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
+ }
- const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
- KenLMState *ret = new KenLMState();
- ret->state = m_ngram->BeginSentenceState();
- return ret;
- }
+ const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
+ KenLMState *ret = new KenLMState();
+ ret->state = m_ngram->BeginSentenceState();
+ return ret;
+ }
- void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
+ void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
- FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
+ FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
- FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
+ FFState *EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
- void IncrementalCallback(Incremental::Manager &manager) const {
- manager.LMCallback(*m_ngram, m_lmIdLookup);
- }
+ void IncrementalCallback(Incremental::Manager &manager) const {
+ manager.LMCallback(*m_ngram, m_lmIdLookup);
+ }
- private:
- LanguageModelKen(const LanguageModelKen<Model> &copy_from);
+private:
+ LanguageModelKen(const LanguageModelKen<Model> &copy_from);
- lm::WordIndex TranslateID(const Word &word) const {
- std::size_t factor = word.GetFactor(m_factorType)->GetId();
- return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
- }
+ lm::WordIndex TranslateID(const Word &word) const {
+ std::size_t factor = word.GetFactor(m_factorType)->GetId();
+ return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
+ }
- // Convert last words of hypothesis into vocab ids, returning an end pointer.
- lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
- lm::WordIndex *index = indices;
- lm::WordIndex *end = indices + m_ngram->Order() - 1;
- int position = hypo.GetCurrTargetWordsRange().GetEndPos();
- for (; ; ++index, --position) {
- if (index == end) return index;
- if (position == -1) {
- *index = m_ngram->GetVocabulary().BeginSentence();
- return index + 1;
- }
- *index = TranslateID(hypo.GetWord(position));
+ // Convert last words of hypothesis into vocab ids, returning an end pointer.
+ lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
+ lm::WordIndex *index = indices;
+ lm::WordIndex *end = indices + m_ngram->Order() - 1;
+ int position = hypo.GetCurrTargetWordsRange().GetEndPos();
+ for (; ; ++index, --position) {
+ if (index == end) return index;
+ if (position == -1) {
+ *index = m_ngram->GetVocabulary().BeginSentence();
+ return index + 1;
}
+ *index = TranslateID(hypo.GetWord(position));
}
+ }
- boost::shared_ptr<Model> m_ngram;
-
- std::vector<lm::WordIndex> m_lmIdLookup;
+ boost::shared_ptr<Model> m_ngram;
- FactorType m_factorType;
+ std::vector<lm::WordIndex> m_lmIdLookup;
- const Factor *m_beginSentenceFactor;
+ FactorType m_factorType;
+
+ const Factor *m_beginSentenceFactor;
};
-class MappingBuilder : public lm::EnumerateVocab {
+class MappingBuilder : public lm::EnumerateVocab
+{
public:
MappingBuilder(FactorCollection &factorCollection, std::vector<lm::WordIndex> &mapping)
: m_factorCollection(factorCollection), m_mapping(mapping) {}
@@ -137,13 +141,14 @@ private:
};
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy)
-:LanguageModel(description, line)
-,m_factorType(factorType)
+ :LanguageModel(description, line)
+ ,m_factorType(factorType)
{
lm::ngram::Config config;
IFVERBOSE(1) {
config.messages = &std::cerr;
- } else {
+ }
+ else {
config.messages = NULL;
}
FactorCollection &collection = FactorCollection::Instance();
@@ -157,15 +162,17 @@ template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::stri
}
template <class Model> LanguageModelKen<Model>::LanguageModelKen(const LanguageModelKen<Model> &copy_from)
-:LanguageModel(copy_from.GetScoreProducerDescription(), copy_from.GetArgLine()),
-m_ngram(copy_from.m_ngram),
+ :LanguageModel(copy_from.GetScoreProducerDescription(), copy_from.GetArgLine()),
+ m_ngram(copy_from.m_ngram),
// TODO: don't copy this.
-m_lmIdLookup(copy_from.m_lmIdLookup),
-m_factorType(copy_from.m_factorType),
-m_beginSentenceFactor(copy_from.m_beginSentenceFactor) {
+ m_lmIdLookup(copy_from.m_lmIdLookup),
+ m_factorType(copy_from.m_factorType),
+ m_beginSentenceFactor(copy_from.m_beginSentenceFactor)
+{
}
-template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const {
+template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
+{
fullScore = 0;
ngramScore = 0;
oovCount = 0;
@@ -174,7 +181,7 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
lm::ngram::ChartState discarded_sadly;
lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);
-
+
size_t position;
if (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)) {
scorer.BeginSentence();
@@ -182,7 +189,7 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
} else {
position = 0;
}
-
+
size_t ngramBoundary = m_ngram->Order() - 1;
size_t end_loop = std::min(ngramBoundary, phrase.GetSize());
@@ -199,7 +206,7 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
}
float before_boundary = fullScore + scorer.Finish();
for (; position < phrase.GetSize(); ++position) {
- const Word &word = phrase.GetWord(position);
+ const Word &word = phrase.GetWord(position);
if (word.IsNonTerminal()) {
fullScore += scorer.Finish();
scorer.Reset();
@@ -207,7 +214,7 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
lm::WordIndex index = TranslateID(word);
scorer.Terminal(index);
if (!index) ++oovCount;
- }
+ }
}
fullScore += scorer.Finish();
@@ -215,11 +222,12 @@ template <class Model> void LanguageModelKen<Model>::CalcScore(const Phrase &phr
fullScore = TransformLMScore(fullScore);
}
-template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const {
+template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
+{
const lm::ngram::State &in_state = static_cast<const KenLMState&>(*ps).state;
std::auto_ptr<KenLMState> ret(new KenLMState());
-
+
if (!hypo.GetCurrTargetLength()) {
ret->state = in_state;
return ret.release();
@@ -242,17 +250,17 @@ template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothes
}
if (hypo.IsSourceCompleted()) {
- // Score end of sentence.
+ // Score end of sentence.
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
score += m_ngram->FullScoreForgotState(&indices.front(), last, m_ngram->GetVocabulary().EndSentence(), ret->state).prob;
} else if (adjust_end < end) {
- // Get state after adding a long phrase.
+ // Get state after adding a long phrase.
std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
const lm::WordIndex *last = LastIDs(hypo, &indices.front());
m_ngram->GetState(&indices.front(), last, ret->state);
} else if (state0 != &ret->state) {
- // Short enough phrase that we can just reuse the state.
+ // Short enough phrase that we can just reuse the state.
ret->state = *state0;
}
@@ -270,34 +278,39 @@ template <class Model> FFState *LanguageModelKen<Model>::Evaluate(const Hypothes
return ret.release();
}
-class LanguageModelChartStateKenLM : public FFState {
- public:
- LanguageModelChartStateKenLM() {}
+class LanguageModelChartStateKenLM : public FFState
+{
+public:
+ LanguageModelChartStateKenLM() {}
- const lm::ngram::ChartState &GetChartState() const { return m_state; }
- lm::ngram::ChartState &GetChartState() { return m_state; }
+ const lm::ngram::ChartState &GetChartState() const {
+ return m_state;
+ }
+ lm::ngram::ChartState &GetChartState() {
+ return m_state;
+ }
- int Compare(const FFState& o) const
- {
- const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM&>(o);
- int ret = m_state.Compare(other.m_state);
- return ret;
- }
+ int Compare(const FFState& o) const {
+ const LanguageModelChartStateKenLM &other = static_cast<const LanguageModelChartStateKenLM&>(o);
+ int ret = m_state.Compare(other.m_state);
+ return ret;
+ }
- private:
- lm::ngram::ChartState m_state;
+private:
+ lm::ngram::ChartState m_state;
};
-template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const {
+template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const ChartHypothesis& hypo, int featureID, ScoreComponentCollection *accumulator) const
+{
LanguageModelChartStateKenLM *newState = new LanguageModelChartStateKenLM();
lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState->GetChartState());
const TargetPhrase &target = hypo.GetCurrTargetPhrase();
const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
- target.GetAlignNonTerm().GetNonTermIndexMap();
+ target.GetAlignNonTerm().GetNonTermIndexMap();
const size_t size = hypo.GetCurrTargetPhrase().GetSize();
size_t phrasePos = 0;
- // Special cases for first word.
+ // Special cases for first word.
if (size) {
const Word &word = hypo.GetCurrTargetPhrase().GetWord(0);
if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
@@ -305,7 +318,7 @@ template <class Model> FFState *LanguageModelKen<Model>::EvaluateChart(const Cha
ruleScore.BeginSentence();
phrasePos++;
} else if (word.IsNonTerminal()) {
- // Non-terminal is first so we can copy instead of rescoring.
+ // Non-terminal is first so we can copy instead of rescoring.
const ChartHypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetFFState(featureID))->GetChartState();
float prob = UntransformLMScore(prevHypo->GetScoreBreakdown().GetScoresForProducer(this)[0]);
@@ -347,20 +360,15 @@ LanguageModel *ConstructKenLM(const std::string &description, const std::string
if (args[0] == "factor") {
factorType = Scan<FactorType>(args[1]);
- }
- else if (args[0] == "order") {
+ } else if (args[0] == "order") {
//nGramOrder = Scan<size_t>(args[1]);
- }
- else if (args[0] == "path") {
+ } else if (args[0] == "path") {
filePath = args[1];
- }
- else if (args[0] == "lazyken") {
+ } else if (args[0] == "lazyken") {
lazy = Scan<bool>(args[1]);
- }
- else if (args[0] == "name") {
+ } else if (args[0] == "name") {
// that's ok. do nothing, passes onto LM constructor
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
@@ -368,26 +376,27 @@ LanguageModel *ConstructKenLM(const std::string &description, const std::string
return ConstructKenLM(description, line, filePath, factorType, lazy);
}
-LanguageModel *ConstructKenLM(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy) {
+LanguageModel *ConstructKenLM(const std::string &description, const std::string &line, const std::string &file, FactorType factorType, bool lazy)
+{
try {
lm::ngram::ModelType model_type;
if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
switch(model_type) {
- case lm::ngram::PROBING:
- return new LanguageModelKen<lm::ngram::ProbingModel>(description, line, file, factorType, lazy);
- case lm::ngram::REST_PROBING:
- return new LanguageModelKen<lm::ngram::RestProbingModel>(description, line, file, factorType, lazy);
- case lm::ngram::TRIE:
- return new LanguageModelKen<lm::ngram::TrieModel>(description, line, file, factorType, lazy);
- case lm::ngram::QUANT_TRIE:
- return new LanguageModelKen<lm::ngram::QuantTrieModel>(description, line, file, factorType, lazy);
- case lm::ngram::ARRAY_TRIE:
- return new LanguageModelKen<lm::ngram::ArrayTrieModel>(description, line, file, factorType, lazy);
- case lm::ngram::QUANT_ARRAY_TRIE:
- return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(description, line, file, factorType, lazy);
- default:
- std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
- abort();
+ case lm::ngram::PROBING:
+ return new LanguageModelKen<lm::ngram::ProbingModel>(description, line, file, factorType, lazy);
+ case lm::ngram::REST_PROBING:
+ return new LanguageModelKen<lm::ngram::RestProbingModel>(description, line, file, factorType, lazy);
+ case lm::ngram::TRIE:
+ return new LanguageModelKen<lm::ngram::TrieModel>(description, line, file, factorType, lazy);
+ case lm::ngram::QUANT_TRIE:
+ return new LanguageModelKen<lm::ngram::QuantTrieModel>(description, line, file, factorType, lazy);
+ case lm::ngram::ARRAY_TRIE:
+ return new LanguageModelKen<lm::ngram::ArrayTrieModel>(description, line, file, factorType, lazy);
+ case lm::ngram::QUANT_ARRAY_TRIE:
+ return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(description, line, file, factorType, lazy);
+ default:
+ std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
+ abort();
}
} else {
return new LanguageModelKen<lm::ngram::ProbingModel>(description, line, file, factorType, lazy);
diff --git a/moses/LM/Ken.h b/moses/LM/Ken.h
index 3c2ceb774..360ac7be8 100644
--- a/moses/LM/Ken.h
+++ b/moses/LM/Ken.h
@@ -26,7 +26,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/TypeDef.h"
-namespace Moses {
+namespace Moses
+{
class LanguageModel;
diff --git a/moses/LM/LDHT.cpp b/moses/LM/LDHT.cpp
index 5f52b2514..1b4e70661 100644
--- a/moses/LM/LDHT.cpp
+++ b/moses/LM/LDHT.cpp
@@ -16,7 +16,8 @@
#include <boost/thread/tss.hpp>
-namespace Moses {
+namespace Moses
+{
struct LDHTLMState : public FFState {
LDHT::NewNgram gram_fingerprints;
@@ -60,338 +61,358 @@ struct LDHTLMState : public FFState {
}
};
-class LanguageModelLDHT : public LanguageModel {
+class LanguageModelLDHT : public LanguageModel
+{
public:
- LanguageModelLDHT();
- LanguageModelLDHT(const std::string& path,
- ScoreIndexManager& manager,
- FactorType factorType);
- LanguageModelLDHT(ScoreIndexManager& manager,
- LanguageModelLDHT& copyFrom);
-
- LDHT::Client* getClientUnsafe() const;
- LDHT::Client* getClientSafe();
- LDHT::Client* initTSSClient();
- virtual ~LanguageModelLDHT();
- virtual void InitializeForInput(InputType const& source);
- virtual void CleanUpAfterSentenceProcessing(const InputType &source);
- virtual const FFState* EmptyHypothesisState(const InputType& input) const;
- virtual bool Useable(const Phrase& phrase) const;
- virtual void CalcScore(const Phrase& phrase,
- float& fullScore,
- float& ngramScore,
- std::size_t& oovCount) const;
- virtual void CalcScoreFromCache(const Phrase& phrase,
- float& fullScore,
- float& ngramScore,
- std::size_t& oovCount) const;
- FFState* Evaluate(const Hypothesis& hypo,
- const FFState* input_state,
- ScoreComponentCollection* score_output) const;
- FFState* EvaluateChart(const ChartHypothesis& hypo,
- int featureID,
- ScoreComponentCollection* accumulator) const;
-
- virtual void IssueRequestsFor(Hypothesis& hypo,
- const FFState* input_state);
- float calcScoreFromState(LDHTLMState* hypo) const;
- void sync();
- void SetFFStateIdx(int state_idx);
+ LanguageModelLDHT();
+ LanguageModelLDHT(const std::string& path,
+ ScoreIndexManager& manager,
+ FactorType factorType);
+ LanguageModelLDHT(ScoreIndexManager& manager,
+ LanguageModelLDHT& copyFrom);
+
+ LDHT::Client* getClientUnsafe() const;
+ LDHT::Client* getClientSafe();
+ LDHT::Client* initTSSClient();
+ virtual ~LanguageModelLDHT();
+ virtual void InitializeForInput(InputType const& source);
+ virtual void CleanUpAfterSentenceProcessing(const InputType &source);
+ virtual const FFState* EmptyHypothesisState(const InputType& input) const;
+ virtual bool Useable(const Phrase& phrase) const;
+ virtual void CalcScore(const Phrase& phrase,
+ float& fullScore,
+ float& ngramScore,
+ std::size_t& oovCount) const;
+ virtual void CalcScoreFromCache(const Phrase& phrase,
+ float& fullScore,
+ float& ngramScore,
+ std::size_t& oovCount) const;
+ FFState* Evaluate(const Hypothesis& hypo,
+ const FFState* input_state,
+ ScoreComponentCollection* score_output) const;
+ FFState* EvaluateChart(const ChartHypothesis& hypo,
+ int featureID,
+ ScoreComponentCollection* accumulator) const;
+
+ virtual void IssueRequestsFor(Hypothesis& hypo,
+ const FFState* input_state);
+ float calcScoreFromState(LDHTLMState* hypo) const;
+ void sync();
+ void SetFFStateIdx(int state_idx);
protected:
- boost::thread_specific_ptr<LDHT::Client> m_client;
- std::string m_configPath;
- FactorType m_factorType;
- int m_state_idx;
- int m_calc_score_count;
- uint64_t m_start_tick;
+ boost::thread_specific_ptr<LDHT::Client> m_client;
+ std::string m_configPath;
+ FactorType m_factorType;
+ int m_state_idx;
+ int m_calc_score_count;
+ uint64_t m_start_tick;
};
LanguageModel* ConstructLDHTLM(const std::string& path,
ScoreIndexManager& manager,
- FactorType factorType) {
- return new LanguageModelLDHT(path, manager, factorType);
+ FactorType factorType)
+{
+ return new LanguageModelLDHT(path, manager, factorType);
}
-LanguageModelLDHT::LanguageModelLDHT() : LanguageModel(), m_client(NULL) {
- m_enableOOVFeature = false;
+LanguageModelLDHT::LanguageModelLDHT() : LanguageModel(), m_client(NULL)
+{
+ m_enableOOVFeature = false;
}
LanguageModelLDHT::LanguageModelLDHT(ScoreIndexManager& manager,
- LanguageModelLDHT& copyFrom) {
- m_calc_score_count = 0;
- //m_client = copyFrom.m_client;
- m_factorType = copyFrom.m_factorType;
- m_configPath = copyFrom.m_configPath;
- Init(manager);
+ LanguageModelLDHT& copyFrom)
+{
+ m_calc_score_count = 0;
+ //m_client = copyFrom.m_client;
+ m_factorType = copyFrom.m_factorType;
+ m_configPath = copyFrom.m_configPath;
+ Init(manager);
}
LanguageModelLDHT::LanguageModelLDHT(const std::string& path,
ScoreIndexManager& manager,
FactorType factorType)
- : m_factorType(factorType) {
- m_configPath = path;
- Init(manager);
+ : m_factorType(factorType)
+{
+ m_configPath = path;
+ Init(manager);
}
-LanguageModelLDHT::~LanguageModelLDHT() {
- // TODO(wilson): should cleanup for each individual thread.
- //delete getClientSafe();
+LanguageModelLDHT::~LanguageModelLDHT()
+{
+ // TODO(wilson): should cleanup for each individual thread.
+ //delete getClientSafe();
}
// Check that there is a TSS Client instance, and instantiate one if
// there isn't.
-LDHT::Client* LanguageModelLDHT::getClientSafe() {
- if (m_client.get() == NULL)
- m_client.reset(initTSSClient());
- return m_client.get();
+LDHT::Client* LanguageModelLDHT::getClientSafe()
+{
+ if (m_client.get() == NULL)
+ m_client.reset(initTSSClient());
+ return m_client.get();
}
// Do not check that there is a TSS Client instance.
-LDHT::Client* LanguageModelLDHT::getClientUnsafe() const {
- return m_client.get();
+LDHT::Client* LanguageModelLDHT::getClientUnsafe() const
+{
+ return m_client.get();
}
-LDHT::Client* LanguageModelLDHT::initTSSClient() {
- std::ifstream config_file(m_configPath.c_str());
- std::string ldht_config_path;
- getline(config_file, ldht_config_path);
- std::string ldhtlm_config_path;
- getline(config_file, ldhtlm_config_path);
-
- LDHT::FactoryCollection* factory_collection =
- LDHT::FactoryCollection::createDefaultFactoryCollection();
-
- LDHT::Client* client;
- //client = new LDHT::ClientLocal();
- client = new LDHT::Client();
- client->fromXmlFiles(*factory_collection,
- ldht_config_path,
- ldhtlm_config_path);
- return client;
+LDHT::Client* LanguageModelLDHT::initTSSClient()
+{
+ std::ifstream config_file(m_configPath.c_str());
+ std::string ldht_config_path;
+ getline(config_file, ldht_config_path);
+ std::string ldhtlm_config_path;
+ getline(config_file, ldhtlm_config_path);
+
+ LDHT::FactoryCollection* factory_collection =
+ LDHT::FactoryCollection::createDefaultFactoryCollection();
+
+ LDHT::Client* client;
+ //client = new LDHT::ClientLocal();
+ client = new LDHT::Client();
+ client->fromXmlFiles(*factory_collection,
+ ldht_config_path,
+ ldhtlm_config_path);
+ return client;
}
-void LanguageModelLDHT::InitializeForInput(InputType const& source) {
- getClientSafe()->clearCache();
- m_start_tick = LDHT::Util::rdtsc();
+void LanguageModelLDHT::InitializeForInput(InputType const& source)
+{
+ getClientSafe()->clearCache();
+ m_start_tick = LDHT::Util::rdtsc();
}
-void LanguageModelLDHT::CleanUpAfterSentenceProcessing(const InputType &source) {
- LDHT::Client* client = getClientSafe();
-
- std::cerr << "LDHT sentence stats:" << std::endl;
- std::cerr << " ngrams submitted: " << client->getNumNgramsSubmitted() << std::endl
- << " ngrams requested: " << client->getNumNgramsRequested() << std::endl
- << " ngrams not found: " << client->getKeyNotFoundCount() << std::endl
- << " cache hits: " << client->getCacheHitCount() << std::endl
- << " inferences: " << client->getInferenceCount() << std::endl
- << " pcnt latency: " << (float)client->getLatencyTicks() / (float)(LDHT::Util::rdtsc() - m_start_tick) * 100.0 << std::endl;
- m_start_tick = 0;
- client->resetLatencyTicks();
- client->resetNumNgramsSubmitted();
- client->resetNumNgramsRequested();
- client->resetInferenceCount();
- client->resetCacheHitCount();
- client->resetKeyNotFoundCount();
+void LanguageModelLDHT::CleanUpAfterSentenceProcessing(const InputType &source)
+{
+ LDHT::Client* client = getClientSafe();
+
+ std::cerr << "LDHT sentence stats:" << std::endl;
+ std::cerr << " ngrams submitted: " << client->getNumNgramsSubmitted() << std::endl
+ << " ngrams requested: " << client->getNumNgramsRequested() << std::endl
+ << " ngrams not found: " << client->getKeyNotFoundCount() << std::endl
+ << " cache hits: " << client->getCacheHitCount() << std::endl
+ << " inferences: " << client->getInferenceCount() << std::endl
+ << " pcnt latency: " << (float)client->getLatencyTicks() / (float)(LDHT::Util::rdtsc() - m_start_tick) * 100.0 << std::endl;
+ m_start_tick = 0;
+ client->resetLatencyTicks();
+ client->resetNumNgramsSubmitted();
+ client->resetNumNgramsRequested();
+ client->resetInferenceCount();
+ client->resetCacheHitCount();
+ client->resetKeyNotFoundCount();
}
const FFState* LanguageModelLDHT::EmptyHypothesisState(
- const InputType& input) const {
- return NULL;
+ const InputType& input) const
+{
+ return NULL;
}
-bool LanguageModelLDHT::Useable(const Phrase& phrase) const {
- return (phrase.GetSize() > 0 && phrase.GetFactor(0, m_factorType) != NULL);
+bool LanguageModelLDHT::Useable(const Phrase& phrase) const
+{
+ return (phrase.GetSize() > 0 && phrase.GetFactor(0, m_factorType) != NULL);
}
void LanguageModelLDHT::CalcScore(const Phrase& phrase,
float& fullScore,
float& ngramScore,
- std::size_t& oovCount) const {
- const_cast<LanguageModelLDHT*>(this)->m_calc_score_count++;
- if (m_calc_score_count > 10000) {
- const_cast<LanguageModelLDHT*>(this)->m_calc_score_count = 0;
- const_cast<LanguageModelLDHT*>(this)->sync();
- }
+ std::size_t& oovCount) const
+{
+ const_cast<LanguageModelLDHT*>(this)->m_calc_score_count++;
+ if (m_calc_score_count > 10000) {
+ const_cast<LanguageModelLDHT*>(this)->m_calc_score_count = 0;
+ const_cast<LanguageModelLDHT*>(this)->sync();
+ }
- // TODO(wilson): handle nonterminal words.
- LDHT::Client* client = getClientUnsafe();
- // Score the first order - 1 words of the phrase.
- int order = LDHT::NewNgram::k_max_order;
- int prefix_start = 0;
- int prefix_end = std::min(phrase.GetSize(), static_cast<size_t>(order - 1));
- LDHT::NewNgram ngram;
- for (int word_idx = prefix_start; word_idx < prefix_end; ++word_idx) {
- ngram.appendGram(phrase.GetWord(word_idx)
- .GetFactor(m_factorType)->GetString().c_str());
- client->requestNgram(ngram);
- }
- // Now score all subsequent ngrams to end of phrase.
- int internal_start = prefix_end;
- int internal_end = phrase.GetSize();
- for (int word_idx = internal_start; word_idx < internal_end; ++word_idx) {
- ngram.appendGram(phrase.GetWord(word_idx)
- .GetFactor(m_factorType)->GetString().c_str());
- client->requestNgram(ngram);
- }
+ // TODO(wilson): handle nonterminal words.
+ LDHT::Client* client = getClientUnsafe();
+ // Score the first order - 1 words of the phrase.
+ int order = LDHT::NewNgram::k_max_order;
+ int prefix_start = 0;
+ int prefix_end = std::min(phrase.GetSize(), static_cast<size_t>(order - 1));
+ LDHT::NewNgram ngram;
+ for (int word_idx = prefix_start; word_idx < prefix_end; ++word_idx) {
+ ngram.appendGram(phrase.GetWord(word_idx)
+ .GetFactor(m_factorType)->GetString().c_str());
+ client->requestNgram(ngram);
+ }
+ // Now score all subsequent ngrams to end of phrase.
+ int internal_start = prefix_end;
+ int internal_end = phrase.GetSize();
+ for (int word_idx = internal_start; word_idx < internal_end; ++word_idx) {
+ ngram.appendGram(phrase.GetWord(word_idx)
+ .GetFactor(m_factorType)->GetString().c_str());
+ client->requestNgram(ngram);
+ }
- fullScore = 0;
- ngramScore = 0;
- oovCount = 0;
+ fullScore = 0;
+ ngramScore = 0;
+ oovCount = 0;
}
void LanguageModelLDHT::CalcScoreFromCache(const Phrase& phrase,
- float& fullScore,
- float& ngramScore,
- std::size_t& oovCount) const {
- // Issue requests for phrase internal ngrams.
- // Sync if necessary. (or autosync).
- const_cast<LanguageModelLDHT*>(this)->sync();
-
- // TODO(wilson): handle nonterminal words.
- LDHT::Client* client = getClientUnsafe();
- // Score the first order - 1 words of the phrase.
- int order = LDHT::NewNgram::k_max_order;
- int prefix_start = 0;
- int prefix_end = std::min(phrase.GetSize(), static_cast<size_t>(order - 1));
- LDHT::NewNgram ngram;
- std::deque<int> full_score_tags;
- for (int word_idx = prefix_start; word_idx < prefix_end; ++word_idx) {
- ngram.appendGram(phrase.GetWord(word_idx)
- .GetFactor(m_factorType)->GetString().c_str());
- full_score_tags.push_back(client->requestNgram(ngram));
- }
- // Now score all subsequent ngrams to end of phrase.
- int internal_start = prefix_end;
- int internal_end = phrase.GetSize();
- std::deque<int> internal_score_tags;
- for (int word_idx = internal_start; word_idx < internal_end; ++word_idx) {
- ngram.appendGram(phrase.GetWord(word_idx)
- .GetFactor(m_factorType)->GetString().c_str());
- internal_score_tags.push_back(client->requestNgram(ngram));
- }
+ float& fullScore,
+ float& ngramScore,
+ std::size_t& oovCount) const
+{
+ // Issue requests for phrase internal ngrams.
+ // Sync if necessary. (or autosync).
+ const_cast<LanguageModelLDHT*>(this)->sync();
+
+ // TODO(wilson): handle nonterminal words.
+ LDHT::Client* client = getClientUnsafe();
+ // Score the first order - 1 words of the phrase.
+ int order = LDHT::NewNgram::k_max_order;
+ int prefix_start = 0;
+ int prefix_end = std::min(phrase.GetSize(), static_cast<size_t>(order - 1));
+ LDHT::NewNgram ngram;
+ std::deque<int> full_score_tags;
+ for (int word_idx = prefix_start; word_idx < prefix_end; ++word_idx) {
+ ngram.appendGram(phrase.GetWord(word_idx)
+ .GetFactor(m_factorType)->GetString().c_str());
+ full_score_tags.push_back(client->requestNgram(ngram));
+ }
+ // Now score all subsequent ngrams to end of phrase.
+ int internal_start = prefix_end;
+ int internal_end = phrase.GetSize();
+ std::deque<int> internal_score_tags;
+ for (int word_idx = internal_start; word_idx < internal_end; ++word_idx) {
+ ngram.appendGram(phrase.GetWord(word_idx)
+ .GetFactor(m_factorType)->GetString().c_str());
+ internal_score_tags.push_back(client->requestNgram(ngram));
+ }
- // Wait for resposes from the servers.
- //client->awaitResponses();
+ // Wait for resposes from the servers.
+ //client->awaitResponses();
- // Calculate the full phrase score, and the internal score.
- fullScore = 0.0;
- while (!full_score_tags.empty()) {
- fullScore += client->getNgramScore(full_score_tags.front());
- full_score_tags.pop_front();
- }
- ngramScore = 0.0;
- while (!internal_score_tags.empty()) {
- float score = client->getNgramScore(internal_score_tags.front());
- internal_score_tags.pop_front();
- fullScore += score;
- ngramScore += score;
- }
- fullScore = TransformLMScore(fullScore);
- ngramScore = TransformLMScore(ngramScore);
- oovCount = 0;
+ // Calculate the full phrase score, and the internal score.
+ fullScore = 0.0;
+ while (!full_score_tags.empty()) {
+ fullScore += client->getNgramScore(full_score_tags.front());
+ full_score_tags.pop_front();
+ }
+ ngramScore = 0.0;
+ while (!internal_score_tags.empty()) {
+ float score = client->getNgramScore(internal_score_tags.front());
+ internal_score_tags.pop_front();
+ fullScore += score;
+ ngramScore += score;
+ }
+ fullScore = TransformLMScore(fullScore);
+ ngramScore = TransformLMScore(ngramScore);
+ oovCount = 0;
}
void LanguageModelLDHT::IssueRequestsFor(Hypothesis& hypo,
- const FFState* input_state) {
- // TODO(wilson): handle nonterminal words.
- LDHT::Client* client = getClientUnsafe();
-
- // Create a new state and copy the contents of the input_state if
- // supplied.
- LDHTLMState* new_state = new LDHTLMState();
- if (input_state == NULL) {
- if (hypo.GetCurrTargetWordsRange().GetStartPos() != 0) {
- V("got a null state but not at start of sentence");
- abort();
- }
- new_state->gram_fingerprints.appendGram(BOS_);
+ const FFState* input_state)
+{
+ // TODO(wilson): handle nonterminal words.
+ LDHT::Client* client = getClientUnsafe();
+
+ // Create a new state and copy the contents of the input_state if
+ // supplied.
+ LDHTLMState* new_state = new LDHTLMState();
+ if (input_state == NULL) {
+ if (hypo.GetCurrTargetWordsRange().GetStartPos() != 0) {
+ V("got a null state but not at start of sentence");
+ abort();
}
- else {
- if (hypo.GetCurrTargetWordsRange().GetStartPos() == 0) {
- V("got a non null state but at start of sentence");
- abort();
- }
- new_state->copyFrom(static_cast<const LDHTLMState&>(*input_state));
+ new_state->gram_fingerprints.appendGram(BOS_);
+ } else {
+ if (hypo.GetCurrTargetWordsRange().GetStartPos() == 0) {
+ V("got a non null state but at start of sentence");
+ abort();
}
+ new_state->copyFrom(static_cast<const LDHTLMState&>(*input_state));
+ }
- // Score ngrams that overlap with the previous phrase.
- int order = LDHT::NewNgram::k_max_order;
- int phrase_start = hypo.GetCurrTargetWordsRange().GetStartPos();
- int phrase_end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
- int overlap_start = phrase_start;
- int overlap_end = std::min(phrase_end, phrase_start + order - 1);
- int word_idx = overlap_start;
- LDHT::NewNgram& ngram = new_state->gram_fingerprints;
- for (; word_idx < overlap_end; ++word_idx) {
- ngram.appendGram(
- hypo.GetFactor(word_idx, m_factorType)->GetString().c_str());
- new_state->appendRequestTag(client->requestNgram(ngram));
- }
- // No need to score phrase internal ngrams, but keep track of them
- // in the state (which in this case is the NewNgram containing the
- // hashes of the individual grams).
- for (; word_idx < phrase_end; ++word_idx) {
- ngram.appendGram(
- hypo.GetFactor(word_idx, m_factorType)->GetString().c_str());
- }
- // If this is the last phrase in the sentence, score the last ngram
- // with the end of sentence marker on it.
- if (hypo.IsSourceCompleted()) {
- ngram.appendGram(EOS_);
- //request_tags.push_back(client->requestNgram(ngram));
- new_state->appendRequestTag(client->requestNgram(ngram));
- }
- hypo.SetFFState(m_state_idx, new_state);
+ // Score ngrams that overlap with the previous phrase.
+ int order = LDHT::NewNgram::k_max_order;
+ int phrase_start = hypo.GetCurrTargetWordsRange().GetStartPos();
+ int phrase_end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
+ int overlap_start = phrase_start;
+ int overlap_end = std::min(phrase_end, phrase_start + order - 1);
+ int word_idx = overlap_start;
+ LDHT::NewNgram& ngram = new_state->gram_fingerprints;
+ for (; word_idx < overlap_end; ++word_idx) {
+ ngram.appendGram(
+ hypo.GetFactor(word_idx, m_factorType)->GetString().c_str());
+ new_state->appendRequestTag(client->requestNgram(ngram));
+ }
+ // No need to score phrase internal ngrams, but keep track of them
+ // in the state (which in this case is the NewNgram containing the
+ // hashes of the individual grams).
+ for (; word_idx < phrase_end; ++word_idx) {
+ ngram.appendGram(
+ hypo.GetFactor(word_idx, m_factorType)->GetString().c_str());
+ }
+ // If this is the last phrase in the sentence, score the last ngram
+ // with the end of sentence marker on it.
+ if (hypo.IsSourceCompleted()) {
+ ngram.appendGram(EOS_);
+ //request_tags.push_back(client->requestNgram(ngram));
+ new_state->appendRequestTag(client->requestNgram(ngram));
+ }
+ hypo.SetFFState(m_state_idx, new_state);
}
-void LanguageModelLDHT::sync() {
- m_calc_score_count = 0;
- getClientUnsafe()->awaitResponses();
+void LanguageModelLDHT::sync()
+{
+ m_calc_score_count = 0;
+ getClientUnsafe()->awaitResponses();
}
-void LanguageModelLDHT::SetFFStateIdx(int state_idx) {
- m_state_idx = state_idx;
+void LanguageModelLDHT::SetFFStateIdx(int state_idx)
+{
+ m_state_idx = state_idx;
}
FFState* LanguageModelLDHT::Evaluate(
- const Hypothesis& hypo,
- const FFState* input_state_ignored,
- ScoreComponentCollection* score_output) const {
- // Input state is the state from the previous hypothesis, which
- // we are not interested in. The requests for this hypo should
- // already have been issued via IssueRequestsFor() and the LM then
- // synced and all responses processed, and the tags placed in our
- // FFState of hypo.
- LDHTLMState* state = const_cast<LDHTLMState*>(static_cast<const LDHTLMState*>(hypo.GetFFState(m_state_idx)));
-
- float score = calcScoreFromState(state);
- score = FloorScore(TransformLMScore(score));
- score_output->PlusEquals(this, score);
-
- return state;
+ const Hypothesis& hypo,
+ const FFState* input_state_ignored,
+ ScoreComponentCollection* score_output) const
+{
+ // Input state is the state from the previous hypothesis, which
+ // we are not interested in. The requests for this hypo should
+ // already have been issued via IssueRequestsFor() and the LM then
+ // synced and all responses processed, and the tags placed in our
+ // FFState of hypo.
+ LDHTLMState* state = const_cast<LDHTLMState*>(static_cast<const LDHTLMState*>(hypo.GetFFState(m_state_idx)));
+
+ float score = calcScoreFromState(state);
+ score = FloorScore(TransformLMScore(score));
+ score_output->PlusEquals(this, score);
+
+ return state;
}
FFState* LanguageModelLDHT::EvaluateChart(
- const ChartHypothesis& hypo,
- int featureID,
- ScoreComponentCollection* accumulator) const {
- return NULL;
+ const ChartHypothesis& hypo,
+ int featureID,
+ ScoreComponentCollection* accumulator) const
+{
+ return NULL;
}
-float LanguageModelLDHT::calcScoreFromState(LDHTLMState* state) const {
- float score = 0.0;
- std::vector<int>::iterator tag_iter;
- LDHT::Client* client = getClientUnsafe();
- for (tag_iter = state->requestTagsBegin();
- tag_iter != state->requestTagsEnd();
- ++tag_iter) {
- score += client->getNgramScore(*tag_iter);
- }
- state->clearRequestTags();
- state->setFinalised();
- return score;
+float LanguageModelLDHT::calcScoreFromState(LDHTLMState* state) const
+{
+ float score = 0.0;
+ std::vector<int>::iterator tag_iter;
+ LDHT::Client* client = getClientUnsafe();
+ for (tag_iter = state->requestTagsBegin();
+ tag_iter != state->requestTagsEnd();
+ ++tag_iter) {
+ score += client->getNgramScore(*tag_iter);
+ }
+ state->clearRequestTags();
+ state->setFinalised();
+ return score;
}
} // namespace Moses.
diff --git a/moses/LM/LDHT.h b/moses/LM/LDHT.h
index a8489c0e3..8c5c3c36b 100644
--- a/moses/LM/LDHT.h
+++ b/moses/LM/LDHT.h
@@ -7,7 +7,8 @@
#include "moses/TypeDef.h"
-namespace Moses {
+namespace Moses
+{
class ScoreIndexManager;
class LanguageModel;
diff --git a/moses/LM/MultiFactor.h b/moses/LM/MultiFactor.h
index 491da4abe..21a9d493b 100644
--- a/moses/LM/MultiFactor.h
+++ b/moses/LM/MultiFactor.h
@@ -33,7 +33,7 @@ namespace Moses
class Phrase;
-/* Abstract class for for multi factor LM. Only inherited by the JointLM at the moment.
+/* Abstract class for for multi factor LM. Only inherited by the JointLM at the moment.
* Could use this when factored LM are implemented
*/
class LanguageModelMultiFactor : public LanguageModelImplementation
@@ -41,16 +41,16 @@ class LanguageModelMultiFactor : public LanguageModelImplementation
protected:
FactorMask m_factorTypes;
- LanguageModelMultiFactor(const std::string& description, const std::string &line)
- :LanguageModelImplementation(description, line)
+ LanguageModelMultiFactor(const std::string& description, const std::string &line)
+ :LanguageModelImplementation(description, line)
{}
-
+
public:
virtual bool Load(const std::string &filePath
, const std::vector<FactorType> &factorTypes
, size_t nGramOrder) = 0;
- bool Useable(const Phrase &phrase) const;
+ bool Useable(const Phrase &phrase) const;
};
}
diff --git a/moses/LM/ORLM.cpp b/moses/LM/ORLM.cpp
index 226267ee2..44fd64efb 100644
--- a/moses/LM/ORLM.cpp
+++ b/moses/LM/ORLM.cpp
@@ -9,10 +9,11 @@
#include "ORLM.h"
using std::map;
-namespace Moses
+namespace Moses
+{
+bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
+ size_t nGramOrder)
{
-bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
- size_t nGramOrder) {
cerr << "Loading LanguageModelORLM..." << endl;
m_filePath = filePath;
m_factorType = factorType;
@@ -26,13 +27,14 @@ bool LanguageModelORLM::Load(const std::string &filePath, FactorType factorType,
CreateFactors();
return true;
}
-void LanguageModelORLM::CreateFactors() {
+void LanguageModelORLM::CreateFactors()
+{
FactorCollection &factorCollection = FactorCollection::Instance();
size_t maxFactorId = 0; // to create lookup vector later on
std::map<size_t, wordID_t> m_lmids_map; // map from factor id -> word id
for(std::map<Word, wordID_t>::const_iterator vIter = m_lm->vocab_->VocabStart();
- vIter != m_lm->vocab_->VocabEnd(); vIter++){
+ vIter != m_lm->vocab_->VocabEnd(); vIter++) {
// get word from ORLM vocab and associate with (new) factor id
size_t factorId = factorCollection.AddFactor(Output,m_factorType,vIter->first.ToString())->GetId();
m_lmids_map[factorId] = vIter->second;
@@ -50,7 +52,7 @@ void LanguageModelORLM::CreateFactors() {
maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
m_sentenceEndWord[m_factorType] = m_sentenceEnd;
// add to lookup vector in object
- lm_ids_vec_.resize(maxFactorId+1);
+ lm_ids_vec_.resize(maxFactorId+1);
// fill with OOV code
fill(lm_ids_vec_.begin(), lm_ids_vec_.end(), m_oov_id);
@@ -58,15 +60,18 @@ void LanguageModelORLM::CreateFactors() {
iter != m_lmids_map.end() ; ++iter)
lm_ids_vec_[iter->first] = iter->second;
}
-wordID_t LanguageModelORLM::GetLmID(const std::string& str) const {
+wordID_t LanguageModelORLM::GetLmID(const std::string& str) const
+{
return m_lm->vocab_->GetWordID(str);
}
-wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const {
+wordID_t LanguageModelORLM::GetLmID(const Factor* factor) const
+{
size_t factorId = factor->GetId();
return (factorId >= lm_ids_vec_.size()) ? m_oov_id : lm_ids_vec_[factorId];
}
-LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor,
- State* finalState) const {
+LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFactor,
+ State* finalState) const
+{
FactorType factorType = GetFactorType();
// set up context
//std::vector<long unsigned int> factor(1,0);
@@ -88,13 +93,14 @@ LMResult LanguageModelORLM::GetValue(const std::vector<const Word*> &contextFact
*/
return ret;
}
-bool LanguageModelORLM::UpdateORLM(const std::vector<string>& ngram, const int value) {
+bool LanguageModelORLM::UpdateORLM(const std::vector<string>& ngram, const int value)
+{
/*cerr << "Inserting into ORLM: \"";
iterate(ngram, nit)
cerr << *nit << " ";
cerr << "\"\t" << value << endl; */
m_lm->vocab_->MakeOpen();
- bool res = m_lm->update(ngram, value);
+ bool res = m_lm->update(ngram, value);
m_lm->vocab_->MakeClosed();
return res;
}
diff --git a/moses/LM/ORLM.h b/moses/LM/ORLM.h
index 48909191e..d7a8b5d35 100644
--- a/moses/LM/ORLM.h
+++ b/moses/LM/ORLM.h
@@ -17,7 +17,8 @@ class Phrase;
/** @todo ask ollie
*/
-class LanguageModelORLM : public LanguageModelSingleFactor {
+class LanguageModelORLM : public LanguageModelSingleFactor
+{
public:
typedef count_t T; // type for ORLM filter
LanguageModelORLM(const std::string &line)
@@ -34,10 +35,12 @@ public:
fout.close();
delete m_lm;
}
- void CleanUpAfterSentenceProcessing() {m_lm->clearCache();} // clear caches
+ void CleanUpAfterSentenceProcessing() {
+ m_lm->clearCache(); // clear caches
+ }
bool UpdateORLM(const std::vector<string>& ngram, const int value);
- protected:
+protected:
OnlineRLM<T>* m_lm;
//MultiOnlineRLM<T>* m_lm;
wordID_t m_oov_id;
diff --git a/moses/LM/ParallelBackoff.cpp b/moses/LM/ParallelBackoff.cpp
index cf8c1509b..0b996de2b 100644
--- a/moses/LM/ParallelBackoff.cpp
+++ b/moses/LM/ParallelBackoff.cpp
@@ -70,7 +70,7 @@ private:
public:
LanguageModelParallelBackoff(const std::string &line)
- :LanguageModelMultiFactor("ParallelBackoffLM", line)
+ :LanguageModelMultiFactor("ParallelBackoffLM", line)
{}
~LanguageModelParallelBackoff();
diff --git a/moses/LM/Rand.cpp b/moses/LM/Rand.cpp
index 8e3e37a1f..5e31029d5 100644
--- a/moses/LM/Rand.cpp
+++ b/moses/LM/Rand.cpp
@@ -37,7 +37,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
-namespace
+namespace
{
using namespace std;
@@ -45,8 +45,8 @@ class LanguageModelRandLM : public LanguageModelSingleFactor
{
public:
LanguageModelRandLM(const std::string &line)
- :LanguageModelSingleFactor("RandLM", line)
- , m_lm(0)
+ :LanguageModelSingleFactor("RandLM", line)
+ , m_lm(0)
{}
bool Load(const std::string &filePath, FactorType factorType, size_t nGramOrder);
virtual LMResult GetValue(const std::vector<const Word*> &contextFactor, State* finalState = NULL) const;
@@ -133,7 +133,7 @@ randlm::WordID LanguageModelRandLM::GetLmID( const std::string &str ) const
}
LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
- State* finalState) const
+ State* finalState) const
{
FactorType factorType = GetFactorType();
// set up context
diff --git a/moses/LM/SRI.cpp b/moses/LM/SRI.cpp
index b6281512c..54e6f93b9 100644
--- a/moses/LM/SRI.cpp
+++ b/moses/LM/SRI.cpp
@@ -39,29 +39,26 @@ using namespace std;
namespace Moses
{
LanguageModelSRI::LanguageModelSRI(const std::string &line)
-:LanguageModelSingleFactor("SRILM", line)
-,m_srilmVocab(0)
-,m_srilmModel(0)
+ :LanguageModelSingleFactor("SRILM", line)
+ ,m_srilmVocab(0)
+ ,m_srilmModel(0)
{
FactorType factorType;
size_t nGramOrder;
string filePath;
for (size_t i = 0; i < m_args.size(); ++i) {
- const vector<string> &args = m_args[i];
-
- if (args[0] == "factor") {
- factorType = Scan<FactorType>(args[1]);
- }
- else if (args[0] == "order") {
- nGramOrder = Scan<size_t>(args[1]);
- }
- else if (args[0] == "path") {
- filePath = args[1];
- }
- else {
- throw "Unknown argument " + args[0];
- }
+ const vector<string> &args = m_args[i];
+
+ if (args[0] == "factor") {
+ factorType = Scan<FactorType>(args[1]);
+ } else if (args[0] == "order") {
+ nGramOrder = Scan<size_t>(args[1]);
+ } else if (args[0] == "path") {
+ filePath = args[1];
+ } else {
+ throw "Unknown argument " + args[0];
+ }
}
Load(filePath, factorType, nGramOrder);
diff --git a/moses/LM/SingleFactor.cpp b/moses/LM/SingleFactor.cpp
index 031fa38ac..abd8aca51 100644
--- a/moses/LM/SingleFactor.cpp
+++ b/moses/LM/SingleFactor.cpp
@@ -38,7 +38,7 @@ namespace Moses
{
LanguageModelSingleFactor::LanguageModelSingleFactor(const std::string& description, const std::string &line)
-:LanguageModelImplementation(description, line)
+ :LanguageModelImplementation(description, line)
{
m_nullContextState = new PointerState(NULL);
m_beginSentenceState = new PointerState(NULL);
diff --git a/moses/LM/SingleFactor.h b/moses/LM/SingleFactor.h
index cb51808ac..9a1f30216 100644
--- a/moses/LM/SingleFactor.h
+++ b/moses/LM/SingleFactor.h
@@ -43,31 +43,27 @@ protected:
FFState *m_nullContextState;
FFState *m_beginSentenceState;
- LanguageModelSingleFactor(const std::string& description, const std::string &line);
+ LanguageModelSingleFactor(const std::string& description, const std::string &line);
public:
- virtual ~LanguageModelSingleFactor();
- virtual bool Load(const std::string &filePath
- , FactorType factorType
- , size_t nGramOrder) = 0;
-
- bool Useable(const Phrase &phrase) const
- {
- return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
- }
-
- const Factor *GetSentenceStart() const
- {
- return m_sentenceStart;
- }
- const Factor *GetSentenceEnd() const
- {
- return m_sentenceEnd;
- }
- FactorType GetFactorType() const
- {
- return m_factorType;
- }
+ virtual ~LanguageModelSingleFactor();
+ virtual bool Load(const std::string &filePath
+ , FactorType factorType
+ , size_t nGramOrder) = 0;
+
+ bool Useable(const Phrase &phrase) const {
+ return (phrase.GetSize()>0 && phrase.GetFactor(0, m_factorType) != NULL);
+ }
+
+ const Factor *GetSentenceStart() const {
+ return m_sentenceStart;
+ }
+ const Factor *GetSentenceEnd() const {
+ return m_sentenceEnd;
+ }
+ FactorType GetFactorType() const {
+ return m_factorType;
+ }
virtual const FFState *GetNullContextState() const;
virtual const FFState *GetBeginSentenceState() const;
diff --git a/moses/LexicalReordering.cpp b/moses/LexicalReordering.cpp
index 71c8fb2b8..98dca7b5f 100644
--- a/moses/LexicalReordering.cpp
+++ b/moses/LexicalReordering.cpp
@@ -10,7 +10,7 @@ using namespace std;
namespace Moses
{
LexicalReordering::LexicalReordering(const std::string &line)
-: StatefulFeatureFunction("LexicalReordering", line)
+ : StatefulFeatureFunction("LexicalReordering", line)
{
std::cerr << "Initializing LexicalReordering.." << std::endl;
@@ -24,41 +24,37 @@ LexicalReordering::LexicalReordering(const std::string &line)
m_configuration = new LexicalReorderingConfiguration(args[1]);
m_configuration->SetScoreProducer(this);
m_modelTypeString = m_configuration->GetModelString();
- }
- else if (args[0] == "input-factor") {
+ } else if (args[0] == "input-factor") {
f_factors =Tokenize<FactorType>(args[1]);
- }
- else if (args[0] == "output-factor") {
+ } else if (args[0] == "output-factor") {
e_factors =Tokenize<FactorType>(args[1]);
- }
- else if (args[0] == "path") {
+ } else if (args[0] == "path") {
filePath = args[1];
- }
- else {
+ } else {
throw "Unknown argument " + args[0];
}
}
switch(m_configuration->GetCondition()) {
- case LexicalReorderingConfiguration::FE:
- case LexicalReorderingConfiguration::E:
- m_factorsE = e_factors;
- if(m_factorsE.empty()) {
- UserMessage::Add("TL factor mask for lexical reordering is unexpectedly empty");
- exit(1);
- }
- if(m_configuration->GetCondition() == LexicalReorderingConfiguration::E)
- break; // else fall through
- case LexicalReorderingConfiguration::F:
- m_factorsF = f_factors;
- if(m_factorsF.empty()) {
- UserMessage::Add("SL factor mask for lexical reordering is unexpectedly empty");
- exit(1);
- }
- break;
- default:
- UserMessage::Add("Unknown conditioning option!");
+ case LexicalReorderingConfiguration::FE:
+ case LexicalReorderingConfiguration::E:
+ m_factorsE = e_factors;
+ if(m_factorsE.empty()) {
+ UserMessage::Add("TL factor mask for lexical reordering is unexpectedly empty");
exit(1);
+ }
+ if(m_configuration->GetCondition() == LexicalReorderingConfiguration::E)
+ break; // else fall through
+ case LexicalReorderingConfiguration::F:
+ m_factorsF = f_factors;
+ if(m_factorsF.empty()) {
+ UserMessage::Add("SL factor mask for lexical reordering is unexpectedly empty");
+ exit(1);
+ }
+ break;
+ default:
+ UserMessage::Add("Unknown conditioning option!");
+ exit(1);
}
m_table = LexicalReorderingTable::LoadAvailable(filePath, m_factorsF, m_factorsE, std::vector<FactorType>());
diff --git a/moses/LexicalReordering.h b/moses/LexicalReordering.h
index 51cf797f0..abaa31c25 100644
--- a/moses/LexicalReordering.h
+++ b/moses/LexicalReordering.h
@@ -24,15 +24,16 @@ class InputType;
/** implementation of lexical reordering (Tilman ...) for phrase-based decoding
*/
-class LexicalReordering : public StatefulFeatureFunction {
-public:
+class LexicalReordering : public StatefulFeatureFunction
+{
+public:
LexicalReordering(const std::string &line);
virtual ~LexicalReordering();
virtual const FFState* EmptyHypothesisState(const InputType &input) const;
- void InitializeForInput(const InputType& i){
- m_table->InitializeForInput(i);
+ void InitializeForInput(const InputType& i) {
+ m_table->InitializeForInput(i);
}
Scores GetProb(const Phrase& f, const Phrase& e) const;
@@ -43,25 +44,25 @@ public:
virtual FFState* EvaluateChart(const ChartHypothesis&,
int /* featureID */,
- ScoreComponentCollection*) const {
- CHECK(0); // not valid for chart decoder
- return NULL;
- }
+ ScoreComponentCollection*) const {
+ CHECK(0); // not valid for chart decoder
+ return NULL;
+ }
private:
- bool DecodeCondition(std::string s);
- bool DecodeDirection(std::string s);
- bool DecodeNumFeatureFunctions(std::string s);
+ bool DecodeCondition(std::string s);
+ bool DecodeDirection(std::string s);
+ bool DecodeNumFeatureFunctions(std::string s);
- LexicalReorderingConfiguration *m_configuration;
- std::string m_modelTypeString;
- std::vector<std::string> m_modelType;
- LexicalReorderingTable* m_table;
- //std::vector<Direction> m_direction;
- std::vector<LexicalReorderingConfiguration::Condition> m_condition;
- //std::vector<size_t> m_scoreOffset;
- //bool m_oneScorePerDirection;
- std::vector<FactorType> m_factorsE, m_factorsF;
+ LexicalReorderingConfiguration *m_configuration;
+ std::string m_modelTypeString;
+ std::vector<std::string> m_modelType;
+ LexicalReorderingTable* m_table;
+ //std::vector<Direction> m_direction;
+ std::vector<LexicalReorderingConfiguration::Condition> m_condition;
+ //std::vector<size_t> m_scoreOffset;
+ //bool m_oneScorePerDirection;
+ std::vector<FactorType> m_factorsE, m_factorsF;
};
}
diff --git a/moses/LexicalReorderingState.cpp b/moses/LexicalReorderingState.cpp
index ddb089055..3165e447f 100644
--- a/moses/LexicalReorderingState.cpp
+++ b/moses/LexicalReorderingState.cpp
@@ -212,7 +212,7 @@ LexicalReorderingState* PhraseBasedReorderingState::Expand(const TranslationOpti
if (m_direction == LexicalReorderingConfiguration::Forward && m_first) {
ClearScores(scores);
} else {
- if (!m_first || m_useFirstBackwardScore){
+ if (!m_first || m_useFirstBackwardScore) {
if (modelType == LexicalReorderingConfiguration::MSD) {
reoType = GetOrientationTypeMSD(currWordsRange);
} else if (modelType == LexicalReorderingConfiguration::MSLR) {
diff --git a/moses/LexicalReorderingTable.cpp b/moses/LexicalReorderingTable.cpp
index c0da31402..65ba66047 100644
--- a/moses/LexicalReorderingTable.cpp
+++ b/moses/LexicalReorderingTable.cpp
@@ -9,7 +9,7 @@
#include "TargetPhraseCollection.h"
#ifndef WIN32
-#include "TranslationModel/CompactPT/LexicalReorderingTableCompact.h"
+#include "TranslationModel/CompactPT/LexicalReorderingTableCompact.h"
#endif
namespace Moses
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index b8e958d04..76809f224 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -80,7 +80,7 @@ void Manager::ProcessSentence()
{
// reset statistics
ResetSentenceStats(m_source);
-
+
Timer getOptionsTime;
getOptionsTime.start();
m_transOptColl->CreateTranslationOptions();
@@ -262,8 +262,9 @@ struct SGNReverseCompare {
/**
* Implements lattice sampling, as in Chatterjee & Cancedda, emnlp 2010
**/
-void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
-
+void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const
+{
+
vector<SearchGraphNode> searchGraph;
GetSearchGraph(searchGraph);
@@ -277,15 +278,15 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
map<int,const Hypothesis*> idToHyp;
map<int,float> fscores;
- //Iterating through the hypos in reverse order of id gives a reverse
- //topological order. We rely on the fact that hypo ids are given out
+ //Iterating through the hypos in reverse order of id gives a reverse
+ //topological order. We rely on the fact that hypo ids are given out
//sequentially, as the search proceeds.
- //NB: Could just sort by stack.
+ //NB: Could just sort by stack.
sort(searchGraph.begin(), searchGraph.end(), SGNReverseCompare());
//first task is to fill in the outgoing hypos and edge scores.
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
- i != searchGraph.end(); ++i) {
+ i != searchGraph.end(); ++i) {
const Hypothesis* hypo = i->hypo;
idToHyp[hypo->GetId()] = hypo;
fscores[hypo->GetId()] = i->fscore;
@@ -293,7 +294,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
//back to current
const Hypothesis* prevHypo = i->hypo->GetPrevHypo();
outgoingHyps[prevHypo].insert(hypo);
- edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] =
+ edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] =
hypo->GetScore() - prevHypo->GetScore();
}
//forward from current
@@ -304,7 +305,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
outgoingHyps[hypo].insert(nextHypo);
map<int,float>::const_iterator fscoreIter = fscores.find(nextHypo->GetId());
CHECK(fscoreIter != fscores.end());
- edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] =
+ edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] =
i->fscore - fscoreIter->second;
}
}
@@ -312,26 +313,26 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
//then run through again to calculate sigmas
for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
- i != searchGraph.end(); ++i) {
+ i != searchGraph.end(); ++i) {
if (i->forward == -1) {
sigmas[i->hypo] = 0;
} else {
- map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
+ map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
outgoingHyps.find(i->hypo);
-
+
CHECK(outIter != outgoingHyps.end());
float sigma = 0;
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
- j != outIter->second.end(); ++j) {
+ j != outIter->second.end(); ++j) {
map<const Hypothesis*, float>::const_iterator succIter = sigmas.find(*j);
CHECK(succIter != sigmas.end());
- map<Edge,float>::const_iterator edgeScoreIter =
+ map<Edge,float>::const_iterator edgeScoreIter =
edgeScores.find(Edge(i->hypo->GetId(),(*j)->GetId()));
CHECK(edgeScoreIter != edgeScores.end());
float term = edgeScoreIter->second + succIter->second; // Add sigma(*j)
if (sigma == 0) {
- sigma = term;
+ sigma = term;
} else {
sigma = log_sum(sigma,term);
}
@@ -347,7 +348,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
vector<const Hypothesis*> path;
path.push_back(startHypo);
while(1) {
- map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
+ map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
outgoingHyps.find(path.back());
if (outIter == outgoingHyps.end() || !outIter->second.size()) {
//end of the path
@@ -358,7 +359,7 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
vector<float> candidateScores;
float scoreTotal = 0;
for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
- j != outIter->second.end(); ++j) {
+ j != outIter->second.end(); ++j) {
candidates.push_back(*j);
CHECK(sigmas.find(*j) != sigmas.end());
Edge edge(path.back()->GetId(),(*j)->GetId());
@@ -385,18 +386,18 @@ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const {
}
//cerr << "Random: " << random << " Chose " << position-1 << endl;
const Hypothesis* chosen = candidates[position-1];
- path.push_back(chosen);
+ path.push_back(chosen);
}
//cerr << "Path: " << endl;
//for (size_t j = 0; j < path.size(); ++j) {
- // cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl;
+ // cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl;
//}
//cerr << endl;
//Convert the hypos to TrellisPath
ret.Add(new TrellisPath(path));
//cerr << ret.at(ret.GetSize()-1).GetScoreBreakdown() << endl;
- }
+ }
}
@@ -680,7 +681,7 @@ void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std:
// outputSearchGraphStream << endl;
// outputSearchGraphStream << (*hypo) << endl;
- // const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
+ // const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
// outputSearchGraphStream << scoreCollection << endl;
const StaticData& staticData = StaticData::Instance();
@@ -753,10 +754,10 @@ size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction*
if (numScoreComps != 0) {
vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
for (size_t i = 0; i < numScoreComps; ++i) {
- outputSearchGraphStream << "# " << ff->GetScoreProducerDescription()
- << " " << ff->GetScoreProducerDescription()
- << " " << (i+1) << " of " << numScoreComps << endl
- << "x" << (index+i) << "scale=" << values[i] << endl;
+ outputSearchGraphStream << "# " << ff->GetScoreProducerDescription()
+ << " " << ff->GetScoreProducerDescription()
+ << " " << (i+1) << " of " << numScoreComps << endl
+ << "x" << (index+i) << "scale=" << values[i] << endl;
}
return index+numScoreComps;
} else {
@@ -779,28 +780,28 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth
// // cout << "m_scores.coreSize()==" << m_scores.coreSize() << endl;
// // cout << "m_scores.cbegin() ?= m_scores.cend()\t" << (m_scores.cbegin() == m_scores.cend()) << endl;
-
+
// // for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
// // std::cout<<prefix << "\t" << (i->first) << "\t" << (i->second) << std::endl;
// // }
// for(int i=0, n=v.size(); i<n; i+=1) {
// // outputSearchGraphStream << prefix << i << "==" << v[i] << std::endl;
-
+
// }
// }
// FVector featureValues = scoreCollection.GetVectorForProducer(ff);
// outputSearchGraphStream << featureValues << endl;
- const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
+ const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
size_t numScoreComps = featureValues.size();//featureValues.coreSize();
// if (numScoreComps != ScoreProducer::unlimited) {
- // vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+ // vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
for (size_t i = 0; i < numScoreComps; ++i) {
outputSearchGraphStream << "x" << (index+i) << "=" << ((zeros) ? 0.0 : featureValues[i]) << " ";
- }
- return index+numScoreComps;
+ }
+ return index+numScoreComps;
// } else {
// cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
// assert(false);
@@ -810,7 +811,7 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth
size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
{
- ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown();
+ ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown();
const Hypothesis *prevHypo = hypo->GetPrevHypo();
if (prevHypo) {
scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() );
@@ -851,60 +852,60 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
{
long hypergraphHypothesisID = 0;
for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) {
-
+
// Get an id number for the previous hypothesis
const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo();
if (prevHypo!=NULL) {
- int mosesPrevHypothesisID = prevHypo->GetId();
- if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) {
- mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID;
- // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID;
- hypergraphHypothesisID += 1;
- }
+ int mosesPrevHypothesisID = prevHypo->GetId();
+ if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) {
+ mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID;
+ // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID;
+ hypergraphHypothesisID += 1;
+ }
}
// Get an id number for this hypothesis
int mosesHypothesisID;
if (searchGraph[arcNumber].recombinationHypo) {
- mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId();
+ mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId();
} else {
- mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
+ mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
}
if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) {
-
- mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
- // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID;
- bool terminalNode = (searchGraph[arcNumber].forward == -1);
- if (terminalNode) {
- // Final arc to end node, representing the end of the sentence </s>
- terminalNodes.insert(hypergraphHypothesisID);
- }
+ mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
+ // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID;
- hypergraphHypothesisID += 1;
+ bool terminalNode = (searchGraph[arcNumber].forward == -1);
+ if (terminalNode) {
+ // Final arc to end node, representing the end of the sentence </s>
+ terminalNodes.insert(hypergraphHypothesisID);
+ }
+
+ hypergraphHypothesisID += 1;
}
// Record that this arc ends at this node
hypergraphIDToArcs.insert(pair<int,int>(mosesIDToHypergraphID[mosesHypothesisID],arcNumber));
}
-
+
// Unique end node
endNode = hypergraphHypothesisID;
// mosesIDToHypergraphID[hypergraphHypothesisID] = hypergraphHypothesisID;
numNodes = endNode + 1;
}
-
+
long numArcs = searchGraph.size() + terminalNodes.size();
// Print number of nodes and arcs
outputSearchGraphStream << numNodes << " " << numArcs << endl;
- VERBOSE(2,"Search graph to output as hypergraph for sentence " << translationId
- << " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl)
+ VERBOSE(2,"Search graph to output as hypergraph for sentence " << translationId
+ << " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl)
VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << translationId << std::endl)
@@ -920,51 +921,51 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
outputSearchGraphStream << count << "\n";
pair<multimap<int,int>::iterator, multimap<int,int>::iterator> range =
- hypergraphIDToArcs.equal_range(hypergraphHypothesisID);
+ hypergraphIDToArcs.equal_range(hypergraphHypothesisID);
for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) {
- int lineNumber = (*it).second;
- const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
- int mosesHypothesisID;// = thisHypo->GetId();
- if (searchGraph[lineNumber].recombinationHypo) {
- mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId();
- } else {
- mosesHypothesisID = searchGraph[lineNumber].hypo->GetId();
- }
- // int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
- UTIL_THROW_IF(
- (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
- util::Exception,
- "Error while writing search lattice as hypergraph for sentence " << translationId << ". " <<
- "Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
- ", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] <<
- ". There are " << numNodes << " nodes in the search lattice."
- );
-
- const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
- if (prevHypo==NULL) {
- // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " start of sentence" << std::endl)
- outputSearchGraphStream << "<s> ||| \n";
- } else {
- int startNode = mosesIDToHypergraphID[prevHypo->GetId()];
- // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl)
- UTIL_THROW_IF(
- (startNode >= hypergraphHypothesisID),
- util::Exception,
- "Error while writing search lattice as hypergraph for sentence" << translationId << ". " <<
- "The nodes must be output in topological order. The code attempted to violate this restriction."
- );
-
- const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
- int targetWordCount = targetPhrase.GetSize();
-
- outputSearchGraphStream << "[" << startNode << "]";
- for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
- outputSearchGraphStream << " " << targetPhrase.GetWord(targetWordIndex);
- }
- outputSearchGraphStream << " ||| ";
- OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
- outputSearchGraphStream << "\n";
- }
+ int lineNumber = (*it).second;
+ const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
+ int mosesHypothesisID;// = thisHypo->GetId();
+ if (searchGraph[lineNumber].recombinationHypo) {
+ mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId();
+ } else {
+ mosesHypothesisID = searchGraph[lineNumber].hypo->GetId();
+ }
+ // int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
+ UTIL_THROW_IF(
+ (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
+ util::Exception,
+ "Error while writing search lattice as hypergraph for sentence " << translationId << ". " <<
+ "Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
+ ", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] <<
+ ". There are " << numNodes << " nodes in the search lattice."
+ );
+
+ const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
+ if (prevHypo==NULL) {
+ // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " start of sentence" << std::endl)
+ outputSearchGraphStream << "<s> ||| \n";
+ } else {
+ int startNode = mosesIDToHypergraphID[prevHypo->GetId()];
+ // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl)
+ UTIL_THROW_IF(
+ (startNode >= hypergraphHypothesisID),
+ util::Exception,
+ "Error while writing search lattice as hypergraph for sentence" << translationId << ". " <<
+ "The nodes must be output in topological order. The code attempted to violate this restriction."
+ );
+
+ const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
+ int targetWordCount = targetPhrase.GetSize();
+
+ outputSearchGraphStream << "[" << startNode << "]";
+ for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
+ outputSearchGraphStream << " " << targetPhrase.GetWord(targetWordIndex);
+ }
+ outputSearchGraphStream << " ||| ";
+ OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
+ outputSearchGraphStream << "\n";
+ }
}
}
}
@@ -1001,14 +1002,14 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea
int hypothesisID = searchGraph[arcNumber].hypo->GetId();
if (nodes.count(hypothesisID) == 0) {
-
+
numNodes += targetWordCount;
nodes[hypothesisID] = numNodes;
//numNodes += 1;
bool terminalNode = (searchGraph[arcNumber].forward == -1);
if (terminalNode) {
- numArcs += 1;
+ numArcs += 1;
}
}
@@ -1038,35 +1039,35 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea
int targetWordCount = targetPhrase.GetSize();
for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
- int x = (targetWordCount-targetWordIndex);
+ int x = (targetWordCount-targetWordIndex);
- outputSearchGraphStream << "J=" << arcNumber;
+ outputSearchGraphStream << "J=" << arcNumber;
- if (targetWordIndex==0) {
- outputSearchGraphStream << " S=" << startNode;
- } else {
- outputSearchGraphStream << " S=" << endNode - x;
- }
+ if (targetWordIndex==0) {
+ outputSearchGraphStream << " S=" << startNode;
+ } else {
+ outputSearchGraphStream << " S=" << endNode - x;
+ }
- outputSearchGraphStream << " E=" << endNode - (x-1)
- << " W=" << targetPhrase.GetWord(targetWordIndex);
+ outputSearchGraphStream << " E=" << endNode - (x-1)
+ << " W=" << targetPhrase.GetWord(targetWordIndex);
- OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream);
+ OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream);
- outputSearchGraphStream << endl;
+ outputSearchGraphStream << endl;
- arcNumber += 1;
+ arcNumber += 1;
}
if (terminalNode && terminalNodes.count(endNode) == 0) {
- terminalNodes.insert(endNode);
- outputSearchGraphStream << "J=" << arcNumber
- << " S=" << endNode
- << " E=" << numNodes
- << endl;
- arcNumber += 1;
+ terminalNodes.insert(endNode);
+ outputSearchGraphStream << "J=" << arcNumber
+ << " S=" << endNode
+ << " E=" << numNodes
+ << endl;
+ arcNumber += 1;
}
- }
+ }
}
}
@@ -1124,17 +1125,17 @@ void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
outputSearchGraphStream << " recombined=" << searchNode.recombinationHypo->GetId();
outputSearchGraphStream << " forward=" << searchNode.forward << " fscore=" << searchNode.fscore
- << " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos()
- << "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos();
+ << " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos()
+ << "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos();
// Modified so that -osgx is a superset of -osg (GST Oct 2011)
ScoreComponentCollection scoreBreakdown = searchNode.hypo->GetScoreBreakdown();
scoreBreakdown.MinusEquals( prevHypo->GetScoreBreakdown() );
//outputSearchGraphStream << " scores = [ " << StaticData::Instance().GetAllWeights();
- outputSearchGraphStream << " scores=\"" << scoreBreakdown << "\"";
+ outputSearchGraphStream << " scores=\"" << scoreBreakdown << "\"";
outputSearchGraphStream << " out=\"" << searchNode.hypo->GetSourcePhraseStringRep() << "|" <<
- searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << "\"" << endl;
+ searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << "\"" << endl;
// outputSearchGraphStream << " out=" << searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << endl;
}
diff --git a/moses/Manager.h b/moses/Manager.h
index 687d8dbeb..fd329c309 100644
--- a/moses/Manager.h
+++ b/moses/Manager.h
@@ -56,9 +56,9 @@ struct SearchGraphNode {
hypo(theHypo), recombinationHypo(theRecombinationHypo),
forward(theForward), fscore(theFscore) {}
- bool operator<(const SearchGraphNode& sgn) const {
- return this->hypo->GetId() < sgn.hypo->GetId();
- }
+ bool operator<(const SearchGraphNode& sgn) const {
+ return this->hypo->GetId() < sgn.hypo->GetId();
+ }
};
diff --git a/moses/MockHypothesis.cpp b/moses/MockHypothesis.cpp
index e98794cb7..826104565 100644
--- a/moses/MockHypothesis.cpp
+++ b/moses/MockHypothesis.cpp
@@ -19,7 +19,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-#include "MockHypothesis.h"
+#include "MockHypothesis.h"
#include <boost/test/unit_test.hpp>
@@ -28,19 +28,20 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
using namespace Moses;
using namespace std;
-namespace MosesTest {
+namespace MosesTest
+{
MockHypothesisGuard::MockHypothesisGuard(
- const string& sourceSentence,
- const vector<Alignment>& alignments,
- const vector<string>& targetSegments)
-: m_emptyTarget(),
- m_sentence(),
- m_wp("WordPenalty"),
- m_uwp("UnknownWordPenalty"),
- m_dist("Distortion"),
- m_manager(0,m_sentence,Normal)
+ const string& sourceSentence,
+ const vector<Alignment>& alignments,
+ const vector<string>& targetSegments)
+ : m_emptyTarget(),
+ m_sentence(),
+ m_wp("WordPenalty"),
+ m_uwp("UnknownWordPenalty"),
+ m_dist("Distortion"),
+ m_manager(0,m_sentence,Normal)
{
BOOST_CHECK_EQUAL(alignments.size(), targetSegments.size());
@@ -49,7 +50,7 @@ MockHypothesisGuard::MockHypothesisGuard(
stringstream in(sourceSentence + "\n");
m_sentence.Read(in,factors);
-
+
//Initial empty hypothesis
m_manager.ResetSentenceStats(m_sentence);
@@ -58,21 +59,20 @@ MockHypothesisGuard::MockHypothesisGuard(
//create the chain
vector<Alignment>::const_iterator ai = alignments.begin();
vector<string>::const_iterator ti = targetSegments.begin();
- for (; ti != targetSegments.end() && ai != alignments.end(); ++ti,++ai)
- {
+ for (; ti != targetSegments.end() && ai != alignments.end(); ++ti,++ai) {
Hypothesis* prevHypo = m_hypothesis;
WordsRange wordsRange(ai->first,ai->second);
m_targetPhrases.push_back(TargetPhrase());
m_targetPhrases.back().CreateFromString(Input, factors, *ti, "|", NULL);
m_toptions.push_back(new TranslationOption
- (wordsRange,m_targetPhrases.back()));
- m_hypothesis = Hypothesis::Create(*prevHypo,*m_toptions.back(),NULL);
+ (wordsRange,m_targetPhrases.back()));
+ m_hypothesis = Hypothesis::Create(*prevHypo,*m_toptions.back(),NULL);
}
}
-MockHypothesisGuard::~MockHypothesisGuard()
+MockHypothesisGuard::~MockHypothesisGuard()
{
RemoveAllInColl(m_toptions);
while (m_hypothesis) {
diff --git a/moses/MockHypothesis.h b/moses/MockHypothesis.h
index 2490dd5a6..67182ad56 100644
--- a/moses/MockHypothesis.h
+++ b/moses/MockHypothesis.h
@@ -29,7 +29,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Hypothesis.h"
#include "Manager.h"
-namespace MosesTest {
+namespace MosesTest
+{
//
// Construct a hypothesis with arbitrary source and target phrase
@@ -38,42 +39,52 @@ namespace MosesTest {
typedef std::pair<size_t,size_t> Alignment; //(first,last) in source
-class MockHypothesisGuard {
- public:
- /** Creates a phrase-based hypothesis.
- */
- MockHypothesisGuard(
- const std::string& sourceSentence,
- const std::vector<Alignment>& alignments,
- const std::vector<std::string>& targetSegments);
- Moses::Hypothesis* operator*() const {return m_hypothesis;}
-
- /** Destroy the hypothesis chain */
- ~MockHypothesisGuard();
-
- private:
- Moses::TargetPhrase m_emptyTarget;
- Moses::Sentence m_sentence;
- Moses::WordPenaltyProducer m_wp;
- Moses::UnknownWordPenaltyProducer m_uwp;
- Moses::DistortionScoreProducer m_dist;
- Moses::Manager m_manager;
- Moses::Hypothesis* m_hypothesis;
- std::vector<Moses::TargetPhrase> m_targetPhrases;
- std::vector<Moses::TranslationOption*> m_toptions;
+class MockHypothesisGuard
+{
+public:
+ /** Creates a phrase-based hypothesis.
+ */
+ MockHypothesisGuard(
+ const std::string& sourceSentence,
+ const std::vector<Alignment>& alignments,
+ const std::vector<std::string>& targetSegments);
+ Moses::Hypothesis* operator*() const {
+ return m_hypothesis;
+ }
+
+ /** Destroy the hypothesis chain */
+ ~MockHypothesisGuard();
+
+private:
+ Moses::TargetPhrase m_emptyTarget;
+ Moses::Sentence m_sentence;
+ Moses::WordPenaltyProducer m_wp;
+ Moses::UnknownWordPenaltyProducer m_uwp;
+ Moses::DistortionScoreProducer m_dist;
+ Moses::Manager m_manager;
+ Moses::Hypothesis* m_hypothesis;
+ std::vector<Moses::TargetPhrase> m_targetPhrases;
+ std::vector<Moses::TranslationOption*> m_toptions;
};
-class HypothesisFixture {
- public:
- HypothesisFixture();
- const Moses::Hypothesis* empty() {return **m_empty;}
- const Moses::Hypothesis* partial() {return **m_partial;}
- const Moses::Hypothesis* full() {return **m_full;}
-
- private:
- std::auto_ptr<MockHypothesisGuard> m_empty;
- std::auto_ptr<MockHypothesisGuard> m_partial;
- std::auto_ptr<MockHypothesisGuard> m_full;
+class HypothesisFixture
+{
+public:
+ HypothesisFixture();
+ const Moses::Hypothesis* empty() {
+ return **m_empty;
+ }
+ const Moses::Hypothesis* partial() {
+ return **m_partial;
+ }
+ const Moses::Hypothesis* full() {
+ return **m_full;
+ }
+
+private:
+ std::auto_ptr<MockHypothesisGuard> m_empty;
+ std::auto_ptr<MockHypothesisGuard> m_partial;
+ std::auto_ptr<MockHypothesisGuard> m_full;
};
diff --git a/moses/OutputCollector.h b/moses/OutputCollector.h
index 96353934e..5f72433d8 100644
--- a/moses/OutputCollector.h
+++ b/moses/OutputCollector.h
@@ -45,27 +45,23 @@ public:
OutputCollector(std::ostream* outStream= &std::cout, std::ostream* debugStream=&std::cerr) :
m_nextOutput(0),m_outStream(outStream),m_debugStream(debugStream),
m_isHoldingOutputStream(false), m_isHoldingDebugStream(false) {}
-
- ~OutputCollector()
- {
+
+ ~OutputCollector() {
if (m_isHoldingOutputStream)
delete m_outStream;
if (m_isHoldingDebugStream)
delete m_debugStream;
}
-
- void HoldOutputStream()
- {
+
+ void HoldOutputStream() {
m_isHoldingOutputStream = true;
}
-
- void HoldDebugStream()
- {
+
+ void HoldDebugStream() {
m_isHoldingDebugStream = true;
}
-
- bool OutputIsCout() const
- {
+
+ bool OutputIsCout() const {
return (m_outStream == std::cout);
}
@@ -87,7 +83,7 @@ public:
*m_outStream << iter->second << std::flush;
++m_nextOutput;
std::map<int,std::string>::iterator debugIter = m_debugs.find(iter->first);
- m_outputs.erase(iter);
+ m_outputs.erase(iter);
if (debugIter != m_debugs.end()) {
*m_debugStream << debugIter->second << std::flush;
m_debugs.erase(debugIter);
diff --git a/moses/PCNTools.h b/moses/PCNTools.h
index 8a31e99ad..ea43df838 100644
--- a/moses/PCNTools.h
+++ b/moses/PCNTools.h
@@ -36,7 +36,7 @@ namespace PCN
typedef std::pair<std::pair<std::string, std::vector<float> >, size_t> CNAlt;
typedef std::vector<CNAlt> CNCol;
typedef std::vector<CNCol> CN;
-
+
/** Given a string ((('foo',0.1,1),('bar',0.9,2)),...) representation of a
* word lattice in PCN format, return a CN object representing the lattice
*/
diff --git a/moses/PDTAimp.h b/moses/PDTAimp.h
index 835cd6895..735179217 100644
--- a/moses/PDTAimp.h
+++ b/moses/PDTAimp.h
@@ -38,7 +38,7 @@ protected:
: m_dict(0),
m_obj(p),useCache(1),m_numInputScores(nis),totalE(0),distinctE(0) {}
- public:
+public:
std::vector<FactorType> m_input,m_output;
PhraseDictionaryTree *m_dict;
typedef std::vector<TargetPhraseCollection const*> vTPC;
@@ -185,7 +185,7 @@ protected:
void Create(const std::vector<FactorType> &input
, const std::vector<FactorType> &output
, const std::string &filePath
- , const std::vector<float> &weight
+ , const std::vector<float> &weight
) {
// set my members
@@ -267,10 +267,10 @@ protected:
StringTgtCand::Tokens const& factorStrings,
Scores const& scoreVector,
const ScoreComponentCollection& sparseFeatures,
- std::vector<float> &weights,
- float weightWP,
+ std::vector<float> &weights,
+ float weightWP,
Phrase const* srcPtr) const {
- FactorCollection &factorCollection = FactorCollection::Instance();
+ FactorCollection &factorCollection = FactorCollection::Instance();
for(size_t k=0; k<factorStrings.size(); ++k) {
util::TokenIter<util::MultiCharacter, false> word(*factorStrings[k], StaticData::Instance().GetFactorDelimiter());
@@ -438,8 +438,8 @@ protected:
//put in phrase table scores, logging as we insert
std::transform(tcands[i].scores.begin(),tcands[i].scores.end(),nscores.begin() + m_numInputScores,TransformScore);
-
- CHECK(nscores.size()==weightT.size());
+
+ CHECK(nscores.size()==weightT.size());
//tally up
float score=std::inner_product(nscores.begin(), nscores.end(), weightT.begin(), 0.0f);
diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index 4264010cd..e16b6d08f 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -69,30 +69,30 @@ Parameter::Parameter()
AddParam("report-all-factors", "report all factors in output, not just first");
AddParam("report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false");
#ifdef HAVE_SYNLM
- AddParam("slmodel-file", "location of the syntactic language model file(s)");
- AddParam("slmodel-factor", "factor to use with syntactic language model");
- AddParam("slmodel-beam", "beam width to use with syntactic language model's parser");
+ AddParam("slmodel-file", "location of the syntactic language model file(s)");
+ AddParam("slmodel-factor", "factor to use with syntactic language model");
+ AddParam("slmodel-beam", "beam width to use with syntactic language model's parser");
#endif
AddParam("stack", "s", "maximum stack size for histogram pruning");
AddParam("stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)");
AddParam("threads","th", "number of threads to use in decoding (defaults to single-threaded)");
- AddParam("translation-details", "T", "for each best hypothesis, report translation details to the given file");
- AddParam("ttable-file", "location and properties of the translation tables");
- AddParam("ttable-limit", "ttl", "maximum number of translation table entries per input phrase");
- AddParam("translation-option-threshold", "tot", "threshold for translation options relative to best for input phrase");
- AddParam("early-discarding-threshold", "edt", "threshold for constructing hypotheses based on estimate cost");
- AddParam("verbose", "v", "verbosity level of the logging");
+ AddParam("translation-details", "T", "for each best hypothesis, report translation details to the given file");
+ AddParam("ttable-file", "location and properties of the translation tables");
+ AddParam("ttable-limit", "ttl", "maximum number of translation table entries per input phrase");
+ AddParam("translation-option-threshold", "tot", "threshold for translation options relative to best for input phrase");
+ AddParam("early-discarding-threshold", "edt", "threshold for constructing hypotheses based on estimate cost");
+ AddParam("verbose", "v", "verbosity level of the logging");
AddParam("references", "Reference file(s) - used for bleu score feature");
- AddParam("output-factors", "list if factors in the output");
- AddParam("cache-path", "?");
- AddParam("distortion-limit", "dl", "distortion (reordering) limit in maximum number of words (0 = monotone, -1 = unlimited)");
- AddParam("monotone-at-punctuation", "mp", "do not reorder over punctuation");
- AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables");
- AddParam("distortion", "configurations for each factorized/lexicalized reordering model.");
- AddParam("early-distortion-cost", "edc", "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no");
- AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'");
+ AddParam("output-factors", "list if factors in the output");
+ AddParam("cache-path", "?");
+ AddParam("distortion-limit", "dl", "distortion (reordering) limit in maximum number of words (0 = monotone, -1 = unlimited)");
+ AddParam("monotone-at-punctuation", "mp", "do not reorder over punctuation");
+ AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables");
+ AddParam("distortion", "configurations for each factorized/lexicalized reordering model.");
+ AddParam("early-distortion-cost", "edc", "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no");
+ AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'");
AddParam("xml-brackets", "xb", "specify strings to be used as xml tags opening and closing, e.g. \"{{ }}\" (default \"< >\"). Avoid square brackets because of configuration file format. Valid only with text input mode" );
- AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation");
+ AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation");
AddParam("lminimum-bayes-risk", "lmbr", "use lattice miminum Bayes risk to determine best translation");
AddParam("mira", "do mira training");
AddParam("consensus-decoding", "con", "use consensus decoding (De Nero et. al. 2009)");
@@ -119,17 +119,17 @@ Parameter::Parameter()
#ifdef HAVE_PROTOBUF
AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
#endif
- AddParam("cube-pruning-pop-limit", "cbp", "How many hypotheses should be popped for each stack. (default = 1000)");
- AddParam("cube-pruning-diversity", "cbd", "How many hypotheses should be created for each coverage. (default = 0)");
- AddParam("search-algorithm", "Which search algorithm to use. 0=normal stack, 1=cube pruning, 2=cube growing. (default = 0)");
- AddParam("constraint", "Location of the file with target sentences to produce constraining the search");
- AddParam("description", "Source language, target language, description");
- AddParam("max-chart-span", "maximum num. of source word chart rules can consume (default 10)");
- AddParam("non-terminals", "list of non-term symbols, space separated");
- AddParam("rule-limit", "a little like table limit. But for chart decoding rules. Default is DEFAULT_MAX_TRANS_OPT_SIZE");
- AddParam("source-label-overlap", "What happens if a span already has a label. 0=add more. 1=replace. 2=discard. Default is 0");
- AddParam("output-hypo-score", "Output the hypo score to stdout with the output string. For search error analysis. Default is false");
- AddParam("unknown-lhs", "file containing target lhs of unknown words. 1 per line: LHS prob");
+ AddParam("cube-pruning-pop-limit", "cbp", "How many hypotheses should be popped for each stack. (default = 1000)");
+ AddParam("cube-pruning-diversity", "cbd", "How many hypotheses should be created for each coverage. (default = 0)");
+ AddParam("search-algorithm", "Which search algorithm to use. 0=normal stack, 1=cube pruning, 2=cube growing. (default = 0)");
+ AddParam("constraint", "Location of the file with target sentences to produce constraining the search");
+ AddParam("description", "Source language, target language, description");
+ AddParam("max-chart-span", "maximum num. of source word chart rules can consume (default 10)");
+ AddParam("non-terminals", "list of non-term symbols, space separated");
+ AddParam("rule-limit", "a little like table limit. But for chart decoding rules. Default is DEFAULT_MAX_TRANS_OPT_SIZE");
+ AddParam("source-label-overlap", "What happens if a span already has a label. 0=add more. 1=replace. 2=discard. Default is 0");
+ AddParam("output-hypo-score", "Output the hypo score to stdout with the output string. For search error analysis. Default is false");
+ AddParam("unknown-lhs", "file containing target lhs of unknown words. 1 per line: LHS prob");
AddParam("phrase-pair-feature", "Source and target factors for phrase pair feature");
AddParam("phrase-boundary-source-feature", "Source factors for phrase boundary feature");
AddParam("phrase-boundary-target-feature", "Target factors for phrase boundary feature");
@@ -153,9 +153,9 @@ Parameter::Parameter()
AddParam("show-weights", "print feature weights and exit");
AddParam("start-translation-id", "Id of 1st input. Default = 0");
AddParam("output-unknowns", "Output the unknown (OOV) words to the given file, one line per sentence");
-
- // Compact phrase table and reordering table.
- AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");
+
+ // Compact phrase table and reordering table.
+ AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");
AddParam("minphr-memory", "Load phrase table in minphr format into memory");
AddParam("print-alignment-info", "Output word-to-word alignment to standard out, separated from translation by |||. Word-to-word alignments are takne from the phrase table if any. Default is false");
@@ -214,7 +214,7 @@ void Parameter::AddParam(const string &paramName, const string &abbrevName, cons
m_valid[paramName] = true;
m_valid[abbrevName] = true;
m_abbreviation[paramName] = abbrevName;
- m_fullname[abbrevName] = paramName;
+ m_fullname[abbrevName] = paramName;
m_description[paramName] = description;
}
@@ -263,7 +263,7 @@ bool Parameter::LoadParam(int argc, char* argv[])
PrintCredit();
Explain();
- cerr << endl;
+ cerr << endl;
UserMessage::Add("No configuration file was specified. Use -config or -f");
cerr << endl;
return false;
@@ -381,11 +381,9 @@ void Parameter::ConvertWeightArgsSingleWeight(const string &oldWeightName, const
PARAM_MAP::iterator iterMap;
iterMap = m_setting.find(oldWeightName);
- if (iterMap != m_setting.end())
- {
+ if (iterMap != m_setting.end()) {
const PARAM_VEC &weights = iterMap->second;
- for (size_t i = 0; i < weights.size(); ++i)
- {
+ for (size_t i = 0; i < weights.size(); ++i) {
SetWeight(newWeightName, ind, Scan<float>(weights[i]));
}
@@ -403,8 +401,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
CHECK(numInputScores.size() == 0);
numInputScores.push_back("1");
numInputScores.push_back("0");
- }
- else if (inputWeights.size() == 2) {
+ } else if (inputWeights.size() == 2) {
CHECK(numInputScores.size() == 0);
numInputScores.push_back("1");
numInputScores.push_back("1");
@@ -463,8 +460,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
PhraseTableImplementation implementation = (PhraseTableImplementation) Scan<int>(token[0]);
string ptType;
- switch (implementation)
- {
+ switch (implementation) {
case Memory:
ptType = "PhraseDictionaryMemory";
break;
@@ -488,8 +484,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
if (ptIndices.find(ptType) == ptIndices.end()) {
ptIndices[ptType] = 0;
ptInd = 0;
- }
- else {
+ } else {
ptInd = ++ptIndices[ptType];
}
@@ -516,7 +511,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
//characteristics of the phrase table
vector<FactorType> input = Tokenize<FactorType>(token[1], ",")
- ,output = Tokenize<FactorType>(token[2], ",");
+ ,output = Tokenize<FactorType>(token[2], ",");
size_t numScoreComponent = Scan<size_t>(token[3]);
string filePath= token[4];
@@ -561,14 +556,13 @@ void Parameter::ConvertWeightArgsDistortion()
// distortion / lex distortion
const PARAM_VEC &oldWeights = GetParam(oldWeightName);
- if (oldWeights.size() > 0)
- {
+ if (oldWeights.size() > 0) {
if (!isParamSpecified("search-algorithm") ||
- (GetParam("search-algorithm").size() > 0
- && (Trim(GetParam("search-algorithm")[0]) == "0"
+ (GetParam("search-algorithm").size() > 0
+ && (Trim(GetParam("search-algorithm")[0]) == "0"
||Trim(GetParam("search-algorithm")[0]) == "1"
- )
- )
+ )
+ )
) {
// phrase-based. Add distance distortion to list of features
AddFeature("Distortion");
@@ -587,8 +581,7 @@ void Parameter::ConvertWeightArgsDistortion()
size_t numFF = Scan<size_t>(toks[2]);
vector<float> weights(numFF);
- for (size_t currFF = 0; currFF < numFF; ++currFF)
- {
+ for (size_t currFF = 0; currFF < numFF; ++currFF) {
CHECK(currOldInd < oldWeights.size());
float weight = Scan<float>(oldWeights[currOldInd]);
weights[currFF] = weight;
@@ -625,12 +618,12 @@ void Parameter::ConvertWeightArgsLM()
bool isChartDecoding = true;
if (!isParamSpecified("search-algorithm") ||
- (GetParam("search-algorithm").size() > 0
- && (Trim(GetParam("search-algorithm")[0]) == "0"
- ||Trim(GetParam("search-algorithm")[0]) == "1"
- )
- )
- ) {
+ (GetParam("search-algorithm").size() > 0
+ && (Trim(GetParam("search-algorithm")[0]) == "0"
+ ||Trim(GetParam("search-algorithm")[0]) == "1"
+ )
+ )
+ ) {
isChartDecoding = false;
}
@@ -643,8 +636,7 @@ void Parameter::ConvertWeightArgsLM()
PARAM_MAP::iterator iterMap;
iterMap = m_setting.find(oldWeightName);
- if (iterMap != m_setting.end())
- {
+ if (iterMap != m_setting.end()) {
size_t currOldInd = 0;
const PARAM_VEC &weights = iterMap->second;
@@ -656,8 +648,7 @@ void Parameter::ConvertWeightArgsLM()
int lmType = Scan<int>(modelToks[0]);
string newFeatureName;
- switch (lmType)
- {
+ switch (lmType) {
case 0:
newFeatureName = "SRILM";
break;
@@ -677,12 +668,11 @@ void Parameter::ConvertWeightArgsLM()
numFF += oovWeights[lmIndex];
vector<float> weightsLM(numFF);
- for (size_t currFF = 0; currFF < numFF; ++currFF)
- {
+ for (size_t currFF = 0; currFF < numFF; ++currFF) {
CHECK(currOldInd < weights.size());
weightsLM[currFF] = Scan<float>(weights[currOldInd]);
if (isChartDecoding) {
- weightsLM[currFF] = UntransformLMScore(weightsLM[currFF]);
+ weightsLM[currFF] = UntransformLMScore(weightsLM[currFF]);
}
++currOldInd;
@@ -691,12 +681,11 @@ void Parameter::ConvertWeightArgsLM()
SetWeight(newFeatureName, ind, weightsLM);
string featureLine = newFeatureName + " "
- + "factor=" + modelToks[1] + " " // factor
- + "order=" + modelToks[2] + " "; // order
+ + "factor=" + modelToks[1] + " " // factor
+ + "order=" + modelToks[2] + " "; // order
if (lmType == 9) {
featureLine += "lazyken=1 ";
- }
- else if (lmType == 8) {
+ } else if (lmType == 8) {
featureLine += "lazyken=0 ";
}
@@ -718,8 +707,7 @@ void Parameter::ConvertWeightArgsGeneration(const std::string &oldWeightName, co
// distortion / lex distortion
PARAM_VEC &oldWeights = m_setting[oldWeightName];
- if (oldWeights.size() > 0)
- {
+ if (oldWeights.size() > 0) {
size_t currOldInd = 0;
PARAM_VEC &models = m_setting[oldFeatureName];
@@ -730,8 +718,7 @@ void Parameter::ConvertWeightArgsGeneration(const std::string &oldWeightName, co
size_t numFF = Scan<size_t>(modelToks[2]);
vector<float> weights(numFF);
- for (size_t currFF = 0; currFF < numFF; ++currFF)
- {
+ for (size_t currFF = 0; currFF < numFF; ++currFF) {
CHECK(currOldInd < oldWeights.size());
float weight = Scan<float>(oldWeights[currOldInd]);
weights[currFF] = weight;
@@ -742,10 +729,10 @@ void Parameter::ConvertWeightArgsGeneration(const std::string &oldWeightName, co
stringstream strme;
strme << "Generation "
- << "input-factor=" << modelToks[0] << " "
- << "output-factor=" << modelToks[1] << " "
- << "num-features=" << modelToks[2] << " "
- << "path=" << modelToks[3];
+ << "input-factor=" << modelToks[0] << " "
+ << "output-factor=" << modelToks[1] << " "
+ << "num-features=" << modelToks[2] << " "
+ << "path=" << modelToks[3];
AddFeature(strme.str());
}
}
@@ -761,23 +748,21 @@ void Parameter::ConvertWeightArgsWordPenalty()
bool isChartDecoding = true;
if (!isParamSpecified("search-algorithm") ||
- (GetParam("search-algorithm").size() > 0
- && (Trim(GetParam("search-algorithm")[0]) == "0"
- ||Trim(GetParam("search-algorithm")[0]) == "1"
- )
- )
- ) {
+ (GetParam("search-algorithm").size() > 0
+ && (Trim(GetParam("search-algorithm")[0]) == "0"
+ ||Trim(GetParam("search-algorithm")[0]) == "1"
+ )
+ )
+ ) {
isChartDecoding = false;
}
PARAM_MAP::iterator iterMap;
iterMap = m_setting.find(oldWeightName);
- if (iterMap != m_setting.end())
- {
+ if (iterMap != m_setting.end()) {
const PARAM_VEC &weights = iterMap->second;
- for (size_t i = 0; i < weights.size(); ++i)
- {
+ for (size_t i = 0; i < weights.size(); ++i) {
float weight = Scan<float>(weights[i]);
if (isChartDecoding) {
weight *= 0.434294482;
@@ -800,8 +785,7 @@ void Parameter::ConvertWeightArgs()
(m_setting.count("weight-i") || m_setting.count("weight-t") || m_setting.count("weight-w") ||
m_setting.count("weight-l") || m_setting.count("weight-u") || m_setting.count("weight-lex") ||
m_setting.count("weight-generation") || m_setting.count("weight-lr") || m_setting.count("weight-d")
- ))
- {
+ )) {
cerr << "Do not mix old and new format for specify weights";
}
@@ -833,8 +817,7 @@ void Parameter::ConvertWeightArgs()
void Parameter::CreateWeightsMap()
{
PARAM_VEC &vec = m_setting["weight"];
- for (size_t i = 0; i < vec.size(); ++i)
- {
+ for (size_t i = 0; i < vec.size(); ++i) {
const string &line = vec[i];
vector<string> toks = Tokenize(line);
CHECK(toks.size() >= 2);
@@ -865,8 +848,7 @@ void Parameter::WeightOverwrite()
string name("");
vector<float> weights;
vector<string> toks = Tokenize(vec[0]);
- for (size_t i = 0; i < toks.size(); ++i)
- {
+ for (size_t i = 0; i < toks.size(); ++i) {
const string &tok = toks[i];
if (tok.substr(tok.size() - 1, 1) == "=") {
@@ -879,8 +861,7 @@ void Parameter::WeightOverwrite()
}
name = tok.substr(0, tok.size() - 1);
- }
- else {
+ } else {
// a weight for curr ff
float weight = Scan<float>(toks[i]);
weights.push_back(weight);
@@ -899,14 +880,13 @@ bool Parameter::Validate()
PARAM_MAP::const_iterator iterParams;
for (iterParams = m_setting.begin(); iterParams != m_setting.end(); ++iterParams) {
const std::string &key = iterParams->first;
-
- if (m_valid.find(key) == m_valid.end())
- {
+
+ if (m_valid.find(key) == m_valid.end()) {
UserMessage::Add("Unknown parameter " + key);
noErrorFlag = false;
}
}
-
+
if (m_setting["lmodel-dub"].size() > 0) {
if (m_setting["lmodel-file"].size() != m_setting["lmodel-dub"].size()) {
stringstream errorMsg("");
@@ -1082,8 +1062,7 @@ bool Parameter::ReadConfigFile(const string &filePath )
if (line.size() == 0) {
// blank line. do nothing.
- }
- else if (line[0]=='[') {
+ } else if (line[0]=='[') {
// new parameter
for (size_t currPos = 0 ; currPos < line.size() ; currPos++) {
if (line[currPos] == ']') {
@@ -1227,23 +1206,23 @@ void Parameter::PrintCredit()
* \param values inew values for paramName */
void Parameter::OverwriteParam(const string &paramName, PARAM_VEC values)
{
- VERBOSE(2,"Overwriting parameter " << paramName);
-
- m_setting[paramName]; // defines the parameter, important for boolean switches
- if (m_setting[paramName].size() > 1){
- VERBOSE(2," (the parameter had " << m_setting[paramName].size() << " previous values)");
- CHECK(m_setting[paramName].size() == values.size());
- }else{
- VERBOSE(2," (the parameter does not have previous values)");
- m_setting[paramName].resize(values.size());
- }
- VERBOSE(2," with the following values:");
- int i=0;
- for (PARAM_VEC::iterator iter = values.begin(); iter != values.end() ; iter++, i++){
- m_setting[paramName][i] = *iter;
- VERBOSE(2, " " << *iter);
- }
- VERBOSE(2, std::endl);
+ VERBOSE(2,"Overwriting parameter " << paramName);
+
+ m_setting[paramName]; // defines the parameter, important for boolean switches
+ if (m_setting[paramName].size() > 1) {
+ VERBOSE(2," (the parameter had " << m_setting[paramName].size() << " previous values)");
+ CHECK(m_setting[paramName].size() == values.size());
+ } else {
+ VERBOSE(2," (the parameter does not have previous values)");
+ m_setting[paramName].resize(values.size());
+ }
+ VERBOSE(2," with the following values:");
+ int i=0;
+ for (PARAM_VEC::iterator iter = values.begin(); iter != values.end() ; iter++, i++) {
+ m_setting[paramName][i] = *iter;
+ VERBOSE(2, " " << *iter);
+ }
+ VERBOSE(2, std::endl);
}
std::set<std::string> Parameter::GetWeightNames() const
@@ -1256,7 +1235,7 @@ std::set<std::string> Parameter::GetWeightNames() const
}
return ret;
}
-
+
}
diff --git a/moses/Parameter.h b/moses/Parameter.h
index a78314692..a0c372a53 100644
--- a/moses/Parameter.h
+++ b/moses/Parameter.h
@@ -38,16 +38,16 @@ typedef std::map<std::string, std::string > PARAM_STRING;
/** Handles parameter values set in config file or on command line.
* Process raw parameter data (names and values as strings) for StaticData
- * to parse; to get useful values, see StaticData.
+ * to parse; to get useful values, see StaticData.
*/
class Parameter
{
protected:
- PARAM_MAP m_setting;
- PARAM_BOOL m_valid;
- PARAM_STRING m_abbreviation;
- PARAM_STRING m_description;
- PARAM_STRING m_fullname;
+ PARAM_MAP m_setting;
+ PARAM_BOOL m_valid;
+ PARAM_STRING m_abbreviation;
+ PARAM_STRING m_description;
+ PARAM_STRING m_fullname;
std::map<std::string, std::vector<float> > m_weights;
@@ -93,32 +93,30 @@ public:
bool isParamSpecified(const std::string &paramName) {
return m_setting.find( paramName ) != m_setting.end();
}
-
- const std::string GetFullName(std::string abbr)
- {
- return m_fullname[abbr];
- }
-
- const std::string GetAbbreviation(std::string full)
- {
- return m_abbreviation[full];
- }
- const PARAM_VEC &GetParamShortName(const std::string &paramName)
- {
- return GetParam(GetFullName(paramName));
- }
-
- void OverwriteParam(const std::string &paramName, PARAM_VEC values);
-
- void OverwriteParamShortName(const std::string &paramShortName, PARAM_VEC values){
- OverwriteParam(GetFullName(paramShortName),values);
- }
-
+
+ const std::string GetFullName(std::string abbr) {
+ return m_fullname[abbr];
+ }
+
+ const std::string GetAbbreviation(std::string full) {
+ return m_abbreviation[full];
+ }
+ const PARAM_VEC &GetParamShortName(const std::string &paramName) {
+ return GetParam(GetFullName(paramName));
+ }
+
+ void OverwriteParam(const std::string &paramName, PARAM_VEC values);
+
+ void OverwriteParamShortName(const std::string &paramShortName, PARAM_VEC values) {
+ OverwriteParam(GetFullName(paramShortName),values);
+ }
+
std::vector<float> &GetWeights(const std::string &name);
std::set<std::string> GetWeightNames() const;
- const PARAM_MAP &GetParams() const
- { return m_setting; }
+ const PARAM_MAP &GetParams() const {
+ return m_setting;
+ }
};
diff --git a/moses/PartialTranslOptColl.h b/moses/PartialTranslOptColl.h
index f4f40d413..5a4e816de 100644
--- a/moses/PartialTranslOptColl.h
+++ b/moses/PartialTranslOptColl.h
@@ -39,7 +39,7 @@ namespace Moses
* The expansion process itself may be still explode, so efficient handling
* of partial translation options during expansion is required.
* This class assists in this tasks by implementing pruning.
- * This implementation is similar to the one in HypothesisStack.
+ * This implementation is similar to the one in HypothesisStack.
*/
class PartialTranslOptColl
{
diff --git a/moses/Phrase.cpp b/moses/Phrase.cpp
index ef5d09b23..3fa607fb4 100644
--- a/moses/Phrase.cpp
+++ b/moses/Phrase.cpp
@@ -103,16 +103,15 @@ Phrase Phrase::GetSubString(const WordsRange &wordsRange) const
Phrase Phrase::GetSubString(const WordsRange &wordsRange, FactorType factorType) const
{
- Phrase retPhrase(wordsRange.GetNumWordsCovered());
+ Phrase retPhrase(wordsRange.GetNumWordsCovered());
- for (size_t currPos = wordsRange.GetStartPos() ; currPos <= wordsRange.GetEndPos() ; currPos++)
- {
- const Factor* f = GetFactor(currPos, factorType);
- Word &word = retPhrase.AddWord();
- word.SetFactor(factorType, f);
- }
+ for (size_t currPos = wordsRange.GetStartPos() ; currPos <= wordsRange.GetEndPos() ; currPos++) {
+ const Factor* f = GetFactor(currPos, factorType);
+ Word &word = retPhrase.AddWord();
+ word.SetFactor(factorType, f);
+ }
- return retPhrase;
+ return retPhrase;
}
std::string Phrase::GetStringRep(const vector<FactorType> factorsToPrint) const
@@ -153,10 +152,10 @@ void Phrase::PrependWord(const Word &newWord)
}
void Phrase::CreateFromString(FactorDirection direction
- ,const std::vector<FactorType> &factorOrder
- ,const StringPiece &phraseString
- ,const StringPiece &factorDelimiter
- ,Word **lhs)
+ ,const std::vector<FactorType> &factorOrder
+ ,const StringPiece &phraseString
+ ,const StringPiece &factorDelimiter
+ ,Word **lhs)
{
// parse
vector<StringPiece> annotatedWordVector;
@@ -165,9 +164,9 @@ void Phrase::CreateFromString(FactorDirection direction
}
if (annotatedWordVector.size() == 0) {
- if (lhs) {
- (*lhs) = NULL;
- }
+ if (lhs) {
+ (*lhs) = NULL;
+ }
return;
}
@@ -188,8 +187,7 @@ void Phrase::CreateFromString(FactorDirection direction
(*lhs) = new Word(true);
(*lhs)->CreateFromString(direction, factorOrder, annotatedWord.substr(1, annotatedWord.size() - 2), true);
assert((*lhs)->IsNonTerminal());
- }
- else {
+ } else {
numWords = annotatedWordVector.size();
//CHECK(lhs == NULL);
if (lhs) {
diff --git a/moses/Phrase.h b/moses/Phrase.h
index 196e403ac..209f92f9e 100644
--- a/moses/Phrase.h
+++ b/moses/Phrase.h
@@ -69,13 +69,13 @@ public:
/** Fills phrase with words from format string, typically from phrase table or sentence input
* \param factorOrder factor types of each element in 2D string vector
* \param phraseString formatted input string to parse
- * \param factorDelimiter delimiter between factors.
+ * \param factorDelimiter delimiter between factors.
*/
void CreateFromString(FactorDirection direction
, const std::vector<FactorType> &factorOrder
- , const StringPiece &phraseString
- , const StringPiece &factorDelimiter
- , Word **lhs);
+ , const StringPiece &phraseString
+ , const StringPiece &factorDelimiter
+ , Word **lhs);
/** copy factors from the other phrase to this phrase.
IsCompatible() must be run beforehand to ensure incompatible factors aren't overwritten
@@ -127,52 +127,49 @@ public:
void AddWord(const Word &newWord) {
AddWord() = newWord;
}
-
- /** appends a phrase at the end of current phrase **/
- void Append(const Phrase &endPhrase);
- void PrependWord(const Word &newWord);
-
- void Clear()
- {
- m_words.clear();
- }
-
- void RemoveWord(size_t pos)
- {
- CHECK(pos < m_words.size());
- m_words.erase(m_words.begin() + pos);
- }
-
- //! create new phrase class that is a substring of this phrase
- Phrase GetSubString(const WordsRange &wordsRange) const;
+
+ /** appends a phrase at the end of current phrase **/
+ void Append(const Phrase &endPhrase);
+ void PrependWord(const Word &newWord);
+
+ void Clear() {
+ m_words.clear();
+ }
+
+ void RemoveWord(size_t pos) {
+ CHECK(pos < m_words.size());
+ m_words.erase(m_words.begin() + pos);
+ }
+
+ //! create new phrase class that is a substring of this phrase
+ Phrase GetSubString(const WordsRange &wordsRange) const;
Phrase GetSubString(const WordsRange &wordsRange, FactorType factorType) const;
-
- //! return a string rep of the phrase. Each factor is separated by the factor delimiter as specified in StaticData class
- std::string GetStringRep(const std::vector<FactorType> factorsToPrint) const;
-
- TO_STRING();
-
-
- int Compare(const Phrase &other) const;
-
- /** transitive comparison between 2 phrases
- * used to insert & find phrase in dictionary
- */
- bool operator< (const Phrase &compare) const
- {
- return Compare(compare) < 0;
- }
-
- bool operator== (const Phrase &compare) const
- {
- return Compare(compare) == 0;
- }
-
- void OnlyTheseFactors(const FactorMask &factors);
+
+ //! return a string rep of the phrase. Each factor is separated by the factor delimiter as specified in StaticData class
+ std::string GetStringRep(const std::vector<FactorType> factorsToPrint) const;
+
+ TO_STRING();
+
+
+ int Compare(const Phrase &other) const;
+
+ /** transitive comparison between 2 phrases
+ * used to insert & find phrase in dictionary
+ */
+ bool operator< (const Phrase &compare) const {
+ return Compare(compare) < 0;
+ }
+
+ bool operator== (const Phrase &compare) const {
+ return Compare(compare) == 0;
+ }
+
+ void OnlyTheseFactors(const FactorMask &factors);
};
-inline size_t hash_value(const Phrase& phrase) {
+inline size_t hash_value(const Phrase& phrase)
+{
size_t seed = 0;
for (size_t i = 0; i < phrase.GetSize(); ++i) {
boost::hash_combine(seed, phrase.GetWord(i));
diff --git a/moses/PrefixTree.h b/moses/PrefixTree.h
index 9cf1360e6..5b81ea175 100644
--- a/moses/PrefixTree.h
+++ b/moses/PrefixTree.h
@@ -63,7 +63,7 @@ public:
keys.insert(i,*b);
data.insert(data.begin()+pos,def);
- Self *self = NULL;
+ Self *self = NULL;
ptr.insert(ptr.begin()+pos, self);
}
if(++b!=e) {
diff --git a/moses/PrefixTreeMap.h b/moses/PrefixTreeMap.h
index fae875bd4..06066878d 100644
--- a/moses/PrefixTreeMap.h
+++ b/moses/PrefixTreeMap.h
@@ -59,7 +59,7 @@ private:
ScoreList m_ScoreList;
};
-
+
/** @todo How is this used in the pb binary phrase table?
*/
struct PPimp {
diff --git a/moses/RuleCube.h b/moses/RuleCube.h
index 05f9f1a24..d0c6ea66a 100644
--- a/moses/RuleCube.h
+++ b/moses/RuleCube.h
@@ -44,7 +44,7 @@ class ChartTranslationOptions;
*/
class RuleCubeItemScoreOrderer
{
- public:
+public:
bool operator()(const RuleCubeItem *p, const RuleCubeItem *q) const {
return p->GetScore() < q->GetScore();
}
@@ -56,7 +56,7 @@ class RuleCubeItemScoreOrderer
*/
class RuleCubeItemPositionOrderer
{
- public:
+public:
bool operator()(const RuleCubeItem *p, const RuleCubeItem *q) const {
return *p < *q;
}
@@ -66,7 +66,7 @@ class RuleCubeItemPositionOrderer
*/
class RuleCubeItemHasher
{
- public:
+public:
size_t operator()(const RuleCubeItem *p) const {
size_t seed = 0;
boost::hash_combine(seed, p->GetHypothesisDimensions());
@@ -79,7 +79,7 @@ class RuleCubeItemHasher
*/
class RuleCubeItemEqualityPred
{
- public:
+public:
bool operator()(const RuleCubeItem *p, const RuleCubeItem *q) const {
return p->GetHypothesisDimensions() == q->GetHypothesisDimensions() &&
p->GetTranslationDimension() == q->GetTranslationDimension();
@@ -90,7 +90,7 @@ class RuleCubeItemEqualityPred
*/
class RuleCube
{
- public:
+public:
RuleCube(const ChartTranslationOptions &, const ChartCellCollection &,
ChartManager &);
@@ -104,26 +104,28 @@ class RuleCube
RuleCubeItem *Pop(ChartManager &);
- bool IsEmpty() const { return m_queue.empty(); }
+ bool IsEmpty() const {
+ return m_queue.empty();
+ }
const ChartTranslationOptions &GetTranslationOption() const {
return m_transOpt;
}
- private:
+private:
#if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200)
typedef boost::unordered_set<RuleCubeItem*,
- RuleCubeItemHasher,
- RuleCubeItemEqualityPred
- > ItemSet;
+ RuleCubeItemHasher,
+ RuleCubeItemEqualityPred
+ > ItemSet;
#else
typedef std::set<RuleCubeItem*, RuleCubeItemPositionOrderer> ItemSet;
#endif
typedef std::priority_queue<RuleCubeItem*,
- std::vector<RuleCubeItem*>,
- RuleCubeItemScoreOrderer
- > Queue;
+ std::vector<RuleCubeItem*>,
+ RuleCubeItemScoreOrderer
+ > Queue;
RuleCube(const RuleCube &); // Not implemented
RuleCube &operator=(const RuleCube &); // Not implemented
diff --git a/moses/RuleCubeItem.h b/moses/RuleCubeItem.h
index 612079172..75669598b 100644
--- a/moses/RuleCubeItem.h
+++ b/moses/RuleCubeItem.h
@@ -39,14 +39,16 @@ typedef std::vector<const ChartHypothesis*> HypoList;
*/
class TranslationDimension
{
- public:
+public:
TranslationDimension(std::size_t pos,
const std::vector<TargetPhrase*> &orderedTargetPhrases)
: m_pos(pos)
, m_orderedTargetPhrases(&orderedTargetPhrases)
{}
- std::size_t IncrementPos() { return m_pos++; }
+ std::size_t IncrementPos() {
+ return m_pos++;
+ }
bool HasMoreTranslations() const {
return m_pos+1 < m_orderedTargetPhrases->size();
@@ -64,7 +66,7 @@ class TranslationDimension
return GetTargetPhrase() == compare.GetTargetPhrase();
}
- private:
+private:
std::size_t m_pos;
const std::vector<TargetPhrase*> *m_orderedTargetPhrases;
};
@@ -81,7 +83,9 @@ public:
, m_orderedHypos(&orderedHypos)
{}
- std::size_t IncrementPos() { return m_pos++; }
+ std::size_t IncrementPos() {
+ return m_pos++;
+ }
bool HasMoreHypo() const {
return m_pos+1 < m_orderedHypos->size();
@@ -109,7 +113,7 @@ std::size_t hash_value(const HypothesisDimension &);
/** @todo How is this used. Split out into separate source file */
class RuleCubeItem
{
- public:
+public:
RuleCubeItem(const ChartTranslationOptions &, const ChartCellCollection &);
RuleCubeItem(const RuleCubeItem &, int);
~RuleCubeItem();
@@ -122,7 +126,9 @@ class RuleCubeItem
return m_hypothesisDimensions;
}
- float GetScore() const { return m_score; }
+ float GetScore() const {
+ return m_score;
+ }
void EstimateScore();
@@ -132,7 +138,7 @@ class RuleCubeItem
bool operator<(const RuleCubeItem &) const;
- private:
+private:
RuleCubeItem(const RuleCubeItem &); // Not implemented
RuleCubeItem &operator=(const RuleCubeItem &); // Not implemented
diff --git a/moses/RuleCubeQueue.h b/moses/RuleCubeQueue.h
index 9763b3877..ae4d20be0 100644
--- a/moses/RuleCubeQueue.h
+++ b/moses/RuleCubeQueue.h
@@ -36,7 +36,7 @@ class ChartManager;
*/
class RuleCubeOrderer
{
- public:
+public:
bool operator()(const RuleCube *p, const RuleCube *q) const {
return p->GetTopScore() < q->GetTopScore();
}
@@ -45,17 +45,19 @@ class RuleCubeOrderer
/** @todo how is this used */
class RuleCubeQueue
{
- public:
+public:
RuleCubeQueue(ChartManager &manager) : m_manager(manager) {}
~RuleCubeQueue();
void Add(RuleCube *);
ChartHypothesis *Pop();
- bool IsEmpty() const { return m_queue.empty(); }
+ bool IsEmpty() const {
+ return m_queue.empty();
+ }
- private:
+private:
typedef std::priority_queue<RuleCube*, std::vector<RuleCube*>,
- RuleCubeOrderer > Queue;
+ RuleCubeOrderer > Queue;
Queue m_queue;
ChartManager &m_manager;
diff --git a/moses/ScoreComponentCollection.cpp b/moses/ScoreComponentCollection.cpp
index c836ea5b3..44f08b316 100644
--- a/moses/ScoreComponentCollection.cpp
+++ b/moses/ScoreComponentCollection.cpp
@@ -17,7 +17,7 @@ ScoreComponentCollection::ScoreComponentCollection() : m_scores(s_denseVectorSiz
void ScoreComponentCollection::RegisterScoreProducer
- (const FeatureFunction* scoreProducer)
+(const FeatureFunction* scoreProducer)
{
size_t start = s_denseVectorSize;
size_t end = start + scoreProducer->GetNumScoreComponents();
@@ -29,56 +29,58 @@ void ScoreComponentCollection::RegisterScoreProducer
float ScoreComponentCollection::GetWeightedScore() const
{
- return m_scores.inner_product(StaticData::Instance().GetAllWeights().m_scores);
+ return m_scores.inner_product(StaticData::Instance().GetAllWeights().m_scores);
}
void ScoreComponentCollection::MultiplyEquals(float scalar)
{
- m_scores *= scalar;
+ m_scores *= scalar;
}
// Multiply all weights of this sparse producer by a given scalar
-void ScoreComponentCollection::MultiplyEquals(const FeatureFunction* sp, float scalar) {
+void ScoreComponentCollection::MultiplyEquals(const FeatureFunction* sp, float scalar)
+{
std::string prefix = sp->GetScoreProducerDescription() + FName::SEP;
for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
std::stringstream name;
name << i->first;
if (name.str().substr( 0, prefix.length() ).compare( prefix ) == 0)
- m_scores[i->first] = i->second * scalar;
+ m_scores[i->first] = i->second * scalar;
}
}
// Count weights belonging to this sparse producer
-size_t ScoreComponentCollection::GetNumberWeights(const FeatureFunction* sp) {
+size_t ScoreComponentCollection::GetNumberWeights(const FeatureFunction* sp)
+{
std::string prefix = sp->GetScoreProducerDescription() + FName::SEP;
size_t weights = 0;
for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
std::stringstream name;
name << i->first;
if (name.str().substr( 0, prefix.length() ).compare( prefix ) == 0)
- weights++;
+ weights++;
}
return weights;
}
void ScoreComponentCollection::DivideEquals(float scalar)
{
- m_scores /= scalar;
+ m_scores /= scalar;
}
void ScoreComponentCollection::CoreDivideEquals(float scalar)
{
- m_scores.coreDivideEquals(scalar);
+ m_scores.coreDivideEquals(scalar);
}
void ScoreComponentCollection::DivideEquals(const ScoreComponentCollection& rhs)
{
- m_scores.divideEquals(rhs.m_scores);
+ m_scores.divideEquals(rhs.m_scores);
}
void ScoreComponentCollection::MultiplyEquals(const ScoreComponentCollection& rhs)
{
- m_scores *= rhs.m_scores;
+ m_scores *= rhs.m_scores;
}
void ScoreComponentCollection::MultiplyEqualsBackoff(const ScoreComponentCollection& rhs, float backoff)
@@ -93,42 +95,51 @@ void ScoreComponentCollection::MultiplyEquals(float core_r0, float sparse_r0)
std::ostream& operator<<(std::ostream& os, const ScoreComponentCollection& rhs)
{
- os << rhs.m_scores;
- return os;
+ os << rhs.m_scores;
+ return os;
}
-void ScoreComponentCollection::L1Normalise() {
+void ScoreComponentCollection::L1Normalise()
+{
m_scores /= m_scores.l1norm_coreFeatures();
}
-float ScoreComponentCollection::GetL1Norm() const {
+float ScoreComponentCollection::GetL1Norm() const
+{
return m_scores.l1norm();
}
-float ScoreComponentCollection::GetL2Norm() const {
+float ScoreComponentCollection::GetL2Norm() const
+{
return m_scores.l2norm();
}
-float ScoreComponentCollection::GetLInfNorm() const {
+float ScoreComponentCollection::GetLInfNorm() const
+{
return m_scores.linfnorm();
}
-size_t ScoreComponentCollection::L1Regularize(float lambda) {
+size_t ScoreComponentCollection::L1Regularize(float lambda)
+{
return m_scores.l1regularize(lambda);
}
-void ScoreComponentCollection::L2Regularize(float lambda) {
+void ScoreComponentCollection::L2Regularize(float lambda)
+{
m_scores.l2regularize(lambda);
}
-size_t ScoreComponentCollection::SparseL1Regularize(float lambda) {
+size_t ScoreComponentCollection::SparseL1Regularize(float lambda)
+{
return m_scores.sparseL1regularize(lambda);
}
-void ScoreComponentCollection::SparseL2Regularize(float lambda) {
+void ScoreComponentCollection::SparseL2Regularize(float lambda)
+{
m_scores.sparseL2regularize(lambda);
}
-void ScoreComponentCollection::Save(ostream& out) const {
+void ScoreComponentCollection::Save(ostream& out) const
+{
ScoreIndexMap::const_iterator iter = s_scoreIndexes.begin();
for (; iter != s_scoreIndexes.end(); ++iter ) {
string name = iter->first->GetScoreProducerDescription();
@@ -148,7 +159,8 @@ void ScoreComponentCollection::Save(ostream& out) const {
m_scores.write(out);
}
-void ScoreComponentCollection::Save(const string& filename) const {
+void ScoreComponentCollection::Save(const string& filename) const
+{
ofstream out(filename.c_str());
if (!out) {
ostringstream msg;
@@ -159,7 +171,8 @@ void ScoreComponentCollection::Save(const string& filename) const {
out.close();
}
-void ScoreComponentCollection::Assign(const FeatureFunction* sp, const string line) {
+void ScoreComponentCollection::Assign(const FeatureFunction* sp, const string line)
+{
istringstream istr(line);
while(istr) {
string namestring;
diff --git a/moses/ScoreComponentCollection.h b/moses/ScoreComponentCollection.h
index e76c9d06b..70c2a05f1 100644
--- a/moses/ScoreComponentCollection.h
+++ b/moses/ScoreComponentCollection.h
@@ -64,19 +64,18 @@ class ScoreComponentCollection
{
friend std::ostream& operator<<(std::ostream& os, const ScoreComponentCollection& rhs);
private:
- FVector m_scores;
+ FVector m_scores;
typedef std::pair<size_t,size_t> IndexPair;
typedef std::map<const FeatureFunction*,IndexPair> ScoreIndexMap;
static ScoreIndexMap s_scoreIndexes;
static size_t s_denseVectorSize;
- static IndexPair GetIndexes(const FeatureFunction* sp)
- {
+ static IndexPair GetIndexes(const FeatureFunction* sp) {
ScoreIndexMap::const_iterator indexIter = s_scoreIndexes.find(sp);
if (indexIter == s_scoreIndexes.end()) {
std::cerr << "ERROR: FeatureFunction: " << sp->GetScoreProducerDescription() <<
- " not registered with ScoreIndexMap" << std::endl;
+ " not registered with ScoreIndexMap" << std::endl;
std::cerr << "You must call ScoreComponentCollection.RegisterScoreProducer() " <<
- " for every FeatureFunction" << std::endl;
+ " for every FeatureFunction" << std::endl;
abort();
}
return indexIter->second;
@@ -91,9 +90,9 @@ public:
ScoreComponentCollection();
//! Clone a score collection
- ScoreComponentCollection(const ScoreComponentCollection& rhs)
- : m_scores(rhs.m_scores)
- {}
+ ScoreComponentCollection(const ScoreComponentCollection& rhs)
+ : m_scores(rhs.m_scores)
+ {}
ScoreComponentCollection& operator=( const ScoreComponentCollection& rhs ) {
m_scores = rhs.m_scores;
@@ -101,124 +100,108 @@ public:
}
/**
- * Register a ScoreProducer with a fixed number of scores, so that it can
+ * Register a ScoreProducer with a fixed number of scores, so that it can
* be allocated space in the dense part of the feature vector.
**/
static void RegisterScoreProducer(const FeatureFunction* scoreProducer);
/** Load from file */
- bool Load(const std::string& filename)
- {
- return m_scores.load(filename);
+ bool Load(const std::string& filename) {
+ return m_scores.load(filename);
}
- const FVector& GetScoresVector() const
- {
- return m_scores;
+ const FVector& GetScoresVector() const {
+ return m_scores;
}
const std::valarray<FValue> &getCoreFeatures() const {
return m_scores.getCoreFeatures();
}
- size_t Size() const
- {
- return m_scores.size();
+ size_t Size() const {
+ return m_scores.size();
}
- void Resize()
- {
+ void Resize() {
if (m_scores.coreSize() != s_denseVectorSize) {
m_scores.resize(s_denseVectorSize);
}
}
/** Create and FVector with the right number of core features */
- static FVector CreateFVector()
- {
+ static FVector CreateFVector() {
return FVector(s_denseVectorSize);
}
- void SetToBinaryOf(const ScoreComponentCollection& rhs)
- {
- m_scores.setToBinaryOf(rhs.m_scores);
+ void SetToBinaryOf(const ScoreComponentCollection& rhs) {
+ m_scores.setToBinaryOf(rhs.m_scores);
}
//! Set all values to 0.0
- void ZeroAll()
- {
- m_scores.clear();
- }
-
- void MultiplyEquals(float scalar);
- void DivideEquals(float scalar);
- void CoreDivideEquals(float scalar);
- void DivideEquals(const ScoreComponentCollection& rhs);
- void MultiplyEquals(const ScoreComponentCollection& rhs);
- void MultiplyEqualsBackoff(const ScoreComponentCollection& rhs, float backoff);
- void MultiplyEquals(float core_r0, float sparse_r0);
- void MultiplyEquals(const FeatureFunction* sp, float scalar);
-
- size_t GetNumberWeights(const FeatureFunction* sp);
-
- void CoreAssign(const ScoreComponentCollection& rhs)
- {
- m_scores.coreAssign(rhs.m_scores);
- }
-
- //! add the score in rhs
- void PlusEquals(const ScoreComponentCollection& rhs)
- {
- m_scores += rhs.m_scores;
- }
-
- // add only sparse features
- void SparsePlusEquals(const ScoreComponentCollection& rhs)
- {
- m_scores.sparsePlusEquals(rhs.m_scores);
- }
-
- void PlusEquals(const FVector& scores)
- {
- m_scores += scores;
- }
-
- //! subtract the score in rhs
- void MinusEquals(const ScoreComponentCollection& rhs)
- {
- m_scores -= rhs.m_scores;
- }
+ void ZeroAll() {
+ m_scores.clear();
+ }
+
+ void MultiplyEquals(float scalar);
+ void DivideEquals(float scalar);
+ void CoreDivideEquals(float scalar);
+ void DivideEquals(const ScoreComponentCollection& rhs);
+ void MultiplyEquals(const ScoreComponentCollection& rhs);
+ void MultiplyEqualsBackoff(const ScoreComponentCollection& rhs, float backoff);
+ void MultiplyEquals(float core_r0, float sparse_r0);
+ void MultiplyEquals(const FeatureFunction* sp, float scalar);
+
+ size_t GetNumberWeights(const FeatureFunction* sp);
+
+ void CoreAssign(const ScoreComponentCollection& rhs) {
+ m_scores.coreAssign(rhs.m_scores);
+ }
+
+ //! add the score in rhs
+ void PlusEquals(const ScoreComponentCollection& rhs) {
+ m_scores += rhs.m_scores;
+ }
+
+ // add only sparse features
+ void SparsePlusEquals(const ScoreComponentCollection& rhs) {
+ m_scores.sparsePlusEquals(rhs.m_scores);
+ }
+
+ void PlusEquals(const FVector& scores) {
+ m_scores += scores;
+ }
+
+ //! subtract the score in rhs
+ void MinusEquals(const ScoreComponentCollection& rhs) {
+ m_scores -= rhs.m_scores;
+ }
//For features which have an unbounded number of components
- void MinusEquals(const FeatureFunction*sp, const std::string& name, float score)
- {
+ void MinusEquals(const FeatureFunction*sp, const std::string& name, float score) {
FName fname(sp->GetScoreProducerDescription(),name);
m_scores[fname] -= score;
}
//For features which have an unbounded number of components
- void SparseMinusEquals(const std::string& full_name, float score)
- {
+ void SparseMinusEquals(const std::string& full_name, float score) {
FName fname(full_name);
m_scores[fname] -= score;
}
- //! Add scores from a single ScoreProducer only
- //! The length of scores must be equal to the number of score components
- //! produced by sp
- void PlusEquals(const FeatureFunction* sp, const ScoreComponentCollection& scores)
- {
+ //! Add scores from a single ScoreProducer only
+ //! The length of scores must be equal to the number of score components
+ //! produced by sp
+ void PlusEquals(const FeatureFunction* sp, const ScoreComponentCollection& scores) {
IndexPair indexes = GetIndexes(sp);
for (size_t i = indexes.first; i < indexes.second; ++i) {
m_scores[i] += scores.m_scores[i];
}
- }
+ }
- //! Add scores from a single FeatureFunction only
- //! The length of scores must be equal to the number of score components
- //! produced by sp
- void PlusEquals(const FeatureFunction* sp, const std::vector<float>& scores)
- {
+ //! Add scores from a single FeatureFunction only
+ //! The length of scores must be equal to the number of score components
+ //! produced by sp
+ void PlusEquals(const FeatureFunction* sp, const std::vector<float>& scores) {
IndexPair indexes = GetIndexes(sp);
CHECK(scores.size() == indexes.second - indexes.first);
for (size_t i = 0; i < scores.size(); ++i) {
@@ -226,56 +209,50 @@ public:
}
}
- //! Special version PlusEquals(ScoreProducer, vector<float>)
- //! to add the score from a single ScoreProducer that produces
- //! a single value
- void PlusEquals(const FeatureFunction* sp, float score)
- {
+ //! Special version PlusEquals(ScoreProducer, vector<float>)
+ //! to add the score from a single ScoreProducer that produces
+ //! a single value
+ void PlusEquals(const FeatureFunction* sp, float score) {
IndexPair indexes = GetIndexes(sp);
CHECK(1 == indexes.second - indexes.first);
m_scores[indexes.first] += score;
- }
+ }
//For features which have an unbounded number of components
- void PlusEquals(const FeatureFunction*sp, const StringPiece& name, float score)
- {
+ void PlusEquals(const FeatureFunction*sp, const StringPiece& name, float score) {
FName fname(sp->GetScoreProducerDescription(),name);
m_scores[fname] += score;
}
//For features which have an unbounded number of components
- void SparsePlusEquals(const std::string& full_name, float score)
- {
- FName fname(full_name);
+ void SparsePlusEquals(const std::string& full_name, float score) {
+ FName fname(full_name);
m_scores[fname] += score;
}
- void Assign(const FeatureFunction* sp, const std::vector<float>& scores)
- {
+ void Assign(const FeatureFunction* sp, const std::vector<float>& scores) {
IndexPair indexes = GetIndexes(sp);
CHECK(scores.size() == indexes.second - indexes.first);
for (size_t i = 0; i < scores.size(); ++i) {
m_scores[i + indexes.first] = scores[i];
}
}
-
+
//! Special version Assign(ScoreProducer, vector<float>)
//! to add the score from a single ScoreProducer that produces
//! a single value
- void Assign(const FeatureFunction* sp, float score)
- {
+ void Assign(const FeatureFunction* sp, float score) {
IndexPair indexes = GetIndexes(sp);
CHECK(1 == indexes.second - indexes.first);
m_scores[indexes.first] = score;
}
-
+
// Assign core weight by index
void Assign(size_t index, float score) {
m_scores[index] = score;
}
- void Assign(const FeatureFunction*sp, const StringPiece &name, float score)
- {
+ void Assign(const FeatureFunction*sp, const StringPiece &name, float score) {
FName fname(sp->GetScoreProducerDescription(),name);
m_scores[fname] = score;
}
@@ -285,27 +262,23 @@ public:
void Assign(const FeatureFunction* sp, const std::string line);
// shortcut: setting the value directly using the feature name
- void Assign(const std::string name, float score)
- {
- FName fname(name);
- m_scores[fname] = score;
- }
-
- float InnerProduct(const ScoreComponentCollection& rhs) const
- {
- return m_scores.inner_product(rhs.m_scores);
- }
-
- float PartialInnerProduct(const FeatureFunction* sp, const std::vector<float>& rhs) const
- {
- std::vector<float> lhs = GetScoresForProducer(sp);
- CHECK(lhs.size() == rhs.size());
- return std::inner_product(lhs.begin(), lhs.end(), rhs.begin(), 0.0f);
- }
-
- //! return a vector of all the scores associated with a certain FeatureFunction
- std::vector<float> GetScoresForProducer(const FeatureFunction* sp) const
- {
+ void Assign(const std::string name, float score) {
+ FName fname(name);
+ m_scores[fname] = score;
+ }
+
+ float InnerProduct(const ScoreComponentCollection& rhs) const {
+ return m_scores.inner_product(rhs.m_scores);
+ }
+
+ float PartialInnerProduct(const FeatureFunction* sp, const std::vector<float>& rhs) const {
+ std::vector<float> lhs = GetScoresForProducer(sp);
+ CHECK(lhs.size() == rhs.size());
+ return std::inner_product(lhs.begin(), lhs.end(), rhs.begin(), 0.0f);
+ }
+
+ //! return a vector of all the scores associated with a certain FeatureFunction
+ std::vector<float> GetScoresForProducer(const FeatureFunction* sp) const {
size_t components = sp->GetNumScoreComponents();
std::vector<float> res(components);
@@ -314,58 +287,52 @@ public:
res[i] = m_scores[i + indexes.first];
}
return res;
- }
+ }
//! get subset of scores that belong to a certain sparse ScoreProducer
FVector GetVectorForProducer(const FeatureFunction* sp) const;
- float GetSparseWeight(const FName& featureName) const
- {
+ float GetSparseWeight(const FName& featureName) const {
return m_scores[featureName];
}
-
+
void PrintCoreFeatures() {
m_scores.printCoreFeatures();
}
- void ThresholdScaling(float maxValue)
- {
- // find (smallest) factor for which all weights are <= maxValue
- // 0.1 / 0.14 = 0.714285714
- // 0.1 / 0.17 = 0.588235294
+ void ThresholdScaling(float maxValue) {
+ // find (smallest) factor for which all weights are <= maxValue
+ // 0.1 / 0.14 = 0.714285714
+ // 0.1 / 0.17 = 0.588235294
m_scores.thresholdScale(maxValue);
- }
-
- void CapMax(float maxValue)
- {
- // cap all sparse features to maxValue
- m_scores.capMax(maxValue);
- }
-
- void CapMin(float minValue)
- {
- // cap all sparse features to minValue
- m_scores.capMin(minValue);
- }
-
- //! if a FeatureFunction produces a single score (for example, a language model score)
- //! this will return it. If not, this method will throw
- float GetScoreForProducer(const FeatureFunction* sp) const
- {
+ }
+
+ void CapMax(float maxValue) {
+ // cap all sparse features to maxValue
+ m_scores.capMax(maxValue);
+ }
+
+ void CapMin(float minValue) {
+ // cap all sparse features to minValue
+ m_scores.capMin(minValue);
+ }
+
+ //! if a FeatureFunction produces a single score (for example, a language model score)
+ //! this will return it. If not, this method will throw
+ float GetScoreForProducer(const FeatureFunction* sp) const {
IndexPair indexes = GetIndexes(sp);
CHECK(indexes.second - indexes.first == 1);
return m_scores[indexes.first];
- }
+ }
//For features which have an unbounded number of components
float GetScoreForProducer
- (const FeatureFunction* sp, const std::string& name) const
- {
+ (const FeatureFunction* sp, const std::string& name) const {
FName fname(sp->GetScoreProducerDescription(),name);
return m_scores[fname];
}
- float GetWeightedScore() const;
+ float GetWeightedScore() const;
void ZeroDenseFeatures(const FeatureFunction* sp);
void L1Normalise();
@@ -378,45 +345,65 @@ public:
void SparseL2Regularize(float lambda);
void Save(const std::string& filename) const;
void Save(std::ostream&) const;
-
- void IncrementSparseHopeFeatures() { m_scores.incrementSparseHopeFeatures(); }
- void IncrementSparseFearFeatures() { m_scores.incrementSparseFearFeatures(); }
- void PrintSparseHopeFeatureCounts(std::ofstream& out) { m_scores.printSparseHopeFeatureCounts(out); }
- void PrintSparseFearFeatureCounts(std::ofstream& out) { m_scores.printSparseFearFeatureCounts(out); }
- void PrintSparseHopeFeatureCounts() { m_scores.printSparseHopeFeatureCounts(); }
- void PrintSparseFearFeatureCounts() { m_scores.printSparseFearFeatureCounts(); }
- size_t PruneSparseFeatures(size_t threshold) { return m_scores.pruneSparseFeatures(threshold); }
- size_t PruneZeroWeightFeatures() { return m_scores.pruneZeroWeightFeatures(); }
- void UpdateConfidenceCounts(ScoreComponentCollection &weightUpdate, bool signedCounts) { m_scores.updateConfidenceCounts(weightUpdate.m_scores, signedCounts); }
- void UpdateLearningRates(float decay_core, float decay_sparse, ScoreComponentCollection &confidenceCounts, float core_r0, float sparse_r0) { m_scores.updateLearningRates(decay_core, decay_sparse, confidenceCounts.m_scores, core_r0, sparse_r0); }
+
+ void IncrementSparseHopeFeatures() {
+ m_scores.incrementSparseHopeFeatures();
+ }
+ void IncrementSparseFearFeatures() {
+ m_scores.incrementSparseFearFeatures();
+ }
+ void PrintSparseHopeFeatureCounts(std::ofstream& out) {
+ m_scores.printSparseHopeFeatureCounts(out);
+ }
+ void PrintSparseFearFeatureCounts(std::ofstream& out) {
+ m_scores.printSparseFearFeatureCounts(out);
+ }
+ void PrintSparseHopeFeatureCounts() {
+ m_scores.printSparseHopeFeatureCounts();
+ }
+ void PrintSparseFearFeatureCounts() {
+ m_scores.printSparseFearFeatureCounts();
+ }
+ size_t PruneSparseFeatures(size_t threshold) {
+ return m_scores.pruneSparseFeatures(threshold);
+ }
+ size_t PruneZeroWeightFeatures() {
+ return m_scores.pruneZeroWeightFeatures();
+ }
+ void UpdateConfidenceCounts(ScoreComponentCollection &weightUpdate, bool signedCounts) {
+ m_scores.updateConfidenceCounts(weightUpdate.m_scores, signedCounts);
+ }
+ void UpdateLearningRates(float decay_core, float decay_sparse, ScoreComponentCollection &confidenceCounts, float core_r0, float sparse_r0) {
+ m_scores.updateLearningRates(decay_core, decay_sparse, confidenceCounts.m_scores, core_r0, sparse_r0);
+ }
#ifdef MPI_ENABLE
- public:
- friend class boost::serialization::access;
-
- private:
- //serialization
- template<class Archive>
- void save(Archive &ar, const unsigned int version) const {
- ar << m_scores;
- }
-
- template<class Archive>
- void load(Archive &ar, const unsigned int version) {
- ar >> m_scores;
+public:
+ friend class boost::serialization::access;
- }
+private:
+ //serialization
+ template<class Archive>
+ void save(Archive &ar, const unsigned int version) const {
+ ar << m_scores;
+ }
+
+ template<class Archive>
+ void load(Archive &ar, const unsigned int version) {
+ ar >> m_scores;
+
+ }
+
+ BOOST_SERIALIZATION_SPLIT_MEMBER()
- BOOST_SERIALIZATION_SPLIT_MEMBER()
-
#endif
};
struct SCCPlus {
ScoreComponentCollection operator()
- (const ScoreComponentCollection& lhs,
- const ScoreComponentCollection& rhs) {
+ (const ScoreComponentCollection& lhs,
+ const ScoreComponentCollection& rhs) {
ScoreComponentCollection sum(lhs);
sum.PlusEquals(rhs);
return sum;
diff --git a/moses/ScoreComponentCollectionTest.cpp b/moses/ScoreComponentCollectionTest.cpp
index f0813f4e8..41fa6562f 100644
--- a/moses/ScoreComponentCollectionTest.cpp
+++ b/moses/ScoreComponentCollectionTest.cpp
@@ -29,31 +29,35 @@ using namespace std;
BOOST_AUTO_TEST_SUITE(scc)
-class MockStatelessFeatureFunction : public StatelessFeatureFunction {
- public:
- MockStatelessFeatureFunction(const string& desc, size_t n, const string &line) :
- StatelessFeatureFunction(desc,n, line) {}
- virtual void Evaluate(const PhraseBasedFeatureContext&, ScoreComponentCollection*) const {}
- virtual void EvaluateChart(const ChartBasedFeatureContext&, ScoreComponentCollection*) const {}
- virtual void Evaluate(const TargetPhrase &targetPhrase
+class MockStatelessFeatureFunction : public StatelessFeatureFunction
+{
+public:
+ MockStatelessFeatureFunction(const string& desc, size_t n, const string &line) :
+ StatelessFeatureFunction(desc,n, line) {}
+ virtual void Evaluate(const PhraseBasedFeatureContext&, ScoreComponentCollection*) const {}
+ virtual void EvaluateChart(const ChartBasedFeatureContext&, ScoreComponentCollection*) const {}
+ virtual void Evaluate(const TargetPhrase &targetPhrase
, ScoreComponentCollection &scoreBreakdown
, ScoreComponentCollection &estimatedFutureScore) const
- { }
+ { }
};
-class MockSingleFeature : public MockStatelessFeatureFunction {
- public:
- MockSingleFeature(): MockStatelessFeatureFunction("MockSingle",1, "MockSingle") {}
+class MockSingleFeature : public MockStatelessFeatureFunction
+{
+public:
+ MockSingleFeature(): MockStatelessFeatureFunction("MockSingle",1, "MockSingle") {}
};
-class MockMultiFeature : public MockStatelessFeatureFunction {
- public:
- MockMultiFeature(): MockStatelessFeatureFunction("MockMulti", 5, "MockMulti") {}
+class MockMultiFeature : public MockStatelessFeatureFunction
+{
+public:
+ MockMultiFeature(): MockStatelessFeatureFunction("MockMulti", 5, "MockMulti") {}
};
-class MockSparseFeature : public MockStatelessFeatureFunction {
- public:
- MockSparseFeature(): MockStatelessFeatureFunction("MockSparse", 0, "MockSparse") {}
+class MockSparseFeature : public MockStatelessFeatureFunction
+{
+public:
+ MockSparseFeature(): MockStatelessFeatureFunction("MockSparse", 0, "MockSparse") {}
};
@@ -66,7 +70,7 @@ struct MockProducers {
MockSparseFeature sparse;
};
-BOOST_FIXTURE_TEST_CASE(ctor, MockProducers)
+BOOST_FIXTURE_TEST_CASE(ctor, MockProducers)
{
ScoreComponentCollection scc;
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&single),0);
@@ -88,11 +92,11 @@ BOOST_FIXTURE_TEST_CASE(plusequals, MockProducers)
scc.PlusEquals(&multi,vec1);
std::vector<float> actual = scc.GetScoresForProducer(&multi);
BOOST_CHECK_EQUAL_COLLECTIONS(vec1.begin(),vec1.end()
- ,actual.begin(), actual.end());
+ ,actual.begin(), actual.end());
scc.PlusEquals(&multi,vec1);
actual = scc.GetScoresForProducer(&multi);
BOOST_CHECK_EQUAL_COLLECTIONS(vec2.begin(),vec2.end(),
- actual.begin(), actual.end());
+ actual.begin(), actual.end());
BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&single), 3.4f);
}
diff --git a/moses/SearchNormalBatch.cpp b/moses/SearchNormalBatch.cpp
index ca72b3973..aa3aeb0a3 100644
--- a/moses/SearchNormalBatch.cpp
+++ b/moses/SearchNormalBatch.cpp
@@ -18,21 +18,21 @@ SearchNormalBatch::SearchNormalBatch(Manager& manager, const InputType &source,
// Split the feature functions into sets of stateless, stateful
// distributed lm, and stateful non-distributed.
const vector<const StatefulFeatureFunction*>& ffs =
- StatefulFeatureFunction::GetStatefulFeatureFunctions();
+ StatefulFeatureFunction::GetStatefulFeatureFunctions();
for (unsigned i = 0; i < ffs.size(); ++i) {
- if (ffs[i]->GetScoreProducerDescription() == "DLM_5gram") { // TODO WFT
- m_dlm_ffs[i] = const_cast<LanguageModel*>(static_cast<const LanguageModel* const>(ffs[i]));
- m_dlm_ffs[i]->SetFFStateIdx(i);
- }
- else {
- m_stateful_ffs[i] = const_cast<StatefulFeatureFunction*>(ffs[i]);
- }
+ if (ffs[i]->GetScoreProducerDescription() == "DLM_5gram") { // TODO WFT
+ m_dlm_ffs[i] = const_cast<LanguageModel*>(static_cast<const LanguageModel* const>(ffs[i]));
+ m_dlm_ffs[i]->SetFFStateIdx(i);
+ } else {
+ m_stateful_ffs[i] = const_cast<StatefulFeatureFunction*>(ffs[i]);
+ }
}
m_stateless_ffs = StatelessFeatureFunction::GetStatelessFeatureFunctions();
-
+
}
-SearchNormalBatch::~SearchNormalBatch() {
+SearchNormalBatch::~SearchNormalBatch()
+{
}
/**
@@ -138,79 +138,79 @@ void SearchNormalBatch::ExpandHypothesis(const Hypothesis &hypothesis, const Tra
for (dlm_iter = m_dlm_ffs.begin();
dlm_iter != m_dlm_ffs.end();
++dlm_iter) {
- const FFState* input_state = newHypo->GetPrevHypo() ? newHypo->GetPrevHypo()->GetFFState((*dlm_iter).first) : NULL;
- (*dlm_iter).second->IssueRequestsFor(*newHypo, input_state);
+ const FFState* input_state = newHypo->GetPrevHypo() ? newHypo->GetPrevHypo()->GetFFState((*dlm_iter).first) : NULL;
+ (*dlm_iter).second->IssueRequestsFor(*newHypo, input_state);
}
m_partial_hypos.push_back(newHypo);
- }
- else {
+ } else {
std::cerr << "can't use early discarding with batch decoding!" << std::endl;
abort();
}
}
-void SearchNormalBatch::EvalAndMergePartialHypos() {
- std::vector<Hypothesis*>::iterator partial_hypo_iter;
- for (partial_hypo_iter = m_partial_hypos.begin();
- partial_hypo_iter != m_partial_hypos.end();
- ++partial_hypo_iter) {
- Hypothesis* hypo = *partial_hypo_iter;
-
- // Evaluate with other ffs.
- std::map<int, StatefulFeatureFunction*>::iterator sfff_iter;
- for (sfff_iter = m_stateful_ffs.begin();
- sfff_iter != m_stateful_ffs.end();
- ++sfff_iter) {
- const StatefulFeatureFunction &ff = *(sfff_iter->second);
- int state_idx = sfff_iter->first;
- hypo->EvaluateWith(ff, state_idx);
- }
- std::vector<const StatelessFeatureFunction*>::iterator slff_iter;
- for (slff_iter = m_stateless_ffs.begin();
- slff_iter != m_stateless_ffs.end();
- ++slff_iter) {
- hypo->EvaluateWith(**slff_iter);
- }
+void SearchNormalBatch::EvalAndMergePartialHypos()
+{
+ std::vector<Hypothesis*>::iterator partial_hypo_iter;
+ for (partial_hypo_iter = m_partial_hypos.begin();
+ partial_hypo_iter != m_partial_hypos.end();
+ ++partial_hypo_iter) {
+ Hypothesis* hypo = *partial_hypo_iter;
+
+ // Evaluate with other ffs.
+ std::map<int, StatefulFeatureFunction*>::iterator sfff_iter;
+ for (sfff_iter = m_stateful_ffs.begin();
+ sfff_iter != m_stateful_ffs.end();
+ ++sfff_iter) {
+ const StatefulFeatureFunction &ff = *(sfff_iter->second);
+ int state_idx = sfff_iter->first;
+ hypo->EvaluateWith(ff, state_idx);
+ }
+ std::vector<const StatelessFeatureFunction*>::iterator slff_iter;
+ for (slff_iter = m_stateless_ffs.begin();
+ slff_iter != m_stateless_ffs.end();
+ ++slff_iter) {
+ hypo->EvaluateWith(**slff_iter);
}
+ }
- // Wait for all requests from the distributed LM to come back.
+ // Wait for all requests from the distributed LM to come back.
+ std::map<int, LanguageModel*>::iterator dlm_iter;
+ for (dlm_iter = m_dlm_ffs.begin();
+ dlm_iter != m_dlm_ffs.end();
+ ++dlm_iter) {
+ (*dlm_iter).second->sync();
+ }
+
+ // Incorporate the DLM scores into all hypotheses and put into their
+ // stacks.
+ for (partial_hypo_iter = m_partial_hypos.begin();
+ partial_hypo_iter != m_partial_hypos.end();
+ ++partial_hypo_iter) {
+ Hypothesis* hypo = *partial_hypo_iter;
+
+ // Calculate DLM scores.
std::map<int, LanguageModel*>::iterator dlm_iter;
for (dlm_iter = m_dlm_ffs.begin();
dlm_iter != m_dlm_ffs.end();
++dlm_iter) {
- (*dlm_iter).second->sync();
+ LanguageModel &lm = *(dlm_iter->second);
+ hypo->EvaluateWith(lm, (*dlm_iter).first);
}
- // Incorporate the DLM scores into all hypotheses and put into their
- // stacks.
- for (partial_hypo_iter = m_partial_hypos.begin();
- partial_hypo_iter != m_partial_hypos.end();
- ++partial_hypo_iter) {
- Hypothesis* hypo = *partial_hypo_iter;
-
- // Calculate DLM scores.
- std::map<int, LanguageModel*>::iterator dlm_iter;
- for (dlm_iter = m_dlm_ffs.begin();
- dlm_iter != m_dlm_ffs.end();
- ++dlm_iter) {
- LanguageModel &lm = *(dlm_iter->second);
- hypo->EvaluateWith(lm, (*dlm_iter).first);
- }
-
- // Put completed hypothesis onto its stack.
- size_t wordsTranslated = hypo->GetWordsBitmap().GetNumWordsCovered();
- m_hypoStackColl[wordsTranslated]->AddPrune(hypo);
- }
- m_partial_hypos.clear();
-
- std::vector < HypothesisStack* >::iterator stack_iter;
- HypothesisStackNormal* stack;
- for (stack_iter = m_hypoStackColl.begin();
- stack_iter != m_hypoStackColl.end();
- ++stack_iter) {
- stack = static_cast<HypothesisStackNormal*>(*stack_iter);
- stack->PruneToSize(m_max_stack_size);
- }
+ // Put completed hypothesis onto its stack.
+ size_t wordsTranslated = hypo->GetWordsBitmap().GetNumWordsCovered();
+ m_hypoStackColl[wordsTranslated]->AddPrune(hypo);
+ }
+ m_partial_hypos.clear();
+
+ std::vector < HypothesisStack* >::iterator stack_iter;
+ HypothesisStackNormal* stack;
+ for (stack_iter = m_hypoStackColl.begin();
+ stack_iter != m_hypoStackColl.end();
+ ++stack_iter) {
+ stack = static_cast<HypothesisStackNormal*>(*stack_iter);
+ stack->PruneToSize(m_max_stack_size);
+ }
}
}
diff --git a/moses/SearchNormalBatch.h b/moses/SearchNormalBatch.h
index fcfda7054..7f6764635 100644
--- a/moses/SearchNormalBatch.h
+++ b/moses/SearchNormalBatch.h
@@ -13,7 +13,7 @@ class TranslationOptionCollection;
/** Implements the phrase-based stack decoding algorithm (no cube pruning) with a twist...
* Language model requests are batched together, duplicate requests are removed, and requests are sent together.
* Useful for distributed LM where network latency is an issue.
- */
+ */
class SearchNormalBatch: public SearchNormal
{
protected:
@@ -21,7 +21,7 @@ protected:
// Added for asynclm decoding.
std::vector<const StatelessFeatureFunction*> m_stateless_ffs;
std::map<int, LanguageModel*> m_dlm_ffs;
- std::map<int, StatefulFeatureFunction*> m_stateful_ffs;
+ std::map<int, StatefulFeatureFunction*> m_stateful_ffs;
std::vector<Hypothesis*> m_partial_hypos;
int m_batch_size;
int m_max_stack_size;
diff --git a/moses/Sentence.cpp b/moses/Sentence.cpp
index b2e5a6633..8e76b0f03 100644
--- a/moses/Sentence.cpp
+++ b/moses/Sentence.cpp
@@ -104,8 +104,7 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
this->SetTopicId(atol(topic_params[0].c_str()));
this->SetUseTopicId(true);
this->SetUseTopicIdAndProb(false);
- }
- else {
+ } else {
this->SetTopicIdAndProb(topic_params);
this->SetUseTopicId(false);
this->SetUseTopicIdAndProb(true);
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index df4c14cde..f822e4e13 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -109,15 +109,15 @@ StaticData::~StaticData()
typedef std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> > Coll;
Coll::iterator iter;
for (iter = m_transOptCache.begin(); iter != m_transOptCache.end(); ++iter) {
- std::pair<TranslationOptionList*,clock_t> &valuePair =iter->second;
- TranslationOptionList *transOptList = valuePair.first;
- delete transOptList;
+ std::pair<TranslationOptionList*,clock_t> &valuePair =iter->second;
+ TranslationOptionList *transOptList = valuePair.first;
+ delete transOptList;
}
/*
const std::vector<FeatureFunction*> &producers = FeatureFunction::GetFeatureFunctions();
for(size_t i=0;i<producers.size();++i) {
- FeatureFunction *ff = producers[i];
+ FeatureFunction *ff = producers[i];
delete ff;
}
*/
@@ -126,7 +126,8 @@ StaticData::~StaticData()
Phrase::FinalizeMemPool();
}
-bool StaticData::LoadDataStatic(Parameter *parameter, const std::string &execPath) {
+bool StaticData::LoadDataStatic(Parameter *parameter, const std::string &execPath)
+{
s_instance.SetExecPath(execPath);
return s_instance.LoadData(parameter);
}
@@ -143,7 +144,7 @@ bool StaticData::LoadData(Parameter *parameter)
}
m_parsingAlgorithm = (m_parameter->GetParam("parsing-algorithm").size() > 0) ?
- (ParsingAlgorithm) Scan<size_t>(m_parameter->GetParam("parsing-algorithm")[0]) : ParseCYKPlus;
+ (ParsingAlgorithm) Scan<size_t>(m_parameter->GetParam("parsing-algorithm")[0]) : ParseCYKPlus;
// to cube or not to cube
m_searchAlgorithm = (m_parameter->GetParam("search-algorithm").size() > 0) ?
@@ -217,7 +218,7 @@ bool StaticData::LoadData(Parameter *parameter)
} else {
m_nBestFactor = 20;
}
-
+
//lattice samples
if (m_parameter->GetParam("lattice-samples").size() ==2 ) {
m_latticeSamplesFilePath = m_parameter->GetParam("lattice-samples")[0];
@@ -276,11 +277,11 @@ bool StaticData::LoadData(Parameter *parameter)
#endif
SetBooleanParameter( &m_unprunedSearchGraph, "unpruned-search-graph", false );
SetBooleanParameter( &m_includeLHSInSearchGraph, "include-lhs-in-search-graph", false );
-
+
if (m_parameter->isParamSpecified("output-unknowns")) {
if (m_parameter->GetParam("output-unknowns").size() == 1) {
- m_outputUnknownsFile =Scan<string>(m_parameter->GetParam("output-unknowns")[0]);
+ m_outputUnknownsFile =Scan<string>(m_parameter->GetParam("output-unknowns")[0]);
} else {
UserMessage::Add(string("need to specify exactly one file name for unknowns"));
return false;
@@ -422,7 +423,7 @@ bool StaticData::LoadData(Parameter *parameter)
cerr << "Errror: Cannot use both n-best mbr and lattice mbr together" << endl;
exit(1);
}
-
+
//mira training
SetBooleanParameter( &m_mira, "mira", false );
@@ -446,7 +447,7 @@ bool StaticData::LoadData(Parameter *parameter)
exit(1);
}
if (m_useConsensusDecoding) m_mbr=true;
-
+
// Compact phrase table and reordering model
SetBooleanParameter( &m_minphrMemory, "minphr-memory", false );
SetBooleanParameter( &m_minlexrMemory, "minlexr-memory", false );
@@ -489,7 +490,7 @@ bool StaticData::LoadData(Parameter *parameter)
}
m_startTranslationId = (m_parameter->GetParam("start-translation-id").size() > 0) ?
- Scan<long>(m_parameter->GetParam("start-translation-id")[0]) : 0;
+ Scan<long>(m_parameter->GetParam("start-translation-id")[0]) : 0;
// Read in constraint decoding file, if provided
if(m_parameter->GetParam("constraint").size()) {
@@ -503,7 +504,7 @@ bool StaticData::LoadData(Parameter *parameter)
InputFileStream constraintFile(m_constraintFileName);
std::string line;
-
+
long sentenceID = GetStartTranslationId() - 1;
while (getline(constraintFile, line)) {
vector<string> vecStr = Tokenize(line, "\t");
@@ -546,14 +547,14 @@ bool StaticData::LoadData(Parameter *parameter)
// specify XML tags opening and closing brackets for XML option
if (m_parameter->GetParam("xml-brackets").size() > 0) {
- std::vector<std::string> brackets = Tokenize(m_parameter->GetParam("xml-brackets")[0]);
- if(brackets.size()!=2) {
- cerr << "invalid xml-brackets value, must specify exactly 2 blank-delimited strings for XML tags opening and closing brackets" << endl;
- exit(1);
- }
- m_xmlBrackets.first= brackets[0];
- m_xmlBrackets.second=brackets[1];
- cerr << "XML tags opening and closing brackets for XML input are: " << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl;
+ std::vector<std::string> brackets = Tokenize(m_parameter->GetParam("xml-brackets")[0]);
+ if(brackets.size()!=2) {
+ cerr << "invalid xml-brackets value, must specify exactly 2 blank-delimited strings for XML tags opening and closing brackets" << endl;
+ exit(1);
+ }
+ m_xmlBrackets.first= brackets[0];
+ m_xmlBrackets.second=brackets[1];
+ cerr << "XML tags opening and closing brackets for XML input are: " << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl;
}
// all features
@@ -574,58 +575,47 @@ bool StaticData::LoadData(Parameter *parameter)
GlobalLexicalModel *model = new GlobalLexicalModel(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
- }
- else if (feature == "GlobalLexicalModelUnlimited") {
+ } else if (feature == "GlobalLexicalModelUnlimited") {
GlobalLexicalModelUnlimited *model = NULL; //new GlobalLexicalModelUnlimited(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
- }
- else if (feature == "SourceWordDeletionFeature") {
+ } else if (feature == "SourceWordDeletionFeature") {
SourceWordDeletionFeature *model = new SourceWordDeletionFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "TargetWordInsertionFeature") {
+ } else if (feature == "TargetWordInsertionFeature") {
TargetWordInsertionFeature *model = new TargetWordInsertionFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "PhraseBoundaryFeature") {
+ } else if (feature == "PhraseBoundaryFeature") {
PhraseBoundaryFeature *model = new PhraseBoundaryFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "PhraseLengthFeature") {
+ } else if (feature == "PhraseLengthFeature") {
PhraseLengthFeature *model = new PhraseLengthFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "WordTranslationFeature") {
+ } else if (feature == "WordTranslationFeature") {
WordTranslationFeature *model = new WordTranslationFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "TargetBigramFeature") {
+ } else if (feature == "TargetBigramFeature") {
TargetBigramFeature *model = new TargetBigramFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "TargetNgramFeature") {
+ } else if (feature == "TargetNgramFeature") {
TargetNgramFeature *model = new TargetNgramFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "PhrasePairFeature") {
+ } else if (feature == "PhrasePairFeature") {
PhrasePairFeature *model = new PhrasePairFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
//SetWeights(model, weights);
- }
- else if (feature == "LexicalReordering") {
+ } else if (feature == "LexicalReordering") {
LexicalReordering *model = new LexicalReordering(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
- }
- else if (feature == "KENLM") {
+ } else if (feature == "KENLM") {
LanguageModel *model = ConstructKenLM(feature, line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
@@ -648,69 +638,58 @@ bool StaticData::LoadData(Parameter *parameter)
GenerationDictionary *model = new GenerationDictionary(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
- }
- else if (feature == "BleuScoreFeature") {
+ } else if (feature == "BleuScoreFeature") {
BleuScoreFeature *model = new BleuScoreFeature(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
- }
- else if (feature == "Distortion") {
+ } else if (feature == "Distortion") {
DistortionScoreProducer *model = new DistortionScoreProducer(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
- }
- else if (feature == "WordPenalty") {
+ } else if (feature == "WordPenalty") {
WordPenaltyProducer *model = new WordPenaltyProducer(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_wpProducer = model;
- }
- else if (feature == "UnknownWordPenalty") {
+ } else if (feature == "UnknownWordPenalty") {
UnknownWordPenaltyProducer *model = new UnknownWordPenaltyProducer(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
if (weights.size() == 0)
weights.push_back(1.0f);
SetWeights(model, weights);
m_unknownWordPenaltyProducer = model;
- }
- else if (feature == "PhraseDictionaryBinary") {
+ } else if (feature == "PhraseDictionaryBinary") {
PhraseDictionaryTreeAdaptor* model = new PhraseDictionaryTreeAdaptor(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_phraseDictionary.push_back(model);
- }
- else if (feature == "PhraseDictionaryOnDisk") {
+ } else if (feature == "PhraseDictionaryOnDisk") {
PhraseDictionaryOnDisk* model = new PhraseDictionaryOnDisk(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_phraseDictionary.push_back(model);
- }
- else if (feature == "PhraseDictionaryMemory") {
+ } else if (feature == "PhraseDictionaryMemory") {
PhraseDictionaryMemory* model = new PhraseDictionaryMemory(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_phraseDictionary.push_back(model);
- }
- else if (feature == "PhraseDictionaryCompact") {
+ } else if (feature == "PhraseDictionaryCompact") {
PhraseDictionaryCompact* model = new PhraseDictionaryCompact(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_phraseDictionary.push_back(model);
- }
- else if (feature == "PhraseDictionaryMultiModel") {
+ } else if (feature == "PhraseDictionaryMultiModel") {
PhraseDictionaryMultiModel* model = new PhraseDictionaryMultiModel(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_phraseDictionary.push_back(model);
- }
- else if (feature == "PhraseDictionaryMultiModelCounts") {
+ } else if (feature == "PhraseDictionaryMultiModelCounts") {
PhraseDictionaryMultiModelCounts* model = new PhraseDictionaryMultiModelCounts(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_phraseDictionary.push_back(model);
- }
- else if (feature == "PhraseDictionaryALSuffixArray") {
- PhraseDictionaryALSuffixArray* model = new PhraseDictionaryALSuffixArray(line);
+ } else if (feature == "PhraseDictionaryALSuffixArray") {
+ PhraseDictionaryALSuffixArray* model = new PhraseDictionaryALSuffixArray(line);
vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
SetWeights(model, weights);
m_phraseDictionary.push_back(model);
@@ -912,7 +891,7 @@ bool StaticData::LoadDecodeGraphs()
DecodeGraph *decodeGraph;
if (IsChart()) {
size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN;
- cerr << "max-chart-span: " << maxChartSpans[decodeGraphInd] << endl;
+ cerr << "max-chart-span: " << maxChartSpans[decodeGraphInd] << endl;
decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan);
} else {
decodeGraph = new DecodeGraph(m_decodeGraphs.size());
@@ -947,7 +926,7 @@ const TranslationOptionList* StaticData::FindTransOptListInCache(const DecodeGra
boost::mutex::scoped_lock lock(m_transOptCacheMutex);
#endif
std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iter
- = m_transOptCache.find(key);
+ = m_transOptCache.find(key);
if (iter == m_transOptCache.end())
return NULL;
iter->second.second = clock(); // update last used time
@@ -994,7 +973,8 @@ void StaticData::AddTransOptListToCache(const DecodeGraph &decodeGraph, const Ph
m_transOptCache[key] = make_pair( storedTransOptList, clock() );
ReduceTransOptCache();
}
-void StaticData::ClearTransOptionCache() const {
+void StaticData::ClearTransOptionCache() const
+{
map<std::pair<size_t, Phrase>, std::pair< TranslationOptionList*, clock_t > >::iterator iterCache;
for (iterCache = m_transOptCache.begin() ; iterCache != m_transOptCache.end() ; ++iterCache) {
TranslationOptionList *transOptList = iterCache->second.first;
@@ -1091,20 +1071,19 @@ void StaticData::SetExecPath(const std::string &path)
{
/*
namespace fs = boost::filesystem;
-
+
fs::path full_path( fs::initial_path<fs::path>() );
-
+
full_path = fs::system_complete( fs::path( path ) );
-
+
//Without file name
m_binPath = full_path.parent_path().string();
*/
-
+
// NOT TESTED
size_t pos = path.rfind("/");
- if (pos != string::npos)
- {
- m_binPath = path.substr(0, pos);
+ if (pos != string::npos) {
+ m_binPath = path.substr(0, pos);
}
cerr << m_binPath << endl;
}
@@ -1114,27 +1093,31 @@ const string &StaticData::GetBinDirectory() const
return m_binPath;
}
-float StaticData::GetWeightWordPenalty() const {
+float StaticData::GetWeightWordPenalty() const
+{
float weightWP = GetWeight(m_wpProducer);
//VERBOSE(1, "Read weightWP from translation sytem: " << weightWP << std::endl);
return weightWP;
}
-float StaticData::GetWeightUnknownWordPenalty() const {
+float StaticData::GetWeightUnknownWordPenalty() const
+{
return GetWeight(m_unknownWordPenaltyProducer);
}
-void StaticData::InitializeForInput(const InputType& source) const {
+void StaticData::InitializeForInput(const InputType& source) const
+{
const std::vector<FeatureFunction*> &producers = FeatureFunction::GetFeatureFunctions();
- for(size_t i=0;i<producers.size();++i) {
+ for(size_t i=0; i<producers.size(); ++i) {
FeatureFunction &ff = *producers[i];
ff.InitializeForInput(source);
}
}
-void StaticData::CleanUpAfterSentenceProcessing(const InputType& source) const {
+void StaticData::CleanUpAfterSentenceProcessing(const InputType& source) const
+{
const std::vector<FeatureFunction*> &producers = FeatureFunction::GetFeatureFunctions();
- for(size_t i=0;i<producers.size();++i) {
+ for(size_t i=0; i<producers.size(); ++i) {
FeatureFunction &ff = *producers[i];
ff.CleanUpAfterSentenceProcessing(source);
}
@@ -1172,8 +1155,7 @@ bool StaticData::CheckWeights() const
set<string>::iterator iter = weightNames.find(descr);
if (iter == weightNames.end()) {
cerr << "Can't find weights for feature function " << descr << endl;
- }
- else {
+ } else {
weightNames.erase(iter);
}
}
diff --git a/moses/StaticData.h b/moses/StaticData.h
index 01a0a19df..5a1cec213 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -87,7 +87,7 @@ protected:
m_translationOptionThreshold,
m_wordDeletionWeight;
-
+
// PhraseTrans, Generation & LanguageModelScore has multiple weights.
int m_maxDistortion;
// do it differently from old pharaoh
@@ -206,7 +206,7 @@ protected:
int m_threadCount;
long m_startTranslationId;
-
+
StaticData();
@@ -223,7 +223,7 @@ protected:
bool m_continuePartialTranslation;
std::string m_binPath;
-
+
public:
bool IsAlwaysCreateDirectTranslationOption() const {
@@ -363,15 +363,15 @@ public:
bool IsLabeledNBestList() const {
return m_labeledNBestList;
}
-
+
bool UseMinphrInMemory() const {
- return m_minphrMemory;
+ return m_minphrMemory;
}
bool UseMinlexrInMemory() const {
- return m_minlexrMemory;
+ return m_minlexrMemory;
}
-
+
size_t GetNumRealWordsInInput() const {
return m_numRealWordsInInput;
}
@@ -421,13 +421,16 @@ public:
bool IsChart() const {
return m_searchAlgorithm == ChartDecoding || m_searchAlgorithm == ChartIncremental;
}
- const WordPenaltyProducer *GetWordPenaltyProducer() const
- { return m_wpProducer; }
- WordPenaltyProducer *GetWordPenaltyProducer() // for mira
- { return m_wpProducer; }
+ const WordPenaltyProducer *GetWordPenaltyProducer() const {
+ return m_wpProducer;
+ }
+ WordPenaltyProducer *GetWordPenaltyProducer() { // for mira
+ return m_wpProducer;
+ }
- const UnknownWordPenaltyProducer *GetUnknownWordPenaltyProducer() const
- { return m_unknownWordPenaltyProducer; }
+ const UnknownWordPenaltyProducer *GetUnknownWordPenaltyProducer() const {
+ return m_unknownWordPenaltyProducer;
+ }
size_t GetNumInputScores() const {
return m_numInputScores;
@@ -458,7 +461,7 @@ public:
float GetSparseWeight(const FName& featureName) const {
return m_allWeights.GetSparseWeight(featureName);
}
-
+
//Weights for feature with fixed number of values
void SetWeights(const FeatureFunction* sp, const std::vector<float>& weights);
@@ -627,15 +630,17 @@ public:
int ThreadCount() const {
return m_threadCount;
}
-
- long GetStartTranslationId() const
- { return m_startTranslationId; }
-
+
+ long GetStartTranslationId() const {
+ return m_startTranslationId;
+ }
+
void SetExecPath(const std::string &path);
const std::string &GetBinDirectory() const;
bool NeedAlignmentInfo() const {
- return m_needAlignmentInfo; }
+ return m_needAlignmentInfo;
+ }
const std::string &GetAlignmentOutputFile() const {
return m_alignmentOutputFile;
}
@@ -656,19 +661,26 @@ public:
float GetWeightWordPenalty() const;
float GetWeightUnknownWordPenalty() const;
- const std::vector<PhraseDictionary*>& GetPhraseDictionaries() const
- { return m_phraseDictionary;}
- const std::vector<const GenerationDictionary*>& GetGenerationDictionaries() const
- { return m_generationDictionary;}
- const PhraseDictionary*GetTranslationScoreProducer(size_t index) const
- { return GetPhraseDictionaries().at(index); }
+ const std::vector<PhraseDictionary*>& GetPhraseDictionaries() const {
+ return m_phraseDictionary;
+ }
+ const std::vector<const GenerationDictionary*>& GetGenerationDictionaries() const {
+ return m_generationDictionary;
+ }
+ const PhraseDictionary*GetTranslationScoreProducer(size_t index) const {
+ return GetPhraseDictionaries().at(index);
+ }
std::vector<float> GetTranslationWeights(size_t index) const {
std::vector<float> weights = GetWeights(GetTranslationScoreProducer(index));
return weights;
}
- const std::vector<DecodeGraph*>& GetDecodeGraphs() const {return m_decodeGraphs;}
- const std::vector<size_t>& GetDecodeGraphBackoff() const {return m_decodeGraphBackoff;}
+ const std::vector<DecodeGraph*>& GetDecodeGraphs() const {
+ return m_decodeGraphs;
+ }
+ const std::vector<size_t>& GetDecodeGraphBackoff() const {
+ return m_decodeGraphBackoff;
+ }
//sentence (and thread) specific initialisationn and cleanup
void InitializeForInput(const InputType& source) const;
@@ -697,8 +709,7 @@ public:
#ifdef WITH_THREADS
if (m_multimodelweights_tmp.find(boost::this_thread::get_id()) != m_multimodelweights_tmp.end()) {
return &m_multimodelweights_tmp.find(boost::this_thread::get_id())->second;
- }
- else {
+ } else {
return NULL;
}
#else
diff --git a/moses/SyntacticLanguageModel.cpp b/moses/SyntacticLanguageModel.cpp
index 4a3b26ff1..cde041fe7 100644
--- a/moses/SyntacticLanguageModel.cpp
+++ b/moses/SyntacticLanguageModel.cpp
@@ -10,154 +10,159 @@
namespace Moses
{
- SyntacticLanguageModel::SyntacticLanguageModel(const std::string &line)
- // Initialize member variables
- /*
- : m_NumScoreComponents(weights.size())
- , m_files(new SyntacticLanguageModelFiles<YModel,XModel>(filePath))
- , m_factorType(factorType)
- , m_beamWidth(beamWidth) {
- */
- {
- /* taken from StaticData::LoadSyntacticLanguageModel()
- cerr << "Loading syntactic language models..." << std::endl;
-
- const vector<float> weights = Scan<float>(m_parameter->GetParam("weight-slm"));
- const vector<string> files = m_parameter->GetParam("slmodel-file");
-
- const FactorType factorType = (m_parameter->GetParam("slmodel-factor").size() > 0) ?
- TransformScore(Scan<int>(m_parameter->GetParam("slmodel-factor")[0]))
- : 0;
-
- const size_t beamWidth = (m_parameter->GetParam("slmodel-beam").size() > 0) ?
- TransformScore(Scan<int>(m_parameter->GetParam("slmodel-beam")[0]))
- : 500;
-
- if (files.size() < 1) {
- cerr << "No syntactic language model files specified!" << std::endl;
- return false;
- }
+SyntacticLanguageModel::SyntacticLanguageModel(const std::string &line)
+// Initialize member variables
+/*
+: m_NumScoreComponents(weights.size())
+, m_files(new SyntacticLanguageModelFiles<YModel,XModel>(filePath))
+, m_factorType(factorType)
+, m_beamWidth(beamWidth) {
+*/
+{
+ /* taken from StaticData::LoadSyntacticLanguageModel()
+ cerr << "Loading syntactic language models..." << std::endl;
- // check if feature is used
- if (weights.size() >= 1) {
+ const vector<float> weights = Scan<float>(m_parameter->GetParam("weight-slm"));
+ const vector<string> files = m_parameter->GetParam("slmodel-file");
- //cout.setf(ios::scientific,ios::floatfield);
- //cerr.setf(ios::scientific,ios::floatfield);
+ const FactorType factorType = (m_parameter->GetParam("slmodel-factor").size() > 0) ?
+ TransformScore(Scan<int>(m_parameter->GetParam("slmodel-factor")[0]))
+ : 0;
- // create the feature
- m_syntacticLanguageModel = new SyntacticLanguageModel(files,weights,factorType,beamWidth);
+ const size_t beamWidth = (m_parameter->GetParam("slmodel-beam").size() > 0) ?
+ TransformScore(Scan<int>(m_parameter->GetParam("slmodel-beam")[0]))
+ : 500;
+ if (files.size() < 1) {
+ cerr << "No syntactic language model files specified!" << std::endl;
+ return false;
+ }
- /////////////////////////////////////////
- // BEGIN LANE's UNSTABLE EXPERIMENT :)
- //
+ // check if feature is used
+ if (weights.size() >= 1) {
- //double ppl = m_syntacticLanguageModel->perplexity();
- //cerr << "Probability is " << ppl << endl;
+ //cout.setf(ios::scientific,ios::floatfield);
+ //cerr.setf(ios::scientific,ios::floatfield);
+ // create the feature
+ m_syntacticLanguageModel = new SyntacticLanguageModel(files,weights,factorType,beamWidth);
- //
- // END LANE's UNSTABLE EXPERIMENT
- /////////////////////////////////////////
+ /////////////////////////////////////////
+ // BEGIN LANE's UNSTABLE EXPERIMENT :)
+ //
+ //double ppl = m_syntacticLanguageModel->perplexity();
+ //cerr << "Probability is " << ppl << endl;
- if (m_syntacticLanguageModel==NULL) {
- return false;
- }
- }
+ //
+ // END LANE's UNSTABLE EXPERIMENT
+ /////////////////////////////////////////
- return true;
- */
- }
- SyntacticLanguageModel::~SyntacticLanguageModel() {
- VERBOSE(3,"Destructing SyntacticLanguageModel" << std::endl);
- delete m_files;
- }
+ if (m_syntacticLanguageModel==NULL) {
+ return false;
+ }
- size_t SyntacticLanguageModel::GetNumScoreComponents() const {
- return m_NumScoreComponents;
}
- std::string SyntacticLanguageModel::GetScoreProducerDescription() const {
- return "SyntacticLM";
- }
+ return true;
- const FFState* SyntacticLanguageModel::EmptyHypothesisState(const InputType &input) const {
+ */
+}
- return new SyntacticLanguageModelState<YModel,XModel,S,R>(m_files,m_beamWidth);
+SyntacticLanguageModel::~SyntacticLanguageModel()
+{
+ VERBOSE(3,"Destructing SyntacticLanguageModel" << std::endl);
+ delete m_files;
+}
- }
+size_t SyntacticLanguageModel::GetNumScoreComponents() const
+{
+ return m_NumScoreComponents;
+}
- /*
- double SyntacticLanguageModel::perplexity() {
+std::string SyntacticLanguageModel::GetScoreProducerDescription() const
+{
+ return "SyntacticLM";
+}
+
+const FFState* SyntacticLanguageModel::EmptyHypothesisState(const InputType &input) const
+{
- SyntacticLanguageModelState<YModel,XModel,S,R> *prev =
- new SyntacticLanguageModelState<YModel,XModel,S,R>(m_files,m_beamWidth);
+ return new SyntacticLanguageModelState<YModel,XModel,S,R>(m_files,m_beamWidth);
- std::cerr << "Initial prob:" << "\t" << prev->getProb() <<std::endl;
+}
+/*
+double SyntacticLanguageModel::perplexity() {
- std::vector<std::string> words(3);
- words[0] = "no";
- words[1] = ",";
- words[2] = "zxvth";
+ SyntacticLanguageModelState<YModel,XModel,S,R> *prev =
+ new SyntacticLanguageModelState<YModel,XModel,S,R>(m_files,m_beamWidth);
+ std::cerr << "Initial prob:" << "\t" << prev->getProb() <<std::endl;
- for (std::vector<std::string>::iterator i=words.begin();
- i != words.end();
- i++) {
- prev = new SyntacticLanguageModelState<YModel,XModel,S,R>(prev, *i);
- std::cerr << *i << "\t" << prev->getProb() <<std::endl;
+ std::vector<std::string> words(3);
+ words[0] = "no";
+ words[1] = ",";
+ words[2] = "zxvth";
- }
- if (true) exit(-1);
+ for (std::vector<std::string>::iterator i=words.begin();
+ i != words.end();
+ i++) {
- return prev->getProb();
+ prev = new SyntacticLanguageModelState<YModel,XModel,S,R>(prev, *i);
+ std::cerr << *i << "\t" << prev->getProb() <<std::endl;
}
- */
- FFState* SyntacticLanguageModel::Evaluate(const Hypothesis& cur_hypo,
- const FFState* prev_state,
- ScoreComponentCollection* accumulator) const {
-
- VERBOSE(3,"Evaluating SyntacticLanguageModel for a hypothesis" << endl);
-
- SyntacticLanguageModelState<YModel,XModel,S,R>* tmpState = NULL;
- SyntacticLanguageModelState<YModel,XModel,S,R>* nextState = NULL;
-
-
- const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
-
- for (size_t i=0, n=targetPhrase.GetSize(); i<n; i++) {
-
- const Word& word = targetPhrase.GetWord(i);
- const Factor* factor = word.GetFactor(m_factorType);
-
- const std::string& string = factor->GetString();
-
- if (i==0) {
- nextState = new SyntacticLanguageModelState<YModel,XModel,S,R>((const SyntacticLanguageModelState<YModel,XModel,S,R>*)prev_state, string);
- } else {
- tmpState = nextState;
- nextState = new SyntacticLanguageModelState<YModel,XModel,S,R>(tmpState, string);
- delete tmpState;
- }
-
- double score = nextState->getScore();
- VERBOSE(3,"SynLM evaluated a score of " << score << endl);
- accumulator->Assign( this, score );
- }
-
+ if (true) exit(-1);
+
+ return prev->getProb();
+
+}
+*/
+FFState* SyntacticLanguageModel::Evaluate(const Hypothesis& cur_hypo,
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const
+{
+
+ VERBOSE(3,"Evaluating SyntacticLanguageModel for a hypothesis" << endl);
+
+ SyntacticLanguageModelState<YModel,XModel,S,R>* tmpState = NULL;
+ SyntacticLanguageModelState<YModel,XModel,S,R>* nextState = NULL;
+
- return nextState;
+ const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
+ for (size_t i=0, n=targetPhrase.GetSize(); i<n; i++) {
+
+ const Word& word = targetPhrase.GetWord(i);
+ const Factor* factor = word.GetFactor(m_factorType);
+
+ const std::string& string = factor->GetString();
+
+ if (i==0) {
+ nextState = new SyntacticLanguageModelState<YModel,XModel,S,R>((const SyntacticLanguageModelState<YModel,XModel,S,R>*)prev_state, string);
+ } else {
+ tmpState = nextState;
+ nextState = new SyntacticLanguageModelState<YModel,XModel,S,R>(tmpState, string);
+ delete tmpState;
+ }
+
+ double score = nextState->getScore();
+ VERBOSE(3,"SynLM evaluated a score of " << score << endl);
+ accumulator->Assign( this, score );
}
+
+
+ return nextState;
+
+}
+
}
diff --git a/moses/SyntacticLanguageModel.h b/moses/SyntacticLanguageModel.h
index 3cd4c58e9..6e88d85c1 100644
--- a/moses/SyntacticLanguageModel.h
+++ b/moses/SyntacticLanguageModel.h
@@ -12,40 +12,41 @@ class XModel; // observed model
namespace Moses
{
- template <class MH, class MO> class SyntacticLanguageModelFiles;
-
- class SyntacticLanguageModel : public StatefulFeatureFunction {
+template <class MH, class MO> class SyntacticLanguageModelFiles;
- public:
- SyntacticLanguageModel(const std::string &line);
+class SyntacticLanguageModel : public StatefulFeatureFunction
+{
+
+public:
+ SyntacticLanguageModel(const std::string &line);
- ~SyntacticLanguageModel();
+ ~SyntacticLanguageModel();
- size_t GetNumScoreComponents() const;
+ size_t GetNumScoreComponents() const;
- const FFState* EmptyHypothesisState(const InputType &input) const;
+ const FFState* EmptyHypothesisState(const InputType &input) const;
- FFState* Evaluate(const Hypothesis& cur_hypo,
- const FFState* prev_state,
- ScoreComponentCollection* accumulator) const;
+ FFState* Evaluate(const Hypothesis& cur_hypo,
+ const FFState* prev_state,
+ ScoreComponentCollection* accumulator) const;
- FFState* EvaluateChart(const ChartHypothesis& cur_hypo,
- int featureID,
- ScoreComponentCollection* accumulator) const {
- throw std::runtime_error("Syntactic LM can only be used with phrase-based decoder.");
- }
+ FFState* EvaluateChart(const ChartHypothesis& cur_hypo,
+ int featureID,
+ ScoreComponentCollection* accumulator) const {
+ throw std::runtime_error("Syntactic LM can only be used with phrase-based decoder.");
+ }
- // double perplexity();
+ // double perplexity();
- private:
+private:
- const size_t m_NumScoreComponents;
- SyntacticLanguageModelFiles<YModel,XModel>* m_files;
- const FactorType m_factorType;
- const size_t m_beamWidth;
+ const size_t m_NumScoreComponents;
+ SyntacticLanguageModelFiles<YModel,XModel>* m_files;
+ const FactorType m_factorType;
+ const size_t m_beamWidth;
- };
+};
}
diff --git a/moses/SyntacticLanguageModelFiles.h b/moses/SyntacticLanguageModelFiles.h
index 2e12e88c6..b91c0abfe 100644
--- a/moses/SyntacticLanguageModelFiles.h
+++ b/moses/SyntacticLanguageModelFiles.h
@@ -9,50 +9,55 @@
namespace Moses
{
-template <class MH, class MO>
-class SyntacticLanguageModelFiles {
+template <class MH, class MO>
+class SyntacticLanguageModelFiles
+{
- public:
+public:
SyntacticLanguageModelFiles(const std::vector<std::string>& filePaths);
~SyntacticLanguageModelFiles();
-
+
MH* getHiddenModel();
MO* getObservedModel();
- private:
+private:
MH* hiddenModel;
MO* observedModel;
-
+
};
template <class MH, class MO>
- SyntacticLanguageModelFiles<MH,MO>::SyntacticLanguageModelFiles(const std::vector<std::string>& filePaths) {
+SyntacticLanguageModelFiles<MH,MO>::SyntacticLanguageModelFiles(const std::vector<std::string>& filePaths)
+{
this->hiddenModel = new MH();
this->observedModel = new MO();
-
+
//// I. LOAD MODELS...
std::cerr << "Reading syntactic language model files...\n";
// For each model file...
for ( int a=0, n=filePaths.size(); a<n; a++ ) { // read models
FILE* pf = fopen(filePaths[a].c_str(),"r"); //CHECK(pf); // Read model file
- if(!pf){
+ if(!pf) {
std::cerr << "Error loading model file " << filePaths[a] << std::endl;
return;
}
std::cerr << "Loading model \'" << filePaths[a] << "\'...\n";
- int c=' '; int i=0; int line=1; String sBuff(1000); // Lookahead/ctrs/buffers
+ int c=' ';
+ int i=0;
+ int line=1;
+ String sBuff(1000); // Lookahead/ctrs/buffers
CONSUME_ALL ( pf, c, WHITESPACE(c), line); // Get to first record
while ( c!=-1 && c!='\0' && c!='\5' ) { // For each record
CONSUME_STR ( pf, c, (c!='\n' && c!='\0' && c!='\5'), sBuff, i, line ); // Consume line
StringInput si(sBuff.c_array());
if ( !( sBuff[0]=='#' // Accept comments/fields
- || si>>*(this->hiddenModel)>>"\0"!=NULL
- || si>>*(this->observedModel)>>"\0"!=NULL
- ))
- std::cerr<<"\nERROR: can't parse \'"<<sBuff<<"\' in line "<<line<<"\n\n";
+ || si>>*(this->hiddenModel)>>"\0"!=NULL
+ || si>>*(this->observedModel)>>"\0"!=NULL
+ ))
+ std::cerr<<"\nERROR: can't parse \'"<<sBuff<<"\' in line "<<line<<"\n\n";
CONSUME_ALL ( pf, c, WHITESPACE(c), line); // Consume whitespace
if ( line%100000==0 ) std::cerr<<" "<<line<<" lines read...\n"; // Progress for big models
}
@@ -66,7 +71,8 @@ template <class MH, class MO>
template <class MH, class MO>
- SyntacticLanguageModelFiles<MH,MO>::~SyntacticLanguageModelFiles() {
+SyntacticLanguageModelFiles<MH,MO>::~SyntacticLanguageModelFiles()
+{
VERBOSE(3,"Destructing syntactic language model files" << std::endl);
delete hiddenModel;
@@ -76,15 +82,17 @@ template <class MH, class MO>
template <class MH, class MO>
- MH* SyntacticLanguageModelFiles<MH,MO>::getHiddenModel() {
-
+MH* SyntacticLanguageModelFiles<MH,MO>::getHiddenModel()
+{
+
return this->hiddenModel;
}
template <class MH, class MO>
- MO* SyntacticLanguageModelFiles<MH,MO>::getObservedModel() {
-
+MO* SyntacticLanguageModelFiles<MH,MO>::getObservedModel()
+{
+
return this->observedModel;
}
diff --git a/moses/SyntacticLanguageModelState.h b/moses/SyntacticLanguageModelState.h
index 15828eedc..bf35616d9 100644
--- a/moses/SyntacticLanguageModelState.h
+++ b/moses/SyntacticLanguageModelState.h
@@ -15,8 +15,9 @@ namespace Moses
{
template <class MY, class MX, class YS=typename MY::RandVarType, class B=NullBackDat<typename MY::RandVarType> >
- class SyntacticLanguageModelState : public FFState {
- public:
+class SyntacticLanguageModelState : public FFState
+{
+public:
// Initialize an empty LM state
SyntacticLanguageModelState( SyntacticLanguageModelFiles<MY,MX>* modelData, int beamSize );
@@ -25,52 +26,53 @@ template <class MY, class MX, class YS=typename MY::RandVarType, class B=NullBac
SyntacticLanguageModelState( const SyntacticLanguageModelState* prev, std::string word );
- ~SyntacticLanguageModelState() {
- VERBOSE(3,"Destructing SyntacticLanguageModelState" << std::endl);
- delete randomVariableStore;
- }
+ ~SyntacticLanguageModelState() {
+ VERBOSE(3,"Destructing SyntacticLanguageModelState" << std::endl);
+ delete randomVariableStore;
+ }
- virtual int Compare(const FFState& other) const;
+ virtual int Compare(const FFState& other) const;
// Get the LM score from this LM state
double getScore() const;
- double getProb() const;
+ double getProb() const;
- private:
+private:
- void setScore(double score);
- void printRV();
+ void setScore(double score);
+ void printRV();
- SafeArray1D<Id<int>,pair<YS,LogProb> >* randomVariableStore;
- double prob;
- double score;
- int beamSize;
- SyntacticLanguageModelFiles<MY,MX>* modelData;
- bool sentenceStart;
+ SafeArray1D<Id<int>,pair<YS,LogProb> >* randomVariableStore;
+ double prob;
+ double score;
+ int beamSize;
+ SyntacticLanguageModelFiles<MY,MX>* modelData;
+ bool sentenceStart;
};
////////////////////////////////////////////////////////////////////////////////
-
- template <class MY, class MX, class YS, class B>
- void SyntacticLanguageModelState<MY,MX,YS,B>::printRV() {
- cerr << "*********** BEGIN printRV() ******************" << endl;
- int size=randomVariableStore->getSize();
- cerr << "randomVariableStore->getSize() == " << size << endl;
+template <class MY, class MX, class YS, class B>
+void SyntacticLanguageModelState<MY,MX,YS,B>::printRV()
+{
+
+ cerr << "*********** BEGIN printRV() ******************" << endl;
+ int size=randomVariableStore->getSize();
+ cerr << "randomVariableStore->getSize() == " << size << endl;
+
+ for (int depth=0; depth<size; depth+=1) {
- for (int depth=0; depth<size; depth+=1) {
-
- const pair<YS,LogProb> *data = &(randomVariableStore->get(depth));
- std::cerr << "randomVariableStore[" << depth << "]\t" << data->first << "\tprob = " << data->second.toProb() << "\tlogProb = " << double(data->second.toInt())/100 << std::endl;
+ const pair<YS,LogProb> *data = &(randomVariableStore->get(depth));
+ std::cerr << "randomVariableStore[" << depth << "]\t" << data->first << "\tprob = " << data->second.toProb() << "\tlogProb = " << double(data->second.toInt())/100 << std::endl;
- }
- cerr << "*********** END printRV() ******************" << endl;
+ }
+ cerr << "*********** END printRV() ******************" << endl;
- }
+}
// Initialize an empty LM state from grammar files
//
@@ -78,7 +80,8 @@ template <class MY, class MX, class YS=typename MY::RandVarType, class B=NullBac
// argv is the list of model file names
//
template <class MY, class MX, class YS, class B>
- SyntacticLanguageModelState<MY,MX,YS,B>::SyntacticLanguageModelState( SyntacticLanguageModelFiles<MY,MX>* modelData, int beamSize ) {
+SyntacticLanguageModelState<MY,MX,YS,B>::SyntacticLanguageModelState( SyntacticLanguageModelFiles<MY,MX>* modelData, int beamSize )
+{
this->randomVariableStore = new SafeArray1D<Id<int>,pair<YS,LogProb> >();
this->modelData = modelData;
@@ -89,7 +92,7 @@ template <class MY, class MX, class YS, class B>
StringInput(String(BEG_STATE).c_array())>>xBEG>>"\0";
cerr<<xBEG<<"\n";
- // cout << "Examining RV store just before RV init" << endl;
+ // cout << "Examining RV store just before RV init" << endl;
//printRV();
// Initialize the random variable store
@@ -107,16 +110,17 @@ template <class MY, class MX, class YS, class B>
//score = l.toDouble();
setScore(l.toDouble());
// MY::F_ROOT_OBS = true;
- // this->modelData->getHiddenModel()->setRootObs(true);
-
-
+// this->modelData->getHiddenModel()->setRootObs(true);
+
+
}
template <class MY, class MX, class YS, class B>
- int SyntacticLanguageModelState<MY,MX,YS,B>::Compare(const FFState& other) const {
+int SyntacticLanguageModelState<MY,MX,YS,B>::Compare(const FFState& other) const
+{
/*
- const SyntacticLanguageModelState<MY,MX,YS,B>& o =
+ const SyntacticLanguageModelState<MY,MX,YS,B>& o =
static_cast<const SyntacticLanguageModelState<MY,MX,YS,B>&>(other);
if (o.score > score) return 1;
@@ -124,13 +128,14 @@ template <class MY, class MX, class YS, class B>
else return 0;
*/
return 0;
- }
+}
template <class MY, class MX, class YS, class B>
- SyntacticLanguageModelState<MY,MX,YS,B>::SyntacticLanguageModelState( const SyntacticLanguageModelState* prev, std::string word ) {
+SyntacticLanguageModelState<MY,MX,YS,B>::SyntacticLanguageModelState( const SyntacticLanguageModelState* prev, std::string word )
+{
- // Initialize member variables
+ // Initialize member variables
this->randomVariableStore = new SafeArray1D<Id<int>,pair<YS,LogProb> >();
this->modelData = prev->modelData;
this->beamSize = prev->beamSize;
@@ -143,13 +148,13 @@ template <class MY, class MX, class YS, class B>
// Get HHMM model files
MY& mH = *(modelData->getHiddenModel());
MX& mO = *(modelData->getObservedModel());
-
+
// Initialize HHMM
- HMM<MY,MX,YS,B> hmm(mH,mO);
+ HMM<MY,MX,YS,B> hmm(mH,mO);
int MAX_WORDS = 2;
hmm.init(MAX_WORDS,this->beamSize,prev->randomVariableStore);
- typename MX::RandVarType x(word.c_str());
- // cout << "Examining HHMM just after hmm.init" << endl;
+ typename MX::RandVarType x(word.c_str());
+ // cout << "Examining HHMM just after hmm.init" << endl;
// hmm.debugPrint();
@@ -158,21 +163,21 @@ template <class MY, class MX, class YS, class B>
hmm.writeCurr(cout,1);
cerr << "*********** END writeCurr() ******************" << endl;
*/
-/*
- {
-
- int wnum=1;
- list<TrellNode<YS,B> > lys = hmm.getMLSnodes(ysEND); // get mls list
- for ( typename list<TrellNode<YS,B> >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame
- cout << "HYPOTH " << wnum
- << " " << i->getBackData()
- << " " << x
- << " " << i->getId()
- << " (" << i->getLogProb() << ")"
- << endl; // print RV val
- }
- }
- */
+ /*
+ {
+
+ int wnum=1;
+ list<TrellNode<YS,B> > lys = hmm.getMLSnodes(ysEND); // get mls list
+ for ( typename list<TrellNode<YS,B> >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame
+ cout << "HYPOTH " << wnum
+ << " " << i->getBackData()
+ << " " << x
+ << " " << i->getId()
+ << " (" << i->getLogProb() << ")"
+ << endl; // print RV val
+ }
+ }
+ */
/*
@@ -189,7 +194,7 @@ template <class MY, class MX, class YS, class B>
// typename MX::RandVarType ov;
// ov.set(word.c_str(),mO);
// MY::WORD = ov.getW();
- //bool endOfSentence = prev->sentenceStart;//true;
+ //bool endOfSentence = prev->sentenceStart;//true;
// std::cerr << "About to give HHMM a word of input:\t" << word << std::endl;
@@ -197,27 +202,27 @@ template <class MY, class MX, class YS, class B>
// cout << "Examining HHMM just after hmm.updateRanked(" << x << "," << prev->sentenceStart << ")" << endl;
// hmm.debugPrint();
-/*
- cerr << "*********** BEGIN writeCurr() ******************" << endl;
- hmm.writeCurr(cout,0);
- hmm.writeCurr(cout,1);
- cerr << "*********** END writeCurr() ******************" << endl;
- */
-/*
-{
+ /*
+ cerr << "*********** BEGIN writeCurr() ******************" << endl;
+ hmm.writeCurr(cout,0);
+ hmm.writeCurr(cout,1);
+ cerr << "*********** END writeCurr() ******************" << endl;
+ */
+ /*
+ {
- int wnum=1;
- list<TrellNode<YS,B> > lys = hmm.getMLSnodes(ysEND); // get mls list
- for ( typename list<TrellNode<YS,B> >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame
- cout << "HYPOTH " << wnum
- << " " << i->getBackData()
- << " " << x
- << " " << i->getId()
- << " (" << i->getLogProb() << ")"
- << endl; // print RV val
- }
- }
- */
+ int wnum=1;
+ list<TrellNode<YS,B> > lys = hmm.getMLSnodes(ysEND); // get mls list
+ for ( typename list<TrellNode<YS,B> >::iterator i=lys.begin(); i!=lys.end(); i++, wnum++ ) { // for each frame
+ cout << "HYPOTH " << wnum
+ << " " << i->getBackData()
+ << " " << x
+ << " " << i->getId()
+ << " (" << i->getLogProb() << ")"
+ << endl; // print RV val
+ }
+ }
+ */
// X ov(word.c_str());
//mH.setWord(ov);
// MY::WORD = ov;//ov.getW();
@@ -226,17 +231,17 @@ template <class MY, class MX, class YS, class B>
//hmm.updateRanked(ov);
//mH.setRootObs(true);
//MY::F_ROOT_OBS = false;
-
+
// Get the current score
- double currSum = hmm.getCurrSum();
- //VERBOSE(3,"Setting score using currSum for " << scientific << x << " = " << currSum << endl);
+ double currSum = hmm.getCurrSum();
+ //VERBOSE(3,"Setting score using currSum for " << scientific << x << " = " << currSum << endl);
setScore(currSum);
- // cout << "Examining RV store just before RV init via gatherElementsInBeam" << endl;
+ // cout << "Examining RV store just before RV init via gatherElementsInBeam" << endl;
// printRV();
// Get new hidden random variable store from HHMM
hmm.gatherElementsInBeam(randomVariableStore);
- // cout << "Examining RV store just after RV init via gatherElementsInBeam" << endl;
+ // cout << "Examining RV store just after RV init via gatherElementsInBeam" << endl;
// printRV();
/*
cerr << "Writing hmm.writeCurr..." << endl;
@@ -248,22 +253,25 @@ template <class MY, class MX, class YS, class B>
template <class MY, class MX, class YS, class B>
-double SyntacticLanguageModelState<MY,MX,YS,B>::getProb() const {
-
+double SyntacticLanguageModelState<MY,MX,YS,B>::getProb() const
+{
+
return prob;
}
template <class MY, class MX, class YS, class B>
-double SyntacticLanguageModelState<MY,MX,YS,B>::getScore() const {
-
+double SyntacticLanguageModelState<MY,MX,YS,B>::getScore() const
+{
+
return score;
}
template <class MY, class MX, class YS, class B>
- void SyntacticLanguageModelState<MY,MX,YS,B>::setScore(double score) {
+void SyntacticLanguageModelState<MY,MX,YS,B>::setScore(double score)
+{
+
-
this->prob = score;
diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp
index 81b7adf44..f3cf9d1e1 100644
--- a/moses/TargetPhrase.cpp
+++ b/moses/TargetPhrase.cpp
@@ -38,10 +38,10 @@ using namespace std;
namespace Moses
{
TargetPhrase::TargetPhrase( std::string out_string)
-:Phrase(0), m_fullScore(0.0), m_sourcePhrase(0)
-, m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
-, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
-, m_lhsTarget(NULL)
+ :Phrase(0), m_fullScore(0.0), m_sourcePhrase(0)
+ , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
+ , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
+ , m_lhsTarget(NULL)
{
//ACAT
@@ -50,37 +50,36 @@ TargetPhrase::TargetPhrase( std::string out_string)
}
TargetPhrase::TargetPhrase()
-:Phrase()
-, m_fullScore(0.0)
-,m_sourcePhrase()
-, m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
-, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
-, m_lhsTarget(NULL)
+ :Phrase()
+ , m_fullScore(0.0)
+ ,m_sourcePhrase()
+ , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
+ , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
+ , m_lhsTarget(NULL)
{
}
TargetPhrase::TargetPhrase(const Phrase &phrase)
-: Phrase(phrase)
-, m_fullScore(0.0)
-, m_sourcePhrase()
-, m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
-, m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
-, m_lhsTarget(NULL)
+ : Phrase(phrase)
+ , m_fullScore(0.0)
+ , m_sourcePhrase()
+ , m_alignTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
+ , m_alignNonTerm(&AlignmentInfoCollection::Instance().GetEmptyAlignmentInfo())
+ , m_lhsTarget(NULL)
{
}
TargetPhrase::TargetPhrase(const TargetPhrase &copy)
-: Phrase(copy)
-, m_fullScore(copy.m_fullScore)
-, m_sourcePhrase(copy.m_sourcePhrase)
-, m_alignTerm(copy.m_alignTerm)
-, m_alignNonTerm(copy.m_alignNonTerm)
-, m_scoreBreakdown(copy.m_scoreBreakdown)
+ : Phrase(copy)
+ , m_fullScore(copy.m_fullScore)
+ , m_sourcePhrase(copy.m_sourcePhrase)
+ , m_alignTerm(copy.m_alignTerm)
+ , m_alignNonTerm(copy.m_alignNonTerm)
+ , m_scoreBreakdown(copy.m_scoreBreakdown)
{
if (copy.m_lhsTarget) {
m_lhsTarget = new Word(copy.m_lhsTarget);
- }
- else {
+ } else {
m_lhsTarget = NULL;
}
@@ -125,8 +124,8 @@ void TargetPhrase::Evaluate(const InputType &input)
const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
for (size_t i = 0; i < ffs.size(); ++i) {
- const FeatureFunction &ff = *ffs[i];
- ff.Evaluate(input, m_scoreBreakdown);
+ const FeatureFunction &ff = *ffs[i];
+ ff.Evaluate(input, m_scoreBreakdown);
}
}
@@ -180,7 +179,7 @@ TargetPhrase *TargetPhrase::MergeNext(const TargetPhrase &inputPhrase) const
void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString)
{
- AlignmentInfo::CollType alignTerm, alignNonTerm;
+ AlignmentInfo::CollType alignTerm, alignNonTerm;
for (util::TokenIter<util::AnyCharacter, true> token(alignString, util::AnyCharacter(" \t")); token; ++token) {
util::TokenIter<util::SingleCharacter, false> dash(*token, util::SingleCharacter('-'));
@@ -194,11 +193,10 @@ void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString)
if (GetWord(targetPos).IsNonTerminal()) {
- alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
+ alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
+ } else {
+ alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
}
- else {
- alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
- }
}
SetAlignTerm(alignTerm);
SetAlignNonTerm(alignNonTerm);
@@ -207,15 +205,15 @@ void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString)
void TargetPhrase::SetAlignTerm(const AlignmentInfo::CollType &coll)
{
- const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
- m_alignTerm = alignmentInfo;
+ const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
+ m_alignTerm = alignmentInfo;
}
void TargetPhrase::SetAlignNonTerm(const AlignmentInfo::CollType &coll)
{
- const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
- m_alignNonTerm = alignmentInfo;
+ const AlignmentInfo *alignmentInfo = AlignmentInfoCollection::Instance().Add(coll);
+ m_alignNonTerm = alignmentInfo;
}
void TargetPhrase::SetSparseScore(const FeatureFunction* translationScoreProducer, const StringPiece &sparseString)
diff --git a/moses/TargetPhrase.h b/moses/TargetPhrase.h
index df876a00a..bb1c7c5a7 100644
--- a/moses/TargetPhrase.h
+++ b/moses/TargetPhrase.h
@@ -48,10 +48,10 @@ protected:
float m_fullScore;
ScoreComponentCollection m_scoreBreakdown;
- // in case of confusion net, ptr to source phrase
- Phrase m_sourcePhrase;
- const AlignmentInfo* m_alignTerm, *m_alignNonTerm;
- const Word *m_lhsTarget;
+ // in case of confusion net, ptr to source phrase
+ Phrase m_sourcePhrase;
+ const AlignmentInfo* m_alignTerm, *m_alignNonTerm;
+ const Word *m_lhsTarget;
public:
TargetPhrase();
@@ -86,26 +86,28 @@ public:
return m_fullScore;
}
- inline const ScoreComponentCollection &GetScoreBreakdown() const
- { return m_scoreBreakdown; }
- inline ScoreComponentCollection &GetScoreBreakdown()
- { return m_scoreBreakdown; }
+ inline const ScoreComponentCollection &GetScoreBreakdown() const {
+ return m_scoreBreakdown;
+ }
+ inline ScoreComponentCollection &GetScoreBreakdown() {
+ return m_scoreBreakdown;
+ }
//TODO: Probably shouldn't copy this, but otherwise ownership is unclear
- void SetSourcePhrase(const Phrase& p)
- {
- m_sourcePhrase=p;
- }
- const Phrase& GetSourcePhrase() const
- {
- return m_sourcePhrase;
- }
-
- void SetTargetLHS(const Word *lhs)
- { m_lhsTarget = lhs; }
- const Word &GetTargetLHS() const
- { return *m_lhsTarget; }
-
+ void SetSourcePhrase(const Phrase& p) {
+ m_sourcePhrase=p;
+ }
+ const Phrase& GetSourcePhrase() const {
+ return m_sourcePhrase;
+ }
+
+ void SetTargetLHS(const Word *lhs) {
+ m_lhsTarget = lhs;
+ }
+ const Word &GetTargetLHS() const {
+ return *m_lhsTarget;
+ }
+
void SetAlignmentInfo(const StringPiece &alignString);
void SetAlignTerm(const AlignmentInfo *alignTerm) {
m_alignTerm = alignTerm;
@@ -117,11 +119,13 @@ public:
void SetAlignTerm(const AlignmentInfo::CollType &coll);
void SetAlignNonTerm(const AlignmentInfo::CollType &coll);
- const AlignmentInfo &GetAlignTerm() const
- { return *m_alignTerm; }
- const AlignmentInfo &GetAlignNonTerm() const
- { return *m_alignNonTerm; }
-
+ const AlignmentInfo &GetAlignTerm() const {
+ return *m_alignTerm;
+ }
+ const AlignmentInfo &GetAlignNonTerm() const {
+ return *m_alignNonTerm;
+ }
+
TO_STRING();
};
@@ -131,10 +135,8 @@ std::ostream& operator<<(std::ostream&, const TargetPhrase&);
/**
* Hasher that looks at source and target phrase.
**/
-struct TargetPhraseHasher
-{
- inline size_t operator()(const TargetPhrase& targetPhrase) const
- {
+struct TargetPhraseHasher {
+ inline size_t operator()(const TargetPhrase& targetPhrase) const {
size_t seed = 0;
boost::hash_combine(seed, targetPhrase);
boost::hash_combine(seed, targetPhrase.GetSourcePhrase());
@@ -145,14 +147,12 @@ struct TargetPhraseHasher
}
};
-struct TargetPhraseComparator
-{
- inline bool operator()(const TargetPhrase& lhs, const TargetPhrase& rhs) const
- {
+struct TargetPhraseComparator {
+ inline bool operator()(const TargetPhrase& lhs, const TargetPhrase& rhs) const {
return lhs.Compare(rhs) == 0 &&
- lhs.GetSourcePhrase().Compare(rhs.GetSourcePhrase()) == 0 &&
- lhs.GetAlignTerm() == rhs.GetAlignTerm() &&
- lhs.GetAlignNonTerm() == rhs.GetAlignNonTerm();
+ lhs.GetSourcePhrase().Compare(rhs.GetSourcePhrase()) == 0 &&
+ lhs.GetAlignTerm() == rhs.GetAlignTerm() &&
+ lhs.GetAlignNonTerm() == rhs.GetAlignNonTerm();
}
};
diff --git a/moses/TargetPhraseCollection.cpp b/moses/TargetPhraseCollection.cpp
index 78b63d852..88ce28eb6 100644
--- a/moses/TargetPhraseCollection.cpp
+++ b/moses/TargetPhraseCollection.cpp
@@ -59,8 +59,8 @@ void TargetPhraseCollection::Sort(bool adhereTableLimit, size_t tableLimit)
{
std::vector<TargetPhrase*>::iterator iterMiddle;
iterMiddle = (tableLimit == 0 || m_collection.size() < tableLimit)
- ? m_collection.end()
- : m_collection.begin()+tableLimit;
+ ? m_collection.end()
+ : m_collection.begin()+tableLimit;
std::partial_sort(m_collection.begin(), iterMiddle, m_collection.end(),
CompareTargetPhrase());
diff --git a/moses/TargetPhraseCollection.h b/moses/TargetPhraseCollection.h
index 4efb911fb..4207bccef 100644
--- a/moses/TargetPhraseCollection.h
+++ b/moses/TargetPhraseCollection.h
@@ -60,7 +60,9 @@ public:
RemoveAllInColl(m_collection);
}
- const std::vector<TargetPhrase*> &GetCollection() const { return m_collection; }
+ const std::vector<TargetPhrase*> &GetCollection() const {
+ return m_collection;
+ }
//! divide collection into 2 buckets using std::nth_element, the top & bottom according to table limit
void NthElement(size_t tableLimit);
diff --git a/moses/Terminal.h b/moses/Terminal.h
index 6247d0b6c..e7d18676e 100644
--- a/moses/Terminal.h
+++ b/moses/Terminal.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -29,7 +29,7 @@ namespace Moses
class TerminalHasher
{
- public:
+public:
// Generate a hash value for a word representing a terminal. It's
// assumed that the same subset of factors will be active for all words
// that are hashed.
@@ -47,7 +47,7 @@ class TerminalHasher
class TerminalEqualityPred
{
- public:
+public:
// Equality predicate for comparing words representing terminals. As
// with the hasher, it's assumed that all words will have the same
// subset of active factors.
diff --git a/moses/ThreadPool.h b/moses/ThreadPool.h
index fad236a98..bf981a2da 100644
--- a/moses/ThreadPool.h
+++ b/moses/ThreadPool.h
@@ -42,7 +42,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
/**
* Classes to implement a ThreadPool.
**/
-namespace Moses {
+namespace Moses
+{
/** A task to be executed by the ThreadPool
*/
@@ -50,7 +51,9 @@ class Task
{
public:
virtual void Run() = 0;
- virtual bool DeleteAfterExecution() { return true; }
+ virtual bool DeleteAfterExecution() {
+ return true;
+ }
virtual ~Task() {}
};
@@ -58,7 +61,7 @@ public:
class ThreadPool
{
- public:
+public:
/**
* Construct a thread pool of a fixed size.
**/
@@ -82,7 +85,9 @@ class ThreadPool
/**
* Set maximum number of queued threads (otherwise Submit blocks)
**/
- void SetQueueLimit( size_t limit ) { m_queueLimit = limit; }
+ void SetQueueLimit( size_t limit ) {
+ m_queueLimit = limit;
+ }
private:
/**
@@ -109,7 +114,7 @@ public:
#ifdef BOOST_HAS_PTHREADS
pthread_t tid = pthread_self();
#else
- typedef void * pthread_t;
+ typedef void * pthread_t;
pthread_t tid = 0;
#endif
std::cerr << "Executing " << m_id << " in thread id " << tid << std::endl;
diff --git a/moses/Timer.h b/moses/Timer.h
index deefa4a71..a6bd0e91a 100644
--- a/moses/Timer.h
+++ b/moses/Timer.h
@@ -33,7 +33,7 @@ public:
* using 'start' or 'restart'
*/
Timer() : running(false) {
- start_time = 0;
+ start_time = 0;
}
void start(const char* msg = 0);
diff --git a/moses/TranslationModel/BilingualDynSuffixArray.cpp b/moses/TranslationModel/BilingualDynSuffixArray.cpp
index 824529b91..a47473de5 100644
--- a/moses/TranslationModel/BilingualDynSuffixArray.cpp
+++ b/moses/TranslationModel/BilingualDynSuffixArray.cpp
@@ -7,495 +7,494 @@
using namespace std;
-namespace Moses {
+namespace Moses
+{
BilingualDynSuffixArray::BilingualDynSuffixArray():
- m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
- m_maxSampleSize(20)
-{
- m_srcSA = 0;
- m_trgSA = 0;
- m_srcCorpus = new std::vector<wordID_t>();
- m_trgCorpus = new std::vector<wordID_t>();
- m_srcVocab = new Vocab(false);
- m_trgVocab = new Vocab(false);
- m_scoreCmp = 0;
+ m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
+ m_maxSampleSize(20)
+{
+ m_srcSA = 0;
+ m_trgSA = 0;
+ m_srcCorpus = new std::vector<wordID_t>();
+ m_trgCorpus = new std::vector<wordID_t>();
+ m_srcVocab = new Vocab(false);
+ m_trgVocab = new Vocab(false);
+ m_scoreCmp = 0;
}
-BilingualDynSuffixArray::~BilingualDynSuffixArray()
+BilingualDynSuffixArray::~BilingualDynSuffixArray()
{
- if(m_srcSA) delete m_srcSA;
- if(m_trgSA) delete m_trgSA;
- if(m_srcVocab) delete m_srcVocab;
- if(m_trgVocab) delete m_trgVocab;
- if(m_srcCorpus) delete m_srcCorpus;
- if(m_trgCorpus) delete m_trgCorpus;
- if(m_scoreCmp) delete m_scoreCmp;
+ if(m_srcSA) delete m_srcSA;
+ if(m_trgSA) delete m_trgSA;
+ if(m_srcVocab) delete m_srcVocab;
+ if(m_trgVocab) delete m_trgVocab;
+ if(m_srcCorpus) delete m_srcCorpus;
+ if(m_trgCorpus) delete m_trgCorpus;
+ if(m_scoreCmp) delete m_scoreCmp;
}
bool BilingualDynSuffixArray::Load(
- const std::vector<FactorType>& inputFactors,
- const std::vector<FactorType>& outputFactors,
- std::string source, std::string target, std::string alignments,
- const std::vector<float> &weight)
+ const std::vector<FactorType>& inputFactors,
+ const std::vector<FactorType>& outputFactors,
+ std::string source, std::string target, std::string alignments,
+ const std::vector<float> &weight)
{
m_inputFactors = inputFactors;
m_outputFactors = outputFactors;
- m_scoreCmp = new ScoresComp(weight);
- InputFileStream sourceStrme(source);
- InputFileStream targetStrme(target);
- cerr << "Loading source corpus...\n";
- LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
- cerr << "Loading target corpus...\n";
- LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
- CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
-
- // build suffix arrays and auxilliary arrays
- cerr << "Building Source Suffix Array...\n";
- m_srcSA = new DynSuffixArray(m_srcCorpus);
- if(!m_srcSA) return false;
- cerr << "Building Target Suffix Array...\n";
- //m_trgSA = new DynSuffixArray(m_trgCorpus);
- //if(!m_trgSA) return false;
+ m_scoreCmp = new ScoresComp(weight);
+ InputFileStream sourceStrme(source);
+ InputFileStream targetStrme(target);
+ cerr << "Loading source corpus...\n";
+ LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
+ cerr << "Loading target corpus...\n";
+ LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
+ CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
+
+ // build suffix arrays and auxilliary arrays
+ cerr << "Building Source Suffix Array...\n";
+ m_srcSA = new DynSuffixArray(m_srcCorpus);
+ if(!m_srcSA) return false;
+ cerr << "Building Target Suffix Array...\n";
+ //m_trgSA = new DynSuffixArray(m_trgCorpus);
+ //if(!m_trgSA) return false;
cerr << "\t(Skipped. Not used)\n";
-
- InputFileStream alignStrme(alignments);
- cerr << "Loading Alignment File...\n";
- LoadRawAlignments(alignStrme);
- //LoadAlignments(alignStrme);
+
+ InputFileStream alignStrme(alignments);
+ cerr << "Loading Alignment File...\n";
+ LoadRawAlignments(alignStrme);
+ //LoadAlignments(alignStrme);
cerr << "Building frequent word cache...\n";
CacheFreqWords();
- return true;
+ return true;
}
-
+
bool BilingualDynSuffixArray::LoadTM(
- const std::vector<FactorType>& inputFactors,
- const std::vector<FactorType>& outputFactors,
- std::string source, std::string target, std::string alignments,
- const std::vector<float> &weight)
+ const std::vector<FactorType>& inputFactors,
+ const std::vector<FactorType>& outputFactors,
+ std::string source, std::string target, std::string alignments,
+ const std::vector<float> &weight)
{
m_inputFactors = inputFactors;
m_outputFactors = outputFactors;
-
+
m_scoreCmp = new ScoresComp(weight);
InputFileStream sourceStrme(source);
InputFileStream targetStrme(target);
- cerr << "Loading target corpus...\n";
+ cerr << "Loading target corpus...\n";
LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
-
- cerr << "Loading source corpus...\n";
+
+ cerr << "Loading source corpus...\n";
LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
-
+
CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
-
+
// build suffix arrays and auxilliary arrays
- cerr << "Building Source Suffix Array...\n";
- m_srcSA = new DynSuffixArray(m_srcCorpus);
+ cerr << "Building Source Suffix Array...\n";
+ m_srcSA = new DynSuffixArray(m_srcCorpus);
if(!m_srcSA) return false;
- cerr << "Building Target Suffix Array...\n";
- //m_trgSA = new DynSuffixArray(m_trgCorpus);
+ cerr << "Building Target Suffix Array...\n";
+ //m_trgSA = new DynSuffixArray(m_trgCorpus);
//if(!m_trgSA) return false;
cerr << "\t(Skipped. Not used)\n";
-
+
InputFileStream alignStrme(alignments);
- cerr << "Loading Alignment File...\n";
+ cerr << "Loading Alignment File...\n";
LoadRawAlignments(alignStrme);
//LoadAlignments(alignStrme);
cerr << "Building frequent word cache...\n";
CacheFreqWords();
return true;
-
+
}
-int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align)
+int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align)
{
- // stores the alignments in the raw file format
- std::string line;
- std::vector<int> vtmp;
+ // stores the alignments in the raw file format
+ std::string line;
+ std::vector<int> vtmp;
int lineNum = 1;
- while(getline(align, line)) {
+ while(getline(align, line)) {
if (lineNum % 10000 == 0)
cerr << lineNum;
- Utils::splitToInt(line, vtmp, "- ");
- CHECK(vtmp.size() % 2 == 0);
- std::vector<short> vAlgn; // store as short ints for memory
- for (std::vector<int>::const_iterator itr = vtmp.begin();
- itr != vtmp.end(); ++itr) {
- vAlgn.push_back(short(*itr));
- }
- m_rawAlignments.push_back(vAlgn);
+ Utils::splitToInt(line, vtmp, "- ");
+ CHECK(vtmp.size() % 2 == 0);
+ std::vector<short> vAlgn; // store as short ints for memory
+ for (std::vector<int>::const_iterator itr = vtmp.begin();
+ itr != vtmp.end(); ++itr) {
+ vAlgn.push_back(short(*itr));
+ }
+ m_rawAlignments.push_back(vAlgn);
++lineNum;
- }
- return m_rawAlignments.size();
+ }
+ return m_rawAlignments.size();
}
-int BilingualDynSuffixArray::LoadRawAlignments(string& align) {
- // stores the alignments in the raw file format
+int BilingualDynSuffixArray::LoadRawAlignments(string& align)
+{
+ // stores the alignments in the raw file format
vector<int> vtmp;
Utils::splitToInt(align, vtmp, "- ");
CHECK(vtmp.size() % 2 == 0);
vector<short> vAlgn; // store as short ints for memory
for (std::vector<int>::const_iterator itr = vtmp.begin();
- itr != vtmp.end(); ++itr) {
- vAlgn.push_back(short(*itr));
+ itr != vtmp.end(); ++itr) {
+ vAlgn.push_back(short(*itr));
}
m_rawAlignments.push_back(vAlgn);
return m_rawAlignments.size();
}
-int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align)
+int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align)
{
- std::string line;
- std::vector<int> vtmp;
- int sntIndex(0);
-
- while(getline(align, line)) {
- Utils::splitToInt(line, vtmp, "- ");
- CHECK(vtmp.size() % 2 == 0);
-
- int sourceSize = GetSourceSentenceSize(sntIndex);
- int targetSize = GetTargetSentenceSize(sntIndex);
-
- SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
- for(int i=0; i < (int)vtmp.size(); i+=2) {
- int sourcePos = vtmp[i];
- int targetPos = vtmp[i+1];
- CHECK(sourcePos < sourceSize);
- CHECK(targetPos < targetSize);
-
- curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
- curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
- }
- curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
- curSnt.trgSnt = m_trgCorpus + sntIndex;
- m_alignments.push_back(curSnt);
-
- sntIndex++;
- }
- return m_alignments.size();
+ std::string line;
+ std::vector<int> vtmp;
+ int sntIndex(0);
+
+ while(getline(align, line)) {
+ Utils::splitToInt(line, vtmp, "- ");
+ CHECK(vtmp.size() % 2 == 0);
+
+ int sourceSize = GetSourceSentenceSize(sntIndex);
+ int targetSize = GetTargetSentenceSize(sntIndex);
+
+ SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
+ for(int i=0; i < (int)vtmp.size(); i+=2) {
+ int sourcePos = vtmp[i];
+ int targetPos = vtmp[i+1];
+ CHECK(sourcePos < sourceSize);
+ CHECK(targetPos < targetSize);
+
+ curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
+ curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
+ }
+ curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
+ curSnt.trgSnt = m_trgCorpus + sntIndex;
+ m_alignments.push_back(curSnt);
+
+ sntIndex++;
+ }
+ return m_alignments.size();
}
-SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const
+SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const
{
- // retrieves the alignments in the format used by SentenceAlignment.Extract()
- int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex);
- int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex);
- std::vector<short> alignment = m_rawAlignments.at(sntIndex);
- SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
- for(size_t i=0; i < alignment.size(); i+=2) {
- int sourcePos = alignment[i];
- int targetPos = alignment[i+1];
- if(trg2Src) {
- curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word
- curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
- }
- else {
- curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
- curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
- }
- }
- curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
- curSnt.trgSnt = m_trgCorpus + sntIndex;
-
- return curSnt;
+ // retrieves the alignments in the format used by SentenceAlignment.Extract()
+ int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex);
+ int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex);
+ std::vector<short> alignment = m_rawAlignments.at(sntIndex);
+ SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
+ for(size_t i=0; i < alignment.size(); i+=2) {
+ int sourcePos = alignment[i];
+ int targetPos = alignment[i+1];
+ if(trg2Src) {
+ curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word
+ curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
+ } else {
+ curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
+ curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
+ }
+ }
+ curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
+ curSnt.trgSnt = m_trgCorpus + sntIndex;
+
+ return curSnt;
}
-bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,
- const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const
+bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,
+ const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const
{
- /* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
- * parameter */
- SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
- // get span of phrase in source sentence
- int beginSentence = m_srcSntBreaks[sntIndex];
- int rightIdx = wordIndex - beginSentence
- ,leftIdx = rightIdx - sourceSize + 1;
- return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
+ /* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
+ * parameter */
+ SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
+ // get span of phrase in source sentence
+ int beginSentence = m_srcSntBreaks[sntIndex];
+ int rightIdx = wordIndex - beginSentence
+ ,leftIdx = rightIdx - sourceSize + 1;
+ return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
}
int BilingualDynSuffixArray::LoadCorpus(FactorDirection direction, InputFileStream& corpus, const FactorList& factors,
- std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray,
- Vocab* vocab)
+ std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray,
+ Vocab* vocab)
{
- std::string line, word;
- int sntIdx(0);
+ std::string line, word;
+ int sntIdx(0);
// corpus.seekg(0); Seems needless -> commented out to allow loading of gzipped corpora (gzfilebuf doesn't support seeking).
- const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
- while(getline(corpus, line)) {
- sntArray.push_back(sntIdx);
- Phrase phrase(ARRAY_SIZE_INCR);
- // parse phrase
- phrase.CreateFromString(direction, factors, line, factorDelimiter, NULL);
- // store words in vocabulary and corpus
- for( size_t i = 0; i < phrase.GetSize(); ++i) {
- cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
- }
- sntIdx += phrase.GetSize();
- }
- //cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus
+ const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
+ while(getline(corpus, line)) {
+ sntArray.push_back(sntIdx);
+ Phrase phrase(ARRAY_SIZE_INCR);
+ // parse phrase
+ phrase.CreateFromString(direction, factors, line, factorDelimiter, NULL);
+ // store words in vocabulary and corpus
+ for( size_t i = 0; i < phrase.GetSize(); ++i) {
+ cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
+ }
+ sntIdx += phrase.GetSize();
+ }
+ //cArray.push_back(vocab->GetkOOVWordID); // signify end of corpus
vocab->MakeClosed(); // avoid adding words
- return cArray.size();
+ return cArray.size();
}
-bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
+bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
{
- // looks up the SA vocab ids for the current src phrase
- size_t phraseSize = src.GetSize();
- for (size_t pos = 0; pos < phraseSize; ++pos) {
- const Word &word = src.GetWord(pos);
- wordID_t arrayId = m_srcVocab->GetWordID(word);
- if (arrayId == m_srcVocab->GetkOOVWordID())
- { // oov
- return false;
- }
- else
- {
- output.SetId(pos, arrayId);
- //cerr << arrayId << " ";
- }
- }
- return true;
+ // looks up the SA vocab ids for the current src phrase
+ size_t phraseSize = src.GetSize();
+ for (size_t pos = 0; pos < phraseSize; ++pos) {
+ const Word &word = src.GetWord(pos);
+ wordID_t arrayId = m_srcVocab->GetWordID(word);
+ if (arrayId == m_srcVocab->GetkOOVWordID()) {
+ // oov
+ return false;
+ } else {
+ output.SetId(pos, arrayId);
+ //cerr << arrayId << " ";
+ }
+ }
+ return true;
}
-pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
+pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
{
- //return pair<float, float>(1, 1);
- float srcLexWeight(1.0), trgLexWeight(1.0);
- std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words
- //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
- const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
- std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
- // for each source word
- for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
- float srcSumPairProbs(0);
- wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
- const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
+ //return pair<float, float>(1, 1);
+ float srcLexWeight(1.0), trgLexWeight(1.0);
+ std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words
+ //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
+ const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
+ std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
+ // for each source word
+ for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
+ float srcSumPairProbs(0);
+ wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
+ const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
// for each target word aligned to this source word in this alignment
- if(srcWordAlignments.size() == 0) { // get p(NULL|src)
- pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
- itrCache = m_wordPairCache.find(wordpair);
- if(itrCache == m_wordPairCache.end()) { // if not in cache
- CacheWordProbs(srcWord);
- itrCache = m_wordPairCache.find(wordpair); // search cache again
- }
- CHECK(itrCache != m_wordPairCache.end());
- srcSumPairProbs += itrCache->second.first;
- targetProbs[wordpair] = itrCache->second.second;
- }
- else { // extract p(trg|src)
- for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
- int trgIdx = srcWordAlignments[i];
- wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
- // get probability of this source->target word pair
- pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
- itrCache = m_wordPairCache.find(wordpair);
- if(itrCache == m_wordPairCache.end()) { // if not in cache
+ if(srcWordAlignments.size() == 0) { // get p(NULL|src)
+ pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
+ itrCache = m_wordPairCache.find(wordpair);
+ if(itrCache == m_wordPairCache.end()) { // if not in cache
+ CacheWordProbs(srcWord);
+ itrCache = m_wordPairCache.find(wordpair); // search cache again
+ }
+ CHECK(itrCache != m_wordPairCache.end());
+ srcSumPairProbs += itrCache->second.first;
+ targetProbs[wordpair] = itrCache->second.second;
+ } else { // extract p(trg|src)
+ for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
+ int trgIdx = srcWordAlignments[i];
+ wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
+ // get probability of this source->target word pair
+ pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
+ itrCache = m_wordPairCache.find(wordpair);
+ if(itrCache == m_wordPairCache.end()) { // if not in cache
CacheWordProbs(srcWord);
- itrCache = m_wordPairCache.find(wordpair); // search cache again
- }
- CHECK(itrCache != m_wordPairCache.end());
- srcSumPairProbs += itrCache->second.first;
- targetProbs[wordpair] = itrCache->second.second;
- }
- }
- float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
- srcLexWeight *= (srcNormalizer * srcSumPairProbs);
- } // end for each source word
- for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
- float trgSumPairProbs(0);
- wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
- for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
- = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
- if(trgItr->first.second == trgWord)
- trgSumPairProbs += trgItr->second;
+ itrCache = m_wordPairCache.find(wordpair); // search cache again
}
- if(trgSumPairProbs == 0) continue; // currently don't store target-side SA
- int noAligned = alignment.numberAligned.at(trgIdx);
- float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
- trgLexWeight *= (trgNormalizer * trgSumPairProbs);
- }
- // TODO::Need to get p(NULL|trg)
- return pair<float, float>(srcLexWeight, trgLexWeight);
+ CHECK(itrCache != m_wordPairCache.end());
+ srcSumPairProbs += itrCache->second.first;
+ targetProbs[wordpair] = itrCache->second.second;
+ }
+ }
+ float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
+ srcLexWeight *= (srcNormalizer * srcSumPairProbs);
+ } // end for each source word
+ for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
+ float trgSumPairProbs(0);
+ wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
+ for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
+ = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
+ if(trgItr->first.second == trgWord)
+ trgSumPairProbs += trgItr->second;
+ }
+ if(trgSumPairProbs == 0) continue; // currently don't store target-side SA
+ int noAligned = alignment.numberAligned.at(trgIdx);
+ float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
+ trgLexWeight *= (trgNormalizer * trgSumPairProbs);
+ }
+ // TODO::Need to get p(NULL|trg)
+ return pair<float, float>(srcLexWeight, trgLexWeight);
}
-void BilingualDynSuffixArray::CacheFreqWords() const {
+void BilingualDynSuffixArray::CacheFreqWords() const
+{
std::multimap<int, wordID_t> wordCnts;
// for each source word in vocab
- Vocab::Word2Id::const_iterator it;
+ Vocab::Word2Id::const_iterator it;
for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) {
// get its frequency
wordID_t srcWord = it->second;
std::vector<wordID_t> sword(1, srcWord), wrdIndices;
m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
- if(wrdIndices.size() >= 1000) { // min count
+ if(wrdIndices.size() >= 1000) { // min count
wordCnts.insert(make_pair(wrdIndices.size(), srcWord));
}
}
int numSoFar(0);
- std::multimap<int, wordID_t>::reverse_iterator ritr;
- for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
+ std::multimap<int, wordID_t>::reverse_iterator ritr;
+ for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
m_freqWordsCached.insert(ritr->second);
CacheWordProbs(ritr->second);
if(++numSoFar == 50) break; // get top counts
}
cerr << "\tCached " << m_freqWordsCached.size() << " source words\n";
}
-void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
+void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
{
- std::map<wordID_t, int> counts;
- std::vector<wordID_t> sword(1, srcWord), wrdIndices;
- bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
- CHECK(ret);
- std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
- float denom(0);
- // for each occurrence of this word
- for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
- int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
- CHECK(sntIdx != -1);
- int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
- const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
- if(srcAlg.size() == 0) {
- ++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
- ++denom;
- }
- else { //get target words aligned to srcword in this sentence
- for(size_t i=0; i < srcAlg.size(); ++i) {
- wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
- ++counts[trgWord];
- ++denom;
- }
- }
- }
- // now we've gotten counts of all target words aligned to this source word
- // get probs and cache all pairs
- for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
- itrCnt != counts.end(); ++itrCnt) {
- pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
- float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg)
- float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src)
- m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
- }
+ std::map<wordID_t, int> counts;
+ std::vector<wordID_t> sword(1, srcWord), wrdIndices;
+ bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
+ CHECK(ret);
+ std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
+ float denom(0);
+ // for each occurrence of this word
+ for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
+ int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
+ CHECK(sntIdx != -1);
+ int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
+ const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
+ if(srcAlg.size() == 0) {
+ ++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
+ ++denom;
+ } else { //get target words aligned to srcword in this sentence
+ for(size_t i=0; i < srcAlg.size(); ++i) {
+ wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
+ ++counts[trgWord];
+ ++denom;
+ }
+ }
+ }
+ // now we've gotten counts of all target words aligned to this source word
+ // get probs and cache all pairs
+ for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
+ itrCnt != counts.end(); ++itrCnt) {
+ pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
+ float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg)
+ float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src)
+ m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
+ }
}
-SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
+SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
{
- // takes sentence indexes and looks up vocab IDs
- SAPhrase phraseIds(phrasepair.GetTargetSize());
- int sntIndex = phrasepair.m_sntIndex;
- int id(-1), pos(0);
- for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words
- id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
- phraseIds.SetId(pos++, id);
- }
- return phraseIds;
+ // takes sentence indexes and looks up vocab IDs
+ SAPhrase phraseIds(phrasepair.GetTargetSize());
+ int sntIndex = phrasepair.m_sntIndex;
+ int id(-1), pos(0);
+ for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words
+ id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
+ phraseIds.SetId(pos++, id);
+ }
+ return phraseIds;
}
-
+
TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase) const
{
- TargetPhrase* targetPhrase = new TargetPhrase();
- for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
- Word& word = m_trgVocab->GetWord( phrase.words[i]);
- CHECK(word != m_trgVocab->GetkOOVWord());
- targetPhrase->AddWord(word);
- }
- targetPhrase->SetSourcePhrase(sourcePhrase);
- // scoring
- return targetPhrase;
+ TargetPhrase* targetPhrase = new TargetPhrase();
+ for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
+ Word& word = m_trgVocab->GetWord( phrase.words[i]);
+ CHECK(word != m_trgVocab->GetkOOVWord());
+ targetPhrase->AddWord(word);
+ }
+ targetPhrase->SetSourcePhrase(sourcePhrase);
+ // scoring
+ return targetPhrase;
}
-void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const
+void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const
{
//cerr << "phrase is \"" << src << endl;
- size_t sourceSize = src.GetSize();
- SAPhrase localIDs(sourceSize);
- if(!GetLocalVocabIDs(src, localIDs)) return;
- float totalTrgPhrases(0);
- std::map<SAPhrase, int> phraseCounts;
- //std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from
- std::map<SAPhrase, pair<float, float> > lexicalWeights;
- std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
- std::vector<unsigned> wrdIndices;
- // extract sentence IDs from SA and return rightmost index of phrases
- if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
+ size_t sourceSize = src.GetSize();
+ SAPhrase localIDs(sourceSize);
+ if(!GetLocalVocabIDs(src, localIDs)) return;
+ float totalTrgPhrases(0);
+ std::map<SAPhrase, int> phraseCounts;
+ //std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from
+ std::map<SAPhrase, pair<float, float> > lexicalWeights;
+ std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
+ std::vector<unsigned> wrdIndices;
+ // extract sentence IDs from SA and return rightmost index of phrases
+ if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
SampleSelection(wrdIndices);
- std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
- // for each sentence with this phrase
- for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
- std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence
- int sntIndex = sntIndexes.at(snt); // get corpus index for sentence
- if(sntIndex == -1) continue; // bad flag set by GetSntIndexes()
- ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs);
- //cerr << "extracted " << phrasePairs.size() << endl;
- totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair
- std::vector<PhrasePair*>::iterator iterPhrasePair;
- for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
- SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
- phraseCounts[phrase]++; // count each unique phrase
- // NOTE::Correct but slow to extract lexical weight here. could do
+ std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
+ // for each sentence with this phrase
+ for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
+ std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence
+ int sntIndex = sntIndexes.at(snt); // get corpus index for sentence
+ if(sntIndex == -1) continue; // bad flag set by GetSntIndexes()
+ ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs);
+ //cerr << "extracted " << phrasePairs.size() << endl;
+ totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair
+ std::vector<PhrasePair*>::iterator iterPhrasePair;
+ for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
+ SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
+ phraseCounts[phrase]++; // count each unique phrase
+ // NOTE::Correct but slow to extract lexical weight here. could do
// it later for only the top phrases chosen by phrase prob p(e|f)
- pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair
- itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached
- if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
- itrLexW->second = lexWeight; // if this lex weight is greater save it
- else lexicalWeights[phrase] = lexWeight; // else save
- }
- // done with sentence. delete SA phrase pairs
- RemoveAllInColl(phrasePairs);
- } // done with all sentences
- // convert to moses phrase pairs
- std::map<SAPhrase, int>::const_iterator iterPhrases;
- std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp);
- // get scores of all phrases
- for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) {
- float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases;
- itrLexW = lexicalWeights.find(iterPhrases->first);
- CHECK(itrLexW != lexicalWeights.end());
- Scores scoreVector(3);
- scoreVector[0] = trg2SrcMLE;
- scoreVector[1] = itrLexW->second.first;
- scoreVector[2] = 2.718; // exp(1);
- phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
- }
- // return top scoring phrases
- std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
- for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) {
- Scores scoreVector = ritr->first;
- TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second, src);
- target.push_back(make_pair( scoreVector, targetPhrase));
- if(target.size() == m_maxSampleSize) break;
- }
+ pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair
+ itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached
+ if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
+ itrLexW->second = lexWeight; // if this lex weight is greater save it
+ else lexicalWeights[phrase] = lexWeight; // else save
+ }
+ // done with sentence. delete SA phrase pairs
+ RemoveAllInColl(phrasePairs);
+ } // done with all sentences
+ // convert to moses phrase pairs
+ std::map<SAPhrase, int>::const_iterator iterPhrases;
+ std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp);
+ // get scores of all phrases
+ for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) {
+ float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases;
+ itrLexW = lexicalWeights.find(iterPhrases->first);
+ CHECK(itrLexW != lexicalWeights.end());
+ Scores scoreVector(3);
+ scoreVector[0] = trg2SrcMLE;
+ scoreVector[1] = itrLexW->second.first;
+ scoreVector[2] = 2.718; // exp(1);
+ phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
+ }
+ // return top scoring phrases
+ std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
+ for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) {
+ Scores scoreVector = ritr->first;
+ TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second, src);
+ target.push_back(make_pair( scoreVector, targetPhrase));
+ if(target.size() == m_maxSampleSize) break;
+ }
}
-std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
- const int sourceSize, const std::vector<unsigned>& sntBreaks) const
+std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
+ const int sourceSize, const std::vector<unsigned>& sntBreaks) const
{
- std::vector<unsigned>::const_iterator vit;
- std::vector<int> sntIndexes;
- for(size_t i=0; i < wrdIndices.size(); ++i) {
- vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
- int index = int(vit - sntBreaks.begin()) - 1;
- // check for phrases that cross sentence boundaries
- if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
- sntIndexes.push_back(-1); // set bad flag
- else
- sntIndexes.push_back(index); // store the index of the sentence in the corpus
- }
- return sntIndexes;
+ std::vector<unsigned>::const_iterator vit;
+ std::vector<int> sntIndexes;
+ for(size_t i=0; i < wrdIndices.size(); ++i) {
+ vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
+ int index = int(vit - sntBreaks.begin()) - 1;
+ // check for phrases that cross sentence boundaries
+ if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
+ sntIndexes.push_back(-1); // set bad flag
+ else
+ sntIndexes.push_back(index); // store the index of the sentence in the corpus
+ }
+ return sntIndexes;
}
int BilingualDynSuffixArray::SampleSelection(std::vector<unsigned>& sample,
- int sampleSize) const
+ int sampleSize) const
{
// only use top 'sampleSize' number of samples
- if(sample.size() > (size_t)sampleSize)
- sample.erase(sample.begin()+sampleSize, sample.end());
- return sample.size();
+ if(sample.size() > (size_t)sampleSize)
+ sample.erase(sample.begin()+sampleSize, sample.end());
+ return sample.size();
}
-void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment) {
+void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment)
+{
vuint_t srcFactor, trgFactor;
cerr << "source, target, alignment = " << source << ", " << target << ", " << alignment << endl;
- const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
+ const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
Phrase sphrase(ARRAY_SIZE_INCR);
@@ -511,7 +510,7 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
cerr << "srcFactor[" << (srcFactor.size() - 1) << "] = " << srcFactor.back() << endl;
m_srcCorpus->push_back(srcFactor.back()); // add word to corpus
}
- m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
+ m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
m_srcVocab->MakeClosed();
Phrase tphrase(ARRAY_SIZE_INCR);
tphrase.CreateFromString(Output, m_outputFactors, target, factorDelimiter, NULL);
@@ -534,16 +533,17 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
LoadRawAlignments(alignment);
m_trgVocab->MakeClosed();
//for(size_t i=0; i < sphrase.GetSize(); ++i)
- //ClearWordInCache(sIDs[i]);
-
+ //ClearWordInCache(sIDs[i]);
+
}
-void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) {
+void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord)
+{
if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end())
return;
- std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it,
- first, last;
+ std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it,
+ first, last;
for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) {
- if(it->first.first == srcWord) { // all source words grouped
+ if(it->first.first == srcWord) { // all source words grouped
first = it; // copy first entry of srcWord
last = it++;
while(it != m_wordPairCache.end() && (it->first.first == srcWord)) {
@@ -553,80 +553,77 @@ void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) {
m_wordPairCache.erase(first, last);
}
}
-SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
- :m_sntIndex(sntIndex)
- ,numberAligned(targetSize, 0)
- ,alignedList(sourceSize)
+SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
+ :m_sntIndex(sntIndex)
+ ,numberAligned(targetSize, 0)
+ ,alignedList(sourceSize)
{
- for(int i=0; i < sourceSize; ++i) {
- std::vector<int> trgWrd;
- alignedList[i] = trgWrd;
- }
+ for(int i=0; i < sourceSize; ++i) {
+ std::vector<int> trgWrd;
+ alignedList[i] = trgWrd;
+ }
}
bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const
{
- // foreign = target, F=T
- // english = source, E=S
- int countTarget = numberAligned.size();
-
- int minTarget = 9999;
- int maxTarget = -1;
- std::vector< int > usedTarget = numberAligned;
- for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++)
- {
- for(int ind=0; ind < (int)alignedList[sourcePos].size();ind++)
- {
- int targetPos = alignedList[sourcePos][ind];
- // cout << "point (" << targetPos << ", " << sourcePos << ")\n";
- if (targetPos<minTarget) { minTarget = targetPos; }
- if (targetPos>maxTarget) { maxTarget = targetPos; }
- usedTarget[ targetPos ]--;
- } // for(int ind=0;ind<sentence
- } // for(int sourcePos=startSource
-
- // cout << "f projected ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
-
- if (maxTarget >= 0 && // aligned to any foreign words at all
- maxTarget-minTarget < maxPhraseLength)
- { // foreign phrase within limits
-
- // check if foreign words are aligned to out of bound english words
- bool out_of_bounds = false;
- for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++)
- {
- if (usedTarget[targetPos]>0)
- {
- // cout << "ouf of bounds: " << targetPos << "\n";
- out_of_bounds = true;
- }
- }
-
- // cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
- if (!out_of_bounds)
- {
- // start point of foreign phrase may retreat over unaligned
- for(int startTarget = minTarget;
- (startTarget >= 0 &&
- startTarget > maxTarget-maxPhraseLength && // within length limit
- (startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned
- startTarget--)
- {
- // end point of foreign phrase may advance over unaligned
- for (int endTarget=maxTarget;
- (endTarget<countTarget &&
- endTarget<startTarget+maxPhraseLength && // within length limit
- (endTarget==maxTarget || numberAligned[endTarget]==0)); // unaligned
- endTarget++)
- {
- PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
- ret.push_back(phrasePair);
- } // for (int endTarget=maxTarget;
- } // for(int startTarget=minTarget;
- } // if (!out_of_bounds)
- } // if (maxTarget >= 0 &&
- return (ret.size() > 0);
-
+ // foreign = target, F=T
+ // english = source, E=S
+ int countTarget = numberAligned.size();
+
+ int minTarget = 9999;
+ int maxTarget = -1;
+ std::vector< int > usedTarget = numberAligned;
+ for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) {
+ for(int ind=0; ind < (int)alignedList[sourcePos].size(); ind++) {
+ int targetPos = alignedList[sourcePos][ind];
+ // cout << "point (" << targetPos << ", " << sourcePos << ")\n";
+ if (targetPos<minTarget) {
+ minTarget = targetPos;
+ }
+ if (targetPos>maxTarget) {
+ maxTarget = targetPos;
+ }
+ usedTarget[ targetPos ]--;
+ } // for(int ind=0;ind<sentence
+ } // for(int sourcePos=startSource
+
+ // cout << "f projected ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
+
+ if (maxTarget >= 0 && // aligned to any foreign words at all
+ maxTarget-minTarget < maxPhraseLength) {
+ // foreign phrase within limits
+
+ // check if foreign words are aligned to out of bound english words
+ bool out_of_bounds = false;
+ for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++) {
+ if (usedTarget[targetPos]>0) {
+ // cout << "ouf of bounds: " << targetPos << "\n";
+ out_of_bounds = true;
+ }
+ }
+
+ // cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
+ if (!out_of_bounds) {
+ // start point of foreign phrase may retreat over unaligned
+ for(int startTarget = minTarget;
+ (startTarget >= 0 &&
+ startTarget > maxTarget-maxPhraseLength && // within length limit
+ (startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned
+ startTarget--) {
+ // end point of foreign phrase may advance over unaligned
+ for (int endTarget=maxTarget;
+ (endTarget<countTarget &&
+ endTarget<startTarget+maxPhraseLength && // within length limit
+ (endTarget==maxTarget || numberAligned[endTarget]==0)); // unaligned
+ endTarget++) {
+ PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
+ ret.push_back(phrasePair);
+ } // for (int endTarget=maxTarget;
+ } // for(int startTarget=minTarget;
+ } // if (!out_of_bounds)
+ } // if (maxTarget >= 0 &&
+ return (ret.size() > 0);
+
}
}// end namepsace
diff --git a/moses/TranslationModel/BilingualDynSuffixArray.h b/moses/TranslationModel/BilingualDynSuffixArray.h
index 5dda1e274..08637d095 100644
--- a/moses/TranslationModel/BilingualDynSuffixArray.h
+++ b/moses/TranslationModel/BilingualDynSuffixArray.h
@@ -1,7 +1,7 @@
#ifndef moses_BilingualDynSuffixArray_h
#define moses_BilingualDynSuffixArray_h
-#include "DynSuffixArray.h"
+#include "DynSuffixArray.h"
#include "moses/TranslationModel/DynSAInclude/vocab.h"
#include "moses/TranslationModel/DynSAInclude/types.h"
#include "moses/TranslationModel/DynSAInclude/utils.h"
@@ -9,26 +9,27 @@
#include "moses/FactorTypeSet.h"
#include "moses/TargetPhrase.h"
-namespace Moses {
+namespace Moses
+{
/** @todo ask Abbey Levenberg
*/
class SAPhrase
{
public:
- std::vector<wordID_t> words;
-
- SAPhrase(size_t phraseSize)
- :words(phraseSize)
- {}
-
- void SetId(size_t pos, wordID_t id)
- {
+ std::vector<wordID_t> words;
+
+ SAPhrase(size_t phraseSize)
+ :words(phraseSize)
+ {}
+
+ void SetId(size_t pos, wordID_t id) {
CHECK(pos < words.size());
- words[pos] = id;
- }
- bool operator<(const SAPhrase& phr2) const
- { return words < phr2.words; }
+ words[pos] = id;
+ }
+ bool operator<(const SAPhrase& phr2) const {
+ return words < phr2.words;
+ }
};
/** @todo ask Abbey Levenberg
@@ -36,42 +37,44 @@ public:
class PhrasePair
{
public:
- int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
- PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
- : m_startTarget(startTarget)
- , m_endTarget(endTarget)
- , m_startSource(startSource)
- , m_endSource(endSource)
- , m_sntIndex(sntIndex)
- {}
-
- size_t GetTargetSize() const
- { return m_endTarget - m_startTarget + 1; }
+ int m_startTarget, m_endTarget, m_startSource, m_endSource, m_sntIndex;
+ PhrasePair(int startTarget, int endTarget, int startSource, int endSource, int sntIndex)
+ : m_startTarget(startTarget)
+ , m_endTarget(endTarget)
+ , m_startSource(startSource)
+ , m_endSource(endSource)
+ , m_sntIndex(sntIndex)
+ {}
+
+ size_t GetTargetSize() const {
+ return m_endTarget - m_startTarget + 1;
+ }
};
-
+
/** @todo ask Abbey Levenberg
*/
-class SentenceAlignment
+class SentenceAlignment
{
public:
- SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
- int m_sntIndex;
- std::vector<wordID_t>* trgSnt;
- std::vector<wordID_t>* srcSnt;
- std::vector<int> numberAligned;
- std::vector< std::vector<int> > alignedList;
- bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
+ SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
+ int m_sntIndex;
+ std::vector<wordID_t>* trgSnt;
+ std::vector<wordID_t>* srcSnt;
+ std::vector<int> numberAligned;
+ std::vector< std::vector<int> > alignedList;
+ bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
};
-class ScoresComp {
-public:
+class ScoresComp
+{
+public:
ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
- bool operator()(const Scores& s1, const Scores& s2) const {
+ bool operator()(const Scores& s1, const Scores& s2) const {
return s1[0] < s2[0]; // just p(e|f) as approximation
/*float score1(0), score2(0);
int idx1(0), idx2(0);
- for (Scores::const_iterator itr = s1.begin();
+ for (Scores::const_iterator itr = s1.begin();
itr != s1.end(); ++itr) {
- score1 += log(*itr * m_weights.at(idx1++));
+ score1 += log(*itr * m_weights.at(idx1++));
}
for (Scores::const_iterator itr = s2.begin();
itr != s2.end(); ++itr) {
@@ -79,78 +82,77 @@ public:
}
return score1 < score2;*/
}
-private:
+private:
const std::vector<float>& m_weights;
};
-
+
/** @todo ask Abbey Levenberg
*/
-class BilingualDynSuffixArray {
-public:
- BilingualDynSuffixArray();
- ~BilingualDynSuffixArray();
- bool Load( const std::vector<FactorType>& inputFactors,
- const std::vector<FactorType>& outputTactors,
- std::string source, std::string target, std::string alignments,
- const std::vector<float> &weight);
- bool LoadTM( const std::vector<FactorType>& inputFactors,
- const std::vector<FactorType>& outputTactors,
- std::string source, std::string target, std::string alignments,
- const std::vector<float> &weight);
- void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
+class BilingualDynSuffixArray
+{
+public:
+ BilingualDynSuffixArray();
+ ~BilingualDynSuffixArray();
+ bool Load( const std::vector<FactorType>& inputFactors,
+ const std::vector<FactorType>& outputTactors,
+ std::string source, std::string target, std::string alignments,
+ const std::vector<float> &weight);
+ bool LoadTM( const std::vector<FactorType>& inputFactors,
+ const std::vector<FactorType>& outputTactors,
+ std::string source, std::string target, std::string alignments,
+ const std::vector<float> &weight);
+ void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
void addSntPair(string& source, string& target, string& alignment);
private:
- DynSuffixArray* m_srcSA;
- DynSuffixArray* m_trgSA;
- std::vector<wordID_t>* m_srcCorpus;
- std::vector<wordID_t>* m_trgCorpus;
+ DynSuffixArray* m_srcSA;
+ DynSuffixArray* m_trgSA;
+ std::vector<wordID_t>* m_srcCorpus;
+ std::vector<wordID_t>* m_trgCorpus;
std::vector<FactorType> m_inputFactors;
std::vector<FactorType> m_outputFactors;
- std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
+ std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
- Vocab* m_srcVocab, *m_trgVocab;
- ScoresComp* m_scoreCmp;
+ Vocab* m_srcVocab, *m_trgVocab;
+ ScoresComp* m_scoreCmp;
- std::vector<SentenceAlignment> m_alignments;
- std::vector<std::vector<short> > m_rawAlignments;
+ std::vector<SentenceAlignment> m_alignments;
+ std::vector<std::vector<short> > m_rawAlignments;
- mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
+ mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
mutable std::set<wordID_t> m_freqWordsCached;
- const size_t m_maxPhraseLength, m_maxSampleSize;
-
- int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector<FactorType>& factors,
- std::vector<wordID_t>&, std::vector<wordID_t>&,
- Vocab*);
- int LoadAlignments(InputFileStream& aligs);
- int LoadRawAlignments(InputFileStream& aligs);
- int LoadRawAlignments(string& aligs);
-
- bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
- SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
- int SampleSelection(std::vector<unsigned>&, int = 300) const;
-
- std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
- TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
- SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
- bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
- void CacheWordProbs(wordID_t) const;
+ const size_t m_maxPhraseLength, m_maxSampleSize;
+
+ int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector<FactorType>& factors,
+ std::vector<wordID_t>&, std::vector<wordID_t>&,
+ Vocab*);
+ int LoadAlignments(InputFileStream& aligs);
+ int LoadRawAlignments(InputFileStream& aligs);
+ int LoadRawAlignments(string& aligs);
+
+ bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
+ SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
+ int SampleSelection(std::vector<unsigned>&, int = 300) const;
+
+ std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
+ TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
+ SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
+ bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
+ void CacheWordProbs(wordID_t) const;
void CacheFreqWords() const;
void ClearWordInCache(wordID_t);
- std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
-
- int GetSourceSentenceSize(size_t sentenceId) const
- {
- return (sentenceId==m_srcSntBreaks.size()-1) ?
- m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
- m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
- }
- int GetTargetSentenceSize(size_t sentenceId) const
- {
- return (sentenceId==m_trgSntBreaks.size()-1) ?
- m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
- m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
- }
+ std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
+
+ int GetSourceSentenceSize(size_t sentenceId) const {
+ return (sentenceId==m_srcSntBreaks.size()-1) ?
+ m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
+ m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
+ }
+ int GetTargetSentenceSize(size_t sentenceId) const {
+ return (sentenceId==m_trgSntBreaks.size()-1) ?
+ m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
+ m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
+ }
};
} // end namespace
#endif
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp
index 4f0f6c2cd..e8d2f734a 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h
index 61982299f..3c3f468c2 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -35,12 +35,12 @@ class WordsRange;
*/
class ChartRuleLookupManagerCYKPlus : public ChartRuleLookupManager
{
- public:
+public:
ChartRuleLookupManagerCYKPlus(const InputType &sentence,
const ChartCellCollectionBase &cellColl)
: ChartRuleLookupManager(sentence, cellColl) {}
- protected:
+protected:
void AddCompletedRule(
const DottedRule &dottedRule,
const TargetPhraseCollection &tpc,
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp
index ce6a1d30d..c0c1986f4 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.cpp
@@ -75,19 +75,19 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
// get list of all rules that apply to spans at same starting position
DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()];
const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList();
-
+
const ChartCellLabel &sourceWordLabel = GetSourceAt(absEndPos);
// loop through the rules
- // (note that expandableDottedRuleList can be expanded as the loop runs
+ // (note that expandableDottedRuleList can be expanded as the loop runs
// through calls to ExtendPartialRuleApplication())
for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) {
// rule we are about to extend
const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind];
// we will now try to extend it, starting after where it ended
size_t startPos = prevDottedRule.IsRoot()
- ? range.GetStartPos()
- : prevDottedRule.GetWordsRange().GetEndPos() + 1;
+ ? range.GetStartPos()
+ : prevDottedRule.GetWordsRange().GetEndPos() + 1;
// search for terminal symbol
// (if only one more word position needs to be covered)
@@ -100,15 +100,15 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
// if we found a new rule -> create it and add it to the list
if (node != NULL) {
- // create the rule
+ // create the rule
#ifdef USE_BOOST_POOL
DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc();
new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel,
prevDottedRule);
#else
DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node,
- sourceWordLabel,
- prevDottedRule);
+ sourceWordLabel,
+ prevDottedRule);
#endif
dottedRuleCol.Add(relEndPos+1, dottedRule);
}
@@ -134,9 +134,7 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection(
// word.
endPos = absEndPos - 1;
stackInd = relEndPos;
- }
- else
- {
+ } else {
endPos = absEndPos;
stackInd = relEndPos + 1;
}
@@ -208,7 +206,7 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
// We'll do whichever minimises the number of lookups:
if (numCombinations <= numChildren*2) {
- // loop over possible source non-terminal labels (as found in input tree)
+ // loop over possible source non-terminal labels (as found in input tree)
NonTerminalSet::const_iterator p = sourceNonTerms.begin();
NonTerminalSet::const_iterator sEnd = sourceNonTerms.end();
for (; p != sEnd; ++p) {
@@ -235,14 +233,12 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule);
#else
DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel,
- prevDottedRule);
+ prevDottedRule);
#endif
dottedRuleColl.Add(stackInd, rule);
}
}
- }
- else
- {
+ } else {
// loop over possible expansions of the rule
PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator p;
PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator end =
@@ -267,7 +263,7 @@ void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule);
#else
DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel,
- prevDottedRule);
+ prevDottedRule);
#endif
dottedRuleColl.Add(stackInd, rule);
}
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h
index 784e1c70d..74bc7d253 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemory.h
@@ -44,7 +44,7 @@ class WordsRange;
//! Implementation of ChartRuleLookupManager for in-memory rule tables.
class ChartRuleLookupManagerMemory : public ChartRuleLookupManagerCYKPlus
{
- public:
+public:
ChartRuleLookupManagerMemory(const InputType &sentence,
const ChartCellCollectionBase &cellColl,
const PhraseDictionaryMemory &ruleTable);
@@ -55,7 +55,7 @@ class ChartRuleLookupManagerMemory : public ChartRuleLookupManagerCYKPlus
const WordsRange &range,
ChartParserCallback &outColl);
- private:
+private:
void ExtendPartialRuleApplication(
const DottedRuleInMemory &prevDottedRule,
size_t startPos,
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp
index 4ad60eb43..412840782 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp
@@ -75,17 +75,17 @@ void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection(
// get list of all rules that apply to spans at same starting position
DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()];
const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList();
-
+
// loop through the rules
- // (note that expandableDottedRuleList can be expanded as the loop runs
+ // (note that expandableDottedRuleList can be expanded as the loop runs
// through calls to ExtendPartialRuleApplication())
for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) {
// rule we are about to extend
const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind];
// we will now try to extend it, starting after where it ended
size_t startPos = prevDottedRule.IsRoot()
- ? range.GetStartPos()
- : prevDottedRule.GetWordsRange().GetEndPos() + 1;
+ ? range.GetStartPos()
+ : prevDottedRule.GetWordsRange().GetEndPos() + 1;
// search for terminal symbol
// (if only one more word position needs to be covered)
@@ -99,15 +99,15 @@ void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection(
// if we found a new rule -> create it and add it to the list
if (node != NULL) {
- // create the rule
+ // create the rule
#ifdef USE_BOOST_POOL
DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc();
new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel,
prevDottedRule);
#else
DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node,
- sourceWordLabel,
- prevDottedRule);
+ sourceWordLabel,
+ prevDottedRule);
#endif
dottedRuleCol.Add(relEndPos+1, dottedRule);
}
@@ -133,9 +133,7 @@ void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection(
// word.
endPos = absEndPos - 1;
stackInd = relEndPos;
- }
- else
- {
+ } else {
endPos = absEndPos;
stackInd = relEndPos + 1;
}
@@ -207,7 +205,7 @@ void ChartRuleLookupManagerMemoryPerSentence::ExtendPartialRuleApplication(
// We'll do whichever minimises the number of lookups:
if (numCombinations <= numChildren*2) {
- // loop over possible source non-terminal labels (as found in input tree)
+ // loop over possible source non-terminal labels (as found in input tree)
NonTerminalSet::const_iterator p = sourceNonTerms.begin();
NonTerminalSet::const_iterator sEnd = sourceNonTerms.end();
for (; p != sEnd; ++p) {
@@ -234,14 +232,12 @@ void ChartRuleLookupManagerMemoryPerSentence::ExtendPartialRuleApplication(
new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule);
#else
DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel,
- prevDottedRule);
+ prevDottedRule);
#endif
dottedRuleColl.Add(stackInd, rule);
}
}
- }
- else
- {
+ } else {
// loop over possible expansions of the rule
PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator p;
PhraseDictionaryNodeMemory::NonTerminalMap::const_iterator end =
@@ -266,7 +262,7 @@ void ChartRuleLookupManagerMemoryPerSentence::ExtendPartialRuleApplication(
new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule);
#else
DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel,
- prevDottedRule);
+ prevDottedRule);
#endif
dottedRuleColl.Add(stackInd, rule);
}
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h
index 6f2b209a7..ebb8cdd7c 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h
@@ -44,10 +44,10 @@ class WordsRange;
//! Implementation of ChartRuleLookupManager for in-memory rule tables.
class ChartRuleLookupManagerMemoryPerSentence : public ChartRuleLookupManagerCYKPlus
{
- public:
+public:
ChartRuleLookupManagerMemoryPerSentence(const InputType &sentence,
- const ChartCellCollectionBase &cellColl,
- const PhraseDictionaryFuzzyMatch &ruleTable);
+ const ChartCellCollectionBase &cellColl,
+ const PhraseDictionaryFuzzyMatch &ruleTable);
~ChartRuleLookupManagerMemoryPerSentence();
@@ -55,7 +55,7 @@ class ChartRuleLookupManagerMemoryPerSentence : public ChartRuleLookupManagerCYK
const WordsRange &range,
ChartParserCallback &outColl);
- private:
+private:
void ExtendPartialRuleApplication(
const DottedRuleInMemory &prevDottedRule,
size_t startPos,
diff --git a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h
index 23f83623d..24d06270b 100644
--- a/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h
+++ b/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h
@@ -35,7 +35,7 @@ namespace Moses
//! Implementation of ChartRuleLookupManager for on-disk rule tables.
class ChartRuleLookupManagerOnDisk : public ChartRuleLookupManagerCYKPlus
{
- public:
+public:
ChartRuleLookupManagerOnDisk(const InputType &sentence,
const ChartCellCollectionBase &cellColl,
const PhraseDictionaryOnDisk &dictionary,
@@ -49,7 +49,7 @@ class ChartRuleLookupManagerOnDisk : public ChartRuleLookupManagerCYKPlus
virtual void GetChartRuleCollection(const WordsRange &range,
ChartParserCallback &outColl);
- private:
+private:
const PhraseDictionaryOnDisk &m_dictionary;
OnDiskPt::OnDiskWrapper &m_dbWrapper;
const std::vector<FactorType> &m_inputFactorsVec;
diff --git a/moses/TranslationModel/CYKPlusParser/DotChart.h b/moses/TranslationModel/CYKPlusParser/DotChart.h
index 9dd34593f..946f36ff2 100644
--- a/moses/TranslationModel/CYKPlusParser/DotChart.h
+++ b/moses/TranslationModel/CYKPlusParser/DotChart.h
@@ -28,26 +28,38 @@ namespace Moses
*/
class DottedRule
{
- public:
+public:
// used only to init dot stack.
DottedRule()
- : m_cellLabel(NULL)
- , m_prev(NULL) {}
+ : m_cellLabel(NULL)
+ , m_prev(NULL) {}
DottedRule(const ChartCellLabel &ccl, const DottedRule &prev)
- : m_cellLabel(&ccl)
- , m_prev(&prev) {}
+ : m_cellLabel(&ccl)
+ , m_prev(&prev) {}
- const WordsRange &GetWordsRange() const { return m_cellLabel->GetCoverage(); }
- const Word &GetSourceWord() const { return m_cellLabel->GetLabel(); }
- bool IsNonTerminal() const { return m_cellLabel->GetLabel().IsNonTerminal(); }
- const DottedRule *GetPrev() const { return m_prev; }
- bool IsRoot() const { return m_prev == NULL; }
- const ChartCellLabel &GetChartCellLabel() const { return *m_cellLabel; }
+ const WordsRange &GetWordsRange() const {
+ return m_cellLabel->GetCoverage();
+ }
+ const Word &GetSourceWord() const {
+ return m_cellLabel->GetLabel();
+ }
+ bool IsNonTerminal() const {
+ return m_cellLabel->GetLabel().IsNonTerminal();
+ }
+ const DottedRule *GetPrev() const {
+ return m_prev;
+ }
+ bool IsRoot() const {
+ return m_prev == NULL;
+ }
+ const ChartCellLabel &GetChartCellLabel() const {
+ return *m_cellLabel;
+ }
- private:
+private:
const ChartCellLabel *m_cellLabel; // usually contains something, unless
- // it's the init processed rule
+ // it's the init processed rule
const DottedRule *m_prev;
};
diff --git a/moses/TranslationModel/CYKPlusParser/DotChartInMemory.cpp b/moses/TranslationModel/CYKPlusParser/DotChartInMemory.cpp
index a28387027..616a2907c 100644
--- a/moses/TranslationModel/CYKPlusParser/DotChartInMemory.cpp
+++ b/moses/TranslationModel/CYKPlusParser/DotChartInMemory.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
diff --git a/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h b/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h
index f0753a8f1..cfd986d7a 100644
--- a/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h
+++ b/moses/TranslationModel/CYKPlusParser/DotChartInMemory.h
@@ -32,21 +32,23 @@ namespace Moses
*/
class DottedRuleInMemory : public DottedRule
{
- public:
+public:
// used only to init dot stack.
explicit DottedRuleInMemory(const PhraseDictionaryNodeMemory &node)
- : DottedRule()
- , m_node(node) {}
+ : DottedRule()
+ , m_node(node) {}
DottedRuleInMemory(const PhraseDictionaryNodeMemory &node,
const ChartCellLabel &cellLabel,
const DottedRuleInMemory &prev)
- : DottedRule(cellLabel, prev)
- , m_node(node) {}
-
- const PhraseDictionaryNodeMemory &GetLastNode() const { return m_node; }
+ : DottedRule(cellLabel, prev)
+ , m_node(node) {}
+
+ const PhraseDictionaryNodeMemory &GetLastNode() const {
+ return m_node;
+ }
- private:
+private:
const PhraseDictionaryNodeMemory &m_node;
};
diff --git a/moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h b/moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h
index 5b756ba8d..edd9f3a62 100644
--- a/moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h
+++ b/moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h
@@ -36,26 +36,32 @@ namespace Moses
*/
class DottedRuleOnDisk : public DottedRule
{
- public:
+public:
// used only to init dot stack.
explicit DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode)
- : DottedRule()
- , m_lastNode(lastNode)
- , m_done(false) {}
+ : DottedRule()
+ , m_lastNode(lastNode)
+ , m_done(false) {}
DottedRuleOnDisk(const OnDiskPt::PhraseNode &lastNode,
const ChartCellLabel &cellLabel,
const DottedRuleOnDisk &prev)
- : DottedRule(cellLabel, prev)
- , m_lastNode(lastNode)
- , m_done(false) {}
+ : DottedRule(cellLabel, prev)
+ , m_lastNode(lastNode)
+ , m_done(false) {}
- const OnDiskPt::PhraseNode &GetLastNode() const { return m_lastNode; }
+ const OnDiskPt::PhraseNode &GetLastNode() const {
+ return m_lastNode;
+ }
- bool Done() const { return m_done; }
- void Done(bool value) const { m_done = value; }
+ bool Done() const {
+ return m_done;
+ }
+ void Done(bool value) const {
+ m_done = value;
+ }
- private:
+private:
const OnDiskPt::PhraseNode &m_lastNode;
mutable bool m_done;
};
diff --git a/moses/TranslationModel/CompactPT/BlockHashIndex.cpp b/moses/TranslationModel/CompactPT/BlockHashIndex.cpp
index 705493ab7..9afe474f7 100644
--- a/moses/TranslationModel/CompactPT/BlockHashIndex.cpp
+++ b/moses/TranslationModel/CompactPT/BlockHashIndex.cpp
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#include "ThrowingFwrite.h"
#include "BlockHashIndex.h"
@@ -32,25 +32,27 @@ namespace Moses
#ifdef WITH_THREADS
BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
size_t threadsNum)
-: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
- m_fileHandle(0), m_fileHandleStart(0), m_size(0),
- m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0),
- m_threadPool(threadsNum) {
+ : m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
+ m_fileHandle(0), m_fileHandleStart(0), m_size(0),
+ m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0),
+ m_threadPool(threadsNum)
+{
#ifndef HAVE_CMPH
- std::cerr << "minphr: CMPH support not compiled in." << std::endl;
- exit(1);
-#endif
- }
+ std::cerr << "minphr: CMPH support not compiled in." << std::endl;
+ exit(1);
+#endif
+}
#else
BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits)
-: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
- m_fileHandle(0), m_fileHandleStart(0), m_size(0),
- m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0) {
+ : m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
+ m_fileHandle(0), m_fileHandleStart(0), m_size(0),
+ m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0)
+{
#ifndef HAVE_CMPH
- std::cerr << "minphr: CMPH support not compiled in." << std::endl;
- exit(1);
-#endif
- }
+ std::cerr << "minphr: CMPH support not compiled in." << std::endl;
+ exit(1);
+#endif
+}
#endif
BlockHashIndex::~BlockHashIndex()
@@ -60,7 +62,7 @@ BlockHashIndex::~BlockHashIndex()
it != m_hashes.end(); it++)
if(*it != 0)
cmph_destroy((cmph_t*)*it);
-
+
for(std::vector<PairedPackedArray<>*>::iterator it = m_arrays.begin();
it != m_arrays.end(); it++)
if(*it != 0)
@@ -72,15 +74,15 @@ size_t BlockHashIndex::GetHash(const char* key)
{
std::string keyStr(key);
size_t i = std::distance(m_landmarks.begin(),
- std::upper_bound(m_landmarks.begin(),
- m_landmarks.end(), keyStr)) - 1;
-
+ std::upper_bound(m_landmarks.begin(),
+ m_landmarks.end(), keyStr)) - 1;
+
if(i == 0ul-1)
return GetSize();
-
+
size_t pos = GetHash(i, key);
if(pos != GetSize())
- return (1ul << m_orderBits) * i + pos;
+ return (1ul << m_orderBits) * i + pos;
else
return GetSize();
}
@@ -100,7 +102,7 @@ size_t BlockHashIndex::GetHash(size_t i, const char* key)
#endif
if(m_hashes[i] == 0)
LoadRange(i);
-#ifdef HAVE_CMPH
+#ifdef HAVE_CMPH
size_t idx = cmph_search((cmph_t*)m_hashes[i], key, (cmph_uint32) strlen(key));
#else
assert(0);
@@ -109,11 +111,11 @@ size_t BlockHashIndex::GetHash(size_t i, const char* key)
std::pair<size_t, size_t> orderPrint = m_arrays[i]->Get(idx, m_orderBits, m_fingerPrintBits);
m_clocks[i] = clock();
-
+
if(GetFprint(key) == orderPrint.second)
- return orderPrint.first;
+ return orderPrint.first;
else
- return GetSize();
+ return GetSize();
}
size_t BlockHashIndex::GetHash(std::string key)
@@ -144,11 +146,11 @@ void BlockHashIndex::BeginSave(std::FILE * mphf)
m_fileHandle = mphf;
ThrowingFwrite(&m_orderBits, sizeof(size_t), 1, m_fileHandle);
ThrowingFwrite(&m_fingerPrintBits, sizeof(size_t), 1, m_fileHandle);
-
+
m_fileHandleStart = std::ftell(m_fileHandle);
-
+
size_t relIndexPos = 0;
- ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
+ ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
}
void BlockHashIndex::SaveRange(size_t i)
@@ -168,25 +170,22 @@ void BlockHashIndex::SaveLastRange()
boost::mutex::scoped_lock lock(m_mutex);
#endif
- while(!m_queue.empty() && m_lastSaved + 1 == -m_queue.top())
- {
+ while(!m_queue.empty() && m_lastSaved + 1 == -m_queue.top()) {
size_t current = -m_queue.top();
m_queue.pop();
SaveRange(current);
m_lastSaved = current;
- }
+ }
}
void BlockHashIndex::DropRange(size_t i)
{
#ifdef HAVE_CMPH
- if(m_hashes[i] != 0)
- {
+ if(m_hashes[i] != 0) {
cmph_destroy((cmph_t*)m_hashes[i]);
m_hashes[i] = 0;
}
- if(m_arrays[i] != 0)
- {
+ if(m_arrays[i] != 0) {
delete m_arrays[i];
m_arrays[i] = 0;
m_clocks[i] = 0;
@@ -201,7 +200,7 @@ void BlockHashIndex::DropLastRange()
boost::mutex::scoped_lock lock(m_mutex);
#endif
- while(m_lastDropped != m_lastSaved)
+ while(m_lastDropped != m_lastSaved)
DropRange(++m_lastDropped);
}
@@ -219,24 +218,24 @@ size_t BlockHashIndex::FinalizeSave()
#endif
SaveLastRange();
-
+
size_t relIndexPos = std::ftell(m_fileHandle) - m_fileHandleStart;
-
+
std::fseek(m_fileHandle, m_fileHandleStart, SEEK_SET);
ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
-
+
std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET);
m_landmarks.save(m_fileHandle);
-
+
size_t seekIndexSize = m_seekIndex.size();
ThrowingFwrite(&seekIndexSize, sizeof(size_t), 1, m_fileHandle);
ThrowingFwrite(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle);
-
+
ThrowingFwrite(&m_size, sizeof(size_t), 1, m_fileHandle);
-
+
size_t fileHandleStop = std::ftell(m_fileHandle);
return fileHandleStop - m_fileHandleStart + sizeof(m_orderBits)
- + sizeof(m_fingerPrintBits);
+ + sizeof(m_fingerPrintBits);
}
size_t BlockHashIndex::Save(std::FILE * mphf)
@@ -251,14 +250,14 @@ size_t BlockHashIndex::Save(std::FILE * mphf)
size_t BlockHashIndex::LoadIndex(std::FILE* mphf)
{
m_fileHandle = mphf;
-
+
size_t beginning = std::ftell(mphf);
size_t read = 0;
read += std::fread(&m_orderBits, sizeof(size_t), 1, mphf);
read += std::fread(&m_fingerPrintBits, sizeof(size_t), 1, mphf);
m_fileHandleStart = std::ftell(m_fileHandle);
-
+
size_t relIndexPos;
read += std::fread(&relIndexPos, sizeof(size_t), 1, mphf);
std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET);
@@ -272,12 +271,12 @@ size_t BlockHashIndex::LoadIndex(std::FILE* mphf)
m_hashes.resize(seekIndexSize, 0);
m_clocks.resize(seekIndexSize, 0);
m_arrays.resize(seekIndexSize, 0);
-
+
read += std::fread(&m_size, sizeof(size_t), 1, m_fileHandle);
size_t end = std::ftell(mphf);
- return end - beginning;
+ return end - beginning;
}
void BlockHashIndex::LoadRange(size_t i)
@@ -288,10 +287,10 @@ void BlockHashIndex::LoadRange(size_t i)
m_arrays[i] = new PairedPackedArray<>(0, m_orderBits,
m_fingerPrintBits);
m_arrays[i]->Load(m_fileHandle);
-
+
m_hashes[i] = (void*)hash;
m_clocks[i] = clock();
-
+
m_numLoadedRanges++;
#endif
}
@@ -308,9 +307,9 @@ size_t BlockHashIndex::Load(std::FILE * mphf)
{
size_t byteSize = LoadIndex(mphf);
size_t end = std::ftell(mphf);
-
+
for(size_t i = 0; i < m_seekIndex.size(); i++)
- LoadRange(i);
+ LoadRange(i);
std::fseek(m_fileHandle, end, SEEK_SET);
return byteSize;
}
@@ -327,14 +326,13 @@ void BlockHashIndex::KeepNLastRanges(float ratio, float tolerance)
#endif
size_t n = m_hashes.size() * ratio;
size_t max = n * (1 + tolerance);
- if(m_numLoadedRanges > max)
- {
+ if(m_numLoadedRanges > max) {
typedef std::vector<std::pair<clock_t, size_t> > LastLoaded;
LastLoaded lastLoaded;
for(size_t i = 0; i < m_hashes.size(); i++)
if(m_hashes[i] != 0)
lastLoaded.push_back(std::make_pair(m_clocks[i], i));
-
+
std::sort(lastLoaded.begin(), lastLoaded.end());
for(LastLoaded::reverse_iterator it = lastLoaded.rbegin() + size_t(n * (1 - tolerance));
it != lastLoaded.rend(); it++)
@@ -348,24 +346,23 @@ void BlockHashIndex::CalcHash(size_t current, void* source_void)
cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void;
cmph_config_t *config = cmph_config_new(source);
cmph_config_set_algo(config, CMPH_CHD);
-
+
cmph_t* hash = cmph_new(config);
PairedPackedArray<> *pv =
new PairedPackedArray<>(source->nkeys, m_orderBits, m_fingerPrintBits);
size_t i = 0;
-
+
source->rewind(source->data);
-
+
std::string lastKey = "";
- while(i < source->nkeys)
- {
+ while(i < source->nkeys) {
unsigned keylen;
char* key;
source->read(source->data, &key, &keylen);
std::string temp(key, keylen);
source->dispose(source->data, key, keylen);
-
+
if(lastKey > temp) {
if(source->nkeys != 2 || temp != "###DUMMY_KEY###") {
std::cerr << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort" << std::endl;
@@ -375,41 +372,40 @@ void BlockHashIndex::CalcHash(size_t current, void* source_void)
}
}
lastKey = temp;
-
+
size_t fprint = GetFprint(temp.c_str());
size_t idx = cmph_search(hash, temp.c_str(),
(cmph_uint32) temp.size());
-
+
pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits);
i++;
}
-
+
cmph_config_destroy(config);
-
+
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_mutex);
#endif
- if(m_hashes.size() <= current)
- {
- m_hashes.resize(current + 1, 0);
+ if(m_hashes.size() <= current) {
+ m_hashes.resize(current + 1, 0);
m_arrays.resize(current + 1, 0);
m_clocks.resize(current + 1, 0);
}
-
+
m_hashes[current] = (void*)hash;
m_arrays[current] = pv;
m_clocks[current] = clock();
- m_queue.push(-current);
+ m_queue.push(-current);
#endif
}
-#ifdef HAVE_CMPH
+#ifdef HAVE_CMPH
void* BlockHashIndex::vectorAdapter(std::vector<std::string>& v)
{
return (void*)CmphVectorAdapter(v);
}
-
+
void* BlockHashIndex::vectorAdapter(StringVector<unsigned, size_t, std::allocator>& sv)
{
return (void*)CmphStringVectorAdapter(sv);
diff --git a/moses/TranslationModel/CompactPT/BlockHashIndex.h b/moses/TranslationModel/CompactPT/BlockHashIndex.h
index 8541a2a19..c245d2d66 100644
--- a/moses/TranslationModel/CompactPT/BlockHashIndex.h
+++ b/moses/TranslationModel/CompactPT/BlockHashIndex.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_BlockHashIndex_h
#define moses_BlockHashIndex_h
@@ -25,7 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <iostream>
#include <string>
#include <vector>
-#include <queue>
+#include <queue>
#include <cstring>
#include <cstdio>
@@ -42,144 +42,139 @@ namespace Moses
class BlockHashIndex
{
- private:
- std::priority_queue<int> m_queue;
-
- size_t m_orderBits;
- size_t m_fingerPrintBits;
-
- std::FILE* m_fileHandle;
- size_t m_fileHandleStart;
-
- StringVector<unsigned char, unsigned long> m_landmarks;
-
- std::vector<void*> m_hashes;
- std::vector<clock_t> m_clocks;
- std::vector<PairedPackedArray<>*> m_arrays;
-
- std::vector<size_t> m_seekIndex;
-
- size_t m_size;
- int m_lastSaved;
- int m_lastDropped;
- size_t m_numLoadedRanges;
-
+private:
+ std::priority_queue<int> m_queue;
+
+ size_t m_orderBits;
+ size_t m_fingerPrintBits;
+
+ std::FILE* m_fileHandle;
+ size_t m_fileHandleStart;
+
+ StringVector<unsigned char, unsigned long> m_landmarks;
+
+ std::vector<void*> m_hashes;
+ std::vector<clock_t> m_clocks;
+ std::vector<PairedPackedArray<>*> m_arrays;
+
+ std::vector<size_t> m_seekIndex;
+
+ size_t m_size;
+ int m_lastSaved;
+ int m_lastDropped;
+ size_t m_numLoadedRanges;
+
#ifdef WITH_THREADS
- ThreadPool m_threadPool;
- boost::mutex m_mutex;
-
- template <typename Keys>
- class HashTask : public Task
- {
- public:
- HashTask(int id, BlockHashIndex& hash, Keys& keys)
- : m_id(id), m_hash(hash), m_keys(new Keys(keys)) {}
-
- virtual void Run()
- {
- m_hash.CalcHash(m_id, *m_keys);
- }
-
- virtual ~HashTask()
- {
- delete m_keys;
- }
-
- private:
- int m_id;
- BlockHashIndex& m_hash;
- Keys* m_keys;
- };
-#endif
-
- size_t GetFprint(const char* key) const;
- size_t GetHash(size_t i, const char* key);
-
+ ThreadPool m_threadPool;
+ boost::mutex m_mutex;
+
+ template <typename Keys>
+ class HashTask : public Task
+ {
public:
+ HashTask(int id, BlockHashIndex& hash, Keys& keys)
+ : m_id(id), m_hash(hash), m_keys(new Keys(keys)) {}
+
+ virtual void Run() {
+ m_hash.CalcHash(m_id, *m_keys);
+ }
+
+ virtual ~HashTask() {
+ delete m_keys;
+ }
+
+ private:
+ int m_id;
+ BlockHashIndex& m_hash;
+ Keys* m_keys;
+ };
+#endif
+
+ size_t GetFprint(const char* key) const;
+ size_t GetHash(size_t i, const char* key);
+
+public:
#ifdef WITH_THREADS
- BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
- size_t threadsNum = 2);
+ BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
+ size_t threadsNum = 2);
#else
- BlockHashIndex(size_t orderBits, size_t fingerPrintBits);
+ BlockHashIndex(size_t orderBits, size_t fingerPrintBits);
#endif
- ~BlockHashIndex();
-
- size_t GetHash(const char* key);
- size_t GetHash(std::string key);
-
- size_t operator[](std::string key);
- size_t operator[](char* key);
-
- void BeginSave(std::FILE* mphf);
- void SaveRange(size_t i);
- void SaveLastRange();
- size_t FinalizeSave();
+ ~BlockHashIndex();
+
+ size_t GetHash(const char* key);
+ size_t GetHash(std::string key);
+
+ size_t operator[](std::string key);
+ size_t operator[](char* key);
+
+ void BeginSave(std::FILE* mphf);
+ void SaveRange(size_t i);
+ void SaveLastRange();
+ size_t FinalizeSave();
#ifdef WITH_THREADS
- void WaitAll();
+ void WaitAll();
#endif
-
- void DropRange(size_t i);
- void DropLastRange();
-
- size_t LoadIndex(std::FILE* mphf);
- void LoadRange(size_t i);
-
- size_t Save(std::string filename);
- size_t Save(std::FILE * mphf);
-
- size_t Load(std::string filename);
- size_t Load(std::FILE * mphf);
-
- size_t GetSize() const;
-
- void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1);
-
- template <typename Keys>
- void AddRange(Keys &keys)
- {
- size_t current = m_landmarks.size();
-
- if(m_landmarks.size() && m_landmarks.back().str() >= keys[0])
- {
- std::cerr << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort" << std::endl;
- std::cerr << "1: " << m_landmarks.back().str() << std::endl;
- std::cerr << "2: " << keys[0] << std::endl;
- abort();
- }
-
- m_landmarks.push_back(keys[0]);
- m_size += keys.size();
-
- if(keys.size() == 1) {
- // add dummy key to avoid null hash
- keys.push_back("###DUMMY_KEY###");
- }
-
+
+ void DropRange(size_t i);
+ void DropLastRange();
+
+ size_t LoadIndex(std::FILE* mphf);
+ void LoadRange(size_t i);
+
+ size_t Save(std::string filename);
+ size_t Save(std::FILE * mphf);
+
+ size_t Load(std::string filename);
+ size_t Load(std::FILE * mphf);
+
+ size_t GetSize() const;
+
+ void KeepNLastRanges(float ratio = 0.1, float tolerance = 0.1);
+
+ template <typename Keys>
+ void AddRange(Keys &keys) {
+ size_t current = m_landmarks.size();
+
+ if(m_landmarks.size() && m_landmarks.back().str() >= keys[0]) {
+ std::cerr << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort" << std::endl;
+ std::cerr << "1: " << m_landmarks.back().str() << std::endl;
+ std::cerr << "2: " << keys[0] << std::endl;
+ abort();
+ }
+
+ m_landmarks.push_back(keys[0]);
+ m_size += keys.size();
+
+ if(keys.size() == 1) {
+ // add dummy key to avoid null hash
+ keys.push_back("###DUMMY_KEY###");
+ }
+
#ifdef WITH_THREADS
- HashTask<Keys>* ht = new HashTask<Keys>(current, *this, keys);
- m_threadPool.Submit(ht);
+ HashTask<Keys>* ht = new HashTask<Keys>(current, *this, keys);
+ m_threadPool.Submit(ht);
#else
- CalcHash(current, keys);
+ CalcHash(current, keys);
#endif
- }
-
- template <typename Keys>
- void CalcHash(size_t current, Keys &keys)
- {
-#ifdef HAVE_CMPH
- void* source = vectorAdapter(keys);
- CalcHash(current, source);
+ }
+
+ template <typename Keys>
+ void CalcHash(size_t current, Keys &keys) {
+#ifdef HAVE_CMPH
+ void* source = vectorAdapter(keys);
+ CalcHash(current, source);
#endif
- }
+ }
+
+ void CalcHash(size_t current, void* source);
- void CalcHash(size_t current, void* source);
-
-#ifdef HAVE_CMPH
- void* vectorAdapter(std::vector<std::string>& v);
- void* vectorAdapter(StringVector<unsigned, size_t, std::allocator>& sv);
- void* vectorAdapter(StringVector<unsigned, size_t, MmapAllocator>& sv);
+#ifdef HAVE_CMPH
+ void* vectorAdapter(std::vector<std::string>& v);
+ void* vectorAdapter(StringVector<unsigned, size_t, std::allocator>& sv);
+ void* vectorAdapter(StringVector<unsigned, size_t, MmapAllocator>& sv);
#endif
};
diff --git a/moses/TranslationModel/CompactPT/CanonicalHuffman.h b/moses/TranslationModel/CompactPT/CanonicalHuffman.h
index faf7ce411..8d6e1cbb1 100644
--- a/moses/TranslationModel/CompactPT/CanonicalHuffman.h
+++ b/moses/TranslationModel/CompactPT/CanonicalHuffman.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_CanonicalHuffman_h
#define moses_CanonicalHuffman_h
@@ -29,320 +29,293 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "ThrowingFwrite.h"
-namespace Moses {
+namespace Moses
+{
template <typename Data>
class CanonicalHuffman
{
- private:
- std::vector<Data> m_symbols;
- std::vector<size_t> m_firstCodes;
- std::vector<size_t> m_lengthIndex;
-
- typedef boost::unordered_map<Data, boost::dynamic_bitset<> > EncodeMap;
- EncodeMap m_encodeMap;
-
- struct MinHeapSorter {
- std::vector<size_t>& m_vec;
-
- MinHeapSorter(std::vector<size_t>& vec) : m_vec(vec) { }
-
- bool operator()(size_t a, size_t b)
- {
- return m_vec[a] > m_vec[b];
- }
- };
-
- template <class Iterator>
- void CalcLengths(Iterator begin, Iterator end, std::vector<size_t>& lengths)
- {
- size_t n = std::distance(begin, end);
- std::vector<size_t> A(2 * n, 0);
-
- m_symbols.resize(n);
- size_t i = 0;
- for(Iterator it = begin; it != end; it++)
- {
- m_symbols[i] = it->first;
-
- A[i] = n + i;
- A[n + i] = it->second;
- i++;
- }
-
- if(n == 1)
- {
- lengths.push_back(1);
- return;
- }
-
- MinHeapSorter hs(A);
- std::make_heap(A.begin(), A.begin() + n, hs);
-
- size_t h = n;
- size_t m1, m2;
- while(h > 1)
- {
- m1 = A[0];
- std::pop_heap(A.begin(), A.begin() + h, hs);
-
- h--;
-
- m2 = A[0];
- std::pop_heap(A.begin(), A.begin() + h, hs);
-
- A[h] = A[m1] + A[m2];
- A[h-1] = h;
- A[m1] = A[m2] = h;
-
- std::push_heap(A.begin(), A.begin() + h, hs);
- }
-
- A[1] = 0;
- for(size_t i = 2; i < 2*n; i++)
- A[i] = A[A[i]] + 1;
-
- lengths.resize(n);
- for(size_t i = 0; i < n; i++)
- lengths[i] = A[i + n];
- }
+private:
+ std::vector<Data> m_symbols;
+ std::vector<size_t> m_firstCodes;
+ std::vector<size_t> m_lengthIndex;
- void CalcCodes(std::vector<size_t>& lengths)
- {
- std::vector<size_t> numLength;
- for(std::vector<size_t>::iterator it = lengths.begin();
- it != lengths.end(); it++) {
- size_t length = *it;
- if(numLength.size() <= length)
- numLength.resize(length + 1, 0);
- numLength[length]++;
- }
-
- m_lengthIndex.resize(numLength.size());
- m_lengthIndex[0] = 0;
- for(size_t l = 1; l < numLength.size(); l++)
- m_lengthIndex[l] = m_lengthIndex[l - 1] + numLength[l - 1];
-
- size_t maxLength = numLength.size() - 1;
-
- m_firstCodes.resize(maxLength + 1, 0);
- for(size_t l = maxLength - 1; l > 0; l--)
- m_firstCodes[l] = (m_firstCodes[l + 1] + numLength[l + 1]) / 2;
-
- std::vector<Data> t_symbols;
- t_symbols.resize(lengths.size());
-
- std::vector<size_t> nextCode = m_firstCodes;
- for(size_t i = 0; i < lengths.size(); i++)
- {
- Data data = m_symbols[i];
- size_t length = lengths[i];
-
- size_t pos = m_lengthIndex[length]
- + (nextCode[length] - m_firstCodes[length]);
- t_symbols[pos] = data;
-
- nextCode[length] = nextCode[length] + 1;
- }
-
- m_symbols.swap(t_symbols);
- }
-
- void CreateCodeMap()
- {
- for(size_t l = 1; l < m_lengthIndex.size(); l++)
- {
- size_t intCode = m_firstCodes[l];
- size_t num = ((l+1 < m_lengthIndex.size()) ? m_lengthIndex[l+1]
- : m_symbols.size()) - m_lengthIndex[l];
-
- for(size_t i = 0; i < num; i++)
- {
- Data data = m_symbols[m_lengthIndex[l] + i];
- boost::dynamic_bitset<> bitCode(l, intCode);
- m_encodeMap[data] = bitCode;
- intCode++;
- }
- }
- }
-
- boost::dynamic_bitset<>& Encode(Data data)
- {
- return m_encodeMap[data];
+ typedef boost::unordered_map<Data, boost::dynamic_bitset<> > EncodeMap;
+ EncodeMap m_encodeMap;
+
+ struct MinHeapSorter {
+ std::vector<size_t>& m_vec;
+
+ MinHeapSorter(std::vector<size_t>& vec) : m_vec(vec) { }
+
+ bool operator()(size_t a, size_t b) {
+ return m_vec[a] > m_vec[b];
}
-
- template <class BitWrapper>
- void PutCode(BitWrapper& bitWrapper, boost::dynamic_bitset<>& code)
- {
- for(int j = code.size()-1; j >= 0; j--)
- bitWrapper.Put(code[j]);
+ };
+
+ template <class Iterator>
+ void CalcLengths(Iterator begin, Iterator end, std::vector<size_t>& lengths) {
+ size_t n = std::distance(begin, end);
+ std::vector<size_t> A(2 * n, 0);
+
+ m_symbols.resize(n);
+ size_t i = 0;
+ for(Iterator it = begin; it != end; it++) {
+ m_symbols[i] = it->first;
+
+ A[i] = n + i;
+ A[n + i] = it->second;
+ i++;
}
-
- public:
-
- template <class Iterator>
- CanonicalHuffman(Iterator begin, Iterator end, bool forEncoding = true)
- {
- std::vector<size_t> lengths;
- CalcLengths(begin, end, lengths);
- CalcCodes(lengths);
-
- if(forEncoding)
- CreateCodeMap();
+
+ if(n == 1) {
+ lengths.push_back(1);
+ return;
}
-
- CanonicalHuffman(std::FILE* pFile, bool forEncoding = false)
- {
- Load(pFile);
-
- if(forEncoding)
- CreateCodeMap();
+
+ MinHeapSorter hs(A);
+ std::make_heap(A.begin(), A.begin() + n, hs);
+
+ size_t h = n;
+ size_t m1, m2;
+ while(h > 1) {
+ m1 = A[0];
+ std::pop_heap(A.begin(), A.begin() + h, hs);
+
+ h--;
+
+ m2 = A[0];
+ std::pop_heap(A.begin(), A.begin() + h, hs);
+
+ A[h] = A[m1] + A[m2];
+ A[h-1] = h;
+ A[m1] = A[m2] = h;
+
+ std::push_heap(A.begin(), A.begin() + h, hs);
}
-
- template <class BitWrapper>
- void Put(BitWrapper& bitWrapper, Data data)
- {
- PutCode(bitWrapper, Encode(data));
+
+ A[1] = 0;
+ for(size_t i = 2; i < 2*n; i++)
+ A[i] = A[A[i]] + 1;
+
+ lengths.resize(n);
+ for(size_t i = 0; i < n; i++)
+ lengths[i] = A[i + n];
+ }
+
+ void CalcCodes(std::vector<size_t>& lengths) {
+ std::vector<size_t> numLength;
+ for(std::vector<size_t>::iterator it = lengths.begin();
+ it != lengths.end(); it++) {
+ size_t length = *it;
+ if(numLength.size() <= length)
+ numLength.resize(length + 1, 0);
+ numLength[length]++;
}
-
- template <class BitWrapper>
- Data Read(BitWrapper& bitWrapper)
- {
- if(bitWrapper.TellFromEnd())
- {
- size_t intCode = bitWrapper.Read();
- size_t len = 1;
- while(intCode < m_firstCodes[len]) {
- intCode = 2 * intCode + bitWrapper.Read();
- len++;
- }
- return m_symbols[m_lengthIndex[len] + (intCode - m_firstCodes[len])];
- }
- return Data();
+
+ m_lengthIndex.resize(numLength.size());
+ m_lengthIndex[0] = 0;
+ for(size_t l = 1; l < numLength.size(); l++)
+ m_lengthIndex[l] = m_lengthIndex[l - 1] + numLength[l - 1];
+
+ size_t maxLength = numLength.size() - 1;
+
+ m_firstCodes.resize(maxLength + 1, 0);
+ for(size_t l = maxLength - 1; l > 0; l--)
+ m_firstCodes[l] = (m_firstCodes[l + 1] + numLength[l + 1]) / 2;
+
+ std::vector<Data> t_symbols;
+ t_symbols.resize(lengths.size());
+
+ std::vector<size_t> nextCode = m_firstCodes;
+ for(size_t i = 0; i < lengths.size(); i++) {
+ Data data = m_symbols[i];
+ size_t length = lengths[i];
+
+ size_t pos = m_lengthIndex[length]
+ + (nextCode[length] - m_firstCodes[length]);
+ t_symbols[pos] = data;
+
+ nextCode[length] = nextCode[length] + 1;
}
-
- size_t Load(std::FILE* pFile)
- {
- size_t start = std::ftell(pFile);
- size_t read = 0;
-
- size_t size;
- read += std::fread(&size, sizeof(size_t), 1, pFile);
- m_symbols.resize(size);
- read += std::fread(&m_symbols[0], sizeof(Data), size, pFile);
-
- read += std::fread(&size, sizeof(size_t), 1, pFile);
- m_firstCodes.resize(size);
- read += std::fread(&m_firstCodes[0], sizeof(size_t), size, pFile);
-
- read += std::fread(&size, sizeof(size_t), 1, pFile);
- m_lengthIndex.resize(size);
- read += std::fread(&m_lengthIndex[0], sizeof(size_t), size, pFile);
-
- return std::ftell(pFile) - start;
+
+ m_symbols.swap(t_symbols);
+ }
+
+ void CreateCodeMap() {
+ for(size_t l = 1; l < m_lengthIndex.size(); l++) {
+ size_t intCode = m_firstCodes[l];
+ size_t num = ((l+1 < m_lengthIndex.size()) ? m_lengthIndex[l+1]
+ : m_symbols.size()) - m_lengthIndex[l];
+
+ for(size_t i = 0; i < num; i++) {
+ Data data = m_symbols[m_lengthIndex[l] + i];
+ boost::dynamic_bitset<> bitCode(l, intCode);
+ m_encodeMap[data] = bitCode;
+ intCode++;
+ }
}
-
- size_t Save(std::FILE* pFile)
- {
- size_t start = std::ftell(pFile);
-
- size_t size = m_symbols.size();
- ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
- ThrowingFwrite(&m_symbols[0], sizeof(Data), size, pFile);
-
- size = m_firstCodes.size();
- ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
- ThrowingFwrite(&m_firstCodes[0], sizeof(size_t), size, pFile);
-
- size = m_lengthIndex.size();
- ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
- ThrowingFwrite(&m_lengthIndex[0], sizeof(size_t), size, pFile);
-
- return std::ftell(pFile) - start;
+ }
+
+ boost::dynamic_bitset<>& Encode(Data data) {
+ return m_encodeMap[data];
+ }
+
+ template <class BitWrapper>
+ void PutCode(BitWrapper& bitWrapper, boost::dynamic_bitset<>& code) {
+ for(int j = code.size()-1; j >= 0; j--)
+ bitWrapper.Put(code[j]);
+ }
+
+public:
+
+ template <class Iterator>
+ CanonicalHuffman(Iterator begin, Iterator end, bool forEncoding = true) {
+ std::vector<size_t> lengths;
+ CalcLengths(begin, end, lengths);
+ CalcCodes(lengths);
+
+ if(forEncoding)
+ CreateCodeMap();
+ }
+
+ CanonicalHuffman(std::FILE* pFile, bool forEncoding = false) {
+ Load(pFile);
+
+ if(forEncoding)
+ CreateCodeMap();
+ }
+
+ template <class BitWrapper>
+ void Put(BitWrapper& bitWrapper, Data data) {
+ PutCode(bitWrapper, Encode(data));
+ }
+
+ template <class BitWrapper>
+ Data Read(BitWrapper& bitWrapper) {
+ if(bitWrapper.TellFromEnd()) {
+ size_t intCode = bitWrapper.Read();
+ size_t len = 1;
+ while(intCode < m_firstCodes[len]) {
+ intCode = 2 * intCode + bitWrapper.Read();
+ len++;
+ }
+ return m_symbols[m_lengthIndex[len] + (intCode - m_firstCodes[len])];
}
+ return Data();
+ }
+
+ size_t Load(std::FILE* pFile) {
+ size_t start = std::ftell(pFile);
+ size_t read = 0;
+
+ size_t size;
+ read += std::fread(&size, sizeof(size_t), 1, pFile);
+ m_symbols.resize(size);
+ read += std::fread(&m_symbols[0], sizeof(Data), size, pFile);
+
+ read += std::fread(&size, sizeof(size_t), 1, pFile);
+ m_firstCodes.resize(size);
+ read += std::fread(&m_firstCodes[0], sizeof(size_t), size, pFile);
+
+ read += std::fread(&size, sizeof(size_t), 1, pFile);
+ m_lengthIndex.resize(size);
+ read += std::fread(&m_lengthIndex[0], sizeof(size_t), size, pFile);
+
+ return std::ftell(pFile) - start;
+ }
+
+ size_t Save(std::FILE* pFile) {
+ size_t start = std::ftell(pFile);
+
+ size_t size = m_symbols.size();
+ ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
+ ThrowingFwrite(&m_symbols[0], sizeof(Data), size, pFile);
+
+ size = m_firstCodes.size();
+ ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
+ ThrowingFwrite(&m_firstCodes[0], sizeof(size_t), size, pFile);
+
+ size = m_lengthIndex.size();
+ ThrowingFwrite(&size, sizeof(size_t), 1, pFile);
+ ThrowingFwrite(&m_lengthIndex[0], sizeof(size_t), size, pFile);
+
+ return std::ftell(pFile) - start;
+ }
};
template <class Container = std::string>
class BitWrapper
{
- private:
- Container& m_data;
-
- typename Container::iterator m_iterator;
- typename Container::value_type m_currentValue;
-
- size_t m_valueBits;
- typename Container::value_type m_mask;
- size_t m_bitPos;
-
- public:
-
- BitWrapper(Container &data)
+private:
+ Container& m_data;
+
+ typename Container::iterator m_iterator;
+ typename Container::value_type m_currentValue;
+
+ size_t m_valueBits;
+ typename Container::value_type m_mask;
+ size_t m_bitPos;
+
+public:
+
+ BitWrapper(Container &data)
: m_data(data), m_iterator(m_data.begin()), m_currentValue(0),
m_valueBits(sizeof(typename Container::value_type) * 8),
m_mask(1), m_bitPos(0) { }
-
- bool Read()
- {
- if(m_bitPos % m_valueBits == 0)
- {
- if(m_iterator != m_data.end())
- m_currentValue = *m_iterator++;
- }
- else
- m_currentValue = m_currentValue >> 1;
-
- m_bitPos++;
- return (m_currentValue & m_mask);
- }
-
- void Put(bool bit) {
- if(m_bitPos % m_valueBits == 0)
- m_data.push_back(0);
-
- if(bit)
- m_data[m_data.size()-1] |= m_mask << (m_bitPos % m_valueBits);
-
- m_bitPos++;
- }
-
- size_t Tell()
- {
- return m_bitPos;
- }
-
- size_t TellFromEnd()
- {
- if(m_data.size() * m_valueBits < m_bitPos)
- return 0;
- return m_data.size() * m_valueBits - m_bitPos;
- }
-
- void Seek(size_t bitPos)
- {
- m_bitPos = bitPos;
- m_iterator = m_data.begin() + int((m_bitPos-1)/m_valueBits);
- m_currentValue = (*m_iterator) >> ((m_bitPos-1) % m_valueBits);
- m_iterator++;
- }
-
- void SeekFromEnd(size_t bitPosFromEnd)
- {
- size_t bitPos = m_data.size() * m_valueBits - bitPosFromEnd;
- Seek(bitPos);
- }
-
- void Reset()
- {
- m_iterator = m_data.begin();
- m_currentValue = 0;
- m_bitPos = 0;
- }
-
- Container& GetContainer()
- {
- return m_data;
- }
+
+ bool Read() {
+ if(m_bitPos % m_valueBits == 0) {
+ if(m_iterator != m_data.end())
+ m_currentValue = *m_iterator++;
+ } else
+ m_currentValue = m_currentValue >> 1;
+
+ m_bitPos++;
+ return (m_currentValue & m_mask);
+ }
+
+ void Put(bool bit) {
+ if(m_bitPos % m_valueBits == 0)
+ m_data.push_back(0);
+
+ if(bit)
+ m_data[m_data.size()-1] |= m_mask << (m_bitPos % m_valueBits);
+
+ m_bitPos++;
+ }
+
+ size_t Tell() {
+ return m_bitPos;
+ }
+
+ size_t TellFromEnd() {
+ if(m_data.size() * m_valueBits < m_bitPos)
+ return 0;
+ return m_data.size() * m_valueBits - m_bitPos;
+ }
+
+ void Seek(size_t bitPos) {
+ m_bitPos = bitPos;
+ m_iterator = m_data.begin() + int((m_bitPos-1)/m_valueBits);
+ m_currentValue = (*m_iterator) >> ((m_bitPos-1) % m_valueBits);
+ m_iterator++;
+ }
+
+ void SeekFromEnd(size_t bitPosFromEnd) {
+ size_t bitPos = m_data.size() * m_valueBits - bitPosFromEnd;
+ Seek(bitPos);
+ }
+
+ void Reset() {
+ m_iterator = m_data.begin();
+ m_currentValue = 0;
+ m_bitPos = 0;
+ }
+
+ Container& GetContainer() {
+ return m_data;
+ }
};
}
diff --git a/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp b/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp
index 40fff6690..8e4d1641f 100644
--- a/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp
+++ b/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifdef HAVE_CMPH
@@ -25,70 +25,70 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
-
- void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
- {
- delete[] key;
- }
-
- void CmphStringVectorAdapterRewind(void *data)
- {
- cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
- cmph_vector->position = 0;
- }
-
- //************************************************************************//
-
- cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v)
- {
- cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
- cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
- assert(key_source);
- assert(cmph_vector);
-
- cmph_vector->vector = (void *)&v;
- cmph_vector->position = 0;
- key_source->data = (void *)cmph_vector;
- key_source->nkeys = v.size();
-
- return key_source;
- }
-
- int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
- {
- cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
- std::vector<std::string>* v = (std::vector<std::string>*)cmph_vector->vector;
- size_t size;
- *keylen = (*v)[cmph_vector->position].size();
- size = *keylen;
- *key = new char[size + 1];
- std::string temp = (*v)[cmph_vector->position];
- strcpy(*key, temp.c_str());
- cmph_vector->position = cmph_vector->position + 1;
- return (int)(*keylen);
- }
-
- void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
- {
- delete[] key;
- }
-
- void CmphVectorAdapterRewind(void *data)
- {
- cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
- cmph_vector->position = 0;
- }
-
- cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v)
- {
- cmph_io_adapter_t * key_source = CmphVectorAdapterNew(v);
-
- key_source->read = CmphVectorAdapterRead;
- key_source->dispose = CmphVectorAdapterDispose;
- key_source->rewind = CmphVectorAdapterRewind;
- return key_source;
- }
-
+
+void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
+{
+ delete[] key;
+}
+
+void CmphStringVectorAdapterRewind(void *data)
+{
+ cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
+ cmph_vector->position = 0;
+}
+
+//************************************************************************//
+
+cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v)
+{
+ cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
+ cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
+ assert(key_source);
+ assert(cmph_vector);
+
+ cmph_vector->vector = (void *)&v;
+ cmph_vector->position = 0;
+ key_source->data = (void *)cmph_vector;
+ key_source->nkeys = v.size();
+
+ return key_source;
+}
+
+int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
+{
+ cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
+ std::vector<std::string>* v = (std::vector<std::string>*)cmph_vector->vector;
+ size_t size;
+ *keylen = (*v)[cmph_vector->position].size();
+ size = *keylen;
+ *key = new char[size + 1];
+ std::string temp = (*v)[cmph_vector->position];
+ strcpy(*key, temp.c_str());
+ cmph_vector->position = cmph_vector->position + 1;
+ return (int)(*keylen);
+}
+
+void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
+{
+ delete[] key;
+}
+
+void CmphVectorAdapterRewind(void *data)
+{
+ cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
+ cmph_vector->position = 0;
+}
+
+cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v)
+{
+ cmph_io_adapter_t * key_source = CmphVectorAdapterNew(v);
+
+ key_source->read = CmphVectorAdapterRead;
+ key_source->dispose = CmphVectorAdapterDispose;
+ key_source->rewind = CmphVectorAdapterRewind;
+ return key_source;
+}
+
}
#endif
diff --git a/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h b/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h
index 5516d4f4d..4a532c289 100644
--- a/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h
+++ b/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_CmphStringVectorAdapterNew_h
#define moses_CmphStringVectorAdapterNew_h
@@ -33,72 +33,71 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
- typedef struct
- {
- void *vector;
- cmph_uint32 position;
- }
- cmph_vector_t;
-
-
- template <typename ValueT, typename PosT, template <typename> class Allocator>
- cmph_io_adapter_t *CmphStringVectorAdapterNew(StringVector<ValueT, PosT, Allocator>& sv)
- {
- cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
- cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
- assert(key_source);
- assert(cmph_vector);
-
- cmph_vector->vector = (void *)&sv;
- cmph_vector->position = 0;
- key_source->data = (void *)cmph_vector;
- key_source->nkeys = sv.size();
-
- return key_source;
- }
-
- template <typename ValueT, typename PosT, template <typename> class Allocator>
- int CmphStringVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
- {
- cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
- StringVector<ValueT, PosT, Allocator>* sv = (StringVector<ValueT, PosT, Allocator>*)cmph_vector->vector;
- size_t size;
- *keylen = (*sv)[cmph_vector->position].size();
- size = *keylen;
- *key = new char[size + 1];
- std::string temp = (*sv)[cmph_vector->position];
- std::strcpy(*key, temp.c_str());
- cmph_vector->position = cmph_vector->position + 1;
- return (int)(*keylen);
- }
-
- void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
-
- void CmphStringVectorAdapterRewind(void *data);
-
- template <typename ValueT, typename PosT, template <typename> class Allocator>
- cmph_io_adapter_t* CmphStringVectorAdapter(StringVector<ValueT, PosT, Allocator>& sv)
- {
- cmph_io_adapter_t * key_source = CmphStringVectorAdapterNew(sv);
-
- key_source->read = CmphStringVectorAdapterRead<ValueT, PosT, Allocator>;
- key_source->dispose = CmphStringVectorAdapterDispose;
- key_source->rewind = CmphStringVectorAdapterRewind;
- return key_source;
- }
-
- //************************************************************************//
-
- cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v);
-
- int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen);
-
- void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
-
- void CmphVectorAdapterRewind(void *data);
-
- cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v);
-
+typedef struct {
+ void *vector;
+ cmph_uint32 position;
+}
+cmph_vector_t;
+
+
+template <typename ValueT, typename PosT, template <typename> class Allocator>
+cmph_io_adapter_t *CmphStringVectorAdapterNew(StringVector<ValueT, PosT, Allocator>& sv)
+{
+ cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
+ cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
+ assert(key_source);
+ assert(cmph_vector);
+
+ cmph_vector->vector = (void *)&sv;
+ cmph_vector->position = 0;
+ key_source->data = (void *)cmph_vector;
+ key_source->nkeys = sv.size();
+
+ return key_source;
+}
+
+template <typename ValueT, typename PosT, template <typename> class Allocator>
+int CmphStringVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
+{
+ cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
+ StringVector<ValueT, PosT, Allocator>* sv = (StringVector<ValueT, PosT, Allocator>*)cmph_vector->vector;
+ size_t size;
+ *keylen = (*sv)[cmph_vector->position].size();
+ size = *keylen;
+ *key = new char[size + 1];
+ std::string temp = (*sv)[cmph_vector->position];
+ std::strcpy(*key, temp.c_str());
+ cmph_vector->position = cmph_vector->position + 1;
+ return (int)(*keylen);
+}
+
+void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
+
+void CmphStringVectorAdapterRewind(void *data);
+
+template <typename ValueT, typename PosT, template <typename> class Allocator>
+cmph_io_adapter_t* CmphStringVectorAdapter(StringVector<ValueT, PosT, Allocator>& sv)
+{
+ cmph_io_adapter_t * key_source = CmphStringVectorAdapterNew(sv);
+
+ key_source->read = CmphStringVectorAdapterRead<ValueT, PosT, Allocator>;
+ key_source->dispose = CmphStringVectorAdapterDispose;
+ key_source->rewind = CmphStringVectorAdapterRewind;
+ return key_source;
+}
+
+//************************************************************************//
+
+cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v);
+
+int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen);
+
+void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
+
+void CmphVectorAdapterRewind(void *data);
+
+cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v);
+
}
#endif
diff --git a/moses/TranslationModel/CompactPT/ConsistentPhrases.h b/moses/TranslationModel/CompactPT/ConsistentPhrases.h
index 0ec86e1ac..c7b7c733b 100644
--- a/moses/TranslationModel/CompactPT/ConsistentPhrases.h
+++ b/moses/TranslationModel/CompactPT/ConsistentPhrases.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_ConsistentPhrases_h
#define moses_ConsistentPhrases_h
@@ -29,97 +29,82 @@ namespace Moses
class ConsistentPhrases
{
- public:
- struct Phrase
- {
- int i, j, m, n;
- Phrase(int i_, int m_, int j_, int n_) : i(i_), j(j_), m(m_), n(n_) { }
- };
-
- struct PhraseSorter
- {
- bool operator()(Phrase a, Phrase b)
- {
- if(a.n > b.n)
- return true;
- if(a.n == b.n && a.j < b.j)
- return true;
- if(a.n == b.n && a.j == b.j && a.m > b.m)
- return true;
- if(a.n == b.n && a.j == b.j && a.m == b.m && a.i < b.i)
- return true;
- return false;
- }
- };
-
- private:
- typedef std::set<Phrase, PhraseSorter> PhraseQueue;
- PhraseQueue m_phraseQueue;
-
- typedef std::pair<unsigned char, unsigned char> AlignPoint;
- typedef std::set<AlignPoint> Alignment;
-
- public:
-
- ConsistentPhrases(int mmax, int nmax, Alignment& a)
- {
- for(int i = 0; i < mmax; i++)
- {
- for(int m = 1; m <= mmax-i; m++)
- {
- for(int j = 0; j < nmax; j++)
- {
- for(int n = 1; n <= nmax-j; n++)
- {
- bool consistant = true;
- for(Alignment::iterator it = a.begin(); it != a.end(); it++)
- {
- int ip = it->first;
- int jp = it->second;
- if((i <= ip && ip < i+m) != (j <= jp && jp < j+n))
- {
- consistant = false;
- break;
- }
+public:
+ struct Phrase {
+ int i, j, m, n;
+ Phrase(int i_, int m_, int j_, int n_) : i(i_), j(j_), m(m_), n(n_) { }
+ };
+
+ struct PhraseSorter {
+ bool operator()(Phrase a, Phrase b) {
+ if(a.n > b.n)
+ return true;
+ if(a.n == b.n && a.j < b.j)
+ return true;
+ if(a.n == b.n && a.j == b.j && a.m > b.m)
+ return true;
+ if(a.n == b.n && a.j == b.j && a.m == b.m && a.i < b.i)
+ return true;
+ return false;
+ }
+ };
+
+private:
+ typedef std::set<Phrase, PhraseSorter> PhraseQueue;
+ PhraseQueue m_phraseQueue;
+
+ typedef std::pair<unsigned char, unsigned char> AlignPoint;
+ typedef std::set<AlignPoint> Alignment;
+
+public:
+
+ ConsistentPhrases(int mmax, int nmax, Alignment& a) {
+ for(int i = 0; i < mmax; i++) {
+ for(int m = 1; m <= mmax-i; m++) {
+ for(int j = 0; j < nmax; j++) {
+ for(int n = 1; n <= nmax-j; n++) {
+ bool consistant = true;
+ for(Alignment::iterator it = a.begin(); it != a.end(); it++) {
+ int ip = it->first;
+ int jp = it->second;
+ if((i <= ip && ip < i+m) != (j <= jp && jp < j+n)) {
+ consistant = false;
+ break;
}
- if(consistant)
- m_phraseQueue.insert(Phrase(i, m, j, n));
- }
- }
- }
+ }
+ if(consistant)
+ m_phraseQueue.insert(Phrase(i, m, j, n));
+ }
+ }
}
- m_phraseQueue.erase(Phrase(0, mmax, 0, nmax));
- }
-
- size_t Empty()
- {
- return !m_phraseQueue.size();
}
-
- Phrase Pop()
- {
- if(m_phraseQueue.size())
- {
- Phrase p = *m_phraseQueue.begin();
- m_phraseQueue.erase(m_phraseQueue.begin());
- return p;
- }
- return Phrase(0,0,0,0);
+ m_phraseQueue.erase(Phrase(0, mmax, 0, nmax));
+ }
+
+ size_t Empty() {
+ return !m_phraseQueue.size();
+ }
+
+ Phrase Pop() {
+ if(m_phraseQueue.size()) {
+ Phrase p = *m_phraseQueue.begin();
+ m_phraseQueue.erase(m_phraseQueue.begin());
+ return p;
}
-
- void RemoveOverlap(Phrase p)
- {
- PhraseQueue ok;
- for(PhraseQueue::iterator it = m_phraseQueue.begin(); it != m_phraseQueue.end(); it++)
- {
- Phrase pp = *it;
- if(!((p.i <= pp.i && pp.i < p.i + p.m) || (pp.i <= p.i && p.i < pp.i + pp.m) ||
- (p.j <= pp.j && pp.j < p.j + p.n) || (pp.j <= p.j && p.j < pp.j + pp.n)))
- ok.insert(pp);
- }
- m_phraseQueue = ok;
+ return Phrase(0,0,0,0);
+ }
+
+ void RemoveOverlap(Phrase p) {
+ PhraseQueue ok;
+ for(PhraseQueue::iterator it = m_phraseQueue.begin(); it != m_phraseQueue.end(); it++) {
+ Phrase pp = *it;
+ if(!((p.i <= pp.i && pp.i < p.i + p.m) || (pp.i <= p.i && p.i < pp.i + pp.m) ||
+ (p.j <= pp.j && pp.j < p.j + p.n) || (pp.j <= p.j && p.j < pp.j + pp.n)))
+ ok.insert(pp);
}
-
+ m_phraseQueue = ok;
+ }
+
};
}
diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
index ff1c663c9..ad7591a7b 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
@@ -1,27 +1,28 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#include "LexicalReorderingTableCompact.h"
-namespace Moses {
+namespace Moses
+{
LexicalReorderingTableCompact::LexicalReorderingTableCompact(
const std::string& filePath,
@@ -29,9 +30,9 @@ LexicalReorderingTableCompact::LexicalReorderingTableCompact(
const std::vector<FactorType>& e_factors,
const std::vector<FactorType>& c_factors)
: LexicalReorderingTable(f_factors, e_factors, c_factors),
- m_inMemory(StaticData::Instance().UseMinlexrInMemory()),
- m_numScoreComponent(6), m_multipleScoreTrees(true),
- m_hash(10, 16), m_scoreTrees(1)
+ m_inMemory(StaticData::Instance().UseMinlexrInMemory()),
+ m_numScoreComponent(6), m_multipleScoreTrees(true),
+ m_hash(10, 16), m_scoreTrees(1)
{
Load(filePath);
}
@@ -41,12 +42,13 @@ LexicalReorderingTableCompact::LexicalReorderingTableCompact(
const std::vector<FactorType>& e_factors,
const std::vector<FactorType>& c_factors)
: LexicalReorderingTable(f_factors, e_factors, c_factors),
- m_inMemory(StaticData::Instance().UseMinlexrInMemory()),
- m_numScoreComponent(6), m_multipleScoreTrees(true),
- m_hash(10, 16), m_scoreTrees(1)
+ m_inMemory(StaticData::Instance().UseMinlexrInMemory()),
+ m_numScoreComponent(6), m_multipleScoreTrees(true),
+ m_hash(10, 16), m_scoreTrees(1)
{ }
-LexicalReorderingTableCompact::~LexicalReorderingTableCompact() {
+LexicalReorderingTableCompact::~LexicalReorderingTableCompact()
+{
for(size_t i = 0; i < m_scoreTrees.size(); i++)
delete m_scoreTrees[i];
}
@@ -57,25 +59,23 @@ std::vector<float> LexicalReorderingTableCompact::GetScore(const Phrase& f,
{
std::string key;
Scores scores;
-
+
if(0 == c.GetSize())
key = MakeKey(f, e, c);
else
- for(size_t i = 0; i <= c.GetSize(); ++i)
- {
+ for(size_t i = 0; i <= c.GetSize(); ++i) {
Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1)));
key = MakeKey(f,e,sub_c);
}
-
+
size_t index = m_hash[key];
- if(m_hash.GetSize() != index)
- {
+ if(m_hash.GetSize() != index) {
std::string scoresString;
if(m_inMemory)
scoresString = m_scoresMemory[index];
else
scoresString = m_scoresMapped[index];
-
+
BitWrapper<> bitStream(scoresString);
for(size_t i = 0; i < m_numScoreComponent; i++)
scores.push_back(m_scoreTrees[m_multipleScoreTrees ? i : 0]->Read(bitStream));
@@ -100,22 +100,17 @@ std::string LexicalReorderingTableCompact::MakeKey(const std::string& f,
const std::string& c) const
{
std::string key;
- if(!f.empty())
- {
+ if(!f.empty()) {
key += f;
}
- if(!m_FactorsE.empty())
- {
- if(!key.empty())
- {
+ if(!m_FactorsE.empty()) {
+ if(!key.empty()) {
key += " ||| ";
}
key += e;
}
- if(!m_FactorsC.empty())
- {
- if(!key.empty())
- {
+ if(!m_FactorsC.empty()) {
+ if(!key.empty()) {
key += " ||| ";
}
key += c;
@@ -133,48 +128,43 @@ LexicalReorderingTable* LexicalReorderingTableCompact::CheckAndLoad(
#ifdef HAVE_CMPH
std::string minlexr = ".minlexr";
// file name is specified without suffix
- if(FileExists(filePath + minlexr))
- {
+ if(FileExists(filePath + minlexr)) {
//there exists a compact binary version use that
- VERBOSE(2,"Using compact lexical reordering table" << std::endl);
- return new LexicalReorderingTableCompact(filePath + minlexr, f_factors, e_factors, c_factors);
+ VERBOSE(2,"Using compact lexical reordering table" << std::endl);
+ return new LexicalReorderingTableCompact(filePath + minlexr, f_factors, e_factors, c_factors);
}
// file name is specified with suffix
if(filePath.substr(filePath.length() - minlexr.length(), minlexr.length()) == minlexr
- && FileExists(filePath))
- {
+ && FileExists(filePath)) {
//there exists a compact binary version use that
- VERBOSE(2,"Using compact lexical reordering table" << std::endl);
- return new LexicalReorderingTableCompact(filePath, f_factors, e_factors, c_factors);
+ VERBOSE(2,"Using compact lexical reordering table" << std::endl);
+ return new LexicalReorderingTableCompact(filePath, f_factors, e_factors, c_factors);
}
#endif
return 0;
}
void LexicalReorderingTableCompact::Load(std::string filePath)
-{
+{
std::FILE* pFile = std::fopen(filePath.c_str(), "r");
if(m_inMemory)
m_hash.Load(pFile);
else
m_hash.LoadIndex(pFile);
-
+
size_t read = 0;
read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, pFile);
read += std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, pFile);
-
- if(m_multipleScoreTrees)
- {
+
+ if(m_multipleScoreTrees) {
m_scoreTrees.resize(m_numScoreComponent);
for(size_t i = 0; i < m_numScoreComponent; i++)
m_scoreTrees[i] = new CanonicalHuffman<float>(pFile);
- }
- else
- {
+ } else {
m_scoreTrees.resize(1);
m_scoreTrees[0] = new CanonicalHuffman<float>(pFile);
}
-
+
if(m_inMemory)
m_scoresMemory.load(pFile, false);
else
diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h
index 849c61c08..46f2228c9 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_LexicalReorderingTableCompact_h
#define moses_LexicalReorderingTableCompact_h
@@ -33,50 +33,51 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "CanonicalHuffman.h"
#include "StringVector.h"
-namespace Moses {
+namespace Moses
+{
class LexicalReorderingTableCompact: public LexicalReorderingTable
{
- private:
- bool m_inMemory;
-
- size_t m_numScoreComponent;
- bool m_multipleScoreTrees;
-
- BlockHashIndex m_hash;
-
- typedef CanonicalHuffman<float> ScoreTree;
- std::vector<ScoreTree*> m_scoreTrees;
-
- StringVector<unsigned char, unsigned long, MmapAllocator> m_scoresMapped;
- StringVector<unsigned char, unsigned long, std::allocator> m_scoresMemory;
-
- std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const;
- std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const;
-
- public:
- LexicalReorderingTableCompact(
- const std::string& filePath,
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors);
-
- LexicalReorderingTableCompact(
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors);
-
- virtual ~LexicalReorderingTableCompact();
-
- virtual std::vector<float> GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
-
- static LexicalReorderingTable* CheckAndLoad(
- const std::string& filePath,
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors);
-
- void Load(std::string filePath);
+private:
+ bool m_inMemory;
+
+ size_t m_numScoreComponent;
+ bool m_multipleScoreTrees;
+
+ BlockHashIndex m_hash;
+
+ typedef CanonicalHuffman<float> ScoreTree;
+ std::vector<ScoreTree*> m_scoreTrees;
+
+ StringVector<unsigned char, unsigned long, MmapAllocator> m_scoresMapped;
+ StringVector<unsigned char, unsigned long, std::allocator> m_scoresMemory;
+
+ std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const;
+ std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const;
+
+public:
+ LexicalReorderingTableCompact(
+ const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors);
+
+ LexicalReorderingTableCompact(
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors);
+
+ virtual ~LexicalReorderingTableCompact();
+
+ virtual std::vector<float> GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
+
+ static LexicalReorderingTable* CheckAndLoad(
+ const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors);
+
+ void Load(std::string filePath);
};
}
diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
index a3eee1694..655ed01ca 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#include "LexicalReorderingTableCreator.h"
#include "ThrowingFwrite.h"
@@ -25,7 +25,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "util/file.hh"
-namespace Moses {
+namespace Moses
+{
LexicalReorderingTableCreator::LexicalReorderingTableCreator(
std::string inPath, std::string outPath, std::string tempfilePath,
@@ -34,49 +35,47 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator(
#ifdef WITH_THREADS
, size_t threads
#endif
- )
+)
: m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath),
- m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
- m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees),
- m_quantize(quantize), m_separator(" ||| "),
- m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1)
-#ifdef WITH_THREADS
- , m_threads(threads)
+ m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
+ m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees),
+ m_quantize(quantize), m_separator(" ||| "),
+ m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1)
+#ifdef WITH_THREADS
+ , m_threads(threads)
#endif
-{
+{
PrintInfo();
-
+
m_outFile = std::fopen(m_outPath.c_str(), "w");
-
+
std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl;
- m_hash.BeginSave(m_outFile);
+ m_hash.BeginSave(m_outFile);
if(tempfilePath.size()) {
MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
- }
- else {
+ } else {
m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>();
}
-
+
EncodeScores();
-
+
std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl;
CalcHuffmanCodes();
-
+
std::cerr << "Pass 2/2: Compressing scores" << std::endl;
-
-
- if(tempfilePath.size()) {
+
+
+ if(tempfilePath.size()) {
MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
- }
- else {
+ } else {
m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>();
}
CompressScores();
-
+
std::cerr << "Saving to " << m_outPath << std::endl;
Save();
std::cerr << "Done" << std::endl;
@@ -84,20 +83,20 @@ LexicalReorderingTableCreator::LexicalReorderingTableCreator(
}
void LexicalReorderingTableCreator::PrintInfo()
-{
+{
std::cerr << "Used options:" << std::endl;
std::cerr << "\tText reordering table will be read from: " << m_inPath << std::endl;
std::cerr << "\tOutput reordering table will be written to: " << m_outPath << std::endl;
std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl;
std::cerr << "\tPhrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl;
- std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
+ std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
std::cerr << "\tUsing score quantization: ";
if(m_quantize)
std::cerr << m_quantize << " best" << std::endl;
else
std::cerr << "no" << std::endl;
-
-#ifdef WITH_THREADS
+
+#ifdef WITH_THREADS
std::cerr << "\tRunning with " << m_threads << " threads" << std::endl;
#endif
std::cerr << std::endl;
@@ -109,7 +108,7 @@ LexicalReorderingTableCreator::~LexicalReorderingTableCreator()
delete m_scoreTrees[i];
delete m_scoreCounters[i];
}
-
+
delete m_encodedScores;
delete m_compressedScores;
}
@@ -121,9 +120,8 @@ void LexicalReorderingTableCreator::EncodeScores()
#ifdef WITH_THREADS
boost::thread_group threads;
- for (size_t i = 0; i < m_threads; ++i)
- {
- EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this);
+ for (size_t i = 0; i < m_threads; ++i) {
+ EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this);
threads.create_thread(*et);
}
threads.join_all();
@@ -136,17 +134,16 @@ void LexicalReorderingTableCreator::EncodeScores()
}
void LexicalReorderingTableCreator::CalcHuffmanCodes()
-{
+{
std::vector<ScoreTree*>::iterator treeIt = m_scoreTrees.begin();
for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
- it != m_scoreCounters.end(); it++)
- {
+ it != m_scoreCounters.end(); it++) {
if(m_quantize)
- (*it)->Quantize(m_quantize);
-
+ (*it)->Quantize(m_quantize);
+
std::cerr << "\tCreating Huffman codes for " << (*it)->Size()
- << " scores" << std::endl;
-
+ << " scores" << std::endl;
+
*treeIt = new ScoreTree((*it)->Begin(), (*it)->End());
treeIt++;
}
@@ -158,7 +155,7 @@ void LexicalReorderingTableCreator::CompressScores()
#ifdef WITH_THREADS
boost::thread_group threads;
for (size_t i = 0; i < m_threads; ++i) {
- CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
+ CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
threads.create_thread(*ct);
}
threads.join_all();
@@ -171,12 +168,12 @@ void LexicalReorderingTableCreator::CompressScores()
}
void LexicalReorderingTableCreator::Save()
-{
+{
ThrowingFwrite(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, m_outFile);
ThrowingFwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile);
for(size_t i = 0; i < m_scoreTrees.size(); i++)
m_scoreTrees[i]->Save(m_outFile);
-
+
m_compressedScores->save(m_outFile);
}
@@ -192,38 +189,37 @@ std::string LexicalReorderingTableCreator::EncodeLine(std::vector<std::string>&
{
std::string scoresString = tokens.back();
std::stringstream scoresStream;
-
+
std::vector<float> scores;
Tokenize<float>(scores, scoresString);
-
+
if(!m_numScoreComponent) {
m_numScoreComponent = scores.size();
m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
it != m_scoreCounters.end(); it++)
- *it = new ScoreCounter();
+ *it = new ScoreCounter();
m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
}
-
+
if(m_numScoreComponent != scores.size()) {
std::cerr << "Error: Wrong number of scores detected ("
- << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
+ << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
std::cerr << "Line: " << tokens[0] << " ||| ... ||| " << scoresString << std::endl;
- abort();
+ abort();
}
-
+
size_t c = 0;
float score;
- while(c < m_numScoreComponent)
- {
+ while(c < m_numScoreComponent) {
score = scores[c];
score = FloorScore(TransformScore(score));
scoresStream.write((char*)&score, sizeof(score));
-
+
m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score);
c++;
}
-
+
return scoresStream.str();
}
@@ -232,25 +228,23 @@ void LexicalReorderingTableCreator::AddEncodedLine(PackedItem& pi)
m_queue.push(pi);
}
-void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) {
- if(force || m_queue.size() > 10000)
- {
- while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine())
- {
+void LexicalReorderingTableCreator::FlushEncodedQueue(bool force)
+{
+ if(force || m_queue.size() > 10000) {
+ while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
PackedItem pi = m_queue.top();
m_queue.pop();
m_lastFlushedLine++;
-
- m_lastRange.push_back(pi.GetSrc());
+
+ m_lastRange.push_back(pi.GetSrc());
m_encodedScores->push_back(pi.GetTrg());
-
+
if((pi.GetLine()+1) % 100000 == 0)
- std::cerr << ".";
+ std::cerr << ".";
if((pi.GetLine()+1) % 5000000 == 0)
- std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
-
- if(m_lastRange.size() == (1ul << m_orderBits))
- {
+ std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
+
+ if(m_lastRange.size() == (1ul << m_orderBits)) {
m_hash.AddRange(m_lastRange);
m_hash.SaveLastRange();
m_hash.DropLastRange();
@@ -258,14 +252,13 @@ void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) {
}
}
}
-
- if(force)
- {
+
+ if(force) {
m_lastFlushedLine = -1;
m_hash.AddRange(m_lastRange);
m_lastRange.clear();
-
+
#ifdef WITH_THREADS
m_hash.WaitAll();
#endif
@@ -278,56 +271,55 @@ void LexicalReorderingTableCreator::FlushEncodedQueue(bool force) {
}
}
-std::string LexicalReorderingTableCreator::CompressEncodedScores(std::string &encodedScores) {
+std::string LexicalReorderingTableCreator::CompressEncodedScores(std::string &encodedScores)
+{
std::stringstream encodedScoresStream(encodedScores);
encodedScoresStream.unsetf(std::ios::skipws);
-
+
std::string compressedScores;
BitWrapper<> compressedScoresStream(compressedScores);
-
+
size_t currScore = 0;
float score;
encodedScoresStream.read((char*) &score, sizeof(score));
-
+
while(encodedScoresStream) {
size_t index = currScore % m_scoreTrees.size();
-
+
if(m_quantize)
score = m_scoreCounters[index]->LowerBound(score);
-
+
m_scoreTrees[index]->Put(compressedScoresStream, score);
encodedScoresStream.read((char*) &score, sizeof(score));
currScore++;
}
-
+
return compressedScores;
}
-void LexicalReorderingTableCreator::AddCompressedScores(PackedItem& pi) {
- m_queue.push(pi);
+void LexicalReorderingTableCreator::AddCompressedScores(PackedItem& pi)
+{
+ m_queue.push(pi);
}
void LexicalReorderingTableCreator::FlushCompressedQueue(bool force)
-{
- if(force || m_queue.size() > 10000)
- {
- while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine())
- {
+{
+ if(force || m_queue.size() > 10000) {
+ while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
PackedItem pi = m_queue.top();
m_queue.pop();
m_lastFlushedLine++;
-
+
m_compressedScores->push_back(pi.GetTrg());
-
+
if((pi.GetLine()+1) % 100000 == 0)
- std::cerr << ".";
+ std::cerr << ".";
if((pi.GetLine()+1) % 5000000 == 0)
- std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
+ std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
}
}
-
- if(force)
- {
+
+ if(force) {
m_lastFlushedLine = -1;
std::cerr << std::endl << std::endl;
}
@@ -343,63 +335,61 @@ boost::mutex EncodingTaskReordering::m_fileMutex;
EncodingTaskReordering::EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator)
: m_inFile(inFile), m_creator(creator) {}
-
+
void EncodingTaskReordering::operator()()
{
size_t lineNum = 0;
-
+
std::vector<std::string> lines;
size_t max_lines = 1000;
lines.reserve(max_lines);
-
+
{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_fileMutex);
#endif
std::string line;
while(lines.size() < max_lines && std::getline(m_inFile, line))
- lines.push_back(line);
+ lines.push_back(line);
lineNum = m_lineNum;
m_lineNum += lines.size();
}
-
+
std::vector<PackedItem> result;
result.reserve(max_lines);
-
- while(lines.size())
- {
- for(size_t i = 0; i < lines.size(); i++)
- {
+
+ while(lines.size()) {
+ for(size_t i = 0; i < lines.size(); i++) {
std::vector<std::string> tokens;
Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
-
+
std::string encodedLine = m_creator.EncodeLine(tokens);
-
+
std::string f = tokens[0];
-
+
std::string e;
if(tokens.size() > 2)
e = tokens[1];
-
+
PackedItem packedItem(lineNum + i, m_creator.MakeSourceTargetKey(f, e),
encodedLine, i);
result.push_back(packedItem);
}
lines.clear();
-
+
{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_mutex);
#endif
- for(size_t i = 0; i < result.size(); i++)
+ for(size_t i = 0; i < result.size(); i++)
m_creator.AddEncodedLine(result[i]);
- m_creator.FlushEncodedQueue();
+ m_creator.FlushEncodedQueue();
}
-
+
result.clear();
lines.reserve(max_lines);
result.reserve(max_lines);
-
+
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_fileMutex);
#endif
@@ -419,11 +409,11 @@ boost::mutex CompressionTaskReordering::m_mutex;
#endif
CompressionTaskReordering::CompressionTaskReordering(StringVector<unsigned char, unsigned long,
- MmapAllocator>& encodedScores,
- LexicalReorderingTableCreator& creator)
+ MmapAllocator>& encodedScores,
+ LexicalReorderingTableCreator& creator)
: m_encodedScores(encodedScores), m_creator(creator)
{ }
-
+
void CompressionTaskReordering::operator()()
{
size_t scoresNum;
@@ -434,12 +424,11 @@ void CompressionTaskReordering::operator()()
scoresNum = m_scoresNum;
m_scoresNum++;
}
-
- while(scoresNum < m_encodedScores.size())
- {
+
+ while(scoresNum < m_encodedScores.size()) {
std::string scores = m_encodedScores[scoresNum];
std::string compressedScores
- = m_creator.CompressEncodedScores(scores);
+ = m_creator.CompressEncodedScores(scores);
std::string dummy;
PackedItem packedItem(scoresNum, dummy, compressedScores, 0);
@@ -449,9 +438,9 @@ void CompressionTaskReordering::operator()()
#endif
m_creator.AddCompressedScores(packedItem);
m_creator.FlushCompressedQueue();
-
- scoresNum = m_scoresNum;
- m_scoresNum++;
+
+ scoresNum = m_scoresNum;
+ m_scoresNum++;
}
}
diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h
index 2e202ce9b..1bf8444fe 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h
@@ -1,139 +1,141 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_LexicalReorderingTableCreator_h
#define moses_LexicalReorderingTableCreator_h
#include "PhraseTableCreator.h"
-namespace Moses {
-
-class LexicalReorderingTableCreator {
- private:
- std::string m_inPath;
- std::string m_outPath;
- std::string m_tempfilePath;
-
- std::FILE* m_outFile;
-
- size_t m_orderBits;
- size_t m_fingerPrintBits;
-
- size_t m_numScoreComponent;
-
- bool m_multipleScoreTrees;
- bool m_quantize;
-
- std::string m_separator;
-
- BlockHashIndex m_hash;
-
- typedef Counter<float> ScoreCounter;
- typedef CanonicalHuffman<float> ScoreTree;
-
- std::vector<ScoreCounter*> m_scoreCounters;
- std::vector<ScoreTree*> m_scoreTrees;
-
- StringVector<unsigned char, unsigned long, MmapAllocator>* m_encodedScores;
- StringVector<unsigned char, unsigned long, MmapAllocator>* m_compressedScores;
-
- std::priority_queue<PackedItem> m_queue;
- long m_lastFlushedLine;
- long m_lastFlushedSourceNum;
- std::string m_lastFlushedSourcePhrase;
- std::vector<std::string> m_lastRange;
-
-#ifdef WITH_THREADS
- size_t m_threads;
+namespace Moses
+{
+
+class LexicalReorderingTableCreator
+{
+private:
+ std::string m_inPath;
+ std::string m_outPath;
+ std::string m_tempfilePath;
+
+ std::FILE* m_outFile;
+
+ size_t m_orderBits;
+ size_t m_fingerPrintBits;
+
+ size_t m_numScoreComponent;
+
+ bool m_multipleScoreTrees;
+ bool m_quantize;
+
+ std::string m_separator;
+
+ BlockHashIndex m_hash;
+
+ typedef Counter<float> ScoreCounter;
+ typedef CanonicalHuffman<float> ScoreTree;
+
+ std::vector<ScoreCounter*> m_scoreCounters;
+ std::vector<ScoreTree*> m_scoreTrees;
+
+ StringVector<unsigned char, unsigned long, MmapAllocator>* m_encodedScores;
+ StringVector<unsigned char, unsigned long, MmapAllocator>* m_compressedScores;
+
+ std::priority_queue<PackedItem> m_queue;
+ long m_lastFlushedLine;
+ long m_lastFlushedSourceNum;
+ std::string m_lastFlushedSourcePhrase;
+ std::vector<std::string> m_lastRange;
+
+#ifdef WITH_THREADS
+ size_t m_threads;
#endif
-
- void PrintInfo();
-
- void EncodeScores();
- void CalcHuffmanCodes();
- void CompressScores();
- void Save();
-
- std::string MakeSourceTargetKey(std::string&, std::string&);
-
- std::string EncodeLine(std::vector<std::string>& tokens);
- void AddEncodedLine(PackedItem& pi);
- void FlushEncodedQueue(bool force = false);
-
- std::string CompressEncodedScores(std::string &encodedScores);
- void AddCompressedScores(PackedItem& pi);
- void FlushCompressedQueue(bool force = false);
-
- public:
- LexicalReorderingTableCreator(std::string inPath,
- std::string outPath,
- std::string tempfilePath,
- size_t orderBits = 10,
- size_t fingerPrintBits = 16,
- bool multipleScoreTrees = true,
- size_t quantize = 0
+
+ void PrintInfo();
+
+ void EncodeScores();
+ void CalcHuffmanCodes();
+ void CompressScores();
+ void Save();
+
+ std::string MakeSourceTargetKey(std::string&, std::string&);
+
+ std::string EncodeLine(std::vector<std::string>& tokens);
+ void AddEncodedLine(PackedItem& pi);
+ void FlushEncodedQueue(bool force = false);
+
+ std::string CompressEncodedScores(std::string &encodedScores);
+ void AddCompressedScores(PackedItem& pi);
+ void FlushCompressedQueue(bool force = false);
+
+public:
+ LexicalReorderingTableCreator(std::string inPath,
+ std::string outPath,
+ std::string tempfilePath,
+ size_t orderBits = 10,
+ size_t fingerPrintBits = 16,
+ bool multipleScoreTrees = true,
+ size_t quantize = 0
#ifdef WITH_THREADS
- , size_t threads = 2
-#endif
- );
-
- ~LexicalReorderingTableCreator();
-
+ , size_t threads = 2
+#endif
+ );
+
+ ~LexicalReorderingTableCreator();
+
friend class EncodingTaskReordering;
friend class CompressionTaskReordering;
};
class EncodingTaskReordering
{
- private:
+private:
#ifdef WITH_THREADS
- static boost::mutex m_mutex;
- static boost::mutex m_fileMutex;
+ static boost::mutex m_mutex;
+ static boost::mutex m_fileMutex;
#endif
- static size_t m_lineNum;
- static size_t m_sourcePhraseNum;
- static std::string m_lastSourcePhrase;
-
- InputFileStream& m_inFile;
- LexicalReorderingTableCreator& m_creator;
-
- public:
- EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator);
- void operator()();
+ static size_t m_lineNum;
+ static size_t m_sourcePhraseNum;
+ static std::string m_lastSourcePhrase;
+
+ InputFileStream& m_inFile;
+ LexicalReorderingTableCreator& m_creator;
+
+public:
+ EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator);
+ void operator()();
};
class CompressionTaskReordering
{
- private:
+private:
#ifdef WITH_THREADS
- static boost::mutex m_mutex;
+ static boost::mutex m_mutex;
#endif
- static size_t m_scoresNum;
- StringVector<unsigned char, unsigned long, MmapAllocator> &m_encodedScores;
- LexicalReorderingTableCreator &m_creator;
-
- public:
- CompressionTaskReordering(StringVector<unsigned char, unsigned long, MmapAllocator>&
- m_encodedScores, LexicalReorderingTableCreator& creator);
- void operator()();
+ static size_t m_scoresNum;
+ StringVector<unsigned char, unsigned long, MmapAllocator> &m_encodedScores;
+ LexicalReorderingTableCreator &m_creator;
+
+public:
+ CompressionTaskReordering(StringVector<unsigned char, unsigned long, MmapAllocator>&
+ m_encodedScores, LexicalReorderingTableCreator& creator);
+ void operator()();
};
}
diff --git a/moses/TranslationModel/CompactPT/ListCoders.h b/moses/TranslationModel/CompactPT/ListCoders.h
index 329e1297a..b41e183ce 100644
--- a/moses/TranslationModel/CompactPT/ListCoders.h
+++ b/moses/TranslationModel/CompactPT/ListCoders.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_ListCoders_h
#define moses_ListCoders_h
@@ -31,94 +31,86 @@ namespace Moses
template <typename T = unsigned int>
class VarIntType
{
- private:
- template <typename IntType, typename OutIt>
- static void EncodeSymbol(IntType input, OutIt output)
- {
- if(input == 0)
- {
- *output = 0;
- output++;
- return;
- }
-
- T msb = 1 << (sizeof(T)*8-1);
- IntType mask = ~msb;
- IntType shift = (sizeof(T)*8-1);
-
- while(input)
- {
- T res = input & mask;
- input >>= shift;
- if(input)
- res |= msb;
- *output = res;
- output++;
- }
- };
-
- template <typename InIt, typename IntType>
- static void DecodeSymbol(InIt &it, InIt end, IntType &output)
- {
- T msb = 1 << (sizeof(T)*8-1);
- IntType shift = (sizeof(T)*8-1);
-
- output = 0;
- size_t i = 0;
- while(it != end && *it & msb) {
- IntType temp = *it & ~msb;
- temp <<= shift*i;
- output |= temp;
- it++; i++;
- }
- assert(it != end);
-
- IntType temp = *it;
+private:
+ template <typename IntType, typename OutIt>
+ static void EncodeSymbol(IntType input, OutIt output) {
+ if(input == 0) {
+ *output = 0;
+ output++;
+ return;
+ }
+
+ T msb = 1 << (sizeof(T)*8-1);
+ IntType mask = ~msb;
+ IntType shift = (sizeof(T)*8-1);
+
+ while(input) {
+ T res = input & mask;
+ input >>= shift;
+ if(input)
+ res |= msb;
+ *output = res;
+ output++;
+ }
+ };
+
+ template <typename InIt, typename IntType>
+ static void DecodeSymbol(InIt &it, InIt end, IntType &output) {
+ T msb = 1 << (sizeof(T)*8-1);
+ IntType shift = (sizeof(T)*8-1);
+
+ output = 0;
+ size_t i = 0;
+ while(it != end && *it & msb) {
+ IntType temp = *it & ~msb;
temp <<= shift*i;
output |= temp;
it++;
+ i++;
}
+ assert(it != end);
- public:
-
- template <typename InIt, typename OutIt>
- static void Encode(InIt it, InIt end, OutIt outIt)
- {
- while(it != end)
- {
- EncodeSymbol(*it, outIt);
- it++;
- }
+ IntType temp = *it;
+ temp <<= shift*i;
+ output |= temp;
+ it++;
+ }
+
+public:
+
+ template <typename InIt, typename OutIt>
+ static void Encode(InIt it, InIt end, OutIt outIt) {
+ while(it != end) {
+ EncodeSymbol(*it, outIt);
+ it++;
}
-
- template <typename InIt, typename OutIt>
- static void Decode(InIt &it, InIt end, OutIt outIt)
- {
- while(it != end)
- {
- size_t output;
- DecodeSymbol(it, end, output);
- *outIt = output;
- outIt++;
- }
+ }
+
+ template <typename InIt, typename OutIt>
+ static void Decode(InIt &it, InIt end, OutIt outIt) {
+ while(it != end) {
+ size_t output;
+ DecodeSymbol(it, end, output);
+ *outIt = output;
+ outIt++;
}
-
- template <typename InIt>
- static size_t DecodeAndSum(InIt &it, InIt end, size_t num)
- {
- size_t sum = 0;
- size_t curr = 0;
-
- while(it != end && curr < num)
- {
- size_t output;
- DecodeSymbol(it, end, output);
- sum += output; curr++;
- }
-
- return sum;
+ }
+
+ template <typename InIt>
+ static size_t DecodeAndSum(InIt &it, InIt end, size_t num) {
+ size_t sum = 0;
+ size_t curr = 0;
+
+ while(it != end && curr < num) {
+ size_t output;
+ DecodeSymbol(it, end, output);
+ sum += output;
+ curr++;
}
+ return sum;
+ }
+
};
typedef VarIntType<unsigned char> VarByte;
@@ -129,179 +121,262 @@ typedef VarIntType<unsigned int> VarInt32;
class Simple9
{
- private:
- typedef unsigned int uint;
-
- template <typename InIt>
- inline static void EncodeSymbol(uint &output, InIt it, InIt end)
- {
- uint length = end - it;
-
- uint type = 0;
- uint bitlength = 0;
-
- switch(length)
- {
- case 1: type = 1; bitlength = 28; break;
- case 2: type = 2; bitlength = 14; break;
- case 3: type = 3; bitlength = 9; break;
- case 4: type = 4; bitlength = 7; break;
- case 5: type = 5; bitlength = 5; break;
- case 7: type = 6; bitlength = 4; break;
- case 9: type = 7; bitlength = 3; break;
- case 14: type = 8; bitlength = 2; break;
- case 28: type = 9; bitlength = 1; break;
- }
-
- output = 0;
- output |= (type << 28);
-
- uint i = 0;
- while(it != end)
- {
- uint l = bitlength * (length-i-1);
- output |= *it << l;
- it++;
- i++;
- }
+private:
+ typedef unsigned int uint;
+
+ template <typename InIt>
+ inline static void EncodeSymbol(uint &output, InIt it, InIt end) {
+ uint length = end - it;
+
+ uint type = 0;
+ uint bitlength = 0;
+
+ switch(length) {
+ case 1:
+ type = 1;
+ bitlength = 28;
+ break;
+ case 2:
+ type = 2;
+ bitlength = 14;
+ break;
+ case 3:
+ type = 3;
+ bitlength = 9;
+ break;
+ case 4:
+ type = 4;
+ bitlength = 7;
+ break;
+ case 5:
+ type = 5;
+ bitlength = 5;
+ break;
+ case 7:
+ type = 6;
+ bitlength = 4;
+ break;
+ case 9:
+ type = 7;
+ bitlength = 3;
+ break;
+ case 14:
+ type = 8;
+ bitlength = 2;
+ break;
+ case 28:
+ type = 9;
+ bitlength = 1;
+ break;
}
-
- template <typename OutIt>
- static inline void DecodeSymbol(uint input, OutIt outIt)
- {
- uint type = (input >> 28);
-
- uint bitlen = 0;
- uint shift = 0;
- uint mask = 0;
-
- switch(type)
- {
- case 1: bitlen = 28; shift = 0; mask = 268435455; break;
- case 2: bitlen = 14; shift = 14; mask = 16383; break;
- case 3: bitlen = 9; shift = 18; mask = 511; break;
- case 4: bitlen = 7; shift = 21; mask = 127; break;
- case 5: bitlen = 5; shift = 20; mask = 31; break;
- case 6: bitlen = 4; shift = 24; mask = 15; break;
- case 7: bitlen = 3; shift = 24; mask = 7; break;
- case 8: bitlen = 2; shift = 26; mask = 3; break;
- case 9: bitlen = 1; shift = 27; mask = 1; break;
- }
-
- while(shift > 0)
- {
- *outIt = (input >> shift) & mask;
- shift -= bitlen;
- outIt++;
- }
- *outIt = input & mask;
- outIt++;
+
+ output = 0;
+ output |= (type << 28);
+
+ uint i = 0;
+ while(it != end) {
+ uint l = bitlength * (length-i-1);
+ output |= *it << l;
+ it++;
+ i++;
}
-
- static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr)
- {
- uint type = (input >> 28);
-
- uint bitlen = 0;
- uint shift = 0;
- uint mask = 0;
-
- switch(type)
- {
- case 1: bitlen = 28; shift = 0; mask = 268435455; break;
- case 2: bitlen = 14; shift = 14; mask = 16383; break;
- case 3: bitlen = 9; shift = 18; mask = 511; break;
- case 4: bitlen = 7; shift = 21; mask = 127; break;
- case 5: bitlen = 5; shift = 20; mask = 31; break;
- case 6: bitlen = 4; shift = 24; mask = 15; break;
- case 7: bitlen = 3; shift = 24; mask = 7; break;
- case 8: bitlen = 2; shift = 26; mask = 3; break;
- case 9: bitlen = 1; shift = 27; mask = 1; break;
- }
+ }
- size_t sum = 0;
- while(shift > 0)
- {
- sum += (input >> shift) & mask;
- shift -= bitlen;
- if(++curr == num)
- return sum;
- }
- sum += input & mask;
- curr++;
- return sum;
+ template <typename OutIt>
+ static inline void DecodeSymbol(uint input, OutIt outIt) {
+ uint type = (input >> 28);
+
+ uint bitlen = 0;
+ uint shift = 0;
+ uint mask = 0;
+
+ switch(type) {
+ case 1:
+ bitlen = 28;
+ shift = 0;
+ mask = 268435455;
+ break;
+ case 2:
+ bitlen = 14;
+ shift = 14;
+ mask = 16383;
+ break;
+ case 3:
+ bitlen = 9;
+ shift = 18;
+ mask = 511;
+ break;
+ case 4:
+ bitlen = 7;
+ shift = 21;
+ mask = 127;
+ break;
+ case 5:
+ bitlen = 5;
+ shift = 20;
+ mask = 31;
+ break;
+ case 6:
+ bitlen = 4;
+ shift = 24;
+ mask = 15;
+ break;
+ case 7:
+ bitlen = 3;
+ shift = 24;
+ mask = 7;
+ break;
+ case 8:
+ bitlen = 2;
+ shift = 26;
+ mask = 3;
+ break;
+ case 9:
+ bitlen = 1;
+ shift = 27;
+ mask = 1;
+ break;
}
-
- public:
- template <typename InIt, typename OutIt>
- static void Encode(InIt it, InIt end, OutIt outIt)
- {
- uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
-
- uint buffer[28];
- for(InIt i = it; i < end; i++)
- {
- uint lastbit = 1;
- uint lastpos = 0;
- uint lastyes = 0;
- uint j = 0;
-
- double log2 = log(2);
- while(j < 9 && lastpos < 28 && (i+lastpos) < end)
- {
- if(lastpos >= parts[j])
- j++;
-
- buffer[lastpos] = *(i + lastpos);
-
- uint reqbit = ceil(log(buffer[lastpos]+1)/log2);
- assert(reqbit <= 28);
-
- uint bit = 28/floor(28/reqbit);
- if(lastbit < bit)
- lastbit = bit;
-
- if(parts[j] > 28/lastbit)
- break;
- else if(lastpos == parts[j]-1)
- lastyes = lastpos;
-
- lastpos++;
- }
- i += lastyes;
-
- uint length = lastyes + 1;
- uint output;
- EncodeSymbol(output, buffer, buffer + length);
-
- *outIt = output;
- outIt++;
- }
+
+ while(shift > 0) {
+ *outIt = (input >> shift) & mask;
+ shift -= bitlen;
+ outIt++;
}
-
- template <typename InIt, typename OutIt>
- static void Decode(InIt &it, InIt end, OutIt outIt)
- {
- while(it != end)
- {
- DecodeSymbol(*it, outIt);
- it++;
- }
+ *outIt = input & mask;
+ outIt++;
+ }
+
+ static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr) {
+ uint type = (input >> 28);
+
+ uint bitlen = 0;
+ uint shift = 0;
+ uint mask = 0;
+
+ switch(type) {
+ case 1:
+ bitlen = 28;
+ shift = 0;
+ mask = 268435455;
+ break;
+ case 2:
+ bitlen = 14;
+ shift = 14;
+ mask = 16383;
+ break;
+ case 3:
+ bitlen = 9;
+ shift = 18;
+ mask = 511;
+ break;
+ case 4:
+ bitlen = 7;
+ shift = 21;
+ mask = 127;
+ break;
+ case 5:
+ bitlen = 5;
+ shift = 20;
+ mask = 31;
+ break;
+ case 6:
+ bitlen = 4;
+ shift = 24;
+ mask = 15;
+ break;
+ case 7:
+ bitlen = 3;
+ shift = 24;
+ mask = 7;
+ break;
+ case 8:
+ bitlen = 2;
+ shift = 26;
+ mask = 3;
+ break;
+ case 9:
+ bitlen = 1;
+ shift = 27;
+ mask = 1;
+ break;
+ }
+
+ size_t sum = 0;
+ while(shift > 0) {
+ sum += (input >> shift) & mask;
+ shift -= bitlen;
+ if(++curr == num)
+ return sum;
}
-
- template <typename InIt>
- static size_t DecodeAndSum(InIt &it, InIt end, size_t num)
- {
- size_t sum = 0;
- size_t curr = 0;
- while(it != end && curr < num)
- {
- sum += DecodeAndSumSymbol(*it, num, curr);
- it++;
+ sum += input & mask;
+ curr++;
+ return sum;
+ }
+
+public:
+ template <typename InIt, typename OutIt>
+ static void Encode(InIt it, InIt end, OutIt outIt) {
+ uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
+
+ uint buffer[28];
+ for(InIt i = it; i < end; i++) {
+ uint lastbit = 1;
+ uint lastpos = 0;
+ uint lastyes = 0;
+ uint j = 0;
+
+ double log2 = log(2);
+ while(j < 9 && lastpos < 28 && (i+lastpos) < end) {
+ if(lastpos >= parts[j])
+ j++;
+
+ buffer[lastpos] = *(i + lastpos);
+
+ uint reqbit = ceil(log(buffer[lastpos]+1)/log2);
+ assert(reqbit <= 28);
+
+ uint bit = 28/floor(28/reqbit);
+ if(lastbit < bit)
+ lastbit = bit;
+
+ if(parts[j] > 28/lastbit)
+ break;
+ else if(lastpos == parts[j]-1)
+ lastyes = lastpos;
+
+ lastpos++;
}
- assert(curr == num);
- return sum;
+ i += lastyes;
+
+ uint length = lastyes + 1;
+ uint output;
+ EncodeSymbol(output, buffer, buffer + length);
+
+ *outIt = output;
+ outIt++;
+ }
+ }
+
+ template <typename InIt, typename OutIt>
+ static void Decode(InIt &it, InIt end, OutIt outIt) {
+ while(it != end) {
+ DecodeSymbol(*it, outIt);
+ it++;
+ }
+ }
+
+ template <typename InIt>
+ static size_t DecodeAndSum(InIt &it, InIt end, size_t num) {
+ size_t sum = 0;
+ size_t curr = 0;
+ while(it != end && curr < num) {
+ sum += DecodeAndSumSymbol(*it, num, curr);
+ it++;
}
+ assert(curr == num);
+ return sum;
+ }
};
}
diff --git a/moses/TranslationModel/CompactPT/MmapAllocator.h b/moses/TranslationModel/CompactPT/MmapAllocator.h
index 049c0149d..7cd6dd49e 100644
--- a/moses/TranslationModel/CompactPT/MmapAllocator.h
+++ b/moses/TranslationModel/CompactPT/MmapAllocator.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_MmapAllocator_h
#define moses_MmapAllocator_h
@@ -30,175 +30,161 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
- template <class T>
- class MmapAllocator
- {
- protected:
- std::FILE* m_file_ptr;
- size_t m_file_desc;
-
- size_t m_page_size;
- size_t m_map_size;
-
- char* m_data_ptr;
- size_t m_data_offset;
- bool m_fixed;
- size_t* m_count;
-
- public:
- typedef T value_type;
- typedef T* pointer;
- typedef const T* const_pointer;
- typedef T& reference;
- typedef const T& const_reference;
- typedef std::size_t size_type;
- typedef std::ptrdiff_t difference_type;
-
- MmapAllocator() throw()
- : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)),
- m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
- m_data_offset(0), m_fixed(false), m_count(new size_t(0))
- { }
-
- MmapAllocator(std::FILE* f_ptr) throw()
- : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
- m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
- m_data_offset(0), m_fixed(false), m_count(new size_t(0))
- { }
-
- MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw()
- : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
- m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
- m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0))
- { }
-
- MmapAllocator(std::string fileName) throw()
- : m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)),
- m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
- m_data_offset(0), m_fixed(false), m_count(new size_t(0))
- { }
-
- MmapAllocator(const MmapAllocator& c) throw()
- : m_file_ptr(c.m_file_ptr), m_file_desc(c.m_file_desc),
- m_page_size(c.m_page_size), m_map_size(c.m_map_size),
- m_data_ptr(c.m_data_ptr), m_data_offset(c.m_data_offset),
- m_fixed(c.m_fixed), m_count(c.m_count)
- {
- (*m_count)++;
- }
-
- ~MmapAllocator() throw()
- {
- if(m_data_ptr && *m_count == 0)
- {
- munmap(m_data_ptr, m_map_size);
- if(!m_fixed && std::ftell(m_file_ptr) != -1)
- std::fclose(m_file_ptr);
- }
- (*m_count)--;
- }
-
- template <class U>
- struct rebind {
- typedef MmapAllocator<U> other;
- };
-
- pointer address (reference value) const
- {
- return &value;
- }
-
- const_pointer address (const_reference value) const
- {
- return &value;
- }
-
- size_type max_size () const throw()
- {
- return std::numeric_limits<size_t>::max() / sizeof(value_type);
- }
-
- pointer allocate (size_type num, const void* = 0)
- {
- m_map_size = num * sizeof(T);
-
- if(!m_fixed)
- {
- size_t read = 0;
- read += ftruncate(m_file_desc, m_map_size);
- m_data_ptr = (char*)mmap(0, m_map_size, PROT_READ|PROT_WRITE, MAP_SHARED,
- m_file_desc, 0);
- if(m_data_ptr == MAP_FAILED)
- std::cerr << "Error: mmapping" << std::endl;
- return (pointer)m_data_ptr;
- }
- else
- {
- size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
- size_t relative_offset = m_data_offset - map_offset;
-
- size_t map_size = m_map_size + relative_offset;
-
- m_data_ptr = (char*)mmap(0, map_size, PROT_READ, MAP_SHARED,
- m_file_desc, map_offset);
-
- return (pointer)(m_data_ptr + relative_offset);
- }
- }
-
- void deallocate (pointer p, size_type num)
- {
- if(!m_fixed) {
- munmap(p, num * sizeof(T));
- }
- else {
- size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
- size_t relative_offset = m_data_offset - map_offset;
- munmap((pointer)((char*)p - relative_offset), num * sizeof(T));
- }
-
- }
-
- void construct (pointer p, const T& value)
- {
- if(!m_fixed)
- new(p) value_type(value);
- }
- void destroy (pointer p)
- {
- if(!m_fixed)
- p->~T();
- }
-
- template <class T1, class T2>
- friend bool operator== (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
-
- template <class T1, class T2>
- friend bool operator!= (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
- };
-
- template <class T1, class T2>
- bool operator== (const MmapAllocator<T1>& a1,
- const MmapAllocator<T2>& a2) throw()
- {
- bool equal = true;
- equal &= a1.m_file_ptr == a2.m_file_ptr;
- equal &= a1.m_file_desc == a2.m_file_desc;
- equal &= a1.m_page_size == a2.m_page_size;
- equal &= a1.m_map_size == a2.m_map_size;
- equal &= a1.m_data_ptr == a2.m_data_ptr;
- equal &= a1.m_data_offset == a2.m_data_offset;
- equal &= a1.m_fixed == a2.m_fixed;
- return equal;
+template <class T>
+class MmapAllocator
+{
+protected:
+ std::FILE* m_file_ptr;
+ size_t m_file_desc;
+
+ size_t m_page_size;
+ size_t m_map_size;
+
+ char* m_data_ptr;
+ size_t m_data_offset;
+ bool m_fixed;
+ size_t* m_count;
+
+public:
+ typedef T value_type;
+ typedef T* pointer;
+ typedef const T* const_pointer;
+ typedef T& reference;
+ typedef const T& const_reference;
+ typedef std::size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+
+ MmapAllocator() throw()
+ : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)),
+ m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
+ m_data_offset(0), m_fixed(false), m_count(new size_t(0))
+ { }
+
+ MmapAllocator(std::FILE* f_ptr) throw()
+ : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
+ m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
+ m_data_offset(0), m_fixed(false), m_count(new size_t(0))
+ { }
+
+ MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw()
+ : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
+ m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
+ m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0))
+ { }
+
+ MmapAllocator(std::string fileName) throw()
+ : m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)),
+ m_page_size(sysconf(_SC_PAGE_SIZE)), m_map_size(0), m_data_ptr(0),
+ m_data_offset(0), m_fixed(false), m_count(new size_t(0))
+ { }
+
+ MmapAllocator(const MmapAllocator& c) throw()
+ : m_file_ptr(c.m_file_ptr), m_file_desc(c.m_file_desc),
+ m_page_size(c.m_page_size), m_map_size(c.m_map_size),
+ m_data_ptr(c.m_data_ptr), m_data_offset(c.m_data_offset),
+ m_fixed(c.m_fixed), m_count(c.m_count) {
+ (*m_count)++;
+ }
+
+ ~MmapAllocator() throw() {
+ if(m_data_ptr && *m_count == 0) {
+ munmap(m_data_ptr, m_map_size);
+ if(!m_fixed && std::ftell(m_file_ptr) != -1)
+ std::fclose(m_file_ptr);
+ }
+ (*m_count)--;
+ }
+
+ template <class U>
+ struct rebind {
+ typedef MmapAllocator<U> other;
+ };
+
+ pointer address (reference value) const {
+ return &value;
+ }
+
+ const_pointer address (const_reference value) const {
+ return &value;
+ }
+
+ size_type max_size () const throw() {
+ return std::numeric_limits<size_t>::max() / sizeof(value_type);
+ }
+
+ pointer allocate (size_type num, const void* = 0) {
+ m_map_size = num * sizeof(T);
+
+ if(!m_fixed) {
+ size_t read = 0;
+ read += ftruncate(m_file_desc, m_map_size);
+ m_data_ptr = (char*)mmap(0, m_map_size, PROT_READ|PROT_WRITE, MAP_SHARED,
+ m_file_desc, 0);
+ if(m_data_ptr == MAP_FAILED)
+ std::cerr << "Error: mmapping" << std::endl;
+ return (pointer)m_data_ptr;
+ } else {
+ size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
+ size_t relative_offset = m_data_offset - map_offset;
+
+ size_t map_size = m_map_size + relative_offset;
+
+ m_data_ptr = (char*)mmap(0, map_size, PROT_READ, MAP_SHARED,
+ m_file_desc, map_offset);
+
+ return (pointer)(m_data_ptr + relative_offset);
}
-
- template <class T1, class T2>
- bool operator!=(const MmapAllocator<T1>& a1,
- const MmapAllocator<T2>& a2) throw()
- {
- return !(a1 == a2);
+ }
+
+ void deallocate (pointer p, size_type num) {
+ if(!m_fixed) {
+ munmap(p, num * sizeof(T));
+ } else {
+ size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
+ size_t relative_offset = m_data_offset - map_offset;
+ munmap((pointer)((char*)p - relative_offset), num * sizeof(T));
}
+ }
+
+ void construct (pointer p, const T& value) {
+ if(!m_fixed)
+ new(p) value_type(value);
+ }
+ void destroy (pointer p) {
+ if(!m_fixed)
+ p->~T();
+ }
+
+ template <class T1, class T2>
+ friend bool operator== (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
+
+ template <class T1, class T2>
+ friend bool operator!= (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
+};
+
+template <class T1, class T2>
+bool operator== (const MmapAllocator<T1>& a1,
+ const MmapAllocator<T2>& a2) throw()
+{
+ bool equal = true;
+ equal &= a1.m_file_ptr == a2.m_file_ptr;
+ equal &= a1.m_file_desc == a2.m_file_desc;
+ equal &= a1.m_page_size == a2.m_page_size;
+ equal &= a1.m_map_size == a2.m_map_size;
+ equal &= a1.m_data_ptr == a2.m_data_ptr;
+ equal &= a1.m_data_offset == a2.m_data_offset;
+ equal &= a1.m_fixed == a2.m_fixed;
+ return equal;
+}
+
+template <class T1, class T2>
+bool operator!=(const MmapAllocator<T1>& a1,
+ const MmapAllocator<T2>& a2) throw()
+{
+ return !(a1 == a2);
+}
+
}
#endif
diff --git a/moses/TranslationModel/CompactPT/MonotonicVector.h b/moses/TranslationModel/CompactPT/MonotonicVector.h
index a4423c369..5e965d3e5 100644
--- a/moses/TranslationModel/CompactPT/MonotonicVector.h
+++ b/moses/TranslationModel/CompactPT/MonotonicVector.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_MonotonicVector_h
#define moses_MonotonicVector_h
@@ -43,206 +43,187 @@ namespace Moses
{
template<typename PosT = size_t, typename NumT = size_t, PosT stepSize = 32,
-template <typename> class Allocator = std::allocator>
+ template <typename> class Allocator = std::allocator>
class MonotonicVector
{
- private:
- typedef std::vector<NumT, Allocator<NumT> > Anchors;
- typedef std::vector<unsigned int, Allocator<unsigned int> > Diffs;
-
- Anchors m_anchors;
- Diffs m_diffs;
- std::vector<unsigned int> m_tempDiffs;
-
- size_t m_size;
- PosT m_last;
- bool m_final;
-
- public:
- typedef PosT value_type;
-
- MonotonicVector() : m_size(0), m_last(0), m_final(false) {}
-
- size_t size() const
- {
- return m_size + m_tempDiffs.size();
- }
-
- PosT at(size_t i) const
- {
- PosT s = stepSize;
- PosT j = m_anchors[i / s];
- PosT r = i % s;
-
- typename Diffs::const_iterator it = m_diffs.begin() + j;
-
- PosT k = 0;
- k += VarInt32::DecodeAndSum(it, m_diffs.end(), 1);
- if(i < m_size)
- k += Simple9::DecodeAndSum(it, m_diffs.end(), r);
- else if(i < m_size + m_tempDiffs.size())
- for(size_t l = 0; l < r; l++)
- k += m_tempDiffs[l];
-
- return k;
- }
-
- PosT operator[](PosT i) const
- {
- return at(i);
- }
-
- PosT back() const
- {
- return at(size()-1);
- }
-
- void push_back(PosT i)
- {
- assert(m_final != true);
-
- if(m_anchors.size() == 0 && m_tempDiffs.size() == 0)
- {
- m_anchors.push_back(0);
- VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
- m_last = i;
- m_size++;
-
- return;
- }
-
- if(m_tempDiffs.size() == stepSize-1)
- {
- Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
- std::back_inserter(m_diffs));
- m_anchors.push_back(m_diffs.size());
- VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
-
- m_size += m_tempDiffs.size() + 1;
- m_tempDiffs.clear();
- }
- else
- {
- PosT last = m_last;
- PosT diff = i - last;
- m_tempDiffs.push_back(diff);
- }
+private:
+ typedef std::vector<NumT, Allocator<NumT> > Anchors;
+ typedef std::vector<unsigned int, Allocator<unsigned int> > Diffs;
+
+ Anchors m_anchors;
+ Diffs m_diffs;
+ std::vector<unsigned int> m_tempDiffs;
+
+ size_t m_size;
+ PosT m_last;
+ bool m_final;
+
+public:
+ typedef PosT value_type;
+
+ MonotonicVector() : m_size(0), m_last(0), m_final(false) {}
+
+ size_t size() const {
+ return m_size + m_tempDiffs.size();
+ }
+
+ PosT at(size_t i) const {
+ PosT s = stepSize;
+ PosT j = m_anchors[i / s];
+ PosT r = i % s;
+
+ typename Diffs::const_iterator it = m_diffs.begin() + j;
+
+ PosT k = 0;
+ k += VarInt32::DecodeAndSum(it, m_diffs.end(), 1);
+ if(i < m_size)
+ k += Simple9::DecodeAndSum(it, m_diffs.end(), r);
+ else if(i < m_size + m_tempDiffs.size())
+ for(size_t l = 0; l < r; l++)
+ k += m_tempDiffs[l];
+
+ return k;
+ }
+
+ PosT operator[](PosT i) const {
+ return at(i);
+ }
+
+ PosT back() const {
+ return at(size()-1);
+ }
+
+ void push_back(PosT i) {
+ assert(m_final != true);
+
+ if(m_anchors.size() == 0 && m_tempDiffs.size() == 0) {
+ m_anchors.push_back(0);
+ VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
m_last = i;
+ m_size++;
+
+ return;
}
-
- void commit()
- {
- assert(m_final != true);
+
+ if(m_tempDiffs.size() == stepSize-1) {
Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
std::back_inserter(m_diffs));
- m_size += m_tempDiffs.size();
+ m_anchors.push_back(m_diffs.size());
+ VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
+
+ m_size += m_tempDiffs.size() + 1;
m_tempDiffs.clear();
- m_final = true;
- }
-
- size_t usage()
- {
- return m_diffs.size() * sizeof(unsigned int)
- + m_anchors.size() * sizeof(NumT);
+ } else {
+ PosT last = m_last;
+ PosT diff = i - last;
+ m_tempDiffs.push_back(diff);
}
-
- size_t load(std::FILE* in, bool map = false)
- {
- size_t byteSize = 0;
-
- byteSize += fread(&m_final, sizeof(bool), 1, in) * sizeof(bool);
- byteSize += fread(&m_size, sizeof(size_t), 1, in) * sizeof(size_t);
- byteSize += fread(&m_last, sizeof(PosT), 1, in) * sizeof(PosT);
-
- byteSize += loadVector(m_diffs, in, map);
- byteSize += loadVector(m_anchors, in, map);
-
- return byteSize;
- }
-
- template <typename ValueT>
- size_t loadVector(std::vector<ValueT, std::allocator<ValueT> >& v,
- std::FILE* in, bool map = false)
- {
- // Can only be read into memory. Mapping not possible with std:allocator.
- assert(map == false);
-
- size_t byteSize = 0;
-
- size_t valSize;
- byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
-
+ m_last = i;
+ }
+
+ void commit() {
+ assert(m_final != true);
+ Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
+ std::back_inserter(m_diffs));
+ m_size += m_tempDiffs.size();
+ m_tempDiffs.clear();
+ m_final = true;
+ }
+
+ size_t usage() {
+ return m_diffs.size() * sizeof(unsigned int)
+ + m_anchors.size() * sizeof(NumT);
+ }
+
+ size_t load(std::FILE* in, bool map = false) {
+ size_t byteSize = 0;
+
+ byteSize += fread(&m_final, sizeof(bool), 1, in) * sizeof(bool);
+ byteSize += fread(&m_size, sizeof(size_t), 1, in) * sizeof(size_t);
+ byteSize += fread(&m_last, sizeof(PosT), 1, in) * sizeof(PosT);
+
+ byteSize += loadVector(m_diffs, in, map);
+ byteSize += loadVector(m_anchors, in, map);
+
+ return byteSize;
+ }
+
+ template <typename ValueT>
+ size_t loadVector(std::vector<ValueT, std::allocator<ValueT> >& v,
+ std::FILE* in, bool map = false) {
+ // Can only be read into memory. Mapping not possible with std:allocator.
+ assert(map == false);
+
+ size_t byteSize = 0;
+
+ size_t valSize;
+ byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
+
+ v.resize(valSize, 0);
+ byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
+
+ return byteSize;
+ }
+
+ template <typename ValueT>
+ size_t loadVector(std::vector<ValueT, MmapAllocator<ValueT> >& v,
+ std::FILE* in, bool map = false) {
+ size_t byteSize = 0;
+
+ size_t valSize;
+ byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
+
+ if(map == false) {
+ // Read data into temporary file (default constructor of MmapAllocator)
+ // and map memory onto temporary file. Can be resized.
+
v.resize(valSize, 0);
byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
-
- return byteSize;
- }
-
- template <typename ValueT>
- size_t loadVector(std::vector<ValueT, MmapAllocator<ValueT> >& v,
- std::FILE* in, bool map = false)
- {
- size_t byteSize = 0;
-
- size_t valSize;
- byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
-
- if(map == false)
- {
- // Read data into temporary file (default constructor of MmapAllocator)
- // and map memory onto temporary file. Can be resized.
-
- v.resize(valSize, 0);
- byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
- }
- else
- {
- // Map it directly on specified region of file "in" starting at valPos
- // with length valSize * sizeof(ValueT). Mapped region cannot be resized.
-
- size_t valPos = std::ftell(in);
-
- Allocator<ValueT> alloc(in, valPos);
- std::vector<ValueT, Allocator<ValueT> > vTemp(alloc);
- vTemp.resize(valSize);
- v.swap(vTemp);
-
- std::fseek(in, valSize * sizeof(ValueT), SEEK_CUR);
- byteSize += valSize * sizeof(ValueT);
- }
-
- return byteSize;
- }
-
- size_t save(std::FILE* out)
- {
- if(!m_final)
- commit();
-
- bool byteSize = 0;
- byteSize += ThrowingFwrite(&m_final, sizeof(bool), 1, out) * sizeof(bool);
- byteSize += ThrowingFwrite(&m_size, sizeof(size_t), 1, out) * sizeof(size_t);
- byteSize += ThrowingFwrite(&m_last, sizeof(PosT), 1, out) * sizeof(PosT);
-
- size_t size = m_diffs.size();
- byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
- byteSize += ThrowingFwrite(&m_diffs[0], sizeof(unsigned int), size, out) * sizeof(unsigned int);
-
- size = m_anchors.size();
- byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
- byteSize += ThrowingFwrite(&m_anchors[0], sizeof(NumT), size, out) * sizeof(NumT);
-
- return byteSize;
- }
-
- void swap(MonotonicVector<PosT, NumT, stepSize, Allocator> &mv)
- {
- if(!m_final)
- commit();
-
- m_diffs.swap(mv.m_diffs);
- m_anchors.swap(mv.m_anchors);
+ } else {
+ // Map it directly on specified region of file "in" starting at valPos
+ // with length valSize * sizeof(ValueT). Mapped region cannot be resized.
+
+ size_t valPos = std::ftell(in);
+
+ Allocator<ValueT> alloc(in, valPos);
+ std::vector<ValueT, Allocator<ValueT> > vTemp(alloc);
+ vTemp.resize(valSize);
+ v.swap(vTemp);
+
+ std::fseek(in, valSize * sizeof(ValueT), SEEK_CUR);
+ byteSize += valSize * sizeof(ValueT);
}
+
+ return byteSize;
+ }
+
+ size_t save(std::FILE* out) {
+ if(!m_final)
+ commit();
+
+ bool byteSize = 0;
+ byteSize += ThrowingFwrite(&m_final, sizeof(bool), 1, out) * sizeof(bool);
+ byteSize += ThrowingFwrite(&m_size, sizeof(size_t), 1, out) * sizeof(size_t);
+ byteSize += ThrowingFwrite(&m_last, sizeof(PosT), 1, out) * sizeof(PosT);
+
+ size_t size = m_diffs.size();
+ byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
+ byteSize += ThrowingFwrite(&m_diffs[0], sizeof(unsigned int), size, out) * sizeof(unsigned int);
+
+ size = m_anchors.size();
+ byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
+ byteSize += ThrowingFwrite(&m_anchors[0], sizeof(NumT), size, out) * sizeof(NumT);
+
+ return byteSize;
+ }
+
+ void swap(MonotonicVector<PosT, NumT, stepSize, Allocator> &mv) {
+ if(!m_final)
+ commit();
+
+ m_diffs.swap(mv.m_diffs);
+ m_anchors.swap(mv.m_anchors);
+ }
};
}
diff --git a/moses/TranslationModel/CompactPT/MurmurHash3.cpp b/moses/TranslationModel/CompactPT/MurmurHash3.cpp
index 0bf738662..d16cd9502 100644
--- a/moses/TranslationModel/CompactPT/MurmurHash3.cpp
+++ b/moses/TranslationModel/CompactPT/MurmurHash3.cpp
@@ -107,16 +107,15 @@ void MurmurHash3_x86_32 ( const void * key, int len,
const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
- for(int i = -nblocks; i; i++)
- {
+ for(int i = -nblocks; i; i++) {
uint32_t k1 = getblock(blocks,i);
k1 *= c1;
k1 = ROTL32(k1,15);
k1 *= c2;
-
+
h1 ^= k1;
- h1 = ROTL32(h1,13);
+ h1 = ROTL32(h1,13);
h1 = h1*5+0xe6546b64;
}
@@ -127,12 +126,17 @@ void MurmurHash3_x86_32 ( const void * key, int len,
uint32_t k1 = 0;
- switch(len & 3)
- {
- case 3: k1 ^= tail[2] << 16;
- case 2: k1 ^= tail[1] << 8;
- case 1: k1 ^= tail[0];
- k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+ switch(len & 3) {
+ case 3:
+ k1 ^= tail[2] << 16;
+ case 2:
+ k1 ^= tail[1] << 8;
+ case 1:
+ k1 ^= tail[0];
+ k1 *= c1;
+ k1 = ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
};
//----------
@@ -143,7 +147,7 @@ void MurmurHash3_x86_32 ( const void * key, int len,
h1 = fmix(h1);
*(uint32_t*)out = h1;
-}
+}
//-----------------------------------------------------------------------------
@@ -158,9 +162,9 @@ void MurmurHash3_x86_128 ( const void * key, const int len,
uint32_t h3 = seed;
uint32_t h4 = seed;
- uint32_t c1 = 0x239b961b;
+ uint32_t c1 = 0x239b961b;
uint32_t c2 = 0xab0e9789;
- uint32_t c3 = 0x38b34ae5;
+ uint32_t c3 = 0x38b34ae5;
uint32_t c4 = 0xa1e38b93;
//----------
@@ -168,28 +172,47 @@ void MurmurHash3_x86_128 ( const void * key, const int len,
const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
- for(int i = -nblocks; i; i++)
- {
+ for(int i = -nblocks; i; i++) {
uint32_t k1 = getblock(blocks,i*4+0);
uint32_t k2 = getblock(blocks,i*4+1);
uint32_t k3 = getblock(blocks,i*4+2);
uint32_t k4 = getblock(blocks,i*4+3);
- k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+ k1 *= c1;
+ k1 = ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
- h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
+ h1 = ROTL32(h1,19);
+ h1 += h2;
+ h1 = h1*5+0x561ccd1b;
- k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
+ k2 *= c2;
+ k2 = ROTL32(k2,16);
+ k2 *= c3;
+ h2 ^= k2;
- h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
+ h2 = ROTL32(h2,17);
+ h2 += h3;
+ h2 = h2*5+0x0bcaa747;
- k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
+ k3 *= c3;
+ k3 = ROTL32(k3,17);
+ k3 *= c4;
+ h3 ^= k3;
- h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
+ h3 = ROTL32(h3,15);
+ h3 += h4;
+ h3 = h3*5+0x96cd1c35;
- k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
+ k4 *= c4;
+ k4 = ROTL32(k4,18);
+ k4 *= c1;
+ h4 ^= k4;
- h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
+ h4 = ROTL32(h4,13);
+ h4 += h1;
+ h4 = h4*5+0x32ac3b17;
}
//----------
@@ -202,47 +225,84 @@ void MurmurHash3_x86_128 ( const void * key, const int len,
uint32_t k3 = 0;
uint32_t k4 = 0;
- switch(len & 15)
- {
- case 15: k4 ^= tail[14] << 16;
- case 14: k4 ^= tail[13] << 8;
- case 13: k4 ^= tail[12] << 0;
- k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
-
- case 12: k3 ^= tail[11] << 24;
- case 11: k3 ^= tail[10] << 16;
- case 10: k3 ^= tail[ 9] << 8;
- case 9: k3 ^= tail[ 8] << 0;
- k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
-
- case 8: k2 ^= tail[ 7] << 24;
- case 7: k2 ^= tail[ 6] << 16;
- case 6: k2 ^= tail[ 5] << 8;
- case 5: k2 ^= tail[ 4] << 0;
- k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
-
- case 4: k1 ^= tail[ 3] << 24;
- case 3: k1 ^= tail[ 2] << 16;
- case 2: k1 ^= tail[ 1] << 8;
- case 1: k1 ^= tail[ 0] << 0;
- k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
+ switch(len & 15) {
+ case 15:
+ k4 ^= tail[14] << 16;
+ case 14:
+ k4 ^= tail[13] << 8;
+ case 13:
+ k4 ^= tail[12] << 0;
+ k4 *= c4;
+ k4 = ROTL32(k4,18);
+ k4 *= c1;
+ h4 ^= k4;
+
+ case 12:
+ k3 ^= tail[11] << 24;
+ case 11:
+ k3 ^= tail[10] << 16;
+ case 10:
+ k3 ^= tail[ 9] << 8;
+ case 9:
+ k3 ^= tail[ 8] << 0;
+ k3 *= c3;
+ k3 = ROTL32(k3,17);
+ k3 *= c4;
+ h3 ^= k3;
+
+ case 8:
+ k2 ^= tail[ 7] << 24;
+ case 7:
+ k2 ^= tail[ 6] << 16;
+ case 6:
+ k2 ^= tail[ 5] << 8;
+ case 5:
+ k2 ^= tail[ 4] << 0;
+ k2 *= c2;
+ k2 = ROTL32(k2,16);
+ k2 *= c3;
+ h2 ^= k2;
+
+ case 4:
+ k1 ^= tail[ 3] << 24;
+ case 3:
+ k1 ^= tail[ 2] << 16;
+ case 2:
+ k1 ^= tail[ 1] << 8;
+ case 1:
+ k1 ^= tail[ 0] << 0;
+ k1 *= c1;
+ k1 = ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
};
//----------
// finalization
- h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+ h1 ^= len;
+ h2 ^= len;
+ h3 ^= len;
+ h4 ^= len;
- h1 += h2; h1 += h3; h1 += h4;
- h2 += h1; h3 += h1; h4 += h1;
+ h1 += h2;
+ h1 += h3;
+ h1 += h4;
+ h2 += h1;
+ h3 += h1;
+ h4 += h1;
h1 = fmix(h1);
h2 = fmix(h2);
h3 = fmix(h3);
h4 = fmix(h4);
- h1 += h2; h1 += h3; h1 += h4;
- h2 += h1; h3 += h1; h4 += h1;
+ h1 += h2;
+ h1 += h3;
+ h1 += h4;
+ h2 += h1;
+ h3 += h1;
+ h4 += h1;
((uint32_t*)out)[0] = h1;
((uint32_t*)out)[1] = h2;
@@ -269,18 +329,27 @@ void MurmurHash3_x64_128 ( const void * key, const int len,
const uint64_t * blocks = (const uint64_t *)(data);
- for(int i = 0; i < nblocks; i++)
- {
+ for(int i = 0; i < nblocks; i++) {
uint64_t k1 = getblock(blocks,i*2+0);
uint64_t k2 = getblock(blocks,i*2+1);
- k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+ k1 *= c1;
+ k1 = ROTL64(k1,31);
+ k1 *= c2;
+ h1 ^= k1;
- h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
+ h1 = ROTL64(h1,27);
+ h1 += h2;
+ h1 = h1*5+0x52dce729;
- k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
+ k2 *= c2;
+ k2 = ROTL64(k2,33);
+ k2 *= c1;
+ h2 ^= k2;
- h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
+ h2 = ROTL64(h2,31);
+ h2 += h1;
+ h2 = h2*5+0x38495ab5;
}
//----------
@@ -291,32 +360,53 @@ void MurmurHash3_x64_128 ( const void * key, const int len,
uint64_t k1 = 0;
uint64_t k2 = 0;
- switch(len & 15)
- {
- case 15: k2 ^= uint64_t(tail[14]) << 48;
- case 14: k2 ^= uint64_t(tail[13]) << 40;
- case 13: k2 ^= uint64_t(tail[12]) << 32;
- case 12: k2 ^= uint64_t(tail[11]) << 24;
- case 11: k2 ^= uint64_t(tail[10]) << 16;
- case 10: k2 ^= uint64_t(tail[ 9]) << 8;
- case 9: k2 ^= uint64_t(tail[ 8]) << 0;
- k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
-
- case 8: k1 ^= uint64_t(tail[ 7]) << 56;
- case 7: k1 ^= uint64_t(tail[ 6]) << 48;
- case 6: k1 ^= uint64_t(tail[ 5]) << 40;
- case 5: k1 ^= uint64_t(tail[ 4]) << 32;
- case 4: k1 ^= uint64_t(tail[ 3]) << 24;
- case 3: k1 ^= uint64_t(tail[ 2]) << 16;
- case 2: k1 ^= uint64_t(tail[ 1]) << 8;
- case 1: k1 ^= uint64_t(tail[ 0]) << 0;
- k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
+ switch(len & 15) {
+ case 15:
+ k2 ^= uint64_t(tail[14]) << 48;
+ case 14:
+ k2 ^= uint64_t(tail[13]) << 40;
+ case 13:
+ k2 ^= uint64_t(tail[12]) << 32;
+ case 12:
+ k2 ^= uint64_t(tail[11]) << 24;
+ case 11:
+ k2 ^= uint64_t(tail[10]) << 16;
+ case 10:
+ k2 ^= uint64_t(tail[ 9]) << 8;
+ case 9:
+ k2 ^= uint64_t(tail[ 8]) << 0;
+ k2 *= c2;
+ k2 = ROTL64(k2,33);
+ k2 *= c1;
+ h2 ^= k2;
+
+ case 8:
+ k1 ^= uint64_t(tail[ 7]) << 56;
+ case 7:
+ k1 ^= uint64_t(tail[ 6]) << 48;
+ case 6:
+ k1 ^= uint64_t(tail[ 5]) << 40;
+ case 5:
+ k1 ^= uint64_t(tail[ 4]) << 32;
+ case 4:
+ k1 ^= uint64_t(tail[ 3]) << 24;
+ case 3:
+ k1 ^= uint64_t(tail[ 2]) << 16;
+ case 2:
+ k1 ^= uint64_t(tail[ 1]) << 8;
+ case 1:
+ k1 ^= uint64_t(tail[ 0]) << 0;
+ k1 *= c1;
+ k1 = ROTL64(k1,31);
+ k1 *= c2;
+ h1 ^= k1;
};
//----------
// finalization
- h1 ^= len; h2 ^= len;
+ h1 ^= len;
+ h2 ^= len;
h1 += h2;
h2 += h1;
diff --git a/moses/TranslationModel/CompactPT/PackedArray.h b/moses/TranslationModel/CompactPT/PackedArray.h
index ad4596546..479c2cc79 100644
--- a/moses/TranslationModel/CompactPT/PackedArray.h
+++ b/moses/TranslationModel/CompactPT/PackedArray.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_PackedArray_h
#define moses_PackedArray_h
@@ -35,128 +35,117 @@ namespace Moses
template <typename T = size_t, typename D = unsigned char>
class PackedArray
{
- protected:
- static size_t m_dataBits;
-
- size_t m_size;
- size_t m_storageSize;
- D* m_storage;
-
- public:
- PackedArray()
- {
- m_size = 0;
- m_storageSize = 0;
- m_storage = new D[0];
- }
-
- PackedArray(size_t size, size_t bits) : m_size(size)
- {
- m_storageSize = ceil(float(bits * size) / float(m_dataBits));
- m_storage = new D[m_storageSize];
- }
-
- PackedArray(const PackedArray<T, D> &c)
- {
- m_size = c.m_size;
-
- m_storageSize = c.m_storageSize;
- m_storage = new D[m_storageSize];
-
- std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D));
- }
-
- virtual ~PackedArray()
- {
- delete [] m_storage;
- m_size = 0;
- m_storageSize = 0;
- m_storage = 0;
- }
-
- T Get(size_t i, size_t bits) const
- {
- T out = 0;
-
- size_t bitstart = (i * bits);
- size_t bitpos = bitstart;
-
- size_t zero = ((1ul << (bits)) - 1);
-
- while(bitpos - bitstart < bits) {
- size_t pos = bitpos / m_dataBits;
- size_t off = bitpos % m_dataBits;
-
- out |= (T(m_storage[pos]) << (bitpos - bitstart)) >> off;
-
- bitpos += (m_dataBits - off);
- }
-
- out &= zero;
- return out;
- }
-
- void Set(size_t i, T v, size_t bits)
- {
- size_t bitstart = (i * bits);
- size_t bitpos = bitstart;
-
- while(bitpos - bitstart < bits) {
- size_t pos = bitpos / m_dataBits;
- size_t off = bitpos % m_dataBits;
-
- size_t rest = bits - (bitpos - bitstart);
- D zero = ~((1ul << (rest + off)) - 1) | ((1ul << off) - 1);
-
- m_storage[pos] &= zero;
- m_storage[pos] |= v << off;
- v = v >> (m_dataBits - off);
- bitpos += (m_dataBits - off);
- }
- }
-
- virtual D*& GetStorage()
- {
- return m_storage;
- }
-
- virtual size_t GetStorageSize() const
- {
- return m_storageSize;
- }
-
- virtual size_t Size() const
- {
- return m_size;
- }
-
- virtual size_t Load(std::FILE* in)
- {
- size_t a1 = std::ftell(in);
-
- size_t read = 0;
- read += std::fread(&m_size, sizeof(m_size), 1, in);
- read += std::fread(&m_storageSize, sizeof(m_storageSize), 1, in);
- delete [] m_storage;
- m_storage = new D[m_storageSize];
- read += std::fread(m_storage, sizeof(D), m_storageSize, in);
-
- size_t a2 = std::ftell(in);
- return a2 - a1;
+protected:
+ static size_t m_dataBits;
+
+ size_t m_size;
+ size_t m_storageSize;
+ D* m_storage;
+
+public:
+ PackedArray() {
+ m_size = 0;
+ m_storageSize = 0;
+ m_storage = new D[0];
+ }
+
+ PackedArray(size_t size, size_t bits) : m_size(size) {
+ m_storageSize = ceil(float(bits * size) / float(m_dataBits));
+ m_storage = new D[m_storageSize];
+ }
+
+ PackedArray(const PackedArray<T, D> &c) {
+ m_size = c.m_size;
+
+ m_storageSize = c.m_storageSize;
+ m_storage = new D[m_storageSize];
+
+ std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D));
+ }
+
+ virtual ~PackedArray() {
+ delete [] m_storage;
+ m_size = 0;
+ m_storageSize = 0;
+ m_storage = 0;
+ }
+
+ T Get(size_t i, size_t bits) const {
+ T out = 0;
+
+ size_t bitstart = (i * bits);
+ size_t bitpos = bitstart;
+
+ size_t zero = ((1ul << (bits)) - 1);
+
+ while(bitpos - bitstart < bits) {
+ size_t pos = bitpos / m_dataBits;
+ size_t off = bitpos % m_dataBits;
+
+ out |= (T(m_storage[pos]) << (bitpos - bitstart)) >> off;
+
+ bitpos += (m_dataBits - off);
}
-
- virtual size_t Save(std::FILE* out)
- {
- size_t a1 = std::ftell(out);
-
- ThrowingFwrite(&m_size, sizeof(m_size), 1, out);
- ThrowingFwrite(&m_storageSize, sizeof(m_storageSize), 1, out);
- ThrowingFwrite(m_storage, sizeof(D), m_storageSize, out);
-
- size_t a2 = std::ftell(out);
- return a2 - a1;
+
+ out &= zero;
+ return out;
+ }
+
+ void Set(size_t i, T v, size_t bits) {
+ size_t bitstart = (i * bits);
+ size_t bitpos = bitstart;
+
+ while(bitpos - bitstart < bits) {
+ size_t pos = bitpos / m_dataBits;
+ size_t off = bitpos % m_dataBits;
+
+ size_t rest = bits - (bitpos - bitstart);
+ D zero = ~((1ul << (rest + off)) - 1) | ((1ul << off) - 1);
+
+ m_storage[pos] &= zero;
+ m_storage[pos] |= v << off;
+ v = v >> (m_dataBits - off);
+ bitpos += (m_dataBits - off);
}
-
+ }
+
+ virtual D*& GetStorage() {
+ return m_storage;
+ }
+
+ virtual size_t GetStorageSize() const {
+ return m_storageSize;
+ }
+
+ virtual size_t Size() const {
+ return m_size;
+ }
+
+ virtual size_t Load(std::FILE* in) {
+ size_t a1 = std::ftell(in);
+
+ size_t read = 0;
+ read += std::fread(&m_size, sizeof(m_size), 1, in);
+ read += std::fread(&m_storageSize, sizeof(m_storageSize), 1, in);
+ delete [] m_storage;
+ m_storage = new D[m_storageSize];
+ read += std::fread(m_storage, sizeof(D), m_storageSize, in);
+
+ size_t a2 = std::ftell(in);
+ return a2 - a1;
+ }
+
+ virtual size_t Save(std::FILE* out) {
+ size_t a1 = std::ftell(out);
+
+ ThrowingFwrite(&m_size, sizeof(m_size), 1, out);
+ ThrowingFwrite(&m_storageSize, sizeof(m_storageSize), 1, out);
+ ThrowingFwrite(m_storage, sizeof(D), m_storageSize, out);
+
+ size_t a2 = std::ftell(out);
+ return a2 - a1;
+ }
+
};
template <typename T, typename D>
@@ -166,34 +155,31 @@ size_t PackedArray<T, D>::m_dataBits = sizeof(D)*8;
template <typename T = size_t, typename D = unsigned char>
class PairedPackedArray : public PackedArray<T,D>
-{
- public:
- PairedPackedArray() : PackedArray<T,D>() {}
-
- PairedPackedArray(size_t size, size_t bits1, size_t bits2)
+{
+public:
+ PairedPackedArray() : PackedArray<T,D>() {}
+
+ PairedPackedArray(size_t size, size_t bits1, size_t bits2)
: PackedArray<T, D>(size, bits1 + bits2) { }
-
- void Set(size_t i, T a, T b, size_t bits1, size_t bits2)
- {
- T c = 0;
- c = a | (b << bits1);
- PackedArray<T,D>::Set(i, c, bits1 + bits2);
- }
-
- void Set(size_t i, std::pair<T,T> p, size_t bits1, size_t bits2)
- {
- T c = 0;
- c = p.second | (p.first << bits1);
- PackedArray<T, D>::Set(i, c);
- }
-
- std::pair<T, T> Get(size_t i, size_t bits1, size_t bits2)
- {
- T v = PackedArray<T, D>::Get(i, bits1 + bits2);
- T a = v & ((1 << bits1) - 1);
- T b = v >> bits1;
- return std::pair<T, T>(a, b);
- }
+
+ void Set(size_t i, T a, T b, size_t bits1, size_t bits2) {
+ T c = 0;
+ c = a | (b << bits1);
+ PackedArray<T,D>::Set(i, c, bits1 + bits2);
+ }
+
+ void Set(size_t i, std::pair<T,T> p, size_t bits1, size_t bits2) {
+ T c = 0;
+ c = p.second | (p.first << bits1);
+ PackedArray<T, D>::Set(i, c);
+ }
+
+ std::pair<T, T> Get(size_t i, size_t bits1, size_t bits2) {
+ T v = PackedArray<T, D>::Get(i, bits1 + bits2);
+ T a = v & ((1 << bits1) - 1);
+ T b = v >> bits1;
+ return std::pair<T, T>(a, b);
+ }
};
}
diff --git a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp
index 03b3f6825..085a7337c 100644
--- a/moses/TranslationModel/CompactPT/PhraseDecoder.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseDecoder.cpp
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#include <deque>
@@ -37,23 +37,23 @@ PhraseDecoder::PhraseDecoder(
const std::vector<float>* weight
)
: m_coding(None), m_numScoreComponent(numScoreComponent),
- m_containsAlignmentInfo(true), m_maxRank(0),
- m_symbolTree(0), m_multipleScoreTrees(false),
- m_scoreTrees(1), m_alignTree(0),
- m_phraseDictionary(phraseDictionary), m_input(input), m_output(output),
- m_weight(weight),
- m_separator(" ||| ")
+ m_containsAlignmentInfo(true), m_maxRank(0),
+ m_symbolTree(0), m_multipleScoreTrees(false),
+ m_scoreTrees(1), m_alignTree(0),
+ m_phraseDictionary(phraseDictionary), m_input(input), m_output(output),
+ m_weight(weight),
+ m_separator(" ||| ")
{ }
PhraseDecoder::~PhraseDecoder()
{
if(m_symbolTree)
delete m_symbolTree;
-
+
for(size_t i = 0; i < m_scoreTrees.size(); i++)
if(m_scoreTrees[i])
delete m_scoreTrees[i];
-
+
if(m_alignTree)
delete m_alignTree;
}
@@ -61,10 +61,10 @@ PhraseDecoder::~PhraseDecoder()
inline unsigned PhraseDecoder::GetSourceSymbolId(std::string& symbol)
{
boost::unordered_map<std::string, unsigned>::iterator it
- = m_sourceSymbolsMap.find(symbol);
+ = m_sourceSymbolsMap.find(symbol);
if(it != m_sourceSymbolsMap.end())
return it->second;
-
+
size_t idx = m_sourceSymbols.find(symbol);
m_sourceSymbolsMap[symbol] = idx;
return idx;
@@ -144,76 +144,70 @@ size_t PhraseDecoder::Load(std::FILE* in)
{
size_t start = std::ftell(in);
size_t read = 0;
-
+
read += std::fread(&m_coding, sizeof(m_coding), 1, in);
read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, in);
read += std::fread(&m_containsAlignmentInfo, sizeof(m_containsAlignmentInfo), 1, in);
read += std::fread(&m_maxRank, sizeof(m_maxRank), 1, in);
read += std::fread(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, in);
-
- if(m_coding == REnc)
- {
+
+ if(m_coding == REnc) {
m_sourceSymbols.load(in);
-
+
size_t size;
read += std::fread(&size, sizeof(size_t), 1, in);
m_lexicalTableIndex.resize(size);
read += std::fread(&m_lexicalTableIndex[0], sizeof(size_t), size, in);
-
+
read += std::fread(&size, sizeof(size_t), 1, in);
m_lexicalTable.resize(size);
read += std::fread(&m_lexicalTable[0], sizeof(SrcTrg), size, in);
}
-
+
m_targetSymbols.load(in);
-
+
m_symbolTree = new CanonicalHuffman<unsigned>(in);
-
+
read += std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, in);
- if(m_multipleScoreTrees)
- {
+ if(m_multipleScoreTrees) {
m_scoreTrees.resize(m_numScoreComponent);
for(size_t i = 0; i < m_numScoreComponent; i++)
m_scoreTrees[i] = new CanonicalHuffman<float>(in);
- }
- else
- {
+ } else {
m_scoreTrees.resize(1);
m_scoreTrees[0] = new CanonicalHuffman<float>(in);
}
-
+
if(m_containsAlignmentInfo)
m_alignTree = new CanonicalHuffman<AlignPoint>(in);
-
+
size_t end = std::ftell(in);
return end - start;
}
-
+
std::string PhraseDecoder::MakeSourceKey(std::string &source)
{
- return source + m_separator;
+ return source + m_separator;
}
-
+
TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &sourcePhrase, bool topLevel)
{
-
+
// Not using TargetPhraseCollection avoiding "new" operator
// which can introduce heavy locking with multiple threads
TargetPhraseVectorPtr tpv(new TargetPhraseVector());
size_t bitsLeft = 0;
-
- if(m_coding == PREnc)
- {
+
+ if(m_coding == PREnc) {
std::pair<TargetPhraseVectorPtr, size_t> cachedPhraseColl
- = m_decodingCache.Retrieve(sourcePhrase);
-
+ = m_decodingCache.Retrieve(sourcePhrase);
+
// Has been cached and is complete or does not need to be completed
if(cachedPhraseColl.first != NULL && (!topLevel || cachedPhraseColl.second == 0))
return cachedPhraseColl.first;
-
+
// Has been cached, but is incomplete
- else if(cachedPhraseColl.first != NULL)
- {
+ else if(cachedPhraseColl.first != NULL) {
bitsLeft = cachedPhraseColl.second;
tpv->resize(cachedPhraseColl.first->size());
std::copy(cachedPhraseColl.first->begin(),
@@ -221,220 +215,187 @@ TargetPhraseVectorPtr PhraseDecoder::CreateTargetPhraseCollection(const Phrase &
tpv->begin());
}
}
-
+
// Retrieve source phrase identifier
std::string sourcePhraseString = sourcePhrase.GetStringRep(*m_input);
size_t sourcePhraseId = m_phraseDictionary.m_hash[MakeSourceKey(sourcePhraseString)];
-
- if(sourcePhraseId != m_phraseDictionary.m_hash.GetSize())
- {
- // Retrieve compressed and encoded target phrase collection
+
+ if(sourcePhraseId != m_phraseDictionary.m_hash.GetSize()) {
+ // Retrieve compressed and encoded target phrase collection
std::string encodedPhraseCollection;
if(m_phraseDictionary.m_inMemory)
encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMemory[sourcePhraseId];
else
encodedPhraseCollection = m_phraseDictionary.m_targetPhrasesMapped[sourcePhraseId];
-
+
BitWrapper<> encodedBitStream(encodedPhraseCollection);
if(m_coding == PREnc && bitsLeft)
encodedBitStream.SeekFromEnd(bitsLeft);
-
+
// Decompress and decode target phrase collection
TargetPhraseVectorPtr decodedPhraseColl =
DecodeCollection(tpv, encodedBitStream, sourcePhrase, topLevel);
-
+
return decodedPhraseColl;
- }
- else
- return TargetPhraseVectorPtr();
+ } else
+ return TargetPhraseVectorPtr();
}
-
+
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream,
const Phrase &sourcePhrase, bool topLevel)
{
-
+
bool extending = tpv->size();
size_t bitsLeft = encodedBitStream.TellFromEnd();
-
+
typedef std::pair<size_t, size_t> AlignPointSizeT;
-
+
std::vector<int> sourceWords;
- if(m_coding == REnc)
- {
- for(size_t i = 0; i < sourcePhrase.GetSize(); i++)
- {
+ if(m_coding == REnc) {
+ for(size_t i = 0; i < sourcePhrase.GetSize(); i++) {
std::string sourceWord
- = sourcePhrase.GetWord(i).GetString(*m_input, false);
+ = sourcePhrase.GetWord(i).GetString(*m_input, false);
unsigned idx = GetSourceSymbolId(sourceWord);
sourceWords.push_back(idx);
}
}
-
+
unsigned phraseStopSymbol = 0;
AlignPoint alignStopSymbol(-1, -1);
-
+
std::vector<float> scores;
std::set<AlignPointSizeT> alignment;
-
+
enum DecodeState { New, Symbol, Score, Alignment, Add } state = New;
-
+
size_t srcSize = sourcePhrase.GetSize();
-
+
TargetPhrase* targetPhrase = NULL;
- while(encodedBitStream.TellFromEnd())
- {
-
- if(state == New)
- {
+ while(encodedBitStream.TellFromEnd()) {
+
+ if(state == New) {
// Creating new TargetPhrase on the heap
tpv->push_back(TargetPhrase());
targetPhrase = &tpv->back();
-
+
targetPhrase->SetSourcePhrase(sourcePhrase);
alignment.clear();
scores.clear();
-
+
state = Symbol;
}
-
- if(state == Symbol)
- {
- unsigned symbol = m_symbolTree->Read(encodedBitStream);
- if(symbol == phraseStopSymbol)
- {
+
+ if(state == Symbol) {
+ unsigned symbol = m_symbolTree->Read(encodedBitStream);
+ if(symbol == phraseStopSymbol) {
state = Score;
- }
- else
- {
- if(m_coding == REnc)
- {
+ } else {
+ if(m_coding == REnc) {
std::string wordString;
size_t type = GetREncType(symbol);
-
- if(type == 1)
- {
+
+ if(type == 1) {
unsigned decodedSymbol = DecodeREncSymbol1(symbol);
wordString = GetTargetSymbol(decodedSymbol);
- }
- else if (type == 2)
- {
+ } else if (type == 2) {
size_t rank = DecodeREncSymbol2Rank(symbol);
size_t srcPos = DecodeREncSymbol2Position(symbol);
-
+
if(srcPos >= sourceWords.size())
- return TargetPhraseVectorPtr();
-
+ return TargetPhraseVectorPtr();
+
wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));
- if(m_phraseDictionary.m_useAlignmentInfo)
- {
+ if(m_phraseDictionary.m_useAlignmentInfo) {
size_t trgPos = targetPhrase->GetSize();
alignment.insert(AlignPoint(srcPos, trgPos));
}
- }
- else if(type == 3)
- {
+ } else if(type == 3) {
size_t rank = DecodeREncSymbol3(symbol);
size_t srcPos = targetPhrase->GetSize();
-
+
if(srcPos >= sourceWords.size())
- return TargetPhraseVectorPtr();
-
- wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));
- if(m_phraseDictionary.m_useAlignmentInfo)
- {
+ return TargetPhraseVectorPtr();
+
+ wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));
+ if(m_phraseDictionary.m_useAlignmentInfo) {
size_t trgPos = srcPos;
alignment.insert(AlignPoint(srcPos, trgPos));
}
}
-
+
Word word;
word.CreateFromString(Output, *m_output, wordString, false);
targetPhrase->AddWord(word);
- }
- else if(m_coding == PREnc)
- {
+ } else if(m_coding == PREnc) {
// if the symbol is just a word
- if(GetPREncType(symbol) == 1)
- {
+ if(GetPREncType(symbol) == 1) {
unsigned decodedSymbol = DecodePREncSymbol1(symbol);
-
+
Word word;
word.CreateFromString(Output, *m_output,
GetTargetSymbol(decodedSymbol), false);
targetPhrase->AddWord(word);
}
// if the symbol is a subphrase pointer
- else
- {
+ else {
int left = DecodePREncSymbol2Left(symbol);
int right = DecodePREncSymbol2Right(symbol);
unsigned rank = DecodePREncSymbol2Rank(symbol);
-
+
int srcStart = left + targetPhrase->GetSize();
int srcEnd = srcSize - right - 1;
-
+
// false positive consistency check
if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize)
return TargetPhraseVectorPtr();
-
+
// false positive consistency check
if(m_maxRank && rank > m_maxRank)
- return TargetPhraseVectorPtr();
-
+ return TargetPhraseVectorPtr();
+
// set subphrase by default to itself
TargetPhraseVectorPtr subTpv = tpv;
-
+
// if range smaller than source phrase retrieve subphrase
- if(unsigned(srcEnd - srcStart + 1) != srcSize)
- {
+ if(unsigned(srcEnd - srcStart + 1) != srcSize) {
Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd));
subTpv = CreateTargetPhraseCollection(subPhrase, false);
- }
- else {
+ } else {
// false positive consistency check
if(rank >= tpv->size()-1)
return TargetPhraseVectorPtr();
}
-
+
// false positive consistency check
- if(subTpv != NULL && rank < subTpv->size())
- {
+ if(subTpv != NULL && rank < subTpv->size()) {
// insert the subphrase into the main target phrase
TargetPhrase& subTp = subTpv->at(rank);
- if(m_phraseDictionary.m_useAlignmentInfo)
- {
+ if(m_phraseDictionary.m_useAlignmentInfo) {
// reconstruct the alignment data based on the alignment of the subphrase
for(AlignmentInfo::const_iterator it = subTp.GetAlignTerm().begin();
- it != subTp.GetAlignTerm().end(); it++)
- {
+ it != subTp.GetAlignTerm().end(); it++) {
alignment.insert(AlignPointSizeT(srcStart + it->first,
targetPhrase->GetSize() + it->second));
}
}
targetPhrase->Append(subTp);
- }
- else
+ } else
return TargetPhraseVectorPtr();
}
- }
- else
- {
- Word word;
- word.CreateFromString(Output, *m_output,
- GetTargetSymbol(symbol), false);
- targetPhrase->AddWord(word);
+ } else {
+ Word word;
+ word.CreateFromString(Output, *m_output,
+ GetTargetSymbol(symbol), false);
+ targetPhrase->AddWord(word);
}
}
- }
- else if(state == Score)
- {
+ } else if(state == Score) {
size_t idx = m_multipleScoreTrees ? scores.size() : 0;
float score = m_scoreTrees[idx]->Read(encodedBitStream);
scores.push_back(score);
-
- if(scores.size() == m_numScoreComponent)
- {
+
+ if(scores.size() == m_numScoreComponent) {
targetPhrase->GetScoreBreakdown().Assign(&m_phraseDictionary, scores);
targetPhrase->Evaluate(sourcePhrase);
@@ -443,49 +404,41 @@ TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
else
state = Add;
}
- }
- else if(state == Alignment)
- {
+ } else if(state == Alignment) {
AlignPoint alignPoint = m_alignTree->Read(encodedBitStream);
- if(alignPoint == alignStopSymbol)
- {
+ if(alignPoint == alignStopSymbol) {
state = Add;
- }
- else
- {
- if(m_phraseDictionary.m_useAlignmentInfo)
+ } else {
+ if(m_phraseDictionary.m_useAlignmentInfo)
alignment.insert(AlignPointSizeT(alignPoint));
}
}
-
- if(state == Add)
- {
+
+ if(state == Add) {
if(m_phraseDictionary.m_useAlignmentInfo) {
targetPhrase->SetAlignTerm(alignment);
}
-
- if(m_coding == PREnc)
- {
+
+ if(m_coding == PREnc) {
if(!m_maxRank || tpv->size() <= m_maxRank)
bitsLeft = encodedBitStream.TellFromEnd();
-
+
if(!topLevel && m_maxRank && tpv->size() >= m_maxRank)
break;
}
-
+
if(encodedBitStream.TellFromEnd() <= 8)
break;
-
+
state = New;
- }
+ }
}
-
- if(m_coding == PREnc && !extending)
- {
+
+ if(m_coding == PREnc && !extending) {
bitsLeft = bitsLeft > 8 ? bitsLeft : 0;
m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank);
}
-
+
return tpv;
}
diff --git a/moses/TranslationModel/CompactPT/PhraseDecoder.h b/moses/TranslationModel/CompactPT/PhraseDecoder.h
index 13c8af300..85e9334da 100644
--- a/moses/TranslationModel/CompactPT/PhraseDecoder.h
+++ b/moses/TranslationModel/CompactPT/PhraseDecoder.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_PhraseDecoder_h
#define moses_PhraseDecoder_h
@@ -52,93 +52,93 @@ class PhraseDictionaryCompact;
class PhraseDecoder
{
- protected:
-
- friend class PhraseDictionaryCompact;
-
- typedef std::pair<unsigned char, unsigned char> AlignPoint;
- typedef std::pair<unsigned, unsigned> SrcTrg;
-
- enum Coding { None, REnc, PREnc } m_coding;
-
- size_t m_numScoreComponent;
- bool m_containsAlignmentInfo;
- size_t m_maxRank;
- size_t m_maxPhraseLength;
-
- boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
- StringVector<unsigned char, unsigned, std::allocator> m_sourceSymbols;
- StringVector<unsigned char, unsigned, std::allocator> m_targetSymbols;
-
- std::vector<size_t> m_lexicalTableIndex;
- std::vector<SrcTrg> m_lexicalTable;
-
- CanonicalHuffman<unsigned>* m_symbolTree;
-
- bool m_multipleScoreTrees;
- std::vector<CanonicalHuffman<float>*> m_scoreTrees;
-
- CanonicalHuffman<AlignPoint>* m_alignTree;
-
- TargetPhraseCollectionCache m_decodingCache;
-
- PhraseDictionaryCompact& m_phraseDictionary;
-
- // ***********************************************
-
- const std::vector<FactorType>* m_input;
- const std::vector<FactorType>* m_output;
- const std::vector<float>* m_weight;
-
- std::string m_separator;
-
- // ***********************************************
-
- unsigned GetSourceSymbolId(std::string& s);
- std::string GetTargetSymbol(unsigned id) const;
-
- size_t GetREncType(unsigned encodedSymbol);
- size_t GetPREncType(unsigned encodedSymbol);
-
- unsigned GetTranslation(unsigned srcIdx, size_t rank);
-
- size_t GetMaxSourcePhraseLength();
-
- unsigned DecodeREncSymbol1(unsigned encodedSymbol);
- unsigned DecodeREncSymbol2Rank(unsigned encodedSymbol);
- unsigned DecodeREncSymbol2Position(unsigned encodedSymbol);
- unsigned DecodeREncSymbol3(unsigned encodedSymbol);
-
- unsigned DecodePREncSymbol1(unsigned encodedSymbol);
- int DecodePREncSymbol2Left(unsigned encodedSymbol);
- int DecodePREncSymbol2Right(unsigned encodedSymbol);
- unsigned DecodePREncSymbol2Rank(unsigned encodedSymbol);
-
- std::string MakeSourceKey(std::string &);
-
- public:
-
- PhraseDecoder(
- PhraseDictionaryCompact &phraseDictionary,
- const std::vector<FactorType>* input,
- const std::vector<FactorType>* output,
- size_t numScoreComponent,
- const std::vector<float>* weight
- );
-
- ~PhraseDecoder();
-
- size_t Load(std::FILE* in);
-
- TargetPhraseVectorPtr CreateTargetPhraseCollection(const Phrase &sourcePhrase,
- bool topLevel = false);
-
- TargetPhraseVectorPtr DecodeCollection(TargetPhraseVectorPtr tpv,
- BitWrapper<> &encodedBitStream,
- const Phrase &sourcePhrase,
- bool topLevel);
-
- void PruneCache();
+protected:
+
+ friend class PhraseDictionaryCompact;
+
+ typedef std::pair<unsigned char, unsigned char> AlignPoint;
+ typedef std::pair<unsigned, unsigned> SrcTrg;
+
+ enum Coding { None, REnc, PREnc } m_coding;
+
+ size_t m_numScoreComponent;
+ bool m_containsAlignmentInfo;
+ size_t m_maxRank;
+ size_t m_maxPhraseLength;
+
+ boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
+ StringVector<unsigned char, unsigned, std::allocator> m_sourceSymbols;
+ StringVector<unsigned char, unsigned, std::allocator> m_targetSymbols;
+
+ std::vector<size_t> m_lexicalTableIndex;
+ std::vector<SrcTrg> m_lexicalTable;
+
+ CanonicalHuffman<unsigned>* m_symbolTree;
+
+ bool m_multipleScoreTrees;
+ std::vector<CanonicalHuffman<float>*> m_scoreTrees;
+
+ CanonicalHuffman<AlignPoint>* m_alignTree;
+
+ TargetPhraseCollectionCache m_decodingCache;
+
+ PhraseDictionaryCompact& m_phraseDictionary;
+
+ // ***********************************************
+
+ const std::vector<FactorType>* m_input;
+ const std::vector<FactorType>* m_output;
+ const std::vector<float>* m_weight;
+
+ std::string m_separator;
+
+ // ***********************************************
+
+ unsigned GetSourceSymbolId(std::string& s);
+ std::string GetTargetSymbol(unsigned id) const;
+
+ size_t GetREncType(unsigned encodedSymbol);
+ size_t GetPREncType(unsigned encodedSymbol);
+
+ unsigned GetTranslation(unsigned srcIdx, size_t rank);
+
+ size_t GetMaxSourcePhraseLength();
+
+ unsigned DecodeREncSymbol1(unsigned encodedSymbol);
+ unsigned DecodeREncSymbol2Rank(unsigned encodedSymbol);
+ unsigned DecodeREncSymbol2Position(unsigned encodedSymbol);
+ unsigned DecodeREncSymbol3(unsigned encodedSymbol);
+
+ unsigned DecodePREncSymbol1(unsigned encodedSymbol);
+ int DecodePREncSymbol2Left(unsigned encodedSymbol);
+ int DecodePREncSymbol2Right(unsigned encodedSymbol);
+ unsigned DecodePREncSymbol2Rank(unsigned encodedSymbol);
+
+ std::string MakeSourceKey(std::string &);
+
+public:
+
+ PhraseDecoder(
+ PhraseDictionaryCompact &phraseDictionary,
+ const std::vector<FactorType>* input,
+ const std::vector<FactorType>* output,
+ size_t numScoreComponent,
+ const std::vector<float>* weight
+ );
+
+ ~PhraseDecoder();
+
+ size_t Load(std::FILE* in);
+
+ TargetPhraseVectorPtr CreateTargetPhraseCollection(const Phrase &sourcePhrase,
+ bool topLevel = false);
+
+ TargetPhraseVectorPtr DecodeCollection(TargetPhraseVectorPtr tpv,
+ BitWrapper<> &encodedBitStream,
+ const Phrase &sourcePhrase,
+ bool topLevel);
+
+ void PruneCache();
};
}
diff --git a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
index e863eb812..ff33f10a7 100644
--- a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#include <fstream>
#include <string>
@@ -40,42 +40,35 @@ using namespace std;
namespace Moses
{
-
+
bool PhraseDictionaryCompact::InitDictionary()
{
const StaticData &staticData = StaticData::Instance();
m_weight = staticData.GetWeights(this);
-
+
std::string tFilePath = m_filePath;
-
+
std::string suffix = ".minphr";
- if(tFilePath.substr(tFilePath.length() - suffix.length(), suffix.length()) == suffix)
- {
- if(!FileExists(tFilePath))
- {
+ if(tFilePath.substr(tFilePath.length() - suffix.length(), suffix.length()) == suffix) {
+ if(!FileExists(tFilePath)) {
std::cerr << "Error: File " << tFilePath << " does not exit." << std::endl;
exit(1);
}
- }
- else
- {
- if(FileExists(tFilePath + suffix))
- {
+ } else {
+ if(FileExists(tFilePath + suffix)) {
tFilePath += suffix;
- }
- else
- {
- std::cerr << "Error: File " << tFilePath << ".minphr does not exit." << std::endl;
- exit(1);
+ } else {
+ std::cerr << "Error: File " << tFilePath << ".minphr does not exit." << std::endl;
+ exit(1);
}
}
m_phraseDecoder = new PhraseDecoder(*this, &m_input, &m_output,
- m_numScoreComponents, &m_weight);
+ m_numScoreComponents, &m_weight);
std::FILE* pFile = std::fopen(tFilePath.c_str() , "r");
-
+
size_t indexSize;
if(m_inMemory)
// Load source phrase index into memory
@@ -85,7 +78,7 @@ bool PhraseDictionaryCompact::InitDictionary()
indexSize = m_hash.LoadIndex(pFile);
size_t coderSize = m_phraseDecoder->Load(pFile);
-
+
size_t phraseSize;
if(m_inMemory)
// Load target phrase collections into memory
@@ -93,8 +86,8 @@ bool PhraseDictionaryCompact::InitDictionary()
else
// Keep target phrase collections on disk
phraseSize = m_targetPhrasesMapped.load(pFile, true);
-
- return indexSize && coderSize && phraseSize;
+
+ return indexSize && coderSize && phraseSize;
}
struct CompareTargetPhrase {
@@ -104,21 +97,22 @@ struct CompareTargetPhrase {
};
const TargetPhraseCollection*
-PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) const {
-
+PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) const
+{
+
// There is no souch source phrase if source phrase is longer than longest
- // observed source phrase during compilation
+ // observed source phrase during compilation
if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
return NULL;
// Retrieve target phrase collection from phrase table
TargetPhraseVectorPtr decodedPhraseColl
- = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
-
+ = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
+
if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));
TargetPhraseCollection* phraseColl = new TargetPhraseCollection();
-
+
// Score phrases and if possible apply ttable_limit
TargetPhraseVector::iterator nth =
(m_tableLimit == 0 || tpv->size() < m_tableLimit) ?
@@ -129,21 +123,21 @@ PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) c
cerr << *tp << endl;
phraseColl->Add(tp);
}
-
+
// Cache phrase pair for for clean-up or retrieval with PREnc
const_cast<PhraseDictionaryCompact*>(this)->CacheForCleanup(phraseColl);
-
+
return phraseColl;
- }
- else
+ } else
return NULL;
}
TargetPhraseVectorPtr
-PhraseDictionaryCompact::GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase) const {
+PhraseDictionaryCompact::GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase) const
+{
// There is no souch source phrase if source phrase is longer than longest
- // observed source phrase during compilation
+ // observed source phrase during compilation
if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
return TargetPhraseVectorPtr();
@@ -151,42 +145,45 @@ PhraseDictionaryCompact::GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase
return m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
}
-PhraseDictionaryCompact::~PhraseDictionaryCompact() {
+PhraseDictionaryCompact::~PhraseDictionaryCompact()
+{
if(m_phraseDecoder)
delete m_phraseDecoder;
}
//TO_STRING_BODY(PhraseDictionaryCompact)
-void PhraseDictionaryCompact::CacheForCleanup(TargetPhraseCollection* tpc) {
+void PhraseDictionaryCompact::CacheForCleanup(TargetPhraseCollection* tpc)
+{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_sentenceMutex);
- PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
+ PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
#else
- PhraseCache &ref = m_sentenceCache;
+ PhraseCache &ref = m_sentenceCache;
#endif
ref.push_back(tpc);
}
void PhraseDictionaryCompact::AddEquivPhrase(const Phrase &source,
- const TargetPhrase &targetPhrase) { }
+ const TargetPhrase &targetPhrase) { }
-void PhraseDictionaryCompact::CleanUpAfterSentenceProcessing(const InputType &source) {
+void PhraseDictionaryCompact::CleanUpAfterSentenceProcessing(const InputType &source)
+{
if(!m_inMemory)
m_hash.KeepNLastRanges(0.01, 0.2);
-
+
m_phraseDecoder->PruneCache();
-
+
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_sentenceMutex);
- PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
+ PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
#else
- PhraseCache &ref = m_sentenceCache;
+ PhraseCache &ref = m_sentenceCache;
#endif
-
- for(PhraseCache::iterator it = ref.begin(); it != ref.end(); it++)
- delete *it;
-
+
+ for(PhraseCache::iterator it = ref.begin(); it != ref.end(); it++)
+ delete *it;
+
PhraseCache temp;
temp.swap(ref);
}
diff --git a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h
index 1eab58894..60969665a 100644
--- a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h
+++ b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_PhraseDictionaryCompact_h
#define moses_PhraseDictionaryCompact_h
@@ -50,7 +50,7 @@ protected:
bool m_inMemory;
bool m_useAlignmentInfo;
-
+
typedef std::vector<TargetPhraseCollection*> PhraseCache;
#ifdef WITH_THREADS
boost::mutex m_sentenceMutex;
@@ -59,23 +59,22 @@ protected:
typedef PhraseCache SentenceCache;
#endif
SentenceCache m_sentenceCache;
-
+
BlockHashIndex m_hash;
PhraseDecoder* m_phraseDecoder;
-
+
StringVector<unsigned char, size_t, MmapAllocator> m_targetPhrasesMapped;
StringVector<unsigned char, size_t, std::allocator> m_targetPhrasesMemory;
std::vector<float> m_weight;
public:
PhraseDictionaryCompact(const std::string &line)
- :PhraseDictionary("PhraseDictionaryCompact", line)
- ,m_inMemory(true)
- ,m_useAlignmentInfo(true)
- ,m_hash(10, 16)
- ,m_phraseDecoder(0)
- ,m_weight(0)
- {
+ :PhraseDictionary("PhraseDictionaryCompact", line)
+ ,m_inMemory(true)
+ ,m_useAlignmentInfo(true)
+ ,m_hash(10, 16)
+ ,m_phraseDecoder(0)
+ ,m_weight(0) {
}
~PhraseDictionaryCompact();
@@ -84,16 +83,15 @@ public:
const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase &source) const;
TargetPhraseVectorPtr GetTargetPhraseCollectionRaw(const Phrase &source) const;
-
+
void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase);
-
+
void CacheForCleanup(TargetPhraseCollection* tpc);
void CleanUpAfterSentenceProcessing(const InputType &source);
virtual ChartRuleLookupManager *CreateRuleLookupManager(
const InputType &,
- const ChartCellCollectionBase &)
- {
+ const ChartCellCollectionBase &) {
assert(false);
return 0;
}
diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
index c7bd81019..fc3b056c6 100644
--- a/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.cpp
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#include <cstdio>
@@ -29,17 +29,17 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
-
+
bool operator<(const PackedItem &pi1, const PackedItem &pi2)
{
if(pi1.GetLine() < pi2.GetLine())
- return false;
+ return false;
return true;
}
-
+
std::string PhraseTableCreator::m_phraseStopSymbol = "__SPECIAL_STOP_SYMBOL__";
std::string PhraseTableCreator::m_separator = " ||| ";
-
+
PhraseTableCreator::PhraseTableCreator(std::string inPath,
std::string outPath,
std::string tempfilePath,
@@ -56,7 +56,7 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath,
#ifdef WITH_THREADS
, size_t threads
#endif
- )
+ )
: m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath),
m_outFile(std::fopen(m_outPath.c_str(), "w")), m_numScoreComponent(numScoreComponent),
m_sortScoreIndex(sortScoreIndex), m_warnMe(warnMe),
@@ -64,81 +64,76 @@ PhraseTableCreator::PhraseTableCreator(std::string inPath,
m_useAlignmentInfo(useAlignmentInfo),
m_multipleScoreTrees(multipleScoreTrees),
m_quantize(quantize), m_maxRank(maxRank),
- #ifdef WITH_THREADS
+#ifdef WITH_THREADS
m_threads(threads),
m_srcHash(m_orderBits, m_fingerPrintBits, 1),
m_rnkHash(10, 24, m_threads),
- #else
+#else
m_srcHash(m_orderBits, m_fingerPrintBits),
m_rnkHash(m_orderBits, m_fingerPrintBits),
- #endif
+#endif
m_maxPhraseLength(0),
m_lastFlushedLine(-1), m_lastFlushedSourceNum(0),
m_lastFlushedSourcePhrase("")
{
PrintInfo();
-
+
AddTargetSymbolId(m_phraseStopSymbol);
-
+
size_t cur_pass = 1;
size_t all_passes = 2;
if(m_coding == PREnc)
all_passes = 3;
-
+
m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
- it != m_scoreCounters.end(); it++)
+ it != m_scoreCounters.end(); it++)
*it = new ScoreCounter();
m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
-
+
// 0th pass
- if(m_coding == REnc)
- {
+ if(m_coding == REnc) {
size_t found = inPath.find_last_of("/\\");
std::string path;
- if(found != std::string::npos)
+ if(found != std::string::npos)
path = inPath.substr(0, found);
else
path = ".";
LoadLexicalTable(path + "/lex.f2e");
- }
- else if(m_coding == PREnc)
- {
+ } else if(m_coding == PREnc) {
std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating hash function for rank assignment" << std::endl;
cur_pass++;
CreateRankHash();
}
-
+
// 1st pass
std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Creating source phrase index + Encoding target phrases" << std::endl;
m_srcHash.BeginSave(m_outFile);
-
+
if(tempfilePath.size()) {
MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
m_encodedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
- }
- else {
- m_encodedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>();
+ } else {
+ m_encodedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>();
}
EncodeTargetPhrases();
-
+
cur_pass++;
-
+
std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl;
CalcHuffmanCodes();
-
+
// 2nd pass
std::cerr << "Pass " << cur_pass << "/" << all_passes << ": Compressing target phrases" << std::endl;
-
+
if(tempfilePath.size()) {
MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
- }
- else {
+ } else {
m_compressedTargetPhrases = new StringVector<unsigned char, unsigned long, MmapAllocator>();
}
CompressTargetPhrases();
-
+
std::cerr << "Saving to " << m_outPath << std::endl;
Save();
std::cerr << "Done" << std::endl;
@@ -149,44 +144,43 @@ PhraseTableCreator::~PhraseTableCreator()
{
delete m_symbolTree;
if(m_useAlignmentInfo)
- delete m_alignTree;
+ delete m_alignTree;
for(size_t i = 0; i < m_scoreTrees.size(); i++) {
delete m_scoreTrees[i];
delete m_scoreCounters[i];
}
-
+
delete m_encodedTargetPhrases;
- delete m_compressedTargetPhrases;
+ delete m_compressedTargetPhrases;
}
void PhraseTableCreator::PrintInfo()
{
std::string encodings[3] = {"Huffman", "Huffman + REnc", "Huffman + PREnc"};
-
+
std::cerr << "Used options:" << std::endl;
std::cerr << "\tText phrase table will be read from: " << m_inPath << std::endl;
std::cerr << "\tOutput phrase table will be written to: " << m_outPath << std::endl;
std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl;
std::cerr << "\tSource phrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl;
std::cerr << "\tSelected target phrase encoding: " << encodings[m_coding] << std::endl;
- if(m_coding == PREnc)
- {
+ if(m_coding == PREnc) {
std::cerr << "\tMaxiumum allowed rank for PREnc: ";
if(!m_maxRank)
std::cerr << "unlimited" << std::endl;
else
- std::cerr << m_maxRank << std::endl;
+ std::cerr << m_maxRank << std::endl;
}
- std::cerr << "\tNumber of score components in phrase table: " << m_numScoreComponent << std::endl;
- std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
+ std::cerr << "\tNumber of score components in phrase table: " << m_numScoreComponent << std::endl;
+ std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
std::cerr << "\tUsing score quantization: ";
if(m_quantize)
std::cerr << m_quantize << " best" << std::endl;
else
std::cerr << "no" << std::endl;
- std::cerr << "\tExplicitly included alignment information: " << (m_useAlignmentInfo ? "yes" : "no") << std::endl;
-
-#ifdef WITH_THREADS
+ std::cerr << "\tExplicitly included alignment information: " << (m_useAlignmentInfo ? "yes" : "no") << std::endl;
+
+#ifdef WITH_THREADS
std::cerr << "\tRunning with " << m_threads << " threads" << std::endl;
#endif
std::cerr << std::endl;
@@ -200,22 +194,21 @@ void PhraseTableCreator::Save()
ThrowingFwrite(&m_useAlignmentInfo, sizeof(m_useAlignmentInfo), 1, m_outFile);
ThrowingFwrite(&m_maxRank, sizeof(m_maxRank), 1, m_outFile);
ThrowingFwrite(&m_maxPhraseLength, sizeof(m_maxPhraseLength), 1, m_outFile);
-
- if(m_coding == REnc)
- {
+
+ if(m_coding == REnc) {
// Save source language symbols for REnc
std::vector<std::string> temp1;
temp1.resize(m_sourceSymbolsMap.size());
for(boost::unordered_map<std::string, unsigned>::iterator it
= m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++)
- temp1[it->second] = it->first;
+ temp1[it->second] = it->first;
std::sort(temp1.begin(), temp1.end());
StringVector<unsigned char, unsigned, std::allocator> sourceSymbols;
for(std::vector<std::string>::iterator it = temp1.begin();
it != temp1.end(); it++)
- sourceSymbols.push_back(*it);
+ sourceSymbols.push_back(*it);
sourceSymbols.save(m_outFile);
-
+
// Save lexical translation table for REnc
size_t size = m_lexicalTableIndex.size();
ThrowingFwrite(&size, sizeof(size_t), 1, m_outFile);
@@ -224,95 +217,92 @@ void PhraseTableCreator::Save()
ThrowingFwrite(&size, sizeof(size_t), 1, m_outFile);
ThrowingFwrite(&m_lexicalTable[0], sizeof(SrcTrg), size, m_outFile);
}
-
+
// Save target language symbols
std::vector<std::string> temp2;
temp2.resize(m_targetSymbolsMap.size());
for(boost::unordered_map<std::string, unsigned>::iterator it
- = m_targetSymbolsMap.begin(); it != m_targetSymbolsMap.end(); it++)
+ = m_targetSymbolsMap.begin(); it != m_targetSymbolsMap.end(); it++)
temp2[it->second] = it->first;
StringVector<unsigned char, unsigned, std::allocator> targetSymbols;
for(std::vector<std::string>::iterator it = temp2.begin();
- it != temp2.end(); it++)
+ it != temp2.end(); it++)
targetSymbols.push_back(*it);
targetSymbols.save(m_outFile);
-
+
// Save Huffman codes for target language symbols
m_symbolTree->Save(m_outFile);
-
+
// Save number of Huffman code sets for scores and
// save Huffman code sets
ThrowingFwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile);
size_t numScoreTrees = m_scoreTrees.size();
for(size_t i = 0; i < numScoreTrees; i++)
m_scoreTrees[i]->Save(m_outFile);
-
+
// Save Huffman codes for alignments
if(m_useAlignmentInfo)
m_alignTree->Save(m_outFile);
-
- // Save compressed target phrase collections
+
+ // Save compressed target phrase collections
m_compressedTargetPhrases->save(m_outFile);
}
-
+
void PhraseTableCreator::LoadLexicalTable(std::string filePath)
{
std::vector<SrcTrgProb> t_lexTable;
-
+
std::cerr << "Reading in lexical table for Rank Encoding" << std::endl;
std::ifstream lexIn(filePath.c_str(), std::ifstream::in);
std::string src, trg;
float prob;
-
+
// Reading in the translation probability lexicon
-
+
std::cerr << "\tLoading from " << filePath << std::endl;
- while(lexIn >> trg >> src >> prob)
- {
+ while(lexIn >> trg >> src >> prob) {
t_lexTable.push_back(SrcTrgProb(SrcTrgString(src, trg), prob));
AddSourceSymbolId(src);
AddTargetSymbolId(trg);
}
-
+
// Sorting lexicon by source words by lexicographical order, corresponding
// target words by decreasing probability.
-
+
std::cerr << "\tSorting according to translation rank" << std::endl;
std::sort(t_lexTable.begin(), t_lexTable.end(), SrcTrgProbSorter());
-
+
// Re-assigning source word ids in lexicographical order
-
+
std::vector<std::string> temp1;
temp1.resize(m_sourceSymbolsMap.size());
for(boost::unordered_map<std::string, unsigned>::iterator it
- = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++)
+ = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++)
temp1[it->second] = it->first;
-
+
std::sort(temp1.begin(), temp1.end());
-
+
for(size_t i = 0; i < temp1.size(); i++)
m_sourceSymbolsMap[temp1[i]] = i;
-
+
// Building the lexicon based on source and target word ids
-
+
std::string srcWord = "";
size_t srcIdx = 0;
for(std::vector<SrcTrgProb>::iterator it = t_lexTable.begin();
- it != t_lexTable.end(); it++)
- {
+ it != t_lexTable.end(); it++) {
// If we encounter a new source word
- if(it->first.first != srcWord)
- {
+ if(it->first.first != srcWord) {
srcIdx = GetSourceSymbolId(it->first.first);
-
+
// Store position of first translation
if(srcIdx >= m_lexicalTableIndex.size())
m_lexicalTableIndex.resize(srcIdx + 1);
m_lexicalTableIndex[srcIdx] = m_lexicalTable.size();
}
-
+
// Store pair of source word and target word
- size_t trgIdx = GetTargetSymbolId(it->first.second);
+ size_t trgIdx = GetTargetSymbolId(it->first.second);
m_lexicalTable.push_back(SrcTrg(srcIdx, trgIdx));
srcWord = it->first.first;
@@ -322,14 +312,13 @@ void PhraseTableCreator::LoadLexicalTable(std::string filePath)
}
void PhraseTableCreator::CreateRankHash()
-{
+{
InputFileStream inFile(m_inPath);
#ifdef WITH_THREADS
boost::thread_group threads;
- for (size_t i = 0; i < m_threads; ++i)
- {
- RankingTask* rt = new RankingTask(inFile, *this);
+ for (size_t i = 0; i < m_threads; ++i) {
+ RankingTask* rt = new RankingTask(inFile, *this);
threads.create_thread(*rt);
}
threads.join_all();
@@ -343,7 +332,7 @@ void PhraseTableCreator::CreateRankHash()
inline std::string PhraseTableCreator::MakeSourceKey(std::string &source)
{
- return source + m_separator;
+ return source + m_separator;
}
inline std::string PhraseTableCreator::MakeSourceTargetKey(std::string &source, std::string &target)
@@ -357,9 +346,8 @@ void PhraseTableCreator::EncodeTargetPhrases()
#ifdef WITH_THREADS
boost::thread_group threads;
- for (size_t i = 0; i < m_threads; ++i)
- {
- EncodingTask* et = new EncodingTask(inFile, *this);
+ for (size_t i = 0; i < m_threads; ++i) {
+ EncodingTask* et = new EncodingTask(inFile, *this);
threads.create_thread(*et);
}
threads.join_all();
@@ -368,17 +356,17 @@ void PhraseTableCreator::EncodeTargetPhrases()
(*et)();
delete et;
#endif
- FlushEncodedQueue(true);
+ FlushEncodedQueue(true);
}
void PhraseTableCreator::CompressTargetPhrases()
-{
+{
#ifdef WITH_THREADS
boost::thread_group threads;
for (size_t i = 0; i < m_threads; ++i) {
- CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this);
- threads.create_thread(*ct);
+ CompressionTask* ct = new CompressionTask(*m_encodedTargetPhrases, *this);
+ threads.create_thread(*ct);
}
threads.join_all();
#else
@@ -392,29 +380,27 @@ void PhraseTableCreator::CompressTargetPhrases()
void PhraseTableCreator::CalcHuffmanCodes()
{
std::cerr << "\tCreating Huffman codes for " << m_symbolCounter.Size()
- << " target phrase symbols" << std::endl;
-
+ << " target phrase symbols" << std::endl;
+
m_symbolTree = new SymbolTree(m_symbolCounter.Begin(),
- m_symbolCounter.End());
-
+ m_symbolCounter.End());
+
std::vector<ScoreTree*>::iterator treeIt = m_scoreTrees.begin();
for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
- it != m_scoreCounters.end(); it++)
- {
+ it != m_scoreCounters.end(); it++) {
if(m_quantize)
- (*it)->Quantize(m_quantize);
-
+ (*it)->Quantize(m_quantize);
+
std::cerr << "\tCreating Huffman codes for " << (*it)->Size()
- << " scores" << std::endl;
-
+ << " scores" << std::endl;
+
*treeIt = new ScoreTree((*it)->Begin(), (*it)->End());
treeIt++;
}
-
- if(m_useAlignmentInfo)
- {
+
+ if(m_useAlignmentInfo) {
std::cerr << "\tCreating Huffman codes for " << m_alignCounter.Size()
- << " alignment points" << std::endl;
+ << " alignment points" << std::endl;
m_alignTree = new AlignTree(m_alignCounter.Begin(), m_alignCounter.End());
}
std::cerr << std::endl;
@@ -440,9 +426,9 @@ void PhraseTableCreator::AddTargetSymbolId(std::string& symbol)
unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol)
{
boost::unordered_map<std::string, unsigned>::iterator it
- = m_sourceSymbolsMap.find(symbol);
-
- if(it != m_sourceSymbolsMap.end())
+ = m_sourceSymbolsMap.find(symbol);
+
+ if(it != m_sourceSymbolsMap.end())
return it->second;
else
return m_sourceSymbolsMap.size();
@@ -451,9 +437,9 @@ unsigned PhraseTableCreator::GetSourceSymbolId(std::string& symbol)
unsigned PhraseTableCreator::GetTargetSymbolId(std::string& symbol)
{
boost::unordered_map<std::string, unsigned>::iterator it
- = m_targetSymbolsMap.find(symbol);
-
- if(it != m_targetSymbolsMap.end())
+ = m_targetSymbolsMap.find(symbol);
+
+ if(it != m_targetSymbolsMap.end())
return it->second;
else
return m_targetSymbolsMap.size();
@@ -465,12 +451,11 @@ unsigned PhraseTableCreator::GetOrAddTargetSymbolId(std::string& symbol)
boost::mutex::scoped_lock lock(m_mutex);
#endif
boost::unordered_map<std::string, unsigned>::iterator it
- = m_targetSymbolsMap.find(symbol);
-
- if(it != m_targetSymbolsMap.end())
+ = m_targetSymbolsMap.find(symbol);
+
+ if(it != m_targetSymbolsMap.end())
return it->second;
- else
- {
+ else {
unsigned value = m_targetSymbolsMap.size();
m_targetSymbolsMap[symbol] = value;
return value;
@@ -481,12 +466,12 @@ unsigned PhraseTableCreator::GetRank(unsigned srcIdx, unsigned trgIdx)
{
size_t srcTrgIdx = m_lexicalTableIndex[srcIdx];
while(srcTrgIdx < m_lexicalTable.size()
- && srcIdx == m_lexicalTable[srcTrgIdx].first
- && m_lexicalTable[srcTrgIdx].second != trgIdx)
+ && srcIdx == m_lexicalTable[srcTrgIdx].first
+ && m_lexicalTable[srcTrgIdx].second != trgIdx)
srcTrgIdx++;
-
+
if(srcTrgIdx < m_lexicalTable.size()
- && m_lexicalTable[srcTrgIdx].second == trgIdx)
+ && m_lexicalTable[srcTrgIdx].second == trgIdx)
return srcTrgIdx - m_lexicalTableIndex[srcIdx];
else
return m_lexicalTable.size();
@@ -522,14 +507,14 @@ unsigned PhraseTableCreator::EncodePREncSymbol1(unsigned trgIdx)
unsigned PhraseTableCreator::EncodePREncSymbol2(int left, int right, unsigned rank)
{
// "left" and "right" must be smaller than 2^5
- // "rank" must be smaller than 2^19
+ // "rank" must be smaller than 2^19
left = left + 32;
right = right + 32;
-
+
assert(64 > left);
assert(64 > right);
assert(524288 > rank);
-
+
unsigned symbol = 0;
symbol |= 1 << 31;
symbol |= left << 25;
@@ -539,151 +524,135 @@ unsigned PhraseTableCreator::EncodePREncSymbol2(int left, int right, unsigned ra
}
void PhraseTableCreator::EncodeTargetPhraseNone(std::vector<std::string>& t,
- std::ostream& os)
+ std::ostream& os)
{
std::stringstream encodedTargetPhrase;
size_t j = 0;
- while(j < t.size())
- {
+ while(j < t.size()) {
unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]);
-
+
m_symbolCounter.Increase(targetSymbolId);
os.write((char*)&targetSymbolId, sizeof(targetSymbolId));
j++;
}
-
+
unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
os.write((char*)&stopSymbolId, sizeof(stopSymbolId));
m_symbolCounter.Increase(stopSymbolId);
}
void PhraseTableCreator::EncodeTargetPhraseREnc(std::vector<std::string>& s,
- std::vector<std::string>& t,
- std::set<AlignPoint>& a,
- std::ostream& os)
-{
+ std::vector<std::string>& t,
+ std::set<AlignPoint>& a,
+ std::ostream& os)
+{
std::stringstream encodedTargetPhrase;
std::vector<std::vector<size_t> > a2(t.size());
for(std::set<AlignPoint>::iterator it = a.begin(); it != a.end(); it++)
a2[it->second].push_back(it->first);
- for(size_t i = 0; i < t.size(); i++)
- {
+ for(size_t i = 0; i < t.size(); i++) {
unsigned idxTarget = GetOrAddTargetSymbolId(t[i]);
unsigned encodedSymbol = -1;
-
+
unsigned bestSrcPos = s.size();
unsigned bestDiff = s.size();
unsigned bestRank = m_lexicalTable.size();
unsigned badRank = m_lexicalTable.size();
-
- for(std::vector<size_t>::iterator it = a2[i].begin(); it != a2[i].end(); it++)
- {
+
+ for(std::vector<size_t>::iterator it = a2[i].begin(); it != a2[i].end(); it++) {
unsigned idxSource = GetSourceSymbolId(s[*it]);
size_t r = GetRank(idxSource, idxTarget);
- if(r != badRank)
- {
- if(r < bestRank)
- {
+ if(r != badRank) {
+ if(r < bestRank) {
bestRank = r;
bestSrcPos = *it;
bestDiff = abs(*it-i);
- }
- else if(r == bestRank && unsigned(abs(*it-i)) < bestDiff)
- {
+ } else if(r == bestRank && unsigned(abs(*it-i)) < bestDiff) {
bestSrcPos = *it;
bestDiff = abs(*it-i);
}
}
}
-
- if(bestRank != badRank && bestSrcPos < s.size())
- {
+
+ if(bestRank != badRank && bestSrcPos < s.size()) {
if(bestSrcPos == i)
encodedSymbol = EncodeREncSymbol3(bestRank);
else
- encodedSymbol = EncodeREncSymbol2(bestSrcPos, bestRank);
+ encodedSymbol = EncodeREncSymbol2(bestSrcPos, bestRank);
a.erase(AlignPoint(bestSrcPos, i));
- }
- else
- {
+ } else {
encodedSymbol = EncodeREncSymbol1(idxTarget);
}
-
+
os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
m_symbolCounter.Increase(encodedSymbol);
}
-
+
unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
unsigned encodedSymbol = EncodeREncSymbol1(stopSymbolId);
os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
- m_symbolCounter.Increase(encodedSymbol);
+ m_symbolCounter.Increase(encodedSymbol);
}
void PhraseTableCreator::EncodeTargetPhrasePREnc(std::vector<std::string>& s,
- std::vector<std::string>& t,
- std::set<AlignPoint>& a,
- size_t ownRank,
- std::ostream& os)
+ std::vector<std::string>& t,
+ std::set<AlignPoint>& a,
+ size_t ownRank,
+ std::ostream& os)
{
std::vector<unsigned> encodedSymbols(t.size());
std::vector<unsigned> encodedSymbolsLengths(t.size(), 0);
-
+
ConsistentPhrases cp(s.size(), t.size(), a);
while(!cp.Empty()) {
ConsistentPhrases::Phrase p = cp.Pop();
-
+
std::stringstream key1;
key1 << s[p.i];
for(int i = p.i+1; i < p.i+p.m; i++)
key1 << " " << s[i];
-
+
std::stringstream key2;
key2 << t[p.j];
for(int i = p.j+1; i < p.j+p.n; i++)
key2 << " " << t[i];
-
+
int rank = -1;
std::string key1Str = key1.str(), key2Str = key2.str();
size_t idx = m_rnkHash[MakeSourceTargetKey(key1Str, key2Str)];
if(idx != m_rnkHash.GetSize())
- rank = m_ranks[idx];
-
- if(rank >= 0 && (m_maxRank == 0 || unsigned(rank) < m_maxRank))
- {
- if(unsigned(p.m) != s.size() || unsigned(rank) < ownRank)
- {
+ rank = m_ranks[idx];
+
+ if(rank >= 0 && (m_maxRank == 0 || unsigned(rank) < m_maxRank)) {
+ if(unsigned(p.m) != s.size() || unsigned(rank) < ownRank) {
std::stringstream encodedSymbol;
encodedSymbols[p.j] = EncodePREncSymbol2(p.i-p.j, s.size()-(p.i+p.m), rank);
encodedSymbolsLengths[p.j] = p.n;
-
+
std::set<AlignPoint> tAlignment;
for(std::set<AlignPoint>::iterator it = a.begin();
- it != a.end(); it++)
+ it != a.end(); it++)
if(it->first < p.i || it->first >= p.i + p.m
- || it->second < p.j || it->second >= p.j + p.n)
- tAlignment.insert(*it);
+ || it->second < p.j || it->second >= p.j + p.n)
+ tAlignment.insert(*it);
a = tAlignment;
- cp.RemoveOverlap(p);
+ cp.RemoveOverlap(p);
}
}
}
-
+
std::stringstream encodedTargetPhrase;
-
+
size_t j = 0;
- while(j < t.size())
- {
- if(encodedSymbolsLengths[j] > 0)
- {
+ while(j < t.size()) {
+ if(encodedSymbolsLengths[j] > 0) {
unsigned encodedSymbol = encodedSymbols[j];
m_symbolCounter.Increase(encodedSymbol);
os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
j += encodedSymbolsLengths[j];
- }
- else
- {
+ } else {
unsigned targetSymbolId = GetOrAddTargetSymbolId(t[j]);
unsigned encodedSymbol = EncodePREncSymbol1(targetSymbolId);
m_symbolCounter.Increase(encodedSymbol);
@@ -691,7 +660,7 @@ void PhraseTableCreator::EncodeTargetPhrasePREnc(std::vector<std::string>& s,
j++;
}
}
-
+
unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
unsigned encodedSymbol = EncodePREncSymbol1(stopSymbolId);
os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
@@ -702,9 +671,8 @@ void PhraseTableCreator::EncodeScores(std::vector<float>& scores, std::ostream&
{
size_t c = 0;
float score;
-
- while(c < scores.size())
- {
+
+ while(c < scores.size()) {
score = scores[c];
score = FloorScore(TransformScore(score));
os.write((char*)&score, sizeof(score));
@@ -714,11 +682,10 @@ void PhraseTableCreator::EncodeScores(std::vector<float>& scores, std::ostream&
}
void PhraseTableCreator::EncodeAlignment(std::set<AlignPoint>& alignment,
- std::ostream& os)
+ std::ostream& os)
{
for(std::set<AlignPoint>::iterator it = alignment.begin();
- it != alignment.end(); it++)
- {
+ it != alignment.end(); it++) {
os.write((char*)&(*it), sizeof(AlignPoint));
m_alignCounter.Increase(*it);
}
@@ -728,83 +695,77 @@ void PhraseTableCreator::EncodeAlignment(std::set<AlignPoint>& alignment,
}
std::string PhraseTableCreator::EncodeLine(std::vector<std::string>& tokens, size_t ownRank)
-{
+{
std::string sourcePhraseStr = tokens[0];
std::string targetPhraseStr = tokens[1];
std::string scoresStr = tokens[2];
-
+
std::string alignmentStr = "";
if(tokens.size() > 3)
alignmentStr = tokens[3];
-
+
std::vector<std::string> s = Tokenize(sourcePhraseStr);
-
+
size_t phraseLength = s.size();
if(m_maxPhraseLength < phraseLength)
m_maxPhraseLength = phraseLength;
-
+
std::vector<std::string> t = Tokenize(targetPhraseStr);
std::vector<float> scores = Tokenize<float>(scoresStr);
-
+
if(scores.size() != m_numScoreComponent) {
std::cerr << "Error: Wrong number of scores detected ("
- << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
+ << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
std::cerr << "Line: " << tokens[0] << " ||| " << tokens[1] << " ||| " << tokens[3] << " ..." << std::endl;
- abort();
+ abort();
}
-
+
std::set<AlignPoint> a;
- if(m_coding != None || m_useAlignmentInfo)
- {
+ if(m_coding != None || m_useAlignmentInfo) {
std::vector<size_t> positions = Tokenize<size_t>(alignmentStr, " \t-");
- for(size_t i = 0; i < positions.size(); i += 2)
- {
+ for(size_t i = 0; i < positions.size(); i += 2) {
a.insert(AlignPoint(positions[i], positions[i+1]));
}
}
-
+
std::stringstream encodedTargetPhrase;
-
- if(m_coding == PREnc)
- {
+
+ if(m_coding == PREnc) {
EncodeTargetPhrasePREnc(s, t, a, ownRank, encodedTargetPhrase);
+ } else if(m_coding == REnc) {
+ EncodeTargetPhraseREnc(s, t, a, encodedTargetPhrase);
+ } else {
+ EncodeTargetPhraseNone(t, encodedTargetPhrase);
}
- else if(m_coding == REnc)
- {
- EncodeTargetPhraseREnc(s, t, a, encodedTargetPhrase);
- }
- else
- {
- EncodeTargetPhraseNone(t, encodedTargetPhrase);
- }
-
+
EncodeScores(scores, encodedTargetPhrase);
-
+
if(m_useAlignmentInfo)
EncodeAlignment(a, encodedTargetPhrase);
-
+
return encodedTargetPhrase.str();
}
std::string PhraseTableCreator::CompressEncodedCollection(std::string encodedCollection)
-{
+{
enum EncodeState {
- ReadSymbol, ReadScore, ReadAlignment,
- EncodeSymbol, EncodeScore, EncodeAlignment };
+ ReadSymbol, ReadScore, ReadAlignment,
+ EncodeSymbol, EncodeScore, EncodeAlignment
+ };
EncodeState state = ReadSymbol;
unsigned phraseStopSymbolId;
if(m_coding == REnc)
phraseStopSymbolId = EncodeREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol));
else if(m_coding == PREnc)
- phraseStopSymbolId = EncodePREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol));
+ phraseStopSymbolId = EncodePREncSymbol1(GetTargetSymbolId(m_phraseStopSymbol));
else
phraseStopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
AlignPoint alignStopSymbol(-1, -1);
std::stringstream encodedStream(encodedCollection);
encodedStream.unsetf(std::ios::skipws);
-
+
std::string compressedEncodedCollection;
BitWrapper<> bitStream(compressedEncodedCollection);
@@ -812,56 +773,50 @@ std::string PhraseTableCreator::CompressEncodedCollection(std::string encodedCol
float score;
size_t currScore = 0;
AlignPoint alignPoint;
-
- while(encodedStream)
- {
- switch(state)
- {
- case ReadSymbol:
- encodedStream.read((char*) &symbol, sizeof(unsigned));
- state = EncodeSymbol;
- break;
- case ReadScore:
- if(currScore == m_numScoreComponent)
- {
- currScore = 0;
- if(m_useAlignmentInfo)
- state = ReadAlignment;
- else
- state = ReadSymbol;
- }
+
+ while(encodedStream) {
+ switch(state) {
+ case ReadSymbol:
+ encodedStream.read((char*) &symbol, sizeof(unsigned));
+ state = EncodeSymbol;
+ break;
+ case ReadScore:
+ if(currScore == m_numScoreComponent) {
+ currScore = 0;
+ if(m_useAlignmentInfo)
+ state = ReadAlignment;
else
- {
- encodedStream.read((char*) &score, sizeof(float));
- currScore++;
- state = EncodeScore;
- }
- break;
- case ReadAlignment:
- encodedStream.read((char*) &alignPoint, sizeof(AlignPoint));
- state = EncodeAlignment;
- break;
-
- case EncodeSymbol:
- state = (symbol == phraseStopSymbolId) ? ReadScore : ReadSymbol;
- m_symbolTree->Put(bitStream, symbol);
- break;
- case EncodeScore:
- {
- state = ReadScore;
- size_t idx = m_multipleScoreTrees ? currScore-1 : 0;
- if(m_quantize)
- score = m_scoreCounters[idx]->LowerBound(score);
- m_scoreTrees[idx]->Put(bitStream, score);
- }
- break;
- case EncodeAlignment:
- state = (alignPoint == alignStopSymbol) ? ReadSymbol : ReadAlignment;
- m_alignTree->Put(bitStream, alignPoint);
- break;
+ state = ReadSymbol;
+ } else {
+ encodedStream.read((char*) &score, sizeof(float));
+ currScore++;
+ state = EncodeScore;
+ }
+ break;
+ case ReadAlignment:
+ encodedStream.read((char*) &alignPoint, sizeof(AlignPoint));
+ state = EncodeAlignment;
+ break;
+
+ case EncodeSymbol:
+ state = (symbol == phraseStopSymbolId) ? ReadScore : ReadSymbol;
+ m_symbolTree->Put(bitStream, symbol);
+ break;
+ case EncodeScore: {
+ state = ReadScore;
+ size_t idx = m_multipleScoreTrees ? currScore-1 : 0;
+ if(m_quantize)
+ score = m_scoreCounters[idx]->LowerBound(score);
+ m_scoreTrees[idx]->Put(bitStream, score);
+ }
+ break;
+ case EncodeAlignment:
+ state = (alignPoint == alignStopSymbol) ? ReadSymbol : ReadAlignment;
+ m_alignTree->Put(bitStream, alignPoint);
+ break;
}
}
-
+
return compressedEncodedCollection;
}
@@ -873,32 +828,28 @@ void PhraseTableCreator::AddRankedLine(PackedItem& pi)
void PhraseTableCreator::FlushRankedQueue(bool force)
{
size_t step = 1ul << 10;
-
- while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine())
- {
+
+ while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
m_lastFlushedLine++;
PackedItem pi = m_queue.top();
m_queue.pop();
-
- if(m_lastSourceRange.size() == step)
- {
+
+ if(m_lastSourceRange.size() == step) {
m_rnkHash.AddRange(m_lastSourceRange);
m_lastSourceRange.clear();
}
-
- if(m_lastFlushedSourcePhrase != pi.GetSrc())
- {
- if(m_rankQueue.size()) {
+
+ if(m_lastFlushedSourcePhrase != pi.GetSrc()) {
+ if(m_rankQueue.size()) {
m_lastFlushedSourceNum++;
if(m_lastFlushedSourceNum % 100000 == 0) {
std::cerr << ".";
}
- if(m_lastFlushedSourceNum % 5000000 == 0)
- {
+ if(m_lastFlushedSourceNum % 5000000 == 0) {
std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl;
}
-
+
m_ranks.resize(m_lastFlushedLine + 1);
int r = 0;
while(!m_rankQueue.empty()) {
@@ -907,33 +858,31 @@ void PhraseTableCreator::FlushRankedQueue(bool force)
}
}
}
-
+
m_lastSourceRange.push_back(pi.GetTrg());
-
+
m_rankQueue.push(std::make_pair(pi.GetScore(), pi.GetLine()));
m_lastFlushedSourcePhrase = pi.GetSrc();
}
-
- if(force)
- {
+
+ if(force) {
m_rnkHash.AddRange(m_lastSourceRange);
m_lastSourceRange.clear();
#ifdef WITH_THREADS
m_rnkHash.WaitAll();
#endif
-
+
m_ranks.resize(m_lastFlushedLine + 1);
int r = 0;
- while(!m_rankQueue.empty())
- {
+ while(!m_rankQueue.empty()) {
m_ranks[m_rankQueue.top().second] = r++;
m_rankQueue.pop();
}
m_lastFlushedLine = -1;
m_lastFlushedSourceNum = 0;
-
+
std::cerr << std::endl << std::endl;
}
}
@@ -946,74 +895,65 @@ void PhraseTableCreator::AddEncodedLine(PackedItem& pi)
void PhraseTableCreator::FlushEncodedQueue(bool force)
{
- while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine())
- {
+ while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
PackedItem pi = m_queue.top();
m_queue.pop();
m_lastFlushedLine++;
-
- if(m_lastFlushedSourcePhrase != pi.GetSrc())
- {
- if(m_lastCollection.size())
- {
+
+ if(m_lastFlushedSourcePhrase != pi.GetSrc()) {
+ if(m_lastCollection.size()) {
std::stringstream targetPhraseCollection;
for(std::vector<std::string>::iterator it =
- m_lastCollection.begin(); it != m_lastCollection.end(); it++)
+ m_lastCollection.begin(); it != m_lastCollection.end(); it++)
targetPhraseCollection << *it;
-
- m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase));
+
+ m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase));
m_encodedTargetPhrases->push_back(targetPhraseCollection.str());
-
+
m_lastFlushedSourceNum++;
if(m_lastFlushedSourceNum % 100000 == 0)
std::cerr << ".";
if(m_lastFlushedSourceNum % 5000000 == 0)
std::cerr << "[" << m_lastFlushedSourceNum << "]" << std::endl;
-
+
m_lastCollection.clear();
}
}
-
- if(m_lastSourceRange.size() == (1ul << m_orderBits))
- {
+
+ if(m_lastSourceRange.size() == (1ul << m_orderBits)) {
m_srcHash.AddRange(m_lastSourceRange);
m_srcHash.SaveLastRange();
m_srcHash.DropLastRange();
m_lastSourceRange.clear();
}
-
+
m_lastFlushedSourcePhrase = pi.GetSrc();
- if(m_coding == PREnc)
- {
+ if(m_coding == PREnc) {
if(m_lastCollection.size() <= pi.GetRank())
m_lastCollection.resize(pi.GetRank() + 1);
m_lastCollection[pi.GetRank()] = pi.GetTrg();
- }
- else
- {
+ } else {
m_lastCollection.push_back(pi.GetTrg());
}
}
-
- if(force)
- {
+
+ if(force) {
if(!m_lastSourceRange.size() || m_lastSourceRange.back() != m_lastFlushedSourcePhrase)
m_lastSourceRange.push_back(MakeSourceKey(m_lastFlushedSourcePhrase));
-
- if(m_lastCollection.size())
- {
+
+ if(m_lastCollection.size()) {
std::stringstream targetPhraseCollection;
for(std::vector<std::string>::iterator it =
- m_lastCollection.begin(); it != m_lastCollection.end(); it++)
+ m_lastCollection.begin(); it != m_lastCollection.end(); it++)
targetPhraseCollection << *it;
-
+
m_encodedTargetPhrases->push_back(targetPhraseCollection.str());
m_lastCollection.clear();
}
-
+
m_srcHash.AddRange(m_lastSourceRange);
m_lastSourceRange.clear();
-
+
#ifdef WITH_THREADS
m_srcHash.WaitAll();
#endif
@@ -1021,7 +961,7 @@ void PhraseTableCreator::FlushEncodedQueue(bool force)
m_srcHash.SaveLastRange();
m_srcHash.DropLastRange();
m_srcHash.FinalizeSave();
-
+
m_lastFlushedLine = -1;
m_lastFlushedSourceNum = 0;
@@ -1031,30 +971,27 @@ void PhraseTableCreator::FlushEncodedQueue(bool force)
void PhraseTableCreator::AddCompressedCollection(PackedItem& pi)
{
- m_queue.push(pi);
+ m_queue.push(pi);
}
void PhraseTableCreator::FlushCompressedQueue(bool force)
{
- if(force || m_queue.size() > 10000)
- {
- while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine())
- {
+ if(force || m_queue.size() > 10000) {
+ while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
PackedItem pi = m_queue.top();
m_queue.pop();
m_lastFlushedLine++;
-
+
m_compressedTargetPhrases->push_back(pi.GetTrg());
-
+
if((pi.GetLine()+1) % 100000 == 0)
std::cerr << ".";
if((pi.GetLine()+1) % 5000000 == 0)
std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
}
}
-
- if(force)
- {
+
+ if(force) {
m_lastFlushedLine = -1;
std::cerr << std::endl << std::endl;
}
@@ -1070,38 +1007,35 @@ boost::mutex RankingTask::m_fileMutex;
RankingTask::RankingTask(InputFileStream& inFile, PhraseTableCreator& creator)
: m_inFile(inFile), m_creator(creator) {}
-
+
void RankingTask::operator()()
{
size_t lineNum = 0;
-
+
std::vector<std::string> lines;
size_t max_lines = 1000;
lines.reserve(max_lines);
-
+
{
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_fileMutex);
+ boost::mutex::scoped_lock lock(m_fileMutex);
#endif
- std::string line;
- while(lines.size() < max_lines && std::getline(m_inFile, line))
- lines.push_back(line);
- lineNum = m_lineNum;
- m_lineNum += lines.size();
+ std::string line;
+ while(lines.size() < max_lines && std::getline(m_inFile, line))
+ lines.push_back(line);
+ lineNum = m_lineNum;
+ m_lineNum += lines.size();
}
-
+
std::vector<PackedItem> result;
result.reserve(max_lines);
-
- while(lines.size())
- {
- for(size_t i = 0; i < lines.size(); i++)
- {
+
+ while(lines.size()) {
+ for(size_t i = 0; i < lines.size(); i++) {
std::vector<std::string> tokens;
Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
-
- if(tokens.size() < 3)
- {
+
+ if(tokens.size() < 3) {
std::cerr << "Error: It seems the following line has a wrong format:" << std::endl;
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
abort();
@@ -1112,38 +1046,38 @@ void RankingTask::operator()()
std::cerr << "Better use -encoding None or disable this warning with -no-warnings ." << std::endl;
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
}
-
+
std::vector<float> scores = Tokenize<float>(tokens[2]);
if(scores.size() != m_creator.m_numScoreComponent) {
std::cerr << "Error: It seems the following line has a wrong number of scores ("
- << scores.size() << " != " << m_creator.m_numScoreComponent << ") :" << std::endl;
+ << scores.size() << " != " << m_creator.m_numScoreComponent << ") :" << std::endl;
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
- abort();
+ abort();
}
-
+
float sortScore = scores[m_creator.m_sortScoreIndex];
-
+
std::string key1 = m_creator.MakeSourceKey(tokens[0]);
std::string key2 = m_creator.MakeSourceTargetKey(tokens[0], tokens[1]);
-
+
PackedItem packedItem(lineNum + i, key1, key2, 0, sortScore);
result.push_back(packedItem);
}
lines.clear();
-
+
{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_mutex);
#endif
- for(size_t i = 0; i < result.size(); i++)
+ for(size_t i = 0; i < result.size(); i++)
m_creator.AddRankedLine(result[i]);
- m_creator.FlushRankedQueue();
+ m_creator.FlushRankedQueue();
}
-
+
result.clear();
lines.reserve(max_lines);
result.reserve(max_lines);
-
+
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_fileMutex);
#endif
@@ -1163,15 +1097,15 @@ boost::mutex EncodingTask::m_fileMutex;
EncodingTask::EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator)
: m_inFile(inFile), m_creator(creator) {}
-
+
void EncodingTask::operator()()
{
size_t lineNum = 0;
-
+
std::vector<std::string> lines;
size_t max_lines = 1000;
lines.reserve(max_lines);
-
+
{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_fileMutex);
@@ -1182,19 +1116,16 @@ void EncodingTask::operator()()
lineNum = m_lineNum;
m_lineNum += lines.size();
}
-
+
std::vector<PackedItem> result;
result.reserve(max_lines);
-
- while(lines.size())
- {
- for(size_t i = 0; i < lines.size(); i++)
- {
+
+ while(lines.size()) {
+ for(size_t i = 0; i < lines.size(); i++) {
std::vector<std::string> tokens;
Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
-
- if(tokens.size() < 3)
- {
+
+ if(tokens.size() < 3) {
std::cerr << "Error: It seems the following line has a wrong format:" << std::endl;
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
abort();
@@ -1207,31 +1138,31 @@ void EncodingTask::operator()()
std::cerr << "Better use -encoding None or disable this warning with -no-warnings." << std::endl;
std::cerr << "Line " << i << ": " << lines[i] << std::endl;
}
-
+
size_t ownRank = 0;
if(m_creator.m_coding == PhraseTableCreator::PREnc)
ownRank = m_creator.m_ranks[lineNum + i];
-
+
std::string encodedLine = m_creator.EncodeLine(tokens, ownRank);
-
+
PackedItem packedItem(lineNum + i, tokens[0], encodedLine, ownRank);
result.push_back(packedItem);
}
lines.clear();
-
+
{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_mutex);
#endif
- for(size_t i = 0; i < result.size(); i++)
+ for(size_t i = 0; i < result.size(); i++)
m_creator.AddEncodedLine(result[i]);
- m_creator.FlushEncodedQueue();
+ m_creator.FlushEncodedQueue();
}
-
+
result.clear();
lines.reserve(max_lines);
result.reserve(max_lines);
-
+
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_fileMutex);
#endif
@@ -1251,10 +1182,10 @@ boost::mutex CompressionTask::m_mutex;
#endif
CompressionTask::CompressionTask(StringVector<unsigned char, unsigned long,
- MmapAllocator>& encodedCollections,
- PhraseTableCreator& creator)
+ MmapAllocator>& encodedCollections,
+ PhraseTableCreator& creator)
: m_encodedCollections(encodedCollections), m_creator(creator) {}
-
+
void CompressionTask::operator()()
{
size_t collectionNum;
@@ -1265,12 +1196,11 @@ void CompressionTask::operator()()
collectionNum = m_collectionNum;
m_collectionNum++;
}
-
- while(collectionNum < m_encodedCollections.size())
- {
+
+ while(collectionNum < m_encodedCollections.size()) {
std::string collection = m_encodedCollections[collectionNum];
std::string compressedCollection
- = m_creator.CompressEncodedCollection(collection);
+ = m_creator.CompressEncodedCollection(collection);
std::string dummy;
PackedItem packedItem(collectionNum, dummy, compressedCollection, 0);
@@ -1280,29 +1210,44 @@ void CompressionTask::operator()()
#endif
m_creator.AddCompressedCollection(packedItem);
m_creator.FlushCompressedQueue();
-
- collectionNum = m_collectionNum;
- m_collectionNum++;
+
+ collectionNum = m_collectionNum;
+ m_collectionNum++;
}
}
//****************************************************************************//
PackedItem::PackedItem(long line, std::string sourcePhrase,
- std::string packedTargetPhrase, size_t rank,
- float score)
+ std::string packedTargetPhrase, size_t rank,
+ float score)
: m_line(line), m_sourcePhrase(sourcePhrase),
m_packedTargetPhrase(packedTargetPhrase), m_rank(rank),
m_score(score) {}
-long PackedItem::GetLine() const { return m_line; }
+long PackedItem::GetLine() const
+{
+ return m_line;
+}
-const std::string& PackedItem::GetSrc() const { return m_sourcePhrase; }
+const std::string& PackedItem::GetSrc() const
+{
+ return m_sourcePhrase;
+}
-const std::string& PackedItem::GetTrg() const { return m_packedTargetPhrase; }
+const std::string& PackedItem::GetTrg() const
+{
+ return m_packedTargetPhrase;
+}
-size_t PackedItem::GetRank() const { return m_rank; }
+size_t PackedItem::GetRank() const
+{
+ return m_rank;
+}
-float PackedItem::GetScore() const { return m_score; }
+float PackedItem::GetScore() const
+{
+ return m_score;
+}
}
diff --git a/moses/TranslationModel/CompactPT/PhraseTableCreator.h b/moses/TranslationModel/CompactPT/PhraseTableCreator.h
index ded3a84eb..fd5fc1581 100644
--- a/moses/TranslationModel/CompactPT/PhraseTableCreator.h
+++ b/moses/TranslationModel/CompactPT/PhraseTableCreator.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_PhraseTableCreator_h
#define moses_PhraseTableCreator_h
@@ -40,386 +40,371 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
-
+
typedef std::pair<unsigned char, unsigned char> AlignPoint;
-template <typename DataType>
+template <typename DataType>
class Counter
{
- public:
- typedef boost::unordered_map<DataType, size_t> FreqMap;
- typedef typename FreqMap::iterator iterator;
- typedef typename FreqMap::mapped_type mapped_type;
- typedef typename FreqMap::value_type value_type;
-
- private:
-#ifdef WITH_THREADS
- boost::mutex m_mutex;
+public:
+ typedef boost::unordered_map<DataType, size_t> FreqMap;
+ typedef typename FreqMap::iterator iterator;
+ typedef typename FreqMap::mapped_type mapped_type;
+ typedef typename FreqMap::value_type value_type;
+
+private:
+#ifdef WITH_THREADS
+ boost::mutex m_mutex;
#endif
- FreqMap m_freqMap;
- size_t m_maxSize;
- std::vector<DataType> m_bestVec;
-
- struct FreqSorter
- {
- bool operator()(const value_type& a, const value_type& b) const
- {
- if(a.second > b.second)
- return true;
- // Check impact on translation quality!
- if(a.second == b.second && a.first > b.first)
- return true;
- return false;
- }
- };
-
- public:
- Counter() : m_maxSize(0) {}
-
- iterator Begin()
- {
- return m_freqMap.begin();
- }
-
- iterator End()
- {
- return m_freqMap.end();
+ FreqMap m_freqMap;
+ size_t m_maxSize;
+ std::vector<DataType> m_bestVec;
+
+ struct FreqSorter {
+ bool operator()(const value_type& a, const value_type& b) const {
+ if(a.second > b.second)
+ return true;
+ // Check impact on translation quality!
+ if(a.second == b.second && a.first > b.first)
+ return true;
+ return false;
}
-
- void Increase(DataType data)
- {
+ };
+
+public:
+ Counter() : m_maxSize(0) {}
+
+ iterator Begin() {
+ return m_freqMap.begin();
+ }
+
+ iterator End() {
+ return m_freqMap.end();
+ }
+
+ void Increase(DataType data) {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- m_freqMap[data]++;
- }
-
- void IncreaseBy(DataType data, size_t num)
- {
+ m_freqMap[data]++;
+ }
+
+ void IncreaseBy(DataType data, size_t num) {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- m_freqMap[data] += num;
- }
-
- mapped_type& operator[](DataType data)
- {
- return m_freqMap[data];
- }
-
- size_t Size()
- {
+ m_freqMap[data] += num;
+ }
+
+ mapped_type& operator[](DataType data) {
+ return m_freqMap[data];
+ }
+
+ size_t Size() {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- return m_freqMap.size();
- }
-
- void Quantize(size_t maxSize)
- {
+ return m_freqMap.size();
+ }
+
+ void Quantize(size_t maxSize) {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- m_maxSize = maxSize;
- std::vector<std::pair<DataType, mapped_type> > freqVec;
- freqVec.insert(freqVec.begin(), m_freqMap.begin(), m_freqMap.end());
- std::sort(freqVec.begin(), freqVec.end(), FreqSorter());
-
- for(size_t i = 0; i < freqVec.size() && i < m_maxSize; i++)
- m_bestVec.push_back(freqVec[i].first);
-
- std::sort(m_bestVec.begin(), m_bestVec.end());
-
- FreqMap t_freqMap;
- for(typename std::vector<std::pair<DataType, mapped_type> >::iterator it
- = freqVec.begin(); it != freqVec.end(); it++)
- {
- DataType closest = LowerBound(it->first);
- t_freqMap[closest] += it->second;
- }
-
- m_freqMap.swap(t_freqMap);
+ m_maxSize = maxSize;
+ std::vector<std::pair<DataType, mapped_type> > freqVec;
+ freqVec.insert(freqVec.begin(), m_freqMap.begin(), m_freqMap.end());
+ std::sort(freqVec.begin(), freqVec.end(), FreqSorter());
+
+ for(size_t i = 0; i < freqVec.size() && i < m_maxSize; i++)
+ m_bestVec.push_back(freqVec[i].first);
+
+ std::sort(m_bestVec.begin(), m_bestVec.end());
+
+ FreqMap t_freqMap;
+ for(typename std::vector<std::pair<DataType, mapped_type> >::iterator it
+ = freqVec.begin(); it != freqVec.end(); it++) {
+ DataType closest = LowerBound(it->first);
+ t_freqMap[closest] += it->second;
}
-
- void Clear()
- {
+
+ m_freqMap.swap(t_freqMap);
+ }
+
+ void Clear() {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- m_freqMap.clear();
- }
-
- DataType LowerBound(DataType data)
- {
- if(m_maxSize == 0 || m_bestVec.size() == 0)
- return data;
+ m_freqMap.clear();
+ }
+
+ DataType LowerBound(DataType data) {
+ if(m_maxSize == 0 || m_bestVec.size() == 0)
+ return data;
+ else {
+ typename std::vector<DataType>::iterator it
+ = std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data);
+ if(it != m_bestVec.end())
+ return *it;
else
- {
- typename std::vector<DataType>::iterator it
- = std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data);
- if(it != m_bestVec.end())
- return *it;
- else
- return m_bestVec.back();
- }
+ return m_bestVec.back();
}
+ }
};
-
+
class PackedItem
{
- private:
- long m_line;
- std::string m_sourcePhrase;
- std::string m_packedTargetPhrase;
- size_t m_rank;
- float m_score;
-
- public:
- PackedItem(long line, std::string sourcePhrase,
- std::string packedTargetPhrase, size_t rank,
- float m_score = 0);
-
- long GetLine() const;
- const std::string& GetSrc() const;
- const std::string& GetTrg() const;
- size_t GetRank() const;
- float GetScore() const;
+private:
+ long m_line;
+ std::string m_sourcePhrase;
+ std::string m_packedTargetPhrase;
+ size_t m_rank;
+ float m_score;
+
+public:
+ PackedItem(long line, std::string sourcePhrase,
+ std::string packedTargetPhrase, size_t rank,
+ float m_score = 0);
+
+ long GetLine() const;
+ const std::string& GetSrc() const;
+ const std::string& GetTrg() const;
+ size_t GetRank() const;
+ float GetScore() const;
};
bool operator<(const PackedItem &pi1, const PackedItem &pi2);
class PhraseTableCreator
{
- public:
- enum Coding { None, REnc, PREnc };
-
- private:
- std::string m_inPath;
- std::string m_outPath;
- std::string m_tempfilePath;
-
- std::FILE* m_outFile;
-
- size_t m_numScoreComponent;
- size_t m_sortScoreIndex;
- size_t m_warnMe;
-
- Coding m_coding;
- size_t m_orderBits;
- size_t m_fingerPrintBits;
- bool m_useAlignmentInfo;
- bool m_multipleScoreTrees;
- size_t m_quantize;
- size_t m_maxRank;
-
- static std::string m_phraseStopSymbol;
- static std::string m_separator;
-
+public:
+ enum Coding { None, REnc, PREnc };
+
+private:
+ std::string m_inPath;
+ std::string m_outPath;
+ std::string m_tempfilePath;
+
+ std::FILE* m_outFile;
+
+ size_t m_numScoreComponent;
+ size_t m_sortScoreIndex;
+ size_t m_warnMe;
+
+ Coding m_coding;
+ size_t m_orderBits;
+ size_t m_fingerPrintBits;
+ bool m_useAlignmentInfo;
+ bool m_multipleScoreTrees;
+ size_t m_quantize;
+ size_t m_maxRank;
+
+ static std::string m_phraseStopSymbol;
+ static std::string m_separator;
+
#ifdef WITH_THREADS
- size_t m_threads;
- boost::mutex m_mutex;
+ size_t m_threads;
+ boost::mutex m_mutex;
#endif
-
- BlockHashIndex m_srcHash;
- BlockHashIndex m_rnkHash;
-
- size_t m_maxPhraseLength;
-
- std::vector<unsigned> m_ranks;
-
- typedef std::pair<unsigned, unsigned> SrcTrg;
- typedef std::pair<std::string, std::string> SrcTrgString;
- typedef std::pair<SrcTrgString, float> SrcTrgProb;
-
- struct SrcTrgProbSorter
- {
- bool operator()(const SrcTrgProb& a, const SrcTrgProb& b) const
- {
- if(a.first.first < b.first.first)
- return true;
-
- if(a.first.first == b.first.first && a.second > b.second)
- return true;
-
- if(a.first.first == b.first.first
- && a.second == b.second
- && a.first.second < b.first.second)
- return true;
-
- return false;
- }
- };
-
- std::vector<size_t> m_lexicalTableIndex;
- std::vector<SrcTrg> m_lexicalTable;
-
- StringVector<unsigned char, unsigned long, MmapAllocator>*
- m_encodedTargetPhrases;
-
- StringVector<unsigned char, unsigned long, MmapAllocator>*
- m_compressedTargetPhrases;
-
- boost::unordered_map<std::string, unsigned> m_targetSymbolsMap;
- boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
-
- typedef Counter<unsigned> SymbolCounter;
- typedef Counter<float> ScoreCounter;
- typedef Counter<AlignPoint> AlignCounter;
-
- typedef CanonicalHuffman<unsigned> SymbolTree;
- typedef CanonicalHuffman<float> ScoreTree;
- typedef CanonicalHuffman<AlignPoint> AlignTree;
-
- SymbolCounter m_symbolCounter;
- SymbolTree* m_symbolTree;
-
- AlignCounter m_alignCounter;
- AlignTree* m_alignTree;
-
- std::vector<ScoreCounter*> m_scoreCounters;
- std::vector<ScoreTree*> m_scoreTrees;
-
- std::priority_queue<PackedItem> m_queue;
- long m_lastFlushedLine;
- long m_lastFlushedSourceNum;
- std::string m_lastFlushedSourcePhrase;
- std::vector<std::string> m_lastSourceRange;
- std::priority_queue<std::pair<float, size_t> > m_rankQueue;
- std::vector<std::string> m_lastCollection;
-
- void Save();
- void PrintInfo();
-
- void AddSourceSymbolId(std::string& symbol);
- unsigned GetSourceSymbolId(std::string& symbol);
-
- void AddTargetSymbolId(std::string& symbol);
- unsigned GetTargetSymbolId(std::string& symbol);
- unsigned GetOrAddTargetSymbolId(std::string& symbol);
-
- unsigned GetRank(unsigned srcIdx, unsigned trgIdx);
-
- unsigned EncodeREncSymbol1(unsigned symbol);
- unsigned EncodeREncSymbol2(unsigned position, unsigned rank);
- unsigned EncodeREncSymbol3(unsigned rank);
-
- unsigned EncodePREncSymbol1(unsigned symbol);
- unsigned EncodePREncSymbol2(int lOff, int rOff, unsigned rank);
-
- void EncodeTargetPhraseNone(std::vector<std::string>& t,
- std::ostream& os);
-
- void EncodeTargetPhraseREnc(std::vector<std::string>& s,
- std::vector<std::string>& t,
- std::set<AlignPoint>& a,
- std::ostream& os);
-
- void EncodeTargetPhrasePREnc(std::vector<std::string>& s,
- std::vector<std::string>& t,
- std::set<AlignPoint>& a, size_t ownRank,
- std::ostream& os);
-
- void EncodeScores(std::vector<float>& scores, std::ostream& os);
- void EncodeAlignment(std::set<AlignPoint>& alignment, std::ostream& os);
-
- std::string MakeSourceKey(std::string&);
- std::string MakeSourceTargetKey(std::string&, std::string&);
-
- void LoadLexicalTable(std::string filePath);
-
- void CreateRankHash();
- void EncodeTargetPhrases();
- void CalcHuffmanCodes();
- void CompressTargetPhrases();
-
- void AddRankedLine(PackedItem& pi);
- void FlushRankedQueue(bool force = false);
-
- std::string EncodeLine(std::vector<std::string>& tokens, size_t ownRank);
- void AddEncodedLine(PackedItem& pi);
- void FlushEncodedQueue(bool force = false);
-
- std::string CompressEncodedCollection(std::string encodedCollection);
- void AddCompressedCollection(PackedItem& pi);
- void FlushCompressedQueue(bool force = false);
-
- public:
-
- PhraseTableCreator(std::string inPath,
- std::string outPath,
- std::string tempfilePath,
- size_t numScoreComponent = 5,
- size_t sortScoreIndex = 2,
- Coding coding = PREnc,
- size_t orderBits = 10,
- size_t fingerPrintBits = 16,
- bool useAlignmentInfo = false,
- bool multipleScoreTrees = true,
- size_t quantize = 0,
- size_t maxRank = 100,
- bool warnMe = true
+
+ BlockHashIndex m_srcHash;
+ BlockHashIndex m_rnkHash;
+
+ size_t m_maxPhraseLength;
+
+ std::vector<unsigned> m_ranks;
+
+ typedef std::pair<unsigned, unsigned> SrcTrg;
+ typedef std::pair<std::string, std::string> SrcTrgString;
+ typedef std::pair<SrcTrgString, float> SrcTrgProb;
+
+ struct SrcTrgProbSorter {
+ bool operator()(const SrcTrgProb& a, const SrcTrgProb& b) const {
+ if(a.first.first < b.first.first)
+ return true;
+
+ if(a.first.first == b.first.first && a.second > b.second)
+ return true;
+
+ if(a.first.first == b.first.first
+ && a.second == b.second
+ && a.first.second < b.first.second)
+ return true;
+
+ return false;
+ }
+ };
+
+ std::vector<size_t> m_lexicalTableIndex;
+ std::vector<SrcTrg> m_lexicalTable;
+
+ StringVector<unsigned char, unsigned long, MmapAllocator>*
+ m_encodedTargetPhrases;
+
+ StringVector<unsigned char, unsigned long, MmapAllocator>*
+ m_compressedTargetPhrases;
+
+ boost::unordered_map<std::string, unsigned> m_targetSymbolsMap;
+ boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
+
+ typedef Counter<unsigned> SymbolCounter;
+ typedef Counter<float> ScoreCounter;
+ typedef Counter<AlignPoint> AlignCounter;
+
+ typedef CanonicalHuffman<unsigned> SymbolTree;
+ typedef CanonicalHuffman<float> ScoreTree;
+ typedef CanonicalHuffman<AlignPoint> AlignTree;
+
+ SymbolCounter m_symbolCounter;
+ SymbolTree* m_symbolTree;
+
+ AlignCounter m_alignCounter;
+ AlignTree* m_alignTree;
+
+ std::vector<ScoreCounter*> m_scoreCounters;
+ std::vector<ScoreTree*> m_scoreTrees;
+
+ std::priority_queue<PackedItem> m_queue;
+ long m_lastFlushedLine;
+ long m_lastFlushedSourceNum;
+ std::string m_lastFlushedSourcePhrase;
+ std::vector<std::string> m_lastSourceRange;
+ std::priority_queue<std::pair<float, size_t> > m_rankQueue;
+ std::vector<std::string> m_lastCollection;
+
+ void Save();
+ void PrintInfo();
+
+ void AddSourceSymbolId(std::string& symbol);
+ unsigned GetSourceSymbolId(std::string& symbol);
+
+ void AddTargetSymbolId(std::string& symbol);
+ unsigned GetTargetSymbolId(std::string& symbol);
+ unsigned GetOrAddTargetSymbolId(std::string& symbol);
+
+ unsigned GetRank(unsigned srcIdx, unsigned trgIdx);
+
+ unsigned EncodeREncSymbol1(unsigned symbol);
+ unsigned EncodeREncSymbol2(unsigned position, unsigned rank);
+ unsigned EncodeREncSymbol3(unsigned rank);
+
+ unsigned EncodePREncSymbol1(unsigned symbol);
+ unsigned EncodePREncSymbol2(int lOff, int rOff, unsigned rank);
+
+ void EncodeTargetPhraseNone(std::vector<std::string>& t,
+ std::ostream& os);
+
+ void EncodeTargetPhraseREnc(std::vector<std::string>& s,
+ std::vector<std::string>& t,
+ std::set<AlignPoint>& a,
+ std::ostream& os);
+
+ void EncodeTargetPhrasePREnc(std::vector<std::string>& s,
+ std::vector<std::string>& t,
+ std::set<AlignPoint>& a, size_t ownRank,
+ std::ostream& os);
+
+ void EncodeScores(std::vector<float>& scores, std::ostream& os);
+ void EncodeAlignment(std::set<AlignPoint>& alignment, std::ostream& os);
+
+ std::string MakeSourceKey(std::string&);
+ std::string MakeSourceTargetKey(std::string&, std::string&);
+
+ void LoadLexicalTable(std::string filePath);
+
+ void CreateRankHash();
+ void EncodeTargetPhrases();
+ void CalcHuffmanCodes();
+ void CompressTargetPhrases();
+
+ void AddRankedLine(PackedItem& pi);
+ void FlushRankedQueue(bool force = false);
+
+ std::string EncodeLine(std::vector<std::string>& tokens, size_t ownRank);
+ void AddEncodedLine(PackedItem& pi);
+ void FlushEncodedQueue(bool force = false);
+
+ std::string CompressEncodedCollection(std::string encodedCollection);
+ void AddCompressedCollection(PackedItem& pi);
+ void FlushCompressedQueue(bool force = false);
+
+public:
+
+ PhraseTableCreator(std::string inPath,
+ std::string outPath,
+ std::string tempfilePath,
+ size_t numScoreComponent = 5,
+ size_t sortScoreIndex = 2,
+ Coding coding = PREnc,
+ size_t orderBits = 10,
+ size_t fingerPrintBits = 16,
+ bool useAlignmentInfo = false,
+ bool multipleScoreTrees = true,
+ size_t quantize = 0,
+ size_t maxRank = 100,
+ bool warnMe = true
#ifdef WITH_THREADS
- , size_t threads = 2
+ , size_t threads = 2
#endif
- );
-
- ~PhraseTableCreator();
-
- friend class RankingTask;
- friend class EncodingTask;
- friend class CompressionTask;
+ );
+
+ ~PhraseTableCreator();
+
+ friend class RankingTask;
+ friend class EncodingTask;
+ friend class CompressionTask;
};
class RankingTask
{
- private:
+private:
#ifdef WITH_THREADS
- static boost::mutex m_mutex;
- static boost::mutex m_fileMutex;
+ static boost::mutex m_mutex;
+ static boost::mutex m_fileMutex;
#endif
- static size_t m_lineNum;
- InputFileStream& m_inFile;
- PhraseTableCreator& m_creator;
-
- public:
- RankingTask(InputFileStream& inFile, PhraseTableCreator& creator);
- void operator()();
+ static size_t m_lineNum;
+ InputFileStream& m_inFile;
+ PhraseTableCreator& m_creator;
+
+public:
+ RankingTask(InputFileStream& inFile, PhraseTableCreator& creator);
+ void operator()();
};
class EncodingTask
{
- private:
+private:
#ifdef WITH_THREADS
- static boost::mutex m_mutex;
- static boost::mutex m_fileMutex;
+ static boost::mutex m_mutex;
+ static boost::mutex m_fileMutex;
#endif
- static size_t m_lineNum;
- static size_t m_sourcePhraseNum;
- static std::string m_lastSourcePhrase;
-
- InputFileStream& m_inFile;
- PhraseTableCreator& m_creator;
-
- public:
- EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator);
- void operator()();
+ static size_t m_lineNum;
+ static size_t m_sourcePhraseNum;
+ static std::string m_lastSourcePhrase;
+
+ InputFileStream& m_inFile;
+ PhraseTableCreator& m_creator;
+
+public:
+ EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator);
+ void operator()();
};
class CompressionTask
{
- private:
+private:
#ifdef WITH_THREADS
- static boost::mutex m_mutex;
+ static boost::mutex m_mutex;
#endif
- static size_t m_collectionNum;
- StringVector<unsigned char, unsigned long, MmapAllocator>&
- m_encodedCollections;
- PhraseTableCreator& m_creator;
-
- public:
- CompressionTask(StringVector<unsigned char, unsigned long, MmapAllocator>&
- encodedCollections, PhraseTableCreator& creator);
- void operator()();
+ static size_t m_collectionNum;
+ StringVector<unsigned char, unsigned long, MmapAllocator>&
+ m_encodedCollections;
+ PhraseTableCreator& m_creator;
+
+public:
+ CompressionTask(StringVector<unsigned char, unsigned long, MmapAllocator>&
+ encodedCollections, PhraseTableCreator& creator);
+ void operator()();
};
}
diff --git a/moses/TranslationModel/CompactPT/StringVector.h b/moses/TranslationModel/CompactPT/StringVector.h
index fcc545a19..4545d61c6 100644
--- a/moses/TranslationModel/CompactPT/StringVector.h
+++ b/moses/TranslationModel/CompactPT/StringVector.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_StringVector_h
#define moses_StringVector_h
@@ -43,255 +43,241 @@ namespace Moses
template <typename ValueIteratorT>
class ValueIteratorRange
{
- private:
- ValueIteratorT m_begin;
- ValueIteratorT m_end;
-
- public:
- ValueIteratorRange(ValueIteratorT begin, ValueIteratorT end);
-
- const ValueIteratorT& begin() const;
- const ValueIteratorT& end() const;
- const std::string str() const;
- operator const std::string()
- {
- return str();
- }
-
- size_t size()
- {
- return std::distance(m_begin, m_end);
- }
-
- template <typename StringT>
- bool operator==(const StringT& o) const;
- bool operator==(const char* c) const;
-
- template <typename StringT>
- bool operator<(const StringT& o) const;
- bool operator<(const char* c) const;
+private:
+ ValueIteratorT m_begin;
+ ValueIteratorT m_end;
+
+public:
+ ValueIteratorRange(ValueIteratorT begin, ValueIteratorT end);
+
+ const ValueIteratorT& begin() const;
+ const ValueIteratorT& end() const;
+ const std::string str() const;
+ operator const std::string() {
+ return str();
+ }
+
+ size_t size() {
+ return std::distance(m_begin, m_end);
+ }
+
+ template <typename StringT>
+ bool operator==(const StringT& o) const;
+ bool operator==(const char* c) const;
+
+ template <typename StringT>
+ bool operator<(const StringT& o) const;
+ bool operator<(const char* c) const;
};
// ********** StringVector **********
template <typename ValueT = unsigned char, typename PosT = unsigned int,
- template <typename> class Allocator = std::allocator>
+ template <typename> class Allocator = std::allocator>
class StringVector
-{
- protected:
- bool m_sorted;
- bool m_memoryMapped;
-
- std::vector<ValueT, Allocator<ValueT> >* m_charArray;
- MonotonicVector<PosT, unsigned int, 32> m_positions;
-
- virtual const ValueT* value_ptr(PosT i) const;
-
+{
+protected:
+ bool m_sorted;
+ bool m_memoryMapped;
+
+ std::vector<ValueT, Allocator<ValueT> >* m_charArray;
+ MonotonicVector<PosT, unsigned int, 32> m_positions;
+
+ virtual const ValueT* value_ptr(PosT i) const;
+
+public:
+ typedef ValueIteratorRange<typename std::vector<ValueT, Allocator<ValueT> >::const_iterator> range;
+
+ // ********** RangeIterator **********
+
+ class RangeIterator : public boost::iterator_facade<RangeIterator,
+ range, std::random_access_iterator_tag, range, PosT>
+ {
+
+ private:
+ PosT m_index;
+ StringVector<ValueT, PosT, Allocator>* m_container;
+
public:
- typedef ValueIteratorRange<typename std::vector<ValueT, Allocator<ValueT> >::const_iterator> range;
-
- // ********** RangeIterator **********
-
- class RangeIterator : public boost::iterator_facade<RangeIterator,
- range, std::random_access_iterator_tag, range, PosT>
- {
-
- private:
- PosT m_index;
- StringVector<ValueT, PosT, Allocator>* m_container;
-
- public:
- RangeIterator();
- RangeIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index=0);
-
- PosT get_index();
-
- private:
- friend class boost::iterator_core_access;
-
- range dereference() const;
- bool equal(RangeIterator const& other) const;
- void increment();
- void decrement();
- void advance(PosT n);
-
- PosT distance_to(RangeIterator const& other) const;
- };
-
- // ********** StringIterator **********
-
- class StringIterator : public boost::iterator_facade<StringIterator,
- std::string, std::random_access_iterator_tag, const std::string, PosT>
- {
-
- private:
- PosT m_index;
- StringVector<ValueT, PosT, Allocator>* m_container;
-
- public:
- StringIterator();
- StringIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index=0);
-
- PosT get_index();
-
- private:
- friend class boost::iterator_core_access;
-
- const std::string dereference() const;
- bool equal(StringIterator const& other) const;
- void increment();
- void decrement();
- void advance(PosT n);
- PosT distance_to(StringIterator const& other) const;
- };
-
- typedef RangeIterator iterator;
- typedef StringIterator string_iterator;
-
- StringVector();
- StringVector(Allocator<ValueT> alloc);
-
- virtual ~StringVector()
- {
- delete m_charArray;
- }
-
- void swap(StringVector<ValueT, PosT, Allocator> &c)
- {
- m_positions.commit();
- m_positions.swap(c.m_positions);
- m_charArray->swap(*c.m_charArray);
-
- bool temp = m_sorted;
- m_sorted = c.m_sorted;
- c.m_sorted = temp;
- }
-
- bool is_sorted() const;
- PosT size() const;
- virtual PosT size2() const;
-
- template<class Iterator> Iterator begin() const;
- template<class Iterator> Iterator end() const;
-
- iterator begin() const;
- iterator end() const;
-
- PosT length(PosT i) const;
- typename std::vector<ValueT, Allocator<ValueT> >::const_iterator begin(PosT i) const;
- typename std::vector<ValueT, Allocator<ValueT> >::const_iterator end(PosT i) const;
-
- void clear()
- {
- m_charArray->clear();
- m_sorted = true;
- m_positions = MonotonicVector<PosT, unsigned int, 32>();
- }
-
- range at(PosT i) const;
- range operator[](PosT i) const;
- range back() const;
-
- template <typename StringT>
- void push_back(StringT s);
- void push_back(const char* c);
-
- template <typename StringT>
- PosT find(StringT &s) const;
- PosT find(const char* c) const;
-
- virtual size_t load(std::FILE* in, bool memoryMapped = false)
- {
- size_t size = 0;
- m_memoryMapped = memoryMapped;
-
- size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool);
- size += m_positions.load(in, m_memoryMapped);
-
- size += loadCharArray(*m_charArray, in, m_memoryMapped);
- return size;
- }
-
- size_t loadCharArray(std::vector<ValueT, std::allocator<ValueT> >& c,
- std::FILE* in, bool map = false)
- {
- // Can only be read into memory. Mapping not possible with std:allocator.
- assert(map == false);
-
- size_t byteSize = 0;
-
- size_t valSize;
- byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
-
+ RangeIterator();
+ RangeIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index=0);
+
+ PosT get_index();
+
+ private:
+ friend class boost::iterator_core_access;
+
+ range dereference() const;
+ bool equal(RangeIterator const& other) const;
+ void increment();
+ void decrement();
+ void advance(PosT n);
+
+ PosT distance_to(RangeIterator const& other) const;
+ };
+
+ // ********** StringIterator **********
+
+ class StringIterator : public boost::iterator_facade<StringIterator,
+ std::string, std::random_access_iterator_tag, const std::string, PosT>
+ {
+
+ private:
+ PosT m_index;
+ StringVector<ValueT, PosT, Allocator>* m_container;
+
+ public:
+ StringIterator();
+ StringIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index=0);
+
+ PosT get_index();
+
+ private:
+ friend class boost::iterator_core_access;
+
+ const std::string dereference() const;
+ bool equal(StringIterator const& other) const;
+ void increment();
+ void decrement();
+ void advance(PosT n);
+ PosT distance_to(StringIterator const& other) const;
+ };
+
+ typedef RangeIterator iterator;
+ typedef StringIterator string_iterator;
+
+ StringVector();
+ StringVector(Allocator<ValueT> alloc);
+
+ virtual ~StringVector() {
+ delete m_charArray;
+ }
+
+ void swap(StringVector<ValueT, PosT, Allocator> &c) {
+ m_positions.commit();
+ m_positions.swap(c.m_positions);
+ m_charArray->swap(*c.m_charArray);
+
+ bool temp = m_sorted;
+ m_sorted = c.m_sorted;
+ c.m_sorted = temp;
+ }
+
+ bool is_sorted() const;
+ PosT size() const;
+ virtual PosT size2() const;
+
+ template<class Iterator> Iterator begin() const;
+ template<class Iterator> Iterator end() const;
+
+ iterator begin() const;
+ iterator end() const;
+
+ PosT length(PosT i) const;
+ typename std::vector<ValueT, Allocator<ValueT> >::const_iterator begin(PosT i) const;
+ typename std::vector<ValueT, Allocator<ValueT> >::const_iterator end(PosT i) const;
+
+ void clear() {
+ m_charArray->clear();
+ m_sorted = true;
+ m_positions = MonotonicVector<PosT, unsigned int, 32>();
+ }
+
+ range at(PosT i) const;
+ range operator[](PosT i) const;
+ range back() const;
+
+ template <typename StringT>
+ void push_back(StringT s);
+ void push_back(const char* c);
+
+ template <typename StringT>
+ PosT find(StringT &s) const;
+ PosT find(const char* c) const;
+
+ virtual size_t load(std::FILE* in, bool memoryMapped = false) {
+ size_t size = 0;
+ m_memoryMapped = memoryMapped;
+
+ size += std::fread(&m_sorted, sizeof(bool), 1, in) * sizeof(bool);
+ size += m_positions.load(in, m_memoryMapped);
+
+ size += loadCharArray(*m_charArray, in, m_memoryMapped);
+ return size;
+ }
+
+ size_t loadCharArray(std::vector<ValueT, std::allocator<ValueT> >& c,
+ std::FILE* in, bool map = false) {
+ // Can only be read into memory. Mapping not possible with std:allocator.
+ assert(map == false);
+
+ size_t byteSize = 0;
+
+ size_t valSize;
+ byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
+
+ c.resize(valSize, 0);
+ byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
+
+ return byteSize;
+ }
+
+ size_t loadCharArray(std::vector<ValueT, MmapAllocator<ValueT> >& c,
+ std::FILE* in, bool map = false) {
+ size_t byteSize = 0;
+
+ size_t valSize;
+ byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
+
+ if(map == false) {
+ // Read data into temporary file (default constructor of MmapAllocator)
+ // and map memory onto temporary file. Can be resized.
+
c.resize(valSize, 0);
byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
-
- return byteSize;
- }
-
- size_t loadCharArray(std::vector<ValueT, MmapAllocator<ValueT> >& c,
- std::FILE* in, bool map = false)
- {
- size_t byteSize = 0;
-
- size_t valSize;
- byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
-
- if(map == false)
- {
- // Read data into temporary file (default constructor of MmapAllocator)
- // and map memory onto temporary file. Can be resized.
-
- c.resize(valSize, 0);
- byteSize += std::fread(&c[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
- }
- else
- {
- // Map it directly on specified region of file "in" starting at valPos
- // with length valSize * sizeof(ValueT). Mapped region cannot be resized.
-
- size_t valPos = std::ftell(in);
- Allocator<ValueT> alloc(in, valPos);
- std::vector<ValueT, Allocator<ValueT> > charArrayTemp(alloc);
- charArrayTemp.resize(valSize);
- c.swap(charArrayTemp);
-
- byteSize += valSize * sizeof(ValueT);
- }
-
- return byteSize;
- }
-
- size_t load(std::string filename, bool memoryMapped = false)
- {
- std::FILE* pFile = fopen(filename.c_str(), "r");
- size_t byteSize = load(pFile, memoryMapped);
- fclose(pFile);
- return byteSize;
- }
+ } else {
+ // Map it directly on specified region of file "in" starting at valPos
+ // with length valSize * sizeof(ValueT). Mapped region cannot be resized.
- size_t save(std::FILE* out)
- {
- size_t byteSize = 0;
- byteSize += ThrowingFwrite(&m_sorted, sizeof(bool), 1, out) * sizeof(bool);
-
- byteSize += m_positions.save(out);
-
- size_t valSize = size2();
- byteSize += ThrowingFwrite(&valSize, sizeof(size_t), 1, out) * sizeof(size_t);
- byteSize += ThrowingFwrite(&(*m_charArray)[0], sizeof(ValueT), valSize, out) * sizeof(ValueT);
-
- return byteSize;
- }
-
- size_t save(std::string filename)
- {
- std::FILE* pFile = fopen(filename.c_str(), "w");
- size_t byteSize = save(pFile);
- fclose(pFile);
- return byteSize;
+ size_t valPos = std::ftell(in);
+ Allocator<ValueT> alloc(in, valPos);
+ std::vector<ValueT, Allocator<ValueT> > charArrayTemp(alloc);
+ charArrayTemp.resize(valSize);
+ c.swap(charArrayTemp);
+
+ byteSize += valSize * sizeof(ValueT);
}
-
+
+ return byteSize;
+ }
+
+ size_t load(std::string filename, bool memoryMapped = false) {
+ std::FILE* pFile = fopen(filename.c_str(), "r");
+ size_t byteSize = load(pFile, memoryMapped);
+ fclose(pFile);
+ return byteSize;
+ }
+
+ size_t save(std::FILE* out) {
+ size_t byteSize = 0;
+ byteSize += ThrowingFwrite(&m_sorted, sizeof(bool), 1, out) * sizeof(bool);
+
+ byteSize += m_positions.save(out);
+
+ size_t valSize = size2();
+ byteSize += ThrowingFwrite(&valSize, sizeof(size_t), 1, out) * sizeof(size_t);
+ byteSize += ThrowingFwrite(&(*m_charArray)[0], sizeof(ValueT), valSize, out) * sizeof(ValueT);
+
+ return byteSize;
+ }
+
+ size_t save(std::string filename) {
+ std::FILE* pFile = fopen(filename.c_str(), "w");
+ size_t byteSize = save(pFile);
+ fclose(pFile);
+ return byteSize;
+ }
+
};
// ********** Implementation **********
@@ -300,214 +286,214 @@ class StringVector
template <typename ValueIteratorT>
ValueIteratorRange<ValueIteratorT>::ValueIteratorRange(ValueIteratorT begin,
- ValueIteratorT end) : m_begin(begin), m_end(end) { }
-
+ ValueIteratorT end) : m_begin(begin), m_end(end) { }
+
template <typename ValueIteratorT>
const ValueIteratorT& ValueIteratorRange<ValueIteratorT>::begin() const
{
- return m_begin;
+ return m_begin;
}
template <typename ValueIteratorT>
const ValueIteratorT& ValueIteratorRange<ValueIteratorT>::end() const
{
- return m_end;
+ return m_end;
}
template <typename ValueIteratorT>
const std::string ValueIteratorRange<ValueIteratorT>::str() const
{
- std::string dummy;
- for(ValueIteratorT it = m_begin; it != m_end; it++)
- dummy.push_back(*it);
- return dummy;
+ std::string dummy;
+ for(ValueIteratorT it = m_begin; it != m_end; it++)
+ dummy.push_back(*it);
+ return dummy;
}
template <typename ValueIteratorT>
template <typename StringT>
bool ValueIteratorRange<ValueIteratorT>::operator==(const StringT& o) const
{
- if(std::distance(m_begin, m_end) == std::distance(o.begin(), o.end()))
- return std::equal(m_begin, m_end, o.begin());
- else
- return false;
+ if(std::distance(m_begin, m_end) == std::distance(o.begin(), o.end()))
+ return std::equal(m_begin, m_end, o.begin());
+ else
+ return false;
}
-
+
template <typename ValueIteratorT>
bool ValueIteratorRange<ValueIteratorT>::operator==(const char* c) const
{
- return *this == std::string(c);
+ return *this == std::string(c);
}
template <typename ValueIteratorT>
template <typename StringT>
bool ValueIteratorRange<ValueIteratorT>::operator<(const StringT &s2) const
{
- return std::lexicographical_compare(m_begin, m_end, s2.begin(), s2.end(),
- std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
+ return std::lexicographical_compare(m_begin, m_end, s2.begin(), s2.end(),
+ std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
}
template <typename ValueIteratorT>
bool ValueIteratorRange<ValueIteratorT>::operator<(const char* c) const
{
- return *this < std::string(c);
+ return *this < std::string(c);
}
template <typename StringT, typename ValueIteratorT>
bool operator<(const StringT &s1, const ValueIteratorRange<ValueIteratorT> &s2)
{
- return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), s2.end(),
- std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
+ return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), s2.end(),
+ std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
}
template <typename ValueIteratorT>
bool operator<(const char* c, const ValueIteratorRange<ValueIteratorT> &s2)
{
- size_t len = std::char_traits<char>::length(c);
- return std::lexicographical_compare(c, c + len, s2.begin(), s2.end(),
- std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
+ size_t len = std::char_traits<char>::length(c);
+ return std::lexicographical_compare(c, c + len, s2.begin(), s2.end(),
+ std::less<typename std::iterator_traits<ValueIteratorT>::value_type>());
}
template <typename OStream, typename ValueIteratorT>
OStream& operator<<(OStream &os, ValueIteratorRange<ValueIteratorT> cr)
{
ValueIteratorT it = cr.begin();
- while(it != cr.end())
- os << *(it++);
- return os;
+ while(it != cr.end())
+ os << *(it++);
+ return os;
}
// StringVector
template<typename ValueT, typename PosT, template <typename> class Allocator>
StringVector<ValueT, PosT, Allocator>::StringVector()
- : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >()) { }
+ : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >()) { }
template<typename ValueT, typename PosT, template <typename> class Allocator>
StringVector<ValueT, PosT, Allocator>::StringVector(Allocator<ValueT> alloc)
- : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >(alloc)) { }
+ : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >(alloc)) { }
-template<typename ValueT, typename PosT, template <typename> class Allocator>
+template<typename ValueT, typename PosT, template <typename> class Allocator>
template <typename StringT>
void StringVector<ValueT, PosT, Allocator>::push_back(StringT s)
{
- if(is_sorted() && size() && !(back() < s))
- m_sorted = false;
+ if(is_sorted() && size() && !(back() < s))
+ m_sorted = false;
- m_positions.push_back(size2());
- std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray));
+ m_positions.push_back(size2());
+ std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray));
}
-template<typename ValueT, typename PosT, template <typename> class Allocator>
+template<typename ValueT, typename PosT, template <typename> class Allocator>
void StringVector<ValueT, PosT, Allocator>::push_back(const char* c)
{
- std::string dummy(c);
- push_back(dummy);
+ std::string dummy(c);
+ push_back(dummy);
}
-template<typename ValueT, typename PosT, template <typename> class Allocator>
+template<typename ValueT, typename PosT, template <typename> class Allocator>
template <typename Iterator>
Iterator StringVector<ValueT, PosT, Allocator>::begin() const
{
- return Iterator(const_cast<StringVector<ValueT, PosT, Allocator>&>(*this), 0);
+ return Iterator(const_cast<StringVector<ValueT, PosT, Allocator>&>(*this), 0);
}
-
-template<typename ValueT, typename PosT, template <typename> class Allocator>
+
+template<typename ValueT, typename PosT, template <typename> class Allocator>
template <typename Iterator>
Iterator StringVector<ValueT, PosT, Allocator>::end() const
{
- return Iterator(const_cast<StringVector<ValueT, PosT, Allocator>&>(*this), size());
+ return Iterator(const_cast<StringVector<ValueT, PosT, Allocator>&>(*this), size());
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename StringVector<ValueT, PosT, Allocator>::iterator StringVector<ValueT, PosT, Allocator>::begin() const
{
- return begin<iterator>();
-};
+ return begin<iterator>();
+};
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename StringVector<ValueT, PosT, Allocator>::iterator StringVector<ValueT, PosT, Allocator>::end() const
{
- return end<iterator>();
-};
+ return end<iterator>();
+};
template<typename ValueT, typename PosT, template <typename> class Allocator>
bool StringVector<ValueT, PosT, Allocator>::is_sorted() const
{
- return m_sorted;
+ return m_sorted;
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::size() const
{
- return m_positions.size();
+ return m_positions.size();
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::size2() const
{
- return m_charArray->size();
+ return m_charArray->size();
}
-
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT, Allocator>::at(PosT i) const
{
- return range(begin(i), end(i));
+ return range(begin(i), end(i));
}
-
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT, Allocator>::operator[](PosT i) const
{
- return at(i);
+ return at(i);
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename StringVector<ValueT, PosT, Allocator>::range StringVector<ValueT, PosT, Allocator>::back() const
{
- return at(size()-1);
+ return at(size()-1);
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::length(PosT i) const
{
- if(i+1 < size())
- return m_positions[i+1] - m_positions[i];
- else
- return size2() - m_positions[i];
+ if(i+1 < size())
+ return m_positions[i+1] - m_positions[i];
+ else
+ return size2() - m_positions[i];
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
const ValueT* StringVector<ValueT, PosT, Allocator>::value_ptr(PosT i) const
{
- return &(*m_charArray)[m_positions[i]];
+ return &(*m_charArray)[m_positions[i]];
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVector<ValueT, PosT, Allocator>::begin(PosT i) const
{
- return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i));
-}
+ return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i));
+}
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVector<ValueT, PosT, Allocator>::end(PosT i) const
{
- return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i) + length(i));
-}
+ return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i) + length(i));
+}
template<typename ValueT, typename PosT, template <typename> class Allocator>
template <typename StringT>
PosT StringVector<ValueT, PosT, Allocator>::find(StringT &s) const
{
- if(m_sorted)
- return std::distance(begin(), std::lower_bound(begin(), end(), s));
- return std::distance(begin(), std::find(begin(), end(), s));
+ if(m_sorted)
+ return std::distance(begin(), std::lower_bound(begin(), end(), s));
+ return std::distance(begin(), std::find(begin(), end(), s));
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::find(const char* c) const
{
- std::string s(c);
- return find(s);
+ std::string s(c);
+ return find(s);
}
// RangeIterator
@@ -518,21 +504,21 @@ StringVector<ValueT, PosT, Allocator>::RangeIterator::RangeIterator() : m_index(
template<typename ValueT, typename PosT, template <typename> class Allocator>
StringVector<ValueT, PosT, Allocator>::RangeIterator::RangeIterator(StringVector<ValueT, PosT, Allocator> &sv, PosT index)
: m_index(index), m_container(&sv) { }
-
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::RangeIterator::get_index()
{
return m_index;
}
-
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
typename StringVector<ValueT, PosT, Allocator>::range
- StringVector<ValueT, PosT, Allocator>::RangeIterator::dereference() const
+StringVector<ValueT, PosT, Allocator>::RangeIterator::dereference() const
{
return typename StringVector<ValueT, PosT, Allocator>::range(
- m_container->begin(m_index),
- m_container->end(m_index)
- );
+ m_container->begin(m_index),
+ m_container->end(m_index)
+ );
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
@@ -577,18 +563,18 @@ template<typename ValueT, typename PosT, template <typename> class Allocator>
StringVector<ValueT, PosT, Allocator>::StringIterator::StringIterator(
StringVector<ValueT, PosT, Allocator> &sv, PosT index) : m_index(index),
m_container(&sv) { }
-
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::StringIterator::get_index()
{
return m_index;
}
-
+
template<typename ValueT, typename PosT, template <typename> class Allocator>
const std::string StringVector<ValueT, PosT, Allocator>::StringIterator::dereference() const
{
return StringVector<ValueT, PosT, Allocator>::range(m_container->begin(m_index),
- m_container->end(m_index)).str();
+ m_container->end(m_index)).str();
}
template<typename ValueT, typename PosT, template <typename> class Allocator>
@@ -620,7 +606,7 @@ template<typename ValueT, typename PosT, template <typename> class Allocator>
PosT StringVector<ValueT, PosT, Allocator>::StringIterator::distance_to(
StringVector<ValueT, PosT, Allocator>::StringIterator const& other) const
{
- return other.m_index - m_index;
+ return other.m_index - m_index;
}
// ********** Some typedefs **********
diff --git a/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h b/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h
index 7687d1498..3eac0226a 100644
--- a/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h
+++ b/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h
@@ -1,23 +1,23 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_TargetPhraseCollectionCache_h
#define moses_TargetPhraseCollectionCache_h
@@ -46,135 +46,119 @@ typedef boost::shared_ptr<TargetPhraseVector> TargetPhraseVectorPtr;
class TargetPhraseCollectionCache
{
- private:
- size_t m_max;
- float m_tolerance;
-
- struct LastUsed {
- clock_t m_clock;
- TargetPhraseVectorPtr m_tpv;
- size_t m_bitsLeft;
-
- LastUsed() : m_clock(0), m_bitsLeft(0) {}
-
- LastUsed(clock_t clock, TargetPhraseVectorPtr tpv, size_t bitsLeft = 0)
+private:
+ size_t m_max;
+ float m_tolerance;
+
+ struct LastUsed {
+ clock_t m_clock;
+ TargetPhraseVectorPtr m_tpv;
+ size_t m_bitsLeft;
+
+ LastUsed() : m_clock(0), m_bitsLeft(0) {}
+
+ LastUsed(clock_t clock, TargetPhraseVectorPtr tpv, size_t bitsLeft = 0)
: m_clock(clock), m_tpv(tpv), m_bitsLeft(bitsLeft) {}
- };
-
- typedef std::map<Phrase, LastUsed> CacheMap;
-
- CacheMap m_phraseCache;
-
+ };
+
+ typedef std::map<Phrase, LastUsed> CacheMap;
+
+ CacheMap m_phraseCache;
+
#ifdef WITH_THREADS
- boost::mutex m_mutex;
+ boost::mutex m_mutex;
#endif
- public:
-
- typedef CacheMap::iterator iterator;
- typedef CacheMap::const_iterator const_iterator;
-
- TargetPhraseCollectionCache(size_t max = 5000, float tolerance = 0.2)
+public:
+
+ typedef CacheMap::iterator iterator;
+ typedef CacheMap::const_iterator const_iterator;
+
+ TargetPhraseCollectionCache(size_t max = 5000, float tolerance = 0.2)
: m_max(max), m_tolerance(tolerance)
- {}
-
- iterator Begin()
- {
- return m_phraseCache.begin();
- }
-
- const_iterator Begin() const
- {
- return m_phraseCache.begin();
- }
-
- iterator End()
- {
- return m_phraseCache.end();
- }
-
- const_iterator End() const
- {
- return m_phraseCache.end();
- }
-
- void Cache(const Phrase &sourcePhrase, TargetPhraseVectorPtr tpv,
- size_t bitsLeft = 0, size_t maxRank = 0)
- {
+ {}
+
+ iterator Begin() {
+ return m_phraseCache.begin();
+ }
+
+ const_iterator Begin() const {
+ return m_phraseCache.begin();
+ }
+
+ iterator End() {
+ return m_phraseCache.end();
+ }
+
+ const_iterator End() const {
+ return m_phraseCache.end();
+ }
+
+ void Cache(const Phrase &sourcePhrase, TargetPhraseVectorPtr tpv,
+ size_t bitsLeft = 0, size_t maxRank = 0) {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- iterator it = m_phraseCache.find(sourcePhrase);
- if(it != m_phraseCache.end())
- it->second.m_clock = clock();
- else
- {
- if(maxRank && tpv->size() > maxRank)
- {
- TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector());
- tpv_temp->resize(maxRank);
- std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin());
- m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft);
- }
- else
- m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft);
- }
+ iterator it = m_phraseCache.find(sourcePhrase);
+ if(it != m_phraseCache.end())
+ it->second.m_clock = clock();
+ else {
+ if(maxRank && tpv->size() > maxRank) {
+ TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector());
+ tpv_temp->resize(maxRank);
+ std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin());
+ m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft);
+ } else
+ m_phraseCache[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft);
}
+ }
- std::pair<TargetPhraseVectorPtr, size_t> Retrieve(const Phrase &sourcePhrase)
- {
+ std::pair<TargetPhraseVectorPtr, size_t> Retrieve(const Phrase &sourcePhrase) {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- iterator it = m_phraseCache.find(sourcePhrase);
- if(it != m_phraseCache.end())
- {
- LastUsed &lu = it->second;
- lu.m_clock = clock();
- return std::make_pair(lu.m_tpv, lu.m_bitsLeft);
- }
- else
- return std::make_pair(TargetPhraseVectorPtr(), 0);
- }
+ iterator it = m_phraseCache.find(sourcePhrase);
+ if(it != m_phraseCache.end()) {
+ LastUsed &lu = it->second;
+ lu.m_clock = clock();
+ return std::make_pair(lu.m_tpv, lu.m_bitsLeft);
+ } else
+ return std::make_pair(TargetPhraseVectorPtr(), 0);
+ }
- void Prune()
- {
+ void Prune() {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- if(m_phraseCache.size() > m_max * (1 + m_tolerance))
- {
- typedef std::set<std::pair<clock_t, Phrase> > Cands;
- Cands cands;
- for(CacheMap::iterator it = m_phraseCache.begin();
- it != m_phraseCache.end(); it++)
- {
- LastUsed &lu = it->second;
- cands.insert(std::make_pair(lu.m_clock, it->first));
- }
-
- for(Cands::iterator it = cands.begin(); it != cands.end(); it++)
- {
- const Phrase& p = it->second;
- m_phraseCache.erase(p);
-
- if(m_phraseCache.size() < (m_max * (1 - m_tolerance)))
- break;
- }
+ if(m_phraseCache.size() > m_max * (1 + m_tolerance)) {
+ typedef std::set<std::pair<clock_t, Phrase> > Cands;
+ Cands cands;
+ for(CacheMap::iterator it = m_phraseCache.begin();
+ it != m_phraseCache.end(); it++) {
+ LastUsed &lu = it->second;
+ cands.insert(std::make_pair(lu.m_clock, it->first));
+ }
+
+ for(Cands::iterator it = cands.begin(); it != cands.end(); it++) {
+ const Phrase& p = it->second;
+ m_phraseCache.erase(p);
+
+ if(m_phraseCache.size() < (m_max * (1 - m_tolerance)))
+ break;
}
}
+ }
- void CleanUp()
- {
+ void CleanUp() {
#ifdef WITH_THREADS
- boost::mutex::scoped_lock lock(m_mutex);
+ boost::mutex::scoped_lock lock(m_mutex);
#endif
- m_phraseCache.clear();
- }
-
+ m_phraseCache.clear();
+ }
+
};
}
diff --git a/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp b/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp
index 35e8e3122..b231836f5 100644
--- a/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp
+++ b/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp
@@ -1,27 +1,28 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
-#include "ThrowingFwrite.h"
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
-size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream) {
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "ThrowingFwrite.h"
+
+size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream)
+{
assert(size);
size_t returnValue = std::fwrite(ptr, size, count, stream);
UTIL_THROW_IF(count != returnValue, util::ErrnoException, "Short fwrite; requested size " << size);
diff --git a/moses/TranslationModel/CompactPT/ThrowingFwrite.h b/moses/TranslationModel/CompactPT/ThrowingFwrite.h
index 4f45ae8f5..466d3973b 100644
--- a/moses/TranslationModel/CompactPT/ThrowingFwrite.h
+++ b/moses/TranslationModel/CompactPT/ThrowingFwrite.h
@@ -1,30 +1,30 @@
-// $Id$
-// vim:tabstop=2
-/***********************************************************************
-Moses - factored phrase-based language decoder
-Copyright (C) 2006 University of Edinburgh
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-***********************************************************************/
+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
#ifndef moses_ThrowingFwrite_h
#define moses_ThrowingFwrite_h
#include <cassert>
#include <cstdio>
-#include "util/exception.hh"
+#include "util/exception.hh"
size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream);
diff --git a/moses/TranslationModel/DynSAInclude/RandLMCache.h b/moses/TranslationModel/DynSAInclude/RandLMCache.h
index b92a2a164..06ce240a1 100644
--- a/moses/TranslationModel/DynSAInclude/RandLMCache.h
+++ b/moses/TranslationModel/DynSAInclude/RandLMCache.h
@@ -22,177 +22,180 @@
#include <ctime>
#include <iostream>
-namespace randlm {
-
- //! @todo ask abby2
- template<typename T>
- class CacheNode {
- public:
- typedef std::map<wordID_t, CacheNode<T>* > childMap;
- // initialise value to 'unknown' (i.e. not yet queried or cached).
- CacheNode(T unknown_value) : value_(unknown_value) {}
- childMap childs_; // child pointers
- T value_; // value stored
- const void* state_; // state pointer
- };
-
- template<typename T>
- class Cache {
- public:
- typedef typename std::map<wordID_t, CacheNode<T>* >::iterator childPtr;
- // unknown_value is used to indicate the ngram was not queried (yet)
- // null_value_ indicates it was queried but not found in model
- // space usage is handled by client.
- Cache(T unknown_value, T null_value) :
- cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) {
- root_ = newNode();
+namespace randlm
+{
+
+//! @todo ask abby2
+template<typename T>
+class CacheNode
+{
+public:
+ typedef std::map<wordID_t, CacheNode<T>* > childMap;
+ // initialise value to 'unknown' (i.e. not yet queried or cached).
+ CacheNode(T unknown_value) : value_(unknown_value) {}
+ childMap childs_; // child pointers
+ T value_; // value stored
+ const void* state_; // state pointer
+};
+
+template<typename T>
+class Cache
+{
+public:
+ typedef typename std::map<wordID_t, CacheNode<T>* >::iterator childPtr;
+ // unknown_value is used to indicate the ngram was not queried (yet)
+ // null_value_ indicates it was queried but not found in model
+ // space usage is handled by client.
+ Cache(T unknown_value, T null_value) :
+ cur_nodes_(0), unknown_value_(unknown_value), null_value_(null_value) {
+ root_ = newNode();
+ }
+ ~Cache() {
+ if(clear()) {
+ delete root_;
+ root_ = NULL;
+ } else {
+ std::cerr << "Error freeing cache memory.\n";
}
- ~Cache() {
- if(clear()) {
- delete root_;
- root_ = NULL;
+ }
+ bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) {
+ // inserts full ngram into cache
+ CacheNode<T>* node = root_;
+ for (int i = len - 1; i > -1; --i) {
+ childPtr child = node->childs_.find(ngram[i]);
+ if( child != node->childs_.end() ) {
+ // current node is already prefix. Go to child node
+ node = node->childs_[ngram[i]];
} else {
- std::cerr << "Error freeing cache memory.\n";
- }
- }
- bool setCacheNgram(const wordID_t* ngram, int len, T value, const void* state) {
- // inserts full ngram into cache
- CacheNode<T>* node = root_;
- for (int i = len - 1; i > -1; --i) {
- childPtr child = node->childs_.find(ngram[i]);
- if( child != node->childs_.end() ) {
- // current node is already prefix. Go to child node
- node = node->childs_[ngram[i]];
- } else {
- // no child for prefix. set new child link in current node
- CacheNode<T> * newChild = newNode(node);
- node->childs_[ngram[i]] = newChild;
- // go to new node
- node = newChild;
- }
+ // no child for prefix. set new child link in current node
+ CacheNode<T> * newChild = newNode(node);
+ node->childs_[ngram[i]] = newChild;
+ // go to new node
+ node = newChild;
}
- node->value_ = value;
- node->state_ = state;
- return true;
}
- bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) {
- // finds value for this full ngram only (returns false if full ngram not in cache)
- CacheNode<T> * node = root_;
- for(int i = len - 1; i > -1; --i) {
- // go to deepest level node of ngram in cache
- childPtr child = node->childs_.find(ngram[i]);
- if( child != node->childs_.end() ) {
- // switch to child node
- node = node->childs_[ngram[i]];
- } else {
- // not cached
- return false;
- }
+ node->value_ = value;
+ node->state_ = state;
+ return true;
+ }
+ bool checkCacheNgram(const wordID_t* ngram, int len, T* value, const void** state) {
+ // finds value for this full ngram only (returns false if full ngram not in cache)
+ CacheNode<T> * node = root_;
+ for(int i = len - 1; i > -1; --i) {
+ // go to deepest level node of ngram in cache
+ childPtr child = node->childs_.find(ngram[i]);
+ if( child != node->childs_.end() ) {
+ // switch to child node
+ node = node->childs_[ngram[i]];
+ } else {
+ // not cached
+ return false;
}
- *value = node->value_;
- if(state) *state = node->state_;
- return *value != null_value_ && *value != unknown_value_;
}
- int getCache2(const wordID_t* ngram, int len, T** values, int* found) {
- // set values array to point to cache value nodes
- CacheNode<T> * node = root_;
- *found = 0;
- //values[0] = &node->value_; // pointer to root node's value
- bool all_found = true;
- for(int i = len - 1; i > -1; --i) {
- // go to deepest level node of ngram in cache
- childPtr child = node->childs_.find(ngram[i]);
- if( child != node->childs_.end() ) {
- // switch to child node
- node = node->childs_[ngram[i]];
- // get pointer to value (index by length - 1)
- values[i] = &node->value_;
- // if null_value then assume all extensions impossible
- if (node->value_ == null_value_) {
- return len - 1 - i; // max length posible
- }
- all_found = all_found && (node->value_ != unknown_value_);
- if (all_found)
- ++(*found);
- } else {
- // initialise uncached values
- CacheNode<T> * newChild = newNode(node);
- node->childs_[ngram[i]] = newChild;
- // go to new node
- node = newChild;
- values[i] = &node->value_;
- }
+ *value = node->value_;
+ if(state) *state = node->state_;
+ return *value != null_value_ && *value != unknown_value_;
+ }
+ int getCache2(const wordID_t* ngram, int len, T** values, int* found) {
+ // set values array to point to cache value nodes
+ CacheNode<T> * node = root_;
+ *found = 0;
+ //values[0] = &node->value_; // pointer to root node's value
+ bool all_found = true;
+ for(int i = len - 1; i > -1; --i) {
+ // go to deepest level node of ngram in cache
+ childPtr child = node->childs_.find(ngram[i]);
+ if( child != node->childs_.end() ) {
+ // switch to child node
+ node = node->childs_[ngram[i]];
+ // get pointer to value (index by length - 1)
+ values[i] = &node->value_;
+ // if null_value then assume all extensions impossible
+ if (node->value_ == null_value_) {
+ return len - 1 - i; // max length posible
+ }
+ all_found = all_found && (node->value_ != unknown_value_);
+ if (all_found)
+ ++(*found);
+ } else {
+ // initialise uncached values
+ CacheNode<T> * newChild = newNode(node);
+ node->childs_[ngram[i]] = newChild;
+ // go to new node
+ node = newChild;
+ values[i] = &node->value_;
}
- return len; // all possible
}
- int getCache(const wordID_t* ngram, int len, T** values, int* found) {
- // get pointers to values for ngram and constituents.
- // returns upper bound on longest subngram in model.
- // 'found' stores longest non-null and known value found.
- CacheNode<T> * node = root_;
- *found = 0;
- values[0] = &node->value_; // pointer to root node's value
- bool all_found = true;
- for(int i = len - 1; i > -1; --i) {
- // go to deepest level node of ngram in cache
- childPtr child = node->childs_.find(ngram[i]);
- if( child != node->childs_.end() ) {
- // switch to child node
- node = node->childs_[ngram[i]];
- // get pointer to value (index by length - 1)
- values[len - i] = &node->value_;
- // if null_value then assume all extensions impossible
- if (node->value_ == null_value_)
- return len - 1 - i; // max length posible
- all_found = all_found && (node->value_ != unknown_value_);
- if (all_found)
- ++(*found);
- } else {
- // initialise uncached values
- CacheNode<T> * newChild = newNode(node);
- node->childs_[ngram[i]] = newChild;
- // go to new node
- node = newChild;
- values[len - i] = &node->value_;
- }
+ return len; // all possible
+ }
+ int getCache(const wordID_t* ngram, int len, T** values, int* found) {
+ // get pointers to values for ngram and constituents.
+ // returns upper bound on longest subngram in model.
+ // 'found' stores longest non-null and known value found.
+ CacheNode<T> * node = root_;
+ *found = 0;
+ values[0] = &node->value_; // pointer to root node's value
+ bool all_found = true;
+ for(int i = len - 1; i > -1; --i) {
+ // go to deepest level node of ngram in cache
+ childPtr child = node->childs_.find(ngram[i]);
+ if( child != node->childs_.end() ) {
+ // switch to child node
+ node = node->childs_[ngram[i]];
+ // get pointer to value (index by length - 1)
+ values[len - i] = &node->value_;
+ // if null_value then assume all extensions impossible
+ if (node->value_ == null_value_)
+ return len - 1 - i; // max length posible
+ all_found = all_found && (node->value_ != unknown_value_);
+ if (all_found)
+ ++(*found);
+ } else {
+ // initialise uncached values
+ CacheNode<T> * newChild = newNode(node);
+ node->childs_[ngram[i]] = newChild;
+ // go to new node
+ node = newChild;
+ values[len - i] = &node->value_;
}
- return len; // all possible
- }
- bool clear() {
- std::cerr << "Clearing cache with " << static_cast<float>(cur_nodes_ * nodeSize())
- / static_cast<float>(1ull << 20) << "MB" << std::endl;
- return clearNodes(root_);
- }
- int nodes() {
- // returns number of nodes
- return cur_nodes_;
- }
- int nodeSize() {
- return sizeof(CacheNode<T>) + sizeof(root_->childs_);
- }
- private:
- CacheNode<T> * root_;
- count_t cur_nodes_;
- T unknown_value_; // Used to initialise data at each node
- T null_value_; // Indicates cached something not in model
- CacheNode<T>* newNode(CacheNode<T> * node = 0) {
- ++cur_nodes_;
- return new CacheNode<T>(unknown_value_);
}
- bool clearNodes(CacheNode<T> * node) {
- //delete children from this node
- if(!node->childs_.empty()) {
- iterate(node->childs_, itr) {
- if(!clearNodes(itr->second))
- std::cerr << "Error emptying cache\n";
- delete itr->second;
- --cur_nodes_;
- }
- node->childs_.clear();
+ return len; // all possible
+ }
+ bool clear() {
+ std::cerr << "Clearing cache with " << static_cast<float>(cur_nodes_ * nodeSize())
+ / static_cast<float>(1ull << 20) << "MB" << std::endl;
+ return clearNodes(root_);
+ }
+ int nodes() {
+ // returns number of nodes
+ return cur_nodes_;
+ }
+ int nodeSize() {
+ return sizeof(CacheNode<T>) + sizeof(root_->childs_);
+ }
+private:
+ CacheNode<T> * root_;
+ count_t cur_nodes_;
+ T unknown_value_; // Used to initialise data at each node
+ T null_value_; // Indicates cached something not in model
+ CacheNode<T>* newNode(CacheNode<T> * node = 0) {
+ ++cur_nodes_;
+ return new CacheNode<T>(unknown_value_);
+ }
+ bool clearNodes(CacheNode<T> * node) {
+ //delete children from this node
+ if(!node->childs_.empty()) {
+ iterate(node->childs_, itr) {
+ if(!clearNodes(itr->second))
+ std::cerr << "Error emptying cache\n";
+ delete itr->second;
+ --cur_nodes_;
}
- return true;
+ node->childs_.clear();
}
+ return true;
+ }
- };
+};
} //end namespace
#endif //INC_RANDLM_CACHE_H
diff --git a/moses/TranslationModel/DynSAInclude/RandLMFilter.h b/moses/TranslationModel/DynSAInclude/RandLMFilter.h
index 298464693..0923f52af 100644
--- a/moses/TranslationModel/DynSAInclude/RandLMFilter.h
+++ b/moses/TranslationModel/DynSAInclude/RandLMFilter.h
@@ -24,296 +24,307 @@
#define log2(X) (log((double)X)/log((double)2))
#endif
-namespace randlm {
-
- /* Class Filter wraps a contiguous array of data. Filter and its subclasses
- * implement read/write/increment functionality on arrays with arbitrary sized addresses
- * (i.e. an address may not use a full number of bytes). When converting to byte-based
- * representation we assume "unused" bits are to left.
- * E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11
- * to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1])
- * and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have
- * been masked out.
- */
- template<typename T>
- class Filter {
- public:
- Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) {
- // number of bits in T
- cell_width_ = sizeof(T) << 3;
- // current implementation has following constraints
- CHECK(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width);
- // used for >> division
- log_cell_width_ = static_cast<int>(floor(log((double)cell_width_)/log((double)2) + 0.000001));
- // size of underlying data in Ts
- cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_;
- // instantiate underlying data
- data_ = new T[cells_];
- CHECK(data_ != NULL);
- CHECK(reset());
- // 'first_bit' marks the first bit used by 'address' (left padded with zeros).
- first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_);
- // mask for full cell
- full_mask_ = static_cast<T>(0xffffffffffffffffull);
- // mask for bits that make up the address
- address_mask_ = full_mask_ >> first_bit_;
- }
- Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) {
- CHECK(loadHeader(fin));
- if (loaddata)
- CHECK(loadData(fin));
- }
- virtual ~Filter() {
- delete[] data_;
- }
- bool reset() {
- for (uint64_t i = 0; i < cells_; ++i)
- data_[i] = 0;
+namespace randlm
+{
+
+/* Class Filter wraps a contiguous array of data. Filter and its subclasses
+ * implement read/write/increment functionality on arrays with arbitrary sized addresses
+ * (i.e. an address may not use a full number of bytes). When converting to byte-based
+ * representation we assume "unused" bits are to left.
+ * E.g. if the underlying data is stored in units T = uint16 and the 'width' = 11
+ * to read 'address' = 3 we extract bits at indices [33,42] (i.e. [11*3, 11*4 - 1])
+ * and store in a uint16 in positions 0000 0001 111111 where the first 7 bits have
+ * been masked out.
+ */
+template<typename T>
+class Filter
+{
+public:
+ Filter(uint64_t addresses, int width) : addresses_(addresses), width_(width), data_(NULL) {
+ // number of bits in T
+ cell_width_ = sizeof(T) << 3;
+ // current implementation has following constraints
+ CHECK(cell_width_ > 0 && cell_width_ <= 64 && cell_width_ >= width);
+ // used for >> division
+ log_cell_width_ = static_cast<int>(floor(log((double)cell_width_)/log((double)2) + 0.000001));
+ // size of underlying data in Ts
+ cells_ = ((addresses * width) + cell_width_ - 1) >> log_cell_width_;
+ // instantiate underlying data
+ data_ = new T[cells_];
+ CHECK(data_ != NULL);
+ CHECK(reset());
+ // 'first_bit' marks the first bit used by 'address' (left padded with zeros).
+ first_bit_ = (width % cell_width_ == 0) ? 0 : cell_width_ - (width % cell_width_);
+ // mask for full cell
+ full_mask_ = static_cast<T>(0xffffffffffffffffull);
+ // mask for bits that make up the address
+ address_mask_ = full_mask_ >> first_bit_;
+ }
+ Filter(FileHandler* fin, bool loaddata = true) : data_(NULL) {
+ CHECK(loadHeader(fin));
+ if (loaddata)
+ CHECK(loadData(fin));
+ }
+ virtual ~Filter() {
+ delete[] data_;
+ }
+ bool reset() {
+ for (uint64_t i = 0; i < cells_; ++i)
+ data_[i] = 0;
+ return true;
+ }
+ count_t size() {
+ // return approx size of filter in MBs
+ return cells_ * sizeof(T) >> 20;
+ }
+ // read / write functions
+ inline bool read(uint64_t address, T* value) {
+ CHECK(address <= addresses_);
+ // copy address to 'value'
+ uint64_t data_bit = address * width_;
+ uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
+ // 'offset' shows how address in 'data' and 'value' align
+ int offset = (data_bit % cell_width_) - first_bit_;
+ // they align so just copy across masking unneeded leading bits
+ if (offset == 0) {
+ *value = data_[data_cell] & address_mask_;
return true;
}
- count_t size() {
- // return approx size of filter in MBs
- return cells_ * sizeof(T) >> 20;
- }
- // read / write functions
- inline bool read(uint64_t address, T* value) {
- CHECK(address <= addresses_);
- // copy address to 'value'
- uint64_t data_bit = address * width_;
- uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
- // 'offset' shows how address in 'data' and 'value' align
- int offset = (data_bit % cell_width_) - first_bit_;
- // they align so just copy across masking unneeded leading bits
- if (offset == 0) {
- *value = data_[data_cell] & address_mask_;
- return true;
- }
- // data address starts to left so shift it right
- if (offset < 0) {
- *value = (data_[data_cell] >> -offset) & address_mask_;
- return true;
- }
- // data address is to right so shift it left and look at one more cell to right
- *value = ((data_[data_cell] << offset)
- | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
+ // data address starts to left so shift it right
+ if (offset < 0) {
+ *value = (data_[data_cell] >> -offset) & address_mask_;
return true;
}
- inline T read(uint64_t address) {
- CHECK(address <= addresses_);
- // return value at address
- T value = 0;
- uint64_t data_bit = address * width_;
- uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
- // 'offset' shows how address in 'data' and 'value' align
- int offset = (data_bit % cell_width_) - first_bit_;
- // they align so just copy across masking unneeded leading bits
- if (offset == 0) {
- value = data_[data_cell] & address_mask_;
- }
- // data address starts to left so shift it right
- else if (offset < 0) {
- value = (data_[data_cell] >> -offset) & address_mask_;
- }
- // data address is to right so shift it left and look at one more cell to right
- else
- value = ((data_[data_cell] << offset)
- | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
- return value;
- }
- inline bool write(uint64_t address, T value) {
- CHECK(address <= addresses_);
- CHECK(log2(value) <= width_);
- // write 'value' to address
- uint64_t data_bit = address * width_;
- uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
- // 'offset' shows how address in 'data' and 'value' align
- int offset = (data_bit % cell_width_) - first_bit_;
- // they align so just copy across masking unneeded leading zeros of value
- if (offset == 0) {
- data_[data_cell] = value | (data_[data_cell] & ~address_mask_);
- return true;
- }
- // address in data is to left so shift value left by -offset
- if (offset < 0) {
- data_[data_cell] = (value << -offset)
- | (data_[data_cell] & ~(address_mask_ << -offset));
- return true;
- }
- // address in data is to right so shift value right by offset
- data_[data_cell] = (value >> offset) |
- (data_[data_cell] & ~(address_mask_ >> offset));
- data_[data_cell + 1] = (value << (cell_width_ - offset)) |
- (data_[data_cell + 1] & (full_mask_ >> offset));
- return true;
- }
- inline bool readWithFingerprint(uint64_t address, T finger, T* value) {
- // copy 'address' ^ 'finger' to 'value'
- uint64_t data_bit = address * width_;
- uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
- // 'offset' shows how address in 'data' and 'value' align
- int offset = (data_bit % cell_width_) - first_bit_;
- // they align so just copy across masking unneeded leading bits
- if (offset == 0) {
- *value = (finger ^ data_[data_cell]) & address_mask_;
- return true;
- }
- // data address starts to left so shift it right
- if (offset < 0) {
- *value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_;
- return true;
- }
- // data address is to right so shift it left and look at one more cell to right
- *value = (((data_[data_cell] << offset)
- | (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger)
- & address_mask_ ;
- return true;
+ // data address is to right so shift it left and look at one more cell to right
+ *value = ((data_[data_cell] << offset)
+ | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
+ return true;
+ }
+ inline T read(uint64_t address) {
+ CHECK(address <= addresses_);
+ // return value at address
+ T value = 0;
+ uint64_t data_bit = address * width_;
+ uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
+ // 'offset' shows how address in 'data' and 'value' align
+ int offset = (data_bit % cell_width_) - first_bit_;
+ // they align so just copy across masking unneeded leading bits
+ if (offset == 0) {
+ value = data_[data_cell] & address_mask_;
}
- inline bool writeWithFingerprint(uint64_t address, T finger, T value) {
- // write 'value' ^ 'finger' to address
- finger &= address_mask_; // make sure fingerprint is correct size
- uint64_t data_bit = address * width_;
- uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
- // 'offset' shows how address in 'data' and 'value' align
- int offset = (data_bit % cell_width_) - first_bit_;
- // they align so just copy across masking unneeded leading zeros of value
- if (offset == 0) {
- data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_);
- return true;
- }
- // address in data is to left so shift value left by -offset
- if (offset < 0) {
- data_[data_cell] = ((finger ^ value) << -offset)
- | (data_[data_cell] & ~(address_mask_ << -offset));
- return true;
- }
- // address in data is to right so shift value right by offset
- data_[data_cell] = ((finger ^ value) >> offset) |
- (data_[data_cell] & ~(address_mask_ >> offset));
- data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) |
- (data_[data_cell + 1] & (full_mask_ >> offset));
- return true;
- }
- // debugging
- void printFilter(const std::string & prefix = "", uint32_t truncate = 64){
- std::cout << prefix;
- for (uint32_t i = 0; i < cells_ && i < truncate; ++i) {
- for (int j = cell_width_ - 1; j >= 0; --j)
- if (data_[i] & (1ull << j))
- std::cout << 1;
- else
- std::cout << 0;
- std::cout << "\n";
- }
- std::cout << std::endl;
+ // data address starts to left so shift it right
+ else if (offset < 0) {
+ value = (data_[data_cell] >> -offset) & address_mask_;
}
- // i/o
- uint64_t getAddresses() { return addresses_; }
- int getWidth() { return width_; }
- int getCellWidth() { return cell_width_; }
- uint32_t getCells() { return cells_; }
- virtual bool save(FileHandler* out) {
- CHECK(out != NULL);
- CHECK(out->write((char*)&cells_, sizeof(cells_)));
- CHECK(out->write((char*)&cell_width_, sizeof(cell_width_)));
- CHECK(out->write((char*)&log_cell_width_, sizeof(log_cell_width_)));
- CHECK(out->write((char*)&addresses_, sizeof(addresses_)));
- CHECK(out->write((char*)&width_, sizeof(width_)));
- CHECK(out->write((char*)&first_bit_, sizeof(first_bit_)));
- CHECK(out->write((char*)&full_mask_, sizeof(full_mask_)));
- CHECK(out->write((char*)&address_mask_, sizeof(address_mask_)));
- //CHECK(out->write((char*)data_, cells_ * sizeof(T)));
- const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29);
- if((width_ == 1) || cells_ < jump)
- CHECK(out->write((char*)data_, cells_ * sizeof(T)));
- else {
- uint64_t idx(0);
- while(idx + jump < cells_) {
- CHECK(out->write((char*)&data_[idx], jump * sizeof(T)));
- idx += jump;
- }
- CHECK(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T)));
- }
+ // data address is to right so shift it left and look at one more cell to right
+ else
+ value = ((data_[data_cell] << offset)
+ | (data_[data_cell + 1] >> (cell_width_ - offset))) & address_mask_ ;
+ return value;
+ }
+ inline bool write(uint64_t address, T value) {
+ CHECK(address <= addresses_);
+ CHECK(log2(value) <= width_);
+ // write 'value' to address
+ uint64_t data_bit = address * width_;
+ uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
+ // 'offset' shows how address in 'data' and 'value' align
+ int offset = (data_bit % cell_width_) - first_bit_;
+ // they align so just copy across masking unneeded leading zeros of value
+ if (offset == 0) {
+ data_[data_cell] = value | (data_[data_cell] & ~address_mask_);
return true;
}
- protected:
- bool loadHeader(FileHandler* fin) {
- CHECK(fin != NULL);
- CHECK(fin->read((char*)&cells_, sizeof(cells_)));
- CHECK(fin->read((char*)&cell_width_, sizeof(cell_width_)));
- CHECK(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type
- CHECK(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_)));
- CHECK(fin->read((char*)&addresses_, sizeof(addresses_)));
- CHECK(fin->read((char*)&width_, sizeof(width_)));
- CHECK(fin->read((char*)&first_bit_, sizeof(first_bit_)));
- CHECK(fin->read((char*)&full_mask_, sizeof(full_mask_)));
- CHECK(fin->read((char*)&address_mask_, sizeof(address_mask_)));
+ // address in data is to left so shift value left by -offset
+ if (offset < 0) {
+ data_[data_cell] = (value << -offset)
+ | (data_[data_cell] & ~(address_mask_ << -offset));
return true;
}
- bool loadData(FileHandler* fin) {
- // instantiate underlying array
- data_ = new T[cells_];
- CHECK(data_ != NULL);
- CHECK(fin->read((char*)data_, cells_ * sizeof(T)));
- //CHECK(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T)));
- //CHECK(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T)));
+ // address in data is to right so shift value right by offset
+ data_[data_cell] = (value >> offset) |
+ (data_[data_cell] & ~(address_mask_ >> offset));
+ data_[data_cell + 1] = (value << (cell_width_ - offset)) |
+ (data_[data_cell + 1] & (full_mask_ >> offset));
+ return true;
+ }
+ inline bool readWithFingerprint(uint64_t address, T finger, T* value) {
+ // copy 'address' ^ 'finger' to 'value'
+ uint64_t data_bit = address * width_;
+ uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
+ // 'offset' shows how address in 'data' and 'value' align
+ int offset = (data_bit % cell_width_) - first_bit_;
+ // they align so just copy across masking unneeded leading bits
+ if (offset == 0) {
+ *value = (finger ^ data_[data_cell]) & address_mask_;
return true;
}
- uint64_t cells_; // number T making up 'data_'
- int cell_width_; // bits per cell (i.e. sizeof(T) << 3)
- int log_cell_width_; // log of bits used for >> division
- uint64_t addresses_; // number of addresses in the filter
- int width_; // width in bits of each address
- int first_bit_; // position of first bit in initial byte
- T full_mask_; // all 1s
- T address_mask_; // 1s in those positions that are part of address
- T* data_; // the raw data as bytes
- };
-
- // Extension with bit test/setter methods added
- class BitFilter : public Filter<uint8_t> {
- public:
- BitFilter(uint64_t bits) : Filter<uint8_t>(bits, 1) {}
- BitFilter(FileHandler* fin, bool loaddata = true)
- : Filter<uint8_t>(fin, loaddata) {
- if (loaddata)
- CHECK(load(fin));
- }
- // TODO: overload operator[]
- virtual bool testBit(uint64_t location) {
- // test bit referenced by location
- return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8);
- }
- virtual bool setBit(uint64_t location) {
- // set bit referenced by location
- data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8);
+ // data address starts to left so shift it right
+ if (offset < 0) {
+ *value = ((data_[data_cell] >> -offset) ^ finger) & address_mask_;
return true;
}
- virtual bool clearBit(uint64_t location) {
- // set bit referenced by location
- data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8);
+ // data address is to right so shift it left and look at one more cell to right
+ *value = (((data_[data_cell] << offset)
+ | (data_[data_cell + 1] >> (cell_width_ - offset))) ^ finger)
+ & address_mask_ ;
+ return true;
+ }
+ inline bool writeWithFingerprint(uint64_t address, T finger, T value) {
+ // write 'value' ^ 'finger' to address
+ finger &= address_mask_; // make sure fingerprint is correct size
+ uint64_t data_bit = address * width_;
+ uint32_t data_cell = (data_bit >> log_cell_width_); // % cells_;
+ // 'offset' shows how address in 'data' and 'value' align
+ int offset = (data_bit % cell_width_) - first_bit_;
+ // they align so just copy across masking unneeded leading zeros of value
+ if (offset == 0) {
+ data_[data_cell] = (finger ^ value) | (data_[data_cell] & ~address_mask_);
return true;
}
- bool save(FileHandler* fout) {
- CHECK(Filter<uint8_t>::save(fout));
- std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;;
+ // address in data is to left so shift value left by -offset
+ if (offset < 0) {
+ data_[data_cell] = ((finger ^ value) << -offset)
+ | (data_[data_cell] & ~(address_mask_ << -offset));
return true;
}
- float rho(uint64_t limit = 0) {
- uint64_t ones = 0;
- uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_;
- for (uint64_t i = 0; i < range; ++i)
- for (int j = 0; j < 8; ++j)
- if (data_[i] & (1 << j))
- ++ones;
- return static_cast<float>((range << 3) - ones)/static_cast<float>(range << 3);
+ // address in data is to right so shift value right by offset
+ data_[data_cell] = ((finger ^ value) >> offset) |
+ (data_[data_cell] & ~(address_mask_ >> offset));
+ data_[data_cell + 1] = ((finger ^ value) << (cell_width_ - offset)) |
+ (data_[data_cell + 1] & (full_mask_ >> offset));
+ return true;
+ }
+ // debugging
+ void printFilter(const std::string & prefix = "", uint32_t truncate = 64) {
+ std::cout << prefix;
+ for (uint32_t i = 0; i < cells_ && i < truncate; ++i) {
+ for (int j = cell_width_ - 1; j >= 0; --j)
+ if (data_[i] & (1ull << j))
+ std::cout << 1;
+ else
+ std::cout << 0;
+ std::cout << "\n";
}
- protected:
- bool load(FileHandler* fin) {
- std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;;
- return true;
+ std::cout << std::endl;
+ }
+ // i/o
+ uint64_t getAddresses() {
+ return addresses_;
+ }
+ int getWidth() {
+ return width_;
+ }
+ int getCellWidth() {
+ return cell_width_;
+ }
+ uint32_t getCells() {
+ return cells_;
+ }
+ virtual bool save(FileHandler* out) {
+ CHECK(out != NULL);
+ CHECK(out->write((char*)&cells_, sizeof(cells_)));
+ CHECK(out->write((char*)&cell_width_, sizeof(cell_width_)));
+ CHECK(out->write((char*)&log_cell_width_, sizeof(log_cell_width_)));
+ CHECK(out->write((char*)&addresses_, sizeof(addresses_)));
+ CHECK(out->write((char*)&width_, sizeof(width_)));
+ CHECK(out->write((char*)&first_bit_, sizeof(first_bit_)));
+ CHECK(out->write((char*)&full_mask_, sizeof(full_mask_)));
+ CHECK(out->write((char*)&address_mask_, sizeof(address_mask_)));
+ //CHECK(out->write((char*)data_, cells_ * sizeof(T)));
+ const uint64_t jump = 524288032ul; //(uint64_t)pow(2, 29);
+ if((width_ == 1) || cells_ < jump)
+ CHECK(out->write((char*)data_, cells_ * sizeof(T)));
+ else {
+ uint64_t idx(0);
+ while(idx + jump < cells_) {
+ CHECK(out->write((char*)&data_[idx], jump * sizeof(T)));
+ idx += jump;
+ }
+ CHECK(out->write((char*)&data_[idx], (cells_ - idx) * sizeof(T)));
}
- };
-/*
+ return true;
+ }
+protected:
+ bool loadHeader(FileHandler* fin) {
+ CHECK(fin != NULL);
+ CHECK(fin->read((char*)&cells_, sizeof(cells_)));
+ CHECK(fin->read((char*)&cell_width_, sizeof(cell_width_)));
+ CHECK(cell_width_ == sizeof(T) << 3); // make sure correct underlying data type
+ CHECK(fin->read((char*)&log_cell_width_, sizeof(log_cell_width_)));
+ CHECK(fin->read((char*)&addresses_, sizeof(addresses_)));
+ CHECK(fin->read((char*)&width_, sizeof(width_)));
+ CHECK(fin->read((char*)&first_bit_, sizeof(first_bit_)));
+ CHECK(fin->read((char*)&full_mask_, sizeof(full_mask_)));
+ CHECK(fin->read((char*)&address_mask_, sizeof(address_mask_)));
+ return true;
+ }
+ bool loadData(FileHandler* fin) {
+ // instantiate underlying array
+ data_ = new T[cells_];
+ CHECK(data_ != NULL);
+ CHECK(fin->read((char*)data_, cells_ * sizeof(T)));
+ //CHECK(fin->read((char*)&data_[0], ceil(float(cells_) / 2.0) * sizeof(T)));
+ //CHECK(fin->read((char*)&data_[cells_ / 2], (cells_ / 2) * sizeof(T)));
+ return true;
+ }
+ uint64_t cells_; // number T making up 'data_'
+ int cell_width_; // bits per cell (i.e. sizeof(T) << 3)
+ int log_cell_width_; // log of bits used for >> division
+ uint64_t addresses_; // number of addresses in the filter
+ int width_; // width in bits of each address
+ int first_bit_; // position of first bit in initial byte
+ T full_mask_; // all 1s
+ T address_mask_; // 1s in those positions that are part of address
+ T* data_; // the raw data as bytes
+};
+
+// Extension with bit test/setter methods added
+class BitFilter : public Filter<uint8_t>
+{
+public:
+ BitFilter(uint64_t bits) : Filter<uint8_t>(bits, 1) {}
+ BitFilter(FileHandler* fin, bool loaddata = true)
+ : Filter<uint8_t>(fin, loaddata) {
+ if (loaddata)
+ CHECK(load(fin));
+ }
+ // TODO: overload operator[]
+ virtual bool testBit(uint64_t location) {
+ // test bit referenced by location
+ return data_[(location % addresses_) >> 3] & 1 << ((location % addresses_) % 8);
+ }
+ virtual bool setBit(uint64_t location) {
+ // set bit referenced by location
+ data_[(location % addresses_) >> 3] |= 1 << ((location % addresses_) % 8);
+ return true;
+ }
+ virtual bool clearBit(uint64_t location) {
+ // set bit referenced by location
+ data_[(location % addresses_) >> 3] &= 0 << ((location % addresses_) % 8);
+ return true;
+ }
+ bool save(FileHandler* fout) {
+ CHECK(Filter<uint8_t>::save(fout));
+ std::cerr << "Saved BitFilter. Rho = " << rho() << "." << std::endl;;
+ return true;
+ }
+ float rho(uint64_t limit = 0) {
+ uint64_t ones = 0;
+ uint64_t range = limit > 0 ? std::min(limit,cells_) : cells_;
+ for (uint64_t i = 0; i < range; ++i)
+ for (int j = 0; j < 8; ++j)
+ if (data_[i] & (1 << j))
+ ++ones;
+ return static_cast<float>((range << 3) - ones)/static_cast<float>(range << 3);
+ }
+protected:
+ bool load(FileHandler* fin) {
+ std::cerr << "Loaded BitFilter. Rho = " << rho() << "." << std::endl;;
+ return true;
+ }
+};
+/*
// ResizedBitFilter deals with resizing to save memory
// whereas other filters should expect locations to be within range
// this filter will need to resize (and possibly rehash) locations
@@ -385,9 +396,9 @@ namespace randlm {
carry = incrementSubCell(data_bit, this->width_, &this->data_[data_cell]);
}
// last update must not have carried
- if (!carry)
+ if (!carry)
return true;
- // wrapped round so check whether need to reset to max count
+ // wrapped round so check whether need to reset to max count
if (!wrap_around_)
CHECK(this->write(address, this->address_mask_));
return false; // false to indicate that overflowed
@@ -402,7 +413,7 @@ namespace randlm {
}
inline bool incrementSubCell(int bit, int len, T* cell) {
// increment counter consisting of bits [startbit, startbit + len - 1] rest stays unchanged
- *cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1)
+ *cell = ((((*cell >> (this->cell_width_ - bit - len)) + 1)
& (this->full_mask_ >> (this->cell_width_ - len))) << (this->cell_width_ - bit - len))
| (*cell & ~(((this->full_mask_ >> (this->cell_width_ - len)) << (this->cell_width_ - bit - len))));
// indicate overflow as true
diff --git a/moses/TranslationModel/DynSAInclude/hash.h b/moses/TranslationModel/DynSAInclude/hash.h
index 03669845e..9e6cfe62a 100644
--- a/moses/TranslationModel/DynSAInclude/hash.h
+++ b/moses/TranslationModel/DynSAInclude/hash.h
@@ -11,60 +11,68 @@ typedef uint64_t P; // largest input range is 2^64
//! @todo ask abby2
template <typename T>
-class HashBase {
- protected:
- T m_; // range of hash output
- count_t H_; // number of hash functions to instantiate
- virtual void initSeeds()=0;
- virtual void freeSeeds()=0;
- public:
- HashBase(float m, count_t H=1):m_((T)m), H_(H) {
- //cerr << "range = (0..." << m_ << "]" << endl;
- }
- HashBase(FileHandler* fin) {
- load(fin);
- }
- virtual ~HashBase(){}
- virtual T hash(const char*s, count_t h)=0; // string hashing
- virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing
- count_t size() { return H_;}
- virtual void save(FileHandler* fout) {
- CHECK(fout != 0);
- fout->write((char*)&m_, sizeof(m_));
- fout->write((char*)&H_, sizeof(H_));
- }
- virtual void load(FileHandler* fin) {
- CHECK(fin != 0);
- fin->read((char*)&m_, sizeof(m_));
- fin->read((char*)&H_, sizeof(H_));
- }
+class HashBase
+{
+protected:
+ T m_; // range of hash output
+ count_t H_; // number of hash functions to instantiate
+ virtual void initSeeds()=0;
+ virtual void freeSeeds()=0;
+public:
+ HashBase(float m, count_t H=1):m_((T)m), H_(H) {
+ //cerr << "range = (0..." << m_ << "]" << endl;
+ }
+ HashBase(FileHandler* fin) {
+ load(fin);
+ }
+ virtual ~HashBase() {}
+ virtual T hash(const char*s, count_t h)=0; // string hashing
+ virtual T hash(const wordID_t* id, const int len, count_t h)=0; // vocab mapped hashing
+ count_t size() {
+ return H_;
+ }
+ virtual void save(FileHandler* fout) {
+ CHECK(fout != 0);
+ fout->write((char*)&m_, sizeof(m_));
+ fout->write((char*)&H_, sizeof(H_));
+ }
+ virtual void load(FileHandler* fin) {
+ CHECK(fin != 0);
+ fin->read((char*)&m_, sizeof(m_));
+ fin->read((char*)&H_, sizeof(H_));
+ }
};
//! @todo ask abby2
template <typename T>
-class UnivHash_linear: public HashBase<T> {
- public:
- UnivHash_linear(float m, count_t H, P pr):
- HashBase<T>(m, H), pr_(pr) {
- //CHECK(isPrime(pr_));
- initSeeds();
- }
- UnivHash_linear(FileHandler* fin):
- HashBase<T>(fin) {
- load(fin);
- }
- ~UnivHash_linear() {freeSeeds();}
- T hash(const char* s, count_t h){return 0;} //not implemented
- T hash(const wordID_t* id, const int len, count_t h);
- T hash(const wordID_t id, const count_t pos,
- const T prevValue, count_t h);
- void save(FileHandler* fout);
- void load(FileHandler* fin);
- private:
- T** a_, **b_;
- P pr_;
- void initSeeds();
- void freeSeeds();
+class UnivHash_linear: public HashBase<T>
+{
+public:
+ UnivHash_linear(float m, count_t H, P pr):
+ HashBase<T>(m, H), pr_(pr) {
+ //CHECK(isPrime(pr_));
+ initSeeds();
+ }
+ UnivHash_linear(FileHandler* fin):
+ HashBase<T>(fin) {
+ load(fin);
+ }
+ ~UnivHash_linear() {
+ freeSeeds();
+ }
+ T hash(const char* s, count_t h) {
+ return 0; //not implemented
+ }
+ T hash(const wordID_t* id, const int len, count_t h);
+ T hash(const wordID_t id, const count_t pos,
+ const T prevValue, count_t h);
+ void save(FileHandler* fout);
+ void load(FileHandler* fin);
+private:
+ T** a_, **b_;
+ P pr_;
+ void initSeeds();
+ void freeSeeds();
};
/** UnivHash_noPrimes:
@@ -74,76 +82,91 @@ class UnivHash_linear: public HashBase<T> {
* # of hash function = 2^(l-1)
*/
template <typename T>
-class UnivHash_noPrimes: public HashBase<T> {
- public:
- UnivHash_noPrimes(float k, float l):
- HashBase<T>(k, 100), d_(count_t((l-k))) {
- if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1;
- else p_ = (P) pow(2,l);
- initSeeds();
- }
- UnivHash_noPrimes(FileHandler* fin):
- HashBase<T>(fin) {
- load(fin);
- }
- ~UnivHash_noPrimes() {freeSeeds();}
- T hash(const char* s, count_t h);
- T hash(const wordID_t* id, const int len, count_t h);
- T hash(const P x, count_t h);
- void save(FileHandler* fout);
- void load(FileHandler* fin);
- private:
- count_t d_; // l-k
- P p_, *a_; // real-valued input range, storage
- void initSeeds();
- void freeSeeds() {delete[] a_;}
+class UnivHash_noPrimes: public HashBase<T>
+{
+public:
+ UnivHash_noPrimes(float k, float l):
+ HashBase<T>(k, 100), d_(count_t((l-k))) {
+ if(((int)l >> 3) == sizeof(P)) p_ = (P) pow(2,l) - 1;
+ else p_ = (P) pow(2,l);
+ initSeeds();
+ }
+ UnivHash_noPrimes(FileHandler* fin):
+ HashBase<T>(fin) {
+ load(fin);
+ }
+ ~UnivHash_noPrimes() {
+ freeSeeds();
+ }
+ T hash(const char* s, count_t h);
+ T hash(const wordID_t* id, const int len, count_t h);
+ T hash(const P x, count_t h);
+ void save(FileHandler* fout);
+ void load(FileHandler* fin);
+private:
+ count_t d_; // l-k
+ P p_, *a_; // real-valued input range, storage
+ void initSeeds();
+ void freeSeeds() {
+ delete[] a_;
+ }
};
//! @todo ask abby2
template <typename T>
-class Hash_shiftAddXOR: public HashBase<T> {
- public:
- Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H),
- l_(5), r_(2) {
- initSeeds();
- }
- ~Hash_shiftAddXOR() {freeSeeds();}
- T hash(const char* s, count_t h);
- T hash(const wordID_t* id, const int len, count_t h) {} // empty
- private:
- T* v_; // random seed storage
- const unsigned short l_, r_; // left-shift bits, right-shift bits
- void initSeeds();
- void freeSeeds() {delete[] v_;}
+class Hash_shiftAddXOR: public HashBase<T>
+{
+public:
+ Hash_shiftAddXOR(float m, count_t H=5): HashBase<T>(m,H),
+ l_(5), r_(2) {
+ initSeeds();
+ }
+ ~Hash_shiftAddXOR() {
+ freeSeeds();
+ }
+ T hash(const char* s, count_t h);
+ T hash(const wordID_t* id, const int len, count_t h) {} // empty
+private:
+ T* v_; // random seed storage
+ const unsigned short l_, r_; // left-shift bits, right-shift bits
+ void initSeeds();
+ void freeSeeds() {
+ delete[] v_;
+ }
};
//! @todo ask abby2
template <typename T>
-class UnivHash_tableXOR: public HashBase<T> {
- public:
- UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H),
- table_(NULL), tblLen_(255*MAX_STR_LEN) {
- initSeeds();
- }
- ~UnivHash_tableXOR() {freeSeeds();}
- T hash(const char* s, count_t h);
- T hash(const wordID_t* id, const int len, count_t h) {}
- private:
- T** table_; // storage for random numbers
- count_t tblLen_; // length of table
- void initSeeds();
- void freeSeeds();
+class UnivHash_tableXOR: public HashBase<T>
+{
+public:
+ UnivHash_tableXOR(float m, count_t H=5): HashBase<T>(m, H),
+ table_(NULL), tblLen_(255*MAX_STR_LEN) {
+ initSeeds();
+ }
+ ~UnivHash_tableXOR() {
+ freeSeeds();
+ }
+ T hash(const char* s, count_t h);
+ T hash(const wordID_t* id, const int len, count_t h) {}
+private:
+ T** table_; // storage for random numbers
+ count_t tblLen_; // length of table
+ void initSeeds();
+ void freeSeeds();
};
// ShiftAddXor
template <typename T>
-void Hash_shiftAddXOR<T>::initSeeds() {
+void Hash_shiftAddXOR<T>::initSeeds()
+{
v_ = new T[this->H_];
for(count_t i=0; i < this->H_; i++)
- v_[i] = Utils::rand<T>() + 1;
+ v_[i] = Utils::rand<T>() + 1;
}
template <typename T>
-T Hash_shiftAddXOR<T>::hash(const char* s, count_t h) {
+T Hash_shiftAddXOR<T>::hash(const char* s, count_t h)
+{
T value = v_[h];
int pos(0);
unsigned char c;
@@ -155,40 +178,44 @@ T Hash_shiftAddXOR<T>::hash(const char* s, count_t h) {
// UnivHash_tableXOR
template <typename T>
-void UnivHash_tableXOR<T>::initSeeds() {
+void UnivHash_tableXOR<T>::initSeeds()
+{
// delete any values in table
- if(table_) freeSeeds();
+ if(table_) freeSeeds();
// instance of new table
table_ = new T* [this->H_];
// fill with random values
for(count_t j=0; j < this->H_; j++) {
table_[j] = new T[tblLen_];
- for(count_t i=0; i < tblLen_; i++) {
- table_[j][i] = Utils::rand<T>(this->m_-1);
+ for(count_t i=0; i < tblLen_; i++) {
+ table_[j][i] = Utils::rand<T>(this->m_-1);
}
}
}
template <typename T>
-void UnivHash_tableXOR<T>::freeSeeds() {
+void UnivHash_tableXOR<T>::freeSeeds()
+{
for(count_t j = 0; j < this->H_; j++)
delete[] table_[j];
delete[] table_;
table_ = NULL;
}
template <typename T>
-T UnivHash_tableXOR<T>::hash(const char* s, count_t h) {
+T UnivHash_tableXOR<T>::hash(const char* s, count_t h)
+{
T value = 0;
count_t pos = 0, idx = 0;
unsigned char c;
while((c = *s++) && (++pos < MAX_STR_LEN))
value ^= table_[h][idx += c];
- CHECK(value < this->m_);
+ CHECK(value < this->m_);
return value;
}
// UnivHash_noPrimes
template <typename T>
-void UnivHash_noPrimes<T>::initSeeds() {
+void UnivHash_noPrimes<T>::initSeeds()
+{
a_ = new P[this->H_];
for(T i=0; i < this->H_; i++) {
a_[i] = Utils::rand<P>();
@@ -196,14 +223,16 @@ void UnivHash_noPrimes<T>::initSeeds() {
}
}
template <typename T>
-T UnivHash_noPrimes<T>::hash(const P x, count_t h) {
+T UnivHash_noPrimes<T>::hash(const P x, count_t h)
+{
// h_a(x) = (ax mod 2^l) div 2^(l-k)
T value = ((a_[h] * x) % p_) >> d_;
return value % this->m_;
}
template <typename T>
-T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
- count_t h) {
+T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
+ count_t h)
+{
T value = 0;
int pos(0);
while(pos < len) {
@@ -213,39 +242,42 @@ T UnivHash_noPrimes<T>::hash(const wordID_t* id, const int len,
return value % this->m_;
}
template <typename T>
-T UnivHash_noPrimes<T>::hash(const char* s, count_t h) {
+T UnivHash_noPrimes<T>::hash(const char* s, count_t h)
+{
T value = 0;
int pos(0);
unsigned char c;
while((c = *s++) && (++pos < MAX_STR_LEN)) {
- value ^= hash((P)c, h);
+ value ^= hash((P)c, h);
}
return value % this->m_;
}
template <typename T>
-void UnivHash_noPrimes<T>::save(FileHandler* fout) {
+void UnivHash_noPrimes<T>::save(FileHandler* fout)
+{
HashBase<T>::save(fout);
fout->write((char*)&p_, sizeof(p_));
fout->write((char*)&d_, sizeof(d_));
- for(T i=0; i < this->H_; i++) {
+ for(T i=0; i < this->H_; i++) {
fout->write((char*)&a_[i], sizeof(a_[i]));
}
}
template <typename T>
-void UnivHash_noPrimes<T>::load(FileHandler* fin) {
+void UnivHash_noPrimes<T>::load(FileHandler* fin)
+{
a_ = new P[this->H_];
// HashBase<T>::load(fin) already done in constructor
fin->read((char*)&p_, sizeof(p_));
fin->read((char*)&d_, sizeof(d_));
- for(T i=0; i < this->H_; i++)
- {
+ for(T i=0; i < this->H_; i++) {
fin->read((char*)&a_[i], sizeof(a_[i]));
}
}
//UnivHash_linear
template <typename T>
-void UnivHash_linear<T>::initSeeds() {
+void UnivHash_linear<T>::initSeeds()
+{
a_ = new T*[this->H_];
b_ = new T*[this->H_];
for(count_t i=0; i < this->H_; i++) {
@@ -258,7 +290,8 @@ void UnivHash_linear<T>::initSeeds() {
}
}
template <typename T>
-void UnivHash_linear<T>::freeSeeds() {
+void UnivHash_linear<T>::freeSeeds()
+{
for(count_t i=0; i < this->H_; i++) {
delete[] a_[i];
delete[] b_[i];
@@ -268,8 +301,9 @@ void UnivHash_linear<T>::freeSeeds() {
a_ = b_ = NULL;
}
template <typename T>
-inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
- count_t h) {
+inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
+ count_t h)
+{
CHECK(h < this->H_);
T value = 0;
int pos(0);
@@ -281,19 +315,21 @@ inline T UnivHash_linear<T>::hash(const wordID_t* id, const int len,
}
template <typename T>
inline T UnivHash_linear<T>::hash(const wordID_t id, const count_t pos,
- const T prevValue, count_t h) {
+ const T prevValue, count_t h)
+{
CHECK(h < this->H_);
T value = prevValue + ((a_[h][pos] * id) + b_[h][pos]); // % pr_;
return value % this->m_;
}
template <typename T>
-void UnivHash_linear<T>::save(FileHandler* fout) {
+void UnivHash_linear<T>::save(FileHandler* fout)
+{
// int bytes = sizeof(a_[0][0]);
HashBase<T>::save(fout);
fout->write((char*)&pr_, sizeof(pr_));
for(count_t i=0; i < this->H_; i++) {
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
- fout->write((char*)&a_[i][j], sizeof(a_[i][j]));
+ fout->write((char*)&a_[i][j], sizeof(a_[i][j]));
fout->write((char*)&b_[i][j], sizeof(b_[i][j]));
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
@@ -301,7 +337,8 @@ void UnivHash_linear<T>::save(FileHandler* fout) {
}
}
template <typename T>
-void UnivHash_linear<T>::load(FileHandler* fin) {
+void UnivHash_linear<T>::load(FileHandler* fin)
+{
// HashBase<T>::load(fin) already done in constructor
fin->read((char*)&pr_, sizeof(pr_));
a_ = new T*[this->H_];
@@ -310,8 +347,8 @@ void UnivHash_linear<T>::load(FileHandler* fin) {
a_[i] = new T[MAX_NGRAM_ORDER];
b_[i] = new T[MAX_NGRAM_ORDER];
for(count_t j=0; j < MAX_NGRAM_ORDER; j++) {
- fin->read((char*)&a_[i][j], sizeof(a_[i][j]));
- fin->read((char*)&b_[i][j], sizeof(b_[i][j]));
+ fin->read((char*)&a_[i][j], sizeof(a_[i][j]));
+ fin->read((char*)&b_[i][j], sizeof(b_[i][j]));
//cout << "a[" << i << "][" << j << "]=" << a_[i][j] << endl;
//cout << "b[" << i << "][" << j << "]=" << b_[i][j] << endl;
}
diff --git a/moses/TranslationModel/DynSAInclude/onlineRLM.h b/moses/TranslationModel/DynSAInclude/onlineRLM.h
index b47cfdd0e..527f1e5d9 100644
--- a/moses/TranslationModel/DynSAInclude/onlineRLM.h
+++ b/moses/TranslationModel/DynSAInclude/onlineRLM.h
@@ -18,27 +18,28 @@ const bool strict_checks_ = false;
//! @todo ask abby2
template<typename T>
-class OnlineRLM: public PerfectHash<T> {
+class OnlineRLM: public PerfectHash<T>
+{
public:
- OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order,
- Moses::Vocab* v, float qBase = 8): PerfectHash<T>(MBs, width, bucketRange, qBase),
+ OnlineRLM(uint16_t MBs, int width, int bucketRange, count_t order,
+ Moses::Vocab* v, float qBase = 8): PerfectHash<T>(MBs, width, bucketRange, qBase),
vocab_(v), bAdapting_(false), order_(order), corpusSize_(0), alpha_(0) {
CHECK(vocab_ != 0);
//instantiate quantizer class here
cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
alpha_ = new float[order_ + 1];
- for(count_t i = 0; i <= order_; ++i)
+ for(count_t i = 0; i <= order_; ++i)
alpha_[i] = i * log10(0.4);
cerr << "Initialzing auxillary bit filters...\n";
bPrefix_ = new BitFilter(this->cells_);
bHit_ = new BitFilter(this->cells_);
}
- OnlineRLM(FileHandler* fin, count_t order):
+ OnlineRLM(FileHandler* fin, count_t order):
PerfectHash<T>(fin), bAdapting_(true), order_(order), corpusSize_(0) {
load(fin);
cache_ = new Cache<float>(8888.8888, 9999.9999); // unknown_value, null_value
alpha_ = new float[order_ + 1];
- for(count_t i = 0; i <= order_; ++i)
+ for(count_t i = 0; i <= order_; ++i)
alpha_[i] = i * log10(0.4);
}
~OnlineRLM() {
@@ -54,14 +55,18 @@ public:
bool insert(const std::vector<string>& ngram, const int value);
bool update(const std::vector<string>& ngram, const int value);
int query(const wordID_t* IDs, const int len);
- int sbsqQuery(const std::vector<string>& ngram, int* len,
- bool bStrict = false);
- int sbsqQuery(const wordID_t* IDs, const int len, int* codes,
- bool bStrict = false);
+ int sbsqQuery(const std::vector<string>& ngram, int* len,
+ bool bStrict = false);
+ int sbsqQuery(const wordID_t* IDs, const int len, int* codes,
+ bool bStrict = false);
void remove(const std::vector<string>& ngram);
count_t heurDelete(count_t num2del, count_t order = 5);
- uint64_t corpusSize() {return corpusSize_;}
- void corpusSize(uint64_t c) {corpusSize_ = c;}
+ uint64_t corpusSize() {
+ return corpusSize_;
+ }
+ void corpusSize(uint64_t c) {
+ corpusSize_ = c;
+ }
void clearCache() {
if(cache_) cache_->clear();
}
@@ -79,7 +84,7 @@ protected:
void markQueried(hpdEntry_t& value);
bool markPrefix(const wordID_t* IDs, const int len, bool bSet);
private:
- const void* getContext(const wordID_t* ngram, int len);
+ const void* getContext(const wordID_t* ngram, int len);
const bool bAdapting_; // used to signal adaptation of model
const count_t order_; // LM order
uint64_t corpusSize_; // total training corpus size
@@ -90,48 +95,50 @@ private:
};
template<typename T>
-bool OnlineRLM<T>::insert(const std::vector<string>& ngram, const int value) {
+bool OnlineRLM<T>::insert(const std::vector<string>& ngram, const int value)
+{
int len = ngram.size();
wordID_t wrdIDs[len];
uint64_t index(this->cells_ + 1);
- for(int i = 0; i < len; ++i)
+ for(int i = 0; i < len; ++i)
wrdIDs[i] = vocab_->GetWordID(ngram[i]);
index = PerfectHash<T>::insert(wrdIDs, len, value);
if(value > 1 && len < order_)
markPrefix(wrdIDs, ngram.size(), true); // mark context
// keep track of total items from training data minus "<s>"
- if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting
+ if(ngram.size() == 1 && (!bAdapting_)) // hack to not change corpusSize when adapting
corpusSize_ += (wrdIDs[0] != vocab_->GetBOSWordID()) ? value : 0;
- if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting
+ if(bAdapting_ && (index < this->cells_)) // mark to keep while adapting
markQueried(index);
return true;
}
template<typename T>
-bool OnlineRLM<T>::update(const std::vector<string>& ngram, const int value) {
+bool OnlineRLM<T>::update(const std::vector<string>& ngram, const int value)
+{
int len = ngram.size();
std::vector<wordID_t> wrdIDs(len);
uint64_t index(this->cells_ + 1);
hpdEntry_t hpdItr;
vocab_->MakeOpen();
- for(int i = 0; i < len; ++i)
+ for(int i = 0; i < len; ++i)
wrdIDs[i] = vocab_->GetWordID(ngram[i]);
- // if updating, minimize false positives by pre-checking if context already in model
- bool bIncluded(true);
+ // if updating, minimize false positives by pre-checking if context already in model
+ bool bIncluded(true);
if(value > 1 && len < (int)order_)
bIncluded = markPrefix(&wrdIDs[0], ngram.size(), true); // mark context
- if(bIncluded) { // if context found
+ if(bIncluded) { // if context found
bIncluded = PerfectHash<T>::update2(&wrdIDs[0], len, value, hpdItr, index);
if(index < this->cells_) {
markQueried(index);
- }
- else if(hpdItr != this->dict_.end()) markQueried(hpdItr);
+ } else if(hpdItr != this->dict_.end()) markQueried(hpdItr);
}
return bIncluded;
}
template<typename T>
-int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
+int OnlineRLM<T>::query(const wordID_t* IDs, int len)
+{
uint64_t filterIdx = 0;
hpdEntry_t hpdItr;
int value(0);
@@ -140,8 +147,7 @@ int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
if(hpdItr != this->dict_.end()) {
//markQueried(hpdItr); // mark this event as "hit"
value -= ((value & this->hitMask_) != 0) ? this->hitMask_ : 0; // check for previous hit marks
- }
- else {
+ } else {
CHECK(filterIdx < this->cells_);
//markQueried(filterIdx);
}
@@ -150,15 +156,16 @@ int OnlineRLM<T>::query(const wordID_t* IDs, int len) {
}
template<typename T>
-bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
- if(len <= 1) return true; // only do this for for ngrams with context
- static Cache<int> pfCache(-1, -1); // local prefix cache
+bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet)
+{
+ if(len <= 1) return true; // only do this for for ngrams with context
+ static Cache<int> pfCache(-1, -1); // local prefix cache
int code(0);
- if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) {
- hpdEntry_t hpdItr;
+ if(!pfCache.checkCacheNgram(IDs, len - 1, &code, NULL)) {
+ hpdEntry_t hpdItr;
uint64_t filterIndex(0);
code = PerfectHash<T>::query(IDs, len - 1, hpdItr, filterIndex); // hash IDs[0..len-1]
- if(code == -1) { // encountered false positive in pipeline
+ if(code == -1) { // encountered false positive in pipeline
cerr << "WARNING: markPrefix(). The O-RLM is *not* well-formed.\n";
// add all prefixes or return false;
return false;
@@ -167,10 +174,9 @@ bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
CHECK(hpdItr == this->dict_.end());
if(bSet) bPrefix_->setBit(filterIndex); // mark index
else bPrefix_->clearBit(filterIndex); // unset index
- }
- else {
+ } else {
CHECK(filterIndex == this->cells_ + 1);
- //how to handle hpd prefixes?
+ //how to handle hpd prefixes?
}
if(pfCache.nodes() > 10000) pfCache.clear();
pfCache.setCacheNgram(IDs, len - 1, code, NULL);
@@ -179,39 +185,43 @@ bool OnlineRLM<T>::markPrefix(const wordID_t* IDs, const int len, bool bSet) {
}
template<typename T>
-void OnlineRLM<T>::markQueried(const uint64_t& index) {
+void OnlineRLM<T>::markQueried(const uint64_t& index)
+{
bHit_->setBit(index);
//cerr << "filter[" << index << "] = " << this->filter_->read(index) << endl;
}
template<typename T>
-void OnlineRLM<T>::markQueried(hpdEntry_t& value) {
- // set high bit of counter to indicate "hit" status
+void OnlineRLM<T>::markQueried(hpdEntry_t& value)
+{
+ // set high bit of counter to indicate "hit" status
value->second |= this->hitMask_;
}
template<typename T>
-void OnlineRLM<T>::remove(const std::vector<string>& ngram) {
+void OnlineRLM<T>::remove(const std::vector<string>& ngram)
+{
wordID_t IDs[ngram.size()];
- for(count_t i = 0; i < ngram.size(); ++i)
+ for(count_t i = 0; i < ngram.size(); ++i)
IDs[i] = vocab_->GetWordID(ngram[i]);
PerfectHash<T>::remove(IDs, ngram.size());
}
template<typename T>
-count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order) {
+count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order)
+{
count_t deleted = 0;
cout << "Deleting " << num2del << " of order "<< order << endl;
// delete from filter first
- int full = *std::max_element(this->idxTracker_, this->idxTracker_
- + this->totBuckets_);
+ int full = *std::max_element(this->idxTracker_, this->idxTracker_
+ + this->totBuckets_);
for(; full > 0; --full) // delete from fullest buckets first
- for(int bk = 0; bk < this->totBuckets_; ++bk) {
+ for(int bk = 0; bk < this->totBuckets_; ++bk) {
if(deleted >= num2del) break;
if(this->idxTracker_[bk] == full) { // if full
uint64_t first = bk * this->bucketRange_,
- last = first + this->bucketRange_;
- for(uint64_t row = first; row < last; ++row) { // check each row
+ last = first + this->bucketRange_;
+ for(uint64_t row = first; row < last; ++row) { // check each row
if(!(bHit_->testBit(row) || bPrefix_->testBit(row) )) {
if(this->filter_->read(row) != 0) {
PerfectHash<T>::remove(row); // remove from filter
@@ -231,16 +241,18 @@ count_t OnlineRLM<T>::heurDelete(count_t num2del, count_t order) {
template<typename T>
int OnlineRLM<T>::sbsqQuery(const std::vector<string>& ngram, int* codes,
- bool bStrict) {
+ bool bStrict)
+{
wordID_t IDs[ngram.size()];
- for(count_t i = 0; i < ngram.size(); ++i)
+ for(count_t i = 0; i < ngram.size(); ++i)
IDs[i] = vocab_->GetWordID(ngram[i]);
return sbsqQuery(IDs, ngram.size(), codes, bStrict);
}
template<typename T>
-int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
- bool bStrict) {
+int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
+ bool bStrict)
+{
uint64_t filterIdx = 0;
int val(0), fnd(0);
hpdEntry_t hpdItr;
@@ -252,14 +264,13 @@ int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
if(hpdItr != this->dict_.end()) {
val -= ((val & this->hitMask_) != 0) ? this->hitMask_ : 0; // account for previous hit marks
}
- }
- else if(bStrict) {
- break;
+ } else if(bStrict) {
+ break;
}
// add to value array
codes[i] = val > 0 ? val : 0;
}
- while(bStrict && (fnd > 1)) { // do checks the other way
+ while(bStrict && (fnd > 1)) { // do checks the other way
val = PerfectHash<T>::query(&IDs[len - fnd], fnd - 1, hpdItr, filterIdx);
if(val != -1) break; // if anything found
else --fnd; // else decrement found
@@ -269,8 +280,9 @@ int OnlineRLM<T>::sbsqQuery(const wordID_t* IDs, const int len, int* codes,
}
template<typename T>
-float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
- const void** state) {
+float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
+ const void** state)
+{
static const float oovprob = log10(1.0 / (static_cast<float>(vocab_->Size()) - 1));
float logprob(0);
const void* context = (state) ? *state : 0;
@@ -278,66 +290,66 @@ float OnlineRLM<T>::getProb(const wordID_t* ngram, int len,
if(!cache_->checkCacheNgram(ngram, len, &logprob, &context)) {
// get full prob and put in cache
int num_fnd(0), den_val(0);
- int *in = new int[len]; // in[] keeps counts of increasing order numerator
+ int *in = new int[len]; // in[] keeps counts of increasing order numerator
for(int i = 0; i < len; ++i) in[i] = 0;
for(int i = len - 1; i >= 0; --i) {
if(ngram[i] == vocab_->GetkOOVWordID()) break; // no need to query if OOV
in[i] = query(&ngram[i], len - i);
if(in[i] > 0) {
num_fnd = len - i;
- }
- else if(strict_checks_) break;
+ } else if(strict_checks_) break;
}
while(num_fnd > 1) { // get lower order count
- //get sub-context of size one less than length found (exluding target)
+ //get sub-context of size one less than length found (exluding target)
if(((den_val = query(&ngram[len - num_fnd], num_fnd - 1)) > 0) &&
(den_val >= in[len - num_fnd]) && (in[len - num_fnd] > 0)) {
break;
- }
- else --num_fnd; // else backoff to lower ngram order
+ } else --num_fnd; // else backoff to lower ngram order
}
- if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams
+ if(num_fnd == 1 && (in[len - 1] < 1)) // sanity check for unigrams
num_fnd = 0;
switch(num_fnd) { // find prob (need to refactor into precomputation)
- case 0: // OOV
- logprob = alpha_[len] + oovprob;
- break;
- case 1: // unigram found only
- CHECK(in[len - 1] > 0);
- logprob = alpha_[len - 1] + (corpusSize_ > 0 ?
- log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_)) : 0);
- //logprob = alpha_[len - 1] +
- //log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_));
- break;
- default:
- CHECK(den_val > 0);
- //if(subgram == in[len - found]) ++subgram; // avoid returning zero probs????
- logprob = alpha_[len - num_fnd] +
- log10(static_cast<float>(in[len - num_fnd]) / static_cast<float>(den_val));
- break;
+ case 0: // OOV
+ logprob = alpha_[len] + oovprob;
+ break;
+ case 1: // unigram found only
+ CHECK(in[len - 1] > 0);
+ logprob = alpha_[len - 1] + (corpusSize_ > 0 ?
+ log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_)) : 0);
+ //logprob = alpha_[len - 1] +
+ //log10(static_cast<float>(in[len - 1]) / static_cast<float>(corpusSize_));
+ break;
+ default:
+ CHECK(den_val > 0);
+ //if(subgram == in[len - found]) ++subgram; // avoid returning zero probs????
+ logprob = alpha_[len - num_fnd] +
+ log10(static_cast<float>(in[len - num_fnd]) / static_cast<float>(den_val));
+ break;
}
// need unique context
context = getContext(&ngram[len - num_fnd], num_fnd);
// put whatever was found in cache
cache_->setCacheNgram(ngram, len, logprob, context);
} // end checkCache
- return logprob;
+ return logprob;
}
template<typename T>
-const void* OnlineRLM<T>::getContext(const wordID_t* ngram, int len) {
+const void* OnlineRLM<T>::getContext(const wordID_t* ngram, int len)
+{
int dummy(0);
float* *addresses = new float*[len]; // only interested in addresses of cache
CHECK(cache_->getCache2(ngram, len, &addresses[0], &dummy) == len);
// return address of cache node
-
+
float *addr0 = addresses[0];
free( addresses );
return (const void*)addr0;
}
template<typename T>
-void OnlineRLM<T>::randDelete(int num2del) {
+void OnlineRLM<T>::randDelete(int num2del)
+{
int deleted = 0;
for(uint64_t i = 0; i < this->cells_; i++) {
if(this->filter_->read(i) != 0) {
@@ -349,19 +361,21 @@ void OnlineRLM<T>::randDelete(int num2del) {
}
template<typename T>
-int OnlineRLM<T>::countHits() {
+int OnlineRLM<T>::countHits()
+{
int hit(0);
for(uint64_t i = 0; i < this->cells_; ++i)
if(bHit_->testBit(i)) ++hit;
iterate(this->dict_, itr)
- if((itr->second & this->hitMask_) != 0)
- ++hit;
+ if((itr->second & this->hitMask_) != 0)
+ ++hit;
cerr << "Hit count = " << hit << endl;
return hit;
}
template<typename T>
-int OnlineRLM<T>::countPrefixes() {
+int OnlineRLM<T>::countPrefixes()
+{
int pfx(0);
for(uint64_t i = 0; i < this->cells_; ++i)
if(bPrefix_->testBit(i)) ++pfx;
@@ -371,23 +385,25 @@ int OnlineRLM<T>::countPrefixes() {
}
template<typename T>
-int OnlineRLM<T>::cleanUpHPD() {
+int OnlineRLM<T>::cleanUpHPD()
+{
cerr << "HPD size before = " << this->dict_.size() << endl;
std::vector<string> vDel, vtmp;
iterate(this->dict_, itr) {
if(((itr->second & this->hitMask_) == 0) && // if not hit during testing
- (Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram
+ (Utils::splitToStr(itr->first, vtmp, "¬") >= 3)) { // and higher order ngram
vDel.push_back(itr->first);
}
}
- iterate(vDel, vitr)
- this->dict_.erase(*vitr);
+ iterate(vDel, vitr)
+ this->dict_.erase(*vitr);
cerr << "HPD size after = " << this->dict_.size() << endl;
return vDel.size();
}
template<typename T>
-void OnlineRLM<T>::clearMarkings() {
+void OnlineRLM<T>::clearMarkings()
+{
cerr << "clearing all event hits\n";
bHit_->reset();
count_t* value(0);
@@ -398,7 +414,8 @@ void OnlineRLM<T>::clearMarkings() {
}
template<typename T>
-void OnlineRLM<T>::save(FileHandler* fout) {
+void OnlineRLM<T>::save(FileHandler* fout)
+{
cerr << "Saving ORLM...\n";
// save vocab
vocab_->Save(fout);
@@ -412,7 +429,8 @@ void OnlineRLM<T>::save(FileHandler* fout) {
}
template<typename T>
-void OnlineRLM<T>::load(FileHandler* fin) {
+void OnlineRLM<T>::load(FileHandler* fin)
+{
cerr << "Loading ORLM...\n";
// load vocab first
vocab_ = new Moses::Vocab(fin);
@@ -428,12 +446,13 @@ void OnlineRLM<T>::load(FileHandler* fin) {
}
template<typename T>
-void OnlineRLM<T>::removeNonMarked() {
+void OnlineRLM<T>::removeNonMarked()
+{
cerr << "deleting all unused events\n";
int deleted(0);
for(uint64_t i = 0; i < this->cells_; ++i) {
- if(!(bHit_->testBit(i) || bPrefix_->testBit(i))
- && (this->filter_->read(i) != 0)) {
+ if(!(bHit_->testBit(i) || bPrefix_->testBit(i))
+ && (this->filter_->read(i) != 0)) {
PerfectHash<T>::remove(i);
++deleted;
}
@@ -456,36 +475,36 @@ float OnlineRLM<T>::getProb2(const wordID_t* ngram, int len, const void** state)
// constrain cache queries using model assumptions
int denom_len = cache_->getCache(ngram, len - 1, &denom_codes[0], &denom_found);
cerr << "denom_len = " << denom_len << endl;
- int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1,
+ int num_len = cache_->getCache(&ngram[len - denom_len - 1], denom_len + 1,
&num_codes[0], &found);
cerr << "num_len= " << num_len << endl;
// keed reducing ngram size until both denominator and numerator are found
// allowed to leave kUnknownCode in cache because we check for this.
found = num_len; // guaranteed to be <= denom_len + 1
// still check for OOV
- for (int i = len - found; i < len; ++i)
- if (ngram[i] == Vocab::kOOVWordID) {
+ for (int i = len - found; i < len; ++i)
+ if (ngram[i] == Vocab::kOOVWordID) {
found = len - i - 1;
}
// check for relative estimator
while(found > 1) {
- if(*denom_codes[found-1] == cache_unk_ &&
- ((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) {
+ if(*denom_codes[found-1] == cache_unk_ &&
+ ((*denom_codes[found-1] = query(&ngram[len-found], found-1)) == 0)) {
//!struct_->query(&ngram[len-*found], *found-1, kMainEventIdx, denom_codes[*found-1])) {
*num_codes[found] = cache_unk_;
} else {
if(*num_codes[found] != cache_unk_ ||
((*num_codes[found] = query(&ngram[len-found], found)) <= *denom_codes[found-1]))
- // struct_->query(&ngram[len-*found], *found, kMainEventIdx,
+ // struct_->query(&ngram[len-*found], *found, kMainEventIdx,
// num_codes[*found], *denom_codes[*found-1]))
break;
- }
+ }
--found;
}
- // didn't find bigram numerator or unigram denominator
+ // didn't find bigram numerator or unigram denominator
if (found == 1)
- found = *num_codes[1] != cache_unk_
- || ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0);
+ found = *num_codes[1] != cache_unk_
+ || ((*num_codes[1] = query(&ngram[len - 1], 1)) != 0);
//struct_->query(&ngram[len - 1], 1, kMainEventIdx, num_codes[1]);
// ....
// return estimate applying correct backoff score (precomputed)
@@ -496,20 +515,20 @@ float OnlineRLM<T>::getProb2(const wordID_t* ngram, int len, const void** state)
//log_prob = stupid_backoff_log10_[len] + uniform_log10prob_;
break;
case 1: // unigram over whole corpus
- log_prob = alpha_[len - 1] +
+ log_prob = alpha_[len - 1] +
log10(static_cast<float>(*num_codes[1]) / static_cast<float>(corpusSize_));
- //log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_
+ //log_prob = log_quantiser_->getLog10Value(*num_codes[1]) - corpus_size_log10_
// + stupid_backoff_log10_[len - 1]; // precomputed
break;
default: // otherwise use both statistics and (possibly zero) backoff weight
- log_prob = alpha_[len - found] +
+ log_prob = alpha_[len - found] +
log10(static_cast<float>(*num_codes[found]) / static_cast<float>(*denom_codes[found-1]));
- //log_prob = log_quantiser_->getLog10Value(*num_codes[*found ])
- // - log_quantiser_->getLog10Value(*denom_codes[*found - 1])
+ //log_prob = log_quantiser_->getLog10Value(*num_codes[*found ])
+ // - log_quantiser_->getLog10Value(*denom_codes[*found - 1])
// + stupid_backoff_log10_[len - *found];
}
context_state = (const void*)num_codes[found == len ? found - 1 : found];;
- //probCache_->store(len, log_prob, context_state);
+ //probCache_->store(len, log_prob, context_state);
if (state)
*state = context_state;
return log_prob;
diff --git a/moses/TranslationModel/DynSAInclude/params.cpp b/moses/TranslationModel/DynSAInclude/params.cpp
index 4be3a1676..a4d51d5b2 100644
--- a/moses/TranslationModel/DynSAInclude/params.cpp
+++ b/moses/TranslationModel/DynSAInclude/params.cpp
@@ -1,10 +1,11 @@
#include "params.h"
-namespace Moses {
+namespace Moses
+{
// parameter constants
const std::string Parameters::kNotSetValue = "__NOT_SET__";
-const int Parameters::kBoolValue = 0;
+const int Parameters::kBoolValue = 0;
const int Parameters::kIntValue = 1;
const int Parameters::kFloatValue = 2;
const int Parameters::kStringValue = 3;
@@ -13,26 +14,30 @@ const int Parameters::kUndefinedValue = -1;
const std::string Parameters::kTrueValue = "1";
const std::string Parameters::kFalseValue = "0";
-Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum) {
+Parameters::Parameters(const ParamDefs * paramdefs, const count_t paramNum)
+{
initialize(paramdefs, paramNum);
}
-Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs,
- const count_t paramNum) {
+Parameters::Parameters(int argc, char ** argv, const ParamDefs * paramdefs,
+ const count_t paramNum)
+{
initialize(paramdefs, paramNum);
loadParams(argc, argv);
}
-void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum) {
+void Parameters::initialize(const ParamDefs * paramdefs, const count_t paramNum)
+{
for( count_t i = 0; i < paramNum; i++ ) {
params_[paramdefs[i].name] = paramdefs[i]; // assign name
}
cerr << "Default parameter values:\n";
- iterate(params_, itr)
- cerr << "\t" << itr->first << " --> " << itr->second.value << endl;
+ iterate(params_, itr)
+ cerr << "\t" << itr->first << " --> " << itr->second.value << endl;
}
-bool Parameters::loadParams(int argc, char ** argv) {
+bool Parameters::loadParams(int argc, char ** argv)
+{
// load params from commandline args
//if( argc < 3 ) {
// fprintf(stderr, "ERROR: No parameters. Use \"-config\" or \"-f\" to specify configuration file.\n");
@@ -66,7 +71,7 @@ bool Parameters::loadParams(int argc, char ** argv) {
std::string val = argv[i+1];
Utils::trim(val);
if( param == "config" )
- load_from_file = true;
+ load_from_file = true;
if(!setParamValue(param, val)) {
std::cerr << "Invalid Param name->value " << param << "->" << val << std::endl;
return false;
@@ -80,35 +85,40 @@ bool Parameters::loadParams(int argc, char ** argv) {
return success;
}
-std::string Parameters::normaliseParamName(const std::string & name) {
+std::string Parameters::normaliseParamName(const std::string & name)
+{
// Map valid abbreviations to long names. Retain other names.
if( params_.find(name) == params_.end() )
- iterate(params_, i)
- if( i->second.abbrev == name )
- return i->first;
+ iterate(params_, i)
+ if( i->second.abbrev == name )
+ return i->first;
return name;
}
-int Parameters::getValueType(const std::string& name) {
+int Parameters::getValueType(const std::string& name)
+{
if(params_.find(name) != params_.end())
return params_[name].type;
return Parameters::kUndefinedValue;
}
-bool Parameters::isValidParamName(const std::string & name) {
- return params_.find(name) != params_.end();
+bool Parameters::isValidParamName(const std::string & name)
+{
+ return params_.find(name) != params_.end();
}
-bool Parameters::setParamValue(const std::string& name, const std::string& val) {
- // TODO: Add basic type checking w verifyValueType()
- bool set = isValidParamName(name);
- if(set) {
- params_[name].value = val;
+bool Parameters::setParamValue(const std::string& name, const std::string& val)
+{
+ // TODO: Add basic type checking w verifyValueType()
+ bool set = isValidParamName(name);
+ if(set) {
+ params_[name].value = val;
std::cerr << "PARAM SET: "<< name << "=" << val << std::endl;
}
return( set );
}
-std::string Parameters::getParamValue(const std::string& name) {
+std::string Parameters::getParamValue(const std::string& name)
+{
std::string value = Parameters::kNotSetValue;
if(isValidParamName(name))
if(params_.find(name) != params_.end())
@@ -117,43 +127,46 @@ std::string Parameters::getParamValue(const std::string& name) {
value = kFalseValue;
return value;
}
-std::string Parameters::getParam(const std::string& name) {
+std::string Parameters::getParam(const std::string& name)
+{
return getParamValue(name);
-/*void* Parameters::getParam(const std::string& name) {
- void* paramVal = 0;
- int type = getValueType(name);
- const char* sval = getParamValue(name).c_str();
- switch(type) {
- case kIntValue: {
- int ival = atoi(sval);
- paramVal = (void*)&ival;
- break;
- }
- case kFloatValue: {
- float fval = atof(sval);
- paramVal = (void*)&fval;
- break;
- }
- case kStringValue: {
- paramVal = (void*)sval;
- break;
- }
- case kBoolValue: {
- bool bval = sval == Parameters::kTrueValue ? true : false;
- paramVal = (void*)&bval;
- break;
+ /*void* Parameters::getParam(const std::string& name) {
+ void* paramVal = 0;
+ int type = getValueType(name);
+ const char* sval = getParamValue(name).c_str();
+ switch(type) {
+ case kIntValue: {
+ int ival = atoi(sval);
+ paramVal = (void*)&ival;
+ break;
+ }
+ case kFloatValue: {
+ float fval = atof(sval);
+ paramVal = (void*)&fval;
+ break;
+ }
+ case kStringValue: {
+ paramVal = (void*)sval;
+ break;
+ }
+ case kBoolValue: {
+ bool bval = sval == Parameters::kTrueValue ? true : false;
+ paramVal = (void*)&bval;
+ break;
+ }
+ default: // --> Parameters::kUndefinedValue
+ paramVal = (void*)sval; // will set to Parameters::kNotSetValue
}
- default: // --> Parameters::kUndefinedValue
- paramVal = (void*)sval; // will set to Parameters::kNotSetValue
- }
- return paramVal;*/
+ return paramVal;*/
}
-bool Parameters::verifyValueType(const std::string& name, const std::string& val) {
+bool Parameters::verifyValueType(const std::string& name, const std::string& val)
+{
// Implement basic type checking
return true;
}
-int Parameters::getParamCount() const {
+int Parameters::getParamCount() const
+{
return params_.size();
}
@@ -161,7 +174,8 @@ int Parameters::getParamCount() const {
* HAVE TO CHANGE loadParams() from file to not overwrite command lines but
* override default if different*/
bool Parameters::loadParams(const std::string & file_path,
- std::set<std::string>& setParams) {
+ std::set<std::string>& setParams)
+{
// parameters loaded from file don't override cmd line paramters
/*std::set<std::string>::iterator end = setParams.end();
FileHandler file(file_path.c_str(), std::ios::in);
diff --git a/moses/TranslationModel/DynSAInclude/params.h b/moses/TranslationModel/DynSAInclude/params.h
index d5af6331d..efc0a6ba3 100644
--- a/moses/TranslationModel/DynSAInclude/params.h
+++ b/moses/TranslationModel/DynSAInclude/params.h
@@ -10,21 +10,23 @@
#include "utils.h"
#include "types.h"
-#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0]))
+#define NumOfParams(paramArray) (sizeof(paramArray)/sizeof((paramArray)[0]))
-namespace Moses {
+namespace Moses
+{
typedef struct ParamDefs {
std::string name;
- std::string value;
+ std::string value;
std::string abbrev;
int type;
std::string description;
} ParamDefs;
- //! @todo ask abby2
-class Parameters {
+//! @todo ask abby2
+class Parameters
+{
public:
- static const std::string kNotSetValue;
+ static const std::string kNotSetValue;
static const int kBoolValue;
static const int kIntValue;
static const int kFloatValue;
@@ -32,15 +34,15 @@ public:
static const int kUndefinedValue;
static const std::string kFalseValue;
static const std::string kTrueValue;
-
+
Parameters(const ParamDefs * paramdefs, const count_t paramNum);
Parameters(int argc, char** argv, const ParamDefs * paramdefs, const count_t paramNum);
~Parameters() {}
bool loadParams(int argc, char ** argv);
bool loadParams(const std::string& param_file, std::set<std::string>&);
int getValueType(const std::string & name);
- bool setParamValue(const std::string& name, const std::string& value);
- bool verifyValueType(const std::string& name, const std::string& value);
+ bool setParamValue(const std::string& name, const std::string& value);
+ bool verifyValueType(const std::string& name, const std::string& value);
bool isValidParamName(const std::string & name);
std::string getParamValue(const std::string& name);
//void* getParam(const std::string& name);
diff --git a/moses/TranslationModel/DynSAInclude/perfectHash.h b/moses/TranslationModel/DynSAInclude/perfectHash.h
index f445e063a..8ea20fa06 100644
--- a/moses/TranslationModel/DynSAInclude/perfectHash.h
+++ b/moses/TranslationModel/DynSAInclude/perfectHash.h
@@ -9,18 +9,19 @@
#include "quantizer.h"
/**
- * PerfectHash handles setting up hash functions and storage
- * for LM data.
- */
+ * PerfectHash handles setting up hash functions and storage
+ * for LM data.
+ */
using randlm::Filter;
using randlm::BitFilter;
typedef std::map<string, count_t> hpDict_t;
typedef hpDict_t::iterator hpdEntry_t;
static count_t collisions_ = 0;
-/* Based on Mortenson et. al. 2006 */
+/* Based on Mortenson et. al. 2006 */
template<typename T>
-class PerfectHash {
+class PerfectHash
+{
public:
PerfectHash(uint16_t MBs, int width, int bucketRange, float qBase);
PerfectHash(FileHandler* fin) {
@@ -41,11 +42,11 @@ protected:
uint8_t* idxTracker_;
uint64_t insert(const wordID_t* IDs, const int len, const count_t value);
bool update(const wordID_t* IDs, const int len, const count_t value,
- hpdEntry_t& hpdAddr, uint64_t& filterIdx);
+ hpdEntry_t& hpdAddr, uint64_t& filterIdx);
bool update2(const wordID_t* IDs, const int len, const count_t value,
- hpdEntry_t& hpdAddr, uint64_t& filterIdx);
- int query(const wordID_t* IDs, const int len,
- hpdEntry_t& hpdAddr, uint64_t& filterIdx);
+ hpdEntry_t& hpdAddr, uint64_t& filterIdx);
+ int query(const wordID_t* IDs, const int len,
+ hpdEntry_t& hpdAddr, uint64_t& filterIdx);
virtual void remove(const wordID_t* IDs, const int len);
void remove(uint64_t index);
void save(FileHandler* fout);
@@ -54,33 +55,34 @@ protected:
//pointer to a specific entry in a hpDict_t
virtual void markQueried(hpdEntry_t&)=0;
private:
- T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket);
+ T nonZeroSignature(const wordID_t* IDs, const int len, count_t bucket);
string hpDictKeyValue(const wordID_t* IDs, const int len);
uint64_t memBound_; // total memory bound in bytes
uint16_t cellWidth_; // in bits
- UnivHash_linear<count_t>* bucketHash_;
+ UnivHash_linear<count_t>* bucketHash_;
UnivHash_linear<T>* fingerHash_;
LogQtizer* qtizer_;
};
template<typename T>
-PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
- float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)),
- cellWidth_(width) {
+PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
+ float qBase): hitMask_(1 << 31), memBound_(MBs * (1ULL << 20)),
+ cellWidth_(width)
+{
bucketRange_ = static_cast<uint8_t>(bucketRange);
if(bucketRange > 255) {
- cerr << "ERROR: Max bucket range is > 2^8\n";
+ cerr << "ERROR: Max bucket range is > 2^8\n";
exit(1);
}
qtizer_ = new LogQtizer(qBase);
int valBits = (int)ceil(log2((float)qtizer_->maxcode()));
cerr << "BITS FOR VALUES ARRAY = " << valBits << endl;
uint64_t totalBits = memBound_ << 3;
- cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells
+ cells_ = (uint64_t) ceil((float)totalBits / (float)(cellWidth_ + valBits)); // upper bound on cells
cells_ += (cells_ % bucketRange_); // make cells multiple of bucket range
totBuckets_ = (cells_ / bucketRange_) - 1; // minus 1 so totBuckets * bucksize + bucksize = cells
filter_ = new Filter<T>(cells_, cellWidth_);
- values_ = new Filter<T>(cells_, valBits);
+ values_ = new Filter<T>(cells_, valBits);
idxTracker_ = new uint8_t[totBuckets_];
for(int i=0; i < totBuckets_; ++i) idxTracker_[i] = 0;
// initialize ranges for each hash function
@@ -89,7 +91,8 @@ PerfectHash<T>::PerfectHash(uint16_t MBs, int width, int bucketRange,
}
template<typename T>
-PerfectHash<T>::~PerfectHash() {
+PerfectHash<T>::~PerfectHash()
+{
delete[] idxTracker_;
delete filter_;
filter_ = NULL;
@@ -99,22 +102,22 @@ PerfectHash<T>::~PerfectHash() {
delete values_;
}
-template<typename T>
-uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
- const count_t value) {
+template<typename T>
+uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
+ const count_t value)
+{
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
- if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows
+ if(idxTracker_[bucket] < (int)bucketRange_) { // if empty rows
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t emptyidx = cells_ + 1;
uint64_t index = bucket * bucketRange_, // starting bucket row
- lastrow = index + bucketRange_; // ending row
- while(index < lastrow) { // unique so check each row for "matching" signature
+ lastrow = index + bucketRange_; // ending row
+ while(index < lastrow) { // unique so check each row for "matching" signature
T filterVal = filter_->read(index);
- if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row
+ if((filterVal == 0) && (emptyidx == cells_ + 1)) { // record first empty row
emptyidx = index;
- }
- else if(filterVal == fp) {
+ } else if(filterVal == fp) {
++collisions_;
dict_[hpDictKeyValue(IDs, len)] = value; // store exact in hpd
return cells_ + 1; // finished
@@ -127,21 +130,21 @@ uint64_t PerfectHash<T>::insert(const wordID_t* IDs, const int len,
values_->write(emptyidx, code);
++idxTracker_[bucket]; // keep track of bucket size
return emptyidx;
- }
- else { // bucket is full
+ } else { // bucket is full
dict_[hpDictKeyValue(IDs, len)] = value; // add to hpd
return cells_ + 1;
}
}
-template<typename T>
-bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
- const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
+template<typename T>
+bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
+ const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx)
+{
// check if key is in high perf. dictionary
filterIdx = cells_ + 1;
string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
- hpdAddr->second = value;
+ hpdAddr->second = value;
return true;
}
// else hash ngram
@@ -150,45 +153,45 @@ bool PerfectHash<T>::update(const wordID_t* IDs, const int len,
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t index = bucket * bucketRange_, // starting bucket row
- lastrow = index + bucketRange_;
+ lastrow = index + bucketRange_;
while(index < lastrow) { // must check each row for matching fp event
T filterVal = filter_->read(index);
if(filterVal == fp) { // found event w.h.p.
- values_->write(index, (T)qtizer_->code(value));
+ values_->write(index, (T)qtizer_->code(value));
filterIdx = index;
return true;
}
++index;
}
- // could add if it gets here.
+ // could add if it gets here.
return false;
}
-template<typename T>
-int PerfectHash<T>::query(const wordID_t* IDs, const int len,
- hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
+template<typename T>
+int PerfectHash<T>::query(const wordID_t* IDs, const int len,
+ hpdEntry_t& hpdAddr, uint64_t& filterIdx)
+{
// check if key is in high perf. dictionary
string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
filterIdx = cells_ + 1;
return(hpdAddr->second); // returns copy of value
- }
- else { // check if key is in filter
- // get bucket
+ } else { // check if key is in filter
+ // get bucket
//count_t bucket = bucketHash_->hash(IDs, len);
count_t bucket = (bucketHash_->size() > 1 ? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
// return value if ngram is in filter
uint64_t index = bucket * bucketRange_,
- lastrow = index + bucketRange_;
+ lastrow = index + bucketRange_;
for(; index < lastrow; ++index) {
if(filter_->read(index) == fp) {
- //cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" <<
- //filter_->read(index) << "\tcode = " << code << endl;
+ //cout << "fp = " << fp << "\tbucket = " << bucket << "\tfilter =" <<
+ //filter_->read(index) << "\tcode = " << code << endl;
filterIdx = index;
hpdAddr = dict_.end();
- return (int)qtizer_->value(values_->read(index));
+ return (int)qtizer_->value(values_->read(index));
}
}
}
@@ -196,22 +199,23 @@ int PerfectHash<T>::query(const wordID_t* IDs, const int len,
}
template<typename T>
-void PerfectHash<T>::remove(const wordID_t* IDs, const int len) {
+void PerfectHash<T>::remove(const wordID_t* IDs, const int len)
+{
// delete key if in high perf. dictionary
string skey = hpDictKeyValue(IDs, len);
if(dict_.find(skey) != dict_.end())
dict_.erase(skey);
else { // check if key is in filter
- // get small representation for ngrams
+ // get small representation for ngrams
//count_t bucket = bucketHash_->hash(IDs, len);
count_t bucket = (bucketHash_->size() > 1? bucketHash_->hash(IDs, len, len) : bucketHash_->hash(IDs, len, 0));
// retrieve non zero fingerprint for ngram
- T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
+ T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
// return value if ngram is in filter
uint64_t index = bucket * bucketRange_,
- lastrow = index + bucketRange_;
+ lastrow = index + bucketRange_;
for(; index < lastrow; ++index) {
- if(filter_->read(index) == fp) {
+ if(filter_->read(index) == fp) {
filter_->write(index, 0);
values_->write(index, 0);
--idxTracker_[bucket]; // track bucket size reduction
@@ -222,7 +226,8 @@ void PerfectHash<T>::remove(const wordID_t* IDs, const int len) {
}
template<typename T> // clear filter index
-void PerfectHash<T>::remove(uint64_t index) {
+void PerfectHash<T>::remove(uint64_t index)
+{
CHECK(index < cells_);
CHECK(filter_->read(index) != 0); // slow
filter_->write(index, 0);
@@ -234,20 +239,22 @@ void PerfectHash<T>::remove(uint64_t index) {
template<typename T>
T PerfectHash<T>::nonZeroSignature(const wordID_t* IDs, const int len,
- count_t bucket) {
+ count_t bucket)
+{
count_t h = bucket;
T fingerprint(0);
do {
fingerprint = fingerHash_->hash(IDs, len, h);
- h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around
+ h += (h < fingerHash_->size() - 1 ? 1 : -h); // wrap around
} while((fingerprint == 0) && (h != bucket));
- if(fingerprint == 0)
+ if(fingerprint == 0)
cerr << "WARNING: Unable to find non-zero signature for ngram\n" << endl;
return fingerprint;
}
template<typename T>
-string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len) {
+string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len)
+{
string skey(" ");
for(int i = 0; i < len; ++i)
skey += Utils::IntToStr(IDs[i]) + "¬";
@@ -256,19 +263,22 @@ string PerfectHash<T>::hpDictKeyValue(const wordID_t* IDs, const int len) {
}
template<typename T>
-count_t PerfectHash<T>::hpDictMemUse() {
+count_t PerfectHash<T>::hpDictMemUse()
+{
// return hpDict memory usage in MBs
return (count_t) sizeof(hpDict_t::value_type)* dict_.size() >> 20;
}
template<typename T>
-count_t PerfectHash<T>::bucketsMemUse() {
+count_t PerfectHash<T>::bucketsMemUse()
+{
// return bucket memory usage in MBs
- return (count_t) (filter_->size() + values_->size());
+ return (count_t) (filter_->size() + values_->size());
}
template<typename T>
-void PerfectHash<T>::save(FileHandler* fout) {
+void PerfectHash<T>::save(FileHandler* fout)
+{
CHECK(fout != 0);
cerr << "\tSaving perfect hash parameters...\n";
fout->write((char*)&hitMask_, sizeof(hitMask_));
@@ -289,12 +299,13 @@ void PerfectHash<T>::save(FileHandler* fout) {
count_t size = dict_.size();
fout->write((char*)&size, sizeof(count_t));
*fout << endl;
- iterate(dict_, t)
- *fout << t->first << "\t" << t->second << "\n";
+ iterate(dict_, t)
+ *fout << t->first << "\t" << t->second << "\n";
}
template<typename T>
-void PerfectHash<T>::load(FileHandler* fin) {
+void PerfectHash<T>::load(FileHandler* fin)
+{
CHECK(fin != 0);
cerr << "\tLoading perfect hash parameters...\n";
fin->read((char*)&hitMask_, sizeof(hitMask_));
@@ -331,12 +342,13 @@ void PerfectHash<T>::load(FileHandler* fin) {
}
template<typename T>
-void PerfectHash<T>::analyze() {
+void PerfectHash<T>::analyze()
+{
cerr << "Analyzing Dynamic Bloomier Filter...\n";
// see how many items in each bucket
uint8_t* bucketCnt = new uint8_t[totBuckets_];
- unsigned largestBucket = 0, totalCellsSet = 0,
- smallestBucket = bucketRange_, totalZeroes = 0;
+ unsigned largestBucket = 0, totalCellsSet = 0,
+ smallestBucket = bucketRange_, totalZeroes = 0;
int curBucket = -1, fullBuckets(0);
for(int i = 0; i < totBuckets_; ++i) bucketCnt[i] = 0;
for(uint64_t i =0; i < cells_; ++i) {
@@ -344,16 +356,14 @@ void PerfectHash<T>::analyze() {
if(filter_->read(i) != 0) {
++bucketCnt[curBucket];
++totalCellsSet;
- }
- else ++totalZeroes;
+ } else ++totalZeroes;
}
count_t bi = 0, si = 0;
for(int i = 0; i < totBuckets_; ++i) {
if(bucketCnt[i] > largestBucket) {
largestBucket = bucketCnt[i];
bi = i;
- }
- else if(bucketCnt[i] < smallestBucket) {
+ } else if(bucketCnt[i] < smallestBucket) {
smallestBucket = bucketCnt[i];
si = i;
}
@@ -366,8 +376,8 @@ void PerfectHash<T>::analyze() {
}
for(int i = 0; i < totBuckets_; ++i) {
if(bucketCnt[i] != idxTracker_[i])
- cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] <<
- "\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl;
+ cerr << "bucketCnt[" << i << "] = " << (int)bucketCnt[i] <<
+ "\tidxTracker_[" << i << "] = " << (int)idxTracker_[i] << endl;
}
cerr << "total cells= " << cells_ << endl;
cerr << "total buckets= " << totBuckets_ << endl;
@@ -380,7 +390,7 @@ void PerfectHash<T>::analyze() {
cerr << "largest bucket (" << bi << ") size= " << largestBucket << endl;
cerr << "smallest bucket (" << si << ") size= " << smallestBucket << endl;
cerr << "last bucket size= " << (int)bucketCnt[totBuckets_ - 1] <<
- " (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl;
+ " (idxTracker last bucket size = " << (int)idxTracker_[totBuckets_ - 1] << ")" << endl;
cerr << "total buckets full = " << fullBuckets << endl;
cerr << "total collision errors= " << collisions_ << endl;
cerr << "high performance dictionary size= " << dict_.size() << endl;
@@ -390,14 +400,15 @@ void PerfectHash<T>::analyze() {
delete[] bucketCnt;
}
-template<typename T>
-bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
- const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx) {
+template<typename T>
+bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
+ const count_t value, hpdEntry_t& hpdAddr, uint64_t& filterIdx)
+{
// check if key is in high perf. dictionary
filterIdx = cells_ + 1;
string skey = hpDictKeyValue(IDs, len);
if((hpdAddr = dict_.find(skey)) != dict_.end()) {
- hpdAddr->second += value;
+ hpdAddr->second += value;
return true;
}
// else hash ngram
@@ -406,18 +417,18 @@ bool PerfectHash<T>::update2(const wordID_t* IDs, const int len,
// restriction on fprint value is non-zero
T fp = nonZeroSignature(IDs, len, (bucket % MAX_HASH_FUNCS));
uint64_t index = bucket * bucketRange_, // starting bucket row
- lastrow = index + bucketRange_;
+ lastrow = index + bucketRange_;
while(index < lastrow) { // must check each row for matching fp event
T filterVal = filter_->read(index);
if(filterVal == fp) { // found event w.h.p.
- int oldval = (int)qtizer_->value(values_->read(index));
- values_->write(index, (T)qtizer_->code(oldval + value));
+ int oldval = (int)qtizer_->value(values_->read(index));
+ values_->write(index, (T)qtizer_->code(oldval + value));
filterIdx = index;
return true;
}
++index;
}
- // add if it gets here.
+ // add if it gets here.
insert(IDs, len, value);
return false;
}
diff --git a/moses/TranslationModel/DynSAInclude/quantizer.h b/moses/TranslationModel/DynSAInclude/quantizer.h
index 6c6850fa6..68d6a55a3 100644
--- a/moses/TranslationModel/DynSAInclude/quantizer.h
+++ b/moses/TranslationModel/DynSAInclude/quantizer.h
@@ -14,7 +14,8 @@ static const float kFloatErr = 0.00001f;
#endif
//! @todo ask abby2
-class LogQtizer {
+class LogQtizer
+{
public:
LogQtizer(float i): base_(pow(2, 1 / i)) {
CHECK(base_ > 1);
@@ -22,8 +23,8 @@ public:
float value = 1; // code = 1 -> value = 1 for any base
std::vector<float> code_to_value_vec;
while (log2(value) < 30) { // assume 2^30 is largest count
- code_to_value_vec.push_back(value);
- value = pow(base_, ++max_code_);
+ code_to_value_vec.push_back(value);
+ value = pow(base_, ++max_code_);
}
code_to_value_vec.push_back(value); // store max_code_ so in total [0, max_code_]
// get valid range
@@ -46,22 +47,22 @@ public:
int code(float value) {
// should just be: return log_b(value)
CHECK(!(value < min_value_ || value > max_value_));
- // but binary search removes errors due to floor operator above
- int code = static_cast<int>(std::lower_bound(code_to_value_, code_to_value_+ max_code_,
- value) - code_to_value_);
- // make sure not overestimating
+ // but binary search removes errors due to floor operator above
+ int code = static_cast<int>(std::lower_bound(code_to_value_, code_to_value_+ max_code_,
+ value) - code_to_value_);
+ // make sure not overestimating
code = code_to_value_[code] > value ? code - 1 : code;
return code;
}
inline float value(int code) {
- // table look up for values
+ // table look up for values
return code_to_value_[code];
}
inline int maxcode() {
return max_code_;
}
inline float logValue(int code) {
- // table look up for log of values
+ // table look up for log of values
return code_to_log_value_[code];
}
~LogQtizer() {
@@ -75,15 +76,15 @@ public:
fout->write((char*)&min_value_, sizeof(min_value_));
for (int j = 0; j <= max_code_; ++j)
fout->write((char*)&code_to_value_[j], sizeof(code_to_value_[j]));
- for (int j = 0; j <= max_code_; ++j)
+ for (int j = 0; j <= max_code_; ++j)
fout->write((char*)&code_to_log_value_[j], sizeof(code_to_log_value_[j]));
std::cerr << "Saved log codebook with " << max_code_ + 1 << " codes." <<std::endl;
}
private:
float base_;
- float* code_to_value_;
+ float* code_to_value_;
float* code_to_log_value_;
- int max_code_;
+ int max_code_;
float max_value_;
float min_value_;
void load(FileHandler* fin) {
diff --git a/moses/TranslationModel/DynSAInclude/vocab.cpp b/moses/TranslationModel/DynSAInclude/vocab.cpp
index 27e052260..da1443f66 100644
--- a/moses/TranslationModel/DynSAInclude/vocab.cpp
+++ b/moses/TranslationModel/DynSAInclude/vocab.cpp
@@ -103,10 +103,11 @@ bool Vocab::Load(const std::string & vocab_path, const FactorDirection& directio
std::cerr << "Loading vocab from " << vocab_path << std::endl;
return Load(&vcbin, direction, factors, closed);
}
-bool Vocab::Load(FileHandler* vcbin) {
+bool Vocab::Load(FileHandler* vcbin)
+{
FactorList factors;
factors.push_back(0);
- return Load(vcbin, Input, factors);
+ return Load(vcbin, Input, factors);
}
bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction,
const FactorList& factors, bool closed)
diff --git a/moses/TranslationModel/DynSAInclude/vocab.h b/moses/TranslationModel/DynSAInclude/vocab.h
index 0c0d50a07..18ff96c8d 100644
--- a/moses/TranslationModel/DynSAInclude/vocab.h
+++ b/moses/TranslationModel/DynSAInclude/vocab.h
@@ -11,7 +11,7 @@
namespace Moses
{
-
+
//! Vocab maps between strings and uint32 ids.
class Vocab
{
diff --git a/moses/TranslationModel/DynSuffixArray.cpp b/moses/TranslationModel/DynSuffixArray.cpp
index 7d5847cc6..c5fddf3f0 100644
--- a/moses/TranslationModel/DynSuffixArray.cpp
+++ b/moses/TranslationModel/DynSuffixArray.cpp
@@ -74,12 +74,12 @@ int DynSuffixArray::F_firstIdx(unsigned word)
// return index of first row where word is found in m_F
/*for(int i=0; i < m_F->size(); ++i) {
if(m_F->at(i) == word) {
- return i;
+ return i;
}
}
return -1;*/
- //NOTE: lower_bound is faster than linear search above but may cause issues
- // if ordering of vocab is not consecutive (ie..after deletions)
+ //NOTE: lower_bound is faster than linear search above but may cause issues
+ // if ordering of vocab is not consecutive (ie..after deletions)
int low = std::lower_bound(m_F->begin(), m_F->end(), word) - m_F->begin();
//cerr << "in F_firstIdx with word = " << word << " and low = " << low << " and F->size() =" << m_F->size() << endl;
if((size_t)low >= m_F->size())
@@ -146,8 +146,8 @@ void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
{
set<pair<unsigned, unsigned> > seen;
while(j != jprime) {
- // this 'seenit' check added for data with many loops. will remove after double
- // checking.
+ // this 'seenit' check added for data with many loops. will remove after double
+ // checking.
bool seenit = seen.insert(std::make_pair(j, jprime)).second;
if(seenit) {
for(size_t i=1; i < m_SA->size(); ++i) {
@@ -163,9 +163,9 @@ void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
int new_j = LastFirstFunc(j);
CHECK(j <= jprime);
// for SA and L, the element at pos j is moved to pos j'
- m_L->insert(m_L->begin() + jprime + 1, m_L->at(j));
+ m_L->insert(m_L->begin() + jprime + 1, m_L->at(j));
m_L->erase(m_L->begin() + j);
- m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j));
+ m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j));
m_SA->erase(m_SA->begin() + j);
// all ISA values between (j...j'] decremented
for(size_t i = 0; i < m_ISA->size(); ++i) {
diff --git a/moses/TranslationModel/PhraseDictionary.cpp b/moses/TranslationModel/PhraseDictionary.cpp
index 808f7ce81..a0c94ccdc 100644
--- a/moses/TranslationModel/PhraseDictionary.cpp
+++ b/moses/TranslationModel/PhraseDictionary.cpp
@@ -31,7 +31,7 @@ namespace Moses
{
PhraseDictionary::PhraseDictionary(const std::string &description, const std::string &line)
-:DecodeFeature(description, line)
+ :DecodeFeature(description, line)
{
m_tableLimit= 20; // TODO default?
@@ -40,20 +40,15 @@ PhraseDictionary::PhraseDictionary(const std::string &description, const std::st
if (args[0] == "num-input-features") {
m_numInputScores = Scan<unsigned>(args[1]);
- }
- else if (args[0] == "path") {
+ } else if (args[0] == "path") {
m_filePath = args[1];
- }
- else if (args[0] == "table-limit") {
+ } else if (args[0] == "table-limit") {
m_tableLimit = Scan<size_t>(args[1]);
- }
- else if (args[0] == "target-path") {
+ } else if (args[0] == "target-path") {
m_targetFile = args[1];
- }
- else if (args[0] == "alignment-path") {
+ } else if (args[0] == "alignment-path") {
m_alignmentsFile = args[1];
- }
- else {
+ } else {
//throw "Unknown argument " + args[0];
}
} // for (size_t i = 0; i < toks.size(); ++i) {
diff --git a/moses/TranslationModel/PhraseDictionary.h b/moses/TranslationModel/PhraseDictionary.h
index 4c10c2e6b..1b1197eb1 100644
--- a/moses/TranslationModel/PhraseDictionary.h
+++ b/moses/TranslationModel/PhraseDictionary.h
@@ -88,7 +88,9 @@ public:
const PhraseDictionary* GetDictionary() const;
PhraseDictionary* GetDictionary();
- const std::string &GetFilePath() const { return m_filePath; }
+ const std::string &GetFilePath() const {
+ return m_filePath;
+ }
protected:
size_t m_tableLimit;
diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
index 126dd3365..afa1c4abc 100644
--- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
@@ -9,7 +9,7 @@ using namespace std;
namespace Moses
{
PhraseDictionaryDynSuffixArray::PhraseDictionaryDynSuffixArray(const std::string &line)
-:PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
+ :PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
{
m_biSA = new BilingualDynSuffixArray();
}
@@ -63,7 +63,7 @@ const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCol
void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment)
{
m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
- //StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
+ //StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
}
void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
{
diff --git a/moses/TranslationModel/PhraseDictionaryMemory.cpp b/moses/TranslationModel/PhraseDictionaryMemory.cpp
index 27cac9f5f..c43b919a4 100644
--- a/moses/TranslationModel/PhraseDictionaryMemory.cpp
+++ b/moses/TranslationModel/PhraseDictionaryMemory.cpp
@@ -41,9 +41,9 @@ namespace Moses
{
TargetPhraseCollection &PhraseDictionaryMemory::GetOrCreateTargetPhraseCollection(
- const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS)
+ const Phrase &source
+ , const TargetPhrase &target
+ , const Word *sourceLHS)
{
PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(source, target, sourceLHS);
return currNode.GetOrCreateTargetPhraseCollection();
@@ -73,8 +73,8 @@ const TargetPhraseCollection *PhraseDictionaryMemory::GetTargetPhraseCollection(
}
PhraseDictionaryNodeMemory &PhraseDictionaryMemory::GetOrCreateNode(const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS)
+ , const TargetPhrase &target
+ , const Word *sourceLHS)
{
const size_t size = source.GetSize();
@@ -102,12 +102,12 @@ PhraseDictionaryNodeMemory &PhraseDictionaryMemory::GetOrCreateNode(const Phrase
CHECK(currNode != NULL);
}
-
+
// finally, the source LHS
//currNode = currNode->GetOrCreateChild(sourceLHS);
//CHECK(currNode != NULL);
-
+
return *currNode;
}
@@ -120,8 +120,7 @@ ChartRuleLookupManager *PhraseDictionaryMemory::CreateRuleLookupManager(
void PhraseDictionaryMemory::SortAndPrune()
{
- if (GetTableLimit())
- {
+ if (GetTableLimit()) {
m_collection.Sort(GetTableLimit());
}
}
diff --git a/moses/TranslationModel/PhraseDictionaryMemory.h b/moses/TranslationModel/PhraseDictionaryMemory.h
index dad8b3bbd..d2a8d0ad3 100644
--- a/moses/TranslationModel/PhraseDictionaryMemory.h
+++ b/moses/TranslationModel/PhraseDictionaryMemory.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -38,15 +38,17 @@ class PhraseDictionaryMemory : public RuleTableTrie
protected:
PhraseDictionaryMemory(const std::string &description, const std::string &line)
- : RuleTableTrie(description, line)
+ : RuleTableTrie(description, line)
{}
public:
PhraseDictionaryMemory(const std::string &line)
- : RuleTableTrie("PhraseDictionaryMemory", line)
+ : RuleTableTrie("PhraseDictionaryMemory", line)
{}
- const PhraseDictionaryNodeMemory &GetRootNode() const { return m_collection; }
+ const PhraseDictionaryNodeMemory &GetRootNode() const {
+ return m_collection;
+ }
ChartRuleLookupManager *CreateRuleLookupManager(
const InputType &,
@@ -54,14 +56,14 @@ public:
TO_STRING();
- protected:
+protected:
TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
- const Phrase &source, const TargetPhrase &target, const Word *sourceLHS);
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS);
const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase& source) const;
PhraseDictionaryNodeMemory &GetOrCreateNode(const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS);
+ , const TargetPhrase &target
+ , const Word *sourceLHS);
void SortAndPrune();
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModel.cpp b/moses/TranslationModel/PhraseDictionaryMultiModel.cpp
index bf3f01a1e..e395cb5a3 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModel.cpp
+++ b/moses/TranslationModel/PhraseDictionaryMultiModel.cpp
@@ -26,7 +26,7 @@ namespace Moses
{
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
-:PhraseDictionary("PhraseDictionaryMultiModel", line)
+ :PhraseDictionary("PhraseDictionaryMultiModel", line)
{
for (size_t i = 0; i < m_args.size(); ++i) {
const vector<string> &args = m_args[i];
@@ -37,12 +37,10 @@ PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
msg << "combination mode unknown: " << m_mode;
throw runtime_error(msg.str());
}
- }
- else if (args[0] == "components") {
+ } else if (args[0] == "components") {
m_pdStr = Tokenize(args[1], ",");
m_numModels = m_pdStr.size();
- }
- else if (args[0] == "lambda") {
+ } else if (args[0] == "lambda") {
m_multimodelweights = Tokenize<float>(args[1], ",");
}
} // for
@@ -55,15 +53,14 @@ PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
}
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &description, const std::string &line)
-:PhraseDictionary(description, line)
+ :PhraseDictionary(description, line)
{
for (size_t i = 0; i < m_args.size(); ++i) {
const vector<string> &args = m_args[i];
if (args[0] == "components") {
m_pdStr = Tokenize(args[1], ",");
m_numModels = m_pdStr.size();
- }
- else if (args[0] == "lambda") {
+ } else if (args[0] == "lambda") {
m_multimodelweights = Tokenize<float>(args[1], ",");
}
} // for
@@ -83,7 +80,7 @@ bool PhraseDictionaryMultiModel::InitDictionary()
// one could choose a higher value than tableLimit (or 0) here for maximal precision, at a cost of speed.
- for(size_t i = 0; i < m_numModels; ++i){
+ for(size_t i = 0; i < m_numModels; ++i) {
const string &ptName = m_pdStr[i];
PhraseDictionary *pt = FindPhraseDictionary(ptName);
@@ -144,7 +141,7 @@ const TargetPhraseCollection *PhraseDictionaryMultiModel::GetTargetPhraseCollect
void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src, std::map<std::string,multiModelStatistics*>* allStats) const
{
- for(size_t i = 0; i < m_numModels; ++i){
+ for(size_t i = 0; i < m_numModels; ++i) {
const PhraseDictionary &pd = *m_pd[i];
TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection( src);
@@ -152,10 +149,9 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src,
TargetPhraseCollection::iterator iterTargetPhrase, iterLast;
if (m_tableLimit != 0 && ret_raw->GetSize() > m_tableLimit) {
- iterLast = ret_raw->begin() + m_tableLimit;
- }
- else {
- iterLast = ret_raw->end();
+ iterLast = ret_raw->begin() + m_tableLimit;
+ } else {
+ iterLast = ret_raw->end();
}
for (iterTargetPhrase = ret_raw->begin(); iterTargetPhrase != iterLast; ++iterTargetPhrase) {
@@ -173,9 +169,9 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src,
Scores scoreVector(m_numScoreComponents);
statistics->p.resize(m_numScoreComponents);
- for(size_t j = 0; j < m_numScoreComponents; ++j){
- statistics->p[j].resize(m_numModels);
- scoreVector[j] = -raw_scores[j];
+ for(size_t j = 0; j < m_numScoreComponents; ++j) {
+ statistics->p[j].resize(m_numModels);
+ scoreVector[j] = -raw_scores[j];
}
statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); // set scores to 0
@@ -186,8 +182,8 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src,
}
multiModelStatistics * statistics = (*allStats)[targetString];
- for(size_t j = 0; j < m_numScoreComponents; ++j){
- statistics->p[j][i] = UntransformScore(raw_scores[j]);
+ for(size_t j = 0; j < m_numScoreComponents; ++j) {
+ statistics->p[j][i] = UntransformScore(raw_scores[j]);
}
(*allStats)[targetString] = statistics;
@@ -199,26 +195,26 @@ void PhraseDictionaryMultiModel::CollectSufficientStatistics(const Phrase& src,
TargetPhraseCollection* PhraseDictionaryMultiModel::CreateTargetPhraseCollectionLinearInterpolation(const Phrase& src, std::map<std::string,multiModelStatistics*>* allStats, std::vector<std::vector<float> > &multimodelweights) const
{
- TargetPhraseCollection *ret = new TargetPhraseCollection();
- for ( std::map< std::string, multiModelStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {
+ TargetPhraseCollection *ret = new TargetPhraseCollection();
+ for ( std::map< std::string, multiModelStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {
- multiModelStatistics * statistics = iter->second;
+ multiModelStatistics * statistics = iter->second;
- Scores scoreVector(m_numScoreComponents);
+ Scores scoreVector(m_numScoreComponents);
- for(size_t i = 0; i < m_numScoreComponents-1; ++i){
- scoreVector[i] = TransformScore(std::inner_product(statistics->p[i].begin(), statistics->p[i].end(), multimodelweights[i].begin(), 0.0));
- }
+ for(size_t i = 0; i < m_numScoreComponents-1; ++i) {
+ scoreVector[i] = TransformScore(std::inner_product(statistics->p[i].begin(), statistics->p[i].end(), multimodelweights[i].begin(), 0.0));
+ }
- //assuming that last value is phrase penalty
- scoreVector[m_numScoreComponents-1] = 1.0;
+ //assuming that last value is phrase penalty
+ scoreVector[m_numScoreComponents-1] = 1.0;
- statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
- statistics->targetPhrase->Evaluate(src);
+ statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
+ statistics->targetPhrase->Evaluate(src);
- ret->Add(new TargetPhrase(*statistics->targetPhrase));
- }
- return ret;
+ ret->Add(new TargetPhrase(*statistics->targetPhrase));
+ }
+ return ret;
}
@@ -235,8 +231,7 @@ std::vector<std::vector<float> > PhraseDictionaryMultiModel::getWeights(size_t n
//checking weights passed to mosesserver; only valid for this sentence; *don't* raise exception if client weights are malformed
if (weights_ptr == NULL || weights_ptr->size() == 0) {
weights_ptr = &m_multimodelweights; //fall back to weights defined in config
- }
- else if(weights_ptr->size() != m_numModels && weights_ptr->size() != m_numModels * numWeights) {
+ } else if(weights_ptr->size() != m_numModels && weights_ptr->size() != m_numModels * numWeights) {
//TODO: can we pass error message to client if weights are malformed?
std::stringstream strme;
strme << "Must have either one multimodel weight per model (" << m_numModels << "), or one per weighted feature and model (" << numWeights << "*" << m_numModels << "). You have " << weights_ptr->size() << ". Reverting to weights in config";
@@ -246,34 +241,30 @@ std::vector<std::vector<float> > PhraseDictionaryMultiModel::getWeights(size_t n
//checking weights defined in config; only valid for this sentence; raise exception if config weights are malformed
if (weights_ptr == NULL || weights_ptr->size() == 0) {
- for (size_t i=0;i < m_numModels;i++) {
+ for (size_t i=0; i < m_numModels; i++) {
raw_weights.push_back(1.0/m_numModels); //uniform weights created online
}
- }
- else if(weights_ptr->size() != m_numModels && weights_ptr->size() != m_numModels * numWeights) {
+ } else if(weights_ptr->size() != m_numModels && weights_ptr->size() != m_numModels * numWeights) {
std::stringstream strme;
strme << "Must have either one multimodel weight per model (" << m_numModels << "), or one per weighted feature and model (" << numWeights << "*" << m_numModels << "). You have " << weights_ptr->size() << ".";
UTIL_THROW(util::Exception, strme.str());
- }
- else {
- raw_weights = *weights_ptr;
+ } else {
+ raw_weights = *weights_ptr;
}
std::vector<std::vector<float> > multimodelweights (numWeights);
- for (size_t i=0;i < numWeights;i++) {
+ for (size_t i=0; i < numWeights; i++) {
std::vector<float> weights_onefeature (m_numModels);
if(raw_weights.size() == m_numModels) {
- weights_onefeature = raw_weights;
- }
- else {
- copy ( raw_weights.begin()+i*m_numModels, raw_weights.begin()+(i+1)*m_numModels, weights_onefeature.begin() );
+ weights_onefeature = raw_weights;
+ } else {
+ copy ( raw_weights.begin()+i*m_numModels, raw_weights.begin()+(i+1)*m_numModels, weights_onefeature.begin() );
}
if(normalize) {
- multimodelweights[i] = normalizeWeights(weights_onefeature);
- }
- else {
- multimodelweights[i] = weights_onefeature;
+ multimodelweights[i] = normalizeWeights(weights_onefeature);
+ } else {
+ multimodelweights[i] = weights_onefeature;
}
}
@@ -282,12 +273,12 @@ std::vector<std::vector<float> > PhraseDictionaryMultiModel::getWeights(size_t n
std::vector<float> PhraseDictionaryMultiModel::normalizeWeights(std::vector<float> &weights) const
{
- std::vector<float> ret (m_numModels);
- float total = std::accumulate(weights.begin(),weights.end(),0.0);
- for (size_t i=0;i < weights.size();i++) {
- ret[i] = weights[i]/total;
- }
- return ret;
+ std::vector<float> ret (m_numModels);
+ float total = std::accumulate(weights.begin(),weights.end(),0.0);
+ for (size_t i=0; i < weights.size(); i++) {
+ ret[i] = weights[i]/total;
+ }
+ return ret;
}
@@ -298,7 +289,8 @@ ChartRuleLookupManager *PhraseDictionaryMultiModel::CreateRuleLookupManager(cons
//copied from PhraseDictionaryCompact; free memory allocated to TargetPhraseCollection (and each TargetPhrase) at end of sentence
-void PhraseDictionaryMultiModel::CacheForCleanup(TargetPhraseCollection* tpc) {
+void PhraseDictionaryMultiModel::CacheForCleanup(TargetPhraseCollection* tpc)
+{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_sentenceMutex);
PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
@@ -309,7 +301,8 @@ void PhraseDictionaryMultiModel::CacheForCleanup(TargetPhraseCollection* tpc) {
}
-void PhraseDictionaryMultiModel::CleanUpAfterSentenceProcessing(const InputType &source) {
+void PhraseDictionaryMultiModel::CleanUpAfterSentenceProcessing(const InputType &source)
+{
#ifdef WITH_THREADS
boost::mutex::scoped_lock lock(m_sentenceMutex);
PhraseCache &ref = m_sentenceCache[boost::this_thread::get_id()];
@@ -317,7 +310,7 @@ void PhraseDictionaryMultiModel::CleanUpAfterSentenceProcessing(const InputType
PhraseCache &ref = m_sentenceCache;
#endif
for(PhraseCache::iterator it = ref.begin(); it != ref.end(); it++) {
- delete *it;
+ delete *it;
}
PhraseCache temp;
@@ -331,149 +324,150 @@ void PhraseDictionaryMultiModel::CleanUpAfterSentenceProcessing(const InputType
}
-void PhraseDictionaryMultiModel::CleanUpComponentModels(const InputType &source) {
- for(size_t i = 0; i < m_numModels; ++i){
+void PhraseDictionaryMultiModel::CleanUpComponentModels(const InputType &source)
+{
+ for(size_t i = 0; i < m_numModels; ++i) {
m_pd[i]->CleanUpAfterSentenceProcessing(source);
}
}
#ifdef WITH_DLIB
-vector<float> PhraseDictionaryMultiModel::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector) {
+vector<float> PhraseDictionaryMultiModel::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector)
+{
- const StaticData &staticData = StaticData::Instance();
- const string& factorDelimiter = staticData.GetFactorDelimiter();
+ const StaticData &staticData = StaticData::Instance();
+ const string& factorDelimiter = staticData.GetFactorDelimiter();
- map<pair<string, string>, size_t> phrase_pair_map;
+ map<pair<string, string>, size_t> phrase_pair_map;
- for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
- phrase_pair_map[*iter] += 1;
- }
+ for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
+ phrase_pair_map[*iter] += 1;
+ }
- vector<multiModelStatisticsOptimization*> optimizerStats;
+ vector<multiModelStatisticsOptimization*> optimizerStats;
- for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) {
+ for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) {
- pair<string, string> phrase_pair = iter->first;
- string source_string = phrase_pair.first;
- string target_string = phrase_pair.second;
+ pair<string, string> phrase_pair = iter->first;
+ string source_string = phrase_pair.first;
+ string target_string = phrase_pair.second;
- vector<float> fs(m_numModels);
- map<string,multiModelStatistics*>* allStats = new(map<string,multiModelStatistics*>);
+ vector<float> fs(m_numModels);
+ map<string,multiModelStatistics*>* allStats = new(map<string,multiModelStatistics*>);
- Phrase sourcePhrase(0);
- sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL);
+ Phrase sourcePhrase(0);
+ sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL);
- CollectSufficientStatistics(sourcePhrase, allStats); //optimization potential: only call this once per source phrase
+ CollectSufficientStatistics(sourcePhrase, allStats); //optimization potential: only call this once per source phrase
- //phrase pair not found; leave cache empty
- if (allStats->find(target_string) == allStats->end()) {
- RemoveAllInMap(*allStats);
- delete allStats;
- continue;
- }
+ //phrase pair not found; leave cache empty
+ if (allStats->find(target_string) == allStats->end()) {
+ RemoveAllInMap(*allStats);
+ delete allStats;
+ continue;
+ }
- multiModelStatisticsOptimization* targetStatistics = new multiModelStatisticsOptimization();
- targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase);
- targetStatistics->p = (*allStats)[target_string]->p;
- targetStatistics->f = iter->second;
- optimizerStats.push_back(targetStatistics);
+ multiModelStatisticsOptimization* targetStatistics = new multiModelStatisticsOptimization();
+ targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase);
+ targetStatistics->p = (*allStats)[target_string]->p;
+ targetStatistics->f = iter->second;
+ optimizerStats.push_back(targetStatistics);
- RemoveAllInMap(*allStats);
- delete allStats;
- }
+ RemoveAllInMap(*allStats);
+ delete allStats;
+ }
- Sentence sentence;
- CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables
+ Sentence sentence;
+ CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables
- size_t numWeights = m_numScoreComponents;
- if (m_mode == "interpolate") {
- //interpolation of phrase penalty is skipped, and fixed-value (2.718) is used instead. results will be screwed up if phrase penalty is not last feature
- numWeights = m_numScoreComponents-1;
- }
+ size_t numWeights = m_numScoreComponents;
+ if (m_mode == "interpolate") {
+ //interpolation of phrase penalty is skipped, and fixed-value (2.718) is used instead. results will be screwed up if phrase penalty is not last feature
+ numWeights = m_numScoreComponents-1;
+ }
- vector<float> ret (m_numModels*numWeights);
- for (size_t iFeature=0; iFeature < numWeights; iFeature++) {
+ vector<float> ret (m_numModels*numWeights);
+ for (size_t iFeature=0; iFeature < numWeights; iFeature++) {
- CrossEntropy * ObjectiveFunction = new CrossEntropy(optimizerStats, this, iFeature);
+ CrossEntropy * ObjectiveFunction = new CrossEntropy(optimizerStats, this, iFeature);
- vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels);
+ vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels);
- if (m_mode == "interpolate") {
- weight_vector = normalizeWeights(weight_vector);
- }
+ if (m_mode == "interpolate") {
+ weight_vector = normalizeWeights(weight_vector);
+ }
- cerr << "Weight vector for feature " << iFeature << ": ";
- for (size_t i=0; i < m_numModels; i++) {
- ret[(iFeature*m_numModels)+i] = weight_vector[i];
- cerr << weight_vector[i] << " ";
- }
- cerr << endl;
- delete ObjectiveFunction;
+ cerr << "Weight vector for feature " << iFeature << ": ";
+ for (size_t i=0; i < m_numModels; i++) {
+ ret[(iFeature*m_numModels)+i] = weight_vector[i];
+ cerr << weight_vector[i] << " ";
}
+ cerr << endl;
+ delete ObjectiveFunction;
+ }
- RemoveAllInColl(optimizerStats);
- return ret;
+ RemoveAllInColl(optimizerStats);
+ return ret;
}
-vector<float> PhraseDictionaryMultiModel::Optimize(OptimizationObjective *ObjectiveFunction, size_t numModels) {
-
- dlib::matrix<double,0,1> starting_point;
- starting_point.set_size(numModels);
- starting_point = 1.0;
-
- try {
- dlib::find_min_bobyqa(*ObjectiveFunction,
- starting_point,
- 2*numModels+1, // number of interpolation points
- dlib::uniform_matrix<double>(numModels,1, 1e-09), // lower bound constraint
- dlib::uniform_matrix<double>(numModels,1, 1e100), // upper bound constraint
- 1.0, // initial trust region radius
- 1e-5, // stopping trust region radius
- 10000 // max number of objective function evaluations
- );
- }
- catch (dlib::bobyqa_failure& e)
- {
- cerr << e.what() << endl;
- }
+vector<float> PhraseDictionaryMultiModel::Optimize(OptimizationObjective *ObjectiveFunction, size_t numModels)
+{
- vector<float> weight_vector (numModels);
+ dlib::matrix<double,0,1> starting_point;
+ starting_point.set_size(numModels);
+ starting_point = 1.0;
+
+ try {
+ dlib::find_min_bobyqa(*ObjectiveFunction,
+ starting_point,
+ 2*numModels+1, // number of interpolation points
+ dlib::uniform_matrix<double>(numModels,1, 1e-09), // lower bound constraint
+ dlib::uniform_matrix<double>(numModels,1, 1e100), // upper bound constraint
+ 1.0, // initial trust region radius
+ 1e-5, // stopping trust region radius
+ 10000 // max number of objective function evaluations
+ );
+ } catch (dlib::bobyqa_failure& e) {
+ cerr << e.what() << endl;
+ }
- for (int i=0; i < starting_point.nr(); i++) {
- weight_vector[i] = starting_point(i);
- }
+ vector<float> weight_vector (numModels);
+
+ for (int i=0; i < starting_point.nr(); i++) {
+ weight_vector[i] = starting_point(i);
+ }
- cerr << "Cross-entropy: " << (*ObjectiveFunction)(starting_point) << endl;
- return weight_vector;
+ cerr << "Cross-entropy: " << (*ObjectiveFunction)(starting_point) << endl;
+ return weight_vector;
}
double CrossEntropy::operator() ( const dlib::matrix<double,0,1>& arg) const
{
- double total = 0.0;
- double n = 0.0;
- std::vector<float> weight_vector (m_model->m_numModels);
+ double total = 0.0;
+ double n = 0.0;
+ std::vector<float> weight_vector (m_model->m_numModels);
- for (int i=0; i < arg.nr(); i++) {
- weight_vector[i] = arg(i);
- }
- if (m_model->m_mode == "interpolate") {
- weight_vector = m_model->normalizeWeights(weight_vector);
- }
+ for (int i=0; i < arg.nr(); i++) {
+ weight_vector[i] = arg(i);
+ }
+ if (m_model->m_mode == "interpolate") {
+ weight_vector = m_model->normalizeWeights(weight_vector);
+ }
- for ( std::vector<multiModelStatisticsOptimization*>::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) {
- multiModelStatisticsOptimization* statistics = *iter;
- size_t f = statistics->f;
+ for ( std::vector<multiModelStatisticsOptimization*>::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) {
+ multiModelStatisticsOptimization* statistics = *iter;
+ size_t f = statistics->f;
- double score;
- score = std::inner_product(statistics->p[m_iFeature].begin(), statistics->p[m_iFeature].end(), weight_vector.begin(), 0.0);
+ double score;
+ score = std::inner_product(statistics->p[m_iFeature].begin(), statistics->p[m_iFeature].end(), weight_vector.begin(), 0.0);
- total -= (FloorScore(TransformScore(score))/TransformScore(2))*f;
- n += f;
- }
- return total/n;
+ total -= (FloorScore(TransformScore(score))/TransformScore(2))*f;
+ n += f;
+ }
+ return total/n;
}
#endif
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModel.h b/moses/TranslationModel/PhraseDictionaryMultiModel.h
index 467333b0a..5feb4f373 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModel.h
+++ b/moses/TranslationModel/PhraseDictionaryMultiModel.h
@@ -36,15 +36,17 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
- struct multiModelStatistics {
- TargetPhrase *targetPhrase;
- std::vector<std::vector<float> > p;
- ~multiModelStatistics() {delete targetPhrase;};
+struct multiModelStatistics {
+ TargetPhrase *targetPhrase;
+ std::vector<std::vector<float> > p;
+ ~multiModelStatistics() {
+ delete targetPhrase;
};
+};
- struct multiModelStatisticsOptimization: multiModelStatistics {
- size_t f;
- };
+struct multiModelStatisticsOptimization: multiModelStatistics {
+ size_t f;
+};
class OptimizationObjective;
@@ -53,7 +55,7 @@ class OptimizationObjective;
class PhraseDictionaryMultiModel: public PhraseDictionary
{
#ifdef WITH_DLIB
-friend class CrossEntropy;
+ friend class CrossEntropy;
#endif
public:
@@ -100,34 +102,33 @@ protected:
};
#ifdef WITH_DLIB
-class OptimizationObjective
+class OptimizationObjective
{
public:
- virtual double operator() ( const dlib::matrix<double,0,1>& arg) const = 0;
+ virtual double operator() ( const dlib::matrix<double,0,1>& arg) const = 0;
};
class CrossEntropy: public OptimizationObjective
{
public:
- CrossEntropy (
- std::vector<multiModelStatisticsOptimization*> &optimizerStats,
- PhraseDictionaryMultiModel * model,
- size_t iFeature
- )
- {
- m_optimizerStats = optimizerStats;
- m_model = model;
- m_iFeature = iFeature;
- }
+ CrossEntropy (
+ std::vector<multiModelStatisticsOptimization*> &optimizerStats,
+ PhraseDictionaryMultiModel * model,
+ size_t iFeature
+ ) {
+ m_optimizerStats = optimizerStats;
+ m_model = model;
+ m_iFeature = iFeature;
+ }
- double operator() ( const dlib::matrix<double,0,1>& arg) const;
+ double operator() ( const dlib::matrix<double,0,1>& arg) const;
protected:
- std::vector<multiModelStatisticsOptimization*> m_optimizerStats;
- PhraseDictionaryMultiModel * m_model;
- size_t m_iFeature;
+ std::vector<multiModelStatisticsOptimization*> m_optimizerStats;
+ PhraseDictionaryMultiModel * m_model;
+ size_t m_iFeature;
};
#endif
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
index 4c61fba91..298e23a9b 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
+++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.cpp
@@ -61,59 +61,56 @@ namespace Moses
{
PhraseDictionaryMultiModelCounts::PhraseDictionaryMultiModelCounts(const std::string &line)
-:PhraseDictionaryMultiModel("PhraseDictionaryMultiModelCounts", line)
+ :PhraseDictionaryMultiModel("PhraseDictionaryMultiModelCounts", line)
{
- m_mode = "instance_weighting"; //TODO: set this in config; use m_mode to switch between interpolation and instance weighting
- m_combineFunction = InstanceWeighting;
- //m_mode = "interpolate";
- //m_combineFunction = LinearInterpolationFromCounts;
-
- for (size_t i = 0; i < m_args.size(); ++i) {
- const vector<string> &args = m_args[i];
- if (args[0] == "mode") {
- m_mode = args[1];
- if (m_mode == "instance_weighting")
- m_combineFunction = InstanceWeighting;
- else if (m_mode == "interpolate") {
- m_combineFunction = LinearInterpolationFromCounts;
- }
- else {
- ostringstream msg;
- msg << "combination mode unknown: " << m_mode;
- throw runtime_error(msg.str());
- }
-
- }
- else if (args[0] == "lex-e2f") {
- m_lexE2FStr = Tokenize(args[1], ",");
- CHECK(m_lexE2FStr.size() == m_pdStr.size());
- }
- else if (args[0] == "lex-f2e") {
- m_lexF2EStr = Tokenize(args[1], ",");
- CHECK(m_lexF2EStr.size() == m_pdStr.size());
+ m_mode = "instance_weighting"; //TODO: set this in config; use m_mode to switch between interpolation and instance weighting
+ m_combineFunction = InstanceWeighting;
+ //m_mode = "interpolate";
+ //m_combineFunction = LinearInterpolationFromCounts;
+
+ for (size_t i = 0; i < m_args.size(); ++i) {
+ const vector<string> &args = m_args[i];
+ if (args[0] == "mode") {
+ m_mode = args[1];
+ if (m_mode == "instance_weighting")
+ m_combineFunction = InstanceWeighting;
+ else if (m_mode == "interpolate") {
+ m_combineFunction = LinearInterpolationFromCounts;
+ } else {
+ ostringstream msg;
+ msg << "combination mode unknown: " << m_mode;
+ throw runtime_error(msg.str());
}
- else if (args[0] == "target-table") {
- m_targetTable = Tokenize(args[1], ",");
- CHECK(m_targetTable.size() == m_pdStr.size());
- }
+ } else if (args[0] == "lex-e2f") {
+ m_lexE2FStr = Tokenize(args[1], ",");
+ CHECK(m_lexE2FStr.size() == m_pdStr.size());
+ } else if (args[0] == "lex-f2e") {
+ m_lexF2EStr = Tokenize(args[1], ",");
+ CHECK(m_lexF2EStr.size() == m_pdStr.size());
+ }
+ else if (args[0] == "target-table") {
+ m_targetTable = Tokenize(args[1], ",");
+ CHECK(m_targetTable.size() == m_pdStr.size());
+ }
- } // for
+
+ } // for
}
PhraseDictionaryMultiModelCounts::~PhraseDictionaryMultiModelCounts()
{
- RemoveAllInColl(m_lexTable_e2f);
- RemoveAllInColl(m_lexTable_f2e);
+ RemoveAllInColl(m_lexTable_e2f);
+ RemoveAllInColl(m_lexTable_f2e);
}
bool PhraseDictionaryMultiModelCounts::InitDictionary()
{
- for(size_t i = 0; i < m_numModels; ++i){
+ for(size_t i = 0; i < m_numModels; ++i) {
// phrase table
const string &ptName = m_pdStr[i];
@@ -189,8 +186,8 @@ bool PhraseDictionaryMultiModelCounts::InitDictionary()
pdta_inverse->SetNumScoreComponentMultiModel(m_numScoreComponent);
pdta_inverse->Load(input, output, target_table, m_weight, m_componentTableLimit, languageModels, m_weightWP);
m_inverse_pd.push_back(pdta_inverse);
- } else if (implementation == Compact) {
-#ifndef WIN32
+ } else if (implementation == Compact) {
+ #ifndef WIN32
PhraseDictionaryCompact* pdc = new PhraseDictionaryCompact(m_numScoreComponent, implementation, m_feature_load);
pdc->SetNumScoreComponentMultiModel(m_numScoreComponent); //for compact models, we need to pass number of log-linear components to correctly resize the score vector
pdc->Load( input, output, main_table, m_weight, componentTableLimit, languageModels, m_weightWP);
@@ -200,9 +197,9 @@ bool PhraseDictionaryMultiModelCounts::InitDictionary()
pdc_inverse->SetNumScoreComponentMultiModel(m_numScoreComponent);
pdc_inverse->Load( input, output, target_table, m_weight, componentTableLimit, languageModels, m_weightWP);
m_inverse_pd.push_back(pdc_inverse);
-#else
- UTIL_THROW(util::Exception, "Compact phrase table not supported in windows");
-#endif
+ #else
+ UTIL_THROW(util::Exception, "Compact phrase table not supported in windows");
+ #endif
}
else {
UTIL_THROW(util::Exception,"PhraseDictionaryMultiModel does not support phrase table type " << implementation);
@@ -218,7 +215,7 @@ bool PhraseDictionaryMultiModelCounts::InitDictionary()
}
-*/
+ */
return true;
}
@@ -250,7 +247,7 @@ const TargetPhraseCollection *PhraseDictionaryMultiModelCounts::GetTargetPhraseC
void PhraseDictionaryMultiModelCounts::CollectSufficientStatistics(const Phrase& src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats) const
//fill fs and allStats with statistics from models
{
- for(size_t i = 0; i < m_numModels; ++i){
+ for(size_t i = 0; i < m_numModels; ++i) {
const PhraseDictionary &pd = *m_pd[i];
TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection( src);
@@ -298,9 +295,9 @@ void PhraseDictionaryMultiModelCounts::CollectSufficientStatistics(const Phrase&
multiModelCountsStatistics * statistics = iter->second;
for (size_t i = 0; i < m_numModels; ++i) {
- if (!statistics->ft[i]) {
- statistics->ft[i] = GetTargetCount(static_cast<const Phrase&>(*statistics->targetPhrase), i);
- }
+ if (!statistics->ft[i]) {
+ statistics->ft[i] = GetTargetCount(static_cast<const Phrase&>(*statistics->targetPhrase), i);
+ }
}
}
}
@@ -313,28 +310,27 @@ TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseColl
multiModelCountsStatistics * statistics = iter->second;
if (statistics->targetPhrase->GetAlignTerm().GetSize() == 0) {
- UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables.");
+ UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables.");
}
try {
- pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(src, static_cast<const Phrase&>(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm());
- vector< set<size_t> > alignedToT = alignment.first;
- vector< set<size_t> > alignedToS = alignment.second;
- double lexst = ComputeWeightedLexicalTranslation(static_cast<const Phrase&>(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], m_output, m_input );
- double lexts = ComputeWeightedLexicalTranslation(src, static_cast<const Phrase&>(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], m_input, m_output );
-
- Scores scoreVector(5);
- scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0])));
- scoreVector[1] = FloorScore(TransformScore(lexst));
- scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2])));
- scoreVector[3] = FloorScore(TransformScore(lexts));
- scoreVector[4] = FloorScore(TransformScore(2.718));
-
- statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
- statistics->targetPhrase->Evaluate(src);
- }
- catch (AlignmentException& e) {
- continue;
+ pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(src, static_cast<const Phrase&>(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm());
+ vector< set<size_t> > alignedToT = alignment.first;
+ vector< set<size_t> > alignedToS = alignment.second;
+ double lexst = ComputeWeightedLexicalTranslation(static_cast<const Phrase&>(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], m_output, m_input );
+ double lexts = ComputeWeightedLexicalTranslation(src, static_cast<const Phrase&>(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], m_input, m_output );
+
+ Scores scoreVector(5);
+ scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0])));
+ scoreVector[1] = FloorScore(TransformScore(lexst));
+ scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2])));
+ scoreVector[3] = FloorScore(TransformScore(lexts));
+ scoreVector[4] = FloorScore(TransformScore(2.718));
+
+ statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
+ statistics->targetPhrase->Evaluate(src);
+ } catch (AlignmentException& e) {
+ continue;
}
ret->Add(new TargetPhrase(*statistics->targetPhrase));
@@ -346,47 +342,50 @@ TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseColl
}
-float PhraseDictionaryMultiModelCounts::GetTargetCount(const Phrase &target, size_t modelIndex) const {
+float PhraseDictionaryMultiModelCounts::GetTargetCount(const Phrase &target, size_t modelIndex) const
+{
- const PhraseDictionary &pd = *m_inverse_pd[modelIndex];
- TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection(target);
+ const PhraseDictionary &pd = *m_inverse_pd[modelIndex];
+ TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection(target);
- // in inverse mode, we want the first score of the first phrase pair (note: if we were to work with truly symmetric models, it would be the third score)
- if (ret_raw != NULL) {
- TargetPhrase * targetPhrase = *(ret_raw->begin());
- return UntransformScore(targetPhrase->GetScoreBreakdown().GetScoresForProducer(&pd)[0]);
- }
+ // in inverse mode, we want the first score of the first phrase pair (note: if we were to work with truly symmetric models, it would be the third score)
+ if (ret_raw != NULL) {
+ TargetPhrase * targetPhrase = *(ret_raw->begin());
+ return UntransformScore(targetPhrase->GetScoreBreakdown().GetScoresForProducer(&pd)[0]);
+ }
- // target phrase unknown
- else return 0;
+ // target phrase unknown
+ else return 0;
}
-pair<PhraseDictionaryMultiModelCounts::AlignVector,PhraseDictionaryMultiModelCounts::AlignVector> PhraseDictionaryMultiModelCounts::GetAlignmentsForLexWeights(const Phrase &phraseS, const Phrase &phraseT, const AlignmentInfo &alignment) const {
+pair<PhraseDictionaryMultiModelCounts::AlignVector,PhraseDictionaryMultiModelCounts::AlignVector> PhraseDictionaryMultiModelCounts::GetAlignmentsForLexWeights(const Phrase &phraseS, const Phrase &phraseT, const AlignmentInfo &alignment) const
+{
- size_t tsize = phraseT.GetSize();
- size_t ssize = phraseS.GetSize();
- AlignVector alignedToT (tsize);
- AlignVector alignedToS (ssize);
- AlignmentInfo::const_iterator iter;
+ size_t tsize = phraseT.GetSize();
+ size_t ssize = phraseS.GetSize();
+ AlignVector alignedToT (tsize);
+ AlignVector alignedToS (ssize);
+ AlignmentInfo::const_iterator iter;
- for (iter = alignment.begin(); iter != alignment.end(); ++iter) {
+ for (iter = alignment.begin(); iter != alignment.end(); ++iter) {
const pair<size_t,size_t> &alignPair = *iter;
- size_t s = alignPair.first;
- size_t t = alignPair.second;
- if (s >= ssize || t >= tsize) {
- cerr << "Error: inconsistent alignment for phrase pair: " << phraseS << " - " << phraseT << endl;
- cerr << "phrase pair will be discarded" << endl;
- throw AlignmentException();
- }
- alignedToT[t].insert( s );
- alignedToS[s].insert( t );
+ size_t s = alignPair.first;
+ size_t t = alignPair.second;
+ if (s >= ssize || t >= tsize) {
+ cerr << "Error: inconsistent alignment for phrase pair: " << phraseS << " - " << phraseT << endl;
+ cerr << "phrase pair will be discarded" << endl;
+ throw AlignmentException();
+ }
+ alignedToT[t].insert( s );
+ alignedToS[s].insert( t );
}
return make_pair(alignedToT,alignedToS);
}
-double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslation( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, vector<float> &multimodelweights, const vector<FactorType> &input_factors, const vector<FactorType> &output_factors ) const {
+double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslation( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, vector<float> &multimodelweights, const vector<FactorType> &input_factors, const vector<FactorType> &output_factors ) const
+{
// lexical translation probability
double lexScore = 1.0;
@@ -414,7 +413,8 @@ double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslation( cons
}
-lexicalCache PhraseDictionaryMultiModelCounts::CacheLexicalStatistics( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, const vector<FactorType> &input_factors, const vector<FactorType> &output_factors ) {
+lexicalCache PhraseDictionaryMultiModelCounts::CacheLexicalStatistics( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, const vector<FactorType> &input_factors, const vector<FactorType> &output_factors )
+{
//do all the necessary lexical table lookups and get counts, but don't apply weights yet
string null = "NULL";
@@ -455,60 +455,65 @@ lexicalCache PhraseDictionaryMultiModelCounts::CacheLexicalStatistics( const Phr
}
-double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslationFromCache( lexicalCache &cache, vector<float> &weights ) const {
+double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslationFromCache( lexicalCache &cache, vector<float> &weights ) const
+{
// lexical translation probability
double lexScore = 1.0;
for (lexicalCache::const_iterator iter = cache.begin(); iter != cache.end(); ++iter) {
- vector<lexicalPair> t_vector = *iter;
- double thisWordScore = 0;
- for ( vector<lexicalPair>::const_iterator iter2 = t_vector.begin(); iter2 != t_vector.end(); ++iter2) {
- vector<float> joint_count = iter2->first;
- vector<float> marginal = iter2->second;
- thisWordScore += m_combineFunction(joint_count, marginal, weights);
- }
- lexScore *= thisWordScore / t_vector.size();
+ vector<lexicalPair> t_vector = *iter;
+ double thisWordScore = 0;
+ for ( vector<lexicalPair>::const_iterator iter2 = t_vector.begin(); iter2 != t_vector.end(); ++iter2) {
+ vector<float> joint_count = iter2->first;
+ vector<float> marginal = iter2->second;
+ thisWordScore += m_combineFunction(joint_count, marginal, weights);
+ }
+ lexScore *= thisWordScore / t_vector.size();
}
return lexScore;
}
// get lexical probability for single word alignment pair
-double PhraseDictionaryMultiModelCounts::GetLexicalProbability( string &wordS, string &wordT, const vector<lexicalTable*> &tables, vector<float> &multimodelweights ) const {
- vector<float> joint_count (m_numModels);
- vector<float> marginals (m_numModels);
+double PhraseDictionaryMultiModelCounts::GetLexicalProbability( string &wordS, string &wordT, const vector<lexicalTable*> &tables, vector<float> &multimodelweights ) const
+{
+ vector<float> joint_count (m_numModels);
+ vector<float> marginals (m_numModels);
- FillLexicalCountsJoint(wordS, wordT, joint_count, tables);
- FillLexicalCountsMarginal(wordS, marginals, tables);
+ FillLexicalCountsJoint(wordS, wordT, joint_count, tables);
+ FillLexicalCountsMarginal(wordS, marginals, tables);
- double lexProb = m_combineFunction(joint_count, marginals, multimodelweights);
+ double lexProb = m_combineFunction(joint_count, marginals, multimodelweights);
return lexProb;
}
-void PhraseDictionaryMultiModelCounts::FillLexicalCountsJoint(string &wordS, string &wordT, vector<float> &count, const vector<lexicalTable*> &tables) const {
- for (size_t i=0;i < m_numModels;i++) {
- lexicalMapJoint::iterator joint_s = tables[i]->joint.find( wordS );
- if (joint_s == tables[i]->joint.end()) count[i] = 0.0;
- else {
- lexicalMap::iterator joint_t = joint_s->second.find( wordT );
- if (joint_t == joint_s->second.end()) count[i] = 0.0;
- else count[i] = joint_t->second;
- }
+void PhraseDictionaryMultiModelCounts::FillLexicalCountsJoint(string &wordS, string &wordT, vector<float> &count, const vector<lexicalTable*> &tables) const
+{
+ for (size_t i=0; i < m_numModels; i++) {
+ lexicalMapJoint::iterator joint_s = tables[i]->joint.find( wordS );
+ if (joint_s == tables[i]->joint.end()) count[i] = 0.0;
+ else {
+ lexicalMap::iterator joint_t = joint_s->second.find( wordT );
+ if (joint_t == joint_s->second.end()) count[i] = 0.0;
+ else count[i] = joint_t->second;
}
+ }
}
-void PhraseDictionaryMultiModelCounts::FillLexicalCountsMarginal(string &wordS, vector<float> &count, const vector<lexicalTable*> &tables) const {
- for (size_t i=0;i < m_numModels;i++) {
- lexicalMap::iterator marginal_s = tables[i]->marginal.find( wordS );
- if (marginal_s == tables[i]->marginal.end()) count[i] = 0.0;
- else count[i] = marginal_s->second;
- }
+void PhraseDictionaryMultiModelCounts::FillLexicalCountsMarginal(string &wordS, vector<float> &count, const vector<lexicalTable*> &tables) const
+{
+ for (size_t i=0; i < m_numModels; i++) {
+ lexicalMap::iterator marginal_s = tables[i]->marginal.find( wordS );
+ if (marginal_s == tables[i]->marginal.end()) count[i] = 0.0;
+ else count[i] = marginal_s->second;
+ }
}
-void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexicalTable* ltable) {
+void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexicalTable* ltable)
+{
cerr << "Loading lexical translation table from " << fileName;
ifstream inFile;
@@ -549,165 +554,161 @@ void PhraseDictionaryMultiModelCounts::LoadLexicalTable( string &fileName, lexic
#ifdef WITH_DLIB
-vector<float> PhraseDictionaryMultiModelCounts::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector) {
+vector<float> PhraseDictionaryMultiModelCounts::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector)
+{
- const StaticData &staticData = StaticData::Instance();
- const string& factorDelimiter = staticData.GetFactorDelimiter();
+ const StaticData &staticData = StaticData::Instance();
+ const string& factorDelimiter = staticData.GetFactorDelimiter();
- map<pair<string, string>, size_t> phrase_pair_map;
+ map<pair<string, string>, size_t> phrase_pair_map;
- for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
- phrase_pair_map[*iter] += 1;
- }
+ for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
+ phrase_pair_map[*iter] += 1;
+ }
- vector<multiModelCountsStatisticsOptimization*> optimizerStats;
+ vector<multiModelCountsStatisticsOptimization*> optimizerStats;
- for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) {
+ for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) {
- pair<string, string> phrase_pair = iter->first;
- string source_string = phrase_pair.first;
- string target_string = phrase_pair.second;
+ pair<string, string> phrase_pair = iter->first;
+ string source_string = phrase_pair.first;
+ string target_string = phrase_pair.second;
- vector<float> fs(m_numModels);
- map<string,multiModelCountsStatistics*>* allStats = new(map<string,multiModelCountsStatistics*>);
+ vector<float> fs(m_numModels);
+ map<string,multiModelCountsStatistics*>* allStats = new(map<string,multiModelCountsStatistics*>);
- Phrase sourcePhrase(0);
- sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL);
+ Phrase sourcePhrase(0);
+ sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL);
- CollectSufficientStatistics(sourcePhrase, fs, allStats); //optimization potential: only call this once per source phrase
+ CollectSufficientStatistics(sourcePhrase, fs, allStats); //optimization potential: only call this once per source phrase
- //phrase pair not found; leave cache empty
- if (allStats->find(target_string) == allStats->end()) {
- RemoveAllInMap(*allStats);
- delete allStats;
- continue;
- }
+ //phrase pair not found; leave cache empty
+ if (allStats->find(target_string) == allStats->end()) {
+ RemoveAllInMap(*allStats);
+ delete allStats;
+ continue;
+ }
- multiModelCountsStatisticsOptimization * targetStatistics = new multiModelCountsStatisticsOptimization();
- targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase);
- targetStatistics->fs = fs;
- targetStatistics->fst = (*allStats)[target_string]->fst;
- targetStatistics->ft = (*allStats)[target_string]->ft;
- targetStatistics->f = iter->second;
+ multiModelCountsStatisticsOptimization * targetStatistics = new multiModelCountsStatisticsOptimization();
+ targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase);
+ targetStatistics->fs = fs;
+ targetStatistics->fst = (*allStats)[target_string]->fst;
+ targetStatistics->ft = (*allStats)[target_string]->ft;
+ targetStatistics->f = iter->second;
- try {
- pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), targetStatistics->targetPhrase->GetAlignTerm());
- targetStatistics->lexCachee2f = CacheLexicalStatistics(static_cast<const Phrase&>(*targetStatistics->targetPhrase), sourcePhrase, alignment.second, m_lexTable_e2f, m_output, m_input );
- targetStatistics->lexCachef2e = CacheLexicalStatistics(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), alignment.first, m_lexTable_f2e, m_input, m_output );
+ try {
+ pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), targetStatistics->targetPhrase->GetAlignTerm());
+ targetStatistics->lexCachee2f = CacheLexicalStatistics(static_cast<const Phrase&>(*targetStatistics->targetPhrase), sourcePhrase, alignment.second, m_lexTable_e2f, m_output, m_input );
+ targetStatistics->lexCachef2e = CacheLexicalStatistics(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), alignment.first, m_lexTable_f2e, m_input, m_output );
- optimizerStats.push_back(targetStatistics);
- }
- catch (AlignmentException& e) {}
+ optimizerStats.push_back(targetStatistics);
+ } catch (AlignmentException& e) {}
- RemoveAllInMap(*allStats);
- delete allStats;
- }
+ RemoveAllInMap(*allStats);
+ delete allStats;
+ }
- Sentence sentence;
- CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables
+ Sentence sentence;
+ CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables
- vector<float> ret (m_numModels*4);
- for (size_t iFeature=0; iFeature < 4; iFeature++) {
+ vector<float> ret (m_numModels*4);
+ for (size_t iFeature=0; iFeature < 4; iFeature++) {
- CrossEntropyCounts * ObjectiveFunction = new CrossEntropyCounts(optimizerStats, this, iFeature);
+ CrossEntropyCounts * ObjectiveFunction = new CrossEntropyCounts(optimizerStats, this, iFeature);
- vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels);
+ vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels);
- if (m_mode == "interpolate") {
- weight_vector = normalizeWeights(weight_vector);
- }
- else if (m_mode == "instance_weighting") {
- float first_value = weight_vector[0];
- for (size_t i=0; i < m_numModels; i++) {
- weight_vector[i] = weight_vector[i]/first_value;
- }
- }
- cerr << "Weight vector for feature " << iFeature << ": ";
- for (size_t i=0; i < m_numModels; i++) {
- ret[(iFeature*m_numModels)+i] = weight_vector[i];
- cerr << weight_vector[i] << " ";
- }
- cerr << endl;
- delete ObjectiveFunction;
+ if (m_mode == "interpolate") {
+ weight_vector = normalizeWeights(weight_vector);
+ } else if (m_mode == "instance_weighting") {
+ float first_value = weight_vector[0];
+ for (size_t i=0; i < m_numModels; i++) {
+ weight_vector[i] = weight_vector[i]/first_value;
+ }
+ }
+ cerr << "Weight vector for feature " << iFeature << ": ";
+ for (size_t i=0; i < m_numModels; i++) {
+ ret[(iFeature*m_numModels)+i] = weight_vector[i];
+ cerr << weight_vector[i] << " ";
}
+ cerr << endl;
+ delete ObjectiveFunction;
+ }
- RemoveAllInColl(optimizerStats);
- return ret;
+ RemoveAllInColl(optimizerStats);
+ return ret;
}
double CrossEntropyCounts::operator() ( const dlib::matrix<double,0,1>& arg) const
{
- double total = 0.0;
- double n = 0.0;
- std::vector<float> weight_vector (m_model->m_numModels);
+ double total = 0.0;
+ double n = 0.0;
+ std::vector<float> weight_vector (m_model->m_numModels);
- for (int i=0; i < arg.nr(); i++) {
- weight_vector[i] = arg(i);
- }
- if (m_model->m_mode == "interpolate") {
- weight_vector = m_model->normalizeWeights(weight_vector);
- }
-
- for ( std::vector<multiModelCountsStatisticsOptimization*>::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) {
- multiModelCountsStatisticsOptimization* statistics = *iter;
- size_t f = statistics->f;
+ for (int i=0; i < arg.nr(); i++) {
+ weight_vector[i] = arg(i);
+ }
+ if (m_model->m_mode == "interpolate") {
+ weight_vector = m_model->normalizeWeights(weight_vector);
+ }
- double score;
- if (m_iFeature == 0) {
- score = m_model->m_combineFunction(statistics->fst, statistics->ft, weight_vector);
- }
- else if (m_iFeature == 1) {
- score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachee2f, weight_vector);
- }
- else if (m_iFeature == 2) {
- score = m_model->m_combineFunction(statistics->fst, statistics->fs, weight_vector);
- }
- else if (m_iFeature == 3) {
- score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachef2e, weight_vector);
- }
- else {
- score = 0;
- UTIL_THROW(util::Exception, "Trying to optimize feature that I don't know. Aborting");
- }
- total -= (FloorScore(TransformScore(score))/TransformScore(2))*f;
- n += f;
+ for ( std::vector<multiModelCountsStatisticsOptimization*>::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) {
+ multiModelCountsStatisticsOptimization* statistics = *iter;
+ size_t f = statistics->f;
+
+ double score;
+ if (m_iFeature == 0) {
+ score = m_model->m_combineFunction(statistics->fst, statistics->ft, weight_vector);
+ } else if (m_iFeature == 1) {
+ score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachee2f, weight_vector);
+ } else if (m_iFeature == 2) {
+ score = m_model->m_combineFunction(statistics->fst, statistics->fs, weight_vector);
+ } else if (m_iFeature == 3) {
+ score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachef2e, weight_vector);
+ } else {
+ score = 0;
+ UTIL_THROW(util::Exception, "Trying to optimize feature that I don't know. Aborting");
}
- return total/n;
+ total -= (FloorScore(TransformScore(score))/TransformScore(2))*f;
+ n += f;
+ }
+ return total/n;
}
#endif
// calculate weighted probability based on instance weighting of joint counts and marginal counts
-double InstanceWeighting(vector<float> &joint_counts, vector<float> &marginals, vector<float> &multimodelweights) {
+double InstanceWeighting(vector<float> &joint_counts, vector<float> &marginals, vector<float> &multimodelweights)
+{
- double joint_counts_weighted = inner_product(joint_counts.begin(), joint_counts.end(), multimodelweights.begin(), 0.0);
- double marginals_weighted = inner_product(marginals.begin(), marginals.end(), multimodelweights.begin(), 0.0);
+ double joint_counts_weighted = inner_product(joint_counts.begin(), joint_counts.end(), multimodelweights.begin(), 0.0);
+ double marginals_weighted = inner_product(marginals.begin(), marginals.end(), multimodelweights.begin(), 0.0);
- if (marginals_weighted == 0) {
- return 0;
- }
- else {
- return joint_counts_weighted/marginals_weighted;
- }
+ if (marginals_weighted == 0) {
+ return 0;
+ } else {
+ return joint_counts_weighted/marginals_weighted;
+ }
}
// calculate linear interpolation of relative frequency estimates based on joint count and marginal counts
//unused for now; enable in config?
-double LinearInterpolationFromCounts(vector<float> &joint_counts, vector<float> &marginals, vector<float> &multimodelweights) {
+double LinearInterpolationFromCounts(vector<float> &joint_counts, vector<float> &marginals, vector<float> &multimodelweights)
+{
- vector<float> p(marginals.size());
+ vector<float> p(marginals.size());
- for (size_t i=0;i < marginals.size();i++) {
- if (marginals[i] != 0) {
- p[i] = joint_counts[i]/marginals[i];
- }
+ for (size_t i=0; i < marginals.size(); i++) {
+ if (marginals[i] != 0) {
+ p[i] = joint_counts[i]/marginals[i];
}
+ }
- double p_weighted = inner_product(p.begin(), p.end(), multimodelweights.begin(), 0.0);
+ double p_weighted = inner_product(p.begin(), p.end(), multimodelweights.begin(), 0.0);
- return p_weighted;
+ return p_weighted;
}
diff --git a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h
index ef89272c3..04be77dd6 100644
--- a/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h
+++ b/moses/TranslationModel/PhraseDictionaryMultiModelCounts.h
@@ -35,32 +35,33 @@ extern std::vector<std::string> tokenize( const char*);
namespace Moses
{
- typedef boost::unordered_map<std::string, double > lexicalMap;
- typedef boost::unordered_map<std::string, lexicalMap > lexicalMapJoint;
- typedef std::pair<std::vector<float>, std::vector<float> > lexicalPair;
- typedef std::vector<std::vector<lexicalPair> > lexicalCache;
+typedef boost::unordered_map<std::string, double > lexicalMap;
+typedef boost::unordered_map<std::string, lexicalMap > lexicalMapJoint;
+typedef std::pair<std::vector<float>, std::vector<float> > lexicalPair;
+typedef std::vector<std::vector<lexicalPair> > lexicalCache;
- struct multiModelCountsStatistics : multiModelStatistics {
- std::vector<float> fst, ft;
- };
+struct multiModelCountsStatistics : multiModelStatistics {
+ std::vector<float> fst, ft;
+};
- struct multiModelCountsStatisticsOptimization: multiModelCountsStatistics {
- std::vector<float> fs;
- lexicalCache lexCachee2f, lexCachef2e;
- size_t f;
- };
+struct multiModelCountsStatisticsOptimization: multiModelCountsStatistics {
+ std::vector<float> fs;
+ lexicalCache lexCachee2f, lexCachef2e;
+ size_t f;
+};
- struct lexicalTable {
- lexicalMapJoint joint;
- lexicalMap marginal;
- };
+struct lexicalTable {
+ lexicalMapJoint joint;
+ lexicalMap marginal;
+};
- double InstanceWeighting(std::vector<float> &joint_counts, std::vector<float> &marginals, std::vector<float> &multimodelweights);
- double LinearInterpolationFromCounts(std::vector<float> &joint_counts, std::vector<float> &marginals, std::vector<float> &multimodelweights);
+double InstanceWeighting(std::vector<float> &joint_counts, std::vector<float> &marginals, std::vector<float> &multimodelweights);
+double LinearInterpolationFromCounts(std::vector<float> &joint_counts, std::vector<float> &marginals, std::vector<float> &multimodelweights);
//thrown if alignment information does not match phrase pair (out-of-bound alignment points)
-class AlignmentException : public std::runtime_error {
+class AlignmentException : public std::runtime_error
+{
public:
AlignmentException() : std::runtime_error("AlignmentException") { }
};
@@ -72,10 +73,10 @@ class PhraseDictionaryMultiModelCounts: public PhraseDictionaryMultiModel
{
#ifdef WITH_DLIB
-friend class CrossEntropyCounts;
+ friend class CrossEntropyCounts;
#endif
-typedef std::vector< std::set<size_t> > AlignVector;
+ typedef std::vector< std::set<size_t> > AlignVector;
public:
@@ -116,23 +117,22 @@ class CrossEntropyCounts: public OptimizationObjective
{
public:
- CrossEntropyCounts (
- std::vector<multiModelCountsStatisticsOptimization*> &optimizerStats,
- PhraseDictionaryMultiModelCounts * model,
- size_t iFeature
- )
- {
- m_optimizerStats = optimizerStats;
- m_model = model;
- m_iFeature = iFeature;
- }
+ CrossEntropyCounts (
+ std::vector<multiModelCountsStatisticsOptimization*> &optimizerStats,
+ PhraseDictionaryMultiModelCounts * model,
+ size_t iFeature
+ ) {
+ m_optimizerStats = optimizerStats;
+ m_model = model;
+ m_iFeature = iFeature;
+ }
- double operator() ( const dlib::matrix<double,0,1>& arg) const;
+ double operator() ( const dlib::matrix<double,0,1>& arg) const;
private:
- std::vector<multiModelCountsStatisticsOptimization*> m_optimizerStats;
- PhraseDictionaryMultiModelCounts * m_model;
- size_t m_iFeature;
+ std::vector<multiModelCountsStatisticsOptimization*> m_optimizerStats;
+ PhraseDictionaryMultiModelCounts * m_model;
+ size_t m_iFeature;
};
#endif
diff --git a/moses/TranslationModel/PhraseDictionaryNodeMemory.cpp b/moses/TranslationModel/PhraseDictionaryNodeMemory.cpp
index 465c076d5..389c74394 100644
--- a/moses/TranslationModel/PhraseDictionaryNodeMemory.cpp
+++ b/moses/TranslationModel/PhraseDictionaryNodeMemory.cpp
@@ -31,8 +31,8 @@ namespace Moses
PhraseDictionaryNodeMemory::~PhraseDictionaryNodeMemory()
{
for (TerminalMap::iterator iter = m_sourceTermMap.begin(); iter != m_sourceTermMap.end(); ++iter) {
- const PhraseDictionaryNodeMemory *node = iter->second;
- delete node;
+ const PhraseDictionaryNodeMemory *node = iter->second;
+ delete node;
}
for (NonTerminalMap::iterator iter = m_nonTermMap.begin(); iter != m_nonTermMap.end(); ++iter) {
const PhraseDictionaryNodeMemory *node = iter->second;
@@ -41,7 +41,8 @@ PhraseDictionaryNodeMemory::~PhraseDictionaryNodeMemory()
delete m_targetPhraseCollection;
}
-TargetPhraseCollection &PhraseDictionaryNodeMemory::GetOrCreateTargetPhraseCollection() {
+TargetPhraseCollection &PhraseDictionaryNodeMemory::GetOrCreateTargetPhraseCollection()
+{
if (m_targetPhraseCollection == NULL)
m_targetPhraseCollection = new TargetPhraseCollection();
return *m_targetPhraseCollection;
@@ -138,9 +139,9 @@ void PhraseDictionaryNodeMemory::Clear()
m_sourceTermMap.clear();
m_nonTermMap.clear();
delete m_targetPhraseCollection;
-
+
}
-
+
std::ostream& operator<<(std::ostream &out, const PhraseDictionaryNodeMemory &node)
{
out << node.GetTargetPhraseCollection();
diff --git a/moses/TranslationModel/PhraseDictionaryNodeMemory.h b/moses/TranslationModel/PhraseDictionaryNodeMemory.h
index 672196ba2..136e10c0a 100644
--- a/moses/TranslationModel/PhraseDictionaryNodeMemory.h
+++ b/moses/TranslationModel/PhraseDictionaryNodeMemory.h
@@ -39,8 +39,8 @@ namespace Moses
class PhraseDictionaryMemory;
class PhraseDictionaryFuzzyMatch;
-
- //! @todo why?
+
+//! @todo why?
class NonTerminalMapKeyHasher
{
public:
@@ -152,7 +152,7 @@ public:
}
void Clear();
-
+
TO_STRING();
};
diff --git a/moses/TranslationModel/PhraseDictionaryTree.cpp b/moses/TranslationModel/PhraseDictionaryTree.cpp
index 321924dfe..c5eefc290 100644
--- a/moses/TranslationModel/PhraseDictionaryTree.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTree.cpp
@@ -157,7 +157,8 @@ PhraseDictionaryTree::PrefixPtr::operator bool() const
typedef LVoc<std::string> WordVoc;
-class PDTimp {
+class PDTimp
+{
public:
typedef PrefixTreeF<LabelId,OFF_T> PTF;
typedef FilePtr<PTF> CPT;
@@ -481,7 +482,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
std::vector<OFF_T> vo;
size_t lnc=0;
size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info
- size_t missingAlignmentCount = 0;
+ size_t missingAlignmentCount = 0;
while(getline(inFile, line)) {
++lnc;
@@ -553,9 +554,9 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
if (!sparseFeatureString.empty()) {
std::vector<std::string> sparseTokens = Tokenize(sparseFeatureString);
if (sparseTokens.size() % 2 != 0) {
- TRACE_ERR("ERROR: incorrectly formatted sparse feature string: " <<
- sparseFeatureString << std::endl);
- abort();
+ TRACE_ERR("ERROR: incorrectly formatted sparse feature string: " <<
+ sparseFeatureString << std::endl);
+ abort();
}
for (size_t i = 0; i < sparseTokens.size(); i+=2) {
fnames.push_back(imp->tv.add(sparseTokens[i]));
@@ -624,7 +625,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
if ( PrintWordAlignment()) {
TRACE_ERR("Count of lines with missing alignments: " <<
- missingAlignmentCount << "/" << lnc << "\n");
+ missingAlignmentCount << "/" << lnc << "\n");
}
fClose(os);
diff --git a/moses/TranslationModel/PhraseDictionaryTree.h b/moses/TranslationModel/PhraseDictionaryTree.h
index 1b88637c3..6214c8194 100644
--- a/moses/TranslationModel/PhraseDictionaryTree.h
+++ b/moses/TranslationModel/PhraseDictionaryTree.h
@@ -31,8 +31,7 @@ class PDTimp;
typedef PrefixTreeF<LabelId,OFF_T> PTF;
//typedef std::pair<std::vector<std::string const*>,Scores > StringTgtCand;
-struct StringTgtCand
-{
+struct StringTgtCand {
typedef std::vector<std::string const*> Tokens;
Tokens tokens;
Scores scores;
@@ -86,7 +85,7 @@ public:
// get the target candidates for a given phrase
void GetTargetCandidates(const std::vector<std::string>& src,
std::vector<StringTgtCand>& rv) const;
-
+
// get the target candidates for a given phrase
void GetTargetCandidates(const std::vector<std::string>& src,
diff --git a/moses/TranslationModel/RuleTable/Loader.h b/moses/TranslationModel/RuleTable/Loader.h
index 4d3e03351..48390e37e 100644
--- a/moses/TranslationModel/RuleTable/Loader.h
+++ b/moses/TranslationModel/RuleTable/Loader.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -32,7 +32,7 @@ namespace Moses
*/
class RuleTableLoader
{
- public:
+public:
virtual ~RuleTableLoader() {}
virtual bool Load(const std::vector<FactorType> &input,
@@ -41,7 +41,7 @@ class RuleTableLoader
size_t tableLimit,
RuleTableTrie &) = 0;
- protected:
+protected:
// Provide access to RuleTableTrie's private SortAndPrune function.
void SortAndPrune(RuleTableTrie &ruleTable) {
ruleTable.SortAndPrune();
@@ -50,10 +50,10 @@ class RuleTableLoader
// Provide access to RuleTableTrie's private
// GetOrCreateTargetPhraseCollection function.
TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
- RuleTableTrie &ruleTable
- , const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS) {
+ RuleTableTrie &ruleTable
+ , const Phrase &source
+ , const TargetPhrase &target
+ , const Word *sourceLHS) {
return ruleTable.GetOrCreateTargetPhraseCollection(source, target, sourceLHS);
}
};
diff --git a/moses/TranslationModel/RuleTable/LoaderCompact.cpp b/moses/TranslationModel/RuleTable/LoaderCompact.cpp
index f235b3e79..2b4a6003a 100644
--- a/moses/TranslationModel/RuleTable/LoaderCompact.cpp
+++ b/moses/TranslationModel/RuleTable/LoaderCompact.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -84,9 +84,9 @@ bool RuleTableLoaderCompact::Load(const std::vector<FactorType> &input,
}
void RuleTableLoaderCompact::LoadVocabularySection(
- LineReader &reader,
- const std::vector<FactorType> &factorTypes,
- std::vector<Word> &vocabulary)
+ LineReader &reader,
+ const std::vector<FactorType> &factorTypes,
+ std::vector<Word> &vocabulary)
{
// Read symbol count.
reader.ReadLine();
@@ -106,10 +106,10 @@ void RuleTableLoaderCompact::LoadVocabularySection(
}
void RuleTableLoaderCompact::LoadPhraseSection(
- LineReader &reader,
- const std::vector<Word> &vocab,
- std::vector<Phrase> &rhsPhrases,
- std::vector<size_t> &lhsIds)
+ LineReader &reader,
+ const std::vector<Word> &vocab,
+ std::vector<Phrase> &rhsPhrases,
+ std::vector<size_t> &lhsIds)
{
// Read phrase count.
reader.ReadLine();
@@ -132,7 +132,7 @@ void RuleTableLoaderCompact::LoadPhraseSection(
}
void RuleTableLoaderCompact::LoadAlignmentSection(
- LineReader &reader, std::vector<const AlignmentInfo *> &alignmentSets, std::vector<Phrase> &sourcePhrases)
+ LineReader &reader, std::vector<const AlignmentInfo *> &alignmentSets, std::vector<Phrase> &sourcePhrases)
{
// Read alignment set count.
reader.ReadLine();
@@ -144,8 +144,8 @@ void RuleTableLoaderCompact::LoadAlignmentSection(
std::vector<size_t> points;
for (size_t i = 0; i < alignmentSetCount; ++i) {
// Read alignment set, lookup in collection, and store pointer.
- alignTerm.clear();
- alignNonTerm.clear();
+ alignTerm.clear();
+ alignNonTerm.clear();
tokens.clear();
reader.ReadLine();
@@ -157,11 +157,10 @@ void RuleTableLoaderCompact::LoadAlignmentSection(
std::pair<size_t, size_t> alignmentPair(points[0], points[1]);
if (sourcePhrases[i].GetWord(alignmentPair.first).IsNonTerminal()) {
- alignNonTerm.insert(alignmentPair);
+ alignNonTerm.insert(alignmentPair);
+ } else {
+ alignTerm.insert(alignmentPair);
}
- else {
- alignTerm.insert(alignmentPair);
- }
}
alignmentSets[i*2] = AlignmentInfoCollection::Instance().Add(alignNonTerm);
@@ -170,13 +169,13 @@ void RuleTableLoaderCompact::LoadAlignmentSection(
}
bool RuleTableLoaderCompact::LoadRuleSection(
- LineReader &reader,
- const std::vector<Word> &vocab,
- const std::vector<Phrase> &sourcePhrases,
- const std::vector<Phrase> &targetPhrases,
- const std::vector<size_t> &targetLhsIds,
- const std::vector<const AlignmentInfo *> &alignmentSets,
- RuleTableTrie &ruleTable)
+ LineReader &reader,
+ const std::vector<Word> &vocab,
+ const std::vector<Phrase> &sourcePhrases,
+ const std::vector<Phrase> &targetPhrases,
+ const std::vector<size_t> &targetLhsIds,
+ const std::vector<const AlignmentInfo *> &alignmentSets,
+ RuleTableTrie &ruleTable)
{
// Read rule count.
reader.ReadLine();
@@ -232,7 +231,7 @@ bool RuleTableLoaderCompact::LoadRuleSection(
// Insert rule into table.
TargetPhraseCollection &coll = GetOrCreateTargetPhraseCollection(
- ruleTable, sourcePhrase, *targetPhrase, &sourceLHS);
+ ruleTable, sourcePhrase, *targetPhrase, &sourceLHS);
coll.Add(targetPhrase);
}
diff --git a/moses/TranslationModel/RuleTable/LoaderCompact.h b/moses/TranslationModel/RuleTable/LoaderCompact.h
index 314cfca57..26e19fce6 100644
--- a/moses/TranslationModel/RuleTable/LoaderCompact.h
+++ b/moses/TranslationModel/RuleTable/LoaderCompact.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -35,14 +35,14 @@ class RuleTableTrie;
//! @todo ask phil williams
class RuleTableLoaderCompact : public RuleTableLoader
{
- public:
+public:
bool Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
const std::string &inFile,
size_t tableLimit,
RuleTableTrie &);
- private:
+private:
struct LineReader {
LineReader(std::istream &input) : m_input(input), m_lineNum(0) {}
void ReadLine() {
@@ -78,8 +78,7 @@ class RuleTableLoaderCompact : public RuleTableLoader
// Like Tokenize() but records starting positions of tokens (instead of
// copying substrings) and assumes delimiter is ASCII space character.
- void FindTokens(std::vector<size_t> &output, const std::string &str) const
- {
+ void FindTokens(std::vector<size_t> &output, const std::string &str) const {
// Skip delimiters at beginning.
size_t lastPos = str.find_first_not_of(' ', 0);
// Find first "non-delimiter".
diff --git a/moses/TranslationModel/RuleTable/LoaderFactory.cpp b/moses/TranslationModel/RuleTable/LoaderFactory.cpp
index b3bd00555..cdbfc965a 100644
--- a/moses/TranslationModel/RuleTable/LoaderFactory.cpp
+++ b/moses/TranslationModel/RuleTable/LoaderFactory.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -37,7 +37,7 @@ namespace Moses
// Determines the rule table type by peeking inside the file then creates
// a suitable RuleTableLoader object.
std::auto_ptr<RuleTableLoader> RuleTableLoaderFactory::Create(
- const std::string &path)
+ const std::string &path)
{
InputFileStream input(path);
std::string line;
@@ -54,17 +54,15 @@ std::auto_ptr<RuleTableLoader> RuleTableLoaderFactory::Create(
msg << "Unsupported compact rule table format: " << tokens[0];
UserMessage::Add(msg.str());
return std::auto_ptr<RuleTableLoader>();
+ } else if (tokens[0] == "[X]" && tokens[1] == "|||") {
+ return std::auto_ptr<RuleTableLoader>(new
+ RuleTableLoaderHiero());
+
}
- else if (tokens[0] == "[X]" && tokens[1] == "|||") {
- return std::auto_ptr<RuleTableLoader>(new
- RuleTableLoaderHiero());
-
- }
-
+
return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderStandard());
- }
- else
- { // empty phrase table
+ } else {
+ // empty phrase table
return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderStandard());
}
}
diff --git a/moses/TranslationModel/RuleTable/LoaderFactory.h b/moses/TranslationModel/RuleTable/LoaderFactory.h
index 01c168680..c695738e4 100644
--- a/moses/TranslationModel/RuleTable/LoaderFactory.h
+++ b/moses/TranslationModel/RuleTable/LoaderFactory.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -30,7 +30,7 @@ class RuleTableLoader;
//! Creates a RuleTableLoader object suitable for loading the specified file.
class RuleTableLoaderFactory
{
- public:
+public:
static std::auto_ptr<RuleTableLoader> Create(const std::string &);
};
diff --git a/moses/TranslationModel/RuleTable/LoaderHiero.cpp b/moses/TranslationModel/RuleTable/LoaderHiero.cpp
index c0526be02..81289d9b2 100644
--- a/moses/TranslationModel/RuleTable/LoaderHiero.cpp
+++ b/moses/TranslationModel/RuleTable/LoaderHiero.cpp
@@ -11,19 +11,20 @@
using namespace std;
-namespace Moses {
-
+namespace Moses
+{
+
bool RuleTableLoaderHiero::Load(const std::vector<FactorType> &input,
- const std::vector<FactorType> &output,
- const std::string &inFile,
- size_t tableLimit,
- RuleTableTrie &ruleTable)
+ const std::vector<FactorType> &output,
+ const std::string &inFile,
+ size_t tableLimit,
+ RuleTableTrie &ruleTable)
{
bool ret = RuleTableLoaderStandard::Load(HieroFormat
- ,input, output
- ,inFile
- ,tableLimit
- ,ruleTable);
+ ,input, output
+ ,inFile
+ ,tableLimit
+ ,ruleTable);
return ret;
}
diff --git a/moses/TranslationModel/RuleTable/LoaderHiero.h b/moses/TranslationModel/RuleTable/LoaderHiero.h
index 1f6b66725..099787281 100644
--- a/moses/TranslationModel/RuleTable/LoaderHiero.h
+++ b/moses/TranslationModel/RuleTable/LoaderHiero.h
@@ -11,7 +11,8 @@
#include "LoaderStandard.h"
-namespace Moses {
+namespace Moses
+{
//! specific implementation of SCFG loader to load rule tables formatted in Hiero-style format
class RuleTableLoaderHiero : public RuleTableLoaderStandard
diff --git a/moses/TranslationModel/RuleTable/LoaderStandard.cpp b/moses/TranslationModel/RuleTable/LoaderStandard.cpp
index 566684775..fb5052c40 100644
--- a/moses/TranslationModel/RuleTable/LoaderStandard.cpp
+++ b/moses/TranslationModel/RuleTable/LoaderStandard.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -59,38 +59,34 @@ bool RuleTableLoaderStandard::Load(const std::vector<FactorType> &input
return ret;
}
-
+
void ReformatHieroRule(int sourceTarget, string &phrase, map<size_t, pair<size_t, size_t> > &ntAlign)
{
vector<string> toks;
Tokenize(toks, phrase, " ");
- for (size_t i = 0; i < toks.size(); ++i)
- {
+ for (size_t i = 0; i < toks.size(); ++i) {
string &tok = toks[i];
size_t tokLen = tok.size();
- if (tok.substr(0, 1) == "[" && tok.substr(tokLen - 1, 1) == "]")
- { // no-term
+ if (tok.substr(0, 1) == "[" && tok.substr(tokLen - 1, 1) == "]") {
+ // no-term
vector<string> split = Tokenize(tok, ",");
CHECK(split.size() == 2);
-
+
tok = "[X]" + split[0] + "]";
size_t coIndex = Scan<size_t>(split[1]);
-
+
pair<size_t, size_t> &alignPoint = ntAlign[coIndex];
- if (sourceTarget == 0)
- {
+ if (sourceTarget == 0) {
alignPoint.first = i;
- }
- else
- {
+ } else {
alignPoint.second = i;
}
}
}
-
+
phrase = Join(" ", toks) + " [X]";
-
+
}
void ReformateHieroScore(string &scoreString)
@@ -98,8 +94,7 @@ void ReformateHieroScore(string &scoreString)
vector<string> toks;
Tokenize(toks, scoreString, " ");
- for (size_t i = 0; i < toks.size(); ++i)
- {
+ for (size_t i = 0; i < toks.size(); ++i) {
string &tok = toks[i];
vector<string> nameValue = Tokenize(tok, "=");
CHECK(nameValue.size() == 2);
@@ -108,49 +103,48 @@ void ReformateHieroScore(string &scoreString)
score = exp(-score);
tok = SPrint(score);
}
-
+
scoreString = Join(" ", toks);
}
-
+
void ReformatHieroRule(const string &lineOrig, string &out)
-{
+{
vector<string> tokens;
vector<float> scoreVector;
-
+
TokenizeMultiCharSeparator(tokens, lineOrig, "|||" );
string &sourcePhraseString = tokens[1]
- , &targetPhraseString = tokens[2]
- , &scoreString = tokens[3];
+ , &targetPhraseString = tokens[2]
+ , &scoreString = tokens[3];
map<size_t, pair<size_t, size_t> > ntAlign;
ReformatHieroRule(0, sourcePhraseString, ntAlign);
ReformatHieroRule(1, targetPhraseString, ntAlign);
ReformateHieroScore(scoreString);
-
+
stringstream align;
map<size_t, pair<size_t, size_t> >::const_iterator iterAlign;
- for (iterAlign = ntAlign.begin(); iterAlign != ntAlign.end(); ++iterAlign)
- {
+ for (iterAlign = ntAlign.begin(); iterAlign != ntAlign.end(); ++iterAlign) {
const pair<size_t, size_t> &alignPoint = iterAlign->second;
align << alignPoint.first << "-" << alignPoint.second << " ";
}
-
+
stringstream ret;
ret << sourcePhraseString << " ||| "
- << targetPhraseString << " ||| "
+ << targetPhraseString << " ||| "
<< scoreString << " ||| "
<< align.str();
-
+
out = ret.str();
}
-
+
bool RuleTableLoaderStandard::Load(FormatType format
- , const std::vector<FactorType> &input
- , const std::vector<FactorType> &output
- , const std::string &inFile
- , size_t /* tableLimit */
- , RuleTableTrie &ruleTable)
+ , const std::vector<FactorType> &input
+ , const std::vector<FactorType> &output
+ , const std::string &inFile
+ , size_t /* tableLimit */
+ , RuleTableTrie &ruleTable)
{
PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format");
@@ -174,7 +168,9 @@ bool RuleTableLoaderStandard::Load(FormatType format
while(true) {
try {
line = in.ReadLine();
- } catch (const util::EndOfFileException &e) { break; }
+ } catch (const util::EndOfFileException &e) {
+ break;
+ }
if (format == HieroFormat) { // inefficiently reformat line
hiero_before.assign(line.data(), line.size());
@@ -186,7 +182,7 @@ bool RuleTableLoaderStandard::Load(FormatType format
StringPiece sourcePhraseString(*pipes);
StringPiece targetPhraseString(*++pipes);
StringPiece scoreString(*++pipes);
-
+
StringPiece alignString;
if (++pipes) {
StringPiece temp(*pipes);
@@ -237,9 +233,9 @@ bool RuleTableLoaderStandard::Load(FormatType format
// rest of target phrase
targetPhrase->SetAlignmentInfo(alignString);
targetPhrase->SetTargetLHS(targetLHS);
-
+
//targetPhrase->SetDebugOutput(string("New Format pt ") + line);
-
+
if (++pipes) {
StringPiece sparseString(*pipes);
targetPhrase->SetSparseScore(&ruleTable, sparseString);
diff --git a/moses/TranslationModel/RuleTable/LoaderStandard.h b/moses/TranslationModel/RuleTable/LoaderStandard.h
index 4beefea39..b47f7c00b 100644
--- a/moses/TranslationModel/RuleTable/LoaderStandard.h
+++ b/moses/TranslationModel/RuleTable/LoaderStandard.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -35,7 +35,7 @@ protected:
const std::string &inFile,
size_t tableLimit,
RuleTableTrie &);
- public:
+public:
bool Load(const std::vector<FactorType> &input,
const std::vector<FactorType> &output,
const std::string &inFile,
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp
index 8f736af60..1f8ebab15 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.cpp
@@ -18,14 +18,14 @@
using namespace std;
-namespace Moses
+namespace Moses
{
PhraseDictionaryALSuffixArray::PhraseDictionaryALSuffixArray(const std::string &line)
-: PhraseDictionaryMemory("PhraseDictionaryALSuffixArray", line)
+ : PhraseDictionaryMemory("PhraseDictionaryALSuffixArray", line)
{
const StaticData &staticData = StaticData::Instance();
if (staticData.ThreadCount() > 1) {
- throw runtime_error("Suffix array implementation is not threadsafe");
+ throw runtime_error("Suffix array implementation is not threadsafe");
}
}
@@ -33,14 +33,14 @@ void PhraseDictionaryALSuffixArray::InitializeForInput(InputType const& source)
{
// populate with rules for this sentence
long translationId = source.GetTranslationId();
-
+
string grammarFile = GetFilePath() + "/grammar." + SPrint(translationId) + ".gz";
-
+
std::auto_ptr<RuleTableLoader> loader =
- RuleTableLoaderFactory::Create(grammarFile);
+ RuleTableLoaderFactory::Create(grammarFile);
bool ret = loader->Load(m_input, m_output, grammarFile, m_tableLimit,
*this);
-
+
CHECK(ret);
}
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h
index 81e1e02cf..aa4c15258 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryALSuffixArray.h
@@ -11,26 +11,28 @@
#include "moses/TranslationModel/PhraseDictionaryMemory.h"
-namespace Moses {
-
+namespace Moses
+{
+
/** Implementation of in-memory phrase table for use with Adam Lopez's suffix array.
* Does 2 things that the normal in-memory pt doesn't do:
* 1. Loads grammar for a sentence to be decoded only when the sentence is being decoded. Unload afterwards
2. Format of the pt file follows Hiero, rather than Moses
- */
+ */
class PhraseDictionaryALSuffixArray : public PhraseDictionaryMemory
{
public:
PhraseDictionaryALSuffixArray(const std::string &line);
- bool InitDictionary()
- { return true; }
+ bool InitDictionary() {
+ return true;
+ }
void InitializeForInput(InputType const& source);
void CleanUpAfterSentenceProcessing(const InputType& source);
protected:
-
+
};
-
+
}
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
index be6996399..669e7306b 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp
@@ -3,17 +3,17 @@
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -49,316 +49,312 @@ using namespace std;
namespace Moses
{
- PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line)
+PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line)
: PhraseDictionary("PhraseDictionaryFuzzyMatch", line)
- {}
-
- bool PhraseDictionaryFuzzyMatch::Load(const std::vector<FactorType> &input
- , const std::vector<FactorType> &output
- , const std::string &initStr
- , size_t tableLimit)
- {
- m_tableLimit = tableLimit;
- m_input = &input;
- m_output = &output;
-
-
- cerr << "initStr=" << initStr << endl;
- m_config = Tokenize(initStr, ";");
- assert(m_config.size() == 3);
-
- m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]);
-
- return true;
+{}
+
+bool PhraseDictionaryFuzzyMatch::Load(const std::vector<FactorType> &input
+ , const std::vector<FactorType> &output
+ , const std::string &initStr
+ , size_t tableLimit)
+{
+ m_tableLimit = tableLimit;
+ m_input = &input;
+ m_output = &output;
+
+
+ cerr << "initStr=" << initStr << endl;
+ m_config = Tokenize(initStr, ";");
+ assert(m_config.size() == 3);
+
+ m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]);
+
+ return true;
+}
+
+ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager(
+ const InputType &sentence,
+ const ChartCellCollectionBase &cellCollection)
+{
+ return new ChartRuleLookupManagerMemoryPerSentence(sentence, cellCollection, *this);
+}
+
+
+int removedirectoryrecursively(const char *dirname)
+{
+ DIR *dir;
+ struct dirent *entry;
+ char path[PATH_MAX];
+
+ if (path == NULL) {
+ fprintf(stderr, "Out of memory error\n");
+ return 0;
}
-
- ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager(
- const InputType &sentence,
- const ChartCellCollectionBase &cellCollection)
- {
- return new ChartRuleLookupManagerMemoryPerSentence(sentence, cellCollection, *this);
+ dir = opendir(dirname);
+ if (dir == NULL) {
+ perror("Error opendir()");
+ return 0;
}
-
-
- int removedirectoryrecursively(const char *dirname)
- {
- DIR *dir;
- struct dirent *entry;
- char path[PATH_MAX];
-
- if (path == NULL) {
- fprintf(stderr, "Out of memory error\n");
- return 0;
- }
- dir = opendir(dirname);
- if (dir == NULL) {
- perror("Error opendir()");
- return 0;
- }
-
- while ((entry = readdir(dir)) != NULL) {
- if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) {
- snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name);
- if (entry->d_type == DT_DIR) {
- removedirectoryrecursively(path);
- }
-
- remove(path);
- /*
- * Here, the actual deletion must be done. Beacuse this is
- * quite a dangerous thing to do, and this program is not very
- * well tested, we are just printing as if we are deleting.
- */
- //printf("(not really) Deleting: %s\n", path);
- /*
- * When you are finished testing this and feel you are ready to do the real
- * deleting, use this: remove*STUB*(path);
- * (see "man 3 remove")
- * Please note that I DONT TAKE RESPONSIBILITY for data you delete with this!
- */
+
+ while ((entry = readdir(dir)) != NULL) {
+ if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) {
+ snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name);
+ if (entry->d_type == DT_DIR) {
+ removedirectoryrecursively(path);
}
-
+
+ remove(path);
+ /*
+ * Here, the actual deletion must be done. Beacuse this is
+ * quite a dangerous thing to do, and this program is not very
+ * well tested, we are just printing as if we are deleting.
+ */
+ //printf("(not really) Deleting: %s\n", path);
+ /*
+ * When you are finished testing this and feel you are ready to do the real
+ * deleting, use this: remove*STUB*(path);
+ * (see "man 3 remove")
+ * Please note that I DONT TAKE RESPONSIBILITY for data you delete with this!
+ */
}
- closedir(dir);
-
- rmdir(dirname);
- /*
- * Now the directory is emtpy, finally delete the directory itself. (Just
- * printing here, see above)
- */
- //printf("(not really) Deleting: %s\n", dirname);
-
- return 1;
+
+ }
+ closedir(dir);
+
+ rmdir(dirname);
+ /*
+ * Now the directory is emtpy, finally delete the directory itself. (Just
+ * printing here, see above)
+ */
+ //printf("(not really) Deleting: %s\n", dirname);
+
+ return 1;
+}
+
+void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence)
+{
+ char dirName[] = "/tmp/moses.XXXXXX";
+ char *temp = mkdtemp(dirName);
+ CHECK(temp);
+ string dirNameStr(dirName);
+
+ string inFileName(dirNameStr + "/in");
+
+ ofstream inFile(inFileName.c_str());
+
+ for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) {
+ inFile << inputSentence.GetWord(i);
}
+ inFile << endl;
+ inFile.close();
+
+ long translationId = inputSentence.GetTranslationId();
+ string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr);
+
+ // populate with rules for this sentence
+ PhraseDictionaryNodeMemory &rootNode = m_collection[translationId];
+ FormatType format = MosesFormat;
- void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence)
- {
- char dirName[] = "/tmp/moses.XXXXXX";
- char *temp = mkdtemp(dirName);
- CHECK(temp);
- string dirNameStr(dirName);
-
- string inFileName(dirNameStr + "/in");
-
- ofstream inFile(inFileName.c_str());
-
- for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i)
- {
- inFile << inputSentence.GetWord(i);
+ // data from file
+ InputFileStream inStream(ptFileName);
+
+ // copied from class LoaderStandard
+ PrintUserTime("Start loading fuzzy-match phrase model");
+
+ const StaticData &staticData = StaticData::Instance();
+ const std::string& factorDelimiter = staticData.GetFactorDelimiter();
+
+
+ string lineOrig;
+ size_t count = 0;
+
+ while(getline(inStream, lineOrig)) {
+ const string *line;
+ if (format == HieroFormat) { // reformat line
+ assert(false);
+ //line = ReformatHieroRule(lineOrig);
+ } else {
+ // do nothing to format of line
+ line = &lineOrig;
}
- inFile << endl;
- inFile.close();
-
- long translationId = inputSentence.GetTranslationId();
- string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr);
-
- // populate with rules for this sentence
- PhraseDictionaryNodeMemory &rootNode = m_collection[translationId];
- FormatType format = MosesFormat;
-
- // data from file
- InputFileStream inStream(ptFileName);
-
- // copied from class LoaderStandard
- PrintUserTime("Start loading fuzzy-match phrase model");
-
- const StaticData &staticData = StaticData::Instance();
- const std::string& factorDelimiter = staticData.GetFactorDelimiter();
-
-
- string lineOrig;
- size_t count = 0;
-
- while(getline(inStream, lineOrig)) {
- const string *line;
- if (format == HieroFormat) { // reformat line
- assert(false);
- //line = ReformatHieroRule(lineOrig);
- }
- else
- { // do nothing to format of line
- line = &lineOrig;
- }
-
- vector<string> tokens;
- vector<float> scoreVector;
-
- TokenizeMultiCharSeparator(tokens, *line , "|||" );
-
- if (tokens.size() != 4 && tokens.size() != 5) {
- stringstream strme;
- strme << "Syntax error at " << ptFileName << ":" << count;
- UserMessage::Add(strme.str());
- abort();
- }
-
- const string &sourcePhraseString = tokens[0]
- , &targetPhraseString = tokens[1]
- , &scoreString = tokens[2]
- , &alignString = tokens[3];
-
- bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
- if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
- TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");
- continue;
- }
-
- Tokenize<float>(scoreVector, scoreString);
- const size_t numScoreComponents = GetNumScoreComponents();
- if (scoreVector.size() != numScoreComponents) {
- stringstream strme;
- strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
- << numScoreComponents << ") of score components on line " << count;
- UserMessage::Add(strme.str());
- abort();
- }
- CHECK(scoreVector.size() == numScoreComponents);
-
- // parse source & find pt node
-
- // constituent labels
- Word *sourceLHS;
- Word *targetLHS;
-
- // source
- Phrase sourcePhrase( 0);
- sourcePhrase.CreateFromString(Input, *m_input, sourcePhraseString, factorDelimiter, &sourceLHS);
-
- // create target phrase obj
- TargetPhrase *targetPhrase = new TargetPhrase();
- targetPhrase->CreateFromString(Output, *m_output, targetPhraseString, factorDelimiter, &targetLHS);
-
- // rest of target phrase
- targetPhrase->SetAlignmentInfo(alignString);
- targetPhrase->SetTargetLHS(targetLHS);
- //targetPhrase->SetDebugOutput(string("New Format pt ") + line);
-
- // component score, for n-best output
- std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
- std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
-
- targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
- targetPhrase->Evaluate(sourcePhrase);
-
- TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS);
- phraseColl.Add(targetPhrase);
-
- count++;
-
- if (format == HieroFormat) { // reformat line
- delete line;
- }
- else
- { // do nothing
- }
-
+
+ vector<string> tokens;
+ vector<float> scoreVector;
+
+ TokenizeMultiCharSeparator(tokens, *line , "|||" );
+
+ if (tokens.size() != 4 && tokens.size() != 5) {
+ stringstream strme;
+ strme << "Syntax error at " << ptFileName << ":" << count;
+ UserMessage::Add(strme.str());
+ abort();
}
-
- // sort and prune each target phrase collection
- SortAndPrune(rootNode);
-
- //removedirectoryrecursively(dirName);
- }
-
- TargetPhraseCollection &PhraseDictionaryFuzzyMatch::GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode
- , const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS)
- {
- PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS);
- return currNode.GetOrCreateTargetPhraseCollection();
- }
- PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode
- , const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS)
- {
- cerr << source << endl << target << endl;
- const size_t size = source.GetSize();
-
- const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
- AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
-
- PhraseDictionaryNodeMemory *currNode = &rootNode;
- for (size_t pos = 0 ; pos < size ; ++pos) {
- const Word& word = source.GetWord(pos);
-
- if (word.IsNonTerminal()) {
- // indexed by source label 1st
- const Word &sourceNonTerm = word;
-
- CHECK(iterAlign != alignmentInfo.end());
- CHECK(iterAlign->first == pos);
- size_t targetNonTermInd = iterAlign->second;
- ++iterAlign;
- const Word &targetNonTerm = target.GetWord(targetNonTermInd);
-
- currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm);
- } else {
- currNode = currNode->GetOrCreateChild(word);
- }
-
- CHECK(currNode != NULL);
+ const string &sourcePhraseString = tokens[0]
+ , &targetPhraseString = tokens[1]
+ , &scoreString = tokens[2]
+ , &alignString = tokens[3];
+
+ bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
+ if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
+ TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");
+ continue;
}
-
- // finally, the source LHS
- //currNode = currNode->GetOrCreateChild(sourceLHS);
- //CHECK(currNode != NULL);
-
-
- return *currNode;
- }
- void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeMemory &rootNode)
- {
- if (GetTableLimit())
- {
- rootNode.Sort(GetTableLimit());
+ Tokenize<float>(scoreVector, scoreString);
+ const size_t numScoreComponents = GetNumScoreComponents();
+ if (scoreVector.size() != numScoreComponents) {
+ stringstream strme;
+ strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
+ << numScoreComponents << ") of score components on line " << count;
+ UserMessage::Add(strme.str());
+ abort();
}
+ CHECK(scoreVector.size() == numScoreComponents);
+
+ // parse source & find pt node
+
+ // constituent labels
+ Word *sourceLHS;
+ Word *targetLHS;
+
+ // source
+ Phrase sourcePhrase( 0);
+ sourcePhrase.CreateFromString(Input, *m_input, sourcePhraseString, factorDelimiter, &sourceLHS);
+
+ // create target phrase obj
+ TargetPhrase *targetPhrase = new TargetPhrase();
+ targetPhrase->CreateFromString(Output, *m_output, targetPhraseString, factorDelimiter, &targetLHS);
+
+ // rest of target phrase
+ targetPhrase->SetAlignmentInfo(alignString);
+ targetPhrase->SetTargetLHS(targetLHS);
+ //targetPhrase->SetDebugOutput(string("New Format pt ") + line);
+
+ // component score, for n-best output
+ std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
+ std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
+
+ targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
+ targetPhrase->Evaluate(sourcePhrase);
+
+ TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS);
+ phraseColl.Add(targetPhrase);
+
+ count++;
+
+ if (format == HieroFormat) { // reformat line
+ delete line;
+ } else {
+ // do nothing
+ }
+
}
-
- void PhraseDictionaryFuzzyMatch::CleanUpAfterSentenceProcessing(const InputType &source)
- {
- m_collection.erase(source.GetTranslationId());
+
+ // sort and prune each target phrase collection
+ SortAndPrune(rootNode);
+
+ //removedirectoryrecursively(dirName);
+}
+
+TargetPhraseCollection &PhraseDictionaryFuzzyMatch::GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode
+ , const Phrase &source
+ , const TargetPhrase &target
+ , const Word *sourceLHS)
+{
+ PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS);
+ return currNode.GetOrCreateTargetPhraseCollection();
+}
+
+PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode
+ , const Phrase &source
+ , const TargetPhrase &target
+ , const Word *sourceLHS)
+{
+ cerr << source << endl << target << endl;
+ const size_t size = source.GetSize();
+
+ const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
+ AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
+
+ PhraseDictionaryNodeMemory *currNode = &rootNode;
+ for (size_t pos = 0 ; pos < size ; ++pos) {
+ const Word& word = source.GetWord(pos);
+
+ if (word.IsNonTerminal()) {
+ // indexed by source label 1st
+ const Word &sourceNonTerm = word;
+
+ CHECK(iterAlign != alignmentInfo.end());
+ CHECK(iterAlign->first == pos);
+ size_t targetNonTermInd = iterAlign->second;
+ ++iterAlign;
+ const Word &targetNonTerm = target.GetWord(targetNonTermInd);
+
+ currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm);
+ } else {
+ currNode = currNode->GetOrCreateChild(word);
+ }
+
+ CHECK(currNode != NULL);
}
- const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) const
- {
- long transId = source.GetTranslationId();
- std::map<long, PhraseDictionaryNodeMemory>::const_iterator iter = m_collection.find(transId);
- CHECK(iter != m_collection.end());
- return iter->second;
+ // finally, the source LHS
+ //currNode = currNode->GetOrCreateChild(sourceLHS);
+ //CHECK(currNode != NULL);
+
+
+ return *currNode;
+}
+
+void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeMemory &rootNode)
+{
+ if (GetTableLimit()) {
+ rootNode.Sort(GetTableLimit());
}
- PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source)
- {
- long transId = source.GetTranslationId();
- std::map<long, PhraseDictionaryNodeMemory>::iterator iter = m_collection.find(transId);
- CHECK(iter != m_collection.end());
- return iter->second;
+}
+
+void PhraseDictionaryFuzzyMatch::CleanUpAfterSentenceProcessing(const InputType &source)
+{
+ m_collection.erase(source.GetTranslationId());
+}
+
+const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) const
+{
+ long transId = source.GetTranslationId();
+ std::map<long, PhraseDictionaryNodeMemory>::const_iterator iter = m_collection.find(transId);
+ CHECK(iter != m_collection.end());
+ return iter->second;
+}
+PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source)
+{
+ long transId = source.GetTranslationId();
+ std::map<long, PhraseDictionaryNodeMemory>::iterator iter = m_collection.find(transId);
+ CHECK(iter != m_collection.end());
+ return iter->second;
+}
+
+TO_STRING_BODY(PhraseDictionaryFuzzyMatch);
+
+// friend
+ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict)
+{
+ typedef PhraseDictionaryNodeMemory::TerminalMap TermMap;
+ typedef PhraseDictionaryNodeMemory::NonTerminalMap NonTermMap;
+
+ /*
+ const PhraseDictionaryNodeMemory &coll = phraseDict.m_collection;
+ for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) {
+ const Word &sourceNonTerm = p->first.first;
+ out << sourceNonTerm;
}
-
- TO_STRING_BODY(PhraseDictionaryFuzzyMatch);
-
- // friend
- ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict)
- {
- typedef PhraseDictionaryNodeMemory::TerminalMap TermMap;
- typedef PhraseDictionaryNodeMemory::NonTerminalMap NonTermMap;
-
- /*
- const PhraseDictionaryNodeMemory &coll = phraseDict.m_collection;
- for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) {
- const Word &sourceNonTerm = p->first.first;
- out << sourceNonTerm;
- }
- for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) {
- const Word &sourceTerm = p->first;
- out << sourceTerm;
- }
- */
-
- return out;
+ for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) {
+ const Word &sourceTerm = p->first;
+ out << sourceTerm;
}
-
+ */
+
+ return out;
+}
+
}
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h
index 8e4d20423..94966b175 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2011 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -29,61 +29,60 @@
namespace Moses
{
- class PhraseDictionaryNodeMemory;
-
- /** Implementation of a SCFG rule table in a trie. Looking up a rule of
- * length n symbols requires n look-ups to find the TargetPhraseCollection.
- */
- class PhraseDictionaryFuzzyMatch : public PhraseDictionary
- {
- friend std::ostream& operator<<(std::ostream&, const PhraseDictionaryFuzzyMatch&);
- friend class RuleTableLoader;
-
- public:
- PhraseDictionaryFuzzyMatch(const std::string &line);
- bool Load(const std::vector<FactorType> &input
- , const std::vector<FactorType> &output
- , const std::string &initStr
- , size_t tableLimit);
-
- const PhraseDictionaryNodeMemory &GetRootNode(const InputType &source) const;
-
- ChartRuleLookupManager *CreateRuleLookupManager(
- const InputType &,
- const ChartCellCollectionBase &);
- void InitializeForInput(InputType const& inputSentence);
- void CleanUpAfterSentenceProcessing(const InputType& source);
-
- virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase& src) const
- {
- assert(false);
- return NULL;
- }
-
- TO_STRING();
-
- protected:
- TargetPhraseCollection &GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode
- , const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS);
-
- PhraseDictionaryNodeMemory &GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode
- , const Phrase &source
- , const TargetPhrase &target
- , const Word *sourceLHS);
-
- void SortAndPrune(PhraseDictionaryNodeMemory &rootNode);
- PhraseDictionaryNodeMemory &GetRootNode(const InputType &source);
-
- std::map<long, PhraseDictionaryNodeMemory> m_collection;
- std::vector<std::string> m_config;
-
- const std::vector<FactorType> *m_input, *m_output;
- const std::vector<float> *m_weight;
-
- tmmt::FuzzyMatchWrapper *m_FuzzyMatchWrapper;
-
- };
-
+class PhraseDictionaryNodeMemory;
+
+/** Implementation of a SCFG rule table in a trie. Looking up a rule of
+ * length n symbols requires n look-ups to find the TargetPhraseCollection.
+ */
+class PhraseDictionaryFuzzyMatch : public PhraseDictionary
+{
+ friend std::ostream& operator<<(std::ostream&, const PhraseDictionaryFuzzyMatch&);
+ friend class RuleTableLoader;
+
+public:
+ PhraseDictionaryFuzzyMatch(const std::string &line);
+ bool Load(const std::vector<FactorType> &input
+ , const std::vector<FactorType> &output
+ , const std::string &initStr
+ , size_t tableLimit);
+
+ const PhraseDictionaryNodeMemory &GetRootNode(const InputType &source) const;
+
+ ChartRuleLookupManager *CreateRuleLookupManager(
+ const InputType &,
+ const ChartCellCollectionBase &);
+ void InitializeForInput(InputType const& inputSentence);
+ void CleanUpAfterSentenceProcessing(const InputType& source);
+
+ virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase& src) const {
+ assert(false);
+ return NULL;
+ }
+
+ TO_STRING();
+
+protected:
+ TargetPhraseCollection &GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode
+ , const Phrase &source
+ , const TargetPhrase &target
+ , const Word *sourceLHS);
+
+ PhraseDictionaryNodeMemory &GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode
+ , const Phrase &source
+ , const TargetPhrase &target
+ , const Word *sourceLHS);
+
+ void SortAndPrune(PhraseDictionaryNodeMemory &rootNode);
+ PhraseDictionaryNodeMemory &GetRootNode(const InputType &source);
+
+ std::map<long, PhraseDictionaryNodeMemory> m_collection;
+ std::vector<std::string> m_config;
+
+ const std::vector<FactorType> *m_input, *m_output;
+ const std::vector<float> *m_weight;
+
+ tmmt::FuzzyMatchWrapper *m_FuzzyMatchWrapper;
+
+};
+
} // namespace Moses
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
index cd509f544..38cf247af 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
@@ -1,4 +1,4 @@
- // vim:tabstop=2
+// vim:tabstop=2
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2010 Hieu Hoang
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h
index 9b186def9..874478cdc 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.h
@@ -35,7 +35,7 @@ class TargetPhraseCollection;
class DottedRuleStackOnDisk;
/** Implementation of on-disk phrase table for hierarchical/syntax model.
- */
+ */
class PhraseDictionaryOnDisk : public PhraseDictionary
{
typedef PhraseDictionary MyBase;
diff --git a/moses/TranslationModel/RuleTable/Trie.cpp b/moses/TranslationModel/RuleTable/Trie.cpp
index c3590074d..950271d29 100644
--- a/moses/TranslationModel/RuleTable/Trie.cpp
+++ b/moses/TranslationModel/RuleTable/Trie.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -38,7 +38,7 @@ bool RuleTableTrie::InitDictionary()
{
std::auto_ptr<Moses::RuleTableLoader> loader =
- Moses::RuleTableLoaderFactory::Create(m_filePath);
+ Moses::RuleTableLoaderFactory::Create(m_filePath);
if (!loader.get()) {
return false;
}
diff --git a/moses/TranslationModel/RuleTable/Trie.h b/moses/TranslationModel/RuleTable/Trie.h
index 822ef8b92..c2f757ab8 100644
--- a/moses/TranslationModel/RuleTable/Trie.h
+++ b/moses/TranslationModel/RuleTable/Trie.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -39,28 +39,27 @@ class Word;
*/
class RuleTableTrie : public PhraseDictionary
{
- public:
+public:
RuleTableTrie(const std::string &description, const std::string &line)
- : PhraseDictionary(description, line)
- {}
+ : PhraseDictionary(description, line)
+ {}
virtual ~RuleTableTrie();
bool InitDictionary();
// Required by PhraseDictionary.
- virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const
- {
+ virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const {
CHECK(false);
return NULL;
}
- private:
+private:
friend class RuleTableLoader;
virtual TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
- const Phrase &source, const TargetPhrase &target,
- const Word *sourceLHS) = 0;
+ const Phrase &source, const TargetPhrase &target,
+ const Word *sourceLHS) = 0;
virtual void SortAndPrune() = 0;
diff --git a/moses/TranslationModel/RuleTable/UTrie.cpp b/moses/TranslationModel/RuleTable/UTrie.cpp
index bcfc0d538..17f457f22 100644
--- a/moses/TranslationModel/RuleTable/UTrie.cpp
+++ b/moses/TranslationModel/RuleTable/UTrie.cpp
@@ -39,15 +39,15 @@ namespace Moses
{
TargetPhraseCollection &RuleTableUTrie::GetOrCreateTargetPhraseCollection(
- const Phrase &source, const TargetPhrase &target, const Word *sourceLHS)
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS)
{
UTrieNode &currNode = GetOrCreateNode(source, target, sourceLHS);
return currNode.GetOrCreateTargetPhraseCollection(target);
}
UTrieNode &RuleTableUTrie::GetOrCreateNode(const Phrase &source,
- const TargetPhrase &target,
- const Word */*sourceLHS*/)
+ const TargetPhrase &target,
+ const Word */*sourceLHS*/)
{
const size_t size = source.GetSize();
diff --git a/moses/TranslationModel/RuleTable/UTrie.h b/moses/TranslationModel/RuleTable/UTrie.h
index d31e22cc7..a8f218158 100644
--- a/moses/TranslationModel/RuleTable/UTrie.h
+++ b/moses/TranslationModel/RuleTable/UTrie.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -43,21 +43,23 @@ class Word;
*/
class RuleTableUTrie : public RuleTableTrie
{
- public:
+public:
RuleTableUTrie(const std::string &line)
- : RuleTableTrie("RuleTableUTrie", line)
+ : RuleTableTrie("RuleTableUTrie", line)
{}
- const UTrieNode &GetRootNode() const { return m_root; }
+ const UTrieNode &GetRootNode() const {
+ return m_root;
+ }
ChartRuleLookupManager *CreateRuleLookupManager(const InputType &,
- const ChartCellCollectionBase &);
+ const ChartCellCollectionBase &);
- private:
+private:
const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const;
TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
- const Phrase &source, const TargetPhrase &target, const Word *sourceLHS);
+ const Phrase &source, const TargetPhrase &target, const Word *sourceLHS);
UTrieNode &GetOrCreateNode(const Phrase &source, const TargetPhrase &target,
const Word *sourceLHS);
diff --git a/moses/TranslationModel/RuleTable/UTrieNode.cpp b/moses/TranslationModel/RuleTable/UTrieNode.cpp
index d2275422e..725f02c97 100644
--- a/moses/TranslationModel/RuleTable/UTrieNode.cpp
+++ b/moses/TranslationModel/RuleTable/UTrieNode.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -90,7 +90,7 @@ UTrieNode *UTrieNode::GetOrCreateNonTerminalChild(const Word &targetNonTerm)
}
TargetPhraseCollection &UTrieNode::GetOrCreateTargetPhraseCollection(
- const TargetPhrase &target)
+ const TargetPhrase &target)
{
const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
const size_t rank = alignmentInfo.GetSize();
diff --git a/moses/TranslationModel/RuleTable/UTrieNode.h b/moses/TranslationModel/RuleTable/UTrieNode.h
index b3d82cddc..436bcbea1 100644
--- a/moses/TranslationModel/RuleTable/UTrieNode.h
+++ b/moses/TranslationModel/RuleTable/UTrieNode.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -42,49 +42,62 @@ class RuleTableUTrie;
//! @todo ask phil williams - whats the diff between this and phrasedictionaryNode
class UTrieNode
{
- public:
+public:
typedef std::vector<std::vector<Word> > LabelTable;
#if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200)
typedef boost::unordered_map<Word,
- UTrieNode,
- TerminalHasher,
- TerminalEqualityPred> TerminalMap;
+ UTrieNode,
+ TerminalHasher,
+ TerminalEqualityPred> TerminalMap;
typedef boost::unordered_map<std::vector<int>,
- TargetPhraseCollection> LabelMap;
+ TargetPhraseCollection> LabelMap;
#else
typedef std::map<Word, UTrieNode> TerminalMap;
typedef std::map<std::vector<int>, TargetPhraseCollection> LabelMap;
#endif
- ~UTrieNode() { delete m_gapNode; }
+ ~UTrieNode() {
+ delete m_gapNode;
+ }
- const LabelTable &GetLabelTable() const { return m_labelTable; }
- const LabelMap &GetLabelMap() const { return m_labelMap; }
- const TerminalMap &GetTerminalMap() const { return m_terminalMap; }
+ const LabelTable &GetLabelTable() const {
+ return m_labelTable;
+ }
+ const LabelMap &GetLabelMap() const {
+ return m_labelMap;
+ }
+ const TerminalMap &GetTerminalMap() const {
+ return m_terminalMap;
+ }
- const UTrieNode *GetNonTerminalChild() const { return m_gapNode; }
+ const UTrieNode *GetNonTerminalChild() const {
+ return m_gapNode;
+ }
UTrieNode *GetOrCreateTerminalChild(const Word &sourceTerm);
UTrieNode *GetOrCreateNonTerminalChild(const Word &targetNonTerm);
TargetPhraseCollection &GetOrCreateTargetPhraseCollection(
- const TargetPhrase &);
+ const TargetPhrase &);
- bool IsLeaf() const { return m_terminalMap.empty() && m_gapNode == NULL; }
+ bool IsLeaf() const {
+ return m_terminalMap.empty() && m_gapNode == NULL;
+ }
- bool HasRules() const { return !m_labelMap.empty(); }
+ bool HasRules() const {
+ return !m_labelMap.empty();
+ }
void Prune(size_t tableLimit);
void Sort(size_t tableLimit);
- private:
+private:
friend class RuleTableUTrie;
UTrieNode() : m_gapNode(NULL) {}
- int InsertLabel(int i, const Word &w)
- {
+ int InsertLabel(int i, const Word &w) {
std::vector<Word> &inner = m_labelTable[i];
for (size_t j = 0; j < inner.size(); ++j) {
if (inner[j] == w) {
diff --git a/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp b/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp
index ec5c1d8f1..b635dc050 100644
--- a/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp
+++ b/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -39,7 +39,7 @@ void ApplicableRuleTrie::Extend(const UTrieNode &root, int minPos,
size_t index = *r;
if (index == (size_t)minPos || (followsGap && index > (size_t)minPos) || minPos == -1) {
ApplicableRuleTrie *subTrie = new ApplicableRuleTrie(index, index,
- child);
+ child);
subTrie->Extend(child, index+1, sentMap, false);
m_children.push_back(subTrie);
}
diff --git a/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h b/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h
index 35243adde..9d2f2cda9 100644
--- a/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h
+++ b/moses/TranslationModel/Scope3Parser/ApplicableRuleTrie.h
@@ -33,14 +33,13 @@ struct VarSpanNode;
/** @todo what is this?
*/
-struct ApplicableRuleTrie
-{
- public:
+struct ApplicableRuleTrie {
+public:
ApplicableRuleTrie(int start, int end, const UTrieNode &node)
- : m_start(start)
- , m_end(end)
- , m_node(&node)
- , m_vstNode(NULL) {}
+ : m_start(start)
+ , m_end(end)
+ , m_node(&node)
+ , m_vstNode(NULL) {}
~ApplicableRuleTrie() {
RemoveAllInColl(m_children);
diff --git a/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h b/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h
index 353fabf22..499085127 100644
--- a/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h
+++ b/moses/TranslationModel/Scope3Parser/IntermediateVarSpanNode.h
@@ -26,9 +26,8 @@ namespace Moses
/** @todo what is this?
*/
-struct IntermediateVarSpanNode
-{
- public:
+struct IntermediateVarSpanNode {
+public:
typedef std::pair<int, int> Range;
IntermediateVarSpanNode()
@@ -41,8 +40,12 @@ struct IntermediateVarSpanNode
, m_end(end)
, m_numSplitPoints(0) {}
- bool isOpen() { return m_end.second == -1; }
- bool isClosed() { return !isOpen(); }
+ bool isOpen() {
+ return m_end.second == -1;
+ }
+ bool isClosed() {
+ return !isOpen();
+ }
Range m_start;
Range m_end;
diff --git a/moses/TranslationModel/Scope3Parser/Parser.cpp b/moses/TranslationModel/Scope3Parser/Parser.cpp
index bfcacb1ed..81e156b3d 100644
--- a/moses/TranslationModel/Scope3Parser/Parser.cpp
+++ b/moses/TranslationModel/Scope3Parser/Parser.cpp
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -38,8 +38,8 @@ namespace Moses
{
void Scope3Parser::GetChartRuleCollection(
- const WordsRange &range,
- ChartParserCallback &outColl)
+ const WordsRange &range,
+ ChartParserCallback &outColl)
{
const size_t start = range.GetStartPos();
const size_t end = range.GetEndPos();
@@ -122,7 +122,7 @@ void Scope3Parser::InitRuleApplicationVector()
}
void Scope3Parser::FillSentenceMap(
- const Sentence &sent, SentenceMap &sentMap)
+ const Sentence &sent, SentenceMap &sentMap)
{
for (size_t i = 0; i < sent.GetSize(); ++i) {
sentMap[sent.GetWord(i)].push_back(i);
@@ -130,10 +130,10 @@ void Scope3Parser::FillSentenceMap(
}
void Scope3Parser::AddRulesToCells(
- const ApplicableRuleTrie &node,
- std::pair<int, int> start,
- int maxPos,
- int depth)
+ const ApplicableRuleTrie &node,
+ std::pair<int, int> start,
+ int maxPos,
+ int depth)
{
if (depth > 0) {
// Determine the start range for this path if not already known.
@@ -183,7 +183,7 @@ void Scope3Parser::AddRulesToCells(
break;
}
m_ruleApplications[i][span].push_back(std::make_pair(node.m_node,
- node.m_vstNode));
+ node.m_vstNode));
}
}
}
diff --git a/moses/TranslationModel/Scope3Parser/Parser.h b/moses/TranslationModel/Scope3Parser/Parser.h
index 0b5e63d95..2a46de9a8 100644
--- a/moses/TranslationModel/Scope3Parser/Parser.h
+++ b/moses/TranslationModel/Scope3Parser/Parser.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -46,15 +46,14 @@ class WordsRange;
*/
class Scope3Parser : public ChartRuleLookupManager
{
- public:
+public:
Scope3Parser(const InputType &sentence,
const ChartCellCollectionBase &cellColl,
const RuleTableUTrie &ruleTable,
size_t maxChartSpan)
- : ChartRuleLookupManager(sentence, cellColl)
- , m_ruleTable(ruleTable)
- , m_maxChartSpan(maxChartSpan)
- {
+ : ChartRuleLookupManager(sentence, cellColl)
+ , m_ruleTable(ruleTable)
+ , m_maxChartSpan(maxChartSpan) {
Init();
}
@@ -62,23 +61,21 @@ class Scope3Parser : public ChartRuleLookupManager
const WordsRange &range,
ChartParserCallback &outColl);
- private:
+private:
// Define a callback type for use by StackLatticeSearcher.
- struct MatchCallback
- {
- public:
- MatchCallback(const WordsRange &range,
- ChartParserCallback &out)
- : m_range(range)
- , m_out(out)
- , m_tpc(NULL) {}
- void operator()(const StackVec &stackVec)
- {
- m_out.Add(*m_tpc, stackVec, m_range);
- }
- const WordsRange &m_range;
- ChartParserCallback &m_out;
- const TargetPhraseCollection *m_tpc;
+ struct MatchCallback {
+ public:
+ MatchCallback(const WordsRange &range,
+ ChartParserCallback &out)
+ : m_range(range)
+ , m_out(out)
+ , m_tpc(NULL) {}
+ void operator()(const StackVec &stackVec) {
+ m_out.Add(*m_tpc, stackVec, m_range);
+ }
+ const WordsRange &m_range;
+ ChartParserCallback &m_out;
+ const TargetPhraseCollection *m_tpc;
};
void Init();
@@ -89,7 +86,7 @@ class Scope3Parser : public ChartRuleLookupManager
const RuleTableUTrie &m_ruleTable;
std::vector<std::vector<std::vector<
- std::pair<const UTrieNode *, const VarSpanNode *> > > > m_ruleApplications;
+ std::pair<const UTrieNode *, const VarSpanNode *> > > > m_ruleApplications;
std::auto_ptr<VarSpanNode> m_varSpanTrie;
StackVec m_emptyStackVec;
const size_t m_maxChartSpan;
diff --git a/moses/TranslationModel/Scope3Parser/SentenceMap.h b/moses/TranslationModel/Scope3Parser/SentenceMap.h
index 9bc46db93..a7a1fdad9 100644
--- a/moses/TranslationModel/Scope3Parser/SentenceMap.h
+++ b/moses/TranslationModel/Scope3Parser/SentenceMap.h
@@ -29,7 +29,7 @@
namespace Moses
{
typedef boost::unordered_map<Word,
- std::vector<size_t>,
- TerminalHasher,
- TerminalEqualityPred> SentenceMap;
+ std::vector<size_t>,
+ TerminalHasher,
+ TerminalEqualityPred> SentenceMap;
}
diff --git a/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp b/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp
index bb553a116..26e4e6aca 100644
--- a/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp
+++ b/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.cpp
@@ -28,14 +28,14 @@ namespace Moses
{
void StackLatticeBuilder::Build(
- int start,
- int end,
- const UTrieNode &ruleNode,
- const VarSpanNode &varSpanNode,
- const std::vector<VarSpanNode::NonTermRange> &ranges,
- const ChartRuleLookupManager &manager,
- StackLattice &lattice,
- std::vector<std::vector<bool> > &checkTable)
+ int start,
+ int end,
+ const UTrieNode &ruleNode,
+ const VarSpanNode &varSpanNode,
+ const std::vector<VarSpanNode::NonTermRange> &ranges,
+ const ChartRuleLookupManager &manager,
+ StackLattice &lattice,
+ std::vector<std::vector<bool> > &checkTable)
{
// Extend the lattice if necessary. Do not shrink it.
const size_t span = end - start + 1;
diff --git a/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h b/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h
index 7091e8f18..551655e30 100644
--- a/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h
+++ b/moses/TranslationModel/Scope3Parser/StackLatticeBuilder.h
@@ -32,7 +32,7 @@ class ChartCellCollection;
*/
class StackLatticeBuilder
{
- public:
+public:
StackLatticeBuilder() {}
void Build(int, int, const UTrieNode &, const VarSpanNode &,
diff --git a/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h b/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h
index 749a3a2c1..4deac31f8 100644
--- a/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h
+++ b/moses/TranslationModel/Scope3Parser/StackLatticeSearcher.h
@@ -1,17 +1,17 @@
/***********************************************************************
Moses - statistical machine translation system
Copyright (C) 2006-2012 University of Edinburgh
-
+
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
-
+
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
-
+
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
@@ -33,22 +33,20 @@ class ChartHypothesisCollection;
template<typename MatchCallBackType>
class StackLatticeSearcher
{
- public:
+public:
StackLatticeSearcher(const StackLattice &lattice,
const std::vector<VarSpanNode::NonTermRange> &ranges)
- : m_lattice(lattice)
- , m_ranges(ranges) {}
+ : m_lattice(lattice)
+ , m_ranges(ranges) {}
- void Search(const std::vector<int> &labels, MatchCallBackType &callback)
- {
+ void Search(const std::vector<int> &labels, MatchCallBackType &callback) {
m_labels = &labels;
m_matchCB = &callback;
SearchInner(0, 0);
}
- private:
- void SearchInner(int start, size_t index)
- {
+private:
+ void SearchInner(int start, size_t index) {
assert(m_stackVec.size() == index);
const VarSpanNode::NonTermRange &range = m_ranges[index];
diff --git a/moses/TranslationModel/Scope3Parser/VarSpanNode.h b/moses/TranslationModel/Scope3Parser/VarSpanNode.h
index 52dc32382..0dda6a787 100644
--- a/moses/TranslationModel/Scope3Parser/VarSpanNode.h
+++ b/moses/TranslationModel/Scope3Parser/VarSpanNode.h
@@ -33,9 +33,8 @@ namespace Moses
/** @todo what is this?
*/
-struct VarSpanNode
-{
- public:
+struct VarSpanNode {
+public:
struct NonTermRange {
size_t s1;
size_t s2;
@@ -48,8 +47,7 @@ struct VarSpanNode
VarSpanNode() : m_parent(0), m_label(0), m_rank(0) {}
- VarSpanNode &Insert(const NodeVec &vec)
- {
+ VarSpanNode &Insert(const NodeVec &vec) {
if (vec.empty()) {
return *this;
}
@@ -59,8 +57,7 @@ struct VarSpanNode
// Given a span, determine the ranges of possible start and end offsets
// for each non-terminal.
void CalculateRanges(int start, int end,
- std::vector<NonTermRange> &ranges) const
- {
+ std::vector<NonTermRange> &ranges) const {
ranges.resize(m_rank);
const VarSpanNode *n = this;
size_t firstIndex = m_rank;
@@ -103,10 +100,9 @@ struct VarSpanNode
size_t m_rank;
MapType m_children;
- private:
+private:
VarSpanNode &Insert(NodeVec::const_iterator first,
- NodeVec::const_iterator last)
- {
+ NodeVec::const_iterator last) {
assert(first != last);
KeyType key;
@@ -117,7 +113,7 @@ struct VarSpanNode
key[4] = first->m_numSplitPoints;
std::pair<MapType::iterator, bool> result = m_children.insert(
- std::make_pair(key, VarSpanNode()));
+ std::make_pair(key, VarSpanNode()));
VarSpanNode &child = result.first->second;
if (result.second) {
child.m_parent = this;
diff --git a/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp b/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp
index 16b180ea5..35e66978b 100644
--- a/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp
+++ b/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.cpp
@@ -30,7 +30,7 @@ namespace Moses
{
std::auto_ptr<VarSpanNode> VarSpanTrieBuilder::Build(
- ApplicableRuleTrie &root)
+ ApplicableRuleTrie &root)
{
std::auto_ptr<VarSpanNode> vstRoot(new VarSpanNode());
NodeVec vec;
diff --git a/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h b/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h
index 13c701b4f..2513a2878 100644
--- a/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h
+++ b/moses/TranslationModel/Scope3Parser/VarSpanTrieBuilder.h
@@ -34,13 +34,12 @@ struct VarSpanNode;
*/
class VarSpanTrieBuilder
{
- public:
+public:
std::auto_ptr<VarSpanNode> Build(ApplicableRuleTrie &);
- private:
+private:
typedef std::vector<IntermediateVarSpanNode> NodeVec;
- struct NodeVecState
- {
+ struct NodeVecState {
std::size_t m_size;
IntermediateVarSpanNode m_lastNode;
};
diff --git a/moses/TranslationModel/fuzzy-match/Alignments.cpp b/moses/TranslationModel/fuzzy-match/Alignments.cpp
index f15d82a5e..142aff251 100644
--- a/moses/TranslationModel/fuzzy-match/Alignments.cpp
+++ b/moses/TranslationModel/fuzzy-match/Alignments.cpp
@@ -8,12 +8,11 @@ using namespace std;
using namespace Moses;
Alignments::Alignments(const std::string &str, size_t sourceSize, size_t targetSize)
-:m_alignS2T(sourceSize)
-,m_alignT2S(targetSize)
+ :m_alignS2T(sourceSize)
+ ,m_alignT2S(targetSize)
{
vector<string> toks = Tokenize(str, " ");
- for (size_t i = 0; i < toks.size(); ++i)
- {
+ for (size_t i = 0; i < toks.size(); ++i) {
string &tok = toks[i];
vector<int> point = Tokenize<int>(tok, "-");
@@ -25,20 +24,18 @@ Alignments::Alignments(const std::string &str, size_t sourceSize, size_t targetS
std::map<int, int> &targets = m_alignS2T[ point[0] ];
iter = targets.find(point[1]);
if (iter == targets .end()) {
- targets[ point[1] ] = 0;
- }
- else {
- ++(iter->second);
+ targets[ point[1] ] = 0;
+ } else {
+ ++(iter->second);
}
// m_alignedToS
std::map<int, int> &sources = m_alignT2S[ point[1] ];
iter = sources.find(point[0]);
if (iter == targets .end()) {
- sources[ point[0] ] = 0;
- }
- else {
- ++(iter->second);
+ sources[ point[0] ] = 0;
+ } else {
+ ++(iter->second);
}
}
diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
index 065368ca7..a4264f6a4 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
@@ -17,10 +17,10 @@
using namespace std;
-namespace tmmt
+namespace tmmt
{
- FuzzyMatchWrapper::FuzzyMatchWrapper(const std::string &sourcePath, const std::string &targetPath, const std::string &alignmentPath)
+FuzzyMatchWrapper::FuzzyMatchWrapper(const std::string &sourcePath, const std::string &targetPath, const std::string &alignmentPath)
:basic_flag(false)
,lsed_flag(true)
,refined_flag(true)
@@ -30,790 +30,735 @@ namespace tmmt
,multiple_flag(true)
,multiple_slack(0)
,multiple_max(100)
- {
- cerr << "creating suffix array" << endl;
- suffixArray = new tmmt::SuffixArray( sourcePath );
+{
+ cerr << "creating suffix array" << endl;
+ suffixArray = new tmmt::SuffixArray( sourcePath );
- //cerr << "loading source data" << endl;
- //load_corpus(sourcePath, source);
+ //cerr << "loading source data" << endl;
+ //load_corpus(sourcePath, source);
- cerr << "loading target data" << endl;
- load_target(targetPath, targetAndAlignment);
+ cerr << "loading target data" << endl;
+ load_target(targetPath, targetAndAlignment);
- cerr << "loading alignment" << endl;
- load_alignment(alignmentPath, targetAndAlignment);
+ cerr << "loading alignment" << endl;
+ load_alignment(alignmentPath, targetAndAlignment);
- // create suffix array
- //load_corpus(m_config[0], input);
-
- cerr << "loading completed" << endl;
- }
+ // create suffix array
+ //load_corpus(m_config[0], input);
+
+ cerr << "loading completed" << endl;
+}
- string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr)
- {
- const Moses::StaticData &staticData = Moses::StaticData::Instance();
+string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr)
+{
+ const Moses::StaticData &staticData = Moses::StaticData::Instance();
+
+ WordIndex wordIndex;
- WordIndex wordIndex;
+ string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr);
- string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr);
-
- // create extrac files
- create_xml(fuzzyMatchFile);
+ // create extrac files
+ create_xml(fuzzyMatchFile);
- // create phrase table with usual Moses scoring and consolidate programs
- string cmd;
- cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract | gzip -c > "
- + fuzzyMatchFile + ".extract.sorted.gz";
- system(cmd.c_str());
- cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract.inv | gzip -c > "
- + fuzzyMatchFile + ".extract.inv.sorted.gz";
- system(cmd.c_str());
+ // create phrase table with usual Moses scoring and consolidate programs
+ string cmd;
+ cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract | gzip -c > "
+ + fuzzyMatchFile + ".extract.sorted.gz";
+ system(cmd.c_str());
+ cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract.inv | gzip -c > "
+ + fuzzyMatchFile + ".extract.inv.sorted.gz";
+ system(cmd.c_str());
#ifdef IS_XCODE
- cmd = "/Users/hieuhoang/unison/workspace/github/moses-smt/bin";
+ cmd = "/Users/hieuhoang/unison/workspace/github/moses-smt/bin";
#elif IS_ECLIPSE
- cmd = "/home/hieu/workspace/github/moses-smt/bin";
+ cmd = "/home/hieu/workspace/github/moses-smt/bin";
#else
- cmd = staticData.GetBinDirectory();
+ cmd = staticData.GetBinDirectory();
#endif
- cmd += string("/../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical ")
- + " -extract-file " + fuzzyMatchFile + ".extract -lexical-file - -score-options \"--NoLex\" "
- + " -phrase-translation-table " + fuzzyMatchFile + ".pt";
- system(cmd.c_str());
+ cmd += string("/../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical ")
+ + " -extract-file " + fuzzyMatchFile + ".extract -lexical-file - -score-options \"--NoLex\" "
+ + " -phrase-translation-table " + fuzzyMatchFile + ".pt";
+ system(cmd.c_str());
+
+
+ return fuzzyMatchFile + ".pt.gz";
+}
+
+string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr)
+{
+ const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus();
+
+ string inputPath = dirNameStr + "/in";
+ string fuzzyMatchFile = dirNameStr + "/fuzzyMatchFile";
+ ofstream fuzzyMatchStream(fuzzyMatchFile.c_str());
+
+ vector< vector< WORD_ID > > input;
+ load_corpus(inputPath, input);
+
+ assert(input.size() == 1);
+ size_t sentenceInd = 0;
+
+ clock_t start_clock = clock();
+ // if (i % 10 == 0) cerr << ".";
+
+ // establish some basic statistics
+ // int input_length = compute_length( input[i] );
+ int input_length = input[sentenceInd].size();
+ int best_cost = input_length * (100-min_match) / 100 + 1;
- return fuzzyMatchFile + ".pt.gz";
+ int match_count = 0; // how many substring matches to be considered
+ //cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl;
+
+ // find match ranges in suffix array
+ vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
+ for(size_t start=0; start<input[sentenceInd].size(); start++) {
+ SuffixArray::INDEX prior_first_match = 0;
+ SuffixArray::INDEX prior_last_match = suffixArray->GetSize()-1;
+ vector< string > substring;
+ bool stillMatched = true;
+ vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
+ //cerr << "start: " << start;
+ for(int word=start; stillMatched && word<input[sentenceInd].size(); word++) {
+ substring.push_back( GetVocabulary().GetWord( input[sentenceInd][word] ) );
+
+ // only look up, if needed (i.e. no unnecessary short gram lookups)
+ // if (! word-start+1 <= short_match_max_length( input_length ) )
+ // {
+ SuffixArray::INDEX first_match, last_match;
+ stillMatched = false;
+ if (suffixArray->FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) ) {
+ stillMatched = true;
+ matchedAtThisStart.push_back( make_pair( first_match, last_match ) );
+ //cerr << " (" << first_match << "," << last_match << ")";
+ //cerr << " " << ( last_match - first_match + 1 );
+ prior_first_match = first_match;
+ prior_last_match = last_match;
+ }
+ //}
+ }
+ //cerr << endl;
+ match_range.push_back( matchedAtThisStart );
}
-
- string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr)
- {
- const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus();
-
- string inputPath = dirNameStr + "/in";
- string fuzzyMatchFile = dirNameStr + "/fuzzyMatchFile";
- ofstream fuzzyMatchStream(fuzzyMatchFile.c_str());
-
- vector< vector< WORD_ID > > input;
- load_corpus(inputPath, input);
-
- assert(input.size() == 1);
- size_t sentenceInd = 0;
-
- clock_t start_clock = clock();
- // if (i % 10 == 0) cerr << ".";
-
- // establish some basic statistics
-
- // int input_length = compute_length( input[i] );
- int input_length = input[sentenceInd].size();
- int best_cost = input_length * (100-min_match) / 100 + 1;
-
- int match_count = 0; // how many substring matches to be considered
- //cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl;
-
- // find match ranges in suffix array
- vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
- for(size_t start=0;start<input[sentenceInd].size();start++)
- {
- SuffixArray::INDEX prior_first_match = 0;
- SuffixArray::INDEX prior_last_match = suffixArray->GetSize()-1;
- vector< string > substring;
- bool stillMatched = true;
- vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
- //cerr << "start: " << start;
- for(int word=start; stillMatched && word<input[sentenceInd].size(); word++)
- {
- substring.push_back( GetVocabulary().GetWord( input[sentenceInd][word] ) );
-
- // only look up, if needed (i.e. no unnecessary short gram lookups)
- // if (! word-start+1 <= short_match_max_length( input_length ) )
- // {
- SuffixArray::INDEX first_match, last_match;
- stillMatched = false;
- if (suffixArray->FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) )
- {
- stillMatched = true;
- matchedAtThisStart.push_back( make_pair( first_match, last_match ) );
- //cerr << " (" << first_match << "," << last_match << ")";
- //cerr << " " << ( last_match - first_match + 1 );
- prior_first_match = first_match;
- prior_last_match = last_match;
- }
- //}
- }
- //cerr << endl;
- match_range.push_back( matchedAtThisStart );
- }
-
- clock_t clock_range = clock();
-
- map< int, vector< Match > > sentence_match;
- map< int, int > sentence_match_word_count;
-
- // go through all matches, longest first
- for(int length = input[sentenceInd].size(); length >= 1; length--)
- {
- // do not create matches, if these are handled by the short match function
- if (length <= short_match_max_length( input_length ) )
- {
- continue;
- }
-
- unsigned int count = 0;
- for(int start = 0; start <= input[sentenceInd].size() - length; start++)
- {
- if (match_range[start].size() >= length)
- {
- pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1];
- // cerr << " (" << range.first << "," << range.second << ")";
- count += range.second - range.first + 1;
-
- for(SuffixArray::INDEX i=range.first; i<=range.second; i++)
- {
- int position = suffixArray->GetPosition( i );
-
- // sentence length mismatch
- size_t sentence_id = suffixArray->GetSentence( position );
- int sentence_length = suffixArray->GetSentenceLength( sentence_id );
- int diff = abs( (int)sentence_length - (int)input_length );
- // cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length;
- //if (length <= 2 && input_length>=5 &&
- // sentence_match.find( sentence_id ) == sentence_match.end())
- // continue;
-
- if (diff > best_cost)
- continue;
-
- // compute minimal cost
- int start_pos = suffixArray->GetWordInSentence( position );
- int end_pos = start_pos + length-1;
- // cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. "
- // << start << "-" << (start+length-1) << " (" << input_length << ")";
- // different number of prior words -> cost is at least diff
- int min_cost = abs( start - start_pos );
-
- // same number of words, but not sent. start -> cost is at least 1
- if (start == start_pos && start>0)
- min_cost++;
-
- // different number of remaining words -> cost is at least diff
- min_cost += abs( ( sentence_length-1 - end_pos ) -
- ( input_length-1 - (start+length-1) ) );
-
- // same number of words, but not sent. end -> cost is at least 1
- if ( sentence_length-1 - end_pos ==
- input_length-1 - (start+length-1)
- && end_pos != sentence_length-1 )
- min_cost++;
-
- // cerr << " -> min_cost " << min_cost;
- if (min_cost > best_cost)
- continue;
-
- // valid match
- match_count++;
-
- // compute maximal cost
- int max_cost = max( start, start_pos )
- + max( sentence_length-1 - end_pos,
- input_length-1 - (start+length-1) );
- // cerr << ", max_cost " << max_cost;
-
- Match m = Match( start, start+length-1,
- start_pos, start_pos+length-1,
- min_cost, max_cost, 0);
- sentence_match[ sentence_id ].push_back( m );
- sentence_match_word_count[ sentence_id ] += length;
-
- if (max_cost < best_cost)
- {
- best_cost = max_cost;
- if (best_cost == 0) break;
- }
- //if (match_count >= MAX_MATCH_COUNT) break;
- }
- }
- // cerr << endl;
- if (best_cost == 0) break;
- //if (match_count >= MAX_MATCH_COUNT) break;
- }
- // cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl;
-
- if (best_cost == 0) break;
- //if (match_count >= MAX_MATCH_COUNT) break;
- }
- cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl;
-
- clock_t clock_matches = clock();
-
- // consider each sentence for which we have matches
- int old_best_cost = best_cost;
- int tm_count_word_match = 0;
- int tm_count_word_match2 = 0;
- int pruned_match_count = 0;
- if (short_match_max_length( input_length ))
- {
- init_short_matches(wordIndex, translationId, input[sentenceInd] );
- }
- vector< int > best_tm;
- typedef map< int, vector< Match > >::iterator I;
-
- clock_t clock_validation_sum = 0;
-
- for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++)
- {
- int tmID = tm->first;
- int tm_length = suffixArray->GetSentenceLength(tmID);
- vector< Match > &match = tm->second;
- add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost );
-
- //cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;
-
- // quick look: how many words are matched
- int words_matched = 0;
- for(int m=0;m<match.size();m++) {
-
- if (match[m].min_cost <= best_cost) // makes no difference
- words_matched += match[m].input_end - match[m].input_start + 1;
- }
- if (max(input_length,tm_length) - words_matched > best_cost)
- {
- if (length_filter_flag) continue;
- }
- tm_count_word_match++;
-
- // prune, check again how many words are matched
- vector< Match > pruned = prune_matches( match, best_cost );
- words_matched = 0;
- for(int p=0;p<pruned.size();p++) {
- words_matched += pruned[p].input_end - pruned[p].input_start + 1;
- }
- if (max(input_length,tm_length) - words_matched > best_cost)
- {
- if (length_filter_flag) continue;
- }
- tm_count_word_match2++;
-
- pruned_match_count += pruned.size();
- int prior_best_cost = best_cost;
- int cost;
-
- clock_t clock_validation_start = clock();
- if (! parse_flag ||
- pruned.size()>=10) // to prevent worst cases
- {
- string path;
- cost = sed( input[sentenceInd], source[tmID], path, false );
- if (cost < best_cost)
- {
- best_cost = cost;
- }
- }
-
- else
- {
- cost = parse_matches( pruned, input_length, tm_length, best_cost );
- if (prior_best_cost != best_cost)
- {
- best_tm.clear();
- }
- }
- clock_validation_sum += clock() - clock_validation_start;
- if (cost == best_cost)
- {
- best_tm.push_back( tmID );
- }
- }
- cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl;
- cerr << "tm considered: " << sentence_match.size()
- << " word-matched: " << tm_count_word_match
- << " word-matched2: " << tm_count_word_match2
- << " best: " << best_tm.size() << endl;
-
- cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl;
-
- // create xml and extract files
- string inputStr, sourceStr;
- for (size_t pos = 0; pos < input_length; ++pos) {
- inputStr += GetVocabulary().GetWord(input[sentenceInd][pos]) + " ";
+
+ clock_t clock_range = clock();
+
+ map< int, vector< Match > > sentence_match;
+ map< int, int > sentence_match_word_count;
+
+ // go through all matches, longest first
+ for(int length = input[sentenceInd].size(); length >= 1; length--) {
+ // do not create matches, if these are handled by the short match function
+ if (length <= short_match_max_length( input_length ) ) {
+ continue;
}
-
- // do not try to find the best ... report multiple matches
- if (multiple_flag) {
- int input_letter_length = compute_length( input[sentenceInd] );
- for(int si=0; si<best_tm.size(); si++) {
- int s = best_tm[si];
- string path;
- unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
- // do not report multiple identical sentences, but just their count
- //cout << sentenceInd << " "; // sentence number
- //cout << letter_cost << "/" << input_letter_length << " ";
- //cout << "(" << best_cost <<"/" << input_length <<") ";
- //cout << "||| " << s << " ||| " << path << endl;
-
- const vector<WORD_ID> &sourceSentence = source[s];
- vector<SentenceAlignment> &targets = targetAndAlignment[s];
- create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream);
-
- }
- } // if (multiple_flag)
- else {
-
- // find the best matches according to letter sed
- string best_path = "";
- int best_match = -1;
- int best_letter_cost;
- if (lsed_flag) {
- best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
- for(int si=0; si<best_tm.size(); si++)
- {
- int s = best_tm[si];
- string path;
- unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
- if (letter_cost < best_letter_cost)
- {
- best_letter_cost = letter_cost;
- best_path = path;
- best_match = s;
+
+ unsigned int count = 0;
+ for(int start = 0; start <= input[sentenceInd].size() - length; start++) {
+ if (match_range[start].size() >= length) {
+ pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1];
+ // cerr << " (" << range.first << "," << range.second << ")";
+ count += range.second - range.first + 1;
+
+ for(SuffixArray::INDEX i=range.first; i<=range.second; i++) {
+ int position = suffixArray->GetPosition( i );
+
+ // sentence length mismatch
+ size_t sentence_id = suffixArray->GetSentence( position );
+ int sentence_length = suffixArray->GetSentenceLength( sentence_id );
+ int diff = abs( (int)sentence_length - (int)input_length );
+ // cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length;
+ //if (length <= 2 && input_length>=5 &&
+ // sentence_match.find( sentence_id ) == sentence_match.end())
+ // continue;
+
+ if (diff > best_cost)
+ continue;
+
+ // compute minimal cost
+ int start_pos = suffixArray->GetWordInSentence( position );
+ int end_pos = start_pos + length-1;
+ // cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. "
+ // << start << "-" << (start+length-1) << " (" << input_length << ")";
+ // different number of prior words -> cost is at least diff
+ int min_cost = abs( start - start_pos );
+
+ // same number of words, but not sent. start -> cost is at least 1
+ if (start == start_pos && start>0)
+ min_cost++;
+
+ // different number of remaining words -> cost is at least diff
+ min_cost += abs( ( sentence_length-1 - end_pos ) -
+ ( input_length-1 - (start+length-1) ) );
+
+ // same number of words, but not sent. end -> cost is at least 1
+ if ( sentence_length-1 - end_pos ==
+ input_length-1 - (start+length-1)
+ && end_pos != sentence_length-1 )
+ min_cost++;
+
+ // cerr << " -> min_cost " << min_cost;
+ if (min_cost > best_cost)
+ continue;
+
+ // valid match
+ match_count++;
+
+ // compute maximal cost
+ int max_cost = max( start, start_pos )
+ + max( sentence_length-1 - end_pos,
+ input_length-1 - (start+length-1) );
+ // cerr << ", max_cost " << max_cost;
+
+ Match m = Match( start, start+length-1,
+ start_pos, start_pos+length-1,
+ min_cost, max_cost, 0);
+ sentence_match[ sentence_id ].push_back( m );
+ sentence_match_word_count[ sentence_id ] += length;
+
+ if (max_cost < best_cost) {
+ best_cost = max_cost;
+ if (best_cost == 0) break;
}
+ //if (match_count >= MAX_MATCH_COUNT) break;
}
}
- // if letter sed turned off, just compute path for first match
- else {
- if (best_tm.size() > 0) {
- string path;
- sed( input[sentenceInd], source[best_tm[0]], path, false );
- best_path = path;
- best_match = best_tm[0];
- }
- }
- cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC)
- << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC)
- << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC)
- << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC)
- << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")"
- << " )" << endl;
- if (lsed_flag) {
- //cout << best_letter_cost << "/" << compute_length( input[sentenceInd] ) << " (";
- }
- //cout << best_cost <<"/" << input_length;
- if (lsed_flag) {
- //cout << ")";
- }
- //cout << " ||| " << best_match << " ||| " << best_path << endl;
-
- if (best_match == -1) {
- CHECK(source.size());
- best_match = 0;
- }
-
- // creat xml & extracts
- const vector<WORD_ID> &sourceSentence = source[best_match];
- vector<SentenceAlignment> &targets = targetAndAlignment[best_match];
- create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path, fuzzyMatchStream);
-
- } // else if (multiple_flag)
-
- fuzzyMatchStream.close();
-
- return fuzzyMatchFile;
+ // cerr << endl;
+ if (best_cost == 0) break;
+ //if (match_count >= MAX_MATCH_COUNT) break;
+ }
+ // cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl;
+
+ if (best_cost == 0) break;
+ //if (match_count >= MAX_MATCH_COUNT) break;
}
+ cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl;
- void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector< WORD_ID > > &corpus )
- { // source
- ifstream fileStream;
- fileStream.open(fileName.c_str());
- if (!fileStream) {
- cerr << "file not found: " << fileName << endl;
- exit(1);
+ clock_t clock_matches = clock();
+
+ // consider each sentence for which we have matches
+ int old_best_cost = best_cost;
+ int tm_count_word_match = 0;
+ int tm_count_word_match2 = 0;
+ int pruned_match_count = 0;
+ if (short_match_max_length( input_length )) {
+ init_short_matches(wordIndex, translationId, input[sentenceInd] );
+ }
+ vector< int > best_tm;
+ typedef map< int, vector< Match > >::iterator I;
+
+ clock_t clock_validation_sum = 0;
+
+ for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++) {
+ int tmID = tm->first;
+ int tm_length = suffixArray->GetSentenceLength(tmID);
+ vector< Match > &match = tm->second;
+ add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost );
+
+ //cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;
+
+ // quick look: how many words are matched
+ int words_matched = 0;
+ for(int m=0; m<match.size(); m++) {
+
+ if (match[m].min_cost <= best_cost) // makes no difference
+ words_matched += match[m].input_end - match[m].input_start + 1;
}
- cerr << "loading " << fileName << endl;
-
- istream *fileStreamP = &fileStream;
-
- char line[LINE_MAX_LENGTH];
- while(true)
- {
- SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
- if (fileStreamP->eof()) break;
- corpus.push_back( GetVocabulary().Tokenize( line ) );
+ if (max(input_length,tm_length) - words_matched > best_cost) {
+ if (length_filter_flag) continue;
}
- }
-
- void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus)
- {
- ifstream fileStream;
- fileStream.open(fileName.c_str());
- if (!fileStream) {
- cerr << "file not found: " << fileName << endl;
- exit(1);
+ tm_count_word_match++;
+
+ // prune, check again how many words are matched
+ vector< Match > pruned = prune_matches( match, best_cost );
+ words_matched = 0;
+ for(int p=0; p<pruned.size(); p++) {
+ words_matched += pruned[p].input_end - pruned[p].input_start + 1;
}
- cerr << "loading " << fileName << endl;
-
- istream *fileStreamP = &fileStream;
-
- WORD_ID delimiter = GetVocabulary().StoreIfNew("|||");
-
- int lineNum = 0;
- char line[LINE_MAX_LENGTH];
- while(true)
- {
- SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
- if (fileStreamP->eof()) break;
-
- vector<WORD_ID> toks = GetVocabulary().Tokenize( line );
-
- corpus.push_back(vector< SentenceAlignment >());
- vector< SentenceAlignment > &vec = corpus.back();
-
- vec.push_back(SentenceAlignment());
- SentenceAlignment *sentence = &vec.back();
-
- const WORD &countStr = GetVocabulary().GetWord(toks[0]);
- sentence->count = atoi(countStr.c_str());
-
- for (size_t i = 1; i < toks.size(); ++i) {
- WORD_ID wordId = toks[i];
-
- if (wordId == delimiter) {
- // target and alignments can have multiple sentences.
- vec.push_back(SentenceAlignment());
- sentence = &vec.back();
-
- // count
- ++i;
-
- const WORD &countStr = GetVocabulary().GetWord(toks[i]);
- sentence->count = atoi(countStr.c_str());
- }
- else {
- // just a normal word, add
- sentence->target.push_back(wordId);
- }
+ if (max(input_length,tm_length) - words_matched > best_cost) {
+ if (length_filter_flag) continue;
+ }
+ tm_count_word_match2++;
+
+ pruned_match_count += pruned.size();
+ int prior_best_cost = best_cost;
+ int cost;
+
+ clock_t clock_validation_start = clock();
+ if (! parse_flag ||
+ pruned.size()>=10) { // to prevent worst cases
+ string path;
+ cost = sed( input[sentenceInd], source[tmID], path, false );
+ if (cost < best_cost) {
+ best_cost = cost;
+ }
+ }
+
+ else {
+ cost = parse_matches( pruned, input_length, tm_length, best_cost );
+ if (prior_best_cost != best_cost) {
+ best_tm.clear();
}
-
- ++lineNum;
-
}
-
+ clock_validation_sum += clock() - clock_validation_start;
+ if (cost == best_cost) {
+ best_tm.push_back( tmID );
+ }
+ }
+ cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl;
+ cerr << "tm considered: " << sentence_match.size()
+ << " word-matched: " << tm_count_word_match
+ << " word-matched2: " << tm_count_word_match2
+ << " best: " << best_tm.size() << endl;
+
+ cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl;
+
+ // create xml and extract files
+ string inputStr, sourceStr;
+ for (size_t pos = 0; pos < input_length; ++pos) {
+ inputStr += GetVocabulary().GetWord(input[sentenceInd][pos]) + " ";
}
-
-
- void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus )
- {
- ifstream fileStream;
- fileStream.open(fileName.c_str());
- if (!fileStream) {
- cerr << "file not found: " << fileName << endl;
- exit(1);
+
+ // do not try to find the best ... report multiple matches
+ if (multiple_flag) {
+ int input_letter_length = compute_length( input[sentenceInd] );
+ for(int si=0; si<best_tm.size(); si++) {
+ int s = best_tm[si];
+ string path;
+ unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
+ // do not report multiple identical sentences, but just their count
+ //cout << sentenceInd << " "; // sentence number
+ //cout << letter_cost << "/" << input_letter_length << " ";
+ //cout << "(" << best_cost <<"/" << input_length <<") ";
+ //cout << "||| " << s << " ||| " << path << endl;
+
+ const vector<WORD_ID> &sourceSentence = source[s];
+ vector<SentenceAlignment> &targets = targetAndAlignment[s];
+ create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream);
+
}
- cerr << "loading " << fileName << endl;
-
- istream *fileStreamP = &fileStream;
-
- string delimiter = "|||";
-
- int lineNum = 0;
- char line[LINE_MAX_LENGTH];
- while(true)
- {
- SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
- if (fileStreamP->eof()) break;
-
- vector< SentenceAlignment > &vec = corpus[lineNum];
- size_t targetInd = 0;
- SentenceAlignment *sentence = &vec[targetInd];
-
- vector<string> toks = Moses::Tokenize(line);
-
- for (size_t i = 0; i < toks.size(); ++i) {
- string &tok = toks[i];
-
- if (tok == delimiter) {
- // target and alignments can have multiple sentences.
- ++targetInd;
- sentence = &vec[targetInd];
-
- ++i;
- }
- else {
- // just a normal alignment, add
- vector<int> alignPoint = Moses::Tokenize<int>(tok, "-");
- assert(alignPoint.size() == 2);
- sentence->alignment.push_back(pair<int,int>(alignPoint[0], alignPoint[1]));
+ } // if (multiple_flag)
+ else {
+
+ // find the best matches according to letter sed
+ string best_path = "";
+ int best_match = -1;
+ int best_letter_cost;
+ if (lsed_flag) {
+ best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
+ for(int si=0; si<best_tm.size(); si++) {
+ int s = best_tm[si];
+ string path;
+ unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
+ if (letter_cost < best_letter_cost) {
+ best_letter_cost = letter_cost;
+ best_path = path;
+ best_match = s;
}
}
-
- ++lineNum;
-
}
+ // if letter sed turned off, just compute path for first match
+ else {
+ if (best_tm.size() > 0) {
+ string path;
+ sed( input[sentenceInd], source[best_tm[0]], path, false );
+ best_path = path;
+ best_match = best_tm[0];
+ }
+ }
+ cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC)
+ << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC)
+ << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC)
+ << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC)
+ << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")"
+ << " )" << endl;
+ if (lsed_flag) {
+ //cout << best_letter_cost << "/" << compute_length( input[sentenceInd] ) << " (";
+ }
+ //cout << best_cost <<"/" << input_length;
+ if (lsed_flag) {
+ //cout << ")";
+ }
+ //cout << " ||| " << best_match << " ||| " << best_path << endl;
+
+ if (best_match == -1) {
+ CHECK(source.size());
+ best_match = 0;
+ }
+
+ // creat xml & extracts
+ const vector<WORD_ID> &sourceSentence = source[best_match];
+ vector<SentenceAlignment> &targets = targetAndAlignment[best_match];
+ create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path, fuzzyMatchStream);
+
+ } // else if (multiple_flag)
+
+ fuzzyMatchStream.close();
+
+ return fuzzyMatchFile;
+}
+
+void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector< WORD_ID > > &corpus )
+{
+ // source
+ ifstream fileStream;
+ fileStream.open(fileName.c_str());
+ if (!fileStream) {
+ cerr << "file not found: " << fileName << endl;
+ exit(1);
}
-
- bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
- {
-#ifdef WITH_THREADS
- boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
-#endif
- map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
- if (lookup != m_lsed.end()) {
- value = lookup->second;
- return true;
+ cerr << "loading " << fileName << endl;
+
+ istream *fileStreamP = &fileStream;
+
+ char line[LINE_MAX_LENGTH];
+ while(true) {
+ SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
+ if (fileStreamP->eof()) break;
+ corpus.push_back( GetVocabulary().Tokenize( line ) );
+ }
+}
+
+void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus)
+{
+ ifstream fileStream;
+ fileStream.open(fileName.c_str());
+ if (!fileStream) {
+ cerr << "file not found: " << fileName << endl;
+ exit(1);
+ }
+ cerr << "loading " << fileName << endl;
+
+ istream *fileStreamP = &fileStream;
+
+ WORD_ID delimiter = GetVocabulary().StoreIfNew("|||");
+
+ int lineNum = 0;
+ char line[LINE_MAX_LENGTH];
+ while(true) {
+ SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
+ if (fileStreamP->eof()) break;
+
+ vector<WORD_ID> toks = GetVocabulary().Tokenize( line );
+
+ corpus.push_back(vector< SentenceAlignment >());
+ vector< SentenceAlignment > &vec = corpus.back();
+
+ vec.push_back(SentenceAlignment());
+ SentenceAlignment *sentence = &vec.back();
+
+ const WORD &countStr = GetVocabulary().GetWord(toks[0]);
+ sentence->count = atoi(countStr.c_str());
+
+ for (size_t i = 1; i < toks.size(); ++i) {
+ WORD_ID wordId = toks[i];
+
+ if (wordId == delimiter) {
+ // target and alignments can have multiple sentences.
+ vec.push_back(SentenceAlignment());
+ sentence = &vec.back();
+
+ // count
+ ++i;
+
+ const WORD &countStr = GetVocabulary().GetWord(toks[i]);
+ sentence->count = atoi(countStr.c_str());
+ } else {
+ // just a normal word, add
+ sentence->target.push_back(wordId);
+ }
+ }
+
+ ++lineNum;
+
+ }
+
+}
+
+
+void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus )
+{
+ ifstream fileStream;
+ fileStream.open(fileName.c_str());
+ if (!fileStream) {
+ cerr << "file not found: " << fileName << endl;
+ exit(1);
+ }
+ cerr << "loading " << fileName << endl;
+
+ istream *fileStreamP = &fileStream;
+
+ string delimiter = "|||";
+
+ int lineNum = 0;
+ char line[LINE_MAX_LENGTH];
+ while(true) {
+ SAFE_GETLINE((*fileStreamP), line, LINE_MAX_LENGTH, '\n');
+ if (fileStreamP->eof()) break;
+
+ vector< SentenceAlignment > &vec = corpus[lineNum];
+ size_t targetInd = 0;
+ SentenceAlignment *sentence = &vec[targetInd];
+
+ vector<string> toks = Moses::Tokenize(line);
+
+ for (size_t i = 0; i < toks.size(); ++i) {
+ string &tok = toks[i];
+
+ if (tok == delimiter) {
+ // target and alignments can have multiple sentences.
+ ++targetInd;
+ sentence = &vec[targetInd];
+
+ ++i;
+ } else {
+ // just a normal alignment, add
+ vector<int> alignPoint = Moses::Tokenize<int>(tok, "-");
+ assert(alignPoint.size() == 2);
+ sentence->alignment.push_back(pair<int,int>(alignPoint[0], alignPoint[1]));
+ }
}
- return false;
+ ++lineNum;
+
}
+}
- void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
- {
+bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
+{
#ifdef WITH_THREADS
- boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+ boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
#endif
- m_lsed[ key ] = value;
+ map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
+ if (lookup != m_lsed.end()) {
+ value = lookup->second;
+ return true;
}
+ return false;
+}
+
+void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
+{
+#ifdef WITH_THREADS
+ boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+#endif
+ m_lsed[ key ] = value;
+}
+
/* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */
unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx )
{
- // check if already computed -> lookup in cache
- pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
- unsigned int value;
- bool ret = GetLSEDCache(pIdx, value);
- if (ret) {
- return value;
- }
-
- // get surface strings for word indices
- const string &a = GetVocabulary().GetWord( aIdx );
- const string &b = GetVocabulary().GetWord( bIdx );
-
- // initialize cost matrix
- unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
- for( unsigned int i=0; i<=a.size(); i++ ) {
- cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
- cost[i][0] = i;
- }
- for( unsigned int j=0; j<=b.size(); j++ ) {
- cost[0][j] = j;
- }
-
- // core string edit distance loop
- for( unsigned int i=1; i<=a.size(); i++ ) {
- for( unsigned int j=1; j<=b.size(); j++ ) {
-
- unsigned int ins = cost[i-1][j] + 1;
- unsigned int del = cost[i][j-1] + 1;
- bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0);
- unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1);
-
- unsigned int min = (ins < del) ? ins : del;
- min = (diag < min) ? diag : min;
-
- cost[i][j] = min;
- }
- }
-
- // clear out memory
- unsigned int final = cost[a.size()][b.size()];
- for( unsigned int i=0; i<=a.size(); i++ ) {
- free( cost[i] );
- }
- free( cost );
-
- // cache and return result
- SetLSEDCache(pIdx, final);
- return final;
-}
-
- /* string edit distance implementation */
-
- unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) {
-
- // initialize cost and path matrices
- unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
- char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
-
- for( unsigned int i=0; i<=a.size(); i++ ) {
- cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
- path[i] = (char*) calloc( sizeof(char), b.size()+1 );
- if (i>0)
- {
- cost[i][0] = cost[i-1][0];
- if (use_letter_sed)
- {
- cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size();
- }
- else
- {
- cost[i][0]++;
- }
- }
- else
- {
- cost[i][0] = 0;
- }
- path[i][0] = 'I';
- }
+ // check if already computed -> lookup in cache
+ pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
+ unsigned int value;
+ bool ret = GetLSEDCache(pIdx, value);
+ if (ret) {
+ return value;
+ }
- for( unsigned int j=0; j<=b.size(); j++ ) {
- if (j>0)
- {
- cost[0][j] = cost[0][j-1];
- if (use_letter_sed)
- {
- cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size();
- }
- else
- {
- cost[0][j]++;
- }
- }
- else
- {
- cost[0][j] = 0;
- }
- path[0][j] = 'D';
+ // get surface strings for word indices
+ const string &a = GetVocabulary().GetWord( aIdx );
+ const string &b = GetVocabulary().GetWord( bIdx );
+
+ // initialize cost matrix
+ unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
+ for( unsigned int i=0; i<=a.size(); i++ ) {
+ cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
+ cost[i][0] = i;
+ }
+ for( unsigned int j=0; j<=b.size(); j++ ) {
+ cost[0][j] = j;
+ }
+
+ // core string edit distance loop
+ for( unsigned int i=1; i<=a.size(); i++ ) {
+ for( unsigned int j=1; j<=b.size(); j++ ) {
+
+ unsigned int ins = cost[i-1][j] + 1;
+ unsigned int del = cost[i][j-1] + 1;
+ bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0);
+ unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1);
+
+ unsigned int min = (ins < del) ? ins : del;
+ min = (diag < min) ? diag : min;
+
+ cost[i][j] = min;
}
+ }
- // core string edit distance algorithm
- for( unsigned int i=1; i<=a.size(); i++ ) {
- for( unsigned int j=1; j<=b.size(); j++ ) {
- unsigned int ins = cost[i-1][j];
- unsigned int del = cost[i][j-1];
- unsigned int match;
- if (use_letter_sed)
- {
- ins += GetVocabulary().GetWord( a[i-1] ).size();
- del += GetVocabulary().GetWord( b[j-1] ).size();
- match = letter_sed( a[i-1], b[j-1] );
- }
- else
- {
- ins++;
- del++;
- match = ( a[i-1] == b[j-1] ) ? 0 : 1;
- }
- unsigned int diag = cost[i-1][j-1] + match;
-
- char action = (ins < del) ? 'I' : 'D';
- unsigned int min = (ins < del) ? ins : del;
- if (diag < min)
- {
- action = (match>0) ? 'S' : 'M';
- min = diag;
- }
+ // clear out memory
+ unsigned int final = cost[a.size()][b.size()];
+ for( unsigned int i=0; i<=a.size(); i++ ) {
+ free( cost[i] );
+ }
+ free( cost );
+
+ // cache and return result
+ SetLSEDCache(pIdx, final);
+ return final;
+}
+
+/* string edit distance implementation */
- cost[i][j] = min;
- path[i][j] = action;
+unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed )
+{
+
+ // initialize cost and path matrices
+ unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
+ char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
+
+ for( unsigned int i=0; i<=a.size(); i++ ) {
+ cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
+ path[i] = (char*) calloc( sizeof(char), b.size()+1 );
+ if (i>0) {
+ cost[i][0] = cost[i-1][0];
+ if (use_letter_sed) {
+ cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size();
+ } else {
+ cost[i][0]++;
}
+ } else {
+ cost[i][0] = 0;
}
+ path[i][0] = 'I';
+ }
- // construct string for best path
- unsigned int i = a.size();
- unsigned int j = b.size();
- best_path = "";
- while( i>0 || j>0 )
- {
- best_path = path[i][j] + best_path;
- if (path[i][j] == 'I')
- {
- i--;
+ for( unsigned int j=0; j<=b.size(); j++ ) {
+ if (j>0) {
+ cost[0][j] = cost[0][j-1];
+ if (use_letter_sed) {
+ cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size();
+ } else {
+ cost[0][j]++;
}
- else if (path[i][j] == 'D')
- {
- j--;
+ } else {
+ cost[0][j] = 0;
+ }
+ path[0][j] = 'D';
+ }
+
+ // core string edit distance algorithm
+ for( unsigned int i=1; i<=a.size(); i++ ) {
+ for( unsigned int j=1; j<=b.size(); j++ ) {
+ unsigned int ins = cost[i-1][j];
+ unsigned int del = cost[i][j-1];
+ unsigned int match;
+ if (use_letter_sed) {
+ ins += GetVocabulary().GetWord( a[i-1] ).size();
+ del += GetVocabulary().GetWord( b[j-1] ).size();
+ match = letter_sed( a[i-1], b[j-1] );
+ } else {
+ ins++;
+ del++;
+ match = ( a[i-1] == b[j-1] ) ? 0 : 1;
}
- else
- {
- i--;
- j--;
+ unsigned int diag = cost[i-1][j-1] + match;
+
+ char action = (ins < del) ? 'I' : 'D';
+ unsigned int min = (ins < del) ? ins : del;
+ if (diag < min) {
+ action = (match>0) ? 'S' : 'M';
+ min = diag;
}
+
+ cost[i][j] = min;
+ path[i][j] = action;
}
+ }
+ // construct string for best path
+ unsigned int i = a.size();
+ unsigned int j = b.size();
+ best_path = "";
+ while( i>0 || j>0 ) {
+ best_path = path[i][j] + best_path;
+ if (path[i][j] == 'I') {
+ i--;
+ } else if (path[i][j] == 'D') {
+ j--;
+ } else {
+ i--;
+ j--;
+ }
+ }
- // clear out memory
- unsigned int final = cost[a.size()][b.size()];
- for( unsigned int i=0; i<=a.size(); i++ ) {
- free( cost[i] );
- free( path[i] );
- }
- free( cost );
- free( path );
+ // clear out memory
+ unsigned int final = cost[a.size()][b.size()];
- // return result
- return final;
+ for( unsigned int i=0; i<=a.size(); i++ ) {
+ free( cost[i] );
+ free( path[i] );
}
+ free( cost );
+ free( path );
+
+ // return result
+ return final;
+}
-/* utlility function: compute length of sentence in characters
+/* utlility function: compute length of sentence in characters
(spaces do not count) */
unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentence )
{
- unsigned int length = 0; for( unsigned int i=0; i<sentence.size(); i++ )
- {
- length += GetVocabulary().GetWord( sentence[i] ).size();
- }
- return length;
+ unsigned int length = 0;
+ for( unsigned int i=0; i<sentence.size(); i++ ) {
+ length += GetVocabulary().GetWord( sentence[i] ).size();
+ }
+ return length;
}
/* brute force method: compare input to all corpus sentences */
- int FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
- vector< vector< WORD_ID > > input )
+int FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
+ vector< vector< WORD_ID > > input )
{
- // go through input set...
- for(unsigned int i=0;i<input.size();i++)
- {
- bool use_letter_sed = false;
-
- // compute sentence length and worst allowed cost
- unsigned int input_length;
- if (use_letter_sed)
- {
- input_length = compute_length( input[i] );
- }
- else
- {
- input_length = input[i].size();
- }
- unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
- string best_path = "";
- int best_match = -1;
-
- // go through all corpus sentences
- for(unsigned int s=0;s<source.size();s++)
- {
- int source_length;
- if (use_letter_sed)
- {
- source_length = compute_length( source[s] );
- }
- else
- {
- source_length = source[s].size();
- }
- int diff = abs((int)source_length - (int)input_length);
- if (length_filter_flag && (diff >= best_cost))
- {
- continue;
- }
-
- // compute string edit distance
- string path;
- unsigned int cost = sed( input[i], source[s], path, use_letter_sed );
-
- // update if new best
- if (cost < best_cost)
- {
- best_cost = cost;
- best_path = path;
- best_match = s;
- }
- }
- //cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl;
- }
+ // go through input set...
+ for(unsigned int i=0; i<input.size(); i++) {
+ bool use_letter_sed = false;
+
+ // compute sentence length and worst allowed cost
+ unsigned int input_length;
+ if (use_letter_sed) {
+ input_length = compute_length( input[i] );
+ } else {
+ input_length = input[i].size();
+ }
+ unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
+ string best_path = "";
+ int best_match = -1;
+
+ // go through all corpus sentences
+ for(unsigned int s=0; s<source.size(); s++) {
+ int source_length;
+ if (use_letter_sed) {
+ source_length = compute_length( source[s] );
+ } else {
+ source_length = source[s].size();
+ }
+ int diff = abs((int)source_length - (int)input_length);
+ if (length_filter_flag && (diff >= best_cost)) {
+ continue;
+ }
+
+ // compute string edit distance
+ string path;
+ unsigned int cost = sed( input[i], source[s], path, use_letter_sed );
+
+ // update if new best
+ if (cost < best_cost) {
+ best_cost = cost;
+ best_path = path;
+ best_match = s;
+ }
+ }
+ //cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl;
+ }
}
/* definition of short matches
@@ -823,274 +768,250 @@ unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentenc
int FuzzyMatchWrapper::short_match_max_length( int input_length )
{
- if ( ! refined_flag )
+ if ( ! refined_flag )
return 0;
if ( input_length >= 5 )
return 1;
- return 0;
+ return 0;
}
/* if we have non-short matches in a sentence, we need to
- take a closer look at it.
+ take a closer look at it.
this function creates a hash map for all input words and their positions
- (to be used by the next function)
+ (to be used by the next function)
(done here, because this has be done only once for an input sentence) */
void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translationId, const vector< WORD_ID > &input )
{
- int max_length = short_match_max_length( input.size() );
- if (max_length == 0)
- return;
-
- wordIndex.clear();
-
- // store input words and their positions in hash map
- for(int i=0; i<input.size(); i++)
- {
- if (wordIndex.find( input[i] ) == wordIndex.end())
- {
- vector< int > position_vector;
- wordIndex[ input[i] ] = position_vector;
- }
- wordIndex[ input[i] ].push_back( i );
- }
+ int max_length = short_match_max_length( input.size() );
+ if (max_length == 0)
+ return;
+
+ wordIndex.clear();
+
+ // store input words and their positions in hash map
+ for(int i=0; i<input.size(); i++) {
+ if (wordIndex.find( input[i] ) == wordIndex.end()) {
+ vector< int > position_vector;
+ wordIndex[ input[i] ] = position_vector;
+ }
+ wordIndex[ input[i] ].push_back( i );
+ }
}
/* add all short matches to list of matches for a sentence */
void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
-{
- int max_length = short_match_max_length( input_length );
- if (max_length == 0)
- return;
-
- int tm_length = tm.size();
- map< WORD_ID,vector< int > >::iterator input_word_hit;
- for(int t_pos=0; t_pos<tm.size(); t_pos++)
- {
- input_word_hit = wordIndex.find( tm[t_pos] );
- if (input_word_hit != wordIndex.end())
- {
- vector< int > &position_vector = input_word_hit->second;
- for(int j=0; j<position_vector.size(); j++)
- {
- int &i_pos = position_vector[j];
-
- // before match
- int max_cost = max( i_pos , t_pos );
- int min_cost = abs( i_pos - t_pos );
- if ( i_pos>0 && i_pos == t_pos )
- min_cost++;
-
- // after match
- max_cost += max( (input_length-i_pos) , (tm_length-t_pos));
- min_cost += abs( (input_length-i_pos) - (tm_length-t_pos));
- if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos))
- min_cost++;
-
- if (min_cost <= best_cost)
- {
- Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 );
- match.push_back( new_match );
- }
- }
- }
- }
+{
+ int max_length = short_match_max_length( input_length );
+ if (max_length == 0)
+ return;
+
+ int tm_length = tm.size();
+ map< WORD_ID,vector< int > >::iterator input_word_hit;
+ for(int t_pos=0; t_pos<tm.size(); t_pos++) {
+ input_word_hit = wordIndex.find( tm[t_pos] );
+ if (input_word_hit != wordIndex.end()) {
+ vector< int > &position_vector = input_word_hit->second;
+ for(int j=0; j<position_vector.size(); j++) {
+ int &i_pos = position_vector[j];
+
+ // before match
+ int max_cost = max( i_pos , t_pos );
+ int min_cost = abs( i_pos - t_pos );
+ if ( i_pos>0 && i_pos == t_pos )
+ min_cost++;
+
+ // after match
+ max_cost += max( (input_length-i_pos) , (tm_length-t_pos));
+ min_cost += abs( (input_length-i_pos) - (tm_length-t_pos));
+ if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos))
+ min_cost++;
+
+ if (min_cost <= best_cost) {
+ Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 );
+ match.push_back( new_match );
+ }
+ }
+ }
+ }
}
/* remove matches that are subsumed by a larger match */
vector< Match > FuzzyMatchWrapper::prune_matches( const vector< Match > &match, int best_cost )
{
- //cerr << "\tpruning";
- vector< Match > pruned;
- for(int i=match.size()-1; i>=0; i--)
- {
- //cerr << " (" << match[i].input_start << "," << match[i].input_end
- // << " ; " << match[i].tm_start << "," << match[i].tm_end
- // << " * " << match[i].min_cost << ")";
-
- //if (match[i].min_cost > best_cost)
- // continue;
-
- bool subsumed = false;
- for(int j=match.size()-1; j>=0; j--)
- {
- if (i!=j // do not compare match with itself
- && ( match[i].input_end - match[i].input_start <=
- match[j].input_end - match[j].input_start ) // i shorter than j
- && ((match[i].input_start == match[j].input_start &&
- match[i].tm_start == match[j].tm_start ) ||
- (match[i].input_end == match[j].input_end &&
- match[i].tm_end == match[j].tm_end) ) )
- {
- subsumed = true;
- }
- }
- if (! subsumed && match[i].min_cost <= best_cost)
- {
- //cerr << "*";
- pruned.push_back( match[i] );
- }
- }
- //cerr << endl;
- return pruned;
+ //cerr << "\tpruning";
+ vector< Match > pruned;
+ for(int i=match.size()-1; i>=0; i--) {
+ //cerr << " (" << match[i].input_start << "," << match[i].input_end
+ // << " ; " << match[i].tm_start << "," << match[i].tm_end
+ // << " * " << match[i].min_cost << ")";
+
+ //if (match[i].min_cost > best_cost)
+ // continue;
+
+ bool subsumed = false;
+ for(int j=match.size()-1; j>=0; j--) {
+ if (i!=j // do not compare match with itself
+ && ( match[i].input_end - match[i].input_start <=
+ match[j].input_end - match[j].input_start ) // i shorter than j
+ && ((match[i].input_start == match[j].input_start &&
+ match[i].tm_start == match[j].tm_start ) ||
+ (match[i].input_end == match[j].input_end &&
+ match[i].tm_end == match[j].tm_end) ) ) {
+ subsumed = true;
+ }
+ }
+ if (! subsumed && match[i].min_cost <= best_cost) {
+ //cerr << "*";
+ pruned.push_back( match[i] );
+ }
+ }
+ //cerr << endl;
+ return pruned;
}
/* A* parsing method to compute string edit distance */
int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost )
-{
- // cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl;
-
- if (match.size() == 1)
- return match[0].max_cost;
- if (match.size() == 0)
- return input_length+tm_length;
-
- int this_best_cost = input_length + tm_length;
- for(int i=0;i<match.size();i++)
- {
- this_best_cost = min( this_best_cost, match[i].max_cost );
- }
- // cerr << "\tthis best cost: " << this_best_cost << endl;
-
- // bottom up combination of spans
- vector< vector< Match > > multi_match;
- multi_match.push_back( match );
-
- int match_level = 1;
- while(multi_match[ match_level-1 ].size()>0)
- {
- // init vector
- vector< Match > empty;
- multi_match.push_back( empty );
-
- for(int first_level = 0; first_level <= (match_level-1)/2; first_level++)
- {
- int second_level = match_level - first_level -1;
- //cerr << "\tcombining level " << first_level << " and " << second_level << endl;
-
- vector< Match > &first_match = multi_match[ first_level ];
- vector< Match > &second_match = multi_match[ second_level ];
-
- for(int i1 = 0; i1 < first_match.size(); i1++) {
- for(int i2 = 0; i2 < second_match.size(); i2++) {
-
- // do not combine the same pair twice
- if (first_level == second_level && i2 <= i1)
- {
- continue;
- }
-
- // get sorted matches (first is before second)
- Match *first, *second;
- if (first_match[i1].input_start < second_match[i2].input_start )
- {
- first = &first_match[i1];
- second = &second_match[i2];
- }
- else
- {
- second = &first_match[i1];
- first = &second_match[i2];
- }
-
- //cerr << "\tcombining "
- // << "(" << first->input_start << "," << first->input_end << "), "
- // << first->tm_start << " [" << first->internal_cost << "]"
- // << " with "
- // << "(" << second->input_start << "," << second->input_end << "), "
- // << second->tm_start<< " [" << second->internal_cost << "]"
- // << endl;
-
- // do not process overlapping matches
- if (first->input_end >= second->input_start)
- {
- continue;
- }
-
- // no overlap / mismatch in tm
- if (first->tm_end >= second->tm_start)
- {
- continue;
- }
-
- // compute cost
- int min_cost = 0;
- int max_cost = 0;
-
- // initial
- min_cost += abs( first->input_start - first->tm_start );
- max_cost += max( first->input_start, first->tm_start );
-
- // same number of words, but not sent. start -> cost is at least 1
- if (first->input_start == first->tm_start && first->input_start > 0)
- {
- min_cost++;
- }
-
- // in-between
- int skipped_words = second->input_start - first->input_end -1;
- int skipped_words_tm = second->tm_start - first->tm_end -1;
- int internal_cost = max( skipped_words, skipped_words_tm );
- internal_cost += first->internal_cost + second->internal_cost;
- min_cost += internal_cost;
- max_cost += internal_cost;
-
- // final
- min_cost += abs( (tm_length-1 - second->tm_end) -
- (input_length-1 - second->input_end) );
- max_cost += max( (tm_length-1 - second->tm_end),
- (input_length-1 - second->input_end) );
-
- // same number of words, but not sent. end -> cost is at least 1
- if ( ( input_length-1 - second->input_end
- == tm_length-1 - second->tm_end )
- && input_length-1 != second->input_end )
- {
- min_cost++;
- }
-
- // cerr << "\tcost: " << min_cost << "-" << max_cost << endl;
-
- // if worst than best cost, forget it
- if (min_cost > best_cost)
- {
- continue;
- }
-
- // add match
- Match new_match( first->input_start,
- second->input_end,
- first->tm_start,
- second->tm_end,
- min_cost,
- max_cost,
- internal_cost);
- multi_match[ match_level ].push_back( new_match );
- // cerr << "\tstored\n";
-
- // possibly updating this_best_cost
- if (max_cost < this_best_cost)
- {
- // cerr << "\tupdating this best cost to " << max_cost << "\n";
- this_best_cost = max_cost;
-
- // possibly updating best_cost
- if (max_cost < best_cost)
- {
- // cerr << "\tupdating best cost to " << max_cost << "\n";
- best_cost = max_cost;
- }
- }
- }
- }
- }
- match_level++;
- }
- return this_best_cost;
+{
+ // cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl;
+
+ if (match.size() == 1)
+ return match[0].max_cost;
+ if (match.size() == 0)
+ return input_length+tm_length;
+
+ int this_best_cost = input_length + tm_length;
+ for(int i=0; i<match.size(); i++) {
+ this_best_cost = min( this_best_cost, match[i].max_cost );
+ }
+ // cerr << "\tthis best cost: " << this_best_cost << endl;
+
+ // bottom up combination of spans
+ vector< vector< Match > > multi_match;
+ multi_match.push_back( match );
+
+ int match_level = 1;
+ while(multi_match[ match_level-1 ].size()>0) {
+ // init vector
+ vector< Match > empty;
+ multi_match.push_back( empty );
+
+ for(int first_level = 0; first_level <= (match_level-1)/2; first_level++) {
+ int second_level = match_level - first_level -1;
+ //cerr << "\tcombining level " << first_level << " and " << second_level << endl;
+
+ vector< Match > &first_match = multi_match[ first_level ];
+ vector< Match > &second_match = multi_match[ second_level ];
+
+ for(int i1 = 0; i1 < first_match.size(); i1++) {
+ for(int i2 = 0; i2 < second_match.size(); i2++) {
+
+ // do not combine the same pair twice
+ if (first_level == second_level && i2 <= i1) {
+ continue;
+ }
+
+ // get sorted matches (first is before second)
+ Match *first, *second;
+ if (first_match[i1].input_start < second_match[i2].input_start ) {
+ first = &first_match[i1];
+ second = &second_match[i2];
+ } else {
+ second = &first_match[i1];
+ first = &second_match[i2];
+ }
+
+ //cerr << "\tcombining "
+ // << "(" << first->input_start << "," << first->input_end << "), "
+ // << first->tm_start << " [" << first->internal_cost << "]"
+ // << " with "
+ // << "(" << second->input_start << "," << second->input_end << "), "
+ // << second->tm_start<< " [" << second->internal_cost << "]"
+ // << endl;
+
+ // do not process overlapping matches
+ if (first->input_end >= second->input_start) {
+ continue;
+ }
+
+ // no overlap / mismatch in tm
+ if (first->tm_end >= second->tm_start) {
+ continue;
+ }
+
+ // compute cost
+ int min_cost = 0;
+ int max_cost = 0;
+
+ // initial
+ min_cost += abs( first->input_start - first->tm_start );
+ max_cost += max( first->input_start, first->tm_start );
+
+ // same number of words, but not sent. start -> cost is at least 1
+ if (first->input_start == first->tm_start && first->input_start > 0) {
+ min_cost++;
+ }
+
+ // in-between
+ int skipped_words = second->input_start - first->input_end -1;
+ int skipped_words_tm = second->tm_start - first->tm_end -1;
+ int internal_cost = max( skipped_words, skipped_words_tm );
+ internal_cost += first->internal_cost + second->internal_cost;
+ min_cost += internal_cost;
+ max_cost += internal_cost;
+
+ // final
+ min_cost += abs( (tm_length-1 - second->tm_end) -
+ (input_length-1 - second->input_end) );
+ max_cost += max( (tm_length-1 - second->tm_end),
+ (input_length-1 - second->input_end) );
+
+ // same number of words, but not sent. end -> cost is at least 1
+ if ( ( input_length-1 - second->input_end
+ == tm_length-1 - second->tm_end )
+ && input_length-1 != second->input_end ) {
+ min_cost++;
+ }
+
+ // cerr << "\tcost: " << min_cost << "-" << max_cost << endl;
+
+ // if worst than best cost, forget it
+ if (min_cost > best_cost) {
+ continue;
+ }
+
+ // add match
+ Match new_match( first->input_start,
+ second->input_end,
+ first->tm_start,
+ second->tm_end,
+ min_cost,
+ max_cost,
+ internal_cost);
+ multi_match[ match_level ].push_back( new_match );
+ // cerr << "\tstored\n";
+
+ // possibly updating this_best_cost
+ if (max_cost < this_best_cost) {
+ // cerr << "\tupdating this best cost to " << max_cost << "\n";
+ this_best_cost = max_cost;
+
+ // possibly updating best_cost
+ if (max_cost < best_cost) {
+ // cerr << "\tupdating best cost to " << max_cost << "\n";
+ best_cost = max_cost;
+ }
+ }
+ }
+ }
+ }
+ match_level++;
+ }
+ return this_best_cost;
}
@@ -1101,22 +1022,22 @@ void FuzzyMatchWrapper::create_extract(int sentenceInd, int cost, const vector<
WORD_ID wordId = sourceSentence[pos];
sourceStr += GetVocabulary().GetWord(wordId) + " ";
}
-
+
for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) {
- const SentenceAlignment &sentenceAlignment = targets[targetInd];
+ const SentenceAlignment &sentenceAlignment = targets[targetInd];
string targetStr = sentenceAlignment.getTargetString(GetVocabulary());
string alignStr = sentenceAlignment.getAlignmentString();
-
+
outputFile
- << sentenceInd << endl
- << cost << endl
- << sourceStr << endl
- << inputStr << endl
- << targetStr << endl
- << alignStr << endl
- << path << endl
- << sentenceAlignment.count << endl;
-
+ << sentenceInd << endl
+ << cost << endl
+ << sourceStr << endl
+ << inputStr << endl
+ << targetStr << endl
+ << alignStr << endl
+ << path << endl
+ << sentenceAlignment.count << endl;
+
}
}
diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
index a6f772fb9..d8813a65c 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h
@@ -20,18 +20,18 @@
#include "Match.h"
#include "moses/InputType.h"
-namespace tmmt
+namespace tmmt
{
class Match;
class SentenceAlignment;
-
+
class FuzzyMatchWrapper
{
public:
FuzzyMatchWrapper(const std::string &source, const std::string &target, const std::string &alignment);
std::string Extract(long translationId, const std::string &dirNameStr);
-
+
protected:
// tm-mt
std::vector< std::vector< tmmt::SentenceAlignment > > targetAndAlignment;
@@ -58,13 +58,13 @@ protected:
void load_corpus( const std::string &fileName, std::vector< std::vector< tmmt::WORD_ID > > &corpus );
void load_target( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus);
void load_alignment( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus );
-
+
/** brute force method: compare input to all corpus sentences */
- int basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source,
- std::vector< std::vector< tmmt::WORD_ID > > input ) ;
-
- /** utlility function: compute length of sentence in characters
- (spaces do not count) */
+ int basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source,
+ std::vector< std::vector< tmmt::WORD_ID > > input ) ;
+
+ /** utlility function: compute length of sentence in characters
+ (spaces do not count) */
unsigned int compute_length( const std::vector< tmmt::WORD_ID > &sentence );
unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx );
unsigned int sed( const std::vector< WORD_ID > &a, const std::vector< WORD_ID > &b, std::string &best_path, bool use_letter_sed );
@@ -77,8 +77,9 @@ protected:
void create_extract(int sentenceInd, int cost, const std::vector< WORD_ID > &sourceSentence, const std::vector<SentenceAlignment> &targets, const std::string &inputStr, const std::string &path, std::ofstream &outputFile);
std::string ExtractTM(WordIndex &wordIndex, long translationId, const std::string &inputPath);
- Vocabulary &GetVocabulary()
- { return suffixArray->GetVocabulary(); }
+ Vocabulary &GetVocabulary() {
+ return suffixArray->GetVocabulary();
+ }
bool GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const;
void SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value);
diff --git a/moses/TranslationModel/fuzzy-match/Match.h b/moses/TranslationModel/fuzzy-match/Match.h
index 7feb25769..f2ba2c150 100644
--- a/moses/TranslationModel/fuzzy-match/Match.h
+++ b/moses/TranslationModel/fuzzy-match/Match.h
@@ -14,17 +14,18 @@ namespace tmmt
/* data structure for n-gram match between input and corpus */
-class Match {
+class Match
+{
public:
- int input_start;
- int input_end;
- int tm_start;
- int tm_end;
- int min_cost;
- int max_cost;
- int internal_cost;
- Match( int is, int ie, int ts, int te, int min, int max, int i )
- :input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i)
+ int input_start;
+ int input_end;
+ int tm_start;
+ int tm_end;
+ int min_cost;
+ int max_cost;
+ int internal_cost;
+ Match( int is, int ie, int ts, int te, int min, int max, int i )
+ :input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i)
{}
};
diff --git a/moses/TranslationModel/fuzzy-match/SentenceAlignment.h b/moses/TranslationModel/fuzzy-match/SentenceAlignment.h
index 30c887fc1..466baa149 100644
--- a/moses/TranslationModel/fuzzy-match/SentenceAlignment.h
+++ b/moses/TranslationModel/fuzzy-match/SentenceAlignment.h
@@ -15,20 +15,18 @@
namespace tmmt
{
-
-struct SentenceAlignment
-{
+
+struct SentenceAlignment {
int count;
std::vector< WORD_ID > target;
std::vector< std::pair<int,int> > alignment;
-
+
SentenceAlignment()
{}
-
+
std::string getTargetString(const Vocabulary &vocab) const;
-
- std::string getAlignmentString() const
- {
+
+ std::string getAlignmentString() const {
std::stringstream strme;
for (size_t i = 0; i < alignment.size(); ++i) {
const std::pair<int,int> &alignPair = alignment[i];
@@ -36,7 +34,7 @@ struct SentenceAlignment
}
return strme.str();
}
-
+
};
}
diff --git a/moses/TranslationModel/fuzzy-match/SuffixArray.cpp b/moses/TranslationModel/fuzzy-match/SuffixArray.cpp
index 8a67fd954..5f49952ce 100644
--- a/moses/TranslationModel/fuzzy-match/SuffixArray.cpp
+++ b/moses/TranslationModel/fuzzy-match/SuffixArray.cpp
@@ -8,247 +8,235 @@ using namespace std;
namespace tmmt
{
-SuffixArray::SuffixArray( string fileName )
+SuffixArray::SuffixArray( string fileName )
{
- m_vcb.StoreIfNew( "<uNk>" );
- m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
-
- ifstream extractFile;
- char line[LINE_MAX_LENGTH];
-
- // count the number of words first;
- extractFile.open(fileName.c_str());
- istream *fileP = &extractFile;
- m_size = 0;
- size_t sentenceCount = 0;
- while(!fileP->eof()) {
- SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
- if (fileP->eof()) break;
- vector< WORD_ID > words = m_vcb.Tokenize( line );
- m_size += words.size() + 1;
- sentenceCount++;
- }
- extractFile.close();
- cerr << m_size << " words (incl. sentence boundaries)" << endl;
-
- // allocate memory
- m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
- m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
- m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
- m_sentence = (size_t*) calloc( sizeof( size_t ), m_size );
- m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount );
-
- // fill the array
- int wordIndex = 0;
- int sentenceId = 0;
- extractFile.open(fileName.c_str());
- fileP = &extractFile;
- while(!fileP->eof()) {
- SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
- if (fileP->eof()) break;
- vector< WORD_ID > words = m_vcb.Tokenize( line );
-
- // add to corpus vector
- corpus.push_back(words);
-
- // create SA
-
- vector< WORD_ID >::const_iterator i;
- for( i=words.begin(); i!=words.end(); i++)
- {
- m_index[ wordIndex ] = wordIndex;
- m_sentence[ wordIndex ] = sentenceId;
- m_wordInSentence[ wordIndex ] = i-words.begin();
- m_array[ wordIndex++ ] = *i;
- }
- m_index[ wordIndex ] = wordIndex;
- m_array[ wordIndex++ ] = m_endOfSentence;
- m_sentenceLength[ sentenceId++ ] = words.size();
- }
- extractFile.close();
- cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
- // List(0,9);
-
- // sort
- m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
- Sort( 0, m_size-1 );
- free( m_buffer );
- cerr << "done sorting" << endl;
+ m_vcb.StoreIfNew( "<uNk>" );
+ m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
+
+ ifstream extractFile;
+ char line[LINE_MAX_LENGTH];
+
+ // count the number of words first;
+ extractFile.open(fileName.c_str());
+ istream *fileP = &extractFile;
+ m_size = 0;
+ size_t sentenceCount = 0;
+ while(!fileP->eof()) {
+ SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
+ if (fileP->eof()) break;
+ vector< WORD_ID > words = m_vcb.Tokenize( line );
+ m_size += words.size() + 1;
+ sentenceCount++;
+ }
+ extractFile.close();
+ cerr << m_size << " words (incl. sentence boundaries)" << endl;
+
+ // allocate memory
+ m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
+ m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
+ m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
+ m_sentence = (size_t*) calloc( sizeof( size_t ), m_size );
+ m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount );
+
+ // fill the array
+ int wordIndex = 0;
+ int sentenceId = 0;
+ extractFile.open(fileName.c_str());
+ fileP = &extractFile;
+ while(!fileP->eof()) {
+ SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n');
+ if (fileP->eof()) break;
+ vector< WORD_ID > words = m_vcb.Tokenize( line );
+
+ // add to corpus vector
+ corpus.push_back(words);
+
+ // create SA
+
+ vector< WORD_ID >::const_iterator i;
+ for( i=words.begin(); i!=words.end(); i++) {
+ m_index[ wordIndex ] = wordIndex;
+ m_sentence[ wordIndex ] = sentenceId;
+ m_wordInSentence[ wordIndex ] = i-words.begin();
+ m_array[ wordIndex++ ] = *i;
+ }
+ m_index[ wordIndex ] = wordIndex;
+ m_array[ wordIndex++ ] = m_endOfSentence;
+ m_sentenceLength[ sentenceId++ ] = words.size();
+ }
+ extractFile.close();
+ cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
+ // List(0,9);
+
+ // sort
+ m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
+ Sort( 0, m_size-1 );
+ free( m_buffer );
+ cerr << "done sorting" << endl;
}
// good ol' quick sort
-void SuffixArray::Sort(INDEX start, INDEX end) {
- if (start == end) return;
- INDEX mid = (start+end+1)/2;
- Sort( start, mid-1 );
- Sort( mid, end );
-
- // merge
- int i = start;
- int j = mid;
- int k = 0;
- int length = end-start+1;
- while( k<length )
- {
- if (i == mid )
- {
- m_buffer[ k++ ] = m_index[ j++ ];
- }
- else if (j > end )
- {
- m_buffer[ k++ ] = m_index[ i++ ];
- }
- else {
- if (CompareIndex( m_index[i], m_index[j] ) < 0)
- {
- m_buffer[ k++ ] = m_index[ i++ ];
- }
- else
- {
- m_buffer[ k++ ] = m_index[ j++ ];
- }
- }
- }
-
- memcpy( ((char*)m_index) + sizeof( INDEX ) * start,
- ((char*)m_buffer), sizeof( INDEX ) * (end-start+1) );
+void SuffixArray::Sort(INDEX start, INDEX end)
+{
+ if (start == end) return;
+ INDEX mid = (start+end+1)/2;
+ Sort( start, mid-1 );
+ Sort( mid, end );
+
+ // merge
+ int i = start;
+ int j = mid;
+ int k = 0;
+ int length = end-start+1;
+ while( k<length ) {
+ if (i == mid ) {
+ m_buffer[ k++ ] = m_index[ j++ ];
+ } else if (j > end ) {
+ m_buffer[ k++ ] = m_index[ i++ ];
+ } else {
+ if (CompareIndex( m_index[i], m_index[j] ) < 0) {
+ m_buffer[ k++ ] = m_index[ i++ ];
+ } else {
+ m_buffer[ k++ ] = m_index[ j++ ];
+ }
+ }
+ }
+
+ memcpy( ((char*)m_index) + sizeof( INDEX ) * start,
+ ((char*)m_buffer), sizeof( INDEX ) * (end-start+1) );
}
SuffixArray::~SuffixArray()
-{
- free(m_index);
- free(m_array);
+{
+ free(m_index);
+ free(m_array);
}
int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
{
- // skip over identical words
- INDEX offset = 0;
- while( a+offset < m_size &&
- b+offset < m_size &&
- m_array[ a+offset ] == m_array[ b+offset ] )
- { offset++; }
-
- if( a+offset == m_size ) return -1;
- if( b+offset == m_size ) return 1;
- return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );
+ // skip over identical words
+ INDEX offset = 0;
+ while( a+offset < m_size &&
+ b+offset < m_size &&
+ m_array[ a+offset ] == m_array[ b+offset ] ) {
+ offset++;
+ }
+
+ if( a+offset == m_size ) return -1;
+ if( b+offset == m_size ) return 1;
+ return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );
}
inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
{
- // cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
- return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
+ // cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
+ return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
}
int SuffixArray::Count( const vector< WORD > &phrase )
{
- INDEX dummy;
- return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );
+ INDEX dummy;
+ return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );
}
bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min )
{
- INDEX dummy;
- return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;
+ INDEX dummy;
+ return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;
}
bool SuffixArray::Exists( const vector< WORD > &phrase )
{
- INDEX dummy;
- return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;
+ INDEX dummy;
+ return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;
}
int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
{
- return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );
+ return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );
}
int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
{
- // cerr << "FindFirst\n";
- INDEX start = search_start;
- INDEX end = (search_end == -1) ? (m_size-1) : search_end;
- INDEX mid = FindFirst( phrase, start, end );
- // cerr << "done\n";
- if (mid == m_size) return 0; // no matches
- if (min == 1) return 1; // only existance check
+ // cerr << "FindFirst\n";
+ INDEX start = search_start;
+ INDEX end = (search_end == -1) ? (m_size-1) : search_end;
+ INDEX mid = FindFirst( phrase, start, end );
+ // cerr << "done\n";
+ if (mid == m_size) return 0; // no matches
+ if (min == 1) return 1; // only existance check
- int matchCount = 1;
+ int matchCount = 1;
- //cerr << "before...\n";
- firstMatch = FindLast( phrase, mid, start, -1 );
- matchCount += mid - firstMatch;
+ //cerr << "before...\n";
+ firstMatch = FindLast( phrase, mid, start, -1 );
+ matchCount += mid - firstMatch;
- //cerr << "after...\n";
- lastMatch = FindLast( phrase, mid, end, 1 );
- matchCount += lastMatch - mid;
+ //cerr << "after...\n";
+ lastMatch = FindLast( phrase, mid, end, 1 );
+ matchCount += lastMatch - mid;
- return matchCount;
+ return matchCount;
}
SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction )
{
- end += direction;
- while(true)
- {
- INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;
-
- int match = Match( phrase, mid );
- int matchNext = Match( phrase, mid+direction );
- //cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl;
-
- if (match == 0 && matchNext != 0) return mid;
-
- if (match == 0) // mid point is a match
- start = mid;
- else
- end = mid;
- }
+ end += direction;
+ while(true) {
+ INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;
+
+ int match = Match( phrase, mid );
+ int matchNext = Match( phrase, mid+direction );
+ //cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl;
+
+ if (match == 0 && matchNext != 0) return mid;
+
+ if (match == 0) // mid point is a match
+ start = mid;
+ else
+ end = mid;
+ }
}
SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end )
{
- while(true)
- {
- INDEX mid = ( start + end + 1 )/2;
- //cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n";
- int match = Match( phrase, mid );
-
- if (match == 0) return mid;
- if (start >= end && match != 0 ) return m_size;
-
- if (match > 0)
- start = mid+1;
- else
- end = mid-1;
- }
+ while(true) {
+ INDEX mid = ( start + end + 1 )/2;
+ //cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n";
+ int match = Match( phrase, mid );
+
+ if (match == 0) return mid;
+ if (start >= end && match != 0 ) return m_size;
+
+ if (match > 0)
+ start = mid+1;
+ else
+ end = mid-1;
+ }
}
int SuffixArray::Match( const vector< WORD > &phrase, INDEX index )
{
- INDEX pos = m_index[ index ];
- for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++)
- {
- int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );
- // cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl;
- if (match != 0)
- return match;
- }
- return 0;
+ INDEX pos = m_index[ index ];
+ for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++) {
+ int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );
+ // cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl;
+ if (match != 0)
+ return match;
+ }
+ return 0;
}
void SuffixArray::List(INDEX start, INDEX end)
{
- for(INDEX i=start; i<=end; i++)
- {
- INDEX pos = m_index[ i ];
- // cerr << i << ":" << pos << "\t";
- for(int j=0; j<5 && j+pos<m_size; j++)
- {
- //cout << " " << m_vcb.GetWord( m_array[ pos+j ] );
- }
- // cerr << "\n";
- }
+ for(INDEX i=start; i<=end; i++) {
+ INDEX pos = m_index[ i ];
+ // cerr << i << ":" << pos << "\t";
+ for(int j=0; j<5 && j+pos<m_size; j++) {
+ //cout << " " << m_vcb.GetWord( m_array[ pos+j ] );
+ }
+ // cerr << "\n";
+ }
}
}
diff --git a/moses/TranslationModel/fuzzy-match/SuffixArray.h b/moses/TranslationModel/fuzzy-match/SuffixArray.h
index 5cfb120d6..a2dbf892c 100644
--- a/moses/TranslationModel/fuzzy-match/SuffixArray.h
+++ b/moses/TranslationModel/fuzzy-match/SuffixArray.h
@@ -6,51 +6,63 @@
namespace tmmt
{
-
-class SuffixArray
+
+class SuffixArray
{
public:
- typedef unsigned int INDEX;
+ typedef unsigned int INDEX;
private:
std::vector< std::vector< WORD_ID > > corpus;
WORD_ID *m_array;
- INDEX *m_index;
- INDEX *m_buffer;
- char *m_wordInSentence;
- size_t *m_sentence;
- char *m_sentenceLength;
- WORD_ID m_endOfSentence;
- Vocabulary m_vcb;
- INDEX m_size;
+ INDEX *m_index;
+ INDEX *m_buffer;
+ char *m_wordInSentence;
+ size_t *m_sentence;
+ char *m_sentenceLength;
+ WORD_ID m_endOfSentence;
+ Vocabulary m_vcb;
+ INDEX m_size;
public:
- SuffixArray( std::string fileName );
- ~SuffixArray();
-
- void Sort(INDEX start, INDEX end);
- int CompareIndex( INDEX a, INDEX b ) const;
- inline int CompareWord( WORD_ID a, WORD_ID b ) const;
- int Count( const std::vector< WORD > &phrase );
- bool MinCount( const std::vector< WORD > &phrase, INDEX min );
- bool Exists( const std::vector< WORD > &phrase );
- int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
- int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
- INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end );
- INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
- int Match( const std::vector< WORD > &phrase, INDEX index );
- void List( INDEX start, INDEX end );
- inline INDEX GetPosition( INDEX index ) { return m_index[ index ]; }
- inline size_t GetSentence( INDEX position ) { return m_sentence[position]; }
- inline char GetWordInSentence( INDEX position ) { return m_wordInSentence[position]; }
- inline char GetSentenceLength( size_t sentenceId ) { return m_sentenceLength[sentenceId]; }
- inline INDEX GetSize() { return m_size; }
-
- Vocabulary &GetVocabulary()
- { return m_vcb; }
- const std::vector< std::vector< WORD_ID > > &GetCorpus() const
- { return corpus; }
+ SuffixArray( std::string fileName );
+ ~SuffixArray();
+
+ void Sort(INDEX start, INDEX end);
+ int CompareIndex( INDEX a, INDEX b ) const;
+ inline int CompareWord( WORD_ID a, WORD_ID b ) const;
+ int Count( const std::vector< WORD > &phrase );
+ bool MinCount( const std::vector< WORD > &phrase, INDEX min );
+ bool Exists( const std::vector< WORD > &phrase );
+ int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
+ int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
+ INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end );
+ INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
+ int Match( const std::vector< WORD > &phrase, INDEX index );
+ void List( INDEX start, INDEX end );
+ inline INDEX GetPosition( INDEX index ) {
+ return m_index[ index ];
+ }
+ inline size_t GetSentence( INDEX position ) {
+ return m_sentence[position];
+ }
+ inline char GetWordInSentence( INDEX position ) {
+ return m_wordInSentence[position];
+ }
+ inline char GetSentenceLength( size_t sentenceId ) {
+ return m_sentenceLength[sentenceId];
+ }
+ inline INDEX GetSize() {
+ return m_size;
+ }
+
+ Vocabulary &GetVocabulary() {
+ return m_vcb;
+ }
+ const std::vector< std::vector< WORD_ID > > &GetCorpus() const {
+ return corpus;
+ }
};
}
diff --git a/moses/TranslationModel/fuzzy-match/Vocabulary.cpp b/moses/TranslationModel/fuzzy-match/Vocabulary.cpp
index 0c833ff78..ab1439a29 100644
--- a/moses/TranslationModel/fuzzy-match/Vocabulary.cpp
+++ b/moses/TranslationModel/fuzzy-match/Vocabulary.cpp
@@ -10,7 +10,8 @@ namespace tmmt
{
// as in beamdecoder/tables.cpp
-vector<WORD_ID> Vocabulary::Tokenize( const char input[] ) {
+vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
+{
vector< WORD_ID > token;
bool betweenWords = true;
int start=0;
@@ -21,8 +22,7 @@ vector<WORD_ID> Vocabulary::Tokenize( const char input[] ) {
if (!isSpace && betweenWords) {
start = i;
betweenWords = false;
- }
- else if (isSpace && !betweenWords) {
+ } else if (isSpace && !betweenWords) {
token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
betweenWords = true;
}
@@ -32,9 +32,11 @@ vector<WORD_ID> Vocabulary::Tokenize( const char input[] ) {
return token;
}
-WORD_ID Vocabulary::StoreIfNew( const WORD& word ) {
+WORD_ID Vocabulary::StoreIfNew( const WORD& word )
+{
- { // read=lock scope
+ {
+ // read=lock scope
#ifdef WITH_THREADS
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
#endif
@@ -43,17 +45,18 @@ WORD_ID Vocabulary::StoreIfNew( const WORD& word ) {
if( i != lookup.end() )
return i->second;
}
-
+
#ifdef WITH_THREADS
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
#endif
WORD_ID id = vocab.size();
vocab.push_back( word );
lookup[ word ] = id;
- return id;
+ return id;
}
-WORD_ID Vocabulary::GetWordID( const WORD &word ) {
+WORD_ID Vocabulary::GetWordID( const WORD &word )
+{
#ifdef WITH_THREADS
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
#endif
diff --git a/moses/TranslationModel/fuzzy-match/Vocabulary.h b/moses/TranslationModel/fuzzy-match/Vocabulary.h
index 7be82bcbe..dfa11c1db 100644
--- a/moses/TranslationModel/fuzzy-match/Vocabulary.h
+++ b/moses/TranslationModel/fuzzy-match/Vocabulary.h
@@ -34,16 +34,20 @@ namespace tmmt
typedef std::string WORD;
typedef unsigned int WORD_ID;
-class Vocabulary {
- public:
+class Vocabulary
+{
+public:
std::map<WORD, WORD_ID> lookup;
std::vector< WORD > vocab;
WORD_ID StoreIfNew( const WORD& );
WORD_ID GetWordID( const WORD& );
std::vector<WORD_ID> Tokenize( const char[] );
- inline WORD &GetWord( WORD_ID id ) const { WORD &i = (WORD&) vocab[ id ]; return i; }
+ inline WORD &GetWord( WORD_ID id ) const {
+ WORD &i = (WORD&) vocab[ id ];
+ return i;
+ }
- protected:
+protected:
#ifdef WITH_THREADS
//reader-writer lock
mutable boost::shared_mutex m_accessLock;
diff --git a/moses/TranslationModel/fuzzy-match/create_xml.cpp b/moses/TranslationModel/fuzzy-match/create_xml.cpp
index 783fb93eb..44c1efc9f 100644
--- a/moses/TranslationModel/fuzzy-match/create_xml.cpp
+++ b/moses/TranslationModel/fuzzy-match/create_xml.cpp
@@ -42,12 +42,10 @@ void create_xml(const string &inPath)
string inLine;
int step = 0;
- while (!inStrme.eof())
- {
+ while (!inStrme.eof()) {
getline(inStrme, inLine);
//cout << inLine << endl;
- switch (step)
- {
+ switch (step) {
case 0:
setenceId = Scan<int>(inLine);
++step;
@@ -63,8 +61,7 @@ void create_xml(const string &inPath)
case 3:
if (input == NULL) {
input = new string(inLine);
- }
- else {
+ } else {
assert(inLine == *input);
}
++step;
@@ -87,9 +84,9 @@ void create_xml(const string &inPath)
//print STDOUT $frame."\n";
rule << ret.ruleS << " [X] ||| " << ret.ruleT << " [X] ||| " << ret.ruleAlignment
- << " ||| " << count << endl;
+ << " ||| " << count << endl;
ruleInv << ret.ruleT << " [X] ||| " << ret.ruleS << " [X] ||| " << ret.ruleAlignmentInv
- << " ||| " << count << endl;
+ << " ||| " << count << endl;
//print STDOUT "$sentenceInd ||| $score ||| $count\n";
++ruleCount;
@@ -112,8 +109,8 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
{
CreateXMLRetValues ret;
vector<string> sourceToks = Tokenize(source, " ")
- ,inputToks = Tokenize(input, " ")
- ,targetsToks = Tokenize(target, " ");
+ ,inputToks = Tokenize(input, " ")
+ ,targetsToks = Tokenize(target, " ");
Alignments alignments(align, sourceToks.size(), targetsToks.size());
map<int, string> frameInput;
map<int, int> alignI2S;
@@ -241,8 +238,7 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
if (action == "M") {
inputBitmap.push_back(1);
- }
- else if (action == "I" || action == "S") {
+ } else if (action == "I" || action == "S") {
inputBitmap.push_back(0);
}
@@ -358,9 +354,8 @@ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &
}
// end of tm target inclusion (not included word or inserted input)
else if (currently_included
- && ( targetBitmap[t] || frameInput.find(t) != frameInput.end() )
- )
- {
+ && ( targetBitmap[t] || frameInput.find(t) != frameInput.end() )
+ ) {
// add xml (unless change is at the beginning of the sentence
if ( start_t >= 0 ) {
string target = "";
diff --git a/moses/TranslationOption.cpp b/moses/TranslationOption.cpp
index 87721bc52..dfe3312fc 100644
--- a/moses/TranslationOption.cpp
+++ b/moses/TranslationOption.cpp
@@ -42,11 +42,11 @@ TranslationOption::TranslationOption(const WordsRange &wordsRange
}
TranslationOption::TranslationOption(const TranslationOption &copy, const WordsRange &sourceWordsRange)
-: m_targetPhrase(copy.m_targetPhrase)
+ : m_targetPhrase(copy.m_targetPhrase)
//, m_sourcePhrase(new Phrase(*copy.m_sourcePhrase)) // TODO use when confusion network trans opt for confusion net properly implemented
-, m_sourceWordsRange(sourceWordsRange)
-, m_futureScore(copy.m_futureScore)
-, m_lexReorderingScores(copy.m_lexReorderingScores)
+ , m_sourceWordsRange(sourceWordsRange)
+ , m_futureScore(copy.m_futureScore)
+ , m_lexReorderingScores(copy.m_lexReorderingScores)
{}
bool TranslationOption::IsCompatible(const Phrase& phrase, const std::vector<FactorType>& featuresToCheck) const
diff --git a/moses/TranslationOption.h b/moses/TranslationOption.h
index 8e2064f83..b1de31eb1 100644
--- a/moses/TranslationOption.h
+++ b/moses/TranslationOption.h
@@ -146,18 +146,18 @@ public:
void CacheLexReorderingScores(const LexicalReordering &scoreProducer, const Scores &score);
TO_STRING();
-
- bool operator== (const TranslationOption &rhs) const
- {
+
+ bool operator== (const TranslationOption &rhs) const {
return m_sourceWordsRange == rhs.m_sourceWordsRange &&
- m_targetPhrase == rhs.m_targetPhrase;
- }
+ m_targetPhrase == rhs.m_targetPhrase;
+ }
};
//XXX: This doesn't look at the alignment. Is this correct?
-inline size_t hash_value(const TranslationOption& translationOption) {
+inline size_t hash_value(const TranslationOption& translationOption)
+{
size_t seed = 0;
boost::hash_combine(seed, translationOption.GetTargetPhrase());
boost::hash_combine(seed, translationOption.GetStartPos());
diff --git a/moses/TranslationOptionCollection.cpp b/moses/TranslationOptionCollection.cpp
index 16bcce791..2d7024c7a 100644
--- a/moses/TranslationOptionCollection.cpp
+++ b/moses/TranslationOptionCollection.cpp
@@ -48,11 +48,11 @@ bool CompareTranslationOption(const TranslationOption *a, const TranslationOptio
* This fn should be called by inherited classes
*/
TranslationOptionCollection::TranslationOptionCollection(
- InputType const& src, size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
+ InputType const& src, size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
: m_source(src)
- ,m_futureScore(src.GetSize())
- ,m_maxNoTransOptPerCoverage(maxNoTransOptPerCoverage)
- ,m_translationOptionThreshold(translationOptionThreshold)
+ ,m_futureScore(src.GetSize())
+ ,m_maxNoTransOptPerCoverage(maxNoTransOptPerCoverage)
+ ,m_translationOptionThreshold(translationOptionThreshold)
{
// create 2-d vector
size_t size = src.GetSize();
@@ -202,73 +202,68 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord,s
const UnknownWordPenaltyProducer *unknownWordPenaltyProducer = staticData.GetUnknownWordPenaltyProducer();
float unknownScore = FloorScore(TransformScore(0));
- // unknown word, add as trans opt
- FactorCollection &factorCollection = FactorCollection::Instance();
-
- size_t isDigit = 0;
-
- const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
- const StringPiece s = f->GetString();
- bool isEpsilon = (s=="" || s==EPSILON);
- if (StaticData::Instance().GetDropUnknown())
- {
-
-
- isDigit = s.find_first_of("0123456789");
- if (isDigit == 1)
- isDigit = 1;
- else
- isDigit = 0;
- // modify the starting bitmap
- }
-
- Phrase* m_unksrc = new Phrase(1);
+ // unknown word, add as trans opt
+ FactorCollection &factorCollection = FactorCollection::Instance();
+
+ size_t isDigit = 0;
+
+ const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
+ const StringPiece s = f->GetString();
+ bool isEpsilon = (s=="" || s==EPSILON);
+ if (StaticData::Instance().GetDropUnknown()) {
+
+
+ isDigit = s.find_first_of("0123456789");
+ if (isDigit == 1)
+ isDigit = 1;
+ else
+ isDigit = 0;
+ // modify the starting bitmap
+ }
+
+ Phrase* m_unksrc = new Phrase(1);
m_unksrc->AddWord() = sourceWord;
- m_unksrcs.push_back(m_unksrc);
-
- TranslationOption *transOpt;
- TargetPhrase targetPhrase;
- targetPhrase.SetSourcePhrase(*m_unksrc);
-
- if (!(staticData.GetDropUnknown() || isEpsilon) || isDigit)
- {
- // add to dictionary
-
- Word &targetWord = targetPhrase.AddWord();
-
- for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++)
- {
- FactorType factorType = static_cast<FactorType>(currFactor);
-
- const Factor *sourceFactor = sourceWord[currFactor];
- if (sourceFactor == NULL)
- targetWord[factorType] = factorCollection.AddFactor(UNKNOWN_FACTOR);
- else
- targetWord[factorType] = factorCollection.AddFactor(sourceFactor->GetString());
- }
- //create a one-to-one alignment between UNKNOWN_FACTOR and its verbatim translation
-
- targetPhrase.SetAlignmentInfo("0-0");
-
- }
- else
- {
- // drop source word. create blank trans opt
-
- //targetPhrase.SetAlignment();
-
- }
+ m_unksrcs.push_back(m_unksrc);
+
+ TranslationOption *transOpt;
+ TargetPhrase targetPhrase;
+ targetPhrase.SetSourcePhrase(*m_unksrc);
+
+ if (!(staticData.GetDropUnknown() || isEpsilon) || isDigit) {
+ // add to dictionary
+
+ Word &targetWord = targetPhrase.AddWord();
+
+ for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) {
+ FactorType factorType = static_cast<FactorType>(currFactor);
+
+ const Factor *sourceFactor = sourceWord[currFactor];
+ if (sourceFactor == NULL)
+ targetWord[factorType] = factorCollection.AddFactor(UNKNOWN_FACTOR);
+ else
+ targetWord[factorType] = factorCollection.AddFactor(sourceFactor->GetString());
+ }
+ //create a one-to-one alignment between UNKNOWN_FACTOR and its verbatim translation
+
+ targetPhrase.SetAlignmentInfo("0-0");
+
+ } else {
+ // drop source word. create blank trans opt
+
+ //targetPhrase.SetAlignment();
+
+ }
targetPhrase.GetScoreBreakdown().Assign(unknownWordPenaltyProducer, unknownScore);
- if (inputScores != NULL) {
- targetPhrase.SetInputScore(*inputScores);
- }
+ if (inputScores != NULL) {
+ targetPhrase.SetInputScore(*inputScores);
+ }
- targetPhrase.Evaluate(*m_unksrc);
+ targetPhrase.Evaluate(*m_unksrc);
- transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos + length - 1), targetPhrase);
- Add(transOpt);
+ transOpt = new TranslationOption(WordsRange(sourcePos, sourcePos + length - 1), targetPhrase);
+ Add(transOpt);
}
@@ -426,19 +421,19 @@ void TranslationOptionCollection::EvaluateWithSource()
{
const size_t size = m_source.GetSize();
for (size_t startPos = 0 ; startPos < size ; ++startPos) {
- size_t maxSize = m_source.GetSize() - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
-
- for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) {
- TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);
-
- TranslationOptionList::const_iterator iterTransOpt;
- for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) {
- TranslationOption &transOpt = **iterTransOpt;
- transOpt.Evaluate(m_source);
- }
- }
+ size_t maxSize = m_source.GetSize() - startPos;
+ size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
+ maxSize = std::min(maxSize, maxSizePhrase);
+
+ for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) {
+ TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);
+
+ TranslationOptionList::const_iterator iterTransOpt;
+ for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) {
+ TranslationOption &transOpt = **iterTransOpt;
+ transOpt.Evaluate(m_source);
+ }
+ }
}
}
@@ -514,7 +509,7 @@ void TranslationOptionCollection::CreateTranslationOptionsForRange(
for (++iterStep ; iterStep != decodeGraph.end() ; ++iterStep) {
- const DecodeStep &decodeStep = **iterStep;
+ const DecodeStep &decodeStep = **iterStep;
PartialTranslOptColl* newPtoc = new PartialTranslOptColl;
// go thru each intermediate trans opt just created
@@ -634,7 +629,7 @@ std::ostream& operator<<(std::ostream& out, const TranslationOptionCollection& c
return out;
}
-const std::vector<Phrase*>& TranslationOptionCollection::GetUnknownSources() const
+const std::vector<Phrase*>& TranslationOptionCollection::GetUnknownSources() const
{
return m_unksrcs;
}
diff --git a/moses/TranslationOptionCollection.h b/moses/TranslationOptionCollection.h
index 36164f135..171a082e2 100644
--- a/moses/TranslationOptionCollection.h
+++ b/moses/TranslationOptionCollection.h
@@ -42,7 +42,7 @@ class InputType;
class FactorMask;
class Word;
class DecodeGraph;
-
+
/** Contains all phrase translations applicable to current input type (a sentence or confusion network).
* A key insight into efficient decoding is that various input
* conditions (trelliss, factored input, normal text, xml markup)
diff --git a/moses/TranslationOptionCollectionConfusionNet.cpp b/moses/TranslationOptionCollectionConfusionNet.cpp
index a25e8cffb..93953ba8a 100644
--- a/moses/TranslationOptionCollectionConfusionNet.cpp
+++ b/moses/TranslationOptionCollectionConfusionNet.cpp
@@ -10,8 +10,8 @@ namespace Moses
/** constructor; just initialize the base class */
TranslationOptionCollectionConfusionNet::TranslationOptionCollectionConfusionNet(
- const ConfusionNet &input
- , size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
+ const ConfusionNet &input
+ , size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
: TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold) {}
/* forcibly create translation option for a particular source word.
diff --git a/moses/TreeInput.cpp b/moses/TreeInput.cpp
index acae0bdb1..166445602 100644
--- a/moses/TreeInput.cpp
+++ b/moses/TreeInput.cpp
@@ -149,7 +149,7 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput>
return false;
}
- // may be either a input span label ("label"), or a specified output translation "translation"
+ // may be either a input span label ("label"), or a specified output translation "translation"
string label = ParseXmlTagAttribute(tagContent,"label");
string translation = ParseXmlTagAttribute(tagContent,"translation");
@@ -165,18 +165,17 @@ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput>
vector<string> altTexts = TokenizeMultiCharSeparator(translation, "||");
vector<string> altLabel = TokenizeMultiCharSeparator(label, "||");
vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
- //TRACE_ERR("number of translations: " << altTexts.size() << endl);
+ //TRACE_ERR("number of translations: " << altTexts.size() << endl);
for (size_t i=0; i<altTexts.size(); ++i) {
// set target phrase
TargetPhrase targetPhrase;
targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);
// set constituent label
- string targetLHSstr;
+ string targetLHSstr;
if (altLabel.size() > i && altLabel[i].size() > 0) {
targetLHSstr = altLabel[i];
- }
- else {
+ } else {
const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS();
UnknownLHSList::const_iterator iterLHS = lhsList.begin();
targetLHSstr = iterLHS->first;
diff --git a/moses/TrellisPath.cpp b/moses/TrellisPath.cpp
index c73575b2c..fed8f9658 100644
--- a/moses/TrellisPath.cpp
+++ b/moses/TrellisPath.cpp
@@ -41,7 +41,8 @@ TrellisPath::TrellisPath(const Hypothesis *hypo)
}
}
-void TrellisPath::InitScore() {
+void TrellisPath::InitScore()
+{
m_totalScore = m_path[0]->GetWinningHypo()->GetTotalScore();
m_scoreBreakdown= m_path[0]->GetWinningHypo()->GetScoreBreakdown();
@@ -82,8 +83,8 @@ TrellisPath::TrellisPath(const TrellisPath &copy, size_t edgeIndex, const Hypoth
InitScore();
}
-TrellisPath::TrellisPath(const vector<const Hypothesis*> edges)
-:m_prevEdgeChanged(NOT_FOUND)
+TrellisPath::TrellisPath(const vector<const Hypothesis*> edges)
+ :m_prevEdgeChanged(NOT_FOUND)
{
m_path.resize(edges.size());
copy(edges.rbegin(),edges.rend(),m_path.begin());
diff --git a/moses/TrellisPath.h b/moses/TrellisPath.h
index d8005435c..26e722696 100644
--- a/moses/TrellisPath.h
+++ b/moses/TrellisPath.h
@@ -59,36 +59,36 @@ protected:
void InitScore();
public:
- TrellisPath(); // not implemented
-
- //! create path OF pure hypo
- TrellisPath(const Hypothesis *hypo);
-
- /** create path from another path, deviate at edgeIndex by using arc instead,
- * which may change other hypo back from there
- */
- TrellisPath(const TrellisPath &copy, size_t edgeIndex, const Hypothesis *arc);
-
- //! get score for this path throught trellis
- inline float GetTotalScore() const { return m_totalScore; }
-
- /** list of each hypo/arcs in path. For anything other than the best hypo, it is not possible just to follow the
- * m_prevHypo variable in the hypothesis object
- */
- inline const std::vector<const Hypothesis *> &GetEdges() const
- {
- return m_path;
- }
-
- inline size_t GetSize() const
- {
- return m_path.size();
- }
-
- //! create a set of next best paths by wiggling 1 of the node at a time.
- void CreateDeviantPaths(TrellisPathCollection &pathColl) const;
-
- //! create a list of next best paths by wiggling 1 of the node at a time.
+ TrellisPath(); // not implemented
+
+ //! create path OF pure hypo
+ TrellisPath(const Hypothesis *hypo);
+
+ /** create path from another path, deviate at edgeIndex by using arc instead,
+ * which may change other hypo back from there
+ */
+ TrellisPath(const TrellisPath &copy, size_t edgeIndex, const Hypothesis *arc);
+
+ //! get score for this path throught trellis
+ inline float GetTotalScore() const {
+ return m_totalScore;
+ }
+
+ /** list of each hypo/arcs in path. For anything other than the best hypo, it is not possible just to follow the
+ * m_prevHypo variable in the hypothesis object
+ */
+ inline const std::vector<const Hypothesis *> &GetEdges() const {
+ return m_path;
+ }
+
+ inline size_t GetSize() const {
+ return m_path.size();
+ }
+
+ //! create a set of next best paths by wiggling 1 of the node at a time.
+ void CreateDeviantPaths(TrellisPathCollection &pathColl) const;
+
+ //! create a list of next best paths by wiggling 1 of the node at a time.
void CreateDeviantPaths(TrellisPathList &pathColl) const;
inline const ScoreComponentCollection &GetScoreBreakdown() const {
diff --git a/moses/TypeDef.h b/moses/TypeDef.h
index e44946a2f..2b98b5bc3 100644
--- a/moses/TypeDef.h
+++ b/moses/TypeDef.h
@@ -129,7 +129,7 @@ enum InputTypeEnum {
,WordLatticeInput = 2
,TreeInputType = 3
,WordLatticeInput2 = 4
-
+
};
enum XmlInputType {
@@ -169,8 +169,7 @@ enum WordAlignmentSort {
,TargetOrder = 1
};
-enum FormatType
-{
+enum FormatType {
MosesFormat
,HieroFormat
};
diff --git a/moses/Util.cpp b/moses/Util.cpp
index 13cee27f9..f92c32dbb 100644
--- a/moses/Util.cpp
+++ b/moses/Util.cpp
@@ -182,8 +182,7 @@ std::string PassthroughSGML(std::string &line, const std::string tagName, const
size_t close = lline.find(rbrack, open);
//check whether the tag is closed with '/>'; if not return the empty string
- if (close == std::string::npos)
- {
+ if (close == std::string::npos) {
TRACE_ERR("PassthroughSGML error: the <passthrough info/> tag does not end properly\n");
return meta;
}
@@ -198,8 +197,7 @@ std::string PassthroughSGML(std::string &line, const std::string tagName, const
lline = ToLower(line);
open = lline.find(lbrack+tagName);
- if (open != std::string::npos)
- {
+ if (open != std::string::npos) {
TRACE_ERR("PassthroughSGML error: there are two <passthrough> tags\n");
}
return meta;
diff --git a/moses/Util.h b/moses/Util.h
index 9f43d9dc3..e5bdc820a 100644
--- a/moses/Util.h
+++ b/moses/Util.h
@@ -363,27 +363,27 @@ std::string PassthroughSGML(std::string &line, const std::string tagName,const s
*/
inline std::string GetFirstString(const std::string& str, int& first_pos, const std::string& delimiters = " \t")
{
-
- std::string first_str;
- // Skip delimiters at beginning.
- std::string::size_type lastPos = str.find_first_not_of(delimiters, first_pos);
-
- // Find first "non-delimiter".
- std::string::size_type pos = str.find_first_of(delimiters, lastPos);
-
- if (std::string::npos != pos || std::string::npos != lastPos){
-
- first_str = str.substr(lastPos, pos - lastPos);
-
- // Skip delimiters. Note the "not_of"
- lastPos = str.find_first_not_of(delimiters, pos);
-
- }
-
- first_pos = lastPos;
- return first_str;
+
+ std::string first_str;
+ // Skip delimiters at beginning.
+ std::string::size_type lastPos = str.find_first_not_of(delimiters, first_pos);
+
+ // Find first "non-delimiter".
+ std::string::size_type pos = str.find_first_of(delimiters, lastPos);
+
+ if (std::string::npos != pos || std::string::npos != lastPos) {
+
+ first_str = str.substr(lastPos, pos - lastPos);
+
+ // Skip delimiters. Note the "not_of"
+ lastPos = str.find_first_not_of(delimiters, pos);
+
+ }
+
+ first_pos = lastPos;
+ return first_str;
}
-
+
template<class T>
T log_sum (T log_a, T log_b)
{
diff --git a/moses/Word.cpp b/moses/Word.cpp
index 69d382c8a..41e5fae03 100644
--- a/moses/Word.cpp
+++ b/moses/Word.cpp
@@ -87,7 +87,8 @@ std::string Word::GetString(const vector<FactorType> factorType,bool endWithBlan
return strme.str();
}
-StringPiece Word::GetString(FactorType factorType) const {
+StringPiece Word::GetString(FactorType factorType) const
+{
return m_factorArray[factorType]->GetString();
}
diff --git a/moses/Word.h b/moses/Word.h
index d650fb67e..e88b0441b 100644
--- a/moses/Word.h
+++ b/moses/Word.h
@@ -152,8 +152,9 @@ struct WordComparer {
};
-inline size_t hash_value(const Word& word) {
- return word.hash();
+inline size_t hash_value(const Word& word)
+{
+ return word.hash();
}
}
diff --git a/moses/XmlOption.cpp b/moses/XmlOption.cpp
index c8d639e0a..4b703b247 100644
--- a/moses/XmlOption.cpp
+++ b/moses/XmlOption.cpp
@@ -83,8 +83,8 @@ string TrimXml(const string& str, const std::string& lbrackStr, const std::strin
*/
bool isXmlTag(const string& tag, const std::string& lbrackStr, const std::string& rbrackStr)
{
- return (tag.substr(0,lbrackStr.length()) == lbrackStr &&
- (tag[lbrackStr.length()] == '/' ||
+ return (tag.substr(0,lbrackStr.length()) == lbrackStr &&
+ (tag[lbrackStr.length()] == '/' ||
(tag[lbrackStr.length()] >= 'a' && tag[lbrackStr.length()] <= 'z') ||
(tag[lbrackStr.length()] >= 'A' && tag[lbrackStr.length()] <= 'Z')));
}
@@ -111,7 +111,7 @@ vector<string> TokenizeXml(const string& str, const std::string& lbrackStr, cons
// walk thorugh the string (loop vver cpos)
while (cpos != str.size()) {
// find the next opening "<" of an xml tag
- lpos = str.find(lbrack, cpos); // lpos = str.find_first_of(lbrack, cpos);
+ lpos = str.find(lbrack, cpos); // lpos = str.find_first_of(lbrack, cpos);
if (lpos != string::npos) {
// find the end of the xml tag
rpos = str.find(rbrack, lpos+lbrackStr.length()-1); // rpos = str.find_first_of(rbrack, lpos);
@@ -149,8 +149,8 @@ vector<string> TokenizeXml(const string& str, const std::string& lbrackStr, cons
* \param lbrackStr xml tag's left bracket string, typically "<"
* \param rbrackStr xml tag's right bracket string, typically ">"
*/
-bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls,
- const std::string& lbrackStr, const std::string& rbrackStr)
+bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls,
+ const std::string& lbrackStr, const std::string& rbrackStr)
{
//parse XML markup in translation line
diff --git a/moses/XmlOption.h b/moses/XmlOption.h
index 45989c841..942446b26 100644
--- a/moses/XmlOption.h
+++ b/moses/XmlOption.h
@@ -30,8 +30,8 @@ std::string TrimXml(const std::string& str, const std::string& lbrackStr="<", co
bool isXmlTag(const std::string& tag, const std::string& lbrackStr="<", const std::string& rbrackStr=">");
std::vector<std::string> TokenizeXml(const std::string& str, const std::string& lbrackStr="<", const std::string& rbrackStr=">");
-bool ProcessAndStripXMLTags(std::string &line, std::vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, std::vector< size_t > &walls,
- const std::string& lbrackStr="<", const std::string& rbrackStr=">");
+bool ProcessAndStripXMLTags(std::string &line, std::vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, std::vector< size_t > &walls,
+ const std::string& lbrackStr="<", const std::string& rbrackStr=">");
}