Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorakimbal1 <akimball2@bloomberg.net>2015-02-15 07:56:41 +0300
committerakimbal1 <akimball2@bloomberg.net>2015-02-15 07:56:41 +0300
commitf307e56273065e93cb798698dbc7434acf92adbc (patch)
tree20541a7111d8fe09c8c704dae639f7b0f06d718d
parent6352dc773cc9494c1b881618bf674cab90c32feb (diff)
parent17ddbc7d38f9ca9bbfa2cab467402b3e8374afa6 (diff)
merge upstream
cking branch 'upstream/master' # Please enter a commit message to explain why this merge is necessary, # especially if it merges an updated upstream into a topic branch. # # Lines starting with '#' will be ignored, and an empty message aborts # the commit.
-rw-r--r--OnDiskPt/Word.cpp4
-rw-r--r--contrib/other-builds/manual-label/EnOpenNLPChunker.cpp4
-rw-r--r--contrib/other-builds/moses/.project265
-rw-r--r--contrib/server/mosesserver.cpp62
-rw-r--r--mert/FeatureDataIterator.cpp3
-rw-r--r--mert/HopeFearDecoder.cpp2
-rw-r--r--mert/ScoreDataIterator.cpp3
-rw-r--r--moses-cmd/Main.cpp190
-rw-r--r--moses/ConfusionNet.cpp2
-rw-r--r--moses/ExportInterface.cpp215
-rw-r--r--moses/ExportInterface.h (renamed from moses-cmd/Main.h)2
-rw-r--r--moses/FF/BleuScoreFeature.cpp2
-rw-r--r--moses/FF/InputFeature.cpp14
-rw-r--r--moses/FF/LexicalReordering/LexicalReordering.cpp201
-rw-r--r--moses/FF/LexicalReordering/LexicalReorderingState.cpp843
-rw-r--r--moses/FF/LexicalReordering/LexicalReorderingState.h3
-rw-r--r--moses/FF/LexicalReordering/LexicalReorderingTable.cpp1215
-rw-r--r--moses/FF/LexicalReordering/LexicalReorderingTable.h291
-rw-r--r--moses/FF/OSM-Feature/KenOSM.h2
-rw-r--r--moses/FF/OSM-Feature/OpSequenceModel.cpp10
-rw-r--r--moses/FF/StatefulFeatureFunction.h4
-rw-r--r--moses/FF/VW/VW.h18
-rw-r--r--moses/LM/Ken.cpp30
-rw-r--r--moses/Parameter.cpp9
-rw-r--r--moses/ScoreComponentCollection.cpp8
-rw-r--r--moses/SearchCubePruning.cpp69
-rw-r--r--moses/SearchNormal.cpp252
-rw-r--r--moses/Sentence.cpp6
-rw-r--r--moses/StaticData.cpp28
-rw-r--r--moses/StaticData.h9
-rw-r--r--moses/Syntax/F2S/HyperTreeLoader.cpp5
-rw-r--r--moses/Syntax/T2S/RuleTrieLoader.cpp5
-rw-r--r--moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp279
-rw-r--r--moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h90
-rw-r--r--moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp17
-rw-r--r--moses/TranslationModel/RuleTable/LoaderStandard.cpp5
-rw-r--r--moses/TranslationModel/UG/generic/file_io/ug_stream.cpp12
-rw-r--r--moses/TranslationModel/UG/mm/mtt-build.cc4
-rw-r--r--moses/TranslationModel/UG/spe-check-coverage.cc8
-rw-r--r--moses/TranslationOption.h11
-rw-r--r--moses/TranslationOptionCollection.cpp1239
-rw-r--r--moses/TranslationOptionCollection.h38
-rw-r--r--moses/TranslationOptionCollectionConfusionNet.cpp69
-rw-r--r--moses/TranslationOptionCollectionConfusionNet.h14
-rw-r--r--moses/TranslationOptionCollectionLattice.cpp18
-rw-r--r--moses/TranslationOptionCollectionLattice.h9
-rw-r--r--moses/TranslationOptionCollectionText.cpp23
-rw-r--r--moses/TranslationOptionCollectionText.h2
-rw-r--r--moses/TranslationOptionList.cpp95
-rw-r--r--moses/TranslationOptionList.h113
-rw-r--r--moses/TranslationTask.cpp2
-rw-r--r--moses/TypeDef.h5
-rw-r--r--moses/Util.cpp4
-rw-r--r--moses/Util.h14
-rw-r--r--moses/WordLattice.h4
-rw-r--r--moses/XmlOption.cpp6
-rw-r--r--phrase-extract/OutputFileStream.cpp4
-rw-r--r--phrase-extract/ScoreFeature.cpp6
-rw-r--r--phrase-extract/consolidate-direct-main.cpp26
-rw-r--r--phrase-extract/extract-mixed-syntax/InputFileStream.cpp5
-rw-r--r--phrase-extract/filter-rule-table/CfgFilter.h29
-rw-r--r--phrase-extract/filter-rule-table/FilterRuleTable.cpp159
-rw-r--r--phrase-extract/filter-rule-table/FilterRuleTable.h15
-rw-r--r--phrase-extract/filter-rule-table/Forest.h59
-rw-r--r--phrase-extract/filter-rule-table/ForestTsgFilter.cpp196
-rw-r--r--phrase-extract/filter-rule-table/ForestTsgFilter.h70
-rw-r--r--phrase-extract/filter-rule-table/Options.h1
-rw-r--r--phrase-extract/filter-rule-table/StringBasedFilter.cpp27
-rw-r--r--phrase-extract/filter-rule-table/StringBasedFilter.h25
-rw-r--r--phrase-extract/filter-rule-table/StringCfgFilter.cpp323
-rw-r--r--phrase-extract/filter-rule-table/StringCfgFilter.h143
-rw-r--r--phrase-extract/filter-rule-table/StringForest.h24
-rw-r--r--phrase-extract/filter-rule-table/StringForestParser.cpp146
-rw-r--r--phrase-extract/filter-rule-table/StringForestParser.h83
-rw-r--r--phrase-extract/filter-rule-table/TreeBasedFilter.cpp243
-rw-r--r--phrase-extract/filter-rule-table/TreeBasedFilter.h88
-rw-r--r--phrase-extract/filter-rule-table/TreeTsgFilter.cpp120
-rw-r--r--phrase-extract/filter-rule-table/TreeTsgFilter.h53
-rw-r--r--phrase-extract/filter-rule-table/TsgFilter.cpp168
-rw-r--r--phrase-extract/filter-rule-table/TsgFilter.h54
-rwxr-xr-xphrase-extract/lexical-reordering/InputFileStream.cpp5
-rw-r--r--phrase-extract/score-main.cpp6
-rwxr-xr-xregression-testing/run-test-extract.perl7
-rwxr-xr-xregression-testing/run-test-mert.perl6
-rwxr-xr-xregression-testing/run-test-misc.perl7
-rwxr-xr-xregression-testing/run-test-scorer.perl5
-rwxr-xr-xscripts/Transliteration/in-decoding-transliteration.pl2
-rwxr-xr-xscripts/Transliteration/post-decoding-transliteration.pl4
-rwxr-xr-xscripts/Transliteration/prepare-transliteration-phrase-table.pl2
-rwxr-xr-xscripts/Transliteration/train-transliteration-module.pl2
-rwxr-xr-xscripts/recaser/train-recaser.perl34
-rwxr-xr-xscripts/tokenizer/normalize-punctuation.perl1
-rwxr-xr-xscripts/training/convert-moses-ini-v2-to-v1.perl263
-rwxr-xr-xscripts/training/train-model.perl16
-rw-r--r--vw/ClassifierFactory.cpp5
-rw-r--r--vw/VWTrainer.cpp4
96 files changed, 5085 insertions, 3208 deletions
diff --git a/OnDiskPt/Word.cpp b/OnDiskPt/Word.cpp
index 9e6fb6502..8932732ea 100644
--- a/OnDiskPt/Word.cpp
+++ b/OnDiskPt/Word.cpp
@@ -18,6 +18,7 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
+#include <boost/algorithm/string/predicate.hpp>
#include "moses/FactorCollection.h"
#include "moses/Util.h"
#include "moses/Word.h"
@@ -27,6 +28,7 @@
#include "util/exception.hh"
using namespace std;
+using namespace boost::algorithm;
namespace OnDiskPt
{
@@ -41,7 +43,7 @@ Word::~Word()
void Word::CreateFromString(const std::string &inString, Vocab &vocab)
{
- if (inString.substr(0, 1) == "[" && inString.substr(inString.size() - 1, 1) == "]") {
+ if (starts_with(inString, "[") && ends_with(inString, "]")) {
// non-term
m_isNonTerminal = true;
string str = inString.substr(1, inString.size() - 2);
diff --git a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp b/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
index 67c2e9d84..e2c2935f9 100644
--- a/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
+++ b/contrib/other-builds/manual-label/EnOpenNLPChunker.cpp
@@ -8,10 +8,12 @@
#include <stdio.h>
#include <algorithm>
#include <fstream>
+#include <boost/algorithm/string/predicate.hpp>
#include "EnOpenNLPChunker.h"
#include "moses/Util.h"
using namespace std;
+using namespace boost::algorithm;
EnOpenNLPChunker::EnOpenNLPChunker(const std::string &openNLPPath)
:m_openNLPPath(openNLPPath)
@@ -85,7 +87,7 @@ void EnOpenNLPChunker::MosesReformat(const string &line, std::ostream &out, cons
inLabel = true;
}
}
- else if (tok.substr(tok.size()-1, 1) == "]") {
+ else if (ends_with(tok, "]")) {
// end of chunk
if (tok.size() > 1) {
if (tok.substr(1,1) == "_") {
diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project
index dcbb6f439..0751ebd8f 100644
--- a/contrib/other-builds/moses/.project
+++ b/contrib/other-builds/moses/.project
@@ -261,6 +261,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/DecodeStepTranslation.h</locationURI>
</link>
<link>
+ <name>ExportInterface.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/ExportInterface.cpp</locationURI>
+ </link>
+ <link>
+ <name>ExportInterface.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/ExportInterface.h</locationURI>
+ </link>
+ <link>
<name>FF</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
@@ -336,6 +346,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/FloydWarshall.h</locationURI>
</link>
<link>
+ <name>ForestInput.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/ForestInput.cpp</locationURI>
+ </link>
+ <link>
+ <name>ForestInput.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/ForestInput.h</locationURI>
+ </link>
+ <link>
<name>GenerationDictionary.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/GenerationDictionary.cpp</locationURI>
@@ -1486,6 +1506,16 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/TreeStructureFeature.h</locationURI>
</link>
<link>
+ <name>FF/UnalignedWordCountFeature.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/UnalignedWordCountFeature.cpp</locationURI>
+ </link>
+ <link>
+ <name>FF/UnalignedWordCountFeature.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/UnalignedWordCountFeature.h</locationURI>
+ </link>
+ <link>
<name>FF/UnknownWordPenaltyProducer.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/UnknownWordPenaltyProducer.cpp</locationURI>
@@ -1856,6 +1886,11 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/CubeQueue.h</locationURI>
</link>
<link>
+ <name>Syntax/F2S</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
<name>Syntax/KBestExtractor.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/KBestExtractor.cpp</locationURI>
@@ -1961,6 +1996,11 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/SymbolHasher.h</locationURI>
</link>
<link>
+ <name>Syntax/T2S</name>
+ <type>2</type>
+ <locationURI>virtual:/virtual</locationURI>
+ </link>
+ <link>
<name>TranslationModel/BilingualDynSuffixArray.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/BilingualDynSuffixArray.cpp</locationURI>
@@ -2321,6 +2361,141 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/LM/oxlm/SourceOxLM.h</locationURI>
</link>
<link>
+ <name>Syntax/F2S/DerivationWriter.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/DerivationWriter.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/DerivationWriter.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/DerivationWriter.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/Forest.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/Forest.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/Forest.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/Forest.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/GlueRuleSynthesizer.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/GlueRuleSynthesizer.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/GlueRuleSynthesizer.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/GlueRuleSynthesizer.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/HyperPath.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperPath.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/HyperPath.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperPath.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/HyperPathLoader.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperPathLoader.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/HyperPathLoader.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperPathLoader.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/HyperTree.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperTree.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/HyperTree.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperTree.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/HyperTreeCreator.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperTreeCreator.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/HyperTreeLoader.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperTreeLoader.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/HyperTreeLoader.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/HyperTreeLoader.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/Manager-inl.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/Manager-inl.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/Manager.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/Manager.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/PHyperedgeToSHyperedgeBundle.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/PHyperedgeToSHyperedgeBundle.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/PVertexToStackMap.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/PVertexToStackMap.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/RuleMatcher.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/RuleMatcher.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/RuleMatcherCallback.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/RuleMatcherCallback.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/RuleMatcherHyperTree-inl.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/RuleMatcherHyperTree-inl.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/RuleMatcherHyperTree.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/RuleMatcherHyperTree.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/TopologicalSorter.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/TopologicalSorter.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/TopologicalSorter.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/TopologicalSorter.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/TreeFragmentTokenizer.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/TreeFragmentTokenizer.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/F2S/TreeFragmentTokenizer.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/F2S/TreeFragmentTokenizer.h</locationURI>
+ </link>
+ <link>
<name>Syntax/S2T/DerivationWriter.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/DerivationWriter.cpp</locationURI>
@@ -2426,6 +2601,96 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/S2T/SChart.h</locationURI>
</link>
<link>
+ <name>Syntax/T2S/GlueRuleSynthesizer.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/GlueRuleSynthesizer.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/GlueRuleSynthesizer.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/GlueRuleSynthesizer.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/HyperTree.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/HyperTree.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/InputTree.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/InputTree.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/InputTreeBuilder.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/InputTreeBuilder.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/InputTreeBuilder.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/InputTreeBuilder.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/InputTreeToForest.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/InputTreeToForest.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/InputTreeToForest.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/InputTreeToForest.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/Manager-inl.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/Manager-inl.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/Manager.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/Manager.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/RuleMatcher.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleMatcher.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/RuleMatcherSCFG-inl.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleMatcherSCFG-inl.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/RuleMatcherSCFG.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleMatcherSCFG.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/RuleTrie.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleTrie.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/RuleTrie.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleTrie.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/RuleTrieCreator.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleTrieCreator.h</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/RuleTrieLoader.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleTrieLoader.cpp</locationURI>
+ </link>
+ <link>
+ <name>Syntax/T2S/RuleTrieLoader.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/Syntax/T2S/RuleTrieLoader.h</locationURI>
+ </link>
+ <link>
<name>TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerCYKPlus.cpp</locationURI>
diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp
index 6819cbeaa..e140dbe7c 100644
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@@ -19,6 +19,8 @@
#include "moses/LM/ORLM.h"
#include "moses/IOWrapper.h"
+#include <boost/foreach.hpp>
+
#ifdef WITH_THREADS
#include <boost/thread.hpp>
#endif
@@ -276,7 +278,7 @@ public:
stringstream out, graphInfo, transCollOpts;
- if (staticData.IsChart()) {
+ if (staticData.IsSyntax()) {
TreeInput tinput;
const vector<FactorType>&
inputFactorOrder = staticData.GetInputFactorOrder();
@@ -505,39 +507,41 @@ public:
retData.insert(pair<string, xmlrpc_c::value>("nbest", xmlrpc_c::value_array(nBestXml)));
}
- void insertTranslationOptions(Manager& manager, map<string, xmlrpc_c::value>& retData) {
+ void
+ insertTranslationOptions(Manager& manager, map<string, xmlrpc_c::value>& retData)
+ {
const TranslationOptionCollection* toptsColl = manager.getSntTranslationOptions();
vector<xmlrpc_c::value> toptsXml;
- for (size_t startPos = 0 ; startPos < toptsColl->GetSource().GetSize() ; ++startPos) {
- size_t maxSize = toptsColl->GetSource().GetSize() - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
-
- for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) {
- WordsRange range(startPos,endPos);
- const TranslationOptionList& fullList = toptsColl->GetTranslationOptionList(range);
- for (size_t i = 0; i < fullList.size(); i++) {
- const TranslationOption* topt = fullList.Get(i);
- map<string, xmlrpc_c::value> toptXml;
- toptXml["phrase"] = xmlrpc_c::value_string(topt->GetTargetPhrase().
- GetStringRep(StaticData::Instance().GetOutputFactorOrder()));
- toptXml["fscore"] = xmlrpc_c::value_double(topt->GetFutureScore());
- toptXml["start"] = xmlrpc_c::value_int(startPos);
- toptXml["end"] = xmlrpc_c::value_int(endPos);
- vector<xmlrpc_c::value> scoresXml;
- const std::valarray<FValue> &scores = topt->GetScoreBreakdown().getCoreFeatures();
- for (size_t j = 0; j < scores.size(); ++j) {
- scoresXml.push_back(xmlrpc_c::value_double(scores[j]));
- }
- toptXml["scores"] = xmlrpc_c::value_array(scoresXml);
- toptsXml.push_back(xmlrpc_c::value_struct(toptXml));
- }
- }
+ size_t const stop = toptsColl->GetSource().GetSize();
+ TranslationOptionList const* tol;
+ for (size_t s = 0 ; s < stop ; ++s)
+ {
+ for (size_t e = s; (tol = toptsColl->GetTranslationOptionList(s,e)) != NULL; ++e)
+ {
+ BOOST_FOREACH(TranslationOption const* topt, *tol)
+ {
+ map<string, xmlrpc_c::value> toptXml;
+ TargetPhrase const& tp = topt->GetTargetPhrase();
+ StaticData const& GLOBAL = StaticData::Instance();
+ string tphrase = tp.GetStringRep(GLOBAL.GetOutputFactorOrder());
+ toptXml["phrase"] = xmlrpc_c::value_string(tphrase);
+ toptXml["fscore"] = xmlrpc_c::value_double(topt->GetFutureScore());
+ toptXml["start"] = xmlrpc_c::value_int(s);
+ toptXml["end"] = xmlrpc_c::value_int(e);
+ vector<xmlrpc_c::value> scoresXml;
+ const std::valarray<FValue> &scores
+ = topt->GetScoreBreakdown().getCoreFeatures();
+ for (size_t j = 0; j < scores.size(); ++j)
+ scoresXml.push_back(xmlrpc_c::value_double(scores[j]));
+
+ toptXml["scores"] = xmlrpc_c::value_array(scoresXml);
+ toptsXml.push_back(xmlrpc_c::value_struct(toptXml));
+ }
+ }
}
retData.insert(pair<string, xmlrpc_c::value>("topt", xmlrpc_c::value_array(toptsXml)));
-
}
-
+
private:
xmlrpc_c::paramList const& m_paramList;
map<string, xmlrpc_c::value> m_retData;
diff --git a/mert/FeatureDataIterator.cpp b/mert/FeatureDataIterator.cpp
index 9deb0ac50..311a8581f 100644
--- a/mert/FeatureDataIterator.cpp
+++ b/mert/FeatureDataIterator.cpp
@@ -87,7 +87,8 @@ void FeatureDataIterator::readNext()
if (marker != StringPiece(FEATURES_TXT_BEGIN)) {
throw FileFormatException(m_in->FileName(), marker.as_string());
}
- size_t sentenceId = m_in->ReadULong();
+ // size_t sentenceId =
+ m_in->ReadULong();
size_t count = m_in->ReadULong();
size_t length = m_in->ReadULong();
m_in->ReadLine(); //discard rest of line
diff --git a/mert/HopeFearDecoder.cpp b/mert/HopeFearDecoder.cpp
index e907d3ea0..3e62d8171 100644
--- a/mert/HopeFearDecoder.cpp
+++ b/mert/HopeFearDecoder.cpp
@@ -246,7 +246,7 @@ void HypergraphHopeFearDecoder::HopeFear(
wv.ToSparse(&weights);
const Graph& graph = *(graphs_[sentenceId]);
- ValType hope_scale = 1.0;
+ // ValType hope_scale = 1.0;
HgHypothesis hopeHypo, fearHypo, modelHypo;
for(size_t safe_loop=0; safe_loop<2; safe_loop++) {
diff --git a/mert/ScoreDataIterator.cpp b/mert/ScoreDataIterator.cpp
index 71e05ab0b..5a6f6fb69 100644
--- a/mert/ScoreDataIterator.cpp
+++ b/mert/ScoreDataIterator.cpp
@@ -49,7 +49,8 @@ void ScoreDataIterator::readNext()
if (marker != StringPiece(SCORES_TXT_BEGIN)) {
throw FileFormatException(m_in->FileName(), marker.as_string());
}
- size_t sentenceId = m_in->ReadULong();
+ // size_t sentenceId =
+ m_in->ReadULong();
size_t count = m_in->ReadULong();
size_t length = m_in->ReadULong();
m_in->ReadLine(); //ignore rest of line
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index f88f186b5..7de3206fd 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -20,195 +20,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
/**
- * Moses main, for single-threaded and multi-threaded.
+ * Moses main wrapper for executable for single-threaded and multi-threaded, simply calling decoder_main.
**/
-#include <exception>
-#include <fstream>
-#include <sstream>
-#include <vector>
-
-#include "util/usage.hh"
-
-#ifdef WIN32
-// Include Visual Leak Detector
-//#include <vld.h>
-#endif
-
-#include "moses/IOWrapper.h"
-#include "moses/Hypothesis.h"
-#include "moses/Manager.h"
-#include "moses/StaticData.h"
-#include "moses/TypeDef.h"
-#include "moses/Util.h"
-#include "moses/Timer.h"
-#include "moses/TranslationModel/PhraseDictionary.h"
-#include "moses/FF/StatefulFeatureFunction.h"
-#include "moses/FF/StatelessFeatureFunction.h"
-#include "moses/TranslationTask.h"
-
-#ifdef HAVE_PROTOBUF
-#include "hypergraph.pb.h"
-#endif
-
-#ifdef PT_UG
-#include <boost/foreach.hpp>
-#include "moses/TranslationModel/UG/mmsapt.h"
-#include "moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h"
-#endif
-
-using namespace std;
-using namespace Moses;
-
-namespace Moses
-{
-
-void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream)
-{
- outputSearchGraphStream.setf(std::ios::fixed);
- outputSearchGraphStream.precision(6);
- StaticData::Instance().GetAllWeights().Save(outputSearchGraphStream);
-}
-
-
-} //namespace
+#include "moses/ExportInterface.h"
/** main function of the command line version of the decoder **/
int main(int argc, char** argv)
{
- try {
-
-#ifdef HAVE_PROTOBUF
- GOOGLE_PROTOBUF_VERIFY_VERSION;
-#endif
-
- // echo command line, if verbose
- IFVERBOSE(1) {
- TRACE_ERR("command: ");
- for(int i=0; i<argc; ++i) TRACE_ERR(argv[i]<<" ");
- TRACE_ERR(endl);
- }
-
- // set number of significant decimals in output
- FixPrecision(cout);
- FixPrecision(cerr);
-
- // load all the settings into the Parameter class
- // (stores them as strings, or array of strings)
- Parameter params;
- if (!params.LoadParam(argc,argv)) {
- exit(1);
- }
-
-
- // initialize all "global" variables, which are stored in StaticData
- // note: this also loads models such as the language model, etc.
- if (!StaticData::LoadDataStatic(&params, argv[0])) {
- exit(1);
- }
-
- // setting "-show-weights" -> just dump out weights and exit
- if (params.isParamSpecified("show-weights")) {
- ShowWeights();
- exit(0);
- }
-
- // shorthand for accessing information in StaticData
- const StaticData& staticData = StaticData::Instance();
-
-
- //initialise random numbers
- srand(time(NULL));
-
- // set up read/writing class
- IFVERBOSE(1) {
- PrintUserTime("Created input-output object");
- }
-
- IOWrapper* ioWrapper = new IOWrapper();
- if (ioWrapper == NULL) {
- cerr << "Error; Failed to create IO object" << endl;
- exit(1);
- }
-
- // check on weights
- const ScoreComponentCollection& weights = staticData.GetAllWeights();
- IFVERBOSE(2) {
- TRACE_ERR("The global weight vector looks like this: ");
- TRACE_ERR(weights);
- TRACE_ERR("\n");
- }
-
-#ifdef WITH_THREADS
- ThreadPool pool(staticData.ThreadCount());
-#endif
-
- // main loop over set of input sentences
- InputType* source = NULL;
- size_t lineCount = staticData.GetStartTranslationId();
- while(ioWrapper->ReadInput(staticData.GetInputType(),source)) {
- source->SetTranslationId(lineCount);
- IFVERBOSE(1) {
- ResetUserTime();
- }
-
- FeatureFunction::CallChangeSource(source);
-
- // set up task of translating one sentence
- TranslationTask* task = new TranslationTask(source, *ioWrapper);
-
- // execute task
-#ifdef WITH_THREADS
-#ifdef PT_UG
- bool spe = params.isParamSpecified("spe-src");
- if (spe) {
- // simulated post-editing: always run single-threaded!
- task->Run();
- delete task;
- string src,trg,aln;
- UTIL_THROW_IF2(!getline(*ioWrapper->spe_src,src), "[" << HERE << "] "
- << "missing update data for simulated post-editing.");
- UTIL_THROW_IF2(!getline(*ioWrapper->spe_trg,trg), "[" << HERE << "] "
- << "missing update data for simulated post-editing.");
- UTIL_THROW_IF2(!getline(*ioWrapper->spe_aln,aln), "[" << HERE << "] "
- << "missing update data for simulated post-editing.");
- BOOST_FOREACH (PhraseDictionary* pd, PhraseDictionary::GetColl()) {
- Mmsapt* sapt = dynamic_cast<Mmsapt*>(pd);
- if (sapt) sapt->add(src,trg,aln);
- VERBOSE(1,"[" << HERE << " added src] " << src << endl);
- VERBOSE(1,"[" << HERE << " added trg] " << trg << endl);
- VERBOSE(1,"[" << HERE << " added aln] " << aln << endl);
- }
- } else
-#endif
- pool.Submit(task);
-#else
- task->Run();
- delete task;
-#endif
-
- source = NULL; //make sure it doesn't get deleted
- ++lineCount;
- }
-
- // we are done, finishing up
-#ifdef WITH_THREADS
- pool.Stop(true); //flush remaining jobs
-#endif
-
- delete ioWrapper;
- FeatureFunction::Destroy();
-
- } catch (const std::exception &e) {
- std::cerr << "Exception: " << e.what() << std::endl;
- return EXIT_FAILURE;
- }
-
- IFVERBOSE(1) util::PrintUsage(std::cerr);
-
-#ifndef EXIT_RETURN
- //This avoids that destructors are called (it can take a long time)
- exit(EXIT_SUCCESS);
-#else
- return EXIT_SUCCESS;
-#endif
+ return decoder_main(argc, argv);
}
+
diff --git a/moses/ConfusionNet.cpp b/moses/ConfusionNet.cpp
index ce0d5e1c0..f6d7b4168 100644
--- a/moses/ConfusionNet.cpp
+++ b/moses/ConfusionNet.cpp
@@ -67,7 +67,7 @@ ConfusionNet()
stats.createOne();
const StaticData& staticData = StaticData::Instance();
- if (staticData.IsChart()) {
+ if (staticData.IsSyntax()) {
m_defaultLabelSet.insert(StaticData::Instance().GetInputDefaultNonTerminal());
}
UTIL_THROW_IF2(&InputFeature::Instance() == NULL, "Input feature must be specified");
diff --git a/moses/ExportInterface.cpp b/moses/ExportInterface.cpp
new file mode 100644
index 000000000..87affdbed
--- /dev/null
+++ b/moses/ExportInterface.cpp
@@ -0,0 +1,215 @@
+// $Id: ExportInterface.cpp 3045 2010-04-05 13:07:29Z hieuhoang1972 $
+
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2009 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+/**
+ * Moses interface for main function, for single-threaded and multi-threaded.
+ **/
+#include <exception>
+#include <fstream>
+#include <sstream>
+#include <vector>
+
+#include "util/usage.hh"
+
+#ifdef WIN32
+// Include Visual Leak Detector
+//#include <vld.h>
+#endif
+
+#include "IOWrapper.h"
+#include "Hypothesis.h"
+#include "Manager.h"
+#include "StaticData.h"
+#include "TypeDef.h"
+#include "Util.h"
+#include "Timer.h"
+#include "TranslationModel/PhraseDictionary.h"
+#include "FF/StatefulFeatureFunction.h"
+#include "FF/StatelessFeatureFunction.h"
+#include "TranslationTask.h"
+
+#ifdef HAVE_PROTOBUF
+#include "hypergraph.pb.h"
+#endif
+
+#ifdef PT_UG
+#include <boost/foreach.hpp>
+#include "TranslationModel/UG/mmsapt.h"
+#include "TranslationModel/UG/generic/program_options/ug_splice_arglist.h"
+#endif
+
+using namespace std;
+using namespace Moses;
+
+namespace Moses
+{
+
+void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream)
+{
+ outputSearchGraphStream.setf(std::ios::fixed);
+ outputSearchGraphStream.precision(6);
+ StaticData::Instance().GetAllWeights().Save(outputSearchGraphStream);
+}
+
+
+} //namespace
+
+/** Called by main function of the command line version of the decoder **/
+int decoder_main(int argc, char** argv)
+{
+ try {
+
+#ifdef HAVE_PROTOBUF
+ GOOGLE_PROTOBUF_VERIFY_VERSION;
+#endif
+
+ // echo command line, if verbose
+ IFVERBOSE(1) {
+ TRACE_ERR("command: ");
+ for(int i=0; i<argc; ++i) TRACE_ERR(argv[i]<<" ");
+ TRACE_ERR(endl);
+ }
+
+ // set number of significant decimals in output
+ FixPrecision(cout);
+ FixPrecision(cerr);
+
+ // load all the settings into the Parameter class
+ // (stores them as strings, or array of strings)
+ Parameter params;
+ if (!params.LoadParam(argc,argv)) {
+ exit(1);
+ }
+
+
+ // initialize all "global" variables, which are stored in StaticData
+ // note: this also loads models such as the language model, etc.
+ if (!StaticData::LoadDataStatic(&params, argv[0])) {
+ exit(1);
+ }
+
+ // setting "-show-weights" -> just dump out weights and exit
+ if (params.isParamSpecified("show-weights")) {
+ ShowWeights();
+ exit(0);
+ }
+
+ // shorthand for accessing information in StaticData
+ const StaticData& staticData = StaticData::Instance();
+
+
+ //initialise random numbers
+ srand(time(NULL));
+
+ // set up read/writing class
+ IFVERBOSE(1) {
+ PrintUserTime("Created input-output object");
+ }
+
+ IOWrapper* ioWrapper = new IOWrapper();
+ if (ioWrapper == NULL) {
+ cerr << "Error; Failed to create IO object" << endl;
+ exit(1);
+ }
+
+ // check on weights
+ const ScoreComponentCollection& weights = staticData.GetAllWeights();
+ IFVERBOSE(2) {
+ TRACE_ERR("The global weight vector looks like this: ");
+ TRACE_ERR(weights);
+ TRACE_ERR("\n");
+ }
+
+#ifdef WITH_THREADS
+ ThreadPool pool(staticData.ThreadCount());
+#endif
+
+ // main loop over set of input sentences
+ InputType* source = NULL;
+ size_t lineCount = staticData.GetStartTranslationId();
+ while(ioWrapper->ReadInput(staticData.GetInputType(),source)) {
+ source->SetTranslationId(lineCount);
+ IFVERBOSE(1) {
+ ResetUserTime();
+ }
+
+ FeatureFunction::CallChangeSource(source);
+
+ // set up task of translating one sentence
+ TranslationTask* task = new TranslationTask(source, *ioWrapper);
+
+ // execute task
+#ifdef WITH_THREADS
+#ifdef PT_UG
+ bool spe = params.isParamSpecified("spe-src");
+ if (spe) {
+ // simulated post-editing: always run single-threaded!
+ task->Run();
+ delete task;
+ string src,trg,aln;
+ UTIL_THROW_IF2(!getline(*ioWrapper->spe_src,src), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ UTIL_THROW_IF2(!getline(*ioWrapper->spe_trg,trg), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ UTIL_THROW_IF2(!getline(*ioWrapper->spe_aln,aln), "[" << HERE << "] "
+ << "missing update data for simulated post-editing.");
+ BOOST_FOREACH (PhraseDictionary* pd, PhraseDictionary::GetColl()) {
+ Mmsapt* sapt = dynamic_cast<Mmsapt*>(pd);
+ if (sapt) sapt->add(src,trg,aln);
+ VERBOSE(1,"[" << HERE << " added src] " << src << endl);
+ VERBOSE(1,"[" << HERE << " added trg] " << trg << endl);
+ VERBOSE(1,"[" << HERE << " added aln] " << aln << endl);
+ }
+ } else
+#endif
+ pool.Submit(task);
+#else
+ task->Run();
+ delete task;
+#endif
+
+ source = NULL; //make sure it doesn't get deleted
+ ++lineCount;
+ }
+
+ // we are done, finishing up
+#ifdef WITH_THREADS
+ pool.Stop(true); //flush remaining jobs
+#endif
+
+ delete ioWrapper;
+ FeatureFunction::Destroy();
+
+ } catch (const std::exception &e) {
+ std::cerr << "Exception: " << e.what() << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ IFVERBOSE(1) util::PrintUsage(std::cerr);
+
+#ifndef EXIT_RETURN
+ //This avoids that destructors are called (it can take a long time)
+ exit(EXIT_SUCCESS);
+#else
+ return EXIT_SUCCESS;
+#endif
+}
+
diff --git a/moses-cmd/Main.h b/moses/ExportInterface.h
index 49fee0219..8f5b3b7f6 100644
--- a/moses-cmd/Main.h
+++ b/moses/ExportInterface.h
@@ -38,5 +38,5 @@ POSSIBILITY OF SUCH DAMAGE.
class IOWrapper;
-int main(int argc, char* argv[]);
+int decoder_main(int argc, char* argv[]);
diff --git a/moses/FF/BleuScoreFeature.cpp b/moses/FF/BleuScoreFeature.cpp
index 5be3b0b6b..24887c373 100644
--- a/moses/FF/BleuScoreFeature.cpp
+++ b/moses/FF/BleuScoreFeature.cpp
@@ -27,7 +27,7 @@ int BleuScoreState::Compare(const FFState& o) const
if (&o == this)
return 0;
- if (StaticData::Instance().IsChart())
+ if (StaticData::Instance().IsSyntax())
return 0;
const BleuScoreState& other = dynamic_cast<const BleuScoreState&>(o);
diff --git a/moses/FF/InputFeature.cpp b/moses/FF/InputFeature.cpp
index 39535f58f..10e5347e4 100644
--- a/moses/FF/InputFeature.cpp
+++ b/moses/FF/InputFeature.cpp
@@ -52,15 +52,15 @@ void InputFeature::EvaluateWithSourceContext(const InputType &input
, ScoreComponentCollection *estimatedFutureScore) const
{
if (m_legacy) {
- //binary phrase-table does input feature itself
- return;
+ //binary phrase-table does input feature itself
+ return;
}
- /*
- const ScorePair *scores = inputPath.GetInputScore();
- if (scores) {
- scoreBreakdown.PlusEquals(this, *scores);
+ else if (input.GetType() == WordLatticeInput){
+ const ScorePair *scores = inputPath.GetInputScore();
+ if (scores) {
+ scoreBreakdown.PlusEquals(this, *scores);
+ }
}
- */
}
} // namespace
diff --git a/moses/FF/LexicalReordering/LexicalReordering.cpp b/moses/FF/LexicalReordering/LexicalReordering.cpp
index 7be2f1d9e..32693984e 100644
--- a/moses/FF/LexicalReordering/LexicalReordering.cpp
+++ b/moses/FF/LexicalReordering/LexicalReordering.cpp
@@ -1,4 +1,5 @@
#include <sstream>
+#include <boost/algorithm/string/predicate.hpp>
#include "moses/FF/FFState.h"
#include "LexicalReordering.h"
@@ -6,113 +7,129 @@
#include "moses/StaticData.h"
using namespace std;
+using namespace boost::algorithm;
namespace Moses
{
-LexicalReordering::LexicalReordering(const std::string &line)
- : StatefulFeatureFunction(line)
-{
- std::cerr << "Initializing LexicalReordering.." << std::endl;
-
- map<string,string> sparseArgs;
- m_haveDefaultScores = false;
- for (size_t i = 0; i < m_args.size(); ++i) {
- const vector<string> &args = m_args[i];
-
- if (args[0] == "type") {
- m_configuration.reset(new LexicalReorderingConfiguration(args[1]));
- m_configuration->SetScoreProducer(this);
- m_modelTypeString = m_configuration->GetModelString();
- } else if (args[0] == "input-factor") {
- m_factorsF =Tokenize<FactorType>(args[1]);
- } else if (args[0] == "output-factor") {
- m_factorsE =Tokenize<FactorType>(args[1]);
- } else if (args[0] == "path") {
- m_filePath = args[1];
- } else if (args[0].substr(0,7) == "sparse-") {
- sparseArgs[args[0].substr(7)] = args[1];
- } else if (args[0] == "default-scores") {
- vector<string> tokens = Tokenize(args[1],",");
- for(size_t i=0; i<tokens.size(); i++) {
- m_defaultScores.push_back( TransformScore( Scan<float>(tokens[i]) ) );
+
+ LexicalReordering::
+ LexicalReordering(const std::string &line)
+ : StatefulFeatureFunction(line)
+ {
+ VERBOSE(1, "Initializing LexicalReordering.." << std::endl);
+
+ map<string,string> sparseArgs;
+ m_haveDefaultScores = false;
+ for (size_t i = 0; i < m_args.size(); ++i) {
+ const vector<string> &args = m_args[i];
+
+ if (args[0] == "type") {
+ m_configuration.reset(new LexicalReorderingConfiguration(args[1]));
+ m_configuration->SetScoreProducer(this);
+ m_modelTypeString = m_configuration->GetModelString();
+ } else if (args[0] == "input-factor") {
+ m_factorsF =Tokenize<FactorType>(args[1]);
+ } else if (args[0] == "output-factor") {
+ m_factorsE =Tokenize<FactorType>(args[1]);
+ } else if (args[0] == "path") {
+ m_filePath = args[1];
+ } else if (starts_with(args[0], "sparse-")) {
+ sparseArgs[args[0].substr(7)] = args[1];
+ } else if (args[0] == "default-scores") {
+ vector<string> tokens = Tokenize(args[1],",");
+ for(size_t i=0; i<tokens.size(); i++) {
+ m_defaultScores.push_back( TransformScore( Scan<float>(tokens[i]) ) );
+ }
+ m_haveDefaultScores = true;
+ } else {
+ UTIL_THROW(util::Exception,"Unknown argument " + args[0]);
}
- m_haveDefaultScores = true;
- } else {
- UTIL_THROW(util::Exception,"Unknown argument " + args[0]);
}
- }
- switch(m_configuration->GetCondition()) {
- case LexicalReorderingConfiguration::FE:
- case LexicalReorderingConfiguration::E:
- if(m_factorsE.empty()) {
- UTIL_THROW(util::Exception,"TL factor mask for lexical reordering is unexpectedly empty");
- }
- if(m_configuration->GetCondition() == LexicalReorderingConfiguration::E)
- break; // else fall through
- case LexicalReorderingConfiguration::F:
- if(m_factorsF.empty()) {
- UTIL_THROW(util::Exception,"SL factor mask for lexical reordering is unexpectedly empty");
+ switch(m_configuration->GetCondition()) {
+ case LexicalReorderingConfiguration::FE:
+ case LexicalReorderingConfiguration::E:
+ if(m_factorsE.empty()) {
+ UTIL_THROW(util::Exception,
+ "TL factor mask for lexical reordering is unexpectedly empty");
+ }
+ if(m_configuration->GetCondition() == LexicalReorderingConfiguration::E)
+ break; // else fall through
+ case LexicalReorderingConfiguration::F:
+ if(m_factorsF.empty()) {
+ UTIL_THROW(util::Exception,
+ "SL factor mask for lexical reordering is unexpectedly empty");
+ }
+ break;
+ default:
+ UTIL_THROW(util::Exception,"Unknown conditioning option!");
}
- break;
- default:
- UTIL_THROW(util::Exception,"Unknown conditioning option!");
- }
- // sanity check: number of default scores
- if (m_haveDefaultScores) {
- if(m_defaultScores.size() != m_configuration->GetNumScoreComponents()) {
- UTIL_THROW(util::Exception,"wrong number of default scores (" << m_defaultScores.size() << ") for lexicalized reordering model (expected " << m_configuration->GetNumScoreComponents() << ")");
- }
+ // sanity check: number of default scores
+ if (m_haveDefaultScores)
+ {
+ if(m_defaultScores.size() != m_configuration->GetNumScoreComponents())
+ {
+ UTIL_THROW(util::Exception,"wrong number of default scores ("
+ << m_defaultScores.size()
+ << ") for lexicalized reordering model (expected "
+ << m_configuration->GetNumScoreComponents() << ")");
+ }
+ }
+
+ m_configuration->ConfigureSparse(sparseArgs, this);
}
- m_configuration->ConfigureSparse(sparseArgs, this);
-}
-
-LexicalReordering::~LexicalReordering()
-{
-}
-
-void LexicalReordering::Load()
-{
- m_table.reset(LexicalReorderingTable::LoadAvailable(m_filePath, m_factorsF, m_factorsE, std::vector<FactorType>()));
-}
-
-Scores LexicalReordering::GetProb(const Phrase& f, const Phrase& e) const
-{
- return m_table->GetScore(f, e, Phrase(ARRAY_SIZE_INCR));
-}
-
-FFState* LexicalReordering::EvaluateWhenApplied(const Hypothesis& hypo,
- const FFState* prev_state,
- ScoreComponentCollection* out) const
-{
- VERBOSE(3,"LexicalReordering::Evaluate(const Hypothesis& hypo,...) START" << std::endl);
- Scores score(GetNumScoreComponents(), 0);
- const LexicalReorderingState *prev = dynamic_cast<const LexicalReorderingState *>(prev_state);
- LexicalReorderingState *next_state = prev->Expand(hypo.GetTranslationOption(), hypo.GetInput(), out);
+ LexicalReordering::
+ ~LexicalReordering()
+ {
+ }
+
+ void LexicalReordering::Load()
+ {
+ typedef LexicalReorderingTable LRT;
+ m_table.reset(LRT::LoadAvailable(m_filePath, m_factorsF, m_factorsE,
+ std::vector<FactorType>()));
+ }
- out->PlusEquals(this, score);
- VERBOSE(3,"LexicalReordering::Evaluate(const Hypothesis& hypo,...) END" << std::endl);
+ Scores LexicalReordering::GetProb(const Phrase& f, const Phrase& e) const
+ {
+ return m_table->GetScore(f, e, Phrase(ARRAY_SIZE_INCR));
+ }
- return next_state;
-}
+ FFState*
+ LexicalReordering::
+ EvaluateWhenApplied(const Hypothesis& hypo,
+ const FFState* prev_state,
+ ScoreComponentCollection* out) const
+ {
+ VERBOSE(3,"LexicalReordering::Evaluate(const Hypothesis& hypo,...) START"
+ << std::endl);
+ Scores score(GetNumScoreComponents(), 0);
+ const LexicalReorderingState *prev = dynamic_cast<const LexicalReorderingState *>(prev_state);
+ LexicalReorderingState *next_state = prev->Expand(hypo.GetTranslationOption(), hypo.GetInput(), out);
+
+ out->PlusEquals(this, score);
+ VERBOSE(3,"LexicalReordering::Evaluate(const Hypothesis& hypo,...) END" << std::endl);
+
+ return next_state;
+ }
-const FFState* LexicalReordering::EmptyHypothesisState(const InputType &input) const
-{
- return m_configuration->CreateLexicalReorderingState(input);
-}
+ const FFState* LexicalReordering::EmptyHypothesisState(const InputType &input) const
+ {
+ return m_configuration->CreateLexicalReorderingState(input);
+ }
-bool LexicalReordering::IsUseable(const FactorMask &mask) const
-{
- for (size_t i = 0; i < m_factorsE.size(); ++i) {
- const FactorType &factor = m_factorsE[i];
- if (!mask[factor]) {
- return false;
+ bool LexicalReordering::IsUseable(const FactorMask &mask) const
+ {
+ for (size_t i = 0; i < m_factorsE.size(); ++i) {
+ const FactorType &factor = m_factorsE[i];
+ if (!mask[factor]) {
+ return false;
+ }
}
- }
- return true;
+ return true;
-}
+ }
}
diff --git a/moses/FF/LexicalReordering/LexicalReorderingState.cpp b/moses/FF/LexicalReordering/LexicalReorderingState.cpp
index 567d1b713..0bb9344bf 100644
--- a/moses/FF/LexicalReordering/LexicalReorderingState.cpp
+++ b/moses/FF/LexicalReordering/LexicalReorderingState.cpp
@@ -1,4 +1,4 @@
-
+// -*- c++ -*-
#include <vector>
#include <string>
@@ -14,506 +14,521 @@
namespace Moses
{
-size_t LexicalReorderingConfiguration::GetNumberOfTypes() const
-{
- switch (m_modelType) {
- case LexicalReorderingConfiguration::MSD:
- return 3;
- break;
- case LexicalReorderingConfiguration::MSLR:
- return 4;
- break;
- default:
- return 2;
- }
-}
+ typedef LexicalReorderingConfiguration LexReoConf;
-size_t LexicalReorderingConfiguration::GetNumScoreComponents() const
-{
- size_t score_per_dir = m_collapseScores ? 1 : GetNumberOfTypes();
- if (m_direction == Bidirectional) {
- return 2 * score_per_dir + m_additionalScoreComponents;
- } else {
- return score_per_dir + m_additionalScoreComponents;
+ bool
+ IsMonotonicStep(WordsRange const& prev, // words range of last source phrase
+ WordsRange const& cur, // words range of current source phrase
+ WordsBitmap const& cov) // coverage bitmap
+ {
+ size_t e = prev.GetEndPos() + 1;
+ size_t s = cur.GetStartPos();
+ return (s == e || (s >= e && !cov.GetValue(e)));
}
-}
-
-void LexicalReorderingConfiguration::ConfigureSparse
-(const std::map<std::string,std::string>& sparseArgs, const LexicalReordering* producer)
-{
- if (sparseArgs.size()) {
- m_sparse.reset(new SparseReordering(sparseArgs, producer));
+
+ bool
+ IsSwap(WordsRange const& prev, WordsRange const& cur, WordsBitmap const& cov)
+ {
+ size_t s = prev.GetStartPos();
+ size_t e = cur.GetEndPos();
+ return (e+1 == s || (e < s && !cov.GetValue(s-1)));
}
-}
-void LexicalReorderingConfiguration::SetAdditionalScoreComponents(size_t number)
-{
- m_additionalScoreComponents = number;
-}
-
-LexicalReorderingConfiguration::LexicalReorderingConfiguration(const std::string &modelType)
- : m_modelString(modelType), m_scoreProducer(NULL), m_modelType(None), m_phraseBased(true), m_collapseScores(false), m_direction(Backward), m_additionalScoreComponents(0)
-{
- std::vector<std::string> config = Tokenize<std::string>(modelType, "-");
-
- for (size_t i=0; i<config.size(); ++i) {
- if (config[i] == "hier") {
- m_phraseBased = false;
- } else if (config[i] == "phrase") {
- m_phraseBased = true;
- } else if (config[i] == "wbe") {
- m_phraseBased = true;
- // no word-based decoding available, fall-back to phrase-based
- // This is the old lexical reordering model combination of moses
- } else if (config[i] == "msd") {
- m_modelType = MSD;
- } else if (config[i] == "mslr") {
- m_modelType = MSLR;
- } else if (config[i] == "monotonicity") {
- m_modelType = Monotonic;
- } else if (config[i] == "leftright") {
- m_modelType = LeftRight;
- } else if (config[i] == "backward" || config[i] == "unidirectional") {
- // note: unidirectional is deprecated, use backward instead
- m_direction = Backward;
- } else if (config[i] == "forward") {
- m_direction = Forward;
- } else if (config[i] == "bidirectional") {
- m_direction = Bidirectional;
- } else if (config[i] == "f") {
- m_condition = F;
- } else if (config[i] == "fe") {
- m_condition = FE;
- } else if (config[i] == "collapseff") {
- m_collapseScores = true;
- } else if (config[i] == "allff") {
- m_collapseScores = false;
- } else {
- std::cerr << "Illegal part in the lexical reordering configuration string: " << config[i] << std::endl;
- exit(1);
- }
+ size_t
+ LexicalReorderingConfiguration::
+ GetNumberOfTypes() const
+ {
+ return ((m_modelType == LexReoConf::MSD) ? 3 :
+ (m_modelType == LexReoConf::MSLR) ? 4 : 2);
}
-
- if (m_modelType == None) {
- std::cerr << "You need to specify the type of the reordering model (msd, monotonicity,...)" << std::endl;
- exit(1);
+
+ size_t
+ LexicalReorderingConfiguration::
+ GetNumScoreComponents() const
+ {
+ size_t score_per_dir = m_collapseScores ? 1 : GetNumberOfTypes();
+ return ((m_direction == Bidirectional)
+ ? 2 * score_per_dir + m_additionalScoreComponents
+ : score_per_dir + m_additionalScoreComponents);
}
-}
-
-LexicalReorderingState *LexicalReorderingConfiguration::CreateLexicalReorderingState(const InputType &input) const
-{
- LexicalReorderingState *bwd = NULL, *fwd = NULL;
- size_t offset = 0;
-
- switch(m_direction) {
- case Backward:
- case Bidirectional:
- if (m_phraseBased) { //Same for forward and backward
- bwd = new PhraseBasedReorderingState(*this, LexicalReorderingConfiguration::Backward, offset);
- } else {
- bwd = new HierarchicalReorderingBackwardState(*this, offset);
- }
- offset += m_collapseScores ? 1 : GetNumberOfTypes();
- if (m_direction == Backward)
- return bwd; // else fall through
- case Forward:
- if (m_phraseBased) { //Same for forward and backward
- fwd = new PhraseBasedReorderingState(*this, LexicalReorderingConfiguration::Forward, offset);
- } else {
- fwd = new HierarchicalReorderingForwardState(*this, input.GetSize(), offset);
- }
- offset += m_collapseScores ? 1 : GetNumberOfTypes();
- if (m_direction == Forward)
- return fwd;
+
+ void
+ LexicalReorderingConfiguration::
+ ConfigureSparse(std::map<std::string,std::string> const& sparseArgs,
+ const LexicalReordering* producer)
+ {
+ if (sparseArgs.size())
+ m_sparse.reset(new SparseReordering(sparseArgs, producer));
}
- return new BidirectionalReorderingState(*this, bwd, fwd, 0);
-}
+ void LexicalReorderingConfiguration::SetAdditionalScoreComponents(size_t number)
+ {
+ m_additionalScoreComponents = number;
+ }
-void LexicalReorderingState::CopyScores(ScoreComponentCollection* accum, const TranslationOption &topt, const InputType& input, ReorderingType reoType) const
-{
- // don't call this on a bidirectional object
- UTIL_THROW_IF2(m_direction != LexicalReorderingConfiguration::Backward && m_direction != LexicalReorderingConfiguration::Forward,
- "Unknown direction: " << m_direction);
- const TranslationOption* relevantOpt = &topt;
- if (m_direction != LexicalReorderingConfiguration::Backward) relevantOpt = m_prevOption;
- const Scores *cachedScores = relevantOpt->GetLexReorderingScores(m_configuration.GetScoreProducer());
-
- // look up applicable score from vectore of scores
- if(cachedScores) {
- Scores scores(m_configuration.GetScoreProducer()->GetNumScoreComponents(),0);
-
- const Scores &scoreSet = *cachedScores;
- if(m_configuration.CollapseScores()) {
- scores[m_offset] = scoreSet[m_offset + reoType];
- } else {
- std::fill(scores.begin() + m_offset, scores.begin() + m_offset + m_configuration.GetNumberOfTypes(), 0);
- scores[m_offset + reoType] = scoreSet[m_offset + reoType];
- }
- accum->PlusEquals(m_configuration.GetScoreProducer(), scores);
+ LexicalReorderingConfiguration::
+ LexicalReorderingConfiguration(const std::string &modelType)
+ : m_modelString(modelType)
+ , m_scoreProducer(NULL)
+ , m_modelType(None)
+ , m_phraseBased(true)
+ , m_collapseScores(false)
+ , m_direction(Backward)
+ , m_additionalScoreComponents(0)
+ {
+ std::vector<std::string> config = Tokenize<std::string>(modelType, "-");
+
+ for (size_t i=0; i<config.size(); ++i)
+ {
+ if (config[i] == "hier") { m_phraseBased = false; }
+ else if (config[i] == "phrase") { m_phraseBased = true; }
+ else if (config[i] == "wbe") { m_phraseBased = true; }
+ // no word-based decoding available, fall-back to phrase-based
+ // This is the old lexical reordering model combination of moses
+
+ else if (config[i] == "msd") { m_modelType = MSD; }
+ else if (config[i] == "mslr") { m_modelType = MSLR; }
+ else if (config[i] == "monotonicity") { m_modelType = Monotonic; }
+ else if (config[i] == "leftright") { m_modelType = LeftRight; }
+
+ else if (config[i] == "backward") { m_direction = Backward; }
+
+ // note: unidirectional is deprecated, use backward instead
+ else if (config[i] == "unidirectional") { m_direction = Backward; }
+ else if (config[i] == "forward") { m_direction = Forward; }
+ else if (config[i] == "bidirectional") { m_direction = Bidirectional; }
+
+ else if (config[i] == "f") { m_condition = F; }
+ else if (config[i] == "fe") { m_condition = FE; }
+
+ else if (config[i] == "collapseff") { m_collapseScores = true; }
+ else if (config[i] == "allff") { m_collapseScores = false; }
+ else
+ {
+ std::cerr
+ << "Illegal part in the lexical reordering configuration string: "
+ << config[i] << std::endl;
+ exit(1);
+ }
+ }
+
+ if (m_modelType == None)
+ {
+ std::cerr
+ << "You need to specify the type of the reordering model "
+ << "(msd, monotonicity,...)" << std::endl;
+ exit(1);
+ }
}
- // else: use default scores (if specified)
- else if (m_configuration.GetScoreProducer()->GetHaveDefaultScores()) {
- Scores scores(m_configuration.GetScoreProducer()->GetNumScoreComponents(),0);
- if(m_configuration.CollapseScores()) {
- scores[m_offset] = m_configuration.GetScoreProducer()->GetDefaultScore(m_offset + reoType);
- } else {
- scores[m_offset + reoType] = m_configuration.GetScoreProducer()->GetDefaultScore(m_offset + reoType);
- }
- accum->PlusEquals(m_configuration.GetScoreProducer(), scores);
+
+ LexicalReorderingState *
+ LexicalReorderingConfiguration::
+ CreateLexicalReorderingState(const InputType &input) const
+ {
+ LexicalReorderingState *bwd = NULL, *fwd = NULL;
+ size_t offset = 0;
+
+ switch(m_direction)
+ {
+ case Backward:
+ case Bidirectional:
+ bwd = (m_phraseBased
+ ? new PhraseBasedReorderingState(*this, Backward, offset);
+ : new HierarchicalReorderingBackwardState(*this, offset));
+ offset += m_collapseScores ? 1 : GetNumberOfTypes();
+ if (m_direction == Backward) return bwd; // else fall through
+ case Forward:
+ fwd = (m_phraseBased
+ ? new PhraseBasedReorderingState(*this, Forward, offset)
+ : new HierarchicalReorderingForwardState(*this, input.GetSize(),
+ offset));
+ offset += m_collapseScores ? 1 : GetNumberOfTypes();
+ if (m_direction == Forward) return fwd;
+ }
+ return new BidirectionalReorderingState(*this, bwd, fwd, 0);
}
- // note: if no default score, no cost
-
- const SparseReordering* sparse = m_configuration.GetSparseReordering();
- if (sparse) sparse->CopyScores(*relevantOpt, m_prevOption, input, reoType, m_direction, accum);
-
-}
-
-int LexicalReorderingState::ComparePrevScores(const TranslationOption *other) const
-{
- const Scores* myPrevScores = m_prevOption->GetLexReorderingScores(m_configuration.GetScoreProducer());
- const Scores* otherPrevScores = other->GetLexReorderingScores(m_configuration.GetScoreProducer());
+ void
+ LexicalReorderingState::
+ CopyScores(ScoreComponentCollection* accum,
+ const TranslationOption &topt,
+ const InputType& input,
+ ReorderingType reoType) const
+ {
+ // don't call this on a bidirectional object
+ UTIL_THROW_IF2(m_direction != Backward && m_direction != Forward,
+ "Unknown direction: " << m_direction);
+
+ TranslationOption const*
+ relevantOpt = (m_direction == Backward) ? &topt : m_prevOption;
+
+ LexicalReordering* reotable = m_configuration.GetScoreProducer();
+ Scores const* cachedScores = relevantOpt->GetLexReorderingScores(reotable);
+
+ size_t off_remote = m_offset + reoType;
+ size_t off_local = m_configuration.CollapseScores() ? m_offset : off_remote;
+
+ // look up applicable score from vectore of scores
+ if(cachedScores)
+ {
+ Scores scores(reotable->GetNumScoreComponents(),0);
+ socres[off_local ] (*cachedScores)[off_remote];
+ accum->PlusEquals(reotable, scores);
+ }
+
+ // else: use default scores (if specified)
+ else if (reotable->GetHaveDefaultScores())
+ {
+ Scores scores(reotable->GetNumScoreComponents(),0);
+ scores[off_local] = reotable->GetDefaultScore(off_remote);
+ accum->PlusEquals(m_configuration.GetScoreProducer(), scores);
+ }
+ // note: if no default score, no cost
+
+ const SparseReordering* sparse = m_configuration.GetSparseReordering();
+ if (sparse) sparse->CopyScores(*relevantOpt, m_prevOption, input, reoType,
+ m_direction, accum);
+ }
+
- if(myPrevScores == otherPrevScores)
- return 0;
+ int
+ LexicalReorderingState::
+ ComparePrevScores(const TranslationOption *other) const
+ {
+ LexicalReordering* reotable = m_configuration.GetScoreProducer();
+ const Scores* myPrevScores = m_prevOption->GetLexReorderingScores(reotable);
+ const Scores* otherPrevScores = other->GetLexReorderingScores(reotable);
- // The pointers are NULL if a phrase pair isn't found in the reordering table.
- if(otherPrevScores == NULL)
- return -1;
- if(myPrevScores == NULL)
- return 1;
+ if(myPrevScores == otherPrevScores)
+ return 0;
- for(size_t i = m_offset; i < m_offset + m_configuration.GetNumberOfTypes(); i++)
- if((*myPrevScores)[i] < (*otherPrevScores)[i])
+ // The pointers are NULL if a phrase pair isn't found in the reordering table.
+ if(otherPrevScores == NULL)
return -1;
- else if((*myPrevScores)[i] > (*otherPrevScores)[i])
+ if(myPrevScores == NULL)
return 1;
- return 0;
-}
+ for(size_t i = m_offset; i < m_offset + m_configuration.GetNumberOfTypes(); i++)
+ if((*myPrevScores)[i] < (*otherPrevScores)[i])
+ return -1;
+ else if((*myPrevScores)[i] > (*otherPrevScores)[i])
+ return 1;
-bool PhraseBasedReorderingState::m_useFirstBackwardScore = true;
+ return 0;
+ }
-PhraseBasedReorderingState::PhraseBasedReorderingState(const PhraseBasedReorderingState *prev, const TranslationOption &topt)
- : LexicalReorderingState(prev, topt), m_prevRange(topt.GetSourceWordsRange()), m_first(false) {}
+ bool PhraseBasedReorderingState::m_useFirstBackwardScore = true;
+ PhraseBasedReorderingState::PhraseBasedReorderingState(const PhraseBasedReorderingState *prev, const TranslationOption &topt)
+ : LexicalReorderingState(prev, topt), m_prevRange(topt.GetSourceWordsRange()), m_first(false) {}
-PhraseBasedReorderingState::PhraseBasedReorderingState(const LexicalReorderingConfiguration &config,
- LexicalReorderingConfiguration::Direction dir, size_t offset)
- : LexicalReorderingState(config, dir, offset), m_prevRange(NOT_FOUND,NOT_FOUND), m_first(true) {}
+ PhraseBasedReorderingState::PhraseBasedReorderingState(const LexReoConf &config,
+ LexReoConf::Direction dir, size_t offset)
+ : LexicalReorderingState(config, dir, offset), m_prevRange(NOT_FOUND,NOT_FOUND), m_first(true) {}
-int PhraseBasedReorderingState::Compare(const FFState& o) const
-{
- if (&o == this)
- return 0;
- const PhraseBasedReorderingState* other = static_cast<const PhraseBasedReorderingState*>(&o);
- if (m_prevRange == other->m_prevRange) {
- if (m_direction == LexicalReorderingConfiguration::Forward) {
- return ComparePrevScores(other->m_prevOption);
- } else {
+ int PhraseBasedReorderingState::Compare(const FFState& o) const
+ {
+ if (&o == this)
return 0;
+
+ const PhraseBasedReorderingState* other = static_cast<const PhraseBasedReorderingState*>(&o);
+ if (m_prevRange == other->m_prevRange) {
+ if (m_direction == LexReoConf::Forward) {
+ return ComparePrevScores(other->m_prevOption);
+ } else {
+ return 0;
+ }
+ } else if (m_prevRange < other->m_prevRange) {
+ return -1;
}
- } else if (m_prevRange < other->m_prevRange) {
- return -1;
+ return 1;
}
- return 1;
-}
-LexicalReorderingState* PhraseBasedReorderingState::Expand(const TranslationOption& topt, const InputType& input,ScoreComponentCollection* scores) const
-{
- ReorderingType reoType;
- const WordsRange currWordsRange = topt.GetSourceWordsRange();
- const LexicalReorderingConfiguration::ModelType modelType = m_configuration.GetModelType();
-
- if ((m_direction != LexicalReorderingConfiguration::Forward && m_useFirstBackwardScore) || !m_first) {
- if (modelType == LexicalReorderingConfiguration::MSD) {
- reoType = GetOrientationTypeMSD(currWordsRange);
- } else if (modelType == LexicalReorderingConfiguration::MSLR) {
- reoType = GetOrientationTypeMSLR(currWordsRange);
- } else if (modelType == LexicalReorderingConfiguration::Monotonic) {
- reoType = GetOrientationTypeMonotonic(currWordsRange);
- } else {
- reoType = GetOrientationTypeLeftRight(currWordsRange);
+ LexicalReorderingState* PhraseBasedReorderingState::Expand(const TranslationOption& topt, const InputType& input,ScoreComponentCollection* scores) const
+ {
+ ReorderingType reoType;
+ const WordsRange currWordsRange = topt.GetSourceWordsRange();
+ const LexReoConf::ModelType modelType = m_configuration.GetModelType();
+
+ if ((m_direction != LexReoConf::Forward && m_useFirstBackwardScore) || !m_first) {
+ if (modelType == LexReoConf::MSD) {
+ reoType = GetOrientationTypeMSD(currWordsRange);
+ } else if (modelType == LexReoConf::MSLR) {
+ reoType = GetOrientationTypeMSLR(currWordsRange);
+ } else if (modelType == LexReoConf::Monotonic) {
+ reoType = GetOrientationTypeMonotonic(currWordsRange);
+ } else {
+ reoType = GetOrientationTypeLeftRight(currWordsRange);
+ }
+ CopyScores(scores, topt, input, reoType);
}
- CopyScores(scores, topt, input, reoType);
- }
- return new PhraseBasedReorderingState(this, topt);
-}
+ return new PhraseBasedReorderingState(this, topt);
+ }
-LexicalReorderingState::ReorderingType PhraseBasedReorderingState::GetOrientationTypeMSD(WordsRange currRange) const
-{
- if (m_first) {
- if (currRange.GetStartPos() == 0) {
+ LexicalReorderingState::ReorderingType PhraseBasedReorderingState::GetOrientationTypeMSD(WordsRange currRange) const
+ {
+ if (m_first) {
+ if (currRange.GetStartPos() == 0) {
+ return M;
+ } else {
+ return D;
+ }
+ }
+ if (m_prevRange.GetEndPos() == currRange.GetStartPos()-1) {
return M;
- } else {
- return D;
+ } else if (m_prevRange.GetStartPos() == currRange.GetEndPos()+1) {
+ return S;
}
+ return D;
}
- if (m_prevRange.GetEndPos() == currRange.GetStartPos()-1) {
- return M;
- } else if (m_prevRange.GetStartPos() == currRange.GetEndPos()+1) {
- return S;
- }
- return D;
-}
-LexicalReorderingState::ReorderingType PhraseBasedReorderingState::GetOrientationTypeMSLR(WordsRange currRange) const
-{
- if (m_first) {
- if (currRange.GetStartPos() == 0) {
+ LexicalReorderingState::ReorderingType PhraseBasedReorderingState::GetOrientationTypeMSLR(WordsRange currRange) const
+ {
+ if (m_first) {
+ if (currRange.GetStartPos() == 0) {
+ return M;
+ } else {
+ return DR;
+ }
+ }
+ if (m_prevRange.GetEndPos() == currRange.GetStartPos()-1) {
return M;
- } else {
+ } else if (m_prevRange.GetStartPos() == currRange.GetEndPos()+1) {
+ return S;
+ } else if (m_prevRange.GetEndPos() < currRange.GetStartPos()) {
return DR;
}
+ return DL;
}
- if (m_prevRange.GetEndPos() == currRange.GetStartPos()-1) {
- return M;
- } else if (m_prevRange.GetStartPos() == currRange.GetEndPos()+1) {
- return S;
- } else if (m_prevRange.GetEndPos() < currRange.GetStartPos()) {
- return DR;
- }
- return DL;
-}
-LexicalReorderingState::ReorderingType PhraseBasedReorderingState::GetOrientationTypeMonotonic(WordsRange currRange) const
-{
- if ((m_first && currRange.GetStartPos() == 0) ||
- (m_prevRange.GetEndPos() == currRange.GetStartPos()-1)) {
- return M;
+ LexicalReorderingState::ReorderingType PhraseBasedReorderingState::GetOrientationTypeMonotonic(WordsRange currRange) const
+ {
+ if ((m_first && currRange.GetStartPos() == 0) ||
+ (m_prevRange.GetEndPos() == currRange.GetStartPos()-1)) {
+ return M;
+ }
+ return NM;
}
- return NM;
-}
-LexicalReorderingState::ReorderingType PhraseBasedReorderingState::GetOrientationTypeLeftRight(WordsRange currRange) const
-{
- if (m_first ||
- (m_prevRange.GetEndPos() <= currRange.GetStartPos())) {
- return R;
+ LexicalReorderingState::ReorderingType PhraseBasedReorderingState::GetOrientationTypeLeftRight(WordsRange currRange) const
+ {
+ if (m_first ||
+ (m_prevRange.GetEndPos() <= currRange.GetStartPos())) {
+ return R;
+ }
+ return L;
}
- return L;
-}
-///////////////////////////
-//BidirectionalReorderingState
+ ///////////////////////////
+ //BidirectionalReorderingState
-int BidirectionalReorderingState::Compare(const FFState& o) const
-{
- if (&o == this)
- return 0;
-
- const BidirectionalReorderingState &other = static_cast<const BidirectionalReorderingState &>(o);
- if(m_backward->Compare(*other.m_backward) < 0)
- return -1;
- else if(m_backward->Compare(*other.m_backward) > 0)
- return 1;
- else
- return m_forward->Compare(*other.m_forward);
-}
+ int BidirectionalReorderingState::Compare(const FFState& o) const
+ {
+ if (&o == this)
+ return 0;
-LexicalReorderingState* BidirectionalReorderingState::Expand(const TranslationOption& topt, const InputType& input, ScoreComponentCollection* scores) const
-{
- LexicalReorderingState *newbwd = m_backward->Expand(topt,input, scores);
- LexicalReorderingState *newfwd = m_forward->Expand(topt, input, scores);
- return new BidirectionalReorderingState(m_configuration, newbwd, newfwd, m_offset);
-}
+ const BidirectionalReorderingState &other = static_cast<const BidirectionalReorderingState &>(o);
+ if(m_backward->Compare(*other.m_backward) < 0)
+ return -1;
+ else if(m_backward->Compare(*other.m_backward) > 0)
+ return 1;
+ else
+ return m_forward->Compare(*other.m_forward);
+ }
-///////////////////////////
-//HierarchicalReorderingBackwardState
+ LexicalReorderingState* BidirectionalReorderingState::Expand(const TranslationOption& topt, const InputType& input, ScoreComponentCollection* scores) const
+ {
+ LexicalReorderingState *newbwd = m_backward->Expand(topt,input, scores);
+ LexicalReorderingState *newfwd = m_forward->Expand(topt, input, scores);
+ return new BidirectionalReorderingState(m_configuration, newbwd, newfwd, m_offset);
+ }
-HierarchicalReorderingBackwardState::HierarchicalReorderingBackwardState(const HierarchicalReorderingBackwardState *prev,
- const TranslationOption &topt, ReorderingStack reoStack)
- : LexicalReorderingState(prev, topt), m_reoStack(reoStack) {}
+ ///////////////////////////
+ //HierarchicalReorderingBackwardState
-HierarchicalReorderingBackwardState::HierarchicalReorderingBackwardState(const LexicalReorderingConfiguration &config, size_t offset)
- : LexicalReorderingState(config, LexicalReorderingConfiguration::Backward, offset) {}
+ HierarchicalReorderingBackwardState::HierarchicalReorderingBackwardState(const HierarchicalReorderingBackwardState *prev,
+ const TranslationOption &topt, ReorderingStack reoStack)
+ : LexicalReorderingState(prev, topt), m_reoStack(reoStack) {}
+ HierarchicalReorderingBackwardState::HierarchicalReorderingBackwardState(const LexReoConf &config, size_t offset)
+ : LexicalReorderingState(config, LexReoConf::Backward, offset) {}
-int HierarchicalReorderingBackwardState::Compare(const FFState& o) const
-{
- const HierarchicalReorderingBackwardState& other = static_cast<const HierarchicalReorderingBackwardState&>(o);
- return m_reoStack.Compare(other.m_reoStack);
-}
-LexicalReorderingState* HierarchicalReorderingBackwardState::Expand(const TranslationOption& topt, const InputType& input,ScoreComponentCollection* scores) const
-{
+ int HierarchicalReorderingBackwardState::Compare(const FFState& o) const
+ {
+ const HierarchicalReorderingBackwardState& other = static_cast<const HierarchicalReorderingBackwardState&>(o);
+ return m_reoStack.Compare(other.m_reoStack);
+ }
- HierarchicalReorderingBackwardState* nextState = new HierarchicalReorderingBackwardState(this, topt, m_reoStack);
- ReorderingType reoType;
- const LexicalReorderingConfiguration::ModelType modelType = m_configuration.GetModelType();
+ LexicalReorderingState* HierarchicalReorderingBackwardState::Expand(const TranslationOption& topt, const InputType& input,ScoreComponentCollection* scores) const
+ {
- int reoDistance = nextState->m_reoStack.ShiftReduce(topt.GetSourceWordsRange());
+ HierarchicalReorderingBackwardState* nextState = new HierarchicalReorderingBackwardState(this, topt, m_reoStack);
+ ReorderingType reoType;
+ const LexReoConf::ModelType modelType = m_configuration.GetModelType();
- if (modelType == LexicalReorderingConfiguration::MSD) {
- reoType = GetOrientationTypeMSD(reoDistance);
- } else if (modelType == LexicalReorderingConfiguration::MSLR) {
- reoType = GetOrientationTypeMSLR(reoDistance);
- } else if (modelType == LexicalReorderingConfiguration::LeftRight) {
- reoType = GetOrientationTypeLeftRight(reoDistance);
- } else {
- reoType = GetOrientationTypeMonotonic(reoDistance);
- }
+ int reoDistance = nextState->m_reoStack.ShiftReduce(topt.GetSourceWordsRange());
- CopyScores(scores, topt, input, reoType);
- return nextState;
-}
+ if (modelType == LexReoConf::MSD) {
+ reoType = GetOrientationTypeMSD(reoDistance);
+ } else if (modelType == LexReoConf::MSLR) {
+ reoType = GetOrientationTypeMSLR(reoDistance);
+ } else if (modelType == LexReoConf::LeftRight) {
+ reoType = GetOrientationTypeLeftRight(reoDistance);
+ } else {
+ reoType = GetOrientationTypeMonotonic(reoDistance);
+ }
-LexicalReorderingState::ReorderingType HierarchicalReorderingBackwardState::GetOrientationTypeMSD(int reoDistance) const
-{
- if (reoDistance == 1) {
- return M;
- } else if (reoDistance == -1) {
- return S;
+ CopyScores(scores, topt, input, reoType);
+ return nextState;
}
- return D;
-}
-LexicalReorderingState::ReorderingType HierarchicalReorderingBackwardState::GetOrientationTypeMSLR(int reoDistance) const
-{
- if (reoDistance == 1) {
- return M;
- } else if (reoDistance == -1) {
- return S;
- } else if (reoDistance > 1) {
- return DR;
+ LexicalReorderingState::ReorderingType HierarchicalReorderingBackwardState::GetOrientationTypeMSD(int reoDistance) const
+ {
+ if (reoDistance == 1) {
+ return M;
+ } else if (reoDistance == -1) {
+ return S;
+ }
+ return D;
}
- return DL;
-}
-LexicalReorderingState::ReorderingType HierarchicalReorderingBackwardState::GetOrientationTypeMonotonic(int reoDistance) const
-{
- if (reoDistance == 1) {
- return M;
+ LexicalReorderingState::ReorderingType HierarchicalReorderingBackwardState::GetOrientationTypeMSLR(int reoDistance) const
+ {
+ if (reoDistance == 1) {
+ return M;
+ } else if (reoDistance == -1) {
+ return S;
+ } else if (reoDistance > 1) {
+ return DR;
+ }
+ return DL;
}
- return NM;
-}
-LexicalReorderingState::ReorderingType HierarchicalReorderingBackwardState::GetOrientationTypeLeftRight(int reoDistance) const
-{
- if (reoDistance >= 1) {
- return R;
+ LexicalReorderingState::ReorderingType HierarchicalReorderingBackwardState::GetOrientationTypeMonotonic(int reoDistance) const
+ {
+ if (reoDistance == 1) {
+ return M;
+ }
+ return NM;
}
- return L;
-}
+ LexicalReorderingState::ReorderingType HierarchicalReorderingBackwardState::GetOrientationTypeLeftRight(int reoDistance) const
+ {
+ if (reoDistance >= 1) {
+ return R;
+ }
+ return L;
+ }
-///////////////////////////
-//HierarchicalReorderingForwardState
-HierarchicalReorderingForwardState::HierarchicalReorderingForwardState(const LexicalReorderingConfiguration &config, size_t size, size_t offset)
- : LexicalReorderingState(config, LexicalReorderingConfiguration::Forward, offset), m_first(true), m_prevRange(NOT_FOUND,NOT_FOUND), m_coverage(size) {}
+ ///////////////////////////
+ //HierarchicalReorderingForwardState
-HierarchicalReorderingForwardState::HierarchicalReorderingForwardState(const HierarchicalReorderingForwardState *prev, const TranslationOption &topt)
- : LexicalReorderingState(prev, topt), m_first(false), m_prevRange(topt.GetSourceWordsRange()), m_coverage(prev->m_coverage)
-{
- const WordsRange currWordsRange = topt.GetSourceWordsRange();
- m_coverage.SetValue(currWordsRange.GetStartPos(), currWordsRange.GetEndPos(), true);
-}
+ HierarchicalReorderingForwardState::
+ HierarchicalReorderingForwardState(const LexReoConf &config, size_t size, size_t offset)
+ : LexicalReorderingState(config, LexReoConf::Forward, offset), m_first(true), m_prevRange(NOT_FOUND,NOT_FOUND), m_coverage(size) {}
-int HierarchicalReorderingForwardState::Compare(const FFState& o) const
-{
- if (&o == this)
- return 0;
+ HierarchicalReorderingForwardState::HierarchicalReorderingForwardState(const HierarchicalReorderingForwardState *prev, const TranslationOption &topt)
+ : LexicalReorderingState(prev, topt), m_first(false), m_prevRange(topt.GetSourceWordsRange()), m_coverage(prev->m_coverage)
+ {
+ const WordsRange currWordsRange = topt.GetSourceWordsRange();
+ m_coverage.SetValue(currWordsRange.GetStartPos(), currWordsRange.GetEndPos(), true);
+ }
- const HierarchicalReorderingForwardState* other = static_cast<const HierarchicalReorderingForwardState*>(&o);
+ int HierarchicalReorderingForwardState::Compare(const FFState& o) const
+ {
+ if (&o == this)
+ return 0;
- if (m_prevRange == other->m_prevRange) {
- return ComparePrevScores(other->m_prevOption);
- } else if (m_prevRange < other->m_prevRange) {
- return -1;
- }
- return 1;
-}
+ const HierarchicalReorderingForwardState* other = static_cast<const HierarchicalReorderingForwardState*>(&o);
-// For compatibility with the phrase-based reordering model, scoring is one step delayed.
-// The forward model takes determines orientations heuristically as follows:
-// mono: if the next phrase comes after the conditioning phrase and
-// - there is a gap to the right of the conditioning phrase, or
-// - the next phrase immediately follows it
-// swap: if the next phrase goes before the conditioning phrase and
-// - there is a gap to the left of the conditioning phrase, or
-// - the next phrase immediately precedes it
-// dright: if the next phrase follows the conditioning phrase and other stuff comes in between
-// dleft: if the next phrase precedes the conditioning phrase and other stuff comes in between
-
-LexicalReorderingState* HierarchicalReorderingForwardState::Expand(const TranslationOption& topt, const InputType& input,ScoreComponentCollection* scores) const
-{
- const LexicalReorderingConfiguration::ModelType modelType = m_configuration.GetModelType();
- const WordsRange currWordsRange = topt.GetSourceWordsRange();
- // keep track of the current coverage ourselves so we don't need the hypothesis
- WordsBitmap coverage = m_coverage;
- coverage.SetValue(currWordsRange.GetStartPos(), currWordsRange.GetEndPos(), true);
-
- ReorderingType reoType;
-
- if (m_first) {
-
- } else {
- if (modelType == LexicalReorderingConfiguration::MSD) {
- reoType = GetOrientationTypeMSD(currWordsRange, coverage);
- } else if (modelType == LexicalReorderingConfiguration::MSLR) {
- reoType = GetOrientationTypeMSLR(currWordsRange, coverage);
- } else if (modelType == LexicalReorderingConfiguration::Monotonic) {
- reoType = GetOrientationTypeMonotonic(currWordsRange, coverage);
- } else {
- reoType = GetOrientationTypeLeftRight(currWordsRange, coverage);
+ if (m_prevRange == other->m_prevRange) {
+ return ComparePrevScores(other->m_prevOption);
+ } else if (m_prevRange < other->m_prevRange) {
+ return -1;
}
-
- CopyScores(scores, topt, input, reoType);
+ return 1;
}
- return new HierarchicalReorderingForwardState(this, topt);
-}
+ // For compatibility with the phrase-based reordering model, scoring is one step delayed.
+ // The forward model takes determines orientations heuristically as follows:
+ // mono: if the next phrase comes after the conditioning phrase and
+ // - there is a gap to the right of the conditioning phrase, or
+ // - the next phrase immediately follows it
+ // swap: if the next phrase goes before the conditioning phrase and
+ // - there is a gap to the left of the conditioning phrase, or
+ // - the next phrase immediately precedes it
+ // dright: if the next phrase follows the cond. phr.
+ // and other stuff comes in between
+ // dleft: if the next phrase precedes the conditioning phrase
+ // and other stuff comes in between
+
+ LexicalReorderingState*
+ HierarchicalReorderingForwardState::
+ Expand(const TranslationOption& topt, const InputType& input,
+ ScoreComponentCollection* scores) const
+ {
+ LexReoConf::ModelType const modelType = m_configuration.GetModelType();
+ WordsRange const& currRange = topt.GetSourceWordsRange();
+
+ // keep track of the current cov. ourselves so we don't need the hypothesis
+ WordsBitmap cov = m_coverage;
+ cov.SetValue(currRange.GetStartPos(), currRange.GetEndPos(), true);
+
+
+ if (!m_first)
+ {
+ ReorderingType reoType
+ = ((modelType == LexReoConf::MSD)
+ ? GetOrientationTypeMSD(currWordsRange, coverage)
+ : (modelType == LexReoConf::MSLR)
+ ? GetOrientationTypeMSLR(currWordsRange, coverage)
+ : (modelType == LexReoConf::Monotonic)
+ ? GetOrientationTypeMonotonic(currWordsRange, coverage);
+ : GetOrientationTypeLeftRight(currWordsRange, coverage));
+ CopyScores(scores, topt, input, reoType);
+ }
+
+ return new HierarchicalReorderingForwardState(this, topt);
+ }
-LexicalReorderingState::ReorderingType HierarchicalReorderingForwardState::GetOrientationTypeMSD(WordsRange currRange, WordsBitmap coverage) const
-{
- if (currRange.GetStartPos() > m_prevRange.GetEndPos() &&
- (!coverage.GetValue(m_prevRange.GetEndPos()+1) || currRange.GetStartPos() == m_prevRange.GetEndPos()+1)) {
- return M;
- } else if (currRange.GetEndPos() < m_prevRange.GetStartPos() &&
- (!coverage.GetValue(m_prevRange.GetStartPos()-1) || currRange.GetEndPos() == m_prevRange.GetStartPos()-1)) {
- return S;
+ LexicalReorderingState::ReorderingType
+ HierarchicalReorderingForwardState::
+ GetOrientationTypeMSD(WordsRange currRange, WordsBitmap coverage) const
+ {
+ return (IsMonotonicStep(m_prevRange,currRange,coverage) ? M
+ : IsSwap(m_prevRange, currRange, coverage) ? S : D);
}
- return D;
-}
-LexicalReorderingState::ReorderingType HierarchicalReorderingForwardState::GetOrientationTypeMSLR(WordsRange currRange, WordsBitmap coverage) const
-{
- if (currRange.GetStartPos() > m_prevRange.GetEndPos() &&
- (!coverage.GetValue(m_prevRange.GetEndPos()+1) || currRange.GetStartPos() == m_prevRange.GetEndPos()+1)) {
- return M;
- } else if (currRange.GetEndPos() < m_prevRange.GetStartPos() &&
- (!coverage.GetValue(m_prevRange.GetStartPos()-1) || currRange.GetEndPos() == m_prevRange.GetStartPos()-1)) {
- return S;
- } else if (currRange.GetStartPos() > m_prevRange.GetEndPos()) {
- return DR;
+ LexicalReorderingState::ReorderingType
+ HierarchicalReorderingForwardState::
+ GetOrientationTypeMSLR(WordsRange currRange, WordsBitmap coverage) const
+ {
+ return (IsMonotonicStep(m_prevRange,currRange,coverage) ? M
+ : IsSwap(m_prevRange, currRange, coverage) ? S
+ : (currRange.GetStartPos() > m_prevRange.GetEndPos()) ? DR : DL);
}
- return DL;
-}
-LexicalReorderingState::ReorderingType HierarchicalReorderingForwardState::GetOrientationTypeMonotonic(WordsRange currRange, WordsBitmap coverage) const
-{
- if (currRange.GetStartPos() > m_prevRange.GetEndPos() &&
- (!coverage.GetValue(m_prevRange.GetEndPos()+1) || currRange.GetStartPos() == m_prevRange.GetEndPos()+1)) {
- return M;
+ LexicalReorderingState::ReorderingType
+ HierarchicalReorderingForwardState::
+ GetOrientationTypeMonotonic(WordsRange currRange, WordsBitmap coverage) const
+ {
+ return IsMonotonicStep(m_prevRange, currRange, coverage) ? M : NM;
}
- return NM;
-}
-LexicalReorderingState::ReorderingType HierarchicalReorderingForwardState::GetOrientationTypeLeftRight(WordsRange currRange, WordsBitmap /* coverage */) const
-{
- if (currRange.GetStartPos() > m_prevRange.GetEndPos()) {
- return R;
+ LexicalReorderingState::ReorderingType
+ HierarchicalReorderingForwardState::
+ GetOrientationTypeLeftRight(WordsRange currRange, WordsBitmap coverage) const
+ {
+ return currRange.GetStartPos() > m_prevRange.GetEndPos() ? R : L;
}
- return L;
-}
}
diff --git a/moses/FF/LexicalReordering/LexicalReorderingState.h b/moses/FF/LexicalReordering/LexicalReorderingState.h
index 79537f119..6599dae5b 100644
--- a/moses/FF/LexicalReordering/LexicalReorderingState.h
+++ b/moses/FF/LexicalReordering/LexicalReorderingState.h
@@ -94,6 +94,9 @@ private:
//! Abstract class for lexical reordering model states
class LexicalReorderingState : public FFState
{
+ typedef LexicalReorderingConfiguration::Forward Forward;
+ typedef LexicalReorderingConfiguration::Backward Backward;
+
public:
virtual int Compare(const FFState& o) const = 0;
virtual LexicalReorderingState* Expand(const TranslationOption& hypo, const InputType& input, ScoreComponentCollection* scores) const = 0;
diff --git a/moses/FF/LexicalReordering/LexicalReorderingTable.cpp b/moses/FF/LexicalReordering/LexicalReorderingTable.cpp
index 2cb9dfc5d..ec79163a7 100644
--- a/moses/FF/LexicalReordering/LexicalReorderingTable.cpp
+++ b/moses/FF/LexicalReordering/LexicalReorderingTable.cpp
@@ -1,7 +1,7 @@
+// -*- c++ -*-
+
#include "LexicalReorderingTable.h"
#include "moses/InputFileStream.h"
-//#include "LVoc.h" //need IPhrase
-
#include "moses/StaticData.h"
#include "moses/TranslationModel/PhraseDictionary.h"
#include "moses/GenerationDictionary.h"
@@ -14,701 +14,598 @@
namespace Moses
{
-/*
- * local helper functions
- */
-//cleans str of leading and tailing spaces
-std::string auxClearString(const std::string& str)
-{
- int i = 0, j = str.size()-1;
- while(i <= j) {
- if(' ' != str[i]) {
- break;
- } else {
- ++i;
+
+ //cleans str of leading and tailing spaces
+ std::string auxClearString(const std::string& str)
+ {
+ int i = 0, j = str.size()-1;
+ while(i <= j) {
+ if(' ' != str[i]) {
+ break;
+ } else {
+ ++i;
+ }
}
- }
- while(j >= i) {
- if(' ' != str[j]) {
- break;
- } else {
- --j;
+ while(j >= i) {
+ if(' ' != str[j]) {
+ break;
+ } else {
+ --j;
+ }
}
+ return str.substr(i,j-i+1);
}
- return str.substr(i,j-i+1);
-}
-void auxAppend(IPhrase& head, const IPhrase& tail)
-{
- head.reserve(head.size()+tail.size());
- for(size_t i = 0; i < tail.size(); ++i) {
- head.push_back(tail[i]);
+ void auxAppend(IPhrase& head, const IPhrase& tail)
+ {
+ head.reserve(head.size()+tail.size());
+ for(size_t i = 0; i < tail.size(); ++i) {
+ head.push_back(tail[i]);
+ }
}
-}
-/*
- * functions for LexicalReorderingTable
- */
-LexicalReorderingTable* LexicalReorderingTable::LoadAvailable(const std::string& filePath, const FactorList& f_factors, const FactorList& e_factors, const FactorList& c_factors)
-{
- //decide use Compact or Tree or Memory table
- LexicalReorderingTable *compactLexr = NULL;
+ LexicalReorderingTable*
+ LexicalReorderingTable::
+ LoadAvailable(const std::string& filePath,
+ const FactorList& f_factors,
+ const FactorList& e_factors,
+ const FactorList& c_factors)
+ {
+ //decide use Compact or Tree or Memory table
#ifdef HAVE_CMPH
- compactLexr = LexicalReorderingTableCompact::CheckAndLoad(filePath + ".minlexr", f_factors, e_factors, c_factors);
+ LexicalReorderingTable *compactLexr = NULL;
+ compactLexr = LexicalReorderingTableCompact::CheckAndLoad(filePath + ".minlexr", f_factors, e_factors, c_factors);
+ if(compactLexr)
+ return compactLexr;
#endif
- if(compactLexr)
- return compactLexr;
- if(FileExists(filePath+".binlexr.idx")) {
- //there exists a binary version use that
- return new LexicalReorderingTableTree(filePath, f_factors, e_factors, c_factors);
- } else {
- //use plain memory
- return new LexicalReorderingTableMemory(filePath, f_factors, e_factors, c_factors);
- }
-}
-
-/*
- * functions for LexicalReorderingTableMemory
- */
-LexicalReorderingTableMemory::LexicalReorderingTableMemory(
- const std::string& filePath,
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors)
- : LexicalReorderingTable(f_factors, e_factors, c_factors)
-{
- LoadFromFile(filePath);
-}
-
-LexicalReorderingTableMemory::~LexicalReorderingTableMemory()
-{
-}
-
-std::vector<float> LexicalReorderingTableMemory::GetScore(const Phrase& f,
- const Phrase& e,
- const Phrase& c)
-{
- //rather complicated because of const can't use []... as [] might enter new things into std::map
- //also can't have to be careful with words range if c is empty can't use c.GetSize()-1 will underflow and be large
- TableType::const_iterator r;
- std::string key;
- if(0 == c.GetSize()) {
- key = MakeKey(f,e,c);
- r = m_Table.find(key);
- if(m_Table.end() != r) {
- return r->second;
- }
- } else {
- //right try from large to smaller context
- for(size_t i = 0; i <= c.GetSize(); ++i) {
- Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1)));
- key = MakeKey(f,e,sub_c);
+ LexicalReorderingTable* ret;
+ if (FileExists(filePath+".binlexr.idx") )
+ ret = new LexicalReorderingTableTree(filePath, f_factors,
+ e_factors, c_factors);
+ else
+ ret = new LexicalReorderingTableMemory(filePath, f_factors,
+ e_factors, c_factors);
+ return ret;
+ }
+
+ LexicalReorderingTableMemory::
+ LexicalReorderingTableMemory(const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors)
+ : LexicalReorderingTable(f_factors, e_factors, c_factors)
+ {
+ LoadFromFile(filePath);
+ }
+
+ LexicalReorderingTableMemory::
+ ~LexicalReorderingTableMemory() { }
+
+ std::vector<float>
+ LexicalReorderingTableMemory::GetScore(const Phrase& f,
+ const Phrase& e,
+ const Phrase& c)
+ {
+ //rather complicated because of const can't use []... as [] might enter new things into std::map
+ //also can't have to be careful with words range if c is empty can't use c.GetSize()-1 will underflow and be large
+ TableType::const_iterator r;
+ std::string key;
+ if(0 == c.GetSize()) {
+ key = MakeKey(f,e,c);
r = m_Table.find(key);
if(m_Table.end() != r) {
- return r->second;
+ return r->second;
+ }
+ } else {
+ //right try from large to smaller context
+ for(size_t i = 0; i <= c.GetSize(); ++i) {
+ Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1)));
+ key = MakeKey(f,e,sub_c);
+ r = m_Table.find(key);
+ if(m_Table.end() != r) {
+ return r->second;
+ }
}
}
+ return Scores();
}
- return Scores();
-}
-
-void LexicalReorderingTableMemory::DbgDump(std::ostream* out) const
-{
- TableType::const_iterator i;
- for(i = m_Table.begin(); i != m_Table.end(); ++i) {
- *out << " key: '" << i->first << "' score: ";
- *out << "(num scores: " << (i->second).size() << ")";
- for(size_t j = 0; j < (i->second).size(); ++j) {
- *out << (i->second)[j] << " ";
- }
- *out << "\n";
- }
-};
-
-std::string LexicalReorderingTableMemory::MakeKey(const Phrase& f,
- const Phrase& e,
- const Phrase& c) const
-{
- /*
- std::string key;
- if(!m_FactorsF.empty()){
- key += f.GetStringRep(m_FactorsF);
- }
- if(!m_FactorsE.empty()){
- if(!key.empty()){
- key += " ||| ";
- }
- key += e.GetStringRep(m_FactorsE);
- }
- */
- return MakeKey(auxClearString(f.GetStringRep(m_FactorsF)),
- auxClearString(e.GetStringRep(m_FactorsE)),
- auxClearString(c.GetStringRep(m_FactorsC)));
-}
-
-std::string LexicalReorderingTableMemory::MakeKey(const std::string& f,
- const std::string& e,
- const std::string& c) const
-{
- std::string key;
- if(!f.empty()) {
- key += f;
- }
- if(!m_FactorsE.empty()) {
- if(!key.empty()) {
- key += "|||";
- }
- key += e;
- }
- if(!m_FactorsC.empty()) {
- if(!key.empty()) {
- key += "|||";
- }
- key += c;
- }
- return key;
-}
-void LexicalReorderingTableMemory::LoadFromFile(const std::string& filePath)
-{
- std::string fileName = filePath;
- if(!FileExists(fileName) && FileExists(fileName+".gz")) {
- fileName += ".gz";
- }
- InputFileStream file(fileName);
- std::string line(""), key("");
- int numScores = -1;
- std::cerr << "Loading table into memory...";
- while(!getline(file, line).eof()) {
- std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");
- int t = 0 ;
- std::string f(""),e(""),c("");
-
- if(!m_FactorsF.empty()) {
- //there should be something for f
- f = auxClearString(tokens.at(t));
- ++t;
- }
- if(!m_FactorsE.empty()) {
- //there should be something for e
- e = auxClearString(tokens.at(t));
- ++t;
- }
- if(!m_FactorsC.empty()) {
- //there should be something for c
- c = auxClearString(tokens.at(t));
- ++t;
- }
- //last token are the probs
- std::vector<float> p = Scan<float>(Tokenize(tokens.at(t)));
- //sanity check: all lines must have equall number of probs
- if(-1 == numScores) {
- numScores = (int)p.size(); //set in first line
- }
- if((int)p.size() != numScores) {
- TRACE_ERR( "found inconsistent number of probabilities... found " << p.size() << " expected " << numScores << std::endl);
- exit(0);
- }
- std::transform(p.begin(),p.end(),p.begin(),TransformScore);
- std::transform(p.begin(),p.end(),p.begin(),FloorScore);
- //save it all into our map
- m_Table[MakeKey(f,e,c)] = p;
+ void
+ LexicalReorderingTableMemory::
+ DbgDump(std::ostream* out) const
+ {
+ TableType::const_iterator i;
+ for(i = m_Table.begin(); i != m_Table.end(); ++i)
+ {
+ *out << " key: '" << i->first << "' score: ";
+ *out << "(num scores: " << (i->second).size() << ")";
+ for(size_t j = 0; j < (i->second).size(); ++j)
+ *out << (i->second)[j] << " ";
+
+ *out << "\n";
+ }
+ };
+
+ std::string
+ LexicalReorderingTableMemory::MakeKey(const Phrase& f,
+ const Phrase& e,
+ const Phrase& c) const
+ {
+ return MakeKey(auxClearString(f.GetStringRep(m_FactorsF)),
+ auxClearString(e.GetStringRep(m_FactorsE)),
+ auxClearString(c.GetStringRep(m_FactorsC)));
+ }
+
+ std::string
+ LexicalReorderingTableMemory::MakeKey(const std::string& f,
+ const std::string& e,
+ const std::string& c) const
+ {
+ std::string key;
+ if(!f.empty()) key += f;
+ if(!m_FactorsE.empty()) { if(!key.empty()) { key += "|||"; } key += e; }
+ if(!m_FactorsC.empty()) { if(!key.empty()) { key += "|||"; } key += c; }
+ return key;
+ }
+
+ void
+ LexicalReorderingTableMemory::
+ LoadFromFile(const std::string& filePath)
+ {
+ std::string fileName = filePath;
+ if(!FileExists(fileName) && FileExists(fileName+".gz"))
+ fileName += ".gz";
+
+ InputFileStream file(fileName);
+ std::string line(""), key("");
+ int numScores = -1;
+ std::cerr << "Loading table into memory...";
+ while(!getline(file, line).eof())
+ {
+ std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");
+ int t = 0 ;
+ std::string f(""),e(""),c("");
+
+ if(!m_FactorsF.empty()) {
+ //there should be something for f
+ f = auxClearString(tokens.at(t));
+ ++t;
+ }
+ if(!m_FactorsE.empty()) {
+ //there should be something for e
+ e = auxClearString(tokens.at(t));
+ ++t;
+ }
+ if(!m_FactorsC.empty()) {
+ //there should be something for c
+ c = auxClearString(tokens.at(t));
+ ++t;
+ }
+ //last token are the probs
+ std::vector<float> p = Scan<float>(Tokenize(tokens.at(t)));
+ //sanity check: all lines must have equall number of probs
+ if(-1 == numScores) {
+ numScores = (int)p.size(); //set in first line
+ }
+ if((int)p.size() != numScores) {
+ TRACE_ERR( "found inconsistent number of probabilities... found "
+ << p.size() << " expected " << numScores << std::endl);
+ exit(0);
+ }
+ std::transform(p.begin(),p.end(),p.begin(),TransformScore);
+ std::transform(p.begin(),p.end(),p.begin(),FloorScore);
+ //save it all into our map
+ m_Table[MakeKey(f,e,c)] = p;
+ }
+ std::cerr << "done.\n";
+ }
+
+ LexicalReorderingTableTree::
+ LexicalReorderingTableTree(const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors)
+ : LexicalReorderingTable(f_factors, e_factors, c_factors)
+ , m_UseCache(false)
+ , m_FilePath(filePath)
+ {
+ m_Table.reset(new PrefixTreeMap());
+ m_Table->Read(m_FilePath+".binlexr");
}
- std::cerr << "done.\n";
-}
-
-/*
- * functions for LexicalReorderingTableTree
- */
-LexicalReorderingTableTree::LexicalReorderingTableTree(
- const std::string& filePath,
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors)
- : LexicalReorderingTable(f_factors, e_factors, c_factors), m_UseCache(false), m_FilePath(filePath)
-{
- m_Table.reset(new PrefixTreeMap());
- m_Table->Read(m_FilePath+".binlexr");
-}
-
-LexicalReorderingTableTree::~LexicalReorderingTableTree()
-{
-}
+
+ LexicalReorderingTableTree::
+ ~LexicalReorderingTableTree()
+ { }
+
+ Scores
+ LexicalReorderingTableTree::
+ GetScore(const Phrase& f, const Phrase& e, const Phrase& c)
+ {
+ if((!m_FactorsF.empty() && 0 == f.GetSize())
+ || (!m_FactorsE.empty() && 0 == e.GetSize()))
+ {
+ //NOTE: no check for c as c might be empty, e.g. start of sentence
+ //not a proper key
+ // phi: commented out, since e may be empty (drop-unknown)
+ //std::cerr << "Not a proper key!\n";
+ return Scores();
+ }
-Scores LexicalReorderingTableTree::GetScore(const Phrase& f, const Phrase& e, const Phrase& c)
-{
- if( (!m_FactorsF.empty() && 0 == f.GetSize())
- || (!m_FactorsE.empty() && 0 == e.GetSize())) {
- //NOTE: no check for c as c might be empty, e.g. start of sentence
- //not a proper key
- // phi: commented out, since e may be empty (drop-unknown)
- //std::cerr << "Not a proper key!\n";
- return Scores();
- }
- CacheType::iterator i;;
- if(m_UseCache) {
- std::pair<CacheType::iterator, bool> r = m_Cache.insert(std::make_pair(MakeCacheKey(f,e),Candidates()));
- if(!r.second) {
- return auxFindScoreForContext((r.first)->second, c);
- }
- i = r.first;
- } else if(!m_Cache.empty()) {
- //although we might not be caching now, cache might be none empty!
- i = m_Cache.find(MakeCacheKey(f,e));
- if(i != m_Cache.end()) {
+ CacheType::iterator i;
+
+ if(m_UseCache)
+ {
+ std::pair<CacheType::iterator, bool> r;
+ r = m_Cache.insert(std::make_pair(MakeCacheKey(f,e),Candidates()));
+ if(!r.second) return auxFindScoreForContext((r.first)->second, c);
+ i = r.first;
+ }
+ else if((i = m_Cache.find(MakeCacheKey(f,e))) != m_Cache.end())
+ // although we might not be caching now, cache might be none empty!
return auxFindScoreForContext(i->second, c);
- }
- }
- //not in cache go to file...
- Scores score;
- Candidates cands;
- m_Table->GetCandidates(MakeTableKey(f,e), &cands);
- if(cands.empty()) {
- return Scores();
- }
- if(m_FactorsC.empty()) {
- UTIL_THROW_IF2(1 != cands.size(), "Error");
- return cands[0].GetScore(0);
- } else {
- score = auxFindScoreForContext(cands, c);
- }
- //cache for future use
- if(m_UseCache) {
- i->second = cands;
- }
- return score;
-};
-
-Scores LexicalReorderingTableTree::auxFindScoreForContext(const Candidates& cands, const Phrase& context)
-{
- if(m_FactorsC.empty()) {
- UTIL_THROW_IF2(cands.size() > 1, "Error");
-
- return (1 == cands.size())?(cands[0].GetScore(0)):(Scores());
- } else {
- std::vector<std::string> cvec;
- for(size_t i = 0; i < context.GetSize(); ++i) {
- /* old code
- std::string s = context.GetWord(i).ToString(m_FactorsC);
- cvec.push_back(s.substr(0,s.size()-1));
- */
- cvec.push_back(context.GetWord(i).GetString(m_FactorsC, false));
- }
- IPhrase c = m_Table->ConvertPhrase(cvec,TargetVocId);
- IPhrase sub_c;
- IPhrase::iterator start = c.begin();
- for(size_t j = 0; j <= context.GetSize(); ++j, ++start) {
- sub_c.assign(start, c.end());
- for(size_t cand = 0; cand < cands.size(); ++cand) {
- IPhrase p = cands[cand].GetPhrase(0);
- if(cands[cand].GetPhrase(0) == sub_c) {
- return cands[cand].GetScore(0);
- }
+ // not in cache => go to file...
+ Candidates cands;
+ m_Table->GetCandidates(MakeTableKey(f,e), &cands);
+ if(cands.empty()) return Scores();
+ if(m_UseCache) i->second = cands;
+
+ if(m_FactorsC.empty())
+ {
+ UTIL_THROW_IF2(1 != cands.size(), "Error");
+ return cands[0].GetScore(0);
+ }
+ else return auxFindScoreForContext(cands, c);
+ };
+
+ Scores
+ LexicalReorderingTableTree::
+ auxFindScoreForContext(const Candidates& cands, const Phrase& context)
+ {
+ if(m_FactorsC.empty())
+ {
+ UTIL_THROW_IF2(cands.size() > 1, "Error");
+ return (cands.size() == 1) ? cands[0].GetScore(0) : Scores();
+ }
+ else
+ {
+ std::vector<std::string> cvec;
+ for(size_t i = 0; i < context.GetSize(); ++i)
+ cvec.push_back(context.GetWord(i).GetString(m_FactorsC, false));
+
+ IPhrase c = m_Table->ConvertPhrase(cvec,TargetVocId);
+ IPhrase sub_c;
+ IPhrase::iterator start = c.begin();
+ for(size_t j = 0; j <= context.GetSize(); ++j, ++start)
+ {
+ sub_c.assign(start, c.end());
+ for(size_t cand = 0; cand < cands.size(); ++cand)
+ {
+ IPhrase p = cands[cand].GetPhrase(0);
+ if(cands[cand].GetPhrase(0) == sub_c)
+ return cands[cand].GetScore(0);
+ }
+ }
+ return Scores();
}
- }
- return Scores();
- }
-}
-
-void LexicalReorderingTableTree::InitializeForInput(const InputType& input)
-{
- ClearCache();
- if(ConfusionNet const* cn = dynamic_cast<ConfusionNet const*>(&input)) {
- Cache(*cn);
- } else if(Sentence const* s = dynamic_cast<Sentence const*>(&input)) {
- // Cache(*s); ... this just takes up too much memory, we cache elsewhere
- DisableCache();
}
- if (!m_Table.get()) {
- //load thread specific table.
- m_Table.reset(new PrefixTreeMap());
- m_Table->Read(m_FilePath+".binlexr");
- }
-};
-bool LexicalReorderingTableTree::Create(std::istream& inFile,
- const std::string& outFileName)
-{
- std::string line;
- //TRACE_ERR("Entering Create...\n");
- std::string
- ofn(outFileName+".binlexr.srctree"),
+ void
+ LexicalReorderingTableTree::
+ InitializeForInput(const InputType& input)
+ {
+ ClearCache();
+ if(ConfusionNet const* cn = dynamic_cast<ConfusionNet const*>(&input))
+ {
+ Cache(*cn);
+ }
+ else if (dynamic_cast<Sentence const*>(&input))
+ {
+ // Cache(*s); ... this just takes up too much memory, we cache elsewhere
+ DisableCache();
+ }
+ if (!m_Table.get())
+ {
+ //load thread specific table.
+ m_Table.reset(new PrefixTreeMap());
+ m_Table->Read(m_FilePath+".binlexr");
+ }
+ };
+
+ bool
+ LexicalReorderingTableTree::
+ Create(std::istream& inFile, const std::string& outFileName)
+ {
+ typedef PrefixTreeSA<LabelId,OFF_T> PSA;
+
+ std::string
+ line,
+ ofn(outFileName+".binlexr.srctree"),
oft(outFileName+".binlexr.tgtdata"),
ofi(outFileName+".binlexr.idx"),
ofsv(outFileName+".binlexr.voc0"),
oftv(outFileName+".binlexr.voc1");
+ FILE *os = fOpen(ofn.c_str(),"wb");
+ FILE *ot = fOpen(oft.c_str(),"wb");
+
+ PSA *psa = new PSA;
+ PSA::setDefault(InvalidOffT);
+ WordVoc* voc[3];
+
+ LabelId currFirstWord = InvalidLabelId;
+ IPhrase currKey;
+
+ Candidates cands;
+ std::vector<OFF_T> vo;
+ size_t lnc = 0;
+ size_t numTokens = 0;
+ size_t numKeyTokens = 0;
+ while(getline(inFile, line))
+ {
+ ++lnc;
+ if(0 == lnc % 10000) TRACE_ERR(".");
+ IPhrase key;
+ Scores score;
+
+ std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");
+ std::string w;
+ if(1 == lnc)
+ {
+ //do some init stuff in the first line
+ numTokens = tokens.size();
+ if(tokens.size() == 2)
+ { // f ||| score
+ numKeyTokens = 1;
+ voc[0] = new WordVoc();
+ voc[1] = 0;
+ }
+ else if(3 == tokens.size() || 4 == tokens.size())
+ { //either f ||| e ||| score or f ||| e ||| c ||| score
+ numKeyTokens = 2;
+ voc[0] = new WordVoc(); //f voc
+ voc[1] = new WordVoc(); //e voc
+ voc[2] = voc[1]; //c & e share voc
+ }
+ }
+ else
+ {
+ //sanity check ALL lines must have same number of tokens
+ UTIL_THROW_IF2(numTokens != tokens.size(),
+ "Lines do not have the same number of tokens");
+ }
+ size_t phrase = 0;
+ for(; phrase < numKeyTokens; ++phrase)
+ {
+ //conditioned on more than just f... need |||
+ if(phrase >=1) key.push_back(PrefixTreeMap::MagicWord);
+ std::istringstream is(tokens[phrase]);
+ while(is >> w) key.push_back(voc[phrase]->add(w));
+ }
- FILE *os = fOpen(ofn.c_str(),"wb");
- FILE *ot = fOpen(oft.c_str(),"wb");
-
- //TRACE_ERR("opend files....\n");
+ //collect all non key phrases, i.e. c
+ std::vector<IPhrase> tgt_phrases;
+ tgt_phrases.resize(numTokens - numKeyTokens - 1);
+ for(size_t j = 0; j < tgt_phrases.size(); ++j, ++phrase)
+ {
+ std::istringstream is(tokens[numKeyTokens + j]);
+ while(is >> w) tgt_phrases[j].push_back(voc[phrase]->add(w));
+ }
- typedef PrefixTreeSA<LabelId,OFF_T> PSA;
- PSA *psa = new PSA;
- PSA::setDefault(InvalidOffT);
- WordVoc* voc[3];
+ //last token is score
+ std::istringstream is(tokens[numTokens-1]);
+ while(is >> w) score.push_back(atof(w.c_str()));
+
+ //transform score now...
+ std::transform(score.begin(),score.end(),score.begin(),TransformScore);
+ std::transform(score.begin(),score.end(),score.begin(),FloorScore);
+ std::vector<Scores> scores;
+ scores.push_back(score);
+
+ if(key.empty()) {
+ TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n");
+ continue;
+ }
- LabelId currFirstWord = InvalidLabelId;
- IPhrase currKey;
+ //first time inits
+ if(currFirstWord == InvalidLabelId) currFirstWord = key[0];
+ if(currKey.empty())
+ {
+ currKey = key;
+ //insert key into tree
+ UTIL_THROW_IF2(psa == NULL, "Object not yet created");
+ PSA::Data& d = psa->insert(key);
+ if(d == InvalidOffT) d = fTell(ot);
+ else
+ {
+ TRACE_ERR("ERROR: source phrase already inserted (A)!\nline("
+ << lnc << "): '" << line << "\n");
+ return false;
+ }
+ }
+
+ if(currKey != key) {
+ //ok new key
+ currKey = key;
+ //a) write cands for old key
+ cands.writeBin(ot);
+ cands.clear();
+ //b) check if we need to move on to new tree root
+ if(key[0] != currFirstWord) {
+ // write key prefix tree to file and clear
+ PTF pf;
+ if(currFirstWord >= vo.size())
+ vo.resize(currFirstWord+1,InvalidOffT);
+ vo[currFirstWord] = fTell(os);
+ pf.create(*psa, os);
+ delete psa;
+ psa = new PSA;
+ currFirstWord = key[0];
+ }
- Candidates cands;
- std::vector<OFF_T> vo;
- size_t lnc = 0;
- size_t numTokens = 0;
- size_t numKeyTokens = 0;
- while(getline(inFile, line)) {
- ++lnc;
- if(0 == lnc % 10000) {
- TRACE_ERR(".");
- }
- IPhrase key;
- Scores score;
-
- std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");
- std::string w;
- if(1 == lnc) {
- //do some init stuff in the first line
- numTokens = tokens.size();
- if(tokens.size() == 2) { //f ||| score
- numKeyTokens = 1;
- voc[0] = new WordVoc();
- voc[1] = 0;
- } else if(3 == tokens.size() || 4 == tokens.size()) { //either f ||| e ||| score or f ||| e ||| c ||| score
- numKeyTokens = 2;
- voc[0] = new WordVoc(); //f voc
- voc[1] = new WordVoc(); //e voc
- voc[2] = voc[1]; //c & e share voc
- }
- } else {
- //sanity check ALL lines must have same number of tokens
- UTIL_THROW_IF2(numTokens != tokens.size(),
- "Lines do not have the same number of tokens");
- }
- size_t phrase = 0;
- for(; phrase < numKeyTokens; ++phrase) {
- //conditioned on more than just f... need |||
- if(phrase >=1) {
- key.push_back(PrefixTreeMap::MagicWord);
- }
- std::istringstream is(tokens[phrase]);
- while(is >> w) {
- key.push_back(voc[phrase]->add(w));
- }
- }
- //collect all non key phrases, i.e. c
- std::vector<IPhrase> tgt_phrases;
- tgt_phrases.resize(numTokens - numKeyTokens - 1);
- for(size_t j = 0; j < tgt_phrases.size(); ++j, ++phrase) {
- std::istringstream is(tokens[numKeyTokens + j]);
- while(is >> w) {
- tgt_phrases[j].push_back(voc[phrase]->add(w));
+ // c) insert key into tree
+ UTIL_THROW_IF2(psa == NULL, "Object not yet created");
+ PSA::Data& d = psa->insert(key);
+ if(d == InvalidOffT) d = fTell(ot);
+ else
+ {
+ TRACE_ERR("ERROR: source phrase already inserted (A)!\nline("
+ << lnc << "): '" << line << "\n");
+ return false;
+ }
+ }
+ cands.push_back(GenericCandidate(tgt_phrases, scores));
}
- }
- //last token is score
- std::istringstream is(tokens[numTokens-1]);
- while(is >> w) {
- score.push_back(atof(w.c_str()));
- }
- //transform score now...
- std::transform(score.begin(),score.end(),score.begin(),TransformScore);
- std::transform(score.begin(),score.end(),score.begin(),FloorScore);
- std::vector<Scores> scores;
- scores.push_back(score);
-
- if(key.empty()) {
- TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n");
- continue;
- }
- //first time inits
- if(currFirstWord == InvalidLabelId) {
- currFirstWord = key[0];
- }
- if(currKey.empty()) {
- currKey = key;
- //insert key into tree
- UTIL_THROW_IF2(psa == NULL, "Object not yet created");
- PSA::Data& d = psa->insert(key);
- if(d == InvalidOffT) {
- d = fTell(ot);
- } else {
- TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" << line << "\n");
- return false;
+ if (lnc == 0)
+ {
+ TRACE_ERR("ERROR: empty lexicalised reordering file\n" << std::endl);
+ return false;
}
+ cands.writeBin(ot);
+ cands.clear();
+
+ PTF pf;
+ if(currFirstWord >= vo.size())
+ vo.resize(currFirstWord+1,InvalidOffT);
+ vo[currFirstWord] = fTell(os);
+ pf.create(*psa,os);
+ delete psa;
+ psa=0;
+
+ fClose(os);
+ fClose(ot);
+ FILE *oi = fOpen(ofi.c_str(),"wb");
+ fWriteVector(oi,vo);
+ fClose(oi);
+
+ if(voc[0]) { voc[0]->Write(ofsv); delete voc[0]; }
+ if(voc[1]) { voc[1]->Write(oftv); delete voc[1]; }
+ return true;
+ }
+
+ std::string
+ LexicalReorderingTableTree::
+ MakeCacheKey(const Phrase& f, const Phrase& e) const
+ {
+ std::string key;
+ if(!m_FactorsF.empty())
+ key += auxClearString(f.GetStringRep(m_FactorsF));
+
+ if(!m_FactorsE.empty()) {
+ if(!key.empty()) { key += "|||"; }
+ key += auxClearString(e.GetStringRep(m_FactorsE));
}
- if(currKey != key) {
- //ok new key
- currKey = key;
- //a) write cands for old key
- cands.writeBin(ot);
- cands.clear();
- //b) check if we need to move on to new tree root
- if(key[0] != currFirstWord) {
- // write key prefix tree to file and clear
- PTF pf;
- if(currFirstWord >= vo.size()) {
- vo.resize(currFirstWord+1,InvalidOffT);
- }
- vo[currFirstWord] = fTell(os);
- pf.create(*psa, os);
- // clear
- delete psa;
- psa = new PSA;
- currFirstWord = key[0];
+ return key;
+ };
+
+ IPhrase
+ LexicalReorderingTableTree::
+ MakeTableKey(const Phrase& f, const Phrase& e) const
+ {
+ IPhrase key;
+ std::vector<std::string> keyPart;
+ if(!m_FactorsF.empty())
+ {
+ for(size_t i = 0; i < f.GetSize(); ++i)
+ keyPart.push_back(f.GetWord(i).GetString(m_FactorsF, false));
+ auxAppend(key, m_Table->ConvertPhrase(keyPart, SourceVocId));
+ keyPart.clear();
}
- //c) insert key into tree
- UTIL_THROW_IF2(psa == NULL, "Object not yet created");
- PSA::Data& d = psa->insert(key);
- if(d == InvalidOffT) {
- d = fTell(ot);
- } else {
- TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" << line << "\n");
- return false;
+ if(!m_FactorsE.empty())
+ {
+ if(!key.empty()) key.push_back(PrefixTreeMap::MagicWord);
+ for(size_t i = 0; i < e.GetSize(); ++i)
+ keyPart.push_back(e.GetWord(i).GetString(m_FactorsE, false));
+ auxAppend(key, m_Table->ConvertPhrase(keyPart,TargetVocId));
}
- }
- cands.push_back(GenericCandidate(tgt_phrases, scores));
- }
- if (lnc == 0) {
- TRACE_ERR("ERROR: empty lexicalised reordering file\n" << std::endl);
- return false;
- }
- //flush remainders
- cands.writeBin(ot);
- cands.clear();
- //process last currFirstWord
- PTF pf;
- if(currFirstWord >= vo.size()) {
- vo.resize(currFirstWord+1,InvalidOffT);
- }
- vo[currFirstWord] = fTell(os);
- pf.create(*psa,os);
- delete psa;
- psa=0;
-
- fClose(os);
- fClose(ot);
- /*
- std::vector<size_t> inv;
- for(size_t i = 0; i < vo.size(); ++i){
- if(vo[i] == InvalidOffT){
- inv.push_back(i);
- }
- }
- if(inv.size()) {
- TRACE_ERR("WARNING: there are src voc entries with no phrase "
- "translation: count "<<inv.size()<<"\n"
- "There exists phrase translations for "<<vo.size()-inv.size()
- <<" entries\n");
- }
- */
- FILE *oi = fOpen(ofi.c_str(),"wb");
- fWriteVector(oi,vo);
- fClose(oi);
-
- if(voc[0]) {
- voc[0]->Write(ofsv);
- delete voc[0];
- }
- if(voc[1]) {
- voc[1]->Write(oftv);
- delete voc[1];
- }
- return true;
-}
-
-std::string LexicalReorderingTableTree::MakeCacheKey(const Phrase& f,
- const Phrase& e) const
-{
- std::string key;
- if(!m_FactorsF.empty()) {
- key += auxClearString(f.GetStringRep(m_FactorsF));
- }
- if(!m_FactorsE.empty()) {
- if(!key.empty()) {
- key += "|||";
- }
- key += auxClearString(e.GetStringRep(m_FactorsE));
- }
- return key;
-};
-
-IPhrase LexicalReorderingTableTree::MakeTableKey(const Phrase& f,
- const Phrase& e) const
-{
- IPhrase key;
- std::vector<std::string> keyPart;
- if(!m_FactorsF.empty()) {
- for(size_t i = 0; i < f.GetSize(); ++i) {
- /* old code
- std::string s = f.GetWord(i).ToString(m_FactorsF);
- keyPart.push_back(s.substr(0,s.size()-1));
- */
- keyPart.push_back(f.GetWord(i).GetString(m_FactorsF, false));
- }
- auxAppend(key, m_Table->ConvertPhrase(keyPart, SourceVocId));
- keyPart.clear();
- }
- if(!m_FactorsE.empty()) {
- if(!key.empty()) {
- key.push_back(PrefixTreeMap::MagicWord);
- }
- for(size_t i = 0; i < e.GetSize(); ++i) {
- /* old code
- std::string s = e.GetWord(i).ToString(m_FactorsE);
- keyPart.push_back(s.substr(0,s.size()-1));
- */
- keyPart.push_back(e.GetWord(i).GetString(m_FactorsE, false));
- }
- auxAppend(key, m_Table->ConvertPhrase(keyPart,TargetVocId));
- //keyPart.clear();
- }
- return key;
-};
-
-
-struct State {
- State(PPimp* t, const std::string& p) : pos(t), path(p) {
- }
- PPimp* pos;
- std::string path;
-};
-
-void LexicalReorderingTableTree::auxCacheForSrcPhrase(const Phrase& f)
-{
- if(m_FactorsE.empty()) {
- //f is all of key...
- Candidates cands;
- m_Table->GetCandidates(MakeTableKey(f,Phrase(ARRAY_SIZE_INCR)),&cands);
- m_Cache[MakeCacheKey(f,Phrase(ARRAY_SIZE_INCR))] = cands;
- } else {
- ObjectPool<PPimp> pool;
- PPimp* pPos = m_Table->GetRoot();
- //1) goto subtree for f
- for(size_t i = 0; i < f.GetSize() && 0 != pPos && pPos->isValid(); ++i) {
- /* old code
- pPos = m_Table.Extend(pPos, auxClearString(f.GetWord(i).ToString(m_FactorsF)), SourceVocId);
- */
- pPos = m_Table->Extend(pPos, f.GetWord(i).GetString(m_FactorsF, false), SourceVocId);
- }
- if(0 != pPos && pPos->isValid()) {
- pPos = m_Table->Extend(pPos, PrefixTreeMap::MagicWord);
- }
- if(0 == pPos || !pPos->isValid()) {
- return;
- }
- //2) explore whole subtree depth first & cache
- std::string cache_key = auxClearString(f.GetStringRep(m_FactorsF)) + "|||";
-
- std::vector<State> stack;
- stack.push_back(State(pool.get(PPimp(pPos->ptr()->getPtr(pPos->idx),0,0)),""));
- Candidates cands;
- while(!stack.empty()) {
- if(stack.back().pos->isValid()) {
- LabelId w = stack.back().pos->ptr()->getKey(stack.back().pos->idx);
- std::string next_path = stack.back().path + " " + m_Table->ConvertWord(w,TargetVocId);
- //cache this
- m_Table->GetCandidates(*stack.back().pos,&cands);
- if(!cands.empty()) {
- m_Cache[cache_key + auxClearString(next_path)] = cands;
- }
- cands.clear();
- PPimp* next_pos = pool.get(PPimp(stack.back().pos->ptr()->getPtr(stack.back().pos->idx),0,0));
- ++stack.back().pos->idx;
- stack.push_back(State(next_pos,next_path));
- } else {
- stack.pop_back();
+ return key;
+ };
+
+
+ struct State
+ {
+ State(PPimp* t, const std::string& p)
+ : pos(t), path(p) { }
+
+ PPimp* pos;
+ std::string path;
+ };
+
+ void
+ LexicalReorderingTableTree::
+ auxCacheForSrcPhrase(const Phrase& f)
+ {
+ if(m_FactorsE.empty())
+ {
+ //f is all of key...
+ Candidates cands;
+ m_Table->GetCandidates(MakeTableKey(f,Phrase(ARRAY_SIZE_INCR)),&cands);
+ m_Cache[MakeCacheKey(f,Phrase(ARRAY_SIZE_INCR))] = cands;
+ }
+ else
+ {
+ ObjectPool<PPimp> pool;
+ PPimp* pPos = m_Table->GetRoot();
+
+ // 1) goto subtree for f
+ for(size_t i = 0; i < f.GetSize() && 0 != pPos && pPos->isValid(); ++i)
+ pPos = m_Table->Extend(pPos, f.GetWord(i).GetString(m_FactorsF, false), SourceVocId);
+
+ if(pPos && pPos->isValid())
+ pPos = m_Table->Extend(pPos, PrefixTreeMap::MagicWord);
+
+ if(!pPos || !pPos->isValid())
+ return;
+
+ //2) explore whole subtree depth first & cache
+ std::string cache_key = auxClearString(f.GetStringRep(m_FactorsF)) + "|||";
+
+ std::vector<State> stack;
+ stack.push_back(State(pool.get(PPimp(pPos->ptr()->getPtr(pPos->idx),0,0)),""));
+ Candidates cands;
+ while(!stack.empty())
+ {
+ if(stack.back().pos->isValid())
+ {
+ LabelId w = stack.back().pos->ptr()->getKey(stack.back().pos->idx);
+ std::string next_path = stack.back().path + " " + m_Table->ConvertWord(w,TargetVocId);
+ //cache this
+ m_Table->GetCandidates(*stack.back().pos,&cands);
+ if(!cands.empty()) m_Cache[cache_key + auxClearString(next_path)] = cands;
+ cands.clear();
+ PPimp* next_pos = pool.get(PPimp(stack.back().pos->ptr()->getPtr(stack.back().pos->idx),0,0));
+ ++stack.back().pos->idx;
+ stack.push_back(State(next_pos,next_path));
+ }
+ else stack.pop_back();
+ }
}
- }
- }
-}
-
-void LexicalReorderingTableTree::Cache(const ConfusionNet& /*input*/)
-{
- return;
-}
-
-void LexicalReorderingTableTree::Cache(const Sentence& input)
-{
- //only works with sentences...
- size_t prev_cache_size = m_Cache.size();
- size_t max_phrase_length = input.GetSize();
- for(size_t len = 0; len <= max_phrase_length; ++len) {
- for(size_t start = 0; start+len <= input.GetSize(); ++start) {
- Phrase f = input.GetSubString(WordsRange(start, start+len));
- auxCacheForSrcPhrase(f);
- }
- }
- std::cerr << "Cached " << m_Cache.size() - prev_cache_size << " new primary reordering table keys\n";
-}
-/*
-Pre fetching implementation using Phrase and Generation Dictionaries
-*//*
-void LexicalReorderingTableTree::Cache(const ConfusionNet& input){
- typedef TargetPhraseCollection::iterator Iter;
- typedef TargetPhraseCollection::const_iterator ConstIter;
- //not implemented for confusion networks...
- Sentence const* s = dynamic_cast<Sentence const*>(&input);
- if(!s){
- return;
}
- int max_phrase_length = input.GetSize();
-
- std::vector<PhraseDictionaryBase*> PhraseTables = StaticData::Instance()->GetPhraseDictionaries();
- //new code:
- //std::vector<PhraseDictionary*> PhraseTables = StaticData::Instance()->GetPhraseDictionaries();
- std::vector<GenerationDictionary*> GenTables = StaticData::Instance()->GetGenerationDictionaries();
- for(size_t len = 1; len <= max_phrase_length; ++len){
- for(size_t start = 0; start+len <= input.GetSize(); ++start){
- Phrase f = s->GetSubString(WordsRange(start, start+len));
- //find all translations of f
- TargetPhraseCollection list;
-
- for(size_t t = 0; t < PhraseTables.size(); ++t){
- //if(doIntersect(PhraseTables[t]->GetOutputFactorMask(),FactorMask(m_FactorsE))){
- //this table gives us something we need
-
- const TargetPhraseCollection* new_list = PhraseTables[t]->GetTargetPhraseCollection(f);
- TargetPhraseCollection curr_list;
- for(ConstIter i = new_list->begin(); i != new_list->end(); ++i){
- for(Iter j = list.begin(); j != list.end(); ++j){
- curr_list.Add((*j)->MergeNext(*(*i)));
- }
- }
- if(list.IsEmpty()){
- list = *new_list;
- } else {
- list = curr_list;
- }
- //}
+
+ void
+ LexicalReorderingTableTree::
+ Cache(const ConfusionNet& /*input*/)
+ {
+ return;
+ }
+
+ void
+ LexicalReorderingTableTree::
+ Cache(const Sentence& input)
+ {
+ //only works with sentences...
+ size_t prev_cache_size = m_Cache.size();
+ size_t max_phrase_length = input.GetSize();
+ for(size_t len = 0; len <= max_phrase_length; ++len)
+ {
+ for(size_t start = 0; start+len <= input.GetSize(); ++start)
+ {
+ Phrase f = input.GetSubString(WordsRange(start, start+len));
+ auxCacheForSrcPhrase(f);
}
- for(size_t g = 0; g < GenTables.size(); ++g){
- //if(doIntersect(GenTables[g]->GetOutputFactorMask(),FactorMask(m_FactorsE))){
- TargetPhraseCollection curr_list;
- for(Iter j = list.begin(); j != list.end(); ++j){
- for(size_t w = 0; w < (*j)->GetSize(); ++w){
- const OutputWordCollection* words = GenTables[g]->FindWord((*j)->GetWord(w));
- for(OutputWordCollection::const_iterator i = words->begin(); i != words->end(); ++i){
- TargetPhrase* p = new TargetPhrase(*(*j));
- Word& pw = p->GetWord(w);
- pw.Merge(i->first);
- curr_list.Add(p);
- }
- }
- }
- list = curr_list;
- //}
- }
- //cache for each translation
- for(Iter e = list.begin(); e < list.end(); ++e){
- Candidates cands;
- m_Table.GetCandidates(MakeTableKey(f,*(*e)), &cands);
- m_Cache.insert(std::make_pair(MakeCacheKey(f,*(*e)),cands));
- }
- }
+ }
+ std::cerr << "Cached " << m_Cache.size() - prev_cache_size
+ << " new primary reordering table keys\n";
}
-};
-*/
-
}
diff --git a/moses/FF/LexicalReordering/LexicalReorderingTable.h b/moses/FF/LexicalReordering/LexicalReorderingTable.h
index fa45e3904..f4eceb72e 100644
--- a/moses/FF/LexicalReordering/LexicalReorderingTable.h
+++ b/moses/FF/LexicalReordering/LexicalReorderingTable.h
@@ -1,7 +1,7 @@
-#ifndef moses_LexicalReorderingTable_h
-#define moses_LexicalReorderingTable_h
+// -*- c++ -*-
+
+#pragma once
-//stdlib dependencies:
#include <vector>
#include <map>
#include <memory>
@@ -12,7 +12,6 @@
#include <boost/thread/tss.hpp>
#endif
-//moses dependencies:
#include "moses/TypeDef.h"
#include "moses/Phrase.h"
#include "moses/InputType.h"
@@ -23,136 +22,174 @@
namespace Moses
{
-class Phrase;
-class InputType;
-class ConfusionNet;
-
-//! additional types
-class LexicalReorderingTable
-{
-public:
- LexicalReorderingTable(const FactorList& f_factors, const FactorList& e_factors, const FactorList& c_factors)
- : m_FactorsF(f_factors), m_FactorsE(e_factors), m_FactorsC(c_factors) {
- }
- virtual ~LexicalReorderingTable() {
- }
-public:
- static LexicalReorderingTable* LoadAvailable(const std::string& filePath, const FactorList& f_factors, const FactorList& e_factors, const FactorList& c_factors);
-public:
- virtual Scores GetScore(const Phrase& f, const Phrase& e, const Phrase& c) = 0;
- virtual void InitializeForInput(const InputType&) {
- /* override for on-demand loading */
- };
- virtual void InitializeForInputPhrase(const Phrase&) {
- };
- /*
- int GetNumScoreComponents() const {
- return m_NumScores;
- }
- */
- const FactorList& GetFFactorMask() const {
- return m_FactorsF;
- }
- const FactorList& GetEFactorMask() const {
- return m_FactorsE;
- }
- const FactorList& GetCFactorMask() const {
- return m_FactorsC;
- }
- virtual void DbgDump(std::ostream* out) const {
- *out << "Overwrite in subclass...\n";
- };
-protected:
- FactorList m_FactorsF;
- FactorList m_FactorsE;
- FactorList m_FactorsC;
-};
-
-//! @todo what is this?
-class LexicalReorderingTableMemory : public LexicalReorderingTable
-{
- //implements LexicalReorderingTable saving all scores in one large std::map<> thingy
- //to be used for non binary tables... uses a LOT of memory
-public:
- LexicalReorderingTableMemory( const std::string& filePath,
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors);
- virtual ~LexicalReorderingTableMemory();
-public:
- virtual std::vector<float> GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
- void DbgDump(std::ostream* out) const;
-private:
- std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const;
- std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const;
-
- void LoadFromFile(const std::string& filePath);
-private:
- typedef std::map< std::string, std::vector<float> > TableType;
- TableType m_Table;
-};
-
-class LexicalReorderingTableTree : public LexicalReorderingTable
-{
- //implements LexicalReorderingTable using the crafty PDT code...
-public:
- LexicalReorderingTableTree(const std::string& filePath,
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors);
- ~LexicalReorderingTableTree();
-public:
- bool IsCacheEnabled() const {
- return m_UseCache;
- };
- void EnableCache() {
- m_UseCache = true;
+ class Phrase;
+ class InputType;
+ class ConfusionNet;
+
+ //! additional types
+ class LexicalReorderingTable
+ {
+ public:
+ LexicalReorderingTable(const FactorList& f_factors,
+ const FactorList& e_factors,
+ const FactorList& c_factors)
+ : m_FactorsF(f_factors)
+ , m_FactorsE(e_factors)
+ , m_FactorsC(c_factors) { }
+
+ virtual
+ ~LexicalReorderingTable() { }
+
+ public:
+ static
+ LexicalReorderingTable*
+ LoadAvailable(const std::string& filePath,
+ const FactorList& f_factors,
+ const FactorList& e_factors,
+ const FactorList& c_factors);
+
+ virtual
+ Scores
+ GetScore(const Phrase& f, const Phrase& e, const Phrase& c) = 0;
+
+ virtual
+ void
+ InitializeForInput(const InputType&) { /* override for on-demand loading */ };
+
+ virtual
+ void
+ InitializeForInputPhrase(const Phrase&) { }
+
+
+ const FactorList& GetFFactorMask() const { return m_FactorsF; }
+ const FactorList& GetEFactorMask() const { return m_FactorsE; }
+ const FactorList& GetCFactorMask() const { return m_FactorsC; }
+
+ virtual
+ void
+ DbgDump(std::ostream* out) const { *out << "Overwrite in subclass...\n"; };
+ // why is this not a pure virtual function? - UG
+
+ protected:
+ FactorList m_FactorsF;
+ FactorList m_FactorsE;
+ FactorList m_FactorsC;
};
- void DisableCache() {
- m_UseCache = false;
- };
- void ClearCache() {
- if (m_UseCache) {
- m_Cache.clear();
- }
+
+ //! @todo what is this?
+ class LexicalReorderingTableMemory
+ : public LexicalReorderingTable
+ {
+ typedef std::map< std::string, std::vector<float> > TableType;
+ TableType m_Table;
+
+ //implements LexicalReorderingTable saving all scores in one large std::map<> thingy
+ //to be used for non binary tables... uses a LOT of memory
+ public:
+ LexicalReorderingTableMemory(const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors);
+
+ virtual
+ ~LexicalReorderingTableMemory();
+
+ public:
+ virtual
+ std::vector<float>
+ GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
+
+ void
+ DbgDump(std::ostream* out) const;
+
+ private:
+
+ std::string
+ MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const;
+
+ std::string
+ MakeKey(const std::string& f, const std::string& e, const std::string& c) const;
+
+ void
+ LoadFromFile(const std::string& filePath);
};
+
+ class LexicalReorderingTableTree
+ : public LexicalReorderingTable
+ {
+ //implements LexicalReorderingTable using the crafty PDT code...
+
+ typedef std::map< std::string, Candidates > CacheType;
- virtual std::vector<float> GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
-
- virtual void InitializeForInput(const InputType& input);
- virtual void InitializeForInputPhrase(const Phrase& f) {
- ClearCache();
- auxCacheForSrcPhrase(f);
- }
-public:
- static bool Create(std::istream& inFile, const std::string& outFileName);
-private:
- std::string MakeCacheKey(const Phrase& f, const Phrase& e) const;
- IPhrase MakeTableKey(const Phrase& f, const Phrase& e) const;
-
- void Cache(const ConfusionNet& input);
- void Cache(const Sentence& input);
-
- void auxCacheForSrcPhrase(const Phrase& f);
- Scores auxFindScoreForContext(const Candidates& cands, const Phrase& contex);
-private:
- //typedef LexicalReorderingCand CandType;
- typedef std::map< std::string, Candidates > CacheType;
#ifdef WITH_THREADS
- typedef boost::thread_specific_ptr<PrefixTreeMap> TableType;
+ typedef boost::thread_specific_ptr<PrefixTreeMap> TableType;
#else
- typedef std::auto_ptr<PrefixTreeMap> TableType;
+ typedef std::auto_ptr<PrefixTreeMap> TableType;
#endif
- static const int SourceVocId = 0;
- static const int TargetVocId = 1;
-
- bool m_UseCache;
- std::string m_FilePath;
- CacheType m_Cache;
- TableType m_Table;
-};
+ static const int SourceVocId = 0;
+ static const int TargetVocId = 1;
+
+ bool m_UseCache;
+ std::string m_FilePath;
+ CacheType m_Cache;
+ TableType m_Table;
+
+ public:
+
+ static
+ bool
+ Create(std::istream& inFile, const std::string& outFileName);
+
+ LexicalReorderingTableTree(const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors);
+
+ ~LexicalReorderingTableTree();
+
+ bool IsCacheEnabled() const { return m_UseCache; };
+ void EnableCache() { m_UseCache = true; };
+ void DisableCache() { m_UseCache = false; };
+ void ClearCache() { if (m_UseCache) m_Cache.clear(); };
+
+ virtual
+ std::vector<float>
+ GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
+
+ virtual
+ void
+ InitializeForInput(const InputType& input);
+
+ virtual
+ void
+ InitializeForInputPhrase(const Phrase& f)
+ {
+ ClearCache();
+ auxCacheForSrcPhrase(f);
+ }
+
+
+ private:
+ std::string
+ MakeCacheKey(const Phrase& f, const Phrase& e) const;
+
+ IPhrase
+ MakeTableKey(const Phrase& f, const Phrase& e) const;
+
+ void
+ Cache(const ConfusionNet& input);
+
+ void
+ Cache(const Sentence& input);
+
+ void
+ auxCacheForSrcPhrase(const Phrase& f);
+
+ Scores
+ auxFindScoreForContext(const Candidates& cands, const Phrase& contex);
+
+ };
}
-#endif
diff --git a/moses/FF/OSM-Feature/KenOSM.h b/moses/FF/OSM-Feature/KenOSM.h
index a50589edc..03deead07 100644
--- a/moses/FF/OSM-Feature/KenOSM.h
+++ b/moses/FF/OSM-Feature/KenOSM.h
@@ -10,6 +10,8 @@ namespace Moses
class KenOSMBase
{
public:
+ virtual ~KenOSMBase() {}
+
virtual float Score(const lm::ngram::State&, const std::string&,
lm::ngram::State&) const = 0;
diff --git a/moses/FF/OSM-Feature/OpSequenceModel.cpp b/moses/FF/OSM-Feature/OpSequenceModel.cpp
index 43ed5f346..d4e2f8719 100644
--- a/moses/FF/OSM-Feature/OpSequenceModel.cpp
+++ b/moses/FF/OSM-Feature/OpSequenceModel.cpp
@@ -66,14 +66,14 @@ void OpSequenceModel:: EvaluateInIsolation(const Phrase &source
alignments.push_back(iter->second);
}
- for (int i = 0; i < targetPhrase.GetSize(); i++) {
+ for (size_t i = 0; i < targetPhrase.GetSize(); i++) {
if (targetPhrase.GetWord(i).IsOOV() && sFactor == 0 && tFactor == 0)
myTargetPhrase.push_back("_TRANS_SLF_");
else
myTargetPhrase.push_back(targetPhrase.GetWord(i).GetFactor(tFactor)->GetString().as_string());
}
- for (int i = 0; i < source.GetSize(); i++) {
+ for (size_t i = 0; i < source.GetSize(); i++) {
mySourcePhrase.push_back(source.GetWord(i).GetFactor(sFactor)->GetString().as_string());
}
@@ -97,7 +97,7 @@ FFState* OpSequenceModel::EvaluateWhenApplied(
WordsBitmap myBitmap = bitmap;
const Manager &manager = cur_hypo.GetManager();
const InputType &source = manager.GetSource();
- const Sentence &sourceSentence = static_cast<const Sentence&>(source);
+ // const Sentence &sourceSentence = static_cast<const Sentence&>(source);
osmHypothesis obj;
vector <string> mySourcePhrase;
vector <string> myTargetPhrase;
@@ -124,7 +124,7 @@ FFState* OpSequenceModel::EvaluateWhenApplied(
int startIndex = sourceRange.GetStartPos();
int endIndex = sourceRange.GetEndPos();
const AlignmentInfo &align = cur_hypo.GetCurrTargetPhrase().GetAlignTerm();
- osmState * statePtr;
+ // osmState * statePtr;
vector <int> alignments;
@@ -149,7 +149,7 @@ FFState* OpSequenceModel::EvaluateWhenApplied(
// cerr<<mySourcePhrase[i]<<endl;
}
- for (int i = 0; i < target.GetSize(); i++) {
+ for (size_t i = 0; i < target.GetSize(); i++) {
if (target.GetWord(i).IsOOV() && sFactor == 0 && tFactor == 0)
myTargetPhrase.push_back("_TRANS_SLF_");
diff --git a/moses/FF/StatefulFeatureFunction.h b/moses/FF/StatefulFeatureFunction.h
index 08b7c607d..f54f3a746 100644
--- a/moses/FF/StatefulFeatureFunction.h
+++ b/moses/FF/StatefulFeatureFunction.h
@@ -17,7 +17,9 @@ class StatefulFeatureFunction: public FeatureFunction
static std::vector<const StatefulFeatureFunction*> m_statefulFFs;
public:
- static const std::vector<const StatefulFeatureFunction*>& GetStatefulFeatureFunctions() {
+ static const std::vector<const StatefulFeatureFunction*>&
+ GetStatefulFeatureFunctions()
+ {
return m_statefulFFs;
}
diff --git a/moses/FF/VW/VW.h b/moses/FF/VW/VW.h
index 8e113c61f..9be44c8b6 100644
--- a/moses/FF/VW/VW.h
+++ b/moses/FF/VW/VW.h
@@ -155,11 +155,6 @@ public:
const WordsRange &sourceRange = translationOptionList.Get(0)->GetSourceWordsRange();
const InputPath &inputPath = translationOptionList.Get(0)->GetInputPath();
- for(size_t i = 0; i < sourceFeatures.size(); ++i)
- (*sourceFeatures[i])(input, inputPath, sourceRange, classifier);
-
- const std::vector<VWFeatureBase*>& targetFeatures = VWFeatureBase::GetTargetFeatures(GetScoreProducerDescription());
-
// optionally update translation options using leave-one-out
std::vector<bool> keep = (m_leaveOneOut.size() > 0)
? LeaveOneOut(translationOptionList)
@@ -168,15 +163,16 @@ public:
std::vector<float> losses(translationOptionList.size());
std::vector<float>::iterator iterLoss;
TranslationOptionList::const_iterator iterTransOpt;
+ std::vector<bool>::const_iterator iterKeep;
if (m_train) {
// check which translation options are correct in advance
bool seenCorrect = false;
- for(iterTransOpt = translationOptionList.begin(), iterLoss = losses.begin() ;
- iterTransOpt != translationOptionList.end() ; ++iterTransOpt, ++iterLoss) {
+ for(iterTransOpt = translationOptionList.begin(), iterLoss = losses.begin(), iterKeep = keep.begin() ;
+ iterTransOpt != translationOptionList.end() ; ++iterTransOpt, ++iterLoss, ++iterKeep) {
bool isCorrect = IsCorrectTranslationOption(**iterTransOpt);
*iterLoss = isCorrect ? 0.0 : 1.0;
- if (isCorrect) seenCorrect = true;
+ if (isCorrect && *iterKeep) seenCorrect = true;
}
// do not train if there are no positive examples
@@ -186,7 +182,11 @@ public:
}
}
- std::vector<bool>::const_iterator iterKeep;
+ for(size_t i = 0; i < sourceFeatures.size(); ++i)
+ (*sourceFeatures[i])(input, inputPath, sourceRange, classifier);
+
+ const std::vector<VWFeatureBase*>& targetFeatures = VWFeatureBase::GetTargetFeatures(GetScoreProducerDescription());
+
for(iterTransOpt = translationOptionList.begin(), iterLoss = losses.begin(), iterKeep = keep.begin() ;
iterTransOpt != translationOptionList.end() ; ++iterTransOpt, ++iterLoss) {
diff --git a/moses/LM/Ken.cpp b/moses/LM/Ken.cpp
index 6343ef4d0..fd37ce45b 100644
--- a/moses/LM/Ken.cpp
+++ b/moses/LM/Ken.cpp
@@ -146,6 +146,8 @@ template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::stri
:LanguageModel(line)
,m_factorType(factorType)
{
+ ReadParameters();
+
lm::ngram::Config config;
IFVERBOSE(1) {
config.messages = &std::cerr;
@@ -441,15 +443,32 @@ bool LanguageModelKen<Model>::IsUseable(const FactorMask &mask) const
return ret;
}
-LanguageModel *ConstructKenLM(const std::string &line)
+
+/* Instantiate LanguageModelKen here. Tells the compiler to generate code
+ * for the instantiations' non-inline member functions in this file.
+ * Otherwise, depending on the compiler, those functions may not be present
+ * at link time.
+ */
+template class LanguageModelKen<lm::ngram::ProbingModel>;
+template class LanguageModelKen<lm::ngram::RestProbingModel>;
+template class LanguageModelKen<lm::ngram::TrieModel>;
+template class LanguageModelKen<lm::ngram::ArrayTrieModel>;
+template class LanguageModelKen<lm::ngram::QuantTrieModel>;
+template class LanguageModelKen<lm::ngram::QuantArrayTrieModel>;
+
+
+LanguageModel *ConstructKenLM(const std::string &lineOrig)
{
FactorType factorType = 0;
string filePath;
bool lazy = false;
- util::TokenIter<util::SingleCharacter, true> argument(line, ' ');
+ util::TokenIter<util::SingleCharacter, true> argument(lineOrig, ' ');
++argument; // KENLM
+ stringstream line;
+ line << "KENLM";
+
for (; argument; ++argument) {
const char *equals = std::find(argument->data(), argument->data() + argument->size(), '=');
UTIL_THROW_IF2(equals == argument->data() + argument->size(),
@@ -464,14 +483,13 @@ LanguageModel *ConstructKenLM(const std::string &line)
filePath.assign(value.data(), value.size());
} else if (name == "lazyken") {
lazy = boost::lexical_cast<bool>(value);
- } else if (name == "name") {
- // that's ok. do nothing, passes onto LM constructor
} else {
- UTIL_THROW2("Unknown KenLM argument " << name);
+ // pass to base class to interpret
+ line << " " << name << "=" << value;
}
}
- return ConstructKenLM(line, filePath, factorType, lazy);
+ return ConstructKenLM(line.str(), filePath, factorType, lazy);
}
LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, bool lazy)
diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index cd9d3b289..6052624cc 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <fstream>
#include <sstream>
#include <algorithm>
+#include <boost/algorithm/string/predicate.hpp>
#include "Parameter.h"
#include "Util.h"
#include "InputFileStream.h"
@@ -32,6 +33,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "util/exception.hh"
using namespace std;
+using namespace boost::algorithm;
namespace Moses
{
@@ -270,8 +272,9 @@ bool Parameter::isOption(const char* token)
if (! token) return false;
std::string tokenString(token);
size_t length = tokenString.size();
- if (length > 0 && tokenString.substr(0,1) != "-") return false;
- if (length > 1 && tokenString.substr(1,1).find_first_not_of("0123456789") == 0) return true;
+ if (length <= 1) return false;
+ if (!starts_with(tokenString, "-")) return false;
+ if (tokenString.substr(1,1).find_first_not_of("0123456789") == 0) return true;
return false;
}
@@ -975,7 +978,7 @@ void Parameter::WeightOverwrite()
for (size_t i = 0; i < toks.size(); ++i) {
const string &tok = toks[i];
- if (tok.substr(tok.size() - 1, 1) == "=") {
+ if (ends_with(tok, "=")) {
// start of new feature
if (name != "") {
diff --git a/moses/ScoreComponentCollection.cpp b/moses/ScoreComponentCollection.cpp
index b8c93c193..a1c864692 100644
--- a/moses/ScoreComponentCollection.cpp
+++ b/moses/ScoreComponentCollection.cpp
@@ -1,5 +1,6 @@
// $Id$
#include <vector>
+#include <boost/algorithm/string/predicate.hpp>
#include "util/exception.hh"
#include "ScoreComponentCollection.h"
#include "StaticData.h"
@@ -7,6 +8,7 @@
#include "moses/FF/StatefulFeatureFunction.h"
using namespace std;
+using namespace boost::algorithm;
namespace Moses
{
@@ -87,7 +89,7 @@ void ScoreComponentCollection::MultiplyEquals(const FeatureFunction* sp, float s
for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
std::stringstream name;
name << i->first;
- if (name.str().substr( 0, prefix.length() ).compare( prefix ) == 0)
+ if (starts_with(name.str(), prefix))
m_scores[i->first] = i->second * scalar;
}
}
@@ -100,7 +102,7 @@ size_t ScoreComponentCollection::GetNumberWeights(const FeatureFunction* sp)
for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
std::stringstream name;
name << i->first;
- if (name.str().substr( 0, prefix.length() ).compare( prefix ) == 0)
+ if (starts_with(name.str(), prefix))
weights++;
}
return weights;
@@ -285,7 +287,7 @@ FVector ScoreComponentCollection::GetVectorForProducer(const FeatureFunction* sp
for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
std::stringstream name;
name << i->first;
- if (name.str().substr( 0, prefix.length() ).compare( prefix ) == 0)
+ if (starts_with(name.str(), prefix))
fv[i->first] = i->second;
}
return fv;
diff --git a/moses/SearchCubePruning.cpp b/moses/SearchCubePruning.cpp
index 2595e35ab..6c981276e 100644
--- a/moses/SearchCubePruning.cpp
+++ b/moses/SearchCubePruning.cpp
@@ -214,47 +214,49 @@ void SearchCubePruning::CreateForwardTodos(HypothesisStackCubePruning &stack)
}
}
-void SearchCubePruning::CreateForwardTodos(const WordsBitmap &bitmap, const WordsRange &range, BitmapContainer &bitmapContainer)
+void
+SearchCubePruning::
+CreateForwardTodos(WordsBitmap const& bitmap, WordsRange const& range,
+ BitmapContainer& bitmapContainer)
{
WordsBitmap newBitmap = bitmap;
newBitmap.SetValue(range.GetStartPos(), range.GetEndPos(), true);
-
+
size_t numCovered = newBitmap.GetNumWordsCovered();
- const TranslationOptionList &transOptList = m_transOptColl.GetTranslationOptionList(range);
+ const TranslationOptionList* transOptList;
+ transOptList = m_transOptColl.GetTranslationOptionList(range);
const SquareMatrix &futureScore = m_transOptColl.GetFutureScore();
- if (transOptList.size() > 0) {
- HypothesisStackCubePruning &newStack = *static_cast<HypothesisStackCubePruning*>(m_hypoStackColl[numCovered]);
- newStack.SetBitmapAccessor(newBitmap, newStack, range, bitmapContainer, futureScore, transOptList);
+ if (transOptList && transOptList->size() > 0) {
+ HypothesisStackCubePruning& newStack
+ = *static_cast<HypothesisStackCubePruning*>(m_hypoStackColl[numCovered]);
+ newStack.SetBitmapAccessor(newBitmap, newStack, range, bitmapContainer,
+ futureScore, *transOptList);
}
}
-
-bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const WordsRange &range) const
+
+bool
+SearchCubePruning::
+CheckDistortion(const WordsBitmap &hypoBitmap, const WordsRange &range) const
{
// since we check for reordering limits, its good to have that limit handy
int maxDistortion = StaticData::Instance().GetMaxDistortion();
-
+ if (maxDistortion < 0) return true;
+
// if there are reordering limits, make sure it is not violated
// the coverage bitmap is handy here (and the position of the first gap)
- const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos()
- , startPos = range.GetStartPos()
- , endPos = range.GetEndPos();
+ size_t const startPos = range.GetStartPos();
+ size_t const endPos = range.GetEndPos();
- // if reordering constraints are used (--monotone-at-punctuation or xml), check if passes all
- if (! m_source.GetReorderingConstraint().Check( hypoBitmap, startPos, endPos ) ) {
+ // if reordering constraints are used (--monotone-at-punctuation or xml),
+ // check if passes all
+ if (!m_source.GetReorderingConstraint().Check(hypoBitmap, startPos, endPos))
return false;
- }
-
- // no limit of reordering: no problem
- if (maxDistortion < 0) {
- return true;
- }
-
- bool leftMostEdge = (hypoFirstGapPos == startPos);
+
+ size_t const hypoFirstGapPos = hypoBitmap.GetFirstGapPos();
// any length extension is okay if starting at left-most edge
- if (leftMostEdge) {
- return true;
- }
+ if (hypoFirstGapPos == startPos) return true;
+
// starting somewhere other than left-most edge, use caution
// the basic idea is this: we would like to translate a phrase starting
// from a position further right than the left-most open gap. The
@@ -264,20 +266,17 @@ bool SearchCubePruning::CheckDistortion(const WordsBitmap &hypoBitmap, const Wor
// hypothesis starting at the left-most edge). If this vlaue is than
// the distortion limit, we don't allow this extension to be made.
WordsRange bestNextExtension(hypoFirstGapPos, hypoFirstGapPos);
- int required_distortion =
- m_source.ComputeDistortionDistance(range, bestNextExtension);
-
- if (required_distortion > maxDistortion) {
- return false;
- }
- return true;
+ return (m_source.ComputeDistortionDistance(range, bestNextExtension)
+ <= maxDistortion);
}
/**
* Find best hypothesis on the last stack.
* This is the end point of the best translation, which can be traced back from here
*/
-const Hypothesis *SearchCubePruning::GetBestHypothesis() const
+Hypothesis const*
+SearchCubePruning::
+GetBestHypothesis() const
{
// const HypothesisStackCubePruning &hypoColl = m_hypoStackColl.back();
const HypothesisStack &hypoColl = *m_hypoStackColl.back();
@@ -287,7 +286,9 @@ const Hypothesis *SearchCubePruning::GetBestHypothesis() const
/**
* Logging of hypothesis stack sizes
*/
-void SearchCubePruning::OutputHypoStackSize()
+void
+SearchCubePruning::
+OutputHypoStackSize()
{
std::vector < HypothesisStack* >::const_iterator iterStack = m_hypoStackColl.begin();
TRACE_ERR( "Stack sizes: " << (int)(*iterStack)->size());
diff --git a/moses/SearchNormal.cpp b/moses/SearchNormal.cpp
index b3e647299..786b554c6 100644
--- a/moses/SearchNormal.cpp
+++ b/moses/SearchNormal.cpp
@@ -3,6 +3,8 @@
#include "SearchNormal.h"
#include "SentenceStats.h"
+#include <boost/foreach.hpp>
+
using namespace std;
namespace Moses
@@ -104,136 +106,140 @@ void SearchNormal::Decode()
* violation of reordering limits.
* \param hypothesis hypothesis to be expanded upon
*/
-void SearchNormal::ProcessOneHypothesis(const Hypothesis &hypothesis)
+void
+SearchNormal::
+ProcessOneHypothesis(const Hypothesis &hypothesis)
{
// since we check for reordering limits, its good to have that limit handy
int maxDistortion = StaticData::Instance().GetMaxDistortion();
bool isWordLattice = StaticData::Instance().GetInputType() == WordLatticeInput;
+ const WordsBitmap hypoBitmap = hypothesis.GetWordsBitmap();
+ const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos();
+ size_t const sourceSize = m_source.GetSize();
+
+ ReorderingConstraint const&
+ ReoConstraint = m_source.GetReorderingConstraint();
+
// no limit of reordering: only check for overlap
if (maxDistortion < 0) {
- const WordsBitmap hypoBitmap = hypothesis.GetWordsBitmap();
- const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos()
- , sourceSize = m_source.GetSize();
-
- for (size_t startPos = hypoFirstGapPos ; startPos < sourceSize ; ++startPos) {
- size_t maxSize = sourceSize - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase;
- for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) {
- // basic checks
- // there have to be translation options
- if (m_transOptColl.GetTranslationOptionList(WordsRange(startPos, endPos)).size() == 0 ||
- // no overlap with existing words
- hypoBitmap.Overlap(WordsRange(startPos, endPos)) ||
- // specified reordering constraints (set with -monotone-at-punctuation or xml)
- !m_source.GetReorderingConstraint().Check( hypoBitmap, startPos, endPos ) ) {
- continue;
- }
-
- //TODO: does this method include incompatible WordLattice hypotheses?
- ExpandAllHypotheses(hypothesis, startPos, endPos);
+ for (size_t startPos = hypoFirstGapPos ; startPos < sourceSize ; ++startPos)
+ {
+ TranslationOptionList const* tol;
+ size_t endPos = startPos;
+ for (tol = m_transOptColl.GetTranslationOptionList(startPos, endPos);
+ tol && endPos < sourceSize;
+ tol = m_transOptColl.GetTranslationOptionList(startPos, ++endPos))
+ {
+ if (tol->size() == 0
+ || hypoBitmap.Overlap(WordsRange(startPos, endPos))
+ || !ReoConstraint.Check(hypoBitmap, startPos, endPos))
+ { continue; }
+
+ //TODO: does this method include incompatible WordLattice hypotheses?
+ ExpandAllHypotheses(hypothesis, startPos, endPos);
+ }
}
- }
-
return; // done with special case (no reordering limit)
}
- // if there are reordering limits, make sure it is not violated
- // the coverage bitmap is handy here (and the position of the first gap)
- const WordsBitmap hypoBitmap = hypothesis.GetWordsBitmap();
- const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos()
- , sourceSize = m_source.GetSize();
-
- // MAIN LOOP. go through each possible range
- for (size_t startPos = hypoFirstGapPos ; startPos < sourceSize ; ++startPos) {
- // don't bother expanding phrases if the first position is already taken
- if(hypoBitmap.GetValue(startPos))
- continue;
-
- WordsRange prevRange = hypothesis.GetCurrSourceWordsRange();
-
- size_t maxSize = sourceSize - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase;
- size_t closestLeft = hypoBitmap.GetEdgeToTheLeftOf(startPos);
- if (isWordLattice) {
- // first question: is there a path from the closest translated word to the left
- // of the hypothesized extension to the start of the hypothesized extension?
- // long version: is there anything to our left? is it farther left than where we're starting anyway? can we get to it?
- // closestLeft is exclusive: a value of 3 means 2 is covered, our arc is currently ENDING at 3 and can start at 3 implicitly
- if (closestLeft != 0 && closestLeft != startPos && !m_source.CanIGetFromAToB(closestLeft, startPos)) {
- continue;
- }
- if (prevRange.GetStartPos() != NOT_FOUND &&
- prevRange.GetStartPos() > startPos && !m_source.CanIGetFromAToB(startPos, prevRange.GetStartPos())) {
- continue;
- }
- }
+ // There are reordering limits. Make sure they are not violated.
- WordsRange currentStartRange(startPos, startPos);
- if(m_source.ComputeDistortionDistance(prevRange, currentStartRange) > maxDistortion)
- continue;
-
- for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) {
- // basic checks
- WordsRange extRange(startPos, endPos);
- // there have to be translation options
- if (m_transOptColl.GetTranslationOptionList(extRange).size() == 0 ||
- // no overlap with existing words
- hypoBitmap.Overlap(extRange) ||
- // specified reordering constraints (set with -monotone-at-punctuation or xml)
- !m_source.GetReorderingConstraint().Check( hypoBitmap, startPos, endPos ) || //
- // connection in input word lattice
- (isWordLattice && !m_source.IsCoveragePossible(extRange))) {
- continue;
- }
-
- // ask second question here:
- // we already know we can get to our starting point from the closest thing to the left. We now ask the follow up:
- // can we get from our end to the closest thing on the right?
- // long version: is anything to our right? is it farther right than our (inclusive) end? can our end reach it?
- bool leftMostEdge = (hypoFirstGapPos == startPos);
-
- // closest right definition:
- size_t closestRight = hypoBitmap.GetEdgeToTheRightOf(endPos);
- if (isWordLattice) {
- //if (!leftMostEdge && closestRight != endPos && closestRight != sourceSize && !m_source.CanIGetFromAToB(endPos, closestRight + 1)) {
- if (closestRight != endPos && ((closestRight + 1) < sourceSize) && !m_source.CanIGetFromAToB(endPos + 1, closestRight + 1)) {
- continue;
- }
- }
+ WordsRange prevRange = hypothesis.GetCurrSourceWordsRange();
+ for (size_t startPos = hypoFirstGapPos ; startPos < sourceSize ; ++startPos)
+ {
+
+ // don't bother expanding phrases if the first position is already taken
+ if(hypoBitmap.GetValue(startPos)) continue;
- // any length extension is okay if starting at left-most edge
- if (leftMostEdge) {
- ExpandAllHypotheses(hypothesis, startPos, endPos);
- }
- // starting somewhere other than left-most edge, use caution
- else {
- // the basic idea is this: we would like to translate a phrase starting
- // from a position further right than the left-most open gap. The
- // distortion penalty for the following phrase will be computed relative
- // to the ending position of the current extension, so we ask now what
- // its maximum value will be (which will always be the value of the
- // hypothesis starting at the left-most edge). If this value is less than
- // the distortion limit, we don't allow this extension to be made.
- WordsRange bestNextExtension(hypoFirstGapPos, hypoFirstGapPos);
- int required_distortion =
- m_source.ComputeDistortionDistance(extRange, bestNextExtension);
-
- if (required_distortion > maxDistortion) {
- continue;
- }
-
- // everything is fine, we're good to go
- ExpandAllHypotheses(hypothesis, startPos, endPos);
-
- }
+ size_t maxSize = sourceSize - startPos;
+ size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
+ maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase;
+ size_t closestLeft = hypoBitmap.GetEdgeToTheLeftOf(startPos);
+
+ if (isWordLattice)
+ {
+ // first question: is there a path from the closest translated word to the left
+ // of the hypothesized extension to the start of the hypothesized extension?
+ // long version:
+ // - is there anything to our left?
+ // - is it farther left than where we're starting anyway?
+ // - can we get to it?
+
+ // closestLeft is exclusive: a value of 3 means 2 is covered, our
+ // arc is currently ENDING at 3 and can start at 3 implicitly
+ if (closestLeft != 0 && closestLeft != startPos
+ && !m_source.CanIGetFromAToB(closestLeft, startPos))
+ continue;
+
+ if (prevRange.GetStartPos() != NOT_FOUND &&
+ prevRange.GetStartPos() > startPos &&
+ !m_source.CanIGetFromAToB(startPos, prevRange.GetStartPos()))
+ continue;
+ }
+
+ WordsRange currentStartRange(startPos, startPos);
+ if(m_source.ComputeDistortionDistance(prevRange, currentStartRange)
+ > maxDistortion)
+ continue;
+
+ TranslationOptionList const* tol;
+ size_t endPos = startPos;
+ for (tol = m_transOptColl.GetTranslationOptionList(startPos, endPos);
+ tol && endPos < sourceSize;
+ tol = m_transOptColl.GetTranslationOptionList(startPos, ++endPos))
+ {
+ WordsRange extRange(startPos, endPos);
+ if (tol->size() == 0
+ || hypoBitmap.Overlap(extRange)
+ || !ReoConstraint.Check(hypoBitmap, startPos, endPos)
+ || (isWordLattice && !m_source.IsCoveragePossible(extRange)))
+ { continue; }
+
+ // ask second question here: we already know we can get to our
+ // starting point from the closest thing to the left. We now ask the
+ // follow up: can we get from our end to the closest thing on the
+ // right?
+ //
+ // long version: is anything to our right? is it farther
+ // right than our (inclusive) end? can our end reach it?
+ bool isLeftMostEdge = (hypoFirstGapPos == startPos);
+
+ size_t closestRight = hypoBitmap.GetEdgeToTheRightOf(endPos);
+ if (isWordLattice) {
+ if (closestRight != endPos
+ && ((closestRight + 1) < sourceSize)
+ && !m_source.CanIGetFromAToB(endPos + 1, closestRight + 1))
+ { continue; }
+ }
+
+ if (isLeftMostEdge)
+ { // any length extension is okay if starting at left-most edge
+ ExpandAllHypotheses(hypothesis, startPos, endPos);
+ }
+ else // starting somewhere other than left-most edge, use caution
+ {
+ // the basic idea is this: we would like to translate a phrase
+ // starting from a position further right than the left-most
+ // open gap. The distortion penalty for the following phrase
+ // will be computed relative to the ending position of the
+ // current extension, so we ask now what its maximum value will
+ // be (which will always be the value of the hypothesis starting
+ // at the left-most edge). If this value is less than the
+ // distortion limit, we don't allow this extension to be made.
+ WordsRange bestNextExtension(hypoFirstGapPos, hypoFirstGapPos);
+
+ if (m_source.ComputeDistortionDistance(extRange, bestNextExtension)
+ > maxDistortion) continue;
+
+ // everything is fine, we're good to go
+ ExpandAllHypotheses(hypothesis, startPos, endPos);
+ }
+ }
}
- }
}
-
+
/**
* Expand a hypothesis given a list of translation options
@@ -242,7 +248,9 @@ void SearchNormal::ProcessOneHypothesis(const Hypothesis &hypothesis)
* \param endPos last word position of span covered
*/
-void SearchNormal::ExpandAllHypotheses(const Hypothesis &hypothesis, size_t startPos, size_t endPos)
+void
+SearchNormal::
+ExpandAllHypotheses(const Hypothesis &hypothesis, size_t startPos, size_t endPos)
{
// early discarding: check if hypothesis is too bad to build
// this idea is explained in (Moore&Quirk, MT Summit 2007)
@@ -250,15 +258,19 @@ void SearchNormal::ExpandAllHypotheses(const Hypothesis &hypothesis, size_t star
if (StaticData::Instance().UseEarlyDiscarding()) {
// expected score is based on score of current hypothesis
expectedScore = hypothesis.GetScore();
-
+
// add new future score estimate
- expectedScore += m_transOptColl.GetFutureScore().CalcFutureScore( hypothesis.GetWordsBitmap(), startPos, endPos );
+ expectedScore +=
+ m_transOptColl.GetFutureScore()
+ .CalcFutureScore(hypothesis.GetWordsBitmap(), startPos, endPos);
}
-
+
// loop through all translation options
- const TranslationOptionList &transOptList = m_transOptColl.GetTranslationOptionList(WordsRange(startPos, endPos));
+ const TranslationOptionList* tol
+ = m_transOptColl.GetTranslationOptionList(startPos, endPos);
+ if (!tol) return;
TranslationOptionList::const_iterator iter;
- for (iter = transOptList.begin() ; iter != transOptList.end() ; ++iter) {
+ for (iter = tol->begin() ; iter != tol->end() ; ++iter) {
ExpandHypothesis(hypothesis, **iter, expectedScore);
}
}
diff --git a/moses/Sentence.cpp b/moses/Sentence.cpp
index 58d650aa3..a937f21e3 100644
--- a/moses/Sentence.cpp
+++ b/moses/Sentence.cpp
@@ -43,7 +43,7 @@ Sentence::Sentence()
, InputType()
{
const StaticData& staticData = StaticData::Instance();
- if (staticData.IsChart()) {
+ if (staticData.IsSyntax()) {
m_defaultLabelSet.insert(StaticData::Instance().GetInputDefaultNonTerminal());
}
}
@@ -168,7 +168,7 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
if (staticData.GetXmlInputType() != XmlPassThrough) {
int offset = 0;
- if (staticData.IsChart()) {
+ if (staticData.IsSyntax()) {
offset = 1;
}
@@ -188,7 +188,7 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
// placeholders
ProcessPlaceholders(placeholders);
- if (staticData.IsChart()) {
+ if (staticData.IsSyntax()) {
InitStartEndWord();
}
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index ce6d76502..8709d758f 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -21,6 +21,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
#include <string>
+#include <boost/algorithm/string/predicate.hpp>
#include "moses/FF/Factory.h"
#include "TypeDef.h"
@@ -50,6 +51,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#endif
using namespace std;
+using namespace boost::algorithm;
namespace Moses
{
@@ -58,15 +60,15 @@ bool g_mosesDebug = false;
StaticData StaticData::s_instance;
StaticData::StaticData()
- :m_sourceStartPosMattersForRecombination(false)
- ,m_inputType(SentenceInput)
- ,m_onlyDistinctNBest(false)
- ,m_needAlignmentInfo(false)
- ,m_lmEnableOOVFeature(false)
- ,m_isAlwaysCreateDirectTranslationOption(false)
- ,m_currentWeightSetting("default")
- ,m_requireSortingAfterSourceContext(false)
- ,m_treeStructure(NULL)
+ : m_sourceStartPosMattersForRecombination(false)
+ , m_requireSortingAfterSourceContext(false)
+ , m_inputType(SentenceInput)
+ , m_onlyDistinctNBest(false)
+ , m_needAlignmentInfo(false)
+ , m_lmEnableOOVFeature(false)
+ , m_isAlwaysCreateDirectTranslationOption(false)
+ , m_currentWeightSetting("default")
+ , m_treeStructure(NULL)
{
m_xmlBrackets.first="<";
m_xmlBrackets.second=">";
@@ -110,7 +112,7 @@ bool StaticData::LoadData(Parameter *parameter)
// to cube or not to cube
m_parameter->SetParameter(m_searchAlgorithm, "search-algorithm", Normal);
- if (IsChart())
+ if (IsSyntax())
LoadChartDecodingParameters();
// input type has to be specified BEFORE loading the phrase tables!
@@ -696,7 +698,7 @@ void StaticData::LoadDecodeGraphsOld(const vector<string> &mappingVector, const
UTIL_THROW_IF2(decodeStep == NULL, "Null decode step");
if (m_decodeGraphs.size() < decodeGraphInd + 1) {
DecodeGraph *decodeGraph;
- if (IsChart()) {
+ if (IsSyntax()) {
size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN;
VERBOSE(1,"max-chart-span: " << maxChartSpans[decodeGraphInd] << endl);
decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan);
@@ -763,7 +765,7 @@ void StaticData::LoadDecodeGraphsNew(const std::vector<std::string> &mappingVect
UTIL_THROW_IF2(decodeStep == NULL, "Null decode step");
if (m_decodeGraphs.size() < decodeGraphInd + 1) {
DecodeGraph *decodeGraph;
- if (IsChart()) {
+ if (IsSyntax()) {
size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN;
VERBOSE(1,"max-chart-span: " << maxChartSpans[decodeGraphInd] << endl);
decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan);
@@ -1176,7 +1178,7 @@ void StaticData::ResetWeights(const std::string &denseWeights, const std::string
for (size_t i = 0; i < toks.size(); ++i) {
const string &tok = toks[i];
- if (tok.substr(tok.size() - 1, 1) == "=") {
+ if (ends_with(tok, "=")) {
// start of new feature
if (name != "") {
diff --git a/moses/StaticData.h b/moses/StaticData.h
index 193f79aad..d9a96aaa3 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -436,8 +436,13 @@ public:
SearchAlgorithm GetSearchAlgorithm() const {
return m_searchAlgorithm;
}
- bool IsChart() const {
- return m_searchAlgorithm == CYKPlus || m_searchAlgorithm == ChartIncremental;
+ bool IsSyntax() const {
+ return m_searchAlgorithm == CYKPlus ||
+ m_searchAlgorithm == ChartIncremental ||
+ m_searchAlgorithm == SyntaxS2T ||
+ m_searchAlgorithm == SyntaxT2S ||
+ m_searchAlgorithm == SyntaxT2S_SCFG ||
+ m_searchAlgorithm == SyntaxF2S;
}
const ScoreComponentCollection& GetAllWeights() const {
diff --git a/moses/Syntax/F2S/HyperTreeLoader.cpp b/moses/Syntax/F2S/HyperTreeLoader.cpp
index 8dcadef55..54e34412a 100644
--- a/moses/Syntax/F2S/HyperTreeLoader.cpp
+++ b/moses/Syntax/F2S/HyperTreeLoader.cpp
@@ -1,8 +1,9 @@
#include "HyperTreeLoader.h"
#include <sys/stat.h>
-#include <stdlib.h>
+#include <cmath>
+#include <stdlib.h>
#include <fstream>
#include <string>
#include <iterator>
@@ -93,7 +94,7 @@ bool HyperTreeLoader::Load(const std::vector<FactorType> &input,
for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
int processed;
float score = converter.StringToFloat(s->data(), s->length(), &processed);
- UTIL_THROW_IF2(isnan(score), "Bad score " << *s << " on line " << count);
+ UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
scoreVector.push_back(FloorScore(TransformScore(score)));
}
const std::size_t numScoreComponents = ff.GetNumScoreComponents();
diff --git a/moses/Syntax/T2S/RuleTrieLoader.cpp b/moses/Syntax/T2S/RuleTrieLoader.cpp
index 9feaefc94..248bf8140 100644
--- a/moses/Syntax/T2S/RuleTrieLoader.cpp
+++ b/moses/Syntax/T2S/RuleTrieLoader.cpp
@@ -1,8 +1,9 @@
#include "RuleTrieLoader.h"
#include <sys/stat.h>
-#include <stdlib.h>
+#include <cmath>
+#include <cstdlib>
#include <fstream>
#include <string>
#include <iterator>
@@ -88,7 +89,7 @@ bool RuleTrieLoader::Load(const std::vector<FactorType> &input,
for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
int processed;
float score = converter.StringToFloat(s->data(), s->length(), &processed);
- UTIL_THROW_IF2(isnan(score), "Bad score " << *s << " on line " << count);
+ UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
scoreVector.push_back(FloorScore(TransformScore(score)));
}
const std::size_t numScoreComponents = ff.GetNumScoreComponents();
diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
index ad7591a7b..5f42bdeef 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp
@@ -1,5 +1,6 @@
-// $Id$
+// -*- c++ -*-
// vim:tabstop=2
+// $Id$
/***********************************************************************
Moses - factored phrase-based language decoder
Copyright (C) 2006 University of Edinburgh
@@ -24,151 +25,155 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
-LexicalReorderingTableCompact::LexicalReorderingTableCompact(
- const std::string& filePath,
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors)
- : LexicalReorderingTable(f_factors, e_factors, c_factors),
- m_inMemory(StaticData::Instance().UseMinlexrInMemory()),
- m_numScoreComponent(6), m_multipleScoreTrees(true),
- m_hash(10, 16), m_scoreTrees(1)
-{
- Load(filePath);
-}
-
-LexicalReorderingTableCompact::LexicalReorderingTableCompact(
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors)
- : LexicalReorderingTable(f_factors, e_factors, c_factors),
- m_inMemory(StaticData::Instance().UseMinlexrInMemory()),
- m_numScoreComponent(6), m_multipleScoreTrees(true),
- m_hash(10, 16), m_scoreTrees(1)
-{ }
-
-LexicalReorderingTableCompact::~LexicalReorderingTableCompact()
-{
- for(size_t i = 0; i < m_scoreTrees.size(); i++)
- delete m_scoreTrees[i];
-}
-
-std::vector<float> LexicalReorderingTableCompact::GetScore(const Phrase& f,
- const Phrase& e,
- const Phrase& c)
-{
- std::string key;
- Scores scores;
-
- if(0 == c.GetSize())
- key = MakeKey(f, e, c);
- else
- for(size_t i = 0; i <= c.GetSize(); ++i) {
- Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1)));
- key = MakeKey(f,e,sub_c);
- }
-
- size_t index = m_hash[key];
- if(m_hash.GetSize() != index) {
- std::string scoresString;
- if(m_inMemory)
- scoresString = m_scoresMemory[index];
- else
- scoresString = m_scoresMapped[index];
-
- BitWrapper<> bitStream(scoresString);
- for(size_t i = 0; i < m_numScoreComponent; i++)
- scores.push_back(m_scoreTrees[m_multipleScoreTrees ? i : 0]->Read(bitStream));
-
- return scores;
+ LexicalReorderingTableCompact::
+ LexicalReorderingTableCompact(const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors)
+ : LexicalReorderingTable(f_factors, e_factors, c_factors)
+ , m_inMemory(StaticData::Instance().UseMinlexrInMemory())
+ , m_numScoreComponent(6)
+ , m_multipleScoreTrees(true)
+ , m_hash(10, 16)
+ , m_scoreTrees(1)
+ {
+ Load(filePath);
}
- return Scores();
-}
-
-std::string LexicalReorderingTableCompact::MakeKey(const Phrase& f,
- const Phrase& e,
- const Phrase& c) const
-{
- return MakeKey(Trim(f.GetStringRep(m_FactorsF)),
- Trim(e.GetStringRep(m_FactorsE)),
- Trim(c.GetStringRep(m_FactorsC)));
-}
+ LexicalReorderingTableCompact::
+ LexicalReorderingTableCompact(const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors)
+ : LexicalReorderingTable(f_factors, e_factors, c_factors)
+ , m_inMemory(StaticData::Instance().UseMinlexrInMemory())
+ , m_numScoreComponent(6)
+ , m_multipleScoreTrees(true)
+ , m_hash(10, 16)
+ , m_scoreTrees(1)
+ { }
+
+ LexicalReorderingTableCompact::
+ ~LexicalReorderingTableCompact()
+ {
+ for(size_t i = 0; i < m_scoreTrees.size(); i++)
+ delete m_scoreTrees[i];
+ }
-std::string LexicalReorderingTableCompact::MakeKey(const std::string& f,
- const std::string& e,
- const std::string& c) const
-{
- std::string key;
- if(!f.empty()) {
- key += f;
+ std::vector<float>
+ LexicalReorderingTableCompact::
+ GetScore(const Phrase& f, const Phrase& e, const Phrase& c)
+ {
+ std::string key;
+ Scores scores;
+
+ if(0 == c.GetSize())
+ key = MakeKey(f, e, c);
+ else
+ for(size_t i = 0; i <= c.GetSize(); ++i)
+ {
+ Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1)));
+ key = MakeKey(f,e,sub_c);
+ }
+
+ size_t index = m_hash[key];
+ if(m_hash.GetSize() != index)
+ {
+ std::string scoresString;
+ if(m_inMemory)
+ scoresString = m_scoresMemory[index];
+ else
+ scoresString = m_scoresMapped[index];
+
+ BitWrapper<> bitStream(scoresString);
+ for(size_t i = 0; i < m_numScoreComponent; i++)
+ scores.push_back(m_scoreTrees[m_multipleScoreTrees ? i : 0]->Read(bitStream));
+
+ return scores;
+ }
+
+ return Scores();
}
- if(!m_FactorsE.empty()) {
- if(!key.empty()) {
- key += " ||| ";
- }
- key += e;
+
+ std::string
+ LexicalReorderingTableCompact::
+ MakeKey(const Phrase& f,
+ const Phrase& e,
+ const Phrase& c) const
+ {
+ return MakeKey(Trim(f.GetStringRep(m_FactorsF)),
+ Trim(e.GetStringRep(m_FactorsE)),
+ Trim(c.GetStringRep(m_FactorsC)));
}
- if(!m_FactorsC.empty()) {
- if(!key.empty()) {
- key += " ||| ";
- }
- key += c;
+
+ std::string
+ LexicalReorderingTableCompact::
+ MakeKey(const std::string& f,
+ const std::string& e,
+ const std::string& c) const
+ {
+ std::string key;
+ if(!f.empty()) key += f;
+ if(!m_FactorsE.empty()) { if(!key.empty()) key += " ||| "; key += e; }
+ if(!m_FactorsC.empty()) { if(!key.empty()) key += " ||| "; key += c; }
+ key += " ||| ";
+ return key;
}
- key += " ||| ";
- return key;
-}
-LexicalReorderingTable* LexicalReorderingTableCompact::CheckAndLoad(
- const std::string& filePath,
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors)
-{
+ LexicalReorderingTable*
+ LexicalReorderingTableCompact::
+ CheckAndLoad
+ (const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors)
+ {
#ifdef HAVE_CMPH
- std::string minlexr = ".minlexr";
- // file name is specified without suffix
- if(FileExists(filePath + minlexr)) {
- //there exists a compact binary version use that
- VERBOSE(2,"Using compact lexical reordering table" << std::endl);
- return new LexicalReorderingTableCompact(filePath + minlexr, f_factors, e_factors, c_factors);
- }
- // file name is specified with suffix
- if(filePath.substr(filePath.length() - minlexr.length(), minlexr.length()) == minlexr
- && FileExists(filePath)) {
- //there exists a compact binary version use that
- VERBOSE(2,"Using compact lexical reordering table" << std::endl);
- return new LexicalReorderingTableCompact(filePath, f_factors, e_factors, c_factors);
- }
+ std::string minlexr = ".minlexr";
+ // file name is specified without suffix
+ if(FileExists(filePath + minlexr)) {
+ //there exists a compact binary version use that
+ VERBOSE(2,"Using compact lexical reordering table" << std::endl);
+ return new LexicalReorderingTableCompact(filePath + minlexr, f_factors, e_factors, c_factors);
+ }
+ // file name is specified with suffix
+ if(filePath.substr(filePath.length() - minlexr.length(), minlexr.length()) == minlexr
+ && FileExists(filePath)) {
+ //there exists a compact binary version use that
+ VERBOSE(2,"Using compact lexical reordering table" << std::endl);
+ return new LexicalReorderingTableCompact(filePath, f_factors, e_factors, c_factors);
+ }
#endif
- return 0;
-}
-
-void LexicalReorderingTableCompact::Load(std::string filePath)
-{
- std::FILE* pFile = std::fopen(filePath.c_str(), "r");
- if(m_inMemory)
- m_hash.Load(pFile);
- else
- m_hash.LoadIndex(pFile);
-
- size_t read = 0;
- read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, pFile);
- read += std::fread(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, pFile);
-
- if(m_multipleScoreTrees) {
- m_scoreTrees.resize(m_numScoreComponent);
- for(size_t i = 0; i < m_numScoreComponent; i++)
- m_scoreTrees[i] = new CanonicalHuffman<float>(pFile);
- } else {
- m_scoreTrees.resize(1);
- m_scoreTrees[0] = new CanonicalHuffman<float>(pFile);
+ return 0;
}
- if(m_inMemory)
- m_scoresMemory.load(pFile, false);
- else
- m_scoresMapped.load(pFile, true);
-}
-
+ void
+ LexicalReorderingTableCompact::
+ Load(std::string filePath)
+ {
+ std::FILE* pFile = std::fopen(filePath.c_str(), "r");
+ if(m_inMemory)
+ m_hash.Load(pFile);
+ else
+ m_hash.LoadIndex(pFile);
+
+ size_t read = 0;
+ read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, pFile);
+ read += std::fread(&m_multipleScoreTrees,
+ sizeof(m_multipleScoreTrees), 1, pFile);
+
+ if(m_multipleScoreTrees) {
+ m_scoreTrees.resize(m_numScoreComponent);
+ for(size_t i = 0; i < m_numScoreComponent; i++)
+ m_scoreTrees[i] = new CanonicalHuffman<float>(pFile);
+ } else {
+ m_scoreTrees.resize(1);
+ m_scoreTrees[0] = new CanonicalHuffman<float>(pFile);
+ }
+
+ if(m_inMemory)
+ m_scoresMemory.load(pFile, false);
+ else
+ m_scoresMapped.load(pFile, true);
+ }
+
}
diff --git a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h
index 755398788..d54dd5273 100644
--- a/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h
+++ b/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h
@@ -36,49 +36,53 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
namespace Moses
{
-class LexicalReorderingTableCompact: public LexicalReorderingTable
-{
-private:
- bool m_inMemory;
-
- size_t m_numScoreComponent;
- bool m_multipleScoreTrees;
-
- BlockHashIndex m_hash;
-
- typedef CanonicalHuffman<float> ScoreTree;
- std::vector<ScoreTree*> m_scoreTrees;
-
- StringVector<unsigned char, unsigned long, MmapAllocator> m_scoresMapped;
- StringVector<unsigned char, unsigned long, std::allocator> m_scoresMemory;
-
- std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const;
- std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const;
-
-public:
- LexicalReorderingTableCompact(
- const std::string& filePath,
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors);
-
- LexicalReorderingTableCompact(
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors);
-
- virtual ~LexicalReorderingTableCompact();
-
- virtual std::vector<float> GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
-
- static LexicalReorderingTable* CheckAndLoad(
- const std::string& filePath,
- const std::vector<FactorType>& f_factors,
- const std::vector<FactorType>& e_factors,
- const std::vector<FactorType>& c_factors);
-
- void Load(std::string filePath);
-};
+ class LexicalReorderingTableCompact:
+ public LexicalReorderingTable
+ {
+ private:
+ bool m_inMemory;
+
+ size_t m_numScoreComponent;
+ bool m_multipleScoreTrees;
+
+ BlockHashIndex m_hash;
+
+ typedef CanonicalHuffman<float> ScoreTree;
+ std::vector<ScoreTree*> m_scoreTrees;
+
+ StringVector<unsigned char, unsigned long, MmapAllocator> m_scoresMapped;
+ StringVector<unsigned char, unsigned long, std::allocator> m_scoresMemory;
+
+ std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const;
+ std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const;
+
+ public:
+ LexicalReorderingTableCompact(const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors);
+
+ LexicalReorderingTableCompact(const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors);
+
+ virtual
+ ~LexicalReorderingTableCompact();
+
+ virtual
+ std::vector<float>
+ GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
+
+ static
+ LexicalReorderingTable*
+ CheckAndLoad(const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors);
+
+ void
+ Load(std::string filePath);
+ };
}
diff --git a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
index 90d5575a1..9c3f6b513 100644
--- a/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
+++ b/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp
@@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <queue>
#include <algorithm>
#include <sys/stat.h>
+#include <boost/algorithm/string/predicate.hpp>
#include "PhraseDictionaryCompact.h"
#include "moses/FactorCollection.h"
@@ -37,6 +38,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "util/exception.hh"
using namespace std;
+using namespace boost::algorithm;
namespace Moses
{
@@ -63,18 +65,9 @@ void PhraseDictionaryCompact::Load()
std::string tFilePath = m_filePath;
std::string suffix = ".minphr";
- if(tFilePath.substr(tFilePath.length() - suffix.length(), suffix.length()) == suffix) {
- if(!FileExists(tFilePath)) {
- throw runtime_error("Error: File " + tFilePath + " does not exit.");
- exit(1);
- }
- } else {
- if(FileExists(tFilePath + suffix)) {
- tFilePath += suffix;
- } else {
- throw runtime_error("Error: File " + tFilePath + ".minphr does not exit.");
- }
- }
+ if (!ends_with(tFilePath, suffix)) tFilePath += suffix;
+ if (!FileExists(tFilePath))
+ throw runtime_error("Error: File " + tFilePath + " does not exist.");
m_phraseDecoder = new PhraseDecoder(*this, &m_input, &m_output,
m_numScoreComponents, &m_weight);
diff --git a/moses/TranslationModel/RuleTable/LoaderStandard.cpp b/moses/TranslationModel/RuleTable/LoaderStandard.cpp
index 410a53e6f..95463feea 100644
--- a/moses/TranslationModel/RuleTable/LoaderStandard.cpp
+++ b/moses/TranslationModel/RuleTable/LoaderStandard.cpp
@@ -26,6 +26,7 @@
#include <iostream>
#include <sys/stat.h>
#include <stdlib.h>
+#include <boost/algorithm/string/predicate.hpp>
#include "Trie.h"
#include "moses/FactorCollection.h"
#include "moses/Word.h"
@@ -42,6 +43,7 @@
#include "util/exception.hh"
using namespace std;
+using namespace boost::algorithm;
namespace Moses
{
@@ -67,8 +69,7 @@ void ReformatHieroRule(int sourceTarget, string &phrase, map<size_t, pair<size_t
for (size_t i = 0; i < toks.size(); ++i) {
string &tok = toks[i];
- size_t tokLen = tok.size();
- if (tok.substr(0, 1) == "[" && tok.substr(tokLen - 1, 1) == "]") {
+ if (starts_with(tok, "[") && ends_with(tok, "]")) {
// no-term
vector<string> split = Tokenize(tok, ",");
UTIL_THROW_IF2(split.size() != 2,
diff --git a/moses/TranslationModel/UG/generic/file_io/ug_stream.cpp b/moses/TranslationModel/UG/generic/file_io/ug_stream.cpp
index 4e0c7cbc1..073b64dfc 100644
--- a/moses/TranslationModel/UG/generic/file_io/ug_stream.cpp
+++ b/moses/TranslationModel/UG/generic/file_io/ug_stream.cpp
@@ -2,11 +2,13 @@
// (c) 2006,2007,2008 Ulrich Germann
// makes opening files a little more convenient
+#include <boost/algorithm/string/predicate.hpp>
#include "ug_stream.h"
namespace ugdiss
{
using namespace std;
+ using namespace boost::algorithm;
using namespace boost::iostreams;
filtering_istream*
@@ -28,11 +30,11 @@ namespace ugdiss
void
open_input_stream(string fname, filtering_istream& in)
{
- if (fname.size()>3 && fname.substr(fname.size()-3,3)==".gz")
+ if (ends_with(fname, ".gz"))
{
in.push(gzip_decompressor());
}
- else if (fname.size() > 4 && fname.substr(fname.size()-4,4)==".bz2")
+ else if (ends_with(fname, "bz2"))
{
in.push(bzip2_decompressor());
}
@@ -42,13 +44,11 @@ namespace ugdiss
void
open_output_stream(string fname, filtering_ostream& out)
{
- if ((fname.size() > 3 && fname.substr(fname.size()-3,3)==".gz") ||
- (fname.size() > 4 && fname.substr(fname.size()-4,4)==".gz_"))
+ if (ends_with(fname, ".gz") || ends_with(fname, ".gz_"))
{
out.push(gzip_compressor());
}
- else if ((fname.size() > 4 && fname.substr(fname.size()-4,4)==".bz2") ||
- (fname.size() > 5 && fname.substr(fname.size()-5,5)==".bz2_"))
+ else if (ends_with(fname, ".bz2") || ends_with(fname, ".bz2_"))
{
out.push(bzip2_compressor());
}
diff --git a/moses/TranslationModel/UG/mm/mtt-build.cc b/moses/TranslationModel/UG/mm/mtt-build.cc
index 49fd7f6c2..f49895ebf 100644
--- a/moses/TranslationModel/UG/mm/mtt-build.cc
+++ b/moses/TranslationModel/UG/mm/mtt-build.cc
@@ -4,6 +4,7 @@
// recognized based on the number of fields per line) into memory-mapped
// format. (c) 2007-2013 Ulrich Germann
+#include <boost/algorithm/string/predicate.hpp>
#include <boost/program_options.hpp>
#include <boost/program_options/options_description.hpp>
#include <boost/program_options/parsers.hpp>
@@ -32,6 +33,7 @@ using namespace std;
using namespace ugdiss;
using namespace Moses;
using namespace boost;
+using namespace boost::algorithm;
namespace po=boost::program_options;
int with_pfas;
@@ -200,7 +202,7 @@ process_tagged_input(ostream& out,
vector<string> w; string f; istringstream buf(line);
while (buf>>f) w.push_back(f);
- if (w.size() == 0 || (w[0].size() >= 4 && w[0].substr(0,4) == "SID="))
+ if (w.size() == 0 || starts_with(w[0], "SID="))
new_sent = true;
else if (w.size() == 1 && w[0] == "<P>")
diff --git a/moses/TranslationModel/UG/spe-check-coverage.cc b/moses/TranslationModel/UG/spe-check-coverage.cc
index 039b4cd37..4ab2d749a 100644
--- a/moses/TranslationModel/UG/spe-check-coverage.cc
+++ b/moses/TranslationModel/UG/spe-check-coverage.cc
@@ -1,6 +1,7 @@
#include "mmsapt.h"
#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
#include "moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h"
+#include <boost/algorithm/string/predicate.hpp>
#include <boost/foreach.hpp>
#include <boost/format.hpp>
#include <boost/tokenizer.hpp>
@@ -12,6 +13,7 @@ using namespace Moses;
using namespace bitext;
using namespace std;
using namespace boost;
+using namespace boost::algorithm;
vector<FactorType> fo(1,FactorType(0));
@@ -111,7 +113,7 @@ int main(int argc, char* argv[])
int dynprovidx = -1;
for (size_t i = 0; i < fname.size(); ++i)
{
- if (fname[i].substr(0,7) == "prov-1.")
+ if (starts_with(fname[i], "prov-1."))
dynprovidx = i;
}
cout << endl;
@@ -189,8 +191,8 @@ int main(int argc, char* argv[])
size_t j = x-idx.first;
float f = (mmsapt && mmsapt->isLogVal(j)) ? exp(scores[x]) : scores[x];
string fmt = (mmsapt && mmsapt->isInteger(j)) ? "%10d" : "%10.8f";
- if (fname[j].substr(0,3) == "lex") fmt = "%10.3e";
- if (fname[j].substr(0,7) == "prov-1.")
+ if (starts_with(fname[j], "lex")) fmt = "%10.3e";
+ else if (starts_with(fname[j], "prov-1."))
{
f = round(f/(1-f));
fmt = "%10d";
diff --git a/moses/TranslationOption.h b/moses/TranslationOption.h
index b5a50fc32..a5effef88 100644
--- a/moses/TranslationOption.h
+++ b/moses/TranslationOption.h
@@ -1,3 +1,4 @@
+// -*- c++ -*-
// $Id$
/***********************************************************************
@@ -74,6 +75,16 @@ protected:
_ScoreCacheMap m_lexReorderingScores;
public:
+ struct Better
+ {
+ bool operator()(TranslationOption const& a, TranslationOption const& b) const
+ { return a.GetFutureScore() > b.GetFutureScore(); }
+
+ bool operator()(TranslationOption const* a, TranslationOption const* b) const
+ { return a->GetFutureScore() > b->GetFutureScore(); }
+ };
+
+
explicit TranslationOption(); // For initial hypo that does translate nothing
/** constructor. Used by initial translation step */
diff --git a/moses/TranslationOptionCollection.cpp b/moses/TranslationOptionCollection.cpp
index 212b346d0..aa65cb320 100644
--- a/moses/TranslationOptionCollection.cpp
+++ b/moses/TranslationOptionCollection.cpp
@@ -40,723 +40,660 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "moses/FF/InputFeature.h"
#include "util/exception.hh"
+#include <boost/foreach.hpp>
using namespace std;
namespace Moses
{
-/** helper for pruning */
-bool CompareTranslationOption(const TranslationOption *a, const TranslationOption *b)
-{
- return a->GetFutureScore() > b->GetFutureScore();
-}
-
-/** constructor; since translation options are indexed by coverage span, the corresponding data structure is initialized here
- * This fn should be called by inherited classes
-*/
-TranslationOptionCollection::TranslationOptionCollection(
- InputType const& src, size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
- : m_source(src)
- ,m_futureScore(src.GetSize())
- ,m_maxNoTransOptPerCoverage(maxNoTransOptPerCoverage)
- ,m_translationOptionThreshold(translationOptionThreshold)
-{
- // create 2-d vector
- size_t size = src.GetSize();
- for (size_t startPos = 0 ; startPos < size ; ++startPos) {
- m_collection.push_back( vector< TranslationOptionList >() );
-
- size_t maxSize = size - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
+ /** helper for pruning */
+ // bool CompareTranslationOption(const TranslationOption *a, const TranslationOption *b)
+ // {
+ // return a->GetFutureScore() > b->GetFutureScore();
+ // }
+
+ /** constructor; since translation options are indexed by coverage span, the
+ * corresponding data structure is initialized here This fn should be
+ * called by inherited classe */
+ TranslationOptionCollection::
+ TranslationOptionCollection(InputType const& src,
+ size_t maxNoTransOptPerCoverage,
+ float translationOptionThreshold)
+ : m_source(src)
+ , m_futureScore(src.GetSize())
+ , m_maxNoTransOptPerCoverage(maxNoTransOptPerCoverage)
+ , m_translationOptionThreshold(translationOptionThreshold)
+ {
+ // create 2-d vector
+ size_t size = src.GetSize();
+ for (size_t sPos = 0 ; sPos < size ; ++sPos) {
+ m_collection.push_back( vector< TranslationOptionList >() );
+
+ size_t maxSize = size - sPos;
+ size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
+ maxSize = std::min(maxSize, maxSizePhrase);
- for (size_t endPos = 0 ; endPos < maxSize ; ++endPos) {
- m_collection[startPos].push_back( TranslationOptionList() );
+ for (size_t ePos = 0 ; ePos < maxSize ; ++ePos) {
+ m_collection[sPos].push_back( TranslationOptionList() );
+ }
}
}
-}
-/** destructor, clears out data structures */
-TranslationOptionCollection::~TranslationOptionCollection()
-{
- RemoveAllInColl(m_inputPathQueue);
-}
+ /** destructor, clears out data structures */
+ TranslationOptionCollection::
+ ~TranslationOptionCollection()
+ {
+ RemoveAllInColl(m_inputPathQueue);
+ }
-void TranslationOptionCollection::Prune()
-{
- // quit, if max size, threshold
- if (m_maxNoTransOptPerCoverage == 0 && m_translationOptionThreshold == -std::numeric_limits<float>::infinity())
- return;
-
- // bookkeeping for how many options used, pruned
- size_t total = 0;
- size_t totalPruned = 0;
-
- // loop through all spans
- size_t size = m_source.GetSize();
- for (size_t startPos = 0 ; startPos < size; ++startPos) {
- size_t maxSize = size - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
-
- for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) {
- // consider list for a span
- TranslationOptionList &fullList = GetTranslationOptionList(startPos, endPos);
- total += fullList.size();
-
- // size pruning
- if (m_maxNoTransOptPerCoverage > 0 &&
- fullList.size() > m_maxNoTransOptPerCoverage) {
- // sort in vector
- NTH_ELEMENT4(fullList.begin(), fullList.begin() + m_maxNoTransOptPerCoverage, fullList.end(), CompareTranslationOption);
- totalPruned += fullList.size() - m_maxNoTransOptPerCoverage;
-
- // delete the rest
- for (size_t i = m_maxNoTransOptPerCoverage ; i < fullList.size() ; ++i) {
- delete fullList.Get(i);
- }
- fullList.resize(m_maxNoTransOptPerCoverage);
+ void
+ TranslationOptionCollection::
+ Prune()
+ {
+ static float no_th = -std::numeric_limits<float>::infinity();
+
+ if (m_maxNoTransOptPerCoverage == 0 && m_translationOptionThreshold == no_th)
+ return;
+
+ // bookkeeping for how many options used, pruned
+ size_t total = 0;
+ size_t totalPruned = 0;
+
+ // loop through all spans
+ size_t size = m_source.GetSize();
+ for (size_t sPos = 0 ; sPos < size; ++sPos)
+ {
+ BOOST_FOREACH(TranslationOptionList& fullList, m_collection[sPos])
+ {
+ total += fullList.size();
+ totalPruned += fullList.SelectNBest(m_maxNoTransOptPerCoverage);
+ totalPruned += fullList.PruneByThreshold(m_translationOptionThreshold);
+ }
}
+
+ VERBOSE(2," Total translation options: " << total << std::endl
+ << "Total translation options pruned: " << totalPruned << std::endl);
+ }
- // threshold pruning
- if (fullList.size() > 1 && m_translationOptionThreshold != -std::numeric_limits<float>::infinity()) {
- // first, find the best score
- float bestScore = -std::numeric_limits<float>::infinity();
- for (size_t i=0; i < fullList.size() ; ++i) {
- if (fullList.Get(i)->GetFutureScore() > bestScore)
- bestScore = fullList.Get(i)->GetFutureScore();
- }
- //std::cerr << "best score for span " << startPos << "-" << endPos << " is " << bestScore << "\n";
- // then, remove items that are worse than best score + threshold
- for (size_t i=0; i < fullList.size() ; ++i) {
- if (fullList.Get(i)->GetFutureScore() < bestScore + m_translationOptionThreshold) {
- //std::cerr << "\tremoving item " << i << ", score " << fullList.Get(i)->GetFutureScore() << ": " << fullList.Get(i)->GetTargetPhrase() << "\n";
- delete fullList.Get(i);
- fullList.Remove(i);
- total--;
- totalPruned++;
- i--;
- }
- //else
- //{
- // std::cerr << "\tkeeping item " << i << ", score " << fullList.Get(i)->GetFutureScore() << ": " << fullList.Get(i)->GetTargetPhrase() << "\n";
- //}
- }
- } // end of threshold pruning
- }
- } // end of loop through all spans
-
- VERBOSE(2," Total translation options: " << total << std::endl
- << "Total translation options pruned: " << totalPruned << std::endl);
-}
-
-/** Force a creation of a translation option where there are none for a particular source position.
-* ie. where a source word has not been translated, create a translation option by
-* 1. not observing the table limits on phrase/generation tables
-* 2. using the handler ProcessUnknownWord()
-* Call this function once translation option collection has been filled with translation options
-*
-* This function calls for unknown words is complicated by the fact it must handle different input types.
-* The call stack is
-* Base::ProcessUnknownWord()
-* Inherited::ProcessUnknownWord(position)
-* Base::ProcessOneUnknownWord()
-*
-*/
-
-void TranslationOptionCollection::ProcessUnknownWord()
-{
- const vector<DecodeGraph*>& decodeGraphList = StaticData::Instance().GetDecodeGraphs();
- size_t size = m_source.GetSize();
- // try to translation for coverage with no trans by expanding table limit
- for (size_t graphInd = 0 ; graphInd < decodeGraphList.size() ; graphInd++) {
- const DecodeGraph &decodeGraph = *decodeGraphList[graphInd];
- for (size_t pos = 0 ; pos < size ; ++pos) {
- TranslationOptionList &fullList = GetTranslationOptionList(pos, pos);
- size_t numTransOpt = fullList.size();
- if (numTransOpt == 0) {
- CreateTranslationOptionsForRange(decodeGraph, pos, pos, false, graphInd);
+ /** Force a creation of a translation option where there are none for a
+ * particular source position. ie. where a source word has not been
+ * translated, create a translation option by
+ * 1. not observing the table limits on phrase/generation tables
+ * 2. using the handler ProcessUnknownWord()
+ * Call this function once translation option collection has been filled with
+ * translation options
+ *
+ * This function calls for unknown words is complicated by the fact it must
+ * handle different input types. The call stack is
+ * Base::ProcessUnknownWord()
+ * Inherited::ProcessUnknownWord(position)
+ * Base::ProcessOneUnknownWord()
+ *
+ */
+
+ void
+ TranslationOptionCollection::
+ ProcessUnknownWord()
+ {
+ const vector<DecodeGraph*>& decodeGraphList
+ = StaticData::Instance().GetDecodeGraphs();
+ size_t size = m_source.GetSize();
+ // try to translation for coverage with no trans by expanding table limit
+ for (size_t graphInd = 0 ; graphInd < decodeGraphList.size() ; graphInd++) {
+ const DecodeGraph &decodeGraph = *decodeGraphList[graphInd];
+ for (size_t pos = 0 ; pos < size ; ++pos) {
+ TranslationOptionList* fullList = GetTranslationOptionList(pos, pos);
+ // size_t numTransOpt = fullList.size();
+ if (!fullList || fullList->size() == 0) {
+ CreateTranslationOptionsForRange(decodeGraph, pos, pos, false, graphInd);
+ }
}
}
- }
- bool alwaysCreateDirectTranslationOption = StaticData::Instance().IsAlwaysCreateDirectTranslationOption();
- // create unknown words for 1 word coverage where we don't have any trans options
- for (size_t pos = 0 ; pos < size ; ++pos) {
- TranslationOptionList &fullList = GetTranslationOptionList(pos, pos);
- if (fullList.size() == 0 || alwaysCreateDirectTranslationOption)
- ProcessUnknownWord(pos);
- }
-}
-
-/** special handling of ONE unknown words. Either add temporarily add word to translation table,
- * or drop the translation.
- * This function should be called by the ProcessOneUnknownWord() in the inherited class
- * At the moment, this unknown word handler is a bit of a hack, if copies over each factor from source
- * to target word, or uses the 'UNK' factor.
- * Ideally, this function should be in a class which can be expanded upon, for example,
- * to create a morphologically aware handler.
- *
- * \param sourceWord the unknown word
- * \param sourcePos
- * \param length length covered by this word (may be > 1 for lattice input)
- * \param inputScores a set of scores associated with unknown word (input scores from latties/CNs)
- */
-void TranslationOptionCollection::ProcessOneUnknownWord(const InputPath &inputPath,
- size_t sourcePos,
- size_t length,
- const ScorePair *inputScores)
-{
- const StaticData &staticData = StaticData::Instance();
- const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance();
- float unknownScore = FloorScore(TransformScore(0));
- const Word &sourceWord = inputPath.GetPhrase().GetWord(0);
-
- // hack. Once the OOV FF is a phrase table, get rid of this
- PhraseDictionary *firstPt = NULL;
- if (PhraseDictionary::GetColl().size() == 0) {
- firstPt = PhraseDictionary::GetColl()[0];
+ bool alwaysCreateDirectTranslationOption
+ = StaticData::Instance().IsAlwaysCreateDirectTranslationOption();
+ // create unknown words for 1 word coverage where we don't have any trans options
+ for (size_t pos = 0 ; pos < size ; ++pos) {
+ TranslationOptionList* fullList = GetTranslationOptionList(pos, pos);
+ if (!fullList || fullList->size() == 0 || alwaysCreateDirectTranslationOption)
+ ProcessUnknownWord(pos);
+ }
}
- // unknown word, add as trans opt
- FactorCollection &factorCollection = FactorCollection::Instance();
-
- size_t isDigit = 0;
-
- const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
- const StringPiece s = f->GetString();
- bool isEpsilon = (s=="" || s==EPSILON);
- if (StaticData::Instance().GetDropUnknown()) {
-
-
- isDigit = s.find_first_of("0123456789");
- if (isDigit == string::npos)
- isDigit = 0;
- else
- isDigit = 1;
- // modify the starting bitmap
- }
+ /** special handling of ONE unknown words. Either add temporarily add word to
+ * translation table, or drop the translation. This function should be
+ * called by the ProcessOneUnknownWord() in the inherited class At the
+ * moment, this unknown word handler is a bit of a hack, if copies over
+ * each factor from source to target word, or uses the 'UNK' factor.
+ * Ideally, this function should be in a class which can be expanded
+ * upon, for example, to create a morphologically aware handler.
+ *
+ * \param sourceWord the unknown word
+ * \param sourcePos
+ * \param length length covered by this word (may be > 1 for lattice input)
+ * \param inputScores a set of scores associated with unknown word (input scores from latties/CNs)
+ */
+ void
+ TranslationOptionCollection::
+ ProcessOneUnknownWord(const InputPath &inputPath, size_t sourcePos,
+ size_t length, const ScorePair *inputScores)
+ {
+ const StaticData &staticData = StaticData::Instance();
+ const UnknownWordPenaltyProducer&
+ unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance();
+ float unknownScore = FloorScore(TransformScore(0));
+ const Word &sourceWord = inputPath.GetPhrase().GetWord(0);
+
+ // hack. Once the OOV FF is a phrase table, get rid of this
+ PhraseDictionary *firstPt = NULL;
+ if (PhraseDictionary::GetColl().size() == 0) {
+ firstPt = PhraseDictionary::GetColl()[0];
+ }
- TargetPhrase targetPhrase(firstPt);
+ // unknown word, add as trans opt
+ FactorCollection &factorCollection = FactorCollection::Instance();
- if (!(staticData.GetDropUnknown() || isEpsilon) || isDigit) {
- // add to dictionary
+ size_t isDigit = 0;
- Word &targetWord = targetPhrase.AddWord();
- targetWord.SetIsOOV(true);
+ const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
+ const StringPiece s = f->GetString();
+ bool isEpsilon = (s=="" || s==EPSILON);
+ if (StaticData::Instance().GetDropUnknown()) {
- for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) {
- FactorType factorType = static_cast<FactorType>(currFactor);
- const Factor *sourceFactor = sourceWord[currFactor];
- if (sourceFactor == NULL)
- targetWord[factorType] = factorCollection.AddFactor(UNKNOWN_FACTOR);
+ isDigit = s.find_first_of("0123456789");
+ if (isDigit == string::npos)
+ isDigit = 0;
else
- targetWord[factorType] = factorCollection.AddFactor(sourceFactor->GetString());
+ isDigit = 1;
+ // modify the starting bitmap
}
- //create a one-to-one alignment between UNKNOWN_FACTOR and its verbatim translation
- targetPhrase.SetAlignmentInfo("0-0");
+ TargetPhrase targetPhrase(firstPt);
- } else {
- // drop source word. create blank trans opt
+ if (!(staticData.GetDropUnknown() || isEpsilon) || isDigit) {
+ // add to dictionary
- //targetPhrase.SetAlignment();
+ Word &targetWord = targetPhrase.AddWord();
+ targetWord.SetIsOOV(true);
- }
-
- targetPhrase.GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
+ for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) {
+ FactorType factorType = static_cast<FactorType>(currFactor);
- // source phrase
- const Phrase &sourcePhrase = inputPath.GetPhrase();
- m_unksrcs.push_back(&sourcePhrase);
- WordsRange range(sourcePos, sourcePos + length - 1);
+ const Factor *sourceFactor = sourceWord[currFactor];
+ if (sourceFactor == NULL)
+ targetWord[factorType] = factorCollection.AddFactor(UNKNOWN_FACTOR);
+ else
+ targetWord[factorType] = factorCollection.AddFactor(sourceFactor->GetString());
+ }
+ //create a one-to-one alignment between UNKNOWN_FACTOR and its verbatim translation
- targetPhrase.EvaluateInIsolation(sourcePhrase);
+ targetPhrase.SetAlignmentInfo("0-0");
+
+ }
+
+ targetPhrase.GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
- TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
- transOpt->SetInputPath(inputPath);
- Add(transOpt);
+ // source phrase
+ const Phrase &sourcePhrase = inputPath.GetPhrase();
+ m_unksrcs.push_back(&sourcePhrase);
+ WordsRange range(sourcePos, sourcePos + length - 1);
+ targetPhrase.EvaluateInIsolation(sourcePhrase);
-}
+ TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
+ transOpt->SetInputPath(inputPath);
+ Add(transOpt);
-/** compute future score matrix in a dynamic programming fashion.
- * This matrix used in search.
- * Call this function once translation option collection has been filled with translation options
-*/
-void TranslationOptionCollection::CalcFutureScore()
-{
- // setup the matrix (ignore lower triangle, set upper triangle to -inf
- size_t size = m_source.GetSize(); // the width of the matrix
- for(size_t row=0; row<size; row++) {
- for(size_t col=row; col<size; col++) {
- m_futureScore.SetScore(row, col, -numeric_limits<float>::infinity());
- }
}
- // walk all the translation options and record the cheapest option for each span
- for (size_t startPos = 0 ; startPos < size ; ++startPos) {
- size_t maxSize = m_source.GetSize() - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
-
- for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) {
- TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);
-
- TranslationOptionList::const_iterator iterTransOpt;
- for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) {
- const TranslationOption &transOpt = **iterTransOpt;
- float score = transOpt.GetFutureScore();
- if (score > m_futureScore.GetScore(startPos, endPos))
- m_futureScore.SetScore(startPos, endPos, score);
+ /** compute future score matrix in a dynamic programming fashion.
+ * This matrix used in search.
+ * Call this function once translation option collection has been filled with translation options
+ */
+ void
+ TranslationOptionCollection::
+ CalcFutureScore()
+ {
+ // setup the matrix (ignore lower triangle, set upper triangle to -inf
+ size_t size = m_source.GetSize(); // the width of the matrix
+
+ for(size_t row=0; row < size; row++) {
+ for(size_t col=row; col<size; col++) {
+ m_futureScore.SetScore(row, col, -numeric_limits<float>::infinity());
}
}
- }
- // now fill all the cells in the strictly upper triangle
- // there is no way to modify the diagonal now, in the case
- // where no translation option covers a single-word span,
- // we leave the +inf in the matrix
- // like in chart parsing we want each cell to contain the highest score
- // of the full-span trOpt or the sum of scores of joining two smaller spans
-
- for(size_t colstart = 1; colstart < size ; colstart++) {
- for(size_t diagshift = 0; diagshift < size-colstart ; diagshift++) {
- size_t startPos = diagshift;
- size_t endPos = colstart+diagshift;
- for(size_t joinAt = startPos; joinAt < endPos ; joinAt++) {
- float joinedScore = m_futureScore.GetScore(startPos, joinAt)
- + m_futureScore.GetScore(joinAt+1, endPos);
- /* // uncomment to see the cell filling scheme
- TRACE_ERR( "[" <<startPos<<","<<endPos<<"] <-? ["<<startPos<<","<<joinAt<<"]+["<<joinAt+1<<","<<endPos
- << "] (colstart: "<<colstart<<", diagshift: "<<diagshift<<")"<<endl);
- */
- if (joinedScore > m_futureScore.GetScore(startPos, endPos))
- m_futureScore.SetScore(startPos, endPos, joinedScore);
+ // walk all the translation options and record the cheapest option for each span
+ for (size_t sPos = 0 ; sPos < size ; ++sPos)
+ {
+ size_t ePos = sPos;
+ BOOST_FOREACH(TranslationOptionList& tol, m_collection[sPos])
+ {
+ TranslationOptionList::const_iterator toi;
+ for(toi = tol.begin() ; toi != tol.end() ; ++toi) {
+ const TranslationOption& to = **toi;
+ float score = to.GetFutureScore();
+ if (score > m_futureScore.GetScore(sPos, ePos))
+ m_futureScore.SetScore(sPos, ePos, score);
+ }
+ ++ePos;
+ }
}
- }
- }
-
- IFVERBOSE(3) {
- int total = 0;
- for(size_t row=0; row<size; row++) {
- size_t maxSize = size - row;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
- for(size_t col=row; col<row+maxSize; col++) {
- int count = GetTranslationOptionList(row, col).size();
- TRACE_ERR( "translation options spanning from "
- << row <<" to "<< col <<" is "
- << count <<endl);
- total += count;
+ // now fill all the cells in the strictly upper triangle
+ // there is no way to modify the diagonal now, in the case
+ // where no translation option covers a single-word span,
+ // we leave the +inf in the matrix
+ // like in chart parsing we want each cell to contain the highest score
+ // of the full-span trOpt or the sum of scores of joining two smaller spans
+
+ for(size_t colstart = 1; colstart < size ; colstart++) {
+ for(size_t diagshift = 0; diagshift < size-colstart ; diagshift++) {
+ size_t sPos = diagshift;
+ size_t ePos = colstart+diagshift;
+ for(size_t joinAt = sPos; joinAt < ePos ; joinAt++) {
+ float joinedScore = m_futureScore.GetScore(sPos, joinAt)
+ + m_futureScore.GetScore(joinAt+1, ePos);
+ // uncomment to see the cell filling scheme
+ // TRACE_ERR("[" << sPos << "," << ePos << "] <-? ["
+ // << sPos << "," << joinAt << "]+["
+ // << joinAt+1 << "," << ePos << "] (colstart: "
+ // << colstart << ", diagshift: " << diagshift << ")"
+ // << endl);
+
+ if (joinedScore > m_futureScore.GetScore(sPos, ePos))
+ m_futureScore.SetScore(sPos, ePos, joinedScore);
+ }
}
}
- TRACE_ERR( "translation options generated in total: "<< total << endl);
-
- for(size_t row=0; row<size; row++)
- for(size_t col=row; col<size; col++)
- TRACE_ERR( "future cost from "<< row <<" to "<< col <<" is "<< m_futureScore.GetScore(row, col) <<endl);
+
+ IFVERBOSE(3)
+ {
+ int total = 0;
+ for(size_t row = 0; row < size; row++)
+ {
+ size_t col = row;
+ BOOST_FOREACH(TranslationOptionList& tol, m_collection[row])
+ {
+ // size_t maxSize = size - row;
+ // size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
+ // maxSize = std::min(maxSize, maxSizePhrase);
+
+ // for(size_t col=row; col<row+maxSize; col++) {
+ int count = tol.size();
+ TRACE_ERR( "translation options spanning from "
+ << row <<" to "<< col <<" is "
+ << count <<endl);
+ total += count;
+ ++col;
+ }
+ }
+ TRACE_ERR( "translation options generated in total: "<< total << endl);
+
+ for(size_t row=0; row<size; row++)
+ for(size_t col=row; col<size; col++)
+ TRACE_ERR( "future cost from "<< row <<" to "<< col <<" is "
+ << m_futureScore.GetScore(row, col) <<endl);
+ }
}
-}
-
-
-
-/** Create all possible translations from the phrase tables
- * for a particular input sentence. This implies applying all
- * translation and generation steps. Also computes future cost matrix.
- */
-void TranslationOptionCollection::CreateTranslationOptions()
-{
- // loop over all substrings of the source sentence, look them up
- // in the phraseDictionary (which is the- possibly filtered-- phrase
- // table loaded on initialization), generate TranslationOption objects
- // for all phrases
- // there may be multiple decoding graphs (factorizations of decoding)
- const vector <DecodeGraph*> &decodeGraphList = StaticData::Instance().GetDecodeGraphs();
- // length of the sentence
- const size_t size = m_source.GetSize();
- // loop over all decoding graphs, each generates translation options
- for (size_t graphInd = 0 ; graphInd < decodeGraphList.size() ; graphInd++) {
- if (decodeGraphList.size() > 1) {
- VERBOSE(3,"Creating translation options from decoding graph " << graphInd << endl);
- }
-
- const DecodeGraph &decodeGraph = *decodeGraphList[graphInd];
- size_t backoff = decodeGraph.GetBackoff();
- // generate phrases that start at startPos ...
-// VERBOSE(1,"TranslationOptionCollection::CreateTranslationOptions() graphInd:" << graphInd << endl);
- for (size_t startPos = 0 ; startPos < size; startPos++) {
-// VERBOSE(1,"TranslationOptionCollection::CreateTranslationOptions() startPos:" << startPos << endl);
- size_t maxSize = size - startPos; // don't go over end of sentence
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
-
- // ... and that end at endPos
- for (size_t endPos = startPos ; endPos < startPos + maxSize ; endPos++) {
-// VERBOSE(1,"TranslationOptionCollection::CreateTranslationOptions() endPos:" << endPos << endl);
- if (graphInd > 0 && // only skip subsequent graphs
- backoff != 0 && // use of backoff specified
- (endPos-startPos+1 > backoff || // size exceeds backoff limit or ...
- m_collection[startPos][endPos-startPos].size() > 0)) { // no phrases found so far
- VERBOSE(3,"No backoff to graph " << graphInd << " for span [" << startPos << ";" << endPos << "]" << endl);
- // do not create more options
-// VERBOSE(1,"TranslationOptionCollection::CreateTranslationOptions() continue:" << endl);
- continue;
- }
-
- // create translation options for that range
-// VERBOSE(1,"TranslationOptionCollection::CreateTranslationOptions() before CreateTranslationOptionsForRange" << endl);
- CreateTranslationOptionsForRange( decodeGraph, startPos, endPos, true, graphInd);
-// VERBOSE(1,"TranslationOptionCollection::CreateTranslationOptions() after CreateTranslationOptionsForRange" << endl);
+ /** Create all possible translations from the phrase tables
+ * for a particular input sentence. This implies applying all
+ * translation and generation steps. Also computes future cost matrix.
+ */
+ void
+ TranslationOptionCollection::
+ CreateTranslationOptions()
+ {
+ // loop over all substrings of the source sentence, look them up
+ // in the phraseDictionary (which is the- possibly filtered-- phrase
+ // table loaded on initialization), generate TranslationOption objects
+ // for all phrases
+
+ // there may be multiple decoding graphs (factorizations of decoding)
+ const vector <DecodeGraph*> &decodeGraphList
+ = StaticData::Instance().GetDecodeGraphs();
+
+ // length of the sentence
+ const size_t size = m_source.GetSize();
+
+ // loop over all decoding graphs, each generates translation options
+ for (size_t gidx = 0 ; gidx < decodeGraphList.size() ; gidx++)
+ {
+ if (decodeGraphList.size() > 1)
+ VERBOSE(3,"Creating translation options from decoding graph " << gidx << endl);
+
+ const DecodeGraph& dg = *decodeGraphList[gidx];
+ size_t backoff = dg.GetBackoff();
+ // iterate over spans
+ for (size_t sPos = 0 ; sPos < size; sPos++)
+ {
+ size_t maxSize = size - sPos; // don't go over end of sentence
+ size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
+ maxSize = std::min(maxSize, maxSizePhrase);
+
+ for (size_t ePos = sPos ; ePos < sPos + maxSize ; ePos++)
+ {
+ if (gidx && backoff &&
+ (ePos-sPos+1 <= backoff || // size exceeds backoff limit (HUH? UG) or ...
+ m_collection[sPos][ePos-sPos].size() > 0))
+ {
+ VERBOSE(3,"No backoff to graph " << gidx << " for span [" << sPos << ";" << ePos << "]" << endl);
+ continue;
+ }
+ CreateTranslationOptionsForRange(dg, sPos, ePos, true, gidx);
+ }
+ }
}
- }
+ VERBOSE(3,"Translation Option Collection\n " << *this << endl);
+ ProcessUnknownWord();
+ EvaluateWithSourceContext();
+ Prune();
+ Sort();
+ CalcFutureScore(); // future score matrix
+ CacheLexReordering(); // Cached lex reodering costs
}
- VERBOSE(3,"Translation Option Collection\n " << *this << endl);
-
- ProcessUnknownWord();
- EvaluateWithSourceContext();
-
- // Prune
- Prune();
-
- Sort();
-
- // future score matrix
- CalcFutureScore();
-
- // Cached lex reodering costs
- CacheLexReordering();
-}
-
-void TranslationOptionCollection::CreateTranslationOptionsForRange(
- const DecodeGraph &decodeGraph
- , size_t startPos
- , size_t endPos
- , bool adhereTableLimit
- , size_t graphInd
- , InputPath &inputPath)
-{
-//VERBOSE(1,"TranslationOptionCollection::CreateTranslationOptionsForRange() START startPos:" << startPos << " endPos:" << endPos << endl);
- if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos)) {
-
- // partial trans opt stored in here
- PartialTranslOptColl* oldPtoc = new PartialTranslOptColl;
- size_t totalEarlyPruned = 0;
-
- // initial translation step
- list <const DecodeStep* >::const_iterator iterStep = decodeGraph.begin();
- const DecodeStep &decodeStep = **iterStep;
-
- const PhraseDictionary &phraseDictionary = *decodeStep.GetPhraseDictionaryFeature();
- const TargetPhraseCollection *targetPhrases = inputPath.GetTargetPhrases(phraseDictionary);
-
-// VERBOSE(1,"TranslationOptionCollection::CreateTranslationOptionsForRange() before ProcessInitialTranslation" << endl);
- static_cast<const DecodeStepTranslation&>(decodeStep).ProcessInitialTranslation
- (m_source, *oldPtoc
- , startPos, endPos, adhereTableLimit
- , inputPath, targetPhrases);
-// VERBOSE(1,"TranslationOptionCollection::CreateTranslationOptionsForRange() after ProcessInitialTranslation" << endl);
-
- SetInputScore(inputPath, *oldPtoc);
-
- // do rest of decode steps
- int indexStep = 0;
-
- for (++iterStep ; iterStep != decodeGraph.end() ; ++iterStep) {
-
- const DecodeStep *decodeStep = *iterStep;
- PartialTranslOptColl* newPtoc = new PartialTranslOptColl;
-
- // go thru each intermediate trans opt just created
- const vector<TranslationOption*>& partTransOptList = oldPtoc->GetList();
- vector<TranslationOption*>::const_iterator iterPartialTranslOpt;
- for (iterPartialTranslOpt = partTransOptList.begin() ; iterPartialTranslOpt != partTransOptList.end() ; ++iterPartialTranslOpt) {
- TranslationOption &inputPartialTranslOpt = **iterPartialTranslOpt;
-
- if (const DecodeStepTranslation *translateStep = dynamic_cast<const DecodeStepTranslation*>(decodeStep) ) {
- const PhraseDictionary &phraseDictionary = *translateStep->GetPhraseDictionaryFeature();
- const TargetPhraseCollection *targetPhrases = inputPath.GetTargetPhrases(phraseDictionary);
- translateStep->Process(inputPartialTranslOpt
- , *decodeStep
- , *newPtoc
- , this
- , adhereTableLimit
- , targetPhrases);
- } else {
- const DecodeStepGeneration *genStep = dynamic_cast<const DecodeStepGeneration*>(decodeStep);
- assert(genStep);
- genStep->Process(inputPartialTranslOpt
- , *decodeStep
- , *newPtoc
- , this
- , adhereTableLimit);
- }
+ bool
+ TranslationOptionCollection::
+ CreateTranslationOptionsForRange
+ (const DecodeGraph& dgraph, size_t sPos, size_t ePos,
+ bool adhereTableLimit, size_t gidx, InputPath &inputPath)
+ {
+ typedef DecodeStepTranslation Tstep;
+ typedef DecodeStepGeneration Gstep;
+ if ((StaticData::Instance().GetXmlInputType() != XmlExclusive)
+ || !HasXmlOptionsOverlappingRange(sPos,ePos))
+ {
+
+ // partial trans opt stored in here
+ PartialTranslOptColl* oldPtoc = new PartialTranslOptColl;
+ size_t totalEarlyPruned = 0;
+
+ // initial translation step
+ list <const DecodeStep* >::const_iterator d = dgraph.begin();
+ const DecodeStep &dstep = **d;
+
+ const PhraseDictionary &pdict = *dstep.GetPhraseDictionaryFeature();
+ const TargetPhraseCollection *targetPhrases = inputPath.GetTargetPhrases(pdict);
+
+ static_cast<const Tstep&>(dstep).ProcessInitialTranslation
+ (m_source, *oldPtoc, sPos, ePos, adhereTableLimit, inputPath, targetPhrases);
+
+ SetInputScore(inputPath, *oldPtoc);
+
+ // do rest of decode steps
+ int indexStep = 0;
+
+ for (++d ; d != dgraph.end() ; ++d)
+ {
+ const DecodeStep *dstep = *d;
+ PartialTranslOptColl* newPtoc = new PartialTranslOptColl;
+
+ // go thru each intermediate trans opt just created
+ const vector<TranslationOption*>& partTransOptList = oldPtoc->GetList();
+ vector<TranslationOption*>::const_iterator pto;
+ for (pto = partTransOptList.begin() ; pto != partTransOptList.end() ; ++pto)
+ {
+ TranslationOption &inputPartialTranslOpt = **pto;
+ if (const Tstep *tstep = dynamic_cast<const Tstep*>(dstep))
+ {
+ const PhraseDictionary &pdict = *tstep->GetPhraseDictionaryFeature();
+ const TargetPhraseCollection *targetPhrases = inputPath.GetTargetPhrases(pdict);
+ tstep->Process(inputPartialTranslOpt, *dstep, *newPtoc,
+ this, adhereTableLimit, targetPhrases);
+ }
+ else
+ {
+ const Gstep *genStep = dynamic_cast<const Gstep*>(dstep);
+ UTIL_THROW_IF2(!genStep, "Decode steps must be either "
+ << "Translation or Generation Steps!");
+ genStep->Process(inputPartialTranslOpt, *dstep, *newPtoc,
+ this, adhereTableLimit);
+ }
+ }
+
+ // last but 1 partial trans not required anymore
+ totalEarlyPruned += newPtoc->GetPrunedCount();
+ delete oldPtoc;
+ oldPtoc = newPtoc;
+
+ indexStep++;
+ } // for (++d
+
+ // add to fully formed translation option list
+ PartialTranslOptColl &lastPartialTranslOptColl = *oldPtoc;
+ const vector<TranslationOption*>& partTransOptList = lastPartialTranslOptColl.GetList();
+ vector<TranslationOption*>::const_iterator c;
+ for (c = partTransOptList.begin() ; c != partTransOptList.end() ; ++c)
+ {
+ TranslationOption *transOpt = *c;
+ if (StaticData::Instance().GetXmlInputType() != XmlConstraint
+ || !ViolatesXmlOptionsConstraint(sPos,ePos,transOpt))
+ {
+ Add(transOpt);
+ }
+ }
+ lastPartialTranslOptColl.DetachAll();
+ totalEarlyPruned += oldPtoc->GetPrunedCount();
+ delete oldPtoc;
+ // TRACE_ERR( "Early translation options pruned: " << totalEarlyPruned << endl);
+ } // if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(sPos,ePos))
+
+ if (gidx == 0 && StaticData::Instance().GetXmlInputType() != XmlPassThrough
+ && HasXmlOptionsOverlappingRange(sPos,ePos))
+ {
+ CreateXmlOptionsForRange(sPos, ePos);
}
- // last but 1 partial trans not required anymore
- totalEarlyPruned += newPtoc->GetPrunedCount();
- delete oldPtoc;
- oldPtoc = newPtoc;
-
- indexStep++;
- } // for (++iterStep
-
- // add to fully formed translation option list
- PartialTranslOptColl &lastPartialTranslOptColl = *oldPtoc;
- const vector<TranslationOption*>& partTransOptList = lastPartialTranslOptColl.GetList();
- vector<TranslationOption*>::const_iterator iterColl;
- for (iterColl = partTransOptList.begin() ; iterColl != partTransOptList.end() ; ++iterColl) {
- TranslationOption *transOpt = *iterColl;
- if (StaticData::Instance().GetXmlInputType() != XmlConstraint || !ViolatesXmlOptionsConstraint(startPos,endPos,transOpt)) {
- Add(transOpt);
- }
- }
-
- lastPartialTranslOptColl.DetachAll();
- totalEarlyPruned += oldPtoc->GetPrunedCount();
- delete oldPtoc;
- // TRACE_ERR( "Early translation options pruned: " << totalEarlyPruned << endl);
- } // if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos))
-
-// VERBOSE(1,"TranslationOptionCollection::CreateTranslationOptionsForRange() before CreateXmlOptionsForRange" << endl);
- if (graphInd == 0 && StaticData::Instance().GetXmlInputType() != XmlPassThrough && HasXmlOptionsOverlappingRange(startPos,endPos)) {
- CreateXmlOptionsForRange(startPos, endPos);
+ return true;
}
-// VERBOSE(1,"TranslationOptionCollection::CreateTranslationOptionsForRange() after CreateXmlOptionsForRange" << endl);
-}
-void TranslationOptionCollection::SetInputScore(const InputPath &inputPath, PartialTranslOptColl &oldPtoc)
-{
- const ScorePair *inputScore = inputPath.GetInputScore();
- if (inputScore == NULL) {
- return;
+ void
+ TranslationOptionCollection::
+ SetInputScore(const InputPath &inputPath, PartialTranslOptColl &oldPtoc)
+ {
+ const ScorePair* inputScore = inputPath.GetInputScore();
+ if (inputScore == NULL) return;
+
+ const InputFeature &inputFeature = InputFeature::Instance();
+
+ const std::vector<TranslationOption*> &transOpts = oldPtoc.GetList();
+ for (size_t i = 0; i < transOpts.size(); ++i) {
+ TranslationOption &transOpt = *transOpts[i];
+
+ ScoreComponentCollection &scores = transOpt.GetScoreBreakdown();
+ scores.PlusEquals(&inputFeature, *inputScore);
+
+ }
}
- const InputFeature &inputFeature = InputFeature::Instance();
-
- const std::vector<TranslationOption*> &transOpts = oldPtoc.GetList();
- for (size_t i = 0; i < transOpts.size(); ++i) {
- TranslationOption &transOpt = *transOpts[i];
-
- ScoreComponentCollection &scores = transOpt.GetScoreBreakdown();
- scores.PlusEquals(&inputFeature, *inputScore);
-
+ void
+ TranslationOptionCollection::
+ EvaluateWithSourceContext()
+ {
+ const size_t size = m_source.GetSize();
+ for (size_t sPos = 0 ; sPos < size ; ++sPos)
+ {
+ BOOST_FOREACH(TranslationOptionList& tol, m_collection[sPos])
+ {
+ typedef TranslationOptionList::const_iterator to_iter;
+ for(to_iter i = tol.begin() ; i != tol.end() ; ++i)
+ (*i)->EvaluateWithSourceContext(m_source);
+ }
+ }
}
-}
-
-void TranslationOptionCollection::EvaluateWithSourceContext()
-{
- const size_t size = m_source.GetSize();
-
- for (size_t startPos = 0 ; startPos < size ; ++startPos) {
- size_t maxSize = m_source.GetSize() - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
-
- for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) {
- TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);
-
- TranslationOptionList::const_iterator iterTransOpt;
- for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) {
- TranslationOption &transOpt = **iterTransOpt;
- transOpt.EvaluateWithSourceContext(m_source);
+
+ void
+ TranslationOptionCollection::
+ Sort()
+ {
+ static TranslationOption::Better cmp;
+ size_t size = m_source.GetSize();
+ for (size_t sPos = 0 ; sPos < size; ++sPos)
+ {
+ BOOST_FOREACH(TranslationOptionList& tol, m_collection.at(sPos))
+ {
+ // cerr << sPos << ": " << tol.size() << " "
+ // << __FILE__ << ":" << __LINE__ << endl;
+ // size_t nulls=0;
+ // BOOST_FOREACH(TranslationOption const* t, tol)
+ // if (t == NULL) ++nulls;
+ // cerr << nulls << " null pointers ;"
+ // << __FILE__ << ":" << __LINE__ << endl;
+ std::sort(tol.begin(), tol.end(), cmp);
+ }
}
-
- EvaluateTranslatonOptionListWithSourceContext(transOptList);
- }
}
-}
-
-void TranslationOptionCollection::EvaluateTranslatonOptionListWithSourceContext(
- TranslationOptionList &translationOptionList)
-{
-
- const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
- const StaticData &staticData = StaticData::Instance();
- for (size_t i = 0; i < ffs.size(); ++i) {
- const FeatureFunction &ff = *ffs[i];
- if (! staticData.IsFeatureFunctionIgnored(ff)) {
- ff.EvaluateTranslationOptionListWithSourceContext(m_source, translationOptionList);
- }
+
+ /** Check if this range overlaps with any XML options. This doesn't need to be an exact match, only an overlap.
+ * by default, we don't support XML options. subclasses need to override this function.
+ * called by CreateTranslationOptionsForRange()
+ * \param sPos first position in input sentence
+ * \param lastPos last position in input sentence
+ */
+ bool
+ TranslationOptionCollection::
+ HasXmlOptionsOverlappingRange(size_t, size_t) const
+ { return false; }
+
+ /** Check if an option conflicts with any constraint XML options. Okay, if XML option is substring in source and target.
+ * by default, we don't support XML options. subclasses need to override this function.
+ * called by CreateTranslationOptionsForRange()
+ * \param sPos first position in input sentence
+ * \param lastPos last position in input sentence
+ */
+ bool
+ TranslationOptionCollection::
+ ViolatesXmlOptionsConstraint(size_t, size_t, TranslationOption*) const
+ { return false; }
+
+ /** Populates the current Collection with XML options exactly covering the range specified. Default implementation does nothing.
+ * called by CreateTranslationOptionsForRange()
+ * \param sPos first position in input sentence
+ * \param lastPos last position in input sentence
+ */
+ void
+ TranslationOptionCollection::
+ CreateXmlOptionsForRange(size_t, size_t)
+ { }
+
+
+ /** Add translation option to the list
+ * \param translationOption translation option to be added */
+ void
+ TranslationOptionCollection::
+ Add(TranslationOption *translationOption)
+ {
+ const WordsRange &coverage = translationOption->GetSourceWordsRange();
+ size_t const s = coverage.GetStartPos();
+ size_t const e = coverage.GetEndPos();
+ size_t const i = e - s;
+
+ UTIL_THROW_IF2(e >= m_source.GetSize(),
+ "Coverage exceeds input size:" << coverage << "\n"
+ << "translationOption=" << *translationOption);
+
+ vector<TranslationOptionList>& v = m_collection[s];
+ while (i >= v.size()) v.push_back(TranslationOptionList());
+ v[i].Add(translationOption);
}
-
-}
-
-void TranslationOptionCollection::Sort()
-{
- size_t size = m_source.GetSize();
- for (size_t startPos = 0 ; startPos < size; ++startPos) {
- size_t maxSize = size - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
-
- for (size_t endPos = startPos ; endPos < startPos + maxSize; ++endPos) {
- TranslationOptionList &transOptList = GetTranslationOptionList(startPos, endPos);
- std::sort(transOptList.begin(), transOptList.end(), CompareTranslationOption);
- }
+
+ TO_STRING_BODY(TranslationOptionCollection);
+
+ std::ostream&
+ operator<<(std::ostream& out, const TranslationOptionCollection& coll)
+ {
+ size_t stop = coll.m_source.GetSize();
+ TranslationOptionList const* tol;
+ for (size_t sPos = 0 ; sPos < stop ; ++sPos)
+ {
+ for (size_t ePos = sPos;
+ (tol = coll.GetTranslationOptionList(sPos, ePos)) != NULL;
+ ++ePos)
+ {
+ BOOST_FOREACH(TranslationOption const* to, *tol)
+ out << *to << std::endl;
+ }
+ }
+ return out;
}
-}
-
-/** Check if this range overlaps with any XML options. This doesn't need to be an exact match, only an overlap.
- * by default, we don't support XML options. subclasses need to override this function.
- * called by CreateTranslationOptionsForRange()
- * \param startPos first position in input sentence
- * \param lastPos last position in input sentence
- */
-bool TranslationOptionCollection::HasXmlOptionsOverlappingRange(size_t, size_t) const
-{
- return false;
- //not implemented for base class
-}
-
-/** Check if an option conflicts with any constraint XML options. Okay, if XML option is substring in source and target.
- * by default, we don't support XML options. subclasses need to override this function.
- * called by CreateTranslationOptionsForRange()
- * \param startPos first position in input sentence
- * \param lastPos last position in input sentence
- */
-bool TranslationOptionCollection::ViolatesXmlOptionsConstraint(size_t, size_t, TranslationOption *) const
-{
- return false;
- //not implemented for base class
-}
-
-/** Populates the current Collection with XML options exactly covering the range specified. Default implementation does nothing.
- * called by CreateTranslationOptionsForRange()
- * \param startPos first position in input sentence
- * \param lastPos last position in input sentence
- */
-void TranslationOptionCollection::CreateXmlOptionsForRange(size_t, size_t)
-{
- //not implemented for base class
-};
-
-
-/** Add translation option to the list
- * \param translationOption translation option to be added */
-void TranslationOptionCollection::Add(TranslationOption *translationOption)
-{
- const WordsRange &coverage = translationOption->GetSourceWordsRange();
-
- if (coverage.GetEndPos() - coverage.GetStartPos() >= m_collection[coverage.GetStartPos()].size()) {
- cerr << "translationOption=" << *translationOption << endl;
- cerr << "coverage=" << coverage << endl;
+
+ void
+ TranslationOptionCollection::
+ CacheLexReordering()
+ {
+ typedef StatefulFeatureFunction sfFF;
+ std::vector<const sfFF*> const& all_sfff
+ = sfFF::GetStatefulFeatureFunctions();
+ size_t const stop = m_source.GetSize();
+
+ BOOST_FOREACH(sfFF const* ff, all_sfff)
+ {
+ if (typeid(*ff) != typeid(LexicalReordering)) continue;
+ LexicalReordering const& lr = static_cast<const LexicalReordering&>(*ff);
+ for (size_t s = 0 ; s < stop ; s++)
+ {
+ BOOST_FOREACH(TranslationOptionList const& tol, m_collection[s])
+ {
+ BOOST_FOREACH(TranslationOption* to, tol)
+ {
+ Phrase const& sphrase = to->GetInputPath().GetPhrase();
+ Phrase const& tphrase = to->GetTargetPhrase();
+ Scores score = lr.GetProb(sphrase,tphrase);
+ if (!score.empty()) to->CacheLexReorderingScores(lr, score);
+ }
+ }
+ }
+ }
}
-
- UTIL_THROW_IF2(coverage.GetEndPos() - coverage.GetStartPos() >= m_collection[coverage.GetStartPos()].size(),
- "Out of bound access: " << coverage);
- m_collection[coverage.GetStartPos()][coverage.GetEndPos() - coverage.GetStartPos()].Add(translationOption);
-}
-
-TO_STRING_BODY(TranslationOptionCollection);
-
-std::ostream& operator<<(std::ostream& out, const TranslationOptionCollection& coll)
-{
- size_t size = coll.m_source.GetSize();
- for (size_t startPos = 0 ; startPos < size ; ++startPos) {
- size_t maxSize = size - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
-
- for (size_t endPos = startPos ; endPos < startPos + maxSize ; ++endPos) {
- const TranslationOptionList& fullList = coll.GetTranslationOptionList(startPos, endPos);
- size_t sizeFull = fullList.size();
- for (size_t i = 0; i < sizeFull; i++) {
- out << *fullList.Get(i) << std::endl;
- }
- }
+
+ //! list of trans opt for a particular span
+ TranslationOptionList*
+ TranslationOptionCollection::
+ GetTranslationOptionList(size_t const sPos, size_t const ePos)
+ {
+ UTIL_THROW_IF2(sPos >= m_collection.size(), "Out of bound access.");
+ vector<TranslationOptionList>& tol = m_collection[sPos];
+ size_t idx = ePos - sPos;
+ return idx < tol.size() ? &tol[idx] : NULL;
}
-
- //std::vector< std::vector< TranslationOptionList > >::const_iterator i = coll.m_collection.begin();
- //size_t j = 0;
- //for (; i!=coll.m_collection.end(); ++i) {
- //out << "s[" << j++ << "].size=" << i->size() << std::endl;
- //}
-
- return out;
-}
-
-void TranslationOptionCollection::CacheLexReordering()
-{
- size_t size = m_source.GetSize();
-
- const std::vector<const StatefulFeatureFunction*> &ffs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
- std::vector<const StatefulFeatureFunction*>::const_iterator iter;
- for (iter = ffs.begin(); iter != ffs.end(); ++iter) {
- const StatefulFeatureFunction &ff = **iter;
- if (typeid(ff) == typeid(LexicalReordering)) {
- const LexicalReordering &lexreordering = static_cast<const LexicalReordering&>(ff);
- for (size_t startPos = 0 ; startPos < size ; startPos++) {
- size_t maxSize = size - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
-
- for (size_t endPos = startPos ; endPos < startPos + maxSize; endPos++) {
- TranslationOptionList &transOptList = GetTranslationOptionList( startPos, endPos);
- TranslationOptionList::iterator iterTransOpt;
- for(iterTransOpt = transOptList.begin() ; iterTransOpt != transOptList.end() ; ++iterTransOpt) {
- TranslationOption &transOpt = **iterTransOpt;
- //Phrase sourcePhrase = m_source.GetSubString(WordsRange(startPos,endPos));
- const Phrase &sourcePhrase = transOpt.GetInputPath().GetPhrase();
- Scores score = lexreordering.GetProb(sourcePhrase
- , transOpt.GetTargetPhrase());
- if (!score.empty())
- transOpt.CacheLexReorderingScores(lexreordering, score);
- } // for(iterTransOpt
- } // for (size_t endPos = startPos ; endPos < startPos + maxSize; endPos++) {
- } // for (size_t startPos = 0 ; startPos < size ; startPos++) {
- } // if (typeid(ff) == typeid(LexicalReordering)) {
- } // for (iter = ffs.begin(); iter != ffs.end(); ++iter) {
-}
-
-//! list of trans opt for a particular span
-TranslationOptionList &TranslationOptionCollection::GetTranslationOptionList(size_t startPos, size_t endPos)
-{
- size_t maxSize = endPos - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
-
- UTIL_THROW_IF2(maxSize >= m_collection[startPos].size(),
- "Out of bound access: " << maxSize);
-
- return m_collection[startPos][maxSize];
-}
-const TranslationOptionList &TranslationOptionCollection::GetTranslationOptionList(size_t startPos, size_t endPos) const
-{
- size_t maxSize = endPos - startPos;
- size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength();
- maxSize = std::min(maxSize, maxSizePhrase);
-
- UTIL_THROW_IF2(maxSize >= m_collection[startPos].size(),
- "Out of bound access: " << maxSize);
- return m_collection[startPos][maxSize];
-}
-
-void TranslationOptionCollection::GetTargetPhraseCollectionBatch()
-{
- const vector <DecodeGraph*> &decodeGraphList = StaticData::Instance().GetDecodeGraphs();
- for (size_t graphInd = 0 ; graphInd < decodeGraphList.size() ; graphInd++) {
- const DecodeGraph &decodeGraph = *decodeGraphList[graphInd];
-
- list <const DecodeStep* >::const_iterator iterStep;
- for (iterStep = decodeGraph.begin(); iterStep != decodeGraph.end() ; ++iterStep) {
- const DecodeStep &decodeStep = **iterStep;
- const DecodeStepTranslation *transStep = dynamic_cast<const DecodeStepTranslation *>(&decodeStep);
- if (transStep) {
- const PhraseDictionary &phraseDictionary = *transStep->GetPhraseDictionaryFeature();
- phraseDictionary.GetTargetPhraseCollectionBatch(m_inputPathQueue);
+
+ TranslationOptionList const*
+ TranslationOptionCollection::
+ GetTranslationOptionList(size_t sPos, size_t ePos) const
+ {
+ UTIL_THROW_IF2(sPos >= m_collection.size(), "Out of bound access.");
+ vector<TranslationOptionList> const& tol = m_collection[sPos];
+ size_t idx = ePos - sPos;
+ return idx < tol.size() ? &tol[idx] : NULL;
+ }
+
+ void
+ TranslationOptionCollection::
+ GetTargetPhraseCollectionBatch()
+ {
+ typedef DecodeStepTranslation Tstep;
+ const vector <DecodeGraph*> &dgl = StaticData::Instance().GetDecodeGraphs();
+ BOOST_FOREACH(DecodeGraph const* dgraph, dgl)
+ {
+ typedef list <const DecodeStep* >::const_iterator dsiter;
+ for (dsiter i = dgraph->begin(); i != dgraph->end() ; ++i)
+ {
+ const Tstep* tstep = dynamic_cast<const Tstep *>(*i);
+ if (tstep)
+ {
+ const PhraseDictionary &pdict = *tstep->GetPhraseDictionaryFeature();
+ pdict.GetTargetPhraseCollectionBatch(m_inputPathQueue);
+ }
+ }
}
- }
}
-}
-
+
} // namespace
diff --git a/moses/TranslationOptionCollection.h b/moses/TranslationOptionCollection.h
index 2db0df34a..562912b18 100644
--- a/moses/TranslationOptionCollection.h
+++ b/moses/TranslationOptionCollection.h
@@ -1,3 +1,4 @@
+// -*- c++ -*-
// $Id$
/***********************************************************************
@@ -88,9 +89,17 @@ protected:
//! sort all trans opt in each list for cube pruning */
void Sort();
+public:
+ // is there any good reason not to make these public? UG
+
//! list of trans opt for a particular span
- TranslationOptionList &GetTranslationOptionList(size_t startPos, size_t endPos);
- const TranslationOptionList &GetTranslationOptionList(size_t startPos, size_t endPos) const;
+ TranslationOptionList*
+ GetTranslationOptionList(size_t startPos, size_t endPos);
+
+ TranslationOptionList const*
+ GetTranslationOptionList(size_t startPos, size_t endPos) const;
+
+protected:
void Add(TranslationOption *translationOption);
//! implemented by inherited class, called by this class
@@ -104,7 +113,7 @@ protected:
void GetTargetPhraseCollectionBatch();
- void CreateTranslationOptionsForRange(
+ bool CreateTranslationOptionsForRange(
const DecodeGraph &decodeGraph
, size_t startPos
, size_t endPos
@@ -129,15 +138,20 @@ public:
//! Create all possible translations from the phrase tables
virtual void CreateTranslationOptions();
- //! Create translation options that exactly cover a specific input span.
- virtual void CreateTranslationOptionsForRange(const DecodeGraph &decodeStepList
- , size_t startPosition
- , size_t endPosition
- , bool adhereTableLimit
- , size_t graphInd) = 0;
+ //! Create translation options that exactly cover a specific input span.
+ virtual
+ bool
+ CreateTranslationOptionsForRange
+ (const DecodeGraph &decodeStepList,
+ size_t startPosition, size_t endPosition,
+ bool adhereTableLimit, size_t graphInd) = 0;
+
//!Check if this range has XML options
- virtual bool HasXmlOptionsOverlappingRange(size_t startPosition, size_t endPosition) const;
+ virtual
+ bool
+ HasXmlOptionsOverlappingRange(size_t startPosition,
+ size_t endPosition) const;
//! Check if a subsumed XML option constraint is satisfied
virtual bool ViolatesXmlOptionsConstraint(size_t startPosition, size_t endPosition, TranslationOption *transOpt) const;
@@ -152,7 +166,9 @@ public:
}
//! list of trans opt for a particular span
- const TranslationOptionList &GetTranslationOptionList(const WordsRange &coverage) const {
+ TranslationOptionList const*
+ GetTranslationOptionList(const WordsRange &coverage) const
+ {
return GetTranslationOptionList(coverage.GetStartPos(), coverage.GetEndPos());
}
diff --git a/moses/TranslationOptionCollectionConfusionNet.cpp b/moses/TranslationOptionCollectionConfusionNet.cpp
index e03f074b0..698cf51c2 100644
--- a/moses/TranslationOptionCollectionConfusionNet.cpp
+++ b/moses/TranslationOptionCollectionConfusionNet.cpp
@@ -177,52 +177,51 @@ void TranslationOptionCollectionConfusionNet::CreateTranslationOptions()
* \param startPos first position in input sentence
* \param lastPos last position in input sentence
* \param adhereTableLimit whether phrase & generation table limits are adhered to
+ * \return true if there is at least one path for the range has matches
+ * in the source side of the parallel data, i.e., the phrase prefix exists
+ * (abortion condition for trie-based lookup if false)
*/
-void TranslationOptionCollectionConfusionNet::CreateTranslationOptionsForRange(
- const DecodeGraph &decodeGraph
- , size_t startPos
- , size_t endPos
- , bool adhereTableLimit
- , size_t graphInd)
+bool
+TranslationOptionCollectionConfusionNet::
+CreateTranslationOptionsForRange(const DecodeGraph &decodeGraph,
+ size_t startPos, size_t endPos,
+ bool adhereTableLimit, size_t graphInd)
{
if (StaticData::Instance().GetUseLegacyPT()) {
- CreateTranslationOptionsForRangeLEGACY(decodeGraph, startPos, endPos, adhereTableLimit, graphInd);
+ return CreateTranslationOptionsForRangeLEGACY(decodeGraph, startPos, endPos,
+ adhereTableLimit, graphInd);
} else {
- CreateTranslationOptionsForRangeNew(decodeGraph, startPos, endPos, adhereTableLimit, graphInd);
+ return CreateTranslationOptionsForRangeNew(decodeGraph, startPos, endPos,
+ adhereTableLimit, graphInd);
}
}
-void TranslationOptionCollectionConfusionNet::CreateTranslationOptionsForRangeNew(
- const DecodeGraph &decodeGraph
- , size_t startPos
- , size_t endPos
- , bool adhereTableLimit
- , size_t graphInd)
+bool
+TranslationOptionCollectionConfusionNet::
+CreateTranslationOptionsForRangeNew(const DecodeGraph &decodeGraph, size_t startPos,
+ size_t endPos, bool adhereTableLimit, size_t graphInd)
{
InputPathList &inputPathList = GetInputPathList(startPos, endPos);
+ if (inputPathList.size() == 0) return false; // no input path matches!
InputPathList::iterator iter;
for (iter = inputPathList.begin(); iter != inputPathList.end(); ++iter) {
InputPath &inputPath = **iter;
- TranslationOptionCollection::CreateTranslationOptionsForRange(decodeGraph
- , startPos
- , endPos
- , adhereTableLimit
- , graphInd
- , inputPath);
-
+ TranslationOptionCollection::CreateTranslationOptionsForRange
+ (decodeGraph, startPos, endPos, adhereTableLimit, graphInd, inputPath);
}
+ return true;
}
-void TranslationOptionCollectionConfusionNet::CreateTranslationOptionsForRangeLEGACY(
- const DecodeGraph &decodeGraph
- , size_t startPos
- , size_t endPos
- , bool adhereTableLimit
- , size_t graphInd)
+bool
+TranslationOptionCollectionConfusionNet::
+CreateTranslationOptionsForRangeLEGACY(const DecodeGraph &decodeGraph, size_t startPos,
+ size_t endPos, bool adhereTableLimit, size_t graphInd)
{
- if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos)) {
+ bool retval = true;
+ XmlInputType intype = StaticData::Instance().GetXmlInputType();
+ if ((intype != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos)) {
InputPathList &inputPathList = GetInputPathList(startPos, endPos);
-
+
// partial trans opt stored in here
PartialTranslOptColl* oldPtoc = new PartialTranslOptColl;
size_t totalEarlyPruned = 0;
@@ -232,8 +231,7 @@ void TranslationOptionCollectionConfusionNet::CreateTranslationOptionsForRangeLE
const DecodeStep &decodeStep = **iterStep;
static_cast<const DecodeStepTranslation&>(decodeStep).ProcessInitialTranslationLEGACY
- (m_source, *oldPtoc
- , startPos, endPos, adhereTableLimit, inputPathList );
+ (m_source, *oldPtoc, startPos, endPos, adhereTableLimit, inputPathList);
// do rest of decode steps
int indexStep = 0;
@@ -292,11 +290,14 @@ void TranslationOptionCollectionConfusionNet::CreateTranslationOptionsForRangeLE
delete oldPtoc;
// TRACE_ERR( "Early translation options pruned: " << totalEarlyPruned << endl);
- } // if ((StaticData::Instance().GetXmlInputType() != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos))
-
- if (graphInd == 0 && StaticData::Instance().GetXmlInputType() != XmlPassThrough && HasXmlOptionsOverlappingRange(startPos,endPos)) {
+ } // if ((intype != XmlExclusive) || !HasXmlOptionsOverlappingRange(startPos,endPos))
+
+
+ if (graphInd == 0 && intype != XmlPassThrough &&
+ HasXmlOptionsOverlappingRange(startPos,endPos)) {
CreateXmlOptionsForRange(startPos, endPos);
}
+ return retval;
}
diff --git a/moses/TranslationOptionCollectionConfusionNet.h b/moses/TranslationOptionCollectionConfusionNet.h
index ee53f35eb..cf01ebdb1 100644
--- a/moses/TranslationOptionCollectionConfusionNet.h
+++ b/moses/TranslationOptionCollectionConfusionNet.h
@@ -22,13 +22,13 @@ protected:
InputPathMatrix m_inputPathMatrix; /*< contains translation options */
InputPathList &GetInputPathList(size_t startPos, size_t endPos);
- void CreateTranslationOptionsForRangeNew(const DecodeGraph &decodeStepList
+ bool CreateTranslationOptionsForRangeNew(const DecodeGraph &decodeStepList
, size_t startPosition
, size_t endPosition
, bool adhereTableLimit
, size_t graphInd);
- void CreateTranslationOptionsForRangeLEGACY(const DecodeGraph &decodeStepList
+ bool CreateTranslationOptionsForRangeLEGACY(const DecodeGraph &decodeStepList
, size_t startPosition
, size_t endPosition
, bool adhereTableLimit
@@ -39,12 +39,12 @@ public:
void ProcessUnknownWord(size_t sourcePos);
void CreateTranslationOptions();
- void CreateTranslationOptionsForRange(const DecodeGraph &decodeStepList
- , size_t startPosition
- , size_t endPosition
- , bool adhereTableLimit
- , size_t graphInd);
+ bool
+ CreateTranslationOptionsForRange
+ (const DecodeGraph &decodeStepList, size_t spos, size_t epos,
+ bool adhereTableLimit, size_t graphInd);
+
protected:
};
diff --git a/moses/TranslationOptionCollectionLattice.cpp b/moses/TranslationOptionCollectionLattice.cpp
index d20e07fbf..6f9de7836 100644
--- a/moses/TranslationOptionCollectionLattice.cpp
+++ b/moses/TranslationOptionCollectionLattice.cpp
@@ -170,18 +170,22 @@ void TranslationOptionCollectionLattice::CreateTranslationOptions()
}
-void TranslationOptionCollectionLattice::ProcessUnknownWord(size_t sourcePos)
+void
+TranslationOptionCollectionLattice::
+ProcessUnknownWord(size_t sourcePos)
{
UTIL_THROW(util::Exception, "ProcessUnknownWord() not implemented for lattice");
+ // why??? UG
}
-void TranslationOptionCollectionLattice::CreateTranslationOptionsForRange(const DecodeGraph &decodeStepList
- , size_t startPosition
- , size_t endPosition
- , bool adhereTableLimit
- , size_t graphInd)
+bool
+TranslationOptionCollectionLattice::
+CreateTranslationOptionsForRange
+(const DecodeGraph &decodeStepList, size_t startPosition, size_t endPosition,
+ bool adhereTableLimit, size_t graphInd)
{
- UTIL_THROW(util::Exception, "CreateTranslationOptionsForRange() not implemented for lattice");
+ UTIL_THROW(util::Exception,
+ "CreateTranslationOptionsForRange() not implemented for lattice");
}
} // namespace
diff --git a/moses/TranslationOptionCollectionLattice.h b/moses/TranslationOptionCollectionLattice.h
index 09efb4b3c..cea90f11e 100644
--- a/moses/TranslationOptionCollectionLattice.h
+++ b/moses/TranslationOptionCollectionLattice.h
@@ -26,11 +26,10 @@ public:
void CreateTranslationOptions();
- void CreateTranslationOptionsForRange(const DecodeGraph &decodeStepList
- , size_t startPosition
- , size_t endPosition
- , bool adhereTableLimit
- , size_t graphInd); // do not implement
+ bool
+ CreateTranslationOptionsForRange
+ (const DecodeGraph &decodeStepList, size_t startPosition, size_t endPosition,
+ bool adhereTableLimit, size_t graphInd); // do not implement
protected:
void Extend(const InputPath &prevPath, const WordLattice &input);
diff --git a/moses/TranslationOptionCollectionText.cpp b/moses/TranslationOptionCollectionText.cpp
index 2db62fc8f..0f7671a70 100644
--- a/moses/TranslationOptionCollectionText.cpp
+++ b/moses/TranslationOptionCollectionText.cpp
@@ -171,21 +171,18 @@ void TranslationOptionCollectionText::CreateTranslationOptions()
* \param lastPos last position in input sentence
* \param adhereTableLimit whether phrase & generation table limits are adhered to
*/
-void TranslationOptionCollectionText::CreateTranslationOptionsForRange(
- const DecodeGraph &decodeGraph
- , size_t startPos
- , size_t endPos
- , bool adhereTableLimit
- , size_t graphInd)
+bool
+TranslationOptionCollectionText::
+CreateTranslationOptionsForRange
+(const DecodeGraph &decodeGraph, size_t startPos, size_t endPos,
+ bool adhereTableLimit, size_t graphInd)
{
InputPath &inputPath = GetInputPath(startPos, endPos);
-
- TranslationOptionCollection::CreateTranslationOptionsForRange(decodeGraph
- , startPos
- , endPos
- , adhereTableLimit
- , graphInd
- , inputPath);
+
+ return
+ TranslationOptionCollection::
+ CreateTranslationOptionsForRange
+ (decodeGraph, startPos, endPos, adhereTableLimit, graphInd, inputPath);
}
diff --git a/moses/TranslationOptionCollectionText.h b/moses/TranslationOptionCollectionText.h
index 6ba5598ef..cdb35963e 100644
--- a/moses/TranslationOptionCollectionText.h
+++ b/moses/TranslationOptionCollectionText.h
@@ -56,7 +56,7 @@ public:
void CreateTranslationOptions();
- void CreateTranslationOptionsForRange(const DecodeGraph &decodeStepList
+ bool CreateTranslationOptionsForRange(const DecodeGraph &decodeStepList
, size_t startPosition
, size_t endPosition
, bool adhereTableLimit
diff --git a/moses/TranslationOptionList.cpp b/moses/TranslationOptionList.cpp
index 1d99729fe..4e6449b5b 100644
--- a/moses/TranslationOptionList.cpp
+++ b/moses/TranslationOptionList.cpp
@@ -1,39 +1,90 @@
-
#include "TranslationOptionList.h"
#include "Util.h"
#include "TranslationOption.h"
+#include <boost/foreach.hpp>
using namespace std;
namespace Moses
{
-TranslationOptionList::TranslationOptionList(const TranslationOptionList &copy)
-{
- const_iterator iter;
- for (iter = copy.begin(); iter != copy.end(); ++iter) {
- const TranslationOption &origTransOpt = **iter;
- TranslationOption *newTransOpt = new TranslationOption(origTransOpt);
- Add(newTransOpt);
+ TranslationOptionList::
+ TranslationOptionList(const TranslationOptionList &copy)
+ {
+ const_iterator iter;
+ for (iter = copy.begin(); iter != copy.end(); ++iter) {
+ const TranslationOption &origTransOpt = **iter;
+ TranslationOption *newTransOpt = new TranslationOption(origTransOpt);
+ Add(newTransOpt);
+ }
}
-}
-TranslationOptionList::~TranslationOptionList()
-{
- RemoveAllInColl(m_coll);
-}
+ TranslationOptionList::
+ ~TranslationOptionList()
+ {
+ RemoveAllInColl(m_coll);
+ }
-TO_STRING_BODY(TranslationOptionList);
+ TO_STRING_BODY(TranslationOptionList);
-std::ostream& operator<<(std::ostream& out, const TranslationOptionList& coll)
-{
- TranslationOptionList::const_iterator iter;
- for (iter = coll.begin(); iter != coll.end(); ++iter) {
- const TranslationOption &transOpt = **iter;
- out << transOpt << endl;
+ std::ostream& operator<<(std::ostream& out, const TranslationOptionList& coll)
+ {
+ TranslationOptionList::const_iterator iter;
+ for (iter = coll.begin(); iter != coll.end(); ++iter) {
+ const TranslationOption &transOpt = **iter;
+ out << transOpt << endl;
+ }
+
+ return out;
+ }
+
+ size_t
+ TranslationOptionList::
+ SelectNBest(size_t const N)
+ {
+ if (N == 0 || N >= m_coll.size()) return 0;
+ static TranslationOption::Better cmp;
+ NTH_ELEMENT4(m_coll.begin(), m_coll.begin() + N, m_coll.end(), cmp);
+ // delete the rest
+ for (size_t i = N ; i < m_coll.size() ; ++i) delete m_coll[i];
+ size_t ret = m_coll.size() - N;
+ m_coll.resize(N);
+ return ret;
+ }
+
+ size_t
+ TranslationOptionList::
+ PruneByThreshold(float const th)
+ {
+ if (m_coll.size() <= 1) return 0;
+ if (th == -std::numeric_limits<float>::infinity()) return 0;
+
+ // first, find the best score
+ float bestScore = -std::numeric_limits<float>::infinity();
+ BOOST_FOREACH(TranslationOption const* t, m_coll)
+ {
+ if (t->GetFutureScore() > bestScore)
+ bestScore = t->GetFutureScore();
+ }
+
+ size_t old_size = m_coll.size();
+
+ // then, remove items that are worse than best score + threshold
+ // why '+' th ??? Does this ever hold?
+ for (size_t i=0; i < m_coll.size() ; ++i)
+ {
+ if (m_coll[i]->GetFutureScore() < bestScore + th)
+ {
+ delete m_coll[i];
+ if(i + 1 < m_coll.size())
+ std::swap(m_coll[i],m_coll.back());
+ m_coll.pop_back();
+ }
+ }
+
+ m_coll.resize(m_coll.size());
+ return old_size - m_coll.size();
}
- return out;
-}
} // namespace
diff --git a/moses/TranslationOptionList.h b/moses/TranslationOptionList.h
index 39ab526f9..c12d108af 100644
--- a/moses/TranslationOptionList.h
+++ b/moses/TranslationOptionList.h
@@ -1,5 +1,5 @@
-#ifndef moses_TranslationOptionList_h
-#define moses_TranslationOptionList_h
+// -*- c++ -*-
+#pragma once
#include <vector>
#include "util/exception.hh"
@@ -9,62 +9,63 @@
namespace Moses
{
-class TranslationOption;
+ class TranslationOption;
-/** wrapper around vector of translation options
- */
-class TranslationOptionList
-{
- friend std::ostream& operator<<(std::ostream& out, const TranslationOptionList& coll);
-
-protected:
- typedef std::vector<TranslationOption*> CollType;
- CollType m_coll;
-
-public:
- typedef CollType::iterator iterator;
- typedef CollType::const_iterator const_iterator;
- const_iterator begin() const {
- return m_coll.begin();
- }
- const_iterator end() const {
- return m_coll.end();
- }
- iterator begin() {
- return m_coll.begin();
- }
- iterator end() {
- return m_coll.end();
- }
-
- TranslationOptionList() {
- }
- TranslationOptionList(const TranslationOptionList &copy);
- ~TranslationOptionList();
+ /** wrapper around vector of translation options
+ */
+ class TranslationOptionList
+ {
+ friend std::ostream& operator<<(std::ostream& out, const TranslationOptionList& coll);
- void resize(size_t newSize) {
- m_coll.resize(newSize);
- }
- size_t size() const {
- return m_coll.size();
- }
+ protected:
+ typedef std::vector<TranslationOption*> CollType;
+ CollType m_coll;
+
+ public:
+ typedef CollType::iterator iterator;
+ typedef CollType::const_iterator const_iterator;
+ const_iterator begin() const {
+ return m_coll.begin();
+ }
+ const_iterator end() const {
+ return m_coll.end();
+ }
+ iterator begin() {
+ return m_coll.begin();
+ }
+ iterator end() {
+ return m_coll.end();
+ }
+
+ TranslationOptionList() {
+ }
+ TranslationOptionList(const TranslationOptionList &copy);
+ ~TranslationOptionList();
+
+ void resize(size_t newSize) {
+ m_coll.resize(newSize);
+ }
+ size_t size() const {
+ return m_coll.size();
+ }
+
+ const TranslationOption *Get(size_t ind) const {
+ return m_coll.at(ind);
+ }
+ void Remove( size_t ind ) {
+ UTIL_THROW_IF2(ind >= m_coll.size(),
+ "Out of bound index " << ind);
+ m_coll.erase( m_coll.begin()+ind );
+ }
+ void Add(TranslationOption *transOpt) {
+ UTIL_THROW_IF2(!transOpt, "Not a valid translation option!");
+ m_coll.push_back(transOpt);
+ }
+
+ TO_STRING();
- const TranslationOption *Get(size_t ind) const {
- return m_coll.at(ind);
- }
- void Remove( size_t ind ) {
- UTIL_THROW_IF2(ind >= m_coll.size(),
- "Out of bound index " << ind);
- m_coll.erase( m_coll.begin()+ind );
- }
- void Add(TranslationOption *transOpt) {
- m_coll.push_back(transOpt);
- }
-
- TO_STRING();
-
-};
+ size_t SelectNBest(size_t const N);
+ size_t PruneByThreshold(float const th);
+ };
}
-
-#endif
diff --git a/moses/TranslationTask.cpp b/moses/TranslationTask.cpp
index 7c629db7f..eff0588b6 100644
--- a/moses/TranslationTask.cpp
+++ b/moses/TranslationTask.cpp
@@ -61,7 +61,7 @@ void TranslationTask::Run()
// which manager
BaseManager *manager;
- if (!staticData.IsChart()) {
+ if (!staticData.IsSyntax()) {
// phrase-based
manager = new Manager(*m_source);
} else if (staticData.GetSearchAlgorithm() == SyntaxF2S ||
diff --git a/moses/TypeDef.h b/moses/TypeDef.h
index 0a1e1ad9b..a619639bc 100644
--- a/moses/TypeDef.h
+++ b/moses/TypeDef.h
@@ -122,7 +122,7 @@ enum InputTypeEnum {
,ConfusionNetworkInput = 1
,WordLatticeInput = 2
,TreeInputType = 3
- ,WordLatticeInput2 = 4
+ //,WordLatticeInput2 = 4
, TabbedSentenceInput = 5
,ForestInputType = 6
};
@@ -140,6 +140,9 @@ enum DictionaryFind {
,All = 1
};
+// Note: StaticData uses SearchAlgorithm to determine whether the translation
+// model is phrase-based or syntax-based. If you add a syntax-based search
+// algorithm here then you should also update StaticData::IsSyntax().
enum SearchAlgorithm {
Normal = 0
,CubePruning = 1
diff --git a/moses/Util.cpp b/moses/Util.cpp
index 24323f61d..5b6f16e2b 100644
--- a/moses/Util.cpp
+++ b/moses/Util.cpp
@@ -32,6 +32,7 @@
#include <stdio.h>
#include <iostream>
#include <iomanip>
+#include <boost/algorithm/string/predicate.hpp>
#include "TypeDef.h"
#include "Util.h"
#include "Timer.h"
@@ -42,6 +43,7 @@
#include "moses/StaticData.h"
using namespace std;
+using namespace boost::algorithm;
namespace Moses
{
@@ -54,7 +56,7 @@ string GetTempFolder()
#ifdef _WIN32
char *tmpPath = getenv("TMP");
string str(tmpPath);
- if (str.substr(str.size() - 1, 1) != "\\")
+ if (!ends_with(str, "\\"))
str += "\\";
return str;
#else
diff --git a/moses/Util.h b/moses/Util.h
index beefa53da..48e6a51ae 100644
--- a/moses/Util.h
+++ b/moses/Util.h
@@ -48,6 +48,11 @@ namespace Moses
* when compiling for a gui front-end so that running gui won't generate
* output on command line
* */
+
+// TRACE_ERR might have been defined by IRSTLM
+#ifdef TRACE_ERR
+#undef TRACE_ERR
+#endif
#ifdef TRACE_ENABLE
#define TRACE_ERR(str) do { std::cerr << str; } while (false)
#else
@@ -57,7 +62,16 @@ namespace Moses
/** verbose macros
* */
+// VERBOSE might have been defined by IRSTLM
+#ifdef VERBOSE
+#undef VERBOSE
+#endif
#define VERBOSE(level,str) { IFVERBOSE(level) { TRACE_ERR(str); } }
+
+// VERBOSE might have been defined by IRSTLM
+#ifdef IFVERBOSE
+#undef IFVERBOSE
+#endif
#define IFVERBOSE(level) if (StaticData::Instance().GetVerboseLevel() >= level)
#define XVERBOSE(level,str) VERBOSE(level, "[" << HERE << "] " << str)
#define HERE __FILE__ << ":" << __LINE__
diff --git a/moses/WordLattice.h b/moses/WordLattice.h
index 325271234..dc1582d78 100644
--- a/moses/WordLattice.h
+++ b/moses/WordLattice.h
@@ -21,6 +21,10 @@ private:
public:
WordLattice();
+
+ InputTypeEnum GetType() const
+ { return WordLatticeInput; }
+
size_t GetColumnIncrement(size_t ic, size_t j) const;
void Print(std::ostream&) const;
/** Get shortest path between two nodes
diff --git a/moses/XmlOption.cpp b/moses/XmlOption.cpp
index 3ac4f6cd2..38b767d8a 100644
--- a/moses/XmlOption.cpp
+++ b/moses/XmlOption.cpp
@@ -24,6 +24,7 @@
#include <vector>
#include <string>
#include <iostream>
+#include <boost/algorithm/string/predicate.hpp>
#include <boost/foreach.hpp>
#include <boost/unordered_map.hpp>
#include "Util.h"
@@ -40,6 +41,7 @@
namespace Moses
{
using namespace std;
+using namespace boost::algorithm;
string ParseXmlTagAttribute(const string& tag,const string& attributeName)
{
@@ -73,7 +75,7 @@ string TrimXml(const string& str, const std::string& lbrackStr, const std::strin
if (str.size() < lbrackStr.length()+rbrackStr.length() ) return str;
// strip first and last character
- if (str.substr(0,lbrackStr.length()) == lbrackStr && str.substr(str.size()-rbrackStr.length()) == rbrackStr) {
+ if (starts_with(str, lbrackStr) && ends_with(str, rbrackStr)) {
return str.substr(lbrackStr.length(), str.size()-lbrackStr.length()-rbrackStr.length());
}
// not an xml token -> do nothing
@@ -371,7 +373,7 @@ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingCon
vector<float> ffWeights;
vector<string> toks = Tokenize(ParseXmlTagAttribute(tagContent,"weights"));
BOOST_FOREACH(string const& tok, toks) {
- if (tok.substr(tok.size() - 1, 1) == "=") {
+ if (ends_with(tok, "=")) {
// start new feature
if (ffName != "") {
// set previous feature weights
diff --git a/phrase-extract/OutputFileStream.cpp b/phrase-extract/OutputFileStream.cpp
index a61ce1ab1..15c2bd73e 100644
--- a/phrase-extract/OutputFileStream.cpp
+++ b/phrase-extract/OutputFileStream.cpp
@@ -19,11 +19,13 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
+#include <boost/algorithm/string/predicate.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include "OutputFileStream.h"
#include "gzfilebuf.h"
using namespace std;
+using namespace boost::algorithm;
namespace Moses
{
@@ -51,7 +53,7 @@ bool OutputFileStream::Open(const std::string &filePath)
return false;
}
- if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
+ if (ends_with(filePath, ".gz")) {
this->push(boost::iostreams::gzip_compressor());
}
this->push(*m_outFile);
diff --git a/phrase-extract/ScoreFeature.cpp b/phrase-extract/ScoreFeature.cpp
index c037ab584..0795ed7c9 100644
--- a/phrase-extract/ScoreFeature.cpp
+++ b/phrase-extract/ScoreFeature.cpp
@@ -17,11 +17,13 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
+#include <boost/algorithm/string/predicate.hpp>
#include "ScoreFeature.h"
#include "DomainFeature.h"
#include "InternalStructFeature.h"
using namespace std;
+using namespace boost::algorithm;
namespace MosesTraining
{
@@ -41,7 +43,7 @@ void ScoreFeatureManager::configure(const std::vector<std::string> args)
for (size_t i = 0; i < args.size(); ++i) {
if (args[i] == "--IgnoreSentenceId") {
m_includeSentenceId = true;
- } else if (args[i].substr(0,8) == "--Domain") {
+ } else if (starts_with(args[i], "--Domain")) {
string type = args[i].substr(8);
++i;
UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
@@ -59,7 +61,7 @@ void ScoreFeatureManager::configure(const std::vector<std::string> args)
}
domainAdded = true;
m_includeSentenceId = true;
- } else if (args[i].substr(0,14) == "--SparseDomain") {
+ } else if (starts_with(args[i], "--SparseDomain")) {
string type = args[i].substr(14);
++i;
UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
diff --git a/phrase-extract/consolidate-direct-main.cpp b/phrase-extract/consolidate-direct-main.cpp
index 576cdd568..423a3909b 100644
--- a/phrase-extract/consolidate-direct-main.cpp
+++ b/phrase-extract/consolidate-direct-main.cpp
@@ -28,6 +28,8 @@
using namespace std;
+std::vector<std::string> tokenize( const char [] );
+
vector< string > splitLine(const char *line)
{
vector< string > item;
@@ -58,7 +60,7 @@ bool getLine( istream &fileP, vector< string > &item )
string line;
if (getline(fileP, line)) {
item = splitLine(line.c_str());
- return false;
+ return true;
} else {
return false;
}
@@ -107,17 +109,17 @@ int main(int argc, char* argv[])
if (! getLine(fileDirectP, itemDirect ))
break;
- (*fileConsolidated) << itemDirect[0] << " ||| " << itemDirect[1] << " ||| ";
-
- // output alignment and probabilities
- (*fileConsolidated) << itemDirect[2] // prob direct
- << " 2.718" // phrase count feature
- << " ||| " << itemDirect[3]; // alignment
-
- // counts
- (*fileConsolidated) << "||| 0 " << itemDirect[4]; // indirect
- (*fileConsolidated) << endl;
-
+ vector< string > count = tokenize( itemDirect[4].c_str() );
+ float countEF = atof(count[0].c_str());
+ float countF = atof(count[1].c_str());
+ float prob = countF/countEF;
+
+ (*fileConsolidated) << itemDirect[0] << " ||| " // source
+ << itemDirect[1] << " ||| " // target
+ << prob << " ||| " // prob
+ << itemDirect[2] << "||| " // alignment
+ << itemDirect[4] << " " << countEF // counts
+ << " ||| " << endl;
}
fileConsolidated->flush();
diff --git a/phrase-extract/extract-mixed-syntax/InputFileStream.cpp b/phrase-extract/extract-mixed-syntax/InputFileStream.cpp
index d111903e6..ef7741476 100644
--- a/phrase-extract/extract-mixed-syntax/InputFileStream.cpp
+++ b/phrase-extract/extract-mixed-syntax/InputFileStream.cpp
@@ -22,8 +22,10 @@
#include "InputFileStream.h"
#include "gzfilebuf.h"
#include <iostream>
+#include <boost/algorithm/string/predicate.hpp>
using namespace std;
+using namespace boost::algorithm;
namespace Moses
{
@@ -31,8 +33,7 @@ InputFileStream::InputFileStream(const std::string &filePath)
: std::istream(NULL)
, m_streambuf(NULL)
{
- if (filePath.size() > 3 &&
- filePath.substr(filePath.size() - 3, 3) == ".gz") {
+ if (ends_with(filePath, ".gz")) {
m_streambuf = new gzfilebuf(filePath.c_str());
} else {
std::filebuf* fb = new std::filebuf();
diff --git a/phrase-extract/filter-rule-table/CfgFilter.h b/phrase-extract/filter-rule-table/CfgFilter.h
new file mode 100644
index 000000000..fde766423
--- /dev/null
+++ b/phrase-extract/filter-rule-table/CfgFilter.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <istream>
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+// Base class for StringCfgFilter and TreeCfgFilter, both of which filter rule
+// tables where the source-side is CFG.
+class CfgFilter {
+ public:
+ virtual ~CfgFilter() {}
+
+ // Read a rule table from 'in' and filter it according to the test sentences.
+ virtual void Filter(std::istream &in, std::ostream &out) = 0;
+
+ protected:
+};
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.cpp b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
index d62d599ec..b7483f0dc 100644
--- a/phrase-extract/filter-rule-table/FilterRuleTable.cpp
+++ b/phrase-extract/filter-rule-table/FilterRuleTable.cpp
@@ -9,20 +9,20 @@
#include <sstream>
#include <vector>
+#include <boost/make_shared.hpp>
#include <boost/program_options.hpp>
-#include "util/string_piece.hh"
-#include "util/string_piece_hash.hh"
-#include "util/tokenize_piece.hh"
-
#include "syntax-common/exception.h"
#include "syntax-common/xml_tree_parser.h"
#include "InputFileStream.h"
+#include "ForestTsgFilter.h"
#include "Options.h"
-#include "StringBasedFilter.h"
-#include "TreeBasedFilter.h"
+#include "StringCfgFilter.h"
+#include "StringForest.h"
+#include "StringForestParser.h"
+#include "TreeTsgFilter.h"
namespace MosesTraining
{
@@ -33,6 +33,19 @@ namespace FilterRuleTable
int FilterRuleTable::Main(int argc, char *argv[])
{
+ enum TestSentenceFormat {
+ kUnknownTestSentenceFormat,
+ kString,
+ kTree,
+ kForest
+ };
+
+ enum SourceSideRuleFormat {
+ kUnknownSourceSideRuleFormat,
+ kCfg,
+ kTsg
+ };
+
// Process command-line options.
Options options;
ProcessOptions(argc, argv, options);
@@ -40,48 +53,105 @@ int FilterRuleTable::Main(int argc, char *argv[])
// Open input file.
Moses::InputFileStream testStream(options.testSetFile);
- // Read the first test sentence and determine if it is a parse tree or a
- // string.
- std::string line;
- if (!std::getline(testStream, line)) {
- // TODO Error?
- return 0;
+ // Determine the expected test sentence format and source-side rule format
+ // based on the argument to the options.model parameter.
+ TestSentenceFormat testSentenceFormat = kUnknownTestSentenceFormat;
+ SourceSideRuleFormat sourceSideRuleFormat = kUnknownSourceSideRuleFormat;
+ if (options.model == "hierarchical" || options.model == "s2t") {
+ testSentenceFormat = kString;
+ sourceSideRuleFormat = kCfg;
+ } else if (options.model == "t2s") {
+ testSentenceFormat = kTree;
+ sourceSideRuleFormat = kTsg;
+ } else if (options.model == "t2s-scfg") {
+ testSentenceFormat = kTree;
+ sourceSideRuleFormat = kCfg;
+ } else if (options.model == "f2s") {
+ testSentenceFormat = kForest;
+ sourceSideRuleFormat = kTsg;
+ } else {
+ Error(std::string("unsupported model type: ") + options.model);
}
- if (line.find_first_of('<') == std::string::npos) {
- // Test sentences are strings.
- std::vector<std::vector<std::string> > sentences;
- do {
- sentences.resize(sentences.size()+1);
- ReadTokens(line, sentences.back());
- } while (std::getline(testStream, line));
- StringBasedFilter filter(sentences);
+
+ // Read the test sentences then set up and run the filter.
+ if (testSentenceFormat == kString) {
+ assert(sourceSideRuleFormat == kCfg);
+ std::vector<boost::shared_ptr<std::string> > testStrings;
+ ReadTestSet(testStream, testStrings);
+ StringCfgFilter filter(testStrings);
filter.Filter(std::cin, std::cout);
- } else {
- // Test sentences are XML parse trees.
- XmlTreeParser parser;
- std::vector<boost::shared_ptr<StringTree> > sentences;
- int lineNum = 1;
- do {
- if (line.size() == 0) {
- std::cerr << "skipping blank test sentence at line " << lineNum
- << std::endl;
- continue;
- }
- sentences.push_back(boost::shared_ptr<StringTree>(parser.Parse(line)));
- ++lineNum;
- } while (std::getline(testStream, line));
- TreeBasedFilter filter(sentences);
+ } else if (testSentenceFormat == kTree) {
+ std::vector<boost::shared_ptr<StringTree> > testTrees;
+ ReadTestSet(testStream, testTrees);
+ if (sourceSideRuleFormat == kCfg) {
+ // TODO Implement TreeCfgFilter
+ Error("tree/cfg filtering algorithm not supported yet");
+ } else if (sourceSideRuleFormat == kTsg) {
+ TreeTsgFilter filter(testTrees);
+ filter.Filter(std::cin, std::cout);
+ } else {
+ assert(false);
+ }
+ } else if (testSentenceFormat == kForest) {
+ std::vector<boost::shared_ptr<StringForest> > testForests;
+ ReadTestSet(testStream, testForests);
+ assert(sourceSideRuleFormat == kTsg);
+ ForestTsgFilter filter(testForests);
filter.Filter(std::cin, std::cout);
}
return 0;
}
-void FilterRuleTable::ReadTokens(const std::string &s,
- std::vector<std::string> &tokens)
+void FilterRuleTable::ReadTestSet(
+ std::istream &input,
+ std::vector<boost::shared_ptr<std::string> > &sentences)
{
- tokens.clear();
-// TODO
+ int lineNum = 0;
+ std::string line;
+ while (std::getline(input, line)) {
+ ++lineNum;
+ if (line.empty()) {
+ std::cerr << "skipping blank test sentence at line " << lineNum
+ << std::endl;
+ continue;
+ }
+ sentences.push_back(boost::make_shared<std::string>(line));
+ }
+}
+
+void FilterRuleTable::ReadTestSet(
+ std::istream &input, std::vector<boost::shared_ptr<StringTree> > &sentences)
+{
+ XmlTreeParser parser;
+ int lineNum = 0;
+ std::string line;
+ while (std::getline(input, line)) {
+ ++lineNum;
+ if (line.empty()) {
+ std::cerr << "skipping blank test sentence at line " << lineNum
+ << std::endl;
+ continue;
+ }
+ sentences.push_back(boost::shared_ptr<StringTree>(parser.Parse(line)));
+ }
+}
+
+void FilterRuleTable::ReadTestSet(
+ std::istream &input,
+ std::vector<boost::shared_ptr<StringForest> > &sentences)
+{
+ StringForestParser end;
+ int sentNum = 0;
+ for (StringForestParser p(input); p != end; ++p) {
+ ++sentNum;
+ if (p->forest->vertices.empty()) {
+ std::cerr << "skipping sentence " << sentNum << ": forest is empty"
+ << std::endl;
+ continue;
+ }
+ sentences.push_back(p->forest);
+ }
}
void FilterRuleTable::ProcessOptions(int argc, char *argv[],
@@ -94,13 +164,14 @@ void FilterRuleTable::ProcessOptions(int argc, char *argv[],
// options list.
std::ostringstream usageTop;
usageTop << "Usage: " << GetName()
- << " [OPTION]... TEST\n\n"
- << "Given a SCFG/STSG rule table (on standard input) and a set of test sentences,\nfilter out the rules that cannot be applied to any of the test sentences and\nwrite the filtered table to standard output.\n\n"
+ << " [OPTION]... MODEL TEST\n\n"
+ << "Filter for SCFG/STSG rule tables.\n\n"
<< "Options";
// Construct the 'bottom' of the usage message.
std::ostringstream usageBottom;
- usageBottom << "TODO";
+ usageBottom << "\nGiven a rule table on standard input and a set of test sentences, filters out\nthe rules that cannot be applied to any of the test sentences and writes the\nfiltered table to standard output. MODEL specifies the type of syntax model.\nThe following values are supported:\n\n"
+ << " hierarchical, s2t, t2s, t2s-scfg, f2s\n";
// Declare the command line options that are visible to the user.
po::options_description visible(usageTop.str());
@@ -109,6 +180,9 @@ void FilterRuleTable::ProcessOptions(int argc, char *argv[],
// (these are used as positional options).
po::options_description hidden("Hidden options");
hidden.add_options()
+ ("Model",
+ po::value(&options.model),
+ "one of: hierarchical, s2t, t2s, t2s-scfg, f2s")
("TestSetFile",
po::value(&options.testSetFile),
"test set file")
@@ -120,6 +194,7 @@ void FilterRuleTable::ProcessOptions(int argc, char *argv[],
// Register the positional options.
po::positional_options_description p;
+ p.add("Model", 1);
p.add("TestSetFile", 1);
// Process the command-line.
diff --git a/phrase-extract/filter-rule-table/FilterRuleTable.h b/phrase-extract/filter-rule-table/FilterRuleTable.h
index 379f286da..0ece0108a 100644
--- a/phrase-extract/filter-rule-table/FilterRuleTable.h
+++ b/phrase-extract/filter-rule-table/FilterRuleTable.h
@@ -7,6 +7,8 @@
#include "syntax-common/string_tree.h"
+#include "StringForest.h"
+
namespace MosesTraining
{
namespace Syntax
@@ -38,7 +40,18 @@ private:
void ProcessOptions(int, char *[], Options &) const;
- void ReadTokens(const std::string &, std::vector<std::string> &);
+ // Read test set (string version)
+ void ReadTestSet(std::istream &,
+ std::vector<boost::shared_ptr<std::string> > &);
+
+ // Read test set (tree version)
+ void ReadTestSet(std::istream &,
+ std::vector<boost::shared_ptr<StringTree> > &);
+
+ // Read test set (forest version)
+ void ReadTestSet(std::istream &,
+ std::vector<boost::shared_ptr<StringForest> > &);
+
std::string m_name;
};
diff --git a/phrase-extract/filter-rule-table/Forest.h b/phrase-extract/filter-rule-table/Forest.h
new file mode 100644
index 000000000..2207314a6
--- /dev/null
+++ b/phrase-extract/filter-rule-table/Forest.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <vector>
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+template<typename T>
+struct Forest {
+ struct Vertex;
+
+ struct Hyperedge {
+ Vertex *head;
+ std::vector<Vertex *> tail;
+ };
+
+ struct Vertex {
+ ~Vertex();
+ T value;
+ std::vector<Hyperedge *> incoming;
+ };
+
+ Forest() {}
+
+ ~Forest();
+
+ std::vector<Vertex *> vertices;
+
+ private:
+ // Copying is not allowed.
+ Forest(const Forest &);
+ Forest &operator=(const Forest &);
+};
+
+template<typename T>
+Forest<T>::~Forest()
+{
+ for (typename std::vector<Vertex *>::iterator p = vertices.begin();
+ p != vertices.end(); ++p) {
+ delete *p;
+ }
+}
+
+template<typename T>
+Forest<T>::Vertex::~Vertex()
+{
+ for (typename std::vector<Hyperedge *>::iterator p = incoming.begin();
+ p != incoming.end(); ++p) {
+ delete *p;
+ }
+}
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace Moses
diff --git a/phrase-extract/filter-rule-table/ForestTsgFilter.cpp b/phrase-extract/filter-rule-table/ForestTsgFilter.cpp
new file mode 100644
index 000000000..cc61020c6
--- /dev/null
+++ b/phrase-extract/filter-rule-table/ForestTsgFilter.cpp
@@ -0,0 +1,196 @@
+#include "ForestTsgFilter.h"
+
+#include <boost/make_shared.hpp>
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+// kMatchLimit is used to limit the effort spent trying to match an individual
+// rule. It defines the maximum number of times that MatchFragment() can be
+// called before the search is aborted and the rule is (possibly wrongly)
+// accepted.
+// FIXME Use a better matching algorithm.
+const std::size_t ForestTsgFilter::kMatchLimit = 10000;
+
+ForestTsgFilter::ForestTsgFilter(
+ const std::vector<boost::shared_ptr<StringForest> > &sentences)
+{
+ // Convert each StringForest to an IdForest.
+ m_sentences.reserve(sentences.size());
+ for (std::vector<boost::shared_ptr<StringForest> >::const_iterator p =
+ sentences.begin(); p != sentences.end(); ++p) {
+ m_sentences.push_back(StringForestToIdForest(**p));
+ }
+
+ // Construct a map from vocabulary Ids to IdForest nodes.
+ m_idToSentence.resize(m_testVocab.Size());
+ for (std::size_t i = 0; i < m_sentences.size(); ++i) {
+ const IdForest &forest = *(m_sentences[i]);
+ for (std::vector<IdForest::Vertex *>::const_iterator
+ p = forest.vertices.begin(); p != forest.vertices.end(); ++p) {
+ m_idToSentence[(*p)->value.id][i].push_back(*p);
+ }
+ }
+}
+
+boost::shared_ptr<ForestTsgFilter::IdForest>
+ForestTsgFilter::StringForestToIdForest(const StringForest &f)
+{
+ typedef StringForest::Vertex StringVertex;
+ typedef StringForest::Hyperedge StringHyperedge;
+ typedef IdForest::Vertex IdVertex;
+ typedef IdForest::Hyperedge IdHyperedge;
+
+ boost::shared_ptr<IdForest> g = boost::make_shared<IdForest>();
+
+ // Map from f's vertices to g's vertices.
+ boost::unordered_map<const StringVertex *, const IdVertex *> vertexMap;
+
+ // Create idForest's vertices and populate vertexMap.
+ for (std::vector<StringVertex *>::const_iterator p = f.vertices.begin();
+ p != f.vertices.end(); ++p) {
+ const StringVertex *v = *p;
+ IdVertex *w = new IdVertex();
+ w->value.id = m_testVocab.Insert(v->value.symbol);
+ w->value.start = v->value.start;
+ w->value.end = v->value.end;
+ g->vertices.push_back(w);
+ vertexMap[v] = w;
+ }
+
+ // Create g's hyperedges.
+ for (std::vector<StringVertex *>::const_iterator p = f.vertices.begin();
+ p != f.vertices.end(); ++p) {
+ for (std::vector<StringHyperedge *>::const_iterator
+ q = (*p)->incoming.begin(); q != (*p)->incoming.end(); ++q) {
+ IdHyperedge *e = new IdHyperedge();
+ e->head = const_cast<IdVertex *>(vertexMap[(*q)->head]);
+ e->tail.reserve((*q)->tail.size());
+ for (std::vector<StringVertex*>::const_iterator
+ r = (*q)->tail.begin(); r != (*q)->tail.end(); ++r) {
+ e->tail.push_back(const_cast<IdVertex *>(vertexMap[*r]));
+ }
+ e->head->incoming.push_back(e);
+ }
+ }
+
+ return g;
+}
+
+bool ForestTsgFilter::MatchFragment(const IdTree &fragment,
+ const std::vector<IdTree *> &leaves)
+{
+ typedef std::vector<const IdTree *> TreeVec;
+
+ // Reset the match counter.
+ m_matchCount = 0;
+
+ // Determine which of the fragment's leaves occurs in the smallest number of
+ // sentences in the test set. If the fragment contains a rare word
+ // (which is pretty likely assuming a Zipfian distribution) then we only
+ // have to try matching the fragment against a small number of potential
+ // match sites.
+ const IdTree *rarestLeaf = leaves[0];
+ std::size_t lowestCount = m_idToSentence[rarestLeaf->value()].size();
+ for (std::size_t i = 1; i < leaves.size(); ++i) {
+ const IdTree *leaf = leaves[i];
+ std::size_t count = m_idToSentence[leaf->value()].size();
+ if (count < lowestCount) {
+ lowestCount = count;
+ rarestLeaf = leaf;
+ }
+ }
+
+ // Try to match the rule fragment against the sentences where the rarest
+ // leaf was found.
+ const InnerMap &leafSentenceMap = m_idToSentence[rarestLeaf->value()];
+ const InnerMap &rootSentenceMap = m_idToSentence[fragment.value()];
+
+ std::vector<std::pair<std::size_t, std::size_t> > spans;
+ // For each forest i that contains the rarest leaf symbol...
+ for (InnerMap::const_iterator p = leafSentenceMap.begin();
+ p != leafSentenceMap.end(); ++p) {
+ std::size_t i = p->first;
+ // Get the set of candidate match sites in forest i (these are vertices
+ // with the same label as the root of the rule fragment).
+ InnerMap::const_iterator q = rootSentenceMap.find(i);
+ if (q == rootSentenceMap.end()) {
+ continue;
+ }
+ const std::vector<const IdForest::Vertex*> &candidates = q->second;
+ // Record the span(s) of the rare leaf symbol in forest i.
+ spans.clear();
+ for (std::vector<const IdForest::Vertex*>::const_iterator
+ r = p->second.begin(); r != p->second.end(); ++r) {
+ spans.push_back(std::make_pair((*r)->value.start, (*r)->value.end));
+ }
+ // For each candidate match site in forest i...
+ for (std::vector<const IdForest::Vertex*>::const_iterator
+ r = candidates.begin(); r != candidates.end(); ++r) {
+ const IdForest::Vertex &v = **r;
+ // Check that the subtrees rooted at v are at least as wide as the
+ // fragment (counting each non-terminal as being one token wide).
+ if (v.value.end - v.value.start + 1 < leaves.size()) {
+ continue;
+ }
+ // Check that the candidate's span covers one of the rare leaf symbols.
+ bool covered = false;
+ for (std::vector<std::pair<std::size_t, std::size_t> >::const_iterator
+ s = spans.begin(); s != spans.end(); ++s) {
+ if (v.value.start <= s->first && v.value.end >= s->second) {
+ covered = true;
+ break;
+ }
+ }
+ if (!covered) {
+ continue;
+ }
+ // Attempt to match the fragment at the candidate site.
+ if (MatchFragment(fragment, v)) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+bool ForestTsgFilter::MatchFragment(const IdTree &fragment,
+ const IdForest::Vertex &v)
+{
+ if (++m_matchCount >= kMatchLimit) {
+ return true;
+ }
+ if (fragment.value() != v.value.id) {
+ return false;
+ }
+ const std::vector<IdTree*> &children = fragment.children();
+ if (children.empty()) {
+ return true;
+ }
+ for (std::vector<IdForest::Hyperedge *>::const_iterator
+ p = v.incoming.begin(); p != v.incoming.end(); ++p) {
+ const std::vector<IdForest::Vertex*> &tail = (*p)->tail;
+ if (children.size() != tail.size()) {
+ continue;
+ }
+ bool match = true;
+ for (std::size_t i = 0; i < children.size(); ++i) {
+ if (!MatchFragment(*children[i], *tail[i])) {
+ match = false;
+ break;
+ }
+ }
+ if (match) {
+ return true;
+ }
+ }
+ return false;
+}
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/ForestTsgFilter.h b/phrase-extract/filter-rule-table/ForestTsgFilter.h
new file mode 100644
index 000000000..42b872a8d
--- /dev/null
+++ b/phrase-extract/filter-rule-table/ForestTsgFilter.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <istream>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+
+#include "syntax-common/numbered_set.h"
+#include "syntax-common/string_tree.h"
+#include "syntax-common/tree.h"
+#include "syntax-common/tree_fragment_tokenizer.h"
+
+#include "Forest.h"
+#include "StringForest.h"
+#include "TsgFilter.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+// Filters a rule table, discarding rules that cannot be applied to a given
+// test set. The rule table must have a TSG source-side and the test sentences
+// must be parse forests.
+class ForestTsgFilter : public TsgFilter {
+ public:
+ // Initialize the filter for a given set of test forests.
+ ForestTsgFilter(const std::vector<boost::shared_ptr<StringForest> > &);
+
+ private:
+ struct IdForestValue {
+ Vocabulary::IdType id;
+ std::size_t start;
+ std::size_t end;
+ };
+
+ static const std::size_t kMatchLimit;
+
+ // Represents a forest using integer vocabulary values.
+ typedef Forest<IdForestValue> IdForest;
+
+ typedef boost::unordered_map<std::size_t,
+ std::vector<const IdForest::Vertex*> > InnerMap;
+
+ typedef std::vector<InnerMap> IdToSentenceMap;
+
+ // Forest-specific implementation of virtual function.
+ bool MatchFragment(const IdTree &, const std::vector<IdTree *> &);
+
+ // Try to match a fragment against a specific vertex of a test forest.
+ bool MatchFragment(const IdTree &, const IdForest::Vertex &);
+
+ // Convert a StringForest to an IdForest (wrt m_testVocab). Inserts symbols
+ // into m_testVocab.
+ boost::shared_ptr<IdForest> StringForestToIdForest(const StringForest &);
+
+ std::vector<boost::shared_ptr<IdForest> > m_sentences;
+ IdToSentenceMap m_idToSentence;
+ std::size_t m_matchCount;
+};
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/Options.h b/phrase-extract/filter-rule-table/Options.h
index 0c86c1411..c3871075a 100644
--- a/phrase-extract/filter-rule-table/Options.h
+++ b/phrase-extract/filter-rule-table/Options.h
@@ -14,6 +14,7 @@ public:
Options() {}
// Positional options
+ std::string model;
std::string testSetFile;
};
diff --git a/phrase-extract/filter-rule-table/StringBasedFilter.cpp b/phrase-extract/filter-rule-table/StringBasedFilter.cpp
deleted file mode 100644
index 6e67cee17..000000000
--- a/phrase-extract/filter-rule-table/StringBasedFilter.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "StringBasedFilter.h"
-
-namespace MosesTraining
-{
-namespace Syntax
-{
-namespace FilterRuleTable
-{
-
-StringBasedFilter::StringBasedFilter(
- const std::vector<std::vector<std::string> > &sentences)
-{
-}
-
-void StringBasedFilter::Filter(std::istream &in, std::ostream &out)
-{
- std::string line;
- int lineNum = 0;
- while (std::getline(in, line)) {
- ++lineNum;
- out << line << std::endl;
- }
-}
-
-} // namespace FilterRuleTable
-} // namespace Syntax
-} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/StringBasedFilter.h b/phrase-extract/filter-rule-table/StringBasedFilter.h
deleted file mode 100644
index 31444e586..000000000
--- a/phrase-extract/filter-rule-table/StringBasedFilter.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-
-#include <istream>
-#include <ostream>
-#include <string>
-#include <vector>
-
-namespace MosesTraining
-{
-namespace Syntax
-{
-namespace FilterRuleTable
-{
-
-class StringBasedFilter
-{
-public:
- StringBasedFilter(const std::vector<std::vector<std::string> > &);
-
- void Filter(std::istream &, std::ostream &);
-};
-
-} // namespace FilterRuleTable
-} // namespace Syntax
-} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/StringCfgFilter.cpp b/phrase-extract/filter-rule-table/StringCfgFilter.cpp
new file mode 100644
index 000000000..b1e9ab4ee
--- /dev/null
+++ b/phrase-extract/filter-rule-table/StringCfgFilter.cpp
@@ -0,0 +1,323 @@
+#include "StringCfgFilter.h"
+
+#include <algorithm>
+
+#include "util/string_piece_hash.hh"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+const std::size_t StringCfgFilter::kMaxNGramLength = 5;
+
+StringCfgFilter::StringCfgFilter(
+ const std::vector<boost::shared_ptr<std::string> > &sentences)
+ : m_maxSentenceLength(-1)
+{
+ // Populate m_ngramCoordinateMap (except for the CoordinateTable's
+ // sentence vectors) and record the sentence lengths.
+ m_sentenceLengths.reserve(sentences.size());
+ const util::AnyCharacter delimiter(" \t");
+ std::vector<Vocabulary::IdType> vocabIds;
+ for (std::size_t i = 0; i < sentences.size(); ++i) {
+ vocabIds.clear();
+ for (util::TokenIter<util::AnyCharacter, true> p(*sentences[i], delimiter);
+ p; ++p) {
+ std::string tmp;
+ p->CopyToString(&tmp);
+ vocabIds.push_back(m_testVocab.Insert(tmp));
+ }
+ AddSentenceNGrams(vocabIds, i);
+ const int sentenceLength = static_cast<int>(vocabIds.size());
+ m_sentenceLengths.push_back(sentenceLength);
+ m_maxSentenceLength = std::max(sentenceLength, m_maxSentenceLength);
+ }
+
+ // Populate the CoordinateTable's sentence vectors.
+ for (NGramCoordinateMap::iterator p = m_ngramCoordinateMap.begin();
+ p != m_ngramCoordinateMap.end(); ++p) {
+ CoordinateTable &ct = p->second;
+ ct.sentences.reserve(ct.intraSentencePositions.size());
+ for (boost::unordered_map<int, PositionSeq>::const_iterator
+ q = ct.intraSentencePositions.begin();
+ q != ct.intraSentencePositions.end(); ++q) {
+ ct.sentences.push_back(q->first);
+ }
+ std::sort(ct.sentences.begin(), ct.sentences.end());
+ }
+}
+
+void StringCfgFilter::Filter(std::istream &in, std::ostream &out)
+{
+ const util::MultiCharacter fieldDelimiter("|||");
+ const util::AnyCharacter symbolDelimiter(" \t");
+
+ std::string line;
+ std::string prevLine;
+ StringPiece source;
+ std::vector<StringPiece> symbols;
+ Pattern pattern;
+ bool keep = true;
+ int lineNum = 0;
+
+ while (std::getline(in, line)) {
+ ++lineNum;
+
+ // Read the source-side of the rule.
+ util::TokenIter<util::MultiCharacter> it(line, fieldDelimiter);
+
+ // Check if this rule has the same source-side as the previous rule. If
+ // it does then we already know whether or not to keep the rule. This
+ // optimisation is based on the assumption that the rule table is sorted
+ // (which is the case in the standard Moses training pipeline).
+ if (*it == source) {
+ if (keep) {
+ out << line << std::endl;
+ }
+ continue;
+ }
+
+ // The source-side is different from the previous rule's.
+ source = *it;
+
+ // Tokenize the source-side.
+ symbols.clear();
+ for (util::TokenIter<util::AnyCharacter, true> p(source, symbolDelimiter);
+ p; ++p) {
+ symbols.push_back(*p);
+ }
+
+ // Generate a pattern (fails if any source-side terminal is not in the
+ // test set vocabulary) and attempt to match it against the test sentences.
+ keep = GeneratePattern(symbols, pattern) && MatchPattern(pattern);
+ if (keep) {
+ out << line << std::endl;
+ }
+
+ // Retain line for the next iteration (in order that the source StringPiece
+ // remains valid).
+ prevLine.swap(line);
+ }
+}
+
+void StringCfgFilter::AddSentenceNGrams(
+ const std::vector<Vocabulary::IdType> &s, std::size_t sentNum)
+{
+ const std::size_t len = s.size();
+
+ NGram ngram;
+ // For each starting position in the sentence:
+ for (std::size_t i = 0; i < len; ++i) {
+ // For each n-gram length: 1, 2, 3, ... kMaxNGramLength (or less when
+ // approaching the end of the sentence):
+ for (std::size_t n = 1; n <= std::min(kMaxNGramLength, len-i); ++n) {
+ ngram.clear();
+ for (std::size_t j = 0; j < n; ++j) {
+ ngram.push_back(s[i+j]);
+ }
+ m_ngramCoordinateMap[ngram].intraSentencePositions[sentNum].push_back(i);
+ }
+ }
+}
+
+bool StringCfgFilter::GeneratePattern(const std::vector<StringPiece> &symbols,
+ Pattern &pattern) const
+{
+ pattern.subpatterns.clear();
+ pattern.minGapWidths.clear();
+
+ int gapWidth = 0;
+
+ // The first symbol is handled as a special case because there is always a
+ // leading gap / non-gap.
+ if (IsNonTerminal(symbols[0])) {
+ ++gapWidth;
+ } else {
+ pattern.minGapWidths.push_back(0);
+ // Add the symbol to the first n-gram.
+ Vocabulary::IdType vocabId =
+ m_testVocab.Lookup(symbols[0], StringPieceCompatibleHash(),
+ StringPieceCompatibleEquals());
+ if (vocabId == Vocabulary::NullId()) {
+ return false;
+ }
+ pattern.subpatterns.push_back(NGram(1, vocabId));
+ }
+
+ // Process the remaining symbols (except the last which is the RHS).
+ for (std::size_t i = 1; i < symbols.size()-1; ++i) {
+ // Is current symbol a non-terminal?
+ if (IsNonTerminal(symbols[i])) {
+ ++gapWidth;
+ continue;
+ }
+ // Does the current terminal follow a non-terminal?
+ if (gapWidth > 0) {
+ pattern.minGapWidths.push_back(gapWidth);
+ gapWidth = 0;
+ pattern.subpatterns.resize(pattern.subpatterns.size()+1);
+ // Is the current n-gram full?
+ } else if (pattern.subpatterns.back().size() == kMaxNGramLength) {
+ pattern.minGapWidths.push_back(0);
+ pattern.subpatterns.resize(pattern.subpatterns.size()+1);
+ }
+ // Add the symbol to the current n-gram.
+ Vocabulary::IdType vocabId =
+ m_testVocab.Lookup(symbols[i], StringPieceCompatibleHash(),
+ StringPieceCompatibleEquals());
+ if (vocabId == Vocabulary::NullId()) {
+ return false;
+ }
+ pattern.subpatterns.back().push_back(vocabId);
+ }
+
+ // Add the final gap width value (0 if the last symbol was a terminal).
+ pattern.minGapWidths.push_back(gapWidth);
+ return true;
+}
+
+bool StringCfgFilter::IsNonTerminal(const StringPiece &symbol) const
+{
+ return symbol.size() >= 3 && symbol[0] == '[' &&
+ symbol[symbol.size()-1] == ']';
+}
+
+bool StringCfgFilter::MatchPattern(const Pattern &pattern) const
+{
+ // Step 0: If the pattern is just a single gap (i.e. the original rule
+ // was fully non-lexical) then the pattern matches unless the
+ // minimum gap width is wider than any sentence.
+ if (pattern.subpatterns.empty()) {
+ assert(pattern.minGapWidths.size() == 1);
+ return pattern.minGapWidths[0] <= m_maxSentenceLength;
+ }
+
+ // Step 1: Look up all of the subpatterns in m_ngramCoordinateMap and record
+ // pointers to their CoordinateTables.
+ std::vector<const CoordinateTable *> tables;
+ for (std::vector<NGram>::const_iterator p = pattern.subpatterns.begin();
+ p != pattern.subpatterns.end(); ++p) {
+ NGramCoordinateMap::const_iterator q = m_ngramCoordinateMap.find(*p);
+ // If a subpattern doesn't appear in m_ngramCoordinateMap then the match
+ // has already failed.
+ if (q == m_ngramCoordinateMap.end()) {
+ return false;
+ }
+ tables.push_back(&(q->second));
+ }
+
+ // Step 2: Intersect the CoordinateTables' sentence sets to find the set of
+ // test set sentences in which all subpatterns occur.
+ std::vector<int> intersection = tables[0]->sentences;
+ std::vector<int> tmp(intersection.size());
+ for (std::size_t i = 1; i < tables.size(); ++i) {
+ std::vector<int>::iterator p = std::set_intersection(
+ intersection.begin(), intersection.end(), tables[i]->sentences.begin(),
+ tables[i]->sentences.end(), tmp.begin());
+ tmp.resize(p-tmp.begin());
+ if (tmp.empty()) {
+ return false;
+ }
+ intersection.swap(tmp);
+ }
+
+ // Step 3: For each sentence in the intersection, try to find a consistent
+ // sequence of intra-sentence positions (one for each subpattern).
+ // 'Consistent' here means that the subpatterns occur in the right
+ // order and are separated by at least the minimum widths required
+ // by the pattern's gaps).
+ for (std::vector<int>::const_iterator p = intersection.begin();
+ p != intersection.end(); ++p) {
+ if (MatchPattern(pattern, tables, *p)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool StringCfgFilter::MatchPattern(
+ const Pattern &pattern,
+ std::vector<const CoordinateTable *> &tables,
+ int sentenceId) const
+{
+ const int sentenceLength = m_sentenceLengths[sentenceId];
+
+ // In the for loop below, we need to know the set of start position ranges
+ // where subpattern i is allowed to occur (rangeSet) and we are generating
+ // the ranges for subpattern i+1 (nextRangeSet).
+ // TODO Merge ranges if subpattern i follows a non-zero gap.
+ std::vector<Range> rangeSet;
+ std::vector<Range> nextRangeSet;
+
+ // Calculate the range for the first subpattern.
+ int minStart = pattern.minGapWidths[0];
+ int maxStart = sentenceLength - MinWidth(pattern, 0);
+ rangeSet.push_back(Range(minStart, maxStart));
+
+ // Attempt to match subpatterns.
+ for (int i = 0; i < pattern.subpatterns.size(); ++i) {
+ // Look-up the intra-sentence position sequence.
+ boost::unordered_map<int, PositionSeq>::const_iterator r =
+ tables[i]->intraSentencePositions.find(sentenceId);
+ assert(r != tables[i]->intraSentencePositions.end());
+ const PositionSeq &col = r->second;
+ for (PositionSeq::const_iterator p = col.begin(); p != col.end(); ++p) {
+ bool inRange = false;
+ for (std::vector<Range>::const_iterator q = rangeSet.begin();
+ q != rangeSet.end(); ++q) {
+ // TODO Use the fact that the ranges are ordered to break early.
+ if (*p >= q->first && *p <= q->second) {
+ inRange = true;
+ break;
+ }
+ }
+ if (!inRange) {
+ continue;
+ }
+ // If this is the last subpattern then we're done.
+ if (i+1 == pattern.subpatterns.size()) {
+ return true;
+ }
+ nextRangeSet.push_back(CalcNextRange(pattern, i, *p, sentenceLength));
+ }
+ if (nextRangeSet.empty()) {
+ return false;
+ }
+ rangeSet.swap(nextRangeSet);
+ nextRangeSet.clear();
+ }
+ return true;
+}
+
+StringCfgFilter::Range StringCfgFilter::CalcNextRange(
+ const Pattern &pattern, int i, int x, int sentenceLength) const
+{
+ assert(i+1 < pattern.subpatterns.size());
+ Range range;
+ if (pattern.minGapWidths[i+1] == 0) {
+ // The next subpattern follows this one without a gap.
+ range.first = range.second = x + pattern.subpatterns[i].size();
+ } else {
+ range.first = x + pattern.subpatterns[i].size() + pattern.minGapWidths[i+1];
+ // TODO MinWidth should only be computed once per subpattern.
+ range.second = sentenceLength - MinWidth(pattern, i+1);
+ }
+ return range;
+}
+
+int StringCfgFilter::MinWidth(const Pattern &pattern, int i) const
+{
+ int minWidth = 0;
+ for (; i < pattern.subpatterns.size(); ++i) {
+ minWidth += pattern.subpatterns[i].size();
+ minWidth += pattern.minGapWidths[i+1];
+ }
+ return minWidth;
+}
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/StringCfgFilter.h b/phrase-extract/filter-rule-table/StringCfgFilter.h
new file mode 100644
index 000000000..cadd7127a
--- /dev/null
+++ b/phrase-extract/filter-rule-table/StringCfgFilter.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "syntax-common/numbered_set.h"
+
+#include <boost/shared_ptr.hpp>
+#include <boost/unordered_map.hpp>
+
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+
+#include "CfgFilter.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+// Filters a rule table, discarding rules that cannot be applied to a given
+// test set. The rule table must have a CFG source-side and the test sentences
+// must be strings.
+class StringCfgFilter : public CfgFilter {
+ public:
+ // Initialize the filter for a given set of test sentences.
+ StringCfgFilter(const std::vector<boost::shared_ptr<std::string> > &);
+
+ void Filter(std::istream &in, std::ostream &out);
+
+ private:
+ // Filtering works by converting the source LHSs of translation rules to
+ // patterns containing variable length gaps and then pattern matching
+ // against the test set.
+ //
+ // The algorithm is vaguely similar to Algorithm 1 from Rahman et al. (2006),
+ // but with a slightly different definition of a pattern and designed for a
+ // text containing sentence boundaries. Here the text is assumed to be
+ // short (a few thousand sentences) and the number of patterns is assumed to
+ // be large (tens of millions of rules).
+ //
+ // M. Sohel Rahman, Costas S. Iliopoulos, Inbok Lee, Manal Mohamed, and
+ // William F. Smyth
+ // "Finding Patterns with Variable Length Gaps or Don't Cares"
+ // In proceedings of COCOON, 2006
+
+ // Max NGram length.
+ static const std::size_t kMaxNGramLength;
+
+ // Maps words from strings to integers.
+ typedef NumberedSet<std::string, std::size_t> Vocabulary;
+
+ // A NGram is a sequence of words.
+ typedef std::vector<Vocabulary::IdType> NGram;
+
+ // A pattern is an alternating sequence of gaps and NGram subpatterns,
+ // starting and ending with a gap. Every gap has a minimum width, which
+ // can be any integer >= 0 (a gap of width 0 is really a non-gap).
+ //
+ // The source LHSs of translation rules are converted to patterns where each
+ // sequence of m consecutive non-terminals is converted to a gap with minimum
+ // width m. For example, if a rule has the source LHS:
+ //
+ // [NP] and all the king 's men could n't [VB] [NP] together again
+ //
+ // and kMaxN is set to 5 then the following pattern is used:
+ //
+ // * <and all the king 's> * <men could n't> * <together again> *
+ //
+ // where the gaps have minimum widths of 1, 0, 2, and 0.
+ //
+ struct Pattern
+ {
+ std::vector<NGram> subpatterns;
+ std::vector<int> minGapWidths;
+ };
+
+ // A sorted (ascending) sequence of start positions.
+ typedef std::vector<int> PositionSeq;
+
+ // A range of start positions.
+ typedef std::pair<int, int> Range;
+
+ // A CoordinateTable records the set of sentences in which a single
+ // n-gram occurs and for each of those sentences, the start positions
+ struct CoordinateTable {
+ // Sentences IDs (ascending). This contains the same values as the key set
+ // from intraSentencePositions but sorted into ascending order.
+ std::vector<int> sentences;
+ // Map from sentence ID to set of intra-sentence start positions.
+ boost::unordered_map<int, PositionSeq> intraSentencePositions;
+ };
+
+ // NGramCoordinateMap is the main search structure. It maps a NGram to
+ // a CoordinateTable containing the positions that the n-gram occurs at
+ // in the test set.
+ typedef boost::unordered_map<NGram, CoordinateTable> NGramCoordinateMap;
+
+ // Add all n-grams and coordinates for a single sentence s with index i.
+ void AddSentenceNGrams(const std::vector<Vocabulary::IdType> &s,
+ std::size_t i);
+
+ // Calculate the range of possible start positions for subpattern i+1
+ // assuming that subpattern i has position x.
+ Range CalcNextRange(const Pattern &p, int i, int x, int sentenceLength) const;
+
+ // Generate the pattern corresponding to the given source-side of a rule.
+ // This will fail if the rule's source-side contains any terminals that
+ // do not occur in the test sentence vocabulary.
+ bool GeneratePattern(const std::vector<StringPiece> &, Pattern &) const;
+
+ // Calculate the minimum width of the pattern suffix starting
+ // at subpattern i.
+ int MinWidth(const Pattern &p, int i) const;
+
+ bool IsNonTerminal(const StringPiece &symbol) const;
+
+ // Try to match the pattern p against any sentence in the test set.
+ bool MatchPattern(const Pattern &p) const;
+
+ // Try to match the pattern p against the sentence with the given ID.
+ bool MatchPattern(const Pattern &p,
+ std::vector<const CoordinateTable *> &tables,
+ int id) const;
+
+ // The main search structure constructed from the test set sentences.
+ NGramCoordinateMap m_ngramCoordinateMap;
+
+ // The lengths of the test sentences.
+ std::vector<int> m_sentenceLengths;
+
+ // The maximum length of any test sentence.
+ int m_maxSentenceLength;
+
+ // The symbol vocabulary of the test sentences.
+ Vocabulary m_testVocab;
+};
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/StringForest.h b/phrase-extract/filter-rule-table/StringForest.h
new file mode 100644
index 000000000..74318f191
--- /dev/null
+++ b/phrase-extract/filter-rule-table/StringForest.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <string>
+
+#include "Forest.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+struct StringForestValue {
+ std::string symbol; // terminal or non-terminal (without square brackets)
+ std::size_t start;
+ std::size_t end;
+};
+
+typedef Forest<StringForestValue> StringForest;
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace Moses
diff --git a/phrase-extract/filter-rule-table/StringForestParser.cpp b/phrase-extract/filter-rule-table/StringForestParser.cpp
new file mode 100644
index 000000000..118b46475
--- /dev/null
+++ b/phrase-extract/filter-rule-table/StringForestParser.cpp
@@ -0,0 +1,146 @@
+#include "StringForestParser.h"
+
+#include <istream>
+#include <string>
+
+#include <boost/make_shared.hpp>
+
+#include "util/tokenize_piece.hh"
+
+#include "syntax-common/exception.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+StringForestParser::StringForestParser()
+ : m_input(0) {
+}
+
+StringForestParser::StringForestParser(std::istream &input)
+ : m_input(&input) {
+ ++(*this);
+}
+
+StringForestParser &StringForestParser::operator++() {
+ if (!m_input) {
+ return *this;
+ }
+ m_vertexSet.clear();
+ m_entry.forest.reset(new StringForest());
+ if (!std::getline(*m_input, m_tmpLine)) {
+ m_input = 0;
+ return *this;
+ }
+ // The first line contains the sentence number.
+ ParseSentenceNumLine(m_tmpLine, m_entry.sentNum);
+ // The second line contains the sentence string.
+ std::getline(*m_input, m_entry.sentence);
+ // Subsequent lines contain hyperedges -- or a blank line if there was a
+ // parse failure -- terminated by a blank line.
+ std::getline(*m_input, m_tmpLine);
+ if (m_tmpLine == "") { // Parse failure
+ std::getline(*m_input, m_tmpLine);
+ assert(m_tmpLine == "");
+ return *this;
+ }
+ while (m_tmpLine != "") {
+ ParseHyperedgeLine(m_tmpLine, *m_entry.forest);
+ std::getline(*m_input, m_tmpLine);
+ }
+ return *this;
+}
+
+StringForest::Vertex *StringForestParser::AddOrDeleteVertex(
+ StringForest::Vertex *v)
+{
+ std::pair<VertexSet::iterator, bool> ret = m_vertexSet.insert(v);
+ if (ret.second) {
+ m_entry.forest->vertices.push_back(*ret.first);
+ } else {
+ delete v;
+ }
+ return *ret.first;
+}
+
+void StringForestParser::ParseSentenceNumLine(const std::string &line,
+ std::size_t &sentNum)
+{
+ const util::AnyCharacter delimiter(" \t");
+ util::TokenIter<util::AnyCharacter, true> p(line, delimiter);
+ if (*p != "sentence") {
+ // FIXME
+ throw Exception("");
+ }
+ ++p;
+ std::string tmp;
+ p->CopyToString(&tmp);
+ sentNum = std::atoi(tmp.c_str());
+}
+
+void StringForestParser::ParseHyperedgeLine(const std::string &line,
+ StringForest &forest)
+{
+ const util::AnyCharacter delimiter(" \t");
+ util::TokenIter<util::AnyCharacter, true> p(line, delimiter);
+ StringForest::Vertex *v = AddOrDeleteVertex(ParseVertex(*p));
+ StringForest::Hyperedge *e = new StringForest::Hyperedge();
+ e->head = v;
+ ++p;
+ if (*p != "=>") {
+ // FIXME
+ throw Exception("");
+ }
+ for (++p; *p != "|||"; ++p) {
+ v = ParseVertex(*p);
+ if (v->value.start == -1) {
+ // Egret does not give start/end for terminals.
+ v->value.start = v->value.end = e->head->value.start;
+ }
+ e->tail.push_back(AddOrDeleteVertex(v));
+ }
+ // Weight is ignored
+ e->head->incoming.push_back(e);
+}
+
+StringForest::Vertex *StringForestParser::ParseVertex(const StringPiece &s)
+{
+ StringForest::Vertex *v = new StringForest::Vertex();
+ std::size_t pos = s.rfind('[');
+ if (pos == std::string::npos) {
+ s.CopyToString(&v->value.symbol);
+ //v.value.symbol.isNonTerminal = false;
+ v->value.start = v->value.end = -1;
+ return v;
+ }
+ if (pos > 2 && s[pos-2] == '^' && s[pos-1] == 'g') {
+ s.substr(0, pos-2).CopyToString(&v->value.symbol);
+ } else {
+ s.substr(0, pos).CopyToString(&v->value.symbol);
+ }
+ //v.symbol.isNonTerminal = true;
+ std::size_t begin = pos + 1;
+ pos = s.find(',', begin+1);
+ std::string tmp;
+ s.substr(begin, pos-begin).CopyToString(&tmp);
+ v->value.start = std::atoi(tmp.c_str());
+ s.substr(pos+1, s.size()-pos-2).CopyToString(&tmp);
+ v->value.end = std::atoi(tmp.c_str());
+ return v;
+}
+
+bool operator==(const StringForestParser &lhs, const StringForestParser &rhs) {
+ // TODO Is this right? Compare values of istreams if non-zero?
+ return lhs.m_input == rhs.m_input;
+}
+
+bool operator!=(const StringForestParser &lhs, const StringForestParser &rhs) {
+ return !(lhs == rhs);
+}
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/StringForestParser.h b/phrase-extract/filter-rule-table/StringForestParser.h
new file mode 100644
index 000000000..259e7fab9
--- /dev/null
+++ b/phrase-extract/filter-rule-table/StringForestParser.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <istream>
+#include <string>
+#include <vector>
+#include <utility>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/unordered_set.hpp>
+
+#include "util/string_piece.hh"
+
+#include "StringForest.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+class StringForestParser {
+ public:
+ struct Entry {
+ std::size_t sentNum;
+ std::string sentence;
+ boost::shared_ptr<StringForest> forest;
+ };
+
+ StringForestParser();
+ StringForestParser(std::istream &);
+
+ Entry &operator*() { return m_entry; }
+ Entry *operator->() { return &m_entry; }
+
+ StringForestParser &operator++();
+
+ friend bool operator==(const StringForestParser &,
+ const StringForestParser &);
+ friend bool operator!=(const StringForestParser &,
+ const StringForestParser &);
+
+ private:
+ struct VertexSetHash {
+ std::size_t operator()(const StringForest::Vertex *v) const {
+ std::size_t seed = 0;
+ boost::hash_combine(seed, v->value.symbol);
+ boost::hash_combine(seed, v->value.start);
+ boost::hash_combine(seed, v->value.end);
+ return seed;
+ }
+ };
+
+ struct VertexSetPred {
+ bool operator()(const StringForest::Vertex *v,
+ const StringForest::Vertex *w) const {
+ return v->value.symbol == w->value.symbol &&
+ v->value.start == w->value.start &&
+ v->value.end == w->value.end;
+ }
+ };
+
+ typedef boost::unordered_set<StringForest::Vertex *, VertexSetHash,
+ VertexSetPred> VertexSet;
+
+ // Copying is not allowed
+ StringForestParser(const StringForestParser &);
+ StringForestParser &operator=(const StringForestParser &);
+
+ StringForest::Vertex *AddOrDeleteVertex(StringForest::Vertex *);
+ void ParseHyperedgeLine(const std::string &, StringForest &);
+ void ParseSentenceNumLine(const std::string &, std::size_t &);
+ StringForest::Vertex *ParseVertex(const StringPiece &);
+
+ Entry m_entry;
+ std::istream *m_input;
+ std::string m_tmpLine;
+ VertexSet m_vertexSet;
+};
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/TreeBasedFilter.cpp b/phrase-extract/filter-rule-table/TreeBasedFilter.cpp
deleted file mode 100644
index fee03641a..000000000
--- a/phrase-extract/filter-rule-table/TreeBasedFilter.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-#include "TreeBasedFilter.h"
-
-#include "boost/scoped_ptr.hpp"
-
-#include "util/string_piece.hh"
-#include "util/string_piece_hash.hh"
-#include "util/tokenize_piece.hh"
-
-namespace MosesTraining
-{
-namespace Syntax
-{
-namespace FilterRuleTable
-{
-
-TreeBasedFilter::TreeBasedFilter(
- const std::vector<boost::shared_ptr<StringTree> > &sentences)
-{
- // Convert each StringTree to an IdTree.
- m_sentences.reserve(sentences.size());
- for (std::vector<boost::shared_ptr<StringTree> >::const_iterator p =
- sentences.begin(); p != sentences.end(); ++p) {
- m_sentences.push_back(boost::shared_ptr<IdTree>(StringTreeToIdTree(**p)));
- }
-
- m_labelToTree.resize(m_testVocab.Size());
- // Construct a map from root labels to IdTree nodes.
- for (std::vector<boost::shared_ptr<IdTree> >::const_iterator p =
- m_sentences.begin(); p != m_sentences.end(); ++p) {
- AddNodesToMap(**p);
- }
-}
-
-TreeBasedFilter::IdTree *TreeBasedFilter::StringTreeToIdTree(
- const StringTree &s)
-{
- IdTree *t = new IdTree(m_testVocab.Insert(s.value()));
- const std::vector<StringTree*> &sChildren = s.children();
- std::vector<IdTree*> &tChildren = t->children();
- tChildren.reserve(sChildren.size());
- for (std::vector<StringTree*>::const_iterator p = sChildren.begin();
- p != sChildren.end(); ++p) {
- IdTree *child = StringTreeToIdTree(**p);
- child->parent() = t;
- tChildren.push_back(child);
- }
- return t;
-}
-
-void TreeBasedFilter::AddNodesToMap(const IdTree &tree)
-{
- m_labelToTree[tree.value()].push_back(&tree);
- const std::vector<IdTree*> &children = tree.children();
- for (std::vector<IdTree*>::const_iterator p = children.begin();
- p != children.end(); ++p) {
- AddNodesToMap(**p);
- }
-}
-
-void TreeBasedFilter::Filter(std::istream &in, std::ostream &out)
-{
- const util::MultiCharacter delimiter("|||");
-
- std::string line;
- std::string prevLine;
- StringPiece source;
- bool keep;
- int lineNum = 0;
- std::vector<TreeFragmentToken> tokens;
- std::vector<IdTree *> leaves;
-
- while (std::getline(in, line)) {
- ++lineNum;
-
- // Read the source-side of the rule.
- util::TokenIter<util::MultiCharacter> it(line, delimiter);
-
- // Check if this rule has the same source-side as the previous rule. If
- // it does then we already know whether or not to keep the rule. This
- // optimisation is based on the assumption that the rule table is sorted
- // (which is the case in the standard Moses training pipeline).
- if (*it == source) {
- if (keep) {
- out << line << std::endl;
- }
- continue;
- }
-
- // The source-side is different from the previous rule's.
- source = *it;
-
- // Tokenize the source-side tree fragment.
- tokens.clear();
- for (TreeFragmentTokenizer p(source); p != TreeFragmentTokenizer(); ++p) {
- tokens.push_back(*p);
- }
-
- // Construct an IdTree representing the source-side tree fragment. This
- // will fail if the fragment contains any symbols that don't occur in
- // m_testVocab and in that case the rule can be discarded. In practice,
- // this catches a lot of discardable rules (see comment at the top of this
- // function). If the fragment is successfully created then we attempt to
- // match the tree fragment against the test trees. This test is exact, but
- // slow.
- int i = 0;
- leaves.clear();
- boost::scoped_ptr<IdTree> fragment(BuildTree(tokens, i, leaves));
- keep = fragment.get() && MatchFragment(*fragment, leaves);
- if (keep) {
- out << line << std::endl;
- }
-
- // Retain line for the next iteration (in order that the source StringPiece
- // remains valid).
- prevLine.swap(line);
- }
-}
-
-bool TreeBasedFilter::MatchFragment(const IdTree &fragment,
- const std::vector<IdTree *> &leaves)
-{
- typedef std::vector<const IdTree *> TreeVec;
-
- // Determine which of the fragment's leaves has the smallest number of
- // subtree matches in the test set. If the fragment contains a rare word
- // (which is pretty likely assuming a Zipfian distribution) then we only
- // have to try matching the fragment against a small number of potential
- // match sites.
- const IdTree *rarestLeaf = leaves[0];
- std::size_t lowestCount = m_labelToTree[rarestLeaf->value()].size();
- for (std::size_t i = 1; i < leaves.size(); ++i) {
- const IdTree *leaf = leaves[i];
- std::size_t count = m_labelToTree[leaf->value()].size();
- if (count < lowestCount) {
- lowestCount = count;
- rarestLeaf = leaf;
- }
- }
-
- // Determine the depth of the chosen leaf.
- const std::size_t depth = rarestLeaf->Depth();
-
- // Try to match the rule fragment against the test set subtrees where a
- // leaf match was found.
- TreeVec &nodes = m_labelToTree[rarestLeaf->value()];
- for (TreeVec::const_iterator p = nodes.begin(); p != nodes.end(); ++p) {
- // Navigate 'depth' positions up the subtree to find the root of the
- // potential match site.
- const IdTree *t = *p;
- std::size_t d = depth;
- while (d && t->parent()) {
- t = t->parent();
- --d;
- }
- if (d > 0) {
- // The potential match site is not tall enough.
- continue;
- }
- if (MatchFragment(fragment, *t)) {
- return true;
- }
- }
- return false;
-}
-
-TreeBasedFilter::IdTree *TreeBasedFilter::BuildTree(
- const std::vector<TreeFragmentToken> &tokens, int &i,
- std::vector<IdTree *> &leaves)
-{
- // The subtree starting at tokens[i] is either:
- // 1. a single non-variable symbol (like NP or dog), or
- // 2. a variable symbol (like [NP]), or
- // 3. a subtree with children (like [NP [DT] [NN dog]])
-
- // First check for case 1.
- if (tokens[i].type == TreeFragmentToken_WORD) {
- Vocabulary::IdType id = m_testVocab.Lookup(tokens[i++].value,
- StringPieceCompatibleHash(),
- StringPieceCompatibleEquals());
- if (id == Vocabulary::NullId()) {
- return 0;
- }
- leaves.push_back(new IdTree(id));
- return leaves.back();
- }
-
- // We must be dealing with either case 2 or 3. Case 2 looks like case 3 but
- // without the children.
- assert(tokens[i].type == TreeFragmentToken_LSB);
-
- // Skip over the opening [
- ++i;
-
- // Read the root symbol of the subtree.
- Vocabulary::IdType id = m_testVocab.Lookup(tokens[i++].value,
- StringPieceCompatibleHash(),
- StringPieceCompatibleEquals());
- if (id == Vocabulary::NullId()) {
- return 0;
- }
- IdTree *root = new IdTree(id);
-
- // Read the children (in case 2 there won't be any).
- while (tokens[i].type != TreeFragmentToken_RSB) {
- IdTree *child = BuildTree(tokens, i, leaves);
- if (!child) {
- delete root;
- return 0;
- }
- root->children().push_back(child);
- child->parent() = root;
- }
-
- if (root->IsLeaf()) {
- leaves.push_back(root);
- }
-
- // Skip over the closing ] and we're done.
- ++i;
- return root;
-}
-
-bool TreeBasedFilter::MatchFragment(const IdTree &fragment, const IdTree &tree)
-{
- if (fragment.value() != tree.value()) {
- return false;
- }
- const std::vector<IdTree*> &fragChildren = fragment.children();
- const std::vector<IdTree*> &treeChildren = tree.children();
- if (!fragChildren.empty() && fragChildren.size() != treeChildren.size()) {
- return false;
- }
- for (std::size_t i = 0; i < fragChildren.size(); ++i) {
- if (!MatchFragment(*fragChildren[i], *treeChildren[i])) {
- return false;
- }
- }
- return true;
-}
-
-} // namespace FilterRuleTable
-} // namespace Syntax
-} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/TreeBasedFilter.h b/phrase-extract/filter-rule-table/TreeBasedFilter.h
deleted file mode 100644
index f30c9dd97..000000000
--- a/phrase-extract/filter-rule-table/TreeBasedFilter.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#pragma once
-
-#include <istream>
-#include <ostream>
-#include <string>
-#include <vector>
-
-#include <boost/shared_ptr.hpp>
-#include <boost/unordered_map.hpp>
-
-#include "syntax-common/numbered_set.h"
-#include "syntax-common/string_tree.h"
-#include "syntax-common/tree.h"
-#include "syntax-common/tree_fragment_tokenizer.h"
-
-namespace MosesTraining
-{
-namespace Syntax
-{
-namespace FilterRuleTable
-{
-
-// Filters a rule table (currently assumed to be tree-to-string, STSG),
-// discarding rules that cannot be applied to a given set of test sentences.
-class TreeBasedFilter
-{
-public:
- // Initialize the filter for a given set of test sentences.
- TreeBasedFilter(const std::vector<boost::shared_ptr<StringTree> > &);
-
- // Read a rule table from 'in' and filter it according to the test sentences.
- // This is slow because it involves testing every rule (or a significant
- // fraction) at every node of every test sentence parse tree. There are a
- // couple of optimizations that speed things up in practice, but it could
- // still use some work to make it faster.
- //
- // Some statistics from real data (WMT14, English-German):
- //
- // 4.4M Parallel sentences (source-side parsed with Berkeley parser)
- // 2.7K Test sentences (newstest2014)
- //
- // 73.4M Original rule table size (number of distinct, composed GHKM rules)
- // 22.9M Number of rules with same source-side as previous rule
- // 50.5M Number of rules requiring vocabulary matching test
- // 24.1M Number of rules requiring full tree matching test
- // 6.7M Number of rules retained after filtering
- //
- void Filter(std::istream &in, std::ostream &out);
-
-private:
- // Maps source-side symbols (terminals and non-terminals) from strings to
- // integers.
- typedef NumberedSet<std::string, std::size_t> Vocabulary;
-
- // Represents the test trees using their integer vocabulary values for faster
- // matching.
- typedef Tree<Vocabulary::IdType> IdTree;
-
- // Add an entry to m_labelToTree for every subtree of the given tree.
- void AddNodesToMap(const IdTree &);
-
- // Build an IdTree (wrt m_testVocab) for the tree beginning at position i of
- // the token sequence or return 0 if any symbol in the fragment is not in
- // m_testVocab. If successful then on return, i will be set to the position
- // immediately after the last token of the tree and leaves will contain the
- // pointers to the fragment's leaves. If the build fails then i and leaves
- // are undefined.
- IdTree *BuildTree(const std::vector<TreeFragmentToken> &tokens, int &i,
- std::vector<IdTree *> &leaves);
-
- // Try to match a fragment against any test tree.
- bool MatchFragment(const IdTree &, const std::vector<IdTree *> &);
-
- // Try to match a fragment against a specific subtree of a test tree.
- bool MatchFragment(const IdTree &, const IdTree &);
-
- // Convert a StringTree to an IdTree (wrt m_testVocab). Inserts symbols into
- // m_testVocab.
- IdTree *StringTreeToIdTree(const StringTree &);
-
- std::vector<boost::shared_ptr<IdTree> > m_sentences;
- std::vector<std::vector<const IdTree *> > m_labelToTree;
- Vocabulary m_testVocab;
-};
-
-} // namespace FilterRuleTable
-} // namespace Syntax
-} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/TreeTsgFilter.cpp b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp
new file mode 100644
index 000000000..a84cd7b65
--- /dev/null
+++ b/phrase-extract/filter-rule-table/TreeTsgFilter.cpp
@@ -0,0 +1,120 @@
+#include "TreeTsgFilter.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+TreeTsgFilter::TreeTsgFilter(
+ const std::vector<boost::shared_ptr<StringTree> > &sentences)
+{
+ // Convert each StringTree to an IdTree.
+ m_sentences.reserve(sentences.size());
+ for (std::vector<boost::shared_ptr<StringTree> >::const_iterator p =
+ sentences.begin(); p != sentences.end(); ++p) {
+ m_sentences.push_back(boost::shared_ptr<IdTree>(StringTreeToIdTree(**p)));
+ }
+
+ m_labelToTree.resize(m_testVocab.Size());
+ // Construct a map from vocabulary Ids to IdTree nodes.
+ for (std::vector<boost::shared_ptr<IdTree> >::const_iterator p =
+ m_sentences.begin(); p != m_sentences.end(); ++p) {
+ AddNodesToMap(**p);
+ }
+}
+
+TreeTsgFilter::IdTree *TreeTsgFilter::StringTreeToIdTree(const StringTree &s)
+{
+ IdTree *t = new IdTree(m_testVocab.Insert(s.value()));
+ const std::vector<StringTree*> &sChildren = s.children();
+ std::vector<IdTree*> &tChildren = t->children();
+ tChildren.reserve(sChildren.size());
+ for (std::vector<StringTree*>::const_iterator p = sChildren.begin();
+ p != sChildren.end(); ++p) {
+ IdTree *child = StringTreeToIdTree(**p);
+ child->parent() = t;
+ tChildren.push_back(child);
+ }
+ return t;
+}
+
+void TreeTsgFilter::AddNodesToMap(const IdTree &tree)
+{
+ m_labelToTree[tree.value()].push_back(&tree);
+ const std::vector<IdTree*> &children = tree.children();
+ for (std::vector<IdTree*>::const_iterator p = children.begin();
+ p != children.end(); ++p) {
+ AddNodesToMap(**p);
+ }
+}
+
+bool TreeTsgFilter::MatchFragment(const IdTree &fragment,
+ const std::vector<IdTree *> &leaves)
+{
+ typedef std::vector<const IdTree *> TreeVec;
+
+ // Determine which of the fragment's leaves has the smallest number of
+ // subtree matches in the test set. If the fragment contains a rare word
+ // (which is pretty likely assuming a Zipfian distribution) then we only
+ // have to try matching the fragment against a small number of potential
+ // match sites.
+ const IdTree *rarestLeaf = leaves[0];
+ std::size_t lowestCount = m_labelToTree[rarestLeaf->value()].size();
+ for (std::size_t i = 1; i < leaves.size(); ++i) {
+ const IdTree *leaf = leaves[i];
+ std::size_t count = m_labelToTree[leaf->value()].size();
+ if (count < lowestCount) {
+ lowestCount = count;
+ rarestLeaf = leaf;
+ }
+ }
+
+ // Determine the depth of the chosen leaf.
+ const std::size_t depth = rarestLeaf->Depth();
+
+ // Try to match the rule fragment against the test set subtrees where a
+ // leaf match was found.
+ TreeVec &nodes = m_labelToTree[rarestLeaf->value()];
+ for (TreeVec::const_iterator p = nodes.begin(); p != nodes.end(); ++p) {
+ // Navigate 'depth' positions up the subtree to find the root of the
+ // potential match site.
+ const IdTree *t = *p;
+ std::size_t d = depth;
+ while (d && t->parent()) {
+ t = t->parent();
+ --d;
+ }
+ if (d > 0) {
+ // The potential match site is not tall enough.
+ continue;
+ }
+ if (MatchFragment(fragment, *t)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool TreeTsgFilter::MatchFragment(const IdTree &fragment, const IdTree &tree)
+{
+ if (fragment.value() != tree.value()) {
+ return false;
+ }
+ const std::vector<IdTree*> &fragChildren = fragment.children();
+ const std::vector<IdTree*> &treeChildren = tree.children();
+ if (!fragChildren.empty() && fragChildren.size() != treeChildren.size()) {
+ return false;
+ }
+ for (std::size_t i = 0; i < fragChildren.size(); ++i) {
+ if (!MatchFragment(*fragChildren[i], *treeChildren[i])) {
+ return false;
+ }
+ }
+ return true;
+}
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/TreeTsgFilter.h b/phrase-extract/filter-rule-table/TreeTsgFilter.h
new file mode 100644
index 000000000..6ae26400a
--- /dev/null
+++ b/phrase-extract/filter-rule-table/TreeTsgFilter.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <istream>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include <boost/shared_ptr.hpp>
+#include <boost/unordered_map.hpp>
+
+#include "syntax-common/numbered_set.h"
+#include "syntax-common/string_tree.h"
+#include "syntax-common/tree.h"
+#include "syntax-common/tree_fragment_tokenizer.h"
+
+#include "TsgFilter.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+// Filters a rule table, discarding rules that cannot be applied to a given
+// test set. The rule table must have a TSG source-side and the test sentences
+// must be parse trees.
+class TreeTsgFilter : public TsgFilter {
+ public:
+ // Initialize the filter for a given set of test sentences.
+ TreeTsgFilter(const std::vector<boost::shared_ptr<StringTree> > &);
+
+ private:
+ // Add an entry to m_labelToTree for every subtree of the given tree.
+ void AddNodesToMap(const IdTree &);
+
+ // Tree-specific implementation of virtual function.
+ bool MatchFragment(const IdTree &, const std::vector<IdTree *> &);
+
+ // Try to match a fragment against a specific subtree of a test tree.
+ bool MatchFragment(const IdTree &, const IdTree &);
+
+ // Convert a StringTree to an IdTree (wrt m_testVocab). Inserts symbols into
+ // m_testVocab.
+ IdTree *StringTreeToIdTree(const StringTree &);
+
+ std::vector<boost::shared_ptr<IdTree> > m_sentences;
+ std::vector<std::vector<const IdTree *> > m_labelToTree;
+};
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/TsgFilter.cpp b/phrase-extract/filter-rule-table/TsgFilter.cpp
new file mode 100644
index 000000000..6322564dd
--- /dev/null
+++ b/phrase-extract/filter-rule-table/TsgFilter.cpp
@@ -0,0 +1,168 @@
+#include "TsgFilter.h"
+
+#include "boost/scoped_ptr.hpp"
+
+#include "util/string_piece.hh"
+#include "util/string_piece_hash.hh"
+#include "util/tokenize_piece.hh"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+// Read a rule table from 'in' and filter it according to the test sentences.
+//
+// This involves testing TSG fragments for matches against at potential match
+// sites in the set of test parse trees / forests. There are a few
+// optimizations that make this reasonably fast in practice:
+//
+// Optimization 1
+// If a rule has the same TSG fragment as the previous rule then re-use the
+// result of the previous filtering decision.
+//
+// Optimization 2
+// Test if the TSG fragment contains any symbols that don't occur in the
+// symbol vocabulary of the test set. If it does then the rule can be
+// discarded.
+//
+// Optimization 3
+// Prior to filtering, a map is constructed from each distinct test set tree /
+// forest vertex symbol to the set of vertices having that symbol. During
+// filtering, for each rule's TSG fragment the leaf with the smallest number of
+// corresponding test nodes nodes is determined. Matching is only attempted
+// at those sites (this is done in MatchFragment, which has tree- and
+// forest-specific implementations).
+//
+// Some statistics from real data (WMT14, English-German, tree-version):
+//
+// 4.4M Parallel sentences (source-side parsed with Berkeley parser)
+// 2.7K Test sentences (newstest2014)
+//
+// 73.4M Original rule table size (number of distinct, composed GHKM rules)
+// 22.9M Number of rules with same source-side as previous rule
+// 50.5M Number of rules requiring vocabulary matching test
+// 24.1M Number of rules requiring full tree matching test
+// 6.7M Number of rules retained after filtering
+//
+void TsgFilter::Filter(std::istream &in, std::ostream &out)
+{
+ const util::MultiCharacter delimiter("|||");
+
+ std::string line;
+ std::string prevLine;
+ StringPiece source;
+ bool keep = true;
+ int lineNum = 0;
+ std::vector<TreeFragmentToken> tokens;
+ std::vector<IdTree *> leaves;
+
+ while (std::getline(in, line)) {
+ ++lineNum;
+
+ // Read the source-side of the rule.
+ util::TokenIter<util::MultiCharacter> it(line, delimiter);
+
+ // Check if this rule has the same source-side as the previous rule. If
+ // it does then we already know whether or not to keep the rule. This
+ // optimisation is based on the assumption that the rule table is sorted
+ // (which is the case in the standard Moses training pipeline).
+ if (*it == source) {
+ if (keep) {
+ out << line << std::endl;
+ }
+ continue;
+ }
+
+ // The source-side is different from the previous rule's.
+ source = *it;
+
+ // Tokenize the source-side tree fragment.
+ tokens.clear();
+ for (TreeFragmentTokenizer p(source); p != TreeFragmentTokenizer(); ++p) {
+ tokens.push_back(*p);
+ }
+
+ // Construct an IdTree representing the source-side tree fragment. This
+ // will fail if the fragment contains any symbols that don't occur in
+ // m_testVocab and in that case the rule can be discarded. In practice,
+ // this catches a lot of discardable rules (see comment at the top of this
+ // function). If the fragment is successfully created then we attempt to
+ // match the tree fragment against the test trees. This test is exact, but
+ // slow.
+ int i = 0;
+ leaves.clear();
+ boost::scoped_ptr<IdTree> fragment(BuildTree(tokens, i, leaves));
+ keep = fragment.get() && MatchFragment(*fragment, leaves);
+ if (keep) {
+ out << line << std::endl;
+ }
+
+ // Retain line for the next iteration (in order that the source StringPiece
+ // remains valid).
+ prevLine.swap(line);
+ }
+}
+
+TsgFilter::IdTree *TsgFilter::BuildTree(
+ const std::vector<TreeFragmentToken> &tokens, int &i,
+ std::vector<IdTree *> &leaves)
+{
+ // The subtree starting at tokens[i] is either:
+ // 1. a single non-variable symbol (like NP or dog), or
+ // 2. a variable symbol (like [NP]), or
+ // 3. a subtree with children (like [NP [DT] [NN dog]])
+
+ // First check for case 1.
+ if (tokens[i].type == TreeFragmentToken_WORD) {
+ Vocabulary::IdType id = m_testVocab.Lookup(tokens[i++].value,
+ StringPieceCompatibleHash(),
+ StringPieceCompatibleEquals());
+ if (id == Vocabulary::NullId()) {
+ return 0;
+ }
+ leaves.push_back(new IdTree(id));
+ return leaves.back();
+ }
+
+ // We must be dealing with either case 2 or 3. Case 2 looks like case 3 but
+ // without the children.
+ assert(tokens[i].type == TreeFragmentToken_LSB);
+
+ // Skip over the opening [
+ ++i;
+
+ // Read the root symbol of the subtree.
+ Vocabulary::IdType id = m_testVocab.Lookup(tokens[i++].value,
+ StringPieceCompatibleHash(),
+ StringPieceCompatibleEquals());
+ if (id == Vocabulary::NullId()) {
+ return 0;
+ }
+ IdTree *root = new IdTree(id);
+
+ // Read the children (in case 2 there won't be any).
+ while (tokens[i].type != TreeFragmentToken_RSB) {
+ IdTree *child = BuildTree(tokens, i, leaves);
+ if (!child) {
+ delete root;
+ return 0;
+ }
+ root->children().push_back(child);
+ child->parent() = root;
+ }
+
+ if (root->IsLeaf()) {
+ leaves.push_back(root);
+ }
+
+ // Skip over the closing ] and we're done.
+ ++i;
+ return root;
+}
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/filter-rule-table/TsgFilter.h b/phrase-extract/filter-rule-table/TsgFilter.h
new file mode 100644
index 000000000..d52169a07
--- /dev/null
+++ b/phrase-extract/filter-rule-table/TsgFilter.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <istream>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "syntax-common/numbered_set.h"
+#include "syntax-common/tree.h"
+#include "syntax-common/tree_fragment_tokenizer.h"
+
+namespace MosesTraining
+{
+namespace Syntax
+{
+namespace FilterRuleTable
+{
+
+// Base class for TreeTsgFilter and ForestTsgFilter, both of which filter rule
+// tables where the source-side is TSG.
+class TsgFilter {
+ public:
+ virtual ~TsgFilter() {}
+
+ // Read a rule table from 'in' and filter it according to the test sentences.
+ void Filter(std::istream &in, std::ostream &out);
+
+ protected:
+ // Maps symbols (terminals and non-terminals) from strings to integers.
+ typedef NumberedSet<std::string, std::size_t> Vocabulary;
+
+ // Represents a tree using integer vocabulary values.
+ typedef Tree<Vocabulary::IdType> IdTree;
+
+ // Build an IdTree (wrt m_testVocab) for the tree beginning at position i of
+ // the token sequence or return 0 if any symbol in the fragment is not in
+ // m_testVocab. If successful then on return, i will be set to the position
+ // immediately after the last token of the tree and leaves will contain the
+ // pointers to the fragment's leaves. If the build fails then i and leaves
+ // are undefined.
+ IdTree *BuildTree(const std::vector<TreeFragmentToken> &tokens, int &i,
+ std::vector<IdTree *> &leaves);
+
+ // Try to match a fragment. The implementation depends on whether the test
+ // sentences are trees or forests.
+ virtual bool MatchFragment(const IdTree &, const std::vector<IdTree *> &) = 0;
+
+ // The symbol vocabulary of the test sentences.
+ Vocabulary m_testVocab;
+};
+
+} // namespace FilterRuleTable
+} // namespace Syntax
+} // namespace MosesTraining
diff --git a/phrase-extract/lexical-reordering/InputFileStream.cpp b/phrase-extract/lexical-reordering/InputFileStream.cpp
index 013781c36..89667a6d1 100755
--- a/phrase-extract/lexical-reordering/InputFileStream.cpp
+++ b/phrase-extract/lexical-reordering/InputFileStream.cpp
@@ -22,8 +22,10 @@
#include "InputFileStream.h"
#include "gzfilebuf.h"
#include <iostream>
+#include <boost/algorithm/string/predicate.hpp>
using namespace std;
+using namespace boost::algorithm;
namespace Moses
{
@@ -41,8 +43,7 @@ InputFileStream::~InputFileStream()
void InputFileStream::Open(const std::string &filePath)
{
- if (filePath.size() > 3 &&
- filePath.substr(filePath.size() - 3, 3) == ".gz") {
+ if (ends_with(filePath, ".gz")) {
m_streambuf = new gzfilebuf(filePath.c_str());
} else {
std::filebuf* fb = new std::filebuf();
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index bd1b54b2c..d652b2b9b 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -28,6 +28,7 @@
#include <set>
#include <vector>
#include <algorithm>
+#include <boost/algorithm/string/predicate.hpp>
#include <boost/unordered_map.hpp>
#include "ScoreFeature.h"
@@ -38,6 +39,7 @@
#include "OutputFileStream.h"
using namespace std;
+using namespace boost::algorithm;
using namespace MosesTraining;
namespace MosesTraining
@@ -904,10 +906,10 @@ void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
bool l2rFlag = false;
bool r2lFlag = false;
- if (!key.substr(0,4).compare("L2R_")) {
+ if (starts_with(key, "L2R_")) {
l2rFlag = true;
}
- if (!key.substr(0,4).compare("R2L_")) {
+ if (starts_with(key, "R2L_")) {
r2lFlag = true;
}
if (!l2rFlag && !r2lFlag) {
diff --git a/regression-testing/run-test-extract.perl b/regression-testing/run-test-extract.perl
index d13ef62d6..bc0dc0cf9 100755
--- a/regression-testing/run-test-extract.perl
+++ b/regression-testing/run-test-extract.perl
@@ -46,6 +46,11 @@ s/(\$\w+)/$1/eeg;
$extractorArgs = $_;
my $cmdMain = "$extractorExe $extractorArgs \n";
+
+open CMD, ">$results_dir/cmd_line";
+print CMD "$cmdMain";
+close CMD;
+
`$cmdMain`;
my $truthPath = "$test_dir/$test_name/truth/";
@@ -53,7 +58,7 @@ my $truthPath = "$test_dir/$test_name/truth/";
if (-e $outPath)
{
- my $cmd = "diff --exclude=.DS_Store --exclude=._* $outPath/ $truthPath/ | wc -l";
+ my $cmd = "diff --exclude=.DS_Store --exclude=._* --exclude=cmd_line $outPath/ $truthPath/ | wc -l";
my $numDiff = `$cmd`;
if ($numDiff == 0)
diff --git a/regression-testing/run-test-mert.perl b/regression-testing/run-test-mert.perl
index e22d152df..233e08b44 100755
--- a/regression-testing/run-test-mert.perl
+++ b/regression-testing/run-test-mert.perl
@@ -92,7 +92,11 @@ exit 0;
sub exec_test {
my ($test_dir,$results) = @_;
my $start_time = time;
- my ($o, $ec, $sig) = run_command("sh $test_dir/command $bin_dir $test_dir 1> $results/run.stdout 2> $results/run.stderr");
+ my $cmd = "sh $test_dir/command $bin_dir $test_dir 1> $results/run.stdout 2> $results/run.stderr";
+ open CMD, ">$results/cmd_line";
+ print CMD "$cmd";
+ close CMD;
+ my ($o, $ec, $sig) = run_command($cmd);
my $elapsed = 0;
$elapsed = time - $start_time;
return ($o, $elapsed, $ec, $sig);
diff --git a/regression-testing/run-test-misc.perl b/regression-testing/run-test-misc.perl
index 7a444f8c3..da79c94e8 100755
--- a/regression-testing/run-test-misc.perl
+++ b/regression-testing/run-test-misc.perl
@@ -38,6 +38,11 @@ unless (defined $results_dir)
use File::Basename qw/dirname/;
my $dir = dirname ($0);
my $cmdMain = "perl -I $dir $test_dir/$test_name/run.perl -moses-root $mosesRoot -moses-bin $mosesBin -test $test_name -data-dir $data_dir -test-dir $test_dir -results-dir $results_dir\n";
+
+open CMD, ">$results_dir/cmd_line";
+print CMD $cmdMain;
+close CMD;
+
`$cmdMain`;
my $outPath = "$results_dir/out";
@@ -47,7 +52,7 @@ print STDERR "outPath=$outPath \n truthPath=$truthPath \n";
if (-e $outPath)
{
- my $cmd = "diff $outPath $truthPath | wc -l";
+ my $cmd = "diff --exclude=cmd_line $outPath $truthPath | wc -l";
my $numDiff = `$cmd`;
diff --git a/regression-testing/run-test-scorer.perl b/regression-testing/run-test-scorer.perl
index 22a7b9370..6bd95ad55 100755
--- a/regression-testing/run-test-scorer.perl
+++ b/regression-testing/run-test-scorer.perl
@@ -46,6 +46,11 @@ s/(\$\w+)/$1/eeg;
$scorerArgs = $_;
my $cmdMain = "$scoreExe $scorerArgs \n";
+
+open CMD, ">$results_dir/cmd_line";
+print CMD "$cmdMain";
+close CMD;
+
`$cmdMain`;
my $truthPath = "$test_dir/$test_name/truth/results.txt";
diff --git a/scripts/Transliteration/in-decoding-transliteration.pl b/scripts/Transliteration/in-decoding-transliteration.pl
index 237aec587..ebf1c490b 100755
--- a/scripts/Transliteration/in-decoding-transliteration.pl
+++ b/scripts/Transliteration/in-decoding-transliteration.pl
@@ -120,7 +120,7 @@ sub run_transliteration
`$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`;
- `$MOSES_SRC/scripts/training/filter-model-given-input.pl $TRANSLIT_MODEL/evaluation/$eval_file.filtered $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini $TRANSLIT_MODEL/evaluation/$eval_file -Binarizer "$MOSES_SRC/bin/processPhraseTable"`;
+ `$MOSES_SRC/scripts/training/filter-model-given-input.pl $TRANSLIT_MODEL/evaluation/$eval_file.filtered $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini $TRANSLIT_MODEL/evaluation/$eval_file -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;
`rm $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`;
diff --git a/scripts/Transliteration/post-decoding-transliteration.pl b/scripts/Transliteration/post-decoding-transliteration.pl
index 69fd8bf46..578160ba2 100755
--- a/scripts/Transliteration/post-decoding-transliteration.pl
+++ b/scripts/Transliteration/post-decoding-transliteration.pl
@@ -135,7 +135,7 @@ sub run_transliteration
`$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -config $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini -lm 0:3:$TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini:8`;
- `$MOSES_SRC/scripts/training/filter-model-given-input.pl $TRANSLIT_MODEL/evaluation/$eval_file.filtered $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini $TRANSLIT_MODEL/evaluation/$eval_file -Binarizer "$MOSES_SRC/bin/processPhraseTable"`;
+ `$MOSES_SRC/scripts/training/filter-model-given-input.pl $TRANSLIT_MODEL/evaluation/$eval_file.filtered $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini $TRANSLIT_MODEL/evaluation/$eval_file -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;
`rm $TRANSLIT_MODEL/evaluation/$eval_file.moses.table.ini`;
@@ -296,7 +296,7 @@ sub run_decoder
`$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -lmodel-oov-feature "yes" -post-decoding-translit "yes" -phrase-translation-table $corpus_dir/model/phrase-table -config $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini -lm 0:3:$corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini:8`;
- `$MOSES_SRC/scripts/training/filter-model-given-input.pl $corpus_dir/evaluation/filtered $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini $INPUT_FILE -Binarizer "$MOSES_SRC/bin/processPhraseTable"`;
+ `$MOSES_SRC/scripts/training/filter-model-given-input.pl $corpus_dir/evaluation/filtered $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini $INPUT_FILE -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;
`rm $corpus_dir/evaluation/$OUTPUT_FILE_NAME.moses.table.ini`;
diff --git a/scripts/Transliteration/prepare-transliteration-phrase-table.pl b/scripts/Transliteration/prepare-transliteration-phrase-table.pl
index a96964ac9..dfd1ed4de 100755
--- a/scripts/Transliteration/prepare-transliteration-phrase-table.pl
+++ b/scripts/Transliteration/prepare-transliteration-phrase-table.pl
@@ -101,7 +101,7 @@ sub run_transliteration
`$MOSES_SRC/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -reordering msd-bidirectional-fe -score-options '--KneserNey' -phrase-translation-table $TRANSLIT_MODEL/model/phrase-table -reordering-table $TRANSLIT_MODEL/model/reordering-table -config $eval_file.moses.table.ini -lm 0:3:$eval_file.moses.table.ini:8`;
- `$MOSES_SRC/scripts/training/filter-model-given-input.pl $eval_file.filtered $eval_file.moses.table.ini $eval_file -Binarizer "$MOSES_SRC/bin/processPhraseTable"`;
+ `$MOSES_SRC/scripts/training/filter-model-given-input.pl $eval_file.filtered $eval_file.moses.table.ini $eval_file -Binarizer "$MOSES_SRC/bin/CreateOnDiskPt 1 1 4 100 2"`;
`rm $eval_file.moses.table.ini`;
diff --git a/scripts/Transliteration/train-transliteration-module.pl b/scripts/Transliteration/train-transliteration-module.pl
index 7739e2a2b..ed7f32097 100755
--- a/scripts/Transliteration/train-transliteration-module.pl
+++ b/scripts/Transliteration/train-transliteration-module.pl
@@ -180,7 +180,7 @@ sub train_transliteration_module{
`$MOSES_SRC_DIR/scripts/training/train-model.perl -mgiza -mgiza-cpus 10 -dont-zip -first-step 9 -external-bin-dir $EXTERNAL_BIN_DIR -f $INPUT_EXTENSION -e $OUTPUT_EXTENSION -alignment grow-diag-final-and -parts 5 -score-options '--KneserNey' -phrase-translation-table $OUT_DIR/model/phrase-table -config $OUT_DIR/tuning/moses.table.ini -lm 0:3:$OUT_DIR/tuning/moses.table.ini:8`;
- `$MOSES_SRC_DIR/scripts/training/filter-model-given-input.pl $OUT_DIR/tuning/filtered $OUT_DIR/tuning/moses.table.ini $OUT_DIR/tuning/input -Binarizer "$MOSES_SRC_DIR/bin/processPhraseTable"`;
+ `$MOSES_SRC_DIR/scripts/training/filter-model-given-input.pl $OUT_DIR/tuning/filtered $OUT_DIR/tuning/moses.table.ini $OUT_DIR/tuning/input -Binarizer "$MOSES_SRC_DIR/bin/CreateOnDiskPt 1 1 4 100 2"`;
`rm $OUT_DIR/tuning/moses.table.ini`;
diff --git a/scripts/recaser/train-recaser.perl b/scripts/recaser/train-recaser.perl
index b12aa6147..ad75af068 100755
--- a/scripts/recaser/train-recaser.perl
+++ b/scripts/recaser/train-recaser.perl
@@ -13,8 +13,12 @@ my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG,$HELP,$ERROR);
my $LM = "KENLM"; # KENLM is default.
my $BUILD_LM = "build-lm.sh";
my $BUILD_KENLM = "$Bin/../../bin/lmplz";
+my $BUILD_BINARY = "$Bin/../../bin/build_binary";
+my $EXTRACT = "$Bin/../../bin/extract";
+my $SCORE = "$Bin/../../bin/score";
+my $CONSOLIDATE_DIRECT = "$Bin/../../bin/consolidate-direct";
my $NGRAM_COUNT = "ngram-count";
-my $TRAIN_SCRIPT = "train-factored-phrase-model.perl";
+my $TRAIN_SCRIPT = "$Bin/../training/train-model.perl";
my $MAX_LEN = 1;
my $FIRST_STEP = 1;
my $LAST_STEP = 11;
@@ -118,11 +122,14 @@ sub train_lm {
}
else {
$LM = "KENLM";
- $cmd = "$BUILD_KENLM --prune 0 0 1 -S 5% -T $DIR/lmtmp --order 3 --text $CORPUS --arpa $DIR/cased.kenlm.gz";
+ $cmd = "$BUILD_KENLM --prune 0 0 1 -S 5% -T $DIR/lmtmp --order 3 --text $CORPUS --arpa $DIR/cased.kenlm.arpa.gz";
}
print STDERR "** Using $LM **" . "\n";
print STDERR $cmd."\n";
system($cmd) == 0 || die("Language model training failed with error " . ($? >> 8) . "\n");
+ if ($LM eq "KENLM") {
+ system("$BUILD_BINARY $DIR/cased.kenlm.arpa.gz $DIR/cased.kenlm ; rm $DIR/cased.kenlm.arpa.gz");
+ }
}
sub prepare_data {
@@ -159,10 +166,29 @@ sub prepare_data {
}
sub train_recase_model {
+ print STDERR "\n(4) Training recasing model @ ".`date`;
my $first = $FIRST_STEP;
$first = 4 if $first < 4;
- print STDERR "\n(4) Training recasing model @ ".`date`;
+ if ($MAX_LEN == 1) {
+ my $cmd = "$EXTRACT $DIR/aligned.cased $DIR/aligned.lowercased $DIR/aligned.a $DIR/extract 1";
+ system($cmd) == 0 || die("ERROR: extract (special case max-len 1) failed: $cmd");
+ $cmd = "sort -S 2G $DIR/extract > $DIR/extract.sorted";
+ system($cmd) == 0 || die("ERROR: sort extract (special case max-len 1) failed: $cmd");
+ $cmd = "$SCORE $DIR/extract.sorted /dev/null $DIR/phrase-table-half --NoLex";
+ system($cmd) == 0 || die("ERROR: score (special case max-len 1) failed: $cmd");
+ $cmd = "$CONSOLIDATE_DIRECT $DIR/phrase-table-half $DIR/phrase-table";
+ system($cmd) == 0 || die("ERROR: consolidate-direct (special case max-len 1) failed: $cmd");
+ system("rm $DIR/phrase-table-half");
+ system("gzip $DIR/phrase-table");
+ $first = 9;
+ }
my $cmd = "$TRAIN_SCRIPT --root-dir $DIR --model-dir $DIR --first-step $first --alignment a --corpus $DIR/aligned --f lowercased --e cased --max-phrase-length $MAX_LEN";
+ if ($MAX_LEN == 1) {
+ $cmd .= " --score-options='--NoLex --OnlyDirect'";
+ }
+ else {
+ $cmd .= " --score-options='--OnlyDirect'";
+ }
if (uc $LM eq "IRSTLM") {
$cmd .= " --lm 0:3:$DIR/cased.irstlm.gz:1";
}
@@ -170,7 +196,7 @@ sub train_recase_model {
$cmd .= " --lm 0:3:$DIR/cased.srilm.gz:8";
}
else {
- $cmd .= " --lm 0:3:$DIR/cased.kenlm.gz:8";
+ $cmd .= " --lm 0:3:$DIR/cased.kenlm:8";
}
$cmd .= " -config $CONFIG" if $CONFIG;
print STDERR $cmd."\n";
diff --git a/scripts/tokenizer/normalize-punctuation.perl b/scripts/tokenizer/normalize-punctuation.perl
index 60d559eb6..c679ab2a7 100755
--- a/scripts/tokenizer/normalize-punctuation.perl
+++ b/scripts/tokenizer/normalize-punctuation.perl
@@ -75,7 +75,6 @@ while(<STDIN>) {
s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
}
- print STDERR $_ if //;
if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {
s/(\d) (\d)/$1,$2/g;
diff --git a/scripts/training/convert-moses-ini-v2-to-v1.perl b/scripts/training/convert-moses-ini-v2-to-v1.perl
new file mode 100755
index 000000000..aad3ba15e
--- /dev/null
+++ b/scripts/training/convert-moses-ini-v2-to-v1.perl
@@ -0,0 +1,263 @@
+#! /usr/bin/env python
+# -*- coding: utf8 -*-
+
+
+from __future__ import (
+ absolute_import,
+ print_function,
+ unicode_literals,
+ )
+
+__version__ = '1.0'
+__license__ = 'LGPL3'
+__source__ = 'Precision Translation Tools Pte Lte'
+
+import errno
+from sys import stdout
+from copy import deepcopy
+from os.path import (
+ dirname,
+ basename,
+ exists,
+ realpath,
+ )
+from os import (
+ sep,
+ makedirs,
+ )
+
+root_escape = '%(escape-prefix)s'
+
+
+class moses2_to_ini(object):
+
+
+ def __init__(self, inp, out, escape_prefix):
+ self.inp = inp
+ self.out = out
+ self.escape_prefix = escape_prefix
+ self._config = {}
+
+
+ def parse(self):
+
+ content = ''
+ key = ''
+ section = None
+ self._config = {}
+ counter = 0
+
+ with open(self.inp, 'rb' ) as f:
+ contents = f.read().decode('utf8')
+
+ lines = contents.splitlines()
+
+ # retrieve all values except feature/functions with attributes
+ for i, line in [(i, line.strip()) for i, line in enumerate(lines)
+ if line.strip() and not line.strip().startswith('#')]:
+
+ if line.startswith('[') and line.endswith(']'):
+
+ section = line.strip('] [')
+
+ if section not in self._config.keys() + ['feature', 'weight']:
+ # new section not in config and not a reserved section
+ counter = 0
+ key = section
+ self._config[key] = {}
+
+ elif section == 'feature' and line in ['UnknownWordPenalty',
+ 'WordPenalty', 'PhrasePenalty', 'Distortion']:
+ # known feature/funcions without attributes
+ key = '%s0' % line
+ if key not in self._config:
+ self._config[key] = {}
+ self._config[key]['feature'] = line
+
+ elif section == 'feature':
+ # skip feature/funcions with artuments
+ continue
+
+ elif section == 'weight':
+ # add weight value to feature sections
+ for key, value in [(key.strip(), value.strip())
+ for key, value in [line.split('=', 1)]]:
+ if key not in self._config:
+ self._config[key] = {}
+ self._config[key]['weight'] = value
+
+ else:
+ self._config[key][counter] = line
+ counter += 0
+
+ lines[i] = ''
+
+ # second, match feature/functions attributes to [weight] section values
+ for i, line in [(i, line.strip()) for i, line in enumerate(lines)
+ if line.strip() and not line.strip().startswith('#')]:
+
+ # add "feature" to assist creating tmpdict for feature/functions
+ line = 'feature=%s' % line
+ tmpdict = dict([key.split('=',1) for key in line.split()])
+
+ # feature/functions 'name' attribute must match an entry in [weight]
+ if tmpdict.get('name') not in self._config:
+ raise RuntimeError('malformed moses.ini v2 file')
+
+ for key, value in [(key.strip(), value.strip()) for key, value
+ in tmpdict.items() if key.strip() != 'name']:
+
+ self._config[tmpdict['name']][key] = value
+
+ return deepcopy(self._config)
+
+
+ def render(self, config):
+
+ self._config = deepcopy(config)
+
+ _config = deepcopy(config)
+
+ lines = _tolines(_config, self.escape_prefix)
+
+ if self.out == '-':
+
+ stdout.write('\n'.join(lines))
+
+ else:
+
+ contents = '\r\n'.join(lines)
+
+ makedir(dirname(self.out))
+
+ with open(self.out, 'wb') as f:
+ f.write(contents.encode('utf8'))
+
+
+ def __str__(self):
+ return '\n'.join(_tolines(self._config, self.escape_prefix))
+
+
+ @property
+ def config(self):
+ return deepcopy(self._config)
+
+
+def _tolines(config, escape_prefix):
+
+ lines = []
+
+ # group feature/functions first
+ for sectionname in [sectionname for sectionname in sorted(config)
+ if sectionname[-1] in '0123456789']:
+
+ section = config[sectionname]
+
+ lines.append('[%s]' % sectionname)
+
+ for option, value in section.items():
+
+ if option == 'path' \
+ and escape_prefix is not None \
+ and value.startswith(escape_prefix):
+
+ value = value.replace(escape_prefix, root_escape, 1)
+
+ lines.append('%s=%s' % (option, value))
+
+ lines.append('')
+
+ for sectionname in [sectionname for sectionname in sorted(config)
+ if sectionname[-1] not in '0123456789']:
+
+ section = config[sectionname]
+
+ lines.append('[%s]' % sectionname)
+
+ for option, value in section.items():
+
+ lines.append('%s=%s' % (option, value))
+
+ lines.append('')
+
+ return deepcopy(lines)
+
+
+def makedir(path, mode=0o777):
+ try:
+ makedirs(path, mode)
+ except OSError as e:
+ if e.errno not in [errno.EEXIST,
+ errno.EPERM, errno.EACCES, errno.ENOENT]:
+ raise
+
+
+def get_args():
+ '''Parse command-line arguments
+
+ Uses the API compatibility between the legacy
+ argparse.OptionParser and its replacement argparse.ArgumentParser
+ for functional equivelancy and nearly identical help prompt.
+ '''
+
+ description = 'Convert Moses.ini v2 file to standard INI format'
+ usage = '%s [arguments]' % basename(__file__)
+
+ try:
+ from argparse import ArgumentParser
+ except ImportError:
+ from optparse import OptionParser
+ argparser = False
+ escape_help = ('Optional. Path of SMT model. If provided, '
+ 'escapes \"escape-prefix\" with \"%(escape-prefix)s\"')
+ parser = OptionParser(usage=usage, description=description)
+ add_argument = parser.add_option
+ else:
+ argparser = True
+ escape_help = ('Optional. Path of SMT model. If provided, '
+ 'escape \"escape-prefix\" with \"%%(escape-prefix)s\"')
+ parser = ArgumentParser(usage=usage, description=description)
+ add_argument = parser.add_argument
+
+ add_argument('-i','--inp', action='store',
+ help='moses.ini v2 file to convert (required)')
+
+ add_argument('-o','--out', action='store', default='-',
+ help='standard INI file (default: "-" outputs to stdout)')
+
+ add_argument('-r','--escape-prefix', action='store',
+ help=escape_help)
+
+ if argparser:
+
+ args = vars(parser.parse_args())
+
+ else:
+
+ opts = parser.parse_args()
+ args = vars(opts[0])
+
+ if args['inp'] is None:
+ parser.error('argument -i/--inp required')
+
+ args['inp'] = realpath(args['inp'])
+
+ if not exists(args['inp']):
+ parser.error('argument -i/--inp invalid.\n'
+ 'reference: %s' % args['inp'])
+
+ if args['out'] != '-':
+ args['out'] = realpath(args['out'])
+
+ return args
+
+
+if __name__ == '__main__':
+
+ args = get_args()
+
+ converter = moses2_to_ini(**args)
+
+ config = converter.parse()
+
+ converter.render(config)
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index ec3644560..bc1ed63a8 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -2134,11 +2134,17 @@ sub create_ini {
if ($type =~ /^\d+$/) {
# backwards compatibility if the type is given not as string but as a number
- $type = "SRILM" if $type == 0;
- $type = "IRSTLM" if $type == 1;
- $type = "KENLM lazyken=0" if $type == 8;
- $type = "KENLM lazyken=1" if $type == 9;
- die "Unknown numeric LM type given: $type" if $type =~ /^\d+$/;
+ if ($type == 0) {
+ $type = "SRILM";
+ } elsif ($type == 1) {
+ $type = "IRSTLM";
+ } elsif ($type == 8) {
+ $type = "KENLM lazyken=0";
+ } elsif ($type == 9) {
+ $type = "KENLM lazyken=1";
+ } else {
+ die "Unknown numeric LM type given: $type";
+ }
}
my $lm_oov_prob = 0.1;
diff --git a/vw/ClassifierFactory.cpp b/vw/ClassifierFactory.cpp
index 286bf84a6..4d8ee4e54 100644
--- a/vw/ClassifierFactory.cpp
+++ b/vw/ClassifierFactory.cpp
@@ -2,6 +2,9 @@
#include "vw.h"
#include "../moses/Util.h"
#include <iostream>
+#include <boost/algorithm/string/predicate.hpp>
+
+using namespace boost::algorithm;
namespace Discriminative
{
@@ -15,7 +18,7 @@ ClassifierFactory::ClassifierFactory(const std::string &modelFile, const std::st
ClassifierFactory::ClassifierFactory(const std::string &modelFilePrefix)
: m_lastId(0), m_train(true)
{
- if (modelFilePrefix.size() > 3 && modelFilePrefix.substr(modelFilePrefix.size() - 3, 3) == ".gz") {
+ if (ends_with(modelFilePrefix, ".gz")) {
m_modelFilePrefix = modelFilePrefix.substr(0, modelFilePrefix.size() - 3);
m_gzip = true;
} else {
diff --git a/vw/VWTrainer.cpp b/vw/VWTrainer.cpp
index dff58a1de..e513de3d2 100644
--- a/vw/VWTrainer.cpp
+++ b/vw/VWTrainer.cpp
@@ -1,8 +1,10 @@
#include "Util.h"
#include "Classifier.h"
+#include <boost/algorithm/string/predicate.hpp>
#include <boost/iostreams/device/file.hpp>
using namespace std;
+using namespace boost::algorithm;
using namespace Moses;
namespace Discriminative
@@ -10,7 +12,7 @@ namespace Discriminative
VWTrainer::VWTrainer(const std::string &outputFile)
{
- if (outputFile.size() > 3 && outputFile.substr(outputFile.size() - 3, 3) == ".gz") {
+ if (ends_with(outputFile, ".gz")) {
m_bfos.push(boost::iostreams::gzip_compressor());
}
m_bfos.push(boost::iostreams::file_sink(outputFile));