Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieu@hoang.co.uk>2013-06-28 17:19:30 +0400
committerHieu Hoang <hieu@hoang.co.uk>2013-06-28 17:19:30 +0400
commitfa4b92fc0af49079827fb6d71eae86fc97332121 (patch)
tree3f14600ca2dc8fc8de14ed7552ca4864d5c7baf8
parent8c19c2ba8ae59e4cdfefaa442e9825ba9b8bf39f (diff)
parentc963338476a40d5b1071c0c253332ed6ebd0b427 (diff)
Merge branch 'master' into nadir_osm
-rw-r--r--contrib/other-builds/moses/.project10
-rw-r--r--moses/FF/DistortionScoreProducer.cpp3
-rw-r--r--moses/FF/FeatureFunction.cpp4
-rw-r--r--moses/FF/PhrasePenalty.cpp9
-rw-r--r--moses/FF/PhrasePenalty.h8
-rw-r--r--moses/FF/UnknownWordPenaltyProducer.cpp3
-rw-r--r--moses/FF/WordPenaltyProducer.cpp3
-rw-r--r--moses/Parameter.cpp38
-rw-r--r--moses/StaticData.cpp5
-rw-r--r--moses/TargetPhraseCollection.cpp10
-rw-r--r--moses/TranslationModel/BilingualDynSuffixArray.cpp794
-rw-r--r--moses/TranslationModel/BilingualDynSuffixArray.h147
-rw-r--r--moses/TranslationModel/DynSuffixArray.cpp40
-rw-r--r--moses/TranslationModel/DynSuffixArray.h24
-rw-r--r--moses/TranslationModel/PhraseDictionaryDynSuffixArray.README4
-rw-r--r--moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp108
-rw-r--r--moses/TranslationModel/PhraseDictionaryDynSuffixArray.h6
-rw-r--r--moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp3
-rw-r--r--moses/TranslationModel/RuleTable/Trie.h6
-rw-r--r--moses/TranslationModel/WordCoocTable.cpp72
-rw-r--r--moses/TranslationModel/WordCoocTable.h72
-rw-r--r--moses/TypeDef.h1
-rw-r--r--moses/generic/sampling/Sampling.h51
-rw-r--r--moses/generic/sorting/NBestList.h85
-rw-r--r--moses/generic/sorting/VectorIndexSorter.h69
25 files changed, 1140 insertions, 435 deletions
diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project
index 086623c0e..593955b4d 100644
--- a/contrib/other-builds/moses/.project
+++ b/contrib/other-builds/moses/.project
@@ -1572,6 +1572,16 @@
<locationURI>virtual:/virtual</locationURI>
</link>
<link>
+ <name>TranslationModel/WordCoocTable.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/WordCoocTable.h</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.h</locationURI>
+ </link>
+ <link>
<name>TranslationModel/fuzzy-match</name>
<type>2</type>
<locationURI>virtual:/virtual</locationURI>
diff --git a/moses/FF/DistortionScoreProducer.cpp b/moses/FF/DistortionScoreProducer.cpp
index 67a1185cb..36618b579 100644
--- a/moses/FF/DistortionScoreProducer.cpp
+++ b/moses/FF/DistortionScoreProducer.cpp
@@ -22,7 +22,8 @@ struct DistortionState_traditional : public FFState {
};
DistortionScoreProducer::DistortionScoreProducer(const std::string &line)
- : StatefulFeatureFunction("Distortion", 1, line) {
+ : StatefulFeatureFunction("Distortion", 1, line)
+{
ReadParameters();
}
diff --git a/moses/FF/FeatureFunction.cpp b/moses/FF/FeatureFunction.cpp
index b35b98648..7c1c98652 100644
--- a/moses/FF/FeatureFunction.cpp
+++ b/moses/FF/FeatureFunction.cpp
@@ -102,8 +102,8 @@ void FeatureFunction::SetParameter(const std::string& key, const std::string& va
void FeatureFunction::ReadParameters()
{
while (!m_args.empty()) {
- const vector<string> &args = m_args[0];
- SetParameter(args[0], args[1]);
+ const vector<string> &args = m_args[0];
+ SetParameter(args[0], args[1]);
m_args.erase(m_args.begin());
}
diff --git a/moses/FF/PhrasePenalty.cpp b/moses/FF/PhrasePenalty.cpp
index ac1aa47b8..8b54939a7 100644
--- a/moses/FF/PhrasePenalty.cpp
+++ b/moses/FF/PhrasePenalty.cpp
@@ -5,14 +5,15 @@
namespace Moses
{
PhrasePenalty::PhrasePenalty(const std::string &line)
-: StatelessFeatureFunction("PhrasePenalty",1, line) {
+ : StatelessFeatureFunction("PhrasePenalty",1, line)
+{
ReadParameters();
}
void PhrasePenalty::Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const
{
scoreBreakdown.Assign(this, 1.0f);
}
diff --git a/moses/FF/PhrasePenalty.h b/moses/FF/PhrasePenalty.h
index 5263f15ef..7ebc20659 100644
--- a/moses/FF/PhrasePenalty.h
+++ b/moses/FF/PhrasePenalty.h
@@ -11,13 +11,13 @@ public:
PhrasePenalty(const std::string &line);
bool IsUseable(const FactorMask &mask) const {
- return true;
+ return true;
}
virtual void Evaluate(const Phrase &source
- , const TargetPhrase &targetPhrase
- , ScoreComponentCollection &scoreBreakdown
- , ScoreComponentCollection &estimatedFutureScore) const;
+ , const TargetPhrase &targetPhrase
+ , ScoreComponentCollection &scoreBreakdown
+ , ScoreComponentCollection &estimatedFutureScore) const;
};
} //namespace
diff --git a/moses/FF/UnknownWordPenaltyProducer.cpp b/moses/FF/UnknownWordPenaltyProducer.cpp
index adc4c4d6b..4ba033e58 100644
--- a/moses/FF/UnknownWordPenaltyProducer.cpp
+++ b/moses/FF/UnknownWordPenaltyProducer.cpp
@@ -7,7 +7,8 @@ using namespace std;
namespace Moses
{
UnknownWordPenaltyProducer::UnknownWordPenaltyProducer(const std::string &line)
- : StatelessFeatureFunction("UnknownWordPenalty",1, line) {
+ : StatelessFeatureFunction("UnknownWordPenalty",1, line)
+{
m_tuneable = false;
ReadParameters();
}
diff --git a/moses/FF/WordPenaltyProducer.cpp b/moses/FF/WordPenaltyProducer.cpp
index 054852349..8d77d68fd 100644
--- a/moses/FF/WordPenaltyProducer.cpp
+++ b/moses/FF/WordPenaltyProducer.cpp
@@ -7,7 +7,8 @@ using namespace std;
namespace Moses
{
WordPenaltyProducer::WordPenaltyProducer(const std::string &line)
- : StatelessFeatureFunction("WordPenalty",1, line) {
+ : StatelessFeatureFunction("WordPenalty",1, line)
+{
ReadParameters();
}
diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index e22a89c6d..bcd7c1519 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -275,13 +275,15 @@ bool Parameter::LoadParam(int argc, char* argv[])
}
// overwrite parameters with values from switches
- for(PARAM_STRING::const_iterator iterParam = m_description.begin(); iterParam != m_description.end(); iterParam++) {
+ for(PARAM_STRING::const_iterator iterParam = m_description.begin();
+ iterParam != m_description.end(); iterParam++) {
const string paramName = iterParam->first;
OverwriteParam("-" + paramName, paramName, argc, argv);
}
// ... also shortcuts
- for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin(); iterParam != m_abbreviation.end(); iterParam++) {
+ for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin();
+ iterParam != m_abbreviation.end(); iterParam++) {
const string paramName = iterParam->first;
const string paramShortName = iterParam->second;
OverwriteParam("-" + paramShortName, paramName, argc, argv);
@@ -294,7 +296,8 @@ bool Parameter::LoadParam(int argc, char* argv[])
verbose = Scan<int>(m_setting["verbose"][0]);
if (verbose >= 1) { // only if verbose
TRACE_ERR( "Defined parameters (per moses.ini or switch):" << endl);
- for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ; iterParam != m_setting.end(); iterParam++) {
+ for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ;
+ iterParam != m_setting.end(); iterParam++) {
TRACE_ERR( "\t" << iterParam->first << ": ");
for ( size_t i = 0; i < iterParam->second.size(); i++ )
TRACE_ERR( iterParam->second[i] << " ");
@@ -303,7 +306,8 @@ bool Parameter::LoadParam(int argc, char* argv[])
}
// convert old weights args to new format
- if (!isParamSpecified("feature"))
+ // WHAT IS GOING ON HERE??? - UG
+ if (!isParamSpecified("feature")) // UG
ConvertWeightArgs();
CreateWeightsMap();
WeightOverwrite();
@@ -331,11 +335,11 @@ std::vector<float> &Parameter::GetWeights(const std::string &name)
{
std::vector<float> &ret = m_weights[name];
- cerr << "WEIGHT " << name << "=";
- for (size_t i = 0; i < ret.size(); ++i) {
- cerr << ret[i] << ",";
- }
- cerr << endl;
+ // cerr << "WEIGHT " << name << "=";
+ // for (size_t i = 0; i < ret.size(); ++i) {
+ // cerr << ret[i] << ",";
+ // }
+ // cerr << endl;
return ret;
}
@@ -357,7 +361,10 @@ void Parameter::SetWeight(const std::string &name, size_t ind, const vector<floa
newWeights.push_back(line);
}
-void Parameter::AddWeight(const std::string &name, size_t ind, const std::vector<float> &weights)
+void
+Parameter::
+AddWeight(const std::string &name, size_t ind,
+ const std::vector<float> &weights)
{
PARAM_VEC &newWeights = m_setting["weight"];
@@ -478,6 +485,12 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
case Compact:
ptType = "PhraseDictionaryCompact";
break;
+ case SuffixArray:
+ ptType = "PhraseDictionarySuffixArray";
+ break;
+ case DSuffixArray:
+ ptType = "PhraseDictionaryDynSuffixArray";
+ break;
default:
break;
}
@@ -502,6 +515,9 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
++currOldInd;
}
+
+ // cerr << weights.size() << " PHRASE TABLE WEIGHTS "
+ // << __FILE__ << ":" << __LINE__ << endl;
AddWeight(ptType, ptInd, weights);
// actual pt
@@ -527,7 +543,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
ptLine << "num-features=" << numScoreComponent << " ";
ptLine << "table-limit=" << maxTargetPhrase[currDict] << " ";
- if (implementation == SuffixArray) {
+ if (implementation == SuffixArray || implementation == DSuffixArray) {
ptLine << "target-path=" << token[5] << " ";
ptLine << "alignment-path=" << token[6] << " ";
}
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index bc4231b70..a9eb14ba3 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -700,8 +700,8 @@ bool StaticData::LoadData(Parameter *parameter)
SetWeights(model, weights);
} else if (feature == "PhrasePenalty") {
PhrasePenalty* model = new PhrasePenalty(line);
- vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
- SetWeights(model, weights);
+ vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
+ SetWeights(model, weights);
}
#ifdef HAVE_SYNLM
@@ -1177,7 +1177,6 @@ void StaticData::LoadFeatureFunctions()
}
}
- // load phrase table
for (size_t i = 0; i < m_phraseDictionary.size(); ++i) {
PhraseDictionary *pt = m_phraseDictionary[i];
pt->Load();
diff --git a/moses/TargetPhraseCollection.cpp b/moses/TargetPhraseCollection.cpp
index 73f6d0543..9a28dbd5d 100644
--- a/moses/TargetPhraseCollection.cpp
+++ b/moses/TargetPhraseCollection.cpp
@@ -35,11 +35,11 @@ struct CompareTargetPhrase {
void TargetPhraseCollection::NthElement(size_t tableLimit)
{
- vector<TargetPhrase*>::iterator
- iterMiddle = (tableLimit == 0 || m_collection.size() < tableLimit) ?m_collection.end() : m_collection.begin() + tableLimit;
-
- //std::sort(m_collection.begin(), m_collection.end(), CompareTargetPhrase());
- std::nth_element(m_collection.begin(), iterMiddle, m_collection.end(), CompareTargetPhrase());
+ vector<TargetPhrase*>::iterator nth;
+ nth = (tableLimit && tableLimit <= m_collection.size()
+ ? m_collection.begin() + tableLimit
+ : m_collection.end());
+ std::nth_element(m_collection.begin(), nth, m_collection.end(), CompareTargetPhrase());
}
void TargetPhraseCollection::Prune(bool adhereTableLimit, size_t tableLimit)
diff --git a/moses/TranslationModel/BilingualDynSuffixArray.cpp b/moses/TranslationModel/BilingualDynSuffixArray.cpp
index 35b5b37e6..aea2491e9 100644
--- a/moses/TranslationModel/BilingualDynSuffixArray.cpp
+++ b/moses/TranslationModel/BilingualDynSuffixArray.cpp
@@ -3,6 +3,11 @@
#include "moses/FactorCollection.h"
#include "moses/StaticData.h"
#include "moses/TargetPhrase.h"
+
+#include "moses/generic/sorting/NBestList.h"
+#include "moses/generic/sampling/Sampling.h"
+
+#include <boost/foreach.hpp>
#include <iomanip>
using namespace std;
@@ -10,43 +15,48 @@ using namespace std;
namespace Moses
{
-BilingualDynSuffixArray::BilingualDynSuffixArray():
+BilingualDynSuffixArray::
+BilingualDynSuffixArray():
m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
- m_maxSampleSize(20)
+ m_maxSampleSize(20), m_maxPTEntries(20)
{
m_srcSA = 0;
m_trgSA = 0;
- m_srcCorpus = new std::vector<wordID_t>();
- m_trgCorpus = new std::vector<wordID_t>();
- m_srcVocab = new Vocab(false);
- m_trgVocab = new Vocab(false);
+ m_srcCorpus = new vector<wordID_t>();
+ m_trgCorpus = new vector<wordID_t>();
+ m_srcVocab = new Vocab(false);
+ m_trgVocab = new Vocab(false);
m_scoreCmp = 0;
}
-BilingualDynSuffixArray::~BilingualDynSuffixArray()
+BilingualDynSuffixArray::
+~BilingualDynSuffixArray()
{
- if(m_srcSA) delete m_srcSA;
- if(m_trgSA) delete m_trgSA;
- if(m_srcVocab) delete m_srcVocab;
- if(m_trgVocab) delete m_trgVocab;
+ if(m_srcSA) delete m_srcSA;
+ if(m_trgSA) delete m_trgSA;
+ if(m_srcVocab) delete m_srcVocab;
+ if(m_trgVocab) delete m_trgVocab;
if(m_srcCorpus) delete m_srcCorpus;
if(m_trgCorpus) delete m_trgCorpus;
- if(m_scoreCmp) delete m_scoreCmp;
+ if(m_scoreCmp) delete m_scoreCmp;
}
-bool BilingualDynSuffixArray::Load(
- const std::vector<FactorType>& inputFactors,
- const std::vector<FactorType>& outputFactors,
- std::string source, std::string target, std::string alignments,
- const std::vector<float> &weight)
+bool
+BilingualDynSuffixArray::
+Load(
+ const vector<FactorType>& inputFactors,
+ const vector<FactorType>& outputFactors,
+ string source, string target, string alignments,
+ const vector<float> &weight)
{
m_inputFactors = inputFactors;
m_outputFactors = outputFactors;
- m_scoreCmp = new ScoresComp(weight);
+ // m_scoreCmp = new ScoresComp(weight);
InputFileStream sourceStrme(source);
InputFileStream targetStrme(target);
cerr << "Loading source corpus...\n";
+ // Input and Output are 'Factor directions' (whatever that is) defined in Typedef.h
LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
cerr << "Loading target corpus...\n";
LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
@@ -57,88 +67,69 @@ bool BilingualDynSuffixArray::Load(
m_srcSA = new DynSuffixArray(m_srcCorpus);
if(!m_srcSA) return false;
cerr << "Building Target Suffix Array...\n";
- //m_trgSA = new DynSuffixArray(m_trgCorpus);
- //if(!m_trgSA) return false;
- cerr << "\t(Skipped. Not used)\n";
+ m_trgSA = new DynSuffixArray(m_trgCorpus);
+ if(!m_trgSA) return false;
InputFileStream alignStrme(alignments);
cerr << "Loading Alignment File...\n";
LoadRawAlignments(alignStrme);
+ cerr << m_srcSntBreaks.size() << " "
+ << m_trgSntBreaks.size() << " "
+ << m_rawAlignments.size() << endl;
//LoadAlignments(alignStrme);
cerr << "Building frequent word cache...\n";
CacheFreqWords();
- return true;
-}
-
-bool BilingualDynSuffixArray::LoadTM(
- const std::vector<FactorType>& inputFactors,
- const std::vector<FactorType>& outputFactors,
- std::string source, std::string target, std::string alignments,
- const std::vector<float> &weight)
-{
- m_inputFactors = inputFactors;
- m_outputFactors = outputFactors;
-
- m_scoreCmp = new ScoresComp(weight);
- InputFileStream sourceStrme(source);
- InputFileStream targetStrme(target);
-
- cerr << "Loading target corpus...\n";
- LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
-
- cerr << "Loading source corpus...\n";
- LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
- CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
-
- // build suffix arrays and auxilliary arrays
- cerr << "Building Source Suffix Array...\n";
- m_srcSA = new DynSuffixArray(m_srcCorpus);
- if(!m_srcSA) return false;
- cerr << "Building Target Suffix Array...\n";
- //m_trgSA = new DynSuffixArray(m_trgCorpus);
- //if(!m_trgSA) return false;
- cerr << "\t(Skipped. Not used)\n";
-
- InputFileStream alignStrme(alignments);
- cerr << "Loading Alignment File...\n";
- LoadRawAlignments(alignStrme);
- //LoadAlignments(alignStrme);
- cerr << "Building frequent word cache...\n";
- CacheFreqWords();
+ wordID_t const* s = &(*m_srcCorpus)[0];
+ wordID_t const* t = &(*m_trgCorpus)[0];
+ for (size_t sid = 0; sid < m_srcSntBreaks.size(); ++sid) {
+ wordID_t const* se = s + GetSourceSentenceSize(sid);
+ wordID_t const* te = t + GetTargetSentenceSize(sid);
+ vector<short> const& a = m_rawAlignments[sid];
+ m_wrd_cooc.Count(vector<wordID_t>(s,se),
+ vector<wordID_t>(t,te), a,
+ m_srcVocab->GetkOOVWordID(),
+ m_trgVocab->GetkOOVWordID());
+ s = se;
+ t = te;
+ }
+ if (m_srcSntBreaks.size() != m_trgSntBreaks.size() ||
+ m_rawAlignments.size() != m_trgSntBreaks.size()) {
+ cerr << "FATAL ERROR: Line counts don't match!\n"
+ << "Source side text corpus: " << m_srcSntBreaks.size() << "\n"
+ << "Target side text corpus: " << m_trgSntBreaks.size() << "\n"
+ << "Word alignments: " << m_rawAlignments.size() << endl;
+ exit(1);
+ }
return true;
-
}
-int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align)
+int
+BilingualDynSuffixArray::
+LoadRawAlignments(InputFileStream& align)
{
// stores the alignments in the raw file format
- std::string line;
- std::vector<int> vtmp;
- int lineNum = 1;
+ string line;
+ // vector<int> vtmp;
+ // int lineNum = 0;
while(getline(align, line)) {
- if (lineNum % 10000 == 0)
- cerr << lineNum;
- Utils::splitToInt(line, vtmp, "- ");
- CHECK(vtmp.size() % 2 == 0);
- std::vector<short> vAlgn; // store as short ints for memory
- for (std::vector<int>::const_iterator itr = vtmp.begin();
- itr != vtmp.end(); ++itr) {
- vAlgn.push_back(short(*itr));
- }
- m_rawAlignments.push_back(vAlgn);
- ++lineNum;
+ // if (++lineNum % 10000 == 0) cerr << lineNum << endl;
+ LoadRawAlignments(line);
}
return m_rawAlignments.size();
}
-int BilingualDynSuffixArray::LoadRawAlignments(string& align)
+
+
+int
+BilingualDynSuffixArray::
+LoadRawAlignments(string& align)
{
// stores the alignments in the raw file format
vector<int> vtmp;
Utils::splitToInt(align, vtmp, "- ");
CHECK(vtmp.size() % 2 == 0);
vector<short> vAlgn; // store as short ints for memory
- for (std::vector<int>::const_iterator itr = vtmp.begin();
+ for (vector<int>::const_iterator itr = vtmp.begin();
itr != vtmp.end(); ++itr) {
vAlgn.push_back(short(*itr));
}
@@ -146,48 +137,56 @@ int BilingualDynSuffixArray::LoadRawAlignments(string& align)
return m_rawAlignments.size();
}
-int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align)
-{
- std::string line;
- std::vector<int> vtmp;
- int sntIndex(0);
-
- while(getline(align, line)) {
- Utils::splitToInt(line, vtmp, "- ");
- CHECK(vtmp.size() % 2 == 0);
-
- int sourceSize = GetSourceSentenceSize(sntIndex);
- int targetSize = GetTargetSentenceSize(sntIndex);
-
- SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
- for(int i=0; i < (int)vtmp.size(); i+=2) {
- int sourcePos = vtmp[i];
- int targetPos = vtmp[i+1];
- CHECK(sourcePos < sourceSize);
- CHECK(targetPos < targetSize);
-
- curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
- curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
- }
- curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
- curSnt.trgSnt = m_trgCorpus + sntIndex;
- m_alignments.push_back(curSnt);
-
- sntIndex++;
- }
- return m_alignments.size();
-}
-
-SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const
+// int
+// BilingualDynSuffixArray::
+// LoadAlignments(InputFileStream& align)
+// {
+// string line;
+// vector<int> vtmp;
+// int sntIndex = 0;
+
+// while(getline(align, line))
+// {
+// Utils::splitToInt(line, vtmp, "- ");
+// CHECK(vtmp.size() % 2 == 0);
+
+// int sourceSize = GetSourceSentenceSize(sntIndex);
+// int targetSize = GetTargetSentenceSize(sntIndex);
+
+// SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
+// for(int i=0; i < (int)vtmp.size(); i+=2)
+// {
+// int sourcePos = vtmp[i];
+// int targetPos = vtmp[i+1];
+// CHECK(sourcePos < sourceSize);
+// CHECK(targetPos < targetSize);
+
+// curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word
+// curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
+// }
+// curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence
+// curSnt.trgSnt = m_trgCorpus + sntIndex;
+// m_alignments.push_back(curSnt);
+
+// sntIndex++;
+// }
+// return m_alignments.size();
+// }
+
+SentenceAlignment
+BilingualDynSuffixArray::
+GetSentenceAlignment(const int sntIndex, bool trg2Src) const
{
// retrieves the alignments in the format used by SentenceAlignment.Extract()
- int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex);
- int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex);
- std::vector<short> alignment = m_rawAlignments.at(sntIndex);
+ int t = GetTargetSentenceSize(sntIndex);
+ int s = GetSourceSentenceSize(sntIndex);
+ int sntGiven = trg2Src ? t : s;
+ int sntExtract = trg2Src ? s : t;
SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
- for(size_t i=0; i < alignment.size(); i+=2) {
- int sourcePos = alignment[i];
- int targetPos = alignment[i+1];
+ vector<short> const& a = m_rawAlignments.at(sntIndex);
+ for(size_t i=0; i < a.size(); i+=2) {
+ int sourcePos = a[i];
+ int targetPos = a[i+1];
if(trg2Src) {
curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word
curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
@@ -202,32 +201,50 @@ SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntInd
return curSnt;
}
-bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,
- const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const
+bool
+BilingualDynSuffixArray::
+ExtractPhrases(const int& sntIndex,
+ const int& wordIndex,
+ const int& sourceSize,
+ vector<PhrasePair*>& phrasePairs,
+ bool trg2Src) const
{
/* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
* parameter */
SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
// get span of phrase in source sentence
int beginSentence = m_srcSntBreaks[sntIndex];
- int rightIdx = wordIndex - beginSentence
- ,leftIdx = rightIdx - sourceSize + 1;
+ int rightIdx = wordIndex - beginSentence;
+ int leftIdx = rightIdx - sourceSize + 1;
return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
}
-int BilingualDynSuffixArray::LoadCorpus(FactorDirection direction, InputFileStream& corpus, const FactorList& factors,
- std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray,
- Vocab* vocab)
+void
+BilingualDynSuffixArray::
+CleanUp(const InputType& source)
{
- std::string line, word;
+ //m_wordPairCache.clear();
+}
+
+int
+BilingualDynSuffixArray::
+LoadCorpus(FactorDirection direction,
+ InputFileStream & corpus,
+ const FactorList & factors,
+ vector<wordID_t> & cArray,
+ vector<wordID_t> & sntArray,
+ Vocab* vocab)
+{
+ string line, word;
int sntIdx(0);
-// corpus.seekg(0); Seems needless -> commented out to allow loading of gzipped corpora (gzfilebuf doesn't support seeking).
- const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
+ // corpus.seekg(0); Seems needless -> commented out to allow
+ // loading of gzipped corpora (gzfilebuf doesn't support seeking).
+ const string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
while(getline(corpus, line)) {
sntArray.push_back(sntIdx);
Phrase phrase(ARRAY_SIZE_INCR);
// parse phrase
- phrase.CreateFromString(direction, factors, line, factorDelimiter, NULL);
+ phrase.CreateFromString( direction, factors, line, factorDelimiter, NULL);
// store words in vocabulary and corpus
for( size_t i = 0; i < phrase.GetSize(); ++i) {
cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
@@ -239,7 +256,9 @@ int BilingualDynSuffixArray::LoadCorpus(FactorDirection direction, InputFileStre
return cArray.size();
}
-bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
+bool
+BilingualDynSuffixArray::
+GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
{
// looks up the SA vocab ids for the current src phrase
size_t phraseSize = src.GetSize();
@@ -251,87 +270,131 @@ bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &outp
return false;
} else {
output.SetId(pos, arrayId);
- //cerr << arrayId << " ";
}
}
return true;
}
-pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
+pair<float, float>
+BilingualDynSuffixArray::
+GetLexicalWeight(const PhrasePair& pp) const
{
- //return pair<float, float>(1, 1);
- float srcLexWeight(1.0), trgLexWeight(1.0);
- std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words
- //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
- const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
- std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
- // for each source word
- for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
- float srcSumPairProbs(0);
- wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
- const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
- // for each target word aligned to this source word in this alignment
- if(srcWordAlignments.size() == 0) { // get p(NULL|src)
- pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
- itrCache = m_wordPairCache.find(wordpair);
- if(itrCache == m_wordPairCache.end()) { // if not in cache
- CacheWordProbs(srcWord);
- itrCache = m_wordPairCache.find(wordpair); // search cache again
- }
- CHECK(itrCache != m_wordPairCache.end());
- srcSumPairProbs += itrCache->second.first;
- targetProbs[wordpair] = itrCache->second.second;
- } else { // extract p(trg|src)
- for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
- int trgIdx = srcWordAlignments[i];
- wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
- // get probability of this source->target word pair
- pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
- itrCache = m_wordPairCache.find(wordpair);
- if(itrCache == m_wordPairCache.end()) { // if not in cache
- CacheWordProbs(srcWord);
- itrCache = m_wordPairCache.find(wordpair); // search cache again
- }
- CHECK(itrCache != m_wordPairCache.end());
- srcSumPairProbs += itrCache->second.first;
- targetProbs[wordpair] = itrCache->second.second;
- }
- }
- float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
- srcLexWeight *= (srcNormalizer * srcSumPairProbs);
- } // end for each source word
- for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
- float trgSumPairProbs(0);
- wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
- for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
- = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
- if(trgItr->first.second == trgWord)
- trgSumPairProbs += trgItr->second;
- }
- if(trgSumPairProbs == 0) continue; // currently don't store target-side SA
- int noAligned = alignment.numberAligned.at(trgIdx);
- float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
- trgLexWeight *= (trgNormalizer * trgSumPairProbs);
+ // sp,tp: sum of link probabilities
+ // sc,tc: count of links
+ int src_size = pp.GetSourceSize();
+ int trg_size = pp.GetTargetSize();
+ vector<float> sp(src_size, 0), tp(trg_size, 0);
+ vector<int> sc(src_size,0), tc(trg_size,0);
+ wordID_t const* sw = &(m_srcCorpus->at(m_srcSntBreaks.at(pp.m_sntIndex)));
+ wordID_t const* tw = &(m_trgCorpus->at(m_trgSntBreaks.at(pp.m_sntIndex)));
+ vector<short> const & a = m_rawAlignments.at(pp.m_sntIndex);
+ for (size_t i = 0; i < a.size(); i += 2) {
+ int s = a[i], t = a.at(i+1), sx, tx;
+ // sx, tx: local positions within phrase pair
+
+ if (s < pp.m_startSource || t < pp.m_startTarget) continue;
+ if ((sx = s - pp.m_startSource) >= src_size) continue;
+ if ((tx = t - pp.m_startTarget) >= trg_size) continue;
+ sp[sx] += m_wrd_cooc.pbwd(sw[s],tw[t]);
+ tp[tx] += m_wrd_cooc.pfwd(sw[s],tw[t]);
+ ++sc[sx];
+ ++tc[tx];
+ }
+ pair<float,float> ret(1,1);
+ wordID_t null_trg = m_trgVocab->GetkOOVWordID();
+ wordID_t null_src = m_srcVocab->GetkOOVWordID();
+ for (size_t i = 0, k = pp.m_startSource; i < sp.size(); ++i, ++k) {
+ if (sc[i]) ret.first *= sp[i]/sc[i];
+ else ret.first *= m_wrd_cooc.pbwd(sw[k], null_trg);
+ }
+ for (size_t i = 0, k = pp.m_startTarget; i < tp.size(); ++i, ++k) {
+ if (tc[i]) ret.second *= tp[i]/tc[i];
+ else ret.second *= m_wrd_cooc.pfwd(null_src,tw[k]);
}
- // TODO::Need to get p(NULL|trg)
- return pair<float, float>(srcLexWeight, trgLexWeight);
+ return ret;
+
+ // //return pair<float, float>(1, 1);
+ // float srcLexWeight(1.0), trgLexWeight(1.0);
+ // map<pair<wordID_t, wordID_t>, float> targetProbs;
+ // // collects sum of target probs given source words
+
+ // //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
+ // const SentenceAlignment alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
+ // map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
+ // // for each source word
+ // for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
+ // float srcSumPairProbs(0);
+ // wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs
+ // const vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
+ // // for each target word aligned to this source word in this alignment
+ // if(srcWordAlignments.size() == 0) { // get p(NULL|src)
+ // pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
+ // itrCache = m_wordPairCache.find(wordpair);
+ // if(itrCache == m_wordPairCache.end()) { // if not in cache
+ // CacheWordProbs(srcWord);
+ // itrCache = m_wordPairCache.find(wordpair); // search cache again
+ // }
+ // CHECK(itrCache != m_wordPairCache.end());
+ // srcSumPairProbs += itrCache->second.first;
+ // targetProbs[wordpair] = itrCache->second.second;
+ // }
+ // else { // extract p(trg|src)
+ // for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
+ // int trgIdx = srcWordAlignments[i];
+ // wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
+ // // get probability of this source->target word pair
+ // pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
+ // itrCache = m_wordPairCache.find(wordpair);
+ // if(itrCache == m_wordPairCache.end()) { // if not in cache
+ // CacheWordProbs(srcWord);
+ // itrCache = m_wordPairCache.find(wordpair); // search cache again
+ // }
+ // CHECK(itrCache != m_wordPairCache.end());
+ // srcSumPairProbs += itrCache->second.first;
+ // targetProbs[wordpair] = itrCache->second.second;
+ // }
+ // }
+ // float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
+ // srcLexWeight *= (srcNormalizer * srcSumPairProbs);
+ // } // end for each source word
+
+ // for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
+ // float trgSumPairProbs(0);
+ // wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
+ // for (map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
+ // = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
+ // if(trgItr->first.second == trgWord)
+ // trgSumPairProbs += trgItr->second;
+ // }
+ // if(trgSumPairProbs == 0) continue; // currently don't store target-side SA
+ // int noAligned = alignment.numberAligned.at(trgIdx);
+ // float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
+ // trgLexWeight *= (trgNormalizer * trgSumPairProbs);
+ // }
+
+ // // TODO::Need to get p(NULL|trg)
+
+ // return pair<float, float>(srcLexWeight, trgLexWeight);
}
-void BilingualDynSuffixArray::CacheFreqWords() const
+
+void
+BilingualDynSuffixArray::
+CacheFreqWords() const
{
- std::multimap<int, wordID_t> wordCnts;
+ multimap<int, wordID_t> wordCnts;
// for each source word in vocab
Vocab::Word2Id::const_iterator it;
for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) {
// get its frequency
wordID_t srcWord = it->second;
- std::vector<wordID_t> sword(1, srcWord), wrdIndices;
+ vector<wordID_t> sword(1, srcWord), wrdIndices;
m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
if(wrdIndices.size() >= 1000) { // min count
wordCnts.insert(make_pair(wrdIndices.size(), srcWord));
}
}
int numSoFar(0);
- std::multimap<int, wordID_t>::reverse_iterator ritr;
+ multimap<int, wordID_t>::reverse_iterator ritr;
for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
m_freqWordsCached.insert(ritr->second);
CacheWordProbs(ritr->second);
@@ -339,20 +402,23 @@ void BilingualDynSuffixArray::CacheFreqWords() const
}
cerr << "\tCached " << m_freqWordsCached.size() << " source words\n";
}
-void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
+
+void
+BilingualDynSuffixArray::
+CacheWordProbs(wordID_t srcWord) const
{
- std::map<wordID_t, int> counts;
- std::vector<wordID_t> sword(1, srcWord), wrdIndices;
+ map<wordID_t, int> counts;
+ vector<wordID_t> sword(1, srcWord), wrdIndices;
bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
CHECK(ret);
- std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
+ vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
float denom(0);
// for each occurrence of this word
for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
CHECK(sntIdx != -1);
int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
- const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
+ const vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
if(srcAlg.size() == 0) {
++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
++denom;
@@ -366,7 +432,7 @@ void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
}
// now we've gotten counts of all target words aligned to this source word
// get probs and cache all pairs
- for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
+ for(map<wordID_t, int>::const_iterator itrCnt = counts.begin();
itrCnt != counts.end(); ++itrCnt) {
pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg)
@@ -375,7 +441,9 @@ void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
}
}
-SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
+SAPhrase
+BilingualDynSuffixArray::
+TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
{
// takes sentence indexes and looks up vocab IDs
SAPhrase phraseIds(phrasepair.GetTargetSize());
@@ -388,7 +456,9 @@ SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepa
return phraseIds;
}
-TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase) const
+TargetPhrase*
+BilingualDynSuffixArray::
+GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase) const
{
TargetPhrase* targetPhrase = new TargetPhrase();
for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
@@ -401,106 +471,211 @@ TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase,
return targetPhrase;
}
-void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const
+/// Gather translation candidates for source phrase /src/ and store raw
+// phrase pair statistics in /pstats/. Return the sample rate
+// (number of samples considered / total number of hits) and total number of
+// phrase pairs
+pair<float,float>
+BilingualDynSuffixArray::
+GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const
{
- //cerr << "phrase is \"" << src << endl;
- size_t sourceSize = src.GetSize();
- SAPhrase localIDs(sourceSize);
- if(!GetLocalVocabIDs(src, localIDs)) return;
- float totalTrgPhrases(0);
- std::map<SAPhrase, int> phraseCounts;
- //std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from
- std::map<SAPhrase, pair<float, float> > lexicalWeights;
- std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
- std::vector<unsigned> wrdIndices;
- // extract sentence IDs from SA and return rightmost index of phrases
- if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
- SampleSelection(wrdIndices);
- std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
- // for each sentence with this phrase
- for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
- std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence
- int sntIndex = sntIndexes.at(snt); // get corpus index for sentence
- if(sntIndex == -1) continue; // bad flag set by GetSntIndexes()
- ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs);
- //cerr << "extracted " << phrasePairs.size() << endl;
- totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair
- std::vector<PhrasePair*>::iterator iterPhrasePair;
- for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
- SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
- phraseCounts[phrase]++; // count each unique phrase
- // NOTE::Correct but slow to extract lexical weight here. could do
- // it later for only the top phrases chosen by phrase prob p(e|f)
- pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair
- itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached
- if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
- itrLexW->second = lexWeight; // if this lex weight is greater save it
- else lexicalWeights[phrase] = lexWeight; // else save
+ typedef map<SAPhrase, vector<float> >::iterator pstat_iter;
+ typedef map<SAPhrase, vector<float> >::value_type pstat_entry;
+ pair<float,float> ret(0,0);
+ float& sampleRate = ret.first;
+ float& totalPhrases = ret.second;
+ size_t srcSize = src.GetSize();
+ SAPhrase localIDs(srcSize);
+ vector<unsigned> wrdIndices;
+ if(!GetLocalVocabIDs(src, localIDs) ||
+ !m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices))
+ return ret; // source phrase contains OOVs
+
+ // select a sample of the occurrences for phrase extraction
+ size_t m1 = wrdIndices.size();
+ SampleSelection(wrdIndices); // careful! SampleSelection alters wrdIndices!
+ sampleRate = float(wrdIndices.size())/m1;
+
+ // determine the sentences in which these phrases occur
+ vector<int> sntIndices = GetSntIndexes(wrdIndices, srcSize, m_srcSntBreaks);
+ for(size_t s = 0; s < sntIndices.size(); ++s) {
+ int sntStart = sntIndices.at(s);
+ if(sntStart == -1) continue; // marked as bad by GetSntIndexes()
+ vector<PhrasePair*> phrasePairs;
+ ExtractPhrases(sntStart, wrdIndices[s], srcSize, phrasePairs);
+ totalPhrases += phrasePairs.size();
+ vector<PhrasePair*>::iterator p;
+ for (p = phrasePairs.begin(); p != phrasePairs.end(); ++p) {
+ assert(*p);
+ pair<float, float> lex = GetLexicalWeight(**p);
+ pstat_entry entry(TrgPhraseFromSntIdx(**p), Scores(5));
+ pair<pstat_iter, bool> foo = pstats.insert(entry);
+ Scores& feats = foo.first->second;
+ if (foo.second) {
+ feats[0] = 1; // count
+ feats[1] = lex.first;
+ feats[3] = lex.second;
+ } else {
+ feats[0] += 1;
+ feats[1] = max(feats[1],lex.first);
+ feats[3] = max(feats[3],lex.second);
+ }
+ delete *p;
}
- // done with sentence. delete SA phrase pairs
- RemoveAllInColl(phrasePairs);
} // done with all sentences
- // convert to moses phrase pairs
- std::map<SAPhrase, int>::const_iterator iterPhrases;
- std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp);
- // get scores of all phrases
- for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) {
- float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases;
- itrLexW = lexicalWeights.find(iterPhrases->first);
- CHECK(itrLexW != lexicalWeights.end());
- Scores scoreVector(3);
- scoreVector[0] = std::log(trg2SrcMLE);
- scoreVector[1] = std::log(itrLexW->second.first);
- scoreVector[2] = 1; // exp(1);
- phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
- }
- // return top scoring phrases
- std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
- for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) {
- Scores scoreVector = ritr->first;
- TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second, src);
- target.push_back(make_pair( scoreVector, targetPhrase));
- if(target.size() == m_maxSampleSize) break;
+ BOOST_FOREACH(pstat_entry & e, pstats) {
+ Scores& feats = e.second;
+ // 0: bwd phrase prob
+ // 1: lex 1
+ // 2: fwd phrase prob
+ // 3: lex 2
+ // 4: phrase penalty
+ float x = m_trgSA->GetCount(e.first.words)-feats[0] * sampleRate;
+ feats[4] = 1;
+ feats[3] = log(feats[3]);
+ feats[2] = log(feats[0]) - log(totalPhrases);
+ feats[1] = log(feats[1]);
+ feats[0] = log(feats[0]) - log(feats[0] + x);
}
+ return ret;
}
-std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
- const int sourceSize, const std::vector<unsigned>& sntBreaks) const
+// void
+// BilingualDynSuffixArray::
+// GetTargetPhrasesByLexicalWeight
+// (const Phrase& src, vector< pair<Scores, TargetPhrase*> > & target)
+// const
+// {
+// size_t sourceSize = src.GetSize();
+// SAPhrase localIDs(sourceSize);
+// if(!GetLocalVocabIDs(src, localIDs))
+// return; // source phrase contains OOVs
+
+// float totalTrgPhrases(0);
+// map<SAPhrase, int> phraseCounts;
+// map<SAPhrase, pair<float, float> > lexicalWeights;
+// map<SAPhrase, pair<float, float> >::iterator itrLexW;
+
+// // find all occurrences of the phrase in the corpus;
+// // wrdIndices stores the rightmost position
+// vector<unsigned> wrdIndices;
+// if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices))
+// return; // none found
+
+// // select a sample of the occurrences for phrase extraction
+// size_t m1 = wrdIndices.size();
+// SampleSelection(wrdIndices);
+// float sampleRate = float(wrdIndices.size())/m1;
+
+// // determine the sentences in which these phrases occur
+// vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
+
+// // for each sentence with this phrase
+// for(size_t s = 0; s < sntIndexes.size(); ++s)
+// {
+// vector<PhrasePair*> phrasePairs;
+// int sntIndex = sntIndexes.at(s);
+// if(sntIndex == -1) continue; // bad flag set by GetSntIndexes()
+// ExtractPhrases(sntIndex, wrdIndices[s], sourceSize, phrasePairs);
+// totalTrgPhrases += phrasePairs.size();
+// vector<PhrasePair*>::iterator p;
+// for (p = phrasePairs.begin(); p != phrasePairs.end(); ++p)
+// {
+// PhrasePair* px = *p;
+// assert(px);
+// SAPhrase phrase = TrgPhraseFromSntIdx(*px);
+// phraseCounts[phrase]++; // count each unique phrase
+// // NOTE::Correct but slow to extract lexical weight here. could do
+// // it later for only the top phrases chosen by phrase prob p(e|f)
+// pair<float, float> lexWeight = GetLexicalWeight(**p);
+// itrLexW = lexicalWeights.find(phrase);
+// if((itrLexW != lexicalWeights.end()) &&
+// (itrLexW->second.first < lexWeight.first))
+// itrLexW->second = lexWeight; // if this lex weight is greater save it
+// else lexicalWeights[phrase] = lexWeight; // else save
+// }
+// // done with sentence. delete SA phrase pairs
+// BOOST_FOREACH(PhrasePair* p, phrasePairs) delete p;
+// } // done with all sentences
+
+// cerr << "Done extracting ... " << endl;
+
+// // convert to moses phrase pairs
+// map<SAPhrase, int>::iterator pcnt;
+// BetterPhrase better(*m_scoreCmp);
+// NBestList<pair<Scores,SAPhrase const*>,BetterPhrase> nbest(m_maxPTEntries,better);
+// for(pcnt = phraseCounts.begin(); pcnt != phraseCounts.end(); ++pcnt)
+// {
+// float tmarginal = (m_trgSA->GetCount(pcnt->first.words) * sampleRate);
+// float pfwd = pcnt->second / totalTrgPhrases;
+// float pbwd = pcnt->second / tmarginal;
+// pair<float, float> lexWeight = lexicalWeights[pcnt->first];
+// pair<Scores, SAPhrase const*> entry;
+// entry.first.resize(5);
+// entry.first[0] = pbwd;
+// entry.first[1] = itrLexW->second.first;
+// entry.first[2] = pfwd;
+// entry.first[3] = itrLexW->second.second;
+// entry.first[4] = 2.718; // exp(1);
+// entry.second = &pcnt->first;
+// nbest.add(entry);
+// }
+
+// // return top scoring phrases
+// for (size_t n = 0; n < nbest.size(); ++n)
+// {
+// pair<Scores, SAPhrase const*> e = nbest[n];
+// target.push_back(make_pair(e.first,GetMosesFactorIDs(*e.second, src)));
+// }
+// }
+
+vector<int>
+BilingualDynSuffixArray::
+GetSntIndexes(vector<unsigned>& wrdIndices,
+ const int sourceSize,
+ const vector<unsigned>& sntBreaks) const
{
- std::vector<unsigned>::const_iterator vit;
- std::vector<int> sntIndexes;
+ vector<unsigned>::const_iterator vit;
+ vector<int> sntIndices;
for(size_t i=0; i < wrdIndices.size(); ++i) {
- vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
+ vit = upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
int index = int(vit - sntBreaks.begin()) - 1;
// check for phrases that cross sentence boundaries
if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
- sntIndexes.push_back(-1); // set bad flag
+ sntIndices.push_back(-1); // set bad flag
else
- sntIndexes.push_back(index); // store the index of the sentence in the corpus
+ sntIndices.push_back(index); // store the index of the sentence in the corpus
}
- return sntIndexes;
+ return sntIndices;
}
-int BilingualDynSuffixArray::SampleSelection(std::vector<unsigned>& sample,
- int sampleSize) const
+int
+BilingualDynSuffixArray::
+SampleSelection(vector<unsigned>& sample, int sampleSize) const
{
// only use top 'sampleSize' number of samples
- if(sample.size() > (size_t)sampleSize)
- sample.erase(sample.begin()+sampleSize, sample.end());
+ vector<unsigned> s;
+ randomSample<unsigned>(s,sampleSize,sample.size());
+ for (size_t i = 0; i < s.size(); ++i)
+ s[i] = sample[s[i]];
+ sample.swap(s);
return sample.size();
}
-void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment)
+void
+BilingualDynSuffixArray::
+addSntPair(string& source, string& target, string& alignment)
{
vuint_t srcFactor, trgFactor;
- cerr << "source, target, alignment = " << source << ", " << target << ", " << alignment << endl;
- const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
+ cerr << "source, target, alignment = " << source << ", "
+ << target << ", " << alignment << endl;
+ const string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
Phrase sphrase(ARRAY_SIZE_INCR);
sphrase.CreateFromString(Input, m_inputFactors, source, factorDelimiter, NULL);
m_srcVocab->MakeOpen();
- std::vector<wordID_t> sIDs(sphrase.GetSize());
+ vector<wordID_t> sIDs(sphrase.GetSize());
// store words in vocabulary and corpus
for(int i = sphrase.GetSize()-1; i >= 0; --i) {
sIDs[i] = m_srcVocab->GetWordID(sphrase.GetWord(i)); // get vocab id backwards
@@ -515,7 +690,7 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
Phrase tphrase(ARRAY_SIZE_INCR);
tphrase.CreateFromString(Output, m_outputFactors, target, factorDelimiter, NULL);
m_trgVocab->MakeOpen();
- std::vector<wordID_t> tIDs(tphrase.GetSize());
+ vector<wordID_t> tIDs(tphrase.GetSize());
for(int i = tphrase.GetSize()-1; i >= 0; --i) {
tIDs[i] = m_trgVocab->GetWordID(tphrase.GetWord(i)); // get vocab id
}
@@ -529,18 +704,26 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
cerr << "gets to 2\n";
m_srcSA->Insert(&srcFactor, oldSrcCrpSize);
cerr << "gets to 3\n";
- //m_trgSA->Insert(&trgFactor, oldTrgCrpSize);
+ m_trgSA->Insert(&trgFactor, oldTrgCrpSize);
LoadRawAlignments(alignment);
m_trgVocab->MakeClosed();
+
+ m_wrd_cooc.Count(sIDs,tIDs, m_rawAlignments.back(),
+ m_srcVocab->GetkOOVWordID(),
+ m_trgVocab->GetkOOVWordID());
+
//for(size_t i=0; i < sphrase.GetSize(); ++i)
//ClearWordInCache(sIDs[i]);
}
-void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord)
+
+void
+BilingualDynSuffixArray::
+ClearWordInCache(wordID_t srcWord)
{
if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end())
return;
- std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it,
+ map<pair<wordID_t, wordID_t>, pair<float, float> >::iterator it,
first, last;
for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) {
if(it->first.first == srcWord) { // all source words grouped
@@ -553,18 +736,23 @@ void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord)
m_wordPairCache.erase(first, last);
}
}
-SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
- :m_sntIndex(sntIndex)
- ,numberAligned(targetSize, 0)
- ,alignedList(sourceSize)
+
+SentenceAlignment::
+SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
+ : m_sntIndex(sntIndex)
+ , numberAligned(targetSize, 0)
+ , alignedList(sourceSize)
{
- for(int i=0; i < sourceSize; ++i) {
- std::vector<int> trgWrd;
- alignedList[i] = trgWrd;
- }
+ // What is the code below supposed to accomplish??? UG.
+ // for(int i=0; i < sourceSize; ++i) {
+ // vector<int> trgWrd;
+ // alignedList[i] = trgWrd;
+ // }
}
-bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const
+bool
+SentenceAlignment::
+Extract(int maxPhraseLength, vector<PhrasePair*> &ret, int startSource, int endSource) const
{
// foreign = target, F=T
// english = source, E=S
@@ -572,7 +760,7 @@ bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &r
int minTarget = 9999;
int maxTarget = -1;
- std::vector< int > usedTarget = numberAligned;
+ vector< int > usedTarget = numberAligned;
for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) {
for(int ind=0; ind < (int)alignedList[sourcePos].size(); ind++) {
int targetPos = alignedList[sourcePos][ind];
@@ -626,4 +814,42 @@ bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &r
}
+int
+BilingualDynSuffixArray::
+GetSourceSentenceSize(size_t sentenceId) const
+{
+ return (sentenceId==m_srcSntBreaks.size()-1) ?
+ m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
+ m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
+}
+
+int
+BilingualDynSuffixArray::
+GetTargetSentenceSize(size_t sentenceId) const
+{
+ return (sentenceId==m_trgSntBreaks.size()-1) ?
+ m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
+ m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
+}
+
+BetterPhrase::
+BetterPhrase(ScoresComp const& sc)
+ : cmp(sc) {}
+
+// bool
+// BetterPhrase::
+// operator()(pair<Scores, TargetPhrase const*> const& a,
+// pair<Scores, TargetPhrase const*> const& b) const
+// {
+// return cmp(b.first,a.first);
+// }
+
+bool
+BetterPhrase::
+operator()(pair<Scores, SAPhrase const*> const& a,
+ pair<Scores, SAPhrase const*> const& b) const
+{
+ return cmp(b.first,a.first);
+}
+
}// end namepsace
diff --git a/moses/TranslationModel/BilingualDynSuffixArray.h b/moses/TranslationModel/BilingualDynSuffixArray.h
index 7a4d47eea..71f032507 100644
--- a/moses/TranslationModel/BilingualDynSuffixArray.h
+++ b/moses/TranslationModel/BilingualDynSuffixArray.h
@@ -5,23 +5,29 @@
#include "moses/TranslationModel/DynSAInclude/vocab.h"
#include "moses/TranslationModel/DynSAInclude/types.h"
#include "moses/TranslationModel/DynSAInclude/utils.h"
+#include "moses/TranslationModel/WordCoocTable.h"
#include "moses/InputFileStream.h"
#include "moses/FactorTypeSet.h"
#include "moses/TargetPhrase.h"
+#include <boost/dynamic_bitset.hpp>
+#include "moses/TargetPhraseCollection.h"
+#include <map>
+using namespace std;
namespace Moses
{
+class PhraseDictionaryDynSuffixArray;
/** @todo ask Abbey Levenberg
*/
class SAPhrase
{
public:
- std::vector<wordID_t> words;
+ vector<wordID_t> words;
SAPhrase(size_t phraseSize)
- :words(phraseSize) {
- }
+ :words(phraseSize)
+ {}
void SetId(size_t pos, wordID_t id) {
CHECK(pos < words.size());
@@ -43,12 +49,16 @@ public:
, m_endTarget(endTarget)
, m_startSource(startSource)
, m_endSource(endSource)
- , m_sntIndex(sntIndex) {
- }
+ , m_sntIndex(sntIndex)
+ {}
size_t GetTargetSize() const {
return m_endTarget - m_startTarget + 1;
}
+
+ size_t GetSourceSize() const {
+ return m_endSource - m_startSource + 1;
+ }
};
/** @todo ask Abbey Levenberg
@@ -58,32 +68,43 @@ class SentenceAlignment
public:
SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
int m_sntIndex;
- std::vector<wordID_t>* trgSnt;
- std::vector<wordID_t>* srcSnt;
- std::vector<int> numberAligned;
- std::vector< std::vector<int> > alignedList;
- bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
+ vector<wordID_t>* trgSnt;
+ vector<wordID_t>* srcSnt;
+ vector<int> numberAligned;
+ vector< vector<int> > alignedList;
+ bool Extract(int maxPhraseLength, vector<PhrasePair*> &ret,
+ int startSource, int endSource) const;
};
+
class ScoresComp
{
public:
- ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
+ ScoresComp(const vector<float>& weights): m_weights(weights) {}
bool operator()(const Scores& s1, const Scores& s2) const {
return s1[0] < s2[0]; // just p(e|f) as approximation
- /*float score1(0), score2(0);
- int idx1(0), idx2(0);
- for (Scores::const_iterator itr = s1.begin();
- itr != s1.end(); ++itr) {
- score1 += log(*itr * m_weights.at(idx1++));
- }
- for (Scores::const_iterator itr = s2.begin();
- itr != s2.end(); ++itr) {
- score2 += log(*itr * m_weights.at(idx2++));
- }
- return score1 < score2;*/
+ // float score1(0), score2(0);
+ // int idx1(0), idx2(0);
+ // for (Scores::const_iterator itr = s1.begin();
+ // itr != s1.end(); ++itr) {
+ // score1 += log(*itr * m_weights.at(idx1++));
+ // }
+ // for (Scores::const_iterator itr = s2.begin();
+ // itr != s2.end(); ++itr) {
+ // score2 += log(*itr * m_weights.at(idx2++));
+ // }
+ // return score1 < score2;
}
private:
- const std::vector<float>& m_weights;
+ const vector<float>& m_weights;
+};
+
+struct BetterPhrase {
+ ScoresComp const& cmp;
+ BetterPhrase(ScoresComp const& sc);
+ // bool operator()(pair<Scores, TargetPhrase const*> const& a,
+ // pair<Scores, TargetPhrase const*> const& b) const;
+ bool operator()(pair<Scores, SAPhrase const*> const& a,
+ pair<Scores, SAPhrase const*> const& b) const;
};
/** @todo ask Abbey Levenberg
@@ -93,66 +114,70 @@ class BilingualDynSuffixArray
public:
BilingualDynSuffixArray();
~BilingualDynSuffixArray();
- bool Load( const std::vector<FactorType>& inputFactors,
- const std::vector<FactorType>& outputTactors,
- std::string source, std::string target, std::string alignments,
- const std::vector<float> &weight);
- bool LoadTM( const std::vector<FactorType>& inputFactors,
- const std::vector<FactorType>& outputTactors,
- std::string source, std::string target, std::string alignments,
- const std::vector<float> &weight);
- void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
+ bool Load( const vector<FactorType>& inputFactors,
+ const vector<FactorType>& outputTactors,
+ string source, string target, string alignments,
+ const vector<float> &weight);
+ // bool LoadTM( const vector<FactorType>& inputFactors,
+ // const vector<FactorType>& outputTactors,
+ // string source, string target, string alignments,
+ // const vector<float> &weight);
+ void GetTargetPhrasesByLexicalWeight(const Phrase& src, vector< pair<Scores, TargetPhrase*> >& target) const;
+
+ void CleanUp(const InputType& source);
void addSntPair(string& source, string& target, string& alignment);
+ pair<float,float>
+ GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const;
+
+ TargetPhrase*
+ GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
+
private:
- DynSuffixArray* m_srcSA;
- DynSuffixArray* m_trgSA;
- std::vector<wordID_t>* m_srcCorpus;
- std::vector<wordID_t>* m_trgCorpus;
- std::vector<FactorType> m_inputFactors;
- std::vector<FactorType> m_outputFactors;
- std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
+
+ mutable WordCoocTable m_wrd_cooc;
+ DynSuffixArray * m_srcSA;
+ DynSuffixArray * m_trgSA;
+ vector<wordID_t>* m_srcCorpus;
+ vector<wordID_t>* m_trgCorpus;
+ vector<FactorType> m_inputFactors;
+ vector<FactorType> m_outputFactors;
+
+ vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
Vocab* m_srcVocab, *m_trgVocab;
ScoresComp* m_scoreCmp;
- std::vector<SentenceAlignment> m_alignments;
- std::vector<std::vector<short> > m_rawAlignments;
+ vector<SentenceAlignment> m_alignments;
+ vector<vector<short> > m_rawAlignments;
- mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
- mutable std::set<wordID_t> m_freqWordsCached;
+ mutable map<pair<wordID_t, wordID_t>, pair<float, float> > m_wordPairCache;
+ mutable set<wordID_t> m_freqWordsCached;
const size_t m_maxPhraseLength, m_maxSampleSize;
-
- int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector<FactorType>& factors,
- std::vector<wordID_t>&, std::vector<wordID_t>&,
+ const size_t m_maxPTEntries;
+ int LoadCorpus(FactorDirection direction,
+ InputFileStream&, const vector<FactorType>& factors,
+ vector<wordID_t>&, vector<wordID_t>&,
Vocab*);
int LoadAlignments(InputFileStream& aligs);
int LoadRawAlignments(InputFileStream& aligs);
int LoadRawAlignments(string& aligs);
- bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
+ bool ExtractPhrases(const int&, const int&, const int&, vector<PhrasePair*>&, bool=false) const;
SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
- int SampleSelection(std::vector<unsigned>&, int = 300) const;
+ int SampleSelection(vector<unsigned>&, int = 300) const;
- std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
- TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
+ vector<int> GetSntIndexes(vector<unsigned>&, int, const vector<unsigned>&) const;
SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
void CacheWordProbs(wordID_t) const;
void CacheFreqWords() const;
void ClearWordInCache(wordID_t);
- std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
+ pair<float, float> GetLexicalWeight(const PhrasePair&) const;
+
+ int GetSourceSentenceSize(size_t sentenceId) const;
+ int GetTargetSentenceSize(size_t sentenceId) const;
- int GetSourceSentenceSize(size_t sentenceId) const {
- return (sentenceId==m_srcSntBreaks.size()-1) ?
- m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
- m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
- }
- int GetTargetSentenceSize(size_t sentenceId) const {
- return (sentenceId==m_trgSntBreaks.size()-1) ?
- m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
- m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
- }
};
} // end namespace
#endif
diff --git a/moses/TranslationModel/DynSuffixArray.cpp b/moses/TranslationModel/DynSuffixArray.cpp
index c5fddf3f0..a5fc32dcc 100644
--- a/moses/TranslationModel/DynSuffixArray.cpp
+++ b/moses/TranslationModel/DynSuffixArray.cpp
@@ -1,5 +1,6 @@
#include "DynSuffixArray.h"
#include <iostream>
+#include <boost/foreach.hpp>
using namespace std;
@@ -215,8 +216,37 @@ void DynSuffixArray::Substitute(vuint_t* /* newSents */, unsigned /* newIndex */
return;
}
+ComparePosition::
+ComparePosition(vuint_t const& crp, vuint_t const& sfa)
+ : m_crp(crp), m_sfa(sfa) { }
+
+bool
+ComparePosition::
+operator()(unsigned const& i, vector<wordID_t> const& phrase) const
+{
+ unsigned const* x = &m_crp.at(i);
+ unsigned const* e = &m_crp.back();
+ size_t k = 0;
+ for (; k < phrase.size() && x < e; ++k, ++x)
+ if (*x != phrase[k]) return *x < phrase[k];
+ return (x == e && k < phrase.size());
+}
+
+bool
+ComparePosition::
+operator()(vector<wordID_t> const& phrase, unsigned const& i) const
+{
+ unsigned const* x = &m_crp.at(i);
+ unsigned const* e = &m_crp.back();
+ size_t k = 0;
+ for (; k < phrase.size() && x < e; ++k, ++x)
+ if (*x != phrase[k]) return phrase[k] < *x;
+ return false; // (k == phrase.size() && x < e);
+}
+
bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices)
{
+ // DOES THIS EVEN WORK WHEN A DynSuffixArray has been saved and reloaded????
pair<vuint_t::iterator,vuint_t::iterator> bounds;
indices->clear();
size_t phrasesize = phrase->size();
@@ -251,6 +281,16 @@ bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices)
return (indices->size() > 0);
}
+size_t
+DynSuffixArray::
+GetCount(vuint_t const& phrase) const
+{
+ ComparePosition cmp(*m_corpus, *m_SA);
+ vuint_t::const_iterator lb = lower_bound(m_SA->begin(), m_SA->end(), phrase, cmp);
+ vuint_t::const_iterator ub = upper_bound(m_SA->begin(), m_SA->end(), phrase, cmp);
+ return ub-lb;
+}
+
void DynSuffixArray::Save(FILE* fout)
{
fWriteVector(fout, *m_SA);
diff --git a/moses/TranslationModel/DynSuffixArray.h b/moses/TranslationModel/DynSuffixArray.h
index 024d6daf1..62f719d57 100644
--- a/moses/TranslationModel/DynSuffixArray.h
+++ b/moses/TranslationModel/DynSuffixArray.h
@@ -11,9 +11,25 @@
namespace Moses
{
-
+using namespace std;
typedef std::vector<unsigned> vuint_t;
+
+/// compare position /i/ in the suffix array /m_sfa/ into corpus /m_crp/
+/// against reference phrase /phrase/
+// added by Ulrich Germann
+class ComparePosition
+{
+ vuint_t const& m_crp;
+ vuint_t const& m_sfa;
+
+public:
+ ComparePosition(vuint_t const& crp, vuint_t const& sfa);
+ bool operator()(unsigned const& i, vector<wordID_t> const& phrase) const;
+ bool operator()(vector<wordID_t> const& phrase, unsigned const& i) const;
+};
+
+
/** @todo ask Abbey Levenberg
*/
class DynSuffixArray
@@ -30,6 +46,8 @@ public:
void Delete(unsigned, unsigned);
void Substitute(vuint_t*, unsigned);
+ size_t GetCount(vuint_t const& phrase) const;
+
private:
vuint_t* m_SA;
vuint_t* m_ISA;
@@ -46,10 +64,10 @@ private:
void PrintAuxArrays() {
std::cerr << "SA\tISA\tF\tL\n";
for(size_t i=0; i < m_SA->size(); ++i)
- std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t" << m_F->at(i) << "\t" << m_L->at(i) << std::endl;
+ std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t"
+ << m_F->at(i) << "\t" << m_L->at(i) << std::endl;
}
};
-
} //end namespace
#endif
diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README
new file mode 100644
index 000000000..19a7f8779
--- /dev/null
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README
@@ -0,0 +1,4 @@
+Specifying Dynamic Suffix Array-based Phrase Tables in moses.ini
+
+[ttable-file]
+14 0 0 5 <source language text file> <target language text file> <file with alignment info in symal format>
diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
index 61305b958..160362f4c 100644
--- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
@@ -3,83 +3,103 @@
#include "moses/StaticData.h"
#include "moses/TargetPhrase.h"
#include <iomanip>
-
+#include <boost/foreach.hpp>
using namespace std;
namespace Moses
{
-PhraseDictionaryDynSuffixArray::PhraseDictionaryDynSuffixArray(const std::string &line)
- :PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
+PhraseDictionaryDynSuffixArray::
+PhraseDictionaryDynSuffixArray(const std::string &line)
+ : PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
,m_biSA(new BilingualDynSuffixArray())
{
ReadParameters();
}
-PhraseDictionaryDynSuffixArray::~PhraseDictionaryDynSuffixArray()
-{
- delete m_biSA;
-}
void PhraseDictionaryDynSuffixArray::Load()
{
SetFeaturesToApply();
- const StaticData &staticData = StaticData::Instance();
- vector<float> weight = staticData.GetWeights(this);
-
- m_biSA->Load( m_input, m_output, m_source, m_target, m_alignments, weight);
+ vector<float> weight = StaticData::Instance().GetWeights(this);
+ m_biSA->Load(m_input, m_output, m_source, m_target, m_alignments, weight);
}
-const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCollection(const Phrase& src) const
+PhraseDictionaryDynSuffixArray::
+~PhraseDictionaryDynSuffixArray()
{
- TargetPhraseCollection *ret = new TargetPhraseCollection();
- std::vector< std::pair< Scores, TargetPhrase*> > trg;
- // extract target phrases and their scores from suffix array
- m_biSA->GetTargetPhrasesByLexicalWeight( src, trg);
+ delete m_biSA;
+}
- std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr;
- for(itr = trg.begin(); itr != trg.end(); ++itr) {
- Scores scoreVector = itr->first;
- TargetPhrase *targetPhrase = itr->second;
- //std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),NegateScore);
- std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
+void PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const std::string& value)
+{
+ if (key == "source") {
+ m_source = value;
+ } else if (key == "target") {
+ m_target = value;
+ } else if (key == "alignment") {
+ m_alignments = value;
+ } else {
+ PhraseDictionary::SetParameter(key, value);
+ }
+}
- targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
- targetPhrase->Evaluate(src);
+const TargetPhraseCollection*
+PhraseDictionaryDynSuffixArray::
+GetTargetPhraseCollection(const Phrase& src) const
+{
+ typedef map<SAPhrase, vector<float> >::value_type pstat_entry;
+ map<SAPhrase, vector<float> > pstats; // phrase (pair) statistics
+ m_biSA->GatherCands(src,pstats);
- //cout << *targetPhrase << "\t" << std::setprecision(8) << scoreVector[2] << endl;
- ret->Add(targetPhrase);
+ TargetPhraseCollection *ret = new TargetPhraseCollection();
+ BOOST_FOREACH(pstat_entry & e, pstats) {
+ TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src);
+ tp->GetScoreBreakdown().Assign(this,e.second);
+ ret->Add(tp);
}
- ret->NthElement(m_tableLimit); // sort the phrases for the dcoder
+ // return ret;
+ // TargetPhraseCollection *ret = new TargetPhraseCollection();
+ // std::vector< std::pair< Scores, TargetPhrase*> > trg;
+ //
+ // // extract target phrases and their scores from suffix array
+ // m_biSA->GetTargetPhrasesByLexicalWeight(src, trg);
+ //
+ // std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr;
+ // for(itr = trg.begin(); itr != trg.end(); ++itr) {
+ // Scores scoreVector = itr->first;
+ // TargetPhrase *targetPhrase = itr->second;
+ // std::transform(scoreVector.begin(),scoreVector.end(),
+ // scoreVector.begin(),FloorScore);
+ // targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
+ // targetPhrase->Evaluate();
+ // ret->Add(targetPhrase);
+ // }
+ ret->NthElement(m_tableLimit); // sort the phrases for the decoder
return ret;
}
-void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment)
+void
+PhraseDictionaryDynSuffixArray::
+insertSnt(string& source, string& target, string& alignment)
{
m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
//StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
}
-void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
-{
- // need to implement --
-}
-ChartRuleLookupManager *PhraseDictionaryDynSuffixArray::CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
+void
+PhraseDictionaryDynSuffixArray::
+deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
{
- throw "Chart decoding not supported by PhraseDictionaryDynSuffixArray";
+ // need to implement --
}
-void PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const std::string& value)
+ChartRuleLookupManager*
+PhraseDictionaryDynSuffixArray::
+CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
{
- if (key == "source") {
- m_source = value;
- } else if (key == "target") {
- m_target = value;
- } else if (key == "alignment") {
- m_alignments = value;
- } else {
- PhraseDictionary::SetParameter(key, value);
- }
+ CHECK(false);
+ return 0;
}
}// end namepsace
diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h
index 2c9a9ceca..8cd761350 100644
--- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h
@@ -17,21 +17,19 @@ class PhraseDictionaryDynSuffixArray: public PhraseDictionary
public:
PhraseDictionaryDynSuffixArray(const std::string &line);
~PhraseDictionaryDynSuffixArray();
-
+ bool InitDictionary();
void Load();
-
// functions below required by base class
const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase& src) const;
void insertSnt(string&, string&, string&);
void deleteSnt(unsigned, unsigned);
ChartRuleLookupManager *CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&);
-
void SetParameter(const std::string& key, const std::string& value);
-
private:
BilingualDynSuffixArray *m_biSA;
std::string m_source, m_target, m_alignments;
+ std::vector<float> m_weight;
};
} // end namespace
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
index 2f05e92b6..06e931fab 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
@@ -31,7 +31,8 @@ using namespace std;
namespace Moses
{
PhraseDictionaryOnDisk::PhraseDictionaryOnDisk(const std::string &line)
- : MyBase("PhraseDictionaryOnDisk", line) {
+ : MyBase("PhraseDictionaryOnDisk", line)
+{
ReadParameters();
}
diff --git a/moses/TranslationModel/RuleTable/Trie.h b/moses/TranslationModel/RuleTable/Trie.h
index 265747260..0d363f747 100644
--- a/moses/TranslationModel/RuleTable/Trie.h
+++ b/moses/TranslationModel/RuleTable/Trie.h
@@ -48,12 +48,6 @@ public:
void Load();
- // Required by PhraseDictionary.
- virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const {
- CHECK(false);
- return NULL;
- }
-
private:
friend class RuleTableLoader;
diff --git a/moses/TranslationModel/WordCoocTable.cpp b/moses/TranslationModel/WordCoocTable.cpp
new file mode 100644
index 000000000..c3223942e
--- /dev/null
+++ b/moses/TranslationModel/WordCoocTable.cpp
@@ -0,0 +1,72 @@
+#include "moses/TranslationModel/WordCoocTable.h"
+using namespace std;
+namespace Moses
+{
+
+WordCoocTable::
+WordCoocTable()
+{
+ m_cooc.reserve(1000000);
+ m_marg1.reserve(1000000);
+ m_marg2.reserve(1000000);
+}
+
+WordCoocTable::
+WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2)
+ : m_cooc(VocabSize1), m_marg1(VocabSize1,0), m_marg2(VocabSize2, 0)
+{}
+
+void
+WordCoocTable::
+Count(size_t const a, size_t const b)
+{
+ while (a >= m_marg1.size()) {
+ m_cooc.push_back(my_map_t());
+ m_marg1.push_back(0);
+ }
+ while (b >= m_marg2.size())
+ m_marg2.push_back(0);
+ ++m_marg1[a];
+ ++m_marg2[b];
+ ++m_cooc[a][b];
+}
+
+uint32_t
+WordCoocTable::
+GetJoint(size_t const a, size_t const b) const
+{
+ if (a >= m_marg1.size() || b >= m_marg2.size()) return 0;
+ my_map_t::const_iterator m = m_cooc.at(a).find(b);
+ if (m == m_cooc[a].end()) return 0;
+ return m->second;
+}
+
+uint32_t
+WordCoocTable::
+GetMarg1(size_t const x) const
+{
+ return x >= m_marg1.size() ? 0 : m_marg1[x];
+}
+
+uint32_t
+WordCoocTable::
+GetMarg2(size_t const x) const
+{
+ return x >= m_marg2.size() ? 0 : m_marg2[x];
+}
+
+float
+WordCoocTable::
+pfwd(size_t const a, size_t const b) const
+{
+ return float(GetJoint(a,b))/GetMarg1(a);
+}
+
+float
+WordCoocTable::
+pbwd(size_t const a, size_t const b) const
+{
+ // cerr << "at " << __FILE__ << ":" << __LINE__ << endl;
+ return float(GetJoint(a,b))/GetMarg2(b);
+}
+}
diff --git a/moses/TranslationModel/WordCoocTable.h b/moses/TranslationModel/WordCoocTable.h
new file mode 100644
index 000000000..60d788bd3
--- /dev/null
+++ b/moses/TranslationModel/WordCoocTable.h
@@ -0,0 +1,72 @@
+#ifndef moses_WordCoocTable_h
+#define moses_WordCoocTable_h
+
+#include "moses/TranslationModel/DynSAInclude/vocab.h"
+#include "moses/TranslationModel/DynSAInclude/types.h"
+#include "moses/TranslationModel/DynSAInclude/utils.h"
+#include "moses/InputFileStream.h"
+#include "moses/FactorTypeSet.h"
+#include "moses/TargetPhrase.h"
+#include <boost/dynamic_bitset.hpp>
+#include <map>
+
+namespace Moses
+{
+
+using namespace std;
+
+#ifndef bitvector
+typedef boost::dynamic_bitset<uint64_t> bitvector;
+#endif
+
+
+/**
+ * Stores word cooccurrence counts
+ * @todo ask Uli Germann
+ */
+class WordCoocTable
+{
+ typedef map<wordID_t,uint32_t> my_map_t;
+ vector<my_map_t> m_cooc;
+ vector<uint32_t> m_marg1;
+ vector<uint32_t> m_marg2;
+public:
+ WordCoocTable();
+ WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2);
+ uint32_t GetJoint(size_t const a, size_t const b) const;
+ uint32_t GetMarg1(size_t const x) const;
+ uint32_t GetMarg2(size_t const x) const;
+ float pfwd(size_t const a, size_t const b) const;
+ float pbwd(size_t const a, size_t const b) const;
+ void
+ Count(size_t const a, size_t const b);
+
+ template<typename idvec, typename alnvec>
+ void
+ Count(idvec const& s1, idvec const& s2, alnvec const& aln,
+ wordID_t const NULL1, wordID_t const NULL2);
+
+};
+
+template<typename idvec, typename alnvec>
+void
+WordCoocTable::
+Count(idvec const& s1, idvec const& s2, alnvec const& aln,
+ wordID_t const NULL1, wordID_t const NULL2)
+{
+ boost::dynamic_bitset<uint64_t> check1(s1.size()), check2(s2.size());
+ check1.set();
+ check2.set();
+ for (size_t i = 0; i < aln.size(); i += 2) {
+ Count(s1[aln[i]], s2[aln[i+1]]);
+ check1.reset(aln[i]);
+ check2.reset(aln[i+1]);
+ }
+ for (size_t i = check1.find_first(); i < check1.size(); i = check1.find_next(i))
+ Count(s1[i], NULL2);
+ for (size_t i = check2.find_first(); i < check2.size(); i = check2.find_next(i))
+ Count(NULL1, s2[i]);
+}
+
+}
+#endif
diff --git a/moses/TypeDef.h b/moses/TypeDef.h
index 2b98b5bc3..af3a47b23 100644
--- a/moses/TypeDef.h
+++ b/moses/TypeDef.h
@@ -121,6 +121,7 @@ enum PhraseTableImplementation {
,FuzzyMatch = 11
,Compact = 12
,Interpolated = 13
+ ,DSuffixArray = 14
};
enum InputTypeEnum {
diff --git a/moses/generic/sampling/Sampling.h b/moses/generic/sampling/Sampling.h
new file mode 100644
index 000000000..69796c96e
--- /dev/null
+++ b/moses/generic/sampling/Sampling.h
@@ -0,0 +1,51 @@
+#ifndef __sampling_h
+#define __sampling_h
+
+// Utility functions for proper sub-sampling.
+// (c) 2007-2012 Ulrich Germann
+
+
+namespace Moses
+{
+
+inline
+size_t
+randInt(size_t N)
+{
+ return N*(rand()/(RAND_MAX+1.));
+}
+
+// select a random sample of size /s/ without restitution from the range of
+// integers [0,N);
+template<typename idx_t>
+void
+randomSample(vector<idx_t>& v, size_t s, size_t N)
+{
+ // see also Knuth: Art of Computer Programming Vol. 2, p. 142
+
+ s = min(s,N);
+ v.resize(s);
+
+ // the first option tries to be a bit more efficient than O(N) in picking
+ // the samples. The threshold is an ad-hoc, off-the-cuff guess. I still
+ // need to figure out the optimal break-even point between a linear sweep
+ // and repeatedly picking random numbers with the risk of hitting the same
+ // number many times.
+ if (s*10<N) {
+ boost::dynamic_bitset<uint64_t> check(N,0);
+ for (size_t i = 0; i < v.size(); i++) {
+ size_t x = randInt(N);
+ while (check[x]) x = randInt(N);
+ check[x]=true;
+ v[i] = x;
+ }
+ } else {
+ size_t m=0;
+ for (size_t t = 0; m <= s && t < N; t++)
+ if (s==N || randInt(N-t) < s-m) v[m++] = t;
+ }
+}
+
+};
+
+#endif
diff --git a/moses/generic/sorting/NBestList.h b/moses/generic/sorting/NBestList.h
new file mode 100644
index 000000000..9c58baa88
--- /dev/null
+++ b/moses/generic/sorting/NBestList.h
@@ -0,0 +1,85 @@
+#ifndef __n_best_list_h
+#define __n_best_list_h
+#include <algorithm>
+#include "moses/generic/sorting/VectorIndexSorter.h"
+
+// NBest List; (c) 2007-2012 Ulrich Germann
+//
+// The 'trick' used in this implementation is to maintain a heap of size <= N
+// such that the lowest-scoring item is on top of the heap. For each incoming
+// item we can then determine easily if it is in the top N.
+
+namespace Moses
+{
+using namespace std;
+
+template<typename THINGY, typename CMP>
+class
+ NBestList
+{
+ vector<uint32_t> m_heap;
+ vector<THINGY> m_list;
+ VectorIndexSorter<THINGY, CMP, uint32_t> m_better;
+ mutable vector<uint32_t> m_order;
+ mutable bool m_changed;
+public:
+ NBestList(size_t const max_size, CMP const& cmp);
+ NBestList(size_t const max_size);
+ bool add(THINGY const& item);
+ THINGY const& operator[](int i) const;
+ size_t size() const {
+ return m_heap.size();
+ }
+};
+
+template<typename THINGY, typename CMP>
+NBestList<THINGY,CMP>::
+NBestList(size_t const max_size, CMP const& cmp)
+ : m_better(m_list, cmp), m_changed(false)
+{
+ m_heap.reserve(max_size);
+}
+
+template<typename THINGY, typename CMP>
+NBestList<THINGY,CMP>::
+NBestList(size_t const max_size)
+ : m_better(m_heap), m_changed(false)
+{
+ m_heap.reserve(max_size);
+}
+
+template<typename THINGY, typename CMP>
+bool
+NBestList<THINGY,CMP>::
+add(THINGY const& item)
+{
+ if (m_heap.size() == m_heap.capacity()) {
+ if (m_better.Compare(item, m_list[m_heap.at(0)])) {
+ pop_heap(m_heap.begin(),m_heap.end(),m_better);
+ m_list[m_heap.back()] = item;
+ } else return false;
+ } else {
+ m_list.push_back(item);
+ m_heap.push_back(m_heap.size());
+ }
+ push_heap(m_heap.begin(),m_heap.end(),m_better);
+ return m_changed = true;
+}
+
+template<typename THINGY, typename CMP>
+THINGY const&
+NBestList<THINGY,CMP>::
+operator[](int i) const
+{
+ if (m_changed) {
+ m_order.assign(m_heap.begin(),m_heap.end());
+ for (size_t k = m_heap.size(); k != 0; --k)
+ pop_heap(m_order.begin(), m_order.begin()+k);
+ m_changed = false;
+ }
+ if (i < 0) i += m_order.size();
+ return m_list[m_order.at(i)];
+}
+
+}
+#endif
diff --git a/moses/generic/sorting/VectorIndexSorter.h b/moses/generic/sorting/VectorIndexSorter.h
new file mode 100644
index 000000000..a689c417b
--- /dev/null
+++ b/moses/generic/sorting/VectorIndexSorter.h
@@ -0,0 +1,69 @@
+#ifndef __vector_index_sorter_h
+#define __vector_index_sorter_h
+
+// VectorIndexSorter; (c) 2007-2012 Ulrich Germann
+
+// A VectorIndexSorter is a function object for sorting indices into a vector
+// of objects (instead of sorting the vector itself).
+//
+// typcial use:
+// vector<thingy> my_vector;
+// VectorIndexSorter<thingy,less<thingy>,int> sorter(my_vector);
+// vector<int> order;
+// sorter.get_order(order);
+
+namespace Moses
+{
+template<typename VAL, typename COMP = greater<VAL>, typename IDX_T=size_t>
+class
+ VectorIndexSorter : public binary_function<IDX_T const&, IDX_T const&, bool>
+{
+ vector<VAL> const& m_vecref;
+ boost::shared_ptr<COMP> m_comp;
+public:
+
+ COMP const& Compare;
+ VectorIndexSorter(vector<VAL> const& v, COMP const& comp)
+ : m_vecref(v), Compare(comp)
+ { }
+
+ VectorIndexSorter(vector<VAL> const& v)
+ : m_vecref(v), m_comp(new COMP()), Compare(*m_comp)
+ { }
+
+ bool operator()(IDX_T const & a, IDX_T const & b) const {
+ bool fwd = Compare(m_vecref.at(a) ,m_vecref.at(b));
+ bool bwd = Compare(m_vecref[b], m_vecref[a]);
+ return (fwd == bwd ? a < b : fwd);
+ }
+
+ boost::shared_ptr<vector<IDX_T> >
+ GetOrder() const;
+
+ void
+ GetOrder(vector<IDX_T> & order) const;
+
+};
+
+template<typename VAL, typename COMP, typename IDX_T>
+boost::shared_ptr<vector<IDX_T> >
+VectorIndexSorter<VAL,COMP,IDX_T>::
+GetOrder() const
+{
+ boost::shared_ptr<vector<IDX_T> > ret(new vector<IDX_T>(m_vecref.size()));
+ get_order(*ret);
+ return ret;
+}
+
+template<typename VAL, typename COMP, typename IDX_T>
+void
+VectorIndexSorter<VAL,COMP,IDX_T>::
+GetOrder(vector<IDX_T> & order) const
+{
+ order.resize(m_vecref.size());
+ for (IDX_T i = 0; i < IDX_T(m_vecref.size()); ++i) order[i] = i;
+ sort(order.begin(), order.end(), *this);
+}
+
+}
+#endif