Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2013-04-25 22:42:30 +0400
committerKenneth Heafield <github@kheafield.com>2013-04-25 22:42:30 +0400
commitf1d366381033c0caae18f8d15305ded38734bdbf (patch)
tree22b0cbd3acc337a995701629bf9facbe179f5618
parent8a1e944bb428a0af9f6c82c26e5633361ce4052c (diff)
Back FactorCollection with a memory pool. Less memory for large vocabularies.
-rw-r--r--OnDiskPt/OnDiskWrapper.cpp6
-rw-r--r--moses/ChartParser.cpp3
-rw-r--r--moses/Factor.h8
-rw-r--r--moses/FactorCollection.cpp34
-rw-r--r--moses/FactorCollection.h15
-rw-r--r--moses/FeatureVector.cpp21
-rw-r--r--moses/FeatureVector.h13
-rw-r--r--moses/GlobalLexicalModelUnlimited.cpp77
-rw-r--r--moses/GlobalLexicalModelUnlimited.h17
-rw-r--r--moses/LM/IRST.cpp3
-rw-r--r--moses/Phrase.cpp5
-rw-r--r--moses/PhrasePairFeature.cpp8
-rw-r--r--moses/PhrasePairFeature.h5
-rw-r--r--moses/ScoreComponentCollection.h6
-rw-r--r--moses/SourceWordDeletionFeature.cpp5
-rw-r--r--moses/SourceWordDeletionFeature.h4
-rw-r--r--moses/TargetBigramFeature.cpp21
-rw-r--r--moses/TargetBigramFeature.h4
-rw-r--r--moses/TargetNgramFeature.cpp16
-rw-r--r--moses/TargetNgramFeature.h4
-rw-r--r--moses/TargetWordInsertionFeature.cpp5
-rw-r--r--moses/TargetWordInsertionFeature.h4
-rw-r--r--moses/TranslationOptionCollection.cpp2
-rw-r--r--moses/Word.cpp9
-rw-r--r--moses/Word.h2
-rw-r--r--moses/WordTranslationFeature.cpp43
-rw-r--r--moses/WordTranslationFeature.h8
27 files changed, 169 insertions, 179 deletions
diff --git a/OnDiskPt/OnDiskWrapper.cpp b/OnDiskPt/OnDiskWrapper.cpp
index 743a77db1..3a1773c0a 100644
--- a/OnDiskPt/OnDiskWrapper.cpp
+++ b/OnDiskPt/OnDiskWrapper.cpp
@@ -207,8 +207,7 @@ Word *OnDiskWrapper::ConvertFromMoses(Moses::FactorDirection /* direction */
size_t factorType = factorsVec[0];
const Moses::Factor *factor = origWord.GetFactor(factorType);
CHECK(factor);
- string str = factor->GetString();
- strme << str;
+ strme << factor->GetString();
for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) {
size_t factorType = factorsVec[ind];
@@ -218,8 +217,7 @@ Word *OnDiskWrapper::ConvertFromMoses(Moses::FactorDirection /* direction */
break;
}
CHECK(factor);
- string str = factor->GetString();
- strme << "|" << str;
+ strme << "|" << factor->GetString();
} // for (size_t factorType
bool found;
diff --git a/moses/ChartParser.cpp b/moses/ChartParser.cpp
index ea55a46a6..5331a5fe4 100644
--- a/moses/ChartParser.cpp
+++ b/moses/ChartParser.cpp
@@ -49,8 +49,7 @@ void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range
size_t isDigit = 0;
if (staticData.GetDropUnknown()) {
const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
- const string &s = f->GetString();
- isDigit = s.find_first_of("0123456789");
+ isDigit = f->GetString().find_first_of("0123456789");
if (isDigit == string::npos)
isDigit = 0;
else
diff --git a/moses/Factor.h b/moses/Factor.h
index ac1b591ed..87e8f8028 100644
--- a/moses/Factor.h
+++ b/moses/Factor.h
@@ -26,6 +26,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <string>
#include "TypeDef.h"
#include "Util.h"
+#include "util/string_piece.hh"
namespace Moses
{
@@ -44,8 +45,9 @@ class Factor
friend class FactorCollection;
friend struct FactorFriend;
- // FactorCollection writes here.
- std::string m_string;
+ // FactorCollection writes here.
+ // This is mutable so the pointer can be changed to pool-backed memory.
+ mutable StringPiece m_string;
size_t m_id;
//! protected constructor. only friend class, FactorCollection, is allowed to create Factor objects
@@ -59,7 +61,7 @@ class Factor
public:
//! original string representation of the factor
- inline const std::string &GetString() const {
+ StringPiece GetString() const {
return m_string;
}
//! contiguous ID
diff --git a/moses/FactorCollection.cpp b/moses/FactorCollection.cpp
index 849830f4d..969bb39d1 100644
--- a/moses/FactorCollection.cpp
+++ b/moses/FactorCollection.cpp
@@ -27,6 +27,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <string>
#include "FactorCollection.h"
#include "Util.h"
+#include "util/pool.hh"
using namespace std;
@@ -36,42 +37,23 @@ FactorCollection FactorCollection::s_instance;
const Factor *FactorCollection::AddFactor(const StringPiece &factorString)
{
-// Sorry this is so complicated. Can't we just require everybody to use Boost >= 1.42? The issue is that I can't check BOOST_VERSION unless we have Boost.
-#ifdef WITH_THREADS
-
-#if BOOST_VERSION < 104200
FactorFriend to_ins;
- to_ins.in.m_string.assign(factorString.data(), factorString.size());
-#endif // BOOST_VERSION
+ to_ins.in.m_string = factorString;
+ to_ins.in.m_id = m_factorId;
+ // If we're threaded, hope a read-only lock is sufficient.
+#ifdef WITH_THREADS
{ // read=lock scope
boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
-#if BOOST_VERSION >= 104200
- // If this line doesn't compile, upgrade your Boost.
- Set::const_iterator i = m_set.find(factorString, HashFactor(), EqualsFactor());
-#else // BOOST_VERSION
Set::const_iterator i = m_set.find(to_ins);
-#endif // BOOST_VERSION
if (i != m_set.end()) return &i->in;
}
boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
-#if BOOST_VERSION >= 104200
- FactorFriend to_ins;
- to_ins.in.m_string.assign(factorString.data(), factorString.size());
-#endif // BOOST_VERSION
-
-#else // WITH_THREADS
-
-#if BOOST_VERSION >= 104200
- Set::const_iterator i = m_set.find(factorString, HashFactor(), EqualsFactor());
- if (i != m_set.end()) return &i->in;
-#endif
- FactorFriend to_ins;
- to_ins.in.m_string.assign(factorString.data(), factorString.size());
-
#endif // WITH_THREADS
- to_ins.in.m_id = m_factorId;
std::pair<Set::iterator, bool> ret(m_set.insert(to_ins));
if (ret.second) {
+ ret.first->in.m_string.set(
+ memcpy(m_string_backing.Allocate(factorString.size()), factorString.data(), factorString.size()),
+ factorString.size());
m_factorId++;
}
return &ret.first->in;
diff --git a/moses/FactorCollection.h b/moses/FactorCollection.h
index 9a01766f4..e7749244f 100644
--- a/moses/FactorCollection.h
+++ b/moses/FactorCollection.h
@@ -33,6 +33,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <string>
#include "util/string_piece.hh"
+#include "util/pool.hh"
#include "Factor.h"
namespace Moses
@@ -62,27 +63,20 @@ class FactorCollection
friend std::ostream& operator<<(std::ostream&, const FactorCollection&);
struct HashFactor : public std::unary_function<const FactorFriend &, std::size_t> {
- std::size_t operator()(const StringPiece &str) const {
- return util::MurmurHashNative(str.data(), str.size());
- }
std::size_t operator()(const FactorFriend &factor) const {
- return (*this)(factor.in.GetString());
+ return util::MurmurHashNative(factor.in.m_string.data(), factor.in.m_string.size());
}
};
struct EqualsFactor : public std::binary_function<const FactorFriend &, const FactorFriend &, bool> {
bool operator()(const FactorFriend &left, const FactorFriend &right) const {
return left.in.GetString() == right.in.GetString();
}
- bool operator()(const FactorFriend &left, const StringPiece &right) const {
- return left.in.GetString() == right;
- }
- bool operator()(const StringPiece &left, const FactorFriend &right) const {
- return left == right.in.GetString();
- }
};
typedef boost::unordered_set<FactorFriend, HashFactor, EqualsFactor> Set;
Set m_set;
+ util::Pool m_string_backing;
+
static FactorCollection s_instance;
#ifdef WITH_THREADS
//reader-writer lock
@@ -117,6 +111,5 @@ public:
};
-
}
#endif
diff --git a/moses/FeatureVector.cpp b/moses/FeatureVector.cpp
index c01775fd1..6cbddf3d7 100644
--- a/moses/FeatureVector.cpp
+++ b/moses/FeatureVector.cpp
@@ -26,6 +26,7 @@
#include <stdexcept>
#include "FeatureVector.h"
+#include "util/string_piece_hash.hh"
using namespace std;
@@ -41,12 +42,12 @@ namespace Moses {
boost::shared_mutex FName::m_idLock;
#endif
- void FName::init(const string& name) {
+ void FName::init(const StringPiece &name) {
#ifdef WITH_THREADS
//reader lock
boost::shared_lock<boost::shared_mutex> lock(m_idLock);
#endif
- Name2Id::iterator i = name2id.find(name);
+ Name2Id::iterator i = FindStringPiece(name2id, name);
if (i != name2id.end()) {
m_id = i->second;
} else {
@@ -55,15 +56,15 @@ namespace Moses {
lock.unlock();
boost::unique_lock<boost::shared_mutex> write_lock(m_idLock);
#endif
- //Need to check again if the id is in the map, as someone may have added
- //it while we were waiting on the writer lock.
- if (i != name2id.end()) {
- m_id = i->second;
- } else {
- m_id = name2id.size();
- name2id[name] = m_id;
- id2name.push_back(name);
+ std::pair<std::string, size_t> to_ins;
+ to_ins.first.assign(name.data(), name.size());
+ to_ins.second = name2id.size();
+ std::pair<Name2Id::iterator, bool> res(name2id.insert(to_ins));
+ if (res.second) {
+ // TODO this should be string pointers backed by the hash table.
+ id2name.push_back(to_ins.first);
}
+ m_id = res.first->second;
}
}
diff --git a/moses/FeatureVector.h b/moses/FeatureVector.h
index 983248076..4401e3c03 100644
--- a/moses/FeatureVector.h
+++ b/moses/FeatureVector.h
@@ -45,6 +45,7 @@
#endif
#include "util/check.hh"
+#include "util/string_piece.hh"
namespace Moses {
@@ -68,9 +69,13 @@ namespace Moses {
//A feature name can either be initialised as a pair of strings,
//which will be concatenated with a SEP between them, or as
//a single string, which will be used as-is.
- explicit FName(const std::string root, const std::string name)
- {init(root + SEP + name);}
- explicit FName(const std::string& name)
+ FName(const StringPiece &root, const StringPiece &name) {
+ std::string assembled(root.data(), root.size());
+ assembled += SEP;
+ assembled.append(name.data(), name.size());
+ init(assembled);
+ }
+ explicit FName(const StringPiece &name)
{init(name);}
const std::string& name() const;
@@ -89,7 +94,7 @@ namespace Moses {
static void eraseId(size_t id);
private:
- void init(const std::string& name);
+ void init(const StringPiece& name);
size_t m_id;
#ifdef WITH_THREADS
//reader-writer lock
diff --git a/moses/GlobalLexicalModelUnlimited.cpp b/moses/GlobalLexicalModelUnlimited.cpp
index f1de65bd0..cd8299e46 100644
--- a/moses/GlobalLexicalModelUnlimited.cpp
+++ b/moses/GlobalLexicalModelUnlimited.cpp
@@ -3,6 +3,8 @@
#include "StaticData.h"
#include "InputFileStream.h"
#include "UserMessage.h"
+#include "util/string_piece_hash.hh"
+#include "util/murmur_hash.hh"
using namespace std;
@@ -57,11 +59,11 @@ void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComp
const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
for(int targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) {
- string targetString = targetPhrase.GetWord(targetIndex).GetString(0); // TODO: change for other factors
+ StringPiece targetString = targetPhrase.GetWord(targetIndex).GetString(0); // TODO: change for other factors
if (m_ignorePunctuation) {
// check if first char is punctuation
- char firstChar = targetString.at(0);
+ char firstChar = targetString[0];
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
@@ -76,23 +78,24 @@ void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComp
accumulator->SparsePlusEquals(feature.str(), 1);
}
- StringHash alreadyScored;
+ boost::unordered_set<uint64_t> alreadyScored;
for(int sourceIndex = 0; sourceIndex < input.GetSize(); sourceIndex++ ) {
- string sourceString = input.GetWord(sourceIndex).GetString(0); // TODO: change for other factors
+ const StringPiece sourceString = input.GetWord(sourceIndex).GetString(0); // TODO: change for other factors
if (m_ignorePunctuation) {
// check if first char is punctuation
- char firstChar = sourceString.at(0);
+ char firstChar = sourceString[0];
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
}
+ const uint64_t sourceHash = util::MurmurHashNative(sourceString.data(), sourceString.size());
- if ( alreadyScored.find(sourceString) == alreadyScored.end()) {
+ if (alreadyScored.find(sourceHash) == alreadyScored.end()) {
bool sourceExists, targetExists;
if (!m_unrestricted) {
- sourceExists = m_vocabSource.find( sourceString ) != m_vocabSource.end();
- targetExists = m_vocabTarget.find( targetString) != m_vocabTarget.end();
+ sourceExists = FindStringPiece(m_vocabSource, sourceString) != m_vocabSource.end();
+ targetExists = FindStringPiece(m_vocabTarget, targetString) != m_vocabTarget.end();
}
// no feature if vocab is in use and both words are not in restricted vocabularies
@@ -107,15 +110,15 @@ void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComp
feature << "<s>,";
feature << sourceString;
accumulator->SparsePlusEquals(feature.str(), 1);
- alreadyScored[sourceString] = 1;
+ alreadyScored.insert(sourceHash);
}
// add source words to the right of current source word as context
for(int contextIndex = sourceIndex+1; contextIndex < input.GetSize(); contextIndex++ ) {
- string contextString = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
+ StringPiece contextString = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
bool contextExists;
if (!m_unrestricted)
- contextExists = m_vocabSource.find( contextString ) != m_vocabSource.end();
+ contextExists = FindStringPiece(m_vocabSource, contextString ) != m_vocabSource.end();
if (m_unrestricted || contextExists) {
stringstream feature;
@@ -126,7 +129,7 @@ void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComp
feature << ",";
feature << contextString;
accumulator->SparsePlusEquals(feature.str(), 1);
- alreadyScored[sourceString] = 1;
+ alreadyScored.insert(sourceHash);
}
}
}
@@ -135,7 +138,7 @@ void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComp
int globalTargetIndex = cur_hypo.GetSize() - targetPhrase.GetSize() + targetIndex;
// 1) source-target pair, trigger source word (can be discont.) and adjacent target word (bigram)
- string targetContext;
+ StringPiece targetContext;
if (globalTargetIndex > 0)
targetContext = cur_hypo.GetWord(globalTargetIndex-1).GetString(0); // TODO: change for other factors
else
@@ -143,23 +146,23 @@ void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComp
if (sourceIndex == 0) {
string sourceTrigger = "<s>";
- AddFeature(accumulator, alreadyScored, sourceTrigger, sourceString,
+ AddFeature(accumulator, sourceTrigger, sourceString,
targetContext, targetString);
}
else
for(int contextIndex = sourceIndex-1; contextIndex >= 0; contextIndex-- ) {
- string sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
+ StringPiece sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
bool sourceTriggerExists = false;
if (!m_unrestricted)
- sourceTriggerExists = m_vocabSource.find( sourceTrigger ) != m_vocabSource.end();
+ sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger) != m_vocabSource.end();
if (m_unrestricted || sourceTriggerExists)
- AddFeature(accumulator, alreadyScored, sourceTrigger, sourceString,
+ AddFeature(accumulator, sourceTrigger, sourceString,
targetContext, targetString);
}
// 2) source-target pair, adjacent source word (bigram) and trigger target word (can be discont.)
- string sourceContext;
+ StringPiece sourceContext;
if (sourceIndex-1 >= 0)
sourceContext = input.GetWord(sourceIndex-1).GetString(0); // TODO: change for other factors
else
@@ -167,18 +170,18 @@ void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComp
if (globalTargetIndex == 0) {
string targetTrigger = "<s>";
- AddFeature(accumulator, alreadyScored, sourceContext, sourceString,
+ AddFeature(accumulator, sourceContext, sourceString,
targetTrigger, targetString);
}
else
for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
- string targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
+ StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
bool targetTriggerExists = false;
if (!m_unrestricted)
- targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
+ targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger) != m_vocabTarget.end();
if (m_unrestricted || targetTriggerExists)
- AddFeature(accumulator, alreadyScored, sourceContext, sourceString,
+ AddFeature(accumulator, sourceContext, sourceString,
targetTrigger, targetString);
}
}
@@ -195,19 +198,19 @@ void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComp
bool targetTriggerExists = true;
if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
- AddFeature(accumulator, alreadyScored, sourceTrigger, sourceString,
+ AddFeature(accumulator, sourceTrigger, sourceString,
targetTrigger, targetString);
}
else {
// iterate backwards over target
for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
- string targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
+ StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
bool targetTriggerExists = false;
if (!m_unrestricted)
- targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
+ targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger) != m_vocabTarget.end();
if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
- AddFeature(accumulator, alreadyScored, sourceTrigger, sourceString,
+ AddFeature(accumulator, sourceTrigger, sourceString,
targetTrigger, targetString);
}
}
@@ -216,29 +219,29 @@ void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComp
else {
// iterate backwards over source
for(int contextIndex = sourceIndex-1; contextIndex >= 0; contextIndex-- ) {
- string sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
+ StringPiece sourceTrigger = input.GetWord(contextIndex).GetString(0); // TODO: change for other factors
bool sourceTriggerExists = false;
if (!m_unrestricted)
- sourceTriggerExists = m_vocabSource.find( sourceTrigger ) != m_vocabSource.end();
+ sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger) != m_vocabSource.end();
if (globalTargetIndex == 0) {
string targetTrigger = "<s>";
bool targetTriggerExists = true;
if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
- AddFeature(accumulator, alreadyScored, sourceTrigger, sourceString,
+ AddFeature(accumulator, sourceTrigger, sourceString,
targetTrigger, targetString);
}
else {
// iterate backwards over target
for(int globalContextIndex = globalTargetIndex-1; globalContextIndex >= 0; globalContextIndex-- ) {
- string targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
+ StringPiece targetTrigger = cur_hypo.GetWord(globalContextIndex).GetString(0); // TODO: change for other factors
bool targetTriggerExists = false;
if (!m_unrestricted)
- targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end();
+ targetTriggerExists = FindStringPiece(m_vocabTarget, targetTrigger) != m_vocabTarget.end();
if (m_unrestricted || (sourceTriggerExists && targetTriggerExists))
- AddFeature(accumulator, alreadyScored, sourceTrigger, sourceString,
+ AddFeature(accumulator, sourceTrigger, sourceString,
targetTrigger, targetString);
}
}
@@ -252,8 +255,7 @@ void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComp
feature << "~";
feature << sourceString;
accumulator->SparsePlusEquals(feature.str(), 1);
- //alreadyScored.insert( &inputWord );
- alreadyScored[sourceString] = 1;
+ alreadyScored.insert(sourceHash);
}
}
}
@@ -262,8 +264,8 @@ void GlobalLexicalModelUnlimited::Evaluate(const Hypothesis& cur_hypo, ScoreComp
}
void GlobalLexicalModelUnlimited::AddFeature(ScoreComponentCollection* accumulator,
- StringHash alreadyScored, string sourceTrigger, string sourceWord, string targetTrigger,
- string targetWord) const {
+ StringPiece sourceTrigger, StringPiece sourceWord, StringPiece targetTrigger,
+ StringPiece targetWord) const {
stringstream feature;
feature << "glm_";
feature << targetTrigger;
@@ -274,7 +276,8 @@ void GlobalLexicalModelUnlimited::AddFeature(ScoreComponentCollection* accumulat
feature << ",";
feature << sourceWord;
accumulator->SparsePlusEquals(feature.str(), 1);
- alreadyScored[sourceWord] = 1;
+ // BUG(ehasler): this did nothing because alreadyScored was passed by value not reference.
+ //alreadyScored[sourceWord] = 1;
}
}
diff --git a/moses/GlobalLexicalModelUnlimited.h b/moses/GlobalLexicalModelUnlimited.h
index 307461db0..2358e1d19 100644
--- a/moses/GlobalLexicalModelUnlimited.h
+++ b/moses/GlobalLexicalModelUnlimited.h
@@ -12,9 +12,12 @@
#include "FeatureFunction.h"
#include "FactorTypeSet.h"
#include "Sentence.h"
-
#include "FFState.h"
+#include "util/string_piece.hh"
+#include <boost/unordered_set.hpp>
+#include <boost/unordered_map.hpp>
+
#ifdef WITH_THREADS
#include <boost/thread/tss.hpp>
#endif
@@ -35,8 +38,8 @@ class InputType;
class GlobalLexicalModelUnlimited : public StatelessFeatureFunction
{
+ // TODO(ehasler): This should be an array of size 256.
typedef std::map< char, short > CharHash;
- typedef std::map< std::string, short > StringHash;
struct ThreadLocalStorage
{
@@ -64,8 +67,8 @@ private:
float m_sparseProducerWeight;
bool m_ignorePunctuation;
- std::set<std::string> m_vocabSource;
- std::set<std::string> m_vocabTarget;
+ boost::unordered_set<std::string> m_vocabSource;
+ boost::unordered_set<std::string> m_vocabTarget;
public:
GlobalLexicalModelUnlimited(const std::vector< FactorType >& inFactors, const std::vector< FactorType >& outFactors,
@@ -137,9 +140,9 @@ public:
void SetSparseProducerWeight(float weight) { m_sparseProducerWeight = weight; }
float GetSparseProducerWeight() const { return m_sparseProducerWeight; }
- void AddFeature(ScoreComponentCollection* accumulator, StringHash alreadyScored,
- std::string sourceTrigger, std::string sourceWord, std::string targetTrigger,
- std::string targetWord) const;
+ void AddFeature(ScoreComponentCollection* accumulator,
+ StringPiece sourceTrigger, StringPiece sourceWord, StringPiece targetTrigger,
+ StringPiece targetWord) const;
};
}
diff --git a/moses/LM/IRST.cpp b/moses/LM/IRST.cpp
index 2748fa1ba..a477fe20d 100644
--- a/moses/LM/IRST.cpp
+++ b/moses/LM/IRST.cpp
@@ -154,7 +154,8 @@ int LanguageModelIRST::GetLmID( const Factor *factor ) const
if ((factorId >= m_lmIdLookup.size()) || (m_lmIdLookup[factorId] == m_empty)) {
if (d->incflag()==1) {
- std::string s = factor->GetString();
+ const StringPiece &f = factor->GetString();
+ std::string s(f.data(), f.size());
int code = d->encode(s.c_str());
//////////
diff --git a/moses/Phrase.cpp b/moses/Phrase.cpp
index 2e020ef69..7981abdfd 100644
--- a/moses/Phrase.cpp
+++ b/moses/Phrase.cpp
@@ -265,9 +265,8 @@ bool Phrase::Contains(const vector< vector<string> > &subPhraseVector
FactorType factorType = inputFactor[currFactorIndex];
for (size_t currSubPos = 0 ; currSubPos < subSize ; currSubPos++) {
size_t currThisPos = currSubPos + currStartPos;
- const string &subStr = subPhraseVector[currSubPos][currFactorIndex]
- ,&thisStr = GetFactor(currThisPos, factorType)->GetString();
- if (subStr != thisStr) {
+ const string &subStr = subPhraseVector[currSubPos][currFactorIndex];
+ if (subStr != GetFactor(currThisPos, factorType)->GetString()) {
match = false;
break;
}
diff --git a/moses/PhrasePairFeature.cpp b/moses/PhrasePairFeature.cpp
index 020292748..ba0a7343d 100644
--- a/moses/PhrasePairFeature.cpp
+++ b/moses/PhrasePairFeature.cpp
@@ -5,7 +5,7 @@
#include "TargetPhrase.h"
#include "Hypothesis.h"
#include "TranslationOption.h"
-#include <boost/algorithm/string.hpp>
+#include "util/string_piece_hash.hh"
using namespace std;
@@ -182,10 +182,10 @@ void PhrasePairFeature::Evaluate(
// range over source words to get context
for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
- string sourceTrigger = input.GetWord(contextIndex).GetFactor(m_sourceFactorId)->GetString();
+ StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_sourceFactorId)->GetString();
if (m_ignorePunctuation) {
// check if trigger is punctuation
- char firstChar = sourceTrigger.at(0);
+ char firstChar = sourceTrigger.data()[0];
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
@@ -193,7 +193,7 @@ void PhrasePairFeature::Evaluate(
bool sourceTriggerExists = false;
if (!m_unrestricted)
- sourceTriggerExists = m_vocabSource.find( sourceTrigger ) != m_vocabSource.end();
+ sourceTriggerExists = (FindStringPiece(m_vocabSource, sourceTrigger) != m_vocabSource.end());
if (m_unrestricted || sourceTriggerExists) {
ostringstream namestr;
diff --git a/moses/PhrasePairFeature.h b/moses/PhrasePairFeature.h
index d7cc3ea48..ac51aa61e 100644
--- a/moses/PhrasePairFeature.h
+++ b/moses/PhrasePairFeature.h
@@ -1,6 +1,7 @@
#ifndef moses_PhrasePairFeature_h
#define moses_PhrasePairFeature_h
+#include <boost/unordered_set.hpp>
#include <stdexcept>
#include "Factor.h"
@@ -13,11 +14,11 @@ namespace Moses {
* Phrase pair feature: complete source/target phrase pair
**/
class PhrasePairFeature: public StatelessFeatureFunction {
-
+ // TODO(ehasler): This should be bool ispunct[256];
typedef std::map< char, short > CharHash;
typedef std::vector< std::set<std::string> > DocumentVector;
- std::set<std::string> m_vocabSource;
+ boost::unordered_set<std::string> m_vocabSource;
//std::set<std::string> m_vocabTarget;
DocumentVector m_vocabDomain;
FactorType m_sourceFactorId;
diff --git a/moses/ScoreComponentCollection.h b/moses/ScoreComponentCollection.h
index be23e03fd..32b8cb1c9 100644
--- a/moses/ScoreComponentCollection.h
+++ b/moses/ScoreComponentCollection.h
@@ -200,7 +200,7 @@ public:
}
//For features which have an unbounded number of components
- void SparseMinusEquals(const std::string& full_name, float score)
+ void SparseMinusEquals(const StringPiece &full_name, float score)
{
FName fname(full_name);
m_scores[fname] -= score;
@@ -240,7 +240,7 @@ public:
}
//For features which have an unbounded number of components
- void PlusEquals(const ScoreProducer*sp, const std::string& name, float score)
+ void PlusEquals(const ScoreProducer*sp, const StringPiece &name, float score)
{
CHECK(sp->GetNumScoreComponents() == ScoreProducer::unlimited);
FName fname(sp->GetScoreProducerDescription(),name);
@@ -248,7 +248,7 @@ public:
}
//For features which have an unbounded number of components
- void SparsePlusEquals(const std::string& full_name, float score)
+ void SparsePlusEquals(const StringPiece &full_name, float score)
{
FName fname(full_name);
m_scores[fname] += score;
diff --git a/moses/SourceWordDeletionFeature.cpp b/moses/SourceWordDeletionFeature.cpp
index c312a3b03..082e0900b 100644
--- a/moses/SourceWordDeletionFeature.cpp
+++ b/moses/SourceWordDeletionFeature.cpp
@@ -6,6 +6,7 @@
#include "ChartHypothesis.h"
#include "ScoreComponentCollection.h"
#include "TranslationOption.h"
+#include "util/string_piece_hash.hh"
namespace Moses {
@@ -70,9 +71,9 @@ void SourceWordDeletionFeature::ComputeFeatures(const TargetPhrase& targetPhrase
if (!aligned[i]) {
Word w = targetPhrase.GetSourcePhrase().GetWord(i);
if (!w.IsNonTerminal()) {
- const string &word = w.GetFactor(m_factorType)->GetString();
+ const StringPiece &word = w.GetFactor(m_factorType)->GetString();
if (word != "<s>" && word != "</s>") {
- if (!m_unrestricted && m_vocab.find( word ) == m_vocab.end()) {
+ if (!m_unrestricted && FindStringPiece(m_vocab, word) == m_vocab.end()) {
accumulator->PlusEquals(this,"OTHER",1);
}
else {
diff --git a/moses/SourceWordDeletionFeature.h b/moses/SourceWordDeletionFeature.h
index d34aa92f5..b503b4670 100644
--- a/moses/SourceWordDeletionFeature.h
+++ b/moses/SourceWordDeletionFeature.h
@@ -2,7 +2,7 @@
#define moses_SourceWordDeletionFeature_h
#include <string>
-#include <map>
+#include <boost/unordered_set.hpp>
#include "FeatureFunction.h"
#include "FactorCollection.h"
@@ -15,7 +15,7 @@ namespace Moses
*/
class SourceWordDeletionFeature : public StatelessFeatureFunction {
private:
- std::set<std::string> m_vocab;
+ boost::unordered_set<std::string> m_vocab;
FactorType m_factorType;
bool m_unrestricted;
diff --git a/moses/TargetBigramFeature.cpp b/moses/TargetBigramFeature.cpp
index a9ad2216b..64942e947 100644
--- a/moses/TargetBigramFeature.cpp
+++ b/moses/TargetBigramFeature.cpp
@@ -3,6 +3,7 @@
#include "TargetPhrase.h"
#include "Hypothesis.h"
#include "ScoreComponentCollection.h"
+#include "util/string_piece_hash.hh"
namespace Moses {
@@ -71,24 +72,26 @@ FFState* TargetBigramFeature::Evaluate(const Hypothesis& cur_hypo,
f1 = targetPhrase.GetWord(i-1).GetFactor(m_factorType);
}
const Factor* f2 = targetPhrase.GetWord(i).GetFactor(m_factorType);
- const string& w1 = f1->GetString();
- const string& w2 = f2->GetString();
+ StringPiece w1(f1->GetString()), w2(f2->GetString());
// skip bigrams if they don't belong to a given restricted vocabulary
- if (m_vocab.size() &&
- (m_vocab.find(w1) == m_vocab.end() || m_vocab.find(w2) == m_vocab.end())) {
+ if (m_vocab.size() &&
+ (FindStringPiece(m_vocab, w1) == m_vocab.end() || FindStringPiece(m_vocab, w2) == m_vocab.end())) {
continue;
}
-
- string name(w1 +":"+w2);
+ string name(w1.data(), w1.size());
+ name += ':';
+ name.append(w2.data(), w2.size());
accumulator->PlusEquals(this,name,1);
}
if (cur_hypo.GetWordsBitmap().IsComplete()) {
- const string& w1 = targetPhrase.GetWord(targetPhrase.GetSize()-1).GetFactor(m_factorType)->GetString();
+ StringPiece w1(targetPhrase.GetWord(targetPhrase.GetSize()-1).GetFactor(m_factorType)->GetString());
const string& w2 = EOS_;
- if (m_vocab.empty() || (m_vocab.find(w1) != m_vocab.end())) {
- string name(w1 +":"+w2);
+ if (m_vocab.empty() || (FindStringPiece(m_vocab, w1) != m_vocab.end())) {
+ string name(w1.data(), w1.size());
+ name += ':';
+ name += w2;
accumulator->PlusEquals(this,name,1);
}
return NULL;
diff --git a/moses/TargetBigramFeature.h b/moses/TargetBigramFeature.h
index 76b4f6ef7..50bfc8e2c 100644
--- a/moses/TargetBigramFeature.h
+++ b/moses/TargetBigramFeature.h
@@ -4,6 +4,8 @@
#include <string>
#include <map>
+#include <boost/unordered_set.hpp>
+
#include "FactorCollection.h"
#include "FeatureFunction.h"
#include "FFState.h"
@@ -56,7 +58,7 @@ public:
private:
FactorType m_factorType;
Word m_bos;
- std::set<std::string> m_vocab;
+ boost::unordered_set<std::string> m_vocab;
};
}
diff --git a/moses/TargetNgramFeature.cpp b/moses/TargetNgramFeature.cpp
index 7973cedce..24a484f59 100644
--- a/moses/TargetNgramFeature.cpp
+++ b/moses/TargetNgramFeature.cpp
@@ -5,6 +5,8 @@
#include "ScoreComponentCollection.h"
#include "ChartHypothesis.h"
+#include "util/string_piece_hash.hh"
+
namespace Moses {
using namespace std;
@@ -94,9 +96,9 @@ FFState* TargetNgramFeature::Evaluate(const Hypothesis& cur_hypo,
for (size_t n = m_n; n >= smallest_n; --n) { // iterate over ngram size
for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
// const string& curr_w = targetPhrase.GetWord(i).GetFactor(m_factorType)->GetString();
- const string& curr_w = targetPhrase.GetWord(i).GetString(m_factorType);
+ const StringPiece& curr_w = targetPhrase.GetWord(i).GetString(m_factorType);
- if (m_vocab.size() && (m_vocab.find(curr_w) == m_vocab.end())) continue; // skip ngrams
+ if (m_vocab.size() && (FindStringPiece(m_vocab, curr_w) == m_vocab.end())) continue; // skip ngrams
if (n > 1) {
// can we build an ngram at this position? ("<s> this" --> cannot build 3gram at this position)
@@ -172,8 +174,8 @@ FFState* TargetNgramFeature::Evaluate(const Hypothesis& cur_hypo,
void TargetNgramFeature::appendNgram(const Word& word, bool& skip, stringstream &ngram) const {
// const string& w = word.GetFactor(m_factorType)->GetString();
- const string& w = word.GetString(m_factorType);
- if (m_vocab.size() && (m_vocab.find(w) == m_vocab.end())) skip = true;
+ const StringPiece& w = word.GetString(m_factorType);
+ if (m_vocab.size() && (FindStringPiece(m_vocab, w) == m_vocab.end())) skip = true;
else {
ngram << w;
ngram << ":";
@@ -215,7 +217,7 @@ FFState* TargetNgramFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int
makeSuffix = true;
// beginning/end of sentence symbol <s>,</s>?
- string factorZero = word.GetString(0);
+ StringPiece factorZero = word.GetString(0);
if (factorZero.compare("<s>") == 0)
prefixTerminals++;
// end of sentence symbol </s>?
@@ -396,7 +398,7 @@ void TargetNgramFeature::MakePrefixNgrams(std::vector<const Word*> &contextFacto
for (size_t i=k+offset; i <= end_pos; ++i) {
if (i > k+offset)
ngram << ":";
- string factorZero = (*contextFactor[i]).GetString(0);
+ StringPiece factorZero = (*contextFactor[i]).GetString(0);
if (m_factorType == 0 || factorZero.compare("<s>") == 0 || factorZero.compare("</s>") == 0)
ngram << factorZero;
else
@@ -417,7 +419,7 @@ void TargetNgramFeature::MakeSuffixNgrams(std::vector<const Word*> &contextFacto
for (int start_pos=end_pos-1; (start_pos >= 0) && (end_pos-start_pos < m_n); --start_pos) {
ngram << m_baseName;
for (size_t j=start_pos; j <= end_pos; ++j){
- string factorZero = (*contextFactor[j]).GetString(0);
+ StringPiece factorZero = (*contextFactor[j]).GetString(0);
if (m_factorType == 0 || factorZero.compare("<s>") == 0 || factorZero.compare("</s>") == 0)
ngram << factorZero;
else
diff --git a/moses/TargetNgramFeature.h b/moses/TargetNgramFeature.h
index 0aa98be7d..c26198b2a 100644
--- a/moses/TargetNgramFeature.h
+++ b/moses/TargetNgramFeature.h
@@ -13,6 +13,8 @@
#include "ChartHypothesis.h"
#include "ChartManager.h"
+#include <boost/unordered_set.hpp>
+
namespace Moses
{
@@ -213,7 +215,7 @@ public:
private:
FactorType m_factorType;
Word m_bos;
- std::set<std::string> m_vocab;
+ boost::unordered_set<std::string> m_vocab;
size_t m_n;
bool m_lower_ngrams;
diff --git a/moses/TargetWordInsertionFeature.cpp b/moses/TargetWordInsertionFeature.cpp
index 3b9bf36ba..4420bd7a5 100644
--- a/moses/TargetWordInsertionFeature.cpp
+++ b/moses/TargetWordInsertionFeature.cpp
@@ -6,6 +6,7 @@
#include "ChartHypothesis.h"
#include "ScoreComponentCollection.h"
#include "TranslationOption.h"
+#include "util/string_piece_hash.hh"
namespace Moses {
@@ -73,9 +74,9 @@ void TargetWordInsertionFeature::ComputeFeatures(const TargetPhrase& targetPhras
if (!aligned[i]) {
Word w = targetPhrase.GetWord(i);
if (!w.IsNonTerminal()) {
- const string &word = w.GetFactor(m_factorType)->GetString();
+ const StringPiece &word = w.GetFactor(m_factorType)->GetString();
if (word != "<s>" && word != "</s>") {
- if (!m_unrestricted && m_vocab.find( word ) == m_vocab.end()) {
+ if (!m_unrestricted && FindStringPiece(m_vocab, word) == m_vocab.end()) {
accumulator->PlusEquals(this,"OTHER",1);
}
else {
diff --git a/moses/TargetWordInsertionFeature.h b/moses/TargetWordInsertionFeature.h
index a7a149db6..7a1e3770a 100644
--- a/moses/TargetWordInsertionFeature.h
+++ b/moses/TargetWordInsertionFeature.h
@@ -2,7 +2,7 @@
#define moses_TargetWordInsertionFeature_h
#include <string>
-#include <map>
+#include <boost/unordered_set.hpp>
#include "FeatureFunction.h"
#include "FactorCollection.h"
@@ -15,7 +15,7 @@ namespace Moses
*/
class TargetWordInsertionFeature : public StatelessFeatureFunction {
private:
- std::set<std::string> m_vocab;
+ boost::unordered_set<std::string> m_vocab;
FactorType m_factorType;
bool m_unrestricted;
diff --git a/moses/TranslationOptionCollection.cpp b/moses/TranslationOptionCollection.cpp
index 3d553a458..553e68aa7 100644
--- a/moses/TranslationOptionCollection.cpp
+++ b/moses/TranslationOptionCollection.cpp
@@ -207,7 +207,7 @@ void TranslationOptionCollection::ProcessOneUnknownWord(const Word &sourceWord,s
size_t isDigit = 0;
const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
- const string &s = f->GetString();
+ const StringPiece &s = f->GetString();
bool isEpsilon = (s=="" || s==EPSILON);
if (StaticData::Instance().GetDropUnknown())
{
diff --git a/moses/Word.cpp b/moses/Word.cpp
index 2c1ac09ea..69d382c8a 100644
--- a/moses/Word.cpp
+++ b/moses/Word.cpp
@@ -87,13 +87,8 @@ std::string Word::GetString(const vector<FactorType> factorType,bool endWithBlan
return strme.str();
}
-std::string Word::GetString(FactorType factorType) const
-{
- const Factor *factor = m_factorArray[factorType];
- if (factor != NULL)
- return factor->GetString();
- else
- return NULL;
+StringPiece Word::GetString(FactorType factorType) const {
+ return m_factorArray[factorType]->GetString();
}
class StrayFactorException : public util::Exception {};
diff --git a/moses/Word.h b/moses/Word.h
index 70875d75c..d650fb67e 100644
--- a/moses/Word.h
+++ b/moses/Word.h
@@ -102,7 +102,7 @@ public:
* these debugging functions.
*/
std::string GetString(const std::vector<FactorType> factorType,bool endWithBlank) const;
- std::string GetString(FactorType factorType) const;
+ StringPiece GetString(FactorType factorType) const;
TO_STRING();
//! transitive comparison of Word objects
diff --git a/moses/WordTranslationFeature.cpp b/moses/WordTranslationFeature.cpp
index 6fd5040d6..908274c2b 100644
--- a/moses/WordTranslationFeature.cpp
+++ b/moses/WordTranslationFeature.cpp
@@ -7,7 +7,7 @@
#include "ChartHypothesis.h"
#include "ScoreComponentCollection.h"
#include "TranslationOption.h"
-#include <boost/algorithm/string.hpp>
+#include "util/string_piece_hash.hh"
namespace Moses {
@@ -25,14 +25,11 @@ bool WordTranslationFeature::Load(const std::string &filePathSource, const std::
std::string line;
while (getline(inFileSource, line)) {
- std::set<std::string> terms;
+ m_vocabDomain.resize(m_vocabDomain.size() + 1);
vector<string> termVector;
boost::split(termVector, line, boost::is_any_of("\t "));
for (size_t i=0; i < termVector.size(); ++i)
- terms.insert(termVector[i]);
-
- // add term set for current document
- m_vocabDomain.push_back(terms);
+ m_vocabDomain.back().insert(termVector[i]);
}
inFileSource.close();
@@ -89,24 +86,24 @@ void WordTranslationFeature::Evaluate
if (m_factorTypeSource == 0 && ws.IsNonTerminal()) continue;
Word wt = targetPhrase.GetWord(targetIndex);
if (m_factorTypeSource == 0 && wt.IsNonTerminal()) continue;
- string sourceWord = ws.GetFactor(m_factorTypeSource)->GetString();
- string targetWord = wt.GetFactor(m_factorTypeTarget)->GetString();
+ StringPiece sourceWord = ws.GetFactor(m_factorTypeSource)->GetString();
+ StringPiece targetWord = wt.GetFactor(m_factorTypeTarget)->GetString();
if (m_ignorePunctuation) {
// check if source or target are punctuation
- char firstChar = sourceWord.at(0);
+ char firstChar = sourceWord.data()[0];
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
- firstChar = targetWord.at(0);
+ firstChar = targetWord.data()[0];
charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
}
if (!m_unrestricted) {
- if (m_vocabSource.find(sourceWord) == m_vocabSource.end())
+ if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end())
sourceWord = "OTHER";
- if (m_vocabTarget.find(targetWord) == m_vocabTarget.end())
+ if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end())
targetWord = "OTHER";
}
@@ -167,7 +164,7 @@ void WordTranslationFeature::Evaluate
else {
// range over domain trigger words (keywords)
const long docid = input.GetDocumentId();
- for (set<string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
+ for (boost::unordered_set<string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) {
string sourceTrigger = *p;
stringstream feature;
feature << "wt_";
@@ -196,10 +193,10 @@ void WordTranslationFeature::Evaluate
// range over source words to get context
for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) {
if (contextIndex == globalSourceIndex) continue;
- string sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString();
+ StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString();
if (m_ignorePunctuation) {
// check if trigger is punctuation
- char firstChar = sourceTrigger.at(0);
+ char firstChar = sourceTrigger.data()[0];
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
@@ -208,9 +205,9 @@ void WordTranslationFeature::Evaluate
const long docid = input.GetDocumentId();
bool sourceTriggerExists = false;
if (m_domainTrigger)
- sourceTriggerExists = m_vocabDomain[docid].find( sourceTrigger ) != m_vocabDomain[docid].end();
+ sourceTriggerExists = FindStringPiece(m_vocabDomain[docid], sourceTrigger) != m_vocabDomain[docid].end();
else if (!m_unrestricted)
- sourceTriggerExists = m_vocabSource.find( sourceTrigger ) != m_vocabSource.end();
+ sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger) != m_vocabSource.end();
if (m_domainTrigger) {
if (sourceTriggerExists) {
@@ -304,24 +301,24 @@ void WordTranslationFeature::EvaluateChart(
if (m_factorTypeSource == 0 && ws.IsNonTerminal()) continue;
Word wt = targetPhrase.GetWord(targetIndex);
if (m_factorTypeSource == 0 && wt.IsNonTerminal()) continue;
- string sourceWord = ws.GetFactor(m_factorTypeSource)->GetString();
- string targetWord = wt.GetFactor(m_factorTypeTarget)->GetString();
+ StringPiece sourceWord = ws.GetFactor(m_factorTypeSource)->GetString();
+ StringPiece targetWord = wt.GetFactor(m_factorTypeTarget)->GetString();
if (m_ignorePunctuation) {
// check if source or target are punctuation
- char firstChar = sourceWord.at(0);
+ char firstChar = sourceWord[0];
CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
- firstChar = targetWord.at(0);
+ firstChar = targetWord[0];
charIterator = m_punctuationHash.find( firstChar );
if(charIterator != m_punctuationHash.end())
continue;
}
if (!m_unrestricted) {
- if (m_vocabSource.find(sourceWord) == m_vocabSource.end())
+ if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end())
sourceWord = "OTHER";
- if (m_vocabTarget.find(targetWord) == m_vocabTarget.end())
+ if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end())
targetWord = "OTHER";
}
diff --git a/moses/WordTranslationFeature.h b/moses/WordTranslationFeature.h
index 7f74ae4e3..bac948219 100644
--- a/moses/WordTranslationFeature.h
+++ b/moses/WordTranslationFeature.h
@@ -2,7 +2,7 @@
#define moses_WordTranslationFeature_h
#include <string>
-#include <map>
+#include <boost/unordered_set.hpp>
#include "FeatureFunction.h"
#include "FactorCollection.h"
@@ -18,11 +18,11 @@ namespace Moses
class WordTranslationFeature : public StatelessFeatureFunction {
typedef std::map< char, short > CharHash;
- typedef std::vector< std::set<std::string> > DocumentVector;
+ typedef std::vector< boost::unordered_set<std::string> > DocumentVector;
private:
- std::set<std::string> m_vocabSource;
- std::set<std::string> m_vocabTarget;
+ boost::unordered_set<std::string> m_vocabSource;
+ boost::unordered_set<std::string> m_vocabTarget;
DocumentVector m_vocabDomain;
FactorType m_factorTypeSource;
FactorType m_factorTypeTarget;