diff options
author | Hieu Hoang <hieuhoang@gmail.com> | 2015-10-13 18:24:38 +0300 |
---|---|---|
committer | Hieu Hoang <hieuhoang@gmail.com> | 2015-10-13 18:24:38 +0300 |
commit | d5bb41744905bacffb3f85b479009503768cd481 (patch) | |
tree | 9e2cb7295ad3c23dbedcd5160e5c37e8da34e6c8 /moses/FF | |
parent | 0dd07cda0b79ac9bdc0499e4a908655d58ddab98 (diff) | |
parent | e5c9131333c5a38838f2ff627249735551d3db73 (diff) |
Merge branch 'perf_ff' of github.com:hieuhoang/mosesdecoder into perf_ff
Diffstat (limited to 'moses/FF')
-rw-r--r-- | moses/FF/Factory.cpp | 2 | ||||
-rw-r--r-- | moses/FF/VW/VWFeatureSourceSenseWindow.h | 141 |
2 files changed, 143 insertions, 0 deletions
diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index 6fc4e310d..d2c3cf639 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -73,6 +73,7 @@ #include "moses/FF/VW/VWFeatureSourceBigrams.h" #include "moses/FF/VW/VWFeatureSourceIndicator.h" #include "moses/FF/VW/VWFeatureSourcePhraseInternal.h" +#include "moses/FF/VW/VWFeatureSourceSenseWindow.h" #include "moses/FF/VW/VWFeatureSourceWindow.h" #include "moses/FF/VW/VWFeatureTargetBigrams.h" #include "moses/FF/VW/VWFeatureTargetIndicator.h" @@ -279,6 +280,7 @@ FeatureRegistry::FeatureRegistry() MOSES_FNAME(VWFeatureSourceBigrams); MOSES_FNAME(VWFeatureSourceIndicator); MOSES_FNAME(VWFeatureSourcePhraseInternal); + MOSES_FNAME(VWFeatureSourceSenseWindow); MOSES_FNAME(VWFeatureSourceWindow); MOSES_FNAME(VWFeatureTargetBigrams); MOSES_FNAME(VWFeatureTargetPhraseInternal); diff --git a/moses/FF/VW/VWFeatureSourceSenseWindow.h b/moses/FF/VW/VWFeatureSourceSenseWindow.h new file mode 100644 index 000000000..5add76c09 --- /dev/null +++ b/moses/FF/VW/VWFeatureSourceSenseWindow.h @@ -0,0 +1,141 @@ +#pragma once + +#include <string> +#include <algorithm> +#include <boost/foreach.hpp> +#include "ThreadLocalByFeatureStorage.h" +#include "VWFeatureSource.h" +#include "moses/Util.h" + +/* + * Produces features from factors in the following format: + * wordsense1:0.25^wordsense1:0.7^wordsense3:0.05 + * + * This is useful e.g. for including different possible word senses as features weighted + * by their probability. + * + * By default, features are extracted from a small context window around the current + * phrase and from within the phrase. + */ + +namespace Moses +{ + +class VWFeatureSourceSenseWindow : public VWFeatureSource +{ +public: + VWFeatureSourceSenseWindow(const std::string &line) + : VWFeatureSource(line), m_tlsSenses(this), m_tlsForms(this), m_lexicalized(true), m_size(DEFAULT_WINDOW_SIZE) { + ReadParameters(); + + // Call this last + VWFeatureBase::UpdateRegister(); + } + + // precompute feature strings for each input sentence + virtual void InitializeForInput(ttasksptr const& ttask) { + InputType const& input = *(ttask->GetSource().get()); + + std::vector<WordSenses>& senses = *m_tlsSenses.GetStored(); + std::vector<std::string>& forms = *m_tlsForms.GetStored(); + senses.clear(); + forms.clear(); + + senses.resize(input.GetSize()); + forms.resize(input.GetSize()); + + for (size_t i = 0; i < input.GetSize(); i++) { + senses[i] = GetSenses(input, i); + forms[i] = m_lexicalized ? GetWordForm(input, i) + "^" : ""; + } + } + + void operator()(const InputType &input + , const InputPath &inputPath + , const WordsRange &sourceRange + , Discriminative::Classifier &classifier) const { + int begin = sourceRange.GetStartPos(); + int end = sourceRange.GetEndPos() + 1; + int inputLen = input.GetSize(); + + const std::vector<WordSenses>& senses = *m_tlsSenses.GetStored(); + const std::vector<std::string>& forms = *m_tlsForms.GetStored(); + + // before current phrase + for (int i = std::max(0, begin - m_size); i < begin; i++) { + BOOST_FOREACH(const Sense &sense, senses[i]) { + classifier.AddLabelIndependentFeature("snsb^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob); + classifier.AddLabelIndependentFeature("snsb^" + forms[i] + sense.m_label, sense.m_prob); + } + } + + // within current phrase + for (int i = begin; i < end; i++) { + BOOST_FOREACH(const Sense &sense, senses[i]) { + classifier.AddLabelIndependentFeature("snsin^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob); + classifier.AddLabelIndependentFeature("snsin^" + forms[i] + sense.m_label, sense.m_prob); + } + } + + // after current phrase + for (int i = end; i < std::min(end + m_size, inputLen); i++) { + BOOST_FOREACH(const Sense &sense, senses[i]) { + classifier.AddLabelIndependentFeature("snsa^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob); + classifier.AddLabelIndependentFeature("snsa^" + forms[i] + sense.m_label, sense.m_prob); + } + } + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + if (key == "size") { + m_size = Scan<size_t>(value); + } else if (key == "lexicalized") { + m_lexicalized = Scan<bool>(value); + } else { + VWFeatureSource::SetParameter(key, value); + } + } + +private: + static const int DEFAULT_WINDOW_SIZE = 3; + + struct Sense { + std::string m_label; + float m_prob; + }; + + typedef std::vector<Sense> WordSenses; + typedef ThreadLocalByFeatureStorage<std::vector<WordSenses> > TLSSenses; + typedef ThreadLocalByFeatureStorage<std::vector<std::string> > TLSWordForms; + + TLSSenses m_tlsSenses; // for each input sentence, contains extracted senses and probs for each word + TLSWordForms m_tlsForms; // word forms for each input sentence + + + std::vector<Sense> GetSenses(const InputType &input, size_t pos) const { + std::string w = GetWord(input, pos); + std::vector<std::string> senseTokens = Tokenize(w, "^"); + + std::vector<Sense> out(senseTokens.size()); + for (size_t i = 0; i < senseTokens.size(); i++) { + std::vector<std::string> senseColumns = Tokenize(senseTokens[i], ":"); + if (senseColumns.size() != 2) { + UTIL_THROW2("VW :: bad format of sense distribution: " << senseTokens[i]); + } + out[i].m_label = senseColumns[0]; + out[i].m_prob = Scan<float>(senseColumns[1]); + } + + return out; + } + + // assuming that word surface form is always factor 0, output the word form + inline std::string GetWordForm(const InputType &input, size_t pos) const { + return input.GetWord(pos).GetString(0).as_string(); + } + + bool m_lexicalized; + int m_size; +}; + +} |