Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/moses/FF
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2015-10-13 18:24:38 +0300
committerHieu Hoang <hieuhoang@gmail.com>2015-10-13 18:24:38 +0300
commitd5bb41744905bacffb3f85b479009503768cd481 (patch)
tree9e2cb7295ad3c23dbedcd5160e5c37e8da34e6c8 /moses/FF
parent0dd07cda0b79ac9bdc0499e4a908655d58ddab98 (diff)
parente5c9131333c5a38838f2ff627249735551d3db73 (diff)
Merge branch 'perf_ff' of github.com:hieuhoang/mosesdecoder into perf_ff
Diffstat (limited to 'moses/FF')
-rw-r--r--moses/FF/Factory.cpp2
-rw-r--r--moses/FF/VW/VWFeatureSourceSenseWindow.h141
2 files changed, 143 insertions, 0 deletions
diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp
index 6fc4e310d..d2c3cf639 100644
--- a/moses/FF/Factory.cpp
+++ b/moses/FF/Factory.cpp
@@ -73,6 +73,7 @@
#include "moses/FF/VW/VWFeatureSourceBigrams.h"
#include "moses/FF/VW/VWFeatureSourceIndicator.h"
#include "moses/FF/VW/VWFeatureSourcePhraseInternal.h"
+#include "moses/FF/VW/VWFeatureSourceSenseWindow.h"
#include "moses/FF/VW/VWFeatureSourceWindow.h"
#include "moses/FF/VW/VWFeatureTargetBigrams.h"
#include "moses/FF/VW/VWFeatureTargetIndicator.h"
@@ -279,6 +280,7 @@ FeatureRegistry::FeatureRegistry()
MOSES_FNAME(VWFeatureSourceBigrams);
MOSES_FNAME(VWFeatureSourceIndicator);
MOSES_FNAME(VWFeatureSourcePhraseInternal);
+ MOSES_FNAME(VWFeatureSourceSenseWindow);
MOSES_FNAME(VWFeatureSourceWindow);
MOSES_FNAME(VWFeatureTargetBigrams);
MOSES_FNAME(VWFeatureTargetPhraseInternal);
diff --git a/moses/FF/VW/VWFeatureSourceSenseWindow.h b/moses/FF/VW/VWFeatureSourceSenseWindow.h
new file mode 100644
index 000000000..5add76c09
--- /dev/null
+++ b/moses/FF/VW/VWFeatureSourceSenseWindow.h
@@ -0,0 +1,141 @@
+#pragma once
+
+#include <string>
+#include <algorithm>
+#include <boost/foreach.hpp>
+#include "ThreadLocalByFeatureStorage.h"
+#include "VWFeatureSource.h"
+#include "moses/Util.h"
+
+/*
+ * Produces features from factors in the following format:
+ * wordsense1:0.25^wordsense1:0.7^wordsense3:0.05
+ *
+ * This is useful e.g. for including different possible word senses as features weighted
+ * by their probability.
+ *
+ * By default, features are extracted from a small context window around the current
+ * phrase and from within the phrase.
+ */
+
+namespace Moses
+{
+
+class VWFeatureSourceSenseWindow : public VWFeatureSource
+{
+public:
+ VWFeatureSourceSenseWindow(const std::string &line)
+ : VWFeatureSource(line), m_tlsSenses(this), m_tlsForms(this), m_lexicalized(true), m_size(DEFAULT_WINDOW_SIZE) {
+ ReadParameters();
+
+ // Call this last
+ VWFeatureBase::UpdateRegister();
+ }
+
+ // precompute feature strings for each input sentence
+ virtual void InitializeForInput(ttasksptr const& ttask) {
+ InputType const& input = *(ttask->GetSource().get());
+
+ std::vector<WordSenses>& senses = *m_tlsSenses.GetStored();
+ std::vector<std::string>& forms = *m_tlsForms.GetStored();
+ senses.clear();
+ forms.clear();
+
+ senses.resize(input.GetSize());
+ forms.resize(input.GetSize());
+
+ for (size_t i = 0; i < input.GetSize(); i++) {
+ senses[i] = GetSenses(input, i);
+ forms[i] = m_lexicalized ? GetWordForm(input, i) + "^" : "";
+ }
+ }
+
+ void operator()(const InputType &input
+ , const InputPath &inputPath
+ , const WordsRange &sourceRange
+ , Discriminative::Classifier &classifier) const {
+ int begin = sourceRange.GetStartPos();
+ int end = sourceRange.GetEndPos() + 1;
+ int inputLen = input.GetSize();
+
+ const std::vector<WordSenses>& senses = *m_tlsSenses.GetStored();
+ const std::vector<std::string>& forms = *m_tlsForms.GetStored();
+
+ // before current phrase
+ for (int i = std::max(0, begin - m_size); i < begin; i++) {
+ BOOST_FOREACH(const Sense &sense, senses[i]) {
+ classifier.AddLabelIndependentFeature("snsb^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob);
+ classifier.AddLabelIndependentFeature("snsb^" + forms[i] + sense.m_label, sense.m_prob);
+ }
+ }
+
+ // within current phrase
+ for (int i = begin; i < end; i++) {
+ BOOST_FOREACH(const Sense &sense, senses[i]) {
+ classifier.AddLabelIndependentFeature("snsin^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob);
+ classifier.AddLabelIndependentFeature("snsin^" + forms[i] + sense.m_label, sense.m_prob);
+ }
+ }
+
+ // after current phrase
+ for (int i = end; i < std::min(end + m_size, inputLen); i++) {
+ BOOST_FOREACH(const Sense &sense, senses[i]) {
+ classifier.AddLabelIndependentFeature("snsa^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob);
+ classifier.AddLabelIndependentFeature("snsa^" + forms[i] + sense.m_label, sense.m_prob);
+ }
+ }
+ }
+
+ virtual void SetParameter(const std::string& key, const std::string& value) {
+ if (key == "size") {
+ m_size = Scan<size_t>(value);
+ } else if (key == "lexicalized") {
+ m_lexicalized = Scan<bool>(value);
+ } else {
+ VWFeatureSource::SetParameter(key, value);
+ }
+ }
+
+private:
+ static const int DEFAULT_WINDOW_SIZE = 3;
+
+ struct Sense {
+ std::string m_label;
+ float m_prob;
+ };
+
+ typedef std::vector<Sense> WordSenses;
+ typedef ThreadLocalByFeatureStorage<std::vector<WordSenses> > TLSSenses;
+ typedef ThreadLocalByFeatureStorage<std::vector<std::string> > TLSWordForms;
+
+ TLSSenses m_tlsSenses; // for each input sentence, contains extracted senses and probs for each word
+ TLSWordForms m_tlsForms; // word forms for each input sentence
+
+
+ std::vector<Sense> GetSenses(const InputType &input, size_t pos) const {
+ std::string w = GetWord(input, pos);
+ std::vector<std::string> senseTokens = Tokenize(w, "^");
+
+ std::vector<Sense> out(senseTokens.size());
+ for (size_t i = 0; i < senseTokens.size(); i++) {
+ std::vector<std::string> senseColumns = Tokenize(senseTokens[i], ":");
+ if (senseColumns.size() != 2) {
+ UTIL_THROW2("VW :: bad format of sense distribution: " << senseTokens[i]);
+ }
+ out[i].m_label = senseColumns[0];
+ out[i].m_prob = Scan<float>(senseColumns[1]);
+ }
+
+ return out;
+ }
+
+ // assuming that word surface form is always factor 0, output the word form
+ inline std::string GetWordForm(const InputType &input, size_t pos) const {
+ return input.GetWord(pos).GetString(0).as_string();
+ }
+
+ bool m_lexicalized;
+ int m_size;
+};
+
+}