Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/vw
diff options
context:
space:
mode:
authorAles Tamchyna <tamchyna@ufal.mff.cuni.cz>2015-01-06 19:52:19 +0300
committerAles Tamchyna <tamchyna@ufal.mff.cuni.cz>2015-01-06 19:52:19 +0300
commit1970d46706fe80b917e385dfc0b8e7e91b5d62c6 (patch)
treeb0e8cfbd22d6593c7cca7df1acf0b498f895ce4a /vw
parent887392b8c2b74ea3819685e6fdc639e312e13965 (diff)
remove legacy files
Diffstat (limited to 'vw')
-rw-r--r--vw/ExtractorConfig.cpp51
-rw-r--r--vw/ExtractorConfig.h57
-rw-r--r--vw/FeatureExtractor.cpp229
-rw-r--r--vw/FeatureExtractor.h88
-rw-r--r--vw/IniReader.h61
5 files changed, 0 insertions, 486 deletions
diff --git a/vw/ExtractorConfig.cpp b/vw/ExtractorConfig.cpp
deleted file mode 100644
index 27bc570ba..000000000
--- a/vw/ExtractorConfig.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "ExtractorConfig.h"
-#include "Util.h"
-
-#include <exception>
-#include <stdexcept>
-#include <algorithm>
-#include <set>
-
-using namespace std;
-using namespace boost::bimaps;
-using namespace Moses;
-
-namespace Discriminative
-{
-
-void ExtractorConfig::Load(const string &configFile)
-{
- try {
- IniReader reader(configFile);
- m_sourceInternal = reader.Get<bool>("features.source-internal", false);
- m_sourceExternal = reader.Get<bool>("features.source-external", false);
- m_targetInternal = reader.Get<bool>("features.target-internal", false);
- m_sourceIndicator = reader.Get<bool>("features.source-indicator", false);
- m_targetIndicator = reader.Get<bool>("features.target-indicator", false);
- m_sourceTargetIndicator = reader.Get<bool>("features.source-target-indicator", false);
- m_STSE = reader.Get<bool>("features.source-target-source-external", false);
- m_paired = reader.Get<bool>("features.paired", false);
- m_bagOfWords = reader.Get<bool>("features.bag-of-words", false);
- m_mostFrequent = reader.Get<bool>("features.most-frequent", false);
- m_binnedScores = reader.Get<bool>("features.binned-scores", false);
- m_sourceTopic = reader.Get<bool>("features.source-topic", false);
- m_phraseFactor = reader.Get<bool>("features.phrase-factor", false);
- m_windowSize = reader.Get<size_t>("features.window-size", 0);
-
- m_factors = Scan<size_t>(Tokenize(reader.Get<string>("features.factors", ""), ","));
- m_scoreIndexes = Scan<size_t>(Tokenize(reader.Get<string>("features.scores", ""), ","));
- m_scoreBins = Scan<float>(Tokenize(reader.Get<string>("features.score-bins", ""), ","));
-
- m_vwOptsTrain = reader.Get<string>("vw-options.train", "");
- m_vwOptsPredict = reader.Get<string>("vw-options.predict", "");
-
- m_normalization = reader.Get<string>("decoder.normalization", "");
-
- m_isLoaded = true;
- } catch (const runtime_error &err) {
- cerr << "Error loading file " << configFile << ": " << err.what();
- m_isLoaded = false;
- }
-}
-
-} // namespace Discriminative
diff --git a/vw/ExtractorConfig.h b/vw/ExtractorConfig.h
deleted file mode 100644
index 23e136da7..000000000
--- a/vw/ExtractorConfig.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef moses_ExtractorConfig_h
-#define moses_ExtractorConfig_h
-
-#include <vector>
-#include <string>
-#include <map>
-#include <boost/bimap/bimap.hpp>
-#include "IniReader.h"
-
-namespace Discriminative
-{
-
-class ExtractorConfig
-{
- public:
- void Load(const std::string &configFile);
- inline bool GetSourceExternal() const { return m_sourceExternal; }
- inline bool GetSourceInternal() const { return m_sourceInternal; }
- inline bool GetTargetInternal() const { return m_targetInternal; }
- inline bool GetSourceIndicator() const { return m_sourceIndicator; }
- inline bool GetTargetIndicator() const { return m_targetIndicator; }
- inline bool GetSourceTargetIndicator() const { return m_sourceTargetIndicator; }
- inline bool GetSTSE() const { return m_STSE; }
- inline bool GetPhraseFactor() const { return m_phraseFactor; }
- inline bool GetPaired() const { return m_paired; }
- inline bool GetBagOfWords() const { return m_bagOfWords; }
- inline bool GetMostFrequent() const { return m_mostFrequent; }
- inline size_t GetWindowSize() const { return m_windowSize; }
- inline bool GetBinnedScores() const { return m_binnedScores; }
- inline bool GetSourceTopic() const { return m_sourceTopic; }
- inline const std::vector<size_t> &GetFactors() const { return m_factors; }
- inline const std::vector<size_t> &GetScoreIndexes() const { return m_scoreIndexes; }
- inline const std::vector<float> &GetScoreBins() const { return m_scoreBins; }
- inline const std::string &GetVWOptionsTrain() const { return m_vwOptsTrain; }
- inline const std::string &GetVWOptionsPredict() const { return m_vwOptsPredict; }
- inline const std::string &GetNormalization() const { return m_normalization; }
-
- inline bool IsLoaded() const { return m_isLoaded; }
-
- private:
- // read from configuration
- bool m_paired, m_bagOfWords, m_sourceExternal,
- m_sourceInternal, m_targetInternal, m_mostFrequent,
- m_binnedScores, m_sourceIndicator, m_targetIndicator,
- m_sourceTargetIndicator, m_STSE, m_sourceTopic, m_phraseFactor;
- std::string m_vwOptsPredict, m_vwOptsTrain, m_normalization;
- size_t m_windowSize;
- std::vector<size_t> m_factors, m_scoreIndexes;
- std::vector<float> m_scoreBins;
-
- // internal variables
- bool m_isLoaded;
-};
-
-} // namespace Discriminative
-
-#endif // moses_ExtractorConfig_h
diff --git a/vw/FeatureExtractor.cpp b/vw/FeatureExtractor.cpp
deleted file mode 100644
index 63a45ccc4..000000000
--- a/vw/FeatureExtractor.cpp
+++ /dev/null
@@ -1,229 +0,0 @@
-#include "FeatureExtractor.h"
-#include "Util.h"
-
-using namespace std;
-using namespace Moses;
-
-namespace Discriminative
-{
-
-FeatureExtractor::FeatureExtractor(const ExtractorConfig &config, bool train)
- : m_config(config), m_train(train)
-{
- if (! m_config.IsLoaded())
- throw logic_error("configuration file not loaded");
-}
-
-void FeatureExtractor::GenerateFeatures(Classifier *fc,
- const ContextType &context,
- size_t spanStart,
- size_t spanEnd,
- const vector<Translation> &translations,
- vector<float> &losses)
-{
- fc->SetNamespace('s', true);
-
- if (m_config.GetSourceExternal()) GenerateContextFeatures(context, spanStart, spanEnd, fc);
-
- // get words (surface forms) in source phrase
- vector<string> sourceForms(spanEnd - spanStart + 1);
- for (size_t i = spanStart; i <= spanEnd; i++)
- sourceForms[i - spanStart] = context[i][FACTOR_FORM];
-
- map<string, float> maxProbs;
- if (m_config.GetMostFrequent()) maxProbs = GetMaxProb(translations);
-
- if (m_config.GetSourceInternal()) GenerateInternalFeatures(sourceForms, fc);
- if (m_config.GetPhraseFactor()) GeneratePhraseFactorFeatures(context, spanStart, spanEnd, fc);
- if (m_config.GetBagOfWords()) GenerateBagOfWordsFeatures(context, spanStart, spanEnd, FACTOR_FORM, fc);
-
- if (m_config.GetSourceIndicator()) GenerateIndicatorFeature(sourceForms, fc);
-
- vector<Translation>::const_iterator transIt = translations.begin();
- vector<float>::iterator lossIt = losses.begin();
- for (; transIt != translations.end(); transIt++, lossIt++) {
- assert(lossIt != losses.end());
- fc->SetNamespace('t', false);
-
- // get words in target phrase
- const vector<string> &targetForms = transIt->translation;
-
- if (m_config.GetTargetInternal()) GenerateInternalFeatures(targetForms, fc);
- if (m_config.GetPaired()) GeneratePairedFeatures(sourceForms, targetForms, transIt->m_alignment, fc);
-
- if (m_config.GetMostFrequent()) GenerateMostFrequentFeature(transIt->m_ttableScores, maxProbs, fc);
-
- if (m_config.GetBinnedScores()) GenerateScoreFeatures(transIt->m_ttableScores, fc);
-
- // "NOT_IN_" features
- if (m_config.GetBinnedScores() || m_config.GetMostFrequent()) GenerateTTableEntryFeatures(transIt->m_ttableScores, fc);
-
- if (m_config.GetTargetIndicator()) GenerateIndicatorFeature(targetForms, fc);
-
- if (m_config.GetSourceTargetIndicator()) GenerateConcatIndicatorFeature(sourceForms, targetForms, fc);
-
- if (m_config.GetSTSE()) GenerateSTSE(sourceForms, targetForms, context, spanStart, spanEnd, fc);
-
- if (m_train) {
- fc->Train(SPrint(DUMMY_IDX), *lossIt);
- } else {
- *lossIt = fc->Predict(SPrint(DUMMY_IDX));
- }
- }
- fc->FinishExample();
-}
-
-//
-// private methods
-//
-
-string FeatureExtractor::BuildContextFeature(size_t factor, int index, const string &value)
-{
- return "c^" + SPrint(factor) + "_" + SPrint(index) + "_" + value;
-}
-
-void FeatureExtractor::GenerateContextFeatures(const ContextType &context,
- size_t spanStart,
- size_t spanEnd,
- Classifier *fc)
-{
- vector<size_t>::const_iterator factIt;
- for (factIt = m_config.GetFactors().begin(); factIt != m_config.GetFactors().end(); factIt++) {
- for (size_t i = 1; i <= m_config.GetWindowSize(); i++) {
- string left = "<s>";
- string right = "</s>";
- if (spanStart >= i)
- left = context[spanStart - i][*factIt];
- fc->AddFeature(BuildContextFeature(*factIt, -i, left));
- if (spanEnd + i < context.size())
- right = context[spanEnd + i][*factIt];
- fc->AddFeature(BuildContextFeature(*factIt, i, right));
- }
- }
-}
-
-void FeatureExtractor::GenerateIndicatorFeature(const vector<string> &span, Classifier *fc)
-{
- fc->AddFeature("p^" + Join("_", span));
-}
-
-void FeatureExtractor::GenerateConcatIndicatorFeature(const vector<string> &span1, const vector<string> &span2, Classifier *fc)
-{
- fc->AddFeature("p^" + Join("_", span1) + "^" + Join("_", span2));
-}
-
-void FeatureExtractor::GenerateSTSE(const vector<string> &span1, const vector<string> &span2,
- const ContextType &context,
- size_t spanStart,
- size_t spanEnd,
- Classifier *fc)
-{
- vector<size_t>::const_iterator factIt;
- for (factIt = m_config.GetFactors().begin(); factIt != m_config.GetFactors().end(); factIt++) {
- for (size_t i = 1; i <= m_config.GetWindowSize(); i++) {
- string left = "<s>";
- string right = "</s>";
- if (spanStart >= i)
- left = context[spanStart - i][*factIt];
- fc->AddFeature("stse^" + Join("_", span1) + "^" + Join("_", span2) + BuildContextFeature(*factIt, -i, left));
- if (spanEnd + i < context.size())
- right = context[spanEnd + i][*factIt];
- fc->AddFeature("stse^" + Join("_", span1) + "^" + Join("_", span2) + BuildContextFeature(*factIt, i, right));
- }
- }
-}
-
-void FeatureExtractor::GenerateInternalFeatures(const vector<string> &span, Classifier *fc)
-{
- vector<string>::const_iterator it;
- for (it = span.begin(); it != span.end(); it++) {
- fc->AddFeature("w^" + *it);
- }
-}
-
-void FeatureExtractor::GenerateBagOfWordsFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, size_t factorID, Classifier *fc)
-{
- for (size_t i = 0; i < spanStart; i++)
- fc->AddFeature("bow^" + context[i][factorID]);
- for (size_t i = spanEnd + 1; i < context.size(); i++)
- fc->AddFeature("bow^" + context[i][factorID]);
-}
-
-void FeatureExtractor::GeneratePhraseFactorFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, Classifier *fc)
-{
- for (size_t i = spanStart; i <= spanEnd; i++) {
- vector<size_t>::const_iterator factIt;
- for (factIt = m_config.GetFactors().begin(); factIt != m_config.GetFactors().end(); factIt++) {
- fc->AddFeature("ibow^" + SPrint(*factIt) + "_" + context[i][*factIt]);
- }
- }
-}
-
-void FeatureExtractor::GeneratePairedFeatures(const vector<string> &srcPhrase, const vector<string> &tgtPhrase,
- const AlignmentType &align, Classifier *fc)
-{
- AlignmentType::const_iterator it;
- set<size_t> srcAligned;
- set<size_t> tgtAligned;
-
- for (it = align.begin(); it != align.end(); it++) {
- fc->AddFeature("pair^" + srcPhrase[it->first] + "^" + tgtPhrase[it->second]);
- srcAligned.insert(it->first);
- tgtAligned.insert(it->second);
- }
-
- for (size_t i = 0; i < srcPhrase.size(); i++) {
- if (srcAligned.count(i) == 0)
- fc->AddFeature("pair^" + srcPhrase[i] + "^NULL");
- }
-
- for (size_t i = 0; i < tgtPhrase.size(); i++) {
- if (tgtAligned.count(i) == 0)
- fc->AddFeature("pair^NULL^" + tgtPhrase[i]);
- }
-}
-
-void FeatureExtractor::GenerateScoreFeatures(const std::vector<TTableEntry> &ttableScores, Classifier *fc)
-{
- vector<size_t>::const_iterator scoreIt;
- vector<float>::const_iterator binIt;
- vector<TTableEntry>::const_iterator tableIt;
- const vector<size_t> &scoreIDs = m_config.GetScoreIndexes();
- const vector<float> &bins = m_config.GetScoreBins();
-
- for (tableIt = ttableScores.begin(); tableIt != ttableScores.end(); tableIt++) {
- if (! tableIt->m_exists)
- continue;
- string prefix = ttableScores.size() == 1 ? "" : tableIt->m_id + "_";
- for (scoreIt = scoreIDs.begin(); scoreIt != scoreIDs.end(); scoreIt++) {
- for (binIt = bins.begin(); binIt != bins.end(); binIt++) {
- float logScore = log(tableIt->m_scores[*scoreIt]);
- if (logScore < *binIt || Equals(logScore, *binIt)) {
- fc->AddFeature(prefix + "sc^" + SPrint<size_t>(*scoreIt) + "_" + SPrint(*binIt));
- }
- }
- }
- }
-}
-
-void FeatureExtractor::GenerateMostFrequentFeature(const std::vector<TTableEntry> &ttableScores, const map<string, float> &maxProbs, Classifier *fc)
-{
- vector<TTableEntry>::const_iterator it;
- for (it = ttableScores.begin(); it != ttableScores.end(); it++) {
- if (it->m_exists && Equals(it->m_scores[P_E_F_INDEX], maxProbs.find(it->m_id)->second)) {
- string prefix = ttableScores.size() == 1 ? "" : it->m_id + "_";
- fc->AddFeature(prefix + "MOST_FREQUENT");
- }
- }
-}
-
-void FeatureExtractor::GenerateTTableEntryFeatures(const std::vector<TTableEntry> &ttableScores, Classifier *fc)
-{
- vector<TTableEntry>::const_iterator it;
- for (it = ttableScores.begin(); it != ttableScores.end(); it++) {
- if (! it->m_exists)
- fc->AddFeature("NOT_IN_" + it->m_id);
- }
-}
-
-} // namespace Discriminative
diff --git a/vw/FeatureExtractor.h b/vw/FeatureExtractor.h
deleted file mode 100644
index 2edbd331e..000000000
--- a/vw/FeatureExtractor.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifndef moses_FeatureExtractor_h
-#define moses_FeatureExtractor_h
-
-#include "Classifier.h"
-#include "ExtractorConfig.h"
-
-#include <vector>
-#include <string>
-#include <exception>
-#include <stdexcept>
-#include <map>
-
-namespace Discriminative
-{
-
-// label index passed to the classifier, this value is not used in our setting
-const int DUMMY_IDX = 1111;
-
-// vector of words, each word is a vector of factors
-typedef std::vector<std::vector<std::string> > ContextType;
-
-typedef std::multimap<size_t, size_t> AlignmentType;
-
-// In DA scenario, there are multiple phrase tables. This struct
-// contains scores for a phrase in one phrase-table.
-struct TTableEntry
-{
- std::string m_id; // phrase-table identifier
- bool m_exists; // does translation exist in this table
- std::vector<float> m_scores; // translation scores (empty if m_exists == false)
-};
-
-// One translation (phrase target side).
-struct Translation
-{
- std::vector<std::string> translation; // words (surface forms) of translation
- AlignmentType m_alignment; // phrase-internal word alignment
- std::vector<TTableEntry> m_ttableScores; // phrase scores in each phrase table
-};
-
-// extract features
-class FeatureExtractor
-{
-public:
- FeatureExtractor(const ExtractorConfig &config, bool train);
-
- // Generate features for current source phrase and all its translation options, based on
- // configuration. Calls all auxiliary Generate* methods.
- //
- // In training, reads the &losses parameter and passes them to VW. In prediction, &losses is
- // an output variable where VW scores are written.
- void GenerateFeatures(Classifier *fc,
- const ContextType &context,
- size_t spanStart,
- size_t spanEnd,
- const std::vector<Translation> &translations,
- std::vector<float> &losses);
-
-private:
- const ExtractorConfig &m_config; // Configuration of features.
- bool m_train; // Train or predict.
-
- // Get the highest probability P(e|f) associated with any of the translation options,
- // separately for each phrase table (string keys are phrase-table IDs).
- std::map<std::string, float> GetMaxProb(const std::vector<Translation> &translations);
-
- void GenerateContextFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, Classifier *fc);
- void GeneratePhraseFactorFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, Classifier *fc);
- void GenerateInternalFeatures(const std::vector<std::string> &span, Classifier *fc);
- void GenerateIndicatorFeature(const std::vector<std::string> &span, Classifier *fc);
- void GenerateConcatIndicatorFeature(const std::vector<std::string> &span1, const std::vector<std::string> &span2, Classifier *fc);
- void GenerateSTSE(const std::vector<std::string> &span1, const std::vector<std::string> &span2, const ContextType &context, size_t spanStart, size_t spanEnd, Classifier *fc);
- void GenerateBagOfWordsFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, size_t factorID, Classifier *fc);
- void GeneratePairedFeatures(const std::vector<std::string> &srcPhrase,
- const std::vector<std::string> &tgtPhrase,
- const AlignmentType &align,
- Classifier *fc);
- void GenerateScoreFeatures(const std::vector<TTableEntry> &ttableScores, Classifier *fc);
- void GenerateMostFrequentFeature(const std::vector<TTableEntry> &ttableScores,
- const std::map<std::string, float> &maxProbs,
- Classifier *fc);
- void GenerateTTableEntryFeatures(const std::vector<TTableEntry> &ttableScores, Classifier *fc);
- std::string BuildContextFeature(size_t factor, int index, const std::string &value);
-};
-
-} // namespace Discriminative
-
-#endif // moses_FeatureExtractor_h
diff --git a/vw/IniReader.h b/vw/IniReader.h
deleted file mode 100644
index 528491c8a..000000000
--- a/vw/IniReader.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef moses_iniReader_h
-#define moses_iniReader_h
-
-#include <vector>
-#include <algorithm>
-#include <string>
-#include <fstream>
-#include <map>
-#include <exception>
-#include <stdexcept>
-
-#include <boost/algorithm/string.hpp>
-#include <boost/bind.hpp>
-#include <boost/algorithm/string/classification.hpp>
-#include <boost/lexical_cast.hpp>
-
-// simple reader of .ini files
-class IniReader {
-public:
- IniReader(const std::string &file)
- {
- std::ifstream inStr(file.c_str());
- if (! inStr.is_open())
- throw std::runtime_error("Failed to open file " + file);
-
- std::string section = "";
- std::string line;
- while (getline(inStr, line)) {
- if (line.empty() || line[0] == ';' || line[0] == '#') {
- // empty line or comment, do nothing
- } else if (line[0] == '[') {
- // new section
- section = line.substr(1, line.size() - 2);
- } else {
- std::vector<std::string> cols;
- boost::split(cols, line, boost::is_any_of("="));
- std::for_each(cols.begin(), cols.end(),
- boost::bind(&boost::trim<std::string>, _1, std::locale()));
- if (section.empty())
- throw std::runtime_error("Missing section");
- if (cols.size() != 2)
- throw std::runtime_error("Failed to parse line: '" + line + "'");
- std::string key = section + "." + cols[0];
- properties[key] = cols[1];
- }
- }
- inStr.close();
- }
-
- template <class T>
- T Get(const std::string &key, T defaultValue)
- {
- std::map<std::string, std::string>::const_iterator it = properties.find(key);
- return (it == properties.end()) ? defaultValue : boost::lexical_cast<T>(it->second);
- }
-
-private:
- std::map<std::string, std::string> properties;
-};
-
-#endif // moses_iniReader_h