diff options
author | Ales Tamchyna <tamchyna@ufal.mff.cuni.cz> | 2015-01-06 19:52:19 +0300 |
---|---|---|
committer | Ales Tamchyna <tamchyna@ufal.mff.cuni.cz> | 2015-01-06 19:52:19 +0300 |
commit | 1970d46706fe80b917e385dfc0b8e7e91b5d62c6 (patch) | |
tree | b0e8cfbd22d6593c7cca7df1acf0b498f895ce4a /vw | |
parent | 887392b8c2b74ea3819685e6fdc639e312e13965 (diff) |
remove legacy files
Diffstat (limited to 'vw')
-rw-r--r-- | vw/ExtractorConfig.cpp | 51 | ||||
-rw-r--r-- | vw/ExtractorConfig.h | 57 | ||||
-rw-r--r-- | vw/FeatureExtractor.cpp | 229 | ||||
-rw-r--r-- | vw/FeatureExtractor.h | 88 | ||||
-rw-r--r-- | vw/IniReader.h | 61 |
5 files changed, 0 insertions, 486 deletions
diff --git a/vw/ExtractorConfig.cpp b/vw/ExtractorConfig.cpp deleted file mode 100644 index 27bc570ba..000000000 --- a/vw/ExtractorConfig.cpp +++ /dev/null @@ -1,51 +0,0 @@ -#include "ExtractorConfig.h" -#include "Util.h" - -#include <exception> -#include <stdexcept> -#include <algorithm> -#include <set> - -using namespace std; -using namespace boost::bimaps; -using namespace Moses; - -namespace Discriminative -{ - -void ExtractorConfig::Load(const string &configFile) -{ - try { - IniReader reader(configFile); - m_sourceInternal = reader.Get<bool>("features.source-internal", false); - m_sourceExternal = reader.Get<bool>("features.source-external", false); - m_targetInternal = reader.Get<bool>("features.target-internal", false); - m_sourceIndicator = reader.Get<bool>("features.source-indicator", false); - m_targetIndicator = reader.Get<bool>("features.target-indicator", false); - m_sourceTargetIndicator = reader.Get<bool>("features.source-target-indicator", false); - m_STSE = reader.Get<bool>("features.source-target-source-external", false); - m_paired = reader.Get<bool>("features.paired", false); - m_bagOfWords = reader.Get<bool>("features.bag-of-words", false); - m_mostFrequent = reader.Get<bool>("features.most-frequent", false); - m_binnedScores = reader.Get<bool>("features.binned-scores", false); - m_sourceTopic = reader.Get<bool>("features.source-topic", false); - m_phraseFactor = reader.Get<bool>("features.phrase-factor", false); - m_windowSize = reader.Get<size_t>("features.window-size", 0); - - m_factors = Scan<size_t>(Tokenize(reader.Get<string>("features.factors", ""), ",")); - m_scoreIndexes = Scan<size_t>(Tokenize(reader.Get<string>("features.scores", ""), ",")); - m_scoreBins = Scan<float>(Tokenize(reader.Get<string>("features.score-bins", ""), ",")); - - m_vwOptsTrain = reader.Get<string>("vw-options.train", ""); - m_vwOptsPredict = reader.Get<string>("vw-options.predict", ""); - - m_normalization = reader.Get<string>("decoder.normalization", ""); - - m_isLoaded = true; - } catch (const runtime_error &err) { - cerr << "Error loading file " << configFile << ": " << err.what(); - m_isLoaded = false; - } -} - -} // namespace Discriminative diff --git a/vw/ExtractorConfig.h b/vw/ExtractorConfig.h deleted file mode 100644 index 23e136da7..000000000 --- a/vw/ExtractorConfig.h +++ /dev/null @@ -1,57 +0,0 @@ -#ifndef moses_ExtractorConfig_h -#define moses_ExtractorConfig_h - -#include <vector> -#include <string> -#include <map> -#include <boost/bimap/bimap.hpp> -#include "IniReader.h" - -namespace Discriminative -{ - -class ExtractorConfig -{ - public: - void Load(const std::string &configFile); - inline bool GetSourceExternal() const { return m_sourceExternal; } - inline bool GetSourceInternal() const { return m_sourceInternal; } - inline bool GetTargetInternal() const { return m_targetInternal; } - inline bool GetSourceIndicator() const { return m_sourceIndicator; } - inline bool GetTargetIndicator() const { return m_targetIndicator; } - inline bool GetSourceTargetIndicator() const { return m_sourceTargetIndicator; } - inline bool GetSTSE() const { return m_STSE; } - inline bool GetPhraseFactor() const { return m_phraseFactor; } - inline bool GetPaired() const { return m_paired; } - inline bool GetBagOfWords() const { return m_bagOfWords; } - inline bool GetMostFrequent() const { return m_mostFrequent; } - inline size_t GetWindowSize() const { return m_windowSize; } - inline bool GetBinnedScores() const { return m_binnedScores; } - inline bool GetSourceTopic() const { return m_sourceTopic; } - inline const std::vector<size_t> &GetFactors() const { return m_factors; } - inline const std::vector<size_t> &GetScoreIndexes() const { return m_scoreIndexes; } - inline const std::vector<float> &GetScoreBins() const { return m_scoreBins; } - inline const std::string &GetVWOptionsTrain() const { return m_vwOptsTrain; } - inline const std::string &GetVWOptionsPredict() const { return m_vwOptsPredict; } - inline const std::string &GetNormalization() const { return m_normalization; } - - inline bool IsLoaded() const { return m_isLoaded; } - - private: - // read from configuration - bool m_paired, m_bagOfWords, m_sourceExternal, - m_sourceInternal, m_targetInternal, m_mostFrequent, - m_binnedScores, m_sourceIndicator, m_targetIndicator, - m_sourceTargetIndicator, m_STSE, m_sourceTopic, m_phraseFactor; - std::string m_vwOptsPredict, m_vwOptsTrain, m_normalization; - size_t m_windowSize; - std::vector<size_t> m_factors, m_scoreIndexes; - std::vector<float> m_scoreBins; - - // internal variables - bool m_isLoaded; -}; - -} // namespace Discriminative - -#endif // moses_ExtractorConfig_h diff --git a/vw/FeatureExtractor.cpp b/vw/FeatureExtractor.cpp deleted file mode 100644 index 63a45ccc4..000000000 --- a/vw/FeatureExtractor.cpp +++ /dev/null @@ -1,229 +0,0 @@ -#include "FeatureExtractor.h" -#include "Util.h" - -using namespace std; -using namespace Moses; - -namespace Discriminative -{ - -FeatureExtractor::FeatureExtractor(const ExtractorConfig &config, bool train) - : m_config(config), m_train(train) -{ - if (! m_config.IsLoaded()) - throw logic_error("configuration file not loaded"); -} - -void FeatureExtractor::GenerateFeatures(Classifier *fc, - const ContextType &context, - size_t spanStart, - size_t spanEnd, - const vector<Translation> &translations, - vector<float> &losses) -{ - fc->SetNamespace('s', true); - - if (m_config.GetSourceExternal()) GenerateContextFeatures(context, spanStart, spanEnd, fc); - - // get words (surface forms) in source phrase - vector<string> sourceForms(spanEnd - spanStart + 1); - for (size_t i = spanStart; i <= spanEnd; i++) - sourceForms[i - spanStart] = context[i][FACTOR_FORM]; - - map<string, float> maxProbs; - if (m_config.GetMostFrequent()) maxProbs = GetMaxProb(translations); - - if (m_config.GetSourceInternal()) GenerateInternalFeatures(sourceForms, fc); - if (m_config.GetPhraseFactor()) GeneratePhraseFactorFeatures(context, spanStart, spanEnd, fc); - if (m_config.GetBagOfWords()) GenerateBagOfWordsFeatures(context, spanStart, spanEnd, FACTOR_FORM, fc); - - if (m_config.GetSourceIndicator()) GenerateIndicatorFeature(sourceForms, fc); - - vector<Translation>::const_iterator transIt = translations.begin(); - vector<float>::iterator lossIt = losses.begin(); - for (; transIt != translations.end(); transIt++, lossIt++) { - assert(lossIt != losses.end()); - fc->SetNamespace('t', false); - - // get words in target phrase - const vector<string> &targetForms = transIt->translation; - - if (m_config.GetTargetInternal()) GenerateInternalFeatures(targetForms, fc); - if (m_config.GetPaired()) GeneratePairedFeatures(sourceForms, targetForms, transIt->m_alignment, fc); - - if (m_config.GetMostFrequent()) GenerateMostFrequentFeature(transIt->m_ttableScores, maxProbs, fc); - - if (m_config.GetBinnedScores()) GenerateScoreFeatures(transIt->m_ttableScores, fc); - - // "NOT_IN_" features - if (m_config.GetBinnedScores() || m_config.GetMostFrequent()) GenerateTTableEntryFeatures(transIt->m_ttableScores, fc); - - if (m_config.GetTargetIndicator()) GenerateIndicatorFeature(targetForms, fc); - - if (m_config.GetSourceTargetIndicator()) GenerateConcatIndicatorFeature(sourceForms, targetForms, fc); - - if (m_config.GetSTSE()) GenerateSTSE(sourceForms, targetForms, context, spanStart, spanEnd, fc); - - if (m_train) { - fc->Train(SPrint(DUMMY_IDX), *lossIt); - } else { - *lossIt = fc->Predict(SPrint(DUMMY_IDX)); - } - } - fc->FinishExample(); -} - -// -// private methods -// - -string FeatureExtractor::BuildContextFeature(size_t factor, int index, const string &value) -{ - return "c^" + SPrint(factor) + "_" + SPrint(index) + "_" + value; -} - -void FeatureExtractor::GenerateContextFeatures(const ContextType &context, - size_t spanStart, - size_t spanEnd, - Classifier *fc) -{ - vector<size_t>::const_iterator factIt; - for (factIt = m_config.GetFactors().begin(); factIt != m_config.GetFactors().end(); factIt++) { - for (size_t i = 1; i <= m_config.GetWindowSize(); i++) { - string left = "<s>"; - string right = "</s>"; - if (spanStart >= i) - left = context[spanStart - i][*factIt]; - fc->AddFeature(BuildContextFeature(*factIt, -i, left)); - if (spanEnd + i < context.size()) - right = context[spanEnd + i][*factIt]; - fc->AddFeature(BuildContextFeature(*factIt, i, right)); - } - } -} - -void FeatureExtractor::GenerateIndicatorFeature(const vector<string> &span, Classifier *fc) -{ - fc->AddFeature("p^" + Join("_", span)); -} - -void FeatureExtractor::GenerateConcatIndicatorFeature(const vector<string> &span1, const vector<string> &span2, Classifier *fc) -{ - fc->AddFeature("p^" + Join("_", span1) + "^" + Join("_", span2)); -} - -void FeatureExtractor::GenerateSTSE(const vector<string> &span1, const vector<string> &span2, - const ContextType &context, - size_t spanStart, - size_t spanEnd, - Classifier *fc) -{ - vector<size_t>::const_iterator factIt; - for (factIt = m_config.GetFactors().begin(); factIt != m_config.GetFactors().end(); factIt++) { - for (size_t i = 1; i <= m_config.GetWindowSize(); i++) { - string left = "<s>"; - string right = "</s>"; - if (spanStart >= i) - left = context[spanStart - i][*factIt]; - fc->AddFeature("stse^" + Join("_", span1) + "^" + Join("_", span2) + BuildContextFeature(*factIt, -i, left)); - if (spanEnd + i < context.size()) - right = context[spanEnd + i][*factIt]; - fc->AddFeature("stse^" + Join("_", span1) + "^" + Join("_", span2) + BuildContextFeature(*factIt, i, right)); - } - } -} - -void FeatureExtractor::GenerateInternalFeatures(const vector<string> &span, Classifier *fc) -{ - vector<string>::const_iterator it; - for (it = span.begin(); it != span.end(); it++) { - fc->AddFeature("w^" + *it); - } -} - -void FeatureExtractor::GenerateBagOfWordsFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, size_t factorID, Classifier *fc) -{ - for (size_t i = 0; i < spanStart; i++) - fc->AddFeature("bow^" + context[i][factorID]); - for (size_t i = spanEnd + 1; i < context.size(); i++) - fc->AddFeature("bow^" + context[i][factorID]); -} - -void FeatureExtractor::GeneratePhraseFactorFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, Classifier *fc) -{ - for (size_t i = spanStart; i <= spanEnd; i++) { - vector<size_t>::const_iterator factIt; - for (factIt = m_config.GetFactors().begin(); factIt != m_config.GetFactors().end(); factIt++) { - fc->AddFeature("ibow^" + SPrint(*factIt) + "_" + context[i][*factIt]); - } - } -} - -void FeatureExtractor::GeneratePairedFeatures(const vector<string> &srcPhrase, const vector<string> &tgtPhrase, - const AlignmentType &align, Classifier *fc) -{ - AlignmentType::const_iterator it; - set<size_t> srcAligned; - set<size_t> tgtAligned; - - for (it = align.begin(); it != align.end(); it++) { - fc->AddFeature("pair^" + srcPhrase[it->first] + "^" + tgtPhrase[it->second]); - srcAligned.insert(it->first); - tgtAligned.insert(it->second); - } - - for (size_t i = 0; i < srcPhrase.size(); i++) { - if (srcAligned.count(i) == 0) - fc->AddFeature("pair^" + srcPhrase[i] + "^NULL"); - } - - for (size_t i = 0; i < tgtPhrase.size(); i++) { - if (tgtAligned.count(i) == 0) - fc->AddFeature("pair^NULL^" + tgtPhrase[i]); - } -} - -void FeatureExtractor::GenerateScoreFeatures(const std::vector<TTableEntry> &ttableScores, Classifier *fc) -{ - vector<size_t>::const_iterator scoreIt; - vector<float>::const_iterator binIt; - vector<TTableEntry>::const_iterator tableIt; - const vector<size_t> &scoreIDs = m_config.GetScoreIndexes(); - const vector<float> &bins = m_config.GetScoreBins(); - - for (tableIt = ttableScores.begin(); tableIt != ttableScores.end(); tableIt++) { - if (! tableIt->m_exists) - continue; - string prefix = ttableScores.size() == 1 ? "" : tableIt->m_id + "_"; - for (scoreIt = scoreIDs.begin(); scoreIt != scoreIDs.end(); scoreIt++) { - for (binIt = bins.begin(); binIt != bins.end(); binIt++) { - float logScore = log(tableIt->m_scores[*scoreIt]); - if (logScore < *binIt || Equals(logScore, *binIt)) { - fc->AddFeature(prefix + "sc^" + SPrint<size_t>(*scoreIt) + "_" + SPrint(*binIt)); - } - } - } - } -} - -void FeatureExtractor::GenerateMostFrequentFeature(const std::vector<TTableEntry> &ttableScores, const map<string, float> &maxProbs, Classifier *fc) -{ - vector<TTableEntry>::const_iterator it; - for (it = ttableScores.begin(); it != ttableScores.end(); it++) { - if (it->m_exists && Equals(it->m_scores[P_E_F_INDEX], maxProbs.find(it->m_id)->second)) { - string prefix = ttableScores.size() == 1 ? "" : it->m_id + "_"; - fc->AddFeature(prefix + "MOST_FREQUENT"); - } - } -} - -void FeatureExtractor::GenerateTTableEntryFeatures(const std::vector<TTableEntry> &ttableScores, Classifier *fc) -{ - vector<TTableEntry>::const_iterator it; - for (it = ttableScores.begin(); it != ttableScores.end(); it++) { - if (! it->m_exists) - fc->AddFeature("NOT_IN_" + it->m_id); - } -} - -} // namespace Discriminative diff --git a/vw/FeatureExtractor.h b/vw/FeatureExtractor.h deleted file mode 100644 index 2edbd331e..000000000 --- a/vw/FeatureExtractor.h +++ /dev/null @@ -1,88 +0,0 @@ -#ifndef moses_FeatureExtractor_h -#define moses_FeatureExtractor_h - -#include "Classifier.h" -#include "ExtractorConfig.h" - -#include <vector> -#include <string> -#include <exception> -#include <stdexcept> -#include <map> - -namespace Discriminative -{ - -// label index passed to the classifier, this value is not used in our setting -const int DUMMY_IDX = 1111; - -// vector of words, each word is a vector of factors -typedef std::vector<std::vector<std::string> > ContextType; - -typedef std::multimap<size_t, size_t> AlignmentType; - -// In DA scenario, there are multiple phrase tables. This struct -// contains scores for a phrase in one phrase-table. -struct TTableEntry -{ - std::string m_id; // phrase-table identifier - bool m_exists; // does translation exist in this table - std::vector<float> m_scores; // translation scores (empty if m_exists == false) -}; - -// One translation (phrase target side). -struct Translation -{ - std::vector<std::string> translation; // words (surface forms) of translation - AlignmentType m_alignment; // phrase-internal word alignment - std::vector<TTableEntry> m_ttableScores; // phrase scores in each phrase table -}; - -// extract features -class FeatureExtractor -{ -public: - FeatureExtractor(const ExtractorConfig &config, bool train); - - // Generate features for current source phrase and all its translation options, based on - // configuration. Calls all auxiliary Generate* methods. - // - // In training, reads the &losses parameter and passes them to VW. In prediction, &losses is - // an output variable where VW scores are written. - void GenerateFeatures(Classifier *fc, - const ContextType &context, - size_t spanStart, - size_t spanEnd, - const std::vector<Translation> &translations, - std::vector<float> &losses); - -private: - const ExtractorConfig &m_config; // Configuration of features. - bool m_train; // Train or predict. - - // Get the highest probability P(e|f) associated with any of the translation options, - // separately for each phrase table (string keys are phrase-table IDs). - std::map<std::string, float> GetMaxProb(const std::vector<Translation> &translations); - - void GenerateContextFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, Classifier *fc); - void GeneratePhraseFactorFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, Classifier *fc); - void GenerateInternalFeatures(const std::vector<std::string> &span, Classifier *fc); - void GenerateIndicatorFeature(const std::vector<std::string> &span, Classifier *fc); - void GenerateConcatIndicatorFeature(const std::vector<std::string> &span1, const std::vector<std::string> &span2, Classifier *fc); - void GenerateSTSE(const std::vector<std::string> &span1, const std::vector<std::string> &span2, const ContextType &context, size_t spanStart, size_t spanEnd, Classifier *fc); - void GenerateBagOfWordsFeatures(const ContextType &context, size_t spanStart, size_t spanEnd, size_t factorID, Classifier *fc); - void GeneratePairedFeatures(const std::vector<std::string> &srcPhrase, - const std::vector<std::string> &tgtPhrase, - const AlignmentType &align, - Classifier *fc); - void GenerateScoreFeatures(const std::vector<TTableEntry> &ttableScores, Classifier *fc); - void GenerateMostFrequentFeature(const std::vector<TTableEntry> &ttableScores, - const std::map<std::string, float> &maxProbs, - Classifier *fc); - void GenerateTTableEntryFeatures(const std::vector<TTableEntry> &ttableScores, Classifier *fc); - std::string BuildContextFeature(size_t factor, int index, const std::string &value); -}; - -} // namespace Discriminative - -#endif // moses_FeatureExtractor_h diff --git a/vw/IniReader.h b/vw/IniReader.h deleted file mode 100644 index 528491c8a..000000000 --- a/vw/IniReader.h +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef moses_iniReader_h -#define moses_iniReader_h - -#include <vector> -#include <algorithm> -#include <string> -#include <fstream> -#include <map> -#include <exception> -#include <stdexcept> - -#include <boost/algorithm/string.hpp> -#include <boost/bind.hpp> -#include <boost/algorithm/string/classification.hpp> -#include <boost/lexical_cast.hpp> - -// simple reader of .ini files -class IniReader { -public: - IniReader(const std::string &file) - { - std::ifstream inStr(file.c_str()); - if (! inStr.is_open()) - throw std::runtime_error("Failed to open file " + file); - - std::string section = ""; - std::string line; - while (getline(inStr, line)) { - if (line.empty() || line[0] == ';' || line[0] == '#') { - // empty line or comment, do nothing - } else if (line[0] == '[') { - // new section - section = line.substr(1, line.size() - 2); - } else { - std::vector<std::string> cols; - boost::split(cols, line, boost::is_any_of("=")); - std::for_each(cols.begin(), cols.end(), - boost::bind(&boost::trim<std::string>, _1, std::locale())); - if (section.empty()) - throw std::runtime_error("Missing section"); - if (cols.size() != 2) - throw std::runtime_error("Failed to parse line: '" + line + "'"); - std::string key = section + "." + cols[0]; - properties[key] = cols[1]; - } - } - inStr.close(); - } - - template <class T> - T Get(const std::string &key, T defaultValue) - { - std::map<std::string, std::string>::const_iterator it = properties.find(key); - return (it == properties.end()) ? defaultValue : boost::lexical_cast<T>(it->second); - } - -private: - std::map<std::string, std::string> properties; -}; - -#endif // moses_iniReader_h |