diff options
author | Hieu Hoang <hieuhoang@gmail.com> | 2016-10-05 18:43:04 +0300 |
---|---|---|
committer | Hieu Hoang <hieuhoang@gmail.com> | 2016-10-05 18:43:04 +0300 |
commit | 7d7ae1b72ca6487cd50dba6d20d0ba4a4b08b782 (patch) | |
tree | 49473051b7181a047920836c6c8abfbf59453956 | |
parent | 0e4e64b26dd3b82a0dfbfe2445f89e1dcbbdf61a (diff) |
add StoreVocab
-rw-r--r-- | moses/TranslationModel/ProbingPT/StoreTarget.cpp | 266 | ||||
-rw-r--r-- | moses/TranslationModel/ProbingPT/StoreTarget.h | 51 | ||||
-rw-r--r-- | moses/TranslationModel/ProbingPT/StoreVocab.cpp | 13 | ||||
-rw-r--r-- | moses/TranslationModel/ProbingPT/StoreVocab.h | 64 |
4 files changed, 394 insertions, 0 deletions
diff --git a/moses/TranslationModel/ProbingPT/StoreTarget.cpp b/moses/TranslationModel/ProbingPT/StoreTarget.cpp new file mode 100644 index 000000000..8072f408b --- /dev/null +++ b/moses/TranslationModel/ProbingPT/StoreTarget.cpp @@ -0,0 +1,266 @@ +/* + * StoreTarget.cpp + * + * Created on: 19 Jan 2016 + * Author: hieu + */ +#include <boost/foreach.hpp> +#include "StoreTarget.h" +#include "line_splitter.hh" +#include "probing_hash_utils.hh" +#include "moses/OutputFileStream.h" +#include "moses/Util.h" + +using namespace std; + +namespace Moses +{ + +StoreTarget::StoreTarget(const std::string &basepath) +:m_basePath(basepath) +,m_vocab(basepath + "/TargetVocab.dat") +{ + std::string path = basepath + "/TargetColl.dat"; + m_fileTargetColl.open(path.c_str(), + std::ios::out | std::ios::binary | std::ios::ate | std::ios::trunc); + if (!m_fileTargetColl.is_open()) { + throw "can't create file "; + } + +} + +StoreTarget::~StoreTarget() +{ + assert(m_coll.empty()); + m_fileTargetColl.close(); + + // vocab + m_vocab.Save(); +} + +uint64_t StoreTarget::Save() +{ + uint64_t ret = m_fileTargetColl.tellp(); + + // save to disk + uint64_t numTP = m_coll.size(); + m_fileTargetColl.write((char*) &numTP, sizeof(uint64_t)); + + for (size_t i = 0; i < m_coll.size(); ++i) { + Save(*m_coll[i]); + } + + // clear coll + RemoveAllInColl(m_coll); + m_coll.clear(); + + // starting position of coll + return ret; +} + +void StoreTarget::Save(const target_text &rule) +{ + // metadata for each tp + TargetPhraseInfo tpInfo; + tpInfo.alignTerm = GetAlignId(rule.word_align_term); + tpInfo.alignNonTerm = GetAlignId(rule.word_align_non_term); + tpInfo.numWords = rule.target_phrase.size(); + tpInfo.propLength = rule.property.size(); + + //cerr << "TPInfo=" << sizeof(TPInfo); + m_fileTargetColl.write((char*) &tpInfo, sizeof(TargetPhraseInfo)); + + // scores + for (size_t i = 0; i < rule.prob.size(); ++i) { + float prob = rule.prob[i]; + m_fileTargetColl.write((char*) &prob, sizeof(prob)); + } + + // tp + for (size_t i = 0; i < rule.target_phrase.size(); ++i) { + uint32_t vocabId = rule.target_phrase[i]; + m_fileTargetColl.write((char*) &vocabId, sizeof(vocabId)); + } + + // prop TODO + +} + +void StoreTarget::SaveAlignment() +{ + std::string path = m_basePath + "/Alignments.dat"; + OutputFileStream file(path); + + BOOST_FOREACH(Alignments::value_type &valPair, m_aligns) { + file << valPair.second << "\t"; + + const std::vector<size_t> &aligns = valPair.first; + BOOST_FOREACH(size_t align, aligns) { + file << align << " "; + } + file << endl; + } + +} + +void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg) +{ + target_text *rule = new target_text; + //cerr << "line.target_phrase=" << line.target_phrase << endl; + + // target_phrase + vector<bool> nonTerms; + util::TokenIter<util::SingleCharacter> it; + it = util::TokenIter<util::SingleCharacter>(line.target_phrase, + util::SingleCharacter(' ')); + while (it) { + StringPiece word = *it; + //cerr << "word=" << word << endl; + + bool nonTerm = false; + if (scfg) { + // not really sure how to handle factored SCFG and NT + if (scfg && word[0] == '[' && word[word.size() - 1] == ']') { + //cerr << "NON-TERM=" << tok << " " << nonTerms.size() << endl; + nonTerm = true; + } + nonTerms.push_back(nonTerm); + } + + util::TokenIter<util::SingleCharacter> itFactor; + itFactor = util::TokenIter<util::SingleCharacter>(word, + util::SingleCharacter('|')); + while (itFactor) { + StringPiece factor = *itFactor; + + string factorStr = factor.as_string(); + uint32_t vocabId = m_vocab.GetVocabId(factorStr); + + rule->target_phrase.push_back(vocabId); + + itFactor++; + } + + it++; + } + + // probs + it = util::TokenIter<util::SingleCharacter>(line.prob, + util::SingleCharacter(' ')); + while (it) { + string tok = it->as_string(); + float prob = Scan<float>(tok); + + if (log_prob) { + prob = FloorScore(log(prob)); + if (prob == 0.0f) prob = 0.0000000001; + } + + rule->prob.push_back(prob); + it++; + } + + /* + cerr << "nonTerms="; + for (size_t i = 0; i < nonTerms.size(); ++i) { + cerr << nonTerms[i] << " "; + } + cerr << endl; + */ + + // alignment + it = util::TokenIter<util::SingleCharacter>(line.word_align, + util::SingleCharacter(' ')); + while (it) { + string tokPair = Trim(it->as_string()); + if (tokPair.empty()) { + break; + } + + vector<size_t> alignPair = Tokenize<size_t>(tokPair, "-"); + assert(alignPair.size() == 2); + + bool nonTerm = false; + size_t sourcePos = alignPair[0]; + size_t targetPos = alignPair[1]; + if (scfg) { + nonTerm = nonTerms[targetPos]; + } + + //cerr << targetPos << "=" << nonTerm << endl; + + if (nonTerm) { + rule->word_align_non_term.push_back(sourcePos); + rule->word_align_non_term.push_back(targetPos); + //cerr << (int) rule->word_all1.back() << " "; + } + else { + rule->word_align_term.push_back(sourcePos); + rule->word_align_term.push_back(targetPos); + } + + it++; + } + + // extra scores + string prop = line.property.as_string(); + AppendLexRO(prop, rule->prob, log_prob); + + //cerr << "line.property=" << line.property << endl; + //cerr << "prop=" << prop << endl; + + // properties + /* + for (size_t i = 0; i < prop.size(); ++i) { + rule->property.push_back(prop[i]); + } + */ + m_coll.push_back(rule); +} + +uint32_t StoreTarget::GetAlignId(const std::vector<size_t> &align) +{ + boost::unordered_map<std::vector<size_t>, uint32_t>::iterator iter = + m_aligns.find(align); + if (iter == m_aligns.end()) { + uint32_t ind = m_aligns.size(); + m_aligns[align] = ind; + return ind; + } + else { + return iter->second; + } +} + +void StoreTarget::AppendLexRO(std::string &prop, std::vector<float> &retvector, + bool log_prob) const +{ + size_t startPos = prop.find("{{LexRO "); + + if (startPos != string::npos) { + size_t endPos = prop.find("}}", startPos + 8); + string lexProb = prop.substr(startPos + 8, endPos - startPos - 8); + //cerr << "lexProb=" << lexProb << endl; + + // append lex probs to pt probs + vector<float> scores = Tokenize<float>(lexProb); + + if (log_prob) { + for (size_t i = 0; i < scores.size(); ++i) { + scores[i] = FloorScore(log(scores[i])); + if (scores[i] == 0.0f) scores[i] = 0.0000000001; + } + } + + for (size_t i = 0; i < scores.size(); ++i) { + retvector.push_back(scores[i]); + } + + // exclude LexRO property from property column + prop = prop.substr(0, startPos) + + prop.substr(endPos + 2, prop.size() - endPos - 2); + //cerr << "line.property_to_be_binarized=" << line.property_to_be_binarized << "AAAA" << endl; + } +} + +} /* namespace Moses2 */ diff --git a/moses/TranslationModel/ProbingPT/StoreTarget.h b/moses/TranslationModel/ProbingPT/StoreTarget.h new file mode 100644 index 000000000..5c7d9e1b7 --- /dev/null +++ b/moses/TranslationModel/ProbingPT/StoreTarget.h @@ -0,0 +1,51 @@ +/* + * StoreTarget.h + * + * Created on: 19 Jan 2016 + * Author: hieu + */ +#pragma once +#include <string> +#include <fstream> +#include <vector> +#include <inttypes.h> +#include <boost/unordered_map.hpp> +#include <boost/unordered_set.hpp> +#include "StoreVocab.h" + +namespace Moses +{ + +class line_text; +class target_text; + +class StoreTarget +{ +public: + StoreTarget(const std::string &basepath); + virtual ~StoreTarget(); + + uint64_t Save(); + void SaveAlignment(); + + void Append(const line_text &line, bool log_prob, bool scfg); +protected: + std::string m_basePath; + std::fstream m_fileTargetColl; + StoreVocab<uint32_t> m_vocab; + + typedef boost::unordered_map<std::vector<size_t>, uint32_t> Alignments; + Alignments m_aligns; + + std::vector<target_text*> m_coll; + + uint32_t GetAlignId(const std::vector<size_t> &align); + void Save(const target_text &rule); + + void AppendLexRO(std::string &prop, std::vector<float> &retvector, + bool log_prob) const; + +}; + +} /* namespace Moses2 */ + diff --git a/moses/TranslationModel/ProbingPT/StoreVocab.cpp b/moses/TranslationModel/ProbingPT/StoreVocab.cpp new file mode 100644 index 000000000..6515bac63 --- /dev/null +++ b/moses/TranslationModel/ProbingPT/StoreVocab.cpp @@ -0,0 +1,13 @@ +/* + * StoreVocab.cpp + * + * Created on: 15 Jun 2016 + * Author: hieu + */ +#include <fstream> +#include "StoreVocab.h" + +namespace Moses +{ + +} /* namespace Moses2 */ diff --git a/moses/TranslationModel/ProbingPT/StoreVocab.h b/moses/TranslationModel/ProbingPT/StoreVocab.h new file mode 100644 index 000000000..05d279f4c --- /dev/null +++ b/moses/TranslationModel/ProbingPT/StoreVocab.h @@ -0,0 +1,64 @@ +/* + * StoreVocab.h + * + * Created on: 15 Jun 2016 + * Author: hieu + */ +#pragma once +#include <string> +#include <boost/unordered_map.hpp> +#include "moses/OutputFileStream.h" +#include "moses/Util.h" + +namespace Moses +{ + +template<typename VOCABID> +class StoreVocab +{ +protected: + std::string m_path; + + typedef boost::unordered_map<std::string, VOCABID> Coll; + Coll m_vocab; + +public: + StoreVocab(const std::string &path) + :m_path(path) + {} + + virtual ~StoreVocab() {} + + VOCABID GetVocabId(const std::string &word) + { + typename Coll::iterator iter = m_vocab.find(word); + if (iter == m_vocab.end()) { + VOCABID ind = m_vocab.size() + 1; + m_vocab[word] = ind; + return ind; + } + else { + return iter->second; + } + } + + void Insert(VOCABID id, const std::string &word) + { + m_vocab[word] = id; + } + + void Save() + { + OutputFileStream strme(m_path); + + typename Coll::const_iterator iter; + for (iter = m_vocab.begin(); iter != m_vocab.end(); ++iter) { + strme << iter->first << "\t" << iter->second << std::endl; + } + + strme.Close(); + } +}; + +} /* namespace Moses2 */ + |