diff options
author | Hieu Hoang <hieuhoang@gmail.com> | 2017-02-16 14:30:39 +0300 |
---|---|---|
committer | Hieu Hoang <hieuhoang@gmail.com> | 2017-02-16 14:30:39 +0300 |
commit | a391b84b4275d90cabd2bf9d1734ac2c10c9e8bd (patch) | |
tree | 1a61ee76d46308fa4150f14d4da54f82cf7361df /moses | |
parent | 07cef43cea1d3b7a542eab817718a93b74c68ebc (diff) |
moses and moses2 both use probingpt lib
Diffstat (limited to 'moses')
20 files changed, 17 insertions, 1413 deletions
diff --git a/moses/Jamfile b/moses/Jamfile index 49aab9025..5200029fb 100644 --- a/moses/Jamfile +++ b/moses/Jamfile @@ -122,10 +122,10 @@ vwfiles synlm mmlib mserver headers FF_Factory.o LM//LM TranslationModel/CompactPT//CompactPT -TranslationModel/ProbingPT//ProbingPT ThreadPool ..//search ../util/double-conversion//double-conversion +../probingpt//probingpt ..//z ../OnDiskPt//OnDiskPt $(TOP)//boost_filesystem @@ -139,5 +139,5 @@ alias headers-to-install : [ glob-tree *.h ] ; import testing ; -unit-test moses_test : [ glob *Test.cpp Mock*.cpp FF/*Test.cpp ] ..//boost_filesystem moses headers ..//z ../OnDiskPt//OnDiskPt ..//boost_unit_test_framework ; +unit-test moses_test : [ glob *Test.cpp Mock*.cpp FF/*Test.cpp ] ..//boost_filesystem moses headers ..//z ../OnDiskPt//OnDiskPt ../probingpt//probingpt ..//boost_unit_test_framework ; diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT.cpp index 1ae0c67c3..2a7369622 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp +++ b/moses/TranslationModel/ProbingPT.cpp @@ -5,7 +5,8 @@ #include "moses/TargetPhraseCollection.h" #include "moses/InputFileStream.h" #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h" -#include "querying.hh" +#include "probingpt/querying.hh" +#include "probingpt/probing_hash_utils.hh" using namespace std; @@ -14,6 +15,7 @@ namespace Moses ProbingPT::ProbingPT(const std::string &line) : PhraseDictionary(line,true) ,m_engine(NULL) + ,load_method(util::POPULATE_OR_READ) { ReadParameters(); @@ -31,7 +33,7 @@ void ProbingPT::Load(AllOptions::ptr const& opts) m_options = opts; SetFeaturesToApply(); - m_engine = new QueryEngine(m_filePath.c_str()); + m_engine = new probingpt::QueryEngine(m_filePath.c_str(), load_method); m_unkId = 456456546456; @@ -256,12 +258,12 @@ TargetPhraseCollection *ProbingPT::CreateTargetPhrases( TargetPhrase *ProbingPT::CreateTargetPhrase( const char *&offset) const { - TargetPhraseInfo *tpInfo = (TargetPhraseInfo*) offset; + probingpt::TargetPhraseInfo *tpInfo = (probingpt::TargetPhraseInfo*) offset; size_t numRealWords = tpInfo->numWords / m_output.size(); TargetPhrase *tp = new TargetPhrase(this); - offset += sizeof(TargetPhraseInfo); + offset += sizeof(probingpt::TargetPhraseInfo); // scores float *scores = (float*) offset; diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.h b/moses/TranslationModel/ProbingPT.h index 953a2dc2f..bdf5a3bda 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.h +++ b/moses/TranslationModel/ProbingPT.h @@ -3,16 +3,20 @@ #include <boost/iostreams/device/mapped_file.hpp> #include <boost/bimap.hpp> #include <boost/unordered_map.hpp> -#include "../PhraseDictionary.h" +#include "PhraseDictionary.h" +#include "util/mmap.hh" +namespace probingpt +{ +class QueryEngine; +class target_text; +} namespace Moses { class ChartParser; class ChartCellCollectionBase; class ChartRuleLookupManager; -class QueryEngine; -class target_text; class ProbingPT : public PhraseDictionary { @@ -39,12 +43,13 @@ public: protected: - QueryEngine *m_engine; + probingpt::QueryEngine *m_engine; uint64_t m_unkId; std::vector<uint64_t> m_sourceVocab; // factor id -> pt id std::vector<const Factor*> m_targetVocab; // pt id -> factor* std::vector<const AlignmentInfo*> m_aligns; + util::LoadMethod load_method; boost::iostreams::mapped_file_source file; const char *data; diff --git a/moses/TranslationModel/ProbingPT/Jamfile b/moses/TranslationModel/ProbingPT/Jamfile deleted file mode 100644 index 29c6ec41d..000000000 --- a/moses/TranslationModel/ProbingPT/Jamfile +++ /dev/null @@ -1,8 +0,0 @@ -local current = "" ; -local includes = ; - -fakelib ProbingPT : [ glob *.cpp ] ../..//headers : $(includes) <dependency>$(PT-LOG) : : $(includes) ; - -path-constant PT-LOG : bin/pt.log ; -update-if-changed $(PT-LOG) $(current) ; - diff --git a/moses/TranslationModel/ProbingPT/StoreTarget.cpp b/moses/TranslationModel/ProbingPT/StoreTarget.cpp deleted file mode 100644 index f586a26b9..000000000 --- a/moses/TranslationModel/ProbingPT/StoreTarget.cpp +++ /dev/null @@ -1,264 +0,0 @@ -/* - * StoreTarget.cpp - * - * Created on: 19 Jan 2016 - * Author: hieu - */ -#include <boost/foreach.hpp> -#include "StoreTarget.h" -#include "line_splitter.hh" -#include "probing_hash_utils.hh" -#include "moses/OutputFileStream.h" -#include "moses/Util.h" - -using namespace std; - -namespace Moses -{ - -StoreTarget::StoreTarget(const std::string &basepath) - :m_basePath(basepath) - ,m_vocab(basepath + "/TargetVocab.dat") -{ - std::string path = basepath + "/TargetColl.dat"; - m_fileTargetColl.open(path.c_str(), - std::ios::out | std::ios::binary | std::ios::ate | std::ios::trunc); - if (!m_fileTargetColl.is_open()) { - throw "can't create file "; - } - -} - -StoreTarget::~StoreTarget() -{ - assert(m_coll.empty()); - m_fileTargetColl.close(); - - // vocab - m_vocab.Save(); -} - -uint64_t StoreTarget::Save() -{ - uint64_t ret = m_fileTargetColl.tellp(); - - // save to disk - uint64_t numTP = m_coll.size(); - m_fileTargetColl.write((char*) &numTP, sizeof(uint64_t)); - - for (size_t i = 0; i < m_coll.size(); ++i) { - Save(*m_coll[i]); - } - - // clear coll - RemoveAllInColl(m_coll); - m_coll.clear(); - - // starting position of coll - return ret; -} - -void StoreTarget::Save(const target_text &rule) -{ - // metadata for each tp - TargetPhraseInfo tpInfo; - tpInfo.alignTerm = GetAlignId(rule.word_align_term); - tpInfo.alignNonTerm = GetAlignId(rule.word_align_non_term); - tpInfo.numWords = rule.target_phrase.size(); - tpInfo.propLength = rule.property.size(); - - //cerr << "TPInfo=" << sizeof(TPInfo); - m_fileTargetColl.write((char*) &tpInfo, sizeof(TargetPhraseInfo)); - - // scores - for (size_t i = 0; i < rule.prob.size(); ++i) { - float prob = rule.prob[i]; - m_fileTargetColl.write((char*) &prob, sizeof(prob)); - } - - // tp - for (size_t i = 0; i < rule.target_phrase.size(); ++i) { - uint32_t vocabId = rule.target_phrase[i]; - m_fileTargetColl.write((char*) &vocabId, sizeof(vocabId)); - } - - // prop TODO - -} - -void StoreTarget::SaveAlignment() -{ - std::string path = m_basePath + "/Alignments.dat"; - OutputFileStream file(path); - - BOOST_FOREACH(Alignments::value_type &valPair, m_aligns) { - file << valPair.second << "\t"; - - const std::vector<size_t> &aligns = valPair.first; - BOOST_FOREACH(size_t align, aligns) { - file << align << " "; - } - file << endl; - } - -} - -void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg) -{ - target_text *rule = new target_text; - //cerr << "line.target_phrase=" << line.target_phrase << endl; - - // target_phrase - vector<bool> nonTerms; - util::TokenIter<util::SingleCharacter> it; - it = util::TokenIter<util::SingleCharacter>(line.target_phrase, - util::SingleCharacter(' ')); - while (it) { - StringPiece word = *it; - //cerr << "word=" << word << endl; - - bool nonTerm = false; - if (scfg) { - // not really sure how to handle factored SCFG and NT - if (scfg && word[0] == '[' && word[word.size() - 1] == ']') { - //cerr << "NON-TERM=" << tok << " " << nonTerms.size() << endl; - nonTerm = true; - } - nonTerms.push_back(nonTerm); - } - - util::TokenIter<util::SingleCharacter> itFactor; - itFactor = util::TokenIter<util::SingleCharacter>(word, - util::SingleCharacter('|')); - while (itFactor) { - StringPiece factor = *itFactor; - - string factorStr = factor.as_string(); - uint32_t vocabId = m_vocab.GetVocabId(factorStr); - - rule->target_phrase.push_back(vocabId); - - itFactor++; - } - - it++; - } - - // probs - it = util::TokenIter<util::SingleCharacter>(line.prob, - util::SingleCharacter(' ')); - while (it) { - string tok = it->as_string(); - float prob = Scan<float>(tok); - - if (log_prob) { - prob = FloorScore(log(prob)); - if (prob == 0.0f) prob = 0.0000000001; - } - - rule->prob.push_back(prob); - it++; - } - - /* - cerr << "nonTerms="; - for (size_t i = 0; i < nonTerms.size(); ++i) { - cerr << nonTerms[i] << " "; - } - cerr << endl; - */ - - // alignment - it = util::TokenIter<util::SingleCharacter>(line.word_align, - util::SingleCharacter(' ')); - while (it) { - string tokPair = Trim(it->as_string()); - if (tokPair.empty()) { - break; - } - - vector<size_t> alignPair = Tokenize<size_t>(tokPair, "-"); - assert(alignPair.size() == 2); - - bool nonTerm = false; - size_t sourcePos = alignPair[0]; - size_t targetPos = alignPair[1]; - if (scfg) { - nonTerm = nonTerms[targetPos]; - } - - //cerr << targetPos << "=" << nonTerm << endl; - - if (nonTerm) { - rule->word_align_non_term.push_back(sourcePos); - rule->word_align_non_term.push_back(targetPos); - //cerr << (int) rule->word_all1.back() << " "; - } else { - rule->word_align_term.push_back(sourcePos); - rule->word_align_term.push_back(targetPos); - } - - it++; - } - - // extra scores - string prop = line.property.as_string(); - AppendLexRO(prop, rule->prob, log_prob); - - //cerr << "line.property=" << line.property << endl; - //cerr << "prop=" << prop << endl; - - // properties - /* - for (size_t i = 0; i < prop.size(); ++i) { - rule->property.push_back(prop[i]); - } - */ - m_coll.push_back(rule); -} - -uint32_t StoreTarget::GetAlignId(const std::vector<size_t> &align) -{ - boost::unordered_map<std::vector<size_t>, uint32_t>::iterator iter = - m_aligns.find(align); - if (iter == m_aligns.end()) { - uint32_t ind = m_aligns.size(); - m_aligns[align] = ind; - return ind; - } else { - return iter->second; - } -} - -void StoreTarget::AppendLexRO(std::string &prop, std::vector<float> &retvector, - bool log_prob) const -{ - size_t startPos = prop.find("{{LexRO "); - - if (startPos != string::npos) { - size_t endPos = prop.find("}}", startPos + 8); - string lexProb = prop.substr(startPos + 8, endPos - startPos - 8); - //cerr << "lexProb=" << lexProb << endl; - - // append lex probs to pt probs - vector<float> scores = Tokenize<float>(lexProb); - - if (log_prob) { - for (size_t i = 0; i < scores.size(); ++i) { - scores[i] = FloorScore(log(scores[i])); - if (scores[i] == 0.0f) scores[i] = 0.0000000001; - } - } - - for (size_t i = 0; i < scores.size(); ++i) { - retvector.push_back(scores[i]); - } - - // exclude LexRO property from property column - prop = prop.substr(0, startPos) - + prop.substr(endPos + 2, prop.size() - endPos - 2); - //cerr << "line.property_to_be_binarized=" << line.property_to_be_binarized << "AAAA" << endl; - } -} - -} /* namespace Moses2 */ diff --git a/moses/TranslationModel/ProbingPT/StoreTarget.h b/moses/TranslationModel/ProbingPT/StoreTarget.h deleted file mode 100644 index 331c197b3..000000000 --- a/moses/TranslationModel/ProbingPT/StoreTarget.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * StoreTarget.h - * - * Created on: 19 Jan 2016 - * Author: hieu - */ -#pragma once -#include <string> -#include <fstream> -#include <vector> -#include <inttypes.h> -#include <boost/unordered_map.hpp> -#include <boost/unordered_set.hpp> -#include "StoreVocab.h" - -namespace Moses -{ - -class line_text; -class target_text; - -class StoreTarget -{ -public: - StoreTarget(const std::string &basepath); - virtual ~StoreTarget(); - - uint64_t Save(); - void SaveAlignment(); - - void Append(const line_text &line, bool log_prob, bool scfg); -protected: - std::string m_basePath; - std::fstream m_fileTargetColl; - StoreVocab<uint32_t> m_vocab; - - typedef boost::unordered_map<std::vector<size_t>, uint32_t> Alignments; - Alignments m_aligns; - - std::vector<target_text*> m_coll; - - uint32_t GetAlignId(const std::vector<size_t> &align); - void Save(const target_text &rule); - - void AppendLexRO(std::string &prop, std::vector<float> &retvector, - bool log_prob) const; - -}; - -} /* namespace Moses2 */ - diff --git a/moses/TranslationModel/ProbingPT/StoreVocab.cpp b/moses/TranslationModel/ProbingPT/StoreVocab.cpp deleted file mode 100644 index 6515bac63..000000000 --- a/moses/TranslationModel/ProbingPT/StoreVocab.cpp +++ /dev/null @@ -1,13 +0,0 @@ -/* - * StoreVocab.cpp - * - * Created on: 15 Jun 2016 - * Author: hieu - */ -#include <fstream> -#include "StoreVocab.h" - -namespace Moses -{ - -} /* namespace Moses2 */ diff --git a/moses/TranslationModel/ProbingPT/StoreVocab.h b/moses/TranslationModel/ProbingPT/StoreVocab.h deleted file mode 100644 index 806dcebf4..000000000 --- a/moses/TranslationModel/ProbingPT/StoreVocab.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * StoreVocab.h - * - * Created on: 15 Jun 2016 - * Author: hieu - */ -#pragma once -#include <string> -#include <boost/unordered_map.hpp> -#include "moses/OutputFileStream.h" -#include "moses/Util.h" - -namespace Moses -{ - -template<typename VOCABID> -class StoreVocab -{ -protected: - std::string m_path; - - typedef boost::unordered_map<std::string, VOCABID> Coll; - Coll m_vocab; - -public: - StoreVocab(const std::string &path) - :m_path(path) - {} - - virtual ~StoreVocab() {} - - VOCABID GetVocabId(const std::string &word) { - typename Coll::iterator iter = m_vocab.find(word); - if (iter == m_vocab.end()) { - VOCABID ind = m_vocab.size() + 1; - m_vocab[word] = ind; - return ind; - } else { - return iter->second; - } - } - - void Insert(VOCABID id, const std::string &word) { - m_vocab[word] = id; - } - - void Save() { - OutputFileStream strme(m_path); - - typename Coll::const_iterator iter; - for (iter = m_vocab.begin(); iter != m_vocab.end(); ++iter) { - strme << iter->first << "\t" << iter->second << std::endl; - } - - strme.Close(); - } -}; - -} /* namespace Moses2 */ - diff --git a/moses/TranslationModel/ProbingPT/hash.cpp b/moses/TranslationModel/ProbingPT/hash.cpp deleted file mode 100644 index 47242e25d..000000000 --- a/moses/TranslationModel/ProbingPT/hash.cpp +++ /dev/null @@ -1,44 +0,0 @@ -#include <iostream> -#include "hash.hh" - -using namespace std; - -namespace Moses -{ - -uint64_t getHash(StringPiece text) -{ - std::size_t len = text.size(); - uint64_t key = util::MurmurHashNative(text.data(), len); - return key; -} - -std::vector<uint64_t> getVocabIDs(const StringPiece &textin) -{ - //Tokenize - std::vector<uint64_t> output; - - util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' ')); - - while (itWord) { - StringPiece word = *itWord; - uint64_t id = 0; - - util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|')); - while (itFactor) { - StringPiece factor = *itFactor; - //cerr << "factor=" << factor << endl; - - id += getHash(factor); - itFactor++; - } - - output.push_back(id); - itWord++; - } - - return output; -} - -} - diff --git a/moses/TranslationModel/ProbingPT/hash.hh b/moses/TranslationModel/ProbingPT/hash.hh deleted file mode 100644 index f218ad9da..000000000 --- a/moses/TranslationModel/ProbingPT/hash.hh +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - -#include "util/string_piece.hh" -#include "util/murmur_hash.hh" -#include "util/string_piece.hh" //Tokenization and work with StringPiece -#include "util/tokenize_piece.hh" -#include <vector> - -namespace Moses -{ - -//Gets the MurmurmurHash for give string -uint64_t getHash(StringPiece text); - -std::vector<uint64_t> getVocabIDs(const StringPiece &textin); - -} diff --git a/moses/TranslationModel/ProbingPT/line_splitter.cpp b/moses/TranslationModel/ProbingPT/line_splitter.cpp deleted file mode 100644 index cb9e47fec..000000000 --- a/moses/TranslationModel/ProbingPT/line_splitter.cpp +++ /dev/null @@ -1,103 +0,0 @@ -#include "line_splitter.hh" - -namespace Moses -{ - -line_text splitLine(const StringPiece &textin, bool scfg) -{ - const char delim[] = "|||"; - line_text output; - - //Tokenize - util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim)); - //Get source phrase - output.source_phrase = Trim(*it); - //std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl; - - //Get target_phrase - it++; - output.target_phrase = Trim(*it); - //std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl; - - if (scfg) { - /* - std::cerr << "output.source_phrase=" << output.source_phrase << std::endl; - std::cerr << "output.target_phrase=" << output.target_phrase << std::endl; - reformatSCFG(output); - std::cerr << "output.source_phrase=" << output.source_phrase << std::endl; - std::cerr << "output.target_phrase=" << output.target_phrase << std::endl; - */ - } - - //Get probabilities - it++; - output.prob = Trim(*it); - //std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl; - - //Get WordAllignment - it++; - if (it == util::TokenIter<util::MultiCharacter>::end()) return output; - output.word_align = Trim(*it); - //std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl; - - //Get count - it++; - if (it == util::TokenIter<util::MultiCharacter>::end()) return output; - output.counts = Trim(*it); - //std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl; - - //Get sparse_score - it++; - if (it == util::TokenIter<util::MultiCharacter>::end()) return output; - output.sparse_score = Trim(*it); - //std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl; - - //Get property - it++; - if (it == util::TokenIter<util::MultiCharacter>::end()) return output; - output.property = Trim(*it); - //std::cerr << "output.property=" << output.property << "AAAA" << std::endl; - - return output; -} - -std::vector<unsigned char> splitWordAll1(const StringPiece &textin) -{ - const char delim[] = " "; - const char delim2[] = "-"; - std::vector<unsigned char> output; - - //Case with no word alignments. - if (textin.size() == 0) { - return output; - } - - //Split on space - util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim)); - - //For each int - while (it) { - //Split on dash (-) - util::TokenIter<util::MultiCharacter> itInner(*it, - util::MultiCharacter(delim2)); - - //Insert the two entries in the vector. User will read entry 0 and 1 to get the first, - //2 and 3 for second etc. Use unsigned char instead of int to save space, as - //word allignments are all very small numbers that fit in a single byte - output.push_back((unsigned char) (atoi(itInner->data()))); - itInner++; - output.push_back((unsigned char) (atoi(itInner->data()))); - it++; - } - - return output; - -} - -void reformatSCFG(line_text &output) -{ - -} - -} - diff --git a/moses/TranslationModel/ProbingPT/line_splitter.hh b/moses/TranslationModel/ProbingPT/line_splitter.hh deleted file mode 100644 index 01b86fc9b..000000000 --- a/moses/TranslationModel/ProbingPT/line_splitter.hh +++ /dev/null @@ -1,57 +0,0 @@ -#pragma once - -#include "util/string_piece.hh" -#include "util/tokenize_piece.hh" -#include "util/file_piece.hh" -#include <vector> -#include <cstdlib> //atof -#include "util/string_piece.hh" //Tokenization and work with StringPiece -#include "util/tokenize_piece.hh" -#include <vector> - -namespace Moses -{ - -//Struct for holding processed line -struct line_text { - StringPiece source_phrase; - StringPiece target_phrase; - StringPiece prob; - StringPiece word_align; - StringPiece counts; - StringPiece sparse_score; - StringPiece property; - std::string property_to_be_binarized; -}; - -//Struct for holding processed line -struct target_text { - std::vector<unsigned int> target_phrase; - std::vector<float> prob; - std::vector<size_t> word_align_term; - std::vector<size_t> word_align_non_term; - std::vector<char> counts; - std::vector<char> sparse_score; - std::vector<char> property; - - /* - void Reset() - { - target_phrase.clear(); - prob.clear(); - word_all1.clear(); - counts.clear(); - sparse_score.clear(); - property.clear(); - } - */ -}; - -//Ask if it's better to have it receive a pointer to a line_text struct -line_text splitLine(const StringPiece &textin, bool scfg); -void reformatSCFG(line_text &output); - -std::vector<unsigned char> splitWordAll1(const StringPiece &textin); - -} - diff --git a/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp b/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp deleted file mode 100644 index f23f57d66..000000000 --- a/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#include "probing_hash_utils.hh" - -namespace Moses -{ - -//Read table from disk, return memory map location -char * readTable(const char * filename, size_t size) -{ - //Initial position of the file is the end of the file, thus we know the size - int fd; - char * map; - - fd = open(filename, O_RDONLY); - if (fd == -1) { - perror("Error opening file for reading"); - exit(EXIT_FAILURE); - } - - map = (char *) mmap(0, size, PROT_READ, MAP_SHARED, fd, 0); - - if (map == MAP_FAILED) { - close(fd); - perror("Error mmapping the file"); - exit(EXIT_FAILURE); - } - - return map; -} - -void serialize_table(char *mem, size_t size, const std::string &filename) -{ - std::ofstream os(filename.c_str(), std::ios::binary); - os.write((const char*) &mem[0], size); - os.close(); - -} - -uint64_t getKey(const uint64_t source_phrase[], size_t size) -{ - //TOO SLOW - //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size()); - uint64_t key = 0; - for (size_t i = 0; i < size; i++) { - key += (source_phrase[i] << i); - } - return key; -} - -} - diff --git a/moses/TranslationModel/ProbingPT/probing_hash_utils.hh b/moses/TranslationModel/ProbingPT/probing_hash_utils.hh deleted file mode 100644 index 998686b2e..000000000 --- a/moses/TranslationModel/ProbingPT/probing_hash_utils.hh +++ /dev/null @@ -1,51 +0,0 @@ -#pragma once - -#include "util/probing_hash_table.hh" - -#include <sys/mman.h> -#include <boost/functional/hash.hpp> -#include <fcntl.h> -#include <fstream> - -namespace Moses -{ - -#define API_VERSION 15 - -//Hash table entry -struct Entry { - typedef uint64_t Key; - Key key; - - Key GetKey() const { - return key; - } - - void SetKey(Key to) { - key = to; - } - - uint64_t value; -}; - -#define NONE std::numeric_limits<uint64_t>::max() - -//Define table -typedef util::ProbingHashTable<Entry, boost::hash<uint64_t> > Table; - -void serialize_table(char *mem, size_t size, const std::string &filename); - -char * readTable(const char * filename, size_t size); - -uint64_t getKey(const uint64_t source_phrase[], size_t size); - -struct TargetPhraseInfo { - uint32_t alignTerm; - uint32_t alignNonTerm; - uint16_t numWords; - uint16_t propLength; - uint16_t filler; -}; - -} - diff --git a/moses/TranslationModel/ProbingPT/querying.cpp b/moses/TranslationModel/ProbingPT/querying.cpp deleted file mode 100644 index 10c35e361..000000000 --- a/moses/TranslationModel/ProbingPT/querying.cpp +++ /dev/null @@ -1,141 +0,0 @@ -#include "querying.hh" -#include "util/exception.hh" - -using namespace std; - -namespace Moses -{ - -QueryEngine::QueryEngine(const char * filepath) -{ - - //Create filepaths - std::string basepath(filepath); - std::string path_to_config = basepath + "/config"; - std::string path_to_hashtable = basepath + "/probing_hash.dat"; - std::string path_to_source_vocabid = basepath + "/source_vocabids"; - std::string alignPath = basepath + "/Alignments.dat"; - - if (!FileExists(path_to_config)) { - UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config); - } - - ///Source phrase vocabids - read_map(source_vocabids, path_to_source_vocabid.c_str()); - - // alignments - read_alignments(alignPath); - - //Read config file - boost::unordered_map<std::string, std::string> keyValue; - - std::ifstream config(path_to_config.c_str()); - std::string line; - while (getline(config, line)) { - std::vector<std::string> toks = Tokenize(line, "\t"); - UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line); - keyValue[ toks[0] ] = toks[1]; - } - - bool found; - //Check API version: - int version; - found = Get(keyValue, "API_VERSION", version); - if (!found) { - std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl; - } else if (version != API_VERSION) { - std::cerr << "The ProbingPT API has changed. " << version << "!=" - << API_VERSION << " Please rebinarize your phrase tables." << std::endl; - exit(EXIT_FAILURE); - } - - //Get tablesize. - int tablesize; - found = Get(keyValue, "uniq_entries", tablesize); - if (!found) { - std::cerr << "uniq_entries not found" << std::endl; - exit(EXIT_FAILURE); - } - - //Number of scores - found = Get(keyValue, "num_scores", num_scores); - if (!found) { - std::cerr << "num_scores not found" << std::endl; - exit(EXIT_FAILURE); - } - - //How may scores from lex reordering models - found = Get(keyValue, "num_lex_scores", num_lex_scores); - if (!found) { - std::cerr << "num_lex_scores not found" << std::endl; - exit(EXIT_FAILURE); - } - - // have the scores been log() and FloorScore()? - found = Get(keyValue, "log_prob", logProb); - if (!found) { - std::cerr << "logProb not found" << std::endl; - exit(EXIT_FAILURE); - } - - config.close(); - - //Read hashtable - table_filesize = Table::Size(tablesize, 1.2); - mem = readTable(path_to_hashtable.c_str(), table_filesize); - Table table_init(mem, table_filesize); - table = table_init; - - std::cerr << "Initialized successfully! " << std::endl; -} - -QueryEngine::~QueryEngine() -{ - //Clear mmap content from memory. - munmap(mem, table_filesize); - -} - -uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const -{ - //TOO SLOW - //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size()); - return Moses::getKey(source_phrase, size); -} - -std::pair<bool, uint64_t> QueryEngine::query(uint64_t key) -{ - std::pair<bool, uint64_t> ret; - - const Entry * entry; - ret.first = table.Find(key, entry); - if (ret.first) { - ret.second = entry->value; - } - return ret; -} - -void QueryEngine::read_alignments(const std::string &alignPath) -{ - std::ifstream strm(alignPath.c_str()); - - string line; - while (getline(strm, line)) { - vector<string> toks = Tokenize(line, "\t "); - UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file"); - - uint32_t alignInd = Scan<uint32_t>(toks[0]); - if (alignInd >= alignColl.size()) { - alignColl.resize(alignInd + 1); - } - - Alignments &aligns = alignColl[alignInd]; - for (size_t i = 1; i < toks.size(); ++i) { - size_t pos = Scan<size_t>(toks[i]); - aligns.push_back(pos); - } - } -} - -} - diff --git a/moses/TranslationModel/ProbingPT/querying.hh b/moses/TranslationModel/ProbingPT/querying.hh deleted file mode 100644 index 915bc4806..000000000 --- a/moses/TranslationModel/ProbingPT/querying.hh +++ /dev/null @@ -1,66 +0,0 @@ -#pragma once - -#include <boost/unordered_map.hpp> -#include <sys/stat.h> //For finding size of file -#include "vocabid.hh" -#include <algorithm> //toLower -#include <deque> -#include "probing_hash_utils.hh" -#include "hash.hh" //Includes line splitter -#include "line_splitter.hh" -#include "moses//Util.h" - -namespace Moses -{ - -class QueryEngine -{ - std::map<uint64_t, std::string> source_vocabids; - - typedef std::vector<unsigned char> Alignments; - std::vector<Alignments> alignColl; - - Table table; - char *mem; //Memory for the table, necessary so that we can correctly destroy the object - - size_t table_filesize; - bool is_reordering; - - void read_alignments(const std::string &alignPath); - -public: - int num_scores; - int num_lex_scores; - bool logProb; - - QueryEngine(const char *); - ~QueryEngine(); - - std::pair<bool, uint64_t> query(uint64_t key); - - const std::map<uint64_t, std::string> &getSourceVocab() const { - return source_vocabids; - } - - const std::vector<Alignments> &getAlignments() const { - return alignColl; - } - - uint64_t getKey(uint64_t source_phrase[], size_t size) const; - - template<typename T> - inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const { - boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought); - if (iter == keyValue.end()) { - return false; - } - - const std::string &foundStr = iter->second; - found = Scan<T>(foundStr); - return true; - } - -}; - -} - diff --git a/moses/TranslationModel/ProbingPT/storing.cpp b/moses/TranslationModel/ProbingPT/storing.cpp deleted file mode 100644 index baf6ae91e..000000000 --- a/moses/TranslationModel/ProbingPT/storing.cpp +++ /dev/null @@ -1,298 +0,0 @@ -#include <sys/stat.h> -#include <boost/foreach.hpp> -#include "line_splitter.hh" -#include "storing.hh" -#include "StoreTarget.h" -#include "StoreVocab.h" -#include "moses/Util.h" -#include "moses/InputFileStream.h" - -using namespace std; - -namespace Moses -{ - -/////////////////////////////////////////////////////////////////////// -void Node::Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos) -{ - if (pos < sourcePhrase.size()) { - uint64_t vocabId = sourcePhrase[pos]; - - Node *child; - Children::iterator iter = m_children.find(vocabId); - if (iter == m_children.end()) { - // New node. Write other children then discard them - BOOST_FOREACH(Children::value_type &valPair, m_children) { - Node &otherChild = valPair.second; - otherChild.Write(table); - } - m_children.clear(); - - // create new node - child = &m_children[vocabId]; - assert(!child->done); - child->key = key + (vocabId << pos); - } else { - child = &iter->second; - } - - child->Add(table, sourcePhrase, pos + 1); - } else { - // this node was written previously 'cos it has rules - done = true; - } -} - -void Node::Write(Table &table) -{ - //cerr << "START write " << done << " " << key << endl; - BOOST_FOREACH(Children::value_type &valPair, m_children) { - Node &child = valPair.second; - child.Write(table); - } - - if (!done) { - // save - Entry sourceEntry; - sourceEntry.value = NONE; - sourceEntry.key = key; - - //Put into table - table.Insert(sourceEntry); - } -} - -/////////////////////////////////////////////////////////////////////// -void createProbingPT(const std::string &phrasetable_path, - const std::string &basepath, int num_scores, int num_lex_scores, - bool log_prob, int max_cache_size, bool scfg) -{ - std::cerr << "Starting..." << std::endl; - - //Get basepath and create directory if missing - mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); - - StoreTarget storeTarget(basepath); - - //Get uniq lines: - unsigned long uniq_entries = countUniqueSource(phrasetable_path); - - //Source phrase vocabids - StoreVocab<uint64_t> sourceVocab(basepath + "/source_vocabids"); - - //Read the file - util::FilePiece filein(phrasetable_path.c_str()); - - //Init the probing hash table - size_t size = Table::Size(uniq_entries, 1.2); - char * mem = new char[size]; - memset(mem, 0, size); - Table sourceEntries(mem, size); - - std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> cache; - float totalSourceCount = 0; - - //Keep track of the size of each group of target phrases - size_t line_num = 0; - - //Read everything and processs - std::string prevSource; - - Node sourcePhrases; - sourcePhrases.done = true; - sourcePhrases.key = 0; - - while (true) { - try { - //Process line read - line_text line; - line = splitLine(filein.ReadLine(), scfg); - //cerr << "line=" << line.source_phrase << endl; - - ++line_num; - if (line_num % 1000000 == 0) { - std::cerr << line_num << " " << std::flush; - } - - //Add source phrases to vocabularyIDs - add_to_map(sourceVocab, line.source_phrase); - - if (prevSource.empty()) { - // 1st line - prevSource = line.source_phrase.as_string(); - storeTarget.Append(line, log_prob, scfg); - } else if (prevSource == line.source_phrase) { - //If we still have the same line, just append to it: - storeTarget.Append(line, log_prob, scfg); - } else { - assert(prevSource != line.source_phrase); - - //Create a new entry even - - // save - uint64_t targetInd = storeTarget.Save(); - - // next line - storeTarget.Append(line, log_prob, scfg); - - //Create an entry for the previous source phrase: - Entry sourceEntry; - sourceEntry.value = targetInd; - //The key is the sum of hashes of individual words bitshifted by their position in the phrase. - //Probably not entirerly correct, but fast and seems to work fine in practise. - std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource); - if (scfg) { - // storing prefixes? - sourcePhrases.Add(sourceEntries, vocabid_source); - } - sourceEntry.key = getKey(vocabid_source); - - /* - cerr << "prevSource=" << prevSource << flush - << " vocabids=" << Debug(vocabid_source) << flush - << " key=" << sourceEntry.key << endl; - */ - //Put into table - sourceEntries.Insert(sourceEntry); - - // update cache - CURRENT source phrase, not prev - if (max_cache_size) { - std::string countStr = line.counts.as_string(); - countStr = Trim(countStr); - if (!countStr.empty()) { - std::vector<float> toks = Tokenize<float>(countStr); - //cerr << "CACHE:" << line.source_phrase << " " << countStr << " " << toks[1] << endl; - - if (toks.size() >= 2) { - totalSourceCount += toks[1]; - - // compute key for CURRENT source - std::vector<uint64_t> currVocabidSource = getVocabIDs(line.source_phrase.as_string()); - uint64_t currKey = getKey(currVocabidSource); - - CacheItem *item = new CacheItem( - Trim(line.source_phrase.as_string()), - currKey, - toks[1]); - cache.push(item); - - if (max_cache_size > 0 && cache.size() > max_cache_size) { - cache.pop(); - } - } - } - } - - //Set prevLine - prevSource = line.source_phrase.as_string(); - } - - } catch (util::EndOfFileException e) { - std::cerr - << "Reading phrase table finished, writing remaining files to disk." - << std::endl; - - //After the final entry is constructed we need to add it to the phrase_table - //Create an entry for the previous source phrase: - uint64_t targetInd = storeTarget.Save(); - - Entry sourceEntry; - sourceEntry.value = targetInd; - - //The key is the sum of hashes of individual words. Probably not entirerly correct, but fast - std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource); - sourceEntry.key = getKey(vocabid_source); - - //Put into table - sourceEntries.Insert(sourceEntry); - - break; - } - } - - sourcePhrases.Write(sourceEntries); - - storeTarget.SaveAlignment(); - - serialize_table(mem, size, (basepath + "/probing_hash.dat")); - - sourceVocab.Save(); - - serialize_cache(cache, (basepath + "/cache"), totalSourceCount); - - delete[] mem; - - //Write configfile - std::ofstream configfile; - configfile.open((basepath + "/config").c_str()); - configfile << "API_VERSION\t" << API_VERSION << '\n'; - configfile << "uniq_entries\t" << uniq_entries << '\n'; - configfile << "num_scores\t" << num_scores << '\n'; - configfile << "num_lex_scores\t" << num_lex_scores << '\n'; - configfile << "log_prob\t" << log_prob << '\n'; - configfile.close(); -} - -size_t countUniqueSource(const std::string &path) -{ - size_t ret = 0; - InputFileStream strme(path); - - std::string line, prevSource; - while (std::getline(strme, line)) { - std::vector<std::string> toks = TokenizeMultiCharSeparator(line, "|||"); - assert(toks.size() != 0); - - if (prevSource != toks[0]) { - prevSource = toks[0]; - ++ret; - } - } - - return ret; -} - -void serialize_cache( - std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache, - const std::string &path, float totalSourceCount) -{ - std::vector<const CacheItem*> vec(cache.size()); - - size_t ind = cache.size() - 1; - while (!cache.empty()) { - const CacheItem *item = cache.top(); - vec[ind] = item; - cache.pop(); - --ind; - } - - std::ofstream os(path.c_str()); - - os << totalSourceCount << std::endl; - for (size_t i = 0; i < vec.size(); ++i) { - const CacheItem *item = vec[i]; - os << item->count << "\t" << item->sourceKey << "\t" << item->source << std::endl; - delete item; - } - - os.close(); -} - -uint64_t getKey(const std::vector<uint64_t> &vocabid_source) -{ - return getKey(vocabid_source.data(), vocabid_source.size()); -} - -std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos) -{ - assert(endPos < vocabid_source.size()); - - std::vector<uint64_t> ret(endPos + 1); - for (size_t i = 0; i <= endPos; ++i) { - ret[i] = vocabid_source[i]; - } - return ret; -} - -} - diff --git a/moses/TranslationModel/ProbingPT/storing.hh b/moses/TranslationModel/ProbingPT/storing.hh deleted file mode 100644 index 994067515..000000000 --- a/moses/TranslationModel/ProbingPT/storing.hh +++ /dev/null @@ -1,92 +0,0 @@ -#pragma once - -#include <boost/unordered_set.hpp> -#include <boost/unordered_map.hpp> -#include <cstdio> -#include <sstream> -#include <fstream> -#include <iostream> -#include <string> -#include <queue> -#include <sys/stat.h> //mkdir - -#include "hash.hh" //Includes line_splitter -#include "probing_hash_utils.hh" - -#include "util/file_piece.hh" -#include "util/file.hh" -#include "vocabid.hh" - -namespace Moses -{ -typedef std::vector<uint64_t> SourcePhrase; - - -class Node -{ - typedef boost::unordered_map<uint64_t, Node> Children; - Children m_children; - -public: - uint64_t key; - bool done; - - Node() - :done(false) - {} - - void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0); - void Write(Table &table); -}; - - -void createProbingPT(const std::string &phrasetable_path, - const std::string &basepath, int num_scores, int num_lex_scores, - bool log_prob, int max_cache_size, bool scfg); -uint64_t getKey(const std::vector<uint64_t> &source_phrase); - -std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos); - -template<typename T> -std::string Debug(const std::vector<T> &vec) -{ - std::stringstream strm; - for (size_t i = 0; i < vec.size(); ++i) { - strm << vec[i] << " "; - } - return strm.str(); -} - -size_t countUniqueSource(const std::string &path); - -class CacheItem -{ -public: - std::string source; - uint64_t sourceKey; - float count; - CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount) - :source(vSource) - ,sourceKey(vSourceKey) - ,count(vCount) { - } - - bool operator<(const CacheItem &other) const { - return count > other.count; - } -}; - -class CacheItemOrderer -{ -public: - bool operator()(const CacheItem* a, const CacheItem* b) const { - return (*a) < (*b); - } -}; - -void serialize_cache( - std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache, - const std::string &path, float totalSourceCount); - -} - diff --git a/moses/TranslationModel/ProbingPT/vocabid.cpp b/moses/TranslationModel/ProbingPT/vocabid.cpp deleted file mode 100644 index d6f442323..000000000 --- a/moses/TranslationModel/ProbingPT/vocabid.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include <boost/foreach.hpp> -#include "vocabid.hh" -#include "StoreVocab.h" -#include "moses/Util.h" - -namespace Moses -{ - -void add_to_map(StoreVocab<uint64_t> &sourceVocab, - const StringPiece &textin) -{ - //Tokenize - util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' ')); - - while (itWord) { - StringPiece word = *itWord; - - util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|')); - while (itFactor) { - StringPiece factor = *itFactor; - - sourceVocab.Insert(getHash(factor), factor.as_string()); - itFactor++; - } - itWord++; - } -} - -void serialize_map(const std::map<uint64_t, std::string> &karta, - const std::string &filename) -{ - std::ofstream os(filename.c_str()); - - std::map<uint64_t, std::string>::const_iterator iter; - for (iter = karta.begin(); iter != karta.end(); ++iter) { - os << iter->first << '\t' << iter->second << std::endl; - } - - os.close(); -} - -void read_map(std::map<uint64_t, std::string> &karta, const char* filename) -{ - std::ifstream is(filename); - - std::string line; - while (getline(is, line)) { - std::vector<std::string> toks = Tokenize(line, "\t"); - assert(toks.size() == 2); - uint64_t ind = Scan<uint64_t>(toks[1]); - karta[ind] = toks[0]; - } - - //Close the stream after we are done. - is.close(); -} - -} - diff --git a/moses/TranslationModel/ProbingPT/vocabid.hh b/moses/TranslationModel/ProbingPT/vocabid.hh deleted file mode 100644 index 7e1390874..000000000 --- a/moses/TranslationModel/ProbingPT/vocabid.hh +++ /dev/null @@ -1,29 +0,0 @@ -//Serialization -#include <boost/serialization/serialization.hpp> -#include <boost/serialization/map.hpp> -#include <boost/archive/text_iarchive.hpp> -#include <boost/archive/text_oarchive.hpp> -#include <fstream> -#include <iostream> -#include <vector> - -#include <map> //Container -#include "hash.hh" //Hash of elements - -#include "util/string_piece.hh" //Tokenization and work with StringPiece -#include "util/tokenize_piece.hh" - -namespace Moses -{ -template<typename VOCABID> -class StoreVocab; - -void add_to_map(StoreVocab<uint64_t> &sourceVocab, - const StringPiece &textin); - -void serialize_map(const std::map<uint64_t, std::string> &karta, - const std::string &filename); - -void read_map(std::map<uint64_t, std::string> &karta, const char* filename); - -} |