From 3a72b4958a3fc468b6bd6102e67e24007c9b2d9b Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 3 Oct 2016 19:02:06 +0100 Subject: update Moses::ProbingPT with Moses2::ProbingPT. Does not compile --- contrib/other-builds/moses/.project | 32 +- misc/CreateProbingPT.cpp | 108 ++++- misc/Jamfile | 4 +- misc/QueryProbingPT.cpp | 2 +- moses/TranslationModel/ProbingPT/ProbingPT.cpp | 93 +++-- moses/TranslationModel/ProbingPT/ProbingPT.h | 19 +- moses/TranslationModel/ProbingPT/hash.cpp | 36 +- moses/TranslationModel/ProbingPT/hash.hh | 7 +- moses/TranslationModel/ProbingPT/huffmanish.cpp | 451 --------------------- moses/TranslationModel/ProbingPT/huffmanish.hh | 112 ----- moses/TranslationModel/ProbingPT/line_splitter.cpp | 59 ++- moses/TranslationModel/ProbingPT/line_splitter.hh | 36 +- .../ProbingPT/probing_hash_utils.cpp | 28 +- .../ProbingPT/probing_hash_utils.hh | 38 +- moses/TranslationModel/ProbingPT/quering.cpp | 221 ++++------ moses/TranslationModel/ProbingPT/quering.hh | 62 ++- moses/TranslationModel/ProbingPT/storing.cpp | 322 +++++++++++---- moses/TranslationModel/ProbingPT/storing.hh | 91 ++++- moses/TranslationModel/ProbingPT/vocabid.cpp | 53 ++- moses/TranslationModel/ProbingPT/vocabid.hh | 15 +- 20 files changed, 837 insertions(+), 952 deletions(-) delete mode 100644 moses/TranslationModel/ProbingPT/huffmanish.cpp delete mode 100644 moses/TranslationModel/ProbingPT/huffmanish.hh diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index b59f28e08..c25eb5225 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -1319,7 +1319,7 @@ FF/PhraseBoundaryFeature.h 1 PARENT-3-PROJECT_LOC/moses/FF/PhraseBoundaryFeature.h - + FF/PhraseDistanceFeature.cpp 1 @@ -3341,24 +3341,34 @@ PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/ProbingPT.h - TranslationModel/ProbingPT/hash.cpp + TranslationModel/ProbingPT/StoreTarget.cpp 1 - PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.cpp + PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.cpp - TranslationModel/ProbingPT/hash.hh + TranslationModel/ProbingPT/StoreTarget.h 1 - PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh + PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.h - TranslationModel/ProbingPT/huffmanish.cpp + TranslationModel/ProbingPT/StoreVocab.cpp 1 - PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.cpp + PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.cpp - TranslationModel/ProbingPT/huffmanish.hh + TranslationModel/ProbingPT/StoreVocab.h 1 - PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.hh + PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.h + + + TranslationModel/ProbingPT/hash.cpp + 1 + PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.cpp + + + TranslationModel/ProbingPT/hash.hh + 1 + PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh TranslationModel/ProbingPT/line_splitter.cpp @@ -3664,7 +3674,7 @@ TranslationModel/UG/sapt_pscore_coherence.h 1 PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h - + TranslationModel/UG/sapt_pscore_lex1.h 1 @@ -3709,7 +3719,7 @@ TranslationModel/UG/sapt_pscore_wordcount.h 1 PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h - + TranslationModel/UG/sim-pe.cc 1 diff --git a/misc/CreateProbingPT.cpp b/misc/CreateProbingPT.cpp index b23427f30..dff916660 100644 --- a/misc/CreateProbingPT.cpp +++ b/misc/CreateProbingPT.cpp @@ -1,29 +1,113 @@ +#include +#include #include "util/usage.hh" #include "moses/TranslationModel/ProbingPT/storing.hh" +#include "moses/InputFileStream.h" +#include "moses/OutputFileStream.h" +#include "moses/Util.h" +using namespace std; +std::string ReformatSCFGFile(const std::string &path); int main(int argc, char* argv[]) { + string inPath, outPath; + int num_scores = 4; + int num_lex_scores = 0; + bool log_prob = false; + bool scfg = false; + int max_cache_size = 50000; - const char * is_reordering = "false"; + namespace po = boost::program_options; + po::options_description desc("Options"); + desc.add_options() + ("help", "Print help messages") + ("input-pt", po::value()->required(), "Text pt") + ("output-dir", po::value()->required(), "Directory when binary files will be written") + ("num-scores", po::value()->default_value(num_scores), "Number of pt scores") + ("num-lex-scores", po::value()->default_value(num_lex_scores), "Number of lexicalized reordering scores") + ("log-prob", "log (and floor) probabilities before storing") + ("max-cache-size", po::value()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit") + ("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS") - if (!(argc == 5 || argc == 4)) { - // Tell the user how to run the program - std::cerr << "Provided " << argc << " arguments, needed 4 or 5." << std::endl; - std::cerr << "Usage: " << argv[0] << " path_to_phrasetable output_dir num_scores is_reordering" << std::endl; - std::cerr << "is_reordering should be either true or false, but it is currently a stub feature." << std::endl; - //std::cerr << "Usage: " << argv[0] << " path_to_phrasetable number_of_uniq_lines output_bin_file output_hash_table output_vocab_id" << std::endl; - return 1; + ; + + po::variables_map vm; + try { + po::store(po::parse_command_line(argc, argv, desc), + vm); // can throw + + /** --help option + */ + if ( vm.count("help")) { + std::cout << desc << std::endl; + return EXIT_SUCCESS; + } + + po::notify(vm); // throws on error, so do after help in case + // there are any problems + } catch(po::error& e) { + std::cerr << "ERROR: " << e.what() << std::endl << std::endl; + std::cerr << desc << std::endl; + return EXIT_FAILURE; } - if (argc == 5) { - is_reordering = argv[4]; + if (vm.count("input-pt")) inPath = vm["input-pt"].as(); + if (vm.count("output-dir")) outPath = vm["output-dir"].as(); + if (vm.count("num-scores")) num_scores = vm["num-scores"].as(); + if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as(); + if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as(); + if (vm.count("log-prob")) log_prob = true; + if (vm.count("scfg")) scfg = true; + + + if (scfg) { + inPath = ReformatSCFGFile(inPath); } - createProbingPT(argv[1], argv[2], argv[3], is_reordering); + Moses::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg); - util::PrintUsage(std::cout); + //util::PrintUsage(std::cout); return 0; } +std::string ReformatSCFGFile(const std::string &path) +{ + Moses::InputFileStream inFile(path); + string reformattedPath = path + ".reformat.gz"; + Moses::OutputFileStream outFile(reformattedPath); + + string line; + while (getline(inFile, line)) { + vector toks = Moses::TokenizeMultiCharSeparator(line, "|||"); + assert(toks.size() >= 3); + + // source + vector sourceToks = Moses::Tokenize(toks[0], " "); + for (size_t i = 0; i < sourceToks.size() - 1; ++i) { + outFile << sourceToks[i] << " "; + } + + // other columns + for (size_t i = 1; i < toks.size(); ++i) { + outFile << "|||" << toks[i]; + } + outFile << endl; + } + + inFile.Close(); + outFile.Close(); + + string sortedPath = path + ".reformat.sorted.gz"; + string tmpPath = path + ".tmp "; + string cmd = "mkdir " + tmpPath + + " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath; + system(cmd.c_str()); + + cmd = "rm -rf " + tmpPath + " " + reformattedPath; + system(cmd.c_str()); + + return sortedPath; +} + diff --git a/misc/Jamfile b/misc/Jamfile index f1599aca8..135490a46 100644 --- a/misc/Jamfile +++ b/misc/Jamfile @@ -31,9 +31,9 @@ else { } exe CreateProbingPT : CreateProbingPT.cpp ..//boost_filesystem ../moses//moses ; -exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ; +#exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ; -alias programsProbing : CreateProbingPT QueryProbingPT ; +alias programsProbing : CreateProbingPT ; #QueryProbingPT exe merge-sorted : merge-sorted.cc diff --git a/misc/QueryProbingPT.cpp b/misc/QueryProbingPT.cpp index 72fd0be11..5047d4d47 100644 --- a/misc/QueryProbingPT.cpp +++ b/misc/QueryProbingPT.cpp @@ -34,7 +34,7 @@ int main(int argc, char* argv[]) return 1; } - QueryEngine queries(argv[1]); + Moses::QueryEngine queries(argv[1]); //Interactive search std::cout << "Please enter a string to be searched, or exit to exit." << std::endl; diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp index cbfd2c1a4..bb3f26e22 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp +++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp @@ -3,6 +3,7 @@ #include "moses/StaticData.h" #include "moses/FactorCollection.h" #include "moses/TargetPhraseCollection.h" +#include "moses/InputFileStream.h" #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h" #include "quering.hh" @@ -34,44 +35,94 @@ void ProbingPT::Load(AllOptions::ptr const& opts) m_unkId = 456456546456; + FactorCollection &vocab = FactorCollection::Instance(); + // source vocab - const std::map &sourceVocab = m_engine->getSourceVocab(); + const std::map &sourceVocab = + m_engine->getSourceVocab(); std::map::const_iterator iterSource; - for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end(); ++iterSource) { - const string &wordStr = iterSource->second; - const Factor *factor = FactorCollection::Instance().AddFactor(wordStr); + for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end(); + ++iterSource) { + string wordStr = iterSource->second; + //cerr << "wordStr=" << wordStr << endl; - uint64_t probingId = iterSource->first; + const Factor *factor = vocab.AddFactor(wordStr); - SourceVocabMap::value_type entry(factor, probingId); - m_sourceVocabMap.insert(entry); + uint64_t probingId = iterSource->first; + size_t factorId = factor->GetId(); + if (factorId >= m_sourceVocab.size()) { + m_sourceVocab.resize(factorId + 1, m_unkId); + } + m_sourceVocab[factorId] = probingId; } // target vocab - const std::map &probingVocab = m_engine->getVocab(); - std::map::const_iterator iter; - for (iter = probingVocab.begin(); iter != probingVocab.end(); ++iter) { - const string &wordStr = iter->second; - const Factor *factor = FactorCollection::Instance().AddFactor(wordStr); + InputFileStream targetVocabStrme(m_filePath + "/TargetVocab.dat"); + string line; + while (getline(targetVocabStrme, line)) { + vector toks = Tokenize(line, "\t"); + UTIL_THROW_IF2(toks.size() != 2, string("Incorrect format:") + line + "\n"); + + //cerr << "wordStr=" << toks[0] << endl; + + const Factor *factor = vocab.AddFactor(toks[0]); + uint32_t probingId = Scan(toks[1]); + + if (probingId >= m_targetVocab.size()) { + m_targetVocab.resize(probingId + 1); + } + + m_targetVocab[probingId] = factor; + } + + // alignments + CreateAlignmentMap(m_filePath + "/Alignments.dat"); - unsigned int probingId = iter->first; + // memory mapped file to tps + string filePath = m_filePath + "/TargetColl.dat"; + file.open(filePath.c_str()); + if (!file.is_open()) { + throw "Couldn't open file "; + } + + data = file.data(); + //size_t size = file.size(); + + // cache + //CreateCache(system); - TargetVocabMap::value_type entry(factor, probingId); - m_vocabMap.insert(entry); +} +void ProbingPT::CreateAlignmentMap(const std::string path) +{ + const std::vector< std::vector > &probingAlignColl = m_engine->getAlignments(); + m_aligns.resize(probingAlignColl.size(), NULL); + + for (size_t i = 0; i < probingAlignColl.size(); ++i) { + AlignmentInfo::CollType aligns; + + const std::vector &probingAligns = probingAlignColl[i]; + for (size_t j = 0; j < probingAligns.size(); j += 2) { + size_t startPos = probingAligns[j]; + size_t endPos = probingAligns[j+1]; + //cerr << "startPos=" << startPos << " " << endPos << endl; + aligns.insert(std::pair(startPos, endPos)); + } + + const AlignmentInfo *align = AlignmentInfoCollection::Instance().Add(aligns); + m_aligns[i] = align; + //cerr << "align=" << align->Debug(system) << endl; } } void ProbingPT::InitializeForInput(ttasksptr const& ttask) { - ReduceCache(); + } void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const { - CacheColl &cache = GetCache(); - InputPathList::const_iterator iter; for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) { InputPath &inputPath = **iter; @@ -82,12 +133,6 @@ void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQue } TargetPhraseCollection::shared_ptr tpColl = CreateTargetPhrase(sourcePhrase); - - // add target phrase to phrase-table cache - size_t hash = hash_value(sourcePhrase); - std::pair value(tpColl, clock()); - cache[hash] = value; - inputPath.SetTargetPhrases(*this, tpColl, NULL); } } diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.h b/moses/TranslationModel/ProbingPT/ProbingPT.h index 4e7ab02c6..3b5dfc895 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.h +++ b/moses/TranslationModel/ProbingPT/ProbingPT.h @@ -1,17 +1,17 @@ #pragma once - +#include #include #include "../PhraseDictionary.h" -class QueryEngine; -class target_text; namespace Moses { class ChartParser; class ChartCellCollectionBase; class ChartRuleLookupManager; +class QueryEngine; +class target_text; class ProbingPT : public PhraseDictionary { @@ -39,12 +39,16 @@ public: protected: QueryEngine *m_engine; + uint64_t m_unkId; - typedef boost::bimap SourceVocabMap; - mutable SourceVocabMap m_sourceVocabMap; + std::vector m_sourceVocab; // factor id -> pt id + std::vector m_targetVocab; // pt id -> factor* + std::vector m_aligns; - typedef boost::bimap TargetVocabMap; - mutable TargetVocabMap m_vocabMap; + boost::iostreams::mapped_file_source file; + const char *data; + + void CreateAlignmentMap(const std::string path); TargetPhraseCollection::shared_ptr CreateTargetPhrase(const Phrase &sourcePhrase) const; TargetPhrase *CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const; @@ -53,7 +57,6 @@ protected: std::vector ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const; - uint64_t m_unkId; }; } // namespace Moses diff --git a/moses/TranslationModel/ProbingPT/hash.cpp b/moses/TranslationModel/ProbingPT/hash.cpp index 8945649ef..27a64b129 100644 --- a/moses/TranslationModel/ProbingPT/hash.cpp +++ b/moses/TranslationModel/ProbingPT/hash.cpp @@ -1,5 +1,11 @@ +#include #include "hash.hh" +using namespace std; + +namespace Moses +{ + uint64_t getHash(StringPiece text) { std::size_t len = text.size(); @@ -7,24 +13,32 @@ uint64_t getHash(StringPiece text) return key; } -std::vector getVocabIDs(StringPiece textin) +std::vector getVocabIDs(const StringPiece &textin) { //Tokenize std::vector output; - util::TokenIter it(textin, util::SingleCharacter(' ')); + util::TokenIter itWord(textin, util::SingleCharacter(' ')); + + while (itWord) { + StringPiece word = *itWord; + uint64_t id = 0; + + util::TokenIter itFactor(word, util::SingleCharacter('|')); + while (itFactor) { + StringPiece factor = *itFactor; + //cerr << "factor=" << factor << endl; - while(it) { - output.push_back(getHash(*it)); - it++; + id += getHash(factor); + itFactor++; + } + + output.push_back(id); + itWord++; } return output; } -uint64_t getVocabID(std::string candidate) -{ - std::size_t len = candidate.length(); - uint64_t key = util::MurmurHashNative(candidate.c_str(), len); - return key; -} \ No newline at end of file +} + diff --git a/moses/TranslationModel/ProbingPT/hash.hh b/moses/TranslationModel/ProbingPT/hash.hh index 607238ae1..f218ad9da 100644 --- a/moses/TranslationModel/ProbingPT/hash.hh +++ b/moses/TranslationModel/ProbingPT/hash.hh @@ -6,9 +6,12 @@ #include "util/tokenize_piece.hh" #include +namespace Moses +{ + //Gets the MurmurmurHash for give string uint64_t getHash(StringPiece text); -std::vector getVocabIDs(StringPiece textin); +std::vector getVocabIDs(const StringPiece &textin); -uint64_t getVocabID(std::string candidate); \ No newline at end of file +} diff --git a/moses/TranslationModel/ProbingPT/huffmanish.cpp b/moses/TranslationModel/ProbingPT/huffmanish.cpp deleted file mode 100644 index 534fd04d1..000000000 --- a/moses/TranslationModel/ProbingPT/huffmanish.cpp +++ /dev/null @@ -1,451 +0,0 @@ -#include "huffmanish.hh" - -Huffman::Huffman (const char * filepath) -{ - //Read the file - util::FilePiece filein(filepath); - - //Init uniq_lines to zero; - uniq_lines = 0; - - line_text prev_line; //Check for unique lines. - int num_lines = 0 ; - - while (true) { - line_text new_line; - - num_lines++; - - try { - //Process line read - new_line = splitLine(filein.ReadLine()); - count_elements(new_line); //Counts the number of elements, adds new and increments counters. - - } catch (util::EndOfFileException e) { - std::cerr << "Unique entries counted: "; - break; - } - - if (new_line.source_phrase == prev_line.source_phrase) { - continue; - } else { - uniq_lines++; - prev_line = new_line; - } - } - - std::cerr << uniq_lines << std::endl; -} - -void Huffman::count_elements(line_text linein) -{ - //For target phrase: - util::TokenIter it(linein.target_phrase, util::SingleCharacter(' ')); - while (it) { - //Check if we have that entry - std::map::iterator mapiter; - mapiter = target_phrase_words.find(it->as_string()); - - if (mapiter != target_phrase_words.end()) { - //If the element is found, increment the count. - mapiter->second++; - } else { - //Else create a new entry; - target_phrase_words.insert(std::pair(it->as_string(), 1)); - } - it++; - } - - //For word allignment 1 - std::map, unsigned int>::iterator mapiter3; - std::vector numbers = splitWordAll1(linein.word_align); - mapiter3 = word_all1.find(numbers); - - if (mapiter3 != word_all1.end()) { - //If the element is found, increment the count. - mapiter3->second++; - } else { - //Else create a new entry; - word_all1.insert(std::pair, unsigned int>(numbers, 1)); - } - -} - -//Assigns huffman values for each unique element -void Huffman::assign_values() -{ - //First create vectors for all maps so that we could sort them later. - - //Create a vector for target phrases - for(std::map::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) { - target_phrase_words_counts.push_back(*it); - } - //Sort it - std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair()); - - //Create a vector for word allignments 1 - for(std::map, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) { - word_all1_counts.push_back(*it); - } - //Sort it - std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec()); - - - //Afterwards we assign a value for each phrase, starting from 1, as zero is reserved for delimiter - unsigned int i = 1; //huffman code - for(std::vector >::iterator it = target_phrase_words_counts.begin(); - it != target_phrase_words_counts.end(); it++) { - target_phrase_huffman.insert(std::pair(it->first, i)); - i++; //Go to the next huffman code - } - - i = 1; //Reset i for the next map - for(std::vector, unsigned int> >::iterator it = word_all1_counts.begin(); - it != word_all1_counts.end(); it++) { - word_all1_huffman.insert(std::pair, unsigned int>(it->first, i)); - i++; //Go to the next huffman code - } - - //After lookups are produced, clear some memory usage of objects not needed anymore. - target_phrase_words.clear(); - word_all1.clear(); - - target_phrase_words_counts.clear(); - word_all1_counts.clear(); - - std::cerr << "Finished generating huffman codes." << std::endl; - -} - -void Huffman::serialize_maps(const char * dirname) -{ - //Note that directory name should exist. - std::string basedir(dirname); - std::string target_phrase_path(basedir + "/target_phrases"); - std::string probabilities_path(basedir + "/probs"); - std::string word_all1_path(basedir + "/Wall1"); - - //Target phrase - std::ofstream os (target_phrase_path.c_str(), std::ios::binary); - boost::archive::text_oarchive oarch(os); - oarch << lookup_target_phrase; - os.close(); - - //Word all1 - std::ofstream os2 (word_all1_path.c_str(), std::ios::binary); - boost::archive::text_oarchive oarch2(os2); - oarch2 << lookup_word_all1; - os2.close(); -} - -std::vector Huffman::full_encode_line(line_text line) -{ - return vbyte_encode_line((encode_line(line))); -} - -std::vector Huffman::encode_line(line_text line) -{ - std::vector retvector; - - //Get target_phrase first. - util::TokenIter it(line.target_phrase, util::SingleCharacter(' ')); - while (it) { - retvector.push_back(target_phrase_huffman.find(it->as_string())->second); - it++; - } - //Add a zero; - retvector.push_back(0); - - //Get probabilities. Reinterpreting the float bytes as unsgined int. - util::TokenIter probit(line.prob, util::SingleCharacter(' ')); - while (probit) { - //Sometimes we have too big floats to handle, so first convert to double - double tempnum = atof(probit->data()); - float num = (float)tempnum; - retvector.push_back(reinterpret_float(&num)); - probit++; - } - //Add a zero; - retvector.push_back(0); - - - //Get Word allignments - retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_align))->second); - retvector.push_back(0); - - return retvector; -} - -void Huffman::produce_lookups() -{ - //basically invert every map that we have - for(std::map::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) { - lookup_target_phrase.insert(std::pair(it->second, it->first)); - } - - for(std::map, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) { - lookup_word_all1.insert(std::pair >(it->second, it->first)); - } - -} - -HuffmanDecoder::HuffmanDecoder (const char * dirname) -{ - //Read the maps from disk - - //Note that directory name should exist. - std::string basedir(dirname); - std::string target_phrase_path(basedir + "/target_phrases"); - std::string word_all1_path(basedir + "/Wall1"); - - //Target phrases - std::ifstream is (target_phrase_path.c_str(), std::ios::binary); - boost::archive::text_iarchive iarch(is); - iarch >> lookup_target_phrase; - is.close(); - - //Word allignment 1 - std::ifstream is2 (word_all1_path.c_str(), std::ios::binary); - boost::archive::text_iarchive iarch2(is2); - iarch2 >> lookup_word_all1; - is2.close(); - -} - -HuffmanDecoder::HuffmanDecoder (std::map * lookup_target, - std::map > * lookup_word1) -{ - lookup_target_phrase = *lookup_target; - lookup_word_all1 = *lookup_word1; -} - -std::vector HuffmanDecoder::full_decode_line (std::vector lines, int num_scores) -{ - std::vector retvector; //All target phrases - std::vector decoded_lines = vbyte_decode_line(lines); //All decoded lines - std::vector::iterator it = decoded_lines.begin(); //Iterator for them - std::vector current_target_phrase; //Current target phrase decoded - - short zero_count = 0; //Count home many zeroes we have met. so far. Every 3 zeroes mean a new target phrase. - while(it != decoded_lines.end()) { - if (zero_count == 1) { - //We are extracting scores. we know how many scores there are so we can push them - //to the vector. This is done in case any of the scores is 0, because it would mess - //up the state machine. - for (int i = 0; i < num_scores; i++) { - current_target_phrase.push_back(*it); - it++; - } - } - - if (zero_count == 3) { - //We have finished with this entry, decode it, and add it to the retvector. - retvector.push_back(decode_line(current_target_phrase, num_scores)); - current_target_phrase.clear(); //Clear the current target phrase and the zero_count - zero_count = 0; //So that we can reuse them for the next target phrase - } - //Add to the next target_phrase, number by number. - current_target_phrase.push_back(*it); - if (*it == 0) { - zero_count++; - } - it++; //Go to the next word/symbol - } - //Don't forget the last remaining line! - if (zero_count == 3) { - //We have finished with this entry, decode it, and add it to the retvector. - retvector.push_back(decode_line(current_target_phrase, num_scores)); - current_target_phrase.clear(); //Clear the current target phrase and the zero_count - zero_count = 0; //So that we can reuse them for the next target phrase - } - - return retvector; - -} - -target_text HuffmanDecoder::decode_line (std::vector input, int num_scores) -{ - //demo decoder - target_text ret; - //Split everything - std::vector target_phrase; - std::vector probs; - unsigned int wAll; - - //Split the line into the proper arrays - short num_zeroes = 0; - int counter = 0; - while (num_zeroes < 3) { - unsigned int num = input[counter]; - if (num == 0) { - num_zeroes++; - } else if (num_zeroes == 0) { - target_phrase.push_back(num); - } else if (num_zeroes == 1) { - //Push exactly num_scores scores - for (int i = 0; i < num_scores; i++) { - probs.push_back(num); - counter++; - num = input[counter]; - } - continue; - } else if (num_zeroes == 2) { - wAll = num; - } - counter++; - } - - ret.target_phrase = target_phrase; - ret.word_all1 = lookup_word_all1.find(wAll)->second; - - //Decode probabilities - for (std::vector::iterator it = probs.begin(); it != probs.end(); it++) { - ret.prob.push_back(reinterpret_uint(&(*it))); - } - - return ret; - -} - -inline std::string HuffmanDecoder::getTargetWordFromID(unsigned int id) -{ - return lookup_target_phrase.find(id)->second; -} - -std::string HuffmanDecoder::getTargetWordsFromIDs(std::vector ids) -{ - std::string returnstring; - for (std::vector::iterator it = ids.begin(); it != ids.end(); it++) { - returnstring.append(getTargetWordFromID(*it) + " "); - } - - return returnstring; -} - -inline std::string getTargetWordFromID(unsigned int id, std::map * lookup_target_phrase) -{ - return lookup_target_phrase->find(id)->second; -} - -std::string getTargetWordsFromIDs(std::vector ids, std::map * lookup_target_phrase) -{ - std::string returnstring; - for (std::vector::iterator it = ids.begin(); it != ids.end(); it++) { - returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " "); - } - - return returnstring; -} - -/*Those functions are used to more easily store the floats in the binary phrase table - We convert the float unsinged int so that it is the same as our other values and we can - apply variable byte encoding on top of it.*/ - -inline unsigned int reinterpret_float(float * num) -{ - unsigned int * converted_num; - converted_num = reinterpret_cast(num); - return *converted_num; -} - -inline float reinterpret_uint(unsigned int * num) -{ - float * converted_num; - converted_num = reinterpret_cast(num); - return *converted_num; -} - -/*Mostly taken from stackoverflow, http://stackoverflow.com/questions/5858646/optimizing-variable-length-encoding -and modified in order to return a vector of chars. Implements ULEB128 or variable byte encoding. -This is highly optimized version with unrolled loop */ -inline std::vector vbyte_encode(unsigned int num) -{ - //Determine how many bytes we are going to take. - short size; - std::vector byte_vector; - - if (num < 0x00000080U) { - size = 1; - byte_vector.reserve(size); - goto b1; - } - if (num < 0x00004000U) { - size = 2; - byte_vector.reserve(size); - goto b2; - } - if (num < 0x00200000U) { - size = 3; - byte_vector.reserve(size); - goto b3; - } - if (num < 0x10000000U) { - size = 4; - byte_vector.reserve(size); - goto b4; - } - size = 5; - byte_vector.reserve(size); - - - //Now proceed with the encoding. - byte_vector.push_back((num & 0x7f) | 0x80); - num >>= 7; -b4: - byte_vector.push_back((num & 0x7f) | 0x80); - num >>= 7; -b3: - byte_vector.push_back((num & 0x7f) | 0x80); - num >>= 7; -b2: - byte_vector.push_back((num & 0x7f) | 0x80); - num >>= 7; -b1: - byte_vector.push_back(num); - - return byte_vector; -} - -std::vector vbyte_decode_line(std::vector line) -{ - std::vector huffman_line; - std::vector current_num; - - for (std::vector::iterator it = line.begin(); it != line.end(); it++) { - current_num.push_back(*it); - if ((*it >> 7) != 1) { - //We don't have continuation in the next bit - huffman_line.push_back(bytes_to_int(current_num)); - current_num.clear(); - } - } - return huffman_line; -} - -inline unsigned int bytes_to_int(std::vector number) -{ - unsigned int retvalue = 0; - std::vector::iterator it = number.begin(); - unsigned char shift = 0; //By how many bits to shift - - while (it != number.end()) { - retvalue |= (*it & 0x7f) << shift; - shift += 7; - it++; - } - - return retvalue; -} - -std::vector vbyte_encode_line(std::vector line) -{ - std::vector retvec; - - //For each unsigned int in the line, vbyte encode it and add it to a vector of unsigned chars. - for (std::vector::iterator it = line.begin(); it != line.end(); it++) { - std::vector vbyte_encoded = vbyte_encode(*it); - retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end()); - } - - return retvec; -} diff --git a/moses/TranslationModel/ProbingPT/huffmanish.hh b/moses/TranslationModel/ProbingPT/huffmanish.hh deleted file mode 100644 index 0970a9e68..000000000 --- a/moses/TranslationModel/ProbingPT/huffmanish.hh +++ /dev/null @@ -1,112 +0,0 @@ -#pragma once - -//Huffman encodes a line and also produces the vocabulary ids -#include "hash.hh" -#include "line_splitter.hh" -#include -#include -#include -#include -#include -#include -#include -#include -#include - -//Sorting for the second -struct sort_pair { - bool operator()(const std::pair &left, const std::pair &right) { - return left.second > right.second; //This puts biggest numbers first. - } -}; - -struct sort_pair_vec { - bool operator()(const std::pair, unsigned int> &left, const std::pair, unsigned int> &right) { - return left.second > right.second; //This puts biggest numbers first. - } -}; - -class Huffman -{ - unsigned long uniq_lines; //Unique lines in the file. - - //Containers used when counting the occurence of a given phrase - std::map target_phrase_words; - std::map, unsigned int> word_all1; - - //Same containers as vectors, for sorting - std::vector > target_phrase_words_counts; - std::vector, unsigned int> > word_all1_counts; - - //Huffman maps - std::map target_phrase_huffman; - std::map, unsigned int> word_all1_huffman; - - //inverted maps - std::map lookup_target_phrase; - std::map > lookup_word_all1; - -public: - Huffman (const char *); - void count_elements (line_text line); - void assign_values(); - void serialize_maps(const char * dirname); - void produce_lookups(); - - std::vector encode_line(line_text line); - - //encode line + variable byte ontop - std::vector full_encode_line(line_text line); - - //Getters - const std::map get_target_lookup_map() const { - return lookup_target_phrase; - } - const std::map > get_word_all1_lookup_map() const { - return lookup_word_all1; - } - - unsigned long getUniqLines() { - return uniq_lines; - } -}; - -class HuffmanDecoder -{ - std::map lookup_target_phrase; - std::map > lookup_word_all1; - -public: - HuffmanDecoder (const char *); - HuffmanDecoder (std::map *, std::map > *); - - //Getters - const std::map get_target_lookup_map() const { - return lookup_target_phrase; - } - const std::map > get_word_all1_lookup_map() const { - return lookup_word_all1; - } - - inline std::string getTargetWordFromID(unsigned int id); - - std::string getTargetWordsFromIDs(std::vector ids); - - target_text decode_line (std::vector input, int num_scores); - - //Variable byte decodes a all target phrases contained here and then passes them to decode_line - std::vector full_decode_line (std::vector lines, int num_scores); -}; - -std::string getTargetWordsFromIDs(std::vector ids, std::map * lookup_target_phrase); - -inline std::string getTargetWordFromID(unsigned int id, std::map * lookup_target_phrase); - -inline unsigned int reinterpret_float(float * num); - -inline float reinterpret_uint(unsigned int * num); - -std::vector vbyte_encode_line(std::vector line); -inline std::vector vbyte_encode(unsigned int num); -std::vector vbyte_decode_line(std::vector line); -inline unsigned int bytes_to_int(std::vector number); diff --git a/moses/TranslationModel/ProbingPT/line_splitter.cpp b/moses/TranslationModel/ProbingPT/line_splitter.cpp index 1eeeb1899..cb9e47fec 100644 --- a/moses/TranslationModel/ProbingPT/line_splitter.cpp +++ b/moses/TranslationModel/ProbingPT/line_splitter.cpp @@ -1,66 +1,92 @@ #include "line_splitter.hh" -line_text splitLine(StringPiece textin) +namespace Moses { - const char delim[] = " ||| "; + +line_text splitLine(const StringPiece &textin, bool scfg) +{ + const char delim[] = "|||"; line_text output; //Tokenize util::TokenIter it(textin, util::MultiCharacter(delim)); //Get source phrase - output.source_phrase = *it; + output.source_phrase = Trim(*it); + //std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl; //Get target_phrase it++; - output.target_phrase = *it; + output.target_phrase = Trim(*it); + //std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl; + + if (scfg) { + /* + std::cerr << "output.source_phrase=" << output.source_phrase << std::endl; + std::cerr << "output.target_phrase=" << output.target_phrase << std::endl; + reformatSCFG(output); + std::cerr << "output.source_phrase=" << output.source_phrase << std::endl; + std::cerr << "output.target_phrase=" << output.target_phrase << std::endl; + */ + } //Get probabilities it++; - output.prob = *it; + output.prob = Trim(*it); + //std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl; //Get WordAllignment it++; if (it == util::TokenIter::end()) return output; - output.word_align = *it; + output.word_align = Trim(*it); + //std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl; //Get count it++; if (it == util::TokenIter::end()) return output; - output.counts = *it; + output.counts = Trim(*it); + //std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl; //Get sparse_score it++; if (it == util::TokenIter::end()) return output; - output.sparse_score = *it; + output.sparse_score = Trim(*it); + //std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl; //Get property it++; if (it == util::TokenIter::end()) return output; - output.property = *it; + output.property = Trim(*it); + //std::cerr << "output.property=" << output.property << "AAAA" << std::endl; return output; } -std::vector splitWordAll1(StringPiece textin) +std::vector splitWordAll1(const StringPiece &textin) { const char delim[] = " "; const char delim2[] = "-"; std::vector output; + //Case with no word alignments. + if (textin.size() == 0) { + return output; + } + //Split on space util::TokenIter it(textin, util::MultiCharacter(delim)); //For each int while (it) { //Split on dash (-) - util::TokenIter itInner(*it, util::MultiCharacter(delim2)); + util::TokenIter itInner(*it, + util::MultiCharacter(delim2)); //Insert the two entries in the vector. User will read entry 0 and 1 to get the first, //2 and 3 for second etc. Use unsigned char instead of int to save space, as //word allignments are all very small numbers that fit in a single byte - output.push_back((unsigned char)(atoi(itInner->data()))); + output.push_back((unsigned char) (atoi(itInner->data()))); itInner++; - output.push_back((unsigned char)(atoi(itInner->data()))); + output.push_back((unsigned char) (atoi(itInner->data()))); it++; } @@ -68,3 +94,10 @@ std::vector splitWordAll1(StringPiece textin) } +void reformatSCFG(line_text &output) +{ + +} + +} + diff --git a/moses/TranslationModel/ProbingPT/line_splitter.hh b/moses/TranslationModel/ProbingPT/line_splitter.hh index 2cb9a3c8c..cec0a5f45 100644 --- a/moses/TranslationModel/ProbingPT/line_splitter.hh +++ b/moses/TranslationModel/ProbingPT/line_splitter.hh @@ -9,8 +9,12 @@ #include "util/tokenize_piece.hh" #include +namespace Moses +{ + //Struct for holding processed line -struct line_text { +struct line_text +{ StringPiece source_phrase; StringPiece target_phrase; StringPiece prob; @@ -18,16 +22,38 @@ struct line_text { StringPiece counts; StringPiece sparse_score; StringPiece property; + std::string property_to_be_binarized; }; //Struct for holding processed line -struct target_text { +struct target_text +{ std::vector target_phrase; std::vector prob; - std::vector word_all1; + std::vector word_align_term; + std::vector word_align_non_term; + std::vector counts; + std::vector sparse_score; + std::vector property; + + /* + void Reset() + { + target_phrase.clear(); + prob.clear(); + word_all1.clear(); + counts.clear(); + sparse_score.clear(); + property.clear(); + } + */ }; //Ask if it's better to have it receive a pointer to a line_text struct -line_text splitLine(StringPiece textin); +line_text splitLine(const StringPiece &textin, bool scfg); +void reformatSCFG(line_text &output); + +std::vector splitWordAll1(const StringPiece &textin); + +} -std::vector splitWordAll1(StringPiece textin); diff --git a/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp b/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp index ca3e8f69f..f23f57d66 100644 --- a/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp +++ b/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp @@ -1,5 +1,8 @@ #include "probing_hash_utils.hh" +namespace Moses +{ + //Read table from disk, return memory map location char * readTable(const char * filename, size_t size) { @@ -13,7 +16,7 @@ char * readTable(const char * filename, size_t size) exit(EXIT_FAILURE); } - map = (char *)mmap(0, size, PROT_READ, MAP_SHARED, fd, 0); + map = (char *) mmap(0, size, PROT_READ, MAP_SHARED, fd, 0); if (map == MAP_FAILED) { close(fd); @@ -24,11 +27,24 @@ char * readTable(const char * filename, size_t size) return map; } - -void serialize_table(char *mem, size_t size, const char * filename) +void serialize_table(char *mem, size_t size, const std::string &filename) { - std::ofstream os (filename, std::ios::binary); - os.write((const char*)&mem[0], size); + std::ofstream os(filename.c_str(), std::ios::binary); + os.write((const char*) &mem[0], size); os.close(); -} \ No newline at end of file +} + +uint64_t getKey(const uint64_t source_phrase[], size_t size) +{ + //TOO SLOW + //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size()); + uint64_t key = 0; + for (size_t i = 0; i < size; i++) { + key += (source_phrase[i] << i); + } + return key; +} + +} + diff --git a/moses/TranslationModel/ProbingPT/probing_hash_utils.hh b/moses/TranslationModel/ProbingPT/probing_hash_utils.hh index de96e87a0..dcf0dbe25 100644 --- a/moses/TranslationModel/ProbingPT/probing_hash_utils.hh +++ b/moses/TranslationModel/ProbingPT/probing_hash_utils.hh @@ -7,31 +7,49 @@ #include #include +namespace Moses +{ + +#define API_VERSION 15 //Hash table entry -struct Entry { - uint64_t key; +struct Entry +{ typedef uint64_t Key; - unsigned int bytes_toread; + Key key; - uint64_t GetKey() const { + Key GetKey() const + { return key; } - void SetKey(uint64_t to) { + void SetKey(Key to) + { key = to; } - uint64_t GetValue() const { - return value; - } - uint64_t value; }; +#define NONE std::numeric_limits::max() + //Define table typedef util::ProbingHashTable > Table; -void serialize_table(char *mem, size_t size, const char * filename); +void serialize_table(char *mem, size_t size, const std::string &filename); char * readTable(const char * filename, size_t size); + +uint64_t getKey(const uint64_t source_phrase[], size_t size); + +struct TargetPhraseInfo +{ + uint32_t alignTerm; + uint32_t alignNonTerm; + uint16_t numWords; + uint16_t propLength; + uint16_t filler; +}; + +} + diff --git a/moses/TranslationModel/ProbingPT/quering.cpp b/moses/TranslationModel/ProbingPT/quering.cpp index bd1d61a1e..ef980ef06 100644 --- a/moses/TranslationModel/ProbingPT/quering.cpp +++ b/moses/TranslationModel/ProbingPT/quering.cpp @@ -1,73 +1,80 @@ #include "quering.hh" +#include "util/exception.hh" -unsigned char * read_binary_file(const char * filename, size_t filesize) -{ - //Get filesize - int fd; - unsigned char * map; - - fd = open(filename, O_RDONLY); - - if (fd == -1) { - perror("Error opening file for reading"); - exit(EXIT_FAILURE); - } - - map = (unsigned char *)mmap(0, filesize, PROT_READ, MAP_SHARED, fd, 0); - if (map == MAP_FAILED) { - close(fd); - perror("Error mmapping the file"); - exit(EXIT_FAILURE); - } +using namespace std; - return map; -} +namespace Moses +{ -QueryEngine::QueryEngine(const char * filepath) : decoder(filepath) +QueryEngine::QueryEngine(const char * filepath) { //Create filepaths std::string basepath(filepath); std::string path_to_hashtable = basepath + "/probing_hash.dat"; - std::string path_to_data_bin = basepath + "/binfile.dat"; std::string path_to_source_vocabid = basepath + "/source_vocabids"; + std::string alignPath = basepath + "/Alignments.dat"; ///Source phrase vocabids - read_map(&source_vocabids, path_to_source_vocabid.c_str()); + read_map(source_vocabids, path_to_source_vocabid.c_str()); - //Target phrase vocabIDs - vocabids = decoder.get_target_lookup_map(); + // alignments + read_alignments(alignPath); //Read config file + boost::unordered_map keyValue; + + std::ifstream config((basepath + "/config").c_str()); std::string line; - std::ifstream config ((basepath + "/config").c_str()); + while (getline(config, line)) { + std::vector toks = Tokenize(line, "\t"); + UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line); + keyValue[ toks[0] ] = toks[1]; + } + + bool found; //Check API version: - getline(config, line); - if (atoi(line.c_str()) != API_VERSION) { - std::cerr << "The ProbingPT API has changed, please rebinarize your phrase tables." << std::endl; + int version; + found = Get(keyValue, "API_VERSION", version); + if (!found) { + std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl; + } + else if (version != API_VERSION) { + std::cerr << "The ProbingPT API has changed. " << version << "!=" + << API_VERSION << " Please rebinarize your phrase tables." << std::endl; exit(EXIT_FAILURE); } + //Get tablesize. - getline(config, line); - int tablesize = atoi(line.c_str()); + int tablesize; + found = Get(keyValue, "uniq_entries", tablesize); + if (!found) { + std::cerr << "uniq_entries not found" << std::endl; + exit(EXIT_FAILURE); + } + //Number of scores - getline(config, line); - num_scores = atoi(line.c_str()); - //do we have a reordering table - getline(config, line); - std::transform(line.begin(), line.end(), line.begin(), ::tolower); //Get the boolean in lowercase - is_reordering = false; - if (line == "true") { - is_reordering = true; - std::cerr << "WARNING. REORDERING TABLES NOT SUPPORTED YET." << std::endl; + found = Get(keyValue, "num_scores", num_scores); + if (!found) { + std::cerr << "num_scores not found" << std::endl; + exit(EXIT_FAILURE); } - config.close(); - //Mmap binary table - struct stat filestatus; - stat(path_to_data_bin.c_str(), &filestatus); - binary_filesize = filestatus.st_size; - binary_mmaped = read_binary_file(path_to_data_bin.c_str(), binary_filesize); + //How may scores from lex reordering models + found = Get(keyValue, "num_lex_scores", num_lex_scores); + if (!found) { + std::cerr << "num_lex_scores not found" << std::endl; + exit(EXIT_FAILURE); + } + + // have the scores been log() and FloorScore()? + found = Get(keyValue, "log_prob", logProb); + if (!found) { + std::cerr << "logProb not found" << std::endl; + exit(EXIT_FAILURE); + } + + config.close(); //Read hashtable table_filesize = Table::Size(tablesize, 1.2); @@ -81,118 +88,50 @@ QueryEngine::QueryEngine(const char * filepath) : decoder(filepath) QueryEngine::~QueryEngine() { //Clear mmap content from memory. - munmap(binary_mmaped, binary_filesize); munmap(mem, table_filesize); } -std::pair > QueryEngine::query(std::vector source_phrase) +uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const { - bool found; - std::vector translation_entries; - const Entry * entry; //TOO SLOW //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size()); - uint64_t key = 0; - for (int i = 0; i < source_phrase.size(); i++) { - key += (source_phrase[i] << i); - } - - - found = table.Find(key, entry); - - if (found) { - //The phrase that was searched for was found! We need to get the translation entries. - //We will read the largest entry in bytes and then filter the unnecesarry with functions - //from line_splitter - uint64_t initial_index = entry -> GetValue(); - unsigned int bytes_toread = entry -> bytes_toread; - - //ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS! - std::vector encoded_text; //Assign to the vector the relevant portion of the array. - encoded_text.reserve(bytes_toread); - for (int i = 0; i < bytes_toread; i++) { - encoded_text.push_back(binary_mmaped[i+initial_index]); - } - - //Get only the translation entries necessary - translation_entries = decoder.full_decode_line(encoded_text, num_scores); - - } - - std::pair > output (found, translation_entries); - - return output; - + return getKey(source_phrase, size); } -std::pair > QueryEngine::query(StringPiece source_phrase) +std::pair QueryEngine::query(uint64_t key) { - bool found; - std::vector translation_entries; - const Entry * entry; - //Convert source frase to VID - std::vector source_phrase_vid = getVocabIDs(source_phrase); - //TOO SLOW - //uint64_t key = util::MurmurHashNative(&source_phrase_vid[0], source_phrase_vid.size()); - uint64_t key = 0; - for (int i = 0; i < source_phrase_vid.size(); i++) { - key += (source_phrase_vid[i] << i); - } - - found = table.Find(key, entry); - - - if (found) { - //The phrase that was searched for was found! We need to get the translation entries. - //We will read the largest entry in bytes and then filter the unnecesarry with functions - //from line_splitter - uint64_t initial_index = entry -> GetValue(); - unsigned int bytes_toread = entry -> bytes_toread; - //At the end of the file we can't readd + largest_entry cause we get a segfault. - std::cerr << "Entry size is bytes is: " << bytes_toread << std::endl; - - //ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS! - std::vector encoded_text; //Assign to the vector the relevant portion of the array. - encoded_text.reserve(bytes_toread); - for (int i = 0; i < bytes_toread; i++) { - encoded_text.push_back(binary_mmaped[i+initial_index]); - } - - //Get only the translation entries necessary - translation_entries = decoder.full_decode_line(encoded_text, num_scores); + std::pair ret; + const Entry * entry; + ret.first = table.Find(key, entry); + if (ret.first) { + ret.second = entry->value; } - - std::pair > output (found, translation_entries); - - return output; - + return ret; } -void QueryEngine::printTargetInfo(std::vector target_phrases) +void QueryEngine::read_alignments(const std::string &alignPath) { - int entries = target_phrases.size(); + std::ifstream strm(alignPath.c_str()); - for (int i = 0; i toks = Tokenize(line, "\t "); + UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file"); - //Print probabilities: - for (int j = 0; j(toks[0]); + if (alignInd >= alignColl.size()) { + alignColl.resize(alignInd + 1); } - std::cout << "\t"; - - //Print word_all1 - for (int j = 0; j(toks[i]); + aligns.push_back(pos); } - std::cout << std::endl; } } + +} + diff --git a/moses/TranslationModel/ProbingPT/quering.hh b/moses/TranslationModel/ProbingPT/quering.hh index e574d1f8f..c43c7f3b9 100644 --- a/moses/TranslationModel/ProbingPT/quering.hh +++ b/moses/TranslationModel/ProbingPT/quering.hh @@ -1,45 +1,65 @@ #pragma once -#include "probing_hash_utils.hh" -#include "huffmanish.hh" -#include "hash.hh" //Includes line splitter +#include #include //For finding size of file #include "vocabid.hh" #include //toLower -#define API_VERSION 3 - +#include +#include "probing_hash_utils.hh" +#include "hash.hh" //Includes line splitter +#include "line_splitter.hh" +#include "moses//Util.h" -char * read_binary_file(char * filename); +namespace Moses +{ class QueryEngine { - unsigned char * binary_mmaped; //The binari phrase table file - std::map vocabids; std::map source_vocabids; + typedef std::vector Alignments; + std::vector alignColl; + Table table; char *mem; //Memory for the table, necessary so that we can correctly destroy the object - HuffmanDecoder decoder; - - size_t binary_filesize; size_t table_filesize; - int num_scores; bool is_reordering; + + void read_alignments(const std::string &alignPath); + public: - QueryEngine (const char *); + int num_scores; + int num_lex_scores; + bool logProb; + + QueryEngine(const char *); ~QueryEngine(); - std::pair > query(StringPiece source_phrase); - std::pair > query(std::vector source_phrase); - void printTargetInfo(std::vector target_phrases); - const std::map getVocab() const { - return decoder.get_target_lookup_map(); - } - const std::map getSourceVocab() const { - return source_vocabids; + std::pair query(uint64_t key); + + const std::map &getSourceVocab() const + { return source_vocabids; } + + const std::vector &getAlignments() const + { return alignColl; } + + uint64_t getKey(uint64_t source_phrase[], size_t size) const; + + template + inline bool Get(const boost::unordered_map &keyValue, const std::string &sought, T &found) const + { + boost::unordered_map::const_iterator iter = keyValue.find(sought); + if (iter == keyValue.end()) { + return false; + } + + const std::string &foundStr = iter->second; + found = Scan(foundStr); + return true; } }; +} diff --git a/moses/TranslationModel/ProbingPT/storing.cpp b/moses/TranslationModel/ProbingPT/storing.cpp index 01128c1e4..98dcfd5dc 100644 --- a/moses/TranslationModel/ProbingPT/storing.cpp +++ b/moses/TranslationModel/ProbingPT/storing.cpp @@ -1,161 +1,303 @@ +#include +#include +#include "line_splitter.hh" #include "storing.hh" +#include "StoreTarget.h" +#include "StoreVocab.h" +#include "moses/Util.h" +#include "moses/InputFileStream.h" -BinaryFileWriter::BinaryFileWriter (std::string basepath) : os ((basepath + "/binfile.dat").c_str(), std::ios::binary) +using namespace std; + +namespace Moses { - binfile.reserve(10000); //Reserve part of the vector to avoid realocation - it = binfile.begin(); - dist_from_start = 0; //Initialize variables - extra_counter = 0; -} -void BinaryFileWriter::write (std::vector * bytes) +/////////////////////////////////////////////////////////////////////// +void Node::Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos) { - binfile.insert(it, bytes->begin(), bytes->end()); //Insert the bytes - //Keep track of the offsets - it += bytes->size(); - dist_from_start = distance(binfile.begin(),it); - //Flush the vector to disk every once in a while so that we don't consume too much ram - if (dist_from_start > 9000) { - flush(); + if (pos < sourcePhrase.size()) { + uint64_t vocabId = sourcePhrase[pos]; + + Node *child; + Children::iterator iter = m_children.find(vocabId); + if (iter == m_children.end()) { + // New node. Write other children then discard them + BOOST_FOREACH(Children::value_type &valPair, m_children) { + Node &otherChild = valPair.second; + otherChild.Write(table); + } + m_children.clear(); + + // create new node + child = &m_children[vocabId]; + assert(!child->done); + child->key = key + (vocabId << pos); + } + else { + child = &iter->second; + } + + child->Add(table, sourcePhrase, pos + 1); + } + else { + // this node was written previously 'cos it has rules + done = true; } } -void BinaryFileWriter::flush () +void Node::Write(Table &table) { - //Cast unsigned char to char before writing... - os.write((char *)&binfile[0], dist_from_start); - //Clear the vector: - binfile.clear(); - binfile.reserve(10000); - extra_counter += dist_from_start; //Keep track of the total number of bytes. - it = binfile.begin(); //Reset iterator - dist_from_start = distance(binfile.begin(),it); //Reset dist from start -} + //cerr << "START write " << done << " " << key << endl; + BOOST_FOREACH(Children::value_type &valPair, m_children) { + Node &child = valPair.second; + child.Write(table); + } -BinaryFileWriter::~BinaryFileWriter () -{ - os.close(); - binfile.clear(); + if (!done) { + // save + Entry sourceEntry; + sourceEntry.value = NONE; + sourceEntry.key = key; + + //Put into table + table.Insert(sourceEntry); + } } -void createProbingPT(const char * phrasetable_path, const char * target_path, - const char * num_scores, const char * is_reordering) +/////////////////////////////////////////////////////////////////////// +void createProbingPT(const std::string &phrasetable_path, + const std::string &basepath, int num_scores, int num_lex_scores, + bool log_prob, int max_cache_size, bool scfg) { + std::cerr << "Starting..." << std::endl; + //Get basepath and create directory if missing - std::string basepath(target_path); mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); - //Set up huffman and serialize decoder maps. - Huffman huffmanEncoder(phrasetable_path); //initialize - huffmanEncoder.assign_values(); - huffmanEncoder.produce_lookups(); - huffmanEncoder.serialize_maps(target_path); + StoreTarget storeTarget(basepath); //Get uniq lines: - unsigned long uniq_entries = huffmanEncoder.getUniqLines(); + unsigned long uniq_entries = countUniqueSource(phrasetable_path); //Source phrase vocabids - std::map source_vocabids; + StoreVocab sourceVocab(basepath + "/source_vocabids"); //Read the file - util::FilePiece filein(phrasetable_path); + util::FilePiece filein(phrasetable_path.c_str()); //Init the probing hash table size_t size = Table::Size(uniq_entries, 1.2); char * mem = new char[size]; memset(mem, 0, size); - Table table(mem, size); + Table sourceEntries(mem, size); - BinaryFileWriter binfile(basepath); //Init the binary file writer. - - line_text prev_line; //Check if the source phrase of the previous line is the same + std::priority_queue, CacheItemOrderer> cache; + float totalSourceCount = 0; //Keep track of the size of each group of target phrases - uint64_t entrystartidx = 0; - //uint64_t line_num = 0; - + size_t line_num = 0; //Read everything and processs - while(true) { + std::string prevSource; + + Node sourcePhrases; + sourcePhrases.done = true; + sourcePhrases.key = 0; + + while (true) { try { //Process line read line_text line; - line = splitLine(filein.ReadLine()); - //Add source phrases to vocabularyIDs - add_to_map(&source_vocabids, line.source_phrase); + line = splitLine(filein.ReadLine(), scfg); + //cerr << "line=" << line.source_phrase << endl; - if ((binfile.dist_from_start + binfile.extra_counter) == 0) { - prev_line = line; //For the first iteration assume the previous line is - } //The same as this one. + ++line_num; + if (line_num % 1000000 == 0) { + std::cerr << line_num << " " << std::flush; + } - if (line.source_phrase != prev_line.source_phrase) { + //Add source phrases to vocabularyIDs + add_to_map(sourceVocab, line.source_phrase); + + if (prevSource.empty()) { + // 1st line + prevSource = line.source_phrase.as_string(); + storeTarget.Append(line, log_prob, scfg); + } + else if (prevSource == line.source_phrase) { + //If we still have the same line, just append to it: + storeTarget.Append(line, log_prob, scfg); + } + else { + assert(prevSource != line.source_phrase); //Create a new entry even + // save + uint64_t targetInd = storeTarget.Save(); + + // next line + storeTarget.Append(line, log_prob, scfg); + //Create an entry for the previous source phrase: - Entry pesho; - pesho.value = entrystartidx; + Entry sourceEntry; + sourceEntry.value = targetInd; //The key is the sum of hashes of individual words bitshifted by their position in the phrase. //Probably not entirerly correct, but fast and seems to work fine in practise. - pesho.key = 0; - std::vector vocabid_source = getVocabIDs(prev_line.source_phrase); - for (int i = 0; i < vocabid_source.size(); i++) { - pesho.key += (vocabid_source[i] << i); + std::vector vocabid_source = getVocabIDs(prevSource); + if (scfg) { + // storing prefixes? + sourcePhrases.Add(sourceEntries, vocabid_source); } - pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx; + sourceEntry.key = getKey(vocabid_source); + /* + cerr << "prevSource=" << prevSource << flush + << " vocabids=" << Debug(vocabid_source) << flush + << " key=" << sourceEntry.key << endl; + */ //Put into table - table.Insert(pesho); + sourceEntries.Insert(sourceEntry); - entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry + // update cache - CURRENT source phrase, not prev + if (max_cache_size) { + std::string countStr = line.counts.as_string(); + countStr = Trim(countStr); + if (!countStr.empty()) { + std::vector toks = Tokenize(countStr); + //cerr << "CACHE:" << line.source_phrase << " " << countStr << " " << toks[1] << endl; - //Encode a line and write it to disk. - std::vector encoded_line = huffmanEncoder.full_encode_line(line); - binfile.write(&encoded_line); + if (toks.size() >= 2) { + totalSourceCount += toks[1]; - //Set prevLine - prev_line = line; + // compute key for CURRENT source + std::vector currVocabidSource = getVocabIDs(line.source_phrase.as_string()); + uint64_t currKey = getKey(currVocabidSource); - } else { - //If we still have the same line, just append to it: - std::vector encoded_line = huffmanEncoder.full_encode_line(line); - binfile.write(&encoded_line); + CacheItem *item = new CacheItem( + Trim(line.source_phrase.as_string()), + currKey, + toks[1]); + cache.push(item); + + if (max_cache_size > 0 && cache.size() > max_cache_size) { + cache.pop(); + } + } + } + } + + //Set prevLine + prevSource = line.source_phrase.as_string(); } - } catch (util::EndOfFileException e) { - std::cerr << "Reading phrase table finished, writing remaining files to disk." << std::endl; - binfile.flush(); + } + catch (util::EndOfFileException e) { + std::cerr + << "Reading phrase table finished, writing remaining files to disk." + << std::endl; //After the final entry is constructed we need to add it to the phrase_table //Create an entry for the previous source phrase: - Entry pesho; - pesho.value = entrystartidx; + uint64_t targetInd = storeTarget.Save(); + + Entry sourceEntry; + sourceEntry.value = targetInd; + //The key is the sum of hashes of individual words. Probably not entirerly correct, but fast - pesho.key = 0; - std::vector vocabid_source = getVocabIDs(prev_line.source_phrase); - for (int i = 0; i < vocabid_source.size(); i++) { - pesho.key += (vocabid_source[i] << i); - } - pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx; + std::vector vocabid_source = getVocabIDs(prevSource); + sourceEntry.key = getKey(vocabid_source); + //Put into table - table.Insert(pesho); + sourceEntries.Insert(sourceEntry); break; } } - serialize_table(mem, size, (basepath + "/probing_hash.dat").c_str()); + sourcePhrases.Write(sourceEntries); + + storeTarget.SaveAlignment(); - serialize_map(&source_vocabids, (basepath + "/source_vocabids").c_str()); + serialize_table(mem, size, (basepath + "/probing_hash.dat")); + + sourceVocab.Save(); + + serialize_cache(cache, (basepath + "/cache"), totalSourceCount); delete[] mem; //Write configfile std::ofstream configfile; configfile.open((basepath + "/config").c_str()); - configfile << API_VERSION << '\n'; - configfile << uniq_entries << '\n'; - configfile << num_scores << '\n'; - configfile << is_reordering << '\n'; + configfile << "API_VERSION\t" << API_VERSION << '\n'; + configfile << "uniq_entries\t" << uniq_entries << '\n'; + configfile << "num_scores\t" << num_scores << '\n'; + configfile << "num_lex_scores\t" << num_lex_scores << '\n'; + configfile << "log_prob\t" << log_prob << '\n'; configfile.close(); } + +size_t countUniqueSource(const std::string &path) +{ + size_t ret = 0; + InputFileStream strme(path); + + std::string line, prevSource; + while (std::getline(strme, line)) { + std::vector toks = TokenizeMultiCharSeparator(line, "|||"); + assert(toks.size() != 0); + + if (prevSource != toks[0]) { + prevSource = toks[0]; + ++ret; + } + } + + return ret; +} + +void serialize_cache( + std::priority_queue, CacheItemOrderer> &cache, + const std::string &path, float totalSourceCount) +{ + std::vector vec(cache.size()); + + size_t ind = cache.size() - 1; + while (!cache.empty()) { + const CacheItem *item = cache.top(); + vec[ind] = item; + cache.pop(); + --ind; + } + + std::ofstream os(path.c_str()); + + os << totalSourceCount << std::endl; + for (size_t i = 0; i < vec.size(); ++i) { + const CacheItem *item = vec[i]; + os << item->count << "\t" << item->sourceKey << "\t" << item->source << std::endl; + delete item; + } + + os.close(); +} + +uint64_t getKey(const std::vector &vocabid_source) +{ + return getKey(vocabid_source.data(), vocabid_source.size()); +} + +std::vector CreatePrefix(const std::vector &vocabid_source, size_t endPos) +{ + assert(endPos < vocabid_source.size()); + + std::vector ret(endPos + 1); + for (size_t i = 0; i <= endPos; ++i) { + ret[i] = vocabid_source[i]; + } + return ret; +} + +} + diff --git a/moses/TranslationModel/ProbingPT/storing.hh b/moses/TranslationModel/ProbingPT/storing.hh index 8554d614f..957c73491 100644 --- a/moses/TranslationModel/ProbingPT/storing.hh +++ b/moses/TranslationModel/ProbingPT/storing.hh @@ -1,36 +1,95 @@ #pragma once +#include +#include #include +#include #include #include +#include +#include +#include //mkdir #include "hash.hh" //Includes line_splitter #include "probing_hash_utils.hh" -#include "huffmanish.hh" -#include //mkdir #include "util/file_piece.hh" #include "util/file.hh" #include "vocabid.hh" -#define API_VERSION 3 -void createProbingPT(const char * phrasetable_path, const char * target_path, - const char * num_scores, const char * is_reordering); +namespace Moses +{ +typedef std::vector SourcePhrase; + + +class Node +{ + typedef boost::unordered_map Children; + Children m_children; + +public: + uint64_t key; + bool done; + + Node() + :done(false) + {} + + void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0); + void Write(Table &table); +}; + + +void createProbingPT(const std::string &phrasetable_path, + const std::string &basepath, int num_scores, int num_lex_scores, + bool log_prob, int max_cache_size, bool scfg); +uint64_t getKey(const std::vector &source_phrase); + +std::vector CreatePrefix(const std::vector &vocabid_source, size_t endPos); -class BinaryFileWriter +template +std::string Debug(const std::vector &vec) { - std::vector binfile; - std::vector::iterator it; - //Output binary - std::ofstream os; + std::stringstream strm; + for (size_t i = 0; i < vec.size(); ++i) { + strm << vec[i] << " "; + } + return strm.str(); +} +size_t countUniqueSource(const std::string &path); + +class CacheItem +{ public: - unsigned int dist_from_start; //Distance from the start of the vector. - uint64_t extra_counter; //After we reset the counter, we still want to keep track of the correct offset, so + std::string source; + uint64_t sourceKey; + float count; + CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount) + :source(vSource) + ,sourceKey(vSourceKey) + ,count(vCount) + { + } - BinaryFileWriter (std::string); - ~BinaryFileWriter (); - void write (std::vector * bytes); - void flush (); //Flush to disk + bool operator<(const CacheItem &other) const + { + return count > other.count; + } +}; +class CacheItemOrderer +{ +public: + bool operator()(const CacheItem* a, const CacheItem* b) const + { + return (*a) < (*b); + } }; + +void serialize_cache( + std::priority_queue, CacheItemOrderer> &cache, + const std::string &path, float totalSourceCount); + +} + diff --git a/moses/TranslationModel/ProbingPT/vocabid.cpp b/moses/TranslationModel/ProbingPT/vocabid.cpp index 1452f299d..3d6efe841 100644 --- a/moses/TranslationModel/ProbingPT/vocabid.cpp +++ b/moses/TranslationModel/ProbingPT/vocabid.cpp @@ -1,32 +1,59 @@ +#include #include "vocabid.hh" +#include "StoreVocab.h" +#include "moses/Util.h" -void add_to_map(std::map *karta, StringPiece textin) +namespace Moses +{ + +void add_to_map(StoreVocab &sourceVocab, + const StringPiece &textin) { //Tokenize - util::TokenIter it(textin, util::SingleCharacter(' ')); + util::TokenIter itWord(textin, util::SingleCharacter(' ')); + + while (itWord) { + StringPiece word = *itWord; - while(it) { - karta->insert(std::pair(getHash(*it), it->as_string())); - it++; + util::TokenIter itFactor(word, util::SingleCharacter('|')); + while (itFactor) { + StringPiece factor = *itFactor; + + sourceVocab.Insert(getHash(factor), factor.as_string()); + itFactor++; + } + itWord++; } } -void serialize_map(std::map *karta, const char* filename) +void serialize_map(const std::map &karta, + const std::string &filename) { - std::ofstream os (filename, std::ios::binary); - boost::archive::text_oarchive oarch(os); + std::ofstream os(filename.c_str()); + + std::map::const_iterator iter; + for (iter = karta.begin(); iter != karta.end(); ++iter) { + os << iter->first << '\t' << iter->second << std::endl; + } - oarch << *karta; //Serialise map os.close(); } -void read_map(std::map *karta, const char* filename) +void read_map(std::map &karta, const char* filename) { - std::ifstream is (filename, std::ios::binary); - boost::archive::text_iarchive iarch(is); + std::ifstream is(filename); - iarch >> *karta; + std::string line; + while (getline(is, line)) { + std::vector toks = Tokenize(line, "\t"); + assert(toks.size() == 2); + uint64_t ind = Scan(toks[1]); + karta[ind] = toks[0]; + } //Close the stream after we are done. is.close(); } + +} + diff --git a/moses/TranslationModel/ProbingPT/vocabid.hh b/moses/TranslationModel/ProbingPT/vocabid.hh index 491c53439..f9c9b2dff 100644 --- a/moses/TranslationModel/ProbingPT/vocabid.hh +++ b/moses/TranslationModel/ProbingPT/vocabid.hh @@ -13,8 +13,17 @@ #include "util/string_piece.hh" //Tokenization and work with StringPiece #include "util/tokenize_piece.hh" -void add_to_map(std::map *karta, StringPiece textin); +namespace Moses +{ +template +class StoreVocab; -void serialize_map(std::map *karta, const char* filename); +void add_to_map(StoreVocab &sourceVocab, + const StringPiece &textin); -void read_map(std::map *karta, const char* filename); +void serialize_map(const std::map &karta, + const std::string &filename); + +void read_map(std::map &karta, const char* filename); + +} -- cgit v1.2.3