From 3a72b4958a3fc468b6bd6102e67e24007c9b2d9b Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Mon, 3 Oct 2016 19:02:06 +0100 Subject: update Moses::ProbingPT with Moses2::ProbingPT. Does not compile --- contrib/other-builds/moses/.project | 32 +- misc/CreateProbingPT.cpp | 108 ++++- misc/Jamfile | 4 +- misc/QueryProbingPT.cpp | 2 +- moses/TranslationModel/ProbingPT/ProbingPT.cpp | 93 +++-- moses/TranslationModel/ProbingPT/ProbingPT.h | 19 +- moses/TranslationModel/ProbingPT/hash.cpp | 36 +- moses/TranslationModel/ProbingPT/hash.hh | 7 +- moses/TranslationModel/ProbingPT/huffmanish.cpp | 451 --------------------- moses/TranslationModel/ProbingPT/huffmanish.hh | 112 ----- moses/TranslationModel/ProbingPT/line_splitter.cpp | 59 ++- moses/TranslationModel/ProbingPT/line_splitter.hh | 36 +- .../ProbingPT/probing_hash_utils.cpp | 28 +- .../ProbingPT/probing_hash_utils.hh | 38 +- moses/TranslationModel/ProbingPT/quering.cpp | 221 ++++------ moses/TranslationModel/ProbingPT/quering.hh | 62 ++- moses/TranslationModel/ProbingPT/storing.cpp | 322 +++++++++++---- moses/TranslationModel/ProbingPT/storing.hh | 91 ++++- moses/TranslationModel/ProbingPT/vocabid.cpp | 53 ++- moses/TranslationModel/ProbingPT/vocabid.hh | 15 +- 20 files changed, 837 insertions(+), 952 deletions(-) delete mode 100644 moses/TranslationModel/ProbingPT/huffmanish.cpp delete mode 100644 moses/TranslationModel/ProbingPT/huffmanish.hh diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index b59f28e08..c25eb5225 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -1319,7 +1319,7 @@ FF/PhraseBoundaryFeature.h 1 PARENT-3-PROJECT_LOC/moses/FF/PhraseBoundaryFeature.h - + FF/PhraseDistanceFeature.cpp 1 @@ -3341,24 +3341,34 @@ PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/ProbingPT.h - TranslationModel/ProbingPT/hash.cpp + TranslationModel/ProbingPT/StoreTarget.cpp 1 - PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.cpp + PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.cpp - TranslationModel/ProbingPT/hash.hh + TranslationModel/ProbingPT/StoreTarget.h 1 - PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh + PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.h - TranslationModel/ProbingPT/huffmanish.cpp + TranslationModel/ProbingPT/StoreVocab.cpp 1 - PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.cpp + PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.cpp - TranslationModel/ProbingPT/huffmanish.hh + TranslationModel/ProbingPT/StoreVocab.h 1 - PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.hh + PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.h + + + TranslationModel/ProbingPT/hash.cpp + 1 + PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.cpp + + + TranslationModel/ProbingPT/hash.hh + 1 + PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh TranslationModel/ProbingPT/line_splitter.cpp @@ -3664,7 +3674,7 @@ TranslationModel/UG/sapt_pscore_coherence.h 1 PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h - + TranslationModel/UG/sapt_pscore_lex1.h 1 @@ -3709,7 +3719,7 @@ TranslationModel/UG/sapt_pscore_wordcount.h 1 PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h - + TranslationModel/UG/sim-pe.cc 1 diff --git a/misc/CreateProbingPT.cpp b/misc/CreateProbingPT.cpp index b23427f30..dff916660 100644 --- a/misc/CreateProbingPT.cpp +++ b/misc/CreateProbingPT.cpp @@ -1,29 +1,113 @@ +#include +#include #include "util/usage.hh" #include "moses/TranslationModel/ProbingPT/storing.hh" +#include "moses/InputFileStream.h" +#include "moses/OutputFileStream.h" +#include "moses/Util.h" +using namespace std; +std::string ReformatSCFGFile(const std::string &path); int main(int argc, char* argv[]) { + string inPath, outPath; + int num_scores = 4; + int num_lex_scores = 0; + bool log_prob = false; + bool scfg = false; + int max_cache_size = 50000; - const char * is_reordering = "false"; + namespace po = boost::program_options; + po::options_description desc("Options"); + desc.add_options() + ("help", "Print help messages") + ("input-pt", po::value()->required(), "Text pt") + ("output-dir", po::value()->required(), "Directory when binary files will be written") + ("num-scores", po::value()->default_value(num_scores), "Number of pt scores") + ("num-lex-scores", po::value()->default_value(num_lex_scores), "Number of lexicalized reordering scores") + ("log-prob", "log (and floor) probabilities before storing") + ("max-cache-size", po::value()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit") + ("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS") - if (!(argc == 5 || argc == 4)) { - // Tell the user how to run the program - std::cerr << "Provided " << argc << " arguments, needed 4 or 5." << std::endl; - std::cerr << "Usage: " << argv[0] << " path_to_phrasetable output_dir num_scores is_reordering" << std::endl; - std::cerr << "is_reordering should be either true or false, but it is currently a stub feature." << std::endl; - //std::cerr << "Usage: " << argv[0] << " path_to_phrasetable number_of_uniq_lines output_bin_file output_hash_table output_vocab_id" << std::endl; - return 1; + ; + + po::variables_map vm; + try { + po::store(po::parse_command_line(argc, argv, desc), + vm); // can throw + + /** --help option + */ + if ( vm.count("help")) { + std::cout << desc << std::endl; + return EXIT_SUCCESS; + } + + po::notify(vm); // throws on error, so do after help in case + // there are any problems + } catch(po::error& e) { + std::cerr << "ERROR: " << e.what() << std::endl << std::endl; + std::cerr << desc << std::endl; + return EXIT_FAILURE; } - if (argc == 5) { - is_reordering = argv[4]; + if (vm.count("input-pt")) inPath = vm["input-pt"].as(); + if (vm.count("output-dir")) outPath = vm["output-dir"].as(); + if (vm.count("num-scores")) num_scores = vm["num-scores"].as(); + if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as(); + if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as(); + if (vm.count("log-prob")) log_prob = true; + if (vm.count("scfg")) scfg = true; + + + if (scfg) { + inPath = ReformatSCFGFile(inPath); } - createProbingPT(argv[1], argv[2], argv[3], is_reordering); + Moses::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg); - util::PrintUsage(std::cout); + //util::PrintUsage(std::cout); return 0; } +std::string ReformatSCFGFile(const std::string &path) +{ + Moses::InputFileStream inFile(path); + string reformattedPath = path + ".reformat.gz"; + Moses::OutputFileStream outFile(reformattedPath); + + string line; + while (getline(inFile, line)) { + vector toks = Moses::TokenizeMultiCharSeparator(line, "|||"); + assert(toks.size() >= 3); + + // source + vector sourceToks = Moses::Tokenize(toks[0], " "); + for (size_t i = 0; i < sourceToks.size() - 1; ++i) { + outFile << sourceToks[i] << " "; + } + + // other columns + for (size_t i = 1; i < toks.size(); ++i) { + outFile << "|||" << toks[i]; + } + outFile << endl; + } + + inFile.Close(); + outFile.Close(); + + string sortedPath = path + ".reformat.sorted.gz"; + string tmpPath = path + ".tmp "; + string cmd = "mkdir " + tmpPath + + " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath; + system(cmd.c_str()); + + cmd = "rm -rf " + tmpPath + " " + reformattedPath; + system(cmd.c_str()); + + return sortedPath; +} + diff --git a/misc/Jamfile b/misc/Jamfile index f1599aca8..135490a46 100644 --- a/misc/Jamfile +++ b/misc/Jamfile @@ -31,9 +31,9 @@ else { } exe CreateProbingPT : CreateProbingPT.cpp ..//boost_filesystem ../moses//moses ; -exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ; +#exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ; -alias programsProbing : CreateProbingPT QueryProbingPT ; +alias programsProbing : CreateProbingPT ; #QueryProbingPT exe merge-sorted : merge-sorted.cc diff --git a/misc/QueryProbingPT.cpp b/misc/QueryProbingPT.cpp index 72fd0be11..5047d4d47 100644 --- a/misc/QueryProbingPT.cpp +++ b/misc/QueryProbingPT.cpp @@ -34,7 +34,7 @@ int main(int argc, char* argv[]) return 1; } - QueryEngine queries(argv[1]); + Moses::QueryEngine queries(argv[1]); //Interactive search std::cout << "Please enter a string to be searched, or exit to exit." << std::endl; diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp index cbfd2c1a4..bb3f26e22 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp +++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp @@ -3,6 +3,7 @@ #include "moses/StaticData.h" #include "moses/FactorCollection.h" #include "moses/TargetPhraseCollection.h" +#include "moses/InputFileStream.h" #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h" #include "quering.hh" @@ -34,44 +35,94 @@ void ProbingPT::Load(AllOptions::ptr const& opts) m_unkId = 456456546456; + FactorCollection &vocab = FactorCollection::Instance(); + // source vocab - const std::map &sourceVocab = m_engine->getSourceVocab(); + const std::map &sourceVocab = + m_engine->getSourceVocab(); std::map::const_iterator iterSource; - for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end(); ++iterSource) { - const string &wordStr = iterSource->second; - const Factor *factor = FactorCollection::Instance().AddFactor(wordStr); + for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end(); + ++iterSource) { + string wordStr = iterSource->second; + //cerr << "wordStr=" << wordStr << endl; - uint64_t probingId = iterSource->first; + const Factor *factor = vocab.AddFactor(wordStr); - SourceVocabMap::value_type entry(factor, probingId); - m_sourceVocabMap.insert(entry); + uint64_t probingId = iterSource->first; + size_t factorId = factor->GetId(); + if (factorId >= m_sourceVocab.size()) { + m_sourceVocab.resize(factorId + 1, m_unkId); + } + m_sourceVocab[factorId] = probingId; } // target vocab - const std::map &probingVocab = m_engine->getVocab(); - std::map::const_iterator iter; - for (iter = probingVocab.begin(); iter != probingVocab.end(); ++iter) { - const string &wordStr = iter->second; - const Factor *factor = FactorCollection::Instance().AddFactor(wordStr); + InputFileStream targetVocabStrme(m_filePath + "/TargetVocab.dat"); + string line; + while (getline(targetVocabStrme, line)) { + vector toks = Tokenize(line, "\t"); + UTIL_THROW_IF2(toks.size() != 2, string("Incorrect format:") + line + "\n"); + + //cerr << "wordStr=" << toks[0] << endl; + + const Factor *factor = vocab.AddFactor(toks[0]); + uint32_t probingId = Scan(toks[1]); + + if (probingId >= m_targetVocab.size()) { + m_targetVocab.resize(probingId + 1); + } + + m_targetVocab[probingId] = factor; + } + + // alignments + CreateAlignmentMap(m_filePath + "/Alignments.dat"); - unsigned int probingId = iter->first; + // memory mapped file to tps + string filePath = m_filePath + "/TargetColl.dat"; + file.open(filePath.c_str()); + if (!file.is_open()) { + throw "Couldn't open file "; + } + + data = file.data(); + //size_t size = file.size(); + + // cache + //CreateCache(system); - TargetVocabMap::value_type entry(factor, probingId); - m_vocabMap.insert(entry); +} +void ProbingPT::CreateAlignmentMap(const std::string path) +{ + const std::vector< std::vector > &probingAlignColl = m_engine->getAlignments(); + m_aligns.resize(probingAlignColl.size(), NULL); + + for (size_t i = 0; i < probingAlignColl.size(); ++i) { + AlignmentInfo::CollType aligns; + + const std::vector &probingAligns = probingAlignColl[i]; + for (size_t j = 0; j < probingAligns.size(); j += 2) { + size_t startPos = probingAligns[j]; + size_t endPos = probingAligns[j+1]; + //cerr << "startPos=" << startPos << " " << endPos << endl; + aligns.insert(std::pair(startPos, endPos)); + } + + const AlignmentInfo *align = AlignmentInfoCollection::Instance().Add(aligns); + m_aligns[i] = align; + //cerr << "align=" << align->Debug(system) << endl; } } void ProbingPT::InitializeForInput(ttasksptr const& ttask) { - ReduceCache(); + } void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const { - CacheColl &cache = GetCache(); - InputPathList::const_iterator iter; for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) { InputPath &inputPath = **iter; @@ -82,12 +133,6 @@ void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQue } TargetPhraseCollection::shared_ptr tpColl = CreateTargetPhrase(sourcePhrase); - - // add target phrase to phrase-table cache - size_t hash = hash_value(sourcePhrase); - std::pair value(tpColl, clock()); - cache[hash] = value; - inputPath.SetTargetPhrases(*this, tpColl, NULL); } } diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.h b/moses/TranslationModel/ProbingPT/ProbingPT.h index 4e7ab02c6..3b5dfc895 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.h +++ b/moses/TranslationModel/ProbingPT/ProbingPT.h @@ -1,17 +1,17 @@ #pragma once - +#include #include #include "../PhraseDictionary.h" -class QueryEngine; -class target_text; namespace Moses { class ChartParser; class ChartCellCollectionBase; class ChartRuleLookupManager; +class QueryEngine; +class target_text; class ProbingPT : public PhraseDictionary { @@ -39,12 +39,16 @@ public: protected: QueryEngine *m_engine; + uint64_t m_unkId; - typedef boost::bimap SourceVocabMap; - mutable SourceVocabMap m_sourceVocabMap; + std::vector m_sourceVocab; // factor id -> pt id + std::vector m_targetVocab; // pt id -> factor* + std::vector m_aligns; - typedef boost::bimap TargetVocabMap; - mutable TargetVocabMap m_vocabMap; + boost::iostreams::mapped_file_source file; + const char *data; + + void CreateAlignmentMap(const std::string path); TargetPhraseCollection::shared_ptr CreateTargetPhrase(const Phrase &sourcePhrase) const; TargetPhrase *CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const; @@ -53,7 +57,6 @@ protected: std::vector ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const; - uint64_t m_unkId; }; } // namespace Moses diff --git a/moses/TranslationModel/ProbingPT/hash.cpp b/moses/TranslationModel/ProbingPT/hash.cpp index 8945649ef..27a64b129 100644 --- a/moses/TranslationModel/ProbingPT/hash.cpp +++ b/moses/TranslationModel/ProbingPT/hash.cpp @@ -1,5 +1,11 @@ +#include #include "hash.hh" +using namespace std; + +namespace Moses +{ + uint64_t getHash(StringPiece text) { std::size_t len = text.size(); @@ -7,24 +13,32 @@ uint64_t getHash(StringPiece text) return key; } -std::vector getVocabIDs(StringPiece textin) +std::vector getVocabIDs(const StringPiece &textin) { //Tokenize std::vector output; - util::TokenIter it(textin, util::SingleCharacter(' ')); + util::TokenIter itWord(textin, util::SingleCharacter(' ')); + + while (itWord) { + StringPiece word = *itWord; + uint64_t id = 0; + + util::TokenIter itFactor(word, util::SingleCharacter('|')); + while (itFactor) { + StringPiece factor = *itFactor; + //cerr << "factor=" << factor << endl; - while(it) { - output.push_back(getHash(*it)); - it++; + id += getHash(factor); + itFactor++; + } + + output.push_back(id); + itWord++; } return output; } -uint64_t getVocabID(std::string candidate) -{ - std::size_t len = candidate.length(); - uint64_t key = util::MurmurHashNative(candidate.c_str(), len); - return key; -} \ No newline at end of file +} + diff --git a/moses/TranslationModel/ProbingPT/hash.hh b/moses/TranslationModel/ProbingPT/hash.hh index 607238ae1..f218ad9da 100644 --- a/moses/TranslationModel/ProbingPT/hash.hh +++ b/moses/TranslationModel/ProbingPT/hash.hh @@ -6,9 +6,12 @@ #include "util/tokenize_piece.hh" #include +namespace Moses +{ + //Gets the MurmurmurHash for give string uint64_t getHash(StringPiece text); -std::vector getVocabIDs(StringPiece textin); +std::vector getVocabIDs(const StringPiece &textin); -uint64_t getVocabID(std::string candidate); \ No newline at end of file +} diff --git a/moses/TranslationModel/ProbingPT/huffmanish.cpp b/moses/TranslationModel/ProbingPT/huffmanish.cpp deleted file mode 100644 index 534fd04d1..000000000 --- a/moses/TranslationModel/ProbingPT/huffmanish.cpp +++ /dev/null @@ -1,451 +0,0 @@ -#include "huffmanish.hh" - -Huffman::Huffman (const char * filepath) -{ - //Read the file - util::FilePiece filein(filepath); - - //Init uniq_lines to zero; - uniq_lines = 0; - - line_text prev_line; //Check for unique lines. - int num_lines = 0 ; - - while (true) { - line_text new_line; - - num_lines++; - - try { - //Process line read - new_line = splitLine(filein.ReadLine()); - count_elements(new_line); //Counts the number of elements, adds new and increments counters. - - } catch (util::EndOfFileException e) { - std::cerr << "Unique entries counted: "; - break; - } - - if (new_line.source_phrase == prev_line.source_phrase) { - continue; - } else { - uniq_lines++; - prev_line = new_line; - } - } - - std::cerr << uniq_lines << std::endl; -} - -void Huffman::count_elements(line_text linein) -{ - //For target phrase: - util::TokenIter it(linein.target_phrase, util::SingleCharacter(' ')); - while (it) { - //Check if we have that entry - std::map::iterator mapiter; - mapiter = target_phrase_words.find(it->as_string()); - - if (mapiter != target_phrase_words.end()) { - //If the element is found, increment the count. - mapiter->second++; - } else { - //Else create a new entry; - target_phrase_words.insert(std::pair(it->as_string(), 1)); - } - it++; - } - - //For word allignment 1 - std::map, unsigned int>::iterator mapiter3; - std::vector numbers = splitWordAll1(linein.word_align); - mapiter3 = word_all1.find(numbers); - - if (mapiter3 != word_all1.end()) { - //If the element is found, increment the count. - mapiter3->second++; - } else { - //Else create a new entry; - word_all1.insert(std::pair, unsigned int>(numbers, 1)); - } - -} - -//Assigns huffman values for each unique element -void Huffman::assign_values() -{ - //First create vectors for all maps so that we could sort them later. - - //Create a vector for target phrases - for(std::map::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) { - target_phrase_words_counts.push_back(*it); - } - //Sort it - std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair()); - - //Create a vector for word allignments 1 - for(std::map, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) { - word_all1_counts.push_back(*it); - } - //Sort it - std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec()); - - - //Afterwards we assign a value for each phrase, starting from 1, as zero is reserved for delimiter - unsigned int i = 1; //huffman code - for(std::vector >::iterator it = target_phrase_words_counts.begin(); - it != target_phrase_words_counts.end(); it++) { - target_phrase_huffman.insert(std::pair(it->first, i)); - i++; //Go to the next huffman code - } - - i = 1; //Reset i for the next map - for(std::vector, unsigned int> >::iterator it = word_all1_counts.begin(); - it != word_all1_counts.end(); it++) { - word_all1_huffman.insert(std::pair, unsigned int>(it->first, i)); - i++; //Go to the next huffman code - } - - //After lookups are produced, clear some memory usage of objects not needed anymore. - target_phrase_words.clear(); - word_all1.clear(); - - target_phrase_words_counts.clear(); - word_all1_counts.clear(); - - std::cerr << "Finished generating huffman codes." << std::endl; - -} - -void Huffman::serialize_maps(const char * dirname) -{ - //Note that directory name should exist. - std::string basedir(dirname); - std::string target_phrase_path(basedir + "/target_phrases"); - std::string probabilities_path(basedir + "/probs"); - std::string word_all1_path(basedir + "/Wall1"); - - //Target phrase - std::ofstream os (target_phrase_path.c_str(), std::ios::binary); - boost::archive::text_oarchive oarch(os); - oarch << lookup_target_phrase; - os.close(); - - //Word all1 - std::ofstream os2 (word_all1_path.c_str(), std::ios::binary); - boost::archive::text_oarchive oarch2(os2); - oarch2 << lookup_word_all1; - os2.close(); -} - -std::vector Huffman::full_encode_line(line_text line) -{ - return vbyte_encode_line((encode_line(line))); -} - -std::vector Huffman::encode_line(line_text line) -{ - std::vector retvector; - - //Get target_phrase first. - util::TokenIter it(line.target_phrase, util::SingleCharacter(' ')); - while (it) { - retvector.push_back(target_phrase_huffman.find(it->as_string())->second); - it++; - } - //Add a zero; - retvector.push_back(0); - - //Get probabilities. Reinterpreting the float bytes as unsgined int. - util::TokenIter probit(line.prob, util::SingleCharacter(' ')); - while (probit) { - //Sometimes we have too big floats to handle, so first convert to double - double tempnum = atof(probit->data()); - float num = (float)tempnum; - retvector.push_back(reinterpret_float(&num)); - probit++; - } - //Add a zero; - retvector.push_back(0); - - - //Get Word allignments - retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_align))->second); - retvector.push_back(0); - - return retvector; -} - -void Huffman::produce_lookups() -{ - //basically invert every map that we have - for(std::map::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) { - lookup_target_phrase.insert(std::pair(it->second, it->first)); - } - - for(std::map, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) { - lookup_word_all1.insert(std::pair >(it->second, it->first)); - } - -} - -HuffmanDecoder::HuffmanDecoder (const char * dirname) -{ - //Read the maps from disk - - //Note that directory name should exist. - std::string basedir(dirname); - std::string target_phrase_path(basedir + "/target_phrases"); - std::string word_all1_path(basedir + "/Wall1"); - - //Target phrases - std::ifstream is (target_phrase_path.c_str(), std::ios::binary); - boost::archive::text_iarchive iarch(is); - iarch >> lookup_target_phrase; - is.close(); - - //Word allignment 1 - std::ifstream is2 (word_all1_path.c_str(), std::ios::binary); - boost::archive::text_iarchive iarch2(is2); - iarch2 >> lookup_word_all1; - is2.close(); - -} - -HuffmanDecoder::HuffmanDecoder (std::map * lookup_target, - std::map > * lookup_word1) -{ - lookup_target_phrase = *lookup_target; - lookup_word_all1 = *lookup_word1; -} - -std::vector HuffmanDecoder::full_decode_line (std::vector lines, int num_scores) -{ - std::vector retvector; //All target phrases - std::vector decoded_lines = vbyte_decode_line(lines); //All decoded lines - std::vector::iterator it = decoded_lines.begin(); //Iterator for them - std::vector current_target_phrase; //Current target phrase decoded - - short zero_count = 0; //Count home many zeroes we have met. so far. Every 3 zeroes mean a new target phrase. - while(it != decoded_lines.end()) { - if (zero_count == 1) { - //We are extracting scores. we know how many scores there are so we can push them - //to the vector. This is done in case any of the scores is 0, because it would mess - //up the state machine. - for (int i = 0; i < num_scores; i++) { - current_target_phrase.push_back(*it); - it++; - } - } - - if (zero_count == 3) { - //We have finished with this entry, decode it, and add it to the retvector. - retvector.push_back(decode_line(current_target_phrase, num_scores)); - current_target_phrase.clear(); //Clear the current target phrase and the zero_count - zero_count = 0; //So that we can reuse them for the next target phrase - } - //Add to the next target_phrase, number by number. - current_target_phrase.push_back(*it); - if (*it == 0) { - zero_count++; - } - it++; //Go to the next word/symbol - } - //Don't forget the last remaining line! - if (zero_count == 3) { - //We have finished with this entry, decode it, and add it to the retvector. - retvector.push_back(decode_line(current_target_phrase, num_scores)); - current_target_phrase.clear(); //Clear the current target phrase and the zero_count - zero_count = 0; //So that we can reuse them for the next target phrase - } - - return retvector; - -} - -target_text HuffmanDecoder::decode_line (std::vector input, int num_scores) -{ - //demo decoder - target_text ret; - //Split everything - std::vector target_phrase; - std::vector probs; - unsigned int wAll; - - //Split the line into the proper arrays - short num_zeroes = 0; - int counter = 0; - while (num_zeroes < 3) { - unsigned int num = input[counter]; - if (num == 0) { - num_zeroes++; - } else if (num_zeroes == 0) { - target_phrase.push_back(num); - } else if (num_zeroes == 1) { - //Push exactly num_scores scores - for (int i = 0; i < num_scores; i++) { - probs.push_back(num); - counter++; - num = input[counter]; - } - continue; - } else if (num_zeroes == 2) { - wAll = num; - } - counter++; - } - - ret.target_phrase = target_phrase; - ret.word_all1 = lookup_word_all1.find(wAll)->second; - - //Decode probabilities - for (std::vector::iterator it = probs.begin(); it != probs.end(); it++) { - ret.prob.push_back(reinterpret_uint(&(*it))); - } - - return ret; - -} - -inline std::string HuffmanDecoder::getTargetWordFromID(unsigned int id) -{ - return lookup_target_phrase.find(id)->second; -} - -std::string HuffmanDecoder::getTargetWordsFromIDs(std::vector ids) -{ - std::string returnstring; - for (std::vector::iterator it = ids.begin(); it != ids.end(); it++) { - returnstring.append(getTargetWordFromID(*it) + " "); - } - - return returnstring; -} - -inline std::string getTargetWordFromID(unsigned int id, std::map * lookup_target_phrase) -{ - return lookup_target_phrase->find(id)->second; -} - -std::string getTargetWordsFromIDs(std::vector ids, std::map * lookup_target_phrase) -{ - std::string returnstring; - for (std::vector::iterator it = ids.begin(); it != ids.end(); it++) { - returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " "); - } - - return returnstring; -} - -/*Those functions are used to more easily store the floats in the binary phrase table - We convert the float unsinged int so that it is the same as our other values and we can - apply variable byte encoding on top of it.*/ - -inline unsigned int reinterpret_float(float * num) -{ - unsigned int * converted_num; - converted_num = reinterpret_cast(num); - return *converted_num; -} - -inline float reinterpret_uint(unsigned int * num) -{ - float * converted_num; - converted_num = reinterpret_cast(num); - return *converted_num; -} - -/*Mostly taken from stackoverflow, http://stackoverflow.com/questions/5858646/optimizing-variable-length-encoding -and modified in order to return a vector of chars. Implements ULEB128 or variable byte encoding. -This is highly optimized version with unrolled loop */ -inline std::vector vbyte_encode(unsigned int num) -{ - //Determine how many bytes we are going to take. - short size; - std::vector byte_vector; - - if (num < 0x00000080U) { - size = 1; - byte_vector.reserve(size); - goto b1; - } - if (num < 0x00004000U) { - size = 2; - byte_vector.reserve(size); - goto b2; - } - if (num < 0x00200000U) { - size = 3; - byte_vector.reserve(size); - goto b3; - } - if (num < 0x10000000U) { - size = 4; - byte_vector.reserve(size); - goto b4; - } - size = 5; - byte_vector.reserve(size); - - - //Now proceed with the encoding. - byte_vector.push_back((num & 0x7f) | 0x80); - num >>= 7; -b4: - byte_vector.push_back((num & 0x7f) | 0x80); - num >>= 7; -b3: - byte_vector.push_back((num & 0x7f) | 0x80); - num >>= 7; -b2: - byte_vector.push_back((num & 0x7f) | 0x80); - num >>= 7; -b1: - byte_vector.push_back(num); - - return byte_vector; -} - -std::vector vbyte_decode_line(std::vector line) -{ - std::vector huffman_line; - std::vector current_num; - - for (std::vector::iterator it = line.begin(); it != line.end(); it++) { - current_num.push_back(*it); - if ((*it >> 7) != 1) { - //We don't have continuation in the next bit - huffman_line.push_back(bytes_to_int(current_num)); - current_num.clear(); - } - } - return huffman_line; -} - -inline unsigned int bytes_to_int(std::vector number) -{ - unsigned int retvalue = 0; - std::vector::iterator it = number.begin(); - unsigned char shift = 0; //By how many bits to shift - - while (it != number.end()) { - retvalue |= (*it & 0x7f) << shift; - shift += 7; - it++; - } - - return retvalue; -} - -std::vector vbyte_encode_line(std::vector line) -{ - std::vector retvec; - - //For each unsigned int in the line, vbyte encode it and add it to a vector of unsigned chars. - for (std::vector::iterator it = line.begin(); it != line.end(); it++) { - std::vector vbyte_encoded = vbyte_encode(*it); - retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end()); - } - - return retvec; -} diff --git a/moses/TranslationModel/ProbingPT/huffmanish.hh b/moses/TranslationModel/ProbingPT/huffmanish.hh deleted file mode 100644 index 0970a9e68..000000000 --- a/moses/TranslationModel/ProbingPT/huffmanish.hh +++ /dev/null @@ -1,112 +0,0 @@ -#pragma once - -//Huffman encodes a line and also produces the vocabulary ids -#include "hash.hh" -#include "line_splitter.hh" -#include -#include -#include -#include -#include -#include -#include -#include -#include - -//Sorting for the second -struct sort_pair { - bool operator()(const std::pair &left, const std::pair &right) { - return left.second > right.second; //This puts biggest numbers first. - } -}; - -struct sort_pair_vec { - bool operator()(const std::pair, unsigned int> &left, const std::pair, unsigned int> &right) { - return left.second > right.second; //This puts biggest numbers first. - } -}; - -class Huffman -{ - unsigned long uniq_lines; //Unique lines in the file. - - //Containers used when counting the occurence of a given phrase - std::map target_phrase_words; - std::map, unsigned int> word_all1; - - //Same containers as vectors, for sorting - std::vector > target_phrase_words_counts; - std::vector, unsigned int> > word_all1_counts; - - //Huffman maps - std::map target_phrase_huffman; - std::map, unsigned int> word_all1_huffman; - - //inverted maps - std::map lookup_target_phrase; - std::map > lookup_word_all1; - -public: - Huffman (const char *); - void count_elements (line_text line); - void assign_values(); - void serialize_maps(const char * dirname); - void produce_lookups(); - - std::vector encode_line(line_text line); - - //encode line + variable byte ontop - std::vector full_encode_line(line_text line); - - //Getters - const std::map get_target_lookup_map() const { - return lookup_target_phrase; - } - const std::map > get_word_all1_lookup_map() const { - return lookup_word_all1; - } - - unsigned long getUniqLines() { - return uniq_lines; - } -}; - -class HuffmanDecoder -{ - std::map lookup_target_phrase; - std::map > lookup_word_all1; - -public: - HuffmanDecoder (const char *); - HuffmanDecoder (std::map *, std::map > *); - - //Getters - const std::map get_target_lookup_map() const { - return lookup_target_phrase; - } - const std::map > get_word_all1_lookup_map() const { - return lookup_word_all1; - } - - inline std::string getTargetWordFromID(unsigned int id); - - std::string getTargetWordsFromIDs(std::vector ids); - - target_text decode_line (std::vector input, int num_scores); - - //Variable byte decodes a all target phrases contained here and then passes them to decode_line - std::vector full_decode_line (std::vector lines, int num_scores); -}; - -std::string getTargetWordsFromIDs(std::vector ids, std::map * lookup_target_phrase); - -inline std::string getTargetWordFromID(unsigned int id, std::map * lookup_target_phrase); - -inline unsigned int reinterpret_float(float * num); - -inline float reinterpret_uint(unsigned int * num); - -std::vector vbyte_encode_line(std::vector line); -inline std::vector vbyte_encode(unsigned int num); -std::vector vbyte_decode_line(std::vector line); -inline unsigned int bytes_to_int(std::vector number); diff --git a/moses/TranslationModel/ProbingPT/line_splitter.cpp b/moses/TranslationModel/ProbingPT/line_splitter.cpp index 1eeeb1899..cb9e47fec 100644 --- a/moses/TranslationModel/ProbingPT/line_splitter.cpp +++ b/moses/TranslationModel/ProbingPT/line_splitter.cpp @@ -1,66 +1,92 @@ #include "line_splitter.hh" -line_text splitLine(StringPiece textin) +namespace Moses { - const char delim[] = " ||| "; + +line_text splitLine(const StringPiece &textin, bool scfg) +{ + const char delim[] = "|||"; line_text output; //Tokenize util::TokenIter it(textin, util::MultiCharacter(delim)); //Get source phrase - output.source_phrase = *it; + output.source_phrase = Trim(*it); + //std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl; //Get target_phrase it++; - output.target_phrase = *it; + output.target_phrase = Trim(*it); + //std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl; + + if (scfg) { + /* + std::cerr << "output.source_phrase=" << output.source_phrase << std::endl; + std::cerr << "output.target_phrase=" << output.target_phrase << std::endl; + reformatSCFG(output); + std::cerr << "output.source_phrase=" << output.source_phrase << std::endl; + std::cerr << "output.target_phrase=" << output.target_phrase << std::endl; + */ + } //Get probabilities it++; - output.prob = *it; + output.prob = Trim(*it); + //std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl; //Get WordAllignment it++; if (it == util::TokenIter::end()) return output; - output.word_align = *it; + output.word_align = Trim(*it); + //std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl; //Get count it++; if (it == util::TokenIter::end()) return output; - output.counts = *it; + output.counts = Trim(*it); + //std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl; //Get sparse_score it++; if (it == util::TokenIter::end()) return output; - output.sparse_score = *it; + output.sparse_score = Trim(*it); + //std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl; //Get property it++; if (it == util::TokenIter::end()) return output; - output.property = *it; + output.property = Trim(*it); + //std::cerr << "output.property=" << output.property << "AAAA" << std::endl; return output; } -std::vector splitWordAll1(StringPiece textin) +std::vector splitWordAll1(const StringPiece &textin) { const char delim[] = " "; const char delim2[] = "-"; std::vector output; + //Case with no word alignments. + if (textin.size() == 0) { + return output; + } + //Split on space util::TokenIter it(textin, util::MultiCharacter(delim)); //For each int while (it) { //Split on dash (-) - util::TokenIter itInner(*it, util::MultiCharacter(delim2)); + util::TokenIter itInner(*it, + util::MultiCharacter(delim2)); //Insert the two entries in the vector. User will read entry 0 and 1 to get the first, //2 and 3 for second etc. Use unsigned char instead of int to save space, as //word allignments are all very small numbers that fit in a single byte - output.push_back((unsigned char)(atoi(itInner->data()))); + output.push_back((unsigned char) (atoi(itInner->data()))); itInner++; - output.push_back((unsigned char)(atoi(itInner->data()))); + output.push_back((unsigned char) (atoi(itInner->data()))); it++; } @@ -68,3 +94,10 @@ std::vector splitWordAll1(StringPiece textin) } +void reformatSCFG(line_text &output) +{ + +} + +} + diff --git a/moses/TranslationModel/ProbingPT/line_splitter.hh b/moses/TranslationModel/ProbingPT/line_splitter.hh index 2cb9a3c8c..cec0a5f45 100644 --- a/moses/TranslationModel/ProbingPT/line_splitter.hh +++ b/moses/TranslationModel/ProbingPT/line_splitter.hh @@ -9,8 +9,12 @@ #include "util/tokenize_piece.hh" #include +namespace Moses +{ + //Struct for holding processed line -struct line_text { +struct line_text +{ StringPiece source_phrase; StringPiece target_phrase; StringPiece prob; @@ -18,16 +22,38 @@ struct line_text { StringPiece counts; StringPiece sparse_score; StringPiece property; + std::string property_to_be_binarized; }; //Struct for holding processed line -struct target_text { +struct target_text +{ std::vector target_phrase; std::vector prob; - std::vector word_all1; + std::vector word_align_term; + std::vector word_align_non_term; + std::vector counts; + std::vector sparse_score; + std::vector property; + + /* + void Reset() + { + target_phrase.clear(); + prob.clear(); + word_all1.clear(); + counts.clear(); + sparse_score.clear(); + property.clear(); + } + */ }; //Ask if it's better to have it receive a pointer to a line_text struct -line_text splitLine(StringPiece textin); +line_text splitLine(const StringPiece &textin, bool scfg); +void reformatSCFG(line_text &output); + +std::vector splitWordAll1(const StringPiece &textin); + +} -std::vector splitWordAll1(StringPiece textin); diff --git a/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp b/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp index ca3e8f69f..f23f57d66 100644 --- a/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp +++ b/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp @@ -1,5 +1,8 @@ #include "probing_hash_utils.hh" +namespace Moses +{ + //Read table from disk, return memory map location char * readTable(const char * filename, size_t size) { @@ -13,7 +16,7 @@ char * readTable(const char * filename, size_t size) exit(EXIT_FAILURE); } - map = (char *)mmap(0, size, PROT_READ, MAP_SHARED, fd, 0); + map = (char *) mmap(0, size, PROT_READ, MAP_SHARED, fd, 0); if (map == MAP_FAILED) { close(fd); @@ -24,11 +27,24 @@ char * readTable(const char * filename, size_t size) return map; } - -void serialize_table(char *mem, size_t size, const char * filename) +void serialize_table(char *mem, size_t size, const std::string &filename) { - std::ofstream os (filename, std::ios::binary); - os.write((const char*)&mem[0], size); + std::ofstream os(filename.c_str(), std::ios::binary); + os.write((const char*) &mem[0], size); os.close(); -} \ No newline at end of file +} + +uint64_t getKey(const uint64_t source_phrase[], size_t size) +{ + //TOO SLOW + //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size()); + uint64_t key = 0; + for (size_t i = 0; i < size; i++) { + key += (source_phrase[i] << i); + } + return key; +} + +} + diff --git a/moses/TranslationModel/ProbingPT/probing_hash_utils.hh b/moses/TranslationModel/ProbingPT/probing_hash_utils.hh index de96e87a0..dcf0dbe25 100644 --- a/moses/TranslationModel/ProbingPT/probing_hash_utils.hh +++ b/moses/TranslationModel/ProbingPT/probing_hash_utils.hh @@ -7,31 +7,49 @@ #include #include +namespace Moses +{ + +#define API_VERSION 15 //Hash table entry -struct Entry { - uint64_t key; +struct Entry +{ typedef uint64_t Key; - unsigned int bytes_toread; + Key key; - uint64_t GetKey() const { + Key GetKey() const + { return key; } - void SetKey(uint64_t to) { + void SetKey(Key to) + { key = to; } - uint64_t GetValue() const { - return value; - } - uint64_t value; }; +#define NONE std::numeric_limits::max() + //Define table typedef util::ProbingHashTable > Table; -void serialize_table(char *mem, size_t size, const char * filename); +void serialize_table(char *mem, size_t size, const std::string &filename); char * readTable(const char * filename, size_t size); + +uint64_t getKey(const uint64_t source_phrase[], size_t size); + +struct TargetPhraseInfo +{ + uint32_t alignTerm; + uint32_t alignNonTerm; + uint16_t numWords; + uint16_t propLength; + uint16_t filler; +}; + +} + diff --git a/moses/TranslationModel/ProbingPT/quering.cpp b/moses/TranslationModel/ProbingPT/quering.cpp index bd1d61a1e..ef980ef06 100644 --- a/moses/TranslationModel/ProbingPT/quering.cpp +++ b/moses/TranslationModel/ProbingPT/quering.cpp @@ -1,73 +1,80 @@ #include "quering.hh" +#include "util/exception.hh" -unsigned char * read_binary_file(const char * filename, size_t filesize) -{ - //Get filesize - int fd; - unsigned char * map; - - fd = open(filename, O_RDONLY); - - if (fd == -1) { - perror("Error opening file for reading"); - exit(EXIT_FAILURE); - } - - map = (unsigned char *)mmap(0, filesize, PROT_READ, MAP_SHARED, fd, 0); - if (map == MAP_FAILED) { - close(fd); - perror("Error mmapping the file"); - exit(EXIT_FAILURE); - } +using namespace std; - return map; -} +namespace Moses +{ -QueryEngine::QueryEngine(const char * filepath) : decoder(filepath) +QueryEngine::QueryEngine(const char * filepath) { //Create filepaths std::string basepath(filepath); std::string path_to_hashtable = basepath + "/probing_hash.dat"; - std::string path_to_data_bin = basepath + "/binfile.dat"; std::string path_to_source_vocabid = basepath + "/source_vocabids"; + std::string alignPath = basepath + "/Alignments.dat"; ///Source phrase vocabids - read_map(&source_vocabids, path_to_source_vocabid.c_str()); + read_map(source_vocabids, path_to_source_vocabid.c_str()); - //Target phrase vocabIDs - vocabids = decoder.get_target_lookup_map(); + // alignments + read_alignments(alignPath); //Read config file + boost::unordered_map keyValue; + + std::ifstream config((basepath + "/config").c_str()); std::string line; - std::ifstream config ((basepath + "/config").c_str()); + while (getline(config, line)) { + std::vector toks = Tokenize(line, "\t"); + UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line); + keyValue[ toks[0] ] = toks[1]; + } + + bool found; //Check API version: - getline(config, line); - if (atoi(line.c_str()) != API_VERSION) { - std::cerr << "The ProbingPT API has changed, please rebinarize your phrase tables." << std::endl; + int version; + found = Get(keyValue, "API_VERSION", version); + if (!found) { + std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl; + } + else if (version != API_VERSION) { + std::cerr << "The ProbingPT API has changed. " << version << "!=" + << API_VERSION << " Please rebinarize your phrase tables." << std::endl; exit(EXIT_FAILURE); } + //Get tablesize. - getline(config, line); - int tablesize = atoi(line.c_str()); + int tablesize; + found = Get(keyValue, "uniq_entries", tablesize); + if (!found) { + std::cerr << "uniq_entries not found" << std::endl; + exit(EXIT_FAILURE); + } + //Number of scores - getline(config, line); - num_scores = atoi(line.c_str()); - //do we have a reordering table - getline(config, line); - std::transform(line.begin(), line.end(), line.begin(), ::tolower); //Get the boolean in lowercase - is_reordering = false; - if (line == "true") { - is_reordering = true; - std::cerr << "WARNING. REORDERING TABLES NOT SUPPORTED YET." << std::endl; + found = Get(keyValue, "num_scores", num_scores); + if (!found) { + std::cerr << "num_scores not found" << std::endl; + exit(EXIT_FAILURE); } - config.close(); - //Mmap binary table - struct stat filestatus; - stat(path_to_data_bin.c_str(), &filestatus); - binary_filesize = filestatus.st_size; - binary_mmaped = read_binary_file(path_to_data_bin.c_str(), binary_filesize); + //How may scores from lex reordering models + found = Get(keyValue, "num_lex_scores", num_lex_scores); + if (!found) { + std::cerr << "num_lex_scores not found" << std::endl; + exit(EXIT_FAILURE); + } + + // have the scores been log() and FloorScore()? + found = Get(keyValue, "log_prob", logProb); + if (!found) { + std::cerr << "logProb not found" << std::endl; + exit(EXIT_FAILURE); + } + + config.close(); //Read hashtable table_filesize = Table::Size(tablesize, 1.2); @@ -81,118 +88,50 @@ QueryEngine::QueryEngine(const char * filepath) : decoder(filepath) QueryEngine::~QueryEngine() { //Clear mmap content from memory. - munmap(binary_mmaped, binary_filesize); munmap(mem, table_filesize); } -std::pair > QueryEngine::query(std::vector source_phrase) +uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const { - bool found; - std::vector translation_entries; - const Entry * entry; //TOO SLOW //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size()); - uint64_t key = 0; - for (int i = 0; i < source_phrase.size(); i++) { - key += (source_phrase[i] << i); - } - - - found = table.Find(key, entry); - - if (found) { - //The phrase that was searched for was found! We need to get the translation entries. - //We will read the largest entry in bytes and then filter the unnecesarry with functions - //from line_splitter - uint64_t initial_index = entry -> GetValue(); - unsigned int bytes_toread = entry -> bytes_toread; - - //ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS! - std::vector encoded_text; //Assign to the vector the relevant portion of the array. - encoded_text.reserve(bytes_toread); - for (int i = 0; i < bytes_toread; i++) { - encoded_text.push_back(binary_mmaped[i+initial_index]); - } - - //Get only the translation entries necessary - translation_entries = decoder.full_decode_line(encoded_text, num_scores); - - } - - std::pair > output (found, translation_entries); - - return output; - + return getKey(source_phrase, size); } -std::pair > QueryEngine::query(StringPiece source_phrase) +std::pair QueryEngine::query(uint64_t key) { - bool found; - std::vector translation_entries; - const Entry * entry; - //Convert source frase to VID - std::vector source_phrase_vid = getVocabIDs(source_phrase); - //TOO SLOW - //uint64_t key = util::MurmurHashNative(&source_phrase_vid[0], source_phrase_vid.size()); - uint64_t key = 0; - for (int i = 0; i < source_phrase_vid.size(); i++) { - key += (source_phrase_vid[i] << i); - } - - found = table.Find(key, entry); - - - if (found) { - //The phrase that was searched for was found! We need to get the translation entries. - //We will read the largest entry in bytes and then filter the unnecesarry with functions - //from line_splitter - uint64_t initial_index = entry -> GetValue(); - unsigned int bytes_toread = entry -> bytes_toread; - //At the end of the file we can't readd + largest_entry cause we get a segfault. - std::cerr << "Entry size is bytes is: " << bytes_toread << std::endl; - - //ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS! - std::vector encoded_text; //Assign to the vector the relevant portion of the array. - encoded_text.reserve(bytes_toread); - for (int i = 0; i < bytes_toread; i++) { - encoded_text.push_back(binary_mmaped[i+initial_index]); - } - - //Get only the translation entries necessary - translation_entries = decoder.full_decode_line(encoded_text, num_scores); + std::pair ret; + const Entry * entry; + ret.first = table.Find(key, entry); + if (ret.first) { + ret.second = entry->value; } - - std::pair > output (found, translation_entries); - - return output; - + return ret; } -void QueryEngine::printTargetInfo(std::vector target_phrases) +void QueryEngine::read_alignments(const std::string &alignPath) { - int entries = target_phrases.size(); + std::ifstream strm(alignPath.c_str()); - for (int i = 0; i toks = Tokenize(line, "\t "); + UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file"); - //Print probabilities: - for (int j = 0; j(toks[0]); + if (alignInd >= alignColl.size()) { + alignColl.resize(alignInd + 1); } - std::cout << "\t"; - - //Print word_all1 - for (int j = 0; j(toks[i]); + aligns.push_back(pos); } - std::cout << std::endl; } } + +} + diff --git a/moses/TranslationModel/ProbingPT/quering.hh b/moses/TranslationModel/ProbingPT/quering.hh index e574d1f8f..c43c7f3b9 100644 --- a/moses/TranslationModel/ProbingPT/quering.hh +++ b/moses/TranslationModel/ProbingPT/quering.hh @@ -1,45 +1,65 @@ #pragma once -#include "probing_hash_utils.hh" -#include "huffmanish.hh" -#include "hash.hh" //Includes line splitter +#include #include //For finding size of file #include "vocabid.hh" #include //toLower -#define API_VERSION 3 - +#include +#include "probing_hash_utils.hh" +#include "hash.hh" //Includes line splitter +#include "line_splitter.hh" +#include "moses//Util.h" -char * read_binary_file(char * filename); +namespace Moses +{ class QueryEngine { - unsigned char * binary_mmaped; //The binari phrase table file - std::map vocabids; std::map source_vocabids; + typedef std::vector Alignments; + std::vector alignColl; + Table table; char *mem; //Memory for the table, necessary so that we can correctly destroy the object - HuffmanDecoder decoder; - - size_t binary_filesize; size_t table_filesize; - int num_scores; bool is_reordering; + + void read_alignments(const std::string &alignPath); + public: - QueryEngine (const char *); + int num_scores; + int num_lex_scores; + bool logProb; + + QueryEngine(const char *); ~QueryEngine(); - std::pair > query(StringPiece source_phrase); - std::pair > query(std::vector source_phrase); - void printTargetInfo(std::vector target_phrases); - const std::map getVocab() const { - return decoder.get_target_lookup_map(); - } - const std::map getSourceVocab() const { - return source_vocabids; + std::pair query(uint64_t key); + + const std::map &getSourceVocab() const + { return source_vocabids; } + + const std::vector &getAlignments() const + { return alignColl; } + + uint64_t getKey(uint64_t source_phrase[], size_t size) const; + + template + inline bool Get(const boost::unordered_map &keyValue, const std::string &sought, T &found) const + { + boost::unordered_map::const_iterator iter = keyValue.find(sought); + if (iter == keyValue.end()) { + return false; + } + + const std::string &foundStr = iter->second; + found = Scan(foundStr); + return true; } }; +} diff --git a/moses/TranslationModel/ProbingPT/storing.cpp b/moses/TranslationModel/ProbingPT/storing.cpp index 01128c1e4..98dcfd5dc 100644 --- a/moses/TranslationModel/ProbingPT/storing.cpp +++ b/moses/TranslationModel/ProbingPT/storing.cpp @@ -1,161 +1,303 @@ +#include +#include +#include "line_splitter.hh" #include "storing.hh" +#include "StoreTarget.h" +#include "StoreVocab.h" +#include "moses/Util.h" +#include "moses/InputFileStream.h" -BinaryFileWriter::BinaryFileWriter (std::string basepath) : os ((basepath + "/binfile.dat").c_str(), std::ios::binary) +using namespace std; + +namespace Moses { - binfile.reserve(10000); //Reserve part of the vector to avoid realocation - it = binfile.begin(); - dist_from_start = 0; //Initialize variables - extra_counter = 0; -} -void BinaryFileWriter::write (std::vector * bytes) +/////////////////////////////////////////////////////////////////////// +void Node::Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos) { - binfile.insert(it, bytes->begin(), bytes->end()); //Insert the bytes - //Keep track of the offsets - it += bytes->size(); - dist_from_start = distance(binfile.begin(),it); - //Flush the vector to disk every once in a while so that we don't consume too much ram - if (dist_from_start > 9000) { - flush(); + if (pos < sourcePhrase.size()) { + uint64_t vocabId = sourcePhrase[pos]; + + Node *child; + Children::iterator iter = m_children.find(vocabId); + if (iter == m_children.end()) { + // New node. Write other children then discard them + BOOST_FOREACH(Children::value_type &valPair, m_children) { + Node &otherChild = valPair.second; + otherChild.Write(table); + } + m_children.clear(); + + // create new node + child = &m_children[vocabId]; + assert(!child->done); + child->key = key + (vocabId << pos); + } + else { + child = &iter->second; + } + + child->Add(table, sourcePhrase, pos + 1); + } + else { + // this node was written previously 'cos it has rules + done = true; } } -void BinaryFileWriter::flush () +void Node::Write(Table &table) { - //Cast unsigned char to char before writing... - os.write((char *)&binfile[0], dist_from_start); - //Clear the vector: - binfile.clear(); - binfile.reserve(10000); - extra_counter += dist_from_start; //Keep track of the total number of bytes. - it = binfile.begin(); //Reset iterator - dist_from_start = distance(binfile.begin(),it); //Reset dist from start -} + //cerr << "START write " << done << " " << key << endl; + BOOST_FOREACH(Children::value_type &valPair, m_children) { + Node &child = valPair.second; + child.Write(table); + } -BinaryFileWriter::~BinaryFileWriter () -{ - os.close(); - binfile.clear(); + if (!done) { + // save + Entry sourceEntry; + sourceEntry.value = NONE; + sourceEntry.key = key; + + //Put into table + table.Insert(sourceEntry); + } } -void createProbingPT(const char * phrasetable_path, const char * target_path, - const char * num_scores, const char * is_reordering) +/////////////////////////////////////////////////////////////////////// +void createProbingPT(const std::string &phrasetable_path, + const std::string &basepath, int num_scores, int num_lex_scores, + bool log_prob, int max_cache_size, bool scfg) { + std::cerr << "Starting..." << std::endl; + //Get basepath and create directory if missing - std::string basepath(target_path); mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); - //Set up huffman and serialize decoder maps. - Huffman huffmanEncoder(phrasetable_path); //initialize - huffmanEncoder.assign_values(); - huffmanEncoder.produce_lookups(); - huffmanEncoder.serialize_maps(target_path); + StoreTarget storeTarget(basepath); //Get uniq lines: - unsigned long uniq_entries = huffmanEncoder.getUniqLines(); + unsigned long uniq_entries = countUniqueSource(phrasetable_path); //Source phrase vocabids - std::map source_vocabids; + StoreVocab sourceVocab(basepath + "/source_vocabids"); //Read the file - util::FilePiece filein(phrasetable_path); + util::FilePiece filein(phrasetable_path.c_str()); //Init the probing hash table size_t size = Table::Size(uniq_entries, 1.2); char * mem = new char[size]; memset(mem, 0, size); - Table table(mem, size); + Table sourceEntries(mem, size); - BinaryFileWriter binfile(basepath); //Init the binary file writer. - - line_text prev_line; //Check if the source phrase of the previous line is the same + std::priority_queue, CacheItemOrderer> cache; + float totalSourceCount = 0; //Keep track of the size of each group of target phrases - uint64_t entrystartidx = 0; - //uint64_t line_num = 0; - + size_t line_num = 0; //Read everything and processs - while(true) { + std::string prevSource; + + Node sourcePhrases; + sourcePhrases.done = true; + sourcePhrases.key = 0; + + while (true) { try { //Process line read line_text line; - line = splitLine(filein.ReadLine()); - //Add source phrases to vocabularyIDs - add_to_map(&source_vocabids, line.source_phrase); + line = splitLine(filein.ReadLine(), scfg); + //cerr << "line=" << line.source_phrase << endl; - if ((binfile.dist_from_start + binfile.extra_counter) == 0) { - prev_line = line; //For the first iteration assume the previous line is - } //The same as this one. + ++line_num; + if (line_num % 1000000 == 0) { + std::cerr << line_num << " " << std::flush; + } - if (line.source_phrase != prev_line.source_phrase) { + //Add source phrases to vocabularyIDs + add_to_map(sourceVocab, line.source_phrase); + + if (prevSource.empty()) { + // 1st line + prevSource = line.source_phrase.as_string(); + storeTarget.Append(line, log_prob, scfg); + } + else if (prevSource == line.source_phrase) { + //If we still have the same line, just append to it: + storeTarget.Append(line, log_prob, scfg); + } + else { + assert(prevSource != line.source_phrase); //Create a new entry even + // save + uint64_t targetInd = storeTarget.Save(); + + // next line + storeTarget.Append(line, log_prob, scfg); + //Create an entry for the previous source phrase: - Entry pesho; - pesho.value = entrystartidx; + Entry sourceEntry; + sourceEntry.value = targetInd; //The key is the sum of hashes of individual words bitshifted by their position in the phrase. //Probably not entirerly correct, but fast and seems to work fine in practise. - pesho.key = 0; - std::vector vocabid_source = getVocabIDs(prev_line.source_phrase); - for (int i = 0; i < vocabid_source.size(); i++) { - pesho.key += (vocabid_source[i] << i); + std::vector vocabid_source = getVocabIDs(prevSource); + if (scfg) { + // storing prefixes? + sourcePhrases.Add(sourceEntries, vocabid_source); } - pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx; + sourceEntry.key = getKey(vocabid_source); + /* + cerr << "prevSource=" << prevSource << flush + << " vocabids=" << Debug(vocabid_source) << flush + << " key=" << sourceEntry.key << endl; + */ //Put into table - table.Insert(pesho); + sourceEntries.Insert(sourceEntry); - entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry + // update cache - CURRENT source phrase, not prev + if (max_cache_size) { + std::string countStr = line.counts.as_string(); + countStr = Trim(countStr); + if (!countStr.empty()) { + std::vector toks = Tokenize(countStr); + //cerr << "CACHE:" << line.source_phrase << " " << countStr << " " << toks[1] << endl; - //Encode a line and write it to disk. - std::vector encoded_line = huffmanEncoder.full_encode_line(line); - binfile.write(&encoded_line); + if (toks.size() >= 2) { + totalSourceCount += toks[1]; - //Set prevLine - prev_line = line; + // compute key for CURRENT source + std::vector currVocabidSource = getVocabIDs(line.source_phrase.as_string()); + uint64_t currKey = getKey(currVocabidSource); - } else { - //If we still have the same line, just append to it: - std::vector encoded_line = huffmanEncoder.full_encode_line(line); - binfile.write(&encoded_line); + CacheItem *item = new CacheItem( + Trim(line.source_phrase.as_string()), + currKey, + toks[1]); + cache.push(item); + + if (max_cache_size > 0 && cache.size() > max_cache_size) { + cache.pop(); + } + } + } + } + + //Set prevLine + prevSource = line.source_phrase.as_string(); } - } catch (util::EndOfFileException e) { - std::cerr << "Reading phrase table finished, writing remaining files to disk." << std::endl; - binfile.flush(); + } + catch (util::EndOfFileException e) { + std::cerr + << "Reading phrase table finished, writing remaining files to disk." + << std::endl; //After the final entry is constructed we need to add it to the phrase_table //Create an entry for the previous source phrase: - Entry pesho; - pesho.value = entrystartidx; + uint64_t targetInd = storeTarget.Save(); + + Entry sourceEntry; + sourceEntry.value = targetInd; + //The key is the sum of hashes of individual words. Probably not entirerly correct, but fast - pesho.key = 0; - std::vector vocabid_source = getVocabIDs(prev_line.source_phrase); - for (int i = 0; i < vocabid_source.size(); i++) { - pesho.key += (vocabid_source[i] << i); - } - pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx; + std::vector vocabid_source = getVocabIDs(prevSource); + sourceEntry.key = getKey(vocabid_source); + //Put into table - table.Insert(pesho); + sourceEntries.Insert(sourceEntry); break; } } - serialize_table(mem, size, (basepath + "/probing_hash.dat").c_str()); + sourcePhrases.Write(sourceEntries); + + storeTarget.SaveAlignment(); - serialize_map(&source_vocabids, (basepath + "/source_vocabids").c_str()); + serialize_table(mem, size, (basepath + "/probing_hash.dat")); + + sourceVocab.Save(); + + serialize_cache(cache, (basepath + "/cache"), totalSourceCount); delete[] mem; //Write configfile std::ofstream configfile; configfile.open((basepath + "/config").c_str()); - configfile << API_VERSION << '\n'; - configfile << uniq_entries << '\n'; - configfile << num_scores << '\n'; - configfile << is_reordering << '\n'; + configfile << "API_VERSION\t" << API_VERSION << '\n'; + configfile << "uniq_entries\t" << uniq_entries << '\n'; + configfile << "num_scores\t" << num_scores << '\n'; + configfile << "num_lex_scores\t" << num_lex_scores << '\n'; + configfile << "log_prob\t" << log_prob << '\n'; configfile.close(); } + +size_t countUniqueSource(const std::string &path) +{ + size_t ret = 0; + InputFileStream strme(path); + + std::string line, prevSource; + while (std::getline(strme, line)) { + std::vector toks = TokenizeMultiCharSeparator(line, "|||"); + assert(toks.size() != 0); + + if (prevSource != toks[0]) { + prevSource = toks[0]; + ++ret; + } + } + + return ret; +} + +void serialize_cache( + std::priority_queue, CacheItemOrderer> &cache, + const std::string &path, float totalSourceCount) +{ + std::vector vec(cache.size()); + + size_t ind = cache.size() - 1; + while (!cache.empty()) { + const CacheItem *item = cache.top(); + vec[ind] = item; + cache.pop(); + --ind; + } + + std::ofstream os(path.c_str()); + + os << totalSourceCount << std::endl; + for (size_t i = 0; i < vec.size(); ++i) { + const CacheItem *item = vec[i]; + os << item->count << "\t" << item->sourceKey << "\t" << item->source << std::endl; + delete item; + } + + os.close(); +} + +uint64_t getKey(const std::vector &vocabid_source) +{ + return getKey(vocabid_source.data(), vocabid_source.size()); +} + +std::vector CreatePrefix(const std::vector &vocabid_source, size_t endPos) +{ + assert(endPos < vocabid_source.size()); + + std::vector ret(endPos + 1); + for (size_t i = 0; i <= endPos; ++i) { + ret[i] = vocabid_source[i]; + } + return ret; +} + +} + diff --git a/moses/TranslationModel/ProbingPT/storing.hh b/moses/TranslationModel/ProbingPT/storing.hh index 8554d614f..957c73491 100644 --- a/moses/TranslationModel/ProbingPT/storing.hh +++ b/moses/TranslationModel/ProbingPT/storing.hh @@ -1,36 +1,95 @@ #pragma once +#include +#include #include +#include #include #include +#include +#include +#include //mkdir #include "hash.hh" //Includes line_splitter #include "probing_hash_utils.hh" -#include "huffmanish.hh" -#include //mkdir #include "util/file_piece.hh" #include "util/file.hh" #include "vocabid.hh" -#define API_VERSION 3 -void createProbingPT(const char * phrasetable_path, const char * target_path, - const char * num_scores, const char * is_reordering); +namespace Moses +{ +typedef std::vector SourcePhrase; + + +class Node +{ + typedef boost::unordered_map Children; + Children m_children; + +public: + uint64_t key; + bool done; + + Node() + :done(false) + {} + + void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0); + void Write(Table &table); +}; + + +void createProbingPT(const std::string &phrasetable_path, + const std::string &basepath, int num_scores, int num_lex_scores, + bool log_prob, int max_cache_size, bool scfg); +uint64_t getKey(const std::vector &source_phrase); + +std::vector CreatePrefix(const std::vector &vocabid_source, size_t endPos); -class BinaryFileWriter +template +std::string Debug(const std::vector &vec) { - std::vector binfile; - std::vector::iterator it; - //Output binary - std::ofstream os; + std::stringstream strm; + for (size_t i = 0; i < vec.size(); ++i) { + strm << vec[i] << " "; + } + return strm.str(); +} +size_t countUniqueSource(const std::string &path); + +class CacheItem +{ public: - unsigned int dist_from_start; //Distance from the start of the vector. - uint64_t extra_counter; //After we reset the counter, we still want to keep track of the correct offset, so + std::string source; + uint64_t sourceKey; + float count; + CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount) + :source(vSource) + ,sourceKey(vSourceKey) + ,count(vCount) + { + } - BinaryFileWriter (std::string); - ~BinaryFileWriter (); - void write (std::vector * bytes); - void flush (); //Flush to disk + bool operator<(const CacheItem &other) const + { + return count > other.count; + } +}; +class CacheItemOrderer +{ +public: + bool operator()(const CacheItem* a, const CacheItem* b) const + { + return (*a) < (*b); + } }; + +void serialize_cache( + std::priority_queue, CacheItemOrderer> &cache, + const std::string &path, float totalSourceCount); + +} + diff --git a/moses/TranslationModel/ProbingPT/vocabid.cpp b/moses/TranslationModel/ProbingPT/vocabid.cpp index 1452f299d..3d6efe841 100644 --- a/moses/TranslationModel/ProbingPT/vocabid.cpp +++ b/moses/TranslationModel/ProbingPT/vocabid.cpp @@ -1,32 +1,59 @@ +#include #include "vocabid.hh" +#include "StoreVocab.h" +#include "moses/Util.h" -void add_to_map(std::map *karta, StringPiece textin) +namespace Moses +{ + +void add_to_map(StoreVocab &sourceVocab, + const StringPiece &textin) { //Tokenize - util::TokenIter it(textin, util::SingleCharacter(' ')); + util::TokenIter itWord(textin, util::SingleCharacter(' ')); + + while (itWord) { + StringPiece word = *itWord; - while(it) { - karta->insert(std::pair(getHash(*it), it->as_string())); - it++; + util::TokenIter itFactor(word, util::SingleCharacter('|')); + while (itFactor) { + StringPiece factor = *itFactor; + + sourceVocab.Insert(getHash(factor), factor.as_string()); + itFactor++; + } + itWord++; } } -void serialize_map(std::map *karta, const char* filename) +void serialize_map(const std::map &karta, + const std::string &filename) { - std::ofstream os (filename, std::ios::binary); - boost::archive::text_oarchive oarch(os); + std::ofstream os(filename.c_str()); + + std::map::const_iterator iter; + for (iter = karta.begin(); iter != karta.end(); ++iter) { + os << iter->first << '\t' << iter->second << std::endl; + } - oarch << *karta; //Serialise map os.close(); } -void read_map(std::map *karta, const char* filename) +void read_map(std::map &karta, const char* filename) { - std::ifstream is (filename, std::ios::binary); - boost::archive::text_iarchive iarch(is); + std::ifstream is(filename); - iarch >> *karta; + std::string line; + while (getline(is, line)) { + std::vector toks = Tokenize(line, "\t"); + assert(toks.size() == 2); + uint64_t ind = Scan(toks[1]); + karta[ind] = toks[0]; + } //Close the stream after we are done. is.close(); } + +} + diff --git a/moses/TranslationModel/ProbingPT/vocabid.hh b/moses/TranslationModel/ProbingPT/vocabid.hh index 491c53439..f9c9b2dff 100644 --- a/moses/TranslationModel/ProbingPT/vocabid.hh +++ b/moses/TranslationModel/ProbingPT/vocabid.hh @@ -13,8 +13,17 @@ #include "util/string_piece.hh" //Tokenization and work with StringPiece #include "util/tokenize_piece.hh" -void add_to_map(std::map *karta, StringPiece textin); +namespace Moses +{ +template +class StoreVocab; -void serialize_map(std::map *karta, const char* filename); +void add_to_map(StoreVocab &sourceVocab, + const StringPiece &textin); -void read_map(std::map *karta, const char* filename); +void serialize_map(const std::map &karta, + const std::string &filename); + +void read_map(std::map &karta, const char* filename); + +} -- cgit v1.2.3 From 2eea4dd5e0e369a43300298190c4b860c17d19ad Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Tue, 4 Oct 2016 16:48:52 +0100 Subject: compiles --- moses/TranslationModel/ProbingPT/ProbingPT.cpp | 131 ++++++++++++------------- moses/TranslationModel/ProbingPT/ProbingPT.h | 15 ++- 2 files changed, 76 insertions(+), 70 deletions(-) diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp index bb3f26e22..1298f8149 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp +++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp @@ -161,99 +161,94 @@ TargetPhraseCollection::shared_ptr ProbingPT::CreateTargetPhrase(const Phrase &s // create a target phrase from the 1st word of the source, prefix with 'ProbingPT:' assert(sourcePhrase.GetSize()); - TargetPhraseCollection::shared_ptr tpColl; - bool ok; - vector probingSource = ConvertToProbingSourcePhrase(sourcePhrase, ok); - if (!ok) { - // source phrase contains a word unknown in the pt. - // We know immediately there's no translation for it - return tpColl; + std::pair keyStruct = GetKey(sourcePhrase); + if (!keyStruct.first) { + return TargetPhraseCollection::shared_ptr(); } - std::pair > query_result; - - //Actual lookup - query_result = m_engine->query(probingSource); + // check in cache + CachePb::const_iterator iter = m_cachePb.find(keyStruct.second); + if (iter != m_cachePb.end()) { + //cerr << "FOUND IN CACHE " << keyStruct.second << " " << sourcePhrase.Debug(mgr.system) << endl; + TargetPhraseCollection *tps = iter->second; + return TargetPhraseCollection::shared_ptr(tps); + } - if (query_result.first) { - //m_engine->printTargetInfo(query_result.second); - tpColl.reset(new TargetPhraseCollection()); + // query pt + TargetPhraseCollection *tps = CreateTargetPhrases(sourcePhrase, + keyStruct.second); + return TargetPhraseCollection::shared_ptr(tps); +} - const std::vector &probingTargetPhrases = query_result.second; - for (size_t i = 0; i < probingTargetPhrases.size(); ++i) { - const target_text &probingTargetPhrase = probingTargetPhrases[i]; - TargetPhrase *tp = CreateTargetPhrase(sourcePhrase, probingTargetPhrase); +std::pair ProbingPT::GetKey(const Phrase &sourcePhrase) const +{ + std::pair ret; - tpColl->Add(tp); - } + // create a target phrase from the 1st word of the source, prefix with 'ProbingPT:' + size_t sourceSize = sourcePhrase.GetSize(); + assert(sourceSize); - tpColl->Prune(true, m_tableLimit); + uint64_t probingSource[sourceSize]; + GetSourceProbingIds(sourcePhrase, ret.first, probingSource); + if (!ret.first) { + // source phrase contains a word unknown in the pt. + // We know immediately there's no translation for it + } + else { + ret.second = m_engine->getKey(probingSource, sourceSize); } - return tpColl; + return ret; + } -TargetPhrase *ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const +void ProbingPT::GetSourceProbingIds(const Phrase &sourcePhrase, + bool &ok, uint64_t probingSource[]) const { - const std::vector &probingPhrase = probingTargetPhrase.target_phrase; - size_t size = probingPhrase.size(); - - TargetPhrase *tp = new TargetPhrase(this); - // words + size_t size = sourcePhrase.GetSize(); for (size_t i = 0; i < size; ++i) { - uint64_t probingId = probingPhrase[i]; - const Factor *factor = GetTargetFactor(probingId); - assert(factor); - - Word &word = tp->AddWord(); - word.SetFactor(m_output[0], factor); + const Word &word = sourcePhrase.GetWord(i); + uint64_t probingId = GetSourceProbingId(word); + if (probingId == m_unkId) { + ok = false; + return; + } + else { + probingSource[i] = probingId; + } } - // score for this phrase table - vector scores = probingTargetPhrase.prob; - std::transform(scores.begin(), scores.end(), scores.begin(),TransformScore); - tp->GetScoreBreakdown().PlusEquals(this, scores); + ok = true; +} - // alignment - /* - const std::vector &alignments = probingTargetPhrase.word_all1; +uint64_t ProbingPT::GetSourceProbingId(const Word &word) const +{ + uint64_t ret = 0; - AlignmentInfo &aligns = tp->GetAlignTerm(); - for (size_t i = 0; i < alignS.size(); i += 2 ) { - aligns.Add((size_t) alignments[i], (size_t) alignments[i+1]); + for (size_t i = 0; i < m_input.size(); ++i) { + FactorType factorType = m_input[i]; + const Factor *factor = word[factorType]; + + size_t factorId = factor->GetId(); + if (factorId >= m_sourceVocab.size()) { + return m_unkId; + } + ret += m_sourceVocab[factorId]; } - */ - // score of all other ff when this rule is being loaded - tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply()); - return tp; + return ret; } -const Factor *ProbingPT::GetTargetFactor(uint64_t probingId) const +TargetPhraseCollection *ProbingPT::CreateTargetPhrases( + const Phrase &sourcePhrase, uint64_t key) const { - TargetVocabMap::right_map::const_iterator iter; - iter = m_vocabMap.right.find(probingId); - if (iter != m_vocabMap.right.end()) { - return iter->second; - } else { - // not in mapping. Must be UNK - return NULL; - } -} -uint64_t ProbingPT::GetSourceProbingId(const Factor *factor) const -{ - SourceVocabMap::left_map::const_iterator iter; - iter = m_sourceVocabMap.left.find(factor); - if (iter != m_sourceVocabMap.left.end()) { - return iter->second; - } else { - // not in mapping. Must be UNK - return m_unkId; - } } +////////////////////////////////////////////////////////////////// + + ChartRuleLookupManager *ProbingPT::CreateRuleLookupManager( const ChartParser &, const ChartCellCollectionBase &, diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.h b/moses/TranslationModel/ProbingPT/ProbingPT.h index 3b5dfc895..98d052e07 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.h +++ b/moses/TranslationModel/ProbingPT/ProbingPT.h @@ -2,6 +2,7 @@ #pragma once #include #include +#include #include "../PhraseDictionary.h" @@ -48,15 +49,25 @@ protected: boost::iostreams::mapped_file_source file; const char *data; + // caching + typedef boost::unordered_map CachePb; + CachePb m_cachePb; + void CreateAlignmentMap(const std::string path); TargetPhraseCollection::shared_ptr CreateTargetPhrase(const Phrase &sourcePhrase) const; - TargetPhrase *CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const; - const Factor *GetTargetFactor(uint64_t probingId) const; uint64_t GetSourceProbingId(const Factor *factor) const; std::vector ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const; + std::pair GetKey(const Phrase &sourcePhrase) const; + void GetSourceProbingIds(const Phrase &sourcePhrase, bool &ok, + uint64_t probingSource[]) const; + uint64_t GetSourceProbingId(const Word &word) const; + + TargetPhraseCollection *CreateTargetPhrases( + const Phrase &sourcePhrase, uint64_t key) const; + }; } // namespace Moses -- cgit v1.2.3 From 041b13eb19f364b79809a7efa08c4552d41d4e75 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 5 Oct 2016 15:15:47 +0100 Subject: compiles but segfault --- moses/ScoreComponentCollection.h | 9 ++ moses/TranslationModel/ProbingPT/ProbingPT.cpp | 122 +++++++++++++++++++++---- moses/TranslationModel/ProbingPT/ProbingPT.h | 14 ++- 3 files changed, 123 insertions(+), 22 deletions(-) diff --git a/moses/ScoreComponentCollection.h b/moses/ScoreComponentCollection.h index 1305e9c16..0ab57a73a 100644 --- a/moses/ScoreComponentCollection.h +++ b/moses/ScoreComponentCollection.h @@ -247,6 +247,15 @@ public: } } + void PlusEquals(const FeatureFunction* sp, float scores[]) + { + size_t numScores = sp->GetNumScoreComponents(); + size_t offset = sp->GetIndex(); + for (size_t i = 0; i < numScores; ++i) { + m_scores[i + offset] += scores[i]; + } + } + //! Special version PlusEquals(ScoreProducer, vector) //! to add the score from a single ScoreProducer that produces //! a single value diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp index 1298f8149..1fd982f0e 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp +++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp @@ -137,25 +137,6 @@ void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQue } } -std::vector ProbingPT::ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const -{ - size_t size = sourcePhrase.GetSize(); - std::vector ret(size); - for (size_t i = 0; i < size; ++i) { - const Factor *factor = sourcePhrase.GetFactor(i, m_input[0]); - uint64_t probingId = GetSourceProbingId(factor); - if (probingId == m_unkId) { - ok = false; - return ret; - } else { - ret[i] = probingId; - } - } - - ok = true; - return ret; -} - TargetPhraseCollection::shared_ptr ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase) const { // create a target phrase from the 1st word of the source, prefix with 'ProbingPT:' @@ -243,7 +224,110 @@ uint64_t ProbingPT::GetSourceProbingId(const Word &word) const TargetPhraseCollection *ProbingPT::CreateTargetPhrases( const Phrase &sourcePhrase, uint64_t key) const { + TargetPhraseCollection *tps = NULL; + + //Actual lookup + std::pair query_result; // 1st=found, 2nd=target file offset + query_result = m_engine->query(key); + //cerr << "key2=" << query_result.second << endl; + + if (query_result.first) { + const char *offset = data + query_result.second; + uint64_t *numTP = (uint64_t*) offset; + + tps = new TargetPhraseCollection(); + + offset += sizeof(uint64_t); + for (size_t i = 0; i < *numTP; ++i) { + TargetPhrase *tp = CreateTargetPhrase(offset); + assert(tp); + tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply()); + + tps->Add(tp); + + } + + tps->Prune(true, m_tableLimit); + //cerr << *tps << endl; + } + + return tps; + +} + +TargetPhrase *ProbingPT::CreateTargetPhrase( + const char *&offset) const +{ + TargetPhraseInfo *tpInfo = (TargetPhraseInfo*) offset; + size_t numRealWords = tpInfo->numWords / m_output.size(); + + TargetPhrase *tp = new TargetPhrase(this); + + offset += sizeof(TargetPhraseInfo); + + // scores + float *scores = (float*) offset; + + size_t totalNumScores = m_engine->num_scores + m_engine->num_lex_scores; + + if (m_engine->logProb) { + // set pt score for rule + tp->GetScoreBreakdown().PlusEquals(this, scores); + + // save scores for other FF, eg. lex RO. Just give the offset + /* + if (m_engine->num_lex_scores) { + tp->scoreProperties = scores + m_engine->num_scores; + } + */ + } + else { + // log score 1st + float logScores[totalNumScores]; + for (size_t i = 0; i < totalNumScores; ++i) { + logScores[i] = FloorScore(TransformScore(scores[i])); + } + + // set pt score for rule + tp->GetScoreBreakdown().PlusEquals(this, logScores); + + // save scores for other FF, eg. lex RO. + /* + tp->scoreProperties = pool.Allocate(m_engine->num_lex_scores); + for (size_t i = 0; i < m_engine->num_lex_scores; ++i) { + tp->scoreProperties[i] = logScores[i + m_engine->num_scores]; + } + */ + } + + offset += sizeof(float) * totalNumScores; + + // words + for (size_t targetPos = 0; targetPos < numRealWords; ++targetPos) { + for (size_t i = 0; i < m_output.size(); ++i) { + FactorType factorType = m_output[i]; + + uint32_t *probingId = (uint32_t*) offset; + + const Factor *factor = GetTargetFactor(*probingId); + assert(factor); + + Word &word = tp->GetWord(targetPos); + word[factorType] = factor; + + offset += sizeof(uint32_t); + } + } + + // align + uint32_t alignTerm = tpInfo->alignTerm; + //cerr << "alignTerm=" << alignTerm << endl; + UTIL_THROW_IF2(alignTerm >= m_aligns.size(), "Unknown alignInd"); + tp->SetAlignTerm(m_aligns[alignTerm]); + + // properties TODO + return tp; } ////////////////////////////////////////////////////////////////// diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.h b/moses/TranslationModel/ProbingPT/ProbingPT.h index 98d052e07..21c01df28 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.h +++ b/moses/TranslationModel/ProbingPT/ProbingPT.h @@ -56,17 +56,25 @@ protected: void CreateAlignmentMap(const std::string path); TargetPhraseCollection::shared_ptr CreateTargetPhrase(const Phrase &sourcePhrase) const; - uint64_t GetSourceProbingId(const Factor *factor) const; - - std::vector ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const; std::pair GetKey(const Phrase &sourcePhrase) const; void GetSourceProbingIds(const Phrase &sourcePhrase, bool &ok, uint64_t probingSource[]) const; uint64_t GetSourceProbingId(const Word &word) const; + uint64_t GetSourceProbingId(const Factor *factor) const; TargetPhraseCollection *CreateTargetPhrases( const Phrase &sourcePhrase, uint64_t key) const; + TargetPhrase *CreateTargetPhrase( + const char *&offset) const; + + inline const Factor *GetTargetFactor(uint32_t probingId) const + { + if (probingId >= m_targetVocab.size()) { + return NULL; + } + return m_targetVocab[probingId]; + } }; -- cgit v1.2.3 From 0e4e64b26dd3b82a0dfbfe2445f89e1dcbbdf61a Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 5 Oct 2016 16:26:33 +0100 Subject: getKey() -> Moses::getKey() --- moses/TranslationModel/ProbingPT/quering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/TranslationModel/ProbingPT/quering.cpp b/moses/TranslationModel/ProbingPT/quering.cpp index ef980ef06..d616e1f25 100644 --- a/moses/TranslationModel/ProbingPT/quering.cpp +++ b/moses/TranslationModel/ProbingPT/quering.cpp @@ -96,7 +96,7 @@ uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const { //TOO SLOW //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size()); - return getKey(source_phrase, size); + return Moses::getKey(source_phrase, size); } std::pair QueryEngine::query(uint64_t key) -- cgit v1.2.3 From 7d7ae1b72ca6487cd50dba6d20d0ba4a4b08b782 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 5 Oct 2016 16:43:04 +0100 Subject: add StoreVocab --- moses/TranslationModel/ProbingPT/StoreTarget.cpp | 266 +++++++++++++++++++++++ moses/TranslationModel/ProbingPT/StoreTarget.h | 51 +++++ moses/TranslationModel/ProbingPT/StoreVocab.cpp | 13 ++ moses/TranslationModel/ProbingPT/StoreVocab.h | 64 ++++++ 4 files changed, 394 insertions(+) create mode 100644 moses/TranslationModel/ProbingPT/StoreTarget.cpp create mode 100644 moses/TranslationModel/ProbingPT/StoreTarget.h create mode 100644 moses/TranslationModel/ProbingPT/StoreVocab.cpp create mode 100644 moses/TranslationModel/ProbingPT/StoreVocab.h diff --git a/moses/TranslationModel/ProbingPT/StoreTarget.cpp b/moses/TranslationModel/ProbingPT/StoreTarget.cpp new file mode 100644 index 000000000..8072f408b --- /dev/null +++ b/moses/TranslationModel/ProbingPT/StoreTarget.cpp @@ -0,0 +1,266 @@ +/* + * StoreTarget.cpp + * + * Created on: 19 Jan 2016 + * Author: hieu + */ +#include +#include "StoreTarget.h" +#include "line_splitter.hh" +#include "probing_hash_utils.hh" +#include "moses/OutputFileStream.h" +#include "moses/Util.h" + +using namespace std; + +namespace Moses +{ + +StoreTarget::StoreTarget(const std::string &basepath) +:m_basePath(basepath) +,m_vocab(basepath + "/TargetVocab.dat") +{ + std::string path = basepath + "/TargetColl.dat"; + m_fileTargetColl.open(path.c_str(), + std::ios::out | std::ios::binary | std::ios::ate | std::ios::trunc); + if (!m_fileTargetColl.is_open()) { + throw "can't create file "; + } + +} + +StoreTarget::~StoreTarget() +{ + assert(m_coll.empty()); + m_fileTargetColl.close(); + + // vocab + m_vocab.Save(); +} + +uint64_t StoreTarget::Save() +{ + uint64_t ret = m_fileTargetColl.tellp(); + + // save to disk + uint64_t numTP = m_coll.size(); + m_fileTargetColl.write((char*) &numTP, sizeof(uint64_t)); + + for (size_t i = 0; i < m_coll.size(); ++i) { + Save(*m_coll[i]); + } + + // clear coll + RemoveAllInColl(m_coll); + m_coll.clear(); + + // starting position of coll + return ret; +} + +void StoreTarget::Save(const target_text &rule) +{ + // metadata for each tp + TargetPhraseInfo tpInfo; + tpInfo.alignTerm = GetAlignId(rule.word_align_term); + tpInfo.alignNonTerm = GetAlignId(rule.word_align_non_term); + tpInfo.numWords = rule.target_phrase.size(); + tpInfo.propLength = rule.property.size(); + + //cerr << "TPInfo=" << sizeof(TPInfo); + m_fileTargetColl.write((char*) &tpInfo, sizeof(TargetPhraseInfo)); + + // scores + for (size_t i = 0; i < rule.prob.size(); ++i) { + float prob = rule.prob[i]; + m_fileTargetColl.write((char*) &prob, sizeof(prob)); + } + + // tp + for (size_t i = 0; i < rule.target_phrase.size(); ++i) { + uint32_t vocabId = rule.target_phrase[i]; + m_fileTargetColl.write((char*) &vocabId, sizeof(vocabId)); + } + + // prop TODO + +} + +void StoreTarget::SaveAlignment() +{ + std::string path = m_basePath + "/Alignments.dat"; + OutputFileStream file(path); + + BOOST_FOREACH(Alignments::value_type &valPair, m_aligns) { + file << valPair.second << "\t"; + + const std::vector &aligns = valPair.first; + BOOST_FOREACH(size_t align, aligns) { + file << align << " "; + } + file << endl; + } + +} + +void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg) +{ + target_text *rule = new target_text; + //cerr << "line.target_phrase=" << line.target_phrase << endl; + + // target_phrase + vector nonTerms; + util::TokenIter it; + it = util::TokenIter(line.target_phrase, + util::SingleCharacter(' ')); + while (it) { + StringPiece word = *it; + //cerr << "word=" << word << endl; + + bool nonTerm = false; + if (scfg) { + // not really sure how to handle factored SCFG and NT + if (scfg && word[0] == '[' && word[word.size() - 1] == ']') { + //cerr << "NON-TERM=" << tok << " " << nonTerms.size() << endl; + nonTerm = true; + } + nonTerms.push_back(nonTerm); + } + + util::TokenIter itFactor; + itFactor = util::TokenIter(word, + util::SingleCharacter('|')); + while (itFactor) { + StringPiece factor = *itFactor; + + string factorStr = factor.as_string(); + uint32_t vocabId = m_vocab.GetVocabId(factorStr); + + rule->target_phrase.push_back(vocabId); + + itFactor++; + } + + it++; + } + + // probs + it = util::TokenIter(line.prob, + util::SingleCharacter(' ')); + while (it) { + string tok = it->as_string(); + float prob = Scan(tok); + + if (log_prob) { + prob = FloorScore(log(prob)); + if (prob == 0.0f) prob = 0.0000000001; + } + + rule->prob.push_back(prob); + it++; + } + + /* + cerr << "nonTerms="; + for (size_t i = 0; i < nonTerms.size(); ++i) { + cerr << nonTerms[i] << " "; + } + cerr << endl; + */ + + // alignment + it = util::TokenIter(line.word_align, + util::SingleCharacter(' ')); + while (it) { + string tokPair = Trim(it->as_string()); + if (tokPair.empty()) { + break; + } + + vector alignPair = Tokenize(tokPair, "-"); + assert(alignPair.size() == 2); + + bool nonTerm = false; + size_t sourcePos = alignPair[0]; + size_t targetPos = alignPair[1]; + if (scfg) { + nonTerm = nonTerms[targetPos]; + } + + //cerr << targetPos << "=" << nonTerm << endl; + + if (nonTerm) { + rule->word_align_non_term.push_back(sourcePos); + rule->word_align_non_term.push_back(targetPos); + //cerr << (int) rule->word_all1.back() << " "; + } + else { + rule->word_align_term.push_back(sourcePos); + rule->word_align_term.push_back(targetPos); + } + + it++; + } + + // extra scores + string prop = line.property.as_string(); + AppendLexRO(prop, rule->prob, log_prob); + + //cerr << "line.property=" << line.property << endl; + //cerr << "prop=" << prop << endl; + + // properties + /* + for (size_t i = 0; i < prop.size(); ++i) { + rule->property.push_back(prop[i]); + } + */ + m_coll.push_back(rule); +} + +uint32_t StoreTarget::GetAlignId(const std::vector &align) +{ + boost::unordered_map, uint32_t>::iterator iter = + m_aligns.find(align); + if (iter == m_aligns.end()) { + uint32_t ind = m_aligns.size(); + m_aligns[align] = ind; + return ind; + } + else { + return iter->second; + } +} + +void StoreTarget::AppendLexRO(std::string &prop, std::vector &retvector, + bool log_prob) const +{ + size_t startPos = prop.find("{{LexRO "); + + if (startPos != string::npos) { + size_t endPos = prop.find("}}", startPos + 8); + string lexProb = prop.substr(startPos + 8, endPos - startPos - 8); + //cerr << "lexProb=" << lexProb << endl; + + // append lex probs to pt probs + vector scores = Tokenize(lexProb); + + if (log_prob) { + for (size_t i = 0; i < scores.size(); ++i) { + scores[i] = FloorScore(log(scores[i])); + if (scores[i] == 0.0f) scores[i] = 0.0000000001; + } + } + + for (size_t i = 0; i < scores.size(); ++i) { + retvector.push_back(scores[i]); + } + + // exclude LexRO property from property column + prop = prop.substr(0, startPos) + + prop.substr(endPos + 2, prop.size() - endPos - 2); + //cerr << "line.property_to_be_binarized=" << line.property_to_be_binarized << "AAAA" << endl; + } +} + +} /* namespace Moses2 */ diff --git a/moses/TranslationModel/ProbingPT/StoreTarget.h b/moses/TranslationModel/ProbingPT/StoreTarget.h new file mode 100644 index 000000000..5c7d9e1b7 --- /dev/null +++ b/moses/TranslationModel/ProbingPT/StoreTarget.h @@ -0,0 +1,51 @@ +/* + * StoreTarget.h + * + * Created on: 19 Jan 2016 + * Author: hieu + */ +#pragma once +#include +#include +#include +#include +#include +#include +#include "StoreVocab.h" + +namespace Moses +{ + +class line_text; +class target_text; + +class StoreTarget +{ +public: + StoreTarget(const std::string &basepath); + virtual ~StoreTarget(); + + uint64_t Save(); + void SaveAlignment(); + + void Append(const line_text &line, bool log_prob, bool scfg); +protected: + std::string m_basePath; + std::fstream m_fileTargetColl; + StoreVocab m_vocab; + + typedef boost::unordered_map, uint32_t> Alignments; + Alignments m_aligns; + + std::vector m_coll; + + uint32_t GetAlignId(const std::vector &align); + void Save(const target_text &rule); + + void AppendLexRO(std::string &prop, std::vector &retvector, + bool log_prob) const; + +}; + +} /* namespace Moses2 */ + diff --git a/moses/TranslationModel/ProbingPT/StoreVocab.cpp b/moses/TranslationModel/ProbingPT/StoreVocab.cpp new file mode 100644 index 000000000..6515bac63 --- /dev/null +++ b/moses/TranslationModel/ProbingPT/StoreVocab.cpp @@ -0,0 +1,13 @@ +/* + * StoreVocab.cpp + * + * Created on: 15 Jun 2016 + * Author: hieu + */ +#include +#include "StoreVocab.h" + +namespace Moses +{ + +} /* namespace Moses2 */ diff --git a/moses/TranslationModel/ProbingPT/StoreVocab.h b/moses/TranslationModel/ProbingPT/StoreVocab.h new file mode 100644 index 000000000..05d279f4c --- /dev/null +++ b/moses/TranslationModel/ProbingPT/StoreVocab.h @@ -0,0 +1,64 @@ +/* + * StoreVocab.h + * + * Created on: 15 Jun 2016 + * Author: hieu + */ +#pragma once +#include +#include +#include "moses/OutputFileStream.h" +#include "moses/Util.h" + +namespace Moses +{ + +template +class StoreVocab +{ +protected: + std::string m_path; + + typedef boost::unordered_map Coll; + Coll m_vocab; + +public: + StoreVocab(const std::string &path) + :m_path(path) + {} + + virtual ~StoreVocab() {} + + VOCABID GetVocabId(const std::string &word) + { + typename Coll::iterator iter = m_vocab.find(word); + if (iter == m_vocab.end()) { + VOCABID ind = m_vocab.size() + 1; + m_vocab[word] = ind; + return ind; + } + else { + return iter->second; + } + } + + void Insert(VOCABID id, const std::string &word) + { + m_vocab[word] = id; + } + + void Save() + { + OutputFileStream strme(m_path); + + typename Coll::const_iterator iter; + for (iter = m_vocab.begin(); iter != m_vocab.end(); ++iter) { + strme << iter->first << "\t" << iter->second << std::endl; + } + + strme.Close(); + } +}; + +} /* namespace Moses2 */ + -- cgit v1.2.3 From 43ece9b1fff4ef94117e946f24269d2dbf17b20e Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 5 Oct 2016 16:53:16 +0100 Subject: compile error with gcc 4.6.3 --- contrib/moses2/MemPool.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/contrib/moses2/MemPool.h b/contrib/moses2/MemPool.h index 5820ce2be..8160aa5a3 100644 --- a/contrib/moses2/MemPool.h +++ b/contrib/moses2/MemPool.h @@ -230,6 +230,14 @@ public: //std::cerr << "destroy " << p << " " << n << std::endl; } + // return address of values + pointer address (reference value) const { + return &value; + } + const_pointer address (const_reference value) const { + return &value; + } + MemPool &m_pool; protected: }; -- cgit v1.2.3 From cb348f159adae8208ded7042f51c5d9dd739ccb0 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Wed, 5 Oct 2016 16:57:12 +0100 Subject: add --scfg arg --- scripts/generic/binarize4moses2.perl | 41 +++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/scripts/generic/binarize4moses2.perl b/scripts/generic/binarize4moses2.perl index 5b9f08e50..5765c3705 100755 --- a/scripts/generic/binarize4moses2.perl +++ b/scripts/generic/binarize4moses2.perl @@ -12,22 +12,23 @@ my $mosesDir = "$RealBin/../.."; my $ptPath; my $lexRoPath; my $outPath; -my $numScores = 4; my $numLexScores; my $pruneNum = 0; +my $scfg = 0; GetOptions("phrase-table=s" => \$ptPath, "lex-ro=s" => \$lexRoPath, "output-dir=s" => \$outPath, - "num-scores=s" => \$numScores, "num-lex-scores=i" => \$numLexScores, - "prune=i" => \$pruneNum + "prune=i" => \$pruneNum, + "scfg" => \$scfg ) or exit 1; +#print STDERR "scfg=$scfg \n"; die("ERROR: please set --phrase-table") unless defined($ptPath); -die("ERROR: please set --lex-ro") unless defined($lexRoPath); +#die("ERROR: please set --lex-ro") unless defined($lexRoPath); die("ERROR: please set --output-dir") unless defined($outPath); -die("ERROR: please set --num-lex-scores") unless defined($numLexScores); +#die("ERROR: please set --num-lex-scores") unless defined($numLexScores); my $cmd; @@ -37,13 +38,33 @@ my $tempPath = dirname($outPath) ."/tmp.$$"; $cmd = "gzip -dc $ptPath | $mosesDir/contrib/sigtest-filter/filter-pt -n $pruneNum | gzip -c > $tempPath/pt.gz"; systemCheck($cmd); -$cmd = "$mosesDir/bin/processLexicalTableMin -in $lexRoPath -out $tempPath/lex-ro -T . -threads all"; -systemCheck($cmd); +if (defined($lexRoPath)) { + die("ERROR: please set --num-lex-scores") unless defined($numLexScores); -$cmd = "$mosesDir/bin/addLexROtoPT $tempPath/pt.gz $tempPath/lex-ro.minlexr | gzip -c > $tempPath/pt.withLexRO.gz"; -systemCheck($cmd); + $cmd = "$mosesDir/bin/processLexicalTableMin -in $lexRoPath -out $tempPath/lex-ro -T . -threads all"; + systemCheck($cmd); + + $cmd = "$mosesDir/bin/addLexROtoPT $tempPath/pt.gz $tempPath/lex-ro.minlexr | gzip -c > $tempPath/pt.withLexRO.gz"; + systemCheck($cmd); + + $cmd = "ln -s pt.withLexRO.gz $tempPath/pt.txt.gz"; + systemCheck($cmd); +} +else { + $cmd = "ln -s pt.gz $tempPath/pt.txt.gz"; + systemCheck($cmd); +} + +$cmd = "$mosesDir/bin/CreateProbingPT2 --log-prob --input-pt $tempPath/pt.txt.gz --output-dir $outPath"; + +if (defined($lexRoPath)) { + $cmd .= " --num-lex-scores $numLexScores"; +} + +if ($scfg) { + $cmd .= " --scfg"; +} -$cmd = "$mosesDir/bin/CreateProbingPT2 --num-scores $numScores --num-lex-scores $numLexScores --log-prob --input-pt $tempPath/pt.withLexRO.gz --output-dir $outPath"; systemCheck($cmd); exit(0); -- cgit v1.2.3 From fa888166c00d266c09de6f22d123901aae15d73a Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 6 Oct 2016 13:10:55 +0100 Subject: no segfault. yay --- moses/TranslationModel/ProbingPT/ProbingPT.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp index 1fd982f0e..06b1360cd 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp +++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp @@ -304,6 +304,7 @@ TargetPhrase *ProbingPT::CreateTargetPhrase( // words for (size_t targetPos = 0; targetPos < numRealWords; ++targetPos) { + Word &word = tp->AddWord(); for (size_t i = 0; i < m_output.size(); ++i) { FactorType factorType = m_output[i]; @@ -312,7 +313,6 @@ TargetPhrase *ProbingPT::CreateTargetPhrase( const Factor *factor = GetTargetFactor(*probingId); assert(factor); - Word &word = tp->GetWord(targetPos); word[factorType] = factor; offset += sizeof(uint32_t); -- cgit v1.2.3 From d03991acec06b10bdd6fe213aac64012978ae90b Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 6 Oct 2016 13:31:18 +0100 Subject: delete CreateProbingPT2 --- Jamroot | 2 + contrib/moses2/CreateProbingPT2.cpp | 113 ------------------------------------ contrib/moses2/Jamfile | 3 +- 3 files changed, 3 insertions(+), 115 deletions(-) delete mode 100644 contrib/moses2/CreateProbingPT2.cpp diff --git a/Jamroot b/Jamroot index efafa0122..7a7be5c93 100644 --- a/Jamroot +++ b/Jamroot @@ -341,3 +341,5 @@ if [ path.exists $(TOP)/dist ] && $(prefix) != dist { local temp = [ _shell "mkdir -p $(TOP)/bin" ] ; local temp = [ _shell "rm -f $(TOP)/bin/moses_chart" ] ; local temp = [ _shell "cd $(TOP)/bin && ln -s moses moses_chart" ] ; +local temp = [ _shell "cd $(TOP)/bin && ln -s CreateProbingPT CreateProbingPT2" ] ; + diff --git a/contrib/moses2/CreateProbingPT2.cpp b/contrib/moses2/CreateProbingPT2.cpp deleted file mode 100644 index 24b0e2fd1..000000000 --- a/contrib/moses2/CreateProbingPT2.cpp +++ /dev/null @@ -1,113 +0,0 @@ -#include -#include -#include "util/usage.hh" -#include "TranslationModel/ProbingPT/storing.hh" -#include "legacy/InputFileStream.h" -#include "legacy/OutputFileStream.h" -#include "legacy/Util2.h" - -using namespace std; - -std::string ReformatSCFGFile(const std::string &path); - -int main(int argc, char* argv[]) -{ - string inPath, outPath; - int num_scores = 4; - int num_lex_scores = 0; - bool log_prob = false; - bool scfg = false; - int max_cache_size = 50000; - - namespace po = boost::program_options; - po::options_description desc("Options"); - desc.add_options() - ("help", "Print help messages") - ("input-pt", po::value()->required(), "Text pt") - ("output-dir", po::value()->required(), "Directory when binary files will be written") - ("num-scores", po::value()->default_value(num_scores), "Number of pt scores") - ("num-lex-scores", po::value()->default_value(num_lex_scores), "Number of lexicalized reordering scores") - ("log-prob", "log (and floor) probabilities before storing") - ("max-cache-size", po::value()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit") - ("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS") - - ; - - po::variables_map vm; - try { - po::store(po::parse_command_line(argc, argv, desc), - vm); // can throw - - /** --help option - */ - if ( vm.count("help")) { - std::cout << desc << std::endl; - return EXIT_SUCCESS; - } - - po::notify(vm); // throws on error, so do after help in case - // there are any problems - } catch(po::error& e) { - std::cerr << "ERROR: " << e.what() << std::endl << std::endl; - std::cerr << desc << std::endl; - return EXIT_FAILURE; - } - - if (vm.count("input-pt")) inPath = vm["input-pt"].as(); - if (vm.count("output-dir")) outPath = vm["output-dir"].as(); - if (vm.count("num-scores")) num_scores = vm["num-scores"].as(); - if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as(); - if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as(); - if (vm.count("log-prob")) log_prob = true; - if (vm.count("scfg")) scfg = true; - - - if (scfg) { - inPath = ReformatSCFGFile(inPath); - } - - Moses2::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg); - - //util::PrintUsage(std::cout); - return 0; -} - -std::string ReformatSCFGFile(const std::string &path) -{ - Moses2::InputFileStream inFile(path); - string reformattedPath = path + ".reformat.gz"; - Moses2::OutputFileStream outFile(reformattedPath); - - string line; - while (getline(inFile, line)) { - vector toks = Moses2::TokenizeMultiCharSeparator(line, "|||"); - assert(toks.size() >= 3); - - // source - vector sourceToks = Moses2::Tokenize(toks[0], " "); - for (size_t i = 0; i < sourceToks.size() - 1; ++i) { - outFile << sourceToks[i] << " "; - } - - // other columns - for (size_t i = 1; i < toks.size(); ++i) { - outFile << "|||" << toks[i]; - } - outFile << endl; - } - - inFile.Close(); - outFile.Close(); - - string sortedPath = path + ".reformat.sorted.gz"; - string tmpPath = path + ".tmp "; - string cmd = "mkdir " + tmpPath - + " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath; - system(cmd.c_str()); - - cmd = "rm -rf " + tmpPath + " " + reformattedPath; - system(cmd.c_str()); - - return sortedPath; -} - diff --git a/contrib/moses2/Jamfile b/contrib/moses2/Jamfile index 193ac8db5..8791e3cf9 100644 --- a/contrib/moses2/Jamfile +++ b/contrib/moses2/Jamfile @@ -173,11 +173,10 @@ alias deps : ../..//z ../..//boost_iostreams ../..//boost_filesystem ../../mose deps ; exe moses2 : Main.cpp moses2_lib ; -exe CreateProbingPT2 : CreateProbingPT2.cpp moses2_lib ; if [ xmlrpc ] { echo "Building Moses2" ; - alias programs : moses2 CreateProbingPT2 ; + alias programs : moses2 ; } else { echo "Not building Moses2" ; -- cgit v1.2.3 From babc5acf70730bd9fd28d0d4deede8fe0fb23c29 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 6 Oct 2016 13:48:06 +0100 Subject: check that pt exists --- contrib/moses2/TranslationModel/ProbingPT/quering.cpp | 7 ++++++- moses/TranslationModel/ProbingPT/quering.cpp | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/contrib/moses2/TranslationModel/ProbingPT/quering.cpp b/contrib/moses2/TranslationModel/ProbingPT/quering.cpp index f26439442..36e384e73 100644 --- a/contrib/moses2/TranslationModel/ProbingPT/quering.cpp +++ b/contrib/moses2/TranslationModel/ProbingPT/quering.cpp @@ -12,10 +12,15 @@ QueryEngine::QueryEngine(const char * filepath) //Create filepaths std::string basepath(filepath); + std::string path_to_config = basepath + "/config"; std::string path_to_hashtable = basepath + "/probing_hash.dat"; std::string path_to_source_vocabid = basepath + "/source_vocabids"; std::string alignPath = basepath + "/Alignments.dat"; + if (!FileExists(path_to_config)) { + UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config); + } + ///Source phrase vocabids read_map(source_vocabids, path_to_source_vocabid.c_str()); @@ -25,7 +30,7 @@ QueryEngine::QueryEngine(const char * filepath) //Read config file boost::unordered_map keyValue; - std::ifstream config((basepath + "/config").c_str()); + std::ifstream config(path_to_config.c_str()); std::string line; while (getline(config, line)) { std::vector toks = Moses2::Tokenize(line, "\t"); diff --git a/moses/TranslationModel/ProbingPT/quering.cpp b/moses/TranslationModel/ProbingPT/quering.cpp index d616e1f25..52cd7f516 100644 --- a/moses/TranslationModel/ProbingPT/quering.cpp +++ b/moses/TranslationModel/ProbingPT/quering.cpp @@ -11,10 +11,15 @@ QueryEngine::QueryEngine(const char * filepath) //Create filepaths std::string basepath(filepath); + std::string path_to_config = basepath + "/config"; std::string path_to_hashtable = basepath + "/probing_hash.dat"; std::string path_to_source_vocabid = basepath + "/source_vocabids"; std::string alignPath = basepath + "/Alignments.dat"; + if (!FileExists(path_to_config)) { + UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config); + } + ///Source phrase vocabids read_map(source_vocabids, path_to_source_vocabid.c_str()); @@ -24,7 +29,7 @@ QueryEngine::QueryEngine(const char * filepath) //Read config file boost::unordered_map keyValue; - std::ifstream config((basepath + "/config").c_str()); + std::ifstream config(path_to_config.c_str()); std::string line; while (getline(config, line)) { std::vector toks = Tokenize(line, "\t"); -- cgit v1.2.3 From a2fd8d5b2c43f0008a050aa850ed387b2289c9c9 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 6 Oct 2016 13:57:33 +0100 Subject: quering -> querying --- contrib/moses2/Jamfile | 2 +- .../TranslationModel/ProbingPT/ProbingPT.cpp | 2 +- .../moses2/TranslationModel/ProbingPT/quering.cpp | 143 --------------------- .../moses2/TranslationModel/ProbingPT/quering.hh | 65 ---------- .../moses2/TranslationModel/ProbingPT/querying.cpp | 143 +++++++++++++++++++++ .../moses2/TranslationModel/ProbingPT/querying.hh | 65 ++++++++++ contrib/other-builds/moses/.project | 8 +- moses/TranslationModel/ProbingPT/ProbingPT.cpp | 2 +- moses/TranslationModel/ProbingPT/quering.cpp | 142 -------------------- moses/TranslationModel/ProbingPT/quering.hh | 65 ---------- moses/TranslationModel/ProbingPT/querying.cpp | 142 ++++++++++++++++++++ moses/TranslationModel/ProbingPT/querying.hh | 65 ++++++++++ 12 files changed, 422 insertions(+), 422 deletions(-) delete mode 100644 contrib/moses2/TranslationModel/ProbingPT/quering.cpp delete mode 100644 contrib/moses2/TranslationModel/ProbingPT/quering.hh create mode 100644 contrib/moses2/TranslationModel/ProbingPT/querying.cpp create mode 100644 contrib/moses2/TranslationModel/ProbingPT/querying.hh delete mode 100644 moses/TranslationModel/ProbingPT/quering.cpp delete mode 100644 moses/TranslationModel/ProbingPT/quering.hh create mode 100644 moses/TranslationModel/ProbingPT/querying.cpp create mode 100644 moses/TranslationModel/ProbingPT/querying.hh diff --git a/contrib/moses2/Jamfile b/contrib/moses2/Jamfile index 8791e3cf9..850dbcd1f 100644 --- a/contrib/moses2/Jamfile +++ b/contrib/moses2/Jamfile @@ -72,7 +72,7 @@ alias deps : ../..//z ../..//boost_iostreams ../..//boost_filesystem ../../mose TranslationModel/ProbingPT/hash.cpp TranslationModel/ProbingPT/line_splitter.cpp TranslationModel/ProbingPT/probing_hash_utils.cpp - TranslationModel/ProbingPT/quering.cpp + TranslationModel/ProbingPT/querying.cpp TranslationModel/ProbingPT/storing.cpp TranslationModel/ProbingPT/StoreVocab.cpp TranslationModel/ProbingPT/StoreTarget.cpp diff --git a/contrib/moses2/TranslationModel/ProbingPT/ProbingPT.cpp b/contrib/moses2/TranslationModel/ProbingPT/ProbingPT.cpp index 829906b55..2c9a5f31a 100644 --- a/contrib/moses2/TranslationModel/ProbingPT/ProbingPT.cpp +++ b/contrib/moses2/TranslationModel/ProbingPT/ProbingPT.cpp @@ -6,7 +6,7 @@ */ #include #include "ProbingPT.h" -#include "quering.hh" +#include "querying.hh" #include "probing_hash_utils.hh" #include "util/exception.hh" #include "../../System.h" diff --git a/contrib/moses2/TranslationModel/ProbingPT/quering.cpp b/contrib/moses2/TranslationModel/ProbingPT/quering.cpp deleted file mode 100644 index 36e384e73..000000000 --- a/contrib/moses2/TranslationModel/ProbingPT/quering.cpp +++ /dev/null @@ -1,143 +0,0 @@ -#include "quering.hh" -#include "util/exception.hh" -#include "../../legacy/Util2.h" - -using namespace std; - -namespace Moses2 -{ - -QueryEngine::QueryEngine(const char * filepath) -{ - - //Create filepaths - std::string basepath(filepath); - std::string path_to_config = basepath + "/config"; - std::string path_to_hashtable = basepath + "/probing_hash.dat"; - std::string path_to_source_vocabid = basepath + "/source_vocabids"; - std::string alignPath = basepath + "/Alignments.dat"; - - if (!FileExists(path_to_config)) { - UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config); - } - - ///Source phrase vocabids - read_map(source_vocabids, path_to_source_vocabid.c_str()); - - // alignments - read_alignments(alignPath); - - //Read config file - boost::unordered_map keyValue; - - std::ifstream config(path_to_config.c_str()); - std::string line; - while (getline(config, line)) { - std::vector toks = Moses2::Tokenize(line, "\t"); - UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line); - keyValue[ toks[0] ] = toks[1]; - } - - bool found; - //Check API version: - int version; - found = Get(keyValue, "API_VERSION", version); - if (!found) { - std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl; - } - else if (version != API_VERSION) { - std::cerr << "The ProbingPT API has changed. " << version << "!=" - << API_VERSION << " Please rebinarize your phrase tables." << std::endl; - exit(EXIT_FAILURE); - } - - //Get tablesize. - int tablesize; - found = Get(keyValue, "uniq_entries", tablesize); - if (!found) { - std::cerr << "uniq_entries not found" << std::endl; - exit(EXIT_FAILURE); - } - - //Number of scores - found = Get(keyValue, "num_scores", num_scores); - if (!found) { - std::cerr << "num_scores not found" << std::endl; - exit(EXIT_FAILURE); - } - - //How may scores from lex reordering models - found = Get(keyValue, "num_lex_scores", num_lex_scores); - if (!found) { - std::cerr << "num_lex_scores not found" << std::endl; - exit(EXIT_FAILURE); - } - - // have the scores been log() and FloorScore()? - found = Get(keyValue, "log_prob", logProb); - if (!found) { - std::cerr << "logProb not found" << std::endl; - exit(EXIT_FAILURE); - } - - config.close(); - - //Read hashtable - table_filesize = Table::Size(tablesize, 1.2); - mem = readTable(path_to_hashtable.c_str(), table_filesize); - Table table_init(mem, table_filesize); - table = table_init; - - std::cerr << "Initialized successfully! " << std::endl; -} - -QueryEngine::~QueryEngine() -{ - //Clear mmap content from memory. - munmap(mem, table_filesize); - -} - -uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const -{ - //TOO SLOW - //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size()); - return Moses2::getKey(source_phrase, size); -} - -std::pair QueryEngine::query(uint64_t key) -{ - std::pair ret; - - const Entry * entry; - ret.first = table.Find(key, entry); - if (ret.first) { - ret.second = entry->value; - } - return ret; -} - -void QueryEngine::read_alignments(const std::string &alignPath) -{ - std::ifstream strm(alignPath.c_str()); - - string line; - while (getline(strm, line)) { - vector toks = Moses2::Tokenize(line, "\t "); - UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file"); - - uint32_t alignInd = Scan(toks[0]); - if (alignInd >= alignColl.size()) { - alignColl.resize(alignInd + 1); - } - - Alignments &aligns = alignColl[alignInd]; - for (size_t i = 1; i < toks.size(); ++i) { - size_t pos = Scan(toks[i]); - aligns.push_back(pos); - } - } -} - -} - diff --git a/contrib/moses2/TranslationModel/ProbingPT/quering.hh b/contrib/moses2/TranslationModel/ProbingPT/quering.hh deleted file mode 100644 index aae4b4f09..000000000 --- a/contrib/moses2/TranslationModel/ProbingPT/quering.hh +++ /dev/null @@ -1,65 +0,0 @@ -#pragma once - -#include -#include //For finding size of file -#include "vocabid.hh" -#include //toLower -#include -#include "probing_hash_utils.hh" -#include "hash.hh" //Includes line splitter -#include "line_splitter.hh" -#include "../../legacy/Util2.h" - -namespace Moses2 -{ - -class QueryEngine -{ - std::map source_vocabids; - - typedef std::vector Alignments; - std::vector alignColl; - - Table table; - char *mem; //Memory for the table, necessary so that we can correctly destroy the object - - size_t table_filesize; - bool is_reordering; - - void read_alignments(const std::string &alignPath); - -public: - int num_scores; - int num_lex_scores; - bool logProb; - - QueryEngine(const char *); - ~QueryEngine(); - - std::pair query(uint64_t key); - - const std::map &getSourceVocab() const - { return source_vocabids; } - - const std::vector &getAlignments() const - { return alignColl; } - - uint64_t getKey(uint64_t source_phrase[], size_t size) const; - - template - inline bool Get(const boost::unordered_map &keyValue, const std::string &sought, T &found) const - { - boost::unordered_map::const_iterator iter = keyValue.find(sought); - if (iter == keyValue.end()) { - return false; - } - - const std::string &foundStr = iter->second; - found = Scan(foundStr); - return true; - } - -}; - -} - diff --git a/contrib/moses2/TranslationModel/ProbingPT/querying.cpp b/contrib/moses2/TranslationModel/ProbingPT/querying.cpp new file mode 100644 index 000000000..fb8ccef9a --- /dev/null +++ b/contrib/moses2/TranslationModel/ProbingPT/querying.cpp @@ -0,0 +1,143 @@ +#include "querying.hh" +#include "util/exception.hh" +#include "../../legacy/Util2.h" + +using namespace std; + +namespace Moses2 +{ + +QueryEngine::QueryEngine(const char * filepath) +{ + + //Create filepaths + std::string basepath(filepath); + std::string path_to_config = basepath + "/config"; + std::string path_to_hashtable = basepath + "/probing_hash.dat"; + std::string path_to_source_vocabid = basepath + "/source_vocabids"; + std::string alignPath = basepath + "/Alignments.dat"; + + if (!FileExists(path_to_config)) { + UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config); + } + + ///Source phrase vocabids + read_map(source_vocabids, path_to_source_vocabid.c_str()); + + // alignments + read_alignments(alignPath); + + //Read config file + boost::unordered_map keyValue; + + std::ifstream config(path_to_config.c_str()); + std::string line; + while (getline(config, line)) { + std::vector toks = Moses2::Tokenize(line, "\t"); + UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line); + keyValue[ toks[0] ] = toks[1]; + } + + bool found; + //Check API version: + int version; + found = Get(keyValue, "API_VERSION", version); + if (!found) { + std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl; + } + else if (version != API_VERSION) { + std::cerr << "The ProbingPT API has changed. " << version << "!=" + << API_VERSION << " Please rebinarize your phrase tables." << std::endl; + exit(EXIT_FAILURE); + } + + //Get tablesize. + int tablesize; + found = Get(keyValue, "uniq_entries", tablesize); + if (!found) { + std::cerr << "uniq_entries not found" << std::endl; + exit(EXIT_FAILURE); + } + + //Number of scores + found = Get(keyValue, "num_scores", num_scores); + if (!found) { + std::cerr << "num_scores not found" << std::endl; + exit(EXIT_FAILURE); + } + + //How may scores from lex reordering models + found = Get(keyValue, "num_lex_scores", num_lex_scores); + if (!found) { + std::cerr << "num_lex_scores not found" << std::endl; + exit(EXIT_FAILURE); + } + + // have the scores been log() and FloorScore()? + found = Get(keyValue, "log_prob", logProb); + if (!found) { + std::cerr << "logProb not found" << std::endl; + exit(EXIT_FAILURE); + } + + config.close(); + + //Read hashtable + table_filesize = Table::Size(tablesize, 1.2); + mem = readTable(path_to_hashtable.c_str(), table_filesize); + Table table_init(mem, table_filesize); + table = table_init; + + std::cerr << "Initialized successfully! " << std::endl; +} + +QueryEngine::~QueryEngine() +{ + //Clear mmap content from memory. + munmap(mem, table_filesize); + +} + +uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const +{ + //TOO SLOW + //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size()); + return Moses2::getKey(source_phrase, size); +} + +std::pair QueryEngine::query(uint64_t key) +{ + std::pair ret; + + const Entry * entry; + ret.first = table.Find(key, entry); + if (ret.first) { + ret.second = entry->value; + } + return ret; +} + +void QueryEngine::read_alignments(const std::string &alignPath) +{ + std::ifstream strm(alignPath.c_str()); + + string line; + while (getline(strm, line)) { + vector toks = Moses2::Tokenize(line, "\t "); + UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file"); + + uint32_t alignInd = Scan(toks[0]); + if (alignInd >= alignColl.size()) { + alignColl.resize(alignInd + 1); + } + + Alignments &aligns = alignColl[alignInd]; + for (size_t i = 1; i < toks.size(); ++i) { + size_t pos = Scan(toks[i]); + aligns.push_back(pos); + } + } +} + +} + diff --git a/contrib/moses2/TranslationModel/ProbingPT/querying.hh b/contrib/moses2/TranslationModel/ProbingPT/querying.hh new file mode 100644 index 000000000..aae4b4f09 --- /dev/null +++ b/contrib/moses2/TranslationModel/ProbingPT/querying.hh @@ -0,0 +1,65 @@ +#pragma once + +#include +#include //For finding size of file +#include "vocabid.hh" +#include //toLower +#include +#include "probing_hash_utils.hh" +#include "hash.hh" //Includes line splitter +#include "line_splitter.hh" +#include "../../legacy/Util2.h" + +namespace Moses2 +{ + +class QueryEngine +{ + std::map source_vocabids; + + typedef std::vector Alignments; + std::vector alignColl; + + Table table; + char *mem; //Memory for the table, necessary so that we can correctly destroy the object + + size_t table_filesize; + bool is_reordering; + + void read_alignments(const std::string &alignPath); + +public: + int num_scores; + int num_lex_scores; + bool logProb; + + QueryEngine(const char *); + ~QueryEngine(); + + std::pair query(uint64_t key); + + const std::map &getSourceVocab() const + { return source_vocabids; } + + const std::vector &getAlignments() const + { return alignColl; } + + uint64_t getKey(uint64_t source_phrase[], size_t size) const; + + template + inline bool Get(const boost::unordered_map &keyValue, const std::string &sought, T &found) const + { + boost::unordered_map::const_iterator iter = keyValue.find(sought); + if (iter == keyValue.end()) { + return false; + } + + const std::string &foundStr = iter->second; + found = Scan(foundStr); + return true; + } + +}; + +} + diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index c25eb5225..c6b7de6f7 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -3391,14 +3391,14 @@ PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/probing_hash_utils.hh - TranslationModel/ProbingPT/quering.cpp + TranslationModel/ProbingPT/querying.cpp 1 - PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/quering.cpp + PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/querying.cpp - TranslationModel/ProbingPT/quering.hh + TranslationModel/ProbingPT/querying.hh 1 - PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/quering.hh + PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/querying.hh TranslationModel/ProbingPT/storing.cpp diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp index 06b1360cd..8b4505985 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp +++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp @@ -5,7 +5,7 @@ #include "moses/TargetPhraseCollection.h" #include "moses/InputFileStream.h" #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h" -#include "quering.hh" +#include "querying.hh" using namespace std; diff --git a/moses/TranslationModel/ProbingPT/quering.cpp b/moses/TranslationModel/ProbingPT/quering.cpp deleted file mode 100644 index 52cd7f516..000000000 --- a/moses/TranslationModel/ProbingPT/quering.cpp +++ /dev/null @@ -1,142 +0,0 @@ -#include "quering.hh" -#include "util/exception.hh" - -using namespace std; - -namespace Moses -{ - -QueryEngine::QueryEngine(const char * filepath) -{ - - //Create filepaths - std::string basepath(filepath); - std::string path_to_config = basepath + "/config"; - std::string path_to_hashtable = basepath + "/probing_hash.dat"; - std::string path_to_source_vocabid = basepath + "/source_vocabids"; - std::string alignPath = basepath + "/Alignments.dat"; - - if (!FileExists(path_to_config)) { - UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config); - } - - ///Source phrase vocabids - read_map(source_vocabids, path_to_source_vocabid.c_str()); - - // alignments - read_alignments(alignPath); - - //Read config file - boost::unordered_map keyValue; - - std::ifstream config(path_to_config.c_str()); - std::string line; - while (getline(config, line)) { - std::vector toks = Tokenize(line, "\t"); - UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line); - keyValue[ toks[0] ] = toks[1]; - } - - bool found; - //Check API version: - int version; - found = Get(keyValue, "API_VERSION", version); - if (!found) { - std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl; - } - else if (version != API_VERSION) { - std::cerr << "The ProbingPT API has changed. " << version << "!=" - << API_VERSION << " Please rebinarize your phrase tables." << std::endl; - exit(EXIT_FAILURE); - } - - //Get tablesize. - int tablesize; - found = Get(keyValue, "uniq_entries", tablesize); - if (!found) { - std::cerr << "uniq_entries not found" << std::endl; - exit(EXIT_FAILURE); - } - - //Number of scores - found = Get(keyValue, "num_scores", num_scores); - if (!found) { - std::cerr << "num_scores not found" << std::endl; - exit(EXIT_FAILURE); - } - - //How may scores from lex reordering models - found = Get(keyValue, "num_lex_scores", num_lex_scores); - if (!found) { - std::cerr << "num_lex_scores not found" << std::endl; - exit(EXIT_FAILURE); - } - - // have the scores been log() and FloorScore()? - found = Get(keyValue, "log_prob", logProb); - if (!found) { - std::cerr << "logProb not found" << std::endl; - exit(EXIT_FAILURE); - } - - config.close(); - - //Read hashtable - table_filesize = Table::Size(tablesize, 1.2); - mem = readTable(path_to_hashtable.c_str(), table_filesize); - Table table_init(mem, table_filesize); - table = table_init; - - std::cerr << "Initialized successfully! " << std::endl; -} - -QueryEngine::~QueryEngine() -{ - //Clear mmap content from memory. - munmap(mem, table_filesize); - -} - -uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const -{ - //TOO SLOW - //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size()); - return Moses::getKey(source_phrase, size); -} - -std::pair QueryEngine::query(uint64_t key) -{ - std::pair ret; - - const Entry * entry; - ret.first = table.Find(key, entry); - if (ret.first) { - ret.second = entry->value; - } - return ret; -} - -void QueryEngine::read_alignments(const std::string &alignPath) -{ - std::ifstream strm(alignPath.c_str()); - - string line; - while (getline(strm, line)) { - vector toks = Tokenize(line, "\t "); - UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file"); - - uint32_t alignInd = Scan(toks[0]); - if (alignInd >= alignColl.size()) { - alignColl.resize(alignInd + 1); - } - - Alignments &aligns = alignColl[alignInd]; - for (size_t i = 1; i < toks.size(); ++i) { - size_t pos = Scan(toks[i]); - aligns.push_back(pos); - } - } -} - -} - diff --git a/moses/TranslationModel/ProbingPT/quering.hh b/moses/TranslationModel/ProbingPT/quering.hh deleted file mode 100644 index c43c7f3b9..000000000 --- a/moses/TranslationModel/ProbingPT/quering.hh +++ /dev/null @@ -1,65 +0,0 @@ -#pragma once - -#include -#include //For finding size of file -#include "vocabid.hh" -#include //toLower -#include -#include "probing_hash_utils.hh" -#include "hash.hh" //Includes line splitter -#include "line_splitter.hh" -#include "moses//Util.h" - -namespace Moses -{ - -class QueryEngine -{ - std::map source_vocabids; - - typedef std::vector Alignments; - std::vector alignColl; - - Table table; - char *mem; //Memory for the table, necessary so that we can correctly destroy the object - - size_t table_filesize; - bool is_reordering; - - void read_alignments(const std::string &alignPath); - -public: - int num_scores; - int num_lex_scores; - bool logProb; - - QueryEngine(const char *); - ~QueryEngine(); - - std::pair query(uint64_t key); - - const std::map &getSourceVocab() const - { return source_vocabids; } - - const std::vector &getAlignments() const - { return alignColl; } - - uint64_t getKey(uint64_t source_phrase[], size_t size) const; - - template - inline bool Get(const boost::unordered_map &keyValue, const std::string &sought, T &found) const - { - boost::unordered_map::const_iterator iter = keyValue.find(sought); - if (iter == keyValue.end()) { - return false; - } - - const std::string &foundStr = iter->second; - found = Scan(foundStr); - return true; - } - -}; - -} - diff --git a/moses/TranslationModel/ProbingPT/querying.cpp b/moses/TranslationModel/ProbingPT/querying.cpp new file mode 100644 index 000000000..52cd7f516 --- /dev/null +++ b/moses/TranslationModel/ProbingPT/querying.cpp @@ -0,0 +1,142 @@ +#include "quering.hh" +#include "util/exception.hh" + +using namespace std; + +namespace Moses +{ + +QueryEngine::QueryEngine(const char * filepath) +{ + + //Create filepaths + std::string basepath(filepath); + std::string path_to_config = basepath + "/config"; + std::string path_to_hashtable = basepath + "/probing_hash.dat"; + std::string path_to_source_vocabid = basepath + "/source_vocabids"; + std::string alignPath = basepath + "/Alignments.dat"; + + if (!FileExists(path_to_config)) { + UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config); + } + + ///Source phrase vocabids + read_map(source_vocabids, path_to_source_vocabid.c_str()); + + // alignments + read_alignments(alignPath); + + //Read config file + boost::unordered_map keyValue; + + std::ifstream config(path_to_config.c_str()); + std::string line; + while (getline(config, line)) { + std::vector toks = Tokenize(line, "\t"); + UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line); + keyValue[ toks[0] ] = toks[1]; + } + + bool found; + //Check API version: + int version; + found = Get(keyValue, "API_VERSION", version); + if (!found) { + std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl; + } + else if (version != API_VERSION) { + std::cerr << "The ProbingPT API has changed. " << version << "!=" + << API_VERSION << " Please rebinarize your phrase tables." << std::endl; + exit(EXIT_FAILURE); + } + + //Get tablesize. + int tablesize; + found = Get(keyValue, "uniq_entries", tablesize); + if (!found) { + std::cerr << "uniq_entries not found" << std::endl; + exit(EXIT_FAILURE); + } + + //Number of scores + found = Get(keyValue, "num_scores", num_scores); + if (!found) { + std::cerr << "num_scores not found" << std::endl; + exit(EXIT_FAILURE); + } + + //How may scores from lex reordering models + found = Get(keyValue, "num_lex_scores", num_lex_scores); + if (!found) { + std::cerr << "num_lex_scores not found" << std::endl; + exit(EXIT_FAILURE); + } + + // have the scores been log() and FloorScore()? + found = Get(keyValue, "log_prob", logProb); + if (!found) { + std::cerr << "logProb not found" << std::endl; + exit(EXIT_FAILURE); + } + + config.close(); + + //Read hashtable + table_filesize = Table::Size(tablesize, 1.2); + mem = readTable(path_to_hashtable.c_str(), table_filesize); + Table table_init(mem, table_filesize); + table = table_init; + + std::cerr << "Initialized successfully! " << std::endl; +} + +QueryEngine::~QueryEngine() +{ + //Clear mmap content from memory. + munmap(mem, table_filesize); + +} + +uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const +{ + //TOO SLOW + //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size()); + return Moses::getKey(source_phrase, size); +} + +std::pair QueryEngine::query(uint64_t key) +{ + std::pair ret; + + const Entry * entry; + ret.first = table.Find(key, entry); + if (ret.first) { + ret.second = entry->value; + } + return ret; +} + +void QueryEngine::read_alignments(const std::string &alignPath) +{ + std::ifstream strm(alignPath.c_str()); + + string line; + while (getline(strm, line)) { + vector toks = Tokenize(line, "\t "); + UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file"); + + uint32_t alignInd = Scan(toks[0]); + if (alignInd >= alignColl.size()) { + alignColl.resize(alignInd + 1); + } + + Alignments &aligns = alignColl[alignInd]; + for (size_t i = 1; i < toks.size(); ++i) { + size_t pos = Scan(toks[i]); + aligns.push_back(pos); + } + } +} + +} + diff --git a/moses/TranslationModel/ProbingPT/querying.hh b/moses/TranslationModel/ProbingPT/querying.hh new file mode 100644 index 000000000..c43c7f3b9 --- /dev/null +++ b/moses/TranslationModel/ProbingPT/querying.hh @@ -0,0 +1,65 @@ +#pragma once + +#include +#include //For finding size of file +#include "vocabid.hh" +#include //toLower +#include +#include "probing_hash_utils.hh" +#include "hash.hh" //Includes line splitter +#include "line_splitter.hh" +#include "moses//Util.h" + +namespace Moses +{ + +class QueryEngine +{ + std::map source_vocabids; + + typedef std::vector Alignments; + std::vector alignColl; + + Table table; + char *mem; //Memory for the table, necessary so that we can correctly destroy the object + + size_t table_filesize; + bool is_reordering; + + void read_alignments(const std::string &alignPath); + +public: + int num_scores; + int num_lex_scores; + bool logProb; + + QueryEngine(const char *); + ~QueryEngine(); + + std::pair query(uint64_t key); + + const std::map &getSourceVocab() const + { return source_vocabids; } + + const std::vector &getAlignments() const + { return alignColl; } + + uint64_t getKey(uint64_t source_phrase[], size_t size) const; + + template + inline bool Get(const boost::unordered_map &keyValue, const std::string &sought, T &found) const + { + boost::unordered_map::const_iterator iter = keyValue.find(sought); + if (iter == keyValue.end()) { + return false; + } + + const std::string &foundStr = iter->second; + found = Scan(foundStr); + return true; + } + +}; + +} + -- cgit v1.2.3