diff options
author | Hieu Hoang <hieuhoang@gmail.com> | 2016-10-06 16:00:32 +0300 |
---|---|---|
committer | Hieu Hoang <hieuhoang@gmail.com> | 2016-10-06 16:00:32 +0300 |
commit | b7f1b360befe170cd9f366ee6300b0731b813e25 (patch) | |
tree | 3c9ed76628d42d7bb8a17051fd321c9f85dc1466 | |
parent | 2679c30c1b534b1d83b0cd57dc32bcf191fc48f3 (diff) | |
parent | a2fd8d5b2c43f0008a050aa850ed387b2289c9c9 (diff) |
merge
34 files changed, 1525 insertions, 1233 deletions
@@ -341,3 +341,5 @@ if [ path.exists $(TOP)/dist ] && $(prefix) != dist { local temp = [ _shell "mkdir -p $(TOP)/bin" ] ; local temp = [ _shell "rm -f $(TOP)/bin/moses_chart" ] ; local temp = [ _shell "cd $(TOP)/bin && ln -s moses moses_chart" ] ; +local temp = [ _shell "cd $(TOP)/bin && ln -s CreateProbingPT CreateProbingPT2" ] ; + diff --git a/contrib/moses2/CreateProbingPT2.cpp b/contrib/moses2/CreateProbingPT2.cpp deleted file mode 100644 index 24b0e2fd1..000000000 --- a/contrib/moses2/CreateProbingPT2.cpp +++ /dev/null @@ -1,113 +0,0 @@ -#include <string> -#include <boost/program_options.hpp> -#include "util/usage.hh" -#include "TranslationModel/ProbingPT/storing.hh" -#include "legacy/InputFileStream.h" -#include "legacy/OutputFileStream.h" -#include "legacy/Util2.h" - -using namespace std; - -std::string ReformatSCFGFile(const std::string &path); - -int main(int argc, char* argv[]) -{ - string inPath, outPath; - int num_scores = 4; - int num_lex_scores = 0; - bool log_prob = false; - bool scfg = false; - int max_cache_size = 50000; - - namespace po = boost::program_options; - po::options_description desc("Options"); - desc.add_options() - ("help", "Print help messages") - ("input-pt", po::value<string>()->required(), "Text pt") - ("output-dir", po::value<string>()->required(), "Directory when binary files will be written") - ("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores") - ("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores") - ("log-prob", "log (and floor) probabilities before storing") - ("max-cache-size", po::value<int>()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit") - ("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS") - - ; - - po::variables_map vm; - try { - po::store(po::parse_command_line(argc, argv, desc), - vm); // can throw - - /** --help option - */ - if ( vm.count("help")) { - std::cout << desc << std::endl; - return EXIT_SUCCESS; - } - - po::notify(vm); // throws on error, so do after help in case - // there are any problems - } catch(po::error& e) { - std::cerr << "ERROR: " << e.what() << std::endl << std::endl; - std::cerr << desc << std::endl; - return EXIT_FAILURE; - } - - if (vm.count("input-pt")) inPath = vm["input-pt"].as<string>(); - if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>(); - if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>(); - if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>(); - if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as<int>(); - if (vm.count("log-prob")) log_prob = true; - if (vm.count("scfg")) scfg = true; - - - if (scfg) { - inPath = ReformatSCFGFile(inPath); - } - - Moses2::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg); - - //util::PrintUsage(std::cout); - return 0; -} - -std::string ReformatSCFGFile(const std::string &path) -{ - Moses2::InputFileStream inFile(path); - string reformattedPath = path + ".reformat.gz"; - Moses2::OutputFileStream outFile(reformattedPath); - - string line; - while (getline(inFile, line)) { - vector<string> toks = Moses2::TokenizeMultiCharSeparator(line, "|||"); - assert(toks.size() >= 3); - - // source - vector<string> sourceToks = Moses2::Tokenize(toks[0], " "); - for (size_t i = 0; i < sourceToks.size() - 1; ++i) { - outFile << sourceToks[i] << " "; - } - - // other columns - for (size_t i = 1; i < toks.size(); ++i) { - outFile << "|||" << toks[i]; - } - outFile << endl; - } - - inFile.Close(); - outFile.Close(); - - string sortedPath = path + ".reformat.sorted.gz"; - string tmpPath = path + ".tmp "; - string cmd = "mkdir " + tmpPath - + " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath; - system(cmd.c_str()); - - cmd = "rm -rf " + tmpPath + " " + reformattedPath; - system(cmd.c_str()); - - return sortedPath; -} - diff --git a/contrib/moses2/Jamfile b/contrib/moses2/Jamfile index 193ac8db5..850dbcd1f 100644 --- a/contrib/moses2/Jamfile +++ b/contrib/moses2/Jamfile @@ -72,7 +72,7 @@ alias deps : ../..//z ../..//boost_iostreams ../..//boost_filesystem ../../mose TranslationModel/ProbingPT/hash.cpp TranslationModel/ProbingPT/line_splitter.cpp TranslationModel/ProbingPT/probing_hash_utils.cpp - TranslationModel/ProbingPT/quering.cpp + TranslationModel/ProbingPT/querying.cpp TranslationModel/ProbingPT/storing.cpp TranslationModel/ProbingPT/StoreVocab.cpp TranslationModel/ProbingPT/StoreTarget.cpp @@ -173,11 +173,10 @@ alias deps : ../..//z ../..//boost_iostreams ../..//boost_filesystem ../../mose deps ; exe moses2 : Main.cpp moses2_lib ; -exe CreateProbingPT2 : CreateProbingPT2.cpp moses2_lib ; if [ xmlrpc ] { echo "Building Moses2" ; - alias programs : moses2 CreateProbingPT2 ; + alias programs : moses2 ; } else { echo "Not building Moses2" ; diff --git a/contrib/moses2/MemPool.h b/contrib/moses2/MemPool.h index 5820ce2be..8160aa5a3 100644 --- a/contrib/moses2/MemPool.h +++ b/contrib/moses2/MemPool.h @@ -230,6 +230,14 @@ public: //std::cerr << "destroy " << p << " " << n << std::endl; } + // return address of values + pointer address (reference value) const { + return &value; + } + const_pointer address (const_reference value) const { + return &value; + } + MemPool &m_pool; protected: }; diff --git a/contrib/moses2/TranslationModel/ProbingPT/ProbingPT.cpp b/contrib/moses2/TranslationModel/ProbingPT/ProbingPT.cpp index 829906b55..2c9a5f31a 100644 --- a/contrib/moses2/TranslationModel/ProbingPT/ProbingPT.cpp +++ b/contrib/moses2/TranslationModel/ProbingPT/ProbingPT.cpp @@ -6,7 +6,7 @@ */ #include <boost/foreach.hpp> #include "ProbingPT.h" -#include "quering.hh" +#include "querying.hh" #include "probing_hash_utils.hh" #include "util/exception.hh" #include "../../System.h" diff --git a/contrib/moses2/TranslationModel/ProbingPT/quering.cpp b/contrib/moses2/TranslationModel/ProbingPT/querying.cpp index f26439442..fb8ccef9a 100644 --- a/contrib/moses2/TranslationModel/ProbingPT/quering.cpp +++ b/contrib/moses2/TranslationModel/ProbingPT/querying.cpp @@ -1,4 +1,4 @@ -#include "quering.hh" +#include "querying.hh" #include "util/exception.hh" #include "../../legacy/Util2.h" @@ -12,10 +12,15 @@ QueryEngine::QueryEngine(const char * filepath) //Create filepaths std::string basepath(filepath); + std::string path_to_config = basepath + "/config"; std::string path_to_hashtable = basepath + "/probing_hash.dat"; std::string path_to_source_vocabid = basepath + "/source_vocabids"; std::string alignPath = basepath + "/Alignments.dat"; + if (!FileExists(path_to_config)) { + UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config); + } + ///Source phrase vocabids read_map(source_vocabids, path_to_source_vocabid.c_str()); @@ -25,7 +30,7 @@ QueryEngine::QueryEngine(const char * filepath) //Read config file boost::unordered_map<std::string, std::string> keyValue; - std::ifstream config((basepath + "/config").c_str()); + std::ifstream config(path_to_config.c_str()); std::string line; while (getline(config, line)) { std::vector<std::string> toks = Moses2::Tokenize(line, "\t"); diff --git a/contrib/moses2/TranslationModel/ProbingPT/quering.hh b/contrib/moses2/TranslationModel/ProbingPT/querying.hh index aae4b4f09..aae4b4f09 100644 --- a/contrib/moses2/TranslationModel/ProbingPT/quering.hh +++ b/contrib/moses2/TranslationModel/ProbingPT/querying.hh diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index b59f28e08..c6b7de6f7 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -1319,7 +1319,7 @@ <name>FF/PhraseBoundaryFeature.h</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseBoundaryFeature.h</locationURI> - </link> + </link> <link> <name>FF/PhraseDistanceFeature.cpp</name> <type>1</type> @@ -3341,24 +3341,34 @@ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/ProbingPT.h</locationURI> </link> <link> - <name>TranslationModel/ProbingPT/hash.cpp</name> + <name>TranslationModel/ProbingPT/StoreTarget.cpp</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.cpp</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.cpp</locationURI> </link> <link> - <name>TranslationModel/ProbingPT/hash.hh</name> + <name>TranslationModel/ProbingPT/StoreTarget.h</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.h</locationURI> </link> <link> - <name>TranslationModel/ProbingPT/huffmanish.cpp</name> + <name>TranslationModel/ProbingPT/StoreVocab.cpp</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.cpp</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.cpp</locationURI> </link> <link> - <name>TranslationModel/ProbingPT/huffmanish.hh</name> + <name>TranslationModel/ProbingPT/StoreVocab.h</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.hh</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.h</locationURI> + </link> + <link> + <name>TranslationModel/ProbingPT/hash.cpp</name> + <type>1</type> + <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.cpp</locationURI> + </link> + <link> + <name>TranslationModel/ProbingPT/hash.hh</name> + <type>1</type> + <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh</locationURI> </link> <link> <name>TranslationModel/ProbingPT/line_splitter.cpp</name> @@ -3381,14 +3391,14 @@ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/probing_hash_utils.hh</locationURI> </link> <link> - <name>TranslationModel/ProbingPT/quering.cpp</name> + <name>TranslationModel/ProbingPT/querying.cpp</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/quering.cpp</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/querying.cpp</locationURI> </link> <link> - <name>TranslationModel/ProbingPT/quering.hh</name> + <name>TranslationModel/ProbingPT/querying.hh</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/quering.hh</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/querying.hh</locationURI> </link> <link> <name>TranslationModel/ProbingPT/storing.cpp</name> @@ -3664,7 +3674,7 @@ <name>TranslationModel/UG/sapt_pscore_coherence.h</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h</locationURI> - </link> + </link> <link> <name>TranslationModel/UG/sapt_pscore_lex1.h</name> <type>1</type> @@ -3709,7 +3719,7 @@ <name>TranslationModel/UG/sapt_pscore_wordcount.h</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h</locationURI> - </link> + </link> <link> <name>TranslationModel/UG/sim-pe.cc</name> <type>1</type> diff --git a/misc/CreateProbingPT.cpp b/misc/CreateProbingPT.cpp index b23427f30..dff916660 100644 --- a/misc/CreateProbingPT.cpp +++ b/misc/CreateProbingPT.cpp @@ -1,29 +1,113 @@ +#include <string> +#include <boost/program_options.hpp> #include "util/usage.hh" #include "moses/TranslationModel/ProbingPT/storing.hh" +#include "moses/InputFileStream.h" +#include "moses/OutputFileStream.h" +#include "moses/Util.h" +using namespace std; +std::string ReformatSCFGFile(const std::string &path); int main(int argc, char* argv[]) { + string inPath, outPath; + int num_scores = 4; + int num_lex_scores = 0; + bool log_prob = false; + bool scfg = false; + int max_cache_size = 50000; - const char * is_reordering = "false"; + namespace po = boost::program_options; + po::options_description desc("Options"); + desc.add_options() + ("help", "Print help messages") + ("input-pt", po::value<string>()->required(), "Text pt") + ("output-dir", po::value<string>()->required(), "Directory when binary files will be written") + ("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores") + ("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores") + ("log-prob", "log (and floor) probabilities before storing") + ("max-cache-size", po::value<int>()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit") + ("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS") - if (!(argc == 5 || argc == 4)) { - // Tell the user how to run the program - std::cerr << "Provided " << argc << " arguments, needed 4 or 5." << std::endl; - std::cerr << "Usage: " << argv[0] << " path_to_phrasetable output_dir num_scores is_reordering" << std::endl; - std::cerr << "is_reordering should be either true or false, but it is currently a stub feature." << std::endl; - //std::cerr << "Usage: " << argv[0] << " path_to_phrasetable number_of_uniq_lines output_bin_file output_hash_table output_vocab_id" << std::endl; - return 1; + ; + + po::variables_map vm; + try { + po::store(po::parse_command_line(argc, argv, desc), + vm); // can throw + + /** --help option + */ + if ( vm.count("help")) { + std::cout << desc << std::endl; + return EXIT_SUCCESS; + } + + po::notify(vm); // throws on error, so do after help in case + // there are any problems + } catch(po::error& e) { + std::cerr << "ERROR: " << e.what() << std::endl << std::endl; + std::cerr << desc << std::endl; + return EXIT_FAILURE; } - if (argc == 5) { - is_reordering = argv[4]; + if (vm.count("input-pt")) inPath = vm["input-pt"].as<string>(); + if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>(); + if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>(); + if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>(); + if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as<int>(); + if (vm.count("log-prob")) log_prob = true; + if (vm.count("scfg")) scfg = true; + + + if (scfg) { + inPath = ReformatSCFGFile(inPath); } - createProbingPT(argv[1], argv[2], argv[3], is_reordering); + Moses::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg); - util::PrintUsage(std::cout); + //util::PrintUsage(std::cout); return 0; } +std::string ReformatSCFGFile(const std::string &path) +{ + Moses::InputFileStream inFile(path); + string reformattedPath = path + ".reformat.gz"; + Moses::OutputFileStream outFile(reformattedPath); + + string line; + while (getline(inFile, line)) { + vector<string> toks = Moses::TokenizeMultiCharSeparator(line, "|||"); + assert(toks.size() >= 3); + + // source + vector<string> sourceToks = Moses::Tokenize(toks[0], " "); + for (size_t i = 0; i < sourceToks.size() - 1; ++i) { + outFile << sourceToks[i] << " "; + } + + // other columns + for (size_t i = 1; i < toks.size(); ++i) { + outFile << "|||" << toks[i]; + } + outFile << endl; + } + + inFile.Close(); + outFile.Close(); + + string sortedPath = path + ".reformat.sorted.gz"; + string tmpPath = path + ".tmp "; + string cmd = "mkdir " + tmpPath + + " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath; + system(cmd.c_str()); + + cmd = "rm -rf " + tmpPath + " " + reformattedPath; + system(cmd.c_str()); + + return sortedPath; +} + diff --git a/misc/Jamfile b/misc/Jamfile index f1599aca8..135490a46 100644 --- a/misc/Jamfile +++ b/misc/Jamfile @@ -31,9 +31,9 @@ else { } exe CreateProbingPT : CreateProbingPT.cpp ..//boost_filesystem ../moses//moses ; -exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ; +#exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ; -alias programsProbing : CreateProbingPT QueryProbingPT ; +alias programsProbing : CreateProbingPT ; #QueryProbingPT exe merge-sorted : merge-sorted.cc diff --git a/misc/QueryProbingPT.cpp b/misc/QueryProbingPT.cpp index 72fd0be11..5047d4d47 100644 --- a/misc/QueryProbingPT.cpp +++ b/misc/QueryProbingPT.cpp @@ -34,7 +34,7 @@ int main(int argc, char* argv[]) return 1; } - QueryEngine queries(argv[1]); + Moses::QueryEngine queries(argv[1]); //Interactive search std::cout << "Please enter a string to be searched, or exit to exit." << std::endl; diff --git a/moses/ScoreComponentCollection.h b/moses/ScoreComponentCollection.h index 1305e9c16..0ab57a73a 100644 --- a/moses/ScoreComponentCollection.h +++ b/moses/ScoreComponentCollection.h @@ -247,6 +247,15 @@ public: } } + void PlusEquals(const FeatureFunction* sp, float scores[]) + { + size_t numScores = sp->GetNumScoreComponents(); + size_t offset = sp->GetIndex(); + for (size_t i = 0; i < numScores; ++i) { + m_scores[i + offset] += scores[i]; + } + } + //! Special version PlusEquals(ScoreProducer, vector<float>) //! to add the score from a single ScoreProducer that produces //! a single value diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp index cbfd2c1a4..8b4505985 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp +++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp @@ -3,8 +3,9 @@ #include "moses/StaticData.h" #include "moses/FactorCollection.h" #include "moses/TargetPhraseCollection.h" +#include "moses/InputFileStream.h" #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h" -#include "quering.hh" +#include "querying.hh" using namespace std; @@ -34,44 +35,94 @@ void ProbingPT::Load(AllOptions::ptr const& opts) m_unkId = 456456546456; + FactorCollection &vocab = FactorCollection::Instance(); + // source vocab - const std::map<uint64_t, std::string> &sourceVocab = m_engine->getSourceVocab(); + const std::map<uint64_t, std::string> &sourceVocab = + m_engine->getSourceVocab(); std::map<uint64_t, std::string>::const_iterator iterSource; - for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end(); ++iterSource) { - const string &wordStr = iterSource->second; - const Factor *factor = FactorCollection::Instance().AddFactor(wordStr); + for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end(); + ++iterSource) { + string wordStr = iterSource->second; + //cerr << "wordStr=" << wordStr << endl; - uint64_t probingId = iterSource->first; + const Factor *factor = vocab.AddFactor(wordStr); - SourceVocabMap::value_type entry(factor, probingId); - m_sourceVocabMap.insert(entry); + uint64_t probingId = iterSource->first; + size_t factorId = factor->GetId(); + if (factorId >= m_sourceVocab.size()) { + m_sourceVocab.resize(factorId + 1, m_unkId); + } + m_sourceVocab[factorId] = probingId; } // target vocab - const std::map<unsigned int, std::string> &probingVocab = m_engine->getVocab(); - std::map<unsigned int, std::string>::const_iterator iter; - for (iter = probingVocab.begin(); iter != probingVocab.end(); ++iter) { - const string &wordStr = iter->second; - const Factor *factor = FactorCollection::Instance().AddFactor(wordStr); + InputFileStream targetVocabStrme(m_filePath + "/TargetVocab.dat"); + string line; + while (getline(targetVocabStrme, line)) { + vector<string> toks = Tokenize(line, "\t"); + UTIL_THROW_IF2(toks.size() != 2, string("Incorrect format:") + line + "\n"); + + //cerr << "wordStr=" << toks[0] << endl; + + const Factor *factor = vocab.AddFactor(toks[0]); + uint32_t probingId = Scan<uint32_t>(toks[1]); + + if (probingId >= m_targetVocab.size()) { + m_targetVocab.resize(probingId + 1); + } + + m_targetVocab[probingId] = factor; + } + + // alignments + CreateAlignmentMap(m_filePath + "/Alignments.dat"); - unsigned int probingId = iter->first; + // memory mapped file to tps + string filePath = m_filePath + "/TargetColl.dat"; + file.open(filePath.c_str()); + if (!file.is_open()) { + throw "Couldn't open file "; + } + + data = file.data(); + //size_t size = file.size(); + + // cache + //CreateCache(system); - TargetVocabMap::value_type entry(factor, probingId); - m_vocabMap.insert(entry); +} +void ProbingPT::CreateAlignmentMap(const std::string path) +{ + const std::vector< std::vector<unsigned char> > &probingAlignColl = m_engine->getAlignments(); + m_aligns.resize(probingAlignColl.size(), NULL); + + for (size_t i = 0; i < probingAlignColl.size(); ++i) { + AlignmentInfo::CollType aligns; + + const std::vector<unsigned char> &probingAligns = probingAlignColl[i]; + for (size_t j = 0; j < probingAligns.size(); j += 2) { + size_t startPos = probingAligns[j]; + size_t endPos = probingAligns[j+1]; + //cerr << "startPos=" << startPos << " " << endPos << endl; + aligns.insert(std::pair<size_t,size_t>(startPos, endPos)); + } + + const AlignmentInfo *align = AlignmentInfoCollection::Instance().Add(aligns); + m_aligns[i] = align; + //cerr << "align=" << align->Debug(system) << endl; } } void ProbingPT::InitializeForInput(ttasksptr const& ttask) { - ReduceCache(); + } void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const { - CacheColl &cache = GetCache(); - InputPathList::const_iterator iter; for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) { InputPath &inputPath = **iter; @@ -82,133 +133,206 @@ void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQue } TargetPhraseCollection::shared_ptr tpColl = CreateTargetPhrase(sourcePhrase); + inputPath.SetTargetPhrases(*this, tpColl, NULL); + } +} - // add target phrase to phrase-table cache - size_t hash = hash_value(sourcePhrase); - std::pair<TargetPhraseCollection::shared_ptr , clock_t> value(tpColl, clock()); - cache[hash] = value; +TargetPhraseCollection::shared_ptr ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase) const +{ + // create a target phrase from the 1st word of the source, prefix with 'ProbingPT:' + assert(sourcePhrase.GetSize()); - inputPath.SetTargetPhrases(*this, tpColl, NULL); + std::pair<bool, uint64_t> keyStruct = GetKey(sourcePhrase); + if (!keyStruct.first) { + return TargetPhraseCollection::shared_ptr(); + } + + // check in cache + CachePb::const_iterator iter = m_cachePb.find(keyStruct.second); + if (iter != m_cachePb.end()) { + //cerr << "FOUND IN CACHE " << keyStruct.second << " " << sourcePhrase.Debug(mgr.system) << endl; + TargetPhraseCollection *tps = iter->second; + return TargetPhraseCollection::shared_ptr(tps); + } + + // query pt + TargetPhraseCollection *tps = CreateTargetPhrases(sourcePhrase, + keyStruct.second); + return TargetPhraseCollection::shared_ptr(tps); +} + +std::pair<bool, uint64_t> ProbingPT::GetKey(const Phrase &sourcePhrase) const +{ + std::pair<bool, uint64_t> ret; + + // create a target phrase from the 1st word of the source, prefix with 'ProbingPT:' + size_t sourceSize = sourcePhrase.GetSize(); + assert(sourceSize); + + uint64_t probingSource[sourceSize]; + GetSourceProbingIds(sourcePhrase, ret.first, probingSource); + if (!ret.first) { + // source phrase contains a word unknown in the pt. + // We know immediately there's no translation for it + } + else { + ret.second = m_engine->getKey(probingSource, sourceSize); } + + return ret; + } -std::vector<uint64_t> ProbingPT::ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const +void ProbingPT::GetSourceProbingIds(const Phrase &sourcePhrase, + bool &ok, uint64_t probingSource[]) const { + size_t size = sourcePhrase.GetSize(); - std::vector<uint64_t> ret(size); for (size_t i = 0; i < size; ++i) { - const Factor *factor = sourcePhrase.GetFactor(i, m_input[0]); - uint64_t probingId = GetSourceProbingId(factor); + const Word &word = sourcePhrase.GetWord(i); + uint64_t probingId = GetSourceProbingId(word); if (probingId == m_unkId) { ok = false; - return ret; - } else { - ret[i] = probingId; + return; + } + else { + probingSource[i] = probingId; } } ok = true; - return ret; } -TargetPhraseCollection::shared_ptr ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase) const +uint64_t ProbingPT::GetSourceProbingId(const Word &word) const { - // create a target phrase from the 1st word of the source, prefix with 'ProbingPT:' - assert(sourcePhrase.GetSize()); + uint64_t ret = 0; - TargetPhraseCollection::shared_ptr tpColl; - bool ok; - vector<uint64_t> probingSource = ConvertToProbingSourcePhrase(sourcePhrase, ok); - if (!ok) { - // source phrase contains a word unknown in the pt. - // We know immediately there's no translation for it - return tpColl; + for (size_t i = 0; i < m_input.size(); ++i) { + FactorType factorType = m_input[i]; + const Factor *factor = word[factorType]; + + size_t factorId = factor->GetId(); + if (factorId >= m_sourceVocab.size()) { + return m_unkId; + } + ret += m_sourceVocab[factorId]; } - std::pair<bool, std::vector<target_text> > query_result; + return ret; +} + +TargetPhraseCollection *ProbingPT::CreateTargetPhrases( + const Phrase &sourcePhrase, uint64_t key) const +{ + TargetPhraseCollection *tps = NULL; //Actual lookup - query_result = m_engine->query(probingSource); + std::pair<bool, uint64_t> query_result; // 1st=found, 2nd=target file offset + query_result = m_engine->query(key); + //cerr << "key2=" << query_result.second << endl; if (query_result.first) { - //m_engine->printTargetInfo(query_result.second); - tpColl.reset(new TargetPhraseCollection()); + const char *offset = data + query_result.second; + uint64_t *numTP = (uint64_t*) offset; + + tps = new TargetPhraseCollection(); + + offset += sizeof(uint64_t); + for (size_t i = 0; i < *numTP; ++i) { + TargetPhrase *tp = CreateTargetPhrase(offset); + assert(tp); + tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply()); - const std::vector<target_text> &probingTargetPhrases = query_result.second; - for (size_t i = 0; i < probingTargetPhrases.size(); ++i) { - const target_text &probingTargetPhrase = probingTargetPhrases[i]; - TargetPhrase *tp = CreateTargetPhrase(sourcePhrase, probingTargetPhrase); + tps->Add(tp); - tpColl->Add(tp); } - tpColl->Prune(true, m_tableLimit); + tps->Prune(true, m_tableLimit); + //cerr << *tps << endl; } - return tpColl; + return tps; + } -TargetPhrase *ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const +TargetPhrase *ProbingPT::CreateTargetPhrase( + const char *&offset) const { - const std::vector<unsigned int> &probingPhrase = probingTargetPhrase.target_phrase; - size_t size = probingPhrase.size(); + TargetPhraseInfo *tpInfo = (TargetPhraseInfo*) offset; + size_t numRealWords = tpInfo->numWords / m_output.size(); TargetPhrase *tp = new TargetPhrase(this); - // words - for (size_t i = 0; i < size; ++i) { - uint64_t probingId = probingPhrase[i]; - const Factor *factor = GetTargetFactor(probingId); - assert(factor); + offset += sizeof(TargetPhraseInfo); - Word &word = tp->AddWord(); - word.SetFactor(m_output[0], factor); - } + // scores + float *scores = (float*) offset; - // score for this phrase table - vector<float> scores = probingTargetPhrase.prob; - std::transform(scores.begin(), scores.end(), scores.begin(),TransformScore); - tp->GetScoreBreakdown().PlusEquals(this, scores); + size_t totalNumScores = m_engine->num_scores + m_engine->num_lex_scores; - // alignment - /* - const std::vector<unsigned char> &alignments = probingTargetPhrase.word_all1; + if (m_engine->logProb) { + // set pt score for rule + tp->GetScoreBreakdown().PlusEquals(this, scores); - AlignmentInfo &aligns = tp->GetAlignTerm(); - for (size_t i = 0; i < alignS.size(); i += 2 ) { - aligns.Add((size_t) alignments[i], (size_t) alignments[i+1]); + // save scores for other FF, eg. lex RO. Just give the offset + /* + if (m_engine->num_lex_scores) { + tp->scoreProperties = scores + m_engine->num_scores; + } + */ } - */ + else { + // log score 1st + float logScores[totalNumScores]; + for (size_t i = 0; i < totalNumScores; ++i) { + logScores[i] = FloorScore(TransformScore(scores[i])); + } - // score of all other ff when this rule is being loaded - tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply()); - return tp; -} + // set pt score for rule + tp->GetScoreBreakdown().PlusEquals(this, logScores); -const Factor *ProbingPT::GetTargetFactor(uint64_t probingId) const -{ - TargetVocabMap::right_map::const_iterator iter; - iter = m_vocabMap.right.find(probingId); - if (iter != m_vocabMap.right.end()) { - return iter->second; - } else { - // not in mapping. Must be UNK - return NULL; + // save scores for other FF, eg. lex RO. + /* + tp->scoreProperties = pool.Allocate<SCORE>(m_engine->num_lex_scores); + for (size_t i = 0; i < m_engine->num_lex_scores; ++i) { + tp->scoreProperties[i] = logScores[i + m_engine->num_scores]; + } + */ } -} -uint64_t ProbingPT::GetSourceProbingId(const Factor *factor) const -{ - SourceVocabMap::left_map::const_iterator iter; - iter = m_sourceVocabMap.left.find(factor); - if (iter != m_sourceVocabMap.left.end()) { - return iter->second; - } else { - // not in mapping. Must be UNK - return m_unkId; + offset += sizeof(float) * totalNumScores; + + // words + for (size_t targetPos = 0; targetPos < numRealWords; ++targetPos) { + Word &word = tp->AddWord(); + for (size_t i = 0; i < m_output.size(); ++i) { + FactorType factorType = m_output[i]; + + uint32_t *probingId = (uint32_t*) offset; + + const Factor *factor = GetTargetFactor(*probingId); + assert(factor); + + word[factorType] = factor; + + offset += sizeof(uint32_t); + } } + + // align + uint32_t alignTerm = tpInfo->alignTerm; + //cerr << "alignTerm=" << alignTerm << endl; + UTIL_THROW_IF2(alignTerm >= m_aligns.size(), "Unknown alignInd"); + tp->SetAlignTerm(m_aligns[alignTerm]); + + // properties TODO + + return tp; } +////////////////////////////////////////////////////////////////// + + ChartRuleLookupManager *ProbingPT::CreateRuleLookupManager( const ChartParser &, const ChartCellCollectionBase &, diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.h b/moses/TranslationModel/ProbingPT/ProbingPT.h index 4e7ab02c6..21c01df28 100644 --- a/moses/TranslationModel/ProbingPT/ProbingPT.h +++ b/moses/TranslationModel/ProbingPT/ProbingPT.h @@ -1,17 +1,18 @@ #pragma once - +#include <boost/iostreams/device/mapped_file.hpp> #include <boost/bimap.hpp> +#include <boost/unordered_map.hpp> #include "../PhraseDictionary.h" -class QueryEngine; -class target_text; namespace Moses { class ChartParser; class ChartCellCollectionBase; class ChartRuleLookupManager; +class QueryEngine; +class target_text; class ProbingPT : public PhraseDictionary { @@ -39,21 +40,42 @@ public: protected: QueryEngine *m_engine; + uint64_t m_unkId; - typedef boost::bimap<const Factor *, uint64_t> SourceVocabMap; - mutable SourceVocabMap m_sourceVocabMap; + std::vector<uint64_t> m_sourceVocab; // factor id -> pt id + std::vector<const Factor*> m_targetVocab; // pt id -> factor* + std::vector<const AlignmentInfo*> m_aligns; - typedef boost::bimap<const Factor *, unsigned int> TargetVocabMap; - mutable TargetVocabMap m_vocabMap; + boost::iostreams::mapped_file_source file; + const char *data; + + // caching + typedef boost::unordered_map<uint64_t, TargetPhraseCollection*> CachePb; + CachePb m_cachePb; + + void CreateAlignmentMap(const std::string path); TargetPhraseCollection::shared_ptr CreateTargetPhrase(const Phrase &sourcePhrase) const; - TargetPhrase *CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const; - const Factor *GetTargetFactor(uint64_t probingId) const; + + std::pair<bool, uint64_t> GetKey(const Phrase &sourcePhrase) const; + void GetSourceProbingIds(const Phrase &sourcePhrase, bool &ok, + uint64_t probingSource[]) const; + uint64_t GetSourceProbingId(const Word &word) const; uint64_t GetSourceProbingId(const Factor *factor) const; - std::vector<uint64_t> ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const; + TargetPhraseCollection *CreateTargetPhrases( + const Phrase &sourcePhrase, uint64_t key) const; + TargetPhrase *CreateTargetPhrase( + const char *&offset) const; + + inline const Factor *GetTargetFactor(uint32_t probingId) const + { + if (probingId >= m_targetVocab.size()) { + return NULL; + } + return m_targetVocab[probingId]; + } - uint64_t m_unkId; }; } // namespace Moses diff --git a/moses/TranslationModel/ProbingPT/StoreTarget.cpp b/moses/TranslationModel/ProbingPT/StoreTarget.cpp new file mode 100644 index 000000000..8072f408b --- /dev/null +++ b/moses/TranslationModel/ProbingPT/StoreTarget.cpp @@ -0,0 +1,266 @@ +/* + * StoreTarget.cpp + * + * Created on: 19 Jan 2016 + * Author: hieu + */ +#include <boost/foreach.hpp> +#include "StoreTarget.h" +#include "line_splitter.hh" +#include "probing_hash_utils.hh" +#include "moses/OutputFileStream.h" +#include "moses/Util.h" + +using namespace std; + +namespace Moses +{ + +StoreTarget::StoreTarget(const std::string &basepath) +:m_basePath(basepath) +,m_vocab(basepath + "/TargetVocab.dat") +{ + std::string path = basepath + "/TargetColl.dat"; + m_fileTargetColl.open(path.c_str(), + std::ios::out | std::ios::binary | std::ios::ate | std::ios::trunc); + if (!m_fileTargetColl.is_open()) { + throw "can't create file "; + } + +} + +StoreTarget::~StoreTarget() +{ + assert(m_coll.empty()); + m_fileTargetColl.close(); + + // vocab + m_vocab.Save(); +} + +uint64_t StoreTarget::Save() +{ + uint64_t ret = m_fileTargetColl.tellp(); + + // save to disk + uint64_t numTP = m_coll.size(); + m_fileTargetColl.write((char*) &numTP, sizeof(uint64_t)); + + for (size_t i = 0; i < m_coll.size(); ++i) { + Save(*m_coll[i]); + } + + // clear coll + RemoveAllInColl(m_coll); + m_coll.clear(); + + // starting position of coll + return ret; +} + +void StoreTarget::Save(const target_text &rule) +{ + // metadata for each tp + TargetPhraseInfo tpInfo; + tpInfo.alignTerm = GetAlignId(rule.word_align_term); + tpInfo.alignNonTerm = GetAlignId(rule.word_align_non_term); + tpInfo.numWords = rule.target_phrase.size(); + tpInfo.propLength = rule.property.size(); + + //cerr << "TPInfo=" << sizeof(TPInfo); + m_fileTargetColl.write((char*) &tpInfo, sizeof(TargetPhraseInfo)); + + // scores + for (size_t i = 0; i < rule.prob.size(); ++i) { + float prob = rule.prob[i]; + m_fileTargetColl.write((char*) &prob, sizeof(prob)); + } + + // tp + for (size_t i = 0; i < rule.target_phrase.size(); ++i) { + uint32_t vocabId = rule.target_phrase[i]; + m_fileTargetColl.write((char*) &vocabId, sizeof(vocabId)); + } + + // prop TODO + +} + +void StoreTarget::SaveAlignment() +{ + std::string path = m_basePath + "/Alignments.dat"; + OutputFileStream file(path); + + BOOST_FOREACH(Alignments::value_type &valPair, m_aligns) { + file << valPair.second << "\t"; + + const std::vector<size_t> &aligns = valPair.first; + BOOST_FOREACH(size_t align, aligns) { + file << align << " "; + } + file << endl; + } + +} + +void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg) +{ + target_text *rule = new target_text; + //cerr << "line.target_phrase=" << line.target_phrase << endl; + + // target_phrase + vector<bool> nonTerms; + util::TokenIter<util::SingleCharacter> it; + it = util::TokenIter<util::SingleCharacter>(line.target_phrase, + util::SingleCharacter(' ')); + while (it) { + StringPiece word = *it; + //cerr << "word=" << word << endl; + + bool nonTerm = false; + if (scfg) { + // not really sure how to handle factored SCFG and NT + if (scfg && word[0] == '[' && word[word.size() - 1] == ']') { + //cerr << "NON-TERM=" << tok << " " << nonTerms.size() << endl; + nonTerm = true; + } + nonTerms.push_back(nonTerm); + } + + util::TokenIter<util::SingleCharacter> itFactor; + itFactor = util::TokenIter<util::SingleCharacter>(word, + util::SingleCharacter('|')); + while (itFactor) { + StringPiece factor = *itFactor; + + string factorStr = factor.as_string(); + uint32_t vocabId = m_vocab.GetVocabId(factorStr); + + rule->target_phrase.push_back(vocabId); + + itFactor++; + } + + it++; + } + + // probs + it = util::TokenIter<util::SingleCharacter>(line.prob, + util::SingleCharacter(' ')); + while (it) { + string tok = it->as_string(); + float prob = Scan<float>(tok); + + if (log_prob) { + prob = FloorScore(log(prob)); + if (prob == 0.0f) prob = 0.0000000001; + } + + rule->prob.push_back(prob); + it++; + } + + /* + cerr << "nonTerms="; + for (size_t i = 0; i < nonTerms.size(); ++i) { + cerr << nonTerms[i] << " "; + } + cerr << endl; + */ + + // alignment + it = util::TokenIter<util::SingleCharacter>(line.word_align, + util::SingleCharacter(' ')); + while (it) { + string tokPair = Trim(it->as_string()); + if (tokPair.empty()) { + break; + } + + vector<size_t> alignPair = Tokenize<size_t>(tokPair, "-"); + assert(alignPair.size() == 2); + + bool nonTerm = false; + size_t sourcePos = alignPair[0]; + size_t targetPos = alignPair[1]; + if (scfg) { + nonTerm = nonTerms[targetPos]; + } + + //cerr << targetPos << "=" << nonTerm << endl; + + if (nonTerm) { + rule->word_align_non_term.push_back(sourcePos); + rule->word_align_non_term.push_back(targetPos); + //cerr << (int) rule->word_all1.back() << " "; + } + else { + rule->word_align_term.push_back(sourcePos); + rule->word_align_term.push_back(targetPos); + } + + it++; + } + + // extra scores + string prop = line.property.as_string(); + AppendLexRO(prop, rule->prob, log_prob); + + //cerr << "line.property=" << line.property << endl; + //cerr << "prop=" << prop << endl; + + // properties + /* + for (size_t i = 0; i < prop.size(); ++i) { + rule->property.push_back(prop[i]); + } + */ + m_coll.push_back(rule); +} + +uint32_t StoreTarget::GetAlignId(const std::vector<size_t> &align) +{ + boost::unordered_map<std::vector<size_t>, uint32_t>::iterator iter = + m_aligns.find(align); + if (iter == m_aligns.end()) { + uint32_t ind = m_aligns.size(); + m_aligns[align] = ind; + return ind; + } + else { + return iter->second; + } +} + +void StoreTarget::AppendLexRO(std::string &prop, std::vector<float> &retvector, + bool log_prob) const +{ + size_t startPos = prop.find("{{LexRO "); + + if (startPos != string::npos) { + size_t endPos = prop.find("}}", startPos + 8); + string lexProb = prop.substr(startPos + 8, endPos - startPos - 8); + //cerr << "lexProb=" << lexProb << endl; + + // append lex probs to pt probs + vector<float> scores = Tokenize<float>(lexProb); + + if (log_prob) { + for (size_t i = 0; i < scores.size(); ++i) { + scores[i] = FloorScore(log(scores[i])); + if (scores[i] == 0.0f) scores[i] = 0.0000000001; + } + } + + for (size_t i = 0; i < scores.size(); ++i) { + retvector.push_back(scores[i]); + } + + // exclude LexRO property from property column + prop = prop.substr(0, startPos) + + prop.substr(endPos + 2, prop.size() - endPos - 2); + //cerr << "line.property_to_be_binarized=" << line.property_to_be_binarized << "AAAA" << endl; + } +} + +} /* namespace Moses2 */ diff --git a/moses/TranslationModel/ProbingPT/StoreTarget.h b/moses/TranslationModel/ProbingPT/StoreTarget.h new file mode 100644 index 000000000..5c7d9e1b7 --- /dev/null +++ b/moses/TranslationModel/ProbingPT/StoreTarget.h @@ -0,0 +1,51 @@ +/* + * StoreTarget.h + * + * Created on: 19 Jan 2016 + * Author: hieu + */ +#pragma once +#include <string> +#include <fstream> +#include <vector> +#include <inttypes.h> +#include <boost/unordered_map.hpp> +#include <boost/unordered_set.hpp> +#include "StoreVocab.h" + +namespace Moses +{ + +class line_text; +class target_text; + +class StoreTarget +{ +public: + StoreTarget(const std::string &basepath); + virtual ~StoreTarget(); + + uint64_t Save(); + void SaveAlignment(); + + void Append(const line_text &line, bool log_prob, bool scfg); +protected: + std::string m_basePath; + std::fstream m_fileTargetColl; + StoreVocab<uint32_t> m_vocab; + + typedef boost::unordered_map<std::vector<size_t>, uint32_t> Alignments; + Alignments m_aligns; + + std::vector<target_text*> m_coll; + + uint32_t GetAlignId(const std::vector<size_t> &align); + void Save(const target_text &rule); + + void AppendLexRO(std::string &prop, std::vector<float> &retvector, + bool log_prob) const; + +}; + +} /* namespace Moses2 */ + diff --git a/moses/TranslationModel/ProbingPT/StoreVocab.cpp b/moses/TranslationModel/ProbingPT/StoreVocab.cpp new file mode 100644 index 000000000..6515bac63 --- /dev/null +++ b/moses/TranslationModel/ProbingPT/StoreVocab.cpp @@ -0,0 +1,13 @@ +/* + * StoreVocab.cpp + * + * Created on: 15 Jun 2016 + * Author: hieu + */ +#include <fstream> +#include "StoreVocab.h" + +namespace Moses +{ + +} /* namespace Moses2 */ diff --git a/moses/TranslationModel/ProbingPT/StoreVocab.h b/moses/TranslationModel/ProbingPT/StoreVocab.h new file mode 100644 index 000000000..05d279f4c --- /dev/null +++ b/moses/TranslationModel/ProbingPT/StoreVocab.h @@ -0,0 +1,64 @@ +/* + * StoreVocab.h + * + * Created on: 15 Jun 2016 + * Author: hieu + */ +#pragma once +#include <string> +#include <boost/unordered_map.hpp> +#include "moses/OutputFileStream.h" +#include "moses/Util.h" + +namespace Moses +{ + +template<typename VOCABID> +class StoreVocab +{ +protected: + std::string m_path; + + typedef boost::unordered_map<std::string, VOCABID> Coll; + Coll m_vocab; + +public: + StoreVocab(const std::string &path) + :m_path(path) + {} + + virtual ~StoreVocab() {} + + VOCABID GetVocabId(const std::string &word) + { + typename Coll::iterator iter = m_vocab.find(word); + if (iter == m_vocab.end()) { + VOCABID ind = m_vocab.size() + 1; + m_vocab[word] = ind; + return ind; + } + else { + return iter->second; + } + } + + void Insert(VOCABID id, const std::string &word) + { + m_vocab[word] = id; + } + + void Save() + { + OutputFileStream strme(m_path); + + typename Coll::const_iterator iter; + for (iter = m_vocab.begin(); iter != m_vocab.end(); ++iter) { + strme << iter->first << "\t" << iter->second << std::endl; + } + + strme.Close(); + } +}; + +} /* namespace Moses2 */ + diff --git a/moses/TranslationModel/ProbingPT/hash.cpp b/moses/TranslationModel/ProbingPT/hash.cpp index 8945649ef..27a64b129 100644 --- a/moses/TranslationModel/ProbingPT/hash.cpp +++ b/moses/TranslationModel/ProbingPT/hash.cpp @@ -1,5 +1,11 @@ +#include <iostream> #include "hash.hh" +using namespace std; + +namespace Moses +{ + uint64_t getHash(StringPiece text) { std::size_t len = text.size(); @@ -7,24 +13,32 @@ uint64_t getHash(StringPiece text) return key; } -std::vector<uint64_t> getVocabIDs(StringPiece textin) +std::vector<uint64_t> getVocabIDs(const StringPiece &textin) { //Tokenize std::vector<uint64_t> output; - util::TokenIter<util::SingleCharacter> it(textin, util::SingleCharacter(' ')); + util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' ')); + + while (itWord) { + StringPiece word = *itWord; + uint64_t id = 0; + + util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|')); + while (itFactor) { + StringPiece factor = *itFactor; + //cerr << "factor=" << factor << endl; - while(it) { - output.push_back(getHash(*it)); - it++; + id += getHash(factor); + itFactor++; + } + + output.push_back(id); + itWord++; } return output; } -uint64_t getVocabID(std::string candidate) -{ - std::size_t len = candidate.length(); - uint64_t key = util::MurmurHashNative(candidate.c_str(), len); - return key; -}
\ No newline at end of file +} + diff --git a/moses/TranslationModel/ProbingPT/hash.hh b/moses/TranslationModel/ProbingPT/hash.hh index 607238ae1..f218ad9da 100644 --- a/moses/TranslationModel/ProbingPT/hash.hh +++ b/moses/TranslationModel/ProbingPT/hash.hh @@ -6,9 +6,12 @@ #include "util/tokenize_piece.hh" #include <vector> +namespace Moses +{ + //Gets the MurmurmurHash for give string uint64_t getHash(StringPiece text); -std::vector<uint64_t> getVocabIDs(StringPiece textin); +std::vector<uint64_t> getVocabIDs(const StringPiece &textin); -uint64_t getVocabID(std::string candidate);
\ No newline at end of file +} diff --git a/moses/TranslationModel/ProbingPT/huffmanish.cpp b/moses/TranslationModel/ProbingPT/huffmanish.cpp deleted file mode 100644 index 534fd04d1..000000000 --- a/moses/TranslationModel/ProbingPT/huffmanish.cpp +++ /dev/null @@ -1,451 +0,0 @@ -#include "huffmanish.hh" - -Huffman::Huffman (const char * filepath) -{ - //Read the file - util::FilePiece filein(filepath); - - //Init uniq_lines to zero; - uniq_lines = 0; - - line_text prev_line; //Check for unique lines. - int num_lines = 0 ; - - while (true) { - line_text new_line; - - num_lines++; - - try { - //Process line read - new_line = splitLine(filein.ReadLine()); - count_elements(new_line); //Counts the number of elements, adds new and increments counters. - - } catch (util::EndOfFileException e) { - std::cerr << "Unique entries counted: "; - break; - } - - if (new_line.source_phrase == prev_line.source_phrase) { - continue; - } else { - uniq_lines++; - prev_line = new_line; - } - } - - std::cerr << uniq_lines << std::endl; -} - -void Huffman::count_elements(line_text linein) -{ - //For target phrase: - util::TokenIter<util::SingleCharacter> it(linein.target_phrase, util::SingleCharacter(' ')); - while (it) { - //Check if we have that entry - std::map<std::string, unsigned int>::iterator mapiter; - mapiter = target_phrase_words.find(it->as_string()); - - if (mapiter != target_phrase_words.end()) { - //If the element is found, increment the count. - mapiter->second++; - } else { - //Else create a new entry; - target_phrase_words.insert(std::pair<std::string, unsigned int>(it->as_string(), 1)); - } - it++; - } - - //For word allignment 1 - std::map<std::vector<unsigned char>, unsigned int>::iterator mapiter3; - std::vector<unsigned char> numbers = splitWordAll1(linein.word_align); - mapiter3 = word_all1.find(numbers); - - if (mapiter3 != word_all1.end()) { - //If the element is found, increment the count. - mapiter3->second++; - } else { - //Else create a new entry; - word_all1.insert(std::pair<std::vector<unsigned char>, unsigned int>(numbers, 1)); - } - -} - -//Assigns huffman values for each unique element -void Huffman::assign_values() -{ - //First create vectors for all maps so that we could sort them later. - - //Create a vector for target phrases - for(std::map<std::string, unsigned int>::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) { - target_phrase_words_counts.push_back(*it); - } - //Sort it - std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair()); - - //Create a vector for word allignments 1 - for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) { - word_all1_counts.push_back(*it); - } - //Sort it - std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec()); - - - //Afterwards we assign a value for each phrase, starting from 1, as zero is reserved for delimiter - unsigned int i = 1; //huffman code - for(std::vector<std::pair<std::string, unsigned int> >::iterator it = target_phrase_words_counts.begin(); - it != target_phrase_words_counts.end(); it++) { - target_phrase_huffman.insert(std::pair<std::string, unsigned int>(it->first, i)); - i++; //Go to the next huffman code - } - - i = 1; //Reset i for the next map - for(std::vector<std::pair<std::vector<unsigned char>, unsigned int> >::iterator it = word_all1_counts.begin(); - it != word_all1_counts.end(); it++) { - word_all1_huffman.insert(std::pair<std::vector<unsigned char>, unsigned int>(it->first, i)); - i++; //Go to the next huffman code - } - - //After lookups are produced, clear some memory usage of objects not needed anymore. - target_phrase_words.clear(); - word_all1.clear(); - - target_phrase_words_counts.clear(); - word_all1_counts.clear(); - - std::cerr << "Finished generating huffman codes." << std::endl; - -} - -void Huffman::serialize_maps(const char * dirname) -{ - //Note that directory name should exist. - std::string basedir(dirname); - std::string target_phrase_path(basedir + "/target_phrases"); - std::string probabilities_path(basedir + "/probs"); - std::string word_all1_path(basedir + "/Wall1"); - - //Target phrase - std::ofstream os (target_phrase_path.c_str(), std::ios::binary); - boost::archive::text_oarchive oarch(os); - oarch << lookup_target_phrase; - os.close(); - - //Word all1 - std::ofstream os2 (word_all1_path.c_str(), std::ios::binary); - boost::archive::text_oarchive oarch2(os2); - oarch2 << lookup_word_all1; - os2.close(); -} - -std::vector<unsigned char> Huffman::full_encode_line(line_text line) -{ - return vbyte_encode_line((encode_line(line))); -} - -std::vector<unsigned int> Huffman::encode_line(line_text line) -{ - std::vector<unsigned int> retvector; - - //Get target_phrase first. - util::TokenIter<util::SingleCharacter> it(line.target_phrase, util::SingleCharacter(' ')); - while (it) { - retvector.push_back(target_phrase_huffman.find(it->as_string())->second); - it++; - } - //Add a zero; - retvector.push_back(0); - - //Get probabilities. Reinterpreting the float bytes as unsgined int. - util::TokenIter<util::SingleCharacter> probit(line.prob, util::SingleCharacter(' ')); - while (probit) { - //Sometimes we have too big floats to handle, so first convert to double - double tempnum = atof(probit->data()); - float num = (float)tempnum; - retvector.push_back(reinterpret_float(&num)); - probit++; - } - //Add a zero; - retvector.push_back(0); - - - //Get Word allignments - retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_align))->second); - retvector.push_back(0); - - return retvector; -} - -void Huffman::produce_lookups() -{ - //basically invert every map that we have - for(std::map<std::string, unsigned int>::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) { - lookup_target_phrase.insert(std::pair<unsigned int, std::string>(it->second, it->first)); - } - - for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) { - lookup_word_all1.insert(std::pair<unsigned int, std::vector<unsigned char> >(it->second, it->first)); - } - -} - -HuffmanDecoder::HuffmanDecoder (const char * dirname) -{ - //Read the maps from disk - - //Note that directory name should exist. - std::string basedir(dirname); - std::string target_phrase_path(basedir + "/target_phrases"); - std::string word_all1_path(basedir + "/Wall1"); - - //Target phrases - std::ifstream is (target_phrase_path.c_str(), std::ios::binary); - boost::archive::text_iarchive iarch(is); - iarch >> lookup_target_phrase; - is.close(); - - //Word allignment 1 - std::ifstream is2 (word_all1_path.c_str(), std::ios::binary); - boost::archive::text_iarchive iarch2(is2); - iarch2 >> lookup_word_all1; - is2.close(); - -} - -HuffmanDecoder::HuffmanDecoder (std::map<unsigned int, std::string> * lookup_target, - std::map<unsigned int, std::vector<unsigned char> > * lookup_word1) -{ - lookup_target_phrase = *lookup_target; - lookup_word_all1 = *lookup_word1; -} - -std::vector<target_text> HuffmanDecoder::full_decode_line (std::vector<unsigned char> lines, int num_scores) -{ - std::vector<target_text> retvector; //All target phrases - std::vector<unsigned int> decoded_lines = vbyte_decode_line(lines); //All decoded lines - std::vector<unsigned int>::iterator it = decoded_lines.begin(); //Iterator for them - std::vector<unsigned int> current_target_phrase; //Current target phrase decoded - - short zero_count = 0; //Count home many zeroes we have met. so far. Every 3 zeroes mean a new target phrase. - while(it != decoded_lines.end()) { - if (zero_count == 1) { - //We are extracting scores. we know how many scores there are so we can push them - //to the vector. This is done in case any of the scores is 0, because it would mess - //up the state machine. - for (int i = 0; i < num_scores; i++) { - current_target_phrase.push_back(*it); - it++; - } - } - - if (zero_count == 3) { - //We have finished with this entry, decode it, and add it to the retvector. - retvector.push_back(decode_line(current_target_phrase, num_scores)); - current_target_phrase.clear(); //Clear the current target phrase and the zero_count - zero_count = 0; //So that we can reuse them for the next target phrase - } - //Add to the next target_phrase, number by number. - current_target_phrase.push_back(*it); - if (*it == 0) { - zero_count++; - } - it++; //Go to the next word/symbol - } - //Don't forget the last remaining line! - if (zero_count == 3) { - //We have finished with this entry, decode it, and add it to the retvector. - retvector.push_back(decode_line(current_target_phrase, num_scores)); - current_target_phrase.clear(); //Clear the current target phrase and the zero_count - zero_count = 0; //So that we can reuse them for the next target phrase - } - - return retvector; - -} - -target_text HuffmanDecoder::decode_line (std::vector<unsigned int> input, int num_scores) -{ - //demo decoder - target_text ret; - //Split everything - std::vector<unsigned int> target_phrase; - std::vector<unsigned int> probs; - unsigned int wAll; - - //Split the line into the proper arrays - short num_zeroes = 0; - int counter = 0; - while (num_zeroes < 3) { - unsigned int num = input[counter]; - if (num == 0) { - num_zeroes++; - } else if (num_zeroes == 0) { - target_phrase.push_back(num); - } else if (num_zeroes == 1) { - //Push exactly num_scores scores - for (int i = 0; i < num_scores; i++) { - probs.push_back(num); - counter++; - num = input[counter]; - } - continue; - } else if (num_zeroes == 2) { - wAll = num; - } - counter++; - } - - ret.target_phrase = target_phrase; - ret.word_all1 = lookup_word_all1.find(wAll)->second; - - //Decode probabilities - for (std::vector<unsigned int>::iterator it = probs.begin(); it != probs.end(); it++) { - ret.prob.push_back(reinterpret_uint(&(*it))); - } - - return ret; - -} - -inline std::string HuffmanDecoder::getTargetWordFromID(unsigned int id) -{ - return lookup_target_phrase.find(id)->second; -} - -std::string HuffmanDecoder::getTargetWordsFromIDs(std::vector<unsigned int> ids) -{ - std::string returnstring; - for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) { - returnstring.append(getTargetWordFromID(*it) + " "); - } - - return returnstring; -} - -inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase) -{ - return lookup_target_phrase->find(id)->second; -} - -std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase) -{ - std::string returnstring; - for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) { - returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " "); - } - - return returnstring; -} - -/*Those functions are used to more easily store the floats in the binary phrase table - We convert the float unsinged int so that it is the same as our other values and we can - apply variable byte encoding on top of it.*/ - -inline unsigned int reinterpret_float(float * num) -{ - unsigned int * converted_num; - converted_num = reinterpret_cast<unsigned int *>(num); - return *converted_num; -} - -inline float reinterpret_uint(unsigned int * num) -{ - float * converted_num; - converted_num = reinterpret_cast<float *>(num); - return *converted_num; -} - -/*Mostly taken from stackoverflow, http://stackoverflow.com/questions/5858646/optimizing-variable-length-encoding -and modified in order to return a vector of chars. Implements ULEB128 or variable byte encoding. -This is highly optimized version with unrolled loop */ -inline std::vector<unsigned char> vbyte_encode(unsigned int num) -{ - //Determine how many bytes we are going to take. - short size; - std::vector<unsigned char> byte_vector; - - if (num < 0x00000080U) { - size = 1; - byte_vector.reserve(size); - goto b1; - } - if (num < 0x00004000U) { - size = 2; - byte_vector.reserve(size); - goto b2; - } - if (num < 0x00200000U) { - size = 3; - byte_vector.reserve(size); - goto b3; - } - if (num < 0x10000000U) { - size = 4; - byte_vector.reserve(size); - goto b4; - } - size = 5; - byte_vector.reserve(size); - - - //Now proceed with the encoding. - byte_vector.push_back((num & 0x7f) | 0x80); - num >>= 7; -b4: - byte_vector.push_back((num & 0x7f) | 0x80); - num >>= 7; -b3: - byte_vector.push_back((num & 0x7f) | 0x80); - num >>= 7; -b2: - byte_vector.push_back((num & 0x7f) | 0x80); - num >>= 7; -b1: - byte_vector.push_back(num); - - return byte_vector; -} - -std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line) -{ - std::vector<unsigned int> huffman_line; - std::vector<unsigned char> current_num; - - for (std::vector<unsigned char>::iterator it = line.begin(); it != line.end(); it++) { - current_num.push_back(*it); - if ((*it >> 7) != 1) { - //We don't have continuation in the next bit - huffman_line.push_back(bytes_to_int(current_num)); - current_num.clear(); - } - } - return huffman_line; -} - -inline unsigned int bytes_to_int(std::vector<unsigned char> number) -{ - unsigned int retvalue = 0; - std::vector<unsigned char>::iterator it = number.begin(); - unsigned char shift = 0; //By how many bits to shift - - while (it != number.end()) { - retvalue |= (*it & 0x7f) << shift; - shift += 7; - it++; - } - - return retvalue; -} - -std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line) -{ - std::vector<unsigned char> retvec; - - //For each unsigned int in the line, vbyte encode it and add it to a vector of unsigned chars. - for (std::vector<unsigned int>::iterator it = line.begin(); it != line.end(); it++) { - std::vector<unsigned char> vbyte_encoded = vbyte_encode(*it); - retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end()); - } - - return retvec; -} diff --git a/moses/TranslationModel/ProbingPT/huffmanish.hh b/moses/TranslationModel/ProbingPT/huffmanish.hh deleted file mode 100644 index 0970a9e68..000000000 --- a/moses/TranslationModel/ProbingPT/huffmanish.hh +++ /dev/null @@ -1,112 +0,0 @@ -#pragma once - -//Huffman encodes a line and also produces the vocabulary ids -#include "hash.hh" -#include "line_splitter.hh" -#include <cstdio> -#include <fstream> -#include <iostream> -#include <sstream> -#include <boost/serialization/serialization.hpp> -#include <boost/serialization/vector.hpp> -#include <boost/serialization/map.hpp> -#include <boost/archive/text_iarchive.hpp> -#include <boost/archive/text_oarchive.hpp> - -//Sorting for the second -struct sort_pair { - bool operator()(const std::pair<std::string, unsigned int> &left, const std::pair<std::string, unsigned int> &right) { - return left.second > right.second; //This puts biggest numbers first. - } -}; - -struct sort_pair_vec { - bool operator()(const std::pair<std::vector<unsigned char>, unsigned int> &left, const std::pair<std::vector<unsigned char>, unsigned int> &right) { - return left.second > right.second; //This puts biggest numbers first. - } -}; - -class Huffman -{ - unsigned long uniq_lines; //Unique lines in the file. - - //Containers used when counting the occurence of a given phrase - std::map<std::string, unsigned int> target_phrase_words; - std::map<std::vector<unsigned char>, unsigned int> word_all1; - - //Same containers as vectors, for sorting - std::vector<std::pair<std::string, unsigned int> > target_phrase_words_counts; - std::vector<std::pair<std::vector<unsigned char>, unsigned int> > word_all1_counts; - - //Huffman maps - std::map<std::string, unsigned int> target_phrase_huffman; - std::map<std::vector<unsigned char>, unsigned int> word_all1_huffman; - - //inverted maps - std::map<unsigned int, std::string> lookup_target_phrase; - std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1; - -public: - Huffman (const char *); - void count_elements (line_text line); - void assign_values(); - void serialize_maps(const char * dirname); - void produce_lookups(); - - std::vector<unsigned int> encode_line(line_text line); - - //encode line + variable byte ontop - std::vector<unsigned char> full_encode_line(line_text line); - - //Getters - const std::map<unsigned int, std::string> get_target_lookup_map() const { - return lookup_target_phrase; - } - const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const { - return lookup_word_all1; - } - - unsigned long getUniqLines() { - return uniq_lines; - } -}; - -class HuffmanDecoder -{ - std::map<unsigned int, std::string> lookup_target_phrase; - std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1; - -public: - HuffmanDecoder (const char *); - HuffmanDecoder (std::map<unsigned int, std::string> *, std::map<unsigned int, std::vector<unsigned char> > *); - - //Getters - const std::map<unsigned int, std::string> get_target_lookup_map() const { - return lookup_target_phrase; - } - const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const { - return lookup_word_all1; - } - - inline std::string getTargetWordFromID(unsigned int id); - - std::string getTargetWordsFromIDs(std::vector<unsigned int> ids); - - target_text decode_line (std::vector<unsigned int> input, int num_scores); - - //Variable byte decodes a all target phrases contained here and then passes them to decode_line - std::vector<target_text> full_decode_line (std::vector<unsigned char> lines, int num_scores); -}; - -std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase); - -inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase); - -inline unsigned int reinterpret_float(float * num); - -inline float reinterpret_uint(unsigned int * num); - -std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line); -inline std::vector<unsigned char> vbyte_encode(unsigned int num); -std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line); -inline unsigned int bytes_to_int(std::vector<unsigned char> number); diff --git a/moses/TranslationModel/ProbingPT/line_splitter.cpp b/moses/TranslationModel/ProbingPT/line_splitter.cpp index 1eeeb1899..cb9e47fec 100644 --- a/moses/TranslationModel/ProbingPT/line_splitter.cpp +++ b/moses/TranslationModel/ProbingPT/line_splitter.cpp @@ -1,66 +1,92 @@ #include "line_splitter.hh" -line_text splitLine(StringPiece textin) +namespace Moses { - const char delim[] = " ||| "; + +line_text splitLine(const StringPiece &textin, bool scfg) +{ + const char delim[] = "|||"; line_text output; //Tokenize util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim)); //Get source phrase - output.source_phrase = *it; + output.source_phrase = Trim(*it); + //std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl; //Get target_phrase it++; - output.target_phrase = *it; + output.target_phrase = Trim(*it); + //std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl; + + if (scfg) { + /* + std::cerr << "output.source_phrase=" << output.source_phrase << std::endl; + std::cerr << "output.target_phrase=" << output.target_phrase << std::endl; + reformatSCFG(output); + std::cerr << "output.source_phrase=" << output.source_phrase << std::endl; + std::cerr << "output.target_phrase=" << output.target_phrase << std::endl; + */ + } //Get probabilities it++; - output.prob = *it; + output.prob = Trim(*it); + //std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl; //Get WordAllignment it++; if (it == util::TokenIter<util::MultiCharacter>::end()) return output; - output.word_align = *it; + output.word_align = Trim(*it); + //std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl; //Get count it++; if (it == util::TokenIter<util::MultiCharacter>::end()) return output; - output.counts = *it; + output.counts = Trim(*it); + //std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl; //Get sparse_score it++; if (it == util::TokenIter<util::MultiCharacter>::end()) return output; - output.sparse_score = *it; + output.sparse_score = Trim(*it); + //std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl; //Get property it++; if (it == util::TokenIter<util::MultiCharacter>::end()) return output; - output.property = *it; + output.property = Trim(*it); + //std::cerr << "output.property=" << output.property << "AAAA" << std::endl; return output; } -std::vector<unsigned char> splitWordAll1(StringPiece textin) +std::vector<unsigned char> splitWordAll1(const StringPiece &textin) { const char delim[] = " "; const char delim2[] = "-"; std::vector<unsigned char> output; + //Case with no word alignments. + if (textin.size() == 0) { + return output; + } + //Split on space util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim)); //For each int while (it) { //Split on dash (-) - util::TokenIter<util::MultiCharacter> itInner(*it, util::MultiCharacter(delim2)); + util::TokenIter<util::MultiCharacter> itInner(*it, + util::MultiCharacter(delim2)); //Insert the two entries in the vector. User will read entry 0 and 1 to get the first, //2 and 3 for second etc. Use unsigned char instead of int to save space, as //word allignments are all very small numbers that fit in a single byte - output.push_back((unsigned char)(atoi(itInner->data()))); + output.push_back((unsigned char) (atoi(itInner->data()))); itInner++; - output.push_back((unsigned char)(atoi(itInner->data()))); + output.push_back((unsigned char) (atoi(itInner->data()))); it++; } @@ -68,3 +94,10 @@ std::vector<unsigned char> splitWordAll1(StringPiece textin) } +void reformatSCFG(line_text &output) +{ + +} + +} + diff --git a/moses/TranslationModel/ProbingPT/line_splitter.hh b/moses/TranslationModel/ProbingPT/line_splitter.hh index 2cb9a3c8c..cec0a5f45 100644 --- a/moses/TranslationModel/ProbingPT/line_splitter.hh +++ b/moses/TranslationModel/ProbingPT/line_splitter.hh @@ -9,8 +9,12 @@ #include "util/tokenize_piece.hh" #include <vector> +namespace Moses +{ + //Struct for holding processed line -struct line_text { +struct line_text +{ StringPiece source_phrase; StringPiece target_phrase; StringPiece prob; @@ -18,16 +22,38 @@ struct line_text { StringPiece counts; StringPiece sparse_score; StringPiece property; + std::string property_to_be_binarized; }; //Struct for holding processed line -struct target_text { +struct target_text +{ std::vector<unsigned int> target_phrase; std::vector<float> prob; - std::vector<unsigned char> word_all1; + std::vector<size_t> word_align_term; + std::vector<size_t> word_align_non_term; + std::vector<char> counts; + std::vector<char> sparse_score; + std::vector<char> property; + + /* + void Reset() + { + target_phrase.clear(); + prob.clear(); + word_all1.clear(); + counts.clear(); + sparse_score.clear(); + property.clear(); + } + */ }; //Ask if it's better to have it receive a pointer to a line_text struct -line_text splitLine(StringPiece textin); +line_text splitLine(const StringPiece &textin, bool scfg); +void reformatSCFG(line_text &output); + +std::vector<unsigned char> splitWordAll1(const StringPiece &textin); + +} -std::vector<unsigned char> splitWordAll1(StringPiece textin); diff --git a/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp b/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp index ca3e8f69f..f23f57d66 100644 --- a/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp +++ b/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp @@ -1,5 +1,8 @@ #include "probing_hash_utils.hh" +namespace Moses +{ + //Read table from disk, return memory map location char * readTable(const char * filename, size_t size) { @@ -13,7 +16,7 @@ char * readTable(const char * filename, size_t size) exit(EXIT_FAILURE); } - map = (char *)mmap(0, size, PROT_READ, MAP_SHARED, fd, 0); + map = (char *) mmap(0, size, PROT_READ, MAP_SHARED, fd, 0); if (map == MAP_FAILED) { close(fd); @@ -24,11 +27,24 @@ char * readTable(const char * filename, size_t size) return map; } - -void serialize_table(char *mem, size_t size, const char * filename) +void serialize_table(char *mem, size_t size, const std::string &filename) { - std::ofstream os (filename, std::ios::binary); - os.write((const char*)&mem[0], size); + std::ofstream os(filename.c_str(), std::ios::binary); + os.write((const char*) &mem[0], size); os.close(); -}
\ No newline at end of file +} + +uint64_t getKey(const uint64_t source_phrase[], size_t size) +{ + //TOO SLOW + //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size()); + uint64_t key = 0; + for (size_t i = 0; i < size; i++) { + key += (source_phrase[i] << i); + } + return key; +} + +} + diff --git a/moses/TranslationModel/ProbingPT/probing_hash_utils.hh b/moses/TranslationModel/ProbingPT/probing_hash_utils.hh index de96e87a0..dcf0dbe25 100644 --- a/moses/TranslationModel/ProbingPT/probing_hash_utils.hh +++ b/moses/TranslationModel/ProbingPT/probing_hash_utils.hh @@ -7,31 +7,49 @@ #include <fcntl.h> #include <fstream> +namespace Moses +{ + +#define API_VERSION 15 //Hash table entry -struct Entry { - uint64_t key; +struct Entry +{ typedef uint64_t Key; - unsigned int bytes_toread; + Key key; - uint64_t GetKey() const { + Key GetKey() const + { return key; } - void SetKey(uint64_t to) { + void SetKey(Key to) + { key = to; } - uint64_t GetValue() const { - return value; - } - uint64_t value; }; +#define NONE std::numeric_limits<uint64_t>::max() + //Define table typedef util::ProbingHashTable<Entry, boost::hash<uint64_t> > Table; -void serialize_table(char *mem, size_t size, const char * filename); +void serialize_table(char *mem, size_t size, const std::string &filename); char * readTable(const char * filename, size_t size); + +uint64_t getKey(const uint64_t source_phrase[], size_t size); + +struct TargetPhraseInfo +{ + uint32_t alignTerm; + uint32_t alignNonTerm; + uint16_t numWords; + uint16_t propLength; + uint16_t filler; +}; + +} + diff --git a/moses/TranslationModel/ProbingPT/quering.cpp b/moses/TranslationModel/ProbingPT/quering.cpp deleted file mode 100644 index bd1d61a1e..000000000 --- a/moses/TranslationModel/ProbingPT/quering.cpp +++ /dev/null @@ -1,198 +0,0 @@ -#include "quering.hh" - -unsigned char * read_binary_file(const char * filename, size_t filesize) -{ - //Get filesize - int fd; - unsigned char * map; - - fd = open(filename, O_RDONLY); - - if (fd == -1) { - perror("Error opening file for reading"); - exit(EXIT_FAILURE); - } - - map = (unsigned char *)mmap(0, filesize, PROT_READ, MAP_SHARED, fd, 0); - if (map == MAP_FAILED) { - close(fd); - perror("Error mmapping the file"); - exit(EXIT_FAILURE); - } - - return map; -} - -QueryEngine::QueryEngine(const char * filepath) : decoder(filepath) -{ - - //Create filepaths - std::string basepath(filepath); - std::string path_to_hashtable = basepath + "/probing_hash.dat"; - std::string path_to_data_bin = basepath + "/binfile.dat"; - std::string path_to_source_vocabid = basepath + "/source_vocabids"; - - ///Source phrase vocabids - read_map(&source_vocabids, path_to_source_vocabid.c_str()); - - //Target phrase vocabIDs - vocabids = decoder.get_target_lookup_map(); - - //Read config file - std::string line; - std::ifstream config ((basepath + "/config").c_str()); - //Check API version: - getline(config, line); - if (atoi(line.c_str()) != API_VERSION) { - std::cerr << "The ProbingPT API has changed, please rebinarize your phrase tables." << std::endl; - exit(EXIT_FAILURE); - } - //Get tablesize. - getline(config, line); - int tablesize = atoi(line.c_str()); - //Number of scores - getline(config, line); - num_scores = atoi(line.c_str()); - //do we have a reordering table - getline(config, line); - std::transform(line.begin(), line.end(), line.begin(), ::tolower); //Get the boolean in lowercase - is_reordering = false; - if (line == "true") { - is_reordering = true; - std::cerr << "WARNING. REORDERING TABLES NOT SUPPORTED YET." << std::endl; - } - config.close(); - - //Mmap binary table - struct stat filestatus; - stat(path_to_data_bin.c_str(), &filestatus); - binary_filesize = filestatus.st_size; - binary_mmaped = read_binary_file(path_to_data_bin.c_str(), binary_filesize); - - //Read hashtable - table_filesize = Table::Size(tablesize, 1.2); - mem = readTable(path_to_hashtable.c_str(), table_filesize); - Table table_init(mem, table_filesize); - table = table_init; - - std::cerr << "Initialized successfully! " << std::endl; -} - -QueryEngine::~QueryEngine() -{ - //Clear mmap content from memory. - munmap(binary_mmaped, binary_filesize); - munmap(mem, table_filesize); - -} - -std::pair<bool, std::vector<target_text> > QueryEngine::query(std::vector<uint64_t> source_phrase) -{ - bool found; - std::vector<target_text> translation_entries; - const Entry * entry; - //TOO SLOW - //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size()); - uint64_t key = 0; - for (int i = 0; i < source_phrase.size(); i++) { - key += (source_phrase[i] << i); - } - - - found = table.Find(key, entry); - - if (found) { - //The phrase that was searched for was found! We need to get the translation entries. - //We will read the largest entry in bytes and then filter the unnecesarry with functions - //from line_splitter - uint64_t initial_index = entry -> GetValue(); - unsigned int bytes_toread = entry -> bytes_toread; - - //ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS! - std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array. - encoded_text.reserve(bytes_toread); - for (int i = 0; i < bytes_toread; i++) { - encoded_text.push_back(binary_mmaped[i+initial_index]); - } - - //Get only the translation entries necessary - translation_entries = decoder.full_decode_line(encoded_text, num_scores); - - } - - std::pair<bool, std::vector<target_text> > output (found, translation_entries); - - return output; - -} - -std::pair<bool, std::vector<target_text> > QueryEngine::query(StringPiece source_phrase) -{ - bool found; - std::vector<target_text> translation_entries; - const Entry * entry; - //Convert source frase to VID - std::vector<uint64_t> source_phrase_vid = getVocabIDs(source_phrase); - //TOO SLOW - //uint64_t key = util::MurmurHashNative(&source_phrase_vid[0], source_phrase_vid.size()); - uint64_t key = 0; - for (int i = 0; i < source_phrase_vid.size(); i++) { - key += (source_phrase_vid[i] << i); - } - - found = table.Find(key, entry); - - - if (found) { - //The phrase that was searched for was found! We need to get the translation entries. - //We will read the largest entry in bytes and then filter the unnecesarry with functions - //from line_splitter - uint64_t initial_index = entry -> GetValue(); - unsigned int bytes_toread = entry -> bytes_toread; - //At the end of the file we can't readd + largest_entry cause we get a segfault. - std::cerr << "Entry size is bytes is: " << bytes_toread << std::endl; - - //ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS! - std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array. - encoded_text.reserve(bytes_toread); - for (int i = 0; i < bytes_toread; i++) { - encoded_text.push_back(binary_mmaped[i+initial_index]); - } - - //Get only the translation entries necessary - translation_entries = decoder.full_decode_line(encoded_text, num_scores); - - } - - std::pair<bool, std::vector<target_text> > output (found, translation_entries); - - return output; - -} - -void QueryEngine::printTargetInfo(std::vector<target_text> target_phrases) -{ - int entries = target_phrases.size(); - - for (int i = 0; i<entries; i++) { - std::cout << "Entry " << i+1 << " of " << entries << ":" << std::endl; - //Print text - std::cout << getTargetWordsFromIDs(target_phrases[i].target_phrase, &vocabids) << "\t"; - - //Print probabilities: - for (int j = 0; j<target_phrases[i].prob.size(); j++) { - std::cout << target_phrases[i].prob[j] << " "; - } - std::cout << "\t"; - - //Print word_all1 - for (int j = 0; j<target_phrases[i].word_all1.size(); j++) { - if (j%2 == 0) { - std::cout << (short)target_phrases[i].word_all1[j] << "-"; - } else { - std::cout << (short)target_phrases[i].word_all1[j] << " "; - } - } - std::cout << std::endl; - } -} diff --git a/moses/TranslationModel/ProbingPT/quering.hh b/moses/TranslationModel/ProbingPT/quering.hh deleted file mode 100644 index e574d1f8f..000000000 --- a/moses/TranslationModel/ProbingPT/quering.hh +++ /dev/null @@ -1,45 +0,0 @@ -#pragma once - -#include "probing_hash_utils.hh" -#include "huffmanish.hh" -#include "hash.hh" //Includes line splitter -#include <sys/stat.h> //For finding size of file -#include "vocabid.hh" -#include <algorithm> //toLower -#define API_VERSION 3 - - -char * read_binary_file(char * filename); - -class QueryEngine -{ - unsigned char * binary_mmaped; //The binari phrase table file - std::map<unsigned int, std::string> vocabids; - std::map<uint64_t, std::string> source_vocabids; - - Table table; - char *mem; //Memory for the table, necessary so that we can correctly destroy the object - - HuffmanDecoder decoder; - - size_t binary_filesize; - size_t table_filesize; - int num_scores; - bool is_reordering; -public: - QueryEngine (const char *); - ~QueryEngine(); - std::pair<bool, std::vector<target_text> > query(StringPiece source_phrase); - std::pair<bool, std::vector<target_text> > query(std::vector<uint64_t> source_phrase); - void printTargetInfo(std::vector<target_text> target_phrases); - const std::map<unsigned int, std::string> getVocab() const { - return decoder.get_target_lookup_map(); - } - - const std::map<uint64_t, std::string> getSourceVocab() const { - return source_vocabids; - } - -}; - - diff --git a/moses/TranslationModel/ProbingPT/querying.cpp b/moses/TranslationModel/ProbingPT/querying.cpp new file mode 100644 index 000000000..52cd7f516 --- /dev/null +++ b/moses/TranslationModel/ProbingPT/querying.cpp @@ -0,0 +1,142 @@ +#include "quering.hh" +#include "util/exception.hh" + +using namespace std; + +namespace Moses +{ + +QueryEngine::QueryEngine(const char * filepath) +{ + + //Create filepaths + std::string basepath(filepath); + std::string path_to_config = basepath + "/config"; + std::string path_to_hashtable = basepath + "/probing_hash.dat"; + std::string path_to_source_vocabid = basepath + "/source_vocabids"; + std::string alignPath = basepath + "/Alignments.dat"; + + if (!FileExists(path_to_config)) { + UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config); + } + + ///Source phrase vocabids + read_map(source_vocabids, path_to_source_vocabid.c_str()); + + // alignments + read_alignments(alignPath); + + //Read config file + boost::unordered_map<std::string, std::string> keyValue; + + std::ifstream config(path_to_config.c_str()); + std::string line; + while (getline(config, line)) { + std::vector<std::string> toks = Tokenize(line, "\t"); + UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line); + keyValue[ toks[0] ] = toks[1]; + } + + bool found; + //Check API version: + int version; + found = Get(keyValue, "API_VERSION", version); + if (!found) { + std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl; + } + else if (version != API_VERSION) { + std::cerr << "The ProbingPT API has changed. " << version << "!=" + << API_VERSION << " Please rebinarize your phrase tables." << std::endl; + exit(EXIT_FAILURE); + } + + //Get tablesize. + int tablesize; + found = Get(keyValue, "uniq_entries", tablesize); + if (!found) { + std::cerr << "uniq_entries not found" << std::endl; + exit(EXIT_FAILURE); + } + + //Number of scores + found = Get(keyValue, "num_scores", num_scores); + if (!found) { + std::cerr << "num_scores not found" << std::endl; + exit(EXIT_FAILURE); + } + + //How may scores from lex reordering models + found = Get(keyValue, "num_lex_scores", num_lex_scores); + if (!found) { + std::cerr << "num_lex_scores not found" << std::endl; + exit(EXIT_FAILURE); + } + + // have the scores been log() and FloorScore()? + found = Get(keyValue, "log_prob", logProb); + if (!found) { + std::cerr << "logProb not found" << std::endl; + exit(EXIT_FAILURE); + } + + config.close(); + + //Read hashtable + table_filesize = Table::Size(tablesize, 1.2); + mem = readTable(path_to_hashtable.c_str(), table_filesize); + Table table_init(mem, table_filesize); + table = table_init; + + std::cerr << "Initialized successfully! " << std::endl; +} + +QueryEngine::~QueryEngine() +{ + //Clear mmap content from memory. + munmap(mem, table_filesize); + +} + +uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const +{ + //TOO SLOW + //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size()); + return Moses::getKey(source_phrase, size); +} + +std::pair<bool, uint64_t> QueryEngine::query(uint64_t key) +{ + std::pair<bool, uint64_t> ret; + + const Entry * entry; + ret.first = table.Find(key, entry); + if (ret.first) { + ret.second = entry->value; + } + return ret; +} + +void QueryEngine::read_alignments(const std::string &alignPath) +{ + std::ifstream strm(alignPath.c_str()); + + string line; + while (getline(strm, line)) { + vector<string> toks = Tokenize(line, "\t "); + UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file"); + + uint32_t alignInd = Scan<uint32_t>(toks[0]); + if (alignInd >= alignColl.size()) { + alignColl.resize(alignInd + 1); + } + + Alignments &aligns = alignColl[alignInd]; + for (size_t i = 1; i < toks.size(); ++i) { + size_t pos = Scan<size_t>(toks[i]); + aligns.push_back(pos); + } + } +} + +} + diff --git a/moses/TranslationModel/ProbingPT/querying.hh b/moses/TranslationModel/ProbingPT/querying.hh new file mode 100644 index 000000000..c43c7f3b9 --- /dev/null +++ b/moses/TranslationModel/ProbingPT/querying.hh @@ -0,0 +1,65 @@ +#pragma once + +#include <boost/unordered_map.hpp> +#include <sys/stat.h> //For finding size of file +#include "vocabid.hh" +#include <algorithm> //toLower +#include <deque> +#include "probing_hash_utils.hh" +#include "hash.hh" //Includes line splitter +#include "line_splitter.hh" +#include "moses//Util.h" + +namespace Moses +{ + +class QueryEngine +{ + std::map<uint64_t, std::string> source_vocabids; + + typedef std::vector<unsigned char> Alignments; + std::vector<Alignments> alignColl; + + Table table; + char *mem; //Memory for the table, necessary so that we can correctly destroy the object + + size_t table_filesize; + bool is_reordering; + + void read_alignments(const std::string &alignPath); + +public: + int num_scores; + int num_lex_scores; + bool logProb; + + QueryEngine(const char *); + ~QueryEngine(); + + std::pair<bool, uint64_t> query(uint64_t key); + + const std::map<uint64_t, std::string> &getSourceVocab() const + { return source_vocabids; } + + const std::vector<Alignments> &getAlignments() const + { return alignColl; } + + uint64_t getKey(uint64_t source_phrase[], size_t size) const; + + template<typename T> + inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const + { + boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought); + if (iter == keyValue.end()) { + return false; + } + + const std::string &foundStr = iter->second; + found = Scan<T>(foundStr); + return true; + } + +}; + +} + diff --git a/moses/TranslationModel/ProbingPT/storing.cpp b/moses/TranslationModel/ProbingPT/storing.cpp index 01128c1e4..98dcfd5dc 100644 --- a/moses/TranslationModel/ProbingPT/storing.cpp +++ b/moses/TranslationModel/ProbingPT/storing.cpp @@ -1,161 +1,303 @@ +#include <sys/stat.h> +#include <boost/foreach.hpp> +#include "line_splitter.hh" #include "storing.hh" +#include "StoreTarget.h" +#include "StoreVocab.h" +#include "moses/Util.h" +#include "moses/InputFileStream.h" -BinaryFileWriter::BinaryFileWriter (std::string basepath) : os ((basepath + "/binfile.dat").c_str(), std::ios::binary) +using namespace std; + +namespace Moses { - binfile.reserve(10000); //Reserve part of the vector to avoid realocation - it = binfile.begin(); - dist_from_start = 0; //Initialize variables - extra_counter = 0; -} -void BinaryFileWriter::write (std::vector<unsigned char> * bytes) +/////////////////////////////////////////////////////////////////////// +void Node::Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos) { - binfile.insert(it, bytes->begin(), bytes->end()); //Insert the bytes - //Keep track of the offsets - it += bytes->size(); - dist_from_start = distance(binfile.begin(),it); - //Flush the vector to disk every once in a while so that we don't consume too much ram - if (dist_from_start > 9000) { - flush(); + if (pos < sourcePhrase.size()) { + uint64_t vocabId = sourcePhrase[pos]; + + Node *child; + Children::iterator iter = m_children.find(vocabId); + if (iter == m_children.end()) { + // New node. Write other children then discard them + BOOST_FOREACH(Children::value_type &valPair, m_children) { + Node &otherChild = valPair.second; + otherChild.Write(table); + } + m_children.clear(); + + // create new node + child = &m_children[vocabId]; + assert(!child->done); + child->key = key + (vocabId << pos); + } + else { + child = &iter->second; + } + + child->Add(table, sourcePhrase, pos + 1); + } + else { + // this node was written previously 'cos it has rules + done = true; } } -void BinaryFileWriter::flush () +void Node::Write(Table &table) { - //Cast unsigned char to char before writing... - os.write((char *)&binfile[0], dist_from_start); - //Clear the vector: - binfile.clear(); - binfile.reserve(10000); - extra_counter += dist_from_start; //Keep track of the total number of bytes. - it = binfile.begin(); //Reset iterator - dist_from_start = distance(binfile.begin(),it); //Reset dist from start -} + //cerr << "START write " << done << " " << key << endl; + BOOST_FOREACH(Children::value_type &valPair, m_children) { + Node &child = valPair.second; + child.Write(table); + } -BinaryFileWriter::~BinaryFileWriter () -{ - os.close(); - binfile.clear(); + if (!done) { + // save + Entry sourceEntry; + sourceEntry.value = NONE; + sourceEntry.key = key; + + //Put into table + table.Insert(sourceEntry); + } } -void createProbingPT(const char * phrasetable_path, const char * target_path, - const char * num_scores, const char * is_reordering) +/////////////////////////////////////////////////////////////////////// +void createProbingPT(const std::string &phrasetable_path, + const std::string &basepath, int num_scores, int num_lex_scores, + bool log_prob, int max_cache_size, bool scfg) { + std::cerr << "Starting..." << std::endl; + //Get basepath and create directory if missing - std::string basepath(target_path); mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); - //Set up huffman and serialize decoder maps. - Huffman huffmanEncoder(phrasetable_path); //initialize - huffmanEncoder.assign_values(); - huffmanEncoder.produce_lookups(); - huffmanEncoder.serialize_maps(target_path); + StoreTarget storeTarget(basepath); //Get uniq lines: - unsigned long uniq_entries = huffmanEncoder.getUniqLines(); + unsigned long uniq_entries = countUniqueSource(phrasetable_path); //Source phrase vocabids - std::map<uint64_t, std::string> source_vocabids; + StoreVocab<uint64_t> sourceVocab(basepath + "/source_vocabids"); //Read the file - util::FilePiece filein(phrasetable_path); + util::FilePiece filein(phrasetable_path.c_str()); //Init the probing hash table size_t size = Table::Size(uniq_entries, 1.2); char * mem = new char[size]; memset(mem, 0, size); - Table table(mem, size); + Table sourceEntries(mem, size); - BinaryFileWriter binfile(basepath); //Init the binary file writer. - - line_text prev_line; //Check if the source phrase of the previous line is the same + std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> cache; + float totalSourceCount = 0; //Keep track of the size of each group of target phrases - uint64_t entrystartidx = 0; - //uint64_t line_num = 0; - + size_t line_num = 0; //Read everything and processs - while(true) { + std::string prevSource; + + Node sourcePhrases; + sourcePhrases.done = true; + sourcePhrases.key = 0; + + while (true) { try { //Process line read line_text line; - line = splitLine(filein.ReadLine()); - //Add source phrases to vocabularyIDs - add_to_map(&source_vocabids, line.source_phrase); + line = splitLine(filein.ReadLine(), scfg); + //cerr << "line=" << line.source_phrase << endl; - if ((binfile.dist_from_start + binfile.extra_counter) == 0) { - prev_line = line; //For the first iteration assume the previous line is - } //The same as this one. + ++line_num; + if (line_num % 1000000 == 0) { + std::cerr << line_num << " " << std::flush; + } - if (line.source_phrase != prev_line.source_phrase) { + //Add source phrases to vocabularyIDs + add_to_map(sourceVocab, line.source_phrase); + + if (prevSource.empty()) { + // 1st line + prevSource = line.source_phrase.as_string(); + storeTarget.Append(line, log_prob, scfg); + } + else if (prevSource == line.source_phrase) { + //If we still have the same line, just append to it: + storeTarget.Append(line, log_prob, scfg); + } + else { + assert(prevSource != line.source_phrase); //Create a new entry even + // save + uint64_t targetInd = storeTarget.Save(); + + // next line + storeTarget.Append(line, log_prob, scfg); + //Create an entry for the previous source phrase: - Entry pesho; - pesho.value = entrystartidx; + Entry sourceEntry; + sourceEntry.value = targetInd; //The key is the sum of hashes of individual words bitshifted by their position in the phrase. //Probably not entirerly correct, but fast and seems to work fine in practise. - pesho.key = 0; - std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase); - for (int i = 0; i < vocabid_source.size(); i++) { - pesho.key += (vocabid_source[i] << i); + std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource); + if (scfg) { + // storing prefixes? + sourcePhrases.Add(sourceEntries, vocabid_source); } - pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx; + sourceEntry.key = getKey(vocabid_source); + /* + cerr << "prevSource=" << prevSource << flush + << " vocabids=" << Debug(vocabid_source) << flush + << " key=" << sourceEntry.key << endl; + */ //Put into table - table.Insert(pesho); + sourceEntries.Insert(sourceEntry); - entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry + // update cache - CURRENT source phrase, not prev + if (max_cache_size) { + std::string countStr = line.counts.as_string(); + countStr = Trim(countStr); + if (!countStr.empty()) { + std::vector<float> toks = Tokenize<float>(countStr); + //cerr << "CACHE:" << line.source_phrase << " " << countStr << " " << toks[1] << endl; - //Encode a line and write it to disk. - std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line); - binfile.write(&encoded_line); + if (toks.size() >= 2) { + totalSourceCount += toks[1]; - //Set prevLine - prev_line = line; + // compute key for CURRENT source + std::vector<uint64_t> currVocabidSource = getVocabIDs(line.source_phrase.as_string()); + uint64_t currKey = getKey(currVocabidSource); - } else { - //If we still have the same line, just append to it: - std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line); - binfile.write(&encoded_line); + CacheItem *item = new CacheItem( + Trim(line.source_phrase.as_string()), + currKey, + toks[1]); + cache.push(item); + + if (max_cache_size > 0 && cache.size() > max_cache_size) { + cache.pop(); + } + } + } + } + + //Set prevLine + prevSource = line.source_phrase.as_string(); } - } catch (util::EndOfFileException e) { - std::cerr << "Reading phrase table finished, writing remaining files to disk." << std::endl; - binfile.flush(); + } + catch (util::EndOfFileException e) { + std::cerr + << "Reading phrase table finished, writing remaining files to disk." + << std::endl; //After the final entry is constructed we need to add it to the phrase_table //Create an entry for the previous source phrase: - Entry pesho; - pesho.value = entrystartidx; + uint64_t targetInd = storeTarget.Save(); + + Entry sourceEntry; + sourceEntry.value = targetInd; + //The key is the sum of hashes of individual words. Probably not entirerly correct, but fast - pesho.key = 0; - std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase); - for (int i = 0; i < vocabid_source.size(); i++) { - pesho.key += (vocabid_source[i] << i); - } - pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx; + std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource); + sourceEntry.key = getKey(vocabid_source); + //Put into table - table.Insert(pesho); + sourceEntries.Insert(sourceEntry); break; } } - serialize_table(mem, size, (basepath + "/probing_hash.dat").c_str()); + sourcePhrases.Write(sourceEntries); + + storeTarget.SaveAlignment(); - serialize_map(&source_vocabids, (basepath + "/source_vocabids").c_str()); + serialize_table(mem, size, (basepath + "/probing_hash.dat")); + + sourceVocab.Save(); + + serialize_cache(cache, (basepath + "/cache"), totalSourceCount); delete[] mem; //Write configfile std::ofstream configfile; configfile.open((basepath + "/config").c_str()); - configfile << API_VERSION << '\n'; - configfile << uniq_entries << '\n'; - configfile << num_scores << '\n'; - configfile << is_reordering << '\n'; + configfile << "API_VERSION\t" << API_VERSION << '\n'; + configfile << "uniq_entries\t" << uniq_entries << '\n'; + configfile << "num_scores\t" << num_scores << '\n'; + configfile << "num_lex_scores\t" << num_lex_scores << '\n'; + configfile << "log_prob\t" << log_prob << '\n'; configfile.close(); } + +size_t countUniqueSource(const std::string &path) +{ + size_t ret = 0; + InputFileStream strme(path); + + std::string line, prevSource; + while (std::getline(strme, line)) { + std::vector<std::string> toks = TokenizeMultiCharSeparator(line, "|||"); + assert(toks.size() != 0); + + if (prevSource != toks[0]) { + prevSource = toks[0]; + ++ret; + } + } + + return ret; +} + +void serialize_cache( + std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache, + const std::string &path, float totalSourceCount) +{ + std::vector<const CacheItem*> vec(cache.size()); + + size_t ind = cache.size() - 1; + while (!cache.empty()) { + const CacheItem *item = cache.top(); + vec[ind] = item; + cache.pop(); + --ind; + } + + std::ofstream os(path.c_str()); + + os << totalSourceCount << std::endl; + for (size_t i = 0; i < vec.size(); ++i) { + const CacheItem *item = vec[i]; + os << item->count << "\t" << item->sourceKey << "\t" << item->source << std::endl; + delete item; + } + + os.close(); +} + +uint64_t getKey(const std::vector<uint64_t> &vocabid_source) +{ + return getKey(vocabid_source.data(), vocabid_source.size()); +} + +std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos) +{ + assert(endPos < vocabid_source.size()); + + std::vector<uint64_t> ret(endPos + 1); + for (size_t i = 0; i <= endPos; ++i) { + ret[i] = vocabid_source[i]; + } + return ret; +} + +} + diff --git a/moses/TranslationModel/ProbingPT/storing.hh b/moses/TranslationModel/ProbingPT/storing.hh index 8554d614f..957c73491 100644 --- a/moses/TranslationModel/ProbingPT/storing.hh +++ b/moses/TranslationModel/ProbingPT/storing.hh @@ -1,36 +1,95 @@ #pragma once +#include <boost/unordered_set.hpp> +#include <boost/unordered_map.hpp> #include <cstdio> +#include <sstream> #include <fstream> #include <iostream> +#include <string> +#include <queue> +#include <sys/stat.h> //mkdir #include "hash.hh" //Includes line_splitter #include "probing_hash_utils.hh" -#include "huffmanish.hh" -#include <sys/stat.h> //mkdir #include "util/file_piece.hh" #include "util/file.hh" #include "vocabid.hh" -#define API_VERSION 3 -void createProbingPT(const char * phrasetable_path, const char * target_path, - const char * num_scores, const char * is_reordering); +namespace Moses +{ +typedef std::vector<uint64_t> SourcePhrase; + + +class Node +{ + typedef boost::unordered_map<uint64_t, Node> Children; + Children m_children; + +public: + uint64_t key; + bool done; + + Node() + :done(false) + {} + + void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0); + void Write(Table &table); +}; + + +void createProbingPT(const std::string &phrasetable_path, + const std::string &basepath, int num_scores, int num_lex_scores, + bool log_prob, int max_cache_size, bool scfg); +uint64_t getKey(const std::vector<uint64_t> &source_phrase); + +std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos); -class BinaryFileWriter +template<typename T> +std::string Debug(const std::vector<T> &vec) { - std::vector<unsigned char> binfile; - std::vector<unsigned char>::iterator it; - //Output binary - std::ofstream os; + std::stringstream strm; + for (size_t i = 0; i < vec.size(); ++i) { + strm << vec[i] << " "; + } + return strm.str(); +} +size_t countUniqueSource(const std::string &path); + +class CacheItem +{ public: - unsigned int dist_from_start; //Distance from the start of the vector. - uint64_t extra_counter; //After we reset the counter, we still want to keep track of the correct offset, so + std::string source; + uint64_t sourceKey; + float count; + CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount) + :source(vSource) + ,sourceKey(vSourceKey) + ,count(vCount) + { + } - BinaryFileWriter (std::string); - ~BinaryFileWriter (); - void write (std::vector<unsigned char> * bytes); - void flush (); //Flush to disk + bool operator<(const CacheItem &other) const + { + return count > other.count; + } +}; +class CacheItemOrderer +{ +public: + bool operator()(const CacheItem* a, const CacheItem* b) const + { + return (*a) < (*b); + } }; + +void serialize_cache( + std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache, + const std::string &path, float totalSourceCount); + +} + diff --git a/moses/TranslationModel/ProbingPT/vocabid.cpp b/moses/TranslationModel/ProbingPT/vocabid.cpp index 1452f299d..3d6efe841 100644 --- a/moses/TranslationModel/ProbingPT/vocabid.cpp +++ b/moses/TranslationModel/ProbingPT/vocabid.cpp @@ -1,32 +1,59 @@ +#include <boost/foreach.hpp> #include "vocabid.hh" +#include "StoreVocab.h" +#include "moses/Util.h" -void add_to_map(std::map<uint64_t, std::string> *karta, StringPiece textin) +namespace Moses +{ + +void add_to_map(StoreVocab<uint64_t> &sourceVocab, + const StringPiece &textin) { //Tokenize - util::TokenIter<util::SingleCharacter> it(textin, util::SingleCharacter(' ')); + util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' ')); + + while (itWord) { + StringPiece word = *itWord; - while(it) { - karta->insert(std::pair<uint64_t, std::string>(getHash(*it), it->as_string())); - it++; + util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|')); + while (itFactor) { + StringPiece factor = *itFactor; + + sourceVocab.Insert(getHash(factor), factor.as_string()); + itFactor++; + } + itWord++; } } -void serialize_map(std::map<uint64_t, std::string> *karta, const char* filename) +void serialize_map(const std::map<uint64_t, std::string> &karta, + const std::string &filename) { - std::ofstream os (filename, std::ios::binary); - boost::archive::text_oarchive oarch(os); + std::ofstream os(filename.c_str()); + + std::map<uint64_t, std::string>::const_iterator iter; + for (iter = karta.begin(); iter != karta.end(); ++iter) { + os << iter->first << '\t' << iter->second << std::endl; + } - oarch << *karta; //Serialise map os.close(); } -void read_map(std::map<uint64_t, std::string> *karta, const char* filename) +void read_map(std::map<uint64_t, std::string> &karta, const char* filename) { - std::ifstream is (filename, std::ios::binary); - boost::archive::text_iarchive iarch(is); + std::ifstream is(filename); - iarch >> *karta; + std::string line; + while (getline(is, line)) { + std::vector<std::string> toks = Tokenize(line, "\t"); + assert(toks.size() == 2); + uint64_t ind = Scan<uint64_t>(toks[1]); + karta[ind] = toks[0]; + } //Close the stream after we are done. is.close(); } + +} + diff --git a/moses/TranslationModel/ProbingPT/vocabid.hh b/moses/TranslationModel/ProbingPT/vocabid.hh index 491c53439..f9c9b2dff 100644 --- a/moses/TranslationModel/ProbingPT/vocabid.hh +++ b/moses/TranslationModel/ProbingPT/vocabid.hh @@ -13,8 +13,17 @@ #include "util/string_piece.hh" //Tokenization and work with StringPiece #include "util/tokenize_piece.hh" -void add_to_map(std::map<uint64_t, std::string> *karta, StringPiece textin); +namespace Moses +{ +template<typename VOCABID> +class StoreVocab; -void serialize_map(std::map<uint64_t, std::string> *karta, const char* filename); +void add_to_map(StoreVocab<uint64_t> &sourceVocab, + const StringPiece &textin); -void read_map(std::map<uint64_t, std::string> *karta, const char* filename); +void serialize_map(const std::map<uint64_t, std::string> &karta, + const std::string &filename); + +void read_map(std::map<uint64_t, std::string> &karta, const char* filename); + +} |