Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/other-builds/moses/.project32
-rw-r--r--misc/CreateProbingPT.cpp108
-rw-r--r--misc/Jamfile4
-rw-r--r--misc/QueryProbingPT.cpp2
-rw-r--r--moses/TranslationModel/ProbingPT/ProbingPT.cpp93
-rw-r--r--moses/TranslationModel/ProbingPT/ProbingPT.h19
-rw-r--r--moses/TranslationModel/ProbingPT/hash.cpp36
-rw-r--r--moses/TranslationModel/ProbingPT/hash.hh7
-rw-r--r--moses/TranslationModel/ProbingPT/huffmanish.cpp451
-rw-r--r--moses/TranslationModel/ProbingPT/huffmanish.hh112
-rw-r--r--moses/TranslationModel/ProbingPT/line_splitter.cpp59
-rw-r--r--moses/TranslationModel/ProbingPT/line_splitter.hh36
-rw-r--r--moses/TranslationModel/ProbingPT/probing_hash_utils.cpp28
-rw-r--r--moses/TranslationModel/ProbingPT/probing_hash_utils.hh38
-rw-r--r--moses/TranslationModel/ProbingPT/quering.cpp221
-rw-r--r--moses/TranslationModel/ProbingPT/quering.hh62
-rw-r--r--moses/TranslationModel/ProbingPT/storing.cpp322
-rw-r--r--moses/TranslationModel/ProbingPT/storing.hh91
-rw-r--r--moses/TranslationModel/ProbingPT/vocabid.cpp53
-rw-r--r--moses/TranslationModel/ProbingPT/vocabid.hh15
20 files changed, 837 insertions, 952 deletions
diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project
index b59f28e08..c25eb5225 100644
--- a/contrib/other-builds/moses/.project
+++ b/contrib/other-builds/moses/.project
@@ -1319,7 +1319,7 @@
<name>FF/PhraseBoundaryFeature.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseBoundaryFeature.h</locationURI>
- </link>
+ </link>
<link>
<name>FF/PhraseDistanceFeature.cpp</name>
<type>1</type>
@@ -3341,24 +3341,34 @@
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/ProbingPT.h</locationURI>
</link>
<link>
- <name>TranslationModel/ProbingPT/hash.cpp</name>
+ <name>TranslationModel/ProbingPT/StoreTarget.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.cpp</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.cpp</locationURI>
</link>
<link>
- <name>TranslationModel/ProbingPT/hash.hh</name>
+ <name>TranslationModel/ProbingPT/StoreTarget.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.h</locationURI>
</link>
<link>
- <name>TranslationModel/ProbingPT/huffmanish.cpp</name>
+ <name>TranslationModel/ProbingPT/StoreVocab.cpp</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.cpp</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.cpp</locationURI>
</link>
<link>
- <name>TranslationModel/ProbingPT/huffmanish.hh</name>
+ <name>TranslationModel/ProbingPT/StoreVocab.h</name>
<type>1</type>
- <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.hh</locationURI>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.h</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/hash.cpp</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.cpp</locationURI>
+ </link>
+ <link>
+ <name>TranslationModel/ProbingPT/hash.hh</name>
+ <type>1</type>
+ <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh</locationURI>
</link>
<link>
<name>TranslationModel/ProbingPT/line_splitter.cpp</name>
@@ -3664,7 +3674,7 @@
<name>TranslationModel/UG/sapt_pscore_coherence.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h</locationURI>
- </link>
+ </link>
<link>
<name>TranslationModel/UG/sapt_pscore_lex1.h</name>
<type>1</type>
@@ -3709,7 +3719,7 @@
<name>TranslationModel/UG/sapt_pscore_wordcount.h</name>
<type>1</type>
<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h</locationURI>
- </link>
+ </link>
<link>
<name>TranslationModel/UG/sim-pe.cc</name>
<type>1</type>
diff --git a/misc/CreateProbingPT.cpp b/misc/CreateProbingPT.cpp
index b23427f30..dff916660 100644
--- a/misc/CreateProbingPT.cpp
+++ b/misc/CreateProbingPT.cpp
@@ -1,29 +1,113 @@
+#include <string>
+#include <boost/program_options.hpp>
#include "util/usage.hh"
#include "moses/TranslationModel/ProbingPT/storing.hh"
+#include "moses/InputFileStream.h"
+#include "moses/OutputFileStream.h"
+#include "moses/Util.h"
+using namespace std;
+std::string ReformatSCFGFile(const std::string &path);
int main(int argc, char* argv[])
{
+ string inPath, outPath;
+ int num_scores = 4;
+ int num_lex_scores = 0;
+ bool log_prob = false;
+ bool scfg = false;
+ int max_cache_size = 50000;
- const char * is_reordering = "false";
+ namespace po = boost::program_options;
+ po::options_description desc("Options");
+ desc.add_options()
+ ("help", "Print help messages")
+ ("input-pt", po::value<string>()->required(), "Text pt")
+ ("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
+ ("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
+ ("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
+ ("log-prob", "log (and floor) probabilities before storing")
+ ("max-cache-size", po::value<int>()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit")
+ ("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS")
- if (!(argc == 5 || argc == 4)) {
- // Tell the user how to run the program
- std::cerr << "Provided " << argc << " arguments, needed 4 or 5." << std::endl;
- std::cerr << "Usage: " << argv[0] << " path_to_phrasetable output_dir num_scores is_reordering" << std::endl;
- std::cerr << "is_reordering should be either true or false, but it is currently a stub feature." << std::endl;
- //std::cerr << "Usage: " << argv[0] << " path_to_phrasetable number_of_uniq_lines output_bin_file output_hash_table output_vocab_id" << std::endl;
- return 1;
+ ;
+
+ po::variables_map vm;
+ try {
+ po::store(po::parse_command_line(argc, argv, desc),
+ vm); // can throw
+
+ /** --help option
+ */
+ if ( vm.count("help")) {
+ std::cout << desc << std::endl;
+ return EXIT_SUCCESS;
+ }
+
+ po::notify(vm); // throws on error, so do after help in case
+ // there are any problems
+ } catch(po::error& e) {
+ std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
+ std::cerr << desc << std::endl;
+ return EXIT_FAILURE;
}
- if (argc == 5) {
- is_reordering = argv[4];
+ if (vm.count("input-pt")) inPath = vm["input-pt"].as<string>();
+ if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
+ if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
+ if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
+ if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as<int>();
+ if (vm.count("log-prob")) log_prob = true;
+ if (vm.count("scfg")) scfg = true;
+
+
+ if (scfg) {
+ inPath = ReformatSCFGFile(inPath);
}
- createProbingPT(argv[1], argv[2], argv[3], is_reordering);
+ Moses::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg);
- util::PrintUsage(std::cout);
+ //util::PrintUsage(std::cout);
return 0;
}
+std::string ReformatSCFGFile(const std::string &path)
+{
+ Moses::InputFileStream inFile(path);
+ string reformattedPath = path + ".reformat.gz";
+ Moses::OutputFileStream outFile(reformattedPath);
+
+ string line;
+ while (getline(inFile, line)) {
+ vector<string> toks = Moses::TokenizeMultiCharSeparator(line, "|||");
+ assert(toks.size() >= 3);
+
+ // source
+ vector<string> sourceToks = Moses::Tokenize(toks[0], " ");
+ for (size_t i = 0; i < sourceToks.size() - 1; ++i) {
+ outFile << sourceToks[i] << " ";
+ }
+
+ // other columns
+ for (size_t i = 1; i < toks.size(); ++i) {
+ outFile << "|||" << toks[i];
+ }
+ outFile << endl;
+ }
+
+ inFile.Close();
+ outFile.Close();
+
+ string sortedPath = path + ".reformat.sorted.gz";
+ string tmpPath = path + ".tmp ";
+ string cmd = "mkdir " + tmpPath
+ + " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath;
+ system(cmd.c_str());
+
+ cmd = "rm -rf " + tmpPath + " " + reformattedPath;
+ system(cmd.c_str());
+
+ return sortedPath;
+}
+
diff --git a/misc/Jamfile b/misc/Jamfile
index f1599aca8..135490a46 100644
--- a/misc/Jamfile
+++ b/misc/Jamfile
@@ -31,9 +31,9 @@ else {
}
exe CreateProbingPT : CreateProbingPT.cpp ..//boost_filesystem ../moses//moses ;
-exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ;
+#exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ;
-alias programsProbing : CreateProbingPT QueryProbingPT ;
+alias programsProbing : CreateProbingPT ; #QueryProbingPT
exe merge-sorted :
merge-sorted.cc
diff --git a/misc/QueryProbingPT.cpp b/misc/QueryProbingPT.cpp
index 72fd0be11..5047d4d47 100644
--- a/misc/QueryProbingPT.cpp
+++ b/misc/QueryProbingPT.cpp
@@ -34,7 +34,7 @@ int main(int argc, char* argv[])
return 1;
}
- QueryEngine queries(argv[1]);
+ Moses::QueryEngine queries(argv[1]);
//Interactive search
std::cout << "Please enter a string to be searched, or exit to exit." << std::endl;
diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp
index cbfd2c1a4..bb3f26e22 100644
--- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp
+++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp
@@ -3,6 +3,7 @@
#include "moses/StaticData.h"
#include "moses/FactorCollection.h"
#include "moses/TargetPhraseCollection.h"
+#include "moses/InputFileStream.h"
#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h"
#include "quering.hh"
@@ -34,44 +35,94 @@ void ProbingPT::Load(AllOptions::ptr const& opts)
m_unkId = 456456546456;
+ FactorCollection &vocab = FactorCollection::Instance();
+
// source vocab
- const std::map<uint64_t, std::string> &sourceVocab = m_engine->getSourceVocab();
+ const std::map<uint64_t, std::string> &sourceVocab =
+ m_engine->getSourceVocab();
std::map<uint64_t, std::string>::const_iterator iterSource;
- for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end(); ++iterSource) {
- const string &wordStr = iterSource->second;
- const Factor *factor = FactorCollection::Instance().AddFactor(wordStr);
+ for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end();
+ ++iterSource) {
+ string wordStr = iterSource->second;
+ //cerr << "wordStr=" << wordStr << endl;
- uint64_t probingId = iterSource->first;
+ const Factor *factor = vocab.AddFactor(wordStr);
- SourceVocabMap::value_type entry(factor, probingId);
- m_sourceVocabMap.insert(entry);
+ uint64_t probingId = iterSource->first;
+ size_t factorId = factor->GetId();
+ if (factorId >= m_sourceVocab.size()) {
+ m_sourceVocab.resize(factorId + 1, m_unkId);
+ }
+ m_sourceVocab[factorId] = probingId;
}
// target vocab
- const std::map<unsigned int, std::string> &probingVocab = m_engine->getVocab();
- std::map<unsigned int, std::string>::const_iterator iter;
- for (iter = probingVocab.begin(); iter != probingVocab.end(); ++iter) {
- const string &wordStr = iter->second;
- const Factor *factor = FactorCollection::Instance().AddFactor(wordStr);
+ InputFileStream targetVocabStrme(m_filePath + "/TargetVocab.dat");
+ string line;
+ while (getline(targetVocabStrme, line)) {
+ vector<string> toks = Tokenize(line, "\t");
+ UTIL_THROW_IF2(toks.size() != 2, string("Incorrect format:") + line + "\n");
+
+ //cerr << "wordStr=" << toks[0] << endl;
+
+ const Factor *factor = vocab.AddFactor(toks[0]);
+ uint32_t probingId = Scan<uint32_t>(toks[1]);
+
+ if (probingId >= m_targetVocab.size()) {
+ m_targetVocab.resize(probingId + 1);
+ }
+
+ m_targetVocab[probingId] = factor;
+ }
+
+ // alignments
+ CreateAlignmentMap(m_filePath + "/Alignments.dat");
- unsigned int probingId = iter->first;
+ // memory mapped file to tps
+ string filePath = m_filePath + "/TargetColl.dat";
+ file.open(filePath.c_str());
+ if (!file.is_open()) {
+ throw "Couldn't open file ";
+ }
+
+ data = file.data();
+ //size_t size = file.size();
+
+ // cache
+ //CreateCache(system);
- TargetVocabMap::value_type entry(factor, probingId);
- m_vocabMap.insert(entry);
+}
+void ProbingPT::CreateAlignmentMap(const std::string path)
+{
+ const std::vector< std::vector<unsigned char> > &probingAlignColl = m_engine->getAlignments();
+ m_aligns.resize(probingAlignColl.size(), NULL);
+
+ for (size_t i = 0; i < probingAlignColl.size(); ++i) {
+ AlignmentInfo::CollType aligns;
+
+ const std::vector<unsigned char> &probingAligns = probingAlignColl[i];
+ for (size_t j = 0; j < probingAligns.size(); j += 2) {
+ size_t startPos = probingAligns[j];
+ size_t endPos = probingAligns[j+1];
+ //cerr << "startPos=" << startPos << " " << endPos << endl;
+ aligns.insert(std::pair<size_t,size_t>(startPos, endPos));
+ }
+
+ const AlignmentInfo *align = AlignmentInfoCollection::Instance().Add(aligns);
+ m_aligns[i] = align;
+ //cerr << "align=" << align->Debug(system) << endl;
}
}
void ProbingPT::InitializeForInput(ttasksptr const& ttask)
{
- ReduceCache();
+
}
void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
{
- CacheColl &cache = GetCache();
-
InputPathList::const_iterator iter;
for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
InputPath &inputPath = **iter;
@@ -82,12 +133,6 @@ void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQue
}
TargetPhraseCollection::shared_ptr tpColl = CreateTargetPhrase(sourcePhrase);
-
- // add target phrase to phrase-table cache
- size_t hash = hash_value(sourcePhrase);
- std::pair<TargetPhraseCollection::shared_ptr , clock_t> value(tpColl, clock());
- cache[hash] = value;
-
inputPath.SetTargetPhrases(*this, tpColl, NULL);
}
}
diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.h b/moses/TranslationModel/ProbingPT/ProbingPT.h
index 4e7ab02c6..3b5dfc895 100644
--- a/moses/TranslationModel/ProbingPT/ProbingPT.h
+++ b/moses/TranslationModel/ProbingPT/ProbingPT.h
@@ -1,17 +1,17 @@
#pragma once
-
+#include <boost/iostreams/device/mapped_file.hpp>
#include <boost/bimap.hpp>
#include "../PhraseDictionary.h"
-class QueryEngine;
-class target_text;
namespace Moses
{
class ChartParser;
class ChartCellCollectionBase;
class ChartRuleLookupManager;
+class QueryEngine;
+class target_text;
class ProbingPT : public PhraseDictionary
{
@@ -39,12 +39,16 @@ public:
protected:
QueryEngine *m_engine;
+ uint64_t m_unkId;
- typedef boost::bimap<const Factor *, uint64_t> SourceVocabMap;
- mutable SourceVocabMap m_sourceVocabMap;
+ std::vector<uint64_t> m_sourceVocab; // factor id -> pt id
+ std::vector<const Factor*> m_targetVocab; // pt id -> factor*
+ std::vector<const AlignmentInfo*> m_aligns;
- typedef boost::bimap<const Factor *, unsigned int> TargetVocabMap;
- mutable TargetVocabMap m_vocabMap;
+ boost::iostreams::mapped_file_source file;
+ const char *data;
+
+ void CreateAlignmentMap(const std::string path);
TargetPhraseCollection::shared_ptr CreateTargetPhrase(const Phrase &sourcePhrase) const;
TargetPhrase *CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const;
@@ -53,7 +57,6 @@ protected:
std::vector<uint64_t> ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const;
- uint64_t m_unkId;
};
} // namespace Moses
diff --git a/moses/TranslationModel/ProbingPT/hash.cpp b/moses/TranslationModel/ProbingPT/hash.cpp
index 8945649ef..27a64b129 100644
--- a/moses/TranslationModel/ProbingPT/hash.cpp
+++ b/moses/TranslationModel/ProbingPT/hash.cpp
@@ -1,5 +1,11 @@
+#include <iostream>
#include "hash.hh"
+using namespace std;
+
+namespace Moses
+{
+
uint64_t getHash(StringPiece text)
{
std::size_t len = text.size();
@@ -7,24 +13,32 @@ uint64_t getHash(StringPiece text)
return key;
}
-std::vector<uint64_t> getVocabIDs(StringPiece textin)
+std::vector<uint64_t> getVocabIDs(const StringPiece &textin)
{
//Tokenize
std::vector<uint64_t> output;
- util::TokenIter<util::SingleCharacter> it(textin, util::SingleCharacter(' '));
+ util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
+
+ while (itWord) {
+ StringPiece word = *itWord;
+ uint64_t id = 0;
+
+ util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
+ while (itFactor) {
+ StringPiece factor = *itFactor;
+ //cerr << "factor=" << factor << endl;
- while(it) {
- output.push_back(getHash(*it));
- it++;
+ id += getHash(factor);
+ itFactor++;
+ }
+
+ output.push_back(id);
+ itWord++;
}
return output;
}
-uint64_t getVocabID(std::string candidate)
-{
- std::size_t len = candidate.length();
- uint64_t key = util::MurmurHashNative(candidate.c_str(), len);
- return key;
-} \ No newline at end of file
+}
+
diff --git a/moses/TranslationModel/ProbingPT/hash.hh b/moses/TranslationModel/ProbingPT/hash.hh
index 607238ae1..f218ad9da 100644
--- a/moses/TranslationModel/ProbingPT/hash.hh
+++ b/moses/TranslationModel/ProbingPT/hash.hh
@@ -6,9 +6,12 @@
#include "util/tokenize_piece.hh"
#include <vector>
+namespace Moses
+{
+
//Gets the MurmurmurHash for give string
uint64_t getHash(StringPiece text);
-std::vector<uint64_t> getVocabIDs(StringPiece textin);
+std::vector<uint64_t> getVocabIDs(const StringPiece &textin);
-uint64_t getVocabID(std::string candidate); \ No newline at end of file
+}
diff --git a/moses/TranslationModel/ProbingPT/huffmanish.cpp b/moses/TranslationModel/ProbingPT/huffmanish.cpp
deleted file mode 100644
index 534fd04d1..000000000
--- a/moses/TranslationModel/ProbingPT/huffmanish.cpp
+++ /dev/null
@@ -1,451 +0,0 @@
-#include "huffmanish.hh"
-
-Huffman::Huffman (const char * filepath)
-{
- //Read the file
- util::FilePiece filein(filepath);
-
- //Init uniq_lines to zero;
- uniq_lines = 0;
-
- line_text prev_line; //Check for unique lines.
- int num_lines = 0 ;
-
- while (true) {
- line_text new_line;
-
- num_lines++;
-
- try {
- //Process line read
- new_line = splitLine(filein.ReadLine());
- count_elements(new_line); //Counts the number of elements, adds new and increments counters.
-
- } catch (util::EndOfFileException e) {
- std::cerr << "Unique entries counted: ";
- break;
- }
-
- if (new_line.source_phrase == prev_line.source_phrase) {
- continue;
- } else {
- uniq_lines++;
- prev_line = new_line;
- }
- }
-
- std::cerr << uniq_lines << std::endl;
-}
-
-void Huffman::count_elements(line_text linein)
-{
- //For target phrase:
- util::TokenIter<util::SingleCharacter> it(linein.target_phrase, util::SingleCharacter(' '));
- while (it) {
- //Check if we have that entry
- std::map<std::string, unsigned int>::iterator mapiter;
- mapiter = target_phrase_words.find(it->as_string());
-
- if (mapiter != target_phrase_words.end()) {
- //If the element is found, increment the count.
- mapiter->second++;
- } else {
- //Else create a new entry;
- target_phrase_words.insert(std::pair<std::string, unsigned int>(it->as_string(), 1));
- }
- it++;
- }
-
- //For word allignment 1
- std::map<std::vector<unsigned char>, unsigned int>::iterator mapiter3;
- std::vector<unsigned char> numbers = splitWordAll1(linein.word_align);
- mapiter3 = word_all1.find(numbers);
-
- if (mapiter3 != word_all1.end()) {
- //If the element is found, increment the count.
- mapiter3->second++;
- } else {
- //Else create a new entry;
- word_all1.insert(std::pair<std::vector<unsigned char>, unsigned int>(numbers, 1));
- }
-
-}
-
-//Assigns huffman values for each unique element
-void Huffman::assign_values()
-{
- //First create vectors for all maps so that we could sort them later.
-
- //Create a vector for target phrases
- for(std::map<std::string, unsigned int>::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) {
- target_phrase_words_counts.push_back(*it);
- }
- //Sort it
- std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair());
-
- //Create a vector for word allignments 1
- for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) {
- word_all1_counts.push_back(*it);
- }
- //Sort it
- std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec());
-
-
- //Afterwards we assign a value for each phrase, starting from 1, as zero is reserved for delimiter
- unsigned int i = 1; //huffman code
- for(std::vector<std::pair<std::string, unsigned int> >::iterator it = target_phrase_words_counts.begin();
- it != target_phrase_words_counts.end(); it++) {
- target_phrase_huffman.insert(std::pair<std::string, unsigned int>(it->first, i));
- i++; //Go to the next huffman code
- }
-
- i = 1; //Reset i for the next map
- for(std::vector<std::pair<std::vector<unsigned char>, unsigned int> >::iterator it = word_all1_counts.begin();
- it != word_all1_counts.end(); it++) {
- word_all1_huffman.insert(std::pair<std::vector<unsigned char>, unsigned int>(it->first, i));
- i++; //Go to the next huffman code
- }
-
- //After lookups are produced, clear some memory usage of objects not needed anymore.
- target_phrase_words.clear();
- word_all1.clear();
-
- target_phrase_words_counts.clear();
- word_all1_counts.clear();
-
- std::cerr << "Finished generating huffman codes." << std::endl;
-
-}
-
-void Huffman::serialize_maps(const char * dirname)
-{
- //Note that directory name should exist.
- std::string basedir(dirname);
- std::string target_phrase_path(basedir + "/target_phrases");
- std::string probabilities_path(basedir + "/probs");
- std::string word_all1_path(basedir + "/Wall1");
-
- //Target phrase
- std::ofstream os (target_phrase_path.c_str(), std::ios::binary);
- boost::archive::text_oarchive oarch(os);
- oarch << lookup_target_phrase;
- os.close();
-
- //Word all1
- std::ofstream os2 (word_all1_path.c_str(), std::ios::binary);
- boost::archive::text_oarchive oarch2(os2);
- oarch2 << lookup_word_all1;
- os2.close();
-}
-
-std::vector<unsigned char> Huffman::full_encode_line(line_text line)
-{
- return vbyte_encode_line((encode_line(line)));
-}
-
-std::vector<unsigned int> Huffman::encode_line(line_text line)
-{
- std::vector<unsigned int> retvector;
-
- //Get target_phrase first.
- util::TokenIter<util::SingleCharacter> it(line.target_phrase, util::SingleCharacter(' '));
- while (it) {
- retvector.push_back(target_phrase_huffman.find(it->as_string())->second);
- it++;
- }
- //Add a zero;
- retvector.push_back(0);
-
- //Get probabilities. Reinterpreting the float bytes as unsgined int.
- util::TokenIter<util::SingleCharacter> probit(line.prob, util::SingleCharacter(' '));
- while (probit) {
- //Sometimes we have too big floats to handle, so first convert to double
- double tempnum = atof(probit->data());
- float num = (float)tempnum;
- retvector.push_back(reinterpret_float(&num));
- probit++;
- }
- //Add a zero;
- retvector.push_back(0);
-
-
- //Get Word allignments
- retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_align))->second);
- retvector.push_back(0);
-
- return retvector;
-}
-
-void Huffman::produce_lookups()
-{
- //basically invert every map that we have
- for(std::map<std::string, unsigned int>::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) {
- lookup_target_phrase.insert(std::pair<unsigned int, std::string>(it->second, it->first));
- }
-
- for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) {
- lookup_word_all1.insert(std::pair<unsigned int, std::vector<unsigned char> >(it->second, it->first));
- }
-
-}
-
-HuffmanDecoder::HuffmanDecoder (const char * dirname)
-{
- //Read the maps from disk
-
- //Note that directory name should exist.
- std::string basedir(dirname);
- std::string target_phrase_path(basedir + "/target_phrases");
- std::string word_all1_path(basedir + "/Wall1");
-
- //Target phrases
- std::ifstream is (target_phrase_path.c_str(), std::ios::binary);
- boost::archive::text_iarchive iarch(is);
- iarch >> lookup_target_phrase;
- is.close();
-
- //Word allignment 1
- std::ifstream is2 (word_all1_path.c_str(), std::ios::binary);
- boost::archive::text_iarchive iarch2(is2);
- iarch2 >> lookup_word_all1;
- is2.close();
-
-}
-
-HuffmanDecoder::HuffmanDecoder (std::map<unsigned int, std::string> * lookup_target,
- std::map<unsigned int, std::vector<unsigned char> > * lookup_word1)
-{
- lookup_target_phrase = *lookup_target;
- lookup_word_all1 = *lookup_word1;
-}
-
-std::vector<target_text> HuffmanDecoder::full_decode_line (std::vector<unsigned char> lines, int num_scores)
-{
- std::vector<target_text> retvector; //All target phrases
- std::vector<unsigned int> decoded_lines = vbyte_decode_line(lines); //All decoded lines
- std::vector<unsigned int>::iterator it = decoded_lines.begin(); //Iterator for them
- std::vector<unsigned int> current_target_phrase; //Current target phrase decoded
-
- short zero_count = 0; //Count home many zeroes we have met. so far. Every 3 zeroes mean a new target phrase.
- while(it != decoded_lines.end()) {
- if (zero_count == 1) {
- //We are extracting scores. we know how many scores there are so we can push them
- //to the vector. This is done in case any of the scores is 0, because it would mess
- //up the state machine.
- for (int i = 0; i < num_scores; i++) {
- current_target_phrase.push_back(*it);
- it++;
- }
- }
-
- if (zero_count == 3) {
- //We have finished with this entry, decode it, and add it to the retvector.
- retvector.push_back(decode_line(current_target_phrase, num_scores));
- current_target_phrase.clear(); //Clear the current target phrase and the zero_count
- zero_count = 0; //So that we can reuse them for the next target phrase
- }
- //Add to the next target_phrase, number by number.
- current_target_phrase.push_back(*it);
- if (*it == 0) {
- zero_count++;
- }
- it++; //Go to the next word/symbol
- }
- //Don't forget the last remaining line!
- if (zero_count == 3) {
- //We have finished with this entry, decode it, and add it to the retvector.
- retvector.push_back(decode_line(current_target_phrase, num_scores));
- current_target_phrase.clear(); //Clear the current target phrase and the zero_count
- zero_count = 0; //So that we can reuse them for the next target phrase
- }
-
- return retvector;
-
-}
-
-target_text HuffmanDecoder::decode_line (std::vector<unsigned int> input, int num_scores)
-{
- //demo decoder
- target_text ret;
- //Split everything
- std::vector<unsigned int> target_phrase;
- std::vector<unsigned int> probs;
- unsigned int wAll;
-
- //Split the line into the proper arrays
- short num_zeroes = 0;
- int counter = 0;
- while (num_zeroes < 3) {
- unsigned int num = input[counter];
- if (num == 0) {
- num_zeroes++;
- } else if (num_zeroes == 0) {
- target_phrase.push_back(num);
- } else if (num_zeroes == 1) {
- //Push exactly num_scores scores
- for (int i = 0; i < num_scores; i++) {
- probs.push_back(num);
- counter++;
- num = input[counter];
- }
- continue;
- } else if (num_zeroes == 2) {
- wAll = num;
- }
- counter++;
- }
-
- ret.target_phrase = target_phrase;
- ret.word_all1 = lookup_word_all1.find(wAll)->second;
-
- //Decode probabilities
- for (std::vector<unsigned int>::iterator it = probs.begin(); it != probs.end(); it++) {
- ret.prob.push_back(reinterpret_uint(&(*it)));
- }
-
- return ret;
-
-}
-
-inline std::string HuffmanDecoder::getTargetWordFromID(unsigned int id)
-{
- return lookup_target_phrase.find(id)->second;
-}
-
-std::string HuffmanDecoder::getTargetWordsFromIDs(std::vector<unsigned int> ids)
-{
- std::string returnstring;
- for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
- returnstring.append(getTargetWordFromID(*it) + " ");
- }
-
- return returnstring;
-}
-
-inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase)
-{
- return lookup_target_phrase->find(id)->second;
-}
-
-std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase)
-{
- std::string returnstring;
- for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
- returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " ");
- }
-
- return returnstring;
-}
-
-/*Those functions are used to more easily store the floats in the binary phrase table
- We convert the float unsinged int so that it is the same as our other values and we can
- apply variable byte encoding on top of it.*/
-
-inline unsigned int reinterpret_float(float * num)
-{
- unsigned int * converted_num;
- converted_num = reinterpret_cast<unsigned int *>(num);
- return *converted_num;
-}
-
-inline float reinterpret_uint(unsigned int * num)
-{
- float * converted_num;
- converted_num = reinterpret_cast<float *>(num);
- return *converted_num;
-}
-
-/*Mostly taken from stackoverflow, http://stackoverflow.com/questions/5858646/optimizing-variable-length-encoding
-and modified in order to return a vector of chars. Implements ULEB128 or variable byte encoding.
-This is highly optimized version with unrolled loop */
-inline std::vector<unsigned char> vbyte_encode(unsigned int num)
-{
- //Determine how many bytes we are going to take.
- short size;
- std::vector<unsigned char> byte_vector;
-
- if (num < 0x00000080U) {
- size = 1;
- byte_vector.reserve(size);
- goto b1;
- }
- if (num < 0x00004000U) {
- size = 2;
- byte_vector.reserve(size);
- goto b2;
- }
- if (num < 0x00200000U) {
- size = 3;
- byte_vector.reserve(size);
- goto b3;
- }
- if (num < 0x10000000U) {
- size = 4;
- byte_vector.reserve(size);
- goto b4;
- }
- size = 5;
- byte_vector.reserve(size);
-
-
- //Now proceed with the encoding.
- byte_vector.push_back((num & 0x7f) | 0x80);
- num >>= 7;
-b4:
- byte_vector.push_back((num & 0x7f) | 0x80);
- num >>= 7;
-b3:
- byte_vector.push_back((num & 0x7f) | 0x80);
- num >>= 7;
-b2:
- byte_vector.push_back((num & 0x7f) | 0x80);
- num >>= 7;
-b1:
- byte_vector.push_back(num);
-
- return byte_vector;
-}
-
-std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line)
-{
- std::vector<unsigned int> huffman_line;
- std::vector<unsigned char> current_num;
-
- for (std::vector<unsigned char>::iterator it = line.begin(); it != line.end(); it++) {
- current_num.push_back(*it);
- if ((*it >> 7) != 1) {
- //We don't have continuation in the next bit
- huffman_line.push_back(bytes_to_int(current_num));
- current_num.clear();
- }
- }
- return huffman_line;
-}
-
-inline unsigned int bytes_to_int(std::vector<unsigned char> number)
-{
- unsigned int retvalue = 0;
- std::vector<unsigned char>::iterator it = number.begin();
- unsigned char shift = 0; //By how many bits to shift
-
- while (it != number.end()) {
- retvalue |= (*it & 0x7f) << shift;
- shift += 7;
- it++;
- }
-
- return retvalue;
-}
-
-std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line)
-{
- std::vector<unsigned char> retvec;
-
- //For each unsigned int in the line, vbyte encode it and add it to a vector of unsigned chars.
- for (std::vector<unsigned int>::iterator it = line.begin(); it != line.end(); it++) {
- std::vector<unsigned char> vbyte_encoded = vbyte_encode(*it);
- retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end());
- }
-
- return retvec;
-}
diff --git a/moses/TranslationModel/ProbingPT/huffmanish.hh b/moses/TranslationModel/ProbingPT/huffmanish.hh
deleted file mode 100644
index 0970a9e68..000000000
--- a/moses/TranslationModel/ProbingPT/huffmanish.hh
+++ /dev/null
@@ -1,112 +0,0 @@
-#pragma once
-
-//Huffman encodes a line and also produces the vocabulary ids
-#include "hash.hh"
-#include "line_splitter.hh"
-#include <cstdio>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <boost/serialization/serialization.hpp>
-#include <boost/serialization/vector.hpp>
-#include <boost/serialization/map.hpp>
-#include <boost/archive/text_iarchive.hpp>
-#include <boost/archive/text_oarchive.hpp>
-
-//Sorting for the second
-struct sort_pair {
- bool operator()(const std::pair<std::string, unsigned int> &left, const std::pair<std::string, unsigned int> &right) {
- return left.second > right.second; //This puts biggest numbers first.
- }
-};
-
-struct sort_pair_vec {
- bool operator()(const std::pair<std::vector<unsigned char>, unsigned int> &left, const std::pair<std::vector<unsigned char>, unsigned int> &right) {
- return left.second > right.second; //This puts biggest numbers first.
- }
-};
-
-class Huffman
-{
- unsigned long uniq_lines; //Unique lines in the file.
-
- //Containers used when counting the occurence of a given phrase
- std::map<std::string, unsigned int> target_phrase_words;
- std::map<std::vector<unsigned char>, unsigned int> word_all1;
-
- //Same containers as vectors, for sorting
- std::vector<std::pair<std::string, unsigned int> > target_phrase_words_counts;
- std::vector<std::pair<std::vector<unsigned char>, unsigned int> > word_all1_counts;
-
- //Huffman maps
- std::map<std::string, unsigned int> target_phrase_huffman;
- std::map<std::vector<unsigned char>, unsigned int> word_all1_huffman;
-
- //inverted maps
- std::map<unsigned int, std::string> lookup_target_phrase;
- std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
-
-public:
- Huffman (const char *);
- void count_elements (line_text line);
- void assign_values();
- void serialize_maps(const char * dirname);
- void produce_lookups();
-
- std::vector<unsigned int> encode_line(line_text line);
-
- //encode line + variable byte ontop
- std::vector<unsigned char> full_encode_line(line_text line);
-
- //Getters
- const std::map<unsigned int, std::string> get_target_lookup_map() const {
- return lookup_target_phrase;
- }
- const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const {
- return lookup_word_all1;
- }
-
- unsigned long getUniqLines() {
- return uniq_lines;
- }
-};
-
-class HuffmanDecoder
-{
- std::map<unsigned int, std::string> lookup_target_phrase;
- std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
-
-public:
- HuffmanDecoder (const char *);
- HuffmanDecoder (std::map<unsigned int, std::string> *, std::map<unsigned int, std::vector<unsigned char> > *);
-
- //Getters
- const std::map<unsigned int, std::string> get_target_lookup_map() const {
- return lookup_target_phrase;
- }
- const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const {
- return lookup_word_all1;
- }
-
- inline std::string getTargetWordFromID(unsigned int id);
-
- std::string getTargetWordsFromIDs(std::vector<unsigned int> ids);
-
- target_text decode_line (std::vector<unsigned int> input, int num_scores);
-
- //Variable byte decodes a all target phrases contained here and then passes them to decode_line
- std::vector<target_text> full_decode_line (std::vector<unsigned char> lines, int num_scores);
-};
-
-std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase);
-
-inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase);
-
-inline unsigned int reinterpret_float(float * num);
-
-inline float reinterpret_uint(unsigned int * num);
-
-std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line);
-inline std::vector<unsigned char> vbyte_encode(unsigned int num);
-std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line);
-inline unsigned int bytes_to_int(std::vector<unsigned char> number);
diff --git a/moses/TranslationModel/ProbingPT/line_splitter.cpp b/moses/TranslationModel/ProbingPT/line_splitter.cpp
index 1eeeb1899..cb9e47fec 100644
--- a/moses/TranslationModel/ProbingPT/line_splitter.cpp
+++ b/moses/TranslationModel/ProbingPT/line_splitter.cpp
@@ -1,66 +1,92 @@
#include "line_splitter.hh"
-line_text splitLine(StringPiece textin)
+namespace Moses
{
- const char delim[] = " ||| ";
+
+line_text splitLine(const StringPiece &textin, bool scfg)
+{
+ const char delim[] = "|||";
line_text output;
//Tokenize
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
//Get source phrase
- output.source_phrase = *it;
+ output.source_phrase = Trim(*it);
+ //std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl;
//Get target_phrase
it++;
- output.target_phrase = *it;
+ output.target_phrase = Trim(*it);
+ //std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl;
+
+ if (scfg) {
+ /*
+ std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
+ std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
+ reformatSCFG(output);
+ std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
+ std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
+ */
+ }
//Get probabilities
it++;
- output.prob = *it;
+ output.prob = Trim(*it);
+ //std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl;
//Get WordAllignment
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
- output.word_align = *it;
+ output.word_align = Trim(*it);
+ //std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl;
//Get count
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
- output.counts = *it;
+ output.counts = Trim(*it);
+ //std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl;
//Get sparse_score
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
- output.sparse_score = *it;
+ output.sparse_score = Trim(*it);
+ //std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl;
//Get property
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
- output.property = *it;
+ output.property = Trim(*it);
+ //std::cerr << "output.property=" << output.property << "AAAA" << std::endl;
return output;
}
-std::vector<unsigned char> splitWordAll1(StringPiece textin)
+std::vector<unsigned char> splitWordAll1(const StringPiece &textin)
{
const char delim[] = " ";
const char delim2[] = "-";
std::vector<unsigned char> output;
+ //Case with no word alignments.
+ if (textin.size() == 0) {
+ return output;
+ }
+
//Split on space
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
//For each int
while (it) {
//Split on dash (-)
- util::TokenIter<util::MultiCharacter> itInner(*it, util::MultiCharacter(delim2));
+ util::TokenIter<util::MultiCharacter> itInner(*it,
+ util::MultiCharacter(delim2));
//Insert the two entries in the vector. User will read entry 0 and 1 to get the first,
//2 and 3 for second etc. Use unsigned char instead of int to save space, as
//word allignments are all very small numbers that fit in a single byte
- output.push_back((unsigned char)(atoi(itInner->data())));
+ output.push_back((unsigned char) (atoi(itInner->data())));
itInner++;
- output.push_back((unsigned char)(atoi(itInner->data())));
+ output.push_back((unsigned char) (atoi(itInner->data())));
it++;
}
@@ -68,3 +94,10 @@ std::vector<unsigned char> splitWordAll1(StringPiece textin)
}
+void reformatSCFG(line_text &output)
+{
+
+}
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/line_splitter.hh b/moses/TranslationModel/ProbingPT/line_splitter.hh
index 2cb9a3c8c..cec0a5f45 100644
--- a/moses/TranslationModel/ProbingPT/line_splitter.hh
+++ b/moses/TranslationModel/ProbingPT/line_splitter.hh
@@ -9,8 +9,12 @@
#include "util/tokenize_piece.hh"
#include <vector>
+namespace Moses
+{
+
//Struct for holding processed line
-struct line_text {
+struct line_text
+{
StringPiece source_phrase;
StringPiece target_phrase;
StringPiece prob;
@@ -18,16 +22,38 @@ struct line_text {
StringPiece counts;
StringPiece sparse_score;
StringPiece property;
+ std::string property_to_be_binarized;
};
//Struct for holding processed line
-struct target_text {
+struct target_text
+{
std::vector<unsigned int> target_phrase;
std::vector<float> prob;
- std::vector<unsigned char> word_all1;
+ std::vector<size_t> word_align_term;
+ std::vector<size_t> word_align_non_term;
+ std::vector<char> counts;
+ std::vector<char> sparse_score;
+ std::vector<char> property;
+
+ /*
+ void Reset()
+ {
+ target_phrase.clear();
+ prob.clear();
+ word_all1.clear();
+ counts.clear();
+ sparse_score.clear();
+ property.clear();
+ }
+ */
};
//Ask if it's better to have it receive a pointer to a line_text struct
-line_text splitLine(StringPiece textin);
+line_text splitLine(const StringPiece &textin, bool scfg);
+void reformatSCFG(line_text &output);
+
+std::vector<unsigned char> splitWordAll1(const StringPiece &textin);
+
+}
-std::vector<unsigned char> splitWordAll1(StringPiece textin);
diff --git a/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp b/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp
index ca3e8f69f..f23f57d66 100644
--- a/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp
+++ b/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp
@@ -1,5 +1,8 @@
#include "probing_hash_utils.hh"
+namespace Moses
+{
+
//Read table from disk, return memory map location
char * readTable(const char * filename, size_t size)
{
@@ -13,7 +16,7 @@ char * readTable(const char * filename, size_t size)
exit(EXIT_FAILURE);
}
- map = (char *)mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
+ map = (char *) mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
if (map == MAP_FAILED) {
close(fd);
@@ -24,11 +27,24 @@ char * readTable(const char * filename, size_t size)
return map;
}
-
-void serialize_table(char *mem, size_t size, const char * filename)
+void serialize_table(char *mem, size_t size, const std::string &filename)
{
- std::ofstream os (filename, std::ios::binary);
- os.write((const char*)&mem[0], size);
+ std::ofstream os(filename.c_str(), std::ios::binary);
+ os.write((const char*) &mem[0], size);
os.close();
-} \ No newline at end of file
+}
+
+uint64_t getKey(const uint64_t source_phrase[], size_t size)
+{
+ //TOO SLOW
+ //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
+ uint64_t key = 0;
+ for (size_t i = 0; i < size; i++) {
+ key += (source_phrase[i] << i);
+ }
+ return key;
+}
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/probing_hash_utils.hh b/moses/TranslationModel/ProbingPT/probing_hash_utils.hh
index de96e87a0..dcf0dbe25 100644
--- a/moses/TranslationModel/ProbingPT/probing_hash_utils.hh
+++ b/moses/TranslationModel/ProbingPT/probing_hash_utils.hh
@@ -7,31 +7,49 @@
#include <fcntl.h>
#include <fstream>
+namespace Moses
+{
+
+#define API_VERSION 15
//Hash table entry
-struct Entry {
- uint64_t key;
+struct Entry
+{
typedef uint64_t Key;
- unsigned int bytes_toread;
+ Key key;
- uint64_t GetKey() const {
+ Key GetKey() const
+ {
return key;
}
- void SetKey(uint64_t to) {
+ void SetKey(Key to)
+ {
key = to;
}
- uint64_t GetValue() const {
- return value;
- }
-
uint64_t value;
};
+#define NONE std::numeric_limits<uint64_t>::max()
+
//Define table
typedef util::ProbingHashTable<Entry, boost::hash<uint64_t> > Table;
-void serialize_table(char *mem, size_t size, const char * filename);
+void serialize_table(char *mem, size_t size, const std::string &filename);
char * readTable(const char * filename, size_t size);
+
+uint64_t getKey(const uint64_t source_phrase[], size_t size);
+
+struct TargetPhraseInfo
+{
+ uint32_t alignTerm;
+ uint32_t alignNonTerm;
+ uint16_t numWords;
+ uint16_t propLength;
+ uint16_t filler;
+};
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/quering.cpp b/moses/TranslationModel/ProbingPT/quering.cpp
index bd1d61a1e..ef980ef06 100644
--- a/moses/TranslationModel/ProbingPT/quering.cpp
+++ b/moses/TranslationModel/ProbingPT/quering.cpp
@@ -1,73 +1,80 @@
#include "quering.hh"
+#include "util/exception.hh"
-unsigned char * read_binary_file(const char * filename, size_t filesize)
-{
- //Get filesize
- int fd;
- unsigned char * map;
-
- fd = open(filename, O_RDONLY);
-
- if (fd == -1) {
- perror("Error opening file for reading");
- exit(EXIT_FAILURE);
- }
-
- map = (unsigned char *)mmap(0, filesize, PROT_READ, MAP_SHARED, fd, 0);
- if (map == MAP_FAILED) {
- close(fd);
- perror("Error mmapping the file");
- exit(EXIT_FAILURE);
- }
+using namespace std;
- return map;
-}
+namespace Moses
+{
-QueryEngine::QueryEngine(const char * filepath) : decoder(filepath)
+QueryEngine::QueryEngine(const char * filepath)
{
//Create filepaths
std::string basepath(filepath);
std::string path_to_hashtable = basepath + "/probing_hash.dat";
- std::string path_to_data_bin = basepath + "/binfile.dat";
std::string path_to_source_vocabid = basepath + "/source_vocabids";
+ std::string alignPath = basepath + "/Alignments.dat";
///Source phrase vocabids
- read_map(&source_vocabids, path_to_source_vocabid.c_str());
+ read_map(source_vocabids, path_to_source_vocabid.c_str());
- //Target phrase vocabIDs
- vocabids = decoder.get_target_lookup_map();
+ // alignments
+ read_alignments(alignPath);
//Read config file
+ boost::unordered_map<std::string, std::string> keyValue;
+
+ std::ifstream config((basepath + "/config").c_str());
std::string line;
- std::ifstream config ((basepath + "/config").c_str());
+ while (getline(config, line)) {
+ std::vector<std::string> toks = Tokenize(line, "\t");
+ UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line);
+ keyValue[ toks[0] ] = toks[1];
+ }
+
+ bool found;
//Check API version:
- getline(config, line);
- if (atoi(line.c_str()) != API_VERSION) {
- std::cerr << "The ProbingPT API has changed, please rebinarize your phrase tables." << std::endl;
+ int version;
+ found = Get(keyValue, "API_VERSION", version);
+ if (!found) {
+ std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl;
+ }
+ else if (version != API_VERSION) {
+ std::cerr << "The ProbingPT API has changed. " << version << "!="
+ << API_VERSION << " Please rebinarize your phrase tables." << std::endl;
exit(EXIT_FAILURE);
}
+
//Get tablesize.
- getline(config, line);
- int tablesize = atoi(line.c_str());
+ int tablesize;
+ found = Get(keyValue, "uniq_entries", tablesize);
+ if (!found) {
+ std::cerr << "uniq_entries not found" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
//Number of scores
- getline(config, line);
- num_scores = atoi(line.c_str());
- //do we have a reordering table
- getline(config, line);
- std::transform(line.begin(), line.end(), line.begin(), ::tolower); //Get the boolean in lowercase
- is_reordering = false;
- if (line == "true") {
- is_reordering = true;
- std::cerr << "WARNING. REORDERING TABLES NOT SUPPORTED YET." << std::endl;
+ found = Get(keyValue, "num_scores", num_scores);
+ if (!found) {
+ std::cerr << "num_scores not found" << std::endl;
+ exit(EXIT_FAILURE);
}
- config.close();
- //Mmap binary table
- struct stat filestatus;
- stat(path_to_data_bin.c_str(), &filestatus);
- binary_filesize = filestatus.st_size;
- binary_mmaped = read_binary_file(path_to_data_bin.c_str(), binary_filesize);
+ //How may scores from lex reordering models
+ found = Get(keyValue, "num_lex_scores", num_lex_scores);
+ if (!found) {
+ std::cerr << "num_lex_scores not found" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ // have the scores been log() and FloorScore()?
+ found = Get(keyValue, "log_prob", logProb);
+ if (!found) {
+ std::cerr << "logProb not found" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ config.close();
//Read hashtable
table_filesize = Table::Size(tablesize, 1.2);
@@ -81,118 +88,50 @@ QueryEngine::QueryEngine(const char * filepath) : decoder(filepath)
QueryEngine::~QueryEngine()
{
//Clear mmap content from memory.
- munmap(binary_mmaped, binary_filesize);
munmap(mem, table_filesize);
}
-std::pair<bool, std::vector<target_text> > QueryEngine::query(std::vector<uint64_t> source_phrase)
+uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
{
- bool found;
- std::vector<target_text> translation_entries;
- const Entry * entry;
//TOO SLOW
//uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
- uint64_t key = 0;
- for (int i = 0; i < source_phrase.size(); i++) {
- key += (source_phrase[i] << i);
- }
-
-
- found = table.Find(key, entry);
-
- if (found) {
- //The phrase that was searched for was found! We need to get the translation entries.
- //We will read the largest entry in bytes and then filter the unnecesarry with functions
- //from line_splitter
- uint64_t initial_index = entry -> GetValue();
- unsigned int bytes_toread = entry -> bytes_toread;
-
- //ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS!
- std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array.
- encoded_text.reserve(bytes_toread);
- for (int i = 0; i < bytes_toread; i++) {
- encoded_text.push_back(binary_mmaped[i+initial_index]);
- }
-
- //Get only the translation entries necessary
- translation_entries = decoder.full_decode_line(encoded_text, num_scores);
-
- }
-
- std::pair<bool, std::vector<target_text> > output (found, translation_entries);
-
- return output;
-
+ return getKey(source_phrase, size);
}
-std::pair<bool, std::vector<target_text> > QueryEngine::query(StringPiece source_phrase)
+std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
{
- bool found;
- std::vector<target_text> translation_entries;
- const Entry * entry;
- //Convert source frase to VID
- std::vector<uint64_t> source_phrase_vid = getVocabIDs(source_phrase);
- //TOO SLOW
- //uint64_t key = util::MurmurHashNative(&source_phrase_vid[0], source_phrase_vid.size());
- uint64_t key = 0;
- for (int i = 0; i < source_phrase_vid.size(); i++) {
- key += (source_phrase_vid[i] << i);
- }
-
- found = table.Find(key, entry);
-
-
- if (found) {
- //The phrase that was searched for was found! We need to get the translation entries.
- //We will read the largest entry in bytes and then filter the unnecesarry with functions
- //from line_splitter
- uint64_t initial_index = entry -> GetValue();
- unsigned int bytes_toread = entry -> bytes_toread;
- //At the end of the file we can't readd + largest_entry cause we get a segfault.
- std::cerr << "Entry size is bytes is: " << bytes_toread << std::endl;
-
- //ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS!
- std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array.
- encoded_text.reserve(bytes_toread);
- for (int i = 0; i < bytes_toread; i++) {
- encoded_text.push_back(binary_mmaped[i+initial_index]);
- }
-
- //Get only the translation entries necessary
- translation_entries = decoder.full_decode_line(encoded_text, num_scores);
+ std::pair<bool, uint64_t> ret;
+ const Entry * entry;
+ ret.first = table.Find(key, entry);
+ if (ret.first) {
+ ret.second = entry->value;
}
-
- std::pair<bool, std::vector<target_text> > output (found, translation_entries);
-
- return output;
-
+ return ret;
}
-void QueryEngine::printTargetInfo(std::vector<target_text> target_phrases)
+void QueryEngine::read_alignments(const std::string &alignPath)
{
- int entries = target_phrases.size();
+ std::ifstream strm(alignPath.c_str());
- for (int i = 0; i<entries; i++) {
- std::cout << "Entry " << i+1 << " of " << entries << ":" << std::endl;
- //Print text
- std::cout << getTargetWordsFromIDs(target_phrases[i].target_phrase, &vocabids) << "\t";
+ string line;
+ while (getline(strm, line)) {
+ vector<string> toks = Tokenize(line, "\t ");
+ UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file");
- //Print probabilities:
- for (int j = 0; j<target_phrases[i].prob.size(); j++) {
- std::cout << target_phrases[i].prob[j] << " ";
+ uint32_t alignInd = Scan<uint32_t>(toks[0]);
+ if (alignInd >= alignColl.size()) {
+ alignColl.resize(alignInd + 1);
}
- std::cout << "\t";
-
- //Print word_all1
- for (int j = 0; j<target_phrases[i].word_all1.size(); j++) {
- if (j%2 == 0) {
- std::cout << (short)target_phrases[i].word_all1[j] << "-";
- } else {
- std::cout << (short)target_phrases[i].word_all1[j] << " ";
- }
+
+ Alignments &aligns = alignColl[alignInd];
+ for (size_t i = 1; i < toks.size(); ++i) {
+ size_t pos = Scan<size_t>(toks[i]);
+ aligns.push_back(pos);
}
- std::cout << std::endl;
}
}
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/quering.hh b/moses/TranslationModel/ProbingPT/quering.hh
index e574d1f8f..c43c7f3b9 100644
--- a/moses/TranslationModel/ProbingPT/quering.hh
+++ b/moses/TranslationModel/ProbingPT/quering.hh
@@ -1,45 +1,65 @@
#pragma once
-#include "probing_hash_utils.hh"
-#include "huffmanish.hh"
-#include "hash.hh" //Includes line splitter
+#include <boost/unordered_map.hpp>
#include <sys/stat.h> //For finding size of file
#include "vocabid.hh"
#include <algorithm> //toLower
-#define API_VERSION 3
-
+#include <deque>
+#include "probing_hash_utils.hh"
+#include "hash.hh" //Includes line splitter
+#include "line_splitter.hh"
+#include "moses//Util.h"
-char * read_binary_file(char * filename);
+namespace Moses
+{
class QueryEngine
{
- unsigned char * binary_mmaped; //The binari phrase table file
- std::map<unsigned int, std::string> vocabids;
std::map<uint64_t, std::string> source_vocabids;
+ typedef std::vector<unsigned char> Alignments;
+ std::vector<Alignments> alignColl;
+
Table table;
char *mem; //Memory for the table, necessary so that we can correctly destroy the object
- HuffmanDecoder decoder;
-
- size_t binary_filesize;
size_t table_filesize;
- int num_scores;
bool is_reordering;
+
+ void read_alignments(const std::string &alignPath);
+
public:
- QueryEngine (const char *);
+ int num_scores;
+ int num_lex_scores;
+ bool logProb;
+
+ QueryEngine(const char *);
~QueryEngine();
- std::pair<bool, std::vector<target_text> > query(StringPiece source_phrase);
- std::pair<bool, std::vector<target_text> > query(std::vector<uint64_t> source_phrase);
- void printTargetInfo(std::vector<target_text> target_phrases);
- const std::map<unsigned int, std::string> getVocab() const {
- return decoder.get_target_lookup_map();
- }
- const std::map<uint64_t, std::string> getSourceVocab() const {
- return source_vocabids;
+ std::pair<bool, uint64_t> query(uint64_t key);
+
+ const std::map<uint64_t, std::string> &getSourceVocab() const
+ { return source_vocabids; }
+
+ const std::vector<Alignments> &getAlignments() const
+ { return alignColl; }
+
+ uint64_t getKey(uint64_t source_phrase[], size_t size) const;
+
+ template<typename T>
+ inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const
+ {
+ boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought);
+ if (iter == keyValue.end()) {
+ return false;
+ }
+
+ const std::string &foundStr = iter->second;
+ found = Scan<T>(foundStr);
+ return true;
}
};
+}
diff --git a/moses/TranslationModel/ProbingPT/storing.cpp b/moses/TranslationModel/ProbingPT/storing.cpp
index 01128c1e4..98dcfd5dc 100644
--- a/moses/TranslationModel/ProbingPT/storing.cpp
+++ b/moses/TranslationModel/ProbingPT/storing.cpp
@@ -1,161 +1,303 @@
+#include <sys/stat.h>
+#include <boost/foreach.hpp>
+#include "line_splitter.hh"
#include "storing.hh"
+#include "StoreTarget.h"
+#include "StoreVocab.h"
+#include "moses/Util.h"
+#include "moses/InputFileStream.h"
-BinaryFileWriter::BinaryFileWriter (std::string basepath) : os ((basepath + "/binfile.dat").c_str(), std::ios::binary)
+using namespace std;
+
+namespace Moses
{
- binfile.reserve(10000); //Reserve part of the vector to avoid realocation
- it = binfile.begin();
- dist_from_start = 0; //Initialize variables
- extra_counter = 0;
-}
-void BinaryFileWriter::write (std::vector<unsigned char> * bytes)
+///////////////////////////////////////////////////////////////////////
+void Node::Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos)
{
- binfile.insert(it, bytes->begin(), bytes->end()); //Insert the bytes
- //Keep track of the offsets
- it += bytes->size();
- dist_from_start = distance(binfile.begin(),it);
- //Flush the vector to disk every once in a while so that we don't consume too much ram
- if (dist_from_start > 9000) {
- flush();
+ if (pos < sourcePhrase.size()) {
+ uint64_t vocabId = sourcePhrase[pos];
+
+ Node *child;
+ Children::iterator iter = m_children.find(vocabId);
+ if (iter == m_children.end()) {
+ // New node. Write other children then discard them
+ BOOST_FOREACH(Children::value_type &valPair, m_children) {
+ Node &otherChild = valPair.second;
+ otherChild.Write(table);
+ }
+ m_children.clear();
+
+ // create new node
+ child = &m_children[vocabId];
+ assert(!child->done);
+ child->key = key + (vocabId << pos);
+ }
+ else {
+ child = &iter->second;
+ }
+
+ child->Add(table, sourcePhrase, pos + 1);
+ }
+ else {
+ // this node was written previously 'cos it has rules
+ done = true;
}
}
-void BinaryFileWriter::flush ()
+void Node::Write(Table &table)
{
- //Cast unsigned char to char before writing...
- os.write((char *)&binfile[0], dist_from_start);
- //Clear the vector:
- binfile.clear();
- binfile.reserve(10000);
- extra_counter += dist_from_start; //Keep track of the total number of bytes.
- it = binfile.begin(); //Reset iterator
- dist_from_start = distance(binfile.begin(),it); //Reset dist from start
-}
+ //cerr << "START write " << done << " " << key << endl;
+ BOOST_FOREACH(Children::value_type &valPair, m_children) {
+ Node &child = valPair.second;
+ child.Write(table);
+ }
-BinaryFileWriter::~BinaryFileWriter ()
-{
- os.close();
- binfile.clear();
+ if (!done) {
+ // save
+ Entry sourceEntry;
+ sourceEntry.value = NONE;
+ sourceEntry.key = key;
+
+ //Put into table
+ table.Insert(sourceEntry);
+ }
}
-void createProbingPT(const char * phrasetable_path, const char * target_path,
- const char * num_scores, const char * is_reordering)
+///////////////////////////////////////////////////////////////////////
+void createProbingPT(const std::string &phrasetable_path,
+ const std::string &basepath, int num_scores, int num_lex_scores,
+ bool log_prob, int max_cache_size, bool scfg)
{
+ std::cerr << "Starting..." << std::endl;
+
//Get basepath and create directory if missing
- std::string basepath(target_path);
mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
- //Set up huffman and serialize decoder maps.
- Huffman huffmanEncoder(phrasetable_path); //initialize
- huffmanEncoder.assign_values();
- huffmanEncoder.produce_lookups();
- huffmanEncoder.serialize_maps(target_path);
+ StoreTarget storeTarget(basepath);
//Get uniq lines:
- unsigned long uniq_entries = huffmanEncoder.getUniqLines();
+ unsigned long uniq_entries = countUniqueSource(phrasetable_path);
//Source phrase vocabids
- std::map<uint64_t, std::string> source_vocabids;
+ StoreVocab<uint64_t> sourceVocab(basepath + "/source_vocabids");
//Read the file
- util::FilePiece filein(phrasetable_path);
+ util::FilePiece filein(phrasetable_path.c_str());
//Init the probing hash table
size_t size = Table::Size(uniq_entries, 1.2);
char * mem = new char[size];
memset(mem, 0, size);
- Table table(mem, size);
+ Table sourceEntries(mem, size);
- BinaryFileWriter binfile(basepath); //Init the binary file writer.
-
- line_text prev_line; //Check if the source phrase of the previous line is the same
+ std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> cache;
+ float totalSourceCount = 0;
//Keep track of the size of each group of target phrases
- uint64_t entrystartidx = 0;
- //uint64_t line_num = 0;
-
+ size_t line_num = 0;
//Read everything and processs
- while(true) {
+ std::string prevSource;
+
+ Node sourcePhrases;
+ sourcePhrases.done = true;
+ sourcePhrases.key = 0;
+
+ while (true) {
try {
//Process line read
line_text line;
- line = splitLine(filein.ReadLine());
- //Add source phrases to vocabularyIDs
- add_to_map(&source_vocabids, line.source_phrase);
+ line = splitLine(filein.ReadLine(), scfg);
+ //cerr << "line=" << line.source_phrase << endl;
- if ((binfile.dist_from_start + binfile.extra_counter) == 0) {
- prev_line = line; //For the first iteration assume the previous line is
- } //The same as this one.
+ ++line_num;
+ if (line_num % 1000000 == 0) {
+ std::cerr << line_num << " " << std::flush;
+ }
- if (line.source_phrase != prev_line.source_phrase) {
+ //Add source phrases to vocabularyIDs
+ add_to_map(sourceVocab, line.source_phrase);
+
+ if (prevSource.empty()) {
+ // 1st line
+ prevSource = line.source_phrase.as_string();
+ storeTarget.Append(line, log_prob, scfg);
+ }
+ else if (prevSource == line.source_phrase) {
+ //If we still have the same line, just append to it:
+ storeTarget.Append(line, log_prob, scfg);
+ }
+ else {
+ assert(prevSource != line.source_phrase);
//Create a new entry even
+ // save
+ uint64_t targetInd = storeTarget.Save();
+
+ // next line
+ storeTarget.Append(line, log_prob, scfg);
+
//Create an entry for the previous source phrase:
- Entry pesho;
- pesho.value = entrystartidx;
+ Entry sourceEntry;
+ sourceEntry.value = targetInd;
//The key is the sum of hashes of individual words bitshifted by their position in the phrase.
//Probably not entirerly correct, but fast and seems to work fine in practise.
- pesho.key = 0;
- std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
- for (int i = 0; i < vocabid_source.size(); i++) {
- pesho.key += (vocabid_source[i] << i);
+ std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
+ if (scfg) {
+ // storing prefixes?
+ sourcePhrases.Add(sourceEntries, vocabid_source);
}
- pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
+ sourceEntry.key = getKey(vocabid_source);
+ /*
+ cerr << "prevSource=" << prevSource << flush
+ << " vocabids=" << Debug(vocabid_source) << flush
+ << " key=" << sourceEntry.key << endl;
+ */
//Put into table
- table.Insert(pesho);
+ sourceEntries.Insert(sourceEntry);
- entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry
+ // update cache - CURRENT source phrase, not prev
+ if (max_cache_size) {
+ std::string countStr = line.counts.as_string();
+ countStr = Trim(countStr);
+ if (!countStr.empty()) {
+ std::vector<float> toks = Tokenize<float>(countStr);
+ //cerr << "CACHE:" << line.source_phrase << " " << countStr << " " << toks[1] << endl;
- //Encode a line and write it to disk.
- std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
- binfile.write(&encoded_line);
+ if (toks.size() >= 2) {
+ totalSourceCount += toks[1];
- //Set prevLine
- prev_line = line;
+ // compute key for CURRENT source
+ std::vector<uint64_t> currVocabidSource = getVocabIDs(line.source_phrase.as_string());
+ uint64_t currKey = getKey(currVocabidSource);
- } else {
- //If we still have the same line, just append to it:
- std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
- binfile.write(&encoded_line);
+ CacheItem *item = new CacheItem(
+ Trim(line.source_phrase.as_string()),
+ currKey,
+ toks[1]);
+ cache.push(item);
+
+ if (max_cache_size > 0 && cache.size() > max_cache_size) {
+ cache.pop();
+ }
+ }
+ }
+ }
+
+ //Set prevLine
+ prevSource = line.source_phrase.as_string();
}
- } catch (util::EndOfFileException e) {
- std::cerr << "Reading phrase table finished, writing remaining files to disk." << std::endl;
- binfile.flush();
+ }
+ catch (util::EndOfFileException e) {
+ std::cerr
+ << "Reading phrase table finished, writing remaining files to disk."
+ << std::endl;
//After the final entry is constructed we need to add it to the phrase_table
//Create an entry for the previous source phrase:
- Entry pesho;
- pesho.value = entrystartidx;
+ uint64_t targetInd = storeTarget.Save();
+
+ Entry sourceEntry;
+ sourceEntry.value = targetInd;
+
//The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
- pesho.key = 0;
- std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
- for (int i = 0; i < vocabid_source.size(); i++) {
- pesho.key += (vocabid_source[i] << i);
- }
- pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
+ std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
+ sourceEntry.key = getKey(vocabid_source);
+
//Put into table
- table.Insert(pesho);
+ sourceEntries.Insert(sourceEntry);
break;
}
}
- serialize_table(mem, size, (basepath + "/probing_hash.dat").c_str());
+ sourcePhrases.Write(sourceEntries);
+
+ storeTarget.SaveAlignment();
- serialize_map(&source_vocabids, (basepath + "/source_vocabids").c_str());
+ serialize_table(mem, size, (basepath + "/probing_hash.dat"));
+
+ sourceVocab.Save();
+
+ serialize_cache(cache, (basepath + "/cache"), totalSourceCount);
delete[] mem;
//Write configfile
std::ofstream configfile;
configfile.open((basepath + "/config").c_str());
- configfile << API_VERSION << '\n';
- configfile << uniq_entries << '\n';
- configfile << num_scores << '\n';
- configfile << is_reordering << '\n';
+ configfile << "API_VERSION\t" << API_VERSION << '\n';
+ configfile << "uniq_entries\t" << uniq_entries << '\n';
+ configfile << "num_scores\t" << num_scores << '\n';
+ configfile << "num_lex_scores\t" << num_lex_scores << '\n';
+ configfile << "log_prob\t" << log_prob << '\n';
configfile.close();
}
+
+size_t countUniqueSource(const std::string &path)
+{
+ size_t ret = 0;
+ InputFileStream strme(path);
+
+ std::string line, prevSource;
+ while (std::getline(strme, line)) {
+ std::vector<std::string> toks = TokenizeMultiCharSeparator(line, "|||");
+ assert(toks.size() != 0);
+
+ if (prevSource != toks[0]) {
+ prevSource = toks[0];
+ ++ret;
+ }
+ }
+
+ return ret;
+}
+
+void serialize_cache(
+ std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
+ const std::string &path, float totalSourceCount)
+{
+ std::vector<const CacheItem*> vec(cache.size());
+
+ size_t ind = cache.size() - 1;
+ while (!cache.empty()) {
+ const CacheItem *item = cache.top();
+ vec[ind] = item;
+ cache.pop();
+ --ind;
+ }
+
+ std::ofstream os(path.c_str());
+
+ os << totalSourceCount << std::endl;
+ for (size_t i = 0; i < vec.size(); ++i) {
+ const CacheItem *item = vec[i];
+ os << item->count << "\t" << item->sourceKey << "\t" << item->source << std::endl;
+ delete item;
+ }
+
+ os.close();
+}
+
+uint64_t getKey(const std::vector<uint64_t> &vocabid_source)
+{
+ return getKey(vocabid_source.data(), vocabid_source.size());
+}
+
+std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos)
+{
+ assert(endPos < vocabid_source.size());
+
+ std::vector<uint64_t> ret(endPos + 1);
+ for (size_t i = 0; i <= endPos; ++i) {
+ ret[i] = vocabid_source[i];
+ }
+ return ret;
+}
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/storing.hh b/moses/TranslationModel/ProbingPT/storing.hh
index 8554d614f..957c73491 100644
--- a/moses/TranslationModel/ProbingPT/storing.hh
+++ b/moses/TranslationModel/ProbingPT/storing.hh
@@ -1,36 +1,95 @@
#pragma once
+#include <boost/unordered_set.hpp>
+#include <boost/unordered_map.hpp>
#include <cstdio>
+#include <sstream>
#include <fstream>
#include <iostream>
+#include <string>
+#include <queue>
+#include <sys/stat.h> //mkdir
#include "hash.hh" //Includes line_splitter
#include "probing_hash_utils.hh"
-#include "huffmanish.hh"
-#include <sys/stat.h> //mkdir
#include "util/file_piece.hh"
#include "util/file.hh"
#include "vocabid.hh"
-#define API_VERSION 3
-void createProbingPT(const char * phrasetable_path, const char * target_path,
- const char * num_scores, const char * is_reordering);
+namespace Moses
+{
+typedef std::vector<uint64_t> SourcePhrase;
+
+
+class Node
+{
+ typedef boost::unordered_map<uint64_t, Node> Children;
+ Children m_children;
+
+public:
+ uint64_t key;
+ bool done;
+
+ Node()
+ :done(false)
+ {}
+
+ void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0);
+ void Write(Table &table);
+};
+
+
+void createProbingPT(const std::string &phrasetable_path,
+ const std::string &basepath, int num_scores, int num_lex_scores,
+ bool log_prob, int max_cache_size, bool scfg);
+uint64_t getKey(const std::vector<uint64_t> &source_phrase);
+
+std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos);
-class BinaryFileWriter
+template<typename T>
+std::string Debug(const std::vector<T> &vec)
{
- std::vector<unsigned char> binfile;
- std::vector<unsigned char>::iterator it;
- //Output binary
- std::ofstream os;
+ std::stringstream strm;
+ for (size_t i = 0; i < vec.size(); ++i) {
+ strm << vec[i] << " ";
+ }
+ return strm.str();
+}
+size_t countUniqueSource(const std::string &path);
+
+class CacheItem
+{
public:
- unsigned int dist_from_start; //Distance from the start of the vector.
- uint64_t extra_counter; //After we reset the counter, we still want to keep track of the correct offset, so
+ std::string source;
+ uint64_t sourceKey;
+ float count;
+ CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount)
+ :source(vSource)
+ ,sourceKey(vSourceKey)
+ ,count(vCount)
+ {
+ }
- BinaryFileWriter (std::string);
- ~BinaryFileWriter ();
- void write (std::vector<unsigned char> * bytes);
- void flush (); //Flush to disk
+ bool operator<(const CacheItem &other) const
+ {
+ return count > other.count;
+ }
+};
+class CacheItemOrderer
+{
+public:
+ bool operator()(const CacheItem* a, const CacheItem* b) const
+ {
+ return (*a) < (*b);
+ }
};
+
+void serialize_cache(
+ std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
+ const std::string &path, float totalSourceCount);
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/vocabid.cpp b/moses/TranslationModel/ProbingPT/vocabid.cpp
index 1452f299d..3d6efe841 100644
--- a/moses/TranslationModel/ProbingPT/vocabid.cpp
+++ b/moses/TranslationModel/ProbingPT/vocabid.cpp
@@ -1,32 +1,59 @@
+#include <boost/foreach.hpp>
#include "vocabid.hh"
+#include "StoreVocab.h"
+#include "moses/Util.h"
-void add_to_map(std::map<uint64_t, std::string> *karta, StringPiece textin)
+namespace Moses
+{
+
+void add_to_map(StoreVocab<uint64_t> &sourceVocab,
+ const StringPiece &textin)
{
//Tokenize
- util::TokenIter<util::SingleCharacter> it(textin, util::SingleCharacter(' '));
+ util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
+
+ while (itWord) {
+ StringPiece word = *itWord;
- while(it) {
- karta->insert(std::pair<uint64_t, std::string>(getHash(*it), it->as_string()));
- it++;
+ util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
+ while (itFactor) {
+ StringPiece factor = *itFactor;
+
+ sourceVocab.Insert(getHash(factor), factor.as_string());
+ itFactor++;
+ }
+ itWord++;
}
}
-void serialize_map(std::map<uint64_t, std::string> *karta, const char* filename)
+void serialize_map(const std::map<uint64_t, std::string> &karta,
+ const std::string &filename)
{
- std::ofstream os (filename, std::ios::binary);
- boost::archive::text_oarchive oarch(os);
+ std::ofstream os(filename.c_str());
+
+ std::map<uint64_t, std::string>::const_iterator iter;
+ for (iter = karta.begin(); iter != karta.end(); ++iter) {
+ os << iter->first << '\t' << iter->second << std::endl;
+ }
- oarch << *karta; //Serialise map
os.close();
}
-void read_map(std::map<uint64_t, std::string> *karta, const char* filename)
+void read_map(std::map<uint64_t, std::string> &karta, const char* filename)
{
- std::ifstream is (filename, std::ios::binary);
- boost::archive::text_iarchive iarch(is);
+ std::ifstream is(filename);
- iarch >> *karta;
+ std::string line;
+ while (getline(is, line)) {
+ std::vector<std::string> toks = Tokenize(line, "\t");
+ assert(toks.size() == 2);
+ uint64_t ind = Scan<uint64_t>(toks[1]);
+ karta[ind] = toks[0];
+ }
//Close the stream after we are done.
is.close();
}
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/vocabid.hh b/moses/TranslationModel/ProbingPT/vocabid.hh
index 491c53439..f9c9b2dff 100644
--- a/moses/TranslationModel/ProbingPT/vocabid.hh
+++ b/moses/TranslationModel/ProbingPT/vocabid.hh
@@ -13,8 +13,17 @@
#include "util/string_piece.hh" //Tokenization and work with StringPiece
#include "util/tokenize_piece.hh"
-void add_to_map(std::map<uint64_t, std::string> *karta, StringPiece textin);
+namespace Moses
+{
+template<typename VOCABID>
+class StoreVocab;
-void serialize_map(std::map<uint64_t, std::string> *karta, const char* filename);
+void add_to_map(StoreVocab<uint64_t> &sourceVocab,
+ const StringPiece &textin);
-void read_map(std::map<uint64_t, std::string> *karta, const char* filename);
+void serialize_map(const std::map<uint64_t, std::string> &karta,
+ const std::string &filename);
+
+void read_map(std::map<uint64_t, std::string> &karta, const char* filename);
+
+}