Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'moses/TranslationModel/ProbingPT/huffmanish.cpp')
-rw-r--r--moses/TranslationModel/ProbingPT/huffmanish.cpp451
1 files changed, 0 insertions, 451 deletions
diff --git a/moses/TranslationModel/ProbingPT/huffmanish.cpp b/moses/TranslationModel/ProbingPT/huffmanish.cpp
deleted file mode 100644
index 534fd04d1..000000000
--- a/moses/TranslationModel/ProbingPT/huffmanish.cpp
+++ /dev/null
@@ -1,451 +0,0 @@
-#include "huffmanish.hh"
-
-Huffman::Huffman (const char * filepath)
-{
- //Read the file
- util::FilePiece filein(filepath);
-
- //Init uniq_lines to zero;
- uniq_lines = 0;
-
- line_text prev_line; //Check for unique lines.
- int num_lines = 0 ;
-
- while (true) {
- line_text new_line;
-
- num_lines++;
-
- try {
- //Process line read
- new_line = splitLine(filein.ReadLine());
- count_elements(new_line); //Counts the number of elements, adds new and increments counters.
-
- } catch (util::EndOfFileException e) {
- std::cerr << "Unique entries counted: ";
- break;
- }
-
- if (new_line.source_phrase == prev_line.source_phrase) {
- continue;
- } else {
- uniq_lines++;
- prev_line = new_line;
- }
- }
-
- std::cerr << uniq_lines << std::endl;
-}
-
-void Huffman::count_elements(line_text linein)
-{
- //For target phrase:
- util::TokenIter<util::SingleCharacter> it(linein.target_phrase, util::SingleCharacter(' '));
- while (it) {
- //Check if we have that entry
- std::map<std::string, unsigned int>::iterator mapiter;
- mapiter = target_phrase_words.find(it->as_string());
-
- if (mapiter != target_phrase_words.end()) {
- //If the element is found, increment the count.
- mapiter->second++;
- } else {
- //Else create a new entry;
- target_phrase_words.insert(std::pair<std::string, unsigned int>(it->as_string(), 1));
- }
- it++;
- }
-
- //For word allignment 1
- std::map<std::vector<unsigned char>, unsigned int>::iterator mapiter3;
- std::vector<unsigned char> numbers = splitWordAll1(linein.word_align);
- mapiter3 = word_all1.find(numbers);
-
- if (mapiter3 != word_all1.end()) {
- //If the element is found, increment the count.
- mapiter3->second++;
- } else {
- //Else create a new entry;
- word_all1.insert(std::pair<std::vector<unsigned char>, unsigned int>(numbers, 1));
- }
-
-}
-
-//Assigns huffman values for each unique element
-void Huffman::assign_values()
-{
- //First create vectors for all maps so that we could sort them later.
-
- //Create a vector for target phrases
- for(std::map<std::string, unsigned int>::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) {
- target_phrase_words_counts.push_back(*it);
- }
- //Sort it
- std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair());
-
- //Create a vector for word allignments 1
- for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) {
- word_all1_counts.push_back(*it);
- }
- //Sort it
- std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec());
-
-
- //Afterwards we assign a value for each phrase, starting from 1, as zero is reserved for delimiter
- unsigned int i = 1; //huffman code
- for(std::vector<std::pair<std::string, unsigned int> >::iterator it = target_phrase_words_counts.begin();
- it != target_phrase_words_counts.end(); it++) {
- target_phrase_huffman.insert(std::pair<std::string, unsigned int>(it->first, i));
- i++; //Go to the next huffman code
- }
-
- i = 1; //Reset i for the next map
- for(std::vector<std::pair<std::vector<unsigned char>, unsigned int> >::iterator it = word_all1_counts.begin();
- it != word_all1_counts.end(); it++) {
- word_all1_huffman.insert(std::pair<std::vector<unsigned char>, unsigned int>(it->first, i));
- i++; //Go to the next huffman code
- }
-
- //After lookups are produced, clear some memory usage of objects not needed anymore.
- target_phrase_words.clear();
- word_all1.clear();
-
- target_phrase_words_counts.clear();
- word_all1_counts.clear();
-
- std::cerr << "Finished generating huffman codes." << std::endl;
-
-}
-
-void Huffman::serialize_maps(const char * dirname)
-{
- //Note that directory name should exist.
- std::string basedir(dirname);
- std::string target_phrase_path(basedir + "/target_phrases");
- std::string probabilities_path(basedir + "/probs");
- std::string word_all1_path(basedir + "/Wall1");
-
- //Target phrase
- std::ofstream os (target_phrase_path.c_str(), std::ios::binary);
- boost::archive::text_oarchive oarch(os);
- oarch << lookup_target_phrase;
- os.close();
-
- //Word all1
- std::ofstream os2 (word_all1_path.c_str(), std::ios::binary);
- boost::archive::text_oarchive oarch2(os2);
- oarch2 << lookup_word_all1;
- os2.close();
-}
-
-std::vector<unsigned char> Huffman::full_encode_line(line_text line)
-{
- return vbyte_encode_line((encode_line(line)));
-}
-
-std::vector<unsigned int> Huffman::encode_line(line_text line)
-{
- std::vector<unsigned int> retvector;
-
- //Get target_phrase first.
- util::TokenIter<util::SingleCharacter> it(line.target_phrase, util::SingleCharacter(' '));
- while (it) {
- retvector.push_back(target_phrase_huffman.find(it->as_string())->second);
- it++;
- }
- //Add a zero;
- retvector.push_back(0);
-
- //Get probabilities. Reinterpreting the float bytes as unsgined int.
- util::TokenIter<util::SingleCharacter> probit(line.prob, util::SingleCharacter(' '));
- while (probit) {
- //Sometimes we have too big floats to handle, so first convert to double
- double tempnum = atof(probit->data());
- float num = (float)tempnum;
- retvector.push_back(reinterpret_float(&num));
- probit++;
- }
- //Add a zero;
- retvector.push_back(0);
-
-
- //Get Word allignments
- retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_align))->second);
- retvector.push_back(0);
-
- return retvector;
-}
-
-void Huffman::produce_lookups()
-{
- //basically invert every map that we have
- for(std::map<std::string, unsigned int>::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) {
- lookup_target_phrase.insert(std::pair<unsigned int, std::string>(it->second, it->first));
- }
-
- for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) {
- lookup_word_all1.insert(std::pair<unsigned int, std::vector<unsigned char> >(it->second, it->first));
- }
-
-}
-
-HuffmanDecoder::HuffmanDecoder (const char * dirname)
-{
- //Read the maps from disk
-
- //Note that directory name should exist.
- std::string basedir(dirname);
- std::string target_phrase_path(basedir + "/target_phrases");
- std::string word_all1_path(basedir + "/Wall1");
-
- //Target phrases
- std::ifstream is (target_phrase_path.c_str(), std::ios::binary);
- boost::archive::text_iarchive iarch(is);
- iarch >> lookup_target_phrase;
- is.close();
-
- //Word allignment 1
- std::ifstream is2 (word_all1_path.c_str(), std::ios::binary);
- boost::archive::text_iarchive iarch2(is2);
- iarch2 >> lookup_word_all1;
- is2.close();
-
-}
-
-HuffmanDecoder::HuffmanDecoder (std::map<unsigned int, std::string> * lookup_target,
- std::map<unsigned int, std::vector<unsigned char> > * lookup_word1)
-{
- lookup_target_phrase = *lookup_target;
- lookup_word_all1 = *lookup_word1;
-}
-
-std::vector<target_text> HuffmanDecoder::full_decode_line (std::vector<unsigned char> lines, int num_scores)
-{
- std::vector<target_text> retvector; //All target phrases
- std::vector<unsigned int> decoded_lines = vbyte_decode_line(lines); //All decoded lines
- std::vector<unsigned int>::iterator it = decoded_lines.begin(); //Iterator for them
- std::vector<unsigned int> current_target_phrase; //Current target phrase decoded
-
- short zero_count = 0; //Count home many zeroes we have met. so far. Every 3 zeroes mean a new target phrase.
- while(it != decoded_lines.end()) {
- if (zero_count == 1) {
- //We are extracting scores. we know how many scores there are so we can push them
- //to the vector. This is done in case any of the scores is 0, because it would mess
- //up the state machine.
- for (int i = 0; i < num_scores; i++) {
- current_target_phrase.push_back(*it);
- it++;
- }
- }
-
- if (zero_count == 3) {
- //We have finished with this entry, decode it, and add it to the retvector.
- retvector.push_back(decode_line(current_target_phrase, num_scores));
- current_target_phrase.clear(); //Clear the current target phrase and the zero_count
- zero_count = 0; //So that we can reuse them for the next target phrase
- }
- //Add to the next target_phrase, number by number.
- current_target_phrase.push_back(*it);
- if (*it == 0) {
- zero_count++;
- }
- it++; //Go to the next word/symbol
- }
- //Don't forget the last remaining line!
- if (zero_count == 3) {
- //We have finished with this entry, decode it, and add it to the retvector.
- retvector.push_back(decode_line(current_target_phrase, num_scores));
- current_target_phrase.clear(); //Clear the current target phrase and the zero_count
- zero_count = 0; //So that we can reuse them for the next target phrase
- }
-
- return retvector;
-
-}
-
-target_text HuffmanDecoder::decode_line (std::vector<unsigned int> input, int num_scores)
-{
- //demo decoder
- target_text ret;
- //Split everything
- std::vector<unsigned int> target_phrase;
- std::vector<unsigned int> probs;
- unsigned int wAll;
-
- //Split the line into the proper arrays
- short num_zeroes = 0;
- int counter = 0;
- while (num_zeroes < 3) {
- unsigned int num = input[counter];
- if (num == 0) {
- num_zeroes++;
- } else if (num_zeroes == 0) {
- target_phrase.push_back(num);
- } else if (num_zeroes == 1) {
- //Push exactly num_scores scores
- for (int i = 0; i < num_scores; i++) {
- probs.push_back(num);
- counter++;
- num = input[counter];
- }
- continue;
- } else if (num_zeroes == 2) {
- wAll = num;
- }
- counter++;
- }
-
- ret.target_phrase = target_phrase;
- ret.word_all1 = lookup_word_all1.find(wAll)->second;
-
- //Decode probabilities
- for (std::vector<unsigned int>::iterator it = probs.begin(); it != probs.end(); it++) {
- ret.prob.push_back(reinterpret_uint(&(*it)));
- }
-
- return ret;
-
-}
-
-inline std::string HuffmanDecoder::getTargetWordFromID(unsigned int id)
-{
- return lookup_target_phrase.find(id)->second;
-}
-
-std::string HuffmanDecoder::getTargetWordsFromIDs(std::vector<unsigned int> ids)
-{
- std::string returnstring;
- for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
- returnstring.append(getTargetWordFromID(*it) + " ");
- }
-
- return returnstring;
-}
-
-inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase)
-{
- return lookup_target_phrase->find(id)->second;
-}
-
-std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase)
-{
- std::string returnstring;
- for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
- returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " ");
- }
-
- return returnstring;
-}
-
-/*Those functions are used to more easily store the floats in the binary phrase table
- We convert the float unsinged int so that it is the same as our other values and we can
- apply variable byte encoding on top of it.*/
-
-inline unsigned int reinterpret_float(float * num)
-{
- unsigned int * converted_num;
- converted_num = reinterpret_cast<unsigned int *>(num);
- return *converted_num;
-}
-
-inline float reinterpret_uint(unsigned int * num)
-{
- float * converted_num;
- converted_num = reinterpret_cast<float *>(num);
- return *converted_num;
-}
-
-/*Mostly taken from stackoverflow, http://stackoverflow.com/questions/5858646/optimizing-variable-length-encoding
-and modified in order to return a vector of chars. Implements ULEB128 or variable byte encoding.
-This is highly optimized version with unrolled loop */
-inline std::vector<unsigned char> vbyte_encode(unsigned int num)
-{
- //Determine how many bytes we are going to take.
- short size;
- std::vector<unsigned char> byte_vector;
-
- if (num < 0x00000080U) {
- size = 1;
- byte_vector.reserve(size);
- goto b1;
- }
- if (num < 0x00004000U) {
- size = 2;
- byte_vector.reserve(size);
- goto b2;
- }
- if (num < 0x00200000U) {
- size = 3;
- byte_vector.reserve(size);
- goto b3;
- }
- if (num < 0x10000000U) {
- size = 4;
- byte_vector.reserve(size);
- goto b4;
- }
- size = 5;
- byte_vector.reserve(size);
-
-
- //Now proceed with the encoding.
- byte_vector.push_back((num & 0x7f) | 0x80);
- num >>= 7;
-b4:
- byte_vector.push_back((num & 0x7f) | 0x80);
- num >>= 7;
-b3:
- byte_vector.push_back((num & 0x7f) | 0x80);
- num >>= 7;
-b2:
- byte_vector.push_back((num & 0x7f) | 0x80);
- num >>= 7;
-b1:
- byte_vector.push_back(num);
-
- return byte_vector;
-}
-
-std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line)
-{
- std::vector<unsigned int> huffman_line;
- std::vector<unsigned char> current_num;
-
- for (std::vector<unsigned char>::iterator it = line.begin(); it != line.end(); it++) {
- current_num.push_back(*it);
- if ((*it >> 7) != 1) {
- //We don't have continuation in the next bit
- huffman_line.push_back(bytes_to_int(current_num));
- current_num.clear();
- }
- }
- return huffman_line;
-}
-
-inline unsigned int bytes_to_int(std::vector<unsigned char> number)
-{
- unsigned int retvalue = 0;
- std::vector<unsigned char>::iterator it = number.begin();
- unsigned char shift = 0; //By how many bits to shift
-
- while (it != number.end()) {
- retvalue |= (*it & 0x7f) << shift;
- shift += 7;
- it++;
- }
-
- return retvalue;
-}
-
-std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line)
-{
- std::vector<unsigned char> retvec;
-
- //For each unsigned int in the line, vbyte encode it and add it to a vector of unsigned chars.
- for (std::vector<unsigned int>::iterator it = line.begin(); it != line.end(); it++) {
- std::vector<unsigned char> vbyte_encoded = vbyte_encode(*it);
- retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end());
- }
-
- return retvec;
-}