moses/TranslationModel/ProbingPT/huffmanish.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451

#include "huffmanish.hh"

Huffman::Huffman (const char * filepath)
{
  //Read the file
  util::FilePiece filein(filepath);

  //Init uniq_lines to zero;
  uniq_lines = 0;

  line_text prev_line; //Check for unique lines.
  int num_lines = 0 ;

  while (true) {
    line_text new_line;

    num_lines++;

    try {
      //Process line read
      new_line = splitLine(filein.ReadLine());
      count_elements(new_line); //Counts the number of elements, adds new and increments counters.

    } catch (util::EndOfFileException e) {
      std::cerr << "Unique entries counted: ";
      break;
    }

    if (new_line.source_phrase == prev_line.source_phrase) {
      continue;
    } else {
      uniq_lines++;
      prev_line = new_line;
    }
  }

  std::cerr << uniq_lines << std::endl;
}

void Huffman::count_elements(line_text linein)
{
  //For target phrase:
  util::TokenIter<util::SingleCharacter> it(linein.target_phrase, util::SingleCharacter(' '));
  while (it) {
    //Check if we have that entry
    std::map<std::string, unsigned int>::iterator mapiter;
    mapiter = target_phrase_words.find(it->as_string());

    if (mapiter != target_phrase_words.end()) {
      //If the element is found, increment the count.
      mapiter->second++;
    } else {
      //Else create a new entry;
      target_phrase_words.insert(std::pair<std::string, unsigned int>(it->as_string(), 1));
    }
    it++;
  }

  //For word allignment 1
  std::map<std::vector<unsigned char>, unsigned int>::iterator mapiter3;
  std::vector<unsigned char> numbers = splitWordAll1(linein.word_align);
  mapiter3 = word_all1.find(numbers);

  if (mapiter3 != word_all1.end()) {
    //If the element is found, increment the count.
    mapiter3->second++;
  } else {
    //Else create a new entry;
    word_all1.insert(std::pair<std::vector<unsigned char>, unsigned int>(numbers, 1));
  }

}

//Assigns huffman values for each unique element
void Huffman::assign_values()
{
  //First create vectors for all maps so that we could sort them later.

  //Create a vector for target phrases
  for(std::map<std::string, unsigned int>::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) {
    target_phrase_words_counts.push_back(*it);
  }
  //Sort it
  std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair());

  //Create a vector for word allignments 1
  for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) {
    word_all1_counts.push_back(*it);
  }
  //Sort it
  std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec());


  //Afterwards we assign a value for each phrase, starting from 1, as zero is reserved for delimiter
  unsigned int i = 1; //huffman code
  for(std::vector<std::pair<std::string, unsigned int> >::iterator it = target_phrase_words_counts.begin();
      it != target_phrase_words_counts.end(); it++) {
    target_phrase_huffman.insert(std::pair<std::string, unsigned int>(it->first, i));
    i++; //Go to the next huffman code
  }

  i = 1; //Reset i for the next map
  for(std::vector<std::pair<std::vector<unsigned char>, unsigned int> >::iterator it = word_all1_counts.begin();
      it != word_all1_counts.end(); it++) {
    word_all1_huffman.insert(std::pair<std::vector<unsigned char>, unsigned int>(it->first, i));
    i++; //Go to the next huffman code
  }

  //After lookups are produced, clear some memory usage of objects not needed anymore.
  target_phrase_words.clear();
  word_all1.clear();

  target_phrase_words_counts.clear();
  word_all1_counts.clear();

  std::cerr << "Finished generating huffman codes." << std::endl;

}

void Huffman::serialize_maps(const char * dirname)
{
  //Note that directory name should exist.
  std::string basedir(dirname);
  std::string target_phrase_path(basedir + "/target_phrases");
  std::string probabilities_path(basedir + "/probs");
  std::string word_all1_path(basedir + "/Wall1");

  //Target phrase
  std::ofstream os (target_phrase_path.c_str(), std::ios::binary);
  boost::archive::text_oarchive oarch(os);
  oarch << lookup_target_phrase;
  os.close();

  //Word all1
  std::ofstream os2 (word_all1_path.c_str(), std::ios::binary);
  boost::archive::text_oarchive oarch2(os2);
  oarch2 << lookup_word_all1;
  os2.close();
}

std::vector<unsigned char> Huffman::full_encode_line(line_text line)
{
  return vbyte_encode_line((encode_line(line)));
}

std::vector<unsigned int> Huffman::encode_line(line_text line)
{
  std::vector<unsigned int> retvector;

  //Get target_phrase first.
  util::TokenIter<util::SingleCharacter> it(line.target_phrase, util::SingleCharacter(' '));
  while (it) {
    retvector.push_back(target_phrase_huffman.find(it->as_string())->second);
    it++;
  }
  //Add a zero;
  retvector.push_back(0);

  //Get probabilities. Reinterpreting the float bytes as unsgined int.
  util::TokenIter<util::SingleCharacter> probit(line.prob, util::SingleCharacter(' '));
  while (probit) {
    //Sometimes we have too big floats to handle, so first convert to double
    double tempnum = atof(probit->data());
    float num = (float)tempnum;
    retvector.push_back(reinterpret_float(&num));
    probit++;
  }
  //Add a zero;
  retvector.push_back(0);


  //Get Word allignments
  retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_align))->second);
  retvector.push_back(0);

  return retvector;
}

void Huffman::produce_lookups()
{
  //basically invert every map that we have
  for(std::map<std::string, unsigned int>::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) {
    lookup_target_phrase.insert(std::pair<unsigned int, std::string>(it->second, it->first));
  }

  for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) {
    lookup_word_all1.insert(std::pair<unsigned int, std::vector<unsigned char> >(it->second, it->first));
  }

}

HuffmanDecoder::HuffmanDecoder (const char * dirname)
{
  //Read the maps from disk

  //Note that directory name should exist.
  std::string basedir(dirname);
  std::string target_phrase_path(basedir + "/target_phrases");
  std::string word_all1_path(basedir + "/Wall1");

  //Target phrases
  std::ifstream is (target_phrase_path.c_str(), std::ios::binary);
  boost::archive::text_iarchive iarch(is);
  iarch >> lookup_target_phrase;
  is.close();

  //Word allignment 1
  std::ifstream is2 (word_all1_path.c_str(), std::ios::binary);
  boost::archive::text_iarchive iarch2(is2);
  iarch2 >> lookup_word_all1;
  is2.close();

}

HuffmanDecoder::HuffmanDecoder (std::map<unsigned int, std::string> * lookup_target,
                                std::map<unsigned int, std::vector<unsigned char> > * lookup_word1)
{
  lookup_target_phrase = *lookup_target;
  lookup_word_all1 = *lookup_word1;
}

std::vector<target_text> HuffmanDecoder::full_decode_line (std::vector<unsigned char> lines, int num_scores)
{
  std::vector<target_text> retvector; //All target phrases
  std::vector<unsigned int> decoded_lines = vbyte_decode_line(lines); //All decoded lines
  std::vector<unsigned int>::iterator it = decoded_lines.begin(); //Iterator for them
  std::vector<unsigned int> current_target_phrase; //Current target phrase decoded

  short zero_count = 0; //Count home many zeroes we have met. so far. Every 3 zeroes mean a new target phrase.
  while(it != decoded_lines.end()) {
    if (zero_count == 1) {
      //We are extracting scores. we know how many scores there are so we can push them
      //to the vector. This is done in case any of the scores is 0, because it would mess
      //up the state machine.
      for (int i = 0; i < num_scores; i++) {
        current_target_phrase.push_back(*it);
        it++;
      }
    }

    if (zero_count == 3) {
      //We have finished with this entry, decode it, and add it to the retvector.
      retvector.push_back(decode_line(current_target_phrase, num_scores));
      current_target_phrase.clear(); //Clear the current target phrase and the zero_count
      zero_count = 0; //So that we can reuse them for the next target phrase
    }
    //Add to the next target_phrase, number by number.
    current_target_phrase.push_back(*it);
    if (*it == 0) {
      zero_count++;
    }
    it++; //Go to the next word/symbol
  }
  //Don't forget the last remaining line!
  if (zero_count == 3) {
    //We have finished with this entry, decode it, and add it to the retvector.
    retvector.push_back(decode_line(current_target_phrase, num_scores));
    current_target_phrase.clear(); //Clear the current target phrase and the zero_count
    zero_count = 0; //So that we can reuse them for the next target phrase
  }

  return retvector;

}

target_text HuffmanDecoder::decode_line (std::vector<unsigned int> input, int num_scores)
{
  //demo decoder
  target_text ret;
  //Split everything
  std::vector<unsigned int> target_phrase;
  std::vector<unsigned int> probs;
  unsigned int wAll;

  //Split the line into the proper arrays
  short num_zeroes = 0;
  int counter = 0;
  while (num_zeroes < 3) {
    unsigned int num = input[counter];
    if (num == 0) {
      num_zeroes++;
    } else if (num_zeroes == 0) {
      target_phrase.push_back(num);
    } else if (num_zeroes == 1) {
      //Push exactly num_scores scores
      for (int i = 0; i < num_scores; i++) {
        probs.push_back(num);
        counter++;
        num = input[counter];
      }
      continue;
    } else if (num_zeroes == 2) {
      wAll = num;
    }
    counter++;
  }

  ret.target_phrase = target_phrase;
  ret.word_all1 = lookup_word_all1.find(wAll)->second;

  //Decode probabilities
  for (std::vector<unsigned int>::iterator it = probs.begin(); it != probs.end(); it++) {
    ret.prob.push_back(reinterpret_uint(&(*it)));
  }

  return ret;

}

inline std::string HuffmanDecoder::getTargetWordFromID(unsigned int id)
{
  return lookup_target_phrase.find(id)->second;
}

std::string HuffmanDecoder::getTargetWordsFromIDs(std::vector<unsigned int> ids)
{
  std::string returnstring;
  for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
    returnstring.append(getTargetWordFromID(*it) + " ");
  }

  return returnstring;
}

inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase)
{
  return lookup_target_phrase->find(id)->second;
}

std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase)
{
  std::string returnstring;
  for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
    returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " ");
  }

  return returnstring;
}

/*Those functions are used to more easily store the floats in the binary phrase table
 We convert the float unsinged int so that it is the same as our other values and we can
 apply variable byte encoding on top of it.*/

inline unsigned int reinterpret_float(float * num)
{
  unsigned int * converted_num;
  converted_num = reinterpret_cast<unsigned int *>(num);
  return *converted_num;
}

inline float reinterpret_uint(unsigned int * num)
{
  float * converted_num;
  converted_num = reinterpret_cast<float *>(num);
  return *converted_num;
}

/*Mostly taken from stackoverflow, http://stackoverflow.com/questions/5858646/optimizing-variable-length-encoding
and modified in order to return a vector of chars. Implements ULEB128 or variable byte encoding.
This is highly optimized version with unrolled loop */
inline std::vector<unsigned char> vbyte_encode(unsigned int num)
{
  //Determine how many bytes we are going to take.
  short size;
  std::vector<unsigned char> byte_vector;

  if (num < 0x00000080U) {
    size = 1;
    byte_vector.reserve(size);
    goto b1;
  }
  if (num < 0x00004000U) {
    size = 2;
    byte_vector.reserve(size);
    goto b2;
  }
  if (num < 0x00200000U) {
    size = 3;
    byte_vector.reserve(size);
    goto b3;
  }
  if (num < 0x10000000U) {
    size = 4;
    byte_vector.reserve(size);
    goto b4;
  }
  size = 5;
  byte_vector.reserve(size);


  //Now proceed with the encoding.
  byte_vector.push_back((num & 0x7f) | 0x80);
  num >>= 7;
b4:
  byte_vector.push_back((num & 0x7f) | 0x80);
  num >>= 7;
b3:
  byte_vector.push_back((num & 0x7f) | 0x80);
  num >>= 7;
b2:
  byte_vector.push_back((num & 0x7f) | 0x80);
  num >>= 7;
b1:
  byte_vector.push_back(num);

  return byte_vector;
}

std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line)
{
  std::vector<unsigned int> huffman_line;
  std::vector<unsigned char> current_num;

  for (std::vector<unsigned char>::iterator it = line.begin(); it != line.end(); it++) {
    current_num.push_back(*it);
    if ((*it >> 7) != 1) {
      //We don't have continuation in the next bit
      huffman_line.push_back(bytes_to_int(current_num));
      current_num.clear();
    }
  }
  return huffman_line;
}

inline unsigned int bytes_to_int(std::vector<unsigned char> number)
{
  unsigned int retvalue = 0;
  std::vector<unsigned char>::iterator it = number.begin();
  unsigned char shift = 0; //By how many bits to shift

  while (it != number.end()) {
    retvalue |= (*it & 0x7f) << shift;
    shift += 7;
    it++;
  }

  return retvalue;
}

std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line)
{
  std::vector<unsigned char> retvec;

  //For each unsigned int in the line, vbyte encode it and add it to a vector of unsigned chars.
  for (std::vector<unsigned int>::iterator it = line.begin(); it != line.end(); it++) {
    std::vector<unsigned char> vbyte_encoded = vbyte_encode(*it);
    retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end());
  }

  return retvec;
}