1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
|
#include "storing.hh"
BinaryFileWriter::BinaryFileWriter (std::string basepath) : os ((basepath + "/binfile.dat").c_str(), std::ios::binary)
{
binfile.reserve(10000); //Reserve part of the vector to avoid realocation
it = binfile.begin();
dist_from_start = 0; //Initialize variables
extra_counter = 0;
}
void BinaryFileWriter::write (std::vector<unsigned char> * bytes)
{
binfile.insert(it, bytes->begin(), bytes->end()); //Insert the bytes
//Keep track of the offsets
it += bytes->size();
dist_from_start = distance(binfile.begin(),it);
//Flush the vector to disk every once in a while so that we don't consume too much ram
if (dist_from_start > 9000) {
flush();
}
}
void BinaryFileWriter::flush ()
{
//Cast unsigned char to char before writing...
os.write((char *)&binfile[0], dist_from_start);
//Clear the vector:
binfile.clear();
binfile.reserve(10000);
extra_counter += dist_from_start; //Keep track of the total number of bytes.
it = binfile.begin(); //Reset iterator
dist_from_start = distance(binfile.begin(),it); //Reset dist from start
}
BinaryFileWriter::~BinaryFileWriter ()
{
os.close();
binfile.clear();
}
void createProbingPT(const char * phrasetable_path, const char * target_path,
const char * num_scores, const char * is_reordering)
{
//Get basepath and create directory if missing
std::string basepath(target_path);
mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
//Set up huffman and serialize decoder maps.
Huffman huffmanEncoder(phrasetable_path); //initialize
huffmanEncoder.assign_values();
huffmanEncoder.produce_lookups();
huffmanEncoder.serialize_maps(target_path);
//Get uniq lines:
unsigned long uniq_entries = huffmanEncoder.getUniqLines();
//Source phrase vocabids
std::map<uint64_t, std::string> source_vocabids;
//Read the file
util::FilePiece filein(phrasetable_path);
//Init the probing hash table
size_t size = Table::Size(uniq_entries, 1.2);
char * mem = new char[size];
memset(mem, 0, size);
Table table(mem, size);
BinaryFileWriter binfile(basepath); //Init the binary file writer.
line_text prev_line; //Check if the source phrase of the previous line is the same
//Keep track of the size of each group of target phrases
uint64_t entrystartidx = 0;
//uint64_t line_num = 0;
//Read everything and processs
while(true) {
try {
//Process line read
line_text line;
line = splitLine(filein.ReadLine());
//Add source phrases to vocabularyIDs
add_to_map(&source_vocabids, line.source_phrase);
if ((binfile.dist_from_start + binfile.extra_counter) == 0) {
prev_line = line; //For the first iteration assume the previous line is
} //The same as this one.
if (line.source_phrase != prev_line.source_phrase) {
//Create a new entry even
//Create an entry for the previous source phrase:
Entry pesho;
pesho.value = entrystartidx;
//The key is the sum of hashes of individual words bitshifted by their position in the phrase.
//Probably not entirerly correct, but fast and seems to work fine in practise.
pesho.key = 0;
std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
for (int i = 0; i < vocabid_source.size(); i++) {
pesho.key += (vocabid_source[i] << i);
}
pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
//Put into table
table.Insert(pesho);
entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry
//Encode a line and write it to disk.
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
binfile.write(&encoded_line);
//Set prevLine
prev_line = line;
} else {
//If we still have the same line, just append to it:
std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
binfile.write(&encoded_line);
}
} catch (util::EndOfFileException e) {
std::cerr << "Reading phrase table finished, writing remaining files to disk." << std::endl;
binfile.flush();
//After the final entry is constructed we need to add it to the phrase_table
//Create an entry for the previous source phrase:
Entry pesho;
pesho.value = entrystartidx;
//The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
pesho.key = 0;
std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
for (int i = 0; i < vocabid_source.size(); i++) {
pesho.key += (vocabid_source[i] << i);
}
pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
//Put into table
table.Insert(pesho);
break;
}
}
serialize_table(mem, size, (basepath + "/probing_hash.dat").c_str());
serialize_map(&source_vocabids, (basepath + "/source_vocabids").c_str());
delete[] mem;
//Write configfile
std::ofstream configfile;
configfile.open((basepath + "/config").c_str());
configfile << API_VERSION << '\n';
configfile << uniq_entries << '\n';
configfile << num_scores << '\n';
configfile << is_reordering << '\n';
configfile.close();
}
|