Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/nplm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/prepareNeuralLM.cpp')
-rw-r--r--src/prepareNeuralLM.cpp1057
1 files changed, 528 insertions, 529 deletions
diff --git a/src/prepareNeuralLM.cpp b/src/prepareNeuralLM.cpp
index adedc72..d5fc16b 100644
--- a/src/prepareNeuralLM.cpp
+++ b/src/prepareNeuralLM.cpp
@@ -2,19 +2,19 @@
#include <vector>
#include <queue>
#include <deque>
-# include <fstream>
-# include <iterator>
-
-# include <boost/unordered_map.hpp>
-# include <boost/algorithm/string/join.hpp>
-# include <boost/interprocess/managed_shared_memory.hpp>
-# include <boost/interprocess/allocators/allocator.hpp>
-# include <boost/interprocess/managed_mapped_file.hpp>
+#include <fstream>
+#include <iterator>
+
+#include <boost/unordered_map.hpp>
+#include <boost/algorithm/string/join.hpp>
+#include <boost/interprocess/managed_shared_memory.hpp>
+#include <boost/interprocess/allocators/allocator.hpp>
+#include <boost/interprocess/managed_mapped_file.hpp>
#include <boost/interprocess/containers/vector.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_int_distribution.hpp>
-# include <tclap/CmdLine.h>
+#include <tclap/CmdLine.h>
#include "neuralLM.h"
#include "util.h"
@@ -36,314 +36,313 @@ typedef std::vector<vec,vecAllocator> vecvec;
typedef long long int data_size_t; // training data can easily exceed 2G instances
template<typename T>
-void writeNgrams(const T &data,
- int ngram_size,
- const vocabulary &vocab,
- bool numberize,
- bool add_start_stop,
- bool ngramize,
- const string &filename)
- {
- ofstream file(filename.c_str());
- if (!file)
+void writeNgrams(const T &data,
+ int ngram_size,
+ const vocabulary &vocab,
+ bool numberize,
+ bool add_start_stop,
+ bool ngramize,
+ const string &filename)
+{
+ ofstream file(filename.c_str());
+ if (!file)
+ {
+ cerr << "error: could not open " << filename << endl;
+ exit(1);
+ }
+
+ vector<vector<int> > ngrams;
+
+ for (int i=0; i<data.size(); i++) {
+ preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize);
+ // write out n-grams
+ for (int j=0; j<ngrams.size(); j++)
{
- cerr << "error: could not open " << filename << endl;
- exit(1);
- }
-
- vector<vector<int> > ngrams;
-
- for (int i=0; i<data.size(); i++) {
- preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize);
- // write out n-grams
- for (int j=0; j<ngrams.size(); j++)
- {
- for (int k=0; k<ngram_size; k++)
- {
- file << ngrams[j][k] << " ";
- }
- file << endl;
- }
+ for (int k=0; k<ngram_size; k++)
+ {
+ file << ngrams[j][k] << " ";
+ }
+ file << endl;
}
- file.close();
+ }
+ file.close();
}
// Space efficient version for writing the n-grams.
// They are not read into memory.
-void writeNgrams(const string &input_filename,
- int ngram_size,
- const vocabulary &vocab,
- bool numberize,
- bool add_start_stop,
- bool ngramize,
- const string &filename,
- int train_data_size,
- vector<float> &sent_weights,
- const string &sent_weights_filename)
+void writeNgrams(const string &input_filename,
+ int ngram_size,
+ const vocabulary &vocab,
+ bool numberize,
+ bool add_start_stop,
+ bool ngramize,
+ const string &filename,
+ int train_data_size,
+ vector<float> &sent_weights,
+ const string &sent_weights_filename)
{
- ofstream file(filename.c_str());
- ofstream output_sent_weights_file(sent_weights_filename.c_str());
- if (!file)
- {
- cerr << "error: could not open " << filename << endl;
- exit(1);
+ ofstream file(filename.c_str());
+ ofstream output_sent_weights_file(sent_weights_filename.c_str());
+ if (!file)
+ {
+ cerr << "error: could not open " << filename << endl;
+ exit(1);
+ }
+
+ ifstream input_file(input_filename.c_str());
+ vector<vector<int> > ngrams;
+ //for (int i=0; i<train_data.size(); i++) {
+ string line;
+ int counter = 0;
+ cerr<<"Processed ... ";
+ while (getline(input_file,line) && train_data_size-- > 0) {
+ counter++;
+ if ((counter % 100000) == 0) {
+ cerr<<counter<<" training lines ... ";
}
-
- ifstream input_file(input_filename.c_str());
- vector<vector<int> > ngrams;
- //for (int i=0; i<train_data.size(); i++) {
- string line;
- int counter = 0;
- cerr<<"Processed ... ";
- while (getline(input_file,line) && train_data_size-- > 0) {
- counter++;
- if ((counter % 100000) == 0) {
- cerr<<counter<<" training lines ... ";
- }
- //stringstream lstr(line);
- vector<string> lstr_items;
- splitBySpace(line,lstr_items);
+ //stringstream lstr(line);
+ vector<string> lstr_items;
+ splitBySpace(line,lstr_items);
//for (int i=0; i<data.size(); i++) {
- preprocessWords(lstr_items,
- ngrams,
- ngram_size,
- vocab,
- numberize,
- add_start_stop,
- ngramize);
-
- // write out n-grams
- for (int j=0; j<ngrams.size(); j++)
- {
- if (sent_weights.size() != 0) {
- output_sent_weights_file <<sent_weights[counter-1]<<endl;
- }
- for (int k=0; k<ngram_size; k++)
- {
- file << ngrams[j][k] << " ";
- }
- file << endl;
- }
+ preprocessWords(lstr_items,
+ ngrams,
+ ngram_size,
+ vocab,
+ numberize,
+ add_start_stop,
+ ngramize);
+
+ // write out n-grams
+ for (int j=0; j<ngrams.size(); j++)
+ {
+ if (sent_weights.size() != 0) {
+ output_sent_weights_file <<sent_weights[counter-1]<<endl;
+ }
+ for (int k=0; k<ngram_size; k++)
+ {
+ file << ngrams[j][k] << " ";
+ }
+ file << endl;
}
- cerr<<endl;
- input_file.close();
- file.close();
- output_sent_weights_file.close();
+ }
+ cerr<<endl;
+ input_file.close();
+ file.close();
+ output_sent_weights_file.close();
}
// Space efficient version for writing the n-grams.
// They are not read into memory.
-void writeMmapNgrams(const string &input_filename,
- int ngram_size,
- const vocabulary &vocab,
- bool numberize,
- bool add_start_stop,
- bool ngramize,
- const string &filename,
- unsigned long train_data_size,
- data_size_t num_tokens,
- bool randomize)
+void writeMmapNgrams(const string &input_filename,
+ int ngram_size,
+ const vocabulary &vocab,
+ bool numberize,
+ bool add_start_stop,
+ bool ngramize,
+ const string &filename,
+ unsigned long train_data_size,
+ data_size_t num_tokens,
+ bool randomize)
{
- cerr<<"Num tokens is "<<num_tokens<<endl;
- cerr<<"Training data size is "<<train_data_size<<endl;
- // Open the memory mapped file and create the allocators
- ip::managed_mapped_file mfile(ip::create_only,
- filename.c_str(),
- num_tokens*ngram_size*sizeof(int)+1024UL*1024UL);
- intAllocator ialloc(mfile.get_segment_manager());
- vecAllocator valloc (mfile.get_segment_manager());
- //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc);
-
- vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc);
-
- cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl;
- // Going over every line in the input file and
- // printing the memory mapped ngrams into the
- // output file
- ifstream input_file(input_filename.c_str());
- //for (int i=0; i<train_data.size(); i++) {
- string line;
- int counter = 0;
- cerr<<"Processed ... ";
- long int train_ngram_counter = 0;
- vector<vector<int> > ngrams;
- while (getline(input_file,line) && train_data_size-- > 0) {
- counter++;
- if ((counter % 100000) ==0) {
- //cerr<<"counter is "<<counter<<endl;
- cerr<<counter<<" training lines ... ";
- }
- //stringstream lstr(line);
- vector<string> lstr_items;
- splitBySpace(line,lstr_items);
+ cerr<<"Num tokens is "<<num_tokens<<endl;
+ cerr<<"Training data size is "<<train_data_size<<endl;
+ // Open the memory mapped file and create the allocators
+ ip::managed_mapped_file mfile(ip::create_only,
+ filename.c_str(),
+ num_tokens*ngram_size*sizeof(int)+1024UL*1024UL);
+ intAllocator ialloc(mfile.get_segment_manager());
+ vecAllocator valloc (mfile.get_segment_manager());
+ //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc);
+
+ vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc);
+
+ cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl;
+ // Going over every line in the input file and
+ // printing the memory mapped ngrams into the
+ // output file
+ ifstream input_file(input_filename.c_str());
+ //for (int i=0; i<train_data.size(); i++) {
+ string line;
+ int counter = 0;
+ cerr<<"Processed ... ";
+ long int train_ngram_counter = 0;
+ vector<vector<int> > ngrams;
+ while (getline(input_file,line) && train_data_size-- > 0) {
+ counter++;
+ if ((counter % 100000) ==0) {
+ //cerr<<"counter is "<<counter<<endl;
+ cerr<<counter<<" training lines ... ";
+ }
+ //stringstream lstr(line);
+ vector<string> lstr_items;
+ splitBySpace(line,lstr_items);
//for (int i=0; i<data.size(); i++) {
- preprocessWords(lstr_items, ngrams,
- ngram_size,
- vocab,
- numberize,
- add_start_stop,
- ngramize);
- /*
+ preprocessWords(lstr_items, ngrams,
+ ngram_size,
+ vocab,
+ numberize,
+ add_start_stop,
+ ngramize);
+ /*
cerr<<"line is "<<endl;
cerr<<line<<endl;
cerr<<"Number of ngrams is "<<ngrams.size()<<endl;
- if (ngrams.size() ==1 ){
- cerr<<"The line number was "<<counter<<endl;
- cerr<<line<<endl;
+ if (ngrams.size() ==1 ){
+ cerr<<"The line number was "<<counter<<endl;
+ cerr<<line<<endl;
+ }
+ */
+ // write out n-grams in mmapped file
+ for (int j=0; j<ngrams.size(); j++)
+ {
+ /*
+ for (int k=0; k<ngram_size; k++)
+ {
+ cerr << ngrams[j][k] << " ";
}
+ cerr<< endl;
*/
- // write out n-grams in mmapped file
- for (int j=0; j<ngrams.size(); j++)
- {
- /*
- for (int k=0; k<ngram_size; k++)
- {
- cerr << ngrams[j][k] << " ";
- }
- cerr<< endl;
- */
- for (int k=0; k<ngram_size; k++) {
- mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k];
- }
- train_ngram_counter++;
- //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl;
- }
+ for (int k=0; k<ngram_size; k++) {
+ mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k];
+ }
+ train_ngram_counter++;
+ //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl;
}
- cerr<<endl;
- input_file.close();
-
- // Shrink the file if it was overused
- ip::managed_mapped_file::shrink_to_fit(filename.c_str());
- //now to randomize the items if the randomize flag was set
- if (randomize == true) {
- unsigned seed = 1234; //for testing only
- mt19937 rng(seed);
- cerr<<"Randomly shuffling data...";
- data_size_t counter =0;
- while (counter < num_tokens) {
- data_size_t upper_limit = counter+5000000;
- long int vector_size = 5000000;
- if (counter + 10000000 >= num_tokens) {
- upper_limit = num_tokens;
- vector_size = num_tokens - counter;
- }
- vector<int> temp(vector_size*ngram_size,0);
- for (int i=0;i<vector_size;i++){
- for (int k=0;k<ngram_size;k++) {
- temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k);
- }
- }
- for (data_size_t i=vector_size-1; i>0; i--)
- {
- if (i %500000 == 0) {
- cerr<<"Shuffled "<<num_tokens-1<<" instances...";
- }
- data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
- for (int k=0;k<ngram_size;k++) {
- int temp_val = temp.at(i*ngram_size+k);
- temp.at(i*ngram_size+k) =
- temp.at(j*ngram_size+k);
- temp.at(j*ngram_size+k) = temp_val;
- }
- }
- //Putting it back
- for (int i=0;i<vector_size;i++){
- for (int k=0;k<ngram_size;k++) {
- mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k];
- }
- }
- counter = upper_limit;
+ }
+ cerr<<endl;
+ input_file.close();
+
+ // Shrink the file if it was overused
+ ip::managed_mapped_file::shrink_to_fit(filename.c_str());
+ //now to randomize the items if the randomize flag was set
+ if (randomize == true) {
+ unsigned seed = 1234; //for testing only
+ boost::random::mt19937 rng(seed);
+ cerr<<"Randomly shuffling data...";
+ data_size_t counter =0;
+ while (counter < num_tokens) {
+ data_size_t upper_limit = counter+5000000;
+ long int vector_size = 5000000;
+ if (counter + 10000000 >= num_tokens) {
+ upper_limit = num_tokens;
+ vector_size = num_tokens - counter;
+ }
+ vector<int> temp(vector_size*ngram_size,0);
+ for (int i=0;i<vector_size;i++){
+ for (int k=0;k<ngram_size;k++) {
+ temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k);
}
-
- /*
- for (data_size_t i=num_tokens-1; i>0; i--)
+ }
+ for (data_size_t i=vector_size-1; i>0; i--)
{
if (i %500000 == 0) {
cerr<<"Shuffled "<<num_tokens-1<<" instances...";
}
data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
for (int k=0;k<ngram_size;k++) {
- int temp_val = mMapVec->at(i*ngram_size+k);
- mMapVec->at(i*ngram_size+k) =
- mMapVec->at(j*ngram_size+k);
- mMapVec->at(j*ngram_size+k) = temp_val;
+ int temp_val = temp.at(i*ngram_size+k);
+ temp.at(i*ngram_size+k) =
+ temp.at(j*ngram_size+k);
+ temp.at(j*ngram_size+k) = temp_val;
}
}
- */
- cerr<<endl;
+ //Putting it back
+ for (int i=0;i<vector_size;i++){
+ for (int k=0;k<ngram_size;k++) {
+ mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k];
+ }
+ }
+ counter = upper_limit;
}
+
+ /*
+ for (data_size_t i=num_tokens-1; i>0; i--)
+ {
+ if (i %500000 == 0) {
+ cerr<<"Shuffled "<<num_tokens-1<<" instances...";
+ }
+ data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
+ for (int k=0;k<ngram_size;k++) {
+ int temp_val = mMapVec->at(i*ngram_size+k);
+ mMapVec->at(i*ngram_size+k) =
+ mMapVec->at(j*ngram_size+k);
+ mMapVec->at(j*ngram_size+k) = temp_val;
+ }
+ }
+ */
+ cerr<<endl;
+ }
}
int main(int argc, char *argv[])
{
- ios::sync_with_stdio(false);
- int ngram_size, vocab_size, validation_size;
- bool numberize,
- ngramize,
- add_start_stop,
- mmap_file,
- randomize;
-
- string train_text,
- train_file,
- validation_text,
- validation_file,
- words_file,
- write_words_file,
- sent_weights_text,
- output_sent_weights_text;
-
- try
- {
- CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1");
-
- // The options are printed in reverse order
+ ios::sync_with_stdio(false);
+ int ngram_size, vocab_size, validation_size;
+ bool numberize,
+ ngramize,
+ add_start_stop,
+ mmap_file,
+ randomize;
+
+ string train_text,
+ train_file,
+ validation_text,
+ validation_file,
+ words_file,
+ write_words_file,
+ sent_weights_text,
+ output_sent_weights_text;
+
+ try
+ {
+ CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1");
+
+ // The options are printed in reverse order
ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd);
ValueArg<bool> arg_mmap_file("", "mmap_file", "If true, the training file will be a memory mapped file. \n This is "
- "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd);
+ "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd);
ValueArg<bool> arg_randomize("", "randomize", "If true, Randomly shuffle the training ngrams. It can only be used with mmap_file =1 . Default: false.", false, false, "bool", cmd);
ValueArg<int> arg_vocab_size("", "vocab_size", "Vocabulary size.", false, -1, "int", cmd);
ValueArg<string> arg_words_file("", "words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd);
ValueArg<int> arg_ngram_size("", "ngram_size", "Size of n-grams.", true, -1, "int", cmd);
- ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd);
+ ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd);
ValueArg<int> arg_validation_size("", "validation_size", "How many lines from training data to hold out for validation. Default: 0.", false, 0, "int", cmd);
- ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd);
- ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
- ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd);
- ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd);
- //ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd);
- //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd);
-
-
-
- cmd.parse(argc, argv);
-
- train_text = arg_train_text.getValue();
- train_file = arg_train_file.getValue();
- validation_text = arg_validation_text.getValue();
- validation_file = arg_validation_file.getValue();
- validation_size = arg_validation_size.getValue();
- write_words_file = arg_write_words_file.getValue();
- ngram_size = arg_ngram_size.getValue();
- vocab_size = arg_vocab_size.getValue();
- words_file = arg_words_file.getValue();
- numberize = arg_numberize.getValue();
- ngramize = arg_ngramize.getValue();
- add_start_stop = arg_add_start_stop.getValue();
- mmap_file = arg_mmap_file.getValue();
- randomize = arg_randomize.getValue();
- //sent_weights_text = arg_sent_weights_text.getValue();
- //output_sent_weights_text = arg_sent_weights_file.getValue();
- sent_weights_text = "";
- output_sent_weights_text = "";
+ ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd);
+ ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
+ ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd);
+ ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd);
+ //ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd);
+ //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd);
+
+
+ cmd.parse(argc, argv);
+
+ train_text = arg_train_text.getValue();
+ train_file = arg_train_file.getValue();
+ validation_text = arg_validation_text.getValue();
+ validation_file = arg_validation_file.getValue();
+ validation_size = arg_validation_size.getValue();
+ write_words_file = arg_write_words_file.getValue();
+ ngram_size = arg_ngram_size.getValue();
+ vocab_size = arg_vocab_size.getValue();
+ words_file = arg_words_file.getValue();
+ numberize = arg_numberize.getValue();
+ ngramize = arg_ngramize.getValue();
+ add_start_stop = arg_add_start_stop.getValue();
+ mmap_file = arg_mmap_file.getValue();
+ randomize = arg_randomize.getValue();
+ //sent_weights_text = arg_sent_weights_text.getValue();
+ //output_sent_weights_text = arg_sent_weights_file.getValue();
+ sent_weights_text = "";
+ output_sent_weights_text = "";
// check command line arguments
@@ -364,292 +363,292 @@ int main(int argc, char *argv[])
cerr << "Command line: " << endl;
cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
-
- const string sep(" Value: ");
- cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl;
- cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl;
- cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl;
- cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl;
- cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl;
- cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl;
- cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl;
- cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl;
- cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl;
- cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl;
- cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
- cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
- cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl;
- //cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl;
- }
- catch (TCLAP::ArgException &e)
- {
- cerr << "error: " << e.error() << " for arg " << e.argId() << endl;
- exit(1);
- }
- // VLF: why is this true?
- // DC: it's because the vocabulary has to be constructed from the training data only.
- // If the vocabulary is preset, we can't create the validation data.
- // - if --numberize 0 is set, then --validation_size cannot be used.
- // if (!numberize && (validation_size > 0)) {
- // cerr << "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl;
- // }
-
- // Read in training data and validation data
- // vector<vector<string> > train_data;
- // readSentFile(train_text, train_data);
- // @vaswani: No more reading the entire training file into memory
- // Reading it per line with file io
-
- //for (int i=0; i<train_data.size(); i++) {
- // Go over every line in the file and
- // 1. if the !ngramize then you should check if
- // we have the correct number of items per line
- // 2. build the vocabulary if the words file has not
- // been specified.
- // Construct vocabulary
- vocabulary vocab;
- int start, stop;
- // Add start stop if the vocabulary has not been supplied
- if (words_file == "") {
- vocab.insert_word("<s>");
- vocab.insert_word("</s>");
- vocab.insert_word("<null>");
- // warn user that if --numberize is not set, there will be no vocabulary!
- if (!numberize) {
- cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
- }
- }
- if (mmap_file == false && randomize == true) {
- cerr<<"Randomize option can only be used with mmap_file = 1"<<endl;
- exit(1);
+ const string sep(" Value: ");
+ cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl;
+ cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl;
+ cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl;
+ cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl;
+ cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl;
+ cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl;
+ cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl;
+ cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl;
+ cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl;
+ cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl;
+ cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
+ cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
+ cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl;
+ //cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl;
+ }
+ catch (TCLAP::ArgException &e)
+ {
+ cerr << "error: " << e.error() << " for arg " << e.argId() << endl;
+ exit(1);
+ }
+
+ // VLF: why is this true?
+ // DC: it's because the vocabulary has to be constructed from the training data only.
+ // If the vocabulary is preset, we can't create the validation data.
+ // - if --numberize 0 is set, then --validation_size cannot be used.
+ // if (!numberize && (validation_size > 0)) {
+ // cerr << "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl;
+ // }
+
+ // Read in training data and validation data
+ // vector<vector<string> > train_data;
+ // readSentFile(train_text, train_data);
+ // @vaswani: No more reading the entire training file into memory
+ // Reading it per line with file io
+
+ //for (int i=0; i<train_data.size(); i++) {
+ // Go over every line in the file and
+ // 1. if the !ngramize then you should check if
+ // we have the correct number of items per line
+ // 2. build the vocabulary if the words file has not
+ // been specified.
+ // Construct vocabulary
+ vocabulary vocab;
+ int start, stop;
+ // Add start stop if the vocabulary has not been supplied
+ if (words_file == "") {
+ vocab.insert_word("<s>");
+ vocab.insert_word("</s>");
+ vocab.insert_word("<null>");
+ // warn user that if --numberize is not set, there will be no vocabulary!
+ if (!numberize) {
+ cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
}
- unordered_map<string,int> count; // For keeping word counts if no supplied vocab
-
- deque<vector<string> > validation_data;
- int train_data_size=0;
- cerr<<"Processed ... ";
- data_size_t num_tokens=0;
-
- ifstream training(train_text.c_str());
-
- string line;
- while (getline(training,line)) {
- train_data_size++;
- //stringstream lstr(line);
- vector<string> lstr_items;
- splitBySpace(line,lstr_items);
- // if data is already ngramized, set/check ngram_size
- if (!ngramize) {
- if (ngram_size > 0) {
- if (ngram_size != lstr_items.size()) {
- cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl;
- }
- }
- // else if --ngram_size has not been specified, set it now
- else {
- ngram_size=lstr_items.size();
- }
+ }
+ if (mmap_file == false && randomize == true) {
+ cerr<<"Randomize option can only be used with mmap_file = 1"<<endl;
+ exit(1);
+ }
+ unordered_map<string,int> count; // For keeping word counts if no supplied vocab
+
+ deque<vector<string> > validation_data;
+ int train_data_size=0;
+ cerr<<"Processed ... ";
+ data_size_t num_tokens=0;
+
+ ifstream training(train_text.c_str());
+
+ string line;
+ while (getline(training,line)) {
+ train_data_size++;
+ //stringstream lstr(line);
+ vector<string> lstr_items;
+ splitBySpace(line,lstr_items);
+ // if data is already ngramized, set/check ngram_size
+ if (!ngramize) {
+ if (ngram_size > 0) {
+ if (ngram_size != lstr_items.size()) {
+ cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl;
+ }
}
- if ((train_data_size%100000)==0){
- cerr<<train_data_size<<" lines ... ";
+ // else if --ngram_size has not been specified, set it now
+ else {
+ ngram_size=lstr_items.size();
}
- //break;
- /*
+ }
+ if ((train_data_size%100000)==0){
+ cerr<<train_data_size<<" lines ... ";
+ }
+ //break;
+ /*
if (lstr_items.size() ==1) {
- cerr<<"line :"<<endl;
- cerr<<line<<endl;
- cerr<<"The number of items was 1"<<endl;
- getchar();
- }
- */
- num_tokens += lstr_items.size()+1;
- if (words_file == "") {
- for (int j=0; j<lstr_items.size(); j++) {
- count[lstr_items[j]] += 1;
- }
+ cerr<<"line :"<<endl;
+ cerr<<line<<endl;
+ cerr<<"The number of items was 1"<<endl;
+ getchar();
}
- // Add to validation set if the validation size
- // has not been specified
- if (validation_text == "" && validation_size > 0) {
- //cerr<<"validation size is "<<validation_data.size()<<endl;
- if (validation_data.size() == validation_size) {
- //validation_data.erase(validation_data.begin());
- validation_data.pop_front();
- }
- validation_data.push_back(lstr_items);
+ */
+ num_tokens += lstr_items.size()+1;
+ if (words_file == "") {
+ for (int j=0; j<lstr_items.size(); j++) {
+ count[lstr_items[j]] += 1;
}
}
- cerr<<endl;
- training.close();
- //cerr<<"validation size is "<<validation_data.size()<<endl;
- //getchar();
- if (validation_data.size() < validation_size) {
- cerr<<"validation size is "<<validation_data.size()<<endl;
- cerr << "error: requested validation size is greater than training data size" << endl;
- exit(1);
+ // Add to validation set if the validation size
+ // has not been specified
+ if (validation_text == "" && validation_size > 0) {
+ //cerr<<"validation size is "<<validation_data.size()<<endl;
+ if (validation_data.size() == validation_size) {
+ //validation_data.erase(validation_data.begin());
+ validation_data.pop_front();
+ }
+ validation_data.push_back(lstr_items);
}
-
- train_data_size -= validation_size;
- cerr<<"Training data size is "<<train_data_size<<endl;
-
- // The items in the validation data have already been counted
- // Decrementing the counts of those words before building the vocabulary
- for(int i=0; i<validation_data.size(); i++){
- num_tokens -= (validation_data[i].size() +1);
- for (int j=0; j<validation_data[i].size();j++){
- count[validation_data[i][j]] -= 1;
- if (count[validation_data[i][j]] == 0) {
- count.erase(validation_data[i][j]);
- }
+ }
+ cerr<<endl;
+ training.close();
+ //cerr<<"validation size is "<<validation_data.size()<<endl;
+ //getchar();
+ if (validation_data.size() < validation_size) {
+ cerr<<"validation size is "<<validation_data.size()<<endl;
+ cerr << "error: requested validation size is greater than training data size" << endl;
+ exit(1);
+ }
+
+ train_data_size -= validation_size;
+ cerr<<"Training data size is "<<train_data_size<<endl;
+
+ // The items in the validation data have already been counted
+ // Decrementing the counts of those words before building the vocabulary
+ for(int i=0; i<validation_data.size(); i++){
+ num_tokens -= (validation_data[i].size() +1);
+ for (int j=0; j<validation_data[i].size();j++){
+ count[validation_data[i][j]] -= 1;
+ if (count[validation_data[i][j]] == 0) {
+ count.erase(validation_data[i][j]);
}
}
+ }
- // Getting the top n frequent words for the vocabulary
- if (words_file == "") {
- vocab.insert_most_frequent(count, vocab_size);
- if (vocab.size() < vocab_size) {
- cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
- }
+ // Getting the top n frequent words for the vocabulary
+ if (words_file == "") {
+ vocab.insert_most_frequent(count, vocab_size);
+ if (vocab.size() < vocab_size) {
+ cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
}
- //vector<vector<string> > validation_data;
- if (validation_text != "") {
- readSentFile(validation_text, validation_data);
- for (int i=0; i<validation_data.size(); i++) {
- // if data is already ngramized, set/check ngram_size
- if (!ngramize) {
- // if --ngram_size has been specified, check that it does not conflict with --ngram_size
- if (ngram_size > 0) {
- if (ngram_size != validation_data[i].size()) {
- cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl;
- }
- }
- // else if --ngram_size has not been specified, set it now
- else {
- ngram_size=validation_data[i].size();
- }
- }
+ }
+ //vector<vector<string> > validation_data;
+ if (validation_text != "") {
+ readSentFile(validation_text, validation_data);
+ for (int i=0; i<validation_data.size(); i++) {
+ // if data is already ngramized, set/check ngram_size
+ if (!ngramize) {
+ // if --ngram_size has been specified, check that it does not conflict with --ngram_size
+ if (ngram_size > 0) {
+ if (ngram_size != validation_data[i].size()) {
+ cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl;
+ }
}
+ // else if --ngram_size has not been specified, set it now
+ else {
+ ngram_size=validation_data[i].size();
+ }
+ }
}
- //READING SENTENCE WEIGHTS IF THERE ARE ANY
- vector<float> sent_weights;
- if (sent_weights_text != "") {
- cerr<<"Reading sentence weights from "<<sent_weights_text<<endl;
- ifstream sent_weights_file(sent_weights_text.c_str());
- string line;
- readWeightsFile(sent_weights_file,sent_weights);
- sent_weights_file.close();
- if (sent_weights_text.size() != train_data_size) {
- cerr<<"The number of sentence weights does not match the number of training sentences"<<endl;
- }
+ }
+ //READING SENTENCE WEIGHTS IF THERE ARE ANY
+ vector<float> sent_weights;
+ if (sent_weights_text != "") {
+ cerr<<"Reading sentence weights from "<<sent_weights_text<<endl;
+ ifstream sent_weights_file(sent_weights_text.c_str());
+ string line;
+ readWeightsFile(sent_weights_file,sent_weights);
+ sent_weights_file.close();
+ if (sent_weights_text.size() != train_data_size) {
+ cerr<<"The number of sentence weights does not match the number of training sentences"<<endl;
}
-
- /*
+ }
+
+ /*
else if (validation_size > 0)
{
- // Create validation data
- if (validation_size > train_data.size())
- {
- cerr << "error: requested validation size is greater than training data size" << endl;
- exit(1);
- }
- validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end());
- train_data.resize(train_data.size() - validation_size);
+ // Create validation data
+ if (validation_size > train_data.size())
+ {
+ cerr << "error: requested validation size is greater than training data size" << endl;
+ exit(1);
}
- */
-
- // Construct vocabulary
- //vocabulary vocab;
- //int start, stop;
-
- // read vocabulary from file
- if (words_file != "") {
- vector<string> words;
- readWordsFile(words_file,words);
- for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
- vocab.insert_word(*it);
- }
-
- // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file
- if (vocab_size > 0) {
- if (vocab.size() != vocab_size) {
- cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl;
- }
- }
- // else, set it to the size of vocabulary read from file
- else {
- vocab_size = vocab.size();
- }
-
+ validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end());
+ train_data.resize(train_data.size() - validation_size);
}
- /*
- // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk>
- else {
- vocab.insert_word("<s>");
- vocab.insert_word("</s>");
- vocab.insert_word("<null>");
-
- // warn user that if --numberize is not set, there will be no vocabulary!
- if (!numberize) {
- cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
- }
- unordered_map<string,int> count;
- for (int i=0; i<train_data.size(); i++) {
- for (int j=0; j<train_data[i].size(); j++) {
- count[train_data[i][j]] += 1;
- }
- }
-
- vocab.insert_most_frequent(count, vocab_size);
- if (vocab.size() < vocab_size) {
- cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
- }
+ */
+
+ // Construct vocabulary
+ //vocabulary vocab;
+ //int start, stop;
+
+ // read vocabulary from file
+ if (words_file != "") {
+ vector<string> words;
+ readWordsFile(words_file,words);
+ for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
+ vocab.insert_word(*it);
}
- */
- // write vocabulary to file
- if (write_words_file != "") {
- cerr << "Writing vocabulary to " << write_words_file << endl;
- writeWordsFile(vocab.words(), write_words_file);
+ // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file
+ if (vocab_size > 0) {
+ if (vocab.size() != vocab_size) {
+ cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl;
+ }
}
-
- // Write out numberized n-grams
- if (train_file != "")
- {
- cerr << "Writing training data to " << train_file << endl;
- if (mmap_file == true) {
- writeMmapNgrams(train_text,
- ngram_size,
- vocab,
- numberize,
- add_start_stop,
- ngramize,
- train_file,
- train_data_size,
- num_tokens,
- randomize);
- } else {
- writeNgrams(train_text,
- ngram_size,
- vocab,
- numberize,
- add_start_stop,
- ngramize,
- train_file,
- train_data_size,
- sent_weights,
- output_sent_weights_text);
- }
+ // else, set it to the size of vocabulary read from file
+ else {
+ vocab_size = vocab.size();
}
- if (validation_file != "")
- {
- cerr << "Writing validation data to " << validation_file << endl;
- writeNgrams(validation_data,
- ngram_size,
- vocab,
- numberize,
- add_start_stop,
- ngramize,
- validation_file);
+
+ }
+ /*
+ // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk>
+ else {
+ vocab.insert_word("<s>");
+ vocab.insert_word("</s>");
+ vocab.insert_word("<null>");
+
+ // warn user that if --numberize is not set, there will be no vocabulary!
+ if (!numberize) {
+ cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
+ }
+ unordered_map<string,int> count;
+ for (int i=0; i<train_data.size(); i++) {
+ for (int j=0; j<train_data[i].size(); j++) {
+ count[train_data[i][j]] += 1;
+ }
+ }
+
+ vocab.insert_most_frequent(count, vocab_size);
+ if (vocab.size() < vocab_size) {
+ cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
+ }
+ }
+ */
+
+ // write vocabulary to file
+ if (write_words_file != "") {
+ cerr << "Writing vocabulary to " << write_words_file << endl;
+ writeWordsFile(vocab.words(), write_words_file);
+ }
+
+ // Write out numberized n-grams
+ if (train_file != "")
+ {
+ cerr << "Writing training data to " << train_file << endl;
+ if (mmap_file == true) {
+ writeMmapNgrams(train_text,
+ ngram_size,
+ vocab,
+ numberize,
+ add_start_stop,
+ ngramize,
+ train_file,
+ train_data_size,
+ num_tokens,
+ randomize);
+ } else {
+ writeNgrams(train_text,
+ ngram_size,
+ vocab,
+ numberize,
+ add_start_stop,
+ ngramize,
+ train_file,
+ train_data_size,
+ sent_weights,
+ output_sent_weights_text);
}
+ }
+ if (validation_file != "")
+ {
+ cerr << "Writing validation data to " << validation_file << endl;
+ writeNgrams(validation_data,
+ ngram_size,
+ vocab,
+ numberize,
+ add_start_stop,
+ ngramize,
+ validation_file);
+ }
}