1 files changed, 528 insertions, 529 deletions
diff --git a/src/prepareNeuralLM.cpp b/src/prepareNeuralLM.cpp
index adedc72..d5fc16b 100644
--- a/src/prepareNeuralLM.cpp
+++ b/src/prepareNeuralLM.cpp
@@ -2,19 +2,19 @@
 #include <vector>
 #include <queue>
 #include <deque>
-# include <fstream>
-# include <iterator>
-
-# include <boost/unordered_map.hpp>
-# include <boost/algorithm/string/join.hpp>
-# include <boost/interprocess/managed_shared_memory.hpp>
-# include <boost/interprocess/allocators/allocator.hpp>
-# include <boost/interprocess/managed_mapped_file.hpp>
+#include <fstream>
+#include <iterator>
+
+#include <boost/unordered_map.hpp>
+#include <boost/algorithm/string/join.hpp>
+#include <boost/interprocess/managed_shared_memory.hpp>
+#include <boost/interprocess/allocators/allocator.hpp>
+#include <boost/interprocess/managed_mapped_file.hpp>
 #include <boost/interprocess/containers/vector.hpp>
 #include <boost/random/mersenne_twister.hpp>
 #include <boost/random/uniform_int_distribution.hpp>
 
-# include <tclap/CmdLine.h>
+#include <tclap/CmdLine.h>
 
 #include "neuralLM.h"
 #include "util.h"
@@ -36,314 +36,313 @@ typedef std::vector<vec,vecAllocator> vecvec;
 typedef long long int data_size_t; // training data can easily exceed 2G instances
 
 template<typename T>
-void writeNgrams(const T &data, 
-		 int ngram_size,
-     const vocabulary &vocab, 
-		 bool numberize,
-     bool add_start_stop,
-     bool ngramize, 
-		 const string &filename)
-	{
-    ofstream file(filename.c_str());
-    if (!file)
+void writeNgrams(const T &data,
+                 int ngram_size,
+                 const vocabulary &vocab,
+                 bool numberize,
+                 bool add_start_stop,
+                 bool ngramize,
+                 const string &filename)
+{
+  ofstream file(filename.c_str());
+  if (!file)
+  {
+    cerr << "error: could not open " << filename << endl;
+    exit(1);
+  }
+
+  vector<vector<int> > ngrams;
+
+  for (int i=0; i<data.size(); i++) {
+    preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize);
+    // write out n-grams
+    for (int j=0; j<ngrams.size(); j++)
     {
-	cerr << "error: could not open " << filename << endl;
-	exit(1);
-    }
-
-    vector<vector<int> > ngrams;
-
-    for (int i=0; i<data.size(); i++) {
-        preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize);
-	// write out n-grams
-	for (int j=0; j<ngrams.size(); j++)
-	  {
-	    for (int k=0; k<ngram_size; k++)
-	      {
-	        file << ngrams[j][k] << " ";
-	      }
-	    file << endl;
-	  }
+      for (int k=0; k<ngram_size; k++)
+      {
+        file << ngrams[j][k] << " ";
+      }
+      file << endl;
     }
-    file.close();
+  }
+  file.close();
 }
 
 // Space efficient version for writing the n-grams.
 // They are not read into memory.
-void writeNgrams(const string &input_filename, 
-		 int ngram_size,
-     const vocabulary &vocab, 
-		 bool numberize,
-     bool add_start_stop,
-     bool ngramize, 
-		 const string &filename,
-     int train_data_size,
-		 vector<float> &sent_weights,
-		 const string &sent_weights_filename)
+void writeNgrams(const string &input_filename,
+                 int ngram_size,
+                 const vocabulary &vocab,
+                 bool numberize,
+                 bool add_start_stop,
+                 bool ngramize,
+                 const string &filename,
+                 int train_data_size,
+                 vector<float> &sent_weights,
+                 const string &sent_weights_filename)
 {
-    ofstream file(filename.c_str());
-    ofstream output_sent_weights_file(sent_weights_filename.c_str());
-    if (!file)
-    {
-      cerr << "error: could not open " << filename << endl;
-      exit(1);
+  ofstream file(filename.c_str());
+  ofstream output_sent_weights_file(sent_weights_filename.c_str());
+  if (!file)
+  {
+    cerr << "error: could not open " << filename << endl;
+    exit(1);
+  }
+
+  ifstream input_file(input_filename.c_str());
+  vector<vector<int> > ngrams;
+  //for (int i=0; i<train_data.size(); i++) {
+  string line;
+  int counter = 0;
+  cerr<<"Processed ... ";
+  while (getline(input_file,line) && train_data_size-- > 0) {
+    counter++;
+    if ((counter % 100000) == 0) {
+      cerr<<counter<<" training lines ... ";
     }
-
-    ifstream input_file(input_filename.c_str());
-    vector<vector<int> > ngrams;
-    //for (int i=0; i<train_data.size(); i++) {
-    string line;
-    int counter = 0;
-    cerr<<"Processed ... ";
-    while (getline(input_file,line) && train_data_size-- > 0) {
-            counter++;
-      if ((counter % 100000) == 0) {
-        cerr<<counter<<" training lines ... ";
-      }
-      //stringstream lstr(line);
-      vector<string> lstr_items;
-      splitBySpace(line,lstr_items);
+    //stringstream lstr(line);
+    vector<string> lstr_items;
+    splitBySpace(line,lstr_items);
 
     //for (int i=0; i<data.size(); i++) {
-      preprocessWords(lstr_items,
-          ngrams,
-          ngram_size,
-          vocab,
-          numberize,
-          add_start_stop,
-          ngramize);
-
-	    // write out n-grams
-	    for (int j=0; j<ngrams.size(); j++)
-	    {
-					if (sent_weights.size() != 0) {
-						output_sent_weights_file <<sent_weights[counter-1]<<endl;
-					}	
-	        for (int k=0; k<ngram_size; k++)
-	        {
-	        file << ngrams[j][k] << " ";
-	        }
-	      file << endl;
-	    }
+    preprocessWords(lstr_items,
+                    ngrams,
+                    ngram_size,
+                    vocab,
+                    numberize,
+                    add_start_stop,
+                    ngramize);
+
+    // write out n-grams
+    for (int j=0; j<ngrams.size(); j++)
+    {
+      if (sent_weights.size() != 0) {
+        output_sent_weights_file <<sent_weights[counter-1]<<endl;
+      }
+      for (int k=0; k<ngram_size; k++)
+      {
+        file << ngrams[j][k] << " ";
+      }
+      file << endl;
     }
-    cerr<<endl;
-    input_file.close();
-    file.close();
-    output_sent_weights_file.close();
+  }
+  cerr<<endl;
+  input_file.close();
+  file.close();
+  output_sent_weights_file.close();
 }
 
 // Space efficient version for writing the n-grams.
 // They are not read into memory.
-void writeMmapNgrams(const string &input_filename, 
-		 int ngram_size,
-     const vocabulary &vocab, 
-		 bool numberize,
-     bool add_start_stop,
-     bool ngramize, 
-		 const string &filename,
-     unsigned long train_data_size,
-     data_size_t num_tokens,
-     bool randomize)
+void writeMmapNgrams(const string &input_filename,
+                     int ngram_size,
+                     const vocabulary &vocab,
+                     bool numberize,
+                     bool add_start_stop,
+                     bool ngramize,
+                     const string &filename,
+                     unsigned long train_data_size,
+                     data_size_t num_tokens,
+                     bool randomize)
 {
-    cerr<<"Num tokens is "<<num_tokens<<endl;
-    cerr<<"Training data size is "<<train_data_size<<endl;
-    // Open the memory mapped file and create the allocators
-    ip::managed_mapped_file mfile(ip::create_only,
-        filename.c_str(),
-        num_tokens*ngram_size*sizeof(int)+1024UL*1024UL);
-    intAllocator ialloc(mfile.get_segment_manager());
-    vecAllocator valloc (mfile.get_segment_manager());
-    //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc);
-
-    vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc);
-
-    cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl;
-    // Going over every line in the input file and 
-    // printing the memory mapped ngrams into the 
-    // output file
-    ifstream input_file(input_filename.c_str());
-    //for (int i=0; i<train_data.size(); i++) {
-    string line;
-    int counter = 0;
-    cerr<<"Processed ... ";
-    long int train_ngram_counter = 0;
-    vector<vector<int> > ngrams;
-    while (getline(input_file,line) && train_data_size-- > 0) {
-            counter++;
-      if ((counter % 100000) ==0) {
-        //cerr<<"counter is "<<counter<<endl;
-        cerr<<counter<<" training lines ... ";
-      }
-      //stringstream lstr(line);
-      vector<string> lstr_items;
-      splitBySpace(line,lstr_items);
+  cerr<<"Num tokens is "<<num_tokens<<endl;
+  cerr<<"Training data size is "<<train_data_size<<endl;
+  // Open the memory mapped file and create the allocators
+  ip::managed_mapped_file mfile(ip::create_only,
+                                filename.c_str(),
+                                num_tokens*ngram_size*sizeof(int)+1024UL*1024UL);
+  intAllocator ialloc(mfile.get_segment_manager());
+  vecAllocator valloc (mfile.get_segment_manager());
+  //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc);
+
+  vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc);
+
+  cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl;
+  // Going over every line in the input file and
+  // printing the memory mapped ngrams into the
+  // output file
+  ifstream input_file(input_filename.c_str());
+  //for (int i=0; i<train_data.size(); i++) {
+  string line;
+  int counter = 0;
+  cerr<<"Processed ... ";
+  long int train_ngram_counter = 0;
+  vector<vector<int> > ngrams;
+  while (getline(input_file,line) && train_data_size-- > 0) {
+    counter++;
+    if ((counter % 100000) ==0) {
+      //cerr<<"counter is "<<counter<<endl;
+      cerr<<counter<<" training lines ... ";
+    }
+    //stringstream lstr(line);
+    vector<string> lstr_items;
+    splitBySpace(line,lstr_items);
 
     //for (int i=0; i<data.size(); i++) {
-      preprocessWords(lstr_items, ngrams,
-          ngram_size,
-          vocab,
-          numberize, 
-          add_start_stop,
-          ngramize);
-      /*
+    preprocessWords(lstr_items, ngrams,
+                    ngram_size,
+                    vocab,
+                    numberize,
+                    add_start_stop,
+                    ngramize);
+    /*
       cerr<<"line is "<<endl;
       cerr<<line<<endl;
       cerr<<"Number of ngrams is "<<ngrams.size()<<endl;
-        if (ngrams.size() ==1 ){
-          cerr<<"The line number was "<<counter<<endl;
-          cerr<<line<<endl;
+      if (ngrams.size() ==1 ){
+      cerr<<"The line number was "<<counter<<endl;
+      cerr<<line<<endl;
+      }
+    */
+    // write out n-grams in mmapped file
+    for (int j=0; j<ngrams.size(); j++)
+    {
+      /*
+        for (int k=0; k<ngram_size; k++)
+        {
+        cerr << ngrams[j][k] << " ";
         }
+        cerr<< endl;
       */
-	    // write out n-grams in mmapped file
-	    for (int j=0; j<ngrams.size(); j++)
-	    {
-        /*
-       for (int k=0; k<ngram_size; k++)
-	        {
-	        cerr << ngrams[j][k] << " ";
-	        }
-	      cerr<< endl; 
-        */
-        for (int k=0; k<ngram_size; k++) {
-          mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k];
-        }
-        train_ngram_counter++;
-        //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl;
-	    }
+      for (int k=0; k<ngram_size; k++) {
+        mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k];
+      }
+      train_ngram_counter++;
+      //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl;
     }
-    cerr<<endl;
-    input_file.close();
-
-    // Shrink the file if it was overused
-    ip::managed_mapped_file::shrink_to_fit(filename.c_str());
-    //now to randomize the items if the randomize flag was set
-    if (randomize == true) {
-      unsigned seed = 1234; //for testing only
-      mt19937 rng(seed);
-       cerr<<"Randomly shuffling data...";
-        data_size_t counter =0;
-        while (counter < num_tokens) {
-          data_size_t upper_limit = counter+5000000;
-          long int vector_size = 5000000;
-          if (counter + 10000000 >= num_tokens) {
-            upper_limit = num_tokens;
-            vector_size = num_tokens - counter;
-          }
-          vector<int> temp(vector_size*ngram_size,0);
-          for (int i=0;i<vector_size;i++){
-           for (int k=0;k<ngram_size;k++) {
-             temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k);
-           }
-          }
-          for (data_size_t i=vector_size-1; i>0; i--)
-          {
-            if (i %500000 == 0) {
-              cerr<<"Shuffled "<<num_tokens-1<<" instances...";
-            }
-            data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
-            for (int k=0;k<ngram_size;k++) {
-              int temp_val = temp.at(i*ngram_size+k);
-              temp.at(i*ngram_size+k) =
-                temp.at(j*ngram_size+k);
-              temp.at(j*ngram_size+k) = temp_val;
-            }
-          }
-          //Putting it back
-          for (int i=0;i<vector_size;i++){
-           for (int k=0;k<ngram_size;k++) {
-             mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k];
-           }
-          }
-          counter = upper_limit;
+  }
+  cerr<<endl;
+  input_file.close();
+
+  // Shrink the file if it was overused
+  ip::managed_mapped_file::shrink_to_fit(filename.c_str());
+  //now to randomize the items if the randomize flag was set
+  if (randomize == true) {
+    unsigned seed = 1234; //for testing only
+    boost::random::mt19937 rng(seed);
+    cerr<<"Randomly shuffling data...";
+    data_size_t counter =0;
+    while (counter < num_tokens) {
+      data_size_t upper_limit = counter+5000000;
+      long int vector_size = 5000000;
+      if (counter + 10000000 >= num_tokens) {
+        upper_limit = num_tokens;
+        vector_size = num_tokens - counter;
+      }
+      vector<int> temp(vector_size*ngram_size,0);
+      for (int i=0;i<vector_size;i++){
+        for (int k=0;k<ngram_size;k++) {
+          temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k);
         }
-
-      /*
-      for (data_size_t i=num_tokens-1; i>0; i--)
+      }
+      for (data_size_t i=vector_size-1; i>0; i--)
       {
         if (i %500000 == 0) {
           cerr<<"Shuffled "<<num_tokens-1<<" instances...";
         }
         data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
         for (int k=0;k<ngram_size;k++) {
-          int temp_val = mMapVec->at(i*ngram_size+k);
-          mMapVec->at(i*ngram_size+k) =
-            mMapVec->at(j*ngram_size+k);
-          mMapVec->at(j*ngram_size+k) = temp_val;
+          int temp_val = temp.at(i*ngram_size+k);
+          temp.at(i*ngram_size+k) =
+              temp.at(j*ngram_size+k);
+          temp.at(j*ngram_size+k) = temp_val;
         }
       }
-      */
-    cerr<<endl; 
+      //Putting it back
+      for (int i=0;i<vector_size;i++){
+        for (int k=0;k<ngram_size;k++) {
+          mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k];
+        }
+      }
+      counter = upper_limit;
     }
+
+    /*
+      for (data_size_t i=num_tokens-1; i>0; i--)
+      {
+      if (i %500000 == 0) {
+      cerr<<"Shuffled "<<num_tokens-1<<" instances...";
+      }
+      data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
+      for (int k=0;k<ngram_size;k++) {
+      int temp_val = mMapVec->at(i*ngram_size+k);
+      mMapVec->at(i*ngram_size+k) =
+      mMapVec->at(j*ngram_size+k);
+      mMapVec->at(j*ngram_size+k) = temp_val;
+      }
+      }
+    */
+    cerr<<endl;
+  }
 }
 
 
 int main(int argc, char *argv[])
 {
-    ios::sync_with_stdio(false);
-    int ngram_size, vocab_size, validation_size;
-    bool numberize, 
-         ngramize,
-         add_start_stop,
-         mmap_file,
-         randomize;
-
-    string train_text,
-           train_file,
-           validation_text,
-           validation_file,
-           words_file,
-           write_words_file,
-					 sent_weights_text,
-					 output_sent_weights_text;
-
-    try
-    {
-	CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1");
-
-	// The options are printed in reverse order
+  ios::sync_with_stdio(false);
+  int ngram_size, vocab_size, validation_size;
+  bool numberize,
+      ngramize,
+      add_start_stop,
+      mmap_file,
+      randomize;
+
+  string train_text,
+      train_file,
+      validation_text,
+      validation_file,
+      words_file,
+      write_words_file,
+      sent_weights_text,
+      output_sent_weights_text;
+
+  try
+  {
+    CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1");
+
+    // The options are printed in reverse order
 
     ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
     ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
     ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd);
     ValueArg<bool> arg_mmap_file("", "mmap_file", "If true, the training file will be a memory mapped file. \n This is "
-        "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd);
+                                 "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd);
 
     ValueArg<bool> arg_randomize("", "randomize", "If true, Randomly shuffle the training ngrams. It can only be used with mmap_file =1 . Default: false.", false, false, "bool", cmd);
 
     ValueArg<int> arg_vocab_size("", "vocab_size", "Vocabulary size.", false, -1, "int", cmd);
     ValueArg<string> arg_words_file("", "words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd);
     ValueArg<int> arg_ngram_size("", "ngram_size", "Size of n-grams.", true, -1, "int", cmd);
-	ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd);
+    ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd);
     ValueArg<int> arg_validation_size("", "validation_size", "How many lines from training data to hold out for validation. Default: 0.", false, 0, "int", cmd);
-	ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd);
-	ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
-	ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd);
-	ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd);
-	//ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd);
-  //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd);
-
-
-
-	cmd.parse(argc, argv);
-
-	train_text = arg_train_text.getValue();
-	train_file = arg_train_file.getValue();
-	validation_text = arg_validation_text.getValue();
-	validation_file = arg_validation_file.getValue();
-	validation_size = arg_validation_size.getValue();
-	write_words_file = arg_write_words_file.getValue();
-	ngram_size = arg_ngram_size.getValue();
-	vocab_size = arg_vocab_size.getValue();
-	words_file = arg_words_file.getValue();
-	numberize = arg_numberize.getValue();
-	ngramize = arg_ngramize.getValue();
-	add_start_stop = arg_add_start_stop.getValue();
-  mmap_file = arg_mmap_file.getValue();
-  randomize = arg_randomize.getValue();
-  //sent_weights_text = arg_sent_weights_text.getValue();
-  //output_sent_weights_text = arg_sent_weights_file.getValue();
-  sent_weights_text = "";
-  output_sent_weights_text = "";
+    ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd);
+    ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
+    ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd);
+    ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd);
+    //ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd);
+    //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd);
+
+
+    cmd.parse(argc, argv);
+
+    train_text = arg_train_text.getValue();
+    train_file = arg_train_file.getValue();
+    validation_text = arg_validation_text.getValue();
+    validation_file = arg_validation_file.getValue();
+    validation_size = arg_validation_size.getValue();
+    write_words_file = arg_write_words_file.getValue();
+    ngram_size = arg_ngram_size.getValue();
+    vocab_size = arg_vocab_size.getValue();
+    words_file = arg_words_file.getValue();
+    numberize = arg_numberize.getValue();
+    ngramize = arg_ngramize.getValue();
+    add_start_stop = arg_add_start_stop.getValue();
+    mmap_file = arg_mmap_file.getValue();
+    randomize = arg_randomize.getValue();
+    //sent_weights_text = arg_sent_weights_text.getValue();
+    //output_sent_weights_text = arg_sent_weights_file.getValue();
+    sent_weights_text = "";
+    output_sent_weights_text = "";
 
 
     // check command line arguments
@@ -364,292 +363,292 @@ int main(int argc, char *argv[])
 
     cerr << "Command line: " << endl;
     cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
-	
-	const string sep(" Value: ");
-	cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl;
-	cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl;
-	cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl;
-	cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl;
-	cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl;
-	cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl;
-	cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl;
-	cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl;
-	cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl;
-	cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl;
-	cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
-	cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
-	cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl;
-	//cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl;
-    }
-    catch (TCLAP::ArgException &e)
-    {
-      cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
-      exit(1);
-    }
 
-    // VLF: why is this true?
-    // DC: it's because the vocabulary has to be constructed from the training data only.
-    // If the vocabulary is preset, we can't create the validation data.
-    // - if --numberize 0 is set, then --validation_size cannot be used.
-    // if (!numberize && (validation_size > 0)) {
-    //     cerr <<  "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl;
-    // }
-
-    // Read in training data and validation data
-    // vector<vector<string> > train_data;
-    // readSentFile(train_text, train_data);
-    // @vaswani: No more reading the entire training file into memory
-    // Reading it per line with file io
-    
-    //for (int i=0; i<train_data.size(); i++) {
-    // Go over every line in the file and 
-    // 1. if the !ngramize then you should check if 
-    // we have the correct number of items per line
-    // 2. build the vocabulary if the words file has not
-    // been specified.
-    // Construct vocabulary
-    vocabulary vocab;
-    int start, stop;
-    // Add start stop if the vocabulary has not been supplied
-    if (words_file == "") {
-      vocab.insert_word("<s>");
-	    vocab.insert_word("</s>");
-	    vocab.insert_word("<null>");
-      // warn user that if --numberize is not set, there will be no vocabulary!
-      if (!numberize) {
-          cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
-      }
-    }
-    if (mmap_file == false && randomize == true) {
-      cerr<<"Randomize option can only be used with mmap_file = 1"<<endl;
-      exit(1);
+    const string sep(" Value: ");
+    cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl;
+    cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl;
+    cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl;
+    cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl;
+    cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl;
+    cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl;
+    cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl;
+    cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl;
+    cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl;
+    cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl;
+    cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
+    cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
+    cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl;
+    //cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl;
+  }
+  catch (TCLAP::ArgException &e)
+  {
+    cerr << "error: " << e.error() <<  " for arg " << e.argId() << endl;
+    exit(1);
+  }
+
+  // VLF: why is this true?
+  // DC: it's because the vocabulary has to be constructed from the training data only.
+  // If the vocabulary is preset, we can't create the validation data.
+  // - if --numberize 0 is set, then --validation_size cannot be used.
+  // if (!numberize && (validation_size > 0)) {
+  //     cerr <<  "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl;
+  // }
+
+  // Read in training data and validation data
+  // vector<vector<string> > train_data;
+  // readSentFile(train_text, train_data);
+  // @vaswani: No more reading the entire training file into memory
+  // Reading it per line with file io
+
+  //for (int i=0; i<train_data.size(); i++) {
+  // Go over every line in the file and
+  // 1. if the !ngramize then you should check if
+  // we have the correct number of items per line
+  // 2. build the vocabulary if the words file has not
+  // been specified.
+  // Construct vocabulary
+  vocabulary vocab;
+  int start, stop;
+  // Add start stop if the vocabulary has not been supplied
+  if (words_file == "") {
+    vocab.insert_word("<s>");
+    vocab.insert_word("</s>");
+    vocab.insert_word("<null>");
+    // warn user that if --numberize is not set, there will be no vocabulary!
+    if (!numberize) {
+      cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
     }
-    unordered_map<string,int> count; // For keeping word counts if no supplied vocab
-
-    deque<vector<string> > validation_data;
-    int train_data_size=0;
-    cerr<<"Processed ... ";
-    data_size_t num_tokens=0;
-    
-    ifstream training(train_text.c_str());
-
-    string line;
-    while (getline(training,line)) {
-      train_data_size++;
-      //stringstream lstr(line);
-      vector<string> lstr_items;
-      splitBySpace(line,lstr_items);
-      // if data is already ngramized, set/check ngram_size
-      if (!ngramize) {
-          if (ngram_size > 0) {
-              if (ngram_size != lstr_items.size()) {
-                  cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl;
-              }
-          }
-          // else if --ngram_size has not been specified, set it now
-          else {
-              ngram_size=lstr_items.size();
-          }
+  }
+  if (mmap_file == false && randomize == true) {
+    cerr<<"Randomize option can only be used with mmap_file = 1"<<endl;
+    exit(1);
+  }
+  unordered_map<string,int> count; // For keeping word counts if no supplied vocab
+
+  deque<vector<string> > validation_data;
+  int train_data_size=0;
+  cerr<<"Processed ... ";
+  data_size_t num_tokens=0;
+
+  ifstream training(train_text.c_str());
+
+  string line;
+  while (getline(training,line)) {
+    train_data_size++;
+    //stringstream lstr(line);
+    vector<string> lstr_items;
+    splitBySpace(line,lstr_items);
+    // if data is already ngramized, set/check ngram_size
+    if (!ngramize) {
+      if (ngram_size > 0) {
+        if (ngram_size != lstr_items.size()) {
+          cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl;
+        }
       }
-      if ((train_data_size%100000)==0){
-        cerr<<train_data_size<<" lines ... ";
+      // else if --ngram_size has not been specified, set it now
+      else {
+        ngram_size=lstr_items.size();
       }
-      //break;
-      /*
+    }
+    if ((train_data_size%100000)==0){
+      cerr<<train_data_size<<" lines ... ";
+    }
+    //break;
+    /*
       if (lstr_items.size() ==1) {
-        cerr<<"line :"<<endl;
-        cerr<<line<<endl;
-        cerr<<"The number of items was 1"<<endl;
-        getchar();
-      }
-      */
-      num_tokens += lstr_items.size()+1;
-      if (words_file == "") {
-         for (int j=0; j<lstr_items.size(); j++) {
-              count[lstr_items[j]] += 1; 
-          }
+      cerr<<"line :"<<endl;
+      cerr<<line<<endl;
+      cerr<<"The number of items was 1"<<endl;
+      getchar();
       }
-      // Add to validation set if the validation size
-      // has not been specified
-      if (validation_text == "" && validation_size > 0) {
-        //cerr<<"validation size is "<<validation_data.size()<<endl;
-        if (validation_data.size() == validation_size) {
-          //validation_data.erase(validation_data.begin());
-          validation_data.pop_front();
-        }
-        validation_data.push_back(lstr_items);
+    */
+    num_tokens += lstr_items.size()+1;
+    if (words_file == "") {
+      for (int j=0; j<lstr_items.size(); j++) {
+        count[lstr_items[j]] += 1;
       }
     }
-    cerr<<endl;
-    training.close();
-    //cerr<<"validation size is "<<validation_data.size()<<endl;
-    //getchar();
-    if (validation_data.size() < validation_size) {
-      cerr<<"validation size is "<<validation_data.size()<<endl;
-      cerr << "error: requested validation size is greater than training data size" << endl;
-      exit(1);
+    // Add to validation set if the validation size
+    // has not been specified
+    if (validation_text == "" && validation_size > 0) {
+      //cerr<<"validation size is "<<validation_data.size()<<endl;
+      if (validation_data.size() == validation_size) {
+        //validation_data.erase(validation_data.begin());
+        validation_data.pop_front();
+      }
+      validation_data.push_back(lstr_items);
     }
-    
-    train_data_size -= validation_size; 
-    cerr<<"Training data size is "<<train_data_size<<endl;
-
-    // The items in the validation data have already been counted
-    // Decrementing the counts of those words before building the vocabulary
-    for(int i=0; i<validation_data.size(); i++){
-      num_tokens -= (validation_data[i].size() +1);
-      for (int j=0; j<validation_data[i].size();j++){
-        count[validation_data[i][j]] -= 1;
-        if (count[validation_data[i][j]] == 0) {
-          count.erase(validation_data[i][j]);
-        }
+  }
+  cerr<<endl;
+  training.close();
+  //cerr<<"validation size is "<<validation_data.size()<<endl;
+  //getchar();
+  if (validation_data.size() < validation_size) {
+    cerr<<"validation size is "<<validation_data.size()<<endl;
+    cerr << "error: requested validation size is greater than training data size" << endl;
+    exit(1);
+  }
+
+  train_data_size -= validation_size;
+  cerr<<"Training data size is "<<train_data_size<<endl;
+
+  // The items in the validation data have already been counted
+  // Decrementing the counts of those words before building the vocabulary
+  for(int i=0; i<validation_data.size(); i++){
+    num_tokens -= (validation_data[i].size() +1);
+    for (int j=0; j<validation_data[i].size();j++){
+      count[validation_data[i][j]] -= 1;
+      if (count[validation_data[i][j]] == 0) {
+        count.erase(validation_data[i][j]);
       }
     }
+  }
 
-    // Getting the top n frequent words for the vocabulary
-    if (words_file == "") {
-      vocab.insert_most_frequent(count, vocab_size);
-      if (vocab.size() < vocab_size) {
-          cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
-      }
+  // Getting the top n frequent words for the vocabulary
+  if (words_file == "") {
+    vocab.insert_most_frequent(count, vocab_size);
+    if (vocab.size() < vocab_size) {
+      cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
     }
-    //vector<vector<string> > validation_data;
-    if (validation_text != "") {
-        readSentFile(validation_text, validation_data);
-        for (int i=0; i<validation_data.size(); i++) {
-	    // if data is already ngramized, set/check ngram_size
-            if (!ngramize) {
-                // if --ngram_size has been specified, check that it does not conflict with --ngram_size
-                if (ngram_size > 0) {
-                    if (ngram_size != validation_data[i].size()) {
-                        cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl;
-                    }
-                }
-                // else if --ngram_size has not been specified, set it now
-                else {
-                    ngram_size=validation_data[i].size();
-                }
-            }
+  }
+  //vector<vector<string> > validation_data;
+  if (validation_text != "") {
+    readSentFile(validation_text, validation_data);
+    for (int i=0; i<validation_data.size(); i++) {
+      // if data is already ngramized, set/check ngram_size
+      if (!ngramize) {
+        // if --ngram_size has been specified, check that it does not conflict with --ngram_size
+        if (ngram_size > 0) {
+          if (ngram_size != validation_data[i].size()) {
+            cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl;
+          }
         }
+        // else if --ngram_size has not been specified, set it now
+        else {
+          ngram_size=validation_data[i].size();
+        }
+      }
     }
-    //READING SENTENCE WEIGHTS IF THERE ARE ANY
-    vector<float> sent_weights;
-    if (sent_weights_text != "") {
-      cerr<<"Reading sentence weights from "<<sent_weights_text<<endl;
-      ifstream sent_weights_file(sent_weights_text.c_str());
-			string line;
-      readWeightsFile(sent_weights_file,sent_weights);
-			sent_weights_file.close();
-			if (sent_weights_text.size() != train_data_size) {
-				cerr<<"The number of sentence weights does not match the number of training sentences"<<endl;
-			}
+  }
+  //READING SENTENCE WEIGHTS IF THERE ARE ANY
+  vector<float> sent_weights;
+  if (sent_weights_text != "") {
+    cerr<<"Reading sentence weights from "<<sent_weights_text<<endl;
+    ifstream sent_weights_file(sent_weights_text.c_str());
+    string line;
+    readWeightsFile(sent_weights_file,sent_weights);
+    sent_weights_file.close();
+    if (sent_weights_text.size() != train_data_size) {
+      cerr<<"The number of sentence weights does not match the number of training sentences"<<endl;
     }
-		
-    /*
+  }
+
+  /*
     else if (validation_size > 0)
     {
-      // Create validation data
-      if (validation_size > train_data.size())
-      {
-          cerr << "error: requested validation size is greater than training data size" << endl;
-          exit(1);
-      }
-	    validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end());
-	    train_data.resize(train_data.size() - validation_size);
+    // Create validation data
+    if (validation_size > train_data.size())
+    {
+    cerr << "error: requested validation size is greater than training data size" << endl;
+    exit(1);
     }
-    */
-
-    // Construct vocabulary
-    //vocabulary vocab;
-    //int start, stop;
-    
-    // read vocabulary from file
-    if (words_file != "") {
-        vector<string> words;
-        readWordsFile(words_file,words);
-        for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
-            vocab.insert_word(*it);
-        }
-
-        // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file
-        if (vocab_size > 0) {
-            if (vocab.size() != vocab_size) {
-                cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl;
-            }
-        }
-        // else, set it to the size of vocabulary read from file
-        else {
-            vocab_size = vocab.size();
-        }
-
+    validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end());
+    train_data.resize(train_data.size() - validation_size);
     }
-    /*
-    // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk>
-    else {
-      vocab.insert_word("<s>");
-	    vocab.insert_word("</s>");
-	    vocab.insert_word("<null>");
-
-        // warn user that if --numberize is not set, there will be no vocabulary!
-        if (!numberize) {
-            cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
-        }
-        unordered_map<string,int> count;
-        for (int i=0; i<train_data.size(); i++) {
-            for (int j=0; j<train_data[i].size(); j++) {
-                count[train_data[i][j]] += 1; 
-            }
-        }
-
-        vocab.insert_most_frequent(count, vocab_size);
-        if (vocab.size() < vocab_size) {
-            cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
-        }
+  */
+
+  // Construct vocabulary
+  //vocabulary vocab;
+  //int start, stop;
+
+  // read vocabulary from file
+  if (words_file != "") {
+    vector<string> words;
+    readWordsFile(words_file,words);
+    for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
+      vocab.insert_word(*it);
     }
-    */
 
-    // write vocabulary to file
-    if (write_words_file != "") {
-        cerr << "Writing vocabulary to " << write_words_file << endl;
-        writeWordsFile(vocab.words(), write_words_file);
+    // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file
+    if (vocab_size > 0) {
+      if (vocab.size() != vocab_size) {
+        cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl;
+      }
     }
-
-    // Write out numberized n-grams
-    if (train_file != "")
-    {
-        cerr << "Writing training data to " << train_file << endl;
-        if (mmap_file == true) {
-          writeMmapNgrams(train_text,
-            ngram_size,
-            vocab,
-            numberize,
-            add_start_stop,
-            ngramize,
-            train_file,
-            train_data_size,
-            num_tokens,
-            randomize);
-        } else {
-          writeNgrams(train_text,
-              ngram_size,
-              vocab,
-              numberize,
-              add_start_stop,
-              ngramize,
-              train_file,
-              train_data_size,
-							sent_weights,
-							output_sent_weights_text);
-        }
+    // else, set it to the size of vocabulary read from file
+    else {
+      vocab_size = vocab.size();
     }
-    if (validation_file != "")
-    {
-        cerr << "Writing validation data to " << validation_file << endl;
-        writeNgrams(validation_data,
-            ngram_size,
-            vocab,
-            numberize,
-            add_start_stop,
-            ngramize,
-            validation_file);
+
+  }
+  /*
+  // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk>
+  else {
+  vocab.insert_word("<s>");
+  vocab.insert_word("</s>");
+  vocab.insert_word("<null>");
+
+  // warn user that if --numberize is not set, there will be no vocabulary!
+  if (!numberize) {
+  cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
+  }
+  unordered_map<string,int> count;
+  for (int i=0; i<train_data.size(); i++) {
+  for (int j=0; j<train_data[i].size(); j++) {
+  count[train_data[i][j]] += 1;
+  }
+  }
+
+  vocab.insert_most_frequent(count, vocab_size);
+  if (vocab.size() < vocab_size) {
+  cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
+  }
+  }
+  */
+
+  // write vocabulary to file
+  if (write_words_file != "") {
+    cerr << "Writing vocabulary to " << write_words_file << endl;
+    writeWordsFile(vocab.words(), write_words_file);
+  }
+
+  // Write out numberized n-grams
+  if (train_file != "")
+  {
+    cerr << "Writing training data to " << train_file << endl;
+    if (mmap_file == true) {
+      writeMmapNgrams(train_text,
+                      ngram_size,
+                      vocab,
+                      numberize,
+                      add_start_stop,
+                      ngramize,
+                      train_file,
+                      train_data_size,
+                      num_tokens,
+                      randomize);
+    } else {
+      writeNgrams(train_text,
+                  ngram_size,
+                  vocab,
+                  numberize,
+                  add_start_stop,
+                  ngramize,
+                  train_file,
+                  train_data_size,
+                  sent_weights,
+                  output_sent_weights_text);
     }
+  }
+  if (validation_file != "")
+  {
+    cerr << "Writing validation data to " << validation_file << endl;
+    writeNgrams(validation_data,
+                ngram_size,
+                vocab,
+                numberize,
+                add_start_stop,
+                ngramize,
+                validation_file);
+  }
 }