diff options
author | Rico Sennrich <rico.sennrich@gmx.ch> | 2015-08-27 16:12:52 +0300 |
---|---|---|
committer | Rico Sennrich <rico.sennrich@gmx.ch> | 2015-08-27 16:12:52 +0300 |
commit | 55bf63ddd73978b46e19f3d8c5606d5677cf560f (patch) | |
tree | 939ef531239b69f2fbcc2558cd58af607eac6d50 | |
parent | 0612a2f8cc195f0867d5aaa8b8ef05039af3970c (diff) |
speed up createMmap
-rw-r--r-- | src/createMmap.cpp | 20 |
1 files changed, 12 insertions, 8 deletions
diff --git a/src/createMmap.cpp b/src/createMmap.cpp index 408d5fe..16a523f 100644 --- a/src/createMmap.cpp +++ b/src/createMmap.cpp @@ -70,24 +70,28 @@ void writeMmap(const string &filename_input, ifstream training(filename_input.c_str()); data_size_t i = 0; std::string line; - std::vector<std::string> ngram; + std::string delimiters = " \t"; while (std::getline(training, line)) { if ((i%10000000)==0) { std::cerr<<i<<"..."; } - splitBySpace(line, ngram); - if (ngram.size() != ngram_size) + std::string::size_type startPos = line.find_first_not_of(delimiters, 0); + std::string::size_type endPos; + size_t j = 0; + while (std::string::npos != startPos) { + endPos = line.find_first_of(delimiters, startPos); + mMapVec->at(i*ngram_size+j) = (int)strtol(line.data() + startPos, NULL, 10); + j++; + startPos = line.find_first_not_of(delimiters, endPos); + } + if (j != ngram_size) { - std::cerr << "Error: expected " << ngram_size << " fields in instance, found " << ngram.size() << std::endl; + std::cerr << "Error: expected " << ngram_size << " fields in instance, found " << j << std::endl; std::exit(-1); } - for (int j=0; j<ngram_size; j++) { - mMapVec->at(i*ngram_size+j) = boost::lexical_cast<int>(ngram[j]); - } - ++i; } |