From 21aaec0105ffc69c48a4c8977b965af3e05c7a04 Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Fri, 24 Jul 2015 21:01:13 +0100 Subject: Removed some duplicate code. Can we move all or parts of moses/Util to util/, and from the Moses namespace to the util namespace? There's quite some common functionality in it that is not only relevant to the decoder, but also to phrase extraction and possibly other parts of the toolkit. --- phrase-extract/extract-lex-main.cpp | 9 ++++--- phrase-extract/extract-lex.h | 53 ------------------------------------- 2 files changed, 5 insertions(+), 57 deletions(-) (limited to 'phrase-extract') diff --git a/phrase-extract/extract-lex-main.cpp b/phrase-extract/extract-lex-main.cpp index f63015a6a..78182396d 100644 --- a/phrase-extract/extract-lex-main.cpp +++ b/phrase-extract/extract-lex-main.cpp @@ -4,6 +4,7 @@ #include #include "extract-lex.h" #include "InputFileStream.h" +#include "moses/Util.h" using namespace std; using namespace MosesTraining; @@ -53,9 +54,9 @@ int main(int argc, char* argv[]) assert(isAlign); vector toksTarget, toksSource, toksAlign; - Tokenize(toksTarget, lineTarget); - Tokenize(toksSource, lineSource); - Tokenize(toksAlign, lineAlign); + Moses::Tokenize(toksTarget, lineTarget); + Moses::Tokenize(toksSource, lineSource); + Moses::Tokenize(toksAlign, lineAlign); /* cerr << endl @@ -99,7 +100,7 @@ void ExtractLex::Process(vector &toksTarget, vector &toksSource, const string &alignTok = *iterAlign; vector alignPos; - Tokenize(alignPos, alignTok, "-"); + Moses::Tokenize(alignPos, alignTok, "-"); assert(alignPos.size() == 2); if (alignPos[0] >= toksSource.size()) { diff --git a/phrase-extract/extract-lex.h b/phrase-extract/extract-lex.h index 044a32cf8..1d49465c8 100644 --- a/phrase-extract/extract-lex.h +++ b/phrase-extract/extract-lex.h @@ -9,59 +9,6 @@ namespace MosesTraining { - -//! convert string to variable of type T. Used to reading floats, int etc from files -template -inline T Scan(const std::string &input) -{ - std::stringstream stream(input); - T ret; - stream >> ret; - return ret; -} - - -//! speeded up version of above -template -inline void Scan(std::vector &output, const std::vector< std::string > &input) -{ - output.resize(input.size()); - for (size_t i = 0 ; i < input.size() ; i++) { - output[i] = Scan( input[i] ); - } -} - - -inline void Tokenize(std::vector &output - , const std::string& str - , const std::string& delimiters = " \t") -{ - // Skip delimiters at beginning. - std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); - // Find first "non-delimiter". - std::string::size_type pos = str.find_first_of(delimiters, lastPos); - - while (std::string::npos != pos || std::string::npos != lastPos) { - // Found a token, add it to the vector. - output.push_back(str.substr(lastPos, pos - lastPos)); - // Skip delimiters. Note the "not_of" - lastPos = str.find_first_not_of(delimiters, pos); - // Find next "non-delimiter" - pos = str.find_first_of(delimiters, lastPos); - } -} - -// speeded up version of above -template -inline void Tokenize( std::vector &output - , const std::string &input - , const std::string& delimiters = " \t") -{ - std::vector stringVector; - Tokenize(stringVector, input, delimiters); - return Scan(output, stringVector ); -} - class WordCount { friend std::ostream& operator<<(std::ostream&, const WordCount&); -- cgit v1.2.3