diff options
author | Nicola Bertoldi <bertoldi@fbk.eu> | 2014-05-19 17:34:27 +0400 |
---|---|---|
committer | Nicola Bertoldi <bertoldi@fbk.eu> | 2014-05-19 17:34:27 +0400 |
commit | 2f3cd5e2fe54f0352eee4657ea91e0039073a95a (patch) | |
tree | 535b1f445dd9f762997bc0dd0ef7a17f9500e250 /moses/Util.cpp | |
parent | d85bd05e1827ceb917ab3664fee34fd05725eab6 (diff) |
beautify
Diffstat (limited to 'moses/Util.cpp')
-rw-r--r-- | moses/Util.cpp | 590 |
1 files changed, 293 insertions, 297 deletions
diff --git a/moses/Util.cpp b/moses/Util.cpp index 323df7d5a..79690668f 100644 --- a/moses/Util.cpp +++ b/moses/Util.cpp @@ -3,17 +3,17 @@ /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh - + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. - + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. - + You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA @@ -42,305 +42,301 @@ using namespace std; namespace Moses { - - //global variable - Timer g_timer; - - string GetTempFolder() - { + +//global variable +Timer g_timer; + +string GetTempFolder() +{ #ifdef _WIN32 - char *tmpPath = getenv("TMP"); - string str(tmpPath); - if (str.substr(str.size() - 1, 1) != "\\") - str += "\\"; - return str; + char *tmpPath = getenv("TMP"); + string str(tmpPath); + if (str.substr(str.size() - 1, 1) != "\\") + str += "\\"; + return str; #else - return "/tmp/"; + return "/tmp/"; #endif - } - - const std::string ToLower(const std::string& str) - { - std::string lc(str); - std::transform(lc.begin(), lc.end(), lc.begin(), (int(*)(int))std::tolower); - return lc; - } - - class BoolValueException : public util::Exception {}; - - template<> - bool Scan<bool>(const std::string &input) - { - std::string lc = ToLower(input); - if (lc == "yes" || lc == "y" || lc == "true" || lc == "1") - return true; - if (lc == "no" || lc == "n" || lc =="false" || lc == "0") - return false; - UTIL_THROW(BoolValueException, "Could not interpret " << input << " as a boolean. After lowercasing, valid values are yes, y, true, 1, no, n, false, and 0."); - } - - bool FileExists(const std::string& filePath) - { - ifstream ifs(filePath.c_str()); - return !ifs.fail(); - } - - const std::string Trim(const std::string& str, const std::string dropChars) - { - std::string res = str; - res.erase(str.find_last_not_of(dropChars)+1); - return res.erase(0, res.find_first_not_of(dropChars)); - } - - void ResetUserTime() - { - g_timer.start(); - }; - - void PrintUserTime(const std::string &message) - { - g_timer.check(message.c_str()); - } - - double GetUserTime() - { - return g_timer.get_elapsed_time(); - } - - std::vector< std::map<std::string, std::string> > ProcessAndStripDLT(std::string &line) - { - std::vector< std::map<std::string, std::string> > meta; - std::string lline = ToLower(line); - bool check_dlt = true; - - //allowed format of dlt tag - //<dlt type="name" id="name" attr="value"/> - //the type attribute is mandatory; the name should not contain any double quotation mark - //the id attribute is optional; if present, the name should not contain any double quotation mark - //only one additional attribute is possible; value can contain double quotation marks - //both name and value must be surrounded by double quotation mark - +} + +const std::string ToLower(const std::string& str) +{ + std::string lc(str); + std::transform(lc.begin(), lc.end(), lc.begin(), (int(*)(int))std::tolower); + return lc; +} + +class BoolValueException : public util::Exception {}; + +template<> +bool Scan<bool>(const std::string &input) +{ + std::string lc = ToLower(input); + if (lc == "yes" || lc == "y" || lc == "true" || lc == "1") + return true; + if (lc == "no" || lc == "n" || lc =="false" || lc == "0") + return false; + UTIL_THROW(BoolValueException, "Could not interpret " << input << " as a boolean. After lowercasing, valid values are yes, y, true, 1, no, n, false, and 0."); +} + +bool FileExists(const std::string& filePath) +{ + ifstream ifs(filePath.c_str()); + return !ifs.fail(); +} + +const std::string Trim(const std::string& str, const std::string dropChars) +{ + std::string res = str; + res.erase(str.find_last_not_of(dropChars)+1); + return res.erase(0, res.find_first_not_of(dropChars)); +} + +void ResetUserTime() +{ + g_timer.start(); +}; + +void PrintUserTime(const std::string &message) +{ + g_timer.check(message.c_str()); +} + +double GetUserTime() +{ + return g_timer.get_elapsed_time(); +} + +std::vector< std::map<std::string, std::string> > ProcessAndStripDLT(std::string &line) +{ + std::vector< std::map<std::string, std::string> > meta; + std::string lline = ToLower(line); + bool check_dlt = true; + + //allowed format of dlt tag + //<dlt type="name" id="name" attr="value"/> + //the type attribute is mandatory; the name should not contain any double quotation mark + //the id attribute is optional; if present, the name should not contain any double quotation mark + //only one additional attribute is possible; value can contain double quotation marks + //both name and value must be surrounded by double quotation mark + // std::cerr << "GLOBAL START" << endl; - while (check_dlt) { - size_t start = lline.find("<dlt"); - if (start == std::string::npos) { - //no more dlt tags - check_dlt = false; - continue; - } - size_t close = lline.find("/>"); - if (close == std::string::npos) { - // error: dlt tag is not ended - check_dlt = false; - continue; - } - //std::string dlt = Trim(lline.substr(start+4, close-start-4)); - std::string dlt = Trim(line.substr(start+4, close-start-4)); - - line.erase(start,close-start+2); - lline.erase(start,close-start+2); - - if (dlt != "") { - std::map<std::string, std::string> tmp_meta; - - //check if type is present and store it - size_t start_type = dlt.find("type="); - size_t len_type=4; - if (start_type != std::string::npos) { - //type is present - //required format type="value" - //double quotation mark is required - - std::string val_type; - std::string label_type = dlt.substr(start_type, len_type); - if (dlt[start_type+len_type+1] == '"'){ - val_type = dlt.substr(start_type+len_type+2); - size_t close_type = val_type.find('"'); - val_type = val_type.substr(0, close_type); - dlt.erase(start_type,start_type+len_type+2+close_type+1); - } - else{ - TRACE_ERR("DLT parse error: missing character \" for type \n"); - } - label_type = Trim(label_type); - dlt = Trim(dlt); - - tmp_meta[label_type] = val_type; - } - else{ - //type is not present - UTIL_THROW(util::Exception, "ProcessAndStripDLT(std::string &line): Attribute type for dlt tag is mandatory."); - } - - //check if id is present and store it - size_t start_id = dlt.find("id="); - size_t len_id=2; - if (start_id != std::string::npos) { - //id is present - //required format id="name" - //double quotation mark is required - - std::string val_id; - std::string label_id = dlt.substr(start_id, len_id); - if (dlt[start_id+len_id+1] == '"'){ - val_id = dlt.substr(start_id+len_id+2); - size_t close_id = val_id.find('"'); - val_id = val_id.substr(0, close_id); - dlt.erase(start_id,start_id+len_id+2+close_id+1); - } - else{ - TRACE_ERR("DLT parse error: missing character \" for id \n"); - } - label_id = Trim(label_id); - dlt = Trim(dlt); - - tmp_meta[label_id] = val_id; - } - else{ - //id is not present - //do nothing - } - - for (size_t i = 1; i < dlt.size(); i++) { - if (dlt[i] == '=') { - std::string label = dlt.substr(0, i); - std::string val = dlt.substr(i+1); - if (val[0] == '"') { - - val = val.substr(1); - // it admits any double quotation mark (but is attribute) in the value of the attribute - // it assumes that just one attribute (besides id attribute) is present in the tag, - // it assumes that the value starts and ends with double quotation mark - size_t close = val.rfind('"'); - if (close == std::string::npos) { - TRACE_ERR("SGML parse error: missing \"\n"); - dlt = ""; - i = 0; - } else { - dlt = val.substr(close+1); - val = val.substr(0, close); - i = 0; - } - } else { - size_t close = val.find(' '); - if (close == std::string::npos) { - dlt = ""; - i = 0; - } else { - dlt = val.substr(close+1); - val = val.substr(0, close); - } - } - label = Trim(label); - dlt = Trim(dlt); - - tmp_meta[label] = val; - } - } - - meta.push_back(tmp_meta); - } - } + while (check_dlt) { + size_t start = lline.find("<dlt"); + if (start == std::string::npos) { + //no more dlt tags + check_dlt = false; + continue; + } + size_t close = lline.find("/>"); + if (close == std::string::npos) { + // error: dlt tag is not ended + check_dlt = false; + continue; + } + //std::string dlt = Trim(lline.substr(start+4, close-start-4)); + std::string dlt = Trim(line.substr(start+4, close-start-4)); + + line.erase(start,close-start+2); + lline.erase(start,close-start+2); + + if (dlt != "") { + std::map<std::string, std::string> tmp_meta; + + //check if type is present and store it + size_t start_type = dlt.find("type="); + size_t len_type=4; + if (start_type != std::string::npos) { + //type is present + //required format type="value" + //double quotation mark is required + + std::string val_type; + std::string label_type = dlt.substr(start_type, len_type); + if (dlt[start_type+len_type+1] == '"') { + val_type = dlt.substr(start_type+len_type+2); + size_t close_type = val_type.find('"'); + val_type = val_type.substr(0, close_type); + dlt.erase(start_type,start_type+len_type+2+close_type+1); + } else { + TRACE_ERR("DLT parse error: missing character \" for type \n"); + } + label_type = Trim(label_type); + dlt = Trim(dlt); + + tmp_meta[label_type] = val_type; + } else { + //type is not present + UTIL_THROW(util::Exception, "ProcessAndStripDLT(std::string &line): Attribute type for dlt tag is mandatory."); + } + + //check if id is present and store it + size_t start_id = dlt.find("id="); + size_t len_id=2; + if (start_id != std::string::npos) { + //id is present + //required format id="name" + //double quotation mark is required + + std::string val_id; + std::string label_id = dlt.substr(start_id, len_id); + if (dlt[start_id+len_id+1] == '"') { + val_id = dlt.substr(start_id+len_id+2); + size_t close_id = val_id.find('"'); + val_id = val_id.substr(0, close_id); + dlt.erase(start_id,start_id+len_id+2+close_id+1); + } else { + TRACE_ERR("DLT parse error: missing character \" for id \n"); + } + label_id = Trim(label_id); + dlt = Trim(dlt); + + tmp_meta[label_id] = val_id; + } else { + //id is not present + //do nothing + } + + for (size_t i = 1; i < dlt.size(); i++) { + if (dlt[i] == '=') { + std::string label = dlt.substr(0, i); + std::string val = dlt.substr(i+1); + if (val[0] == '"') { + + val = val.substr(1); + // it admits any double quotation mark (but is attribute) in the value of the attribute + // it assumes that just one attribute (besides id attribute) is present in the tag, + // it assumes that the value starts and ends with double quotation mark + size_t close = val.rfind('"'); + if (close == std::string::npos) { + TRACE_ERR("SGML parse error: missing \"\n"); + dlt = ""; + i = 0; + } else { + dlt = val.substr(close+1); + val = val.substr(0, close); + i = 0; + } + } else { + size_t close = val.find(' '); + if (close == std::string::npos) { + dlt = ""; + i = 0; + } else { + dlt = val.substr(close+1); + val = val.substr(0, close); + } + } + label = Trim(label); + dlt = Trim(dlt); + + tmp_meta[label] = val; + } + } + + meta.push_back(tmp_meta); + } + } // std::cerr << "GLOBAL END" << endl; - return meta; - } - - std::map<std::string, std::string> ProcessAndStripSGML(std::string &line) - { - std::map<std::string, std::string> meta; - std::string lline = ToLower(line); - if (lline.find("<seg")!=0) return meta; - size_t close = lline.find(">"); - if (close == std::string::npos) return meta; // error - size_t end = lline.find("</seg>"); - std::string seg = Trim(lline.substr(4, close-4)); - std::string text = line.substr(close+1, end - close - 1); - for (size_t i = 1; i < seg.size(); i++) { - if (seg[i] == '=' && seg[i-1] == ' ') { - std::string less = seg.substr(0, i-1) + seg.substr(i); - seg = less; - i = 0; - continue; - } - if (seg[i] == '=' && seg[i+1] == ' ') { - std::string less = seg.substr(0, i+1); - if (i+2 < seg.size()) less += seg.substr(i+2); - seg = less; - i = 0; - continue; - } - } - line = Trim(text); - if (seg == "") return meta; - for (size_t i = 1; i < seg.size(); i++) { - if (seg[i] == '=') { - std::string label = seg.substr(0, i); - std::string val = seg.substr(i+1); - if (val[0] == '"') { - val = val.substr(1); - size_t close = val.find('"'); - if (close == std::string::npos) { - TRACE_ERR("SGML parse error: missing \"\n"); - seg = ""; - i = 0; - } else { - seg = val.substr(close+1); - val = val.substr(0, close); - i = 0; - } - } else { - size_t close = val.find(' '); - if (close == std::string::npos) { - seg = ""; - i = 0; - } else { - seg = val.substr(close+1); - val = val.substr(0, close); - } - } - label = Trim(label); - seg = Trim(seg); - meta[label] = val; - } - } - return meta; - } - - std::string PassthroughSGML(std::string &line, const std::string tagName, const std::string& lbrackStr, const std::string& rbrackStr) - { - string lbrack = lbrackStr; // = "<"; - string rbrack = rbrackStr; // = ">"; - - std::string meta = ""; - - std::string lline = ToLower(line); - size_t open = lline.find(lbrack+tagName); - //check whether the tag exists; if not return the empty string - if (open == std::string::npos) return meta; - - size_t close = lline.find(rbrack, open); - //check whether the tag is closed with '/>'; if not return the empty string - if (close == std::string::npos) { - TRACE_ERR("PassthroughSGML error: the <passthrough info/> tag does not end properly\n"); - return meta; - } - // extract the tag - std::string tmp = line.substr(open, close - open + 1); - meta = line.substr(open, close - open + 1); - - // strip the tag from the line - line = line.substr(0, open) + line.substr(close + 1, std::string::npos); - - TRACE_ERR("The input contains a <passthrough info/> tag:" << meta << std::endl); - - lline = ToLower(line); - open = lline.find(lbrack+tagName); - if (open != std::string::npos) { - TRACE_ERR("PassthroughSGML error: there are two <passthrough> tags\n"); - } - return meta; - } - + return meta; +} + +std::map<std::string, std::string> ProcessAndStripSGML(std::string &line) +{ + std::map<std::string, std::string> meta; + std::string lline = ToLower(line); + if (lline.find("<seg")!=0) return meta; + size_t close = lline.find(">"); + if (close == std::string::npos) return meta; // error + size_t end = lline.find("</seg>"); + std::string seg = Trim(lline.substr(4, close-4)); + std::string text = line.substr(close+1, end - close - 1); + for (size_t i = 1; i < seg.size(); i++) { + if (seg[i] == '=' && seg[i-1] == ' ') { + std::string less = seg.substr(0, i-1) + seg.substr(i); + seg = less; + i = 0; + continue; + } + if (seg[i] == '=' && seg[i+1] == ' ') { + std::string less = seg.substr(0, i+1); + if (i+2 < seg.size()) less += seg.substr(i+2); + seg = less; + i = 0; + continue; + } + } + line = Trim(text); + if (seg == "") return meta; + for (size_t i = 1; i < seg.size(); i++) { + if (seg[i] == '=') { + std::string label = seg.substr(0, i); + std::string val = seg.substr(i+1); + if (val[0] == '"') { + val = val.substr(1); + size_t close = val.find('"'); + if (close == std::string::npos) { + TRACE_ERR("SGML parse error: missing \"\n"); + seg = ""; + i = 0; + } else { + seg = val.substr(close+1); + val = val.substr(0, close); + i = 0; + } + } else { + size_t close = val.find(' '); + if (close == std::string::npos) { + seg = ""; + i = 0; + } else { + seg = val.substr(close+1); + val = val.substr(0, close); + } + } + label = Trim(label); + seg = Trim(seg); + meta[label] = val; + } + } + return meta; +} + +std::string PassthroughSGML(std::string &line, const std::string tagName, const std::string& lbrackStr, const std::string& rbrackStr) +{ + string lbrack = lbrackStr; // = "<"; + string rbrack = rbrackStr; // = ">"; + + std::string meta = ""; + + std::string lline = ToLower(line); + size_t open = lline.find(lbrack+tagName); + //check whether the tag exists; if not return the empty string + if (open == std::string::npos) return meta; + + size_t close = lline.find(rbrack, open); + //check whether the tag is closed with '/>'; if not return the empty string + if (close == std::string::npos) { + TRACE_ERR("PassthroughSGML error: the <passthrough info/> tag does not end properly\n"); + return meta; + } + // extract the tag + std::string tmp = line.substr(open, close - open + 1); + meta = line.substr(open, close - open + 1); + + // strip the tag from the line + line = line.substr(0, open) + line.substr(close + 1, std::string::npos); + + TRACE_ERR("The input contains a <passthrough info/> tag:" << meta << std::endl); + + lline = ToLower(line); + open = lline.find(lbrack+tagName); + if (open != std::string::npos) { + TRACE_ERR("PassthroughSGML error: there are two <passthrough> tags\n"); + } + return meta; +} + } |