#include "PCNTools.h" #include #include #include "Util.h" #include "util/exception.hh" using namespace std; namespace PCN { const std::string chars = "'\\"; const char& quote = chars[0]; const char& slash = chars[1]; // safe get inline char get(const std::string& in, int c) { if (c < 0 || c >= (int)in.size()) return 0; else return in[(size_t)c]; } // consume whitespace inline void eatws(const std::string& in, int& c) { while (get(in,c) == ' ') { c++; } } std::string getString(const std::string& in, int &c) { std::string ret; eatws(in,c); while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') { ret += get(in,c++); } eatws(in,c); return ret; } // from 'foo' return foo std::string getEscapedString(const std::string& in, int &c) { eatws(in,c); if (get(in,c++) != quote) return "ERROR"; std::string res; char cur = 0; do { cur = get(in,c++); if (cur == slash) { res += get(in,c++); } else if (cur != quote) { res += cur; } } while (get(in,c) != quote && (c < (int)in.size())); c++; eatws(in,c); return res; } // basically atof float getFloat(const std::string& in, int &c) { std::string tmp; eatws(in,c); while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') { tmp += get(in,c++); } eatws(in,c); return atof(tmp.c_str()); } // basically atof int getInt(const std::string& in, int &c) { std::string tmp; eatws(in,c); while (c < (int)in.size() && get(in,c) != ' ' && get(in,c) != ')' && get(in,c) != ',') { tmp += get(in,c++); } eatws(in,c); return atoi(tmp.c_str()); } // parse ('foo', 0.23) CNAlt getCNAlt(const std::string& in, int &c) { if (get(in,c++) != '(') { std::cerr << "PCN/PLF parse error: expected ( at start of cn alt block\n"; // throw "expected ("; return CNAlt(); } std::string word = getEscapedString(in,c); if (get(in,c++) != ',') { std::cerr << "PCN/PLF parse error: expected , after string\n"; // throw "expected , after string"; return CNAlt(); } size_t cnNext = 1; // read all tokens after the 1st std::vector toks; toks.push_back(getString(in,c)); while (get(in,c) == ',') { c++; string tok = getString(in,c); toks.push_back(tok); } std::vector probs; // dense scores size_t ind; for (ind = 0; ind < toks.size() - 1; ++ind) { const string &tok = toks[ind]; if (tok.find('=') == tok.npos) { float val = Moses::Scan(tok); probs.push_back(val); } else { // beginning of sparse feature break; } } // sparse features std::map sparseFeatures; for (; ind < toks.size() - 1; ++ind) { const string &tok = toks[ind]; vector keyValue = Moses::Tokenize(tok, "="); UTIL_THROW_IF2(keyValue.size() != 2, "Format error: " << tok); float prob = Moses::Scan(keyValue[1]); sparseFeatures[ keyValue[0] ] = prob; } //last item is column increment cnNext = Moses::Scan(toks.back()); if (get(in,c++) != ')') { std::cerr << "PCN/PLF parse error: expected ) at end of cn alt block\n"; // throw "expected )"; return CNAlt(); } eatws(in,c); return CNAlt(word, probs, sparseFeatures, cnNext); } // parse (('foo', 0.23), ('bar', 0.77)) CNCol getCNCol(const std::string& in, int &c) { CNCol res; if (get(in,c++) != '(') return res; // error eatws(in,c); while (1) { if (c > (int)in.size()) { break; } if (get(in,c) == ')') { c++; eatws(in,c); break; } if (get(in,c) == ',' && get(in,c+1) == ')') { c+=2; eatws(in,c); break; } if (get(in,c) == ',') { c++; eatws(in,c); } res.push_back(getCNAlt(in, c)); } return res; } // parse ((('foo', 0.23), ('bar', 0.77)), (('a', 0.3), ('c', 0.7))) CN parsePCN(const std::string& in) { CN res; int c = 0; if (in[c++] != '(') return res; // error while (1) { if (c > (int)in.size()) { break; } if (get(in,c) == ')') { c++; eatws(in,c); break; } if (get(in,c) == ',' && get(in,c+1) == ')') { c+=2; eatws(in,c); break; } if (get(in,c) == ',') { c++; eatws(in,c); } res.push_back(getCNCol(in, c)); } return res; } }