#include "line_splitter.h" namespace probingpt { line_text splitLine(const StringPiece &textin, bool scfg) { const char delim[] = "|||"; line_text output; //Tokenize util::TokenIter it(textin, util::MultiCharacter(delim)); //Get source phrase output.source_phrase = Trim(*it); //std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl; //Get target_phrase it++; output.target_phrase = Trim(*it); //std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl; if (scfg) { /* std::cerr << "output.source_phrase=" << output.source_phrase << std::endl; std::cerr << "output.target_phrase=" << output.target_phrase << std::endl; reformatSCFG(output); std::cerr << "output.source_phrase=" << output.source_phrase << std::endl; std::cerr << "output.target_phrase=" << output.target_phrase << std::endl; */ } //Get probabilities it++; output.prob = Trim(*it); //std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl; //Get WordAllignment it++; if (it == util::TokenIter::end()) return output; output.word_align = Trim(*it); //std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl; //Get count it++; if (it == util::TokenIter::end()) return output; output.counts = Trim(*it); //std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl; //Get sparse_score it++; if (it == util::TokenIter::end()) return output; output.sparse_score = Trim(*it); //std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl; //Get property it++; if (it == util::TokenIter::end()) return output; output.property = Trim(*it); //std::cerr << "output.property=" << output.property << "AAAA" << std::endl; return output; } std::vector splitWordAll1(const StringPiece &textin) { const char delim[] = " "; const char delim2[] = "-"; std::vector output; //Case with no word alignments. if (textin.size() == 0) { return output; } //Split on space util::TokenIter it(textin, util::MultiCharacter(delim)); //For each int while (it) { //Split on dash (-) util::TokenIter itInner(*it, util::MultiCharacter(delim2)); //Insert the two entries in the vector. User will read entry 0 and 1 to get the first, //2 and 3 for second etc. Use unsigned char instead of int to save space, as //word allignments are all very small numbers that fit in a single byte output.push_back((unsigned char) (atoi(itInner->data()))); itInner++; output.push_back((unsigned char) (atoi(itInner->data()))); it++; } return output; } void reformatSCFG(line_text &output) { } }