Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'moses/TranslationModel/ProbingPT/line_splitter.cpp')
-rw-r--r--moses/TranslationModel/ProbingPT/line_splitter.cpp59
1 files changed, 46 insertions, 13 deletions
diff --git a/moses/TranslationModel/ProbingPT/line_splitter.cpp b/moses/TranslationModel/ProbingPT/line_splitter.cpp
index 1eeeb1899..cb9e47fec 100644
--- a/moses/TranslationModel/ProbingPT/line_splitter.cpp
+++ b/moses/TranslationModel/ProbingPT/line_splitter.cpp
@@ -1,66 +1,92 @@
#include "line_splitter.hh"
-line_text splitLine(StringPiece textin)
+namespace Moses
{
- const char delim[] = " ||| ";
+
+line_text splitLine(const StringPiece &textin, bool scfg)
+{
+ const char delim[] = "|||";
line_text output;
//Tokenize
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
//Get source phrase
- output.source_phrase = *it;
+ output.source_phrase = Trim(*it);
+ //std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl;
//Get target_phrase
it++;
- output.target_phrase = *it;
+ output.target_phrase = Trim(*it);
+ //std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl;
+
+ if (scfg) {
+ /*
+ std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
+ std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
+ reformatSCFG(output);
+ std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
+ std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
+ */
+ }
//Get probabilities
it++;
- output.prob = *it;
+ output.prob = Trim(*it);
+ //std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl;
//Get WordAllignment
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
- output.word_align = *it;
+ output.word_align = Trim(*it);
+ //std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl;
//Get count
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
- output.counts = *it;
+ output.counts = Trim(*it);
+ //std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl;
//Get sparse_score
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
- output.sparse_score = *it;
+ output.sparse_score = Trim(*it);
+ //std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl;
//Get property
it++;
if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
- output.property = *it;
+ output.property = Trim(*it);
+ //std::cerr << "output.property=" << output.property << "AAAA" << std::endl;
return output;
}
-std::vector<unsigned char> splitWordAll1(StringPiece textin)
+std::vector<unsigned char> splitWordAll1(const StringPiece &textin)
{
const char delim[] = " ";
const char delim2[] = "-";
std::vector<unsigned char> output;
+ //Case with no word alignments.
+ if (textin.size() == 0) {
+ return output;
+ }
+
//Split on space
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
//For each int
while (it) {
//Split on dash (-)
- util::TokenIter<util::MultiCharacter> itInner(*it, util::MultiCharacter(delim2));
+ util::TokenIter<util::MultiCharacter> itInner(*it,
+ util::MultiCharacter(delim2));
//Insert the two entries in the vector. User will read entry 0 and 1 to get the first,
//2 and 3 for second etc. Use unsigned char instead of int to save space, as
//word allignments are all very small numbers that fit in a single byte
- output.push_back((unsigned char)(atoi(itInner->data())));
+ output.push_back((unsigned char) (atoi(itInner->data())));
itInner++;
- output.push_back((unsigned char)(atoi(itInner->data())));
+ output.push_back((unsigned char) (atoi(itInner->data())));
it++;
}
@@ -68,3 +94,10 @@ std::vector<unsigned char> splitWordAll1(StringPiece textin)
}
+void reformatSCFG(line_text &output)
+{
+
+}
+
+}
+