diff options
author | Hieu Hoang <hieu@hoang.co.uk> | 2013-03-09 01:53:04 +0400 |
---|---|---|
committer | Hieu Hoang <hieu@hoang.co.uk> | 2013-03-09 01:53:04 +0400 |
commit | a53392165fcee9f33340a7e1e0997c30a11f7618 (patch) | |
tree | 13f626e2e373f351b10180c95680008be14af67d /moses/Phrase.cpp | |
parent | 9298402dcf75bf08364f877750919312e7e80a29 (diff) |
merge Phrase::CreateFromString() and Phrase::CreateFromStringNewFormat()
Diffstat (limited to 'moses/Phrase.cpp')
-rw-r--r-- | moses/Phrase.cpp | 69 |
1 files changed, 54 insertions, 15 deletions
diff --git a/moses/Phrase.cpp b/moses/Phrase.cpp index 6bb23007b..5ec03723b 100644 --- a/moses/Phrase.cpp +++ b/moses/Phrase.cpp @@ -158,23 +158,62 @@ void Phrase::CreateFromString(FactorDirection direction ,const StringPiece &factorDelimiter ,Word *lhs) { - FactorCollection &factorCollection = FactorCollection::Instance(); + // parse + vector<StringPiece> annotatedWordVector; + for (util::TokenIter<util::AnyCharacter, true> it(phraseString, "\t "); it; ++it) { + annotatedWordVector.push_back(*it); + } - for (util::TokenIter<util::AnyCharacter, true> word_it(phraseString, util::AnyCharacter(" \t")); word_it; ++word_it) { - Word &word = AddWord(); - size_t index = 0; - for (util::TokenIter<util::MultiCharacter, false> factor_it(*word_it, util::MultiCharacter(factorDelimiter)); - factor_it && (index < factorOrder.size()); - ++factor_it, ++index) { - word[factorOrder[index]] = factorCollection.AddFactor(*factor_it); - } - if (index != factorOrder.size()) { - TRACE_ERR( "[ERROR] Malformed input: '" << *word_it << "'" << std::endl - << "In '" << phraseString << "'" << endl - << " Expected input to have words composed of " << factorOrder.size() << " factor(s) (form FAC1|FAC2|...)" << std::endl - << " but instead received input with " << index << " factor(s).\n"); - abort(); + if (annotatedWordVector.size() == 0) + return; + + // KOMMA|none ART|Def.Z NN|Neut.NotGen.Sg VVFIN|none + // to + // "KOMMA|none" "ART|Def.Z" "NN|Neut.NotGen.Sg" "VVFIN|none" + + size_t numWords; + const StringPiece &annotatedWord = annotatedWordVector.back(); + if (annotatedWord.size() >= 2 + && *annotatedWord.data() == '[' + && annotatedWord.data()[annotatedWord.size() - 1] == ']') { + // hiero/syntax rule + numWords = annotatedWordVector.size()-1; + + // lhs + CHECK(lhs); + lhs->CreateFromString(direction, factorOrder, annotatedWord.substr(1, annotatedWord.size() - 2), true); + assert(lhs->IsNonTerminal()); + } + else { + CHECK(lhs == NULL); + + numWords = annotatedWordVector.size(); + } + + // parse each word + m_words.reserve(numWords); + + for (size_t phrasePos = 0 ; phrasePos < numWords; phrasePos++) { + StringPiece &annotatedWord = annotatedWordVector[phrasePos]; + bool isNonTerminal; + if (annotatedWord.size() >= 2 && *annotatedWord.data() == '[' && annotatedWord.data()[annotatedWord.size() - 1] == ']') { + // non-term + isNonTerminal = true; + + size_t nextPos = annotatedWord.find('[', 1); + CHECK(nextPos != string::npos); + + if (direction == Input) + annotatedWord = annotatedWord.substr(1, nextPos - 2); + else + annotatedWord = annotatedWord.substr(nextPos + 1, annotatedWord.size() - nextPos - 2); + } else { + isNonTerminal = false; } + + Word &word = AddWord(); + word.CreateFromString(direction, factorOrder, annotatedWord, isNonTerminal); + } } |