Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieu@hoang.co.uk>2013-03-09 01:53:04 +0400
committerHieu Hoang <hieu@hoang.co.uk>2013-03-09 01:53:04 +0400
commita53392165fcee9f33340a7e1e0997c30a11f7618 (patch)
tree13f626e2e373f351b10180c95680008be14af67d /moses/Phrase.cpp
parent9298402dcf75bf08364f877750919312e7e80a29 (diff)
merge Phrase::CreateFromString() and Phrase::CreateFromStringNewFormat()
Diffstat (limited to 'moses/Phrase.cpp')
-rw-r--r--moses/Phrase.cpp69
1 files changed, 54 insertions, 15 deletions
diff --git a/moses/Phrase.cpp b/moses/Phrase.cpp
index 6bb23007b..5ec03723b 100644
--- a/moses/Phrase.cpp
+++ b/moses/Phrase.cpp
@@ -158,23 +158,62 @@ void Phrase::CreateFromString(FactorDirection direction
,const StringPiece &factorDelimiter
,Word *lhs)
{
- FactorCollection &factorCollection = FactorCollection::Instance();
+ // parse
+ vector<StringPiece> annotatedWordVector;
+ for (util::TokenIter<util::AnyCharacter, true> it(phraseString, "\t "); it; ++it) {
+ annotatedWordVector.push_back(*it);
+ }
- for (util::TokenIter<util::AnyCharacter, true> word_it(phraseString, util::AnyCharacter(" \t")); word_it; ++word_it) {
- Word &word = AddWord();
- size_t index = 0;
- for (util::TokenIter<util::MultiCharacter, false> factor_it(*word_it, util::MultiCharacter(factorDelimiter));
- factor_it && (index < factorOrder.size());
- ++factor_it, ++index) {
- word[factorOrder[index]] = factorCollection.AddFactor(*factor_it);
- }
- if (index != factorOrder.size()) {
- TRACE_ERR( "[ERROR] Malformed input: '" << *word_it << "'" << std::endl
- << "In '" << phraseString << "'" << endl
- << " Expected input to have words composed of " << factorOrder.size() << " factor(s) (form FAC1|FAC2|...)" << std::endl
- << " but instead received input with " << index << " factor(s).\n");
- abort();
+ if (annotatedWordVector.size() == 0)
+ return;
+
+ // KOMMA|none ART|Def.Z NN|Neut.NotGen.Sg VVFIN|none
+ // to
+ // "KOMMA|none" "ART|Def.Z" "NN|Neut.NotGen.Sg" "VVFIN|none"
+
+ size_t numWords;
+ const StringPiece &annotatedWord = annotatedWordVector.back();
+ if (annotatedWord.size() >= 2
+ && *annotatedWord.data() == '['
+ && annotatedWord.data()[annotatedWord.size() - 1] == ']') {
+ // hiero/syntax rule
+ numWords = annotatedWordVector.size()-1;
+
+ // lhs
+ CHECK(lhs);
+ lhs->CreateFromString(direction, factorOrder, annotatedWord.substr(1, annotatedWord.size() - 2), true);
+ assert(lhs->IsNonTerminal());
+ }
+ else {
+ CHECK(lhs == NULL);
+
+ numWords = annotatedWordVector.size();
+ }
+
+ // parse each word
+ m_words.reserve(numWords);
+
+ for (size_t phrasePos = 0 ; phrasePos < numWords; phrasePos++) {
+ StringPiece &annotatedWord = annotatedWordVector[phrasePos];
+ bool isNonTerminal;
+ if (annotatedWord.size() >= 2 && *annotatedWord.data() == '[' && annotatedWord.data()[annotatedWord.size() - 1] == ']') {
+ // non-term
+ isNonTerminal = true;
+
+ size_t nextPos = annotatedWord.find('[', 1);
+ CHECK(nextPos != string::npos);
+
+ if (direction == Input)
+ annotatedWord = annotatedWord.substr(1, nextPos - 2);
+ else
+ annotatedWord = annotatedWord.substr(nextPos + 1, annotatedWord.size() - nextPos - 2);
+ } else {
+ isNonTerminal = false;
}
+
+ Word &word = AddWord();
+ word.CreateFromString(direction, factorOrder, annotatedWord, isNonTerminal);
+
}
}