Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/moses
diff options
context:
space:
mode:
authorredpony <redpony@1f5c12ca-751b-0410-a591-d2e778427230>2007-11-10 02:56:57 +0300
committerredpony <redpony@1f5c12ca-751b-0410-a591-d2e778427230>2007-11-10 02:56:57 +0300
commit3b008f67a9bc00bb27ce1e9692baa59298a6429e (patch)
treec649b39c757464c1388b0b8cbd01235adbfaddcc /moses
parentc4b566dcd7d53c37a9cafa7a1cd86dd7062be609 (diff)
add support for multiple xml options for the same span, start refactoring to add support for overlapping xml options.
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1507 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'moses')
-rwxr-xr-xmoses/src/Sentence.cpp38
-rwxr-xr-xmoses/src/Sentence.h16
-rw-r--r--moses/src/XmlOption.cpp126
-rw-r--r--moses/src/XmlOption.h20
4 files changed, 169 insertions, 31 deletions
diff --git a/moses/src/Sentence.cpp b/moses/src/Sentence.cpp
index 4648ef4a0..49e202278 100755
--- a/moses/src/Sentence.cpp
+++ b/moses/src/Sentence.cpp
@@ -46,8 +46,8 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
std::vector<string> xmlTokens = Tokenize(line,"<>");
std::string tagName = "";
std::string tagContents = "";
- std::string altText = "";
- std::string altProb = "";
+ std::vector<std::string> altTexts;
+ std::vector<std::string> altProbs;
size_t offset=0;
size_t tagStart=0;
if (xmlTokens.size()>1 && line.at(0) == '<') offset=1;
@@ -75,13 +75,17 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
{
//this is an open tag
tagName = nextTagName;
- altText = Sentence::ParseXmlTagAttribute(tagContents,"english");
- altProb = Sentence::ParseXmlTagAttribute(tagContents,"prob");
+ altTexts = TokenizeMultiCharSeparator(Sentence::ParseXmlTagAttribute(tagContents,"english"), "||");
+ altProbs = TokenizeMultiCharSeparator(Sentence::ParseXmlTagAttribute(tagContents,"prob"), "||");
tagStart = Phrase::GetSize();
VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
- VERBOSE(3,"XML TAG ENGLISH IS: '" << altText << "'" << endl);
- VERBOSE(3,"XML TAG PROB IS: '" << altProb << "'" << endl);
+ VERBOSE(3,"XML TAG ENGLISH IS: '" << altTexts[0] << "'" << endl);
+ VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
VERBOSE(3,"XML TAG STARTS AT WORD: " << Phrase::GetSize() << endl);
+ if (altTexts.size() != altProbs.size()) {
+ TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
+ return 0;
+ }
}
else if ((nextTagName.size() == 0) || (nextTagName.at(0) != '/') || (nextTagName.substr(1) != tagName))
{
@@ -99,19 +103,21 @@ int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
//TODO: deal with multiple XML options here
if (staticData.GetXmlInputType() != XmlIgnore) {
- //only store options if we aren't ignoring them
- //set default probability
- float probValue = 1;
- if (altProb != "") probValue = Scan<float>(altProb);
- //Convert from prob to log-prob
- float scoreValue = FloorScore(TransformScore(probValue));
- XmlOption option(tagStart,tagEnd,altText,scoreValue);
- m_xmlOptionsList.push_back(option);
+ for (size_t i=0; i<altTexts.size(); ++i) {
+ //only store options if we aren't ignoring them
+ //set default probability
+ float probValue = 1;
+ if (altProbs[i] != "") probValue = Scan<float>(altProbs[i]);
+ //Convert from prob to log-prob
+ float scoreValue = FloorScore(TransformScore(probValue));
+ XmlOption option(tagStart,tagEnd,altTexts[i],scoreValue);
+ m_xmlOptionsList.push_back(option);
+ }
}
tagName= "";
tagContents = "";
- altText = "";
- altProb = "";
+ altTexts.clear();
+ altProbs.clear();
}
}
diff --git a/moses/src/Sentence.h b/moses/src/Sentence.h
index 6d32c3304..8a05787ee 100755
--- a/moses/src/Sentence.h
+++ b/moses/src/Sentence.h
@@ -26,27 +26,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "Word.h"
#include "Phrase.h"
#include "InputType.h"
+#include "XmlOption.h"
class WordsRange;
class PhraseDictionary;
class TranslationOption;
class TranslationOptionCollection;
-/** This struct is used for storing XML force translation data for a given range in the sentence
- */
-struct XmlOption {
-
- size_t startPos, endPos;
- std::vector<std::string> targetPhrases;
- std::vector<float> targetScores;
-
- XmlOption(int s, int e, std::string targetPhrase, float targetScore): startPos(s), endPos(e) {
- targetPhrases.push_back(targetPhrase);
- targetScores.push_back(targetScore);
- }
-
-};
-
/***
* A Phrase class with an ID. Used specifically as source input so contains functionality to read
diff --git a/moses/src/XmlOption.cpp b/moses/src/XmlOption.cpp
new file mode 100644
index 000000000..e3166d88d
--- /dev/null
+++ b/moses/src/XmlOption.cpp
@@ -0,0 +1,126 @@
+// $Id: Sentence.cpp 1465 2007-09-27 14:16:28Z hieuhoang1972 $
+// vim:tabstop=2
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include "XmlOption.h"
+
+
+std::vector<XmlOption> parseXMLOptions(const std::string& line) {
+ //parse XML markup in translation line
+ std::vector<XmlOption> res;
+ if (line.find_first_of('<') == std::string::npos) { return res; }
+ std::vector<string> xmlTokens = Tokenize(line,"<>");
+ std::string tagName = "";
+ std::string tagContents = "";
+ std::vector<std::string> altTexts;
+ std::vector<std::string> altProbs;
+ size_t offset=0;
+ size_t tagStart=0;
+ if (xmlTokens.size()>1 && line.at(0) == '<') offset=1;
+ for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++)
+ {
+ if(((xmlTokenPos+offset) % 2) == 0)
+ {
+ //phrase, not tag
+ Phrase::CreateFromString(factorOrder,xmlTokens[xmlTokenPos],factorDelimiter);
+ }
+ else
+ {
+ //TODO: support UNARY tags
+
+ //tag data
+ std::string tag = Trim(xmlTokens[xmlTokenPos]);
+ VERBOSE(3,"XML TAG IS: " << tag << endl);
+ std::string::size_type endOfName = xmlTokens[xmlTokenPos].find_first_of(' ');
+ std::string nextTagName = tag;
+ if (endOfName != std::string::npos) {
+ nextTagName = xmlTokens[xmlTokenPos].substr(0,endOfName);
+ tagContents = xmlTokens[xmlTokenPos].substr(endOfName+1);
+ }
+ if ((xmlTokenPos-1+offset) % 4 == 0)
+ {
+ //this is an open tag
+ tagName = nextTagName;
+ altTexts = Tokenize(ParseXmlTagAttribute(tagContents,"english"), "|");
+ altProbs = Tokenize(ParseXmlTagAttribute(tagContents,"prob"), "|");
+ tagStart = Phrase::GetSize();
+ VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
+ VERBOSE(3,"XML TAG ENGLISH IS: '" << altTexts[0] << "'" << endl);
+ VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
+ VERBOSE(3,"XML TAG STARTS AT WORD: " << Phrase::GetSize() << endl);
+ if (altTexts.size() != altProbs.size()) {
+ TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
+ return 0;
+ }
+ }
+ else if ((nextTagName.size() == 0) || (nextTagName.at(0) != '/') || (nextTagName.substr(1) != tagName))
+ {
+ //mismatched tag, abort!
+ TRACE_ERR("ERROR: tried to parse malformed XML with xml-input enabled: " << line << endl);
+ return 0;
+ }
+ else
+ {
+ VERBOSE(3,"XML END TAG IS: " << nextTagName.substr(1) << endl);
+ VERBOSE(3,"XML TAG ENDS AT WORD: " << Phrase::GetSize() << endl);
+ //store translation options into members
+ size_t tagEnd = Phrase::GetSize()-1; //size is inclusive
+
+ //TODO: deal with multiple XML options here
+
+ if (staticData.GetXmlInputType() != XmlIgnore) {
+ for (size_t i=0; i<altTexts.size(); ++i) {
+ //only store options if we aren't ignoring them
+ //set default probability
+ float probValue = 1;
+ if (altProbs[i] != "") probValue = Scan<float>(altProbs[i]);
+ //Convert from prob to log-prob
+ float scoreValue = FloorScore(TransformScore(probValue));
+ XmlOption option(tagStart,tagEnd,altTexts[i],scoreValue);
+ m_xmlOptionsList.push_back(option);
+ }
+ }
+ tagName= "";
+ tagContents = "";
+ altTexts.clear();
+ altProbs.clear();
+ }
+ }
+ }
+}
+
+std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName){
+ /*TODO deal with unescaping \"*/
+ string tagOpen = attributeName + "=\"";
+ size_t contentsStart = tag.find(tagOpen);
+ if (contentsStart == std::string::npos) return "";
+ contentsStart += tagOpen.size();
+ size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
+ if (contentsEnd == std::string::npos) {
+ TRACE_ERR("Malformed XML attribute: "<< tag);
+ return "";
+ }
+ size_t possibleEnd;
+ while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != std::string::npos) {
+ contentsEnd = possibleEnd;
+ }
+ return tag.substr(contentsStart,contentsEnd-contentsStart);
+}
diff --git a/moses/src/XmlOption.h b/moses/src/XmlOption.h
new file mode 100644
index 000000000..07ef06250
--- /dev/null
+++ b/moses/src/XmlOption.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <vector>
+#include <string>
+
+/** This struct is used for storing XML force translation data for a given range in the sentence
+ */
+struct XmlOption {
+
+ size_t startPos, endPos;
+ std::vector<std::string> targetPhrases;
+ std::vector<float> targetScores;
+
+ XmlOption(int s, int e, std::string targetPhrase, float targetScore): startPos(s), endPos(e) {
+ targetPhrases.push_back(targetPhrase);
+ targetScores.push_back(targetScore);
+ }
+
+};
+