// $Id$ // vim:tabstop=2 /*********************************************************************** Moses - factored phrase-based language decoder Copyright (C) 2006 University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #include #include #include #include "Sentence.h" #include "TranslationOptionCollectionText.h" #include "StaticData.h" #include "moses/FF/DynamicCacheBasedLanguageModel.h" #include "moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h" #include "ChartTranslationOptions.h" #include "Util.h" #include "XmlOption.h" #include "FactorCollection.h" using namespace std; namespace Moses { Sentence:: Sentence() : Phrase(0) , InputType() { const StaticData& SD = StaticData::Instance(); if (SD.IsSyntax()) m_defaultLabelSet.insert(SD.GetInputDefaultNonTerminal()); } Sentence:: ~Sentence() { RemoveAllInColl(m_xmlOptions); } void Sentence:: aux_init_partial_translation(string& line) { string sourceCompletedStr; int loc1 = line.find( "|||", 0 ); int loc2 = line.find( "|||", loc1 + 3 ); if (loc1 > -1 && loc2 > -1) { m_initialTargetPhrase = Trim(line.substr(0, loc1)); string scov = Trim(line.substr(loc1 + 3, loc2 - loc1 - 3)); line = line.substr(loc2 + 3); m_sourceCompleted.resize(scov.size()); int contiguous = 1; for (size_t i = 0; i < scov.size(); ++i) { if (sourceCompletedStr.at(i) == '1') { m_sourceCompleted[i] = true; if (contiguous) m_frontSpanCoveredLength++; } else { m_sourceCompleted[i] = false; contiguous = 0; } } } } void Sentence:: aux_interpret_sgml_markup(string& line) { // if sentences is specified as " ... ", extract id typedef std::map metamap; metamap meta = ProcessAndStripSGML(line); metamap::const_iterator i; if ((i = meta.find("id")) != meta.end()) this->SetTranslationId(atol(i->second.c_str())); if ((i = meta.find("docid")) != meta.end()) { this->SetDocumentId(atol(i->second.c_str())); this->SetUseTopicId(false); this->SetUseTopicIdAndProb(false); } if ((i = meta.find("topic")) != meta.end()) { vector topic_params; boost::split(topic_params, i->second, boost::is_any_of("\t ")); if (topic_params.size() == 1) { this->SetTopicId(atol(topic_params[0].c_str())); this->SetUseTopicId(true); this->SetUseTopicIdAndProb(false); } else { this->SetTopicIdAndProb(topic_params); this->SetUseTopicId(false); this->SetUseTopicIdAndProb(true); } } if ((i = meta.find("weight-setting")) != meta.end()) { this->SetWeightSetting(i->second); this->SetSpecifiesWeightSetting(true); StaticData::Instance().SetWeightSetting(i->second); // oh this is so horrible! Why does this have to be propagated globally? // --- UG } else this->SetSpecifiesWeightSetting(false); } void Sentence:: aux_interpret_dlt(string& line) // whatever DLT means ... --- UG { using namespace std; typedef map str2str_map; vector meta = ProcessAndStripDLT(line); BOOST_FOREACH(str2str_map const& M, meta) { str2str_map::const_iterator i,j; if ((i = M.find("type")) != M.end()) { j = M.find("id"); string id = j == M.end() ? "default" : j->second; if (i->second == "cbtm") { PhraseDictionaryDynamicCacheBased* cbtm; cbtm = PhraseDictionaryDynamicCacheBased::InstanceNonConst(id); if (cbtm) cbtm->ExecuteDlt(M); } if (i->second == "cblm") { DynamicCacheBasedLanguageModel* cblm; cblm = DynamicCacheBasedLanguageModel::InstanceNonConst(id); if (cblm) cblm->ExecuteDlt(M); } } } } void Sentence:: aux_interpret_xml(std::string& line, std::vector & xmlWalls, std::vector >& placeholders) { // parse XML markup in translation line const StaticData &SD = StaticData::Instance(); using namespace std; if (SD.GetXmlInputType() != XmlPassThrough) { int offset = SD.IsSyntax() ? 1 : 0; bool OK = ProcessAndStripXMLTags(line, m_xmlOptions, m_reorderingConstraint, xmlWalls, placeholders, offset, SD.GetXmlBrackets().first, SD.GetXmlBrackets().second); UTIL_THROW_IF2(!OK, "Unable to parse XML in line: " << line); } } void Sentence:: init(string line, std::vector const& factorOrder) { using namespace std; const StaticData &SD = StaticData::Instance(); m_frontSpanCoveredLength = 0; m_sourceCompleted.resize(0); if (SD.ContinuePartialTranslation()) aux_init_partial_translation(line); line = Trim(line); aux_interpret_sgml_markup(line); // for "" if (SD.IsPassthroughEnabled() || SD.IsPassthroughInNBestEnabled()) { string pthru = PassthroughSGML(line,"passthrough"); this->SetPassthroughInformation(pthru); } vector xmlWalls; vector >placeholders; aux_interpret_xml(line, xmlWalls, placeholders); Phrase::CreateFromString(Input, factorOrder, line, NULL); ProcessPlaceholders(placeholders); if (SD.IsSyntax()) InitStartEndWord(); // now that we have final word positions in phrase (from // CreateFromString), we can make input phrase objects to go with // our XmlOptions and create TranslationOptions // only fill the vector if we are parsing XML if (SD.GetXmlInputType() != XmlPassThrough) { m_xmlCoverageMap.assign(GetSize(), false); BOOST_FOREACH(XmlOption* o, m_xmlOptions) { WordsRange const& r = o->range; for(size_t j = r.GetStartPos(); j <= r.GetEndPos(); ++j) m_xmlCoverageMap[j]=true; } } // reordering walls and zones m_reorderingConstraint.InitializeWalls(GetSize()); // set reordering walls, if "-monotone-at-punction" is set if (SD.UseReorderingConstraint() && GetSize()) { WordsRange r(0, GetSize()-1); m_reorderingConstraint.SetMonotoneAtPunctuation(GetSubString(r)); } // set walls obtained from xml for(size_t i=0; i& factorOrder) { std::string line; if (getline(in, line, '\n').eof()) return 0; init(line, factorOrder); return 1; } void Sentence:: ProcessPlaceholders(const std::vector< std::pair > &placeholders) { FactorType placeholderFactor = StaticData::Instance().GetPlaceholderFactor(); if (placeholderFactor == NOT_FOUND) { return; } for (size_t i = 0; i < placeholders.size(); ++i) { size_t pos = placeholders[i].first; const string &str = placeholders[i].second; const Factor *factor = FactorCollection::Instance().AddFactor(str); Word &word = Phrase::GetWord(pos); word[placeholderFactor] = factor; } } TranslationOptionCollection* Sentence:: CreateTranslationOptionCollection() const { size_t maxNoTransOptPerCoverage = StaticData::Instance().GetMaxNoTransOptPerCoverage(); float transOptThreshold = StaticData::Instance().GetTranslationOptionThreshold(); TranslationOptionCollection *rv= new TranslationOptionCollectionText(*this, maxNoTransOptPerCoverage, transOptThreshold); assert(rv); return rv; } void Sentence::Print(std::ostream& out) const { out<<*static_cast(this); } bool Sentence::XmlOverlap(size_t startPos, size_t endPos) const { for (size_t pos = startPos; pos <= endPos ; pos++) { if (pos < m_xmlCoverageMap.size() && m_xmlCoverageMap[pos]) { return true; } } return false; } void Sentence::GetXmlTranslationOptions(std::vector &list) const { for (std::vector::const_iterator iterXMLOpts = m_xmlOptions.begin(); iterXMLOpts != m_xmlOptions.end(); ++iterXMLOpts) { const XmlOption &xmlOption = **iterXMLOpts; const WordsRange &range = xmlOption.range; const TargetPhrase &targetPhrase = xmlOption.targetPhrase; TranslationOption *transOpt = new TranslationOption(range, targetPhrase); list.push_back(transOpt); } } void Sentence::GetXmlTranslationOptions(std::vector &list, size_t startPos, size_t endPos) const { //iterate over XmlOptions list, find exact source/target matches for (std::vector::const_iterator iterXMLOpts = m_xmlOptions.begin(); iterXMLOpts != m_xmlOptions.end(); ++iterXMLOpts) { const XmlOption &xmlOption = **iterXMLOpts; const WordsRange &range = xmlOption.range; if (startPos == range.GetStartPos() && endPos == range.GetEndPos()) { const TargetPhrase &targetPhrase = xmlOption.targetPhrase; TranslationOption *transOpt = new TranslationOption(range, targetPhrase); list.push_back(transOpt); } } } std::vector Sentence::GetXmlChartTranslationOptions() const { const StaticData &staticData = StaticData::Instance(); std::vector ret; // XML Options // this code is a copy of the 1 in Sentence. //only fill the vector if we are parsing XML if (staticData.GetXmlInputType() != XmlPassThrough ) { //TODO: needed to handle exclusive //for (size_t i=0; i::const_iterator iterXmlOpts = m_xmlOptions.begin(); iterXmlOpts != m_xmlOptions.end(); iterXmlOpts++) { const XmlOption &xmlOption = **iterXmlOpts; TargetPhrase *targetPhrase = new TargetPhrase(xmlOption.targetPhrase); WordsRange *range = new WordsRange(xmlOption.range); StackVec emptyStackVec; // hmmm... maybe dangerous, but it is never consulted TargetPhraseCollection *tpc = new TargetPhraseCollection; tpc->Add(targetPhrase); ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f); ret.push_back(transOpt); //TODO: needed to handle exclusive //for(size_t j=transOpt->GetSourceWordsRange().GetStartPos(); j<=transOpt->GetSourceWordsRange().GetEndPos(); j++) { // m_xmlCoverageMap[j]=true; //} } } return ret; } void Sentence:: CreateFromString(vector const& FOrder, string const& phraseString) { Phrase::CreateFromString(Input, FOrder, phraseString, NULL); } Sentence:: Sentence(size_t const transId, string const& stext) : InputType(transId) { vector const& IFO = StaticData::Instance().GetInputFactorOrder(); init(stext, IFO); } }