diff options
author | Matthias Huck <huck@i6.informatik.rwth-aachen.de> | 2014-08-08 00:02:51 +0400 |
---|---|---|
committer | Matthias Huck <huck@i6.informatik.rwth-aachen.de> | 2014-08-08 00:02:51 +0400 |
commit | c27cbf55eacd4c72685507b9bab624437d9adb4b (patch) | |
tree | d493c7f3607b9fc78d22b8fd04bdb0f016a9fa2a /phrase-extract/PropertiesConsolidator.cpp | |
parent | cda9d1d5aee25b3ba6598742bea44f1da624252b (diff) |
source labels: integration into EMS
Diffstat (limited to 'phrase-extract/PropertiesConsolidator.cpp')
-rw-r--r-- | phrase-extract/PropertiesConsolidator.cpp | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/phrase-extract/PropertiesConsolidator.cpp b/phrase-extract/PropertiesConsolidator.cpp new file mode 100644 index 000000000..642c48672 --- /dev/null +++ b/phrase-extract/PropertiesConsolidator.cpp @@ -0,0 +1,159 @@ +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include "PropertiesConsolidator.h" + +#include <sstream> +#include <limits> +#include <vector> + +#include "moses/Util.h" +#include "phrase-extract/InputFileStream.h" +#include "phrase-extract/OutputFileStream.h" + + +namespace MosesTraining +{ + +void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile) +{ + Moses::InputFileStream inFile(sourceLabelSetFile); + + // read source label set + m_sourceLabels.clear(); + std::string line; + while (getline(inFile, line)) { + std::istringstream tokenizer(line); + std::string label; + size_t index; + try { + tokenizer >> label >> index; + } catch (const std::exception &e) { + UTIL_THROW2("Error reading source label set file " << sourceLabelSetFile << " ."); + } + std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) ); + UTIL_THROW_IF2(!inserted.second,"Source label set file " << sourceLabelSetFile << " should contain each syntactic label only once."); + } + + inFile.Close(); + + m_sourceLabelsFlag = true; +} + + +std::string PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString) const +{ + if ( propertiesString.empty() ) { + return propertiesString; + } + + std::ostringstream out; + std::vector<std::string> toks; + Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{"); + for (size_t i = 1; i < toks.size(); ++i) { + std::string &tok = toks[i]; + if (tok.empty()) { + continue; + } + size_t endPos = tok.rfind("}"); + tok = tok.substr(0, endPos - 1); + std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " "); + assert(keyValue.size() == 2); + + if ( !keyValue[0].compare("SourceLabels") ) { + + if ( m_sourceLabelsFlag ) { + + // SourceLabels additional property: replace strings with vocabulary indices + out << " {{" << keyValue[0]; + + std::istringstream tokenizer(keyValue[1]); + + size_t nNTs; + double totalCount; + + if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side) + UTIL_THROW2("Not able to read number of non-terminals from SourceLabels property. " + << "Flawed SourceLabels property?"); + } + assert( nNTs > 0 ); + out << " " << nNTs; + + if (! (tokenizer >> totalCount)) { // second token: overall rule count + UTIL_THROW2("Not able to read overall rule count from SourceLabels property. " + << "Flawed SourceLabels property?"); + } + assert( totalCount > 0.0 ); + out << " " << totalCount; + + while (tokenizer.peek() != EOF) { + try { + + size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max(); + + std::string token; + + if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule + for (size_t i=0; i<nNTs-1; ++i) { // RHS source non-terminal labels + tokenizer >> token; // RHS source non-terminal label + std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token); + UTIL_THROW_IF2(found == m_sourceLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set."); + out << " " << found->second; + } + + tokenizer >> token; // sourceLabelsRHSCount + out << " " << token; + + tokenizer >> numberOfLHSsGivenRHS; + out << " " << numberOfLHSsGivenRHS; + } + + for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS + tokenizer >> token; // LHS source non-terminal label + std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token); + UTIL_THROW_IF2(found == m_sourceLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set."); + out << " " << found->second; + + tokenizer >> token; // ruleSourceLabelledCount + out << " " << token; + } + + } catch (const std::exception &e) { + UTIL_THROW2("Flawed item in SourceLabels property?"); + } + } + + out << "}}"; + + } else { // don't process source labels additional property + out << " {{" << keyValue[0] << " " << keyValue[1] << "}}"; + } + + } else { + + // output other additional property + out << " {{" << keyValue[0] << " " << keyValue[1] << "}}"; + } + } + + return out.str(); +} + +} // namespace MosesTraining + |