From d0e92da7340ae1c46c4eaa41f52bf5eaaf47961c Mon Sep 17 00:00:00 2001 From: Matthias Huck Date: Wed, 11 Jun 2014 19:27:18 +0100 Subject: GHKM extraction can add a source labels phrase property --- phrase-extract/ExtractionPhrasePair.cpp | 143 ++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) (limited to 'phrase-extract/ExtractionPhrasePair.cpp') diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp index 2b26c2ad6..9564b1cfe 100644 --- a/phrase-extract/ExtractionPhrasePair.cpp +++ b/phrase-extract/ExtractionPhrasePair.cpp @@ -321,5 +321,148 @@ std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &ke } +std::string ExtractionPhrasePair::CollectAllLabelsSeparateLHSAndRHS(const std::string& propertyKey, + std::set& labelSet, + boost::unordered_map& countsLabelsLHS, + boost::unordered_map* >& jointCountsRulesTargetLHSAndLabelsLHS, + Vocabulary &vcbT) const +{ + const PROPERTY_VALUES *allPropertyValues = GetProperty( propertyKey ); + + if ( allPropertyValues == NULL ) { + return ""; + } + + std::string lhs="", rhs="", currentRhs=""; + float currentRhsCount = 0.0; + std::list< std::pair > lhsGivenCurrentRhsCounts; + + std::ostringstream oss; + for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin(); + iter!=allPropertyValues->end(); ++iter) { + + size_t space = (iter->first).find_last_of(' '); + if ( space == string::npos ) { + lhs = iter->first; + rhs.clear(); + } else { + lhs = (iter->first).substr(space+1); + rhs = (iter->first).substr(0,space); + } + + labelSet.insert(lhs); + + if ( rhs.compare(currentRhs) ) { + + if ( iter!=allPropertyValues->begin() ) { + if ( !currentRhs.empty() ) { + istringstream tokenizer(currentRhs); + std::string rhsLabel; + while ( tokenizer.peek() != EOF ) { + tokenizer >> rhsLabel; + labelSet.insert(rhsLabel); + } + oss << " " << currentRhs << " " << currentRhsCount; + } + if ( lhsGivenCurrentRhsCounts.size() > 0 ) { + if ( !currentRhs.empty() ) { + oss << " " << lhsGivenCurrentRhsCounts.size(); + } + for ( std::list< std::pair >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin(); + iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) { + oss << " " << iter2->first << " " << iter2->second; + + // update countsLabelsLHS and jointCountsRulesTargetLHSAndLabelsLHS + std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back()); + ruleTargetLhs.erase(ruleTargetLhs.begin()); // strip square brackets + ruleTargetLhs.erase(ruleTargetLhs.size()-1); + + std::pair< boost::unordered_map::iterator, bool > insertedCountsLabelsLHS = + countsLabelsLHS.insert(std::pair(iter2->first,iter2->second)); + if (!insertedCountsLabelsLHS.second) { + (insertedCountsLabelsLHS.first)->second += iter2->second; + } + + boost::unordered_map* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter = + jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs); + if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) { + boost::unordered_map* jointCounts = new boost::unordered_map; + jointCounts->insert(std::pair(iter2->first,iter2->second)); + jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair* >(ruleTargetLhs,jointCounts)); + } else { + boost::unordered_map* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second; + std::pair< boost::unordered_map::iterator, bool > insertedJointCounts = + jointCounts->insert(std::pair(iter2->first,iter2->second)); + if (!insertedJointCounts.second) { + (insertedJointCounts.first)->second += iter2->second; + } + } + + } + } + + lhsGivenCurrentRhsCounts.clear(); + } + + currentRhsCount = 0.0; + currentRhs = rhs; + } + + currentRhsCount += iter->second; + lhsGivenCurrentRhsCounts.push_back( std::pair(lhs,iter->second) ); + } + + if ( !currentRhs.empty() ) { + istringstream tokenizer(currentRhs); + std::string rhsLabel; + while ( tokenizer.peek() != EOF ) { + tokenizer >> rhsLabel; + labelSet.insert(rhsLabel); + } + oss << " " << currentRhs << " " << currentRhsCount; + } + if ( lhsGivenCurrentRhsCounts.size() > 0 ) { + if ( !currentRhs.empty() ) { + oss << " " << lhsGivenCurrentRhsCounts.size(); + } + for ( std::list< std::pair >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin(); + iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) { + oss << " " << iter2->first << " " << iter2->second; + + // update countsLabelsLHS and jointCountsRulesTargetLHSAndLabelsLHS + std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back()); + ruleTargetLhs.erase(ruleTargetLhs.begin()); // strip square brackets + ruleTargetLhs.erase(ruleTargetLhs.size()-1); + + std::pair< boost::unordered_map::iterator, bool > insertedCountsLabelsLHS = + countsLabelsLHS.insert(std::pair(iter2->first,iter2->second)); + if (!insertedCountsLabelsLHS.second) { + (insertedCountsLabelsLHS.first)->second += iter2->second; + } + + boost::unordered_map* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter = + jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs); + if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) { + boost::unordered_map* jointCounts = new boost::unordered_map; + jointCounts->insert(std::pair(iter2->first,iter2->second)); + jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair* >(ruleTargetLhs,jointCounts)); + } else { + boost::unordered_map* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second; + std::pair< boost::unordered_map::iterator, bool > insertedJointCounts = + jointCounts->insert(std::pair(iter2->first,iter2->second)); + if (!insertedJointCounts.second) { + (insertedJointCounts.first)->second += iter2->second; + } + } + + } + } + + std::string allPropertyValuesString(oss.str()); + return allPropertyValuesString; +} + + + } -- cgit v1.2.3