Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthias Huck <huck@i6.informatik.rwth-aachen.de>2014-06-11 22:27:18 +0400
committerMatthias Huck <huck@i6.informatik.rwth-aachen.de>2014-06-11 22:27:18 +0400
commitd0e92da7340ae1c46c4eaa41f52bf5eaaf47961c (patch)
treeaff4ce24eca81443c7c11181d08f380966355c1e /phrase-extract/ExtractionPhrasePair.cpp
parent02848112d8bd2bc16114ad7b0dff465f083e0d4b (diff)
GHKM extraction can add a source labels phrase property
Diffstat (limited to 'phrase-extract/ExtractionPhrasePair.cpp')
-rw-r--r--phrase-extract/ExtractionPhrasePair.cpp143
1 files changed, 143 insertions, 0 deletions
diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp
index 2b26c2ad6..9564b1cfe 100644
--- a/phrase-extract/ExtractionPhrasePair.cpp
+++ b/phrase-extract/ExtractionPhrasePair.cpp
@@ -321,5 +321,148 @@ std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &ke
}
+std::string ExtractionPhrasePair::CollectAllLabelsSeparateLHSAndRHS(const std::string& propertyKey,
+ std::set<std::string>& labelSet,
+ boost::unordered_map<std::string,float>& countsLabelsLHS,
+ boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& jointCountsRulesTargetLHSAndLabelsLHS,
+ Vocabulary &vcbT) const
+{
+ const PROPERTY_VALUES *allPropertyValues = GetProperty( propertyKey );
+
+ if ( allPropertyValues == NULL ) {
+ return "";
+ }
+
+ std::string lhs="", rhs="", currentRhs="";
+ float currentRhsCount = 0.0;
+ std::list< std::pair<std::string,float> > lhsGivenCurrentRhsCounts;
+
+ std::ostringstream oss;
+ for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
+ iter!=allPropertyValues->end(); ++iter) {
+
+ size_t space = (iter->first).find_last_of(' ');
+ if ( space == string::npos ) {
+ lhs = iter->first;
+ rhs.clear();
+ } else {
+ lhs = (iter->first).substr(space+1);
+ rhs = (iter->first).substr(0,space);
+ }
+
+ labelSet.insert(lhs);
+
+ if ( rhs.compare(currentRhs) ) {
+
+ if ( iter!=allPropertyValues->begin() ) {
+ if ( !currentRhs.empty() ) {
+ istringstream tokenizer(currentRhs);
+ std::string rhsLabel;
+ while ( tokenizer.peek() != EOF ) {
+ tokenizer >> rhsLabel;
+ labelSet.insert(rhsLabel);
+ }
+ oss << " " << currentRhs << " " << currentRhsCount;
+ }
+ if ( lhsGivenCurrentRhsCounts.size() > 0 ) {
+ if ( !currentRhs.empty() ) {
+ oss << " " << lhsGivenCurrentRhsCounts.size();
+ }
+ for ( std::list< std::pair<std::string,float> >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin();
+ iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) {
+ oss << " " << iter2->first << " " << iter2->second;
+
+ // update countsLabelsLHS and jointCountsRulesTargetLHSAndLabelsLHS
+ std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back());
+ ruleTargetLhs.erase(ruleTargetLhs.begin()); // strip square brackets
+ ruleTargetLhs.erase(ruleTargetLhs.size()-1);
+
+ std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedCountsLabelsLHS =
+ countsLabelsLHS.insert(std::pair<std::string,float>(iter2->first,iter2->second));
+ if (!insertedCountsLabelsLHS.second) {
+ (insertedCountsLabelsLHS.first)->second += iter2->second;
+ }
+
+ boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter =
+ jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs);
+ if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) {
+ boost::unordered_map<std::string,float>* jointCounts = new boost::unordered_map<std::string,float>;
+ jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
+ jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair<std::string,boost::unordered_map<std::string,float>* >(ruleTargetLhs,jointCounts));
+ } else {
+ boost::unordered_map<std::string,float>* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second;
+ std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedJointCounts =
+ jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
+ if (!insertedJointCounts.second) {
+ (insertedJointCounts.first)->second += iter2->second;
+ }
+ }
+
+ }
+ }
+
+ lhsGivenCurrentRhsCounts.clear();
+ }
+
+ currentRhsCount = 0.0;
+ currentRhs = rhs;
+ }
+
+ currentRhsCount += iter->second;
+ lhsGivenCurrentRhsCounts.push_back( std::pair<std::string,float>(lhs,iter->second) );
+ }
+
+ if ( !currentRhs.empty() ) {
+ istringstream tokenizer(currentRhs);
+ std::string rhsLabel;
+ while ( tokenizer.peek() != EOF ) {
+ tokenizer >> rhsLabel;
+ labelSet.insert(rhsLabel);
+ }
+ oss << " " << currentRhs << " " << currentRhsCount;
+ }
+ if ( lhsGivenCurrentRhsCounts.size() > 0 ) {
+ if ( !currentRhs.empty() ) {
+ oss << " " << lhsGivenCurrentRhsCounts.size();
+ }
+ for ( std::list< std::pair<std::string,float> >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin();
+ iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) {
+ oss << " " << iter2->first << " " << iter2->second;
+
+ // update countsLabelsLHS and jointCountsRulesTargetLHSAndLabelsLHS
+ std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back());
+ ruleTargetLhs.erase(ruleTargetLhs.begin()); // strip square brackets
+ ruleTargetLhs.erase(ruleTargetLhs.size()-1);
+
+ std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedCountsLabelsLHS =
+ countsLabelsLHS.insert(std::pair<std::string,float>(iter2->first,iter2->second));
+ if (!insertedCountsLabelsLHS.second) {
+ (insertedCountsLabelsLHS.first)->second += iter2->second;
+ }
+
+ boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter =
+ jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs);
+ if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) {
+ boost::unordered_map<std::string,float>* jointCounts = new boost::unordered_map<std::string,float>;
+ jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
+ jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair<std::string,boost::unordered_map<std::string,float>* >(ruleTargetLhs,jointCounts));
+ } else {
+ boost::unordered_map<std::string,float>* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second;
+ std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedJointCounts =
+ jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
+ if (!insertedJointCounts.second) {
+ (insertedJointCounts.first)->second += iter2->second;
+ }
+ }
+
+ }
+ }
+
+ std::string allPropertyValuesString(oss.str());
+ return allPropertyValuesString;
+}
+
+
+
}