Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRico Sennrich <rico.sennrich@gmx.ch>2012-06-05 12:25:42 +0400
committerRico Sennrich <rico.sennrich@gmx.ch>2012-06-15 12:23:52 +0400
commit9a0426543b8c2b013eaab3ddd69c1b2911b4fac3 (patch)
tree6dc255f699557afea7d923645bf6fcb5ed634a9b /contrib
parenta8a5f896dbdfffddf0468ba7a28a06b29dbba3a7 (diff)
sigtest-filter for hierarchical rule tables
-h to enable: split rules into terminal sequences, look up each in the suffix array, and treat the intersection of the resulting sentence lists as occurrences of the rule. for a speedup, use unordered_set instead of set for SentIdSet (requires GCC >= 4.3)
Diffstat (limited to 'contrib')
-rw-r--r--contrib/sigtest-filter/filter-pt.cpp154
1 files changed, 132 insertions, 22 deletions
diff --git a/contrib/sigtest-filter/filter-pt.cpp b/contrib/sigtest-filter/filter-pt.cpp
index b0828ae33..9528a0676 100644
--- a/contrib/sigtest-filter/filter-pt.cpp
+++ b/contrib/sigtest-filter/filter-pt.cpp
@@ -37,9 +37,11 @@ bool print_neglog_significance = false; // add -log(p) to phrase table?
double sig_filter_limit = 0; // keep phrase pairs with -log(sig) > sig_filter_limit
// higher = filter-more
bool pef_filter_only = false; // only filter based on pef
+bool hierarchical = false;
// globals
PhraseSetMap esets;
+PhraseSetMap fsets;
double p_111 = 0.0; // alpha
size_t nremoved_sigfilter = 0;
size_t nremoved_pfefilter = 0;
@@ -59,7 +61,8 @@ void usage()
<< " [-l threshold] >0.0, a+e, or a-e: keep values that have a -log significance > this\n"
<< " [-n num ] 0, 1...: 0=no filtering, >0 sort by P(e|f) and keep the top num elements\n"
<< " [-c ] add the cooccurence counts to the phrase table\n"
- << " [-p ] add -log(significance) to the phrasetable\n\n";
+ << " [-p ] add -log(significance) to the phrasetable\n"
+ << " [-h ] filter hierarchical rule table\n";
exit(1);
}
@@ -190,6 +193,124 @@ double fisher_exact(int cfe, int ce, int cf)
return total_p;
}
+template <class setType>
+setType unordered_set_intersect(setType & set_1, setType & set_2)
+{
+ setType set_out;
+
+ if (set_1.size() < set_2.size()) {
+ for (SentIdSet::iterator i=set_1.begin(); i != set_1.end(); ++i) {
+ if (set_2.find(*i) != set_2.end()) set_out.insert(*i);
+ }
+ }
+ else {
+ for (SentIdSet::iterator i=set_2.begin(); i != set_2.end(); ++i) {
+ if (set_1.find(*i) != set_1.end()) set_out.insert(*i);
+ }
+ }
+ return set_out;
+}
+
+
+SentIdSet lookup_phrase(const std::string & phrase, C_SuffixArraySearchApplicationBase & my_sa)
+{
+ SentIdSet occur_set;
+ vector<S_SimplePhraseLocationElement> locations;
+
+ locations = my_sa.locateExactPhraseInCorpus(phrase.c_str());
+ if(locations.size()==0) {
+ cerr<<"No occurrences found!!\n";
+ }
+ for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin(); i != locations.end(); ++i) {
+ occur_set.insert(i->sentIdInCorpus);
+ }
+ return occur_set;
+}
+
+
+// slight simplicifaction: we consider all sentences in which "a" and "b" occur to be instances of the rule "a [X][X] b".
+SentIdSet lookup_multiple_phrases(vector<std::string> & phrases, C_SuffixArraySearchApplicationBase & my_sa, const std::string & rule, PhraseSetMap & cache)
+{
+
+ if (phrases.size() == 1) {
+ return lookup_phrase(phrases.front(), my_sa);
+ }
+
+ else {
+ SentIdSet main_set;
+ SentIdSet & first_set = cache[phrases.front()];
+ bool first = true;
+ if (first_set.empty()) {
+ first_set = lookup_phrase(phrases.front(), my_sa);
+ }
+ for (vector<std::string>::iterator phrase=phrases.begin()+1; phrase != phrases.end(); ++phrase) {
+ SentIdSet & temp_set = cache[*phrase];
+ if (temp_set.empty()) {
+ temp_set = lookup_phrase(*phrase, my_sa);
+ }
+ if (first) {
+ main_set = unordered_set_intersect(first_set,temp_set);
+ first = false;
+ }
+ else {
+ main_set = unordered_set_intersect(main_set,temp_set);
+ }
+ if (temp_set.size() < MINIMUM_SIZE_TO_KEEP) {
+ cache.erase(*phrase);
+ }
+ }
+
+ if (first_set.size() < MINIMUM_SIZE_TO_KEEP) {
+ cache.erase(phrases.front());
+ }
+
+ return main_set;
+ }
+}
+
+
+SentIdSet find_occurrences(const std::string& rule, C_SuffixArraySearchApplicationBase & my_sa, PhraseSetMap & cache)
+{
+ SentIdSet sa_set;
+
+ // we search for hierarchical rules by stripping away NT and looking for terminals sequences
+ // if a rule contains multiple sequences of terminals, we intersect their occurrences.
+ if (hierarchical) {
+ // std::cerr << "splitting up phrase: " << phrase << "\n";
+ size_t pos = 0;
+ size_t endPos = 0;
+ vector<std::string> phrases;
+
+ // NT at start of rule
+ if (rule.find("[X][X] ") == 0) {
+ pos = 7;
+ }
+
+ while (rule.find(" [X][X] ", pos) < rule.size()-11) {
+ endPos = rule.find(" [X][X] ", pos);
+ phrases.push_back(rule.substr(pos,endPos-pos));
+ pos = endPos + 8;
+ }
+
+ // NT at end of rule
+ if (rule.find(" [X][X] [X]", pos) < rule.size()) {
+ endPos = rule.size()-11;
+ }
+ // rule doesn't end with NT: cut LHS of rule
+ else {
+ endPos = rule.size()-4;
+ }
+
+ phrases.push_back(rule.substr(pos,endPos-pos));
+ sa_set = lookup_multiple_phrases(phrases, my_sa, rule, cache);
+ }
+ else {
+ sa_set = lookup_phrase(rule, my_sa);
+ }
+ return sa_set;
+}
+
+
// input: unordered list of translation options for a single source phrase
void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options)
{
@@ -201,32 +322,17 @@ void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options)
options.erase(options.begin()+pfe_filter_limit,options.end());
}
if (pef_filter_only) return;
-
+// std::cerr << "f phrase: " << options.front()->f_phrase << "\n";
SentIdSet fset;
- vector<S_SimplePhraseLocationElement> locations;
- //std::cerr << "Looking up f-phrase: " << options.front()->f_phrase << "\n";
-
- locations = f_sa.locateExactPhraseInCorpus(options.front()->f_phrase.c_str());
- if(locations.size()==0) {
- cerr<<"No occurrences found!!\n";
- }
- for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin();
- i != locations.end();
- ++i) {
- fset.insert(i->sentIdInCorpus);
- }
+ fset = find_occurrences(options.front()->f_phrase, f_sa, fsets);
size_t cf = fset.size();
for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
const std::string& e_phrase = (*i)->e_phrase;
size_t cef=0;
- SentIdSet& eset = esets[(*i)->e_phrase];
+ SentIdSet& eset = esets[e_phrase];
if (eset.empty()) {
- //std::cerr << "Looking up e-phrase: " << e_phrase << "\n";
- vector<S_SimplePhraseLocationElement> locations = e_sa.locateExactPhraseInCorpus(e_phrase.c_str());
- for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin(); i!= locations.end(); ++i) {
- TextLenType curSentId = i->sentIdInCorpus;
- eset.insert(curSentId);
- }
+ eset = find_occurrences(e_phrase, e_sa, esets);
+ //std::cerr << "Looking up e-phrase: " << e_phrase << "\n";
}
size_t ce=eset.size();
if (ce < cf) {
@@ -243,6 +349,7 @@ void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options)
if (ce < MINIMUM_SIZE_TO_KEEP) {
esets.erase(e_phrase);
}
+
}
std::vector<PTEntry*>::iterator new_end =
std::remove_if(options.begin(), options.end(), NlogSigThresholder(sig_filter_limit));
@@ -256,7 +363,7 @@ int main(int argc, char * argv[])
const char* efile=0;
const char* ffile=0;
int pfe_index = 2;
- while ((c = getopt(argc, argv, "cpf:e:i:n:l:")) != -1) {
+ while ((c = getopt(argc, argv, "cpf:e:i:n:l:h")) != -1) {
switch (c) {
case 'e':
efile = optarg;
@@ -277,6 +384,9 @@ int main(int argc, char * argv[])
case 'p':
print_neglog_significance = true;
break;
+ case 'h':
+ hierarchical = true;
+ break;
case 'l':
std::cerr << "-l = " << optarg << "\n";
if (strcmp(optarg,"a+e") == 0) {