Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorskyload <skyload@1f5c12ca-751b-0410-a591-d2e778427230>2010-04-21 16:07:02 +0400
committerskyload <skyload@1f5c12ca-751b-0410-a591-d2e778427230>2010-04-21 16:07:02 +0400
commit6895b4fc3856950022d7995b0e6f5b0d58eeccfe (patch)
tree6e7aee9b7d6e316f13338e10091611527e465d90
parente1464a1fe076de87ca447c7edbd287275aeeeff9 (diff)
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/branches/DPR_MOSES@3166 1f5c12ca-751b-0410-a591-d2e778427230
-rw-r--r--sigtest-filter/Makefile10
-rw-r--r--sigtest-filter/README.txt42
-rw-r--r--sigtest-filter/WIN32_functions.cpp236
-rw-r--r--sigtest-filter/WIN32_functions.h24
-rw-r--r--sigtest-filter/check-install5
-rw-r--r--sigtest-filter/filter-pt.cpp359
-rw-r--r--sigtest-filter/sigtest-filter.sln20
-rw-r--r--sigtest-filter/sigtest-filter.vcproj237
8 files changed, 933 insertions, 0 deletions
diff --git a/sigtest-filter/Makefile b/sigtest-filter/Makefile
new file mode 100644
index 000000000..7deb7a247
--- /dev/null
+++ b/sigtest-filter/Makefile
@@ -0,0 +1,10 @@
+SALMDIR=/chomes/redpony/salm
+FLAVOR=o32
+INC=-I$(SALMDIR)/Src/Shared -I$(SALMDIR)/Src/SuffixArrayApplications -I$(SALMDIR)/Src/SuffixArrayApplications/SuffixArraySearch
+OBJS=$(SALMDIR)/Distribution/Linux/Objs/Search/_SuffixArrayApplicationBase.$(FLAVOR) $(SALMDIR)/Distribution/Linux/Objs/Search/_SuffixArraySearchApplicationBase.$(FLAVOR) $(SALMDIR)/Distribution/Linux/Objs/Shared/_String.$(FLAVOR) $(SALMDIR)/Distribution/Linux/Objs/Shared/_IDVocabulary.$(FLAVOR)
+
+all: filter-pt
+
+filter-pt: filter-pt.cpp
+ ./check-install $(SALMDIR)
+ $(CXX) -O6 $(INC) $(OBJS) -o filter-pt filter-pt.cpp
diff --git a/sigtest-filter/README.txt b/sigtest-filter/README.txt
new file mode 100644
index 000000000..b21129b89
--- /dev/null
+++ b/sigtest-filter/README.txt
@@ -0,0 +1,42 @@
+Re-implementation of Johnson et al. (2007)'s phrasetable filtering strategy.
+
+This implementation relies on Joy Zhang's SALM Suffix Array toolkit. It is
+available here:
+
+ http://projectile.sv.cmu.edu/research/public/tools/salm/salm.htm
+
+--Chris Dyer <redpony@umd.edu>
+
+BUILD INSTRUCTIONS
+---------------------------------
+
+1. Download and build SALM.
+
+2. make SALMDIR=/path/to/SALM
+
+
+USAGE INSTRUCTIONS
+---------------------------------
+
+1. Using the SALM/Bin/Linux/Index/IndexSA.O32, create a suffix array index
+ of the source and target sides of your training bitext.
+
+2. cat phrase-table.txt | ./filter-pt -e TARG.suffix -f SOURCE.suffix \
+ -l <FILTER-VALUE>
+
+ FILTER-VALUE is the -log prob threshold described in Johnson et al.
+ (2007)'s paper. It may be either 'a+e', 'a-e', or a positive real
+ value. 'a+e' is a good setting- it filters out <1,1,1> phrase pairs.
+ I also recommend using -n 30, which filteres out all but the top
+ 30 phrase pairs, sorted by P(e|f). This was used in the paper.
+
+3. Run with no options to see more use-cases.
+
+
+REFERENCES
+---------------------------------
+
+H. Johnson, J. Martin, G. Foster and R. Kuhn. (2007) Improving Translation
+ Quality by Discarding Most of the Phrasetable. In Proceedings of the 2007
+ Joint Conference on Empirical Methods in Natural Language Processing and
+ Computational Natural Language Learning (EMNLP-CoNLL), pp. 967-975.
diff --git a/sigtest-filter/WIN32_functions.cpp b/sigtest-filter/WIN32_functions.cpp
new file mode 100644
index 000000000..d03e9bce6
--- /dev/null
+++ b/sigtest-filter/WIN32_functions.cpp
@@ -0,0 +1,236 @@
+// XGetopt.cpp Version 1.2
+//
+// Author: Hans Dietrich
+// hdietrich2@hotmail.com
+//
+// Description:
+// XGetopt.cpp implements getopt(), a function to parse command lines.
+//
+// History
+// Version 1.2 - 2003 May 17
+// - Added Unicode support
+//
+// Version 1.1 - 2002 March 10
+// - Added example to XGetopt.cpp module header
+//
+// This software is released into the public domain.
+// You are free to use it in any way you like.
+//
+// This software is provided "as is" with no expressed
+// or implied warranty. I accept no liability for any
+// damage or loss of business that this software may cause.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////
+// if you are using precompiled headers then include this line:
+///////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////
+// if you are not using precompiled headers then include these lines:
+//#include <windows.h>
+//#include <stdio.h>
+//#include <tchar.h>
+///////////////////////////////////////////////////////////////////////////////
+
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "WIN32_functions.h"
+
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// X G e t o p t . c p p
+//
+//
+// NAME
+// getopt -- parse command line options
+//
+// SYNOPSIS
+// int getopt(int argc, char *argv[], char *optstring)
+//
+// extern char *optarg;
+// extern int optind;
+//
+// DESCRIPTION
+// The getopt() function parses the command line arguments. Its
+// arguments argc and argv are the argument count and array as
+// passed into the application on program invocation. In the case
+// of Visual C++ programs, argc and argv are available via the
+// variables __argc and __argv (double underscores), respectively.
+// getopt returns the next option letter in argv that matches a
+// letter in optstring. (Note: Unicode programs should use
+// __targv instead of __argv. Also, all character and string
+// literals should be enclosed in ( ) ).
+//
+// optstring is a string of recognized option letters; if a letter
+// is followed by a colon, the option is expected to have an argument
+// that may or may not be separated from it by white space. optarg
+// is set to point to the start of the option argument on return from
+// getopt.
+//
+// Option letters may be combined, e.g., "-ab" is equivalent to
+// "-a -b". Option letters are case sensitive.
+//
+// getopt places in the external variable optind the argv index
+// of the next argument to be processed. optind is initialized
+// to 0 before the first call to getopt.
+//
+// When all options have been processed (i.e., up to the first
+// non-option argument), getopt returns EOF, optarg will point
+// to the argument, and optind will be set to the argv index of
+// the argument. If there are no non-option arguments, optarg
+// will be set to NULL.
+//
+// The special option "--" may be used to delimit the end of the
+// options; EOF will be returned, and "--" (and everything after it)
+// will be skipped.
+//
+// RETURN VALUE
+// For option letters contained in the string optstring, getopt
+// will return the option letter. getopt returns a question mark (?)
+// when it encounters an option letter not included in optstring.
+// EOF is returned when processing is finished.
+//
+// BUGS
+// 1) Long options are not supported.
+// 2) The GNU double-colon extension is not supported.
+// 3) The environment variable POSIXLY_CORRECT is not supported.
+// 4) The + syntax is not supported.
+// 5) The automatic permutation of arguments is not supported.
+// 6) This implementation of getopt() returns EOF if an error is
+// encountered, instead of -1 as the latest standard requires.
+//
+// EXAMPLE
+// BOOL CMyApp::ProcessCommandLine(int argc, char *argv[])
+// {
+// int c;
+//
+// while ((c = getopt(argc, argv, ("aBn:"))) != EOF)
+// {
+// switch (c)
+// {
+// case ('a'):
+// TRACE(("option a\n"));
+// //
+// // set some flag here
+// //
+// break;
+//
+// case ('B'):
+// TRACE( ("option B\n"));
+// //
+// // set some other flag here
+// //
+// break;
+//
+// case ('n'):
+// TRACE(("option n: value=%d\n"), atoi(optarg));
+// //
+// // do something with value here
+// //
+// break;
+//
+// case ('?'):
+// TRACE(("ERROR: illegal option %s\n"), argv[optind-1]);
+// return FALSE;
+// break;
+//
+// default:
+// TRACE(("WARNING: no handler for option %c\n"), c);
+// return FALSE;
+// break;
+// }
+// }
+// //
+// // check for non-option args here
+// //
+// return TRUE;
+// }
+//
+///////////////////////////////////////////////////////////////////////////////
+
+char *optarg; // global argument pointer
+int optind = 0; // global argv index
+
+int getopt(int argc, char *argv[], char *optstring)
+{
+ static char *next = NULL;
+ if (optind == 0)
+ next = NULL;
+
+ optarg = NULL;
+
+ if (next == NULL || *next =='\0')
+ {
+ if (optind == 0)
+ optind++;
+
+ if (optind >= argc || argv[optind][0] != ('-') || argv[optind][1] == ('\0'))
+ {
+ optarg = NULL;
+ if (optind < argc)
+ optarg = argv[optind];
+ return EOF;
+ }
+
+ if (strcmp(argv[optind], "--") == 0)
+ {
+ optind++;
+ optarg = NULL;
+ if (optind < argc)
+ optarg = argv[optind];
+ return EOF;
+ }
+
+ next = argv[optind];
+ next++; // skip past -
+ optind++;
+ }
+
+ char c = *next++;
+ char *cp = strchr(optstring, c);
+
+ if (cp == NULL || c == (':'))
+ return ('?');
+
+ cp++;
+ if (*cp == (':'))
+ {
+ if (*next != ('\0'))
+ {
+ optarg = next;
+ next = NULL;
+ }
+ else if (optind < argc)
+ {
+ optarg = argv[optind];
+ optind++;
+ }
+ else
+ {
+ return ('?');
+ }
+ }
+
+ return c;
+}
+
+ // for an overview, see
+ // W. Press, S. Teukolsky and W. Vetterling. (1992) Numerical Recipes in C. Chapter 6.1.
+ double lgamma(int x)
+ {
+ // size_t xx=(size_t)x; xx--; size_t sum=1; while (xx) { sum *= xx--; } return log((double)(sum));
+ if (x <= 2) { return 0.0; }
+ static double coefs[6] = {76.18009172947146, -86.50532032941677, 24.01409824083091, -1.231739572450155, 0.1208650973866179e-2, -0.5395239384953e-5};
+ double tmp=(double)x+5.5;
+ tmp -= (((double)x)+0.5)*log(tmp);
+ double y=(double)x;
+ double sum = 1.000000000190015;
+ for (size_t j=0;j<6;++j) { sum += coefs[j]/++y; }
+ return -tmp+log(2.5066282746310005*sum/(double)x);
+ } \ No newline at end of file
diff --git a/sigtest-filter/WIN32_functions.h b/sigtest-filter/WIN32_functions.h
new file mode 100644
index 000000000..6a719392e
--- /dev/null
+++ b/sigtest-filter/WIN32_functions.h
@@ -0,0 +1,24 @@
+// XGetopt.h Version 1.2
+//
+// Author: Hans Dietrich
+// hdietrich2@hotmail.com
+//
+// This software is released into the public domain.
+// You are free to use it in any way you like.
+//
+// This software is provided "as is" with no expressed
+// or implied warranty. I accept no liability for any
+// damage or loss of business that this software may cause.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef XGETOPT_H
+#define XGETOPT_H
+
+extern int optind, opterr;
+extern char *optarg;
+
+int getopt(int argc, char *argv[], char *optstring);
+double lgamma(int x);
+
+#endif //XGETOPT_H
diff --git a/sigtest-filter/check-install b/sigtest-filter/check-install
new file mode 100644
index 000000000..ba4f431e0
--- /dev/null
+++ b/sigtest-filter/check-install
@@ -0,0 +1,5 @@
+#!/usr/bin/perl -w
+use strict;
+my $path = shift @ARGV;
+die "Can't find SALM installation path: $path\nPlease use:\n\n make SALMDIR=/path/to/SALM\n\n" unless (-d $path);
+exit 0;
diff --git a/sigtest-filter/filter-pt.cpp b/sigtest-filter/filter-pt.cpp
new file mode 100644
index 000000000..85b4df49a
--- /dev/null
+++ b/sigtest-filter/filter-pt.cpp
@@ -0,0 +1,359 @@
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
+
+#include "_SuffixArraySearchApplicationBase.h"
+
+#include <vector>
+#include <iostream>
+#include <set>
+
+#ifdef WIN32
+#include "WIN32_functions.h"
+#else
+#include <unistd.h>
+#endif
+
+typedef std::set<TextLenType> SentIdSet;
+typedef std::map<std::string, SentIdSet> PhraseSetMap;
+
+#undef min
+
+// constants
+const size_t MINIMUM_SIZE_TO_KEEP = 10000; // reduce this to improve memory usage,
+ // increase for speed
+const std::string SEPARATOR = " ||| ";
+
+const double ALPHA_PLUS_EPS = -1000.0; // dummy value
+const double ALPHA_MINUS_EPS = -2000.0; // dummy value
+
+// configuration params
+int pfe_filter_limit = 0; // 0 = don't filter anything based on P(f|e)
+bool print_cooc_counts = false; // add cooc counts to phrase table?
+bool print_neglog_significance = false; // add -log(p) to phrase table?
+double sig_filter_limit = 0; // keep phrase pairs with -log(sig) > sig_filter_limit
+ // higher = filter-more
+bool pef_filter_only = false; // only filter based on pef
+
+// globals
+PhraseSetMap esets;
+double p_111 = 0.0; // alpha
+size_t nremoved_sigfilter = 0;
+size_t nremoved_pfefilter = 0;
+
+C_SuffixArraySearchApplicationBase e_sa;
+C_SuffixArraySearchApplicationBase f_sa;
+int num_lines;
+
+void usage()
+{
+ std::cerr << "\nFilter phrase table using significance testing as described\n"
+ << "in H. Johnson, et al. (2007) Improving Translation Quality\n"
+ << "by Discarding Most of the Phrasetable. EMNLP 2007.\n"
+ << "\nUsage:\n"
+ << "\n filter-pt -e english.suf-arr -f french.suf-arr\n"
+ << " [-c] [-p] [-l threshold] [-n num] < PHRASE-TABLE > FILTERED-PHRASE-TABLE\n\n"
+ << " [-l threshold] >0.0, a+e, or a-e: keep values that have a -log significance > this\n"
+ << " [-n num ] 0, 1...: 0=no filtering, >0 sort by P(e|f) and keep the top num elements\n"
+ << " [-c ] add the cooccurence counts to the phrase table\n"
+ << " [-p ] add -log(significance) to the phrasetable\n\n";
+ exit(1);
+}
+
+struct PTEntry {
+ PTEntry(const std::string& str, int index);
+ std::string f_phrase;
+ std::string e_phrase;
+ std::string extra;
+ std::string scores;
+ float pfe;
+ int cf;
+ int ce;
+ int cfe;
+ float nlog_pte;
+ void set_cooc_stats(int _cef, int _cf, int _ce, float nlp) {
+ cfe = _cef;
+ cf = _cf;
+ ce = _ce;
+ nlog_pte = nlp;
+ }
+
+};
+
+PTEntry::PTEntry(const std::string& str, int index) :
+ cf(0), ce(0), cfe(0), nlog_pte(0.0)
+{
+ size_t pos = 0;
+ std::string::size_type nextPos = str.find(SEPARATOR, pos);
+ this->f_phrase = str.substr(pos,nextPos); pos = nextPos + SEPARATOR.size();
+ nextPos = str.find(SEPARATOR, pos);
+ this->e_phrase = str.substr(pos,nextPos-pos); pos = nextPos + SEPARATOR.size();
+ nextPos = str.rfind(SEPARATOR);
+ this->extra = str.substr(pos, ((nextPos > pos)?(nextPos-pos):0));
+ this->scores = str.substr(nextPos + SEPARATOR.size(),std::string::npos);
+ int c = 0;
+ std::string::iterator i=scores.begin();
+ if (index > 0) {
+ for (; i != scores.end(); ++i) {
+ if ((*i) == ' ') {
+ c++;
+ if (c == index) break;
+ }
+ }
+ }
+ ++i;
+ char f[24];
+ char *fp=f;
+ while (i != scores.end() && *i != ' ') {
+ *fp++=*i++;
+ }
+ *fp++=0;
+
+ this->pfe = atof(f);
+
+ // std::cerr << "L: " << f_phrase << " ::: " << e_phrase << " ::: " << scores << " ::: " << pfe << std::endl;
+ // std::cerr << "X: " << extra << "\n";
+}
+
+struct PfeComparer {
+ bool operator()(const PTEntry* a, const PTEntry* b) const { return a->pfe > b->pfe; }
+};
+
+struct NlogSigThresholder {
+ NlogSigThresholder(float threshold) : t(threshold) {}
+ float t;
+ bool operator()(const PTEntry* a) const { if (a->nlog_pte < t) { delete a; return true; } else return false; }
+};
+
+std::ostream& operator << (std::ostream& os, const PTEntry& pp)
+{
+ os << pp.f_phrase << " ||| " << pp.e_phrase;
+ if (pp.extra.size()>0) os << " ||| " << pp.extra;
+ os << " ||| " << pp.scores;
+ if (print_cooc_counts) os << " ||| " << pp.cfe << " " << pp.cf << " " << pp.ce;
+ if (print_neglog_significance) os << " ||| " << pp.nlog_pte;
+ return os;
+}
+
+void print(int a, int b, int c, int d, float p) {
+ std::cerr << a << "\t" << b << "\t P=" << p << "\n"
+ << c << "\t" << d << "\t xf=" << (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1) << "\n\n";
+}
+
+// 2x2 (one-sided) Fisher's exact test
+// see B. Moore. (2004) On Log Likelihood and the Significance of Rare Events
+double fisher_exact(int cfe, int ce, int cf)
+{
+ assert(cfe <= ce);
+ assert(cfe <= cf);
+
+ int a = cfe;
+ int b = (cf - cfe);
+ int c = (ce - cfe);
+ int d = (num_lines - ce - cf + cfe);
+ int n = a + b + c + d;
+
+ double cp = exp(lgamma(1+a+c) + lgamma(1+b+d) + lgamma(1+a+b) + lgamma(1+c+d) - lgamma(1+n) - lgamma(1+a) - lgamma(1+b) - lgamma(1+c) - lgamma(1+d));
+ double total_p = 0.0;
+ int tc = std::min(b,c);
+ for (int i=0; i<=tc; i++) {
+ total_p += cp;
+// double lg = lgamma(1+a+c) + lgamma(1+b+d) + lgamma(1+a+b) + lgamma(1+c+d) - lgamma(1+n) - lgamma(1+a) - lgamma(1+b) - lgamma(1+c) - lgamma(1+d); double cp = exp(lg);
+// print(a,b,c,d,cp);
+ double coef = (double)(b)*(double)(c)/(double)(a+1)/(double)(d+1);
+ cp *= coef;
+ ++a; --c; ++d; --b;
+ }
+ return total_p;
+}
+
+// input: unordered list of translation options for a single source phrase
+void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options)
+{
+ if (pfe_filter_limit>0 && options.size() > pfe_filter_limit) {
+ nremoved_pfefilter += (options.size() - pfe_filter_limit);
+ std::nth_element(options.begin(), options.begin()+pfe_filter_limit, options.end(), PfeComparer());
+ for (std::vector<PTEntry*>::iterator i=options.begin()+pfe_filter_limit; i != options.end(); ++i)
+ delete *i;
+ options.erase(options.begin()+pfe_filter_limit,options.end());
+ }
+ if (pef_filter_only) return;
+
+ SentIdSet fset;
+ vector<S_SimplePhraseLocationElement> locations;
+ //std::cerr << "Looking up f-phrase: " << options.front()->f_phrase << "\n";
+
+ locations = f_sa.locateExactPhraseInCorpus(options.front()->f_phrase.c_str());
+ if(locations.size()==0){
+ cerr<<"No occurrences found!!\n";
+ }
+ for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin();
+ i != locations.end();
+ ++i)
+ {
+ fset.insert(i->sentIdInCorpus);
+ }
+ size_t cf = fset.size();
+ for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
+ const std::string& e_phrase = (*i)->e_phrase;
+ size_t cef=0;
+ SentIdSet& eset = esets[(*i)->e_phrase];
+ if (eset.empty()) {
+ //std::cerr << "Looking up e-phrase: " << e_phrase << "\n";
+ vector<S_SimplePhraseLocationElement> locations = e_sa.locateExactPhraseInCorpus(e_phrase.c_str());
+ for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin(); i!= locations.end(); ++i) {
+ TextLenType curSentId = i->sentIdInCorpus;
+ eset.insert(curSentId);
+ }
+ }
+ size_t ce=eset.size();
+ if (ce < cf) {
+ for (SentIdSet::iterator i=eset.begin(); i != eset.end(); ++i) {
+ if (fset.find(*i) != fset.end()) cef++;
+ }
+ } else {
+ for (SentIdSet::iterator i=fset.begin(); i != fset.end(); ++i) {
+ if (eset.find(*i) != eset.end()) cef++;
+ }
+ }
+ double nlp = -log(fisher_exact(cef, cf, ce));
+ (*i)->set_cooc_stats(cef, cf, ce, nlp);
+ if (ce < MINIMUM_SIZE_TO_KEEP) {
+ esets.erase(e_phrase);
+ }
+ }
+ std::vector<PTEntry*>::iterator new_end =
+ std::remove_if(options.begin(), options.end(), NlogSigThresholder(sig_filter_limit));
+ nremoved_sigfilter += (options.end() - new_end);
+ options.erase(new_end,options.end());
+}
+
+int main(int argc, char * argv[]){
+ int c;
+ const char* efile=0;
+ const char* ffile=0;
+ int pfe_index = 2;
+ while ((c = getopt(argc, argv, "cpf:e:i:n:l:")) != -1) {
+ switch (c) {
+ case 'e':
+ efile = optarg;
+ break;
+ case 'f':
+ ffile = optarg;
+ break;
+ case 'i': // index of pfe in phrase table
+ pfe_index = atoi(optarg);
+ break;
+ case 'n': // keep only the top n entries in phrase table sorted by p(f|e) (0=all)
+ pfe_filter_limit = atoi(optarg);
+ std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
+ break;
+ case 'c':
+ print_cooc_counts = true;
+ break;
+ case 'p':
+ print_neglog_significance = true;
+ break;
+ case 'l':
+ std::cerr << "-l = " << optarg << "\n";
+ if (strcmp(optarg,"a+e") == 0) {
+ sig_filter_limit = ALPHA_PLUS_EPS;
+ } else if (strcmp(optarg,"a-e") == 0) {
+ sig_filter_limit = ALPHA_MINUS_EPS;
+ } else {
+ char *x;
+ sig_filter_limit = strtod(optarg, &x);
+ if (sig_filter_limit < 0.0) {
+ std::cerr << "Filter limit (-l) must be either 'a+e', 'a-e' or a real number >= 0.0\n";
+ usage();
+ }
+ }
+ break;
+ default:
+ usage();
+ }
+ }
+ if (sig_filter_limit == 0.0) pef_filter_only = true;
+ //-----------------------------------------------------------------------------
+ if (optind != argc || ((!efile || !ffile) && !pef_filter_only)) {
+ usage();
+ }
+
+ //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
+ if (!pef_filter_only) {
+ e_sa.loadData_forSearch(efile, false, false);
+ f_sa.loadData_forSearch(ffile, false, false);
+ size_t elines = e_sa.returnTotalSentNumber();
+ size_t flines = f_sa.returnTotalSentNumber();
+ if (elines != flines) {
+ std::cerr << "Number of lines in e-corpus != number of lines in f-corpus!\n";
+ usage();
+ } else {
+ std::cerr << "Training corpus: " << elines << " lines\n";
+ num_lines = elines;
+ }
+ p_111 = -log(fisher_exact(1,1,1));
+ std::cerr << "\\alpha = " << p_111 << "\n";
+ if (sig_filter_limit == ALPHA_MINUS_EPS) { sig_filter_limit = p_111 - 0.001; }
+ else if (sig_filter_limit == ALPHA_PLUS_EPS) { sig_filter_limit = p_111 + 0.001; }
+ std::cerr << "Sig filter threshold is = " << sig_filter_limit << "\n";
+ } else {
+ std::cerr << "Filtering using P(e|f) only. n=" << pfe_filter_limit << std::endl;
+ }
+
+ char tmpString[10000];
+ std::string prev = "";
+ std::vector<PTEntry*> options;
+ size_t pt_lines = 0;
+ while(!cin.eof()){
+ cin.getline(tmpString,10000,'\n');
+ if(++pt_lines%10000==0)
+ {
+ std::cerr << ".";
+ if(pt_lines%500000==0) std::cerr << "[n:"<<pt_lines<<"]\n";
+ }
+
+ if(strlen(tmpString)>0){
+ PTEntry* pp = new PTEntry(tmpString, pfe_index);
+ if (prev != pp->f_phrase) {
+ prev = pp->f_phrase;
+
+ if (!options.empty()) { // always true after first line
+ compute_cooc_stats_and_filter(options);
+ }
+ for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
+ std::cout << **i << std::endl;
+ delete *i;
+ }
+ options.clear();
+ options.push_back(pp);
+
+ } else {
+ options.push_back(pp);
+ }
+ // for(int i=0;i<locations.size(); i++){
+ // cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
+ // }
+ }
+ }
+ compute_cooc_stats_and_filter(options);
+ for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
+ std::cout << **i << std::endl;
+ delete *i;
+ }
+ float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
+ float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
+ std::cerr << "\n\n------------------------------------------------------\n"
+ << " unfiltered phrases pairs: " << pt_lines << "\n"
+ << "\n"
+ << " P(f|e) filter [first]: " << nremoved_pfefilter << " (" << pfefper << "%)\n"
+ << " significance filter: " << nremoved_sigfilter << " (" << sigfper << "%)\n"
+ << " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n"
+ << "\n"
+ << " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n"
+ << "------------------------------------------------------\n";
+
+ return 0;
+}
diff --git a/sigtest-filter/sigtest-filter.sln b/sigtest-filter/sigtest-filter.sln
new file mode 100644
index 000000000..517b06238
--- /dev/null
+++ b/sigtest-filter/sigtest-filter.sln
@@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 9.00
+# Visual Studio 2005
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sigtest-filter", "sigtest-filter.vcproj", "{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Release|Win32 = Release|Win32
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Debug|Win32.ActiveCfg = Debug|Win32
+ {FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Debug|Win32.Build.0 = Debug|Win32
+ {FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Release|Win32.ActiveCfg = Release|Win32
+ {FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}.Release|Win32.Build.0 = Release|Win32
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
diff --git a/sigtest-filter/sigtest-filter.vcproj b/sigtest-filter/sigtest-filter.vcproj
new file mode 100644
index 000000000..a961ac61d
--- /dev/null
+++ b/sigtest-filter/sigtest-filter.vcproj
@@ -0,0 +1,237 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+ ProjectType="Visual C++"
+ Version="8.00"
+ Name="sigtest-filter"
+ ProjectGUID="{FA2910DF-FD9D-4E6D-A393-9F9F9E309E78}"
+ RootNamespace="sigtestfilter"
+ Keyword="Win32Proj"
+ >
+ <Platforms>
+ <Platform
+ Name="Win32"
+ />
+ </Platforms>
+ <ToolFiles>
+ </ToolFiles>
+ <Configurations>
+ <Configuration
+ Name="Debug|Win32"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+ IntermediateDirectory="$(ConfigurationName)"
+ ConfigurationType="1"
+ UseOfMFC="2"
+ CharacterSet="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories="..\..\..\SALM\Src\SuffixArrayApplications\SuffixArraySearch;..\..\..\SALM\Src\SuffixArrayApplications;..\..\..\SALM\Src\Shared"
+ PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+ MinimalRebuild="true"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="3"
+ UsePrecompiledHeader="0"
+ WarningLevel="3"
+ Detect64BitPortabilityProblems="true"
+ DebugInformationFormat="4"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ LinkIncremental="2"
+ GenerateDebugInformation="true"
+ SubSystem="1"
+ TargetMachine="1"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCWebDeploymentTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|Win32"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+ IntermediateDirectory="$(ConfigurationName)"
+ ConfigurationType="1"
+ UseOfMFC="2"
+ CharacterSet="1"
+ WholeProgramOptimization="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ AdditionalIncludeDirectories="..\..\..\SALM\Src\SuffixArrayApplications\SuffixArraySearch;..\..\..\SALM\Src\SuffixArrayApplications;..\..\..\SALM\Src\Shared"
+ PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+ RuntimeLibrary="2"
+ UsePrecompiledHeader="0"
+ WarningLevel="3"
+ Detect64BitPortabilityProblems="true"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ LinkIncremental="1"
+ GenerateDebugInformation="true"
+ SubSystem="1"
+ OptimizeReferences="2"
+ EnableCOMDATFolding="2"
+ TargetMachine="1"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCWebDeploymentTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ </Configurations>
+ <References>
+ </References>
+ <Files>
+ <Filter
+ Name="Source Files"
+ Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+ UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+ >
+ <File
+ RelativePath="..\..\..\SALM\Src\Shared\_IDVocabulary.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\..\SALM\Src\Shared\_String.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\..\SALM\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\..\SALM\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.cpp"
+ >
+ </File>
+ <File
+ RelativePath=".\filter-pt.cpp"
+ >
+ </File>
+ <File
+ RelativePath=".\WIN32_functions.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Header Files"
+ Filter="h;hpp;hxx;hm;inl;inc;xsd"
+ UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+ >
+ <File
+ RelativePath="..\..\..\SALM\Src\Shared\_IDVocabulary.h"
+ >
+ </File>
+ <File
+ RelativePath="..\..\..\SALM\Src\SuffixArrayApplications\_SuffixArrayApplicationBase.h"
+ >
+ </File>
+ <File
+ RelativePath="..\..\..\SALM\Src\SuffixArrayApplications\SuffixArraySearch\_SuffixArraySearchApplicationBase.h"
+ >
+ </File>
+ <File
+ RelativePath=".\WIN32_functions.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Resource Files"
+ Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
+ UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+ >
+ </Filter>
+ </Files>
+ <Globals>
+ </Globals>
+</VisualStudioProject>