(almost) maximally memory efficient reimplementation of cooccurrence finder. Uses as much as 10x less memory than the standard implementation and is only marginally slower (~ 1.5x)-- this will, obviously depend on the corpora. WILL NOT COMPUTE COUNTS. Finds vocabulary cooccurrences only!

author: redpony <redpony@ca354974-eb3a-0410-8f5c-d3948404989b> 2007-12-14 03:14:11 +0300
committer: redpony <redpony@ca354974-eb3a-0410-8f5c-d3948404989b> 2007-12-14 03:14:11 +0300
commit: 637b65f2cea7ea00ce3b653ac519e69ba286fcf9 (patch)
tree: 20032e91d6afb95d807771fa0cbba7eede3a4d65
parent: 2429fa62d9fb266d42d9af8efd4a861627f0858c (diff)
2 files changed, 94 insertions, 0 deletions
diff --git a/GIZA++-v2/Makefile b/GIZA++-v2/Makefile
index b81a02a..1baa134 100644
--- a/GIZA++-v2/Makefile
+++ b/GIZA++-v2/Makefile
@@ -136,3 +136,6 @@ plain2snt.out: plain2snt.cpp
 snt2cooc.out: snt2cooc.cpp
 	$(CXX) -O3 -g -W -Wall -Wno-deprecated snt2cooc.cpp -o snt2cooc.out
 
+small_snt2cooc.out: small_snt2cooc.cpp
+	$(CXX) -O3 -g small_snt2cooc.cpp -o small_snt2cooc.out
+
diff --git a/GIZA++-v2/small_snt2cooc.cpp b/GIZA++-v2/small_snt2cooc.cpp
new file mode 100644
index 0000000..1ce7648
--- /dev/null
+++ b/GIZA++-v2/small_snt2cooc.cpp
@@ -0,0 +1,91 @@
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <vector>
+#include <algorithm>
+
+
+using namespace std;
+
+class myset {
+  private:
+    vector<int> data;
+    void check_cap() {
+      size_t dc = data.capacity();
+      if (dc - data.size() < 3) {
+        if (dc < 4) { dc = 2; }
+        if (dc < 18) { dc*=2; } else { dc+=15; }
+	data.reserve(dc);
+      }
+    }
+  public:
+    typedef vector<int>::iterator iterator;
+    void insert(int x) {
+      if (data.size() == 0) { data.push_back(x); return; }
+      vector<int>::iterator p = lower_bound(data.begin(), data.end(), x);
+      int i = p - data.begin();
+      if (i >= data.size()) { check_cap(); data.push_back(x); return; }
+      if (*p == x) return;
+      check_cap();
+      data.insert(data.begin() + i,x);
+    }
+    iterator begin() { return data.begin(); }
+    iterator end() { return data.end(); }
+};
+
+//#include <set>
+// typedef std::set<int> intset;
+//#include <ext/hash_set>
+// typedef __gnu_cxx::hash_set<int> intset;
+typedef myset intset;
+
+
+int main(int argc,char **argv)
+{
+  if( argc!=2 )
+    {
+      cerr << "Usage: " << argv[0] << " snt12 \n";
+      cerr << "Converts GIZA++ snt-format into plain text.\n";
+      exit(1);
+    }
+  ifstream t(argv[1]);
+  string line1,line2,line3;
+  vector<intset> vsi(400000);
+  int nLine=0;
+  int totalElems=0;
+  while(getline(t,line1)&&getline(t,line2)&&getline(t,line3))
+    {
+      istringstream eingabe1(line1.c_str()),eingabe2(line2.c_str()),eingabe3(line3.c_str());
+      double count;
+      string word;
+      eingabe1>>count;
+      vector<int>l1,l2;
+      while(eingabe2>>word)
+	l1.push_back(atoi(word.c_str()));
+      while(eingabe3>>word)
+	l2.push_back(atoi(word.c_str()));
+      if( ((++nLine)%1000)==0 )
+	cerr << "line " << nLine << '\n'; 
+      for(unsigned int j=0;j<l2.size();++j)
+	vsi[0].insert(l2[j]);
+      for(unsigned int i=0;i<l1.size();++i)
+	{
+	  if( l1[i]>=int(vsi.size()) )
+	    {
+	      cerr << "I have to resize: " << l1[i] << endl;
+	      vsi.resize(l1[i]+1000);
+	    }
+	  intset&theset=vsi[l1[i]];
+	  for(unsigned int j=0;j<l2.size();++j)
+	    theset.insert(l2[j]);
+	}
+     }
+  int vi = 0;
+  for(vector<intset>::iterator i=vsi.begin();i != vsi.end(); ++i) {
+    for(intset::iterator j=i->begin();j!=i->end();++j)
+      cout << vi << " " << *j << endl;
+    ++vi;
+  }
+}
+
author	redpony <redpony@ca354974-eb3a-0410-8f5c-d3948404989b>	2007-12-14 03:14:11 +0300
committer	redpony <redpony@ca354974-eb3a-0410-8f5c-d3948404989b>	2007-12-14 03:14:11 +0300
commit	637b65f2cea7ea00ce3b653ac519e69ba286fcf9 (patch)
tree	20032e91d6afb95d807771fa0cbba7eede3a4d65
parent	2429fa62d9fb266d42d9af8efd4a861627f0858c (diff)