Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/giza-pp.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorredpony <redpony@ca354974-eb3a-0410-8f5c-d3948404989b>2007-12-14 03:14:11 +0300
committerredpony <redpony@ca354974-eb3a-0410-8f5c-d3948404989b>2007-12-14 03:14:11 +0300
commit637b65f2cea7ea00ce3b653ac519e69ba286fcf9 (patch)
tree20032e91d6afb95d807771fa0cbba7eede3a4d65
parent2429fa62d9fb266d42d9af8efd4a861627f0858c (diff)
(almost) maximally memory efficient reimplementation of cooccurrence finder. Uses as much as 10x less memory than the standard implementation and is only marginally slower (~ 1.5x)-- this will, obviously depend on the corpora. WILL NOT COMPUTE COUNTS. Finds vocabulary cooccurrences only!
-rw-r--r--GIZA++-v2/Makefile3
-rw-r--r--GIZA++-v2/small_snt2cooc.cpp91
2 files changed, 94 insertions, 0 deletions
diff --git a/GIZA++-v2/Makefile b/GIZA++-v2/Makefile
index b81a02a..1baa134 100644
--- a/GIZA++-v2/Makefile
+++ b/GIZA++-v2/Makefile
@@ -136,3 +136,6 @@ plain2snt.out: plain2snt.cpp
snt2cooc.out: snt2cooc.cpp
$(CXX) -O3 -g -W -Wall -Wno-deprecated snt2cooc.cpp -o snt2cooc.out
+small_snt2cooc.out: small_snt2cooc.cpp
+ $(CXX) -O3 -g small_snt2cooc.cpp -o small_snt2cooc.out
+
diff --git a/GIZA++-v2/small_snt2cooc.cpp b/GIZA++-v2/small_snt2cooc.cpp
new file mode 100644
index 0000000..1ce7648
--- /dev/null
+++ b/GIZA++-v2/small_snt2cooc.cpp
@@ -0,0 +1,91 @@
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <vector>
+#include <algorithm>
+
+
+using namespace std;
+
+class myset {
+ private:
+ vector<int> data;
+ void check_cap() {
+ size_t dc = data.capacity();
+ if (dc - data.size() < 3) {
+ if (dc < 4) { dc = 2; }
+ if (dc < 18) { dc*=2; } else { dc+=15; }
+ data.reserve(dc);
+ }
+ }
+ public:
+ typedef vector<int>::iterator iterator;
+ void insert(int x) {
+ if (data.size() == 0) { data.push_back(x); return; }
+ vector<int>::iterator p = lower_bound(data.begin(), data.end(), x);
+ int i = p - data.begin();
+ if (i >= data.size()) { check_cap(); data.push_back(x); return; }
+ if (*p == x) return;
+ check_cap();
+ data.insert(data.begin() + i,x);
+ }
+ iterator begin() { return data.begin(); }
+ iterator end() { return data.end(); }
+};
+
+//#include <set>
+// typedef std::set<int> intset;
+//#include <ext/hash_set>
+// typedef __gnu_cxx::hash_set<int> intset;
+typedef myset intset;
+
+
+int main(int argc,char **argv)
+{
+ if( argc!=2 )
+ {
+ cerr << "Usage: " << argv[0] << " snt12 \n";
+ cerr << "Converts GIZA++ snt-format into plain text.\n";
+ exit(1);
+ }
+ ifstream t(argv[1]);
+ string line1,line2,line3;
+ vector<intset> vsi(400000);
+ int nLine=0;
+ int totalElems=0;
+ while(getline(t,line1)&&getline(t,line2)&&getline(t,line3))
+ {
+ istringstream eingabe1(line1.c_str()),eingabe2(line2.c_str()),eingabe3(line3.c_str());
+ double count;
+ string word;
+ eingabe1>>count;
+ vector<int>l1,l2;
+ while(eingabe2>>word)
+ l1.push_back(atoi(word.c_str()));
+ while(eingabe3>>word)
+ l2.push_back(atoi(word.c_str()));
+ if( ((++nLine)%1000)==0 )
+ cerr << "line " << nLine << '\n';
+ for(unsigned int j=0;j<l2.size();++j)
+ vsi[0].insert(l2[j]);
+ for(unsigned int i=0;i<l1.size();++i)
+ {
+ if( l1[i]>=int(vsi.size()) )
+ {
+ cerr << "I have to resize: " << l1[i] << endl;
+ vsi.resize(l1[i]+1000);
+ }
+ intset&theset=vsi[l1[i]];
+ for(unsigned int j=0;j<l2.size();++j)
+ theset.insert(l2[j]);
+ }
+ }
+ int vi = 0;
+ for(vector<intset>::iterator i=vsi.begin();i != vsi.end(); ++i) {
+ for(intset::iterator j=i->begin();j!=i->end();++j)
+ cout << vi << " " << *j << endl;
+ ++vi;
+ }
+}
+