diff options
author | redpony <redpony@ca354974-eb3a-0410-8f5c-d3948404989b> | 2007-12-14 03:14:11 +0300 |
---|---|---|
committer | redpony <redpony@ca354974-eb3a-0410-8f5c-d3948404989b> | 2007-12-14 03:14:11 +0300 |
commit | 637b65f2cea7ea00ce3b653ac519e69ba286fcf9 (patch) | |
tree | 20032e91d6afb95d807771fa0cbba7eede3a4d65 | |
parent | 2429fa62d9fb266d42d9af8efd4a861627f0858c (diff) |
(almost) maximally memory efficient reimplementation of cooccurrence finder. Uses as much as 10x less memory than the standard implementation and is only marginally slower (~ 1.5x)-- this will, obviously depend on the corpora. WILL NOT COMPUTE COUNTS. Finds vocabulary cooccurrences only!
-rw-r--r-- | GIZA++-v2/Makefile | 3 | ||||
-rw-r--r-- | GIZA++-v2/small_snt2cooc.cpp | 91 |
2 files changed, 94 insertions, 0 deletions
diff --git a/GIZA++-v2/Makefile b/GIZA++-v2/Makefile index b81a02a..1baa134 100644 --- a/GIZA++-v2/Makefile +++ b/GIZA++-v2/Makefile @@ -136,3 +136,6 @@ plain2snt.out: plain2snt.cpp snt2cooc.out: snt2cooc.cpp $(CXX) -O3 -g -W -Wall -Wno-deprecated snt2cooc.cpp -o snt2cooc.out +small_snt2cooc.out: small_snt2cooc.cpp + $(CXX) -O3 -g small_snt2cooc.cpp -o small_snt2cooc.out + diff --git a/GIZA++-v2/small_snt2cooc.cpp b/GIZA++-v2/small_snt2cooc.cpp new file mode 100644 index 0000000..1ce7648 --- /dev/null +++ b/GIZA++-v2/small_snt2cooc.cpp @@ -0,0 +1,91 @@ +#include <iostream> +#include <string> +#include <sstream> +#include <fstream> +#include <vector> +#include <algorithm> + + +using namespace std; + +class myset { + private: + vector<int> data; + void check_cap() { + size_t dc = data.capacity(); + if (dc - data.size() < 3) { + if (dc < 4) { dc = 2; } + if (dc < 18) { dc*=2; } else { dc+=15; } + data.reserve(dc); + } + } + public: + typedef vector<int>::iterator iterator; + void insert(int x) { + if (data.size() == 0) { data.push_back(x); return; } + vector<int>::iterator p = lower_bound(data.begin(), data.end(), x); + int i = p - data.begin(); + if (i >= data.size()) { check_cap(); data.push_back(x); return; } + if (*p == x) return; + check_cap(); + data.insert(data.begin() + i,x); + } + iterator begin() { return data.begin(); } + iterator end() { return data.end(); } +}; + +//#include <set> +// typedef std::set<int> intset; +//#include <ext/hash_set> +// typedef __gnu_cxx::hash_set<int> intset; +typedef myset intset; + + +int main(int argc,char **argv) +{ + if( argc!=2 ) + { + cerr << "Usage: " << argv[0] << " snt12 \n"; + cerr << "Converts GIZA++ snt-format into plain text.\n"; + exit(1); + } + ifstream t(argv[1]); + string line1,line2,line3; + vector<intset> vsi(400000); + int nLine=0; + int totalElems=0; + while(getline(t,line1)&&getline(t,line2)&&getline(t,line3)) + { + istringstream eingabe1(line1.c_str()),eingabe2(line2.c_str()),eingabe3(line3.c_str()); + double count; + string word; + eingabe1>>count; + vector<int>l1,l2; + while(eingabe2>>word) + l1.push_back(atoi(word.c_str())); + while(eingabe3>>word) + l2.push_back(atoi(word.c_str())); + if( ((++nLine)%1000)==0 ) + cerr << "line " << nLine << '\n'; + for(unsigned int j=0;j<l2.size();++j) + vsi[0].insert(l2[j]); + for(unsigned int i=0;i<l1.size();++i) + { + if( l1[i]>=int(vsi.size()) ) + { + cerr << "I have to resize: " << l1[i] << endl; + vsi.resize(l1[i]+1000); + } + intset&theset=vsi[l1[i]]; + for(unsigned int j=0;j<l2.size();++j) + theset.insert(l2[j]); + } + } + int vi = 0; + for(vector<intset>::iterator i=vsi.begin();i != vsi.end(); ++i) { + for(intset::iterator j=i->begin();j!=i->end();++j) + cout << vi << " " << *j << endl; + ++vi; + } +} + |