1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
|
// count words in a memory-mapped corpus
#include "ug_mm_ttrack.h"
#include "tpt_tokenindex.h"
#include "ug_corpus_token.h"
#include <string>
#include <vector>
#include <cassert>
#include <boost/unordered_map.hpp>
#include <boost/foreach.hpp>
#include <iomanip>
#include "ug_typedefs.h"
#include "tpt_pickler.h"
// #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
// #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
// #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
#include <algorithm>
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
using namespace std;
using namespace ugdiss;
using namespace Moses;
typedef L2R_Token<SimpleWordId> Token;
// typedef mmTSA<Token>::tree_iterator iter;
typedef boost::unordered_map<pair<size_t,size_t>,size_t> phrase_counter_t;
#define CACHING_THRESHOLD 1000
mmTtrack<Token> T; // token tracks
TokenIndex V; // vocabs
// mmTSA<Token> I; // suffix arrays
void interpret_args(int ac, char* av[]);
string bname;
bool echo;
int main(int argc, char* argv[])
{
interpret_args(argc,argv);
T.open(bname+".mct");
V.open(bname+".tdx");
vector<size_t> cnt(V.ksize(),0);
for (size_t sid = 0; sid < T.size(); ++sid)
{
Token const* stop = T.sntEnd(sid);
for (Token const* t = T.sntStart(sid); t < stop; ++cnt[(t++)->id()]);
}
for (size_t wid = 2; wid < V.ksize(); ++wid)
cout << V[wid] << " " << cnt[wid] << endl;
exit(0);
}
void
interpret_args(int ac, char* av[])
{
namespace po=boost::program_options;
po::variables_map vm;
po::options_description o("Options");
po::options_description h("Hidden Options");
po::positional_options_description a;
o.add_options()
("help,h", "print this message")
;
h.add_options()
("bname", po::value<string>(&bname), "base name")
;
a.add("bname",1);
get_options(ac,av,h.add(o),a,vm);
}
|