Welcome to mirror list, hosted at ThFree Co, Russian Federation.

sizes.cc « lm - github.com/kpu/kenlm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: f0f514a1066f465854b0b75c095059a0ff7a1684 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#include "sizes.hh"
#include "model.hh"
#include "../util/file_piece.hh"

#include <vector>
#include <iomanip>

namespace lm {
namespace ngram {

void ShowSizes(const std::vector<uint64_t> &counts, const lm::ngram::Config &config) {
  uint64_t sizes[6];
  sizes[0] = ProbingModel::Size(counts, config);
  sizes[1] = RestProbingModel::Size(counts, config);
  sizes[2] = TrieModel::Size(counts, config);
  sizes[3] = QuantTrieModel::Size(counts, config);
  sizes[4] = ArrayTrieModel::Size(counts, config);
  sizes[5] = QuantArrayTrieModel::Size(counts, config);
  uint64_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / sizeof(uint64_t));
  uint64_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / sizeof(uint64_t));
  uint64_t divide;
  char prefix;
  if (min_length < (1 << 10) * 10) {
    prefix = ' ';
    divide = 1;
  } else if (min_length < (1 << 20) * 10) {
    prefix = 'k';
    divide = 1 << 10;
  } else if (min_length < (1ULL << 30) * 10) {
    prefix = 'M';
    divide = 1 << 20;
  } else {
    prefix = 'G';
    divide = 1 << 30;
  }
  long int length = std::max<long int>(2, static_cast<long int>(ceil(log10((double) max_length / divide))));
  std::cerr << "Memory estimate for binary LM:\ntype    ";

  // right align bytes.
  for (long int i = 0; i < length - 2; ++i) std::cerr << ' ';

  std::cerr << prefix << "B\n"
    "probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n"
    "probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r models -p " << config.probing_multiplier << "\n"
    "trie    " << std::setw(length) << (sizes[2] / divide) << " without quantization\n"
    "trie    " << std::setw(length) << (sizes[3] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n"
    "trie    " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n"
    "trie    " << std::setw(length) << (sizes[5] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n";
}

void ShowSizes(const std::vector<uint64_t> &counts) {
  lm::ngram::Config config;
  ShowSizes(counts, config);
}

void ShowSizes(const char *file, const lm::ngram::Config &config) {
  std::vector<uint64_t> counts;
  util::FilePiece f(file);
  lm::ReadARPACounts(f, counts);
  ShowSizes(counts, config);
}

}} //namespaces