lm/builder/pipeline.hh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73

#ifndef LM_BUILDER_PIPELINE_H
#define LM_BUILDER_PIPELINE_H

#include "lm/builder/adjust_counts.hh"
#include "lm/builder/initial_probabilities.hh"
#include "lm/builder/header_info.hh"
#include "lm/lm_exception.hh"
#include "lm/word_index.hh"
#include "util/stream/config.hh"
#include "util/file_piece.hh"

#include <string>
#include <cstddef>

namespace lm { namespace builder {

struct PipelineConfig {
  std::size_t order;
  std::string vocab_file;
  util::stream::SortConfig sort;
  InitialProbabilitiesConfig initial_probs;
  util::stream::ChainConfig read_backoffs;

  // Include a header in the ARPA with some statistics?
  bool verbose_header;

  // Estimated vocabulary size.  Used for sizing CorpusCount memory and
  // initial probing hash table sizing, also in CorpusCount.
  lm::WordIndex vocab_estimate;

  // Minimum block size to tolerate.
  std::size_t minimum_block;

  // Number of blocks to use.  This will be overridden to 1 if everything fits.
  std::size_t block_count;

  // n-gram count thresholds for pruning. 0 values means no pruning for
  // corresponding n-gram order
  std::vector<uint64_t> prune_thresholds; //mjd

  // What to do with discount failures.
  DiscountConfig discount;

  // Compute collapsed q values instead of probability and backoff
  bool output_q;
  
  /* Computing the perplexity of LMs with different vocabularies is hard.  For
   * example, the lowest perplexity is attained by a unigram model that
   * predicts p(<unk>) = 1 and has no other vocabulary.  Also, linearly
   * interpolated models will sum to more than 1 because <unk> is duplicated
   * (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to
   * 1 but comes with its own problems).  This option will make the vocabulary
   * a particular size by replicating <unk> multiple times for purposes of
   * computing vocabulary size.  It has no effect if the actual vocabulary is
   * larger.  This parameter serves the same purpose as IRSTLM's "dub".
   */
  uint64_t vocab_size_for_unk;

  /* What to do the first time <s>, </s>, or <unk> appears in the input.  If
   * this is anything but THROW_UP, then the symbol will always be treated as
   * whitespace.
   */
  WarningAction disallowed_symbol_action;

  const std::string &TempPrefix() const { return sort.temp_prefix; }
  std::size_t TotalMemory() const { return sort.total_memory; }
};

// Takes ownership of text_file and out_arpa.
void Pipeline(PipelineConfig config, int text_file, int out_arpa);

}} // namespaces
#endif // LM_BUILDER_PIPELINE_H