Welcome to mirror list, hosted at ThFree Co, Russian Federation.

count_io.hh « filter « lm - github.com/kpu/kenlm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: a350f477feda6f147af500eb4d8fee85ae9b968b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#ifndef LM_FILTER_COUNT_IO_H
#define LM_FILTER_COUNT_IO_H

#include <fstream>
#include <iostream>
#include <string>

#include "../../util/file_stream.hh"
#include "../../util/file.hh"
#include "../../util/file_piece.hh"

namespace lm {

class CountOutput : boost::noncopyable {
  public:
    explicit CountOutput(const char *name) : file_(util::CreateOrThrow(name)) {}

    void AddNGram(const StringPiece &line) {
      file_ << line << '\n';
    }

    template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) {
      AddNGram(line);
    }

    void AddNGram(const StringPiece &ngram, const StringPiece &line) {
      AddNGram(line);
    }

  private:
    util::FileStream file_;
};

class CountBatch {
  public:
    explicit CountBatch(std::streamsize initial_read)
      : initial_read_(initial_read) {
      buffer_.reserve(initial_read);
    }

    void Read(std::istream &in) {
      buffer_.resize(initial_read_);
      in.read(&*buffer_.begin(), initial_read_);
      buffer_.resize(in.gcount());
      char got;
      while (in.get(got) && got != '\n')
        buffer_.push_back(got);
    }

    template <class Output> void Send(Output &out) {
      for (util::TokenIter<util::SingleCharacter> line(StringPiece(&*buffer_.begin(), buffer_.size()), '\n'); line; ++line) {
        util::TokenIter<util::SingleCharacter> tabber(*line, '\t');
        if (!tabber) {
          std::cerr << "Warning: empty n-gram count line being removed\n";
          continue;
        }
        util::TokenIter<util::SingleCharacter, true> words(*tabber, ' ');
        if (!words) {
          std::cerr << "Line has a tab but no words.\n";
          continue;
        }
        out.AddNGram(words, util::TokenIter<util::SingleCharacter, true>::end(), *line);
      }
    }

  private:
    std::streamsize initial_read_;

    // This could have been a std::string but that's less happy with raw writes.
    std::vector<char> buffer_;
};

template <class Output> void ReadCount(util::FilePiece &in_file, Output &out) {
  try {
    while (true) {
      StringPiece line = in_file.ReadLine();
      util::TokenIter<util::SingleCharacter> tabber(line, '\t');
      if (!tabber) {
        std::cerr << "Warning: empty n-gram count line being removed\n";
        continue;
      }
      out.AddNGram(*tabber, line);
    }
  } catch (const util::EndOfFileException &) {}
}

} // namespace lm

#endif // LM_FILTER_COUNT_IO_H