Welcome to mirror list, hosted at ThFree Co, Russian Federation.

count_io.hh « filter « lm - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 740b8d50ef88f150761ec990f4bc09941ef182a8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#ifndef LM_FILTER_COUNT_IO__
#define LM_FILTER_COUNT_IO__

#include <fstream>
#include <iostream>
#include <string>

#if !defined __MINGW32__
#include <err.h>
#endif

#include "util/file_piece.hh"

namespace lm {

class CountOutput : boost::noncopyable {
  public:
    explicit CountOutput(const char *name) : file_(name, std::ios::out) {}

    void AddNGram(const StringPiece &line) {
      if (!(file_ << line << '\n')) {
#if defined __MINGW32__
        std::cerr<<"Writing counts file failed"<<std::endl;
        exit(3);
#else
        err(3, "Writing counts file failed");
#endif
      }
    }

    template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) {
      AddNGram(line);
    }

    void AddNGram(const StringPiece &ngram, const StringPiece &line) {
      AddNGram(line);
    }

  private:
    std::fstream file_;
};

class CountBatch {
  public:
    explicit CountBatch(std::streamsize initial_read)
      : initial_read_(initial_read) {
      buffer_.reserve(initial_read);
    }

    void Read(std::istream &in) {
      buffer_.resize(initial_read_);
      in.read(&*buffer_.begin(), initial_read_);
      buffer_.resize(in.gcount());
      char got;
      while (in.get(got) && got != '\n')
        buffer_.push_back(got);
    }

    template <class Output> void Send(Output &out) {
      for (util::TokenIter<util::SingleCharacter> line(StringPiece(&*buffer_.begin(), buffer_.size()), '\n'); line; ++line) {
        util::TokenIter<util::SingleCharacter> tabber(*line, '\t');
        if (!tabber) {
          std::cerr << "Warning: empty n-gram count line being removed\n";
          continue;
        }
        util::TokenIter<util::SingleCharacter, true> words(*tabber, ' ');
        if (!words) {
          std::cerr << "Line has a tab but no words.\n";
          continue;
        }
        out.AddNGram(words, util::TokenIter<util::SingleCharacter, true>::end(), *line);
      }
    }

  private:
    std::streamsize initial_read_;

    // This could have been a std::string but that's less happy with raw writes.
    std::vector<char> buffer_;
};

template <class Output> void ReadCount(util::FilePiece &in_file, Output &out) {
  try {
    while (true) {
      StringPiece line = in_file.ReadLine();
      util::TokenIter<util::SingleCharacter> tabber(line, '\t');
      if (!tabber) {
        std::cerr << "Warning: empty n-gram count line being removed\n";
        continue;
      }
      out.AddNGram(*tabber, line);
    }
  } catch (const util::EndOfFileException &e) {}
}

} // namespace lm

#endif // LM_FILTER_COUNT_IO__