Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/kpu/kenlm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2014-01-31 06:16:34 +0400
committerKenneth Heafield <github@kheafield.com>2014-01-31 06:16:34 +0400
commitd4035115213e3e458766f93c5ab4330ad2e73fa7 (patch)
tree9ac10f525400ed4b52ec3e6b9b775fbdb15d0e16
parent98780630a7ded5566d55b77d6a8468b07fd3004f (diff)
parent3e8f1bcac1227f47485e0c56010f46f8233eed67 (diff)
Merge branch 'master' of github.com:kpu/kenlm
-rw-r--r--util/cat_compressed_main.cc17
-rw-r--r--util/read_compressed.cc51
2 files changed, 35 insertions, 33 deletions
diff --git a/util/cat_compressed_main.cc b/util/cat_compressed_main.cc
index 0b9e964..2b4d729 100644
--- a/util/cat_compressed_main.cc
+++ b/util/cat_compressed_main.cc
@@ -29,14 +29,19 @@ int main(int argc, char *argv[]) {
}
}
- if (argc == 1) {
- util::ReadCompressed in(0);
- Copy(in, 1);
- } else {
- for (int i = 1; i < argc; ++i) {
- util::ReadCompressed in(util::OpenReadOrThrow(argv[i]));
+ try {
+ if (argc == 1) {
+ util::ReadCompressed in(0);
Copy(in, 1);
+ } else {
+ for (int i = 1; i < argc; ++i) {
+ util::ReadCompressed in(util::OpenReadOrThrow(argv[i]));
+ Copy(in, 1);
+ }
}
+ } catch (const std::exception &e) {
+ std::cerr << e.what() << std::endl;
+ return 2;
}
return 0;
}
diff --git a/util/read_compressed.cc b/util/read_compressed.cc
index 0b75080..71ef0e2 100644
--- a/util/read_compressed.cc
+++ b/util/read_compressed.cc
@@ -58,7 +58,7 @@ class ReadBase {
namespace {
-ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, std::size_t already_size);
+ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, std::size_t already_size, bool require_compressed);
// Completed file that other classes can thunk to.
class Complete : public ReadBase {
@@ -130,7 +130,7 @@ template <class Compression> class StreamCompressed : public ReadBase {
if (!back_.Process()) {
// reached end, at least for the compressed portion.
std::size_t ret = static_cast<const uint8_t *>(static_cast<void*>(back_.Stream().next_out)) - static_cast<const uint8_t*>(to);
- ReplaceThis(ReadFactory(file_.release(), ReadCount(thunk), back_.Stream().next_in, back_.Stream().avail_in), thunk);
+ ReplaceThis(ReadFactory(file_.release(), ReadCount(thunk), back_.Stream().next_in, back_.Stream().avail_in, true), thunk);
if (ret) return ret;
// We did not read anything this round, so clients might think EOF. Transfer responsibility to the next reader.
return Current(thunk)->Read(to, amount, thunk);
@@ -206,8 +206,6 @@ class GZip {
};
#endif // HAVE_ZLIB
-const uint8_t kBZMagic[3] = {'B', 'Z', 'h'};
-
#ifdef HAVE_BZLIB
class BZip {
public:
@@ -346,25 +344,26 @@ class IStreamReader : public ReadBase {
};
enum MagicResult {
- UNKNOWN, GZIP, BZIP, XZIP
+ UTIL_UNKNOWN, UTIL_GZIP, UTIL_BZIP, UTIL_XZIP
};
-MagicResult DetectMagic(const void *from_void) {
+MagicResult DetectMagic(const void *from_void, std::size_t length) {
const uint8_t *header = static_cast<const uint8_t*>(from_void);
- if (header[0] == 0x1f && header[1] == 0x8b) {
- return GZIP;
+ if (length >= 2 && header[0] == 0x1f && header[1] == 0x8b) {
+ return UTIL_GZIP;
}
- if (!memcmp(header, kBZMagic, sizeof(kBZMagic))) {
- return BZIP;
+ const uint8_t kBZMagic[3] = {'B', 'Z', 'h'};
+ if (length >= sizeof(kBZMagic) && !memcmp(header, kBZMagic, sizeof(kBZMagic))) {
+ return UTIL_BZIP;
}
const uint8_t kXZMagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
- if (!memcmp(header, kXZMagic, sizeof(kXZMagic))) {
- return XZIP;
+ if (length >= sizeof(kXZMagic) && !memcmp(header, kXZMagic, sizeof(kXZMagic))) {
+ return UTIL_XZIP;
}
- return UNKNOWN;
+ return UTIL_UNKNOWN;
}
-ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, std::size_t already_size) {
+ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, const std::size_t already_size, bool require_compressed) {
scoped_fd hold(fd);
std::string header(reinterpret_cast<const char*>(already_data), already_size);
if (header.size() < ReadCompressed::kMagicSize) {
@@ -378,38 +377,35 @@ ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, st
hold.release();
return new Complete();
}
- // Assumption: it's impossible to have a gzip/bzip2 file smaller than xz's 6-byte header.
- if (header.size() < ReadCompressed::kMagicSize)
- return new UncompressedWithHeader(hold.release(), header.data(), header.size());
- switch (DetectMagic(&header[0])) {
- case GZIP:
+ switch (DetectMagic(&header[0], header.size())) {
+ case UTIL_GZIP:
#ifdef HAVE_ZLIB
return new StreamCompressed<GZip>(hold.release(), header.data(), header.size());
#else
UTIL_THROW(CompressedException, "This looks like a gzip file but gzip support was not compiled in.");
#endif
- case BZIP:
+ case UTIL_BZIP:
#ifdef HAVE_BZLIB
return new StreamCompressed<BZip>(hold.release(), &header[0], header.size());
#else
- UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZ), but bzip support was not compiled in.");
+ UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZh), but bzip support was not compiled in.");
#endif
- case XZIP:
+ case UTIL_XZIP:
#ifdef HAVE_XZLIB
return new StreamCompressed<XZip>(hold.release(), header.data(), header.size());
#else
UTIL_THROW(CompressedException, "This looks like an xz file, but xz support was not compiled in.");
#endif
- case UNKNOWN:
- break;
+ default:
+ UTIL_THROW_IF(require_compressed, CompressedException, "Uncompressed data detected after a compresssed file. This could be supported but usually indicates an error.");
+ return new UncompressedWithHeader(hold.release(), header.data(), header.size());
}
- return new UncompressedWithHeader(hold.release(), header.data(), header.size());
}
} // namespace
bool ReadCompressed::DetectCompressedMagic(const void *from_void) {
- return DetectMagic(from_void) != UNKNOWN;
+ return DetectMagic(from_void, kMagicSize) != UTIL_UNKNOWN;
}
ReadCompressed::ReadCompressed(int fd) {
@@ -425,8 +421,9 @@ ReadCompressed::ReadCompressed() {}
ReadCompressed::~ReadCompressed() {}
void ReadCompressed::Reset(int fd) {
+ raw_amount_ = 0;
internal_.reset();
- internal_.reset(ReadFactory(fd, raw_amount_, NULL, 0));
+ internal_.reset(ReadFactory(fd, raw_amount_, NULL, 0, false));
}
void ReadCompressed::Reset(std::istream &in) {