diff options
author | Kenneth Heafield <github@kheafield.com> | 2014-01-31 06:16:34 +0400 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2014-01-31 06:16:34 +0400 |
commit | d4035115213e3e458766f93c5ab4330ad2e73fa7 (patch) | |
tree | 9ac10f525400ed4b52ec3e6b9b775fbdb15d0e16 | |
parent | 98780630a7ded5566d55b77d6a8468b07fd3004f (diff) | |
parent | 3e8f1bcac1227f47485e0c56010f46f8233eed67 (diff) |
Merge branch 'master' of github.com:kpu/kenlm
-rw-r--r-- | util/cat_compressed_main.cc | 17 | ||||
-rw-r--r-- | util/read_compressed.cc | 51 |
2 files changed, 35 insertions, 33 deletions
diff --git a/util/cat_compressed_main.cc b/util/cat_compressed_main.cc index 0b9e964..2b4d729 100644 --- a/util/cat_compressed_main.cc +++ b/util/cat_compressed_main.cc @@ -29,14 +29,19 @@ int main(int argc, char *argv[]) { } } - if (argc == 1) { - util::ReadCompressed in(0); - Copy(in, 1); - } else { - for (int i = 1; i < argc; ++i) { - util::ReadCompressed in(util::OpenReadOrThrow(argv[i])); + try { + if (argc == 1) { + util::ReadCompressed in(0); Copy(in, 1); + } else { + for (int i = 1; i < argc; ++i) { + util::ReadCompressed in(util::OpenReadOrThrow(argv[i])); + Copy(in, 1); + } } + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 2; } return 0; } diff --git a/util/read_compressed.cc b/util/read_compressed.cc index 0b75080..71ef0e2 100644 --- a/util/read_compressed.cc +++ b/util/read_compressed.cc @@ -58,7 +58,7 @@ class ReadBase { namespace { -ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, std::size_t already_size); +ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, std::size_t already_size, bool require_compressed); // Completed file that other classes can thunk to. class Complete : public ReadBase { @@ -130,7 +130,7 @@ template <class Compression> class StreamCompressed : public ReadBase { if (!back_.Process()) { // reached end, at least for the compressed portion. std::size_t ret = static_cast<const uint8_t *>(static_cast<void*>(back_.Stream().next_out)) - static_cast<const uint8_t*>(to); - ReplaceThis(ReadFactory(file_.release(), ReadCount(thunk), back_.Stream().next_in, back_.Stream().avail_in), thunk); + ReplaceThis(ReadFactory(file_.release(), ReadCount(thunk), back_.Stream().next_in, back_.Stream().avail_in, true), thunk); if (ret) return ret; // We did not read anything this round, so clients might think EOF. Transfer responsibility to the next reader. return Current(thunk)->Read(to, amount, thunk); @@ -206,8 +206,6 @@ class GZip { }; #endif // HAVE_ZLIB -const uint8_t kBZMagic[3] = {'B', 'Z', 'h'}; - #ifdef HAVE_BZLIB class BZip { public: @@ -346,25 +344,26 @@ class IStreamReader : public ReadBase { }; enum MagicResult { - UNKNOWN, GZIP, BZIP, XZIP + UTIL_UNKNOWN, UTIL_GZIP, UTIL_BZIP, UTIL_XZIP }; -MagicResult DetectMagic(const void *from_void) { +MagicResult DetectMagic(const void *from_void, std::size_t length) { const uint8_t *header = static_cast<const uint8_t*>(from_void); - if (header[0] == 0x1f && header[1] == 0x8b) { - return GZIP; + if (length >= 2 && header[0] == 0x1f && header[1] == 0x8b) { + return UTIL_GZIP; } - if (!memcmp(header, kBZMagic, sizeof(kBZMagic))) { - return BZIP; + const uint8_t kBZMagic[3] = {'B', 'Z', 'h'}; + if (length >= sizeof(kBZMagic) && !memcmp(header, kBZMagic, sizeof(kBZMagic))) { + return UTIL_BZIP; } const uint8_t kXZMagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; - if (!memcmp(header, kXZMagic, sizeof(kXZMagic))) { - return XZIP; + if (length >= sizeof(kXZMagic) && !memcmp(header, kXZMagic, sizeof(kXZMagic))) { + return UTIL_XZIP; } - return UNKNOWN; + return UTIL_UNKNOWN; } -ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, std::size_t already_size) { +ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, const std::size_t already_size, bool require_compressed) { scoped_fd hold(fd); std::string header(reinterpret_cast<const char*>(already_data), already_size); if (header.size() < ReadCompressed::kMagicSize) { @@ -378,38 +377,35 @@ ReadBase *ReadFactory(int fd, uint64_t &raw_amount, const void *already_data, st hold.release(); return new Complete(); } - // Assumption: it's impossible to have a gzip/bzip2 file smaller than xz's 6-byte header. - if (header.size() < ReadCompressed::kMagicSize) - return new UncompressedWithHeader(hold.release(), header.data(), header.size()); - switch (DetectMagic(&header[0])) { - case GZIP: + switch (DetectMagic(&header[0], header.size())) { + case UTIL_GZIP: #ifdef HAVE_ZLIB return new StreamCompressed<GZip>(hold.release(), header.data(), header.size()); #else UTIL_THROW(CompressedException, "This looks like a gzip file but gzip support was not compiled in."); #endif - case BZIP: + case UTIL_BZIP: #ifdef HAVE_BZLIB return new StreamCompressed<BZip>(hold.release(), &header[0], header.size()); #else - UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZ), but bzip support was not compiled in."); + UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZh), but bzip support was not compiled in."); #endif - case XZIP: + case UTIL_XZIP: #ifdef HAVE_XZLIB return new StreamCompressed<XZip>(hold.release(), header.data(), header.size()); #else UTIL_THROW(CompressedException, "This looks like an xz file, but xz support was not compiled in."); #endif - case UNKNOWN: - break; + default: + UTIL_THROW_IF(require_compressed, CompressedException, "Uncompressed data detected after a compresssed file. This could be supported but usually indicates an error."); + return new UncompressedWithHeader(hold.release(), header.data(), header.size()); } - return new UncompressedWithHeader(hold.release(), header.data(), header.size()); } } // namespace bool ReadCompressed::DetectCompressedMagic(const void *from_void) { - return DetectMagic(from_void) != UNKNOWN; + return DetectMagic(from_void, kMagicSize) != UTIL_UNKNOWN; } ReadCompressed::ReadCompressed(int fd) { @@ -425,8 +421,9 @@ ReadCompressed::ReadCompressed() {} ReadCompressed::~ReadCompressed() {} void ReadCompressed::Reset(int fd) { + raw_amount_ = 0; internal_.reset(); - internal_.reset(ReadFactory(fd, raw_amount_, NULL, 0)); + internal_.reset(ReadFactory(fd, raw_amount_, NULL, 0, false)); } void ReadCompressed::Reset(std::istream &in) { |