diff options
author | Maxim Pimenov <m@maps.me> | 2020-01-20 18:29:58 +0300 |
---|---|---|
committer | Vladimir Byko-Ianko <bykoianko@gmail.com> | 2020-01-21 16:31:31 +0300 |
commit | 8354f6d6cab07c3bc4be89d7ce3f206b9c408e5a (patch) | |
tree | 37159feba31f60cf39754da5682a0f774711329d /coding | |
parent | ca5bb7ce874bf76f9745321b7f7de9cf44f29242 (diff) |
[base] [coding] Moved BWT and MTF from base/ to coding/.
Diffstat (limited to 'coding')
-rw-r--r-- | coding/CMakeLists.txt | 4 | ||||
-rw-r--r-- | coding/bwt.cpp | 195 | ||||
-rw-r--r-- | coding/bwt.hpp | 60 | ||||
-rw-r--r-- | coding/bwt_coder.hpp | 13 | ||||
-rw-r--r-- | coding/coding_tests/CMakeLists.txt | 2 | ||||
-rw-r--r-- | coding/coding_tests/bwt_tests.cpp | 90 | ||||
-rw-r--r-- | coding/coding_tests/move_to_front_tests.cpp | 32 | ||||
-rw-r--r-- | coding/move_to_front.cpp | 33 | ||||
-rw-r--r-- | coding/move_to_front.hpp | 25 |
9 files changed, 447 insertions, 7 deletions
diff --git a/coding/CMakeLists.txt b/coding/CMakeLists.txt index 792529507a..d6df42b679 100644 --- a/coding/CMakeLists.txt +++ b/coding/CMakeLists.txt @@ -18,6 +18,8 @@ set( buffer_reader.hpp buffered_file_writer.cpp buffered_file_writer.hpp + bwt.cpp + bwt.hpp bwt_coder.hpp byte_stream.hpp compressed_bit_vector.cpp @@ -51,6 +53,8 @@ set( memory_region.hpp mmap_reader.cpp mmap_reader.hpp + move_to_front.cpp + move_to_front.hpp parse_xml.hpp point_coding.cpp point_coding.hpp diff --git a/coding/bwt.cpp b/coding/bwt.cpp new file mode 100644 index 0000000000..4b271a452e --- /dev/null +++ b/coding/bwt.cpp @@ -0,0 +1,195 @@ +#include "coding/bwt.hpp" + +#include "base/assert.hpp" +#include "base/suffix_array.hpp" + +#include <algorithm> +#include <array> +#include <limits> +#include <vector> + +using namespace std; + +namespace +{ +size_t const kNumBytes = 256; + +// Fake trailing '$' for the BWT, used for original string +// reconstruction. +uint32_t const kEOS = 256; + +// FirstColumn represents the first column in the BWT matrix. As +// during reverse BWT we need to reconstruct canonical first column, +// with '$' as the first element, this wrapper is used. Also note that +// other characters in the first column are sorted, so we actually +// don't need to store them explicitly, it's enough to store start +// positions of the corresponding groups of consecutive characters. +class FirstColumn +{ +public: + FirstColumn(size_t n, uint8_t const * s) : m_n(n), m_starts({}) + { + m_starts.fill(0); + for (size_t i = 0; i < n; ++i) + ++m_starts[s[i]]; + + size_t offset = 0; + for (size_t i = 0; i < m_starts.size(); ++i) + { + auto const count = m_starts[i]; + m_starts[i] = offset; + offset += count; + } + } + + size_t Size() const { return m_n + 1; } + + uint32_t operator[](size_t i) const + { + ASSERT_LESS(i, Size(), ()); + if (i == 0) + return kEOS; + + --i; + auto it = upper_bound(m_starts.begin(), m_starts.end(), i); + ASSERT(it != m_starts.begin(), ()); + --it; + return static_cast<uint32_t>(distance(m_starts.begin(), it)); + } + + // Returns the rank of the i-th symbol among symbols with the same + // value. + size_t Rank(size_t i) const + { + ASSERT_LESS(i, Size(), ()); + if (i == 0) + return 0; + + --i; + auto it = upper_bound(m_starts.begin(), m_starts.end(), i); + if (it == m_starts.begin()) + return i; + --it; + return i - *it; + } + +private: + size_t const m_n; + array<size_t, kNumBytes> m_starts; +}; + +// LastColumn represents the last column in the BWT matrix. As during +// reverse BWT we need to reconstruct canonical last column, |s| is +// replaced by s[start] + s[0, start) + '$' + s[start, n). +class LastColumn +{ +public: + LastColumn(size_t n, size_t start, uint8_t const * s) : m_n(n), m_start(start), m_s(s) + { + for (size_t i = 0; i < Size(); ++i) + { + auto const b = (*this)[i]; + if (b == kEOS) + continue; + ASSERT_LESS(b, kNumBytes, ()); + m_table[b].push_back(i); + } + } + + size_t Size() const { return m_n + 1; } + + uint32_t operator[](size_t i) const + { + if (i == 0) + { + ASSERT_LESS(m_start, m_n, ()); + return m_s[m_start]; + } + + if (i == m_start + 1) + return kEOS; + + ASSERT_LESS_OR_EQUAL(i, m_n, ()); + return m_s[i - 1]; + } + + // Returns the index of the |rank|-th |byte| in the canonical BWT + // last column. + size_t Select(uint32_t byte, size_t rank) + { + if (byte == kEOS) + { + ASSERT_EQUAL(rank, 0, ()); + return 0; + } + + ASSERT_LESS(rank, m_table[byte].size(), (byte, rank)); + return m_table[byte][rank]; + } + +private: + size_t const m_n; + size_t const m_start; + uint8_t const * const m_s; + array<vector<size_t>, kNumBytes> m_table; +}; +} // namespace + +namespace coding +{ +size_t BWT(size_t n, uint8_t const * s, uint8_t * r) +{ + vector<size_t> sa(n); + base::Skew(n, s, sa.data()); + + size_t result = 0; + for (size_t i = 0; i < n; ++i) + { + if (sa[i] != 0) + { + r[i] = s[sa[i] - 1]; + } + else + { + result = i; + r[i] = s[n - 1]; + } + } + return result; +} + +size_t BWT(string const & s, string & r) +{ + auto const n = s.size(); + r.assign(n, '\0'); + return BWT(n, reinterpret_cast<uint8_t const *>(s.data()), reinterpret_cast<uint8_t *>(&r[0])); +} + +void RevBWT(size_t n, size_t start, uint8_t const * s, uint8_t * r) +{ + if (n == 0) + return; + + FirstColumn first(n, s); + LastColumn last(n, start, s); + + auto curr = start + 1; + for (size_t i = 0; i < n; ++i) + { + ASSERT_LESS(curr, first.Size(), ()); + ASSERT(first[curr] != kEOS, ()); + + r[i] = first[curr]; + curr = last.Select(r[i], first.Rank(curr)); + } + + ASSERT_EQUAL(first[curr], kEOS, ()); +} + +void RevBWT(size_t start, string const & s, string & r) +{ + auto const n = s.size(); + r.assign(n, '\0'); + RevBWT(n, start, reinterpret_cast<uint8_t const *>(s.data()), reinterpret_cast<uint8_t *>(&r[0])); +} +} // namespace coding diff --git a/coding/bwt.hpp b/coding/bwt.hpp new file mode 100644 index 0000000000..dc0e550dc7 --- /dev/null +++ b/coding/bwt.hpp @@ -0,0 +1,60 @@ +#pragma once + +#include <cstdint> +#include <string> + +namespace coding +{ +// Computes the Burrows-Wheeler transform of the string |s|, stores +// result in the string |r|. Note - the size of |r| must be |n|. +// Returns the index of the original string among the all sorted +// rotations of the |s|. +// +// *NOTE* in contrast to popular explanations of BWT, we do not append +// to |s| trailing '$' that is less than any other character in |s|. +// The reason is that |s| can be an arbitrary byte string, with zero +// bytes inside, so implementation of this trailing '$' is expensive, +// and, actually, not needed. +// +// For example, if |s| is "abaaba", canonical BWT is: +// +// Sorted rotations: canonical BWT: +// $abaaba a +// a$abaab b +// aaba$ab b +// aba$aba a +// * abaaba$ $ +// ba$abaa a +// baaba$a a +// +// where '*' denotes original string. +// +// Our implementation will sort rotations in a way as there is an +// implicit '$' that is less than any other byte in |s|, but does not +// return this '$'. Therefore, the order of rotations will be the same +// as above, without the first '$abaaba': +// +// Sorted rotations: ours BWT: +// aabaab b +// aabaab b +// abaaba a +// * abaaba a +// baabaa a +// baabaa a +// +// where '*' denotes the index of original string. As one can see, +// there are two 'abaaba' strings, but as mentioned, rotations are +// sorted like there is an implicit '$' at the end of the original +// string. It's possible to get from "ours BWT" to the "original BWT", +// see the code for details. +// +// Complexity: O(n) time and O(n) memory. +size_t BWT(size_t n, uint8_t const * s, uint8_t * r); +size_t BWT(std::string const & s, std::string & r); + +// Inverse Burrows-Wheeler transform. +// +// Complexity: O(n) time and O(n) memory. +void RevBWT(size_t n, size_t start, uint8_t const * s, uint8_t * r); +void RevBWT(size_t start, std::string const & s, std::string & r); +} // namespace coding diff --git a/coding/bwt_coder.hpp b/coding/bwt_coder.hpp index 790300308f..5264df4d01 100644 --- a/coding/bwt_coder.hpp +++ b/coding/bwt_coder.hpp @@ -1,11 +1,10 @@ #pragma once +#include "coding/bwt.hpp" #include "coding/huffman.hpp" +#include "coding/move_to_front.hpp" #include "coding/varint.hpp" -#include "base/bwt.hpp" -#include "base/move_to_front.hpp" - #include <algorithm> #include <cstdint> #include <iterator> @@ -27,9 +26,9 @@ public: std::vector<uint8_t> & bwtBuffer) { bwtBuffer.resize(n); - auto const start = base::BWT(n, s, bwtBuffer.data()); + auto const start = BWT(n, s, bwtBuffer.data()); - base::MoveToFront mtf; + MoveToFront mtf; for (auto & b : bwtBuffer) b = mtf.Transform(b); @@ -84,7 +83,7 @@ public: huffman.ReadAndDecode(source, std::back_inserter(bwtBuffer)); size_t const n = bwtBuffer.size(); - base::MoveToFront mtf; + MoveToFront mtf; for (size_t i = 0; i < n; ++i) { auto const b = mtf[bwtBuffer[i]]; @@ -96,7 +95,7 @@ public: CHECK_LESS(start, n, ()); revBuffer.resize(n); - base::RevBWT(n, static_cast<size_t>(start), bwtBuffer.data(), revBuffer.data()); + RevBWT(n, static_cast<size_t>(start), bwtBuffer.data(), revBuffer.data()); return std::copy(revBuffer.begin(), revBuffer.end(), it); } diff --git a/coding/coding_tests/CMakeLists.txt b/coding/coding_tests/CMakeLists.txt index 3210d589ee..0769bdeaf2 100644 --- a/coding/coding_tests/CMakeLists.txt +++ b/coding/coding_tests/CMakeLists.txt @@ -5,6 +5,7 @@ set( base64_test.cpp bit_streams_test.cpp bwt_coder_tests.cpp + bwt_tests.cpp compressed_bit_vector_test.cpp csv_reader_test.cpp dd_vector_test.cpp @@ -21,6 +22,7 @@ set( map_uint32_to_val_tests.cpp mem_file_reader_test.cpp mem_file_writer_test.cpp + move_to_front_tests.cpp png_decoder_test.cpp point_coding_tests.cpp reader_cache_test.cpp diff --git a/coding/coding_tests/bwt_tests.cpp b/coding/coding_tests/bwt_tests.cpp new file mode 100644 index 0000000000..aa0c9947ef --- /dev/null +++ b/coding/coding_tests/bwt_tests.cpp @@ -0,0 +1,90 @@ +#include "testing/testing.hpp" + +#include "coding/bwt.hpp" + +#include <algorithm> +#include <random> +#include <string> + +using namespace coding; +using namespace std; + +namespace +{ +string RevRevBWT(string const & s) +{ + string r; + auto const start = BWT(s, r); + + string rr; + RevBWT(start, r, rr); + return rr; +} + +UNIT_TEST(BWT_Smoke) +{ + { + TEST_EQUAL(BWT(0 /* n */, nullptr /* s */, nullptr /* r */), 0, ()); + } + + { + string r; + TEST_EQUAL(BWT(string() /* s */, r /* r */), 0, ()); + } + + { + string const s = "aaaaaa"; + string r; + TEST_EQUAL(BWT(s, r), 5, ()); + TEST_EQUAL(r, s, ()); + } + + { + string const s = "mississippi"; + string r; + TEST_EQUAL(BWT(s, r), 4, ()); + TEST_EQUAL(r, "pssmipissii", ()); + } +} + +UNIT_TEST(RevBWT_Smoke) +{ + string const strings[] = {"abaaba", "mississippi", "a b b", "Again and again and again"}; + for (auto const & s : strings) + TEST_EQUAL(s, RevRevBWT(s), ()); + + for (size_t i = 0; i < 100; ++i) + { + string const s(i, '\0'); + TEST_EQUAL(s, RevRevBWT(s), ()); + } + + for (size_t i = 0; i < 100; ++i) + { + string const s(i, 'a' + (i % 3)); + TEST_EQUAL(s, RevRevBWT(s), ()); + } +} + +UNIT_TEST(RevBWT_AllBytes) +{ + int kSeed = 42; + int kMin = 1; + int kMax = 10; + + mt19937 engine(kSeed); + uniform_int_distribution<int> uid(kMin, kMax); + + string s; + for (size_t i = 0; i < 256; ++i) + { + auto const count = uid(engine); + ASSERT_GREATER_OR_EQUAL(count, kMin, ()); + ASSERT_LESS_OR_EQUAL(count, kMax, ()); + for (int j = 0; j < count; ++j) + s.push_back(static_cast<uint8_t>(i)); + } + shuffle(s.begin(), s.end(), engine); + TEST_EQUAL(s, RevRevBWT(s), ()); +} +} // namespace diff --git a/coding/coding_tests/move_to_front_tests.cpp b/coding/coding_tests/move_to_front_tests.cpp new file mode 100644 index 0000000000..18e8247339 --- /dev/null +++ b/coding/coding_tests/move_to_front_tests.cpp @@ -0,0 +1,32 @@ +#include "testing/testing.hpp" + +#include "coding/move_to_front.hpp" + +#include <cstdint> + +using namespace coding; + +namespace +{ +UNIT_TEST(MoveToFront_Smoke) +{ + MoveToFront mtf; + for (size_t i = 0; i < 256; ++i) + TEST_EQUAL(mtf[i], i, ()); + + // Initially 3 should be on the 3rd position. + TEST_EQUAL(mtf.Transform(3), 3, ()); + + // After the first transform, 3 should be moved to the 0th position. + TEST_EQUAL(mtf.Transform(3), 0, ()); + TEST_EQUAL(mtf.Transform(3), 0, ()); + TEST_EQUAL(mtf.Transform(3), 0, ()); + + TEST_EQUAL(mtf[0], 3, ()); + TEST_EQUAL(mtf[1], 0, ()); + TEST_EQUAL(mtf[2], 1, ()); + TEST_EQUAL(mtf[3], 2, ()); + for (size_t i = 4; i < 256; ++i) + TEST_EQUAL(mtf[i], i, ()); +} +} // namespace diff --git a/coding/move_to_front.cpp b/coding/move_to_front.cpp new file mode 100644 index 0000000000..ae79c9d9cd --- /dev/null +++ b/coding/move_to_front.cpp @@ -0,0 +1,33 @@ +#include "coding/move_to_front.hpp" + +#include "base/assert.hpp" + +#include <algorithm> +#include <cstring> +#include <numeric> + +using namespace std; + +namespace coding +{ +// static +size_t constexpr MoveToFront::kNumBytes; + +MoveToFront::MoveToFront() +{ + iota(m_order.begin(), m_order.end(), 0); +} + +uint8_t MoveToFront::Transform(uint8_t b) +{ + auto const it = find(m_order.begin(), m_order.end(), b); + ASSERT(it != m_order.end(), ()); + + size_t const result = distance(m_order.begin(), it); + ASSERT_LESS(result, kNumBytes, ()); + + rotate(m_order.begin(), it, it + 1); + ASSERT_EQUAL(m_order[0], b, ()); + return static_cast<uint8_t>(result); +} +} // namespace coding diff --git a/coding/move_to_front.hpp b/coding/move_to_front.hpp new file mode 100644 index 0000000000..dc81cb4a5d --- /dev/null +++ b/coding/move_to_front.hpp @@ -0,0 +1,25 @@ +#pragma once + +#include <array> +#include <cstdint> +#include <limits> + +namespace coding +{ +class MoveToFront +{ +public: + static size_t constexpr kNumBytes = static_cast<size_t>(std::numeric_limits<uint8_t>::max()) + 1; + + MoveToFront(); + + // Returns index of the byte |b| in the current sequence of bytes, + // then moves |b| to the first position. + uint8_t Transform(uint8_t b); + + uint8_t operator[](uint8_t i) const { return m_order[i]; } + +private: + std::array<uint8_t, kNumBytes> m_order; +}; +} // namespace coding |