[base] [coding] Moved BWT and MTF from base/ to coding/.

author: Maxim Pimenov <m@maps.me> 2020-01-20 18:29:58 +0300
committer: Vladimir Byko-Ianko <bykoianko@gmail.com> 2020-01-21 16:31:31 +0300
commit: 8354f6d6cab07c3bc4be89d7ce3f206b9c408e5a (patch)
tree: 37159feba31f60cf39754da5682a0f774711329d /coding
parent: ca5bb7ce874bf76f9745321b7f7de9cf44f29242 (diff)
9 files changed, 447 insertions, 7 deletions
diff --git a/coding/CMakeLists.txt b/coding/CMakeLists.txt
index 792529507a..d6df42b679 100644
--- a/coding/CMakeLists.txt
+++ b/coding/CMakeLists.txt
@@ -18,6 +18,8 @@ set(
   buffer_reader.hpp
   buffered_file_writer.cpp
   buffered_file_writer.hpp
+  bwt.cpp
+  bwt.hpp
   bwt_coder.hpp
   byte_stream.hpp
   compressed_bit_vector.cpp
@@ -51,6 +53,8 @@ set(
   memory_region.hpp
   mmap_reader.cpp
   mmap_reader.hpp
+  move_to_front.cpp
+  move_to_front.hpp
   parse_xml.hpp
   point_coding.cpp
   point_coding.hpp
diff --git a/coding/bwt.cpp b/coding/bwt.cpp
new file mode 100644
index 0000000000..4b271a452e
--- /dev/null
+++ b/coding/bwt.cpp
@@ -0,0 +1,195 @@
+#include "coding/bwt.hpp"
+
+#include "base/assert.hpp"
+#include "base/suffix_array.hpp"
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <vector>
+
+using namespace std;
+
+namespace
+{
+size_t const kNumBytes = 256;
+
+// Fake trailing '$' for the BWT, used for original string
+// reconstruction.
+uint32_t const kEOS = 256;
+
+// FirstColumn represents the first column in the BWT matrix. As
+// during reverse BWT we need to reconstruct canonical first column,
+// with '$' as the first element, this wrapper is used. Also note that
+// other characters in the first column are sorted, so we actually
+// don't need to store them explicitly, it's enough to store start
+// positions of the corresponding groups of consecutive characters.
+class FirstColumn
+{
+public:
+  FirstColumn(size_t n, uint8_t const * s) : m_n(n), m_starts({})
+  {
+    m_starts.fill(0);
+    for (size_t i = 0; i < n; ++i)
+      ++m_starts[s[i]];
+
+    size_t offset = 0;
+    for (size_t i = 0; i < m_starts.size(); ++i)
+    {
+      auto const count = m_starts[i];
+      m_starts[i] = offset;
+      offset += count;
+    }
+  }
+
+  size_t Size() const { return m_n + 1; }
+
+  uint32_t operator[](size_t i) const
+  {
+    ASSERT_LESS(i, Size(), ());
+    if (i == 0)
+      return kEOS;
+
+    --i;
+    auto it = upper_bound(m_starts.begin(), m_starts.end(), i);
+    ASSERT(it != m_starts.begin(), ());
+    --it;
+    return static_cast<uint32_t>(distance(m_starts.begin(), it));
+  }
+
+  // Returns the rank of the i-th symbol among symbols with the same
+  // value.
+  size_t Rank(size_t i) const
+  {
+    ASSERT_LESS(i, Size(), ());
+    if (i == 0)
+      return 0;
+
+    --i;
+    auto it = upper_bound(m_starts.begin(), m_starts.end(), i);
+    if (it == m_starts.begin())
+      return i;
+    --it;
+    return i - *it;
+  }
+
+private:
+  size_t const m_n;
+  array<size_t, kNumBytes> m_starts;
+};
+
+// LastColumn represents the last column in the BWT matrix. As during
+// reverse BWT we need to reconstruct canonical last column, |s| is
+// replaced by s[start] + s[0, start) + '$' + s[start, n).
+class LastColumn
+{
+public:
+  LastColumn(size_t n, size_t start, uint8_t const * s) : m_n(n), m_start(start), m_s(s)
+  {
+    for (size_t i = 0; i < Size(); ++i)
+    {
+      auto const b = (*this)[i];
+      if (b == kEOS)
+        continue;
+      ASSERT_LESS(b, kNumBytes, ());
+      m_table[b].push_back(i);
+    }
+  }
+
+  size_t Size() const { return m_n + 1; }
+
+  uint32_t operator[](size_t i) const
+  {
+    if (i == 0)
+    {
+      ASSERT_LESS(m_start, m_n, ());
+      return m_s[m_start];
+    }
+
+    if (i == m_start + 1)
+      return kEOS;
+
+    ASSERT_LESS_OR_EQUAL(i, m_n, ());
+    return m_s[i - 1];
+  }
+
+  // Returns the index of the |rank|-th |byte| in the canonical BWT
+  // last column.
+  size_t Select(uint32_t byte, size_t rank)
+  {
+    if (byte == kEOS)
+    {
+      ASSERT_EQUAL(rank, 0, ());
+      return 0;
+    }
+
+    ASSERT_LESS(rank, m_table[byte].size(), (byte, rank));
+    return m_table[byte][rank];
+  }
+
+private:
+  size_t const m_n;
+  size_t const m_start;
+  uint8_t const * const m_s;
+  array<vector<size_t>, kNumBytes> m_table;
+};
+}  // namespace
+
+namespace coding
+{
+size_t BWT(size_t n, uint8_t const * s, uint8_t * r)
+{
+  vector<size_t> sa(n);
+  base::Skew(n, s, sa.data());
+
+  size_t result = 0;
+  for (size_t i = 0; i < n; ++i)
+  {
+    if (sa[i] != 0)
+    {
+      r[i] = s[sa[i] - 1];
+    }
+    else
+    {
+      result = i;
+      r[i] = s[n - 1];
+    }
+  }
+  return result;
+}
+
+size_t BWT(string const & s, string & r)
+{
+  auto const n = s.size();
+  r.assign(n, '\0');
+  return BWT(n, reinterpret_cast<uint8_t const *>(s.data()), reinterpret_cast<uint8_t *>(&r[0]));
+}
+
+void RevBWT(size_t n, size_t start, uint8_t const * s, uint8_t * r)
+{
+  if (n == 0)
+    return;
+
+  FirstColumn first(n, s);
+  LastColumn last(n, start, s);
+
+  auto curr = start + 1;
+  for (size_t i = 0; i < n; ++i)
+  {
+    ASSERT_LESS(curr, first.Size(), ());
+    ASSERT(first[curr] != kEOS, ());
+
+    r[i] = first[curr];
+    curr = last.Select(r[i], first.Rank(curr));
+  }
+
+  ASSERT_EQUAL(first[curr], kEOS, ());
+}
+
+void RevBWT(size_t start, string const & s, string & r)
+{
+  auto const n = s.size();
+  r.assign(n, '\0');
+  RevBWT(n, start, reinterpret_cast<uint8_t const *>(s.data()), reinterpret_cast<uint8_t *>(&r[0]));
+}
+}  // namespace coding
diff --git a/coding/bwt.hpp b/coding/bwt.hpp
new file mode 100644
index 0000000000..dc0e550dc7
--- /dev/null
+++ b/coding/bwt.hpp
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+namespace coding
+{
+// Computes the Burrows-Wheeler transform of the string |s|, stores
+// result in the string |r|. Note - the size of |r| must be |n|.
+// Returns the index of the original string among the all sorted
+// rotations of the |s|.
+//
+// *NOTE* in contrast to popular explanations of BWT, we do not append
+// to |s| trailing '$' that is less than any other character in |s|.
+// The reason is that |s| can be an arbitrary byte string, with zero
+// bytes inside, so implementation of this trailing '$' is expensive,
+// and, actually, not needed.
+//
+// For example, if |s| is "abaaba", canonical BWT is:
+//
+//   Sorted rotations:       canonical BWT:
+//   $abaaba                 a
+//   a$abaab                 b
+//   aaba$ab                 b
+//   aba$aba                 a
+// * abaaba$                 $
+//   ba$abaa                 a
+//   baaba$a                 a
+//
+// where '*' denotes original string.
+//
+// Our implementation will sort rotations in a way as there is an
+// implicit '$' that is less than any other byte in |s|, but does not
+// return this '$'. Therefore, the order of rotations will be the same
+// as above, without the first '$abaaba':
+//
+//   Sorted rotations:      ours BWT:
+//   aabaab                 b
+//   aabaab                 b
+//   abaaba                 a
+// * abaaba                 a
+//   baabaa                 a
+//   baabaa                 a
+//
+// where '*' denotes the index of original string. As one can see,
+// there are two 'abaaba' strings, but as mentioned, rotations are
+// sorted like there is an implicit '$' at the end of the original
+// string. It's possible to get from "ours BWT" to the "original BWT",
+// see the code for details.
+//
+// Complexity: O(n) time and O(n) memory.
+size_t BWT(size_t n, uint8_t const * s, uint8_t * r);
+size_t BWT(std::string const & s, std::string & r);
+
+// Inverse Burrows-Wheeler transform.
+//
+// Complexity: O(n) time and O(n) memory.
+void RevBWT(size_t n, size_t start, uint8_t const * s, uint8_t * r);
+void RevBWT(size_t start, std::string const & s, std::string & r);
+}  // namespace coding
diff --git a/coding/bwt_coder.hpp b/coding/bwt_coder.hpp
index 790300308f..5264df4d01 100644
--- a/coding/bwt_coder.hpp
+++ b/coding/bwt_coder.hpp
@@ -1,11 +1,10 @@
 #pragma once
 
+#include "coding/bwt.hpp"
 #include "coding/huffman.hpp"
+#include "coding/move_to_front.hpp"
 #include "coding/varint.hpp"
 
-#include "base/bwt.hpp"
-#include "base/move_to_front.hpp"
-
 #include <algorithm>
 #include <cstdint>
 #include <iterator>
@@ -27,9 +26,9 @@ public:
                                   std::vector<uint8_t> & bwtBuffer)
   {
     bwtBuffer.resize(n);
-    auto const start = base::BWT(n, s, bwtBuffer.data());
+    auto const start = BWT(n, s, bwtBuffer.data());
 
-    base::MoveToFront mtf;
+    MoveToFront mtf;
     for (auto & b : bwtBuffer)
       b = mtf.Transform(b);
 
@@ -84,7 +83,7 @@ public:
     huffman.ReadAndDecode(source, std::back_inserter(bwtBuffer));
 
     size_t const n = bwtBuffer.size();
-    base::MoveToFront mtf;
+    MoveToFront mtf;
     for (size_t i = 0; i < n; ++i)
     {
       auto const b = mtf[bwtBuffer[i]];
@@ -96,7 +95,7 @@ public:
       CHECK_LESS(start, n, ());
 
     revBuffer.resize(n);
-    base::RevBWT(n, static_cast<size_t>(start), bwtBuffer.data(), revBuffer.data());
+    RevBWT(n, static_cast<size_t>(start), bwtBuffer.data(), revBuffer.data());
     return std::copy(revBuffer.begin(), revBuffer.end(), it);
   }
 
diff --git a/coding/coding_tests/CMakeLists.txt b/coding/coding_tests/CMakeLists.txt
index 3210d589ee..0769bdeaf2 100644
--- a/coding/coding_tests/CMakeLists.txt
+++ b/coding/coding_tests/CMakeLists.txt
@@ -5,6 +5,7 @@ set(
   base64_test.cpp
   bit_streams_test.cpp
   bwt_coder_tests.cpp
+  bwt_tests.cpp
   compressed_bit_vector_test.cpp
   csv_reader_test.cpp
   dd_vector_test.cpp
@@ -21,6 +22,7 @@ set(
   map_uint32_to_val_tests.cpp
   mem_file_reader_test.cpp
   mem_file_writer_test.cpp
+  move_to_front_tests.cpp
   png_decoder_test.cpp
   point_coding_tests.cpp
   reader_cache_test.cpp
diff --git a/coding/coding_tests/bwt_tests.cpp b/coding/coding_tests/bwt_tests.cpp
new file mode 100644
index 0000000000..aa0c9947ef
--- /dev/null
+++ b/coding/coding_tests/bwt_tests.cpp
@@ -0,0 +1,90 @@
+#include "testing/testing.hpp"
+
+#include "coding/bwt.hpp"
+
+#include <algorithm>
+#include <random>
+#include <string>
+
+using namespace coding;
+using namespace std;
+
+namespace
+{
+string RevRevBWT(string const & s)
+{
+  string r;
+  auto const start = BWT(s, r);
+
+  string rr;
+  RevBWT(start, r, rr);
+  return rr;
+}
+
+UNIT_TEST(BWT_Smoke)
+{
+  {
+    TEST_EQUAL(BWT(0 /* n */, nullptr /* s */, nullptr /* r */), 0, ());
+  }
+
+  {
+    string r;
+    TEST_EQUAL(BWT(string() /* s */, r /* r */), 0, ());
+  }
+
+  {
+    string const s = "aaaaaa";
+    string r;
+    TEST_EQUAL(BWT(s, r), 5, ());
+    TEST_EQUAL(r, s, ());
+  }
+
+  {
+    string const s = "mississippi";
+    string r;
+    TEST_EQUAL(BWT(s, r), 4, ());
+    TEST_EQUAL(r, "pssmipissii", ());
+  }
+}
+
+UNIT_TEST(RevBWT_Smoke)
+{
+  string const strings[] = {"abaaba", "mississippi", "a b b", "Again and again and again"};
+  for (auto const & s : strings)
+    TEST_EQUAL(s, RevRevBWT(s), ());
+
+  for (size_t i = 0; i < 100; ++i)
+  {
+    string const s(i, '\0');
+    TEST_EQUAL(s, RevRevBWT(s), ());
+  }
+
+  for (size_t i = 0; i < 100; ++i)
+  {
+    string const s(i, 'a' + (i % 3));
+    TEST_EQUAL(s, RevRevBWT(s), ());
+  }
+}
+
+UNIT_TEST(RevBWT_AllBytes)
+{
+  int kSeed = 42;
+  int kMin = 1;
+  int kMax = 10;
+
+  mt19937 engine(kSeed);
+  uniform_int_distribution<int> uid(kMin, kMax);
+
+  string s;
+  for (size_t i = 0; i < 256; ++i)
+  {
+    auto const count = uid(engine);
+    ASSERT_GREATER_OR_EQUAL(count, kMin, ());
+    ASSERT_LESS_OR_EQUAL(count, kMax, ());
+    for (int j = 0; j < count; ++j)
+      s.push_back(static_cast<uint8_t>(i));
+  }
+  shuffle(s.begin(), s.end(), engine);
+  TEST_EQUAL(s, RevRevBWT(s), ());
+}
+}  // namespace
diff --git a/coding/coding_tests/move_to_front_tests.cpp b/coding/coding_tests/move_to_front_tests.cpp
new file mode 100644
index 0000000000..18e8247339
--- /dev/null
+++ b/coding/coding_tests/move_to_front_tests.cpp
@@ -0,0 +1,32 @@
+#include "testing/testing.hpp"
+
+#include "coding/move_to_front.hpp"
+
+#include <cstdint>
+
+using namespace coding;
+
+namespace
+{
+UNIT_TEST(MoveToFront_Smoke)
+{
+  MoveToFront mtf;
+  for (size_t i = 0; i < 256; ++i)
+    TEST_EQUAL(mtf[i], i, ());
+
+  // Initially 3 should be on the 3rd position.
+  TEST_EQUAL(mtf.Transform(3), 3, ());
+
+  // After the first transform, 3 should be moved to the 0th position.
+  TEST_EQUAL(mtf.Transform(3), 0, ());
+  TEST_EQUAL(mtf.Transform(3), 0, ());
+  TEST_EQUAL(mtf.Transform(3), 0, ());
+
+  TEST_EQUAL(mtf[0], 3, ());
+  TEST_EQUAL(mtf[1], 0, ());
+  TEST_EQUAL(mtf[2], 1, ());
+  TEST_EQUAL(mtf[3], 2, ());
+  for (size_t i = 4; i < 256; ++i)
+    TEST_EQUAL(mtf[i], i, ());
+}
+}  // namespace
diff --git a/coding/move_to_front.cpp b/coding/move_to_front.cpp
new file mode 100644
index 0000000000..ae79c9d9cd
--- /dev/null
+++ b/coding/move_to_front.cpp
@@ -0,0 +1,33 @@
+#include "coding/move_to_front.hpp"
+
+#include "base/assert.hpp"
+
+#include <algorithm>
+#include <cstring>
+#include <numeric>
+
+using namespace std;
+
+namespace coding
+{
+// static
+size_t constexpr MoveToFront::kNumBytes;
+
+MoveToFront::MoveToFront()
+{
+  iota(m_order.begin(), m_order.end(), 0);
+}
+
+uint8_t MoveToFront::Transform(uint8_t b)
+{
+  auto const it = find(m_order.begin(), m_order.end(), b);
+  ASSERT(it != m_order.end(), ());
+
+  size_t const result = distance(m_order.begin(), it);
+  ASSERT_LESS(result, kNumBytes, ());
+
+  rotate(m_order.begin(), it, it + 1);
+  ASSERT_EQUAL(m_order[0], b, ());
+  return static_cast<uint8_t>(result);
+}
+}  // namespace coding
diff --git a/coding/move_to_front.hpp b/coding/move_to_front.hpp
new file mode 100644
index 0000000000..dc81cb4a5d
--- /dev/null
+++ b/coding/move_to_front.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <limits>
+
+namespace coding
+{
+class MoveToFront
+{
+public:
+  static size_t constexpr kNumBytes = static_cast<size_t>(std::numeric_limits<uint8_t>::max()) + 1;
+
+  MoveToFront();
+
+  // Returns index of the byte |b| in the current sequence of bytes,
+  // then moves |b| to the first position.
+  uint8_t Transform(uint8_t b);
+
+  uint8_t operator[](uint8_t i) const { return m_order[i]; }
+
+private:
+  std::array<uint8_t, kNumBytes> m_order;
+};
+}  // namespace coding
author	Maxim Pimenov <m@maps.me>	2020-01-20 18:29:58 +0300
committer	Vladimir Byko-Ianko <bykoianko@gmail.com>	2020-01-21 16:31:31 +0300
commit	8354f6d6cab07c3bc4be89d7ce3f206b9c408e5a (patch)
tree	37159feba31f60cf39754da5682a0f774711329d /coding
parent	ca5bb7ce874bf76f9745321b7f7de9cf44f29242 (diff)