[omim] [indexer] Reorder indexing: all buckets for a feature instead of all features for a bucket.

author: Maxim Pimenov <m@maps.me> 2015-06-03 17:35:18 +0300
committer: Alex Zolotarev <alex@maps.me> 2015-09-23 02:52:07 +0300
commit: 58fa09b57ec14a0a474e8fa5d476c2a71aaa5874 (patch)
tree: 38fc26d81418c30ebd8b2f303bda26902e66f590 /indexer/scale_index_builder.hpp
parent: 3f37dc59dcf8277b55ac6da02440185bcfc68c5b (diff)
1 files changed, 144 insertions, 60 deletions
diff --git a/indexer/scale_index_builder.hpp b/indexer/scale_index_builder.hpp
index fe04d8f069..17c37c0734 100644
--- a/indexer/scale_index_builder.hpp
+++ b/indexer/scale_index_builder.hpp
@@ -1,10 +1,10 @@
 #pragma once
-#include "indexer/scale_index.hpp"
+#include "indexer/cell_id.hpp"
+#include "indexer/data_header.hpp"
+#include "indexer/feature.hpp"
 #include "indexer/feature_covering.hpp"
 #include "indexer/feature_visibility.hpp"
-#include "indexer/feature.hpp"
 #include "indexer/interval_index_builder.hpp"
-#include "indexer/cell_id.hpp"
 
 #include "defines.hpp"
 
@@ -16,13 +16,16 @@
 #include "base/base.hpp"
 #include "base/logging.hpp"
 #include "base/macros.hpp"
+#include "base/scope_guard.hpp"
 
 #include "std/string.hpp"
-#include "std/vector.hpp"
 #include "std/utility.hpp"
-#include "std/unordered_set.hpp"
+#include "std/vector.hpp"
 
 
+namespace covering
+{
+
 class CellFeaturePair
 {
 public:
@@ -49,68 +52,131 @@ private:
 };
 STATIC_ASSERT(sizeof(CellFeaturePair) == 12);
 
+class CellFeatureBucketTuple
+{
+public:
+  CellFeatureBucketTuple() {}
+  CellFeatureBucketTuple(CellFeaturePair p, uint32_t bucket) : m_pair(p), m_bucket(bucket) {}
+
+  bool operator<(CellFeatureBucketTuple const & rhs) const
+  {
+    if (m_bucket != rhs.m_bucket)
+      return m_bucket < rhs.m_bucket;
+    return m_pair < rhs.m_pair;
+  }
+
+  CellFeaturePair GetCellFeaturePair() const { return m_pair; }
+  uint32_t GetBucket() const { return m_bucket; }
+
+private:
+  CellFeaturePair m_pair;
+  uint32_t m_bucket;
+};
+STATIC_ASSERT(sizeof(CellFeatureBucketTuple) == 16);
+
 template <class SorterT>
 class FeatureCoverer
 {
-  unordered_set<uint32_t> & m_skipped;
-
 public:
-  FeatureCoverer(unordered_set<uint32_t> & skipped,
-                 uint32_t bucket,
-                 int codingScale,
-                 SorterT & sorter,
-                 uint32_t & numFeatures)
-    : m_skipped(skipped), m_Sorter(sorter),
-      m_codingDepth(covering::GetCodingDepth(codingScale)),
-      m_ScaleRange(ScaleIndexBase::ScaleRangeForBucket(bucket)),
-      m_NumFeatures(numFeatures)
+  FeatureCoverer(feature::DataHeader const & header, SorterT & sorter, vector<uint32_t> & featuresInBucket, vector<uint32_t> & cellsInBucket)
+      : m_header(header),
+        m_bucketsCount(header.GetLastScale() + 1),
+        m_Sorter(sorter),
+        m_codingDepth(covering::GetCodingDepth(header.GetLastScale())),
+        m_featuresInBucket(featuresInBucket),
+        m_cellsInBucket(cellsInBucket)
   {
-    ASSERT_LESS(m_ScaleRange.first, m_ScaleRange.second, ());
+    m_featuresInBucket.resize(m_bucketsCount);
+    m_cellsInBucket.resize(m_bucketsCount);
   }
-
+ 
   template <class TFeature>
   void operator() (TFeature const & f, uint32_t offset) const
   {
-    if (FeatureShouldBeIndexed(f, offset))
+    uint32_t minScale = 0;
+    bool skip = false;
+    m_scalesIdx = 0;
+    for (uint32_t bucket = 0; bucket < m_bucketsCount; ++bucket)
     {
-      vector<int64_t> const cells = covering::CoverFeature(f, m_codingDepth, 250);
-      for (int64_t cell : cells)
-        m_Sorter.Add(CellFeaturePair(cell, offset));
+      // There is a one-to-one correspondence between buckets and scales.
+      // This is not immediately obvious and in fact there was an idea to map
+      // a bucket to a contiguous range of scales.
+      // todo(@pimenov): We probably should remove scale_index.hpp altogether.
+      if (FeatureShouldBeIndexed(f, offset, bucket, skip, minScale))
+      {
+        vector<int64_t> const cells = covering::CoverFeature(f, m_codingDepth, 250);
+        for (int64_t cell : cells)
+          m_Sorter.Add(CellFeatureBucketTuple(CellFeaturePair(cell, offset), bucket));
 
-      ++m_NumFeatures;
+        m_featuresInBucket[bucket] += 1;
+        m_cellsInBucket[bucket] += cells.size();
+      }
     }
   }
 
 private:
   template <class TFeature>
-  bool FeatureShouldBeIndexed(TFeature const & f, uint32_t offset) const
+  bool FeatureShouldBeIndexed(TFeature const & f, uint32_t offset, uint32_t scale, bool & skip,
+                              uint32_t & minScale) const
   {
     // Do index features for the first visible interval only once.
     // If the feature was skipped as empty for the suitable interval,
     // it should be indexed in the next interval where geometry is not empty.
 
+    bool needReset = (scale == 0);
+    while (m_scalesIdx < m_header.GetScalesCount() && m_header.GetScale(m_scalesIdx) < scale)
+    {
+      ++m_scalesIdx;
+      needReset = true;
+    }
+      
+    if (needReset)
+      f.ResetGeometry();
+
     // This function invokes geometry reading for the needed scale.
-    if (f.IsEmptyGeometry(m_ScaleRange.second - 1))
+    if (f.IsEmptyGeometry(scale))
     {
-      m_skipped.insert(offset);
+      skip = true;
       return false;
     }
 
-    // This function assumes that geometry rect for the needed scale is already initialized.
-    uint32_t const minScale = feature::GetMinDrawableScale(f);
-    if (m_ScaleRange.first <= minScale && minScale < m_ScaleRange.second)
+    if (needReset)
     {
-      (void) m_skipped.erase(offset);
+      // This function assumes that geometry rect for the needed scale is already initialized.
+      // Note: it works with FeatureBase so in fact it does not use the information about
+      // the feature's geometry except for the type and the LimitRect.
+      minScale = feature::GetMinDrawableScale(f);
+    }
+    
+    if (minScale == scale)
+    {
+      skip = false;
       return true;
     }
-
-    return (minScale < m_ScaleRange.first && m_skipped.erase(offset) == 1);
+    
+    if (minScale < scale && skip)
+    {
+      skip = false;
+      return true;
+    }
+    return false;
   }
 
+  // We do not need to parse a feature's geometry for every bucket.
+  // The scales at which geometry changes are encoded in the mwm header.
+  // We cannot know them beforehand because they are different for
+  // the common case of a country file and the special case of the world file.
+  // m_scaleIdx is a position in the scales array that should be reset for every feature
+  // and then only move forward. Its purpose is to detect the moments when we
+  // need to reread the feature's geometry.
+  feature::DataHeader const & m_header;
+  mutable size_t m_scalesIdx;
+
+  uint32_t m_bucketsCount;
   SorterT & m_Sorter;
   int m_codingDepth;
-  pair<uint32_t, uint32_t> m_ScaleRange;
-  uint32_t & m_NumFeatures;
+  vector<uint32_t> & m_featuresInBucket;
+  vector<uint32_t> & m_cellsInBucket;
 };
 
 template <class SinkT>
@@ -131,57 +197,75 @@ private:
 };
 
 template <class FeaturesVectorT, class WriterT>
-inline void IndexScales(uint32_t bucketsCount,
-                        int codingScale,
+inline void IndexScales(feature::DataHeader const & header,
                         FeaturesVectorT const & featuresVector,
                         WriterT & writer,
                         string const & tmpFilePrefix)
 {
   // TODO: Make scale bucketing dynamic.
 
-  STATIC_ASSERT(sizeof(CellFeaturePair) == 12);
+  int bucketsCount = header.GetLastScale() + 1;
+
+  string const cells2featureAllBucketsFile =
+      tmpFilePrefix + CELL2FEATURE_SORTED_EXT + ".allbuckets";
+  MY_SCOPE_GUARD(cells2featureAllBucketsFileGuard,
+                 bind(&FileWriter::DeleteFileX, cells2featureAllBucketsFile));
+  {
+    FileWriter cellsToFeaturesAllBucketsWriter(cells2featureAllBucketsFile);
+
+    typedef FileSorter<CellFeatureBucketTuple, WriterFunctor<FileWriter> > SorterType;
+    WriterFunctor<FileWriter> out(cellsToFeaturesAllBucketsWriter);
+    SorterType sorter(1024 * 1024 /* bufferBytes */, tmpFilePrefix + CELL2FEATURE_TMP_EXT, out); 
+    vector<uint32_t> featuresInBucket(bucketsCount);
+    vector<uint32_t> cellsInBucket(bucketsCount);
+    featuresVector.ForEachOffset(FeatureCoverer<SorterType>(header, sorter, featuresInBucket, cellsInBucket));
+    sorter.SortAndFinish();
+    
+    for (uint32_t bucket = 0; bucket < bucketsCount; ++bucket)
+    {
+      uint32_t numCells = cellsInBucket[bucket];
+      uint32_t numFeatures = featuresInBucket[bucket];
+      LOG(LINFO, ("Building scale index for bucket:", bucket));
+      double const cellsPerFeature = numFeatures == 0 ? 0.0 : static_cast<double>(numCells) / static_cast<double>(numFeatures);
+      LOG(LINFO, ("Features:", numFeatures, "cells:", numCells, "cells per feature:", cellsPerFeature));
+    }
+  }
 
-  unordered_set<uint32_t> skipped;
+  FileReader reader(cells2featureAllBucketsFile);
+  DDVector<CellFeatureBucketTuple, FileReader, uint64_t> cellsToFeaturesAllBuckets(reader);
 
   VarSerialVectorWriter<WriterT> recordWriter(writer, bucketsCount);
+  auto it = cellsToFeaturesAllBuckets.begin();
+
   for (uint32_t bucket = 0; bucket < bucketsCount; ++bucket)
   {
-    LOG(LINFO, ("Building scale index for bucket:", bucket));
-
-    uint32_t numFeatures = 0;
     string const cells2featureFile = tmpFilePrefix + CELL2FEATURE_SORTED_EXT;
+    MY_SCOPE_GUARD(cells2featureFileGuard, bind(&FileWriter::DeleteFileX, cells2featureFile));
     {
       FileWriter cellsToFeaturesWriter(cells2featureFile);
-
-      typedef FileSorter<CellFeaturePair, WriterFunctor<FileWriter> > SorterType;
       WriterFunctor<FileWriter> out(cellsToFeaturesWriter);
-      SorterType sorter(1024*1024, tmpFilePrefix + CELL2FEATURE_TMP_EXT, out);
-      featuresVector.ForEachOffset(
-            FeatureCoverer<SorterType>(skipped, bucket, codingScale, sorter, numFeatures));
-      // LOG(LINFO, ("Sorting..."));
-      sorter.SortAndFinish();
+      while (it < cellsToFeaturesAllBuckets.end() && it->GetBucket() == bucket)
+      {
+        out(it->GetCellFeaturePair());
+        ++it;
+      }
     }
 
-    // LOG(LINFO, ("Indexing..."));
     {
       FileReader reader(cells2featureFile);
-      uint64_t const numCells = reader.Size() / sizeof(CellFeaturePair);
       DDVector<CellFeaturePair, FileReader, uint64_t> cellsToFeatures(reader);
-      LOG(LINFO, ("Being indexed", "features:", numFeatures, "cells:", numCells,
-                  "cells per feature:", (numCells + 1.0) / (numFeatures + 1.0)));
       SubWriter<WriterT> subWriter(writer);
-      BuildIntervalIndex(cellsToFeatures.begin(), cellsToFeatures.end(), subWriter,
-                         RectId::DEPTH_LEVELS * 2 + 1);
+      LOG(LINFO, ("Building interval index for bucket:", bucket));
+      BuildIntervalIndex(cellsToFeatures.begin(), cellsToFeatures.end(), subWriter, RectId::DEPTH_LEVELS * 2 + 1);
     }
-
-    FileWriter::DeleteFileX(cells2featureFile);
-    // LOG(LINFO, ("Indexing done."));
     recordWriter.FinishRecord();
   }
 
-  /// @todo Now we can't check this condition here.
-  /// We have stored features that are invisible even on the last zoom for now (coastlines).
-  //CHECK(skipped.empty(), ());
+  // todo(@pimenov). There was an old todo here that said there were
+  // features (coastlines) that have been indexed despite being invisible at the last scale.
+  // This should be impossible but it is better to re-check it.
 
   LOG(LINFO, ("All scale indexes done."));
 }
+
+}  // namespace covering
+\ No newline at end of file
author	Maxim Pimenov <m@maps.me>	2015-06-03 17:35:18 +0300
committer	Alex Zolotarev <alex@maps.me>	2015-09-23 02:52:07 +0300
commit	58fa09b57ec14a0a474e8fa5d476c2a71aaa5874 (patch)
tree	38fc26d81418c30ebd8b2f303bda26902e66f590 /indexer/scale_index_builder.hpp
parent	3f37dc59dcf8277b55ac6da02440185bcfc68c5b (diff)