Mantaflow [Part 1]: Added preprocessed Mantaflow source files

Includes preprocessed Mantaflow source files for both OpenMP and TBB (if OpenMP is not present, TBB files will be used instead). These files come directly from the Mantaflow repository. Future updates to the core fluid solver will take place by updating the files. Reviewed By: sergey, mont29 Maniphest Tasks: T59995 Differential Revision: https://developer.blender.org/D3850
author: Sebastián Barschkis <sebbas@sebbas.org> 2019-12-16 17:40:15 +0300
committer: Sebastián Barschkis <sebbas@sebbas.org> 2019-12-16 18:27:26 +0300
commit: 4ff7c5eed6b546ae42692f3a869a5ccc095a9cb4 (patch)
tree: 03f8233788ae46e66672f711e3cf42d8d00d01cc /extern/mantaflow/preprocessed/multigrid.cpp
parent: 6a3f2b30d206df23120cd212132adea821b6c20e (diff)
1 files changed, 1857 insertions, 0 deletions
diff --git a/extern/mantaflow/preprocessed/multigrid.cpp b/extern/mantaflow/preprocessed/multigrid.cpp
new file mode 100644
index 00000000000..9e35c6f9368
--- /dev/null
+++ b/extern/mantaflow/preprocessed/multigrid.cpp
@@ -0,0 +1,1857 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Multigrid solver
+ *
+ * This program is free software, distributed under the terms of the
+ * Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Copyright 2016, by Florian Ferstl (florian.ferstl.ff@gmail.com)
+ *
+ * This is an implementation of the solver developed by Dick et al. [1]
+ * without topology awareness (= vertex duplication on coarser levels). This
+ * simplification allows us to use regular grids for all levels of the multigrid
+ * hierarchy and works well for moderately complex domains.
+ *
+ * [1] Solving the Fluid Pressure Poisson Equation Using Multigrid-Evaluation
+ *     and Improvements, C. Dick, M. Rogowsky, R. Westermann, IEEE TVCG 2015
+ *
+ ******************************************************************************/
+
+#include "multigrid.h"
+
+#define FOR_LVL(IDX, LVL) for (int IDX = 0; IDX < mb[LVL].size(); IDX++)
+
+#define FOR_VEC_MINMAX(VEC, MIN, MAX) \
+  Vec3i VEC; \
+  const Vec3i VEC##__min = (MIN), VEC##__max = (MAX); \
+  for (VEC.z = VEC##__min.z; VEC.z <= VEC##__max.z; VEC.z++) \
+    for (VEC.y = VEC##__min.y; VEC.y <= VEC##__max.y; VEC.y++) \
+      for (VEC.x = VEC##__min.x; VEC.x <= VEC##__max.x; VEC.x++)
+
+#define FOR_VECLIN_MINMAX(VEC, LIN, MIN, MAX) \
+  Vec3i VEC; \
+  int LIN = 0; \
+  const Vec3i VEC##__min = (MIN), VEC##__max = (MAX); \
+  for (VEC.z = VEC##__min.z; VEC.z <= VEC##__max.z; VEC.z++) \
+    for (VEC.y = VEC##__min.y; VEC.y <= VEC##__max.y; VEC.y++) \
+      for (VEC.x = VEC##__min.x; VEC.x <= VEC##__max.x; VEC.x++, LIN++)
+
+#define MG_TIMINGS(X)
+//#define MG_TIMINGS(X) X
+
+using namespace std;
+namespace Manta {
+
+// Helper class for calling mantaflow kernels with a specific number of threads
+class ThreadSize {
+  IndexInt s;
+
+ public:
+  ThreadSize(IndexInt _s)
+  {
+    s = _s;
+  }
+  IndexInt size()
+  {
+    return s;
+  }
+};
+
+// ----------------------------------------------------------------------------
+// Efficient min heap for <ID, key> pairs with 0<=ID<N and 0<=key<K
+// (elements are stored in K buckets, where each bucket is a doubly linked list).
+// - if K<<N, all ops are O(1) on avg (worst case O(K)).
+// - memory usage O(K+N): (K+N) * 3 * sizeof(int).
+class NKMinHeap {
+ private:
+  struct Entry {
+    int key, prev, next;
+    Entry() : key(-1), prev(-1), next(-1)
+    {
+    }
+  };
+
+  int mN, mK, mSize, mMinKey;
+
+  // Double linked lists of IDs, one for each bucket/key.
+  // The first K entries are the buckets' head pointers,
+  // and the last N entries correspond to the IDs.
+  std::vector<Entry> mEntries;
+
+ public:
+  NKMinHeap(int N, int K) : mN(N), mK(K), mSize(0), mMinKey(-1), mEntries(N + K)
+  {
+  }
+
+  int size()
+  {
+    return mSize;
+  }
+  int getKey(int ID)
+  {
+    return mEntries[mK + ID].key;
+  }
+
+  // Insert, decrease or increase key (or delete by setting key to -1)
+  void setKey(int ID, int key);
+
+  // peek min key (returns ID/key pair)
+  std::pair<int, int> peekMin();
+
+  // pop min key (returns ID/key pair)
+  std::pair<int, int> popMin();
+
+  void print();  // for debugging
+};
+
+void NKMinHeap::setKey(int ID, int key)
+{
+  assertMsg(0 <= ID && ID < mN, "NKMinHeap::setKey: ID out of range");
+  assertMsg(-1 <= key && key < mK, "NKMinHeap::setKey: key out of range");
+
+  const int kid = mK + ID;
+
+  if (mEntries[kid].key == key)
+    return;  // nothing changes
+
+  // remove from old key-list if ID existed previously
+  if (mEntries[kid].key != -1) {
+    int pred = mEntries[kid].prev;
+    int succ = mEntries[kid].next;  // can be -1
+
+    mEntries[pred].next = succ;
+    if (succ != -1)
+      mEntries[succ].prev = pred;
+
+    // if removed key was minimum key, mMinKey may need to be updated
+    int removedKey = mEntries[kid].key;
+    if (removedKey == mMinKey) {
+      if (mSize == 1) {
+        mMinKey = -1;
+      }
+      else {
+        for (; mMinKey < mK; mMinKey++) {
+          if (mEntries[mMinKey].next != -1)
+            break;
+        }
+      }
+    }
+
+    mSize--;
+  }
+
+  // set new key of ID
+  mEntries[kid].key = key;
+
+  if (key == -1) {
+    // finished if key was set to -1
+    mEntries[kid].next = mEntries[kid].prev = -1;
+    return;
+  }
+
+  // add key
+  mSize++;
+  if (mMinKey == -1)
+    mMinKey = key;
+  else
+    mMinKey = std::min(mMinKey, key);
+
+  // insert into new key-list (headed by mEntries[key])
+  int tmp = mEntries[key].next;
+
+  mEntries[key].next = kid;
+  mEntries[kid].prev = key;
+
+  mEntries[kid].next = tmp;
+  if (tmp != -1)
+    mEntries[tmp].prev = kid;
+}
+
+std::pair<int, int> NKMinHeap::peekMin()
+{
+  if (mSize == 0)
+    return std::pair<int, int>(-1, -1);  // error
+
+  const int ID = mEntries[mMinKey].next - mK;
+  return std::pair<int, int>(ID, mMinKey);
+}
+
+std::pair<int, int> NKMinHeap::popMin()
+{
+  if (mSize == 0)
+    return std::pair<int, int>(-1, -1);  // error
+
+  const int kid = mEntries[mMinKey].next;
+  const int ID = kid - mK;
+  const int key = mMinKey;
+
+  // remove from key-list
+  int pred = mEntries[kid].prev;
+  int succ = mEntries[kid].next;  // can be -1
+
+  mEntries[pred].next = succ;
+  if (succ != -1)
+    mEntries[succ].prev = pred;
+
+  // remove entry
+  mEntries[kid] = Entry();
+  mSize--;
+
+  // update mMinKey
+  if (mSize == 0) {
+    mMinKey = -1;
+  }
+  else {
+    for (; mMinKey < mK; mMinKey++) {
+      if (mEntries[mMinKey].next != -1)
+        break;
+    }
+  }
+
+  // return result
+  return std::pair<int, int>(ID, key);
+}
+
+void NKMinHeap::print()
+{
+  std::cout << "Size: " << mSize << ", MinKey: " << mMinKey << std::endl;
+  for (int key = 0; key < mK; key++) {
+    if (mEntries[key].next != -1) {
+      std::cout << "Key " << key << ": ";
+      int kid = mEntries[key].next;
+      while (kid != -1) {
+        std::cout << kid - mK << " ";
+        kid = mEntries[kid].next;
+      }
+      std::cout << std::endl;
+    }
+  }
+  std::cout << std::endl;
+}
+
+// ----------------------------------------------------------------------------
+// GridMg methods
+//
+// Illustration of 27-point stencil indices
+// y     | z = -1    z = 0      z = 1
+// ^     | 6  7  8,  15 16 17,  24 25 26
+// |     | 3  4  5,  12 13 14,  21 22 23
+// o-> x | 0  1  2,   9 10 11,  18 19 20
+//
+// Symmetric storage with only 14 entries per vertex
+// y     | z = -1    z = 0      z = 1
+// ^     | -  -  -,   2  3  4,  11 12 13
+// |     | -  -  -,   -  0  1,   8  9 10
+// o-> x | -  -  -,   -  -  -,   5  6  7
+
+GridMg::GridMg(const Vec3i &gridSize)
+    : mNumPreSmooth(1),
+      mNumPostSmooth(1),
+      mCoarsestLevelAccuracy(Real(1E-8)),
+      mTrivialEquationScale(Real(1E-6)),
+      mIsASet(false),
+      mIsRhsSet(false)
+{
+  MG_TIMINGS(MuTime time;)
+
+  // 2D or 3D mode
+  mIs3D = (gridSize.z > 1);
+  mDim = mIs3D ? 3 : 2;
+  mStencilSize = mIs3D ? 14 : 5;  // A has a full 27-point stencil on levels > 0
+  mStencilSize0 = mIs3D ? 4 : 3;  // A has a 7-point stencil on level 0
+  mStencilMin = Vec3i(-1, -1, mIs3D ? -1 : 0);
+  mStencilMax = Vec3i(1, 1, mIs3D ? 1 : 0);
+
+  // Create level 0 (=original grid)
+  mSize.push_back(gridSize);
+  mPitch.push_back(Vec3i(1, mSize.back().x, mSize.back().x * mSize.back().y));
+  int n = mSize.back().x * mSize.back().y * mSize.back().z;
+
+  mA.push_back(std::vector<Real>(n * mStencilSize0));
+  mx.push_back(std::vector<Real>(n));
+  mb.push_back(std::vector<Real>(n));
+  mr.push_back(std::vector<Real>(n));
+  mType.push_back(std::vector<VertexType>(n));
+  mCGtmp1.push_back(std::vector<double>());
+  mCGtmp2.push_back(std::vector<double>());
+  mCGtmp3.push_back(std::vector<double>());
+  mCGtmp4.push_back(std::vector<double>());
+
+  debMsg("GridMg::GridMg level 0: " << mSize[0].x << " x " << mSize[0].y << " x " << mSize[0].z
+                                    << " x ",
+         2);
+
+  // Create coarse levels >0
+  for (int l = 1; l <= 100; l++) {
+    if (mSize[l - 1].x <= 5 && mSize[l - 1].y <= 5 && mSize[l - 1].z <= 5)
+      break;
+    if (n <= 1000)
+      break;
+
+    mSize.push_back((mSize[l - 1] + 2) / 2);
+    mPitch.push_back(Vec3i(1, mSize.back().x, mSize.back().x * mSize.back().y));
+    n = mSize.back().x * mSize.back().y * mSize.back().z;
+
+    mA.push_back(std::vector<Real>(n * mStencilSize));
+    mx.push_back(std::vector<Real>(n));
+    mb.push_back(std::vector<Real>(n));
+    mr.push_back(std::vector<Real>(n));
+    mType.push_back(std::vector<VertexType>(n));
+    mCGtmp1.push_back(std::vector<double>());
+    mCGtmp2.push_back(std::vector<double>());
+    mCGtmp3.push_back(std::vector<double>());
+    mCGtmp4.push_back(std::vector<double>());
+
+    debMsg("GridMg::GridMg level " << l << ": " << mSize[l].x << " x " << mSize[l].y << " x "
+                                   << mSize[l].z << " x ",
+           2);
+  }
+
+  // Additional memory for CG on coarsest level
+  mCGtmp1.back() = std::vector<double>(n);
+  mCGtmp2.back() = std::vector<double>(n);
+  mCGtmp3.back() = std::vector<double>(n);
+  mCGtmp4.back() = std::vector<double>(n);
+
+  MG_TIMINGS(debMsg("GridMg: Allocation done in " << time.update(), 1);)
+
+  // Precalculate coarsening paths:
+  // (V) <--restriction-- (U) <--A_{l-1}-- (W) <--interpolation-- (N)
+  Vec3i p7stencil[7] = {Vec3i(0, 0, 0),
+                        Vec3i(-1, 0, 0),
+                        Vec3i(1, 0, 0),
+                        Vec3i(0, -1, 0),
+                        Vec3i(0, 1, 0),
+                        Vec3i(0, 0, -1),
+                        Vec3i(0, 0, 1)};
+  Vec3i V(1, 1, 1);  // reference coarse grid vertex at (1,1,1)
+  FOR_VEC_MINMAX(U, V * 2 + mStencilMin, V * 2 + mStencilMax)
+  {
+    for (int i = 0; i < 1 + 2 * mDim; i++) {
+      Vec3i W = U + p7stencil[i];
+      FOR_VEC_MINMAX(N, W / 2, (W + 1) / 2)
+      {
+        int s = dot(N, Vec3i(1, 3, 9));
+
+        if (s >= 13) {
+          CoarseningPath path;
+          path.N = N - 1;                  // offset of N on coarse grid
+          path.U = U - V * 2;              // offset of U on fine grid
+          path.W = W - V * 2;              // offset of W on fine grid
+          path.sc = s - 13;                // stencil index corresponding to V<-N on coarse grid
+          path.sf = (i + 1) / 2;           // stencil index corresponding to U<-W on coarse grid
+          path.inUStencil = (i % 2 == 0);  // fine grid stencil entry stored at U or W?
+          path.rw = Real(1) /
+                    Real(1 << ((U.x % 2) + (U.y % 2) + (U.z % 2)));  // restriction weight V<-U
+          path.iw = Real(1) /
+                    Real(1 << ((W.x % 2) + (W.y % 2) + (W.z % 2)));  // interpolation weight W<-N
+          mCoarseningPaths0.push_back(path);
+        }
+      }
+    }
+  }
+
+  auto pathLess = [](const GridMg::CoarseningPath &p1, const GridMg::CoarseningPath &p2) {
+    if (p1.sc == p2.sc)
+      return dot(p1.U + 1, Vec3i(1, 3, 9)) < dot(p2.U + 1, Vec3i(1, 3, 9));
+    return p1.sc < p2.sc;
+  };
+  std::sort(mCoarseningPaths0.begin(), mCoarseningPaths0.end(), pathLess);
+}
+
+void GridMg::analyzeStencil(int v,
+                            bool is3D,
+                            bool &isStencilSumNonZero,
+                            bool &isEquationTrivial) const
+{
+  Vec3i V = vecIdx(v, 0);
+
+  // collect stencil entries
+  Real A[7];
+  A[0] = mA[0][v * mStencilSize0 + 0];
+  A[1] = mA[0][v * mStencilSize0 + 1];
+  A[2] = mA[0][v * mStencilSize0 + 2];
+  A[3] = is3D ? mA[0][v * mStencilSize0 + 3] : Real(0);
+  A[4] = V.x != 0 ? mA[0][(v - mPitch[0].x) * mStencilSize0 + 1] : Real(0);
+  A[5] = V.y != 0 ? mA[0][(v - mPitch[0].y) * mStencilSize0 + 2] : Real(0);
+  A[6] = V.z != 0 && is3D ? mA[0][(v - mPitch[0].z) * mStencilSize0 + 3] : Real(0);
+
+  // compute sum of stencil entries
+  Real stencilMax = Real(0), stencilSum = Real(0);
+  for (int i = 0; i < 7; i++) {
+    stencilSum += A[i];
+    stencilMax = max(stencilMax, std::abs(A[i]));
+  }
+
+  // check if sum is numerically zero
+  isStencilSumNonZero = std::abs(stencilSum / stencilMax) > Real(1E-6);
+
+  // check for trivial equation (exact comparisons)
+  isEquationTrivial = A[0] == Real(1) && A[1] == Real(0) && A[2] == Real(0) && A[3] == Real(0) &&
+                      A[4] == Real(0) && A[5] == Real(0) && A[6] == Real(0);
+}
+
+struct knCopyA : public KernelBase {
+  knCopyA(std::vector<Real> &sizeRef,
+          std::vector<Real> &A0,
+          int stencilSize0,
+          bool is3D,
+          const Grid<Real> *pA0,
+          const Grid<Real> *pAi,
+          const Grid<Real> *pAj,
+          const Grid<Real> *pAk)
+      : KernelBase(sizeRef.size()),
+        sizeRef(sizeRef),
+        A0(A0),
+        stencilSize0(stencilSize0),
+        is3D(is3D),
+        pA0(pA0),
+        pAi(pAi),
+        pAj(pAj),
+        pAk(pAk)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 std::vector<Real> &sizeRef,
+                 std::vector<Real> &A0,
+                 int stencilSize0,
+                 bool is3D,
+                 const Grid<Real> *pA0,
+                 const Grid<Real> *pAi,
+                 const Grid<Real> *pAj,
+                 const Grid<Real> *pAk) const
+  {
+    A0[idx * stencilSize0 + 0] = (*pA0)[idx];
+    A0[idx * stencilSize0 + 1] = (*pAi)[idx];
+    A0[idx * stencilSize0 + 2] = (*pAj)[idx];
+    if (is3D)
+      A0[idx * stencilSize0 + 3] = (*pAk)[idx];
+  }
+  inline std::vector<Real> &getArg0()
+  {
+    return sizeRef;
+  }
+  typedef std::vector<Real> type0;
+  inline std::vector<Real> &getArg1()
+  {
+    return A0;
+  }
+  typedef std::vector<Real> type1;
+  inline int &getArg2()
+  {
+    return stencilSize0;
+  }
+  typedef int type2;
+  inline bool &getArg3()
+  {
+    return is3D;
+  }
+  typedef bool type3;
+  inline const Grid<Real> *getArg4()
+  {
+    return pA0;
+  }
+  typedef Grid<Real> type4;
+  inline const Grid<Real> *getArg5()
+  {
+    return pAi;
+  }
+  typedef Grid<Real> type5;
+  inline const Grid<Real> *getArg6()
+  {
+    return pAj;
+  }
+  typedef Grid<Real> type6;
+  inline const Grid<Real> *getArg7()
+  {
+    return pAk;
+  }
+  typedef Grid<Real> type7;
+  void runMessage()
+  {
+    debMsg("Executing kernel knCopyA ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, sizeRef, A0, stencilSize0, is3D, pA0, pAi, pAj, pAk);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  std::vector<Real> &sizeRef;
+  std::vector<Real> &A0;
+  int stencilSize0;
+  bool is3D;
+  const Grid<Real> *pA0;
+  const Grid<Real> *pAi;
+  const Grid<Real> *pAj;
+  const Grid<Real> *pAk;
+};
+
+struct knActivateVertices : public KernelBase {
+  knActivateVertices(std::vector<GridMg::VertexType> &type_0,
+                     std::vector<Real> &A0,
+                     bool &nonZeroStencilSumFound,
+                     bool &trivialEquationsFound,
+                     const GridMg &mg)
+      : KernelBase(type_0.size()),
+        type_0(type_0),
+        A0(A0),
+        nonZeroStencilSumFound(nonZeroStencilSumFound),
+        trivialEquationsFound(trivialEquationsFound),
+        mg(mg)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 std::vector<GridMg::VertexType> &type_0,
+                 std::vector<Real> &A0,
+                 bool &nonZeroStencilSumFound,
+                 bool &trivialEquationsFound,
+                 const GridMg &mg) const
+  {
+    // active vertices on level 0 are vertices with non-zero diagonal entry in A
+    type_0[idx] = GridMg::vtInactive;
+
+    if (mg.mA[0][idx * mg.mStencilSize0 + 0] != Real(0)) {
+      type_0[idx] = GridMg::vtActive;
+
+      bool isStencilSumNonZero = false, isEquationTrivial = false;
+      mg.analyzeStencil(int(idx), mg.mIs3D, isStencilSumNonZero, isEquationTrivial);
+
+      // Note: nonZeroStencilSumFound and trivialEquationsFound are only
+      // changed from false to true, and hence there are no race conditions.
+      if (isStencilSumNonZero)
+        nonZeroStencilSumFound = true;
+
+      // scale down trivial equations
+      if (isEquationTrivial) {
+        type_0[idx] = GridMg::vtActiveTrivial;
+        A0[idx * mg.mStencilSize0 + 0] *= mg.mTrivialEquationScale;
+        trivialEquationsFound = true;
+      };
+    }
+  }
+  inline std::vector<GridMg::VertexType> &getArg0()
+  {
+    return type_0;
+  }
+  typedef std::vector<GridMg::VertexType> type0;
+  inline std::vector<Real> &getArg1()
+  {
+    return A0;
+  }
+  typedef std::vector<Real> type1;
+  inline bool &getArg2()
+  {
+    return nonZeroStencilSumFound;
+  }
+  typedef bool type2;
+  inline bool &getArg3()
+  {
+    return trivialEquationsFound;
+  }
+  typedef bool type3;
+  inline const GridMg &getArg4()
+  {
+    return mg;
+  }
+  typedef GridMg type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel knActivateVertices ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, type_0, A0, nonZeroStencilSumFound, trivialEquationsFound, mg);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  std::vector<GridMg::VertexType> &type_0;
+  std::vector<Real> &A0;
+  bool &nonZeroStencilSumFound;
+  bool &trivialEquationsFound;
+  const GridMg &mg;
+};
+
+void GridMg::setA(const Grid<Real> *pA0,
+                  const Grid<Real> *pAi,
+                  const Grid<Real> *pAj,
+                  const Grid<Real> *pAk)
+{
+  MG_TIMINGS(MuTime time;)
+
+  // Copy level 0
+  knCopyA(mx[0], mA[0], mStencilSize0, mIs3D, pA0, pAi, pAj, pAk);
+
+  // Determine active vertices and scale trivial equations
+  bool nonZeroStencilSumFound = false;
+  bool trivialEquationsFound = false;
+
+  knActivateVertices(mType[0], mA[0], nonZeroStencilSumFound, trivialEquationsFound, *this);
+
+  if (trivialEquationsFound)
+    debMsg("GridMg::setA: Found at least one trivial equation", 2);
+
+  // Sanity check: if all rows of A sum up to 0 --> A doesn't have full rank (opposite direction
+  // isn't necessarily true)
+  if (!nonZeroStencilSumFound)
+    debMsg(
+        "GridMg::setA: Found constant mode: A*1=0! A does not have full rank and multigrid may "
+        "not converge. (forgot to fix a pressure value?)",
+        1);
+
+  // Create coarse grids and operators on levels >0
+  for (int l = 1; l < mA.size(); l++) {
+    MG_TIMINGS(time.get();)
+    genCoarseGrid(l);
+    MG_TIMINGS(debMsg("GridMg: Generated level " << l << " in " << time.update(), 1);)
+    genCoraseGridOperator(l);
+    MG_TIMINGS(debMsg("GridMg: Generated operator " << l << " in " << time.update(), 1);)
+  }
+
+  mIsASet = true;
+  mIsRhsSet = false;  // invalidate rhs
+}
+
+struct knSetRhs : public KernelBase {
+  knSetRhs(std::vector<Real> &b, const Grid<Real> &rhs, const GridMg &mg)
+      : KernelBase(b.size()), b(b), rhs(rhs), mg(mg)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx, std::vector<Real> &b, const Grid<Real> &rhs, const GridMg &mg) const
+  {
+    b[idx] = rhs[idx];
+
+    // scale down trivial equations
+    if (mg.mType[0][idx] == GridMg::vtActiveTrivial) {
+      b[idx] *= mg.mTrivialEquationScale;
+    };
+  }
+  inline std::vector<Real> &getArg0()
+  {
+    return b;
+  }
+  typedef std::vector<Real> type0;
+  inline const Grid<Real> &getArg1()
+  {
+    return rhs;
+  }
+  typedef Grid<Real> type1;
+  inline const GridMg &getArg2()
+  {
+    return mg;
+  }
+  typedef GridMg type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetRhs ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, b, rhs, mg);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  std::vector<Real> &b;
+  const Grid<Real> &rhs;
+  const GridMg &mg;
+};
+
+void GridMg::setRhs(const Grid<Real> &rhs)
+{
+  assertMsg(mIsASet, "GridMg::setRhs Error: A has not been set.");
+
+  knSetRhs(mb[0], rhs, *this);
+
+  mIsRhsSet = true;
+}
+
+template<class T> struct knSet : public KernelBase {
+  knSet(std::vector<T> &data, T value) : KernelBase(data.size()), data(data), value(value)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx, std::vector<T> &data, T value) const
+  {
+    data[idx] = value;
+  }
+  inline std::vector<T> &getArg0()
+  {
+    return data;
+  }
+  typedef std::vector<T> type0;
+  inline T &getArg1()
+  {
+    return value;
+  }
+  typedef T type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel knSet ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, data, value);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  std::vector<T> &data;
+  T value;
+};
+
+template<class T> struct knCopyToVector : public KernelBase {
+  knCopyToVector(std::vector<T> &dst, const Grid<T> &src)
+      : KernelBase(dst.size()), dst(dst), src(src)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx, std::vector<T> &dst, const Grid<T> &src) const
+  {
+    dst[idx] = src[idx];
+  }
+  inline std::vector<T> &getArg0()
+  {
+    return dst;
+  }
+  typedef std::vector<T> type0;
+  inline const Grid<T> &getArg1()
+  {
+    return src;
+  }
+  typedef Grid<T> type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel knCopyToVector ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, dst, src);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  std::vector<T> &dst;
+  const Grid<T> &src;
+};
+
+template<class T> struct knCopyToGrid : public KernelBase {
+  knCopyToGrid(const std::vector<T> &src, Grid<T> &dst)
+      : KernelBase(src.size()), src(src), dst(dst)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx, const std::vector<T> &src, Grid<T> &dst) const
+  {
+    dst[idx] = src[idx];
+  }
+  inline const std::vector<T> &getArg0()
+  {
+    return src;
+  }
+  typedef std::vector<T> type0;
+  inline Grid<T> &getArg1()
+  {
+    return dst;
+  }
+  typedef Grid<T> type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel knCopyToGrid ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, src, dst);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const std::vector<T> &src;
+  Grid<T> &dst;
+};
+
+template<class T> struct knAddAssign : public KernelBase {
+  knAddAssign(std::vector<T> &dst, const std::vector<T> &src)
+      : KernelBase(dst.size()), dst(dst), src(src)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx, std::vector<T> &dst, const std::vector<T> &src) const
+  {
+    dst[idx] += src[idx];
+  }
+  inline std::vector<T> &getArg0()
+  {
+    return dst;
+  }
+  typedef std::vector<T> type0;
+  inline const std::vector<T> &getArg1()
+  {
+    return src;
+  }
+  typedef std::vector<T> type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel knAddAssign ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, dst, src);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  std::vector<T> &dst;
+  const std::vector<T> &src;
+};
+
+Real GridMg::doVCycle(Grid<Real> &dst, const Grid<Real> *src)
+{
+  MG_TIMINGS(MuTime timeSmooth; MuTime timeCG; MuTime timeI; MuTime timeR; MuTime timeTotal;
+             MuTime time;)
+  MG_TIMINGS(timeSmooth.clear(); timeCG.clear(); timeI.clear(); timeR.clear();)
+
+  assertMsg(mIsASet && mIsRhsSet, "GridMg::doVCycle Error: A and/or rhs have not been set.");
+
+  const int maxLevel = int(mA.size()) - 1;
+
+  if (src) {
+    knCopyToVector<Real>(mx[0], *src);
+  }
+  else {
+    knSet<Real>(mx[0], Real(0));
+  }
+
+  for (int l = 0; l < maxLevel; l++) {
+    MG_TIMINGS(time.update();)
+    for (int i = 0; i < mNumPreSmooth; i++) {
+      smoothGS(l, false);
+    }
+
+    MG_TIMINGS(timeSmooth += time.update();)
+
+    calcResidual(l);
+    restrict(l + 1, mr[l], mb[l + 1]);
+
+    knSet<Real>(mx[l + 1], Real(0));
+
+    MG_TIMINGS(timeR += time.update();)
+  }
+
+  MG_TIMINGS(time.update();)
+  solveCG(maxLevel);
+  MG_TIMINGS(timeCG += time.update();)
+
+  for (int l = maxLevel - 1; l >= 0; l--) {
+    MG_TIMINGS(time.update();)
+    interpolate(l, mx[l + 1], mr[l]);
+
+    knAddAssign<Real>(mx[l], mr[l]);
+
+    MG_TIMINGS(timeI += time.update();)
+
+    for (int i = 0; i < mNumPostSmooth; i++) {
+      smoothGS(l, true);
+    }
+    MG_TIMINGS(timeSmooth += time.update();)
+  }
+
+  calcResidual(0);
+  Real res = calcResidualNorm(0);
+
+  knCopyToGrid<Real>(mx[0], dst);
+
+  MG_TIMINGS(debMsg("GridMg: Finished VCycle in "
+                        << timeTotal.update() << " (smoothing: " << timeSmooth
+                        << ", CG: " << timeCG << ", R: " << timeR << ", I: " << timeI << ")",
+                    1);)
+
+  return res;
+}
+
+struct knActivateCoarseVertices : public KernelBase {
+  knActivateCoarseVertices(std::vector<GridMg::VertexType> &type, int unused)
+      : KernelBase(type.size()), type(type), unused(unused)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx, std::vector<GridMg::VertexType> &type, int unused) const
+  {
+    // set all remaining 'free' vertices to 'removed',
+    if (type[idx] == GridMg::vtFree)
+      type[idx] = GridMg::vtRemoved;
+
+    // then convert 'zero' vertices to 'active' and 'removed' vertices to 'inactive'
+    if (type[idx] == GridMg::vtZero)
+      type[idx] = GridMg::vtActive;
+    if (type[idx] == GridMg::vtRemoved)
+      type[idx] = GridMg::vtInactive;
+  }
+  inline std::vector<GridMg::VertexType> &getArg0()
+  {
+    return type;
+  }
+  typedef std::vector<GridMg::VertexType> type0;
+  inline int &getArg1()
+  {
+    return unused;
+  }
+  typedef int type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel knActivateCoarseVertices ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, type, unused);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  std::vector<GridMg::VertexType> &type;
+  int unused;
+};
+
+// Determine active cells on coarse level l from active cells on fine level l-1
+// while ensuring a full-rank interpolation operator (see Section 3.3 in [1]).
+void GridMg::genCoarseGrid(int l)
+{
+  // AF_Free: unused/untouched vertices
+  // AF_Zero: vertices selected for coarser level
+  // AF_Removed: vertices removed from coarser level
+  enum activeFlags : char { AF_Removed = 0, AF_Zero = 1, AF_Free = 2 };
+
+  // initialize all coarse vertices with 'free'
+  knSet<VertexType>(mType[l], vtFree);
+
+  // initialize min heap of (ID: fine grid vertex, key: #free interpolation vertices) pairs
+  NKMinHeap heap(int(mb[l - 1].size()),
+                 mIs3D ? 9 : 5);  // max 8 (or 4 in 2D) free interpolation vertices
+
+  FOR_LVL(v, l - 1)
+  {
+    if (mType[l - 1][v] != vtInactive) {
+      Vec3i V = vecIdx(v, l - 1);
+      int fiv = 1 << ((V.x % 2) + (V.y % 2) + (V.z % 2));
+      heap.setKey(v, fiv);
+    }
+  }
+
+  // process fine vertices in heap consecutively, always choosing the vertex with
+  // the currently smallest number of free interpolation vertices
+  while (heap.size() > 0) {
+    int v = heap.popMin().first;
+    Vec3i V = vecIdx(v, l - 1);
+
+    // loop over associated interpolation vertices of V on coarse level l:
+    // the first encountered 'free' vertex is set to 'zero',
+    // all remaining 'free' vertices are set to 'removed'.
+    bool vdone = false;
+
+    FOR_VEC_MINMAX(I, V / 2, (V + 1) / 2)
+    {
+      int i = linIdx(I, l);
+
+      if (mType[l][i] == vtFree) {
+        if (vdone) {
+          mType[l][i] = vtRemoved;
+        }
+        else {
+          mType[l][i] = vtZero;
+          vdone = true;
+        }
+
+        // update #free interpolation vertices in heap:
+        // loop over all associated restriction vertices of I on fine level l-1
+        FOR_VEC_MINMAX(R, vmax(0, I * 2 - 1), vmin(mSize[l - 1] - 1, I * 2 + 1))
+        {
+          int r = linIdx(R, l - 1);
+          int key = heap.getKey(r);
+
+          if (key > 1) {
+            heap.setKey(r, key - 1);
+          }  // decrease key of r
+          else if (key > -1) {
+            heap.setKey(r, -1);
+          }  // removes r from heap
+        }
+      }
+    }
+  }
+
+  knActivateCoarseVertices(mType[l], 0);
+}
+
+struct knGenCoarseGridOperator : public KernelBase {
+  knGenCoarseGridOperator(std::vector<Real> &sizeRef,
+                          std::vector<Real> &A,
+                          int l,
+                          const GridMg &mg)
+      : KernelBase(sizeRef.size()), sizeRef(sizeRef), A(A), l(l), mg(mg)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 std::vector<Real> &sizeRef,
+                 std::vector<Real> &A,
+                 int l,
+                 const GridMg &mg) const
+  {
+    if (mg.mType[l][idx] == GridMg::vtInactive)
+      return;
+
+    for (int i = 0; i < mg.mStencilSize; i++) {
+      A[idx * mg.mStencilSize + i] = Real(0);
+    }  // clear stencil
+
+    Vec3i V = mg.vecIdx(int(idx), l);
+
+    // Calculate the stencil of A_l at V by considering all vertex paths of the form:
+    // (V) <--restriction-- (U) <--A_{l-1}-- (W) <--interpolation-- (N)
+    // V and N are vertices on the coarse grid level l,
+    // U and W are vertices on the fine grid level l-1.
+
+    if (l == 1) {
+      // loop over precomputed paths
+      for (auto it = mg.mCoarseningPaths0.begin(); it != mg.mCoarseningPaths0.end(); it++) {
+        Vec3i N = V + it->N;
+        int n = mg.linIdx(N, l);
+        if (!mg.inGrid(N, l) || mg.mType[l][n] == GridMg::vtInactive)
+          continue;
+
+        Vec3i U = V * 2 + it->U;
+        int u = mg.linIdx(U, l - 1);
+        if (!mg.inGrid(U, l - 1) || mg.mType[l - 1][u] == GridMg::vtInactive)
+          continue;
+
+        Vec3i W = V * 2 + it->W;
+        int w = mg.linIdx(W, l - 1);
+        if (!mg.inGrid(W, l - 1) || mg.mType[l - 1][w] == GridMg::vtInactive)
+          continue;
+
+        if (it->inUStencil) {
+          A[idx * mg.mStencilSize + it->sc] += it->rw *
+                                               mg.mA[l - 1][u * mg.mStencilSize0 + it->sf] *
+                                               it->iw;
+        }
+        else {
+          A[idx * mg.mStencilSize + it->sc] += it->rw *
+                                               mg.mA[l - 1][w * mg.mStencilSize0 + it->sf] *
+                                               it->iw;
+        }
+      }
+    }
+    else {
+      // l > 1:
+      // loop over restriction vertices U on level l-1 associated with V
+      FOR_VEC_MINMAX(U, vmax(0, V * 2 - 1), vmin(mg.mSize[l - 1] - 1, V * 2 + 1))
+      {
+        int u = mg.linIdx(U, l - 1);
+        if (mg.mType[l - 1][u] == GridMg::vtInactive)
+          continue;
+
+        // restriction weight
+        Real rw = Real(1) / Real(1 << ((U.x % 2) + (U.y % 2) + (U.z % 2)));
+
+        // loop over all stencil neighbors N of V on level l that can be reached via restriction to
+        // U
+        FOR_VEC_MINMAX(N, (U - 1) / 2, vmin(mg.mSize[l] - 1, (U + 2) / 2))
+        {
+          int n = mg.linIdx(N, l);
+          if (mg.mType[l][n] == GridMg::vtInactive)
+            continue;
+
+          // stencil entry at V associated to N (coarse grid level l)
+          Vec3i SC = N - V + mg.mStencilMax;
+          int sc = SC.x + 3 * SC.y + 9 * SC.z;
+          if (sc < mg.mStencilSize - 1)
+            continue;
+
+          // loop over all vertices W which are in the stencil of A_{l-1} at U
+          // and which interpolate from N
+          FOR_VEC_MINMAX(W,
+                         vmax(0, vmax(U - 1, N * 2 - 1)),
+                         vmin(mg.mSize[l - 1] - 1, vmin(U + 1, N * 2 + 1)))
+          {
+            int w = mg.linIdx(W, l - 1);
+            if (mg.mType[l - 1][w] == GridMg::vtInactive)
+              continue;
+
+            // stencil entry at U associated to W (fine grid level l-1)
+            Vec3i SF = W - U + mg.mStencilMax;
+            int sf = SF.x + 3 * SF.y + 9 * SF.z;
+
+            Real iw = Real(1) /
+                      Real(1 << ((W.x % 2) + (W.y % 2) + (W.z % 2)));  // interpolation weight
+
+            if (sf < mg.mStencilSize) {
+              A[idx * mg.mStencilSize + sc - mg.mStencilSize + 1] +=
+                  rw * mg.mA[l - 1][w * mg.mStencilSize + mg.mStencilSize - 1 - sf] * iw;
+            }
+            else {
+              A[idx * mg.mStencilSize + sc - mg.mStencilSize + 1] +=
+                  rw * mg.mA[l - 1][u * mg.mStencilSize + sf - mg.mStencilSize + 1] * iw;
+            }
+          }
+        }
+      }
+    }
+  }
+  inline std::vector<Real> &getArg0()
+  {
+    return sizeRef;
+  }
+  typedef std::vector<Real> type0;
+  inline std::vector<Real> &getArg1()
+  {
+    return A;
+  }
+  typedef std::vector<Real> type1;
+  inline int &getArg2()
+  {
+    return l;
+  }
+  typedef int type2;
+  inline const GridMg &getArg3()
+  {
+    return mg;
+  }
+  typedef GridMg type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel knGenCoarseGridOperator ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, sizeRef, A, l, mg);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  std::vector<Real> &sizeRef;
+  std::vector<Real> &A;
+  int l;
+  const GridMg &mg;
+};
+
+// Calculate A_l on coarse level l from A_{l-1} on fine level l-1 using
+// Galerkin-based coarsening, i.e., compute A_l = R * A_{l-1} * I.
+void GridMg::genCoraseGridOperator(int l)
+{
+  // for each coarse grid vertex V
+  knGenCoarseGridOperator(mx[l], mA[l], l, *this);
+}
+
+struct knSmoothColor : public KernelBase {
+  knSmoothColor(ThreadSize &numBlocks,
+                std::vector<Real> &x,
+                const Vec3i &blockSize,
+                const std::vector<Vec3i> &colorOffs,
+                int l,
+                const GridMg &mg)
+      : KernelBase(numBlocks.size()),
+        numBlocks(numBlocks),
+        x(x),
+        blockSize(blockSize),
+        colorOffs(colorOffs),
+        l(l),
+        mg(mg)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 ThreadSize &numBlocks,
+                 std::vector<Real> &x,
+                 const Vec3i &blockSize,
+                 const std::vector<Vec3i> &colorOffs,
+                 int l,
+                 const GridMg &mg) const
+  {
+    Vec3i blockOff(int(idx) % blockSize.x,
+                   (int(idx) % (blockSize.x * blockSize.y)) / blockSize.x,
+                   int(idx) / (blockSize.x * blockSize.y));
+
+    for (int off = 0; off < colorOffs.size(); off++) {
+
+      Vec3i V = blockOff * 2 + colorOffs[off];
+      if (!mg.inGrid(V, l))
+        continue;
+
+      const int v = mg.linIdx(V, l);
+      if (mg.mType[l][v] == GridMg::vtInactive)
+        continue;
+
+      Real sum = mg.mb[l][v];
+
+      if (l == 0) {
+        int n;
+        for (int d = 0; d < mg.mDim; d++) {
+          if (V[d] > 0) {
+            n = v - mg.mPitch[0][d];
+            sum -= mg.mA[0][n * mg.mStencilSize0 + d + 1] * mg.mx[0][n];
+          }
+          if (V[d] < mg.mSize[0][d] - 1) {
+            n = v + mg.mPitch[0][d];
+            sum -= mg.mA[0][v * mg.mStencilSize0 + d + 1] * mg.mx[0][n];
+          }
+        }
+
+        x[v] = sum / mg.mA[0][v * mg.mStencilSize0 + 0];
+      }
+      else {
+        FOR_VECLIN_MINMAX(S, s, mg.mStencilMin, mg.mStencilMax)
+        {
+          if (s == mg.mStencilSize - 1)
+            continue;
+
+          Vec3i N = V + S;
+          int n = mg.linIdx(N, l);
+
+          if (mg.inGrid(N, l) && mg.mType[l][n] != GridMg::vtInactive) {
+            if (s < mg.mStencilSize) {
+              sum -= mg.mA[l][n * mg.mStencilSize + mg.mStencilSize - 1 - s] * mg.mx[l][n];
+            }
+            else {
+              sum -= mg.mA[l][v * mg.mStencilSize + s - mg.mStencilSize + 1] * mg.mx[l][n];
+            }
+          }
+        }
+
+        x[v] = sum / mg.mA[l][v * mg.mStencilSize + 0];
+      }
+    }
+  }
+  inline ThreadSize &getArg0()
+  {
+    return numBlocks;
+  }
+  typedef ThreadSize type0;
+  inline std::vector<Real> &getArg1()
+  {
+    return x;
+  }
+  typedef std::vector<Real> type1;
+  inline const Vec3i &getArg2()
+  {
+    return blockSize;
+  }
+  typedef Vec3i type2;
+  inline const std::vector<Vec3i> &getArg3()
+  {
+    return colorOffs;
+  }
+  typedef std::vector<Vec3i> type3;
+  inline int &getArg4()
+  {
+    return l;
+  }
+  typedef int type4;
+  inline const GridMg &getArg5()
+  {
+    return mg;
+  }
+  typedef GridMg type5;
+  void runMessage()
+  {
+    debMsg("Executing kernel knSmoothColor ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, numBlocks, x, blockSize, colorOffs, l, mg);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  ThreadSize &numBlocks;
+  std::vector<Real> &x;
+  const Vec3i &blockSize;
+  const std::vector<Vec3i> &colorOffs;
+  int l;
+  const GridMg &mg;
+};
+
+void GridMg::smoothGS(int l, bool reversedOrder)
+{
+  // Multicolor Gauss-Seidel with two colors for the 5/7-point stencil on level 0
+  // and with four/eight colors for the 9/27-point stencil on levels > 0
+  std::vector<std::vector<Vec3i>> colorOffs;
+  const Vec3i a[8] = {Vec3i(0, 0, 0),
+                      Vec3i(1, 0, 0),
+                      Vec3i(0, 1, 0),
+                      Vec3i(1, 1, 0),
+                      Vec3i(0, 0, 1),
+                      Vec3i(1, 0, 1),
+                      Vec3i(0, 1, 1),
+                      Vec3i(1, 1, 1)};
+  if (mIs3D) {
+    if (l == 0)
+      colorOffs = {{a[0], a[3], a[5], a[6]}, {a[1], a[2], a[4], a[7]}};
+    else
+      colorOffs = {{a[0]}, {a[1]}, {a[2]}, {a[3]}, {a[4]}, {a[5]}, {a[6]}, {a[7]}};
+  }
+  else {
+    if (l == 0)
+      colorOffs = {{a[0], a[3]}, {a[1], a[2]}};
+    else
+      colorOffs = {{a[0]}, {a[1]}, {a[2]}, {a[3]}};
+  }
+
+  // Divide grid into 2x2 blocks for parallelization
+  Vec3i blockSize = (mSize[l] + 1) / 2;
+  ThreadSize numBlocks(blockSize.x * blockSize.y * blockSize.z);
+
+  for (int c = 0; c < colorOffs.size(); c++) {
+    int color = reversedOrder ? int(colorOffs.size()) - 1 - c : c;
+
+    knSmoothColor(numBlocks, mx[l], blockSize, colorOffs[color], l, *this);
+  }
+}
+
+struct knCalcResidual : public KernelBase {
+  knCalcResidual(std::vector<Real> &r, int l, const GridMg &mg)
+      : KernelBase(r.size()), r(r), l(l), mg(mg)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx, std::vector<Real> &r, int l, const GridMg &mg) const
+  {
+    if (mg.mType[l][idx] == GridMg::vtInactive)
+      return;
+
+    Vec3i V = mg.vecIdx(int(idx), l);
+
+    Real sum = mg.mb[l][idx];
+
+    if (l == 0) {
+      int n;
+      for (int d = 0; d < mg.mDim; d++) {
+        if (V[d] > 0) {
+          n = int(idx) - mg.mPitch[0][d];
+          sum -= mg.mA[0][n * mg.mStencilSize0 + d + 1] * mg.mx[0][n];
+        }
+        if (V[d] < mg.mSize[0][d] - 1) {
+          n = int(idx) + mg.mPitch[0][d];
+          sum -= mg.mA[0][idx * mg.mStencilSize0 + d + 1] * mg.mx[0][n];
+        }
+      }
+      sum -= mg.mA[0][idx * mg.mStencilSize0 + 0] * mg.mx[0][idx];
+    }
+    else {
+      FOR_VECLIN_MINMAX(S, s, mg.mStencilMin, mg.mStencilMax)
+      {
+        Vec3i N = V + S;
+        int n = mg.linIdx(N, l);
+
+        if (mg.inGrid(N, l) && mg.mType[l][n] != GridMg::vtInactive) {
+          if (s < mg.mStencilSize) {
+            sum -= mg.mA[l][n * mg.mStencilSize + mg.mStencilSize - 1 - s] * mg.mx[l][n];
+          }
+          else {
+            sum -= mg.mA[l][idx * mg.mStencilSize + s - mg.mStencilSize + 1] * mg.mx[l][n];
+          }
+        }
+      }
+    }
+
+    r[idx] = sum;
+  }
+  inline std::vector<Real> &getArg0()
+  {
+    return r;
+  }
+  typedef std::vector<Real> type0;
+  inline int &getArg1()
+  {
+    return l;
+  }
+  typedef int type1;
+  inline const GridMg &getArg2()
+  {
+    return mg;
+  }
+  typedef GridMg type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel knCalcResidual ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, r, l, mg);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  std::vector<Real> &r;
+  int l;
+  const GridMg &mg;
+};
+
+void GridMg::calcResidual(int l)
+{
+  knCalcResidual(mr[l], l, *this);
+}
+
+struct knResidualNormSumSqr : public KernelBase {
+  knResidualNormSumSqr(const vector<Real> &r, int l, const GridMg &mg)
+      : KernelBase(r.size()), r(r), l(l), mg(mg), result(Real(0))
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx, const vector<Real> &r, int l, const GridMg &mg, Real &result)
+  {
+    if (mg.mType[l][idx] == GridMg::vtInactive)
+      return;
+
+    result += r[idx] * r[idx];
+  }
+  inline operator Real()
+  {
+    return result;
+  }
+  inline Real &getRet()
+  {
+    return result;
+  }
+  inline const vector<Real> &getArg0()
+  {
+    return r;
+  }
+  typedef vector<Real> type0;
+  inline int &getArg1()
+  {
+    return l;
+  }
+  typedef int type1;
+  inline const GridMg &getArg2()
+  {
+    return mg;
+  }
+  typedef GridMg type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel knResidualNormSumSqr ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, r, l, mg, result);
+  }
+  void run()
+  {
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  knResidualNormSumSqr(knResidualNormSumSqr &o, tbb::split)
+      : KernelBase(o), r(o.r), l(o.l), mg(o.mg), result(Real(0))
+  {
+  }
+  void join(const knResidualNormSumSqr &o)
+  {
+    result += o.result;
+  }
+  const vector<Real> &r;
+  int l;
+  const GridMg &mg;
+  Real result;
+};
+;
+
+Real GridMg::calcResidualNorm(int l)
+{
+  Real res = knResidualNormSumSqr(mr[l], l, *this);
+
+  return std::sqrt(res);
+}
+
+// Standard conjugate gradients with Jacobi preconditioner
+// Notes: Always run at double precision. Not parallelized since
+//        coarsest level is assumed to be small.
+void GridMg::solveCG(int l)
+{
+  auto applyAStencil = [this](int v, int l, const std::vector<double> &vec) -> double {
+    Vec3i V = vecIdx(v, l);
+
+    double sum = 0;
+
+    if (l == 0) {
+      int n;
+      for (int d = 0; d < mDim; d++) {
+        if (V[d] > 0) {
+          n = v - mPitch[0][d];
+          sum += mA[0][n * mStencilSize0 + d + 1] * vec[n];
+        }
+        if (V[d] < mSize[0][d] - 1) {
+          n = v + mPitch[0][d];
+          sum += mA[0][v * mStencilSize0 + d + 1] * vec[n];
+        }
+      }
+      sum += mA[0][v * mStencilSize0 + 0] * vec[v];
+    }
+    else {
+      FOR_VECLIN_MINMAX(S, s, mStencilMin, mStencilMax)
+      {
+        Vec3i N = V + S;
+        int n = linIdx(N, l);
+
+        if (inGrid(N, l) && mType[l][n] != vtInactive) {
+          if (s < mStencilSize) {
+            sum += mA[l][n * mStencilSize + mStencilSize - 1 - s] * vec[n];
+          }
+          else {
+            sum += mA[l][v * mStencilSize + s - mStencilSize + 1] * vec[n];
+          }
+        }
+      }
+    }
+
+    return sum;
+  };
+
+  std::vector<double> &z = mCGtmp1[l];
+  std::vector<double> &p = mCGtmp2[l];
+  std::vector<double> &x = mCGtmp3[l];
+  std::vector<double> &r = mCGtmp4[l];
+
+  // Initialization:
+  double alphaTop = 0;
+  double initialResidual = 0;
+
+  FOR_LVL(v, l)
+  {
+    x[v] = mx[l][v];
+  }
+
+  FOR_LVL(v, l)
+  {
+    if (mType[l][v] == vtInactive)
+      continue;
+
+    r[v] = mb[l][v] - applyAStencil(v, l, x);
+    if (l == 0) {
+      z[v] = r[v] / mA[0][v * mStencilSize0 + 0];
+    }
+    else {
+      z[v] = r[v] / mA[l][v * mStencilSize + 0];
+    }
+
+    initialResidual += r[v] * r[v];
+    p[v] = z[v];
+    alphaTop += r[v] * z[v];
+  }
+
+  initialResidual = std::sqrt(initialResidual);
+
+  int iter = 0;
+  const int maxIter = 10000;
+  double residual = -1;
+
+  // CG iterations
+  for (; iter < maxIter && initialResidual > 1E-12; iter++) {
+    double alphaBot = 0;
+
+    FOR_LVL(v, l)
+    {
+      if (mType[l][v] == vtInactive)
+        continue;
+
+      z[v] = applyAStencil(v, l, p);
+      alphaBot += p[v] * z[v];
+    }
+
+    double alpha = alphaTop / alphaBot;
+
+    double alphaTopNew = 0;
+    residual = 0;
+
+    FOR_LVL(v, l)
+    {
+      if (mType[l][v] == vtInactive)
+        continue;
+
+      x[v] += alpha * p[v];
+      r[v] -= alpha * z[v];
+      residual += r[v] * r[v];
+      if (l == 0)
+        z[v] = r[v] / mA[0][v * mStencilSize0 + 0];
+      else
+        z[v] = r[v] / mA[l][v * mStencilSize + 0];
+      alphaTopNew += r[v] * z[v];
+    }
+
+    residual = std::sqrt(residual);
+
+    if (residual / initialResidual < mCoarsestLevelAccuracy)
+      break;
+
+    double beta = alphaTopNew / alphaTop;
+    alphaTop = alphaTopNew;
+
+    FOR_LVL(v, l)
+    {
+      p[v] = z[v] + beta * p[v];
+    }
+    debMsg("GridMg::solveCG i=" << iter << " rel-residual=" << (residual / initialResidual), 5);
+  }
+
+  FOR_LVL(v, l)
+  {
+    mx[l][v] = Real(x[v]);
+  }
+
+  if (iter == maxIter) {
+    debMsg("GridMg::solveCG Warning: Reached maximum number of CG iterations", 1);
+  }
+  else {
+    debMsg("GridMg::solveCG Info: Reached residual " << residual << " in " << iter
+                                                     << " iterations",
+           2);
+  }
+}
+
+struct knRestrict : public KernelBase {
+  knRestrict(std::vector<Real> &dst, const std::vector<Real> &src, int l_dst, const GridMg &mg)
+      : KernelBase(dst.size()), dst(dst), src(src), l_dst(l_dst), mg(mg)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 std::vector<Real> &dst,
+                 const std::vector<Real> &src,
+                 int l_dst,
+                 const GridMg &mg) const
+  {
+    if (mg.mType[l_dst][idx] == GridMg::vtInactive)
+      return;
+
+    const int l_src = l_dst - 1;
+
+    // Coarse grid vertex
+    Vec3i V = mg.vecIdx(int(idx), l_dst);
+
+    Real sum = Real(0);
+
+    FOR_VEC_MINMAX(R, vmax(0, V * 2 - 1), vmin(mg.mSize[l_src] - 1, V * 2 + 1))
+    {
+      int r = mg.linIdx(R, l_src);
+      if (mg.mType[l_src][r] == GridMg::vtInactive)
+        continue;
+
+      // restriction weight
+      Real rw = Real(1) / Real(1 << ((R.x % 2) + (R.y % 2) + (R.z % 2)));
+
+      sum += rw * src[r];
+    }
+
+    dst[idx] = sum;
+  }
+  inline std::vector<Real> &getArg0()
+  {
+    return dst;
+  }
+  typedef std::vector<Real> type0;
+  inline const std::vector<Real> &getArg1()
+  {
+    return src;
+  }
+  typedef std::vector<Real> type1;
+  inline int &getArg2()
+  {
+    return l_dst;
+  }
+  typedef int type2;
+  inline const GridMg &getArg3()
+  {
+    return mg;
+  }
+  typedef GridMg type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel knRestrict ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, dst, src, l_dst, mg);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  std::vector<Real> &dst;
+  const std::vector<Real> &src;
+  int l_dst;
+  const GridMg &mg;
+};
+
+void GridMg::restrict(int l_dst, const std::vector<Real> &src, std::vector<Real> &dst) const
+{
+  knRestrict(dst, src, l_dst, *this);
+}
+
+struct knInterpolate : public KernelBase {
+  knInterpolate(std::vector<Real> &dst, const std::vector<Real> &src, int l_dst, const GridMg &mg)
+      : KernelBase(dst.size()), dst(dst), src(src), l_dst(l_dst), mg(mg)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 std::vector<Real> &dst,
+                 const std::vector<Real> &src,
+                 int l_dst,
+                 const GridMg &mg) const
+  {
+    if (mg.mType[l_dst][idx] == GridMg::vtInactive)
+      return;
+
+    const int l_src = l_dst + 1;
+
+    Vec3i V = mg.vecIdx(int(idx), l_dst);
+
+    Real sum = Real(0);
+
+    FOR_VEC_MINMAX(I, V / 2, (V + 1) / 2)
+    {
+      int i = mg.linIdx(I, l_src);
+      if (mg.mType[l_src][i] != GridMg::vtInactive)
+        sum += src[i];
+    }
+
+    // interpolation weight
+    Real iw = Real(1) / Real(1 << ((V.x % 2) + (V.y % 2) + (V.z % 2)));
+
+    dst[idx] = iw * sum;
+  }
+  inline std::vector<Real> &getArg0()
+  {
+    return dst;
+  }
+  typedef std::vector<Real> type0;
+  inline const std::vector<Real> &getArg1()
+  {
+    return src;
+  }
+  typedef std::vector<Real> type1;
+  inline int &getArg2()
+  {
+    return l_dst;
+  }
+  typedef int type2;
+  inline const GridMg &getArg3()
+  {
+    return mg;
+  }
+  typedef GridMg type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel knInterpolate ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, dst, src, l_dst, mg);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  std::vector<Real> &dst;
+  const std::vector<Real> &src;
+  int l_dst;
+  const GridMg &mg;
+};
+
+void GridMg::interpolate(int l_dst, const std::vector<Real> &src, std::vector<Real> &dst) const
+{
+  knInterpolate(dst, src, l_dst, *this);
+}
+
+};  // namespace Manta
author	Sebastián Barschkis <sebbas@sebbas.org>	2019-12-16 17:40:15 +0300
committer	Sebastián Barschkis <sebbas@sebbas.org>	2019-12-16 18:27:26 +0300
commit	4ff7c5eed6b546ae42692f3a869a5ccc095a9cb4 (patch)
tree	03f8233788ae46e66672f711e3cf42d8d00d01cc /extern/mantaflow/preprocessed/multigrid.cpp
parent	6a3f2b30d206df23120cd212132adea821b6c20e (diff)