16 files changed, 21044 insertions, 0 deletions
diff --git a/extern/mantaflow/preprocessed/plugin/advection.cpp b/extern/mantaflow/preprocessed/plugin/advection.cpp
new file mode 100644
index 00000000000..13f53140348
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/advection.cpp
@@ -0,0 +1,1521 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Copyright 2011-2015 Tobias Pfaff, Nils Thuerey
+ *
+ * This program is free software, distributed under the terms of the
+ * Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Plugins for advection
+ *
+ ******************************************************************************/
+
+#include "vectorbase.h"
+#include "grid.h"
+#include "kernel.h"
+#include <limits>
+
+using namespace std;
+
+namespace Manta {
+
+//! Semi-Lagrange interpolation kernel
+
+template<class T> struct SemiLagrange : public KernelBase {
+  SemiLagrange(const FlagGrid &flags,
+               const MACGrid &vel,
+               Grid<T> &dst,
+               const Grid<T> &src,
+               Real dt,
+               bool isLevelset,
+               int orderSpace,
+               int orderTrace)
+      : KernelBase(&flags, 1),
+        flags(flags),
+        vel(vel),
+        dst(dst),
+        src(src),
+        dt(dt),
+        isLevelset(isLevelset),
+        orderSpace(orderSpace),
+        orderTrace(orderTrace)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 const MACGrid &vel,
+                 Grid<T> &dst,
+                 const Grid<T> &src,
+                 Real dt,
+                 bool isLevelset,
+                 int orderSpace,
+                 int orderTrace) const
+  {
+    if (orderTrace == 1) {
+      // traceback position
+      Vec3 pos = Vec3(i + 0.5f, j + 0.5f, k + 0.5f) - vel.getCentered(i, j, k) * dt;
+      dst(i, j, k) = src.getInterpolatedHi(pos, orderSpace);
+    }
+    else if (orderTrace == 2) {
+      // backtracing using explicit midpoint
+      Vec3 p0 = Vec3(i + 0.5f, j + 0.5f, k + 0.5f);
+      Vec3 p1 = p0 - vel.getCentered(i, j, k) * dt * 0.5;
+      Vec3 p2 = p0 - vel.getInterpolated(p1) * dt;
+      dst(i, j, k) = src.getInterpolatedHi(p2, orderSpace);
+    }
+    else {
+      assertMsg(false, "Unknown backtracing order " << orderTrace);
+    }
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline const MACGrid &getArg1()
+  {
+    return vel;
+  }
+  typedef MACGrid type1;
+  inline Grid<T> &getArg2()
+  {
+    return dst;
+  }
+  typedef Grid<T> type2;
+  inline const Grid<T> &getArg3()
+  {
+    return src;
+  }
+  typedef Grid<T> type3;
+  inline Real &getArg4()
+  {
+    return dt;
+  }
+  typedef Real type4;
+  inline bool &getArg5()
+  {
+    return isLevelset;
+  }
+  typedef bool type5;
+  inline int &getArg6()
+  {
+    return orderSpace;
+  }
+  typedef int type6;
+  inline int &getArg7()
+  {
+    return orderTrace;
+  }
+  typedef int type7;
+  void runMessage()
+  {
+    debMsg("Executing kernel SemiLagrange ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, dst, src, dt, isLevelset, orderSpace, orderTrace);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, dst, src, dt, isLevelset, orderSpace, orderTrace);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  const MACGrid &vel;
+  Grid<T> &dst;
+  const Grid<T> &src;
+  Real dt;
+  bool isLevelset;
+  int orderSpace;
+  int orderTrace;
+};
+
+//! Semi-Lagrange interpolation kernel for MAC grids
+
+struct SemiLagrangeMAC : public KernelBase {
+  SemiLagrangeMAC(const FlagGrid &flags,
+                  const MACGrid &vel,
+                  MACGrid &dst,
+                  const MACGrid &src,
+                  Real dt,
+                  int orderSpace,
+                  int orderTrace)
+      : KernelBase(&flags, 1),
+        flags(flags),
+        vel(vel),
+        dst(dst),
+        src(src),
+        dt(dt),
+        orderSpace(orderSpace),
+        orderTrace(orderTrace)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 const MACGrid &vel,
+                 MACGrid &dst,
+                 const MACGrid &src,
+                 Real dt,
+                 int orderSpace,
+                 int orderTrace) const
+  {
+    if (orderTrace == 1) {
+      // get currect velocity at MAC position
+      // no need to shift xpos etc. as lookup field is also shifted
+      Vec3 xpos = Vec3(i + 0.5f, j + 0.5f, k + 0.5f) - vel.getAtMACX(i, j, k) * dt;
+      Real vx = src.getInterpolatedComponentHi<0>(xpos, orderSpace);
+      Vec3 ypos = Vec3(i + 0.5f, j + 0.5f, k + 0.5f) - vel.getAtMACY(i, j, k) * dt;
+      Real vy = src.getInterpolatedComponentHi<1>(ypos, orderSpace);
+      Vec3 zpos = Vec3(i + 0.5f, j + 0.5f, k + 0.5f) - vel.getAtMACZ(i, j, k) * dt;
+      Real vz = src.getInterpolatedComponentHi<2>(zpos, orderSpace);
+
+      dst(i, j, k) = Vec3(vx, vy, vz);
+    }
+    else if (orderTrace == 2) {
+      Vec3 p0 = Vec3(i + 0.5, j + 0.5, k + 0.5);
+      Vec3 xp0 = Vec3(i, j + 0.5f, k + 0.5f);
+      Vec3 xp1 = xp0 - src.getAtMACX(i, j, k) * dt * 0.5;
+      Vec3 xp2 = p0 - src.getInterpolated(xp1) * dt;
+      Real vx = src.getInterpolatedComponentHi<0>(xp2, orderSpace);
+      Vec3 yp0 = Vec3(i + 0.5f, j, k + 0.5f);
+      Vec3 yp1 = yp0 - src.getAtMACY(i, j, k) * dt * 0.5;
+      Vec3 yp2 = p0 - src.getInterpolated(yp1) * dt;
+      Real vy = src.getInterpolatedComponentHi<1>(yp2, orderSpace);
+      Vec3 zp0 = Vec3(i + 0.5f, j + 0.5f, k);
+      Vec3 zp1 = zp0 - src.getAtMACZ(i, j, k) * dt * 0.5;
+      Vec3 zp2 = p0 - src.getInterpolated(zp1) * dt;
+      Real vz = src.getInterpolatedComponentHi<2>(zp2, orderSpace);
+
+      dst(i, j, k) = Vec3(vx, vy, vz);
+    }
+    else {
+      assertMsg(false, "Unknown backtracing order " << orderTrace);
+    }
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline const MACGrid &getArg1()
+  {
+    return vel;
+  }
+  typedef MACGrid type1;
+  inline MACGrid &getArg2()
+  {
+    return dst;
+  }
+  typedef MACGrid type2;
+  inline const MACGrid &getArg3()
+  {
+    return src;
+  }
+  typedef MACGrid type3;
+  inline Real &getArg4()
+  {
+    return dt;
+  }
+  typedef Real type4;
+  inline int &getArg5()
+  {
+    return orderSpace;
+  }
+  typedef int type5;
+  inline int &getArg6()
+  {
+    return orderTrace;
+  }
+  typedef int type6;
+  void runMessage()
+  {
+    debMsg("Executing kernel SemiLagrangeMAC ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, dst, src, dt, orderSpace, orderTrace);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, dst, src, dt, orderSpace, orderTrace);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  const MACGrid &vel;
+  MACGrid &dst;
+  const MACGrid &src;
+  Real dt;
+  int orderSpace;
+  int orderTrace;
+};
+
+//! Kernel: Correct based on forward and backward SL steps (for both centered & mac grids)
+
+template<class T> struct MacCormackCorrect : public KernelBase {
+  MacCormackCorrect(const FlagGrid &flags,
+                    Grid<T> &dst,
+                    const Grid<T> &old,
+                    const Grid<T> &fwd,
+                    const Grid<T> &bwd,
+                    Real strength,
+                    bool isLevelSet,
+                    bool isMAC = false)
+      : KernelBase(&flags, 0),
+        flags(flags),
+        dst(dst),
+        old(old),
+        fwd(fwd),
+        bwd(bwd),
+        strength(strength),
+        isLevelSet(isLevelSet),
+        isMAC(isMAC)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const FlagGrid &flags,
+                 Grid<T> &dst,
+                 const Grid<T> &old,
+                 const Grid<T> &fwd,
+                 const Grid<T> &bwd,
+                 Real strength,
+                 bool isLevelSet,
+                 bool isMAC = false) const
+  {
+    dst[idx] = fwd[idx];
+
+    if (flags.isFluid(idx)) {
+      // only correct inside fluid region; note, strenth of correction can be modified here
+      dst[idx] += strength * 0.5 * (old[idx] - bwd[idx]);
+    }
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<T> &getArg1()
+  {
+    return dst;
+  }
+  typedef Grid<T> type1;
+  inline const Grid<T> &getArg2()
+  {
+    return old;
+  }
+  typedef Grid<T> type2;
+  inline const Grid<T> &getArg3()
+  {
+    return fwd;
+  }
+  typedef Grid<T> type3;
+  inline const Grid<T> &getArg4()
+  {
+    return bwd;
+  }
+  typedef Grid<T> type4;
+  inline Real &getArg5()
+  {
+    return strength;
+  }
+  typedef Real type5;
+  inline bool &getArg6()
+  {
+    return isLevelSet;
+  }
+  typedef bool type6;
+  inline bool &getArg7()
+  {
+    return isMAC;
+  }
+  typedef bool type7;
+  void runMessage()
+  {
+    debMsg("Executing kernel MacCormackCorrect ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, flags, dst, old, fwd, bwd, strength, isLevelSet, isMAC);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const FlagGrid &flags;
+  Grid<T> &dst;
+  const Grid<T> &old;
+  const Grid<T> &fwd;
+  const Grid<T> &bwd;
+  Real strength;
+  bool isLevelSet;
+  bool isMAC;
+};
+
+//! Kernel: Correct based on forward and backward SL steps (for both centered & mac grids)
+
+template<class T> struct MacCormackCorrectMAC : public KernelBase {
+  MacCormackCorrectMAC(const FlagGrid &flags,
+                       Grid<T> &dst,
+                       const Grid<T> &old,
+                       const Grid<T> &fwd,
+                       const Grid<T> &bwd,
+                       Real strength,
+                       bool isLevelSet,
+                       bool isMAC = false)
+      : KernelBase(&flags, 0),
+        flags(flags),
+        dst(dst),
+        old(old),
+        fwd(fwd),
+        bwd(bwd),
+        strength(strength),
+        isLevelSet(isLevelSet),
+        isMAC(isMAC)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 Grid<T> &dst,
+                 const Grid<T> &old,
+                 const Grid<T> &fwd,
+                 const Grid<T> &bwd,
+                 Real strength,
+                 bool isLevelSet,
+                 bool isMAC = false) const
+  {
+    bool skip[3] = {false, false, false};
+
+    if (!flags.isFluid(i, j, k))
+      skip[0] = skip[1] = skip[2] = true;
+    if (isMAC) {
+      if ((i > 0) && (!flags.isFluid(i - 1, j, k)))
+        skip[0] = true;
+      if ((j > 0) && (!flags.isFluid(i, j - 1, k)))
+        skip[1] = true;
+      if ((k > 0) && (!flags.isFluid(i, j, k - 1)))
+        skip[2] = true;
+    }
+
+    for (int c = 0; c < 3; ++c) {
+      if (skip[c]) {
+        dst(i, j, k)[c] = fwd(i, j, k)[c];
+      }
+      else {
+        // perform actual correction with given strength
+        dst(i, j, k)[c] = fwd(i, j, k)[c] + strength * 0.5 * (old(i, j, k)[c] - bwd(i, j, k)[c]);
+      }
+    }
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<T> &getArg1()
+  {
+    return dst;
+  }
+  typedef Grid<T> type1;
+  inline const Grid<T> &getArg2()
+  {
+    return old;
+  }
+  typedef Grid<T> type2;
+  inline const Grid<T> &getArg3()
+  {
+    return fwd;
+  }
+  typedef Grid<T> type3;
+  inline const Grid<T> &getArg4()
+  {
+    return bwd;
+  }
+  typedef Grid<T> type4;
+  inline Real &getArg5()
+  {
+    return strength;
+  }
+  typedef Real type5;
+  inline bool &getArg6()
+  {
+    return isLevelSet;
+  }
+  typedef bool type6;
+  inline bool &getArg7()
+  {
+    return isMAC;
+  }
+  typedef bool type7;
+  void runMessage()
+  {
+    debMsg("Executing kernel MacCormackCorrectMAC ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, dst, old, fwd, bwd, strength, isLevelSet, isMAC);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, dst, old, fwd, bwd, strength, isLevelSet, isMAC);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const FlagGrid &flags;
+  Grid<T> &dst;
+  const Grid<T> &old;
+  const Grid<T> &fwd;
+  const Grid<T> &bwd;
+  Real strength;
+  bool isLevelSet;
+  bool isMAC;
+};
+
+// Helper to collect min/max in a template
+template<class T> inline void getMinMax(T &minv, T &maxv, const T &val)
+{
+  if (val < minv)
+    minv = val;
+  if (val > maxv)
+    maxv = val;
+}
+template<> inline void getMinMax<Vec3>(Vec3 &minv, Vec3 &maxv, const Vec3 &val)
+{
+  getMinMax(minv.x, maxv.x, val.x);
+  getMinMax(minv.y, maxv.y, val.y);
+  getMinMax(minv.z, maxv.z, val.z);
+}
+
+//! detect out of bounds value
+template<class T> inline bool cmpMinMax(T &minv, T &maxv, const T &val)
+{
+  if (val < minv)
+    return true;
+  if (val > maxv)
+    return true;
+  return false;
+}
+template<> inline bool cmpMinMax<Vec3>(Vec3 &minv, Vec3 &maxv, const Vec3 &val)
+{
+  return (cmpMinMax(minv.x, maxv.x, val.x) | cmpMinMax(minv.y, maxv.y, val.y) |
+          cmpMinMax(minv.z, maxv.z, val.z));
+}
+
+#define checkFlag(x, y, z) (flags((x), (y), (z)) & (FlagGrid::TypeFluid | FlagGrid::TypeEmpty))
+
+//! Helper function for clamping non-mac grids (those have specialized per component version below)
+//  Note - 2 clamp modes, a sharper one (default, clampMode 1, also uses backward step),
+//         and a softer version (clampMode 2) that is recommended in Andy's paper
+template<class T>
+inline T doClampComponent(const Vec3i &gridSize,
+                          const FlagGrid &flags,
+                          T dst,
+                          const Grid<T> &orig,
+                          const T fwd,
+                          const Vec3 &pos,
+                          const Vec3 &vel,
+                          const int clampMode)
+{
+  T minv(std::numeric_limits<Real>::max()), maxv(-std::numeric_limits<Real>::max());
+  bool haveFl = false;
+
+  // forward (and optionally) backward
+  Vec3i positions[2];
+  int numPos = 1;
+  positions[0] = toVec3i(pos - vel);
+  if (clampMode == 1) {
+    numPos = 2;
+    positions[1] = toVec3i(pos + vel);
+  }
+
+  for (int l = 0; l < numPos; ++l) {
+    Vec3i &currPos = positions[l];
+
+    // clamp lookup to grid
+    const int i0 = clamp(currPos.x, 0, gridSize.x - 1);  // note! gridsize already has -1 from call
+    const int j0 = clamp(currPos.y, 0, gridSize.y - 1);
+    const int k0 = clamp(currPos.z, 0, (orig.is3D() ? (gridSize.z - 1) : 1));
+    const int i1 = i0 + 1, j1 = j0 + 1, k1 = (orig.is3D() ? (k0 + 1) : k0);
+
+    // find min/max around source pos
+    if (checkFlag(i0, j0, k0)) {
+      getMinMax(minv, maxv, orig(i0, j0, k0));
+      haveFl = true;
+    }
+    if (checkFlag(i1, j0, k0)) {
+      getMinMax(minv, maxv, orig(i1, j0, k0));
+      haveFl = true;
+    }
+    if (checkFlag(i0, j1, k0)) {
+      getMinMax(minv, maxv, orig(i0, j1, k0));
+      haveFl = true;
+    }
+    if (checkFlag(i1, j1, k0)) {
+      getMinMax(minv, maxv, orig(i1, j1, k0));
+      haveFl = true;
+    }
+
+    if (orig.is3D()) {
+      if (checkFlag(i0, j0, k1)) {
+        getMinMax(minv, maxv, orig(i0, j0, k1));
+        haveFl = true;
+      }
+      if (checkFlag(i1, j0, k1)) {
+        getMinMax(minv, maxv, orig(i1, j0, k1));
+        haveFl = true;
+      }
+      if (checkFlag(i0, j1, k1)) {
+        getMinMax(minv, maxv, orig(i0, j1, k1));
+        haveFl = true;
+      }
+      if (checkFlag(i1, j1, k1)) {
+        getMinMax(minv, maxv, orig(i1, j1, k1));
+        haveFl = true;
+      }
+    }
+  }
+
+  if (!haveFl)
+    return fwd;
+  if (clampMode == 1) {
+    dst = clamp(dst, minv, maxv);  // hard clamp
+  }
+  else {
+    if (cmpMinMax(minv, maxv, dst))
+      dst = fwd;  // recommended in paper, "softer"
+  }
+  return dst;
+}
+
+//! Helper function for clamping MAC grids, slight differences in flag checks
+//  similar to scalar version, just uses single component c of vec3 values
+//  for symmetry, reverts to first order near boundaries for clampMode 2
+template<int c>
+inline Real doClampComponentMAC(const FlagGrid &flags,
+                                const Vec3i &gridSize,
+                                Real dst,
+                                const MACGrid &orig,
+                                Real fwd,
+                                const Vec3 &pos,
+                                const Vec3 &vel,
+                                const int clampMode)
+{
+  Real minv = std::numeric_limits<Real>::max(), maxv = -std::numeric_limits<Real>::max();
+  // bool haveFl = false;
+
+  // forward (and optionally) backward
+  Vec3i positions[2];
+  int numPos = 1;
+  positions[0] = toVec3i(pos - vel);
+  if (clampMode == 1) {
+    numPos = 2;
+    positions[1] = toVec3i(pos + vel);
+  }
+
+  Vec3i oPos = toVec3i(pos);
+  Vec3i nbPos = oPos;
+  nbPos[c] -= 1;
+  if (clampMode == 2 &&
+      (!(checkFlag(oPos.x, oPos.y, oPos.z) && checkFlag(nbPos.x, nbPos.y, nbPos.z))))
+    return fwd;  // replaces haveFl check
+
+  for (int l = 0; l < numPos; ++l) {
+    Vec3i &currPos = positions[l];
+
+    const int i0 = clamp(currPos.x, 0, gridSize.x - 1);  // note! gridsize already has -1 from call
+    const int j0 = clamp(
+        currPos.y, 0, gridSize.y - 1);  // but we need a clamp to -2 for the +1 offset below
+    const int k0 = clamp(currPos.z, 0, (orig.is3D() ? (gridSize.z - 1) : 0));
+    const int i1 = i0 + 1, j1 = j0 + 1, k1 = (orig.is3D() ? (k0 + 1) : k0);
+
+    // find min/max around source pos
+    getMinMax(minv, maxv, orig(i0, j0, k0)[c]);
+    getMinMax(minv, maxv, orig(i1, j0, k0)[c]);
+    getMinMax(minv, maxv, orig(i0, j1, k0)[c]);
+    getMinMax(minv, maxv, orig(i1, j1, k0)[c]);
+
+    if (orig.is3D()) {
+      getMinMax(minv, maxv, orig(i0, j0, k1)[c]);
+      getMinMax(minv, maxv, orig(i1, j0, k1)[c]);
+      getMinMax(minv, maxv, orig(i0, j1, k1)[c]);
+      getMinMax(minv, maxv, orig(i1, j1, k1)[c]);
+    }
+  }
+
+  if (clampMode == 1) {
+    dst = clamp(dst, minv, maxv);  // hard clamp
+  }
+  else {
+    if (cmpMinMax(minv, maxv, dst))
+      dst = fwd;  // recommended in paper, "softer"
+  }
+  return dst;
+}
+
+#undef checkFlag
+
+//! Kernel: Clamp obtained value to min/max in source area, and reset values that point out of grid
+//! or into boundaries
+//          (note - MAC grids are handled below)
+
+template<class T> struct MacCormackClamp : public KernelBase {
+  MacCormackClamp(const FlagGrid &flags,
+                  const MACGrid &vel,
+                  Grid<T> &dst,
+                  const Grid<T> &orig,
+                  const Grid<T> &fwd,
+                  Real dt,
+                  const int clampMode)
+      : KernelBase(&flags, 1),
+        flags(flags),
+        vel(vel),
+        dst(dst),
+        orig(orig),
+        fwd(fwd),
+        dt(dt),
+        clampMode(clampMode)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 const MACGrid &vel,
+                 Grid<T> &dst,
+                 const Grid<T> &orig,
+                 const Grid<T> &fwd,
+                 Real dt,
+                 const int clampMode) const
+  {
+    T dval = dst(i, j, k);
+    Vec3i gridUpper = flags.getSize() - 1;
+
+    dval = doClampComponent<T>(gridUpper,
+                               flags,
+                               dval,
+                               orig,
+                               fwd(i, j, k),
+                               Vec3(i, j, k),
+                               vel.getCentered(i, j, k) * dt,
+                               clampMode);
+
+    if (1 && clampMode == 1) {
+      // lookup forward/backward , round to closest NB
+      Vec3i posFwd = toVec3i(Vec3(i, j, k) + Vec3(0.5, 0.5, 0.5) - vel.getCentered(i, j, k) * dt);
+      Vec3i posBwd = toVec3i(Vec3(i, j, k) + Vec3(0.5, 0.5, 0.5) + vel.getCentered(i, j, k) * dt);
+
+      // test if lookups point out of grid or into obstacle (note doClampComponent already checks
+      // sides, below is needed for valid flags access)
+      if (posFwd.x < 0 || posFwd.y < 0 || posFwd.z < 0 || posBwd.x < 0 || posBwd.y < 0 ||
+          posBwd.z < 0 || posFwd.x > gridUpper.x || posFwd.y > gridUpper.y ||
+          ((posFwd.z > gridUpper.z) && flags.is3D()) || posBwd.x > gridUpper.x ||
+          posBwd.y > gridUpper.y || ((posBwd.z > gridUpper.z) && flags.is3D()) ||
+          flags.isObstacle(posFwd) || flags.isObstacle(posBwd)) {
+        dval = fwd(i, j, k);
+      }
+    }
+    // clampMode 2 handles flags in doClampComponent call
+
+    dst(i, j, k) = dval;
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline const MACGrid &getArg1()
+  {
+    return vel;
+  }
+  typedef MACGrid type1;
+  inline Grid<T> &getArg2()
+  {
+    return dst;
+  }
+  typedef Grid<T> type2;
+  inline const Grid<T> &getArg3()
+  {
+    return orig;
+  }
+  typedef Grid<T> type3;
+  inline const Grid<T> &getArg4()
+  {
+    return fwd;
+  }
+  typedef Grid<T> type4;
+  inline Real &getArg5()
+  {
+    return dt;
+  }
+  typedef Real type5;
+  inline const int &getArg6()
+  {
+    return clampMode;
+  }
+  typedef int type6;
+  void runMessage()
+  {
+    debMsg("Executing kernel MacCormackClamp ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  const MACGrid &vel;
+  Grid<T> &dst;
+  const Grid<T> &orig;
+  const Grid<T> &fwd;
+  Real dt;
+  const int clampMode;
+};
+
+//! Kernel: same as MacCormackClamp above, but specialized version for MAC grids
+
+struct MacCormackClampMAC : public KernelBase {
+  MacCormackClampMAC(const FlagGrid &flags,
+                     const MACGrid &vel,
+                     MACGrid &dst,
+                     const MACGrid &orig,
+                     const MACGrid &fwd,
+                     Real dt,
+                     const int clampMode)
+      : KernelBase(&flags, 1),
+        flags(flags),
+        vel(vel),
+        dst(dst),
+        orig(orig),
+        fwd(fwd),
+        dt(dt),
+        clampMode(clampMode)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 const MACGrid &vel,
+                 MACGrid &dst,
+                 const MACGrid &orig,
+                 const MACGrid &fwd,
+                 Real dt,
+                 const int clampMode) const
+  {
+    Vec3 pos(i, j, k);
+    Vec3 dval = dst(i, j, k);
+    Vec3 dfwd = fwd(i, j, k);
+    Vec3i gridUpper = flags.getSize() - 1;
+
+    dval.x = doClampComponentMAC<0>(
+        flags, gridUpper, dval.x, orig, dfwd.x, pos, vel.getAtMACX(i, j, k) * dt, clampMode);
+    dval.y = doClampComponentMAC<1>(
+        flags, gridUpper, dval.y, orig, dfwd.y, pos, vel.getAtMACY(i, j, k) * dt, clampMode);
+    if (flags.is3D())
+      dval.z = doClampComponentMAC<2>(
+          flags, gridUpper, dval.z, orig, dfwd.z, pos, vel.getAtMACZ(i, j, k) * dt, clampMode);
+
+    // note - the MAC version currently does not check whether source points were inside an
+    // obstacle! (unlike centered version) this would need to be done for each face separately to
+    // stay symmetric...
+
+    dst(i, j, k) = dval;
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline const MACGrid &getArg1()
+  {
+    return vel;
+  }
+  typedef MACGrid type1;
+  inline MACGrid &getArg2()
+  {
+    return dst;
+  }
+  typedef MACGrid type2;
+  inline const MACGrid &getArg3()
+  {
+    return orig;
+  }
+  typedef MACGrid type3;
+  inline const MACGrid &getArg4()
+  {
+    return fwd;
+  }
+  typedef MACGrid type4;
+  inline Real &getArg5()
+  {
+    return dt;
+  }
+  typedef Real type5;
+  inline const int &getArg6()
+  {
+    return clampMode;
+  }
+  typedef int type6;
+  void runMessage()
+  {
+    debMsg("Executing kernel MacCormackClampMAC ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  const MACGrid &vel;
+  MACGrid &dst;
+  const MACGrid &orig;
+  const MACGrid &fwd;
+  Real dt;
+  const int clampMode;
+};
+
+//! template function for performing SL advection
+//! (Note boundary width only needed for specialization for MAC grids below)
+template<class GridType>
+void fnAdvectSemiLagrange(FluidSolver *parent,
+                          const FlagGrid &flags,
+                          const MACGrid &vel,
+                          GridType &orig,
+                          int order,
+                          Real strength,
+                          int orderSpace,
+                          int clampMode,
+                          int orderTrace)
+{
+  typedef typename GridType::BASETYPE T;
+
+  Real dt = parent->getDt();
+  bool levelset = orig.getType() & GridBase::TypeLevelset;
+
+  // forward step
+  GridType fwd(parent);
+  SemiLagrange<T>(flags, vel, fwd, orig, dt, levelset, orderSpace, orderTrace);
+
+  if (order == 1) {
+    orig.swap(fwd);
+  }
+  else if (order == 2) {  // MacCormack
+    GridType bwd(parent);
+    GridType newGrid(parent);
+
+    // bwd <- backwards step
+    SemiLagrange<T>(flags, vel, bwd, fwd, -dt, levelset, orderSpace, orderTrace);
+
+    // newGrid <- compute correction
+    MacCormackCorrect<T>(flags, newGrid, orig, fwd, bwd, strength, levelset);
+
+    // clamp values
+    MacCormackClamp<T>(flags, vel, newGrid, orig, fwd, dt, clampMode);
+
+    orig.swap(newGrid);
+  }
+}
+
+// outflow functions
+
+//! calculate local propagation velocity for cell (i,j,k)
+Vec3 getBulkVel(const FlagGrid &flags, const MACGrid &vel, int i, int j, int k)
+{
+  Vec3 avg = Vec3(0.);
+  int count = 0;
+  int size = 1;  // stencil size
+  int nmax = (flags.is3D() ? size : 0);
+  // average the neighboring fluid / outflow cell's velocity
+  for (int n = -nmax; n <= nmax; n++) {
+    for (int m = -size; m <= size; m++) {
+      for (int l = -size; l <= size; l++) {
+        if (flags.isInBounds(Vec3i(i + l, j + m, k + n)) &&
+            (flags.isFluid(i + l, j + m, k + n) || flags.isOutflow(i + l, j + m, k + n))) {
+          avg += vel(i + l, j + m, k + n);
+          count++;
+        }
+      }
+    }
+  }
+  return count > 0 ? avg / count : avg;
+}
+
+//! extrapolate normal velocity components into outflow cell
+struct extrapolateVelConvectiveBC : public KernelBase {
+  extrapolateVelConvectiveBC(const FlagGrid &flags,
+                             const MACGrid &vel,
+                             MACGrid &velDst,
+                             const MACGrid &velPrev,
+                             Real timeStep)
+      : KernelBase(&flags, 0),
+        flags(flags),
+        vel(vel),
+        velDst(velDst),
+        velPrev(velPrev),
+        timeStep(timeStep)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 const MACGrid &vel,
+                 MACGrid &velDst,
+                 const MACGrid &velPrev,
+                 Real timeStep) const
+  {
+    if (flags.isOutflow(i, j, k)) {
+      Vec3 bulkVel = getBulkVel(flags, vel, i, j, k);
+      int dim = flags.is3D() ? 3 : 2;
+      const Vec3i cur = Vec3i(i, j, k);
+      Vec3i low, up, flLow, flUp;
+      int cnt = 0;
+      // iterate over each velocity component x, y, z
+      for (int c = 0; c < dim; c++) {
+        low = up = flLow = flUp = cur;
+        Real factor = timeStep *
+                      max((Real)1.0, bulkVel[c]);  // prevent the extrapolated velocity from
+                                                   // exploding when bulk velocity below 1
+        low[c] = flLow[c] = cur[c] - 1;
+        up[c] = flUp[c] = cur[c] + 1;
+        // iterate over bWidth to allow for extrapolation into more distant outflow cells;
+        // hard-coded extrapolation distance of two cells
+        for (int d = 0; d < 2; d++) {
+          bool extrapolateFromLower = flags.isInBounds(flLow) && flags.isFluid(flLow);
+          bool extrapolateFromUpper = flags.isInBounds(flUp) && flags.isFluid(flUp);
+          if (extrapolateFromLower || extrapolateFromUpper) {
+            if (extrapolateFromLower) {
+              velDst(i, j, k) += ((vel(i, j, k) - velPrev(i, j, k)) / factor) + vel(low);
+              cnt++;
+            }
+            if (extrapolateFromUpper) {
+              // check for cells equally far away from two fluid cells -> average value between
+              // both sides
+              velDst(i, j, k) += ((vel(i, j, k) - velPrev(i, j, k)) / factor) + vel(up);
+              cnt++;
+            }
+            break;
+          }
+          flLow[c]--;
+          flUp[c]++;
+        }
+      }
+      if (cnt > 0)
+        velDst(i, j, k) /= cnt;
+    }
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline const MACGrid &getArg1()
+  {
+    return vel;
+  }
+  typedef MACGrid type1;
+  inline MACGrid &getArg2()
+  {
+    return velDst;
+  }
+  typedef MACGrid type2;
+  inline const MACGrid &getArg3()
+  {
+    return velPrev;
+  }
+  typedef MACGrid type3;
+  inline Real &getArg4()
+  {
+    return timeStep;
+  }
+  typedef Real type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel extrapolateVelConvectiveBC ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, vel, velDst, velPrev, timeStep);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, vel, velDst, velPrev, timeStep);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const FlagGrid &flags;
+  const MACGrid &vel;
+  MACGrid &velDst;
+  const MACGrid &velPrev;
+  Real timeStep;
+};
+
+//! copy extrapolated velocity components
+struct copyChangedVels : public KernelBase {
+  copyChangedVels(const FlagGrid &flags, const MACGrid &velDst, MACGrid &vel)
+      : KernelBase(&flags, 0), flags(flags), velDst(velDst), vel(vel)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(
+      int i, int j, int k, const FlagGrid &flags, const MACGrid &velDst, MACGrid &vel) const
+  {
+    if (flags.isOutflow(i, j, k))
+      vel(i, j, k) = velDst(i, j, k);
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline const MACGrid &getArg1()
+  {
+    return velDst;
+  }
+  typedef MACGrid type1;
+  inline MACGrid &getArg2()
+  {
+    return vel;
+  }
+  typedef MACGrid type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel copyChangedVels ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, velDst, vel);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, velDst, vel);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const FlagGrid &flags;
+  const MACGrid &velDst;
+  MACGrid &vel;
+};
+
+//! extrapolate normal velocity components into open boundary cells (marked as outflow cells)
+void applyOutflowBC(const FlagGrid &flags, MACGrid &vel, const MACGrid &velPrev, double timeStep)
+{
+  MACGrid velDst(vel.getParent());  // do not overwrite vel while it is read
+  extrapolateVelConvectiveBC(flags, vel, velDst, velPrev, max(1.0, timeStep * 4));
+  copyChangedVels(flags, velDst, vel);
+}
+
+// advection helpers
+
+//! prevent parts of the surface getting "stuck" in obstacle regions
+struct knResetPhiInObs : public KernelBase {
+  knResetPhiInObs(const FlagGrid &flags, Grid<Real> &sdf)
+      : KernelBase(&flags, 0), flags(flags), sdf(sdf)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, const FlagGrid &flags, Grid<Real> &sdf) const
+  {
+    if (flags.isObstacle(i, j, k) && (sdf(i, j, k) < 0.)) {
+      sdf(i, j, k) = 0.1;
+    }
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<Real> &getArg1()
+  {
+    return sdf;
+  }
+  typedef Grid<Real> type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel knResetPhiInObs ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, sdf);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, sdf);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const FlagGrid &flags;
+  Grid<Real> &sdf;
+};
+void resetPhiInObs(const FlagGrid &flags, Grid<Real> &sdf)
+{
+  knResetPhiInObs(flags, sdf);
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "resetPhiInObs", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Real> &sdf = *_args.getPtr<Grid<Real>>("sdf", 1, &_lock);
+      _retval = getPyNone();
+      resetPhiInObs(flags, sdf);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "resetPhiInObs", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("resetPhiInObs", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_resetPhiInObs("", "resetPhiInObs", _W_0);
+extern "C" {
+void PbRegister_resetPhiInObs()
+{
+  KEEP_UNUSED(_RP_resetPhiInObs);
+}
+}
+
+// advection main calls
+
+//! template function for performing SL advection: specialized version for MAC grids
+template<>
+void fnAdvectSemiLagrange<MACGrid>(FluidSolver *parent,
+                                   const FlagGrid &flags,
+                                   const MACGrid &vel,
+                                   MACGrid &orig,
+                                   int order,
+                                   Real strength,
+                                   int orderSpace,
+                                   int clampMode,
+                                   int orderTrace)
+{
+  Real dt = parent->getDt();
+
+  // forward step
+  MACGrid fwd(parent);
+  SemiLagrangeMAC(flags, vel, fwd, orig, dt, orderSpace, orderTrace);
+
+  if (orderSpace != 1) {
+    debMsg("Warning higher order for MAC grids not yet implemented...", 1);
+  }
+
+  if (order == 1) {
+    applyOutflowBC(flags, fwd, orig, dt);
+    orig.swap(fwd);
+  }
+  else if (order == 2) {  // MacCormack
+    MACGrid bwd(parent);
+    MACGrid newGrid(parent);
+
+    // bwd <- backwards step
+    SemiLagrangeMAC(flags, vel, bwd, fwd, -dt, orderSpace, orderTrace);
+
+    // newGrid <- compute correction
+    MacCormackCorrectMAC<Vec3>(flags, newGrid, orig, fwd, bwd, strength, false, true);
+
+    // clamp values
+    MacCormackClampMAC(flags, vel, newGrid, orig, fwd, dt, clampMode);
+
+    applyOutflowBC(flags, newGrid, orig, dt);
+    orig.swap(newGrid);
+  }
+}
+
+//! Perform semi-lagrangian advection of target Real- or Vec3 grid
+//! Open boundary handling needs information about width of border
+//! Clamping modes: 1 regular clamp leading to more overshoot and sharper results, 2 revert to 1st
+//! order slightly smoother less overshoot (enable when 1 gives artifacts)
+
+void advectSemiLagrange(const FlagGrid *flags,
+                        const MACGrid *vel,
+                        GridBase *grid,
+                        int order = 1,
+                        Real strength = 1.0,
+                        int orderSpace = 1,
+                        bool openBounds = false,
+                        int boundaryWidth = -1,
+                        int clampMode = 2,
+                        int orderTrace = 1)
+{
+  assertMsg(order == 1 || order == 2,
+            "AdvectSemiLagrange: Only order 1 (regular SL) and 2 (MacCormack) supported");
+  if ((boundaryWidth != -1) || (openBounds)) {
+    debMsg(
+        "Warning: boundaryWidth and openBounds parameters in AdvectSemiLagrange plugin are "
+        "deprecated (and have no more effect), please remove.",
+        0);
+  }
+
+  // determine type of grid
+  if (grid->getType() & GridBase::TypeReal) {
+    fnAdvectSemiLagrange<Grid<Real>>(flags->getParent(),
+                                     *flags,
+                                     *vel,
+                                     *((Grid<Real> *)grid),
+                                     order,
+                                     strength,
+                                     orderSpace,
+                                     clampMode,
+                                     orderTrace);
+  }
+  else if (grid->getType() & GridBase::TypeMAC) {
+    fnAdvectSemiLagrange<MACGrid>(flags->getParent(),
+                                  *flags,
+                                  *vel,
+                                  *((MACGrid *)grid),
+                                  order,
+                                  strength,
+                                  orderSpace,
+                                  clampMode,
+                                  orderTrace);
+  }
+  else if (grid->getType() & GridBase::TypeVec3) {
+    fnAdvectSemiLagrange<Grid<Vec3>>(flags->getParent(),
+                                     *flags,
+                                     *vel,
+                                     *((Grid<Vec3> *)grid),
+                                     order,
+                                     strength,
+                                     orderSpace,
+                                     clampMode,
+                                     orderTrace);
+  }
+  else
+    errMsg("AdvectSemiLagrange: Grid Type is not supported (only Real, Vec3, MAC, Levelset)");
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "advectSemiLagrange", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid *flags = _args.getPtr<FlagGrid>("flags", 0, &_lock);
+      const MACGrid *vel = _args.getPtr<MACGrid>("vel", 1, &_lock);
+      GridBase *grid = _args.getPtr<GridBase>("grid", 2, &_lock);
+      int order = _args.getOpt<int>("order", 3, 1, &_lock);
+      Real strength = _args.getOpt<Real>("strength", 4, 1.0, &_lock);
+      int orderSpace = _args.getOpt<int>("orderSpace", 5, 1, &_lock);
+      bool openBounds = _args.getOpt<bool>("openBounds", 6, false, &_lock);
+      int boundaryWidth = _args.getOpt<int>("boundaryWidth", 7, -1, &_lock);
+      int clampMode = _args.getOpt<int>("clampMode", 8, 2, &_lock);
+      int orderTrace = _args.getOpt<int>("orderTrace", 9, 1, &_lock);
+      _retval = getPyNone();
+      advectSemiLagrange(flags,
+                         vel,
+                         grid,
+                         order,
+                         strength,
+                         orderSpace,
+                         openBounds,
+                         boundaryWidth,
+                         clampMode,
+                         orderTrace);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "advectSemiLagrange", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("advectSemiLagrange", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_advectSemiLagrange("", "advectSemiLagrange", _W_1);
+extern "C" {
+void PbRegister_advectSemiLagrange()
+{
+  KEEP_UNUSED(_RP_advectSemiLagrange);
+}
+}
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/apic.cpp b/extern/mantaflow/preprocessed/plugin/apic.cpp
new file mode 100644
index 00000000000..6ff893014c9
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/apic.cpp
@@ -0,0 +1,496 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+// ----------------------------------------------------------------------------
+//
+// MantaFlow fluid solver framework
+// Copyright 2016-2017 Kiwon Um, Nils Thuerey
+//
+// This program is free software, distributed under the terms of the
+// Apache License, Version 2.0
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Affine Particle-In-Cell
+//
+// ----------------------------------------------------------------------------
+
+#include "particle.h"
+#include "grid.h"
+
+namespace Manta {
+
+struct knApicMapLinearVec3ToMACGrid : public KernelBase {
+  knApicMapLinearVec3ToMACGrid(const BasicParticleSystem &p,
+                               MACGrid &mg,
+                               MACGrid &vg,
+                               const ParticleDataImpl<Vec3> &vp,
+                               const ParticleDataImpl<Vec3> &cpx,
+                               const ParticleDataImpl<Vec3> &cpy,
+                               const ParticleDataImpl<Vec3> &cpz,
+                               const ParticleDataImpl<int> *ptype,
+                               const int exclude)
+      : KernelBase(p.size()),
+        p(p),
+        mg(mg),
+        vg(vg),
+        vp(vp),
+        cpx(cpx),
+        cpy(cpy),
+        cpz(cpz),
+        ptype(ptype),
+        exclude(exclude)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystem &p,
+                 MACGrid &mg,
+                 MACGrid &vg,
+                 const ParticleDataImpl<Vec3> &vp,
+                 const ParticleDataImpl<Vec3> &cpx,
+                 const ParticleDataImpl<Vec3> &cpy,
+                 const ParticleDataImpl<Vec3> &cpz,
+                 const ParticleDataImpl<int> *ptype,
+                 const int exclude)
+  {
+    if (!p.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
+      return;
+    const IndexInt dX[2] = {0, vg.getStrideX()};
+    const IndexInt dY[2] = {0, vg.getStrideY()};
+    const IndexInt dZ[2] = {0, vg.getStrideZ()};
+
+    const Vec3 &pos = p[idx].pos, &vel = vp[idx];
+    const IndexInt fi = static_cast<IndexInt>(pos.x), fj = static_cast<IndexInt>(pos.y),
+                   fk = static_cast<IndexInt>(pos.z);
+    const IndexInt ci = static_cast<IndexInt>(pos.x - 0.5),
+                   cj = static_cast<IndexInt>(pos.y - 0.5),
+                   ck = static_cast<IndexInt>(pos.z - 0.5);
+    const Real wfi = clamp(pos.x - fi, Real(0), Real(1)),
+               wfj = clamp(pos.y - fj, Real(0), Real(1)),
+               wfk = clamp(pos.z - fk, Real(0), Real(1));
+    const Real wci = clamp(Real(pos.x - ci - 0.5), Real(0), Real(1)),
+               wcj = clamp(Real(pos.y - cj - 0.5), Real(0), Real(1)),
+               wck = clamp(Real(pos.z - ck - 0.5), Real(0), Real(1));
+    // TODO: check index for safety
+    {  // u-face
+      const IndexInt gidx = fi * dX[1] + cj * dY[1] + ck * dZ[1];
+      const Vec3 gpos(fi, cj + 0.5, ck + 0.5);
+      const Real wi[2] = {Real(1) - wfi, wfi};
+      const Real wj[2] = {Real(1) - wcj, wcj};
+      const Real wk[2] = {Real(1) - wck, wck};
+      for (int i = 0; i < 2; ++i)
+        for (int j = 0; j < 2; ++j)
+          for (int k = 0; k < 2; ++k) {
+            const Real w = wi[i] * wj[j] * wk[k];
+            mg[gidx + dX[i] + dY[j] + dZ[k]].x += w;
+            vg[gidx + dX[i] + dY[j] + dZ[k]].x += w * vel.x;
+            vg[gidx + dX[i] + dY[j] + dZ[k]].x += w * dot(cpx[idx], gpos + Vec3(i, j, k) - pos);
+          }
+    }
+    {  // v-face
+      const IndexInt gidx = ci * dX[1] + fj * dY[1] + ck * dZ[1];
+      const Vec3 gpos(ci + 0.5, fj, ck + 0.5);
+      const Real wi[2] = {Real(1) - wci, wci};
+      const Real wj[2] = {Real(1) - wfj, wfj};
+      const Real wk[2] = {Real(1) - wck, wck};
+      for (int i = 0; i < 2; ++i)
+        for (int j = 0; j < 2; ++j)
+          for (int k = 0; k < 2; ++k) {
+            const Real w = wi[i] * wj[j] * wk[k];
+            mg[gidx + dX[i] + dY[j] + dZ[k]].y += w;
+            vg[gidx + dX[i] + dY[j] + dZ[k]].y += w * vel.y;
+            vg[gidx + dX[i] + dY[j] + dZ[k]].y += w * dot(cpy[idx], gpos + Vec3(i, j, k) - pos);
+          }
+    }
+    if (!vg.is3D())
+      return;
+    {  // w-face
+      const IndexInt gidx = ci * dX[1] + cj * dY[1] + fk * dZ[1];
+      const Vec3 gpos(ci + 0.5, cj + 0.5, fk);
+      const Real wi[2] = {Real(1) - wci, wci};
+      const Real wj[2] = {Real(1) - wcj, wcj};
+      const Real wk[2] = {Real(1) - wfk, wfk};
+      for (int i = 0; i < 2; ++i)
+        for (int j = 0; j < 2; ++j)
+          for (int k = 0; k < 2; ++k) {
+            const Real w = wi[i] * wj[j] * wk[k];
+            mg[gidx + dX[i] + dY[j] + dZ[k]].z += w;
+            vg[gidx + dX[i] + dY[j] + dZ[k]].z += w * vel.z;
+            vg[gidx + dX[i] + dY[j] + dZ[k]].z += w * dot(cpz[idx], gpos + Vec3(i, j, k) - pos);
+          }
+    }
+  }
+  inline const BasicParticleSystem &getArg0()
+  {
+    return p;
+  }
+  typedef BasicParticleSystem type0;
+  inline MACGrid &getArg1()
+  {
+    return mg;
+  }
+  typedef MACGrid type1;
+  inline MACGrid &getArg2()
+  {
+    return vg;
+  }
+  typedef MACGrid type2;
+  inline const ParticleDataImpl<Vec3> &getArg3()
+  {
+    return vp;
+  }
+  typedef ParticleDataImpl<Vec3> type3;
+  inline const ParticleDataImpl<Vec3> &getArg4()
+  {
+    return cpx;
+  }
+  typedef ParticleDataImpl<Vec3> type4;
+  inline const ParticleDataImpl<Vec3> &getArg5()
+  {
+    return cpy;
+  }
+  typedef ParticleDataImpl<Vec3> type5;
+  inline const ParticleDataImpl<Vec3> &getArg6()
+  {
+    return cpz;
+  }
+  typedef ParticleDataImpl<Vec3> type6;
+  inline const ParticleDataImpl<int> *getArg7()
+  {
+    return ptype;
+  }
+  typedef ParticleDataImpl<int> type7;
+  inline const int &getArg8()
+  {
+    return exclude;
+  }
+  typedef int type8;
+  void runMessage()
+  {
+    debMsg("Executing kernel knApicMapLinearVec3ToMACGrid ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void run()
+  {
+    const IndexInt _sz = size;
+    for (IndexInt i = 0; i < _sz; i++)
+      op(i, p, mg, vg, vp, cpx, cpy, cpz, ptype, exclude);
+  }
+  const BasicParticleSystem &p;
+  MACGrid &mg;
+  MACGrid &vg;
+  const ParticleDataImpl<Vec3> &vp;
+  const ParticleDataImpl<Vec3> &cpx;
+  const ParticleDataImpl<Vec3> &cpy;
+  const ParticleDataImpl<Vec3> &cpz;
+  const ParticleDataImpl<int> *ptype;
+  const int exclude;
+};
+
+void apicMapPartsToMAC(const FlagGrid &flags,
+                       MACGrid &vel,
+                       const BasicParticleSystem &parts,
+                       const ParticleDataImpl<Vec3> &partVel,
+                       const ParticleDataImpl<Vec3> &cpx,
+                       const ParticleDataImpl<Vec3> &cpy,
+                       const ParticleDataImpl<Vec3> &cpz,
+                       MACGrid *mass = NULL,
+                       const ParticleDataImpl<int> *ptype = NULL,
+                       const int exclude = 0)
+{
+  // affine map
+  // let's assume that the particle mass is constant, 1.0
+  const bool freeMass = !mass;
+  if (!mass)
+    mass = new MACGrid(flags.getParent());
+  else
+    mass->clear();
+
+  vel.clear();
+  knApicMapLinearVec3ToMACGrid(parts, *mass, vel, partVel, cpx, cpy, cpz, ptype, exclude);
+  mass->stomp(VECTOR_EPSILON);
+  vel.safeDivide(*mass);
+
+  if (freeMass)
+    delete mass;
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "apicMapPartsToMAC", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 2, &_lock);
+      const ParticleDataImpl<Vec3> &partVel = *_args.getPtr<ParticleDataImpl<Vec3>>(
+          "partVel", 3, &_lock);
+      const ParticleDataImpl<Vec3> &cpx = *_args.getPtr<ParticleDataImpl<Vec3>>("cpx", 4, &_lock);
+      const ParticleDataImpl<Vec3> &cpy = *_args.getPtr<ParticleDataImpl<Vec3>>("cpy", 5, &_lock);
+      const ParticleDataImpl<Vec3> &cpz = *_args.getPtr<ParticleDataImpl<Vec3>>("cpz", 6, &_lock);
+      MACGrid *mass = _args.getPtrOpt<MACGrid>("mass", 7, NULL, &_lock);
+      const ParticleDataImpl<int> *ptype = _args.getPtrOpt<ParticleDataImpl<int>>(
+          "ptype", 8, NULL, &_lock);
+      const int exclude = _args.getOpt<int>("exclude", 9, 0, &_lock);
+      _retval = getPyNone();
+      apicMapPartsToMAC(flags, vel, parts, partVel, cpx, cpy, cpz, mass, ptype, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "apicMapPartsToMAC", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("apicMapPartsToMAC", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_apicMapPartsToMAC("", "apicMapPartsToMAC", _W_0);
+extern "C" {
+void PbRegister_apicMapPartsToMAC()
+{
+  KEEP_UNUSED(_RP_apicMapPartsToMAC);
+}
+}
+
+struct knApicMapLinearMACGridToVec3 : public KernelBase {
+  knApicMapLinearMACGridToVec3(ParticleDataImpl<Vec3> &vp,
+                               ParticleDataImpl<Vec3> &cpx,
+                               ParticleDataImpl<Vec3> &cpy,
+                               ParticleDataImpl<Vec3> &cpz,
+                               const BasicParticleSystem &p,
+                               const MACGrid &vg,
+                               const FlagGrid &flags,
+                               const ParticleDataImpl<int> *ptype,
+                               const int exclude)
+      : KernelBase(vp.size()),
+        vp(vp),
+        cpx(cpx),
+        cpy(cpy),
+        cpz(cpz),
+        p(p),
+        vg(vg),
+        flags(flags),
+        ptype(ptype),
+        exclude(exclude)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 ParticleDataImpl<Vec3> &vp,
+                 ParticleDataImpl<Vec3> &cpx,
+                 ParticleDataImpl<Vec3> &cpy,
+                 ParticleDataImpl<Vec3> &cpz,
+                 const BasicParticleSystem &p,
+                 const MACGrid &vg,
+                 const FlagGrid &flags,
+                 const ParticleDataImpl<int> *ptype,
+                 const int exclude) const
+  {
+    if (!p.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
+      return;
+
+    vp[idx] = cpx[idx] = cpy[idx] = cpz[idx] = Vec3(Real(0));
+    const IndexInt dX[2] = {0, vg.getStrideX()}, dY[2] = {0, vg.getStrideY()},
+                   dZ[2] = {0, vg.getStrideZ()};
+    const Real gw[2] = {-Real(1), Real(1)};
+
+    const Vec3 &pos = p[idx].pos;
+    const IndexInt fi = static_cast<IndexInt>(pos.x), fj = static_cast<IndexInt>(pos.y),
+                   fk = static_cast<IndexInt>(pos.z);
+    const IndexInt ci = static_cast<IndexInt>(pos.x - 0.5),
+                   cj = static_cast<IndexInt>(pos.y - 0.5),
+                   ck = static_cast<IndexInt>(pos.z - 0.5);
+    const Real wfi = clamp(pos.x - fi, Real(0), Real(1)),
+               wfj = clamp(pos.y - fj, Real(0), Real(1)),
+               wfk = clamp(pos.z - fk, Real(0), Real(1));
+    const Real wci = clamp(Real(pos.x - ci - 0.5), Real(0), Real(1)),
+               wcj = clamp(Real(pos.y - cj - 0.5), Real(0), Real(1)),
+               wck = clamp(Real(pos.z - ck - 0.5), Real(0), Real(1));
+    // TODO: check index for safety
+    {  // u
+      const IndexInt gidx = fi * dX[1] + cj * dY[1] + ck * dZ[1];
+      const Real wx[2] = {Real(1) - wfi, wfi};
+      const Real wy[2] = {Real(1) - wcj, wcj};
+      const Real wz[2] = {Real(1) - wck, wck};
+      for (int i = 0; i < 2; ++i)
+        for (int j = 0; j < 2; ++j)
+          for (int k = 0; k < 2; ++k) {
+            const IndexInt vidx = gidx + dX[i] + dY[j] + dZ[k];
+            Real vgx = vg[vidx].x;
+            vp[idx].x += wx[i] * wy[j] * wz[k] * vgx;
+            cpx[idx].x += gw[i] * wy[j] * wz[k] * vgx;
+            cpx[idx].y += wx[i] * gw[j] * wz[k] * vgx;
+            cpx[idx].z += wx[i] * wy[j] * gw[k] * vgx;
+          }
+    }
+    {  // v
+      const IndexInt gidx = ci * dX[1] + fj * dY[1] + ck * dZ[1];
+      const Real wx[2] = {Real(1) - wci, wci};
+      const Real wy[2] = {Real(1) - wfj, wfj};
+      const Real wz[2] = {Real(1) - wck, wck};
+      for (int i = 0; i < 2; ++i)
+        for (int j = 0; j < 2; ++j)
+          for (int k = 0; k < 2; ++k) {
+            const IndexInt vidx = gidx + dX[i] + dY[j] + dZ[k];
+            Real vgy = vg[vidx].y;
+            vp[idx].y += wx[i] * wy[j] * wz[k] * vgy;
+            cpy[idx].x += gw[i] * wy[j] * wz[k] * vgy;
+            cpy[idx].y += wx[i] * gw[j] * wz[k] * vgy;
+            cpy[idx].z += wx[i] * wy[j] * gw[k] * vgy;
+          }
+    }
+    if (!vg.is3D())
+      return;
+    {  // w
+      const IndexInt gidx = ci * dX[1] + cj * dY[1] + fk * dZ[1];
+      const Real wx[2] = {Real(1) - wci, wci};
+      const Real wy[2] = {Real(1) - wcj, wcj};
+      const Real wz[2] = {Real(1) - wfk, wfk};
+      for (int i = 0; i < 2; ++i)
+        for (int j = 0; j < 2; ++j)
+          for (int k = 0; k < 2; ++k) {
+            const IndexInt vidx = gidx + dX[i] + dY[j] + dZ[k];
+            Real vgz = vg[vidx].z;
+            vp[idx].z += wx[i] * wy[j] * wz[k] * vgz;
+            cpz[idx].x += gw[i] * wy[j] * wz[k] * vgz;
+            cpz[idx].y += wx[i] * gw[j] * wz[k] * vgz;
+            cpz[idx].z += wx[i] * wy[j] * gw[k] * vgz;
+          }
+    }
+  }
+  inline ParticleDataImpl<Vec3> &getArg0()
+  {
+    return vp;
+  }
+  typedef ParticleDataImpl<Vec3> type0;
+  inline ParticleDataImpl<Vec3> &getArg1()
+  {
+    return cpx;
+  }
+  typedef ParticleDataImpl<Vec3> type1;
+  inline ParticleDataImpl<Vec3> &getArg2()
+  {
+    return cpy;
+  }
+  typedef ParticleDataImpl<Vec3> type2;
+  inline ParticleDataImpl<Vec3> &getArg3()
+  {
+    return cpz;
+  }
+  typedef ParticleDataImpl<Vec3> type3;
+  inline const BasicParticleSystem &getArg4()
+  {
+    return p;
+  }
+  typedef BasicParticleSystem type4;
+  inline const MACGrid &getArg5()
+  {
+    return vg;
+  }
+  typedef MACGrid type5;
+  inline const FlagGrid &getArg6()
+  {
+    return flags;
+  }
+  typedef FlagGrid type6;
+  inline const ParticleDataImpl<int> *getArg7()
+  {
+    return ptype;
+  }
+  typedef ParticleDataImpl<int> type7;
+  inline const int &getArg8()
+  {
+    return exclude;
+  }
+  typedef int type8;
+  void runMessage()
+  {
+    debMsg("Executing kernel knApicMapLinearMACGridToVec3 ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, vp, cpx, cpy, cpz, p, vg, flags, ptype, exclude);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  ParticleDataImpl<Vec3> &vp;
+  ParticleDataImpl<Vec3> &cpx;
+  ParticleDataImpl<Vec3> &cpy;
+  ParticleDataImpl<Vec3> &cpz;
+  const BasicParticleSystem &p;
+  const MACGrid &vg;
+  const FlagGrid &flags;
+  const ParticleDataImpl<int> *ptype;
+  const int exclude;
+};
+
+void apicMapMACGridToParts(ParticleDataImpl<Vec3> &partVel,
+                           ParticleDataImpl<Vec3> &cpx,
+                           ParticleDataImpl<Vec3> &cpy,
+                           ParticleDataImpl<Vec3> &cpz,
+                           const BasicParticleSystem &parts,
+                           const MACGrid &vel,
+                           const FlagGrid &flags,
+                           const ParticleDataImpl<int> *ptype = NULL,
+                           const int exclude = 0)
+{
+  knApicMapLinearMACGridToVec3(partVel, cpx, cpy, cpz, parts, vel, flags, ptype, exclude);
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "apicMapMACGridToParts", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      ParticleDataImpl<Vec3> &partVel = *_args.getPtr<ParticleDataImpl<Vec3>>(
+          "partVel", 0, &_lock);
+      ParticleDataImpl<Vec3> &cpx = *_args.getPtr<ParticleDataImpl<Vec3>>("cpx", 1, &_lock);
+      ParticleDataImpl<Vec3> &cpy = *_args.getPtr<ParticleDataImpl<Vec3>>("cpy", 2, &_lock);
+      ParticleDataImpl<Vec3> &cpz = *_args.getPtr<ParticleDataImpl<Vec3>>("cpz", 3, &_lock);
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 4, &_lock);
+      const MACGrid &vel = *_args.getPtr<MACGrid>("vel", 5, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 6, &_lock);
+      const ParticleDataImpl<int> *ptype = _args.getPtrOpt<ParticleDataImpl<int>>(
+          "ptype", 7, NULL, &_lock);
+      const int exclude = _args.getOpt<int>("exclude", 8, 0, &_lock);
+      _retval = getPyNone();
+      apicMapMACGridToParts(partVel, cpx, cpy, cpz, parts, vel, flags, ptype, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "apicMapMACGridToParts", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("apicMapMACGridToParts", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_apicMapMACGridToParts("", "apicMapMACGridToParts", _W_1);
+extern "C" {
+void PbRegister_apicMapMACGridToParts()
+{
+  KEEP_UNUSED(_RP_apicMapMACGridToParts);
+}
+}
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/extforces.cpp b/extern/mantaflow/preprocessed/plugin/extforces.cpp
new file mode 100644
index 00000000000..3e1e5733257
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/extforces.cpp
@@ -0,0 +1,1559 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Copyright 2011 Tobias Pfaff, Nils Thuerey
+ *
+ * This program is free software, distributed under the terms of the
+ * GNU General Public License (GPL)
+ * http://www.gnu.org/licenses
+ *
+ * Set boundary conditions, gravity
+ *
+ ******************************************************************************/
+
+#include "vectorbase.h"
+#include "grid.h"
+#include "commonkernels.h"
+#include "particle.h"
+
+using namespace std;
+
+namespace Manta {
+
+//! add constant force between fl/fl and fl/em cells
+struct KnApplyForceField : public KernelBase {
+  KnApplyForceField(const FlagGrid &flags,
+                    MACGrid &vel,
+                    const Grid<Vec3> &force,
+                    const Grid<Real> *include,
+                    bool additive,
+                    bool isMAC)
+      : KernelBase(&flags, 1),
+        flags(flags),
+        vel(vel),
+        force(force),
+        include(include),
+        additive(additive),
+        isMAC(isMAC)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 MACGrid &vel,
+                 const Grid<Vec3> &force,
+                 const Grid<Real> *include,
+                 bool additive,
+                 bool isMAC) const
+  {
+    bool curFluid = flags.isFluid(i, j, k);
+    bool curEmpty = flags.isEmpty(i, j, k);
+    if (!curFluid && !curEmpty)
+      return;
+    if (include && ((*include)(i, j, k) > 0.))
+      return;
+
+    Real forceX = (isMAC) ? force(i, j, k).x : 0.5 * (force(i - 1, j, k).x + force(i, j, k).x);
+    Real forceY = (isMAC) ? force(i, j, k).y : 0.5 * (force(i, j - 1, k).y + force(i, j, k).y);
+
+    Real forceZ = 0.;
+    if (vel.is3D())
+      forceZ = (isMAC) ? force(i, j, k).z : 0.5 * (force(i, j, k - 1).z + force(i, j, k).z);
+
+    if (flags.isFluid(i - 1, j, k) || (curFluid && flags.isEmpty(i - 1, j, k)))
+      vel(i, j, k).x = (additive) ? vel(i, j, k).x + forceX : forceX;
+    if (flags.isFluid(i, j - 1, k) || (curFluid && flags.isEmpty(i, j - 1, k)))
+      vel(i, j, k).y = (additive) ? vel(i, j, k).y + forceY : forceY;
+    if (vel.is3D() && (flags.isFluid(i, j, k - 1) || (curFluid && flags.isEmpty(i, j, k - 1))))
+      vel(i, j, k).z = (additive) ? vel(i, j, k).z + forceZ : forceZ;
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline MACGrid &getArg1()
+  {
+    return vel;
+  }
+  typedef MACGrid type1;
+  inline const Grid<Vec3> &getArg2()
+  {
+    return force;
+  }
+  typedef Grid<Vec3> type2;
+  inline const Grid<Real> *getArg3()
+  {
+    return include;
+  }
+  typedef Grid<Real> type3;
+  inline bool &getArg4()
+  {
+    return additive;
+  }
+  typedef bool type4;
+  inline bool &getArg5()
+  {
+    return isMAC;
+  }
+  typedef bool type5;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnApplyForceField ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, force, include, additive, isMAC);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, force, include, additive, isMAC);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  MACGrid &vel;
+  const Grid<Vec3> &force;
+  const Grid<Real> *include;
+  bool additive;
+  bool isMAC;
+};
+
+//! add constant force between fl/fl and fl/em cells
+struct KnApplyForce : public KernelBase {
+  KnApplyForce(
+      const FlagGrid &flags, MACGrid &vel, Vec3 force, const Grid<Real> *exclude, bool additive)
+      : KernelBase(&flags, 1),
+        flags(flags),
+        vel(vel),
+        force(force),
+        exclude(exclude),
+        additive(additive)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 MACGrid &vel,
+                 Vec3 force,
+                 const Grid<Real> *exclude,
+                 bool additive) const
+  {
+    bool curFluid = flags.isFluid(i, j, k);
+    bool curEmpty = flags.isEmpty(i, j, k);
+    if (!curFluid && !curEmpty)
+      return;
+    if (exclude && ((*exclude)(i, j, k) < 0.))
+      return;
+
+    if (flags.isFluid(i - 1, j, k) || (curFluid && flags.isEmpty(i - 1, j, k)))
+      vel(i, j, k).x = (additive) ? vel(i, j, k).x + force.x : force.x;
+    if (flags.isFluid(i, j - 1, k) || (curFluid && flags.isEmpty(i, j - 1, k)))
+      vel(i, j, k).y = (additive) ? vel(i, j, k).y + force.y : force.y;
+    if (vel.is3D() && (flags.isFluid(i, j, k - 1) || (curFluid && flags.isEmpty(i, j, k - 1))))
+      vel(i, j, k).z = (additive) ? vel(i, j, k).z + force.z : force.z;
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline MACGrid &getArg1()
+  {
+    return vel;
+  }
+  typedef MACGrid type1;
+  inline Vec3 &getArg2()
+  {
+    return force;
+  }
+  typedef Vec3 type2;
+  inline const Grid<Real> *getArg3()
+  {
+    return exclude;
+  }
+  typedef Grid<Real> type3;
+  inline bool &getArg4()
+  {
+    return additive;
+  }
+  typedef bool type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnApplyForce ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, force, exclude, additive);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, force, exclude, additive);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  MACGrid &vel;
+  Vec3 force;
+  const Grid<Real> *exclude;
+  bool additive;
+};
+
+//! add gravity forces to all fluid cells, automatically adapts to different grid sizes
+void addGravity(const FlagGrid &flags,
+                MACGrid &vel,
+                Vec3 gravity,
+                const Grid<Real> *exclude = NULL)
+{
+  Vec3 f = gravity * flags.getParent()->getDt() / flags.getDx();
+  KnApplyForce(flags, vel, f, exclude, true);
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "addGravity", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      Vec3 gravity = _args.get<Vec3>("gravity", 2, &_lock);
+      const Grid<Real> *exclude = _args.getPtrOpt<Grid<Real>>("exclude", 3, NULL, &_lock);
+      _retval = getPyNone();
+      addGravity(flags, vel, gravity, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "addGravity", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("addGravity", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_addGravity("", "addGravity", _W_0);
+extern "C" {
+void PbRegister_addGravity()
+{
+  KEEP_UNUSED(_RP_addGravity);
+}
+}
+
+//! add gravity forces to all fluid cells , but dont account for changing cell size
+void addGravityNoScale(const FlagGrid &flags,
+                       MACGrid &vel,
+                       const Vec3 &gravity,
+                       const Grid<Real> *exclude = NULL)
+{
+  const Vec3 f = gravity * flags.getParent()->getDt();
+  KnApplyForce(flags, vel, f, exclude, true);
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "addGravityNoScale", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      const Vec3 &gravity = _args.get<Vec3>("gravity", 2, &_lock);
+      const Grid<Real> *exclude = _args.getPtrOpt<Grid<Real>>("exclude", 3, NULL, &_lock);
+      _retval = getPyNone();
+      addGravityNoScale(flags, vel, gravity, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "addGravityNoScale", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("addGravityNoScale", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_addGravityNoScale("", "addGravityNoScale", _W_1);
+extern "C" {
+void PbRegister_addGravityNoScale()
+{
+  KEEP_UNUSED(_RP_addGravityNoScale);
+}
+}
+
+//! kernel to add Buoyancy force
+struct KnAddBuoyancy : public KernelBase {
+  KnAddBuoyancy(const FlagGrid &flags, const Grid<Real> &factor, MACGrid &vel, Vec3 strength)
+      : KernelBase(&flags, 1), flags(flags), factor(factor), vel(vel), strength(strength)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 const Grid<Real> &factor,
+                 MACGrid &vel,
+                 Vec3 strength) const
+  {
+    if (!flags.isFluid(i, j, k))
+      return;
+    if (flags.isFluid(i - 1, j, k))
+      vel(i, j, k).x += (0.5 * strength.x) * (factor(i, j, k) + factor(i - 1, j, k));
+    if (flags.isFluid(i, j - 1, k))
+      vel(i, j, k).y += (0.5 * strength.y) * (factor(i, j, k) + factor(i, j - 1, k));
+    if (vel.is3D() && flags.isFluid(i, j, k - 1))
+      vel(i, j, k).z += (0.5 * strength.z) * (factor(i, j, k) + factor(i, j, k - 1));
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline const Grid<Real> &getArg1()
+  {
+    return factor;
+  }
+  typedef Grid<Real> type1;
+  inline MACGrid &getArg2()
+  {
+    return vel;
+  }
+  typedef MACGrid type2;
+  inline Vec3 &getArg3()
+  {
+    return strength;
+  }
+  typedef Vec3 type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnAddBuoyancy ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, factor, vel, strength);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, factor, vel, strength);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  const Grid<Real> &factor;
+  MACGrid &vel;
+  Vec3 strength;
+};
+
+//! add Buoyancy force based on fctor (e.g. smoke density)
+void addBuoyancy(const FlagGrid &flags,
+                 const Grid<Real> &density,
+                 MACGrid &vel,
+                 Vec3 gravity,
+                 Real coefficient = 1.)
+{
+  Vec3 f = -gravity * flags.getParent()->getDt() / flags.getParent()->getDx() * coefficient;
+  KnAddBuoyancy(flags, density, vel, f);
+}
+static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "addBuoyancy", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      const Grid<Real> &density = *_args.getPtr<Grid<Real>>("density", 1, &_lock);
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 2, &_lock);
+      Vec3 gravity = _args.get<Vec3>("gravity", 3, &_lock);
+      Real coefficient = _args.getOpt<Real>("coefficient", 4, 1., &_lock);
+      _retval = getPyNone();
+      addBuoyancy(flags, density, vel, gravity, coefficient);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "addBuoyancy", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("addBuoyancy", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_addBuoyancy("", "addBuoyancy", _W_2);
+extern "C" {
+void PbRegister_addBuoyancy()
+{
+  KEEP_UNUSED(_RP_addBuoyancy);
+}
+}
+
+// inflow / outflow boundaries
+
+//! helper to parse openbounds string [xXyYzZ] , convert to vec3
+inline void convertDescToVec(const string &desc, Vector3D<bool> &lo, Vector3D<bool> &up)
+{
+  for (size_t i = 0; i < desc.size(); i++) {
+    if (desc[i] == 'x')
+      lo.x = true;
+    else if (desc[i] == 'y')
+      lo.y = true;
+    else if (desc[i] == 'z')
+      lo.z = true;
+    else if (desc[i] == 'X')
+      up.x = true;
+    else if (desc[i] == 'Y')
+      up.y = true;
+    else if (desc[i] == 'Z')
+      up.z = true;
+    else
+      errMsg("invalid character in boundary description string. Only [xyzXYZ] allowed.");
+  }
+}
+
+//! add empty and outflow flag to cells of open boundaries
+void setOpenBound(FlagGrid &flags,
+                  int bWidth,
+                  string openBound = "",
+                  int type = FlagGrid::TypeOutflow | FlagGrid::TypeEmpty)
+{
+  if (openBound == "")
+    return;
+  Vector3D<bool> lo, up;
+  convertDescToVec(openBound, lo, up);
+
+  FOR_IJK(flags)
+  {
+    bool loX = lo.x && i <= bWidth;  // a cell which belongs to the lower x open bound
+    bool loY = lo.y && j <= bWidth;
+    bool upX = up.x && i >= flags.getSizeX() - bWidth -
+                                1;  // a cell which belongs to the upper x open bound
+    bool upY = up.y && j >= flags.getSizeY() - bWidth - 1;
+    bool innerI = i > bWidth &&
+                  i < flags.getSizeX() - bWidth -
+                          1;  // a cell which does not belong to the lower or upper x bound
+    bool innerJ = j > bWidth && j < flags.getSizeY() - bWidth - 1;
+
+    // when setting boundaries to open: don't set shared part of wall to empty if neighboring wall
+    // is not open
+    if ((!flags.is3D()) && (loX || upX || loY || upY)) {
+      if ((loX || upX || innerI) && (loY || upY || innerJ) && flags.isObstacle(i, j, k))
+        flags(i, j, k) = type;
+    }
+    else {
+      bool loZ = lo.z && k <= bWidth;  // a cell which belongs to the lower z open bound
+      bool upZ = up.z && k >= flags.getSizeZ() - bWidth -
+                                  1;  // a cell which belongs to the upper z open bound
+      bool innerK = k > bWidth &&
+                    k < flags.getSizeZ() - bWidth -
+                            1;  // a cell which does not belong to the lower or upper z bound
+      if (loX || upX || loY || upY || loZ || upZ) {
+        if ((loX || upX || innerI) && (loY || upY || innerJ) && (loZ || upZ || innerK) &&
+            flags.isObstacle(i, j, k))
+          flags(i, j, k) = type;
+      }
+    }
+  }
+}
+static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "setOpenBound", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      int bWidth = _args.get<int>("bWidth", 1, &_lock);
+      string openBound = _args.getOpt<string>("openBound", 2, "", &_lock);
+      int type = _args.getOpt<int>("type", 3, FlagGrid::TypeOutflow | FlagGrid::TypeEmpty, &_lock);
+      _retval = getPyNone();
+      setOpenBound(flags, bWidth, openBound, type);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "setOpenBound", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("setOpenBound", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_setOpenBound("", "setOpenBound", _W_3);
+extern "C" {
+void PbRegister_setOpenBound()
+{
+  KEEP_UNUSED(_RP_setOpenBound);
+}
+}
+
+//! delete fluid and ensure empty flag in outflow cells, delete particles and density and set phi
+//! to 0.5
+void resetOutflow(FlagGrid &flags,
+                  Grid<Real> *phi = 0,
+                  BasicParticleSystem *parts = 0,
+                  Grid<Real> *real = 0,
+                  Grid<int> *index = 0,
+                  ParticleIndexSystem *indexSys = 0)
+{
+  // check if phi and parts -> pindex and gpi already created -> access particles from cell index,
+  // avoid extra looping over particles
+  if (parts && (!index || !indexSys)) {
+    if (phi)
+      debMsg(
+          "resetOpenBound for phi and particles, but missing index and indexSys for enhanced "
+          "particle access!",
+          1);
+    for (int idx = 0; idx < (int)parts->size(); idx++)
+      if (parts->isActive(idx) && flags.isInBounds(parts->getPos(idx)) &&
+          flags.isOutflow(parts->getPos(idx)))
+        parts->kill(idx);
+  }
+  FOR_IJK(flags)
+  {
+    if (flags.isOutflow(i, j, k)) {
+      flags(i, j, k) = (flags(i, j, k) | FlagGrid::TypeEmpty) &
+                       ~FlagGrid::TypeFluid;  // make sure there is not fluid flag set and to reset
+                                              // the empty flag
+      // the particles in a cell i,j,k are particles[index(i,j,k)] to particles[index(i+1,j,k)-1]
+      if (parts && index && indexSys) {
+        int isysIdxS = index->index(i, j, k);
+        int pStart = (*index)(isysIdxS), pEnd = 0;
+        if (flags.isInBounds(isysIdxS + 1))
+          pEnd = (*index)(isysIdxS + 1);
+        else
+          pEnd = indexSys->size();
+        // now loop over particles in cell
+        for (int p = pStart; p < pEnd; ++p) {
+          int psrc = (*indexSys)[p].sourceIndex;
+          if (parts->isActive(psrc) && flags.isInBounds(parts->getPos(psrc)))
+            parts->kill(psrc);
+        }
+      }
+      if (phi)
+        (*phi)(i, j, k) = 0.5;
+      if (real)
+        (*real)(i, j, k) = 0;
+    }
+  }
+  if (parts)
+    parts->doCompress();
+}
+static PyObject *_W_4(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "resetOutflow", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 1, 0, &_lock);
+      BasicParticleSystem *parts = _args.getPtrOpt<BasicParticleSystem>("parts", 2, 0, &_lock);
+      Grid<Real> *real = _args.getPtrOpt<Grid<Real>>("real", 3, 0, &_lock);
+      Grid<int> *index = _args.getPtrOpt<Grid<int>>("index", 4, 0, &_lock);
+      ParticleIndexSystem *indexSys = _args.getPtrOpt<ParticleIndexSystem>(
+          "indexSys", 5, 0, &_lock);
+      _retval = getPyNone();
+      resetOutflow(flags, phi, parts, real, index, indexSys);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "resetOutflow", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("resetOutflow", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_resetOutflow("", "resetOutflow", _W_4);
+extern "C" {
+void PbRegister_resetOutflow()
+{
+  KEEP_UNUSED(_RP_resetOutflow);
+}
+}
+
+//! enforce a constant inflow/outflow at the grid boundaries
+struct KnSetInflow : public KernelBase {
+  KnSetInflow(MACGrid &vel, int dim, int p0, const Vec3 &val)
+      : KernelBase(&vel, 0), vel(vel), dim(dim), p0(p0), val(val)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, MACGrid &vel, int dim, int p0, const Vec3 &val) const
+  {
+    Vec3i p(i, j, k);
+    if (p[dim] == p0 || p[dim] == p0 + 1)
+      vel(i, j, k) = val;
+  }
+  inline MACGrid &getArg0()
+  {
+    return vel;
+  }
+  typedef MACGrid type0;
+  inline int &getArg1()
+  {
+    return dim;
+  }
+  typedef int type1;
+  inline int &getArg2()
+  {
+    return p0;
+  }
+  typedef int type2;
+  inline const Vec3 &getArg3()
+  {
+    return val;
+  }
+  typedef Vec3 type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnSetInflow ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, vel, dim, p0, val);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, vel, dim, p0, val);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  MACGrid &vel;
+  int dim;
+  int p0;
+  const Vec3 &val;
+};
+
+//! enforce a constant inflow/outflow at the grid boundaries
+void setInflowBcs(MACGrid &vel, string dir, Vec3 value)
+{
+  for (size_t i = 0; i < dir.size(); i++) {
+    if (dir[i] >= 'x' && dir[i] <= 'z') {
+      int dim = dir[i] - 'x';
+      KnSetInflow(vel, dim, 0, value);
+    }
+    else if (dir[i] >= 'X' && dir[i] <= 'Z') {
+      int dim = dir[i] - 'X';
+      KnSetInflow(vel, dim, vel.getSize()[dim] - 1, value);
+    }
+    else
+      errMsg("invalid character in direction string. Only [xyzXYZ] allowed.");
+  }
+}
+static PyObject *_W_5(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "setInflowBcs", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 0, &_lock);
+      string dir = _args.get<string>("dir", 1, &_lock);
+      Vec3 value = _args.get<Vec3>("value", 2, &_lock);
+      _retval = getPyNone();
+      setInflowBcs(vel, dir, value);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "setInflowBcs", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("setInflowBcs", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_setInflowBcs("", "setInflowBcs", _W_5);
+extern "C" {
+void PbRegister_setInflowBcs()
+{
+  KEEP_UNUSED(_RP_setInflowBcs);
+}
+}
+
+// set obstacle boundary conditions
+
+//! set no-stick wall boundary condition between ob/fl and ob/ob cells
+struct KnSetWallBcs : public KernelBase {
+  KnSetWallBcs(const FlagGrid &flags, MACGrid &vel, const MACGrid *obvel)
+      : KernelBase(&flags, 0), flags(flags), vel(vel), obvel(obvel)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(
+      int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const MACGrid *obvel) const
+  {
+
+    bool curFluid = flags.isFluid(i, j, k);
+    bool curObs = flags.isObstacle(i, j, k);
+    Vec3 bcsVel(0., 0., 0.);
+    if (!curFluid && !curObs)
+      return;
+
+    if (obvel) {
+      bcsVel.x = (*obvel)(i, j, k).x;
+      bcsVel.y = (*obvel)(i, j, k).y;
+      if ((*obvel).is3D())
+        bcsVel.z = (*obvel)(i, j, k).z;
+    }
+
+    // we use i>0 instead of bnd=1 to check outer wall
+    if (i > 0 && flags.isObstacle(i - 1, j, k))
+      vel(i, j, k).x = bcsVel.x;
+    if (i > 0 && curObs && flags.isFluid(i - 1, j, k))
+      vel(i, j, k).x = bcsVel.x;
+    if (j > 0 && flags.isObstacle(i, j - 1, k))
+      vel(i, j, k).y = bcsVel.y;
+    if (j > 0 && curObs && flags.isFluid(i, j - 1, k))
+      vel(i, j, k).y = bcsVel.y;
+
+    if (!vel.is3D()) {
+      vel(i, j, k).z = 0;
+    }
+    else {
+      if (k > 0 && flags.isObstacle(i, j, k - 1))
+        vel(i, j, k).z = bcsVel.z;
+      if (k > 0 && curObs && flags.isFluid(i, j, k - 1))
+        vel(i, j, k).z = bcsVel.z;
+    }
+
+    if (curFluid) {
+      if ((i > 0 && flags.isStick(i - 1, j, k)) ||
+          (i < flags.getSizeX() - 1 && flags.isStick(i + 1, j, k)))
+        vel(i, j, k).y = vel(i, j, k).z = 0;
+      if ((j > 0 && flags.isStick(i, j - 1, k)) ||
+          (j < flags.getSizeY() - 1 && flags.isStick(i, j + 1, k)))
+        vel(i, j, k).x = vel(i, j, k).z = 0;
+      if (vel.is3D() && ((k > 0 && flags.isStick(i, j, k - 1)) ||
+                         (k < flags.getSizeZ() - 1 && flags.isStick(i, j, k + 1))))
+        vel(i, j, k).x = vel(i, j, k).y = 0;
+    }
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline MACGrid &getArg1()
+  {
+    return vel;
+  }
+  typedef MACGrid type1;
+  inline const MACGrid *getArg2()
+  {
+    return obvel;
+  }
+  typedef MACGrid type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnSetWallBcs ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, vel, obvel);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, vel, obvel);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const FlagGrid &flags;
+  MACGrid &vel;
+  const MACGrid *obvel;
+};
+
+//! set wall BCs for fill fraction mode, note - only needs obstacle SDF
+
+struct KnSetWallBcsFrac : public KernelBase {
+  KnSetWallBcsFrac(const FlagGrid &flags,
+                   const MACGrid &vel,
+                   MACGrid &velTarget,
+                   const MACGrid *obvel,
+                   const Grid<Real> *phiObs,
+                   const int &boundaryWidth = 0)
+      : KernelBase(&flags, 0),
+        flags(flags),
+        vel(vel),
+        velTarget(velTarget),
+        obvel(obvel),
+        phiObs(phiObs),
+        boundaryWidth(boundaryWidth)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 const MACGrid &vel,
+                 MACGrid &velTarget,
+                 const MACGrid *obvel,
+                 const Grid<Real> *phiObs,
+                 const int &boundaryWidth = 0) const
+  {
+    bool curFluid = flags.isFluid(i, j, k);
+    bool curObs = flags.isObstacle(i, j, k);
+    velTarget(i, j, k) = vel(i, j, k);
+    if (!curFluid && !curObs)
+      return;
+
+    // zero normal component in all obstacle regions
+    if (flags.isInBounds(Vec3i(i, j, k), 1)) {
+
+      if (curObs | flags.isObstacle(i - 1, j, k)) {
+        Vec3 dphi(0., 0., 0.);
+        const Real tmp1 = (phiObs->get(i, j, k) + phiObs->get(i - 1, j, k)) * .5;
+        Real tmp2 = (phiObs->get(i, j + 1, k) + phiObs->get(i - 1, j + 1, k)) * .5;
+        Real phi1 = (tmp1 + tmp2) * .5;
+        tmp2 = (phiObs->get(i, j - 1, k) + phiObs->get(i - 1, j - 1, k)) * .5;
+        Real phi2 = (tmp1 + tmp2) * .5;
+
+        dphi.x = phiObs->get(i, j, k) - phiObs->get(i - 1, j, k);
+        dphi.y = phi1 - phi2;
+
+        if (phiObs->is3D()) {
+          tmp2 = (phiObs->get(i, j, k + 1) + phiObs->get(i - 1, j, k + 1)) * .5;
+          phi1 = (tmp1 + tmp2) * .5;
+          tmp2 = (phiObs->get(i, j, k - 1) + phiObs->get(i - 1, j, k - 1)) * .5;
+          phi2 = (tmp1 + tmp2) * .5;
+          dphi.z = phi1 - phi2;
+        }
+
+        normalize(dphi);
+        Vec3 velMAC = vel.getAtMACX(i, j, k);
+        velTarget(i, j, k).x = velMAC.x - dot(dphi, velMAC) * dphi.x;
+      }
+
+      if (curObs | flags.isObstacle(i, j - 1, k)) {
+        Vec3 dphi(0., 0., 0.);
+        const Real tmp1 = (phiObs->get(i, j, k) + phiObs->get(i, j - 1, k)) * .5;
+        Real tmp2 = (phiObs->get(i + 1, j, k) + phiObs->get(i + 1, j - 1, k)) * .5;
+        Real phi1 = (tmp1 + tmp2) * .5;
+        tmp2 = (phiObs->get(i - 1, j, k) + phiObs->get(i - 1, j - 1, k)) * .5;
+        Real phi2 = (tmp1 + tmp2) * .5;
+
+        dphi.x = phi1 - phi2;
+        dphi.y = phiObs->get(i, j, k) - phiObs->get(i, j - 1, k);
+        if (phiObs->is3D()) {
+          tmp2 = (phiObs->get(i, j, k + 1) + phiObs->get(i, j - 1, k + 1)) * .5;
+          phi1 = (tmp1 + tmp2) * .5;
+          tmp2 = (phiObs->get(i, j, k - 1) + phiObs->get(i, j - 1, k - 1)) * .5;
+          phi2 = (tmp1 + tmp2) * .5;
+          dphi.z = phi1 - phi2;
+        }
+
+        normalize(dphi);
+        Vec3 velMAC = vel.getAtMACY(i, j, k);
+        velTarget(i, j, k).y = velMAC.y - dot(dphi, velMAC) * dphi.y;
+      }
+
+      if (phiObs->is3D() && (curObs | flags.isObstacle(i, j, k - 1))) {
+        Vec3 dphi(0., 0., 0.);
+        const Real tmp1 = (phiObs->get(i, j, k) + phiObs->get(i, j, k - 1)) * .5;
+
+        Real tmp2;
+        tmp2 = (phiObs->get(i + 1, j, k) + phiObs->get(i + 1, j, k - 1)) * .5;
+        Real phi1 = (tmp1 + tmp2) * .5;
+        tmp2 = (phiObs->get(i - 1, j, k) + phiObs->get(i - 1, j, k - 1)) * .5;
+        Real phi2 = (tmp1 + tmp2) * .5;
+        dphi.x = phi1 - phi2;
+
+        tmp2 = (phiObs->get(i, j + 1, k) + phiObs->get(i, j + 1, k - 1)) * .5;
+        phi1 = (tmp1 + tmp2) * .5;
+        tmp2 = (phiObs->get(i, j - 1, k) + phiObs->get(i, j - 1, k - 1)) * .5;
+        phi2 = (tmp1 + tmp2) * .5;
+        dphi.y = phi1 - phi2;
+
+        dphi.z = phiObs->get(i, j, k) - phiObs->get(i, j, k - 1);
+
+        normalize(dphi);
+        Vec3 velMAC = vel.getAtMACZ(i, j, k);
+        velTarget(i, j, k).z = velMAC.z - dot(dphi, velMAC) * dphi.z;
+      }
+    }  // not at boundary
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline const MACGrid &getArg1()
+  {
+    return vel;
+  }
+  typedef MACGrid type1;
+  inline MACGrid &getArg2()
+  {
+    return velTarget;
+  }
+  typedef MACGrid type2;
+  inline const MACGrid *getArg3()
+  {
+    return obvel;
+  }
+  typedef MACGrid type3;
+  inline const Grid<Real> *getArg4()
+  {
+    return phiObs;
+  }
+  typedef Grid<Real> type4;
+  inline const int &getArg5()
+  {
+    return boundaryWidth;
+  }
+  typedef int type5;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnSetWallBcsFrac ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, vel, velTarget, obvel, phiObs, boundaryWidth);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, vel, velTarget, obvel, phiObs, boundaryWidth);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const FlagGrid &flags;
+  const MACGrid &vel;
+  MACGrid &velTarget;
+  const MACGrid *obvel;
+  const Grid<Real> *phiObs;
+  const int &boundaryWidth;
+};
+
+//! set zero normal velocity boundary condition on walls
+// (optionally with second order accuracy using the obstacle SDF , fractions grid currentlyl not
+// needed)
+void setWallBcs(const FlagGrid &flags,
+                MACGrid &vel,
+                const MACGrid *obvel = 0,
+                const MACGrid *fractions = 0,
+                const Grid<Real> *phiObs = 0,
+                int boundaryWidth = 0)
+{
+  if (!phiObs || !fractions) {
+    KnSetWallBcs(flags, vel, obvel);
+  }
+  else {
+    MACGrid tmpvel(vel.getParent());
+    KnSetWallBcsFrac(flags, vel, tmpvel, obvel, phiObs, boundaryWidth);
+    vel.swap(tmpvel);
+  }
+}
+static PyObject *_W_6(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "setWallBcs", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      const MACGrid *obvel = _args.getPtrOpt<MACGrid>("obvel", 2, 0, &_lock);
+      const MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 3, 0, &_lock);
+      const Grid<Real> *phiObs = _args.getPtrOpt<Grid<Real>>("phiObs", 4, 0, &_lock);
+      int boundaryWidth = _args.getOpt<int>("boundaryWidth", 5, 0, &_lock);
+      _retval = getPyNone();
+      setWallBcs(flags, vel, obvel, fractions, phiObs, boundaryWidth);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "setWallBcs", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("setWallBcs", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_setWallBcs("", "setWallBcs", _W_6);
+extern "C" {
+void PbRegister_setWallBcs()
+{
+  KEEP_UNUSED(_RP_setWallBcs);
+}
+}
+
+//! add Forces between fl/fl and fl/em cells (interpolate cell centered forces to MAC grid)
+struct KnAddForceIfLower : public KernelBase {
+  KnAddForceIfLower(const FlagGrid &flags, MACGrid &vel, const Grid<Vec3> &force)
+      : KernelBase(&flags, 1), flags(flags), vel(vel), force(force)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(
+      int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const Grid<Vec3> &force) const
+  {
+    bool curFluid = flags.isFluid(i, j, k);
+    bool curEmpty = flags.isEmpty(i, j, k);
+    if (!curFluid && !curEmpty)
+      return;
+
+    if (flags.isFluid(i - 1, j, k) || (curFluid && flags.isEmpty(i - 1, j, k))) {
+      Real forceMACX = 0.5 * (force(i - 1, j, k).x + force(i, j, k).x);
+      Real min = std::min(vel(i, j, k).x, forceMACX);
+      Real max = std::max(vel(i, j, k).x, forceMACX);
+      Real sum = vel(i, j, k).x + forceMACX;
+      vel(i, j, k).x = (forceMACX > 0) ? std::min(sum, max) : std::max(sum, min);
+    }
+    if (flags.isFluid(i, j - 1, k) || (curFluid && flags.isEmpty(i, j - 1, k))) {
+      Real forceMACY = 0.5 * (force(i, j - 1, k).y + force(i, j, k).y);
+      Real min = std::min(vel(i, j, k).y, forceMACY);
+      Real max = std::max(vel(i, j, k).y, forceMACY);
+      Real sum = vel(i, j, k).y + forceMACY;
+      vel(i, j, k).y = (forceMACY > 0) ? std::min(sum, max) : std::max(sum, min);
+    }
+    if (vel.is3D() && (flags.isFluid(i, j, k - 1) || (curFluid && flags.isEmpty(i, j, k - 1)))) {
+      Real forceMACZ = 0.5 * (force(i, j, k - 1).z + force(i, j, k).z);
+      Real min = std::min(vel(i, j, k).z, forceMACZ);
+      Real max = std::max(vel(i, j, k).z, forceMACZ);
+      Real sum = vel(i, j, k).z + forceMACZ;
+      vel(i, j, k).z = (forceMACZ > 0) ? std::min(sum, max) : std::max(sum, min);
+    }
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline MACGrid &getArg1()
+  {
+    return vel;
+  }
+  typedef MACGrid type1;
+  inline const Grid<Vec3> &getArg2()
+  {
+    return force;
+  }
+  typedef Grid<Vec3> type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnAddForceIfLower ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, force);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, force);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  MACGrid &vel;
+  const Grid<Vec3> &force;
+};
+
+// Initial velocity for smoke
+void setInitialVelocity(const FlagGrid &flags, MACGrid &vel, const Grid<Vec3> &invel)
+{
+  KnAddForceIfLower(flags, vel, invel);
+}
+static PyObject *_W_7(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "setInitialVelocity", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      const Grid<Vec3> &invel = *_args.getPtr<Grid<Vec3>>("invel", 2, &_lock);
+      _retval = getPyNone();
+      setInitialVelocity(flags, vel, invel);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "setInitialVelocity", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("setInitialVelocity", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_setInitialVelocity("", "setInitialVelocity", _W_7);
+extern "C" {
+void PbRegister_setInitialVelocity()
+{
+  KEEP_UNUSED(_RP_setInitialVelocity);
+}
+}
+
+//! Kernel: gradient norm operator
+struct KnConfForce : public KernelBase {
+  KnConfForce(Grid<Vec3> &force,
+              const Grid<Real> &grid,
+              const Grid<Vec3> &curl,
+              Real str,
+              const Grid<Real> *strGrid)
+      : KernelBase(&force, 1), force(force), grid(grid), curl(curl), str(str), strGrid(strGrid)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 Grid<Vec3> &force,
+                 const Grid<Real> &grid,
+                 const Grid<Vec3> &curl,
+                 Real str,
+                 const Grid<Real> *strGrid) const
+  {
+    Vec3 grad = 0.5 * Vec3(grid(i + 1, j, k) - grid(i - 1, j, k),
+                           grid(i, j + 1, k) - grid(i, j - 1, k),
+                           0.);
+    if (grid.is3D())
+      grad[2] = 0.5 * (grid(i, j, k + 1) - grid(i, j, k - 1));
+    normalize(grad);
+    if (strGrid)
+      str += (*strGrid)(i, j, k);
+    force(i, j, k) = str * cross(grad, curl(i, j, k));
+  }
+  inline Grid<Vec3> &getArg0()
+  {
+    return force;
+  }
+  typedef Grid<Vec3> type0;
+  inline const Grid<Real> &getArg1()
+  {
+    return grid;
+  }
+  typedef Grid<Real> type1;
+  inline const Grid<Vec3> &getArg2()
+  {
+    return curl;
+  }
+  typedef Grid<Vec3> type2;
+  inline Real &getArg3()
+  {
+    return str;
+  }
+  typedef Real type3;
+  inline const Grid<Real> *getArg4()
+  {
+    return strGrid;
+  }
+  typedef Grid<Real> type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnConfForce ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, force, grid, curl, str, strGrid);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, force, grid, curl, str, strGrid);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  Grid<Vec3> &force;
+  const Grid<Real> &grid;
+  const Grid<Vec3> &curl;
+  Real str;
+  const Grid<Real> *strGrid;
+};
+
+void vorticityConfinement(MACGrid &vel,
+                          const FlagGrid &flags,
+                          Real strengthGlobal = 0,
+                          const Grid<Real> *strengthCell = NULL)
+{
+  Grid<Vec3> velCenter(flags.getParent()), curl(flags.getParent()), force(flags.getParent());
+  Grid<Real> norm(flags.getParent());
+
+  GetCentered(velCenter, vel);
+  CurlOp(velCenter, curl);
+  GridNorm(norm, curl);
+  KnConfForce(force, norm, curl, strengthGlobal, strengthCell);
+  KnApplyForceField(flags, vel, force, NULL, true, false);
+}
+static PyObject *_W_8(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "vorticityConfinement", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 0, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 1, &_lock);
+      Real strengthGlobal = _args.getOpt<Real>("strengthGlobal", 2, 0, &_lock);
+      const Grid<Real> *strengthCell = _args.getPtrOpt<Grid<Real>>(
+          "strengthCell", 3, NULL, &_lock);
+      _retval = getPyNone();
+      vorticityConfinement(vel, flags, strengthGlobal, strengthCell);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "vorticityConfinement", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("vorticityConfinement", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_vorticityConfinement("", "vorticityConfinement", _W_8);
+extern "C" {
+void PbRegister_vorticityConfinement()
+{
+  KEEP_UNUSED(_RP_vorticityConfinement);
+}
+}
+
+void addForceField(const FlagGrid &flags,
+                   MACGrid &vel,
+                   const Grid<Vec3> &force,
+                   const Grid<Real> *region = NULL,
+                   bool isMAC = false)
+{
+  KnApplyForceField(flags, vel, force, region, true, isMAC);
+}
+static PyObject *_W_9(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "addForceField", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      const Grid<Vec3> &force = *_args.getPtr<Grid<Vec3>>("force", 2, &_lock);
+      const Grid<Real> *region = _args.getPtrOpt<Grid<Real>>("region", 3, NULL, &_lock);
+      bool isMAC = _args.getOpt<bool>("isMAC", 4, false, &_lock);
+      _retval = getPyNone();
+      addForceField(flags, vel, force, region, isMAC);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "addForceField", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("addForceField", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_addForceField("", "addForceField", _W_9);
+extern "C" {
+void PbRegister_addForceField()
+{
+  KEEP_UNUSED(_RP_addForceField);
+}
+}
+
+void setForceField(const FlagGrid &flags,
+                   MACGrid &vel,
+                   const Grid<Vec3> &force,
+                   const Grid<Real> *region = NULL,
+                   bool isMAC = false)
+{
+  KnApplyForceField(flags, vel, force, region, false, isMAC);
+}
+static PyObject *_W_10(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "setForceField", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      const Grid<Vec3> &force = *_args.getPtr<Grid<Vec3>>("force", 2, &_lock);
+      const Grid<Real> *region = _args.getPtrOpt<Grid<Real>>("region", 3, NULL, &_lock);
+      bool isMAC = _args.getOpt<bool>("isMAC", 4, false, &_lock);
+      _retval = getPyNone();
+      setForceField(flags, vel, force, region, isMAC);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "setForceField", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("setForceField", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_setForceField("", "setForceField", _W_10);
+extern "C" {
+void PbRegister_setForceField()
+{
+  KEEP_UNUSED(_RP_setForceField);
+}
+}
+
+void dissolveSmoke(const FlagGrid &flags,
+                   Grid<Real> &density,
+                   Grid<Real> *heat = NULL,
+                   Grid<Real> *red = NULL,
+                   Grid<Real> *green = NULL,
+                   Grid<Real> *blue = NULL,
+                   int speed = 5,
+                   bool logFalloff = true)
+{
+  float dydx = 1.0f / (float)speed;  // max density/speed = dydx
+  float fac = 1.0f - dydx;
+
+  FOR_IJK_BND(density, 0)
+  {
+    bool curFluid = flags.isFluid(i, j, k);
+    if (!curFluid)
+      continue;
+
+    if (logFalloff) {
+      density(i, j, k) *= fac;
+      if (heat) {
+        (*heat)(i, j, k) *= fac;
+      }
+      if (red) {
+        (*red)(i, j, k) *= fac;
+        (*green)(i, j, k) *= fac;
+        (*blue)(i, j, k) *= fac;
+      }
+    }
+    else {  // linear falloff
+      float d = density(i, j, k);
+      density(i, j, k) -= dydx;
+      if (density(i, j, k) < 0.0f)
+        density(i, j, k) = 0.0f;
+      if (heat) {
+        if (fabs((*heat)(i, j, k)) < dydx)
+          (*heat)(i, j, k) = 0.0f;
+        else if ((*heat)(i, j, k) > 0.0f)
+          (*heat)(i, j, k) -= dydx;
+        else if ((*heat)(i, j, k) < 0.0f)
+          (*heat)(i, j, k) += dydx;
+      }
+      if (red && notZero(d)) {
+        (*red)(i, j, k) *= (density(i, j, k) / d);
+        (*green)(i, j, k) *= (density(i, j, k) / d);
+        (*blue)(i, j, k) *= (density(i, j, k) / d);
+      }
+    }
+  }
+}
+static PyObject *_W_11(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "dissolveSmoke", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Real> &density = *_args.getPtr<Grid<Real>>("density", 1, &_lock);
+      Grid<Real> *heat = _args.getPtrOpt<Grid<Real>>("heat", 2, NULL, &_lock);
+      Grid<Real> *red = _args.getPtrOpt<Grid<Real>>("red", 3, NULL, &_lock);
+      Grid<Real> *green = _args.getPtrOpt<Grid<Real>>("green", 4, NULL, &_lock);
+      Grid<Real> *blue = _args.getPtrOpt<Grid<Real>>("blue", 5, NULL, &_lock);
+      int speed = _args.getOpt<int>("speed", 6, 5, &_lock);
+      bool logFalloff = _args.getOpt<bool>("logFalloff", 7, true, &_lock);
+      _retval = getPyNone();
+      dissolveSmoke(flags, density, heat, red, green, blue, speed, logFalloff);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "dissolveSmoke", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("dissolveSmoke", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_dissolveSmoke("", "dissolveSmoke", _W_11);
+extern "C" {
+void PbRegister_dissolveSmoke()
+{
+  KEEP_UNUSED(_RP_dissolveSmoke);
+}
+}
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/fire.cpp b/extern/mantaflow/preprocessed/plugin/fire.cpp
new file mode 100644
index 00000000000..9047d4bf8a1
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/fire.cpp
@@ -0,0 +1,435 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Copyright 2016 Sebastian Barschkis, Nils Thuerey
+ *
+ * This program is free software, distributed under the terms of the
+ * Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Fire modeling plugin
+ *
+ ******************************************************************************/
+
+#include "general.h"
+#include "grid.h"
+#include "vectorbase.h"
+
+using namespace std;
+
+namespace Manta {
+
+struct KnProcessBurn : public KernelBase {
+  KnProcessBurn(Grid<Real> &fuel,
+                Grid<Real> &density,
+                Grid<Real> &react,
+                Grid<Real> *red,
+                Grid<Real> *green,
+                Grid<Real> *blue,
+                Grid<Real> *heat,
+                Real burningRate,
+                Real flameSmoke,
+                Real ignitionTemp,
+                Real maxTemp,
+                Real dt,
+                Vec3 flameSmokeColor)
+      : KernelBase(&fuel, 1),
+        fuel(fuel),
+        density(density),
+        react(react),
+        red(red),
+        green(green),
+        blue(blue),
+        heat(heat),
+        burningRate(burningRate),
+        flameSmoke(flameSmoke),
+        ignitionTemp(ignitionTemp),
+        maxTemp(maxTemp),
+        dt(dt),
+        flameSmokeColor(flameSmokeColor)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 Grid<Real> &fuel,
+                 Grid<Real> &density,
+                 Grid<Real> &react,
+                 Grid<Real> *red,
+                 Grid<Real> *green,
+                 Grid<Real> *blue,
+                 Grid<Real> *heat,
+                 Real burningRate,
+                 Real flameSmoke,
+                 Real ignitionTemp,
+                 Real maxTemp,
+                 Real dt,
+                 Vec3 flameSmokeColor) const
+  {
+    // Save initial values
+    Real origFuel = fuel(i, j, k);
+    Real origSmoke = density(i, j, k);
+    Real smokeEmit = 0.0f;
+    Real flame = 0.0f;
+
+    // Process fuel
+    fuel(i, j, k) -= burningRate * dt;
+    if (fuel(i, j, k) < 0.0f)
+      fuel(i, j, k) = 0.0f;
+
+    // Process reaction coordinate
+    if (origFuel > VECTOR_EPSILON) {
+      react(i, j, k) *= fuel(i, j, k) / origFuel;
+      flame = pow(react(i, j, k), 0.5f);
+    }
+    else {
+      react(i, j, k) = 0.0f;
+    }
+
+    // Set fluid temperature based on fuel burn rate and "flameSmoke" factor
+    smokeEmit = (origFuel < 1.0f) ? (1.0 - origFuel) * 0.5f : 0.0f;
+    smokeEmit = (smokeEmit + 0.5f) * (origFuel - fuel(i, j, k)) * 0.1f * flameSmoke;
+    density(i, j, k) += smokeEmit;
+    clamp(density(i, j, k), (Real)0.0f, (Real)1.0f);
+
+    // Set fluid temperature from the flame temperature profile
+    if (heat && flame)
+      (*heat)(i, j, k) = (1.0f - flame) * ignitionTemp + flame * maxTemp;
+
+    // Mix new color
+    if (smokeEmit > VECTOR_EPSILON) {
+      float smokeFactor = density(i, j, k) / (origSmoke + smokeEmit);
+      if (red)
+        (*red)(i, j, k) = ((*red)(i, j, k) + flameSmokeColor.x * smokeEmit) * smokeFactor;
+      if (green)
+        (*green)(i, j, k) = ((*green)(i, j, k) + flameSmokeColor.y * smokeEmit) * smokeFactor;
+      if (blue)
+        (*blue)(i, j, k) = ((*blue)(i, j, k) + flameSmokeColor.z * smokeEmit) * smokeFactor;
+    }
+  }
+  inline Grid<Real> &getArg0()
+  {
+    return fuel;
+  }
+  typedef Grid<Real> type0;
+  inline Grid<Real> &getArg1()
+  {
+    return density;
+  }
+  typedef Grid<Real> type1;
+  inline Grid<Real> &getArg2()
+  {
+    return react;
+  }
+  typedef Grid<Real> type2;
+  inline Grid<Real> *getArg3()
+  {
+    return red;
+  }
+  typedef Grid<Real> type3;
+  inline Grid<Real> *getArg4()
+  {
+    return green;
+  }
+  typedef Grid<Real> type4;
+  inline Grid<Real> *getArg5()
+  {
+    return blue;
+  }
+  typedef Grid<Real> type5;
+  inline Grid<Real> *getArg6()
+  {
+    return heat;
+  }
+  typedef Grid<Real> type6;
+  inline Real &getArg7()
+  {
+    return burningRate;
+  }
+  typedef Real type7;
+  inline Real &getArg8()
+  {
+    return flameSmoke;
+  }
+  typedef Real type8;
+  inline Real &getArg9()
+  {
+    return ignitionTemp;
+  }
+  typedef Real type9;
+  inline Real &getArg10()
+  {
+    return maxTemp;
+  }
+  typedef Real type10;
+  inline Real &getArg11()
+  {
+    return dt;
+  }
+  typedef Real type11;
+  inline Vec3 &getArg12()
+  {
+    return flameSmokeColor;
+  }
+  typedef Vec3 type12;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnProcessBurn ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i,
+               j,
+               k,
+               fuel,
+               density,
+               react,
+               red,
+               green,
+               blue,
+               heat,
+               burningRate,
+               flameSmoke,
+               ignitionTemp,
+               maxTemp,
+               dt,
+               flameSmokeColor);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i,
+             j,
+             k,
+             fuel,
+             density,
+             react,
+             red,
+             green,
+             blue,
+             heat,
+             burningRate,
+             flameSmoke,
+             ignitionTemp,
+             maxTemp,
+             dt,
+             flameSmokeColor);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  Grid<Real> &fuel;
+  Grid<Real> &density;
+  Grid<Real> &react;
+  Grid<Real> *red;
+  Grid<Real> *green;
+  Grid<Real> *blue;
+  Grid<Real> *heat;
+  Real burningRate;
+  Real flameSmoke;
+  Real ignitionTemp;
+  Real maxTemp;
+  Real dt;
+  Vec3 flameSmokeColor;
+};
+
+void processBurn(Grid<Real> &fuel,
+                 Grid<Real> &density,
+                 Grid<Real> &react,
+                 Grid<Real> *red = NULL,
+                 Grid<Real> *green = NULL,
+                 Grid<Real> *blue = NULL,
+                 Grid<Real> *heat = NULL,
+                 Real burningRate = 0.75f,
+                 Real flameSmoke = 1.0f,
+                 Real ignitionTemp = 1.25f,
+                 Real maxTemp = 1.75f,
+                 Vec3 flameSmokeColor = Vec3(0.7f, 0.7f, 0.7f))
+{
+  Real dt = fuel.getParent()->getDt();
+  KnProcessBurn(fuel,
+                density,
+                react,
+                red,
+                green,
+                blue,
+                heat,
+                burningRate,
+                flameSmoke,
+                ignitionTemp,
+                maxTemp,
+                dt,
+                flameSmokeColor);
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "processBurn", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &fuel = *_args.getPtr<Grid<Real>>("fuel", 0, &_lock);
+      Grid<Real> &density = *_args.getPtr<Grid<Real>>("density", 1, &_lock);
+      Grid<Real> &react = *_args.getPtr<Grid<Real>>("react", 2, &_lock);
+      Grid<Real> *red = _args.getPtrOpt<Grid<Real>>("red", 3, NULL, &_lock);
+      Grid<Real> *green = _args.getPtrOpt<Grid<Real>>("green", 4, NULL, &_lock);
+      Grid<Real> *blue = _args.getPtrOpt<Grid<Real>>("blue", 5, NULL, &_lock);
+      Grid<Real> *heat = _args.getPtrOpt<Grid<Real>>("heat", 6, NULL, &_lock);
+      Real burningRate = _args.getOpt<Real>("burningRate", 7, 0.75f, &_lock);
+      Real flameSmoke = _args.getOpt<Real>("flameSmoke", 8, 1.0f, &_lock);
+      Real ignitionTemp = _args.getOpt<Real>("ignitionTemp", 9, 1.25f, &_lock);
+      Real maxTemp = _args.getOpt<Real>("maxTemp", 10, 1.75f, &_lock);
+      Vec3 flameSmokeColor = _args.getOpt<Vec3>(
+          "flameSmokeColor", 11, Vec3(0.7f, 0.7f, 0.7f), &_lock);
+      _retval = getPyNone();
+      processBurn(fuel,
+                  density,
+                  react,
+                  red,
+                  green,
+                  blue,
+                  heat,
+                  burningRate,
+                  flameSmoke,
+                  ignitionTemp,
+                  maxTemp,
+                  flameSmokeColor);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "processBurn", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("processBurn", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_processBurn("", "processBurn", _W_0);
+extern "C" {
+void PbRegister_processBurn()
+{
+  KEEP_UNUSED(_RP_processBurn);
+}
+}
+
+struct KnUpdateFlame : public KernelBase {
+  KnUpdateFlame(const Grid<Real> &react, Grid<Real> &flame)
+      : KernelBase(&react, 1), react(react), flame(flame)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, const Grid<Real> &react, Grid<Real> &flame) const
+  {
+    if (react(i, j, k) > 0.0f)
+      flame(i, j, k) = pow(react(i, j, k), 0.5f);
+    else
+      flame(i, j, k) = 0.0f;
+  }
+  inline const Grid<Real> &getArg0()
+  {
+    return react;
+  }
+  typedef Grid<Real> type0;
+  inline Grid<Real> &getArg1()
+  {
+    return flame;
+  }
+  typedef Grid<Real> type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnUpdateFlame ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, react, flame);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, react, flame);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const Grid<Real> &react;
+  Grid<Real> &flame;
+};
+
+void updateFlame(const Grid<Real> &react, Grid<Real> &flame)
+{
+  KnUpdateFlame(react, flame);
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "updateFlame", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const Grid<Real> &react = *_args.getPtr<Grid<Real>>("react", 0, &_lock);
+      Grid<Real> &flame = *_args.getPtr<Grid<Real>>("flame", 1, &_lock);
+      _retval = getPyNone();
+      updateFlame(react, flame);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "updateFlame", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("updateFlame", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_updateFlame("", "updateFlame", _W_1);
+extern "C" {
+void PbRegister_updateFlame()
+{
+  KEEP_UNUSED(_RP_updateFlame);
+}
+}
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/flip.cpp b/extern/mantaflow/preprocessed/plugin/flip.cpp
new file mode 100644
index 00000000000..f6d082900b5
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/flip.cpp
@@ -0,0 +1,2819 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Copyright 2011 Tobias Pfaff, Nils Thuerey
+ *
+ * This program is free software, distributed under the terms of the
+ * Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * FLIP (fluid implicit particles)
+ * for use with particle data fields
+ *
+ ******************************************************************************/
+
+#include "particle.h"
+#include "grid.h"
+#include "commonkernels.h"
+#include "randomstream.h"
+#include "levelset.h"
+#include "shapes.h"
+#include "matrixbase.h"
+
+using namespace std;
+namespace Manta {
+
+// init
+
+//! note - this is a simplified version , sampleLevelsetWithParticles has more functionality
+
+void sampleFlagsWithParticles(const FlagGrid &flags,
+                              BasicParticleSystem &parts,
+                              const int discretization,
+                              const Real randomness)
+{
+  const bool is3D = flags.is3D();
+  const Real jlen = randomness / discretization;
+  const Vec3 disp(1.0 / discretization, 1.0 / discretization, 1.0 / discretization);
+  RandomStream mRand(9832);
+
+  FOR_IJK_BND(flags, 0)
+  {
+    if (flags.isObstacle(i, j, k))
+      continue;
+    if (flags.isFluid(i, j, k)) {
+      const Vec3 pos(i, j, k);
+      for (int dk = 0; dk < (is3D ? discretization : 1); dk++)
+        for (int dj = 0; dj < discretization; dj++)
+          for (int di = 0; di < discretization; di++) {
+            Vec3 subpos = pos + disp * Vec3(0.5 + di, 0.5 + dj, 0.5 + dk);
+            subpos += jlen * (Vec3(1, 1, 1) - 2.0 * mRand.getVec3());
+            if (!is3D)
+              subpos[2] = 0.5;
+            parts.addBuffered(subpos);
+          }
+    }
+  }
+  parts.insertBufferedParticles();
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "sampleFlagsWithParticles", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 1, &_lock);
+      const int discretization = _args.get<int>("discretization", 2, &_lock);
+      const Real randomness = _args.get<Real>("randomness", 3, &_lock);
+      _retval = getPyNone();
+      sampleFlagsWithParticles(flags, parts, discretization, randomness);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "sampleFlagsWithParticles", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("sampleFlagsWithParticles", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_sampleFlagsWithParticles("", "sampleFlagsWithParticles", _W_0);
+extern "C" {
+void PbRegister_sampleFlagsWithParticles()
+{
+  KEEP_UNUSED(_RP_sampleFlagsWithParticles);
+}
+}
+
+//! sample a level set with particles, use reset to clear the particle buffer,
+//! and skipEmpty for a continuous inflow (in the latter case, only empty cells will
+//! be re-filled once they empty when calling sampleLevelsetWithParticles during
+//! the main loop).
+
+void sampleLevelsetWithParticles(const LevelsetGrid &phi,
+                                 const FlagGrid &flags,
+                                 BasicParticleSystem &parts,
+                                 const int discretization,
+                                 const Real randomness,
+                                 const bool reset = false,
+                                 const bool refillEmpty = false,
+                                 const int particleFlag = -1)
+{
+  const bool is3D = phi.is3D();
+  const Real jlen = randomness / discretization;
+  const Vec3 disp(1.0 / discretization, 1.0 / discretization, 1.0 / discretization);
+  RandomStream mRand(9832);
+
+  if (reset) {
+    parts.clear();
+    parts.doCompress();
+  }
+
+  FOR_IJK_BND(phi, 0)
+  {
+    if (flags.isObstacle(i, j, k))
+      continue;
+    if (refillEmpty && flags.isFluid(i, j, k))
+      continue;
+    if (phi(i, j, k) < 1.733) {
+      const Vec3 pos(i, j, k);
+      for (int dk = 0; dk < (is3D ? discretization : 1); dk++)
+        for (int dj = 0; dj < discretization; dj++)
+          for (int di = 0; di < discretization; di++) {
+            Vec3 subpos = pos + disp * Vec3(0.5 + di, 0.5 + dj, 0.5 + dk);
+            subpos += jlen * (Vec3(1, 1, 1) - 2.0 * mRand.getVec3());
+            if (!is3D)
+              subpos[2] = 0.5;
+            if (phi.getInterpolated(subpos) > 0.)
+              continue;
+            if (particleFlag < 0) {
+              parts.addBuffered(subpos);
+            }
+            else {
+              parts.addBuffered(subpos, particleFlag);
+            }
+          }
+    }
+  }
+
+  parts.insertBufferedParticles();
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "sampleLevelsetWithParticles", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const LevelsetGrid &phi = *_args.getPtr<LevelsetGrid>("phi", 0, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 1, &_lock);
+      BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 2, &_lock);
+      const int discretization = _args.get<int>("discretization", 3, &_lock);
+      const Real randomness = _args.get<Real>("randomness", 4, &_lock);
+      const bool reset = _args.getOpt<bool>("reset", 5, false, &_lock);
+      const bool refillEmpty = _args.getOpt<bool>("refillEmpty", 6, false, &_lock);
+      const int particleFlag = _args.getOpt<int>("particleFlag", 7, -1, &_lock);
+      _retval = getPyNone();
+      sampleLevelsetWithParticles(
+          phi, flags, parts, discretization, randomness, reset, refillEmpty, particleFlag);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "sampleLevelsetWithParticles", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("sampleLevelsetWithParticles", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_sampleLevelsetWithParticles("", "sampleLevelsetWithParticles", _W_1);
+extern "C" {
+void PbRegister_sampleLevelsetWithParticles()
+{
+  KEEP_UNUSED(_RP_sampleLevelsetWithParticles);
+}
+}
+
+//! sample a shape with particles, use reset to clear the particle buffer,
+//! and skipEmpty for a continuous inflow (in the latter case, only empty cells will
+//! be re-filled once they empty when calling sampleShapeWithParticles during
+//! the main loop).
+
+void sampleShapeWithParticles(const Shape &shape,
+                              const FlagGrid &flags,
+                              BasicParticleSystem &parts,
+                              const int discretization,
+                              const Real randomness,
+                              const bool reset = false,
+                              const bool refillEmpty = false,
+                              const LevelsetGrid *exclude = NULL)
+{
+  const bool is3D = flags.is3D();
+  const Real jlen = randomness / discretization;
+  const Vec3 disp(1.0 / discretization, 1.0 / discretization, 1.0 / discretization);
+  RandomStream mRand(9832);
+
+  if (reset) {
+    parts.clear();
+    parts.doCompress();
+  }
+
+  FOR_IJK_BND(flags, 0)
+  {
+    if (flags.isObstacle(i, j, k))
+      continue;
+    if (refillEmpty && flags.isFluid(i, j, k))
+      continue;
+    const Vec3 pos(i, j, k);
+    for (int dk = 0; dk < (is3D ? discretization : 1); dk++)
+      for (int dj = 0; dj < discretization; dj++)
+        for (int di = 0; di < discretization; di++) {
+          Vec3 subpos = pos + disp * Vec3(0.5 + di, 0.5 + dj, 0.5 + dk);
+          subpos += jlen * (Vec3(1, 1, 1) - 2.0 * mRand.getVec3());
+          if (!is3D)
+            subpos[2] = 0.5;
+          if (exclude && exclude->getInterpolated(subpos) <= 0.)
+            continue;
+          if (!shape.isInside(subpos))
+            continue;
+          parts.addBuffered(subpos);
+        }
+  }
+
+  parts.insertBufferedParticles();
+}
+static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "sampleShapeWithParticles", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const Shape &shape = *_args.getPtr<Shape>("shape", 0, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 1, &_lock);
+      BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 2, &_lock);
+      const int discretization = _args.get<int>("discretization", 3, &_lock);
+      const Real randomness = _args.get<Real>("randomness", 4, &_lock);
+      const bool reset = _args.getOpt<bool>("reset", 5, false, &_lock);
+      const bool refillEmpty = _args.getOpt<bool>("refillEmpty", 6, false, &_lock);
+      const LevelsetGrid *exclude = _args.getPtrOpt<LevelsetGrid>("exclude", 7, NULL, &_lock);
+      _retval = getPyNone();
+      sampleShapeWithParticles(
+          shape, flags, parts, discretization, randomness, reset, refillEmpty, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "sampleShapeWithParticles", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("sampleShapeWithParticles", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_sampleShapeWithParticles("", "sampleShapeWithParticles", _W_2);
+extern "C" {
+void PbRegister_sampleShapeWithParticles()
+{
+  KEEP_UNUSED(_RP_sampleShapeWithParticles);
+}
+}
+
+//! mark fluid cells and helpers
+struct knClearFluidFlags : public KernelBase {
+  knClearFluidFlags(FlagGrid &flags, int dummy = 0)
+      : KernelBase(&flags, 0), flags(flags), dummy(dummy)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, FlagGrid &flags, int dummy = 0) const
+  {
+    if (flags.isFluid(i, j, k)) {
+      flags(i, j, k) = (flags(i, j, k) | FlagGrid::TypeEmpty) & ~FlagGrid::TypeFluid;
+    }
+  }
+  inline FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline int &getArg1()
+  {
+    return dummy;
+  }
+  typedef int type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel knClearFluidFlags ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, dummy);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, dummy);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  FlagGrid &flags;
+  int dummy;
+};
+
+struct knSetNbObstacle : public KernelBase {
+  knSetNbObstacle(FlagGrid &nflags, const FlagGrid &flags, const Grid<Real> &phiObs)
+      : KernelBase(&nflags, 1), nflags(nflags), flags(flags), phiObs(phiObs)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(
+      int i, int j, int k, FlagGrid &nflags, const FlagGrid &flags, const Grid<Real> &phiObs) const
+  {
+    if (phiObs(i, j, k) > 0.)
+      return;
+    if (flags.isEmpty(i, j, k)) {
+      bool set = false;
+      if ((flags.isFluid(i - 1, j, k)) && (phiObs(i + 1, j, k) <= 0.))
+        set = true;
+      if ((flags.isFluid(i + 1, j, k)) && (phiObs(i - 1, j, k) <= 0.))
+        set = true;
+      if ((flags.isFluid(i, j - 1, k)) && (phiObs(i, j + 1, k) <= 0.))
+        set = true;
+      if ((flags.isFluid(i, j + 1, k)) && (phiObs(i, j - 1, k) <= 0.))
+        set = true;
+      if (flags.is3D()) {
+        if ((flags.isFluid(i, j, k - 1)) && (phiObs(i, j, k + 1) <= 0.))
+          set = true;
+        if ((flags.isFluid(i, j, k + 1)) && (phiObs(i, j, k - 1) <= 0.))
+          set = true;
+      }
+      if (set)
+        nflags(i, j, k) = (flags(i, j, k) | FlagGrid::TypeFluid) & ~FlagGrid::TypeEmpty;
+    }
+  }
+  inline FlagGrid &getArg0()
+  {
+    return nflags;
+  }
+  typedef FlagGrid type0;
+  inline const FlagGrid &getArg1()
+  {
+    return flags;
+  }
+  typedef FlagGrid type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return phiObs;
+  }
+  typedef Grid<Real> type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetNbObstacle ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, nflags, flags, phiObs);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, nflags, flags, phiObs);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  FlagGrid &nflags;
+  const FlagGrid &flags;
+  const Grid<Real> &phiObs;
+};
+void markFluidCells(const BasicParticleSystem &parts,
+                    FlagGrid &flags,
+                    const Grid<Real> *phiObs = NULL,
+                    const ParticleDataImpl<int> *ptype = NULL,
+                    const int exclude = 0)
+{
+  // remove all fluid cells
+  knClearFluidFlags(flags, 0);
+
+  // mark all particles in flaggrid as fluid
+  for (IndexInt idx = 0; idx < parts.size(); idx++) {
+    if (!parts.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
+      continue;
+    Vec3i p = toVec3i(parts.getPos(idx));
+    if (flags.isInBounds(p) && flags.isEmpty(p))
+      flags(p) = (flags(p) | FlagGrid::TypeFluid) & ~FlagGrid::TypeEmpty;
+  }
+
+  // special for second order obstacle BCs, check empty cells in boundary region
+  if (phiObs) {
+    FlagGrid tmp(flags);
+    knSetNbObstacle(tmp, flags, *phiObs);
+    flags.swap(tmp);
+  }
+}
+static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "markFluidCells", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 1, &_lock);
+      const Grid<Real> *phiObs = _args.getPtrOpt<Grid<Real>>("phiObs", 2, NULL, &_lock);
+      const ParticleDataImpl<int> *ptype = _args.getPtrOpt<ParticleDataImpl<int>>(
+          "ptype", 3, NULL, &_lock);
+      const int exclude = _args.getOpt<int>("exclude", 4, 0, &_lock);
+      _retval = getPyNone();
+      markFluidCells(parts, flags, phiObs, ptype, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "markFluidCells", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("markFluidCells", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_markFluidCells("", "markFluidCells", _W_3);
+extern "C" {
+void PbRegister_markFluidCells()
+{
+  KEEP_UNUSED(_RP_markFluidCells);
+}
+}
+
+// for testing purposes only...
+void testInitGridWithPos(Grid<Real> &grid)
+{
+  FOR_IJK(grid)
+  {
+    grid(i, j, k) = norm(Vec3(i, j, k));
+  }
+}
+static PyObject *_W_4(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "testInitGridWithPos", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &grid = *_args.getPtr<Grid<Real>>("grid", 0, &_lock);
+      _retval = getPyNone();
+      testInitGridWithPos(grid);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "testInitGridWithPos", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("testInitGridWithPos", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_testInitGridWithPos("", "testInitGridWithPos", _W_4);
+extern "C" {
+void PbRegister_testInitGridWithPos()
+{
+  KEEP_UNUSED(_RP_testInitGridWithPos);
+}
+}
+
+//! helper to calculate particle radius factor to cover the diagonal of a cell in 2d/3d
+inline Real calculateRadiusFactor(const Grid<Real> &grid, Real factor)
+{
+  return (grid.is3D() ? sqrt(3.) : sqrt(2.)) *
+         (factor + .01);  // note, a 1% safety factor is added here
+}
+
+//! re-sample particles based on an input levelset
+// optionally skip seeding new particles in "exclude" SDF
+
+void adjustNumber(BasicParticleSystem &parts,
+                  const MACGrid &vel,
+                  const FlagGrid &flags,
+                  int minParticles,
+                  int maxParticles,
+                  const LevelsetGrid &phi,
+                  Real radiusFactor = 1.,
+                  Real narrowBand = -1.,
+                  const Grid<Real> *exclude = NULL)
+{
+  // which levelset to use as threshold
+  const Real SURFACE_LS = -1.0 * calculateRadiusFactor(phi, radiusFactor);
+  Grid<int> tmp(vel.getParent());
+  std::ostringstream out;
+
+  // count particles in cells, and delete excess particles
+  for (IndexInt idx = 0; idx < (int)parts.size(); idx++) {
+    if (parts.isActive(idx)) {
+      Vec3i p = toVec3i(parts.getPos(idx));
+      if (!tmp.isInBounds(p)) {
+        parts.kill(idx);  // out of domain, remove
+        continue;
+      }
+
+      Real phiv = phi.getInterpolated(parts.getPos(idx));
+      if (phiv > 0) {
+        parts.kill(idx);
+        continue;
+      }
+      if (narrowBand > 0. && phiv < -narrowBand) {
+        parts.kill(idx);
+        continue;
+      }
+
+      bool atSurface = false;
+      if (phiv > SURFACE_LS)
+        atSurface = true;
+      int num = tmp(p);
+
+      // dont delete particles in non fluid cells here, the particles are "always right"
+      if (num > maxParticles && (!atSurface)) {
+        parts.kill(idx);
+      }
+      else {
+        tmp(p) = num + 1;
+      }
+    }
+  }
+
+  // seed new particles
+  RandomStream mRand(9832);
+  FOR_IJK(tmp)
+  {
+    int cnt = tmp(i, j, k);
+
+    // skip cells near surface
+    if (phi(i, j, k) > SURFACE_LS)
+      continue;
+    if (narrowBand > 0. && phi(i, j, k) < -narrowBand) {
+      continue;
+    }
+    if (exclude && ((*exclude)(i, j, k) < 0.)) {
+      continue;
+    }
+
+    if (flags.isFluid(i, j, k) && cnt < minParticles) {
+      for (int m = cnt; m < minParticles; m++) {
+        Vec3 pos = Vec3(i, j, k) + mRand.getVec3();
+        // Vec3 pos (i + 0.5, j + 0.5, k + 0.5); // cell center
+        parts.addBuffered(pos);
+      }
+    }
+  }
+
+  parts.doCompress();
+  parts.insertBufferedParticles();
+}
+static PyObject *_W_5(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "adjustNumber", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      const MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 2, &_lock);
+      int minParticles = _args.get<int>("minParticles", 3, &_lock);
+      int maxParticles = _args.get<int>("maxParticles", 4, &_lock);
+      const LevelsetGrid &phi = *_args.getPtr<LevelsetGrid>("phi", 5, &_lock);
+      Real radiusFactor = _args.getOpt<Real>("radiusFactor", 6, 1., &_lock);
+      Real narrowBand = _args.getOpt<Real>("narrowBand", 7, -1., &_lock);
+      const Grid<Real> *exclude = _args.getPtrOpt<Grid<Real>>("exclude", 8, NULL, &_lock);
+      _retval = getPyNone();
+      adjustNumber(
+          parts, vel, flags, minParticles, maxParticles, phi, radiusFactor, narrowBand, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "adjustNumber", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("adjustNumber", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_adjustNumber("", "adjustNumber", _W_5);
+extern "C" {
+void PbRegister_adjustNumber()
+{
+  KEEP_UNUSED(_RP_adjustNumber);
+}
+}
+
+// simple and slow helper conversion to show contents of int grids like a real grid in the ui
+// (use eg to quickly display contents of the particle-index grid)
+
+void debugIntToReal(const Grid<int> &source, Grid<Real> &dest, Real factor = 1.)
+{
+  FOR_IJK(source)
+  {
+    dest(i, j, k) = (Real)source(i, j, k) * factor;
+  }
+}
+static PyObject *_W_6(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "debugIntToReal", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const Grid<int> &source = *_args.getPtr<Grid<int>>("source", 0, &_lock);
+      Grid<Real> &dest = *_args.getPtr<Grid<Real>>("dest", 1, &_lock);
+      Real factor = _args.getOpt<Real>("factor", 2, 1., &_lock);
+      _retval = getPyNone();
+      debugIntToReal(source, dest, factor);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "debugIntToReal", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("debugIntToReal", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_debugIntToReal("", "debugIntToReal", _W_6);
+extern "C" {
+void PbRegister_debugIntToReal()
+{
+  KEEP_UNUSED(_RP_debugIntToReal);
+}
+}
+
+// build a grid that contains indices for a particle system
+// the particles in a cell i,j,k are particles[index(i,j,k)] to particles[index(i+1,j,k)-1]
+// (ie,  particles[index(i+1,j,k)] already belongs to cell i+1,j,k)
+
+void gridParticleIndex(const BasicParticleSystem &parts,
+                       ParticleIndexSystem &indexSys,
+                       const FlagGrid &flags,
+                       Grid<int> &index,
+                       Grid<int> *counter = NULL)
+{
+  bool delCounter = false;
+  if (!counter) {
+    counter = new Grid<int>(flags.getParent());
+    delCounter = true;
+  }
+  else {
+    counter->clear();
+  }
+
+  // count particles in cells, and delete excess particles
+  index.clear();
+  int inactive = 0;
+  for (IndexInt idx = 0; idx < (IndexInt)parts.size(); idx++) {
+    if (parts.isActive(idx)) {
+      // check index for validity...
+      Vec3i p = toVec3i(parts.getPos(idx));
+      if (!index.isInBounds(p)) {
+        inactive++;
+        continue;
+      }
+
+      index(p)++;
+    }
+    else {
+      inactive++;
+    }
+  }
+
+  // note - this one might be smaller...
+  indexSys.resize(parts.size() - inactive);
+
+  // convert per cell number to continuous index
+  IndexInt idx = 0;
+  FOR_IJK(index)
+  {
+    int num = index(i, j, k);
+    index(i, j, k) = idx;
+    idx += num;
+  }
+
+  // add particles to indexed array, we still need a per cell particle counter
+  for (IndexInt idx = 0; idx < (IndexInt)parts.size(); idx++) {
+    if (!parts.isActive(idx))
+      continue;
+    Vec3i p = toVec3i(parts.getPos(idx));
+    if (!index.isInBounds(p)) {
+      continue;
+    }
+
+    // initialize position and index into original array
+    // indexSys[ index(p)+(*counter)(p) ].pos        = parts[idx].pos;
+    indexSys[index(p) + (*counter)(p)].sourceIndex = idx;
+    (*counter)(p)++;
+  }
+
+  if (delCounter)
+    delete counter;
+}
+static PyObject *_W_7(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "gridParticleIndex", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      ParticleIndexSystem &indexSys = *_args.getPtr<ParticleIndexSystem>("indexSys", 1, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 2, &_lock);
+      Grid<int> &index = *_args.getPtr<Grid<int>>("index", 3, &_lock);
+      Grid<int> *counter = _args.getPtrOpt<Grid<int>>("counter", 4, NULL, &_lock);
+      _retval = getPyNone();
+      gridParticleIndex(parts, indexSys, flags, index, counter);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "gridParticleIndex", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("gridParticleIndex", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_gridParticleIndex("", "gridParticleIndex", _W_7);
+extern "C" {
+void PbRegister_gridParticleIndex()
+{
+  KEEP_UNUSED(_RP_gridParticleIndex);
+}
+}
+
+struct ComputeUnionLevelsetPindex : public KernelBase {
+  ComputeUnionLevelsetPindex(const Grid<int> &index,
+                             const BasicParticleSystem &parts,
+                             const ParticleIndexSystem &indexSys,
+                             LevelsetGrid &phi,
+                             const Real radius,
+                             const ParticleDataImpl<int> *ptype,
+                             const int exclude)
+      : KernelBase(&index, 0),
+        index(index),
+        parts(parts),
+        indexSys(indexSys),
+        phi(phi),
+        radius(radius),
+        ptype(ptype),
+        exclude(exclude)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const Grid<int> &index,
+                 const BasicParticleSystem &parts,
+                 const ParticleIndexSystem &indexSys,
+                 LevelsetGrid &phi,
+                 const Real radius,
+                 const ParticleDataImpl<int> *ptype,
+                 const int exclude) const
+  {
+    const Vec3 gridPos = Vec3(i, j, k) + Vec3(0.5);  // shifted by half cell
+    Real phiv = radius * 1.0;                        // outside
+
+    int r = int(radius) + 1;
+    int rZ = phi.is3D() ? r : 0;
+    for (int zj = k - rZ; zj <= k + rZ; zj++)
+      for (int yj = j - r; yj <= j + r; yj++)
+        for (int xj = i - r; xj <= i + r; xj++) {
+          if (!phi.isInBounds(Vec3i(xj, yj, zj)))
+            continue;
+
+          // note, for the particle indices in indexSys the access is periodic (ie, dont skip for
+          // eg inBounds(sx,10,10)
+          IndexInt isysIdxS = index.index(xj, yj, zj);
+          IndexInt pStart = index(isysIdxS), pEnd = 0;
+          if (phi.isInBounds(isysIdxS + 1))
+            pEnd = index(isysIdxS + 1);
+          else
+            pEnd = indexSys.size();
+
+          // now loop over particles in cell
+          for (IndexInt p = pStart; p < pEnd; ++p) {
+            const int psrc = indexSys[p].sourceIndex;
+            if (ptype && ((*ptype)[psrc] & exclude))
+              continue;
+            const Vec3 pos = parts[psrc].pos;
+            phiv = std::min(phiv, fabs(norm(gridPos - pos)) - radius);
+          }
+        }
+    phi(i, j, k) = phiv;
+  }
+  inline const Grid<int> &getArg0()
+  {
+    return index;
+  }
+  typedef Grid<int> type0;
+  inline const BasicParticleSystem &getArg1()
+  {
+    return parts;
+  }
+  typedef BasicParticleSystem type1;
+  inline const ParticleIndexSystem &getArg2()
+  {
+    return indexSys;
+  }
+  typedef ParticleIndexSystem type2;
+  inline LevelsetGrid &getArg3()
+  {
+    return phi;
+  }
+  typedef LevelsetGrid type3;
+  inline const Real &getArg4()
+  {
+    return radius;
+  }
+  typedef Real type4;
+  inline const ParticleDataImpl<int> *getArg5()
+  {
+    return ptype;
+  }
+  typedef ParticleDataImpl<int> type5;
+  inline const int &getArg6()
+  {
+    return exclude;
+  }
+  typedef int type6;
+  void runMessage()
+  {
+    debMsg("Executing kernel ComputeUnionLevelsetPindex ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, index, parts, indexSys, phi, radius, ptype, exclude);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, index, parts, indexSys, phi, radius, ptype, exclude);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const Grid<int> &index;
+  const BasicParticleSystem &parts;
+  const ParticleIndexSystem &indexSys;
+  LevelsetGrid &phi;
+  const Real radius;
+  const ParticleDataImpl<int> *ptype;
+  const int exclude;
+};
+
+void unionParticleLevelset(const BasicParticleSystem &parts,
+                           const ParticleIndexSystem &indexSys,
+                           const FlagGrid &flags,
+                           const Grid<int> &index,
+                           LevelsetGrid &phi,
+                           const Real radiusFactor = 1.,
+                           const ParticleDataImpl<int> *ptype = NULL,
+                           const int exclude = 0)
+{
+  // use half a cell diagonal as base radius
+  const Real radius = 0.5 * calculateRadiusFactor(phi, radiusFactor);
+  // no reset of phi necessary here
+  ComputeUnionLevelsetPindex(index, parts, indexSys, phi, radius, ptype, exclude);
+
+  phi.setBound(0.5, 0);
+}
+static PyObject *_W_8(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "unionParticleLevelset", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      const ParticleIndexSystem &indexSys = *_args.getPtr<ParticleIndexSystem>(
+          "indexSys", 1, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 2, &_lock);
+      const Grid<int> &index = *_args.getPtr<Grid<int>>("index", 3, &_lock);
+      LevelsetGrid &phi = *_args.getPtr<LevelsetGrid>("phi", 4, &_lock);
+      const Real radiusFactor = _args.getOpt<Real>("radiusFactor", 5, 1., &_lock);
+      const ParticleDataImpl<int> *ptype = _args.getPtrOpt<ParticleDataImpl<int>>(
+          "ptype", 6, NULL, &_lock);
+      const int exclude = _args.getOpt<int>("exclude", 7, 0, &_lock);
+      _retval = getPyNone();
+      unionParticleLevelset(parts, indexSys, flags, index, phi, radiusFactor, ptype, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "unionParticleLevelset", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("unionParticleLevelset", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_unionParticleLevelset("", "unionParticleLevelset", _W_8);
+extern "C" {
+void PbRegister_unionParticleLevelset()
+{
+  KEEP_UNUSED(_RP_unionParticleLevelset);
+}
+}
+
+//! kernel for computing averaged particle level set weights
+
+struct ComputeAveragedLevelsetWeight : public KernelBase {
+  ComputeAveragedLevelsetWeight(const BasicParticleSystem &parts,
+                                const Grid<int> &index,
+                                const ParticleIndexSystem &indexSys,
+                                LevelsetGrid &phi,
+                                const Real radius,
+                                const ParticleDataImpl<int> *ptype,
+                                const int exclude,
+                                Grid<Vec3> *save_pAcc = NULL,
+                                Grid<Real> *save_rAcc = NULL)
+      : KernelBase(&index, 0),
+        parts(parts),
+        index(index),
+        indexSys(indexSys),
+        phi(phi),
+        radius(radius),
+        ptype(ptype),
+        exclude(exclude),
+        save_pAcc(save_pAcc),
+        save_rAcc(save_rAcc)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const BasicParticleSystem &parts,
+                 const Grid<int> &index,
+                 const ParticleIndexSystem &indexSys,
+                 LevelsetGrid &phi,
+                 const Real radius,
+                 const ParticleDataImpl<int> *ptype,
+                 const int exclude,
+                 Grid<Vec3> *save_pAcc = NULL,
+                 Grid<Real> *save_rAcc = NULL) const
+  {
+    const Vec3 gridPos = Vec3(i, j, k) + Vec3(0.5);  // shifted by half cell
+    Real phiv = radius * 1.0;                        // outside
+
+    // loop over neighborhood, similar to ComputeUnionLevelsetPindex
+    const Real sradiusInv = 1. / (4. * radius * radius);
+    int r = int(1. * radius) + 1;
+    int rZ = phi.is3D() ? r : 0;
+    // accumulators
+    Real wacc = 0.;
+    Vec3 pacc = Vec3(0.);
+    Real racc = 0.;
+
+    for (int zj = k - rZ; zj <= k + rZ; zj++)
+      for (int yj = j - r; yj <= j + r; yj++)
+        for (int xj = i - r; xj <= i + r; xj++) {
+          if (!phi.isInBounds(Vec3i(xj, yj, zj)))
+            continue;
+
+          IndexInt isysIdxS = index.index(xj, yj, zj);
+          IndexInt pStart = index(isysIdxS), pEnd = 0;
+          if (phi.isInBounds(isysIdxS + 1))
+            pEnd = index(isysIdxS + 1);
+          else
+            pEnd = indexSys.size();
+          for (IndexInt p = pStart; p < pEnd; ++p) {
+            IndexInt psrc = indexSys[p].sourceIndex;
+            if (ptype && ((*ptype)[psrc] & exclude))
+              continue;
+
+            Vec3 pos = parts[psrc].pos;
+            Real s = normSquare(gridPos - pos) * sradiusInv;
+            // Real  w = std::max(0., cubed(1.-s) );
+            Real w = std::max(0., (1. - s));  // a bit smoother
+            wacc += w;
+            racc += radius * w;
+            pacc += pos * w;
+          }
+        }
+
+    if (wacc > VECTOR_EPSILON) {
+      racc /= wacc;
+      pacc /= wacc;
+      phiv = fabs(norm(gridPos - pacc)) - racc;
+
+      if (save_pAcc)
+        (*save_pAcc)(i, j, k) = pacc;
+      if (save_rAcc)
+        (*save_rAcc)(i, j, k) = racc;
+    }
+    phi(i, j, k) = phiv;
+  }
+  inline const BasicParticleSystem &getArg0()
+  {
+    return parts;
+  }
+  typedef BasicParticleSystem type0;
+  inline const Grid<int> &getArg1()
+  {
+    return index;
+  }
+  typedef Grid<int> type1;
+  inline const ParticleIndexSystem &getArg2()
+  {
+    return indexSys;
+  }
+  typedef ParticleIndexSystem type2;
+  inline LevelsetGrid &getArg3()
+  {
+    return phi;
+  }
+  typedef LevelsetGrid type3;
+  inline const Real &getArg4()
+  {
+    return radius;
+  }
+  typedef Real type4;
+  inline const ParticleDataImpl<int> *getArg5()
+  {
+    return ptype;
+  }
+  typedef ParticleDataImpl<int> type5;
+  inline const int &getArg6()
+  {
+    return exclude;
+  }
+  typedef int type6;
+  inline Grid<Vec3> *getArg7()
+  {
+    return save_pAcc;
+  }
+  typedef Grid<Vec3> type7;
+  inline Grid<Real> *getArg8()
+  {
+    return save_rAcc;
+  }
+  typedef Grid<Real> type8;
+  void runMessage()
+  {
+    debMsg("Executing kernel ComputeAveragedLevelsetWeight ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, parts, index, indexSys, phi, radius, ptype, exclude, save_pAcc, save_rAcc);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, parts, index, indexSys, phi, radius, ptype, exclude, save_pAcc, save_rAcc);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const BasicParticleSystem &parts;
+  const Grid<int> &index;
+  const ParticleIndexSystem &indexSys;
+  LevelsetGrid &phi;
+  const Real radius;
+  const ParticleDataImpl<int> *ptype;
+  const int exclude;
+  Grid<Vec3> *save_pAcc;
+  Grid<Real> *save_rAcc;
+};
+
+template<class T> T smoothingValue(const Grid<T> val, int i, int j, int k, T center)
+{
+  return val(i, j, k);
+}
+
+// smoothing, and
+
+template<class T> struct knSmoothGrid : public KernelBase {
+  knSmoothGrid(const Grid<T> &me, Grid<T> &tmp, Real factor)
+      : KernelBase(&me, 1), me(me), tmp(tmp), factor(factor)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, const Grid<T> &me, Grid<T> &tmp, Real factor) const
+  {
+    T val = me(i, j, k) + me(i + 1, j, k) + me(i - 1, j, k) + me(i, j + 1, k) + me(i, j - 1, k);
+    if (me.is3D()) {
+      val += me(i, j, k + 1) + me(i, j, k - 1);
+    }
+    tmp(i, j, k) = val * factor;
+  }
+  inline const Grid<T> &getArg0()
+  {
+    return me;
+  }
+  typedef Grid<T> type0;
+  inline Grid<T> &getArg1()
+  {
+    return tmp;
+  }
+  typedef Grid<T> type1;
+  inline Real &getArg2()
+  {
+    return factor;
+  }
+  typedef Real type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel knSmoothGrid ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, me, tmp, factor);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, me, tmp, factor);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const Grid<T> &me;
+  Grid<T> &tmp;
+  Real factor;
+};
+
+template<class T> struct knSmoothGridNeg : public KernelBase {
+  knSmoothGridNeg(const Grid<T> &me, Grid<T> &tmp, Real factor)
+      : KernelBase(&me, 1), me(me), tmp(tmp), factor(factor)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, const Grid<T> &me, Grid<T> &tmp, Real factor) const
+  {
+    T val = me(i, j, k) + me(i + 1, j, k) + me(i - 1, j, k) + me(i, j + 1, k) + me(i, j - 1, k);
+    if (me.is3D()) {
+      val += me(i, j, k + 1) + me(i, j, k - 1);
+    }
+    val *= factor;
+    if (val < tmp(i, j, k))
+      tmp(i, j, k) = val;
+    else
+      tmp(i, j, k) = me(i, j, k);
+  }
+  inline const Grid<T> &getArg0()
+  {
+    return me;
+  }
+  typedef Grid<T> type0;
+  inline Grid<T> &getArg1()
+  {
+    return tmp;
+  }
+  typedef Grid<T> type1;
+  inline Real &getArg2()
+  {
+    return factor;
+  }
+  typedef Real type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel knSmoothGridNeg ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, me, tmp, factor);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, me, tmp, factor);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const Grid<T> &me;
+  Grid<T> &tmp;
+  Real factor;
+};
+
+//! Zhu & Bridson particle level set creation
+
+void averagedParticleLevelset(const BasicParticleSystem &parts,
+                              const ParticleIndexSystem &indexSys,
+                              const FlagGrid &flags,
+                              const Grid<int> &index,
+                              LevelsetGrid &phi,
+                              const Real radiusFactor = 1.,
+                              const int smoothen = 1,
+                              const int smoothenNeg = 1,
+                              const ParticleDataImpl<int> *ptype = NULL,
+                              const int exclude = 0)
+{
+  // use half a cell diagonal as base radius
+  const Real radius = 0.5 * calculateRadiusFactor(phi, radiusFactor);
+  ComputeAveragedLevelsetWeight(parts, index, indexSys, phi, radius, ptype, exclude);
+
+  // post-process level-set
+  for (int i = 0; i < std::max(smoothen, smoothenNeg); ++i) {
+    LevelsetGrid tmp(flags.getParent());
+    if (i < smoothen) {
+      knSmoothGrid<Real>(phi, tmp, 1. / (phi.is3D() ? 7. : 5.));
+      phi.swap(tmp);
+    }
+    if (i < smoothenNeg) {
+      knSmoothGridNeg<Real>(phi, tmp, 1. / (phi.is3D() ? 7. : 5.));
+      phi.swap(tmp);
+    }
+  }
+  phi.setBound(0.5, 0);
+}
+static PyObject *_W_9(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "averagedParticleLevelset", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      const ParticleIndexSystem &indexSys = *_args.getPtr<ParticleIndexSystem>(
+          "indexSys", 1, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 2, &_lock);
+      const Grid<int> &index = *_args.getPtr<Grid<int>>("index", 3, &_lock);
+      LevelsetGrid &phi = *_args.getPtr<LevelsetGrid>("phi", 4, &_lock);
+      const Real radiusFactor = _args.getOpt<Real>("radiusFactor", 5, 1., &_lock);
+      const int smoothen = _args.getOpt<int>("smoothen", 6, 1, &_lock);
+      const int smoothenNeg = _args.getOpt<int>("smoothenNeg", 7, 1, &_lock);
+      const ParticleDataImpl<int> *ptype = _args.getPtrOpt<ParticleDataImpl<int>>(
+          "ptype", 8, NULL, &_lock);
+      const int exclude = _args.getOpt<int>("exclude", 9, 0, &_lock);
+      _retval = getPyNone();
+      averagedParticleLevelset(
+          parts, indexSys, flags, index, phi, radiusFactor, smoothen, smoothenNeg, ptype, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "averagedParticleLevelset", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("averagedParticleLevelset", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_averagedParticleLevelset("", "averagedParticleLevelset", _W_9);
+extern "C" {
+void PbRegister_averagedParticleLevelset()
+{
+  KEEP_UNUSED(_RP_averagedParticleLevelset);
+}
+}
+
+//! kernel for improvedParticleLevelset
+
+struct correctLevelset : public KernelBase {
+  correctLevelset(LevelsetGrid &phi,
+                  const Grid<Vec3> &pAcc,
+                  const Grid<Real> &rAcc,
+                  const Real radius,
+                  const Real t_low,
+                  const Real t_high)
+      : KernelBase(&phi, 1),
+        phi(phi),
+        pAcc(pAcc),
+        rAcc(rAcc),
+        radius(radius),
+        t_low(t_low),
+        t_high(t_high)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 LevelsetGrid &phi,
+                 const Grid<Vec3> &pAcc,
+                 const Grid<Real> &rAcc,
+                 const Real radius,
+                 const Real t_low,
+                 const Real t_high) const
+  {
+    if (rAcc(i, j, k) <= VECTOR_EPSILON)
+      return;  // outside nothing happens
+    Real x = pAcc(i, j, k).x;
+
+    // create jacobian of pAcc via central differences
+    Matrix3x3f jacobian = Matrix3x3f(0.5 * (pAcc(i + 1, j, k).x - pAcc(i - 1, j, k).x),
+                                     0.5 * (pAcc(i, j + 1, k).x - pAcc(i, j - 1, k).x),
+                                     0.5 * (pAcc(i, j, k + 1).x - pAcc(i, j, k - 1).x),
+                                     0.5 * (pAcc(i + 1, j, k).y - pAcc(i - 1, j, k).y),
+                                     0.5 * (pAcc(i, j + 1, k).y - pAcc(i, j - 1, k).y),
+                                     0.5 * (pAcc(i, j, k + 1).y - pAcc(i, j, k - 1).y),
+                                     0.5 * (pAcc(i + 1, j, k).z - pAcc(i - 1, j, k).z),
+                                     0.5 * (pAcc(i, j + 1, k).z - pAcc(i, j - 1, k).z),
+                                     0.5 * (pAcc(i, j, k + 1).z - pAcc(i, j, k - 1).z));
+
+    // compute largest eigenvalue of jacobian
+    Vec3 EV = jacobian.eigenvalues();
+    Real maxEV = std::max(std::max(EV.x, EV.y), EV.z);
+
+    // calculate correction factor
+    Real correction = 1;
+    if (maxEV >= t_low) {
+      Real t = (t_high - maxEV) / (t_high - t_low);
+      correction = t * t * t - 3 * t * t + 3 * t;
+    }
+    correction = (correction < 0) ?
+                     0 :
+                     correction;  // enforce correction factor to [0,1] (not explicitly in paper)
+
+    const Vec3 gridPos = Vec3(i, j, k) + Vec3(0.5);  // shifted by half cell
+    const Real correctedPhi = fabs(norm(gridPos - pAcc(i, j, k))) - rAcc(i, j, k) * correction;
+    phi(i, j, k) = (correctedPhi > radius) ?
+                       radius :
+                       correctedPhi;  // adjust too high outside values when too few particles are
+                                      // nearby to make smoothing possible (not in paper)
+  }
+  inline LevelsetGrid &getArg0()
+  {
+    return phi;
+  }
+  typedef LevelsetGrid type0;
+  inline const Grid<Vec3> &getArg1()
+  {
+    return pAcc;
+  }
+  typedef Grid<Vec3> type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return rAcc;
+  }
+  typedef Grid<Real> type2;
+  inline const Real &getArg3()
+  {
+    return radius;
+  }
+  typedef Real type3;
+  inline const Real &getArg4()
+  {
+    return t_low;
+  }
+  typedef Real type4;
+  inline const Real &getArg5()
+  {
+    return t_high;
+  }
+  typedef Real type5;
+  void runMessage()
+  {
+    debMsg("Executing kernel correctLevelset ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, phi, pAcc, rAcc, radius, t_low, t_high);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, phi, pAcc, rAcc, radius, t_low, t_high);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  LevelsetGrid &phi;
+  const Grid<Vec3> &pAcc;
+  const Grid<Real> &rAcc;
+  const Real radius;
+  const Real t_low;
+  const Real t_high;
+};
+
+//! Approach from "A unified particle model for fluid-solid interactions" by Solenthaler et al. in
+//! 2007
+
+void improvedParticleLevelset(const BasicParticleSystem &parts,
+                              const ParticleIndexSystem &indexSys,
+                              const FlagGrid &flags,
+                              const Grid<int> &index,
+                              LevelsetGrid &phi,
+                              const Real radiusFactor = 1.,
+                              const int smoothen = 1,
+                              const int smoothenNeg = 1,
+                              const Real t_low = 0.4,
+                              const Real t_high = 3.5,
+                              const ParticleDataImpl<int> *ptype = NULL,
+                              const int exclude = 0)
+{
+  // create temporary grids to store values from levelset weight computation
+  Grid<Vec3> save_pAcc(flags.getParent());
+  Grid<Real> save_rAcc(flags.getParent());
+
+  const Real radius = 0.5 * calculateRadiusFactor(
+                                phi, radiusFactor);  // use half a cell diagonal as base radius
+  ComputeAveragedLevelsetWeight(
+      parts, index, indexSys, phi, radius, ptype, exclude, &save_pAcc, &save_rAcc);
+  correctLevelset(phi, save_pAcc, save_rAcc, radius, t_low, t_high);
+
+  // post-process level-set
+  for (int i = 0; i < std::max(smoothen, smoothenNeg); ++i) {
+    LevelsetGrid tmp(flags.getParent());
+    if (i < smoothen) {
+      knSmoothGrid<Real>(phi, tmp, 1. / (phi.is3D() ? 7. : 5.));
+      phi.swap(tmp);
+    }
+    if (i < smoothenNeg) {
+      knSmoothGridNeg<Real>(phi, tmp, 1. / (phi.is3D() ? 7. : 5.));
+      phi.swap(tmp);
+    }
+  }
+  phi.setBound(0.5, 0);
+}
+static PyObject *_W_10(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "improvedParticleLevelset", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      const ParticleIndexSystem &indexSys = *_args.getPtr<ParticleIndexSystem>(
+          "indexSys", 1, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 2, &_lock);
+      const Grid<int> &index = *_args.getPtr<Grid<int>>("index", 3, &_lock);
+      LevelsetGrid &phi = *_args.getPtr<LevelsetGrid>("phi", 4, &_lock);
+      const Real radiusFactor = _args.getOpt<Real>("radiusFactor", 5, 1., &_lock);
+      const int smoothen = _args.getOpt<int>("smoothen", 6, 1, &_lock);
+      const int smoothenNeg = _args.getOpt<int>("smoothenNeg", 7, 1, &_lock);
+      const Real t_low = _args.getOpt<Real>("t_low", 8, 0.4, &_lock);
+      const Real t_high = _args.getOpt<Real>("t_high", 9, 3.5, &_lock);
+      const ParticleDataImpl<int> *ptype = _args.getPtrOpt<ParticleDataImpl<int>>(
+          "ptype", 10, NULL, &_lock);
+      const int exclude = _args.getOpt<int>("exclude", 11, 0, &_lock);
+      _retval = getPyNone();
+      improvedParticleLevelset(parts,
+                               indexSys,
+                               flags,
+                               index,
+                               phi,
+                               radiusFactor,
+                               smoothen,
+                               smoothenNeg,
+                               t_low,
+                               t_high,
+                               ptype,
+                               exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "improvedParticleLevelset", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("improvedParticleLevelset", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_improvedParticleLevelset("", "improvedParticleLevelset", _W_10);
+extern "C" {
+void PbRegister_improvedParticleLevelset()
+{
+  KEEP_UNUSED(_RP_improvedParticleLevelset);
+}
+}
+
+struct knPushOutofObs : public KernelBase {
+  knPushOutofObs(BasicParticleSystem &parts,
+                 const FlagGrid &flags,
+                 const Grid<Real> &phiObs,
+                 const Real shift,
+                 const Real thresh,
+                 const ParticleDataImpl<int> *ptype,
+                 const int exclude)
+      : KernelBase(parts.size()),
+        parts(parts),
+        flags(flags),
+        phiObs(phiObs),
+        shift(shift),
+        thresh(thresh),
+        ptype(ptype),
+        exclude(exclude)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 BasicParticleSystem &parts,
+                 const FlagGrid &flags,
+                 const Grid<Real> &phiObs,
+                 const Real shift,
+                 const Real thresh,
+                 const ParticleDataImpl<int> *ptype,
+                 const int exclude) const
+  {
+    if (!parts.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
+      return;
+    Vec3i p = toVec3i(parts.getPos(idx));
+
+    if (!flags.isInBounds(p))
+      return;
+    Real v = phiObs.getInterpolated(parts.getPos(idx));
+    if (v < thresh) {
+      Vec3 grad = getGradient(phiObs, p.x, p.y, p.z);
+      if (normalize(grad) < VECTOR_EPSILON)
+        return;
+      parts.setPos(idx, parts.getPos(idx) + grad * (thresh - v + shift));
+    }
+  }
+  inline BasicParticleSystem &getArg0()
+  {
+    return parts;
+  }
+  typedef BasicParticleSystem type0;
+  inline const FlagGrid &getArg1()
+  {
+    return flags;
+  }
+  typedef FlagGrid type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return phiObs;
+  }
+  typedef Grid<Real> type2;
+  inline const Real &getArg3()
+  {
+    return shift;
+  }
+  typedef Real type3;
+  inline const Real &getArg4()
+  {
+    return thresh;
+  }
+  typedef Real type4;
+  inline const ParticleDataImpl<int> *getArg5()
+  {
+    return ptype;
+  }
+  typedef ParticleDataImpl<int> type5;
+  inline const int &getArg6()
+  {
+    return exclude;
+  }
+  typedef int type6;
+  void runMessage()
+  {
+    debMsg("Executing kernel knPushOutofObs ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, parts, flags, phiObs, shift, thresh, ptype, exclude);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  BasicParticleSystem &parts;
+  const FlagGrid &flags;
+  const Grid<Real> &phiObs;
+  const Real shift;
+  const Real thresh;
+  const ParticleDataImpl<int> *ptype;
+  const int exclude;
+};
+//! push particles out of obstacle levelset
+
+void pushOutofObs(BasicParticleSystem &parts,
+                  const FlagGrid &flags,
+                  const Grid<Real> &phiObs,
+                  const Real shift = 0,
+                  const Real thresh = 0,
+                  const ParticleDataImpl<int> *ptype = NULL,
+                  const int exclude = 0)
+{
+  knPushOutofObs(parts, flags, phiObs, shift, thresh, ptype, exclude);
+}
+static PyObject *_W_11(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "pushOutofObs", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 1, &_lock);
+      const Grid<Real> &phiObs = *_args.getPtr<Grid<Real>>("phiObs", 2, &_lock);
+      const Real shift = _args.getOpt<Real>("shift", 3, 0, &_lock);
+      const Real thresh = _args.getOpt<Real>("thresh", 4, 0, &_lock);
+      const ParticleDataImpl<int> *ptype = _args.getPtrOpt<ParticleDataImpl<int>>(
+          "ptype", 5, NULL, &_lock);
+      const int exclude = _args.getOpt<int>("exclude", 6, 0, &_lock);
+      _retval = getPyNone();
+      pushOutofObs(parts, flags, phiObs, shift, thresh, ptype, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "pushOutofObs", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("pushOutofObs", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_pushOutofObs("", "pushOutofObs", _W_11);
+extern "C" {
+void PbRegister_pushOutofObs()
+{
+  KEEP_UNUSED(_RP_pushOutofObs);
+}
+}
+
+//******************************************************************************
+// grid interpolation functions
+
+template<class T> struct knSafeDivReal : public KernelBase {
+  knSafeDivReal(Grid<T> &me, const Grid<Real> &other, Real cutoff = VECTOR_EPSILON)
+      : KernelBase(&me, 0), me(me), other(other), cutoff(cutoff)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 Grid<T> &me,
+                 const Grid<Real> &other,
+                 Real cutoff = VECTOR_EPSILON) const
+  {
+    if (other[idx] < cutoff) {
+      me[idx] = 0.;
+    }
+    else {
+      T div(other[idx]);
+      me[idx] = safeDivide(me[idx], div);
+    }
+  }
+  inline Grid<T> &getArg0()
+  {
+    return me;
+  }
+  typedef Grid<T> type0;
+  inline const Grid<Real> &getArg1()
+  {
+    return other;
+  }
+  typedef Grid<Real> type1;
+  inline Real &getArg2()
+  {
+    return cutoff;
+  }
+  typedef Real type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel knSafeDivReal ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other, cutoff);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  Grid<T> &me;
+  const Grid<Real> &other;
+  Real cutoff;
+};
+
+// Set velocities on the grid from the particle system
+
+struct knMapLinearVec3ToMACGrid : public KernelBase {
+  knMapLinearVec3ToMACGrid(const BasicParticleSystem &p,
+                           const FlagGrid &flags,
+                           const MACGrid &vel,
+                           Grid<Vec3> &tmp,
+                           const ParticleDataImpl<Vec3> &pvel,
+                           const ParticleDataImpl<int> *ptype,
+                           const int exclude)
+      : KernelBase(p.size()),
+        p(p),
+        flags(flags),
+        vel(vel),
+        tmp(tmp),
+        pvel(pvel),
+        ptype(ptype),
+        exclude(exclude)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystem &p,
+                 const FlagGrid &flags,
+                 const MACGrid &vel,
+                 Grid<Vec3> &tmp,
+                 const ParticleDataImpl<Vec3> &pvel,
+                 const ParticleDataImpl<int> *ptype,
+                 const int exclude)
+  {
+    unusedParameter(flags);
+    if (!p.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
+      return;
+    vel.setInterpolated(p[idx].pos, pvel[idx], &tmp[0]);
+  }
+  inline const BasicParticleSystem &getArg0()
+  {
+    return p;
+  }
+  typedef BasicParticleSystem type0;
+  inline const FlagGrid &getArg1()
+  {
+    return flags;
+  }
+  typedef FlagGrid type1;
+  inline const MACGrid &getArg2()
+  {
+    return vel;
+  }
+  typedef MACGrid type2;
+  inline Grid<Vec3> &getArg3()
+  {
+    return tmp;
+  }
+  typedef Grid<Vec3> type3;
+  inline const ParticleDataImpl<Vec3> &getArg4()
+  {
+    return pvel;
+  }
+  typedef ParticleDataImpl<Vec3> type4;
+  inline const ParticleDataImpl<int> *getArg5()
+  {
+    return ptype;
+  }
+  typedef ParticleDataImpl<int> type5;
+  inline const int &getArg6()
+  {
+    return exclude;
+  }
+  typedef int type6;
+  void runMessage()
+  {
+    debMsg("Executing kernel knMapLinearVec3ToMACGrid ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void run()
+  {
+    const IndexInt _sz = size;
+    for (IndexInt i = 0; i < _sz; i++)
+      op(i, p, flags, vel, tmp, pvel, ptype, exclude);
+  }
+  const BasicParticleSystem &p;
+  const FlagGrid &flags;
+  const MACGrid &vel;
+  Grid<Vec3> &tmp;
+  const ParticleDataImpl<Vec3> &pvel;
+  const ParticleDataImpl<int> *ptype;
+  const int exclude;
+};
+
+// optionally , this function can use an existing vec3 grid to store the weights
+// this is useful in combination with the simple extrapolation function
+
+void mapPartsToMAC(const FlagGrid &flags,
+                   MACGrid &vel,
+                   MACGrid &velOld,
+                   const BasicParticleSystem &parts,
+                   const ParticleDataImpl<Vec3> &partVel,
+                   Grid<Vec3> *weight = NULL,
+                   const ParticleDataImpl<int> *ptype = NULL,
+                   const int exclude = 0)
+{
+  // interpol -> grid. tmpgrid for particle contribution weights
+  bool freeTmp = false;
+  if (!weight) {
+    weight = new Grid<Vec3>(flags.getParent());
+    freeTmp = true;
+  }
+  else {
+    weight->clear();  // make sure we start with a zero grid!
+  }
+  vel.clear();
+  knMapLinearVec3ToMACGrid(parts, flags, vel, *weight, partVel, ptype, exclude);
+
+  // stomp small values in weight to zero to prevent roundoff errors
+  weight->stomp(Vec3(VECTOR_EPSILON));
+  vel.safeDivide(*weight);
+
+  // store original state
+  velOld.copyFrom(vel);
+  if (freeTmp)
+    delete weight;
+}
+static PyObject *_W_12(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "mapPartsToMAC", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      MACGrid &velOld = *_args.getPtr<MACGrid>("velOld", 2, &_lock);
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 3, &_lock);
+      const ParticleDataImpl<Vec3> &partVel = *_args.getPtr<ParticleDataImpl<Vec3>>(
+          "partVel", 4, &_lock);
+      Grid<Vec3> *weight = _args.getPtrOpt<Grid<Vec3>>("weight", 5, NULL, &_lock);
+      const ParticleDataImpl<int> *ptype = _args.getPtrOpt<ParticleDataImpl<int>>(
+          "ptype", 6, NULL, &_lock);
+      const int exclude = _args.getOpt<int>("exclude", 7, 0, &_lock);
+      _retval = getPyNone();
+      mapPartsToMAC(flags, vel, velOld, parts, partVel, weight, ptype, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "mapPartsToMAC", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("mapPartsToMAC", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_mapPartsToMAC("", "mapPartsToMAC", _W_12);
+extern "C" {
+void PbRegister_mapPartsToMAC()
+{
+  KEEP_UNUSED(_RP_mapPartsToMAC);
+}
+}
+
+template<class T> struct knMapLinear : public KernelBase {
+  knMapLinear(const BasicParticleSystem &p,
+              const FlagGrid &flags,
+              const Grid<T> &target,
+              Grid<Real> &gtmp,
+              const ParticleDataImpl<T> &psource)
+      : KernelBase(p.size()), p(p), flags(flags), target(target), gtmp(gtmp), psource(psource)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystem &p,
+                 const FlagGrid &flags,
+                 const Grid<T> &target,
+                 Grid<Real> &gtmp,
+                 const ParticleDataImpl<T> &psource)
+  {
+    unusedParameter(flags);
+    if (!p.isActive(idx))
+      return;
+    target.setInterpolated(p[idx].pos, psource[idx], gtmp);
+  }
+  inline const BasicParticleSystem &getArg0()
+  {
+    return p;
+  }
+  typedef BasicParticleSystem type0;
+  inline const FlagGrid &getArg1()
+  {
+    return flags;
+  }
+  typedef FlagGrid type1;
+  inline const Grid<T> &getArg2()
+  {
+    return target;
+  }
+  typedef Grid<T> type2;
+  inline Grid<Real> &getArg3()
+  {
+    return gtmp;
+  }
+  typedef Grid<Real> type3;
+  inline const ParticleDataImpl<T> &getArg4()
+  {
+    return psource;
+  }
+  typedef ParticleDataImpl<T> type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel knMapLinear ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void run()
+  {
+    const IndexInt _sz = size;
+    for (IndexInt i = 0; i < _sz; i++)
+      op(i, p, flags, target, gtmp, psource);
+  }
+  const BasicParticleSystem &p;
+  const FlagGrid &flags;
+  const Grid<T> &target;
+  Grid<Real> &gtmp;
+  const ParticleDataImpl<T> &psource;
+};
+
+template<class T>
+void mapLinearRealHelper(const FlagGrid &flags,
+                         Grid<T> &target,
+                         const BasicParticleSystem &parts,
+                         const ParticleDataImpl<T> &source)
+{
+  Grid<Real> tmp(flags.getParent());
+  target.clear();
+  knMapLinear<T>(parts, flags, target, tmp, source);
+  knSafeDivReal<T>(target, tmp);
+}
+
+void mapPartsToGrid(const FlagGrid &flags,
+                    Grid<Real> &target,
+                    const BasicParticleSystem &parts,
+                    const ParticleDataImpl<Real> &source)
+{
+  mapLinearRealHelper<Real>(flags, target, parts, source);
+}
+static PyObject *_W_13(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "mapPartsToGrid", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Real> &target = *_args.getPtr<Grid<Real>>("target", 1, &_lock);
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 2, &_lock);
+      const ParticleDataImpl<Real> &source = *_args.getPtr<ParticleDataImpl<Real>>(
+          "source", 3, &_lock);
+      _retval = getPyNone();
+      mapPartsToGrid(flags, target, parts, source);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "mapPartsToGrid", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("mapPartsToGrid", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_mapPartsToGrid("", "mapPartsToGrid", _W_13);
+extern "C" {
+void PbRegister_mapPartsToGrid()
+{
+  KEEP_UNUSED(_RP_mapPartsToGrid);
+}
+}
+
+void mapPartsToGridVec3(const FlagGrid &flags,
+                        Grid<Vec3> &target,
+                        const BasicParticleSystem &parts,
+                        const ParticleDataImpl<Vec3> &source)
+{
+  mapLinearRealHelper<Vec3>(flags, target, parts, source);
+}
+static PyObject *_W_14(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "mapPartsToGridVec3", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Vec3> &target = *_args.getPtr<Grid<Vec3>>("target", 1, &_lock);
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 2, &_lock);
+      const ParticleDataImpl<Vec3> &source = *_args.getPtr<ParticleDataImpl<Vec3>>(
+          "source", 3, &_lock);
+      _retval = getPyNone();
+      mapPartsToGridVec3(flags, target, parts, source);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "mapPartsToGridVec3", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("mapPartsToGridVec3", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_mapPartsToGridVec3("", "mapPartsToGridVec3", _W_14);
+extern "C" {
+void PbRegister_mapPartsToGridVec3()
+{
+  KEEP_UNUSED(_RP_mapPartsToGridVec3);
+}
+}
+
+// integers need "max" mode, not yet implemented
+// PYTHON() void mapPartsToGridInt ( FlagGrid& flags, Grid<int >& target , BasicParticleSystem&
+// parts , ParticleDataImpl<int >& source ) { 	mapLinearRealHelper<int >(flags,target,parts,source);
+//}
+
+template<class T> struct knMapFromGrid : public KernelBase {
+  knMapFromGrid(const BasicParticleSystem &p, const Grid<T> &gsrc, ParticleDataImpl<T> &target)
+      : KernelBase(p.size()), p(p), gsrc(gsrc), target(target)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystem &p,
+                 const Grid<T> &gsrc,
+                 ParticleDataImpl<T> &target) const
+  {
+    if (!p.isActive(idx))
+      return;
+    target[idx] = gsrc.getInterpolated(p[idx].pos);
+  }
+  inline const BasicParticleSystem &getArg0()
+  {
+    return p;
+  }
+  typedef BasicParticleSystem type0;
+  inline const Grid<T> &getArg1()
+  {
+    return gsrc;
+  }
+  typedef Grid<T> type1;
+  inline ParticleDataImpl<T> &getArg2()
+  {
+    return target;
+  }
+  typedef ParticleDataImpl<T> type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel knMapFromGrid ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, p, gsrc, target);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystem &p;
+  const Grid<T> &gsrc;
+  ParticleDataImpl<T> &target;
+};
+void mapGridToParts(const Grid<Real> &source,
+                    const BasicParticleSystem &parts,
+                    ParticleDataImpl<Real> &target)
+{
+  knMapFromGrid<Real>(parts, source, target);
+}
+static PyObject *_W_15(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "mapGridToParts", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const Grid<Real> &source = *_args.getPtr<Grid<Real>>("source", 0, &_lock);
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 1, &_lock);
+      ParticleDataImpl<Real> &target = *_args.getPtr<ParticleDataImpl<Real>>("target", 2, &_lock);
+      _retval = getPyNone();
+      mapGridToParts(source, parts, target);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "mapGridToParts", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("mapGridToParts", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_mapGridToParts("", "mapGridToParts", _W_15);
+extern "C" {
+void PbRegister_mapGridToParts()
+{
+  KEEP_UNUSED(_RP_mapGridToParts);
+}
+}
+
+void mapGridToPartsVec3(const Grid<Vec3> &source,
+                        const BasicParticleSystem &parts,
+                        ParticleDataImpl<Vec3> &target)
+{
+  knMapFromGrid<Vec3>(parts, source, target);
+}
+static PyObject *_W_16(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "mapGridToPartsVec3", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const Grid<Vec3> &source = *_args.getPtr<Grid<Vec3>>("source", 0, &_lock);
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 1, &_lock);
+      ParticleDataImpl<Vec3> &target = *_args.getPtr<ParticleDataImpl<Vec3>>("target", 2, &_lock);
+      _retval = getPyNone();
+      mapGridToPartsVec3(source, parts, target);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "mapGridToPartsVec3", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("mapGridToPartsVec3", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_mapGridToPartsVec3("", "mapGridToPartsVec3", _W_16);
+extern "C" {
+void PbRegister_mapGridToPartsVec3()
+{
+  KEEP_UNUSED(_RP_mapGridToPartsVec3);
+}
+}
+
+// Get velocities from grid
+
+struct knMapLinearMACGridToVec3_PIC : public KernelBase {
+  knMapLinearMACGridToVec3_PIC(const BasicParticleSystem &p,
+                               const FlagGrid &flags,
+                               const MACGrid &vel,
+                               ParticleDataImpl<Vec3> &pvel,
+                               const ParticleDataImpl<int> *ptype,
+                               const int exclude)
+      : KernelBase(p.size()),
+        p(p),
+        flags(flags),
+        vel(vel),
+        pvel(pvel),
+        ptype(ptype),
+        exclude(exclude)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystem &p,
+                 const FlagGrid &flags,
+                 const MACGrid &vel,
+                 ParticleDataImpl<Vec3> &pvel,
+                 const ParticleDataImpl<int> *ptype,
+                 const int exclude) const
+  {
+    if (!p.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
+      return;
+    // pure PIC
+    pvel[idx] = vel.getInterpolated(p[idx].pos);
+  }
+  inline const BasicParticleSystem &getArg0()
+  {
+    return p;
+  }
+  typedef BasicParticleSystem type0;
+  inline const FlagGrid &getArg1()
+  {
+    return flags;
+  }
+  typedef FlagGrid type1;
+  inline const MACGrid &getArg2()
+  {
+    return vel;
+  }
+  typedef MACGrid type2;
+  inline ParticleDataImpl<Vec3> &getArg3()
+  {
+    return pvel;
+  }
+  typedef ParticleDataImpl<Vec3> type3;
+  inline const ParticleDataImpl<int> *getArg4()
+  {
+    return ptype;
+  }
+  typedef ParticleDataImpl<int> type4;
+  inline const int &getArg5()
+  {
+    return exclude;
+  }
+  typedef int type5;
+  void runMessage()
+  {
+    debMsg("Executing kernel knMapLinearMACGridToVec3_PIC ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, p, flags, vel, pvel, ptype, exclude);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystem &p;
+  const FlagGrid &flags;
+  const MACGrid &vel;
+  ParticleDataImpl<Vec3> &pvel;
+  const ParticleDataImpl<int> *ptype;
+  const int exclude;
+};
+
+void mapMACToParts(const FlagGrid &flags,
+                   const MACGrid &vel,
+                   const BasicParticleSystem &parts,
+                   ParticleDataImpl<Vec3> &partVel,
+                   const ParticleDataImpl<int> *ptype = NULL,
+                   const int exclude = 0)
+{
+  knMapLinearMACGridToVec3_PIC(parts, flags, vel, partVel, ptype, exclude);
+}
+static PyObject *_W_17(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "mapMACToParts", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      const MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 2, &_lock);
+      ParticleDataImpl<Vec3> &partVel = *_args.getPtr<ParticleDataImpl<Vec3>>(
+          "partVel", 3, &_lock);
+      const ParticleDataImpl<int> *ptype = _args.getPtrOpt<ParticleDataImpl<int>>(
+          "ptype", 4, NULL, &_lock);
+      const int exclude = _args.getOpt<int>("exclude", 5, 0, &_lock);
+      _retval = getPyNone();
+      mapMACToParts(flags, vel, parts, partVel, ptype, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "mapMACToParts", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("mapMACToParts", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_mapMACToParts("", "mapMACToParts", _W_17);
+extern "C" {
+void PbRegister_mapMACToParts()
+{
+  KEEP_UNUSED(_RP_mapMACToParts);
+}
+}
+
+// with flip delta interpolation
+
+struct knMapLinearMACGridToVec3_FLIP : public KernelBase {
+  knMapLinearMACGridToVec3_FLIP(const BasicParticleSystem &p,
+                                const FlagGrid &flags,
+                                const MACGrid &vel,
+                                const MACGrid &oldVel,
+                                ParticleDataImpl<Vec3> &pvel,
+                                const Real flipRatio,
+                                const ParticleDataImpl<int> *ptype,
+                                const int exclude)
+      : KernelBase(p.size()),
+        p(p),
+        flags(flags),
+        vel(vel),
+        oldVel(oldVel),
+        pvel(pvel),
+        flipRatio(flipRatio),
+        ptype(ptype),
+        exclude(exclude)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystem &p,
+                 const FlagGrid &flags,
+                 const MACGrid &vel,
+                 const MACGrid &oldVel,
+                 ParticleDataImpl<Vec3> &pvel,
+                 const Real flipRatio,
+                 const ParticleDataImpl<int> *ptype,
+                 const int exclude) const
+  {
+    if (!p.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
+      return;
+    Vec3 v = vel.getInterpolated(p[idx].pos);
+    Vec3 delta = v - oldVel.getInterpolated(p[idx].pos);
+    pvel[idx] = flipRatio * (pvel[idx] + delta) + (1.0 - flipRatio) * v;
+  }
+  inline const BasicParticleSystem &getArg0()
+  {
+    return p;
+  }
+  typedef BasicParticleSystem type0;
+  inline const FlagGrid &getArg1()
+  {
+    return flags;
+  }
+  typedef FlagGrid type1;
+  inline const MACGrid &getArg2()
+  {
+    return vel;
+  }
+  typedef MACGrid type2;
+  inline const MACGrid &getArg3()
+  {
+    return oldVel;
+  }
+  typedef MACGrid type3;
+  inline ParticleDataImpl<Vec3> &getArg4()
+  {
+    return pvel;
+  }
+  typedef ParticleDataImpl<Vec3> type4;
+  inline const Real &getArg5()
+  {
+    return flipRatio;
+  }
+  typedef Real type5;
+  inline const ParticleDataImpl<int> *getArg6()
+  {
+    return ptype;
+  }
+  typedef ParticleDataImpl<int> type6;
+  inline const int &getArg7()
+  {
+    return exclude;
+  }
+  typedef int type7;
+  void runMessage()
+  {
+    debMsg("Executing kernel knMapLinearMACGridToVec3_FLIP ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, p, flags, vel, oldVel, pvel, flipRatio, ptype, exclude);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystem &p;
+  const FlagGrid &flags;
+  const MACGrid &vel;
+  const MACGrid &oldVel;
+  ParticleDataImpl<Vec3> &pvel;
+  const Real flipRatio;
+  const ParticleDataImpl<int> *ptype;
+  const int exclude;
+};
+
+void flipVelocityUpdate(const FlagGrid &flags,
+                        const MACGrid &vel,
+                        const MACGrid &velOld,
+                        const BasicParticleSystem &parts,
+                        ParticleDataImpl<Vec3> &partVel,
+                        const Real flipRatio,
+                        const ParticleDataImpl<int> *ptype = NULL,
+                        const int exclude = 0)
+{
+  knMapLinearMACGridToVec3_FLIP(parts, flags, vel, velOld, partVel, flipRatio, ptype, exclude);
+}
+static PyObject *_W_18(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "flipVelocityUpdate", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      const MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      const MACGrid &velOld = *_args.getPtr<MACGrid>("velOld", 2, &_lock);
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 3, &_lock);
+      ParticleDataImpl<Vec3> &partVel = *_args.getPtr<ParticleDataImpl<Vec3>>(
+          "partVel", 4, &_lock);
+      const Real flipRatio = _args.get<Real>("flipRatio", 5, &_lock);
+      const ParticleDataImpl<int> *ptype = _args.getPtrOpt<ParticleDataImpl<int>>(
+          "ptype", 6, NULL, &_lock);
+      const int exclude = _args.getOpt<int>("exclude", 7, 0, &_lock);
+      _retval = getPyNone();
+      flipVelocityUpdate(flags, vel, velOld, parts, partVel, flipRatio, ptype, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "flipVelocityUpdate", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("flipVelocityUpdate", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_flipVelocityUpdate("", "flipVelocityUpdate", _W_18);
+extern "C" {
+void PbRegister_flipVelocityUpdate()
+{
+  KEEP_UNUSED(_RP_flipVelocityUpdate);
+}
+}
+
+//******************************************************************************
+// narrow band
+
+struct knCombineVels : public KernelBase {
+  knCombineVels(MACGrid &vel,
+                const Grid<Vec3> &w,
+                MACGrid &combineVel,
+                const LevelsetGrid *phi,
+                Real narrowBand,
+                Real thresh)
+      : KernelBase(&vel, 0),
+        vel(vel),
+        w(w),
+        combineVel(combineVel),
+        phi(phi),
+        narrowBand(narrowBand),
+        thresh(thresh)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 MACGrid &vel,
+                 const Grid<Vec3> &w,
+                 MACGrid &combineVel,
+                 const LevelsetGrid *phi,
+                 Real narrowBand,
+                 Real thresh) const
+  {
+    int idx = vel.index(i, j, k);
+
+    for (int c = 0; c < 3; ++c) {
+      // Correct narrow-band FLIP
+      if (phi) {
+        Vec3 pos(i, j, k);
+        pos[(c + 1) % 3] += Real(0.5);
+        pos[(c + 2) % 3] += Real(0.5);
+        Real p = phi->getInterpolated(pos);
+        if (p < -narrowBand) {
+          vel[idx][c] = 0;
+          continue;
+        }
+      }
+
+      if (w[idx][c] > thresh) {
+        combineVel[idx][c] = vel[idx][c];
+        vel[idx][c] = -1;
+      }
+      else {
+        vel[idx][c] = 0;
+      }
+    }
+  }
+  inline MACGrid &getArg0()
+  {
+    return vel;
+  }
+  typedef MACGrid type0;
+  inline const Grid<Vec3> &getArg1()
+  {
+    return w;
+  }
+  typedef Grid<Vec3> type1;
+  inline MACGrid &getArg2()
+  {
+    return combineVel;
+  }
+  typedef MACGrid type2;
+  inline const LevelsetGrid *getArg3()
+  {
+    return phi;
+  }
+  typedef LevelsetGrid type3;
+  inline Real &getArg4()
+  {
+    return narrowBand;
+  }
+  typedef Real type4;
+  inline Real &getArg5()
+  {
+    return thresh;
+  }
+  typedef Real type5;
+  void runMessage()
+  {
+    debMsg("Executing kernel knCombineVels ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, vel, w, combineVel, phi, narrowBand, thresh);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, vel, w, combineVel, phi, narrowBand, thresh);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  MACGrid &vel;
+  const Grid<Vec3> &w;
+  MACGrid &combineVel;
+  const LevelsetGrid *phi;
+  Real narrowBand;
+  Real thresh;
+};
+
+//! narrow band velocity combination
+
+void combineGridVel(MACGrid &vel,
+                    const Grid<Vec3> &weight,
+                    MACGrid &combineVel,
+                    const LevelsetGrid *phi = NULL,
+                    Real narrowBand = 0.0,
+                    Real thresh = 0.0)
+{
+  knCombineVels(vel, weight, combineVel, phi, narrowBand, thresh);
+}
+static PyObject *_W_19(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "combineGridVel", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 0, &_lock);
+      const Grid<Vec3> &weight = *_args.getPtr<Grid<Vec3>>("weight", 1, &_lock);
+      MACGrid &combineVel = *_args.getPtr<MACGrid>("combineVel", 2, &_lock);
+      const LevelsetGrid *phi = _args.getPtrOpt<LevelsetGrid>("phi", 3, NULL, &_lock);
+      Real narrowBand = _args.getOpt<Real>("narrowBand", 4, 0.0, &_lock);
+      Real thresh = _args.getOpt<Real>("thresh", 5, 0.0, &_lock);
+      _retval = getPyNone();
+      combineGridVel(vel, weight, combineVel, phi, narrowBand, thresh);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "combineGridVel", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("combineGridVel", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_combineGridVel("", "combineGridVel", _W_19);
+extern "C" {
+void PbRegister_combineGridVel()
+{
+  KEEP_UNUSED(_RP_combineGridVel);
+}
+}
+
+//! surface tension helper
+void getLaplacian(Grid<Real> &laplacian, const Grid<Real> &grid)
+{
+  LaplaceOp(laplacian, grid);
+}
+static PyObject *_W_20(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "getLaplacian", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &laplacian = *_args.getPtr<Grid<Real>>("laplacian", 0, &_lock);
+      const Grid<Real> &grid = *_args.getPtr<Grid<Real>>("grid", 1, &_lock);
+      _retval = getPyNone();
+      getLaplacian(laplacian, grid);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "getLaplacian", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("getLaplacian", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_getLaplacian("", "getLaplacian", _W_20);
+extern "C" {
+void PbRegister_getLaplacian()
+{
+  KEEP_UNUSED(_RP_getLaplacian);
+}
+}
+
+void getCurvature(Grid<Real> &curv, const Grid<Real> &grid, const Real h = 1.0)
+{
+  CurvatureOp(curv, grid, h);
+}
+static PyObject *_W_21(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "getCurvature", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &curv = *_args.getPtr<Grid<Real>>("curv", 0, &_lock);
+      const Grid<Real> &grid = *_args.getPtr<Grid<Real>>("grid", 1, &_lock);
+      const Real h = _args.getOpt<Real>("h", 2, 1.0, &_lock);
+      _retval = getPyNone();
+      getCurvature(curv, grid, h);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "getCurvature", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("getCurvature", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_getCurvature("", "getCurvature", _W_21);
+extern "C" {
+void PbRegister_getCurvature()
+{
+  KEEP_UNUSED(_RP_getCurvature);
+}
+}
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/fluidguiding.cpp b/extern/mantaflow/preprocessed/plugin/fluidguiding.cpp
new file mode 100644
index 00000000000..13383581123
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/fluidguiding.cpp
@@ -0,0 +1,802 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Copyright 2011 Tobias Pfaff, Nils Thuerey
+ *
+ * This program is free software, distributed under the terms of the
+ * Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Plugins for pressure correction: solve_pressure, and ghost fluid helpers
+ *
+ ******************************************************************************/
+#include "vectorbase.h"
+#include "grid.h"
+#include "kernel.h"
+#include "conjugategrad.h"
+#include "rcmatrix.h"
+
+using namespace std;
+namespace Manta {
+
+// only supports a single blur size for now, globals stored here
+bool gBlurPrecomputed = false;
+int gBlurKernelRadius = -1;
+Matrix gBlurKernel;
+
+// *****************************************************************************
+// Helper functions for fluid guiding
+
+//! creates a 1D (horizontal) Gaussian blur kernel of size n and standard deviation sigma
+Matrix get1DGaussianBlurKernel(const int n, const int sigma)
+{
+  Matrix x(n), y(n);
+  for (int j = 0; j < n; j++) {
+    x.add_to_element(0, j, -(n - 1) * 0.5);
+    y.add_to_element(0, j, j - (n - 1) * 0.5);
+  }
+  Matrix G(n);
+  Real sumG = 0;
+  for (int j = 0; j < n; j++) {
+    G.add_to_element(0,
+                     j,
+                     1 / (2 * M_PI * sigma * sigma) *
+                         exp(-(x(0, j) * x(0, j) + y(0, j) * y(0, j)) / (2 * sigma * sigma)));
+    sumG += G(0, j);
+  }
+  G = G * (1.0 / sumG);
+  return G;
+}
+
+//! convolves in with 1D kernel (centred at the kernel's midpoint) in the x-direction
+//! (out must be a grid of zeros)
+struct apply1DKernelDirX : public KernelBase {
+  apply1DKernelDirX(const MACGrid &in, MACGrid &out, const Matrix &kernel)
+      : KernelBase(&in, 0), in(in), out(out), kernel(kernel)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel) const
+  {
+    int nx = in.getSizeX();
+    int kn = kernel.n;
+    int kCentre = kn / 2;
+    for (int m = 0, ind = kn - 1, ii = i - kCentre; m < kn; m++, ind--, ii++) {
+      if (ii < 0)
+        continue;
+      else if (ii >= nx)
+        break;
+      else
+        out(i, j, k) += in(ii, j, k) * kernel(0, ind);
+    }
+  }
+  inline const MACGrid &getArg0()
+  {
+    return in;
+  }
+  typedef MACGrid type0;
+  inline MACGrid &getArg1()
+  {
+    return out;
+  }
+  typedef MACGrid type1;
+  inline const Matrix &getArg2()
+  {
+    return kernel;
+  }
+  typedef Matrix type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel apply1DKernelDirX ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, in, out, kernel);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, in, out, kernel);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const MACGrid &in;
+  MACGrid &out;
+  const Matrix &kernel;
+};
+
+//! convolves in with 1D kernel (centred at the kernel's midpoint) in the y-direction
+//! (out must be a grid of zeros)
+struct apply1DKernelDirY : public KernelBase {
+  apply1DKernelDirY(const MACGrid &in, MACGrid &out, const Matrix &kernel)
+      : KernelBase(&in, 0), in(in), out(out), kernel(kernel)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel) const
+  {
+    int ny = in.getSizeY();
+    int kn = kernel.n;
+    int kCentre = kn / 2;
+    for (int m = 0, ind = kn - 1, jj = j - kCentre; m < kn; m++, ind--, jj++) {
+      if (jj < 0)
+        continue;
+      else if (jj >= ny)
+        break;
+      else
+        out(i, j, k) += in(i, jj, k) * kernel(0, ind);
+    }
+  }
+  inline const MACGrid &getArg0()
+  {
+    return in;
+  }
+  typedef MACGrid type0;
+  inline MACGrid &getArg1()
+  {
+    return out;
+  }
+  typedef MACGrid type1;
+  inline const Matrix &getArg2()
+  {
+    return kernel;
+  }
+  typedef Matrix type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel apply1DKernelDirY ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, in, out, kernel);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, in, out, kernel);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const MACGrid &in;
+  MACGrid &out;
+  const Matrix &kernel;
+};
+
+//! convolves in with 1D kernel (centred at the kernel's midpoint) in the z-direction
+//! (out must be a grid of zeros)
+struct apply1DKernelDirZ : public KernelBase {
+  apply1DKernelDirZ(const MACGrid &in, MACGrid &out, const Matrix &kernel)
+      : KernelBase(&in, 0), in(in), out(out), kernel(kernel)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel) const
+  {
+    int nz = in.getSizeZ();
+    int kn = kernel.n;
+    int kCentre = kn / 2;
+    for (int m = 0, ind = kn - 1, kk = k - kCentre; m < kn; m++, ind--, kk++) {
+      if (kk < 0)
+        continue;
+      else if (kk >= nz)
+        break;
+      else
+        out(i, j, k) += in(i, j, kk) * kernel(0, ind);
+    }
+  }
+  inline const MACGrid &getArg0()
+  {
+    return in;
+  }
+  typedef MACGrid type0;
+  inline MACGrid &getArg1()
+  {
+    return out;
+  }
+  typedef MACGrid type1;
+  inline const Matrix &getArg2()
+  {
+    return kernel;
+  }
+  typedef Matrix type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel apply1DKernelDirZ ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, in, out, kernel);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, in, out, kernel);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const MACGrid &in;
+  MACGrid &out;
+  const Matrix &kernel;
+};
+
+//! Apply separable Gaussian blur in 2D
+void applySeparableKernel2D(MACGrid &grid, const FlagGrid &flags, const Matrix &kernel)
+{
+  // int nx = grid.getSizeX(), ny = grid.getSizeY();
+  // int kn = kernel.n;
+  // int kCentre = kn / 2;
+  FluidSolver *parent = grid.getParent();
+  MACGrid orig = MACGrid(parent);
+  orig.copyFrom(grid);
+  MACGrid gridX = MACGrid(parent);
+  apply1DKernelDirX(grid, gridX, kernel);
+  MACGrid gridXY = MACGrid(parent);
+  apply1DKernelDirY(gridX, gridXY, kernel);
+  grid.copyFrom(gridXY);
+  FOR_IJK(grid)
+  {
+    if ((i > 0 && flags.isObstacle(i - 1, j, k)) || (j > 0 && flags.isObstacle(i, j - 1, k)) ||
+        flags.isObstacle(i, j, k)) {
+      grid(i, j, k).x = orig(i, j, k).x;
+      grid(i, j, k).y = orig(i, j, k).y;
+      grid(i, j, k).z = orig(i, j, k).z;
+    }
+  }
+}
+
+//! Apply separable Gaussian blur in 3D
+void applySeparableKernel3D(MACGrid &grid, const FlagGrid &flags, const Matrix &kernel)
+{
+  // int nx = grid.getSizeX(), ny = grid.getSizeY(), nz = grid.getSizeZ();
+  // int kn = kernel.n;
+  // int kCentre = kn / 2;
+  FluidSolver *parent = grid.getParent();
+  MACGrid orig = MACGrid(parent);
+  orig.copyFrom(grid);
+  MACGrid gridX = MACGrid(parent);
+  apply1DKernelDirX(grid, gridX, kernel);
+  MACGrid gridXY = MACGrid(parent);
+  apply1DKernelDirY(gridX, gridXY, kernel);
+  MACGrid gridXYZ = MACGrid(parent);
+  apply1DKernelDirZ(gridXY, gridXYZ, kernel);
+  grid.copyFrom(gridXYZ);
+  FOR_IJK(grid)
+  {
+    if ((i > 0 && flags.isObstacle(i - 1, j, k)) || (j > 0 && flags.isObstacle(i, j - 1, k)) ||
+        (k > 0 && flags.isObstacle(i, j, k - 1)) || flags.isObstacle(i, j, k)) {
+      grid(i, j, k).x = orig(i, j, k).x;
+      grid(i, j, k).y = orig(i, j, k).y;
+      grid(i, j, k).z = orig(i, j, k).z;
+    }
+  }
+}
+
+//! Apply separable Gaussian blur in 2D or 3D depending on input dimensions
+void applySeparableKernel(MACGrid &grid, const FlagGrid &flags, const Matrix &kernel)
+{
+  if (!grid.is3D())
+    applySeparableKernel2D(grid, flags, kernel);
+  else
+    applySeparableKernel3D(grid, flags, kernel);
+}
+
+//! Compute r-norm for the stopping criterion
+Real getRNorm(const MACGrid &x, const MACGrid &z)
+{
+  MACGrid r = MACGrid(x.getParent());
+  r.copyFrom(x);
+  r.sub(z);
+  return r.getMaxAbs();
+}
+
+//! Compute s-norm for the stopping criterion
+Real getSNorm(const Real rho, const MACGrid &z, const MACGrid &z_prev)
+{
+  MACGrid s = MACGrid(z_prev.getParent());
+  s.copyFrom(z_prev);
+  s.sub(z);
+  s.multConst(rho);
+  return s.getMaxAbs();
+}
+
+//! Compute primal eps for the stopping criterion
+Real getEpsPri(const Real eps_abs, const Real eps_rel, const MACGrid &x, const MACGrid &z)
+{
+  Real max_norm = max(x.getMaxAbs(), z.getMaxAbs());
+  Real eps_pri = sqrt(x.is3D() ? 3.0 : 2.0) * eps_abs + eps_rel * max_norm;
+  return eps_pri;
+}
+
+//! Compute dual eps for the stopping criterion
+Real getEpsDual(const Real eps_abs, const Real eps_rel, const MACGrid &y)
+{
+  Real eps_dual = sqrt(y.is3D() ? 3.0 : 2.0) * eps_abs + eps_rel * y.getMaxAbs();
+  return eps_dual;
+}
+
+//! Create a spiral velocity field in 2D as a test scene (optionally in 3D)
+void getSpiralVelocity(const FlagGrid &flags,
+                       MACGrid &vel,
+                       Real strength = 1.0,
+                       bool with3D = false)
+{
+  int nx = flags.getSizeX(), ny = flags.getSizeY(), nz = 1;
+  if (with3D)
+    nz = flags.getSizeZ();
+  Real midX = 0.5 * (Real)(nx - 1);
+  Real midY = 0.5 * (Real)(ny - 1);
+  Real midZ = 0.5 * (Real)(nz - 1);
+  for (int i = 0; i < nx; i++) {
+    for (int j = 0; j < ny; j++) {
+      for (int k = 0; k < nz; k++) {
+        int idx = flags.index(i, j, k);
+        Real diffX = midX - i;
+        Real diffY = midY - j;
+        Real hypotenuse = sqrt(diffX * diffX + diffY * diffY);
+        if (hypotenuse > 0) {
+          vel[idx].x = diffY / hypotenuse;
+          vel[idx].y = -diffX / hypotenuse;
+        }
+      }
+    }
+  }
+  vel.multConst(strength);
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "getSpiralVelocity", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      Real strength = _args.getOpt<Real>("strength", 2, 1.0, &_lock);
+      bool with3D = _args.getOpt<bool>("with3D", 3, false, &_lock);
+      _retval = getPyNone();
+      getSpiralVelocity(flags, vel, strength, with3D);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "getSpiralVelocity", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("getSpiralVelocity", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_getSpiralVelocity("", "getSpiralVelocity", _W_0);
+extern "C" {
+void PbRegister_getSpiralVelocity()
+{
+  KEEP_UNUSED(_RP_getSpiralVelocity);
+}
+}
+
+//! Set the guiding weight W as a gradient in the y-direction
+void setGradientYWeight(
+    Grid<Real> &W, const int minY, const int maxY, const Real valAtMin, const Real valAtMax)
+{
+  FOR_IJK(W)
+  {
+    if (minY <= j && j <= maxY) {
+      Real val = valAtMin;
+      if (valAtMax != valAtMin) {
+        Real ratio = (Real)(j - minY) / (Real)(maxY - minY);
+        val = ratio * valAtMax + (1.0 - ratio) * valAtMin;
+      }
+      W(i, j, k) = val;
+    }
+  }
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "setGradientYWeight", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &W = *_args.getPtr<Grid<Real>>("W", 0, &_lock);
+      const int minY = _args.get<int>("minY", 1, &_lock);
+      const int maxY = _args.get<int>("maxY", 2, &_lock);
+      const Real valAtMin = _args.get<Real>("valAtMin", 3, &_lock);
+      const Real valAtMax = _args.get<Real>("valAtMax", 4, &_lock);
+      _retval = getPyNone();
+      setGradientYWeight(W, minY, maxY, valAtMin, valAtMax);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "setGradientYWeight", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("setGradientYWeight", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_setGradientYWeight("", "setGradientYWeight", _W_1);
+extern "C" {
+void PbRegister_setGradientYWeight()
+{
+  KEEP_UNUSED(_RP_setGradientYWeight);
+}
+}
+
+// *****************************************************************************
+// More helper functions for fluid guiding
+
+//! Apply Gaussian blur (either 2D or 3D) in a separable way
+void applySeparableGaussianBlur(MACGrid &grid, const FlagGrid &flags, const Matrix &kernel1D)
+{
+  assertMsg(gBlurPrecomputed, "Error - blue kernel not precomputed");
+  applySeparableKernel(grid, flags, kernel1D);
+}
+
+//! Precomputation performed before the first PD iteration
+void ADMM_precompute_Separable(int blurRadius)
+{
+  if (gBlurPrecomputed) {
+    assertMsg(gBlurKernelRadius == blurRadius,
+              "More than a single blur radius not supported at the moment.");
+    return;
+  }
+  int kernelSize = 2 * blurRadius + 1;
+  gBlurKernel = get1DGaussianBlurKernel(kernelSize, kernelSize);
+  gBlurPrecomputed = true;
+  gBlurKernelRadius = blurRadius;
+}
+
+//! Apply approximate multiplication of inverse(M)
+void applyApproxInvM(MACGrid &v, const FlagGrid &flags, const MACGrid &invA)
+{
+  MACGrid v_new = MACGrid(v.getParent());
+  v_new.copyFrom(v);
+  v_new.mult(invA);
+  applySeparableGaussianBlur(v_new, flags, gBlurKernel);
+  applySeparableGaussianBlur(v_new, flags, gBlurKernel);
+  v_new.multConst(2.0);
+  v_new.mult(invA);
+  v.mult(invA);
+  v.sub(v_new);
+}
+
+//! Precompute Q, a reused quantity in the PD iterations
+//! Q = 2*G*G*(velT-velC)-sigma*velC
+void precomputeQ(MACGrid &Q,
+                 const FlagGrid &flags,
+                 const MACGrid &velT_region,
+                 const MACGrid &velC,
+                 const Matrix &gBlurKernel,
+                 const Real sigma)
+{
+  Q.copyFrom(velT_region);
+  Q.sub(velC);
+  applySeparableGaussianBlur(Q, flags, gBlurKernel);
+  applySeparableGaussianBlur(Q, flags, gBlurKernel);
+  Q.multConst(2.0);
+  Q.addScaled(velC, -sigma);
+}
+
+//! Precompute inverse(A), a reused quantity in the PD iterations
+//! A = 2*S^2 + p*I, invA = elementwise 1/A
+void precomputeInvA(MACGrid &invA, const Grid<Real> &weight, const Real sigma)
+{
+  FOR_IJK(invA)
+  {
+    Real val = 2 * weight(i, j, k) * weight(i, j, k) + sigma;
+    if (val < 0.01)
+      val = 0.01;
+    Real invVal = 1.0 / val;
+    invA(i, j, k).x = invVal;
+    invA(i, j, k).y = invVal;
+    invA(i, j, k).z = invVal;
+  }
+}
+
+//! proximal operator of f , guiding
+void prox_f(MACGrid &v,
+            const FlagGrid &flags,
+            const MACGrid &Q,
+            const MACGrid &velC,
+            const Real sigma,
+            const MACGrid &invA)
+{
+  v.multConst(sigma);
+  v.add(Q);
+  applyApproxInvM(v, flags, invA);
+  v.add(velC);
+}
+
+// *****************************************************************************
+
+// re-uses main pressure solve from pressure.cpp
+void solvePressure(MACGrid &vel,
+                   Grid<Real> &pressure,
+                   const FlagGrid &flags,
+                   Real cgAccuracy = 1e-3,
+                   const Grid<Real> *phi = 0,
+                   const Grid<Real> *perCellCorr = 0,
+                   const MACGrid *fractions = 0,
+                   const MACGrid *obvel = 0,
+                   Real gfClamp = 1e-04,
+                   Real cgMaxIterFac = 1.5,
+                   bool precondition = true,
+                   int preconditioner = 1,
+                   bool enforceCompatibility = false,
+                   bool useL2Norm = false,
+                   bool zeroPressureFixing = false,
+                   const Grid<Real> *curv = NULL,
+                   const Real surfTens = 0.0,
+                   Grid<Real> *retRhs = NULL);
+
+//! Main function for fluid guiding , includes "regular" pressure solve
+
+void PD_fluid_guiding(MACGrid &vel,
+                      MACGrid &velT,
+                      Grid<Real> &pressure,
+                      FlagGrid &flags,
+                      Grid<Real> &weight,
+                      int blurRadius = 5,
+                      Real theta = 1.0,
+                      Real tau = 1.0,
+                      Real sigma = 1.0,
+                      Real epsRel = 1e-3,
+                      Real epsAbs = 1e-3,
+                      int maxIters = 200,
+                      Grid<Real> *phi = 0,
+                      Grid<Real> *perCellCorr = 0,
+                      MACGrid *fractions = 0,
+                      MACGrid *obvel = 0,
+                      Real gfClamp = 1e-04,
+                      Real cgMaxIterFac = 1.5,
+                      Real cgAccuracy = 1e-3,
+                      int preconditioner = 1,
+                      bool zeroPressureFixing = false,
+                      const Grid<Real> *curv = NULL,
+                      const Real surfTens = 0.)
+{
+  FluidSolver *parent = vel.getParent();
+
+  // initialize dual/slack variables
+  MACGrid velC = MACGrid(parent);
+  velC.copyFrom(vel);
+  MACGrid x = MACGrid(parent);
+  MACGrid y = MACGrid(parent);
+  MACGrid z = MACGrid(parent);
+  MACGrid x0 = MACGrid(parent);
+  MACGrid z0 = MACGrid(parent);
+
+  // precomputation
+  ADMM_precompute_Separable(blurRadius);
+  MACGrid Q = MACGrid(parent);
+  precomputeQ(Q, flags, velT, velC, gBlurKernel, sigma);
+  MACGrid invA = MACGrid(parent);
+  precomputeInvA(invA, weight, sigma);
+
+  // loop
+  int iter = 0;
+  for (iter = 0; iter < maxIters; iter++) {
+    // x-update
+    x0.copyFrom(x);
+    x.multConst(1.0 / sigma);
+    x.add(y);
+    prox_f(x, flags, Q, velC, sigma, invA);
+    x.multConst(-sigma);
+    x.addScaled(y, sigma);
+    x.add(x0);
+
+    // z-update
+    z0.copyFrom(z);
+    z.addScaled(x, -tau);
+    Real cgAccuracyAdaptive = cgAccuracy;
+
+    solvePressure(z,
+                  pressure,
+                  flags,
+                  cgAccuracyAdaptive,
+                  phi,
+                  perCellCorr,
+                  fractions,
+                  obvel,
+                  gfClamp,
+                  cgMaxIterFac,
+                  true,
+                  preconditioner,
+                  false,
+                  false,
+                  zeroPressureFixing,
+                  curv,
+                  surfTens);
+
+    // y-update
+    y.copyFrom(z);
+    y.sub(z0);
+    y.multConst(theta);
+    y.add(z);
+
+    // stopping criterion
+    bool stop = (iter > 0 && getRNorm(z, z0) < getEpsDual(epsAbs, epsRel, z));
+
+    if (stop || (iter == maxIters - 1))
+      break;
+  }
+
+  // vel_new = z
+  vel.copyFrom(z);
+
+  debMsg("PD_fluid_guiding iterations:" << iter, 1);
+}
+static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "PD_fluid_guiding", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 0, &_lock);
+      MACGrid &velT = *_args.getPtr<MACGrid>("velT", 1, &_lock);
+      Grid<Real> &pressure = *_args.getPtr<Grid<Real>>("pressure", 2, &_lock);
+      FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 3, &_lock);
+      Grid<Real> &weight = *_args.getPtr<Grid<Real>>("weight", 4, &_lock);
+      int blurRadius = _args.getOpt<int>("blurRadius", 5, 5, &_lock);
+      Real theta = _args.getOpt<Real>("theta", 6, 1.0, &_lock);
+      Real tau = _args.getOpt<Real>("tau", 7, 1.0, &_lock);
+      Real sigma = _args.getOpt<Real>("sigma", 8, 1.0, &_lock);
+      Real epsRel = _args.getOpt<Real>("epsRel", 9, 1e-3, &_lock);
+      Real epsAbs = _args.getOpt<Real>("epsAbs", 10, 1e-3, &_lock);
+      int maxIters = _args.getOpt<int>("maxIters", 11, 200, &_lock);
+      Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 12, 0, &_lock);
+      Grid<Real> *perCellCorr = _args.getPtrOpt<Grid<Real>>("perCellCorr", 13, 0, &_lock);
+      MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 14, 0, &_lock);
+      MACGrid *obvel = _args.getPtrOpt<MACGrid>("obvel", 15, 0, &_lock);
+      Real gfClamp = _args.getOpt<Real>("gfClamp", 16, 1e-04, &_lock);
+      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 17, 1.5, &_lock);
+      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 18, 1e-3, &_lock);
+      int preconditioner = _args.getOpt<int>("preconditioner", 19, 1, &_lock);
+      bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 20, false, &_lock);
+      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 21, NULL, &_lock);
+      const Real surfTens = _args.getOpt<Real>("surfTens", 22, 0., &_lock);
+      _retval = getPyNone();
+      PD_fluid_guiding(vel,
+                       velT,
+                       pressure,
+                       flags,
+                       weight,
+                       blurRadius,
+                       theta,
+                       tau,
+                       sigma,
+                       epsRel,
+                       epsAbs,
+                       maxIters,
+                       phi,
+                       perCellCorr,
+                       fractions,
+                       obvel,
+                       gfClamp,
+                       cgMaxIterFac,
+                       cgAccuracy,
+                       preconditioner,
+                       zeroPressureFixing,
+                       curv,
+                       surfTens);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "PD_fluid_guiding", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("PD_fluid_guiding", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_PD_fluid_guiding("", "PD_fluid_guiding", _W_2);
+extern "C" {
+void PbRegister_PD_fluid_guiding()
+{
+  KEEP_UNUSED(_RP_PD_fluid_guiding);
+}
+}
+
+//! reset precomputation
+void releaseBlurPrecomp()
+{
+  gBlurPrecomputed = false;
+  gBlurKernelRadius = -1;
+  gBlurKernel = 0.f;
+}
+static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "releaseBlurPrecomp", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      _retval = getPyNone();
+      releaseBlurPrecomp();
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "releaseBlurPrecomp", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("releaseBlurPrecomp", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_releaseBlurPrecomp("", "releaseBlurPrecomp", _W_3);
+extern "C" {
+void PbRegister_releaseBlurPrecomp()
+{
+  KEEP_UNUSED(_RP_releaseBlurPrecomp);
+}
+}
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/initplugins.cpp b/extern/mantaflow/preprocessed/plugin/initplugins.cpp
new file mode 100644
index 00000000000..3e28c947424
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/initplugins.cpp
@@ -0,0 +1,2317 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Copyright 2011 Tobias Pfaff, Nils Thuerey
+ *
+ * This program is free software, distributed under the terms of the
+ * Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Tools to setup fields and inflows
+ *
+ ******************************************************************************/
+
+#include "vectorbase.h"
+#include "shapes.h"
+#include "commonkernels.h"
+#include "particle.h"
+#include "noisefield.h"
+#include "simpleimage.h"
+#include "mesh.h"
+
+using namespace std;
+
+namespace Manta {
+
+//! Apply noise to grid
+
+struct KnApplyNoiseInfl : public KernelBase {
+  KnApplyNoiseInfl(const FlagGrid &flags,
+                   Grid<Real> &density,
+                   const WaveletNoiseField &noise,
+                   const Grid<Real> &sdf,
+                   Real scale,
+                   Real sigma)
+      : KernelBase(&flags, 0),
+        flags(flags),
+        density(density),
+        noise(noise),
+        sdf(sdf),
+        scale(scale),
+        sigma(sigma)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 Grid<Real> &density,
+                 const WaveletNoiseField &noise,
+                 const Grid<Real> &sdf,
+                 Real scale,
+                 Real sigma) const
+  {
+    if (!flags.isFluid(i, j, k) || sdf(i, j, k) > sigma)
+      return;
+    Real factor = clamp(1.0 - 0.5 / sigma * (sdf(i, j, k) + sigma), 0.0, 1.0);
+
+    Real target = noise.evaluate(Vec3(i, j, k)) * scale * factor;
+    if (density(i, j, k) < target)
+      density(i, j, k) = target;
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<Real> &getArg1()
+  {
+    return density;
+  }
+  typedef Grid<Real> type1;
+  inline const WaveletNoiseField &getArg2()
+  {
+    return noise;
+  }
+  typedef WaveletNoiseField type2;
+  inline const Grid<Real> &getArg3()
+  {
+    return sdf;
+  }
+  typedef Grid<Real> type3;
+  inline Real &getArg4()
+  {
+    return scale;
+  }
+  typedef Real type4;
+  inline Real &getArg5()
+  {
+    return sigma;
+  }
+  typedef Real type5;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnApplyNoiseInfl ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, density, noise, sdf, scale, sigma);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, density, noise, sdf, scale, sigma);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const FlagGrid &flags;
+  Grid<Real> &density;
+  const WaveletNoiseField &noise;
+  const Grid<Real> &sdf;
+  Real scale;
+  Real sigma;
+};
+
+//! Init noise-modulated density inside shape
+
+void densityInflow(const FlagGrid &flags,
+                   Grid<Real> &density,
+                   const WaveletNoiseField &noise,
+                   Shape *shape,
+                   Real scale = 1.0,
+                   Real sigma = 0)
+{
+  Grid<Real> sdf = shape->computeLevelset();
+  KnApplyNoiseInfl(flags, density, noise, sdf, scale, sigma);
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "densityInflow", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Real> &density = *_args.getPtr<Grid<Real>>("density", 1, &_lock);
+      const WaveletNoiseField &noise = *_args.getPtr<WaveletNoiseField>("noise", 2, &_lock);
+      Shape *shape = _args.getPtr<Shape>("shape", 3, &_lock);
+      Real scale = _args.getOpt<Real>("scale", 4, 1.0, &_lock);
+      Real sigma = _args.getOpt<Real>("sigma", 5, 0, &_lock);
+      _retval = getPyNone();
+      densityInflow(flags, density, noise, shape, scale, sigma);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "densityInflow", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("densityInflow", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_densityInflow("", "densityInflow", _W_0);
+extern "C" {
+void PbRegister_densityInflow()
+{
+  KEEP_UNUSED(_RP_densityInflow);
+}
+}
+
+//! Apply noise to real grid based on an SDF
+struct KnAddNoise : public KernelBase {
+  KnAddNoise(const FlagGrid &flags,
+             Grid<Real> &density,
+             const WaveletNoiseField &noise,
+             const Grid<Real> *sdf,
+             Real scale)
+      : KernelBase(&flags, 0), flags(flags), density(density), noise(noise), sdf(sdf), scale(scale)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 Grid<Real> &density,
+                 const WaveletNoiseField &noise,
+                 const Grid<Real> *sdf,
+                 Real scale) const
+  {
+    if (!flags.isFluid(i, j, k) || (sdf && (*sdf)(i, j, k) > 0.))
+      return;
+    density(i, j, k) += noise.evaluate(Vec3(i, j, k)) * scale;
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<Real> &getArg1()
+  {
+    return density;
+  }
+  typedef Grid<Real> type1;
+  inline const WaveletNoiseField &getArg2()
+  {
+    return noise;
+  }
+  typedef WaveletNoiseField type2;
+  inline const Grid<Real> *getArg3()
+  {
+    return sdf;
+  }
+  typedef Grid<Real> type3;
+  inline Real &getArg4()
+  {
+    return scale;
+  }
+  typedef Real type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnAddNoise ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, density, noise, sdf, scale);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, density, noise, sdf, scale);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const FlagGrid &flags;
+  Grid<Real> &density;
+  const WaveletNoiseField &noise;
+  const Grid<Real> *sdf;
+  Real scale;
+};
+void addNoise(const FlagGrid &flags,
+              Grid<Real> &density,
+              const WaveletNoiseField &noise,
+              const Grid<Real> *sdf = NULL,
+              Real scale = 1.0)
+{
+  KnAddNoise(flags, density, noise, sdf, scale);
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "addNoise", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Real> &density = *_args.getPtr<Grid<Real>>("density", 1, &_lock);
+      const WaveletNoiseField &noise = *_args.getPtr<WaveletNoiseField>("noise", 2, &_lock);
+      const Grid<Real> *sdf = _args.getPtrOpt<Grid<Real>>("sdf", 3, NULL, &_lock);
+      Real scale = _args.getOpt<Real>("scale", 4, 1.0, &_lock);
+      _retval = getPyNone();
+      addNoise(flags, density, noise, sdf, scale);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "addNoise", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("addNoise", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_addNoise("", "addNoise", _W_1);
+extern "C" {
+void PbRegister_addNoise()
+{
+  KEEP_UNUSED(_RP_addNoise);
+}
+}
+
+//! sample noise field and set pdata with its values (for convenience, scale the noise values)
+
+template<class T> struct knSetPdataNoise : public KernelBase {
+  knSetPdataNoise(const BasicParticleSystem &parts,
+                  ParticleDataImpl<T> &pdata,
+                  const WaveletNoiseField &noise,
+                  Real scale)
+      : KernelBase(parts.size()), parts(parts), pdata(pdata), noise(noise), scale(scale)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystem &parts,
+                 ParticleDataImpl<T> &pdata,
+                 const WaveletNoiseField &noise,
+                 Real scale) const
+  {
+    pdata[idx] = noise.evaluate(parts.getPos(idx)) * scale;
+  }
+  inline const BasicParticleSystem &getArg0()
+  {
+    return parts;
+  }
+  typedef BasicParticleSystem type0;
+  inline ParticleDataImpl<T> &getArg1()
+  {
+    return pdata;
+  }
+  typedef ParticleDataImpl<T> type1;
+  inline const WaveletNoiseField &getArg2()
+  {
+    return noise;
+  }
+  typedef WaveletNoiseField type2;
+  inline Real &getArg3()
+  {
+    return scale;
+  }
+  typedef Real type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetPdataNoise ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, parts, pdata, noise, scale);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystem &parts;
+  ParticleDataImpl<T> &pdata;
+  const WaveletNoiseField &noise;
+  Real scale;
+};
+
+template<class T> struct knSetPdataNoiseVec : public KernelBase {
+  knSetPdataNoiseVec(const BasicParticleSystem &parts,
+                     ParticleDataImpl<T> &pdata,
+                     const WaveletNoiseField &noise,
+                     Real scale)
+      : KernelBase(parts.size()), parts(parts), pdata(pdata), noise(noise), scale(scale)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystem &parts,
+                 ParticleDataImpl<T> &pdata,
+                 const WaveletNoiseField &noise,
+                 Real scale) const
+  {
+    pdata[idx] = noise.evaluateVec(parts.getPos(idx)) * scale;
+  }
+  inline const BasicParticleSystem &getArg0()
+  {
+    return parts;
+  }
+  typedef BasicParticleSystem type0;
+  inline ParticleDataImpl<T> &getArg1()
+  {
+    return pdata;
+  }
+  typedef ParticleDataImpl<T> type1;
+  inline const WaveletNoiseField &getArg2()
+  {
+    return noise;
+  }
+  typedef WaveletNoiseField type2;
+  inline Real &getArg3()
+  {
+    return scale;
+  }
+  typedef Real type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetPdataNoiseVec ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, parts, pdata, noise, scale);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystem &parts;
+  ParticleDataImpl<T> &pdata;
+  const WaveletNoiseField &noise;
+  Real scale;
+};
+void setNoisePdata(const BasicParticleSystem &parts,
+                   ParticleDataImpl<Real> &pd,
+                   const WaveletNoiseField &noise,
+                   Real scale = 1.)
+{
+  knSetPdataNoise<Real>(parts, pd, noise, scale);
+}
+static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "setNoisePdata", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      ParticleDataImpl<Real> &pd = *_args.getPtr<ParticleDataImpl<Real>>("pd", 1, &_lock);
+      const WaveletNoiseField &noise = *_args.getPtr<WaveletNoiseField>("noise", 2, &_lock);
+      Real scale = _args.getOpt<Real>("scale", 3, 1., &_lock);
+      _retval = getPyNone();
+      setNoisePdata(parts, pd, noise, scale);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "setNoisePdata", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("setNoisePdata", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_setNoisePdata("", "setNoisePdata", _W_2);
+extern "C" {
+void PbRegister_setNoisePdata()
+{
+  KEEP_UNUSED(_RP_setNoisePdata);
+}
+}
+
+void setNoisePdataVec3(const BasicParticleSystem &parts,
+                       ParticleDataImpl<Vec3> &pd,
+                       const WaveletNoiseField &noise,
+                       Real scale = 1.)
+{
+  knSetPdataNoiseVec<Vec3>(parts, pd, noise, scale);
+}
+static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "setNoisePdataVec3", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      ParticleDataImpl<Vec3> &pd = *_args.getPtr<ParticleDataImpl<Vec3>>("pd", 1, &_lock);
+      const WaveletNoiseField &noise = *_args.getPtr<WaveletNoiseField>("noise", 2, &_lock);
+      Real scale = _args.getOpt<Real>("scale", 3, 1., &_lock);
+      _retval = getPyNone();
+      setNoisePdataVec3(parts, pd, noise, scale);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "setNoisePdataVec3", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("setNoisePdataVec3", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_setNoisePdataVec3("", "setNoisePdataVec3", _W_3);
+extern "C" {
+void PbRegister_setNoisePdataVec3()
+{
+  KEEP_UNUSED(_RP_setNoisePdataVec3);
+}
+}
+
+void setNoisePdataInt(const BasicParticleSystem &parts,
+                      ParticleDataImpl<int> &pd,
+                      const WaveletNoiseField &noise,
+                      Real scale = 1.)
+{
+  knSetPdataNoise<int>(parts, pd, noise, scale);
+}
+static PyObject *_W_4(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "setNoisePdataInt", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      ParticleDataImpl<int> &pd = *_args.getPtr<ParticleDataImpl<int>>("pd", 1, &_lock);
+      const WaveletNoiseField &noise = *_args.getPtr<WaveletNoiseField>("noise", 2, &_lock);
+      Real scale = _args.getOpt<Real>("scale", 3, 1., &_lock);
+      _retval = getPyNone();
+      setNoisePdataInt(parts, pd, noise, scale);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "setNoisePdataInt", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("setNoisePdataInt", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_setNoisePdataInt("", "setNoisePdataInt", _W_4);
+extern "C" {
+void PbRegister_setNoisePdataInt()
+{
+  KEEP_UNUSED(_RP_setNoisePdataInt);
+}
+}
+
+//! SDF gradient from obstacle flags, for turbulence.py
+//  FIXME, slow, without kernel...
+Grid<Vec3> obstacleGradient(const FlagGrid &flags)
+{
+  LevelsetGrid levelset(flags.getParent(), false);
+  Grid<Vec3> gradient(flags.getParent());
+
+  // rebuild obstacle levelset
+  FOR_IDX(levelset)
+  {
+    levelset[idx] = flags.isObstacle(idx) ? -0.5 : 0.5;
+  }
+  levelset.reinitMarching(flags, 6.0, 0, true, false, FlagGrid::TypeReserved);
+
+  // build levelset gradient
+  GradientOp(gradient, levelset);
+
+  FOR_IDX(levelset)
+  {
+    Vec3 grad = gradient[idx];
+    Real s = normalize(grad);
+    if (s <= 0.1 || levelset[idx] >= 0)
+      grad = Vec3(0.);
+    gradient[idx] = grad * levelset[idx];
+  }
+
+  return gradient;
+}
+static PyObject *_W_5(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "obstacleGradient", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      _retval = toPy(obstacleGradient(flags));
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "obstacleGradient", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("obstacleGradient", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_obstacleGradient("", "obstacleGradient", _W_5);
+extern "C" {
+void PbRegister_obstacleGradient()
+{
+  KEEP_UNUSED(_RP_obstacleGradient);
+}
+}
+
+//! SDF from obstacle flags, for turbulence.py
+LevelsetGrid obstacleLevelset(const FlagGrid &flags)
+{
+  LevelsetGrid levelset(flags.getParent(), false);
+
+  // rebuild obstacle levelset
+  FOR_IDX(levelset)
+  {
+    levelset[idx] = flags.isObstacle(idx) ? -0.5 : 0.5;
+  }
+  levelset.reinitMarching(flags, 6.0, 0, true, false, FlagGrid::TypeReserved);
+
+  return levelset;
+}
+static PyObject *_W_6(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "obstacleLevelset", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      _retval = toPy(obstacleLevelset(flags));
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "obstacleLevelset", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("obstacleLevelset", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_obstacleLevelset("", "obstacleLevelset", _W_6);
+extern "C" {
+void PbRegister_obstacleLevelset()
+{
+  KEEP_UNUSED(_RP_obstacleLevelset);
+}
+}
+
+//*****************************************************************************
+// blender init functions
+
+struct KnApplyEmission : public KernelBase {
+  KnApplyEmission(const FlagGrid &flags,
+                  Grid<Real> &target,
+                  const Grid<Real> &source,
+                  const Grid<Real> *emissionTexture,
+                  bool isAbsolute,
+                  int type)
+      : KernelBase(&flags, 0),
+        flags(flags),
+        target(target),
+        source(source),
+        emissionTexture(emissionTexture),
+        isAbsolute(isAbsolute),
+        type(type)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 Grid<Real> &target,
+                 const Grid<Real> &source,
+                 const Grid<Real> *emissionTexture,
+                 bool isAbsolute,
+                 int type) const
+  {
+    // if type is given, only apply emission when celltype matches type from flaggrid
+    // and if emission texture is given, only apply emission when some emission is present at cell
+    // (important for emit from particles)
+    bool isInflow = (type & FlagGrid::TypeInflow && flags.isInflow(i, j, k));
+    bool isOutflow = (type & FlagGrid::TypeOutflow && flags.isOutflow(i, j, k));
+    if ((type && !isInflow && !isOutflow) && (emissionTexture && !(*emissionTexture)(i, j, k)))
+      return;
+
+    if (isAbsolute)
+      target(i, j, k) = source(i, j, k);
+    else
+      target(i, j, k) += source(i, j, k);
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<Real> &getArg1()
+  {
+    return target;
+  }
+  typedef Grid<Real> type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return source;
+  }
+  typedef Grid<Real> type2;
+  inline const Grid<Real> *getArg3()
+  {
+    return emissionTexture;
+  }
+  typedef Grid<Real> type3;
+  inline bool &getArg4()
+  {
+    return isAbsolute;
+  }
+  typedef bool type4;
+  inline int &getArg5()
+  {
+    return type;
+  }
+  typedef int type5;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnApplyEmission ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, target, source, emissionTexture, isAbsolute, type);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, target, source, emissionTexture, isAbsolute, type);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const FlagGrid &flags;
+  Grid<Real> &target;
+  const Grid<Real> &source;
+  const Grid<Real> *emissionTexture;
+  bool isAbsolute;
+  int type;
+};
+
+//! Add emission values
+// isAbsolute: whether to add emission values to existing, or replace
+void applyEmission(FlagGrid &flags,
+                   Grid<Real> &target,
+                   Grid<Real> &source,
+                   Grid<Real> *emissionTexture = NULL,
+                   bool isAbsolute = true,
+                   int type = 0)
+{
+  KnApplyEmission(flags, target, source, emissionTexture, isAbsolute, type);
+}
+static PyObject *_W_7(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "applyEmission", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Real> &target = *_args.getPtr<Grid<Real>>("target", 1, &_lock);
+      Grid<Real> &source = *_args.getPtr<Grid<Real>>("source", 2, &_lock);
+      Grid<Real> *emissionTexture = _args.getPtrOpt<Grid<Real>>(
+          "emissionTexture", 3, NULL, &_lock);
+      bool isAbsolute = _args.getOpt<bool>("isAbsolute", 4, true, &_lock);
+      int type = _args.getOpt<int>("type", 5, 0, &_lock);
+      _retval = getPyNone();
+      applyEmission(flags, target, source, emissionTexture, isAbsolute, type);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "applyEmission", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("applyEmission", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_applyEmission("", "applyEmission", _W_7);
+extern "C" {
+void PbRegister_applyEmission()
+{
+  KEEP_UNUSED(_RP_applyEmission);
+}
+}
+
+// blender init functions for meshes
+
+struct KnApplyDensity : public KernelBase {
+  KnApplyDensity(
+      const FlagGrid &flags, Grid<Real> &density, const Grid<Real> &sdf, Real value, Real sigma)
+      : KernelBase(&flags, 0), flags(flags), density(density), sdf(sdf), value(value), sigma(sigma)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 Grid<Real> &density,
+                 const Grid<Real> &sdf,
+                 Real value,
+                 Real sigma) const
+  {
+    if (!flags.isFluid(i, j, k) || sdf(i, j, k) > sigma)
+      return;
+    density(i, j, k) = value;
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<Real> &getArg1()
+  {
+    return density;
+  }
+  typedef Grid<Real> type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return sdf;
+  }
+  typedef Grid<Real> type2;
+  inline Real &getArg3()
+  {
+    return value;
+  }
+  typedef Real type3;
+  inline Real &getArg4()
+  {
+    return sigma;
+  }
+  typedef Real type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnApplyDensity ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, density, sdf, value, sigma);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, density, sdf, value, sigma);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const FlagGrid &flags;
+  Grid<Real> &density;
+  const Grid<Real> &sdf;
+  Real value;
+  Real sigma;
+};
+//! Init noise-modulated density inside mesh
+
+void densityInflowMeshNoise(const FlagGrid &flags,
+                            Grid<Real> &density,
+                            const WaveletNoiseField &noise,
+                            Mesh *mesh,
+                            Real scale = 1.0,
+                            Real sigma = 0)
+{
+  LevelsetGrid sdf(density.getParent(), false);
+  mesh->computeLevelset(sdf, 1.);
+  KnApplyNoiseInfl(flags, density, noise, sdf, scale, sigma);
+}
+static PyObject *_W_8(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "densityInflowMeshNoise", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Real> &density = *_args.getPtr<Grid<Real>>("density", 1, &_lock);
+      const WaveletNoiseField &noise = *_args.getPtr<WaveletNoiseField>("noise", 2, &_lock);
+      Mesh *mesh = _args.getPtr<Mesh>("mesh", 3, &_lock);
+      Real scale = _args.getOpt<Real>("scale", 4, 1.0, &_lock);
+      Real sigma = _args.getOpt<Real>("sigma", 5, 0, &_lock);
+      _retval = getPyNone();
+      densityInflowMeshNoise(flags, density, noise, mesh, scale, sigma);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "densityInflowMeshNoise", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("densityInflowMeshNoise", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_densityInflowMeshNoise("", "densityInflowMeshNoise", _W_8);
+extern "C" {
+void PbRegister_densityInflowMeshNoise()
+{
+  KEEP_UNUSED(_RP_densityInflowMeshNoise);
+}
+}
+
+//! Init constant density inside mesh
+
+void densityInflowMesh(const FlagGrid &flags,
+                       Grid<Real> &density,
+                       Mesh *mesh,
+                       Real value = 1.,
+                       Real cutoff = 7,
+                       Real sigma = 0)
+{
+  LevelsetGrid sdf(density.getParent(), false);
+  mesh->computeLevelset(sdf, 2., cutoff);
+  KnApplyDensity(flags, density, sdf, value, sigma);
+}
+static PyObject *_W_9(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "densityInflowMesh", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Real> &density = *_args.getPtr<Grid<Real>>("density", 1, &_lock);
+      Mesh *mesh = _args.getPtr<Mesh>("mesh", 2, &_lock);
+      Real value = _args.getOpt<Real>("value", 3, 1., &_lock);
+      Real cutoff = _args.getOpt<Real>("cutoff", 4, 7, &_lock);
+      Real sigma = _args.getOpt<Real>("sigma", 5, 0, &_lock);
+      _retval = getPyNone();
+      densityInflowMesh(flags, density, mesh, value, cutoff, sigma);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "densityInflowMesh", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("densityInflowMesh", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_densityInflowMesh("", "densityInflowMesh", _W_9);
+extern "C" {
+void PbRegister_densityInflowMesh()
+{
+  KEEP_UNUSED(_RP_densityInflowMesh);
+}
+}
+
+//*****************************************************************************
+
+//! check for symmetry , optionally enfore by copying
+
+void checkSymmetry(
+    Grid<Real> &a, Grid<Real> *err = NULL, bool symmetrize = false, int axis = 0, int bound = 0)
+{
+  const int c = axis;
+  const int s = a.getSize()[c];
+  FOR_IJK(a)
+  {
+    Vec3i idx(i, j, k), mdx(i, j, k);
+    mdx[c] = s - 1 - idx[c];
+    if (bound > 0 && ((!a.isInBounds(idx, bound)) || (!a.isInBounds(mdx, bound))))
+      continue;
+
+    if (err)
+      (*err)(idx) = fabs((double)(a(idx) - a(mdx)));
+    if (symmetrize && (idx[c] < s / 2)) {
+      a(idx) = a(mdx);
+    }
+  }
+}
+static PyObject *_W_10(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "checkSymmetry", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &a = *_args.getPtr<Grid<Real>>("a", 0, &_lock);
+      Grid<Real> *err = _args.getPtrOpt<Grid<Real>>("err", 1, NULL, &_lock);
+      bool symmetrize = _args.getOpt<bool>("symmetrize", 2, false, &_lock);
+      int axis = _args.getOpt<int>("axis", 3, 0, &_lock);
+      int bound = _args.getOpt<int>("bound", 4, 0, &_lock);
+      _retval = getPyNone();
+      checkSymmetry(a, err, symmetrize, axis, bound);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "checkSymmetry", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("checkSymmetry", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_checkSymmetry("", "checkSymmetry", _W_10);
+extern "C" {
+void PbRegister_checkSymmetry()
+{
+  KEEP_UNUSED(_RP_checkSymmetry);
+}
+}
+
+//! check for symmetry , mac grid version
+
+void checkSymmetryVec3(Grid<Vec3> &a,
+                       Grid<Real> *err = NULL,
+                       bool symmetrize = false,
+                       int axis = 0,
+                       int bound = 0,
+                       int disable = 0)
+{
+  if (err)
+    err->setConst(0.);
+
+  // each dimension is measured separately for flexibility (could be combined)
+  const int c = axis;
+  const int o1 = (c + 1) % 3;
+  const int o2 = (c + 2) % 3;
+
+  // x
+  if (!(disable & 1)) {
+    const int s = a.getSize()[c] + 1;
+    FOR_IJK(a)
+    {
+      Vec3i idx(i, j, k), mdx(i, j, k);
+      mdx[c] = s - 1 - idx[c];
+      if (mdx[c] >= a.getSize()[c])
+        continue;
+      if (bound > 0 && ((!a.isInBounds(idx, bound)) || (!a.isInBounds(mdx, bound))))
+        continue;
+
+      // special case: center "line" of values , should be zero!
+      if (mdx[c] == idx[c]) {
+        if (err)
+          (*err)(idx) += fabs((double)(a(idx)[c]));
+        if (symmetrize)
+          a(idx)[c] = 0.;
+        continue;
+      }
+
+      // note - the a(mdx) component needs to be inverted here!
+      if (err)
+        (*err)(idx) += fabs((double)(a(idx)[c] - (a(mdx)[c] * -1.)));
+      if (symmetrize && (idx[c] < s / 2)) {
+        a(idx)[c] = a(mdx)[c] * -1.;
+      }
+    }
+  }
+
+  // y
+  if (!(disable & 2)) {
+    const int s = a.getSize()[c];
+    FOR_IJK(a)
+    {
+      Vec3i idx(i, j, k), mdx(i, j, k);
+      mdx[c] = s - 1 - idx[c];
+      if (bound > 0 && ((!a.isInBounds(idx, bound)) || (!a.isInBounds(mdx, bound))))
+        continue;
+
+      if (err)
+        (*err)(idx) += fabs((double)(a(idx)[o1] - a(mdx)[o1]));
+      if (symmetrize && (idx[c] < s / 2)) {
+        a(idx)[o1] = a(mdx)[o1];
+      }
+    }
+  }
+
+  // z
+  if (!(disable & 4)) {
+    const int s = a.getSize()[c];
+    FOR_IJK(a)
+    {
+      Vec3i idx(i, j, k), mdx(i, j, k);
+      mdx[c] = s - 1 - idx[c];
+      if (bound > 0 && ((!a.isInBounds(idx, bound)) || (!a.isInBounds(mdx, bound))))
+        continue;
+
+      if (err)
+        (*err)(idx) += fabs((double)(a(idx)[o2] - a(mdx)[o2]));
+      if (symmetrize && (idx[c] < s / 2)) {
+        a(idx)[o2] = a(mdx)[o2];
+      }
+    }
+  }
+}
+static PyObject *_W_11(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "checkSymmetryVec3", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Vec3> &a = *_args.getPtr<Grid<Vec3>>("a", 0, &_lock);
+      Grid<Real> *err = _args.getPtrOpt<Grid<Real>>("err", 1, NULL, &_lock);
+      bool symmetrize = _args.getOpt<bool>("symmetrize", 2, false, &_lock);
+      int axis = _args.getOpt<int>("axis", 3, 0, &_lock);
+      int bound = _args.getOpt<int>("bound", 4, 0, &_lock);
+      int disable = _args.getOpt<int>("disable", 5, 0, &_lock);
+      _retval = getPyNone();
+      checkSymmetryVec3(a, err, symmetrize, axis, bound, disable);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "checkSymmetryVec3", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("checkSymmetryVec3", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_checkSymmetryVec3("", "checkSymmetryVec3", _W_11);
+extern "C" {
+void PbRegister_checkSymmetryVec3()
+{
+  KEEP_UNUSED(_RP_checkSymmetryVec3);
+}
+}
+
+// from simpleimage.cpp
+void projectImg(SimpleImage &img, const Grid<Real> &val, int shadeMode = 0, Real scale = 1.);
+
+//! output shaded (all 3 axes at once for 3D)
+//! shading modes: 0 smoke, 1 surfaces
+
+void projectPpmFull(const Grid<Real> &val, string name, int shadeMode = 0, Real scale = 1.)
+{
+  SimpleImage img;
+  projectImg(img, val, shadeMode, scale);
+  img.writePpm(name);
+}
+static PyObject *_W_12(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "projectPpmFull", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const Grid<Real> &val = *_args.getPtr<Grid<Real>>("val", 0, &_lock);
+      string name = _args.get<string>("name", 1, &_lock);
+      int shadeMode = _args.getOpt<int>("shadeMode", 2, 0, &_lock);
+      Real scale = _args.getOpt<Real>("scale", 3, 1., &_lock);
+      _retval = getPyNone();
+      projectPpmFull(val, name, shadeMode, scale);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "projectPpmFull", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("projectPpmFull", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_projectPpmFull("", "projectPpmFull", _W_12);
+extern "C" {
+void PbRegister_projectPpmFull()
+{
+  KEEP_UNUSED(_RP_projectPpmFull);
+}
+}
+
+// helper functions for pdata operator tests
+
+//! init some test particles at the origin
+
+void addTestParts(BasicParticleSystem &parts, int num)
+{
+  for (int i = 0; i < num; ++i)
+    parts.addBuffered(Vec3(0, 0, 0));
+
+  parts.doCompress();
+  parts.insertBufferedParticles();
+}
+static PyObject *_W_13(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "addTestParts", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      int num = _args.get<int>("num", 1, &_lock);
+      _retval = getPyNone();
+      addTestParts(parts, num);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "addTestParts", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("addTestParts", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_addTestParts("", "addTestParts", _W_13);
+extern "C" {
+void PbRegister_addTestParts()
+{
+  KEEP_UNUSED(_RP_addTestParts);
+}
+}
+
+//! calculate the difference between two pdata fields (note - slow!, not parallelized)
+
+Real pdataMaxDiff(const ParticleDataBase *a, const ParticleDataBase *b)
+{
+  double maxVal = 0.;
+  // debMsg(" PD "<< a->getType()<<"  as"<<a->getSizeSlow()<<"  bs"<<b->getSizeSlow() , 1);
+  assertMsg(a->getType() == b->getType(), "pdataMaxDiff problem - different pdata types!");
+  assertMsg(a->getSizeSlow() == b->getSizeSlow(), "pdataMaxDiff problem - different pdata sizes!");
+
+  if (a->getType() & ParticleDataBase::TypeReal) {
+    const ParticleDataImpl<Real> &av = *dynamic_cast<const ParticleDataImpl<Real> *>(a);
+    const ParticleDataImpl<Real> &bv = *dynamic_cast<const ParticleDataImpl<Real> *>(b);
+    FOR_PARTS(av)
+    {
+      maxVal = std::max(maxVal, (double)fabs(av[idx] - bv[idx]));
+    }
+  }
+  else if (a->getType() & ParticleDataBase::TypeInt) {
+    const ParticleDataImpl<int> &av = *dynamic_cast<const ParticleDataImpl<int> *>(a);
+    const ParticleDataImpl<int> &bv = *dynamic_cast<const ParticleDataImpl<int> *>(b);
+    FOR_PARTS(av)
+    {
+      maxVal = std::max(maxVal, (double)fabs((double)av[idx] - bv[idx]));
+    }
+  }
+  else if (a->getType() & ParticleDataBase::TypeVec3) {
+    const ParticleDataImpl<Vec3> &av = *dynamic_cast<const ParticleDataImpl<Vec3> *>(a);
+    const ParticleDataImpl<Vec3> &bv = *dynamic_cast<const ParticleDataImpl<Vec3> *>(b);
+    FOR_PARTS(av)
+    {
+      double d = 0.;
+      for (int c = 0; c < 3; ++c) {
+        d += fabs((double)av[idx][c] - (double)bv[idx][c]);
+      }
+      maxVal = std::max(maxVal, d);
+    }
+  }
+  else {
+    errMsg("pdataMaxDiff: Grid Type is not supported (only Real, Vec3, int)");
+  }
+
+  return maxVal;
+}
+static PyObject *_W_14(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "pdataMaxDiff", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const ParticleDataBase *a = _args.getPtr<ParticleDataBase>("a", 0, &_lock);
+      const ParticleDataBase *b = _args.getPtr<ParticleDataBase>("b", 1, &_lock);
+      _retval = toPy(pdataMaxDiff(a, b));
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "pdataMaxDiff", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("pdataMaxDiff", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_pdataMaxDiff("", "pdataMaxDiff", _W_14);
+extern "C" {
+void PbRegister_pdataMaxDiff()
+{
+  KEEP_UNUSED(_RP_pdataMaxDiff);
+}
+}
+
+//! calculate center of mass given density grid, for re-centering
+
+Vec3 calcCenterOfMass(const Grid<Real> &density)
+{
+  Vec3 p(0.0f);
+  Real w = 0.0f;
+  FOR_IJK(density)
+  {
+    p += density(i, j, k) * Vec3(i + 0.5f, j + 0.5f, k + 0.5f);
+    w += density(i, j, k);
+  }
+  if (w > 1e-6f)
+    p /= w;
+  return p;
+}
+static PyObject *_W_15(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "calcCenterOfMass", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const Grid<Real> &density = *_args.getPtr<Grid<Real>>("density", 0, &_lock);
+      _retval = toPy(calcCenterOfMass(density));
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "calcCenterOfMass", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("calcCenterOfMass", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_calcCenterOfMass("", "calcCenterOfMass", _W_15);
+extern "C" {
+void PbRegister_calcCenterOfMass()
+{
+  KEEP_UNUSED(_RP_calcCenterOfMass);
+}
+}
+
+//*****************************************************************************
+// helper functions for volume fractions (which are needed for second order obstacle boundaries)
+
+inline static Real calcFraction(Real phi1, Real phi2, Real fracThreshold)
+{
+  if (phi1 > 0. && phi2 > 0.)
+    return 1.;
+  if (phi1 < 0. && phi2 < 0.)
+    return 0.;
+
+  // make sure phi1 < phi2
+  if (phi2 < phi1) {
+    Real t = phi1;
+    phi1 = phi2;
+    phi2 = t;
+  }
+  Real denom = phi1 - phi2;
+  if (denom > -1e-04)
+    return 0.5;
+
+  Real frac = 1. - phi1 / denom;
+  if (frac < fracThreshold)
+    frac = 0.;  // stomp small values , dont mark as fluid
+  return std::min(Real(1), frac);
+}
+
+struct KnUpdateFractions : public KernelBase {
+  KnUpdateFractions(const FlagGrid &flags,
+                    const Grid<Real> &phiObs,
+                    MACGrid &fractions,
+                    const int &boundaryWidth,
+                    const Real fracThreshold)
+      : KernelBase(&flags, 1),
+        flags(flags),
+        phiObs(phiObs),
+        fractions(fractions),
+        boundaryWidth(boundaryWidth),
+        fracThreshold(fracThreshold)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 const Grid<Real> &phiObs,
+                 MACGrid &fractions,
+                 const int &boundaryWidth,
+                 const Real fracThreshold) const
+  {
+
+    // walls at domain bounds and inner objects
+    fractions(i, j, k).x = calcFraction(phiObs(i, j, k), phiObs(i - 1, j, k), fracThreshold);
+    fractions(i, j, k).y = calcFraction(phiObs(i, j, k), phiObs(i, j - 1, k), fracThreshold);
+    if (phiObs.is3D()) {
+      fractions(i, j, k).z = calcFraction(phiObs(i, j, k), phiObs(i, j, k - 1), fracThreshold);
+    }
+
+    // remaining BCs at the domain boundaries
+    const int w = boundaryWidth;
+    // only set if not in obstacle
+    if (phiObs(i, j, k) < 0.)
+      return;
+
+    // x-direction boundaries
+    if (i <= w + 1) {  // min x
+      if ((flags.isInflow(i - 1, j, k)) || (flags.isOutflow(i - 1, j, k)) ||
+          (flags.isOpen(i - 1, j, k))) {
+        fractions(i, j, k).x = fractions(i, j, k).y = 1.;
+        if (flags.is3D())
+          fractions(i, j, k).z = 1.;
+      }
+    }
+    if (i >= flags.getSizeX() - w - 2) {  // max x
+      if ((flags.isInflow(i + 1, j, k)) || (flags.isOutflow(i + 1, j, k)) ||
+          (flags.isOpen(i + 1, j, k))) {
+        fractions(i + 1, j, k).x = fractions(i + 1, j, k).y = 1.;
+        if (flags.is3D())
+          fractions(i + 1, j, k).z = 1.;
+      }
+    }
+    // y-direction boundaries
+    if (j <= w + 1) {  // min y
+      if ((flags.isInflow(i, j - 1, k)) || (flags.isOutflow(i, j - 1, k)) ||
+          (flags.isOpen(i, j - 1, k))) {
+        fractions(i, j, k).x = fractions(i, j, k).y = 1.;
+        if (flags.is3D())
+          fractions(i, j, k).z = 1.;
+      }
+    }
+    if (j >= flags.getSizeY() - w - 2) {  // max y
+      if ((flags.isInflow(i, j + 1, k)) || (flags.isOutflow(i, j + 1, k)) ||
+          (flags.isOpen(i, j + 1, k))) {
+        fractions(i, j + 1, k).x = fractions(i, j + 1, k).y = 1.;
+        if (flags.is3D())
+          fractions(i, j + 1, k).z = 1.;
+      }
+    }
+    // z-direction boundaries
+    if (flags.is3D()) {
+      if (k <= w + 1) {  // min z
+        if ((flags.isInflow(i, j, k - 1)) || (flags.isOutflow(i, j, k - 1)) ||
+            (flags.isOpen(i, j, k - 1))) {
+          fractions(i, j, k).x = fractions(i, j, k).y = 1.;
+          if (flags.is3D())
+            fractions(i, j, k).z = 1.;
+        }
+      }
+      if (j >= flags.getSizeZ() - w - 2) {  // max z
+        if ((flags.isInflow(i, j, k + 1)) || (flags.isOutflow(i, j, k + 1)) ||
+            (flags.isOpen(i, j, k + 1))) {
+          fractions(i, j, k + 1).x = fractions(i, j, k + 1).y = 1.;
+          if (flags.is3D())
+            fractions(i, j, k + 1).z = 1.;
+        }
+      }
+    }
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline const Grid<Real> &getArg1()
+  {
+    return phiObs;
+  }
+  typedef Grid<Real> type1;
+  inline MACGrid &getArg2()
+  {
+    return fractions;
+  }
+  typedef MACGrid type2;
+  inline const int &getArg3()
+  {
+    return boundaryWidth;
+  }
+  typedef int type3;
+  inline const Real &getArg4()
+  {
+    return fracThreshold;
+  }
+  typedef Real type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnUpdateFractions ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, phiObs, fractions, boundaryWidth, fracThreshold);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, phiObs, fractions, boundaryWidth, fracThreshold);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  const Grid<Real> &phiObs;
+  MACGrid &fractions;
+  const int &boundaryWidth;
+  const Real fracThreshold;
+};
+
+//! update fill fraction values
+void updateFractions(const FlagGrid &flags,
+                     const Grid<Real> &phiObs,
+                     MACGrid &fractions,
+                     const int &boundaryWidth = 0,
+                     const Real fracThreshold = 0.01)
+{
+  fractions.setConst(Vec3(0.));
+  KnUpdateFractions(flags, phiObs, fractions, boundaryWidth, fracThreshold);
+}
+static PyObject *_W_16(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "updateFractions", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      const Grid<Real> &phiObs = *_args.getPtr<Grid<Real>>("phiObs", 1, &_lock);
+      MACGrid &fractions = *_args.getPtr<MACGrid>("fractions", 2, &_lock);
+      const int &boundaryWidth = _args.getOpt<int>("boundaryWidth", 3, 0, &_lock);
+      const Real fracThreshold = _args.getOpt<Real>("fracThreshold", 4, 0.01, &_lock);
+      _retval = getPyNone();
+      updateFractions(flags, phiObs, fractions, boundaryWidth, fracThreshold);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "updateFractions", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("updateFractions", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_updateFractions("", "updateFractions", _W_16);
+extern "C" {
+void PbRegister_updateFractions()
+{
+  KEEP_UNUSED(_RP_updateFractions);
+}
+}
+
+struct KnUpdateFlagsObs : public KernelBase {
+  KnUpdateFlagsObs(FlagGrid &flags,
+                   const MACGrid *fractions,
+                   const Grid<Real> &phiObs,
+                   const Grid<Real> *phiOut,
+                   const Grid<Real> *phiIn)
+      : KernelBase(&flags, 1),
+        flags(flags),
+        fractions(fractions),
+        phiObs(phiObs),
+        phiOut(phiOut),
+        phiIn(phiIn)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 FlagGrid &flags,
+                 const MACGrid *fractions,
+                 const Grid<Real> &phiObs,
+                 const Grid<Real> *phiOut,
+                 const Grid<Real> *phiIn) const
+  {
+
+    bool isObs = false;
+    if (fractions) {
+      Real f = 0.;
+      f += fractions->get(i, j, k).x;
+      f += fractions->get(i + 1, j, k).x;
+      f += fractions->get(i, j, k).y;
+      f += fractions->get(i, j + 1, k).y;
+      if (flags.is3D()) {
+        f += fractions->get(i, j, k).z;
+        f += fractions->get(i, j, k + 1).z;
+      }
+      if (f == 0.)
+        isObs = true;
+    }
+    else {
+      if (phiObs(i, j, k) < 0.)
+        isObs = true;
+    }
+
+    bool isOutflow = false;
+    bool isInflow = false;
+    if (phiOut && (*phiOut)(i, j, k) < 0.)
+      isOutflow = true;
+    if (phiIn && (*phiIn)(i, j, k) < 0.)
+      isInflow = true;
+
+    if (isObs)
+      flags(i, j, k) = FlagGrid::TypeObstacle;
+    else if (isInflow)
+      flags(i, j, k) = (FlagGrid::TypeFluid | FlagGrid::TypeInflow);
+    else if (isOutflow)
+      flags(i, j, k) = (FlagGrid::TypeEmpty | FlagGrid::TypeOutflow);
+    else
+      flags(i, j, k) = FlagGrid::TypeEmpty;
+  }
+  inline FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline const MACGrid *getArg1()
+  {
+    return fractions;
+  }
+  typedef MACGrid type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return phiObs;
+  }
+  typedef Grid<Real> type2;
+  inline const Grid<Real> *getArg3()
+  {
+    return phiOut;
+  }
+  typedef Grid<Real> type3;
+  inline const Grid<Real> *getArg4()
+  {
+    return phiIn;
+  }
+  typedef Grid<Real> type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnUpdateFlagsObs ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, fractions, phiObs, phiOut, phiIn);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, fractions, phiObs, phiOut, phiIn);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  FlagGrid &flags;
+  const MACGrid *fractions;
+  const Grid<Real> &phiObs;
+  const Grid<Real> *phiOut;
+  const Grid<Real> *phiIn;
+};
+
+//! update obstacle and outflow flags from levelsets
+//! optionally uses fill fractions for obstacle
+void setObstacleFlags(FlagGrid &flags,
+                      const Grid<Real> &phiObs,
+                      const MACGrid *fractions = NULL,
+                      const Grid<Real> *phiOut = NULL,
+                      const Grid<Real> *phiIn = NULL)
+{
+  KnUpdateFlagsObs(flags, fractions, phiObs, phiOut, phiIn);
+}
+static PyObject *_W_17(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "setObstacleFlags", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      const Grid<Real> &phiObs = *_args.getPtr<Grid<Real>>("phiObs", 1, &_lock);
+      const MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 2, NULL, &_lock);
+      const Grid<Real> *phiOut = _args.getPtrOpt<Grid<Real>>("phiOut", 3, NULL, &_lock);
+      const Grid<Real> *phiIn = _args.getPtrOpt<Grid<Real>>("phiIn", 4, NULL, &_lock);
+      _retval = getPyNone();
+      setObstacleFlags(flags, phiObs, fractions, phiOut, phiIn);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "setObstacleFlags", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("setObstacleFlags", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_setObstacleFlags("", "setObstacleFlags", _W_17);
+extern "C" {
+void PbRegister_setObstacleFlags()
+{
+  KEEP_UNUSED(_RP_setObstacleFlags);
+}
+}
+
+//! small helper for test case test_1040_secOrderBnd.py
+struct kninitVortexVelocity : public KernelBase {
+  kninitVortexVelocity(const Grid<Real> &phiObs,
+                       MACGrid &vel,
+                       const Vec3 &center,
+                       const Real &radius)
+      : KernelBase(&phiObs, 0), phiObs(phiObs), vel(vel), center(center), radius(radius)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const Grid<Real> &phiObs,
+                 MACGrid &vel,
+                 const Vec3 &center,
+                 const Real &radius) const
+  {
+
+    if (phiObs(i, j, k) >= -1.) {
+
+      Real dx = i - center.x;
+      if (dx >= 0)
+        dx -= .5;
+      else
+        dx += .5;
+      Real dy = j - center.y;
+      Real r = std::sqrt(dx * dx + dy * dy);
+      Real alpha = atan2(dy, dx);
+
+      vel(i, j, k).x = -std::sin(alpha) * (r / radius);
+
+      dx = i - center.x;
+      dy = j - center.y;
+      if (dy >= 0)
+        dy -= .5;
+      else
+        dy += .5;
+      r = std::sqrt(dx * dx + dy * dy);
+      alpha = atan2(dy, dx);
+
+      vel(i, j, k).y = std::cos(alpha) * (r / radius);
+    }
+  }
+  inline const Grid<Real> &getArg0()
+  {
+    return phiObs;
+  }
+  typedef Grid<Real> type0;
+  inline MACGrid &getArg1()
+  {
+    return vel;
+  }
+  typedef MACGrid type1;
+  inline const Vec3 &getArg2()
+  {
+    return center;
+  }
+  typedef Vec3 type2;
+  inline const Real &getArg3()
+  {
+    return radius;
+  }
+  typedef Real type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel kninitVortexVelocity ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, phiObs, vel, center, radius);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, phiObs, vel, center, radius);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const Grid<Real> &phiObs;
+  MACGrid &vel;
+  const Vec3 &center;
+  const Real &radius;
+};
+
+void initVortexVelocity(const Grid<Real> &phiObs,
+                        MACGrid &vel,
+                        const Vec3 &center,
+                        const Real &radius)
+{
+  kninitVortexVelocity(phiObs, vel, center, radius);
+}
+static PyObject *_W_18(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "initVortexVelocity", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const Grid<Real> &phiObs = *_args.getPtr<Grid<Real>>("phiObs", 0, &_lock);
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      const Vec3 &center = _args.get<Vec3>("center", 2, &_lock);
+      const Real &radius = _args.get<Real>("radius", 3, &_lock);
+      _retval = getPyNone();
+      initVortexVelocity(phiObs, vel, center, radius);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "initVortexVelocity", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("initVortexVelocity", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_initVortexVelocity("", "initVortexVelocity", _W_18);
+extern "C" {
+void PbRegister_initVortexVelocity()
+{
+  KEEP_UNUSED(_RP_initVortexVelocity);
+}
+}
+
+//*****************************************************************************
+// helper functions for blurring
+
+//! class for Gaussian Blur
+struct GaussianKernelCreator {
+ public:
+  float mSigma;
+  int mDim;
+  float *mMat1D;
+
+  GaussianKernelCreator() : mSigma(0.0f), mDim(0), mMat1D(NULL)
+  {
+  }
+  GaussianKernelCreator(float sigma, int dim = 0) : mSigma(0.0f), mDim(0), mMat1D(NULL)
+  {
+    setGaussianSigma(sigma, dim);
+  }
+
+  Real getWeiAtDis(float disx, float disy)
+  {
+    float m = 1.0 / (sqrt(2.0 * M_PI) * mSigma);
+    float v = m * exp(-(1.0 * disx * disx + 1.0 * disy * disy) / (2.0 * mSigma * mSigma));
+    return v;
+  }
+
+  Real getWeiAtDis(float disx, float disy, float disz)
+  {
+    float m = 1.0 / (sqrt(2.0 * M_PI) * mSigma);
+    float v = m * exp(-(1.0 * disx * disx + 1.0 * disy * disy + 1.0 * disz * disz) /
+                      (2.0 * mSigma * mSigma));
+    return v;
+  }
+
+  void setGaussianSigma(float sigma, int dim = 0)
+  {
+    mSigma = sigma;
+    if (dim < 3)
+      mDim = (int)(2.0 * 3.0 * sigma + 1.0f);
+    else
+      mDim = dim;
+    if (mDim < 3)
+      mDim = 3;
+
+    if (mDim % 2 == 0)
+      ++mDim;  // make dim odd
+
+    float s2 = mSigma * mSigma;
+    int c = mDim / 2;
+    float m = 1.0 / (sqrt(2.0 * M_PI) * mSigma);
+
+    // create 1D matrix
+    if (mMat1D)
+      delete[] mMat1D;
+    mMat1D = new float[mDim];
+    for (int i = 0; i < (mDim + 1) / 2; i++) {
+      float v = m * exp(-(1.0 * i * i) / (2.0 * s2));
+      mMat1D[c + i] = v;
+      mMat1D[c - i] = v;
+    }
+  }
+
+  ~GaussianKernelCreator()
+  {
+    if (mMat1D)
+      delete[] mMat1D;
+  }
+
+  float get1DKernelValue(int off)
+  {
+    assertMsg(off >= 0 && off < mDim, "off exceeded boundary in Gaussian Kernel 1D!");
+    return mMat1D[off];
+  }
+};
+
+template<class T>
+T convolveGrid(Grid<T> &originGrid, GaussianKernelCreator &gkSigma, Vec3 pos, int cdir)
+{
+  // pos should be the centre pos, e.g., 1.5, 4.5, 0.5 for grid pos 1,4,0
+  Vec3 step(1.0, 0.0, 0.0);
+  if (cdir == 1)  // todo, z
+    step = Vec3(0.0, 1.0, 0.0);
+  else if (cdir == 2)
+    step = Vec3(0.0, 0.0, 1.0);
+  T pxResult(0);
+  for (int i = 0; i < gkSigma.mDim; ++i) {
+    Vec3i curpos = toVec3i(pos - step * (i - gkSigma.mDim / 2));
+    if (originGrid.isInBounds(curpos))
+      pxResult += gkSigma.get1DKernelValue(i) * originGrid.get(curpos);
+    else {  // TODO , improve...
+      Vec3i curfitpos = curpos;
+      if (curfitpos.x < 0)
+        curfitpos.x = 0;
+      else if (curfitpos.x >= originGrid.getSizeX())
+        curfitpos.x = originGrid.getSizeX() - 1;
+      if (curfitpos.y < 0)
+        curfitpos.y = 0;
+      else if (curfitpos.y >= originGrid.getSizeY())
+        curfitpos.y = originGrid.getSizeY() - 1;
+      if (curfitpos.z < 0)
+        curfitpos.z = 0;
+      else if (curfitpos.z >= originGrid.getSizeZ())
+        curfitpos.z = originGrid.getSizeZ() - 1;
+      pxResult += gkSigma.get1DKernelValue(i) * originGrid.get(curfitpos);
+    }
+  }
+  return pxResult;
+}
+
+template<class T> struct knBlurGrid : public KernelBase {
+  knBlurGrid(Grid<T> &originGrid, Grid<T> &targetGrid, GaussianKernelCreator &gkSigma, int cdir)
+      : KernelBase(&originGrid, 0),
+        originGrid(originGrid),
+        targetGrid(targetGrid),
+        gkSigma(gkSigma),
+        cdir(cdir)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 Grid<T> &originGrid,
+                 Grid<T> &targetGrid,
+                 GaussianKernelCreator &gkSigma,
+                 int cdir) const
+  {
+    targetGrid(i, j, k) = convolveGrid<T>(originGrid, gkSigma, Vec3(i, j, k), cdir);
+  }
+  inline Grid<T> &getArg0()
+  {
+    return originGrid;
+  }
+  typedef Grid<T> type0;
+  inline Grid<T> &getArg1()
+  {
+    return targetGrid;
+  }
+  typedef Grid<T> type1;
+  inline GaussianKernelCreator &getArg2()
+  {
+    return gkSigma;
+  }
+  typedef GaussianKernelCreator type2;
+  inline int &getArg3()
+  {
+    return cdir;
+  }
+  typedef int type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel knBlurGrid ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, originGrid, targetGrid, gkSigma, cdir);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, originGrid, targetGrid, gkSigma, cdir);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  Grid<T> &originGrid;
+  Grid<T> &targetGrid;
+  GaussianKernelCreator &gkSigma;
+  int cdir;
+};
+
+template<class T> int blurGrid(Grid<T> &originGrid, Grid<T> &targetGrid, float sigma)
+{
+  GaussianKernelCreator tmGK(sigma);
+  Grid<T> tmpGrid(originGrid);
+  knBlurGrid<T>(originGrid, tmpGrid, tmGK, 0);  // blur x
+  knBlurGrid<T>(tmpGrid, targetGrid, tmGK, 1);  // blur y
+  if (targetGrid.is3D()) {
+    tmpGrid.copyFrom(targetGrid);
+    knBlurGrid<T>(tmpGrid, targetGrid, tmGK, 2);
+  }
+  return tmGK.mDim;
+}
+
+struct KnBlurMACGridGauss : public KernelBase {
+  KnBlurMACGridGauss(MACGrid &originGrid,
+                     MACGrid &target,
+                     GaussianKernelCreator &gkSigma,
+                     int cdir)
+      : KernelBase(&originGrid, 0),
+        originGrid(originGrid),
+        target(target),
+        gkSigma(gkSigma),
+        cdir(cdir)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 MACGrid &originGrid,
+                 MACGrid &target,
+                 GaussianKernelCreator &gkSigma,
+                 int cdir) const
+  {
+    Vec3 pos(i, j, k);
+    Vec3 step(1.0, 0.0, 0.0);
+    if (cdir == 1)
+      step = Vec3(0.0, 1.0, 0.0);
+    else if (cdir == 2)
+      step = Vec3(0.0, 0.0, 1.0);
+
+    Vec3 pxResult(0.0f);
+    for (int di = 0; di < gkSigma.mDim; ++di) {
+      Vec3i curpos = toVec3i(pos - step * (di - gkSigma.mDim / 2));
+      if (!originGrid.isInBounds(curpos)) {
+        if (curpos.x < 0)
+          curpos.x = 0;
+        else if (curpos.x >= originGrid.getSizeX())
+          curpos.x = originGrid.getSizeX() - 1;
+        if (curpos.y < 0)
+          curpos.y = 0;
+        else if (curpos.y >= originGrid.getSizeY())
+          curpos.y = originGrid.getSizeY() - 1;
+        if (curpos.z < 0)
+          curpos.z = 0;
+        else if (curpos.z >= originGrid.getSizeZ())
+          curpos.z = originGrid.getSizeZ() - 1;
+      }
+      pxResult += gkSigma.get1DKernelValue(di) * originGrid.get(curpos);
+    }
+    target(i, j, k) = pxResult;
+  }
+  inline MACGrid &getArg0()
+  {
+    return originGrid;
+  }
+  typedef MACGrid type0;
+  inline MACGrid &getArg1()
+  {
+    return target;
+  }
+  typedef MACGrid type1;
+  inline GaussianKernelCreator &getArg2()
+  {
+    return gkSigma;
+  }
+  typedef GaussianKernelCreator type2;
+  inline int &getArg3()
+  {
+    return cdir;
+  }
+  typedef int type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnBlurMACGridGauss ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, originGrid, target, gkSigma, cdir);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, originGrid, target, gkSigma, cdir);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  MACGrid &originGrid;
+  MACGrid &target;
+  GaussianKernelCreator &gkSigma;
+  int cdir;
+};
+
+int blurMacGrid(MACGrid &oG, MACGrid &tG, float si)
+{
+  GaussianKernelCreator tmGK(si);
+  MACGrid tmpGrid(oG);
+  KnBlurMACGridGauss(oG, tmpGrid, tmGK, 0);  // blur x
+  KnBlurMACGridGauss(tmpGrid, tG, tmGK, 1);  // blur y
+  if (tG.is3D()) {
+    tmpGrid.copyFrom(tG);
+    KnBlurMACGridGauss(tmpGrid, tG, tmGK, 2);
+  }
+  return tmGK.mDim;
+}
+static PyObject *_W_19(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "blurMacGrid", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      MACGrid &oG = *_args.getPtr<MACGrid>("oG", 0, &_lock);
+      MACGrid &tG = *_args.getPtr<MACGrid>("tG", 1, &_lock);
+      float si = _args.get<float>("si", 2, &_lock);
+      _retval = toPy(blurMacGrid(oG, tG, si));
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "blurMacGrid", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("blurMacGrid", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_blurMacGrid("", "blurMacGrid", _W_19);
+extern "C" {
+void PbRegister_blurMacGrid()
+{
+  KEEP_UNUSED(_RP_blurMacGrid);
+}
+}
+
+int blurRealGrid(Grid<Real> &oG, Grid<Real> &tG, float si)
+{
+  return blurGrid<Real>(oG, tG, si);
+}
+static PyObject *_W_20(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "blurRealGrid", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &oG = *_args.getPtr<Grid<Real>>("oG", 0, &_lock);
+      Grid<Real> &tG = *_args.getPtr<Grid<Real>>("tG", 1, &_lock);
+      float si = _args.get<float>("si", 2, &_lock);
+      _retval = toPy(blurRealGrid(oG, tG, si));
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "blurRealGrid", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("blurRealGrid", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_blurRealGrid("", "blurRealGrid", _W_20);
+extern "C" {
+void PbRegister_blurRealGrid()
+{
+  KEEP_UNUSED(_RP_blurRealGrid);
+}
+}
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/kepsilon.cpp b/extern/mantaflow/preprocessed/plugin/kepsilon.cpp
new file mode 100644
index 00000000000..306db9e20cc
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/kepsilon.cpp
@@ -0,0 +1,578 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Copyright 2011 Tobias Pfaff, Nils Thuerey
+ *
+ * This program is free software, distributed under the terms of the
+ * Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Turbulence modeling plugins
+ *
+ ******************************************************************************/
+
+#include "grid.h"
+#include "commonkernels.h"
+#include "vortexsheet.h"
+#include "conjugategrad.h"
+
+using namespace std;
+
+namespace Manta {
+
+// k-epsilon model constants
+const Real keCmu = 0.09;
+const Real keC1 = 1.44;
+const Real keC2 = 1.92;
+const Real keS1 = 1.0;
+const Real keS2 = 1.3;
+
+// k-epsilon limiters
+const Real keU0 = 1.0;
+const Real keImin = 2e-3;
+const Real keImax = 1.0;
+const Real keNuMin = 1e-3;
+const Real keNuMax = 5.0;
+
+//! clamp k and epsilon to limits
+
+struct KnTurbulenceClamp : public KernelBase {
+  KnTurbulenceClamp(
+      Grid<Real> &kgrid, Grid<Real> &egrid, Real minK, Real maxK, Real minNu, Real maxNu)
+      : KernelBase(&kgrid, 0),
+        kgrid(kgrid),
+        egrid(egrid),
+        minK(minK),
+        maxK(maxK),
+        minNu(minNu),
+        maxNu(maxNu)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 Grid<Real> &kgrid,
+                 Grid<Real> &egrid,
+                 Real minK,
+                 Real maxK,
+                 Real minNu,
+                 Real maxNu) const
+  {
+    Real eps = egrid[idx];
+    Real ke = clamp(kgrid[idx], minK, maxK);
+    Real nu = keCmu * square(ke) / eps;
+    if (nu > maxNu)
+      eps = keCmu * square(ke) / maxNu;
+    if (nu < minNu)
+      eps = keCmu * square(ke) / minNu;
+
+    kgrid[idx] = ke;
+    egrid[idx] = eps;
+  }
+  inline Grid<Real> &getArg0()
+  {
+    return kgrid;
+  }
+  typedef Grid<Real> type0;
+  inline Grid<Real> &getArg1()
+  {
+    return egrid;
+  }
+  typedef Grid<Real> type1;
+  inline Real &getArg2()
+  {
+    return minK;
+  }
+  typedef Real type2;
+  inline Real &getArg3()
+  {
+    return maxK;
+  }
+  typedef Real type3;
+  inline Real &getArg4()
+  {
+    return minNu;
+  }
+  typedef Real type4;
+  inline Real &getArg5()
+  {
+    return maxNu;
+  }
+  typedef Real type5;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnTurbulenceClamp ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, kgrid, egrid, minK, maxK, minNu, maxNu);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  Grid<Real> &kgrid;
+  Grid<Real> &egrid;
+  Real minK;
+  Real maxK;
+  Real minNu;
+  Real maxNu;
+};
+
+//! Compute k-epsilon production term P = 2*nu_T*sum_ij(Sij^2) and the turbulent viscosity
+//! nu_T=C_mu*k^2/eps
+
+struct KnComputeProduction : public KernelBase {
+  KnComputeProduction(const MACGrid &vel,
+                      const Grid<Vec3> &velCenter,
+                      const Grid<Real> &ke,
+                      const Grid<Real> &eps,
+                      Grid<Real> &prod,
+                      Grid<Real> &nuT,
+                      Grid<Real> *strain,
+                      Real pscale = 1.0f)
+      : KernelBase(&vel, 1),
+        vel(vel),
+        velCenter(velCenter),
+        ke(ke),
+        eps(eps),
+        prod(prod),
+        nuT(nuT),
+        strain(strain),
+        pscale(pscale)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const MACGrid &vel,
+                 const Grid<Vec3> &velCenter,
+                 const Grid<Real> &ke,
+                 const Grid<Real> &eps,
+                 Grid<Real> &prod,
+                 Grid<Real> &nuT,
+                 Grid<Real> *strain,
+                 Real pscale = 1.0f) const
+  {
+    Real curEps = eps(i, j, k);
+    if (curEps > 0) {
+      // turbulent viscosity: nu_T = C_mu * k^2/eps
+      Real curNu = keCmu * square(ke(i, j, k)) / curEps;
+
+      // compute Sij = 1/2 * (dU_i/dx_j + dU_j/dx_i)
+      Vec3 diag = Vec3(vel(i + 1, j, k).x, vel(i, j + 1, k).y, vel(i, j, k + 1).z) - vel(i, j, k);
+      Vec3 ux = 0.5 * (velCenter(i + 1, j, k) - velCenter(i - 1, j, k));
+      Vec3 uy = 0.5 * (velCenter(i, j + 1, k) - velCenter(i, j - 1, k));
+      Vec3 uz = 0.5 * (velCenter(i, j, k + 1) - velCenter(i, j, k - 1));
+      Real S12 = 0.5 * (ux.y + uy.x);
+      Real S13 = 0.5 * (ux.z + uz.x);
+      Real S23 = 0.5 * (uy.z + uz.y);
+      Real S2 = square(diag.x) + square(diag.y) + square(diag.z) + 2.0 * square(S12) +
+                2.0 * square(S13) + 2.0 * square(S23);
+
+      // P = 2*nu_T*sum_ij(Sij^2)
+      prod(i, j, k) = 2.0 * curNu * S2 * pscale;
+      nuT(i, j, k) = curNu;
+      if (strain)
+        (*strain)(i, j, k) = sqrt(S2);
+    }
+    else {
+      prod(i, j, k) = 0;
+      nuT(i, j, k) = 0;
+      if (strain)
+        (*strain)(i, j, k) = 0;
+    }
+  }
+  inline const MACGrid &getArg0()
+  {
+    return vel;
+  }
+  typedef MACGrid type0;
+  inline const Grid<Vec3> &getArg1()
+  {
+    return velCenter;
+  }
+  typedef Grid<Vec3> type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return ke;
+  }
+  typedef Grid<Real> type2;
+  inline const Grid<Real> &getArg3()
+  {
+    return eps;
+  }
+  typedef Grid<Real> type3;
+  inline Grid<Real> &getArg4()
+  {
+    return prod;
+  }
+  typedef Grid<Real> type4;
+  inline Grid<Real> &getArg5()
+  {
+    return nuT;
+  }
+  typedef Grid<Real> type5;
+  inline Grid<Real> *getArg6()
+  {
+    return strain;
+  }
+  typedef Grid<Real> type6;
+  inline Real &getArg7()
+  {
+    return pscale;
+  }
+  typedef Real type7;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnComputeProduction ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, vel, velCenter, ke, eps, prod, nuT, strain, pscale);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, vel, velCenter, ke, eps, prod, nuT, strain, pscale);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const MACGrid &vel;
+  const Grid<Vec3> &velCenter;
+  const Grid<Real> &ke;
+  const Grid<Real> &eps;
+  Grid<Real> &prod;
+  Grid<Real> &nuT;
+  Grid<Real> *strain;
+  Real pscale;
+};
+
+//! Compute k-epsilon production term P = 2*nu_T*sum_ij(Sij^2) and the turbulent viscosity
+//! nu_T=C_mu*k^2/eps
+
+void KEpsilonComputeProduction(const MACGrid &vel,
+                               Grid<Real> &k,
+                               Grid<Real> &eps,
+                               Grid<Real> &prod,
+                               Grid<Real> &nuT,
+                               Grid<Real> *strain = 0,
+                               Real pscale = 1.0f)
+{
+  // get centered velocity grid
+  Grid<Vec3> vcenter(k.getParent());
+  GetCentered(vcenter, vel);
+  FillInBoundary(vcenter, 1);
+
+  // compute limits
+  const Real minK = 1.5 * square(keU0) * square(keImin);
+  const Real maxK = 1.5 * square(keU0) * square(keImax);
+  KnTurbulenceClamp(k, eps, minK, maxK, keNuMin, keNuMax);
+
+  KnComputeProduction(vel, vcenter, k, eps, prod, nuT, strain, pscale);
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "KEpsilonComputeProduction", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const MACGrid &vel = *_args.getPtr<MACGrid>("vel", 0, &_lock);
+      Grid<Real> &k = *_args.getPtr<Grid<Real>>("k", 1, &_lock);
+      Grid<Real> &eps = *_args.getPtr<Grid<Real>>("eps", 2, &_lock);
+      Grid<Real> &prod = *_args.getPtr<Grid<Real>>("prod", 3, &_lock);
+      Grid<Real> &nuT = *_args.getPtr<Grid<Real>>("nuT", 4, &_lock);
+      Grid<Real> *strain = _args.getPtrOpt<Grid<Real>>("strain", 5, 0, &_lock);
+      Real pscale = _args.getOpt<Real>("pscale", 6, 1.0f, &_lock);
+      _retval = getPyNone();
+      KEpsilonComputeProduction(vel, k, eps, prod, nuT, strain, pscale);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "KEpsilonComputeProduction", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("KEpsilonComputeProduction", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_KEpsilonComputeProduction("", "KEpsilonComputeProduction", _W_0);
+extern "C" {
+void PbRegister_KEpsilonComputeProduction()
+{
+  KEEP_UNUSED(_RP_KEpsilonComputeProduction);
+}
+}
+
+//! Integrate source terms of k-epsilon equation
+
+struct KnAddTurbulenceSource : public KernelBase {
+  KnAddTurbulenceSource(Grid<Real> &kgrid, Grid<Real> &egrid, const Grid<Real> &pgrid, Real dt)
+      : KernelBase(&kgrid, 0), kgrid(kgrid), egrid(egrid), pgrid(pgrid), dt(dt)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(
+      IndexInt idx, Grid<Real> &kgrid, Grid<Real> &egrid, const Grid<Real> &pgrid, Real dt) const
+  {
+    Real eps = egrid[idx], prod = pgrid[idx], ke = kgrid[idx];
+    if (ke <= 0)
+      ke = 1e-3;  // pre-clamp to avoid nan
+
+    Real newK = ke + dt * (prod - eps);
+    Real newEps = eps + dt * (prod * keC1 - eps * keC2) * (eps / ke);
+    if (newEps <= 0)
+      newEps = 1e-4;  // pre-clamp to avoid nan
+
+    kgrid[idx] = newK;
+    egrid[idx] = newEps;
+  }
+  inline Grid<Real> &getArg0()
+  {
+    return kgrid;
+  }
+  typedef Grid<Real> type0;
+  inline Grid<Real> &getArg1()
+  {
+    return egrid;
+  }
+  typedef Grid<Real> type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return pgrid;
+  }
+  typedef Grid<Real> type2;
+  inline Real &getArg3()
+  {
+    return dt;
+  }
+  typedef Real type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnAddTurbulenceSource ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, kgrid, egrid, pgrid, dt);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  Grid<Real> &kgrid;
+  Grid<Real> &egrid;
+  const Grid<Real> &pgrid;
+  Real dt;
+};
+
+//! Integrate source terms of k-epsilon equation
+void KEpsilonSources(Grid<Real> &k, Grid<Real> &eps, Grid<Real> &prod)
+{
+  Real dt = k.getParent()->getDt();
+
+  KnAddTurbulenceSource(k, eps, prod, dt);
+
+  // compute limits
+  const Real minK = 1.5 * square(keU0) * square(keImin);
+  const Real maxK = 1.5 * square(keU0) * square(keImax);
+  KnTurbulenceClamp(k, eps, minK, maxK, keNuMin, keNuMax);
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "KEpsilonSources", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &k = *_args.getPtr<Grid<Real>>("k", 0, &_lock);
+      Grid<Real> &eps = *_args.getPtr<Grid<Real>>("eps", 1, &_lock);
+      Grid<Real> &prod = *_args.getPtr<Grid<Real>>("prod", 2, &_lock);
+      _retval = getPyNone();
+      KEpsilonSources(k, eps, prod);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "KEpsilonSources", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("KEpsilonSources", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_KEpsilonSources("", "KEpsilonSources", _W_1);
+extern "C" {
+void PbRegister_KEpsilonSources()
+{
+  KEEP_UNUSED(_RP_KEpsilonSources);
+}
+}
+
+//! Initialize the domain or boundary conditions
+void KEpsilonBcs(
+    const FlagGrid &flags, Grid<Real> &k, Grid<Real> &eps, Real intensity, Real nu, bool fillArea)
+{
+  // compute limits
+  const Real vk = 1.5 * square(keU0) * square(intensity);
+  const Real ve = keCmu * square(vk) / nu;
+
+  FOR_IDX(k)
+  {
+    if (fillArea || flags.isObstacle(idx)) {
+      k[idx] = vk;
+      eps[idx] = ve;
+    }
+  }
+}
+static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "KEpsilonBcs", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Real> &k = *_args.getPtr<Grid<Real>>("k", 1, &_lock);
+      Grid<Real> &eps = *_args.getPtr<Grid<Real>>("eps", 2, &_lock);
+      Real intensity = _args.get<Real>("intensity", 3, &_lock);
+      Real nu = _args.get<Real>("nu", 4, &_lock);
+      bool fillArea = _args.get<bool>("fillArea", 5, &_lock);
+      _retval = getPyNone();
+      KEpsilonBcs(flags, k, eps, intensity, nu, fillArea);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "KEpsilonBcs", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("KEpsilonBcs", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_KEpsilonBcs("", "KEpsilonBcs", _W_2);
+extern "C" {
+void PbRegister_KEpsilonBcs()
+{
+  KEEP_UNUSED(_RP_KEpsilonBcs);
+}
+}
+
+//! Gradient diffusion smoothing. Not unconditionally stable -- should probably do substepping etc.
+void ApplyGradDiff(
+    const Grid<Real> &grid, Grid<Real> &res, const Grid<Real> &nu, Real dt, Real sigma)
+{
+  // should do this (but requires better boundary handling)
+  /*MACGrid grad(grid.getParent());
+  GradientOpMAC(grad, grid);
+  grad *= nu;
+  DivergenceOpMAC(res, grad);
+  res *= dt/sigma;  */
+
+  LaplaceOp(res, grid);
+  res *= nu;
+  res *= dt / sigma;
+}
+
+//! Compute k-epsilon turbulent viscosity
+void KEpsilonGradientDiffusion(
+    Grid<Real> &k, Grid<Real> &eps, Grid<Real> &nuT, Real sigmaU = 4.0, MACGrid *vel = 0)
+{
+  Real dt = k.getParent()->getDt();
+  Grid<Real> res(k.getParent());
+
+  // gradient diffusion of k
+  ApplyGradDiff(k, res, nuT, dt, keS1);
+  k += res;
+
+  // gradient diffusion of epsilon
+  ApplyGradDiff(eps, res, nuT, dt, keS2);
+  eps += res;
+
+  // gradient diffusion of velocity
+  if (vel) {
+    Grid<Real> vc(k.getParent());
+    for (int c = 0; c < 3; c++) {
+      GetComponent(*vel, vc, c);
+      ApplyGradDiff(vc, res, nuT, dt, sigmaU);
+      vc += res;
+      SetComponent(*vel, vc, c);
+    }
+  }
+}
+static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "KEpsilonGradientDiffusion", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &k = *_args.getPtr<Grid<Real>>("k", 0, &_lock);
+      Grid<Real> &eps = *_args.getPtr<Grid<Real>>("eps", 1, &_lock);
+      Grid<Real> &nuT = *_args.getPtr<Grid<Real>>("nuT", 2, &_lock);
+      Real sigmaU = _args.getOpt<Real>("sigmaU", 3, 4.0, &_lock);
+      MACGrid *vel = _args.getPtrOpt<MACGrid>("vel", 4, 0, &_lock);
+      _retval = getPyNone();
+      KEpsilonGradientDiffusion(k, eps, nuT, sigmaU, vel);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "KEpsilonGradientDiffusion", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("KEpsilonGradientDiffusion", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_KEpsilonGradientDiffusion("", "KEpsilonGradientDiffusion", _W_3);
+extern "C" {
+void PbRegister_KEpsilonGradientDiffusion()
+{
+  KEEP_UNUSED(_RP_KEpsilonGradientDiffusion);
+}
+}
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/meshplugins.cpp b/extern/mantaflow/preprocessed/plugin/meshplugins.cpp
new file mode 100644
index 00000000000..415bca153d0
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/meshplugins.cpp
@@ -0,0 +1,780 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Copyright 2011 Tobias Pfaff, Nils Thuerey
+ *
+ * This program is free software, distributed under the terms of the
+ * Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Smoothing etc. for meshes
+ *
+ ******************************************************************************/
+
+/******************************************************************************/
+// Copyright note:
+//
+// These functions (C) Chris Wojtan
+// Long-term goal is to unify with his split&merge codebase
+//
+/******************************************************************************/
+
+#include <queue>
+#include <algorithm>
+#include "mesh.h"
+#include "kernel.h"
+#include "edgecollapse.h"
+#include <mesh.h>
+#include <stack>
+
+using namespace std;
+
+namespace Manta {
+
+//! Mesh smoothing
+/*! see Desbrun 99 "Implicit fairing of of irregular meshes using diffusion and curvature flow"*/
+void smoothMesh(Mesh &mesh, Real strength, int steps = 1, Real minLength = 1e-5)
+{
+  const Real dt = mesh.getParent()->getDt();
+  const Real str = min(dt * strength, (Real)1);
+  mesh.rebuildQuickCheck();
+
+  // calculate original mesh volume
+  Vec3 origCM;
+  Real origVolume = mesh.computeCenterOfMass(origCM);
+
+  // temp vertices
+  const int numCorners = mesh.numTris() * 3;
+  const int numNodes = mesh.numNodes();
+  vector<Vec3> temp(numNodes);
+  vector<bool> visited(numNodes);
+
+  for (int s = 0; s < steps; s++) {
+    // reset markers
+    for (size_t i = 0; i < visited.size(); i++)
+      visited[i] = false;
+
+    for (int c = 0; c < numCorners; c++) {
+      const int node = mesh.corners(c).node;
+      if (visited[node])
+        continue;
+
+      const Vec3 pos = mesh.nodes(node).pos;
+      Vec3 dx(0.0);
+      Real totalLen = 0;
+
+      // rotate around vertex
+      set<int> &ring = mesh.get1Ring(node).nodes;
+      for (set<int>::iterator it = ring.begin(); it != ring.end(); it++) {
+        Vec3 edge = mesh.nodes(*it).pos - pos;
+        Real len = norm(edge);
+
+        if (len > minLength) {
+          dx += edge * (1.0 / len);
+          totalLen += len;
+        }
+        else {
+          totalLen = 0.0;
+          break;
+        }
+      }
+      visited[node] = true;
+      temp[node] = pos;
+      if (totalLen != 0)
+        temp[node] += dx * (str / totalLen);
+    }
+
+    // copy back
+    for (int n = 0; n < numNodes; n++)
+      if (!mesh.isNodeFixed(n))
+        mesh.nodes(n).pos = temp[n];
+  }
+
+  // calculate new mesh volume
+  Vec3 newCM;
+  Real newVolume = mesh.computeCenterOfMass(newCM);
+
+  // preserve volume : scale relative to CM
+  Real beta;
+#if defined(WIN32) || defined(_WIN32)
+  beta = pow((Real)std::abs(origVolume / newVolume), (Real)(1. / 3.));
+#else
+  beta = cbrt(origVolume / newVolume);
+#endif
+
+  for (int n = 0; n < numNodes; n++)
+    if (!mesh.isNodeFixed(n))
+      mesh.nodes(n).pos = origCM + (mesh.nodes(n).pos - newCM) * beta;
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "smoothMesh", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Mesh &mesh = *_args.getPtr<Mesh>("mesh", 0, &_lock);
+      Real strength = _args.get<Real>("strength", 1, &_lock);
+      int steps = _args.getOpt<int>("steps", 2, 1, &_lock);
+      Real minLength = _args.getOpt<Real>("minLength", 3, 1e-5, &_lock);
+      _retval = getPyNone();
+      smoothMesh(mesh, strength, steps, minLength);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "smoothMesh", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("smoothMesh", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_smoothMesh("", "smoothMesh", _W_0);
+extern "C" {
+void PbRegister_smoothMesh()
+{
+  KEEP_UNUSED(_RP_smoothMesh);
+}
+}
+
+//! Subdivide and edgecollapse to guarantee mesh with edgelengths between
+//! min/maxLength and an angle below minAngle
+void subdivideMesh(
+    Mesh &mesh, Real minAngle, Real minLength, Real maxLength, bool cutTubes = false)
+{
+  // gather some statistics
+  int edgeSubdivs = 0, edgeCollsAngle = 0, edgeCollsLen = 0, edgeKill = 0;
+  mesh.rebuildQuickCheck();
+
+  vector<int> deletedNodes;
+  map<int, bool> taintedTris;
+  priority_queue<pair<Real, int>> pq;
+
+  //////////////////////////////////////////
+  // EDGE COLLAPSE                        //
+  //    - particles marked for deletation //
+  //////////////////////////////////////////
+
+  for (int t = 0; t < mesh.numTris(); t++) {
+    if (taintedTris.find(t) != taintedTris.end())
+      continue;
+
+    // check if at least 2 nodes are marked for delete
+    bool k[3];
+    int numKill = 0;
+    for (int i = 0; i < 3; i++) {
+      k[i] = mesh.nodes(mesh.tris(t).c[i]).flags & Mesh::NfKillme;
+      if (k[i])
+        numKill++;
+    }
+    if (numKill < 2)
+      continue;
+
+    if (k[0] && k[1])
+      CollapseEdge(mesh,
+                   t,
+                   2,
+                   mesh.getEdge(t, 0),
+                   mesh.getNode(t, 0),
+                   deletedNodes,
+                   taintedTris,
+                   edgeKill,
+                   cutTubes);
+    else if (k[1] && k[2])
+      CollapseEdge(mesh,
+                   t,
+                   0,
+                   mesh.getEdge(t, 1),
+                   mesh.getNode(t, 1),
+                   deletedNodes,
+                   taintedTris,
+                   edgeKill,
+                   cutTubes);
+    else if (k[2] && k[0])
+      CollapseEdge(mesh,
+                   t,
+                   1,
+                   mesh.getEdge(t, 2),
+                   mesh.getNode(t, 2),
+                   deletedNodes,
+                   taintedTris,
+                   edgeKill,
+                   cutTubes);
+  }
+
+  //////////////////////////////////////////
+  // EDGE COLLAPSING                      //
+  //      - based on small triangle angle //
+  //////////////////////////////////////////
+
+  if (minAngle > 0) {
+    for (int t = 0; t < mesh.numTris(); t++) {
+      // we only want to run through the edge list ONCE.
+      // we achieve this in a method very similar to the above subdivision method.
+
+      // if this triangle has already been deleted, ignore it
+      if (taintedTris.find(t) != taintedTris.end())
+        continue;
+
+      // first we find the angles of this triangle
+      Vec3 e0 = mesh.getEdge(t, 0), e1 = mesh.getEdge(t, 1), e2 = mesh.getEdge(t, 2);
+      Vec3 ne0 = e0;
+      Vec3 ne1 = e1;
+      Vec3 ne2 = e2;
+      normalize(ne0);
+      normalize(ne1);
+      normalize(ne2);
+
+      // Real thisArea = sqrMag(cross(-e2,e0));
+      // small angle approximation says sin(x) = arcsin(x) = x,
+      // arccos(x) = pi/2 - arcsin(x),
+      // cos(x) = dot(A,B),
+      // so angle is approximately 1 - dot(A,B).
+      Real angle[3];
+      angle[0] = 1.0 - dot(ne0, -ne2);
+      angle[1] = 1.0 - dot(ne1, -ne0);
+      angle[2] = 1.0 - dot(ne2, -ne1);
+      Real worstAngle = angle[0];
+      int which = 0;
+      if (angle[1] < worstAngle) {
+        worstAngle = angle[1];
+        which = 1;
+      }
+      if (angle[2] < worstAngle) {
+        worstAngle = angle[2];
+        which = 2;
+      }
+
+      // then we see if the angle is too small
+      if (worstAngle < minAngle) {
+        Vec3 edgevect;
+        Vec3 endpoint;
+        switch (which) {
+          case 0:
+            endpoint = mesh.getNode(t, 1);
+            edgevect = e1;
+            break;
+          case 1:
+            endpoint = mesh.getNode(t, 2);
+            edgevect = e2;
+            break;
+          case 2:
+            endpoint = mesh.getNode(t, 0);
+            edgevect = e0;
+            break;
+          default:
+            break;
+        }
+
+        CollapseEdge(mesh,
+                     t,
+                     which,
+                     edgevect,
+                     endpoint,
+                     deletedNodes,
+                     taintedTris,
+                     edgeCollsAngle,
+                     cutTubes);
+      }
+    }
+  }
+
+  //////////////////////
+  // EDGE SUBDIVISION //
+  //////////////////////
+
+  Real maxLength2 = maxLength * maxLength;
+  for (int t = 0; t < mesh.numTris(); t++) {
+    // first we find the maximum length edge in this triangle
+    Vec3 e0 = mesh.getEdge(t, 0), e1 = mesh.getEdge(t, 1), e2 = mesh.getEdge(t, 2);
+    Real d0 = normSquare(e0);
+    Real d1 = normSquare(e1);
+    Real d2 = normSquare(e2);
+
+    Real longest = max(d0, max(d1, d2));
+    if (longest > maxLength2) {
+      pq.push(pair<Real, int>(longest, t));
+    }
+  }
+  if (maxLength > 0) {
+
+    while (!pq.empty() && pq.top().first > maxLength2) {
+      // we only want to run through the edge list ONCE
+      // and we want to subdivide the original edges before we subdivide any newer, shorter edges,
+      // so whenever we subdivide, we add the 2 new triangles on the end of the SurfaceTri vector
+      // and mark the original subdivided triangles for deletion.
+      //  when we are done subdividing, we delete the obsolete triangles
+
+      int triA = pq.top().second;
+      pq.pop();
+
+      if (taintedTris.find(triA) != taintedTris.end())
+        continue;
+
+      // first we find the maximum length edge in this triangle
+      Vec3 e0 = mesh.getEdge(triA, 0), e1 = mesh.getEdge(triA, 1), e2 = mesh.getEdge(triA, 2);
+      Real d0 = normSquare(e0);
+      Real d1 = normSquare(e1);
+      Real d2 = normSquare(e2);
+
+      Vec3 edgevect;
+      Vec3 endpoint;
+      int which;
+      if (d0 > d1) {
+        if (d0 > d2) {
+          edgevect = e0;
+          endpoint = mesh.getNode(triA, 0);
+          ;
+          which = 2;  // 2 opposite of edge 0-1
+        }
+        else {
+          edgevect = e2;
+          endpoint = mesh.getNode(triA, 2);
+          which = 1;  // 1 opposite of edge 2-0
+        }
+      }
+      else {
+        if (d1 > d2) {
+          edgevect = e1;
+          endpoint = mesh.getNode(triA, 1);
+          which = 0;  // 0 opposite of edge 1-2
+        }
+        else {
+          edgevect = e2;
+          endpoint = mesh.getNode(triA, 2);
+          which = 1;  // 1 opposite of edge 2-0
+        }
+      }
+      // This edge is too long, so we split it in the middle
+
+      //         *
+      //        / \.
+      //       /C0 \.
+      //      /     \.
+      //     /       \.
+      //    /    B    \.
+      //   /           \.
+      //  /C1        C2 \.
+      // *---------------*
+      //  \C2        C1 /
+      //   \           /
+      //    \    A    /
+      //     \       /
+      //      \     /
+      //       \C0 /
+      //        \ /
+      //         *
+      //
+      //      BECOMES
+      //
+      //         *
+      //        /|\.
+      //       / | \.
+      //      /C0|C0\.
+      //     /   |   \.
+      //    / B1 | B2 \.
+      //   /     |     \.
+      //  /C1  C2|C1 C2 \.
+      // *-------*-------*
+      //  \C2  C1|C2  C1/
+      //   \     |     /
+      //    \ A2 | A1 /
+      //     \   |   /
+      //      \C0|C0/
+      //       \ | /
+      //        \|/
+      //         *
+
+      int triB = -1;
+      bool haveB = false;
+      Corner ca_old[3], cb_old[3];
+      ca_old[0] = mesh.corners(triA, which);
+      ca_old[1] = mesh.corners(ca_old[0].next);
+      ca_old[2] = mesh.corners(ca_old[0].prev);
+      if (ca_old[0].opposite >= 0) {
+        cb_old[0] = mesh.corners(ca_old[0].opposite);
+        cb_old[1] = mesh.corners(cb_old[0].next);
+        cb_old[2] = mesh.corners(cb_old[0].prev);
+        triB = cb_old[0].tri;
+        haveB = true;
+      }
+      // else throw Error("nonmanifold");
+
+      // subdivide in the middle of the edge and create new triangles
+      Node newNode;
+      newNode.flags = 0;
+
+      newNode.pos = endpoint + 0.5 * edgevect;  // fallback: linear average
+      // default: use butterfly
+      if (haveB)
+        newNode.pos = ModifiedButterflySubdivision(mesh, ca_old[0], cb_old[0], newNode.pos);
+
+      // find indices of two points of 'which'-edge
+      // merge flags
+      int P0 = ca_old[1].node;
+      int P1 = ca_old[2].node;
+      newNode.flags = mesh.nodes(P0).flags | mesh.nodes(P1).flags;
+
+      Real len0 = norm(mesh.nodes(P0).pos - newNode.pos);
+      Real len1 = norm(mesh.nodes(P1).pos - newNode.pos);
+
+      // remove P0/P1 1-ring connection
+      mesh.get1Ring(P0).nodes.erase(P1);
+      mesh.get1Ring(P1).nodes.erase(P0);
+      mesh.get1Ring(P0).tris.erase(triA);
+      mesh.get1Ring(P1).tris.erase(triA);
+      mesh.get1Ring(ca_old[0].node).tris.erase(triA);
+      if (haveB) {
+        mesh.get1Ring(P0).tris.erase(triB);
+        mesh.get1Ring(P1).tris.erase(triB);
+        mesh.get1Ring(cb_old[0].node).tris.erase(triB);
+      }
+
+      // init channel properties for new node
+      for (int i = 0; i < mesh.numNodeChannels(); i++) {
+        mesh.nodeChannel(i)->addInterpol(P0, P1, len0 / (len0 + len1));
+      }
+
+      // write to array
+      mesh.addTri(Triangle(ca_old[0].node, ca_old[1].node, mesh.numNodes()));
+      mesh.addTri(Triangle(ca_old[0].node, mesh.numNodes(), ca_old[2].node));
+      if (haveB) {
+        mesh.addTri(Triangle(cb_old[0].node, cb_old[1].node, mesh.numNodes()));
+        mesh.addTri(Triangle(cb_old[0].node, mesh.numNodes(), cb_old[2].node));
+      }
+      mesh.addNode(newNode);
+
+      const int nt = haveB ? 4 : 2;
+      int triA1 = mesh.numTris() - nt;
+      int triA2 = mesh.numTris() - nt + 1;
+      int triB1 = 0, triB2 = 0;
+      if (haveB) {
+        triB1 = mesh.numTris() - nt + 2;
+        triB2 = mesh.numTris() - nt + 3;
+      }
+      mesh.tris(triA1).flags = mesh.tris(triA).flags;
+      mesh.tris(triA2).flags = mesh.tris(triA).flags;
+      mesh.tris(triB1).flags = mesh.tris(triB).flags;
+      mesh.tris(triB2).flags = mesh.tris(triB).flags;
+
+      // connect new triangles to outside triangles,
+      // and connect outside triangles to these new ones
+      for (int c = 0; c < 3; c++)
+        mesh.addCorner(Corner(triA1, mesh.tris(triA1).c[c]));
+      for (int c = 0; c < 3; c++)
+        mesh.addCorner(Corner(triA2, mesh.tris(triA2).c[c]));
+      if (haveB) {
+        for (int c = 0; c < 3; c++)
+          mesh.addCorner(Corner(triB1, mesh.tris(triB1).c[c]));
+        for (int c = 0; c < 3; c++)
+          mesh.addCorner(Corner(triB2, mesh.tris(triB2).c[c]));
+      }
+
+      int baseIdx = 3 * (mesh.numTris() - nt);
+      Corner *cBase = &mesh.corners(baseIdx);
+
+      // set next/prev
+      for (int t = 0; t < nt; t++)
+        for (int c = 0; c < 3; c++) {
+          cBase[t * 3 + c].next = baseIdx + t * 3 + ((c + 1) % 3);
+          cBase[t * 3 + c].prev = baseIdx + t * 3 + ((c + 2) % 3);
+        }
+
+      // set opposites
+      // A1
+      cBase[0].opposite = haveB ? (baseIdx + 9) : -1;
+      cBase[1].opposite = baseIdx + 5;
+      cBase[2].opposite = -1;
+      if (ca_old[2].opposite >= 0) {
+        cBase[2].opposite = ca_old[2].opposite;
+        mesh.corners(cBase[2].opposite).opposite = baseIdx + 2;
+      }
+      // A2
+      cBase[3].opposite = haveB ? (baseIdx + 6) : -1;
+      cBase[4].opposite = -1;
+      if (ca_old[1].opposite >= 0) {
+        cBase[4].opposite = ca_old[1].opposite;
+        mesh.corners(cBase[4].opposite).opposite = baseIdx + 4;
+      }
+      cBase[5].opposite = baseIdx + 1;
+      if (haveB) {
+        // B1
+        cBase[6].opposite = baseIdx + 3;
+        cBase[7].opposite = baseIdx + 11;
+        cBase[8].opposite = -1;
+        if (cb_old[2].opposite >= 0) {
+          cBase[8].opposite = cb_old[2].opposite;
+          mesh.corners(cBase[8].opposite).opposite = baseIdx + 8;
+        }
+        // B2
+        cBase[9].opposite = baseIdx + 0;
+        cBase[10].opposite = -1;
+        if (cb_old[1].opposite >= 0) {
+          cBase[10].opposite = cb_old[1].opposite;
+          mesh.corners(cBase[10].opposite).opposite = baseIdx + 10;
+        }
+        cBase[11].opposite = baseIdx + 7;
+      }
+
+      ////////////////////
+      // mark the two original triangles for deletion
+      taintedTris[triA] = true;
+      mesh.removeTriFromLookup(triA);
+      if (haveB) {
+        taintedTris[triB] = true;
+        mesh.removeTriFromLookup(triB);
+      }
+
+      Real areaA1 = mesh.getFaceArea(triA1), areaA2 = mesh.getFaceArea(triA2);
+      Real areaB1 = 0, areaB2 = 0;
+      if (haveB) {
+        areaB1 = mesh.getFaceArea(triB1);
+        areaB2 = mesh.getFaceArea(triB2);
+      }
+
+      // add channel props for new triangles
+      for (int i = 0; i < mesh.numTriChannels(); i++) {
+        mesh.triChannel(i)->addSplit(triA, areaA1 / (areaA1 + areaA2));
+        mesh.triChannel(i)->addSplit(triA, areaA2 / (areaA1 + areaA2));
+        if (haveB) {
+          mesh.triChannel(i)->addSplit(triB, areaB1 / (areaB1 + areaB2));
+          mesh.triChannel(i)->addSplit(triB, areaB2 / (areaB1 + areaB2));
+        }
+      }
+
+      // add the four new triangles to the prority queue
+      for (int i = mesh.numTris() - nt; i < mesh.numTris(); i++) {
+        // find the maximum length edge in this triangle
+        Vec3 ne0 = mesh.getEdge(i, 0), ne1 = mesh.getEdge(i, 1), ne2 = mesh.getEdge(i, 2);
+        Real nd0 = normSquare(ne0);
+        Real nd1 = normSquare(ne1);
+        Real nd2 = normSquare(ne2);
+        Real longest = max(nd0, max(nd1, nd2));
+        // longest = (int)(longest * 1e2) / 1e2; // HACK: truncate
+        pq.push(pair<Real, int>(longest, i));
+      }
+      edgeSubdivs++;
+    }
+  }
+
+  //////////////////////////////////////////
+  // EDGE COLLAPSING                      //
+  //      - based on short edge length    //
+  //////////////////////////////////////////
+  if (minLength > 0) {
+    const Real minLength2 = minLength * minLength;
+    for (int t = 0; t < mesh.numTris(); t++) {
+      // we only want to run through the edge list ONCE.
+      // we achieve this in a method very similar to the above subdivision method.
+
+      // NOTE:
+      // priority queue does not work so great in the edge collapse case,
+      // because collapsing one triangle affects the edge lengths
+      // of many neighbor triangles,
+      // and we do not update their maximum edge length in the queue.
+
+      // if this triangle has already been deleted, ignore it
+      // if(taintedTris[t])
+      //  continue;
+
+      if (taintedTris.find(t) != taintedTris.end())
+        continue;
+
+      // first we find the minimum length edge in this triangle
+      Vec3 e0 = mesh.getEdge(t, 0), e1 = mesh.getEdge(t, 1), e2 = mesh.getEdge(t, 2);
+      Real d0 = normSquare(e0);
+      Real d1 = normSquare(e1);
+      Real d2 = normSquare(e2);
+
+      Vec3 edgevect;
+      Vec3 endpoint;
+      Real dist2;
+      int which;
+      if (d0 < d1) {
+        if (d0 < d2) {
+          dist2 = d0;
+          edgevect = e0;
+          endpoint = mesh.getNode(t, 0);
+          which = 2;  // 2 opposite of edge 0-1
+        }
+        else {
+          dist2 = d2;
+          edgevect = e2;
+          endpoint = mesh.getNode(t, 2);
+          which = 1;  // 1 opposite of edge 2-0
+        }
+      }
+      else {
+        if (d1 < d2) {
+          dist2 = d1;
+          edgevect = e1;
+          endpoint = mesh.getNode(t, 1);
+          which = 0;  // 0 opposite of edge 1-2
+        }
+        else {
+          dist2 = d2;
+          edgevect = e2;
+          endpoint = mesh.getNode(t, 2);
+          which = 1;  // 1 opposite of edge 2-0
+        }
+      }
+      // then we see if the min length edge is too short
+      if (dist2 < minLength2) {
+        CollapseEdge(
+            mesh, t, which, edgevect, endpoint, deletedNodes, taintedTris, edgeCollsLen, cutTubes);
+      }
+    }
+  }
+  // cleanup nodes and triangles marked for deletion
+
+  //  we run backwards through the deleted array,
+  //  replacing triangles with ones from the back
+  //          (this avoids the potential problem of overwriting a triangle
+  //              with a to-be-deleted triangle)
+  std::map<int, bool>::reverse_iterator tti = taintedTris.rbegin();
+  for (; tti != taintedTris.rend(); tti++)
+    mesh.removeTri(tti->first);
+
+  mesh.removeNodes(deletedNodes);
+  cout << "Surface subdivision finished with " << mesh.numNodes() << " surface nodes and "
+       << mesh.numTris();
+  cout << " surface triangles, edgeSubdivs:" << edgeSubdivs << ", edgeCollapses: " << edgeCollsLen;
+  cout << " + " << edgeCollsAngle << " + " << edgeKill << endl;
+  // mesh.sanityCheck();
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "subdivideMesh", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Mesh &mesh = *_args.getPtr<Mesh>("mesh", 0, &_lock);
+      Real minAngle = _args.get<Real>("minAngle", 1, &_lock);
+      Real minLength = _args.get<Real>("minLength", 2, &_lock);
+      Real maxLength = _args.get<Real>("maxLength", 3, &_lock);
+      bool cutTubes = _args.getOpt<bool>("cutTubes", 4, false, &_lock);
+      _retval = getPyNone();
+      subdivideMesh(mesh, minAngle, minLength, maxLength, cutTubes);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "subdivideMesh", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("subdivideMesh", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_subdivideMesh("", "subdivideMesh", _W_1);
+extern "C" {
+void PbRegister_subdivideMesh()
+{
+  KEEP_UNUSED(_RP_subdivideMesh);
+}
+}
+
+void killSmallComponents(Mesh &mesh, int elements = 10)
+{
+  const int num = mesh.numTris();
+  vector<int> comp(num);
+  vector<int> numEl;
+  vector<int> deletedNodes;
+  vector<bool> isNodeDel(mesh.numNodes());
+  map<int, bool> taintedTris;
+  // enumerate components
+  int cur = 0;
+  for (int i = 0; i < num; i++) {
+    if (comp[i] == 0) {
+      cur++;
+      comp[i] = cur;
+
+      stack<int> stack;
+      stack.push(i);
+      int cnt = 1;
+      while (!stack.empty()) {
+        int tri = stack.top();
+        stack.pop();
+        for (int c = 0; c < 3; c++) {
+          int op = mesh.corners(tri, c).opposite;
+          if (op < 0)
+            continue;
+          int ntri = mesh.corners(op).tri;
+          if (comp[ntri] == 0) {
+            comp[ntri] = cur;
+            stack.push(ntri);
+            cnt++;
+          }
+        }
+      }
+      numEl.push_back(cnt);
+    }
+  }
+  // kill small components
+  for (int j = 0; j < num; j++) {
+    if (numEl[comp[j] - 1] < elements) {
+      taintedTris[j] = true;
+      for (int c = 0; c < 3; c++) {
+        int n = mesh.tris(j).c[c];
+        if (!isNodeDel[n]) {
+          isNodeDel[n] = true;
+          deletedNodes.push_back(n);
+        }
+      }
+    }
+  }
+
+  std::map<int, bool>::reverse_iterator tti = taintedTris.rbegin();
+  for (; tti != taintedTris.rend(); tti++)
+    mesh.removeTri(tti->first);
+
+  mesh.removeNodes(deletedNodes);
+
+  if (!taintedTris.empty())
+    cout << "Killed small components : " << deletedNodes.size() << " nodes, " << taintedTris.size()
+         << " tris deleted." << endl;
+}
+static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "killSmallComponents", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Mesh &mesh = *_args.getPtr<Mesh>("mesh", 0, &_lock);
+      int elements = _args.getOpt<int>("elements", 1, 10, &_lock);
+      _retval = getPyNone();
+      killSmallComponents(mesh, elements);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "killSmallComponents", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("killSmallComponents", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_killSmallComponents("", "killSmallComponents", _W_2);
+extern "C" {
+void PbRegister_killSmallComponents()
+{
+  KEEP_UNUSED(_RP_killSmallComponents);
+}
+}
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/pressure.cpp b/extern/mantaflow/preprocessed/plugin/pressure.cpp
new file mode 100644
index 00000000000..7def2669e36
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/pressure.cpp
@@ -0,0 +1,1511 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Copyright 2011 Tobias Pfaff, Nils Thuerey
+ *
+ * This program is free software, distributed under the terms of the
+ * Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Plugins for pressure correction: solve_pressure, and ghost fluid helpers
+ *
+ ******************************************************************************/
+#include "vectorbase.h"
+#include "kernel.h"
+#include "conjugategrad.h"
+#include "multigrid.h"
+
+using namespace std;
+namespace Manta {
+
+//! Preconditioner for CG solver
+// - None: Use standard CG
+// - MIC: Modified incomplete Cholesky preconditioner
+// - MGDynamic: Multigrid preconditioner, rebuilt for each solve
+// - MGStatic: Multigrid preconditioner, built only once (faster than
+//       MGDynamic, but works only if Poisson equation does not change)
+enum Preconditioner { PcNone = 0, PcMIC = 1, PcMGDynamic = 2, PcMGStatic = 3 };
+
+inline static Real surfTensHelper(const IndexInt idx,
+                                  const int offset,
+                                  const Grid<Real> &phi,
+                                  const Grid<Real> &curv,
+                                  const Real surfTens,
+                                  const Real gfClamp);
+
+//! Kernel: Construct the right-hand side of the poisson equation
+
+struct MakeRhs : public KernelBase {
+  MakeRhs(const FlagGrid &flags,
+          Grid<Real> &rhs,
+          const MACGrid &vel,
+          const Grid<Real> *perCellCorr,
+          const MACGrid *fractions,
+          const MACGrid *obvel,
+          const Grid<Real> *phi,
+          const Grid<Real> *curv,
+          const Real surfTens,
+          const Real gfClamp)
+      : KernelBase(&flags, 1),
+        flags(flags),
+        rhs(rhs),
+        vel(vel),
+        perCellCorr(perCellCorr),
+        fractions(fractions),
+        obvel(obvel),
+        phi(phi),
+        curv(curv),
+        surfTens(surfTens),
+        gfClamp(gfClamp),
+        cnt(0),
+        sum(0)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 Grid<Real> &rhs,
+                 const MACGrid &vel,
+                 const Grid<Real> *perCellCorr,
+                 const MACGrid *fractions,
+                 const MACGrid *obvel,
+                 const Grid<Real> *phi,
+                 const Grid<Real> *curv,
+                 const Real surfTens,
+                 const Real gfClamp,
+                 int &cnt,
+                 double &sum)
+  {
+    if (!flags.isFluid(i, j, k)) {
+      rhs(i, j, k) = 0;
+      return;
+    }
+
+    // compute negative divergence
+    // no flag checks: assumes vel at obstacle interfaces is set to zero
+    Real set(0);
+    if (!fractions) {
+      set = vel(i, j, k).x - vel(i + 1, j, k).x + vel(i, j, k).y - vel(i, j + 1, k).y;
+      if (vel.is3D())
+        set += vel(i, j, k).z - vel(i, j, k + 1).z;
+    }
+    else {
+      set = (*fractions)(i, j, k).x * vel(i, j, k).x -
+            (*fractions)(i + 1, j, k).x * vel(i + 1, j, k).x +
+            (*fractions)(i, j, k).y * vel(i, j, k).y -
+            (*fractions)(i, j + 1, k).y * vel(i, j + 1, k).y;
+      if (vel.is3D())
+        set += (*fractions)(i, j, k).z * vel(i, j, k).z -
+               (*fractions)(i, j, k + 1).z * vel(i, j, k + 1).z;
+
+      // compute divergence from obstacle by using obstacle velocity (optional)
+      if (obvel) {
+        set += (1 - (*fractions)(i, j, k).x) * (*obvel)(i, j, k).x -
+               (1 - (*fractions)(i + 1, j, k).x) * (*obvel)(i + 1, j, k).x +
+               (1 - (*fractions)(i, j, k).y) * (*obvel)(i, j, k).y -
+               (1 - (*fractions)(i, j + 1, k).y) * (*obvel)(i, j + 1, k).y;
+        if (obvel->is3D())
+          set += (1 - (*fractions)(i, j, k).z) * (*obvel)(i, j, k).z -
+                 (1 - (*fractions)(i, j, k + 1).z) * (*obvel)(i, j, k + 1).z;
+      }
+    }
+
+    // compute surface tension effect (optional)
+    if (phi && curv) {
+      const IndexInt idx = flags.index(i, j, k);
+      const int X = flags.getStrideX(), Y = flags.getStrideY(), Z = flags.getStrideZ();
+      if (flags.isEmpty(i - 1, j, k))
+        set += surfTensHelper(idx, -X, *phi, *curv, surfTens, gfClamp);
+      if (flags.isEmpty(i + 1, j, k))
+        set += surfTensHelper(idx, +X, *phi, *curv, surfTens, gfClamp);
+      if (flags.isEmpty(i, j - 1, k))
+        set += surfTensHelper(idx, -Y, *phi, *curv, surfTens, gfClamp);
+      if (flags.isEmpty(i, j + 1, k))
+        set += surfTensHelper(idx, +Y, *phi, *curv, surfTens, gfClamp);
+      if (vel.is3D()) {
+        if (flags.isEmpty(i, j, k - 1))
+          set += surfTensHelper(idx, -Z, *phi, *curv, surfTens, gfClamp);
+        if (flags.isEmpty(i, j, k + 1))
+          set += surfTensHelper(idx, +Z, *phi, *curv, surfTens, gfClamp);
+      }
+    }
+
+    // per cell divergence correction (optional)
+    if (perCellCorr)
+      set += perCellCorr->get(i, j, k);
+
+    // obtain sum, cell count
+    sum += set;
+    cnt++;
+
+    rhs(i, j, k) = set;
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<Real> &getArg1()
+  {
+    return rhs;
+  }
+  typedef Grid<Real> type1;
+  inline const MACGrid &getArg2()
+  {
+    return vel;
+  }
+  typedef MACGrid type2;
+  inline const Grid<Real> *getArg3()
+  {
+    return perCellCorr;
+  }
+  typedef Grid<Real> type3;
+  inline const MACGrid *getArg4()
+  {
+    return fractions;
+  }
+  typedef MACGrid type4;
+  inline const MACGrid *getArg5()
+  {
+    return obvel;
+  }
+  typedef MACGrid type5;
+  inline const Grid<Real> *getArg6()
+  {
+    return phi;
+  }
+  typedef Grid<Real> type6;
+  inline const Grid<Real> *getArg7()
+  {
+    return curv;
+  }
+  typedef Grid<Real> type7;
+  inline const Real &getArg8()
+  {
+    return surfTens;
+  }
+  typedef Real type8;
+  inline const Real &getArg9()
+  {
+    return gfClamp;
+  }
+  typedef Real type9;
+  void runMessage()
+  {
+    debMsg("Executing kernel MakeRhs ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i,
+               j,
+               k,
+               flags,
+               rhs,
+               vel,
+               perCellCorr,
+               fractions,
+               obvel,
+               phi,
+               curv,
+               surfTens,
+               gfClamp,
+               cnt,
+               sum);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i,
+             j,
+             k,
+             flags,
+             rhs,
+             vel,
+             perCellCorr,
+             fractions,
+             obvel,
+             phi,
+             curv,
+             surfTens,
+             gfClamp,
+             cnt,
+             sum);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_reduce(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_reduce(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  MakeRhs(MakeRhs &o, tbb::split)
+      : KernelBase(o),
+        flags(o.flags),
+        rhs(o.rhs),
+        vel(o.vel),
+        perCellCorr(o.perCellCorr),
+        fractions(o.fractions),
+        obvel(o.obvel),
+        phi(o.phi),
+        curv(o.curv),
+        surfTens(o.surfTens),
+        gfClamp(o.gfClamp),
+        cnt(0),
+        sum(0)
+  {
+  }
+  void join(const MakeRhs &o)
+  {
+    cnt += o.cnt;
+    sum += o.sum;
+  }
+  const FlagGrid &flags;
+  Grid<Real> &rhs;
+  const MACGrid &vel;
+  const Grid<Real> *perCellCorr;
+  const MACGrid *fractions;
+  const MACGrid *obvel;
+  const Grid<Real> *phi;
+  const Grid<Real> *curv;
+  const Real surfTens;
+  const Real gfClamp;
+  int cnt;
+  double sum;
+};
+
+//! Kernel: make velocity divergence free by subtracting pressure gradient
+
+struct knCorrectVelocity : public KernelBase {
+  knCorrectVelocity(const FlagGrid &flags, MACGrid &vel, const Grid<Real> &pressure)
+      : KernelBase(&flags, 1), flags(flags), vel(vel), pressure(pressure)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(
+      int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const Grid<Real> &pressure) const
+  {
+    const IndexInt idx = flags.index(i, j, k);
+    if (flags.isFluid(idx)) {
+      if (flags.isFluid(i - 1, j, k))
+        vel[idx].x -= (pressure[idx] - pressure(i - 1, j, k));
+      if (flags.isFluid(i, j - 1, k))
+        vel[idx].y -= (pressure[idx] - pressure(i, j - 1, k));
+      if (flags.is3D() && flags.isFluid(i, j, k - 1))
+        vel[idx].z -= (pressure[idx] - pressure(i, j, k - 1));
+
+      if (flags.isEmpty(i - 1, j, k))
+        vel[idx].x -= pressure[idx];
+      if (flags.isEmpty(i, j - 1, k))
+        vel[idx].y -= pressure[idx];
+      if (flags.is3D() && flags.isEmpty(i, j, k - 1))
+        vel[idx].z -= pressure[idx];
+    }
+    else if (flags.isEmpty(idx) &&
+             !flags.isOutflow(idx)) {  // don't change velocities in outflow cells
+      if (flags.isFluid(i - 1, j, k))
+        vel[idx].x += pressure(i - 1, j, k);
+      else
+        vel[idx].x = 0.f;
+      if (flags.isFluid(i, j - 1, k))
+        vel[idx].y += pressure(i, j - 1, k);
+      else
+        vel[idx].y = 0.f;
+      if (flags.is3D()) {
+        if (flags.isFluid(i, j, k - 1))
+          vel[idx].z += pressure(i, j, k - 1);
+        else
+          vel[idx].z = 0.f;
+      }
+    }
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline MACGrid &getArg1()
+  {
+    return vel;
+  }
+  typedef MACGrid type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return pressure;
+  }
+  typedef Grid<Real> type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel knCorrectVelocity ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, pressure);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, pressure);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  MACGrid &vel;
+  const Grid<Real> &pressure;
+};
+
+// *****************************************************************************
+// Ghost fluid helpers
+
+// calculate fraction filled with liquid (note, assumes inside value is < outside!)
+inline static Real thetaHelper(const Real inside, const Real outside)
+{
+  const Real denom = inside - outside;
+  if (denom > -1e-04)
+    return 0.5;  // should always be neg, and large enough...
+  return std::max(Real(0), std::min(Real(1), inside / denom));
+}
+
+// calculate ghost fluid factor, cell at idx should be a fluid cell
+inline static Real ghostFluidHelper(const IndexInt idx,
+                                    const int offset,
+                                    const Grid<Real> &phi,
+                                    const Real gfClamp)
+{
+  Real alpha = thetaHelper(phi[idx], phi[idx + offset]);
+  if (alpha < gfClamp)
+    return alpha = gfClamp;
+  return (1. - (1. / alpha));
+}
+
+inline static Real surfTensHelper(const IndexInt idx,
+                                  const int offset,
+                                  const Grid<Real> &phi,
+                                  const Grid<Real> &curv,
+                                  const Real surfTens,
+                                  const Real gfClamp)
+{
+  return surfTens * (curv[idx + offset] - ghostFluidHelper(idx, offset, phi, gfClamp) * curv[idx]);
+}
+
+//! Kernel: Adapt A0 for ghost fluid
+
+struct ApplyGhostFluidDiagonal : public KernelBase {
+  ApplyGhostFluidDiagonal(Grid<Real> &A0,
+                          const FlagGrid &flags,
+                          const Grid<Real> &phi,
+                          const Real gfClamp)
+      : KernelBase(&A0, 1), A0(A0), flags(flags), phi(phi), gfClamp(gfClamp)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 Grid<Real> &A0,
+                 const FlagGrid &flags,
+                 const Grid<Real> &phi,
+                 const Real gfClamp) const
+  {
+    const int X = flags.getStrideX(), Y = flags.getStrideY(), Z = flags.getStrideZ();
+    const IndexInt idx = flags.index(i, j, k);
+    if (!flags.isFluid(idx))
+      return;
+
+    if (flags.isEmpty(i - 1, j, k))
+      A0[idx] -= ghostFluidHelper(idx, -X, phi, gfClamp);
+    if (flags.isEmpty(i + 1, j, k))
+      A0[idx] -= ghostFluidHelper(idx, +X, phi, gfClamp);
+    if (flags.isEmpty(i, j - 1, k))
+      A0[idx] -= ghostFluidHelper(idx, -Y, phi, gfClamp);
+    if (flags.isEmpty(i, j + 1, k))
+      A0[idx] -= ghostFluidHelper(idx, +Y, phi, gfClamp);
+    if (flags.is3D()) {
+      if (flags.isEmpty(i, j, k - 1))
+        A0[idx] -= ghostFluidHelper(idx, -Z, phi, gfClamp);
+      if (flags.isEmpty(i, j, k + 1))
+        A0[idx] -= ghostFluidHelper(idx, +Z, phi, gfClamp);
+    }
+  }
+  inline Grid<Real> &getArg0()
+  {
+    return A0;
+  }
+  typedef Grid<Real> type0;
+  inline const FlagGrid &getArg1()
+  {
+    return flags;
+  }
+  typedef FlagGrid type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return phi;
+  }
+  typedef Grid<Real> type2;
+  inline const Real &getArg3()
+  {
+    return gfClamp;
+  }
+  typedef Real type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel ApplyGhostFluidDiagonal ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, A0, flags, phi, gfClamp);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, A0, flags, phi, gfClamp);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  Grid<Real> &A0;
+  const FlagGrid &flags;
+  const Grid<Real> &phi;
+  const Real gfClamp;
+};
+
+//! Kernel: Apply velocity update: ghost fluid contribution
+
+struct knCorrectVelocityGhostFluid : public KernelBase {
+  knCorrectVelocityGhostFluid(MACGrid &vel,
+                              const FlagGrid &flags,
+                              const Grid<Real> &pressure,
+                              const Grid<Real> &phi,
+                              Real gfClamp,
+                              const Grid<Real> *curv,
+                              const Real surfTens)
+      : KernelBase(&vel, 1),
+        vel(vel),
+        flags(flags),
+        pressure(pressure),
+        phi(phi),
+        gfClamp(gfClamp),
+        curv(curv),
+        surfTens(surfTens)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 MACGrid &vel,
+                 const FlagGrid &flags,
+                 const Grid<Real> &pressure,
+                 const Grid<Real> &phi,
+                 Real gfClamp,
+                 const Grid<Real> *curv,
+                 const Real surfTens) const
+  {
+    const IndexInt X = flags.getStrideX(), Y = flags.getStrideY(), Z = flags.getStrideZ();
+    const IndexInt idx = flags.index(i, j, k);
+    if (flags.isFluid(idx)) {
+      if (flags.isEmpty(i - 1, j, k))
+        vel[idx][0] += pressure[idx] * ghostFluidHelper(idx, -X, phi, gfClamp);
+      if (flags.isEmpty(i, j - 1, k))
+        vel[idx][1] += pressure[idx] * ghostFluidHelper(idx, -Y, phi, gfClamp);
+      if (flags.is3D() && flags.isEmpty(i, j, k - 1))
+        vel[idx][2] += pressure[idx] * ghostFluidHelper(idx, -Z, phi, gfClamp);
+    }
+    else if (flags.isEmpty(idx) &&
+             !flags.isOutflow(idx)) {  // do not change velocities in outflow cells
+      if (flags.isFluid(i - 1, j, k))
+        vel[idx][0] -= pressure(i - 1, j, k) * ghostFluidHelper(idx - X, +X, phi, gfClamp);
+      else
+        vel[idx].x = 0.f;
+      if (flags.isFluid(i, j - 1, k))
+        vel[idx][1] -= pressure(i, j - 1, k) * ghostFluidHelper(idx - Y, +Y, phi, gfClamp);
+      else
+        vel[idx].y = 0.f;
+      if (flags.is3D()) {
+        if (flags.isFluid(i, j, k - 1))
+          vel[idx][2] -= pressure(i, j, k - 1) * ghostFluidHelper(idx - Z, +Z, phi, gfClamp);
+        else
+          vel[idx].z = 0.f;
+      }
+    }
+
+    if (curv) {
+      if (flags.isFluid(idx)) {
+        if (flags.isEmpty(i - 1, j, k))
+          vel[idx].x += surfTensHelper(idx, -X, phi, *curv, surfTens, gfClamp);
+        if (flags.isEmpty(i, j - 1, k))
+          vel[idx].y += surfTensHelper(idx, -Y, phi, *curv, surfTens, gfClamp);
+        if (flags.is3D() && flags.isEmpty(i, j, k - 1))
+          vel[idx].z += surfTensHelper(idx, -Z, phi, *curv, surfTens, gfClamp);
+      }
+      else if (flags.isEmpty(idx) &&
+               !flags.isOutflow(idx)) {  // do not change velocities in outflow cells
+        vel[idx].x -= (flags.isFluid(i - 1, j, k)) ?
+                          surfTensHelper(idx - X, +X, phi, *curv, surfTens, gfClamp) :
+                          0.f;
+        vel[idx].y -= (flags.isFluid(i, j - 1, k)) ?
+                          surfTensHelper(idx - Y, +Y, phi, *curv, surfTens, gfClamp) :
+                          0.f;
+        if (flags.is3D())
+          vel[idx].z -= (flags.isFluid(i, j, k - 1)) ?
+                            surfTensHelper(idx - Z, +Z, phi, *curv, surfTens, gfClamp) :
+                            0.f;
+      }
+    }
+  }
+  inline MACGrid &getArg0()
+  {
+    return vel;
+  }
+  typedef MACGrid type0;
+  inline const FlagGrid &getArg1()
+  {
+    return flags;
+  }
+  typedef FlagGrid type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return pressure;
+  }
+  typedef Grid<Real> type2;
+  inline const Grid<Real> &getArg3()
+  {
+    return phi;
+  }
+  typedef Grid<Real> type3;
+  inline Real &getArg4()
+  {
+    return gfClamp;
+  }
+  typedef Real type4;
+  inline const Grid<Real> *getArg5()
+  {
+    return curv;
+  }
+  typedef Grid<Real> type5;
+  inline const Real &getArg6()
+  {
+    return surfTens;
+  }
+  typedef Real type6;
+  void runMessage()
+  {
+    debMsg("Executing kernel knCorrectVelocityGhostFluid ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, vel, flags, pressure, phi, gfClamp, curv, surfTens);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, vel, flags, pressure, phi, gfClamp, curv, surfTens);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  MACGrid &vel;
+  const FlagGrid &flags;
+  const Grid<Real> &pressure;
+  const Grid<Real> &phi;
+  Real gfClamp;
+  const Grid<Real> *curv;
+  const Real surfTens;
+};
+
+// improve behavior of clamping for large time steps:
+inline static Real ghostFluidWasClamped(const IndexInt idx,
+                                        const int offset,
+                                        const Grid<Real> &phi,
+                                        const Real gfClamp)
+{
+  const Real alpha = thetaHelper(phi[idx], phi[idx + offset]);
+  if (alpha < gfClamp)
+    return true;
+  return false;
+}
+
+struct knReplaceClampedGhostFluidVels : public KernelBase {
+  knReplaceClampedGhostFluidVels(MACGrid &vel,
+                                 const FlagGrid &flags,
+                                 const Grid<Real> &pressure,
+                                 const Grid<Real> &phi,
+                                 Real gfClamp)
+      : KernelBase(&vel, 1), vel(vel), flags(flags), pressure(pressure), phi(phi), gfClamp(gfClamp)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 MACGrid &vel,
+                 const FlagGrid &flags,
+                 const Grid<Real> &pressure,
+                 const Grid<Real> &phi,
+                 Real gfClamp) const
+  {
+    const IndexInt idx = flags.index(i, j, k);
+    const IndexInt X = flags.getStrideX(), Y = flags.getStrideY(), Z = flags.getStrideZ();
+    if (!flags.isEmpty(idx))
+      return;
+
+    if (flags.isFluid(i - 1, j, k) && ghostFluidWasClamped(idx - X, +X, phi, gfClamp))
+      vel[idx][0] = vel[idx - X][0];
+    if (flags.isFluid(i, j - 1, k) && ghostFluidWasClamped(idx - Y, +Y, phi, gfClamp))
+      vel[idx][1] = vel[idx - Y][1];
+    if (flags.is3D() && flags.isFluid(i, j, k - 1) &&
+        ghostFluidWasClamped(idx - Z, +Z, phi, gfClamp))
+      vel[idx][2] = vel[idx - Z][2];
+
+    if (flags.isFluid(i + 1, j, k) && ghostFluidWasClamped(idx + X, -X, phi, gfClamp))
+      vel[idx][0] = vel[idx + X][0];
+    if (flags.isFluid(i, j + 1, k) && ghostFluidWasClamped(idx + Y, -Y, phi, gfClamp))
+      vel[idx][1] = vel[idx + Y][1];
+    if (flags.is3D() && flags.isFluid(i, j, k + 1) &&
+        ghostFluidWasClamped(idx + Z, -Z, phi, gfClamp))
+      vel[idx][2] = vel[idx + Z][2];
+  }
+  inline MACGrid &getArg0()
+  {
+    return vel;
+  }
+  typedef MACGrid type0;
+  inline const FlagGrid &getArg1()
+  {
+    return flags;
+  }
+  typedef FlagGrid type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return pressure;
+  }
+  typedef Grid<Real> type2;
+  inline const Grid<Real> &getArg3()
+  {
+    return phi;
+  }
+  typedef Grid<Real> type3;
+  inline Real &getArg4()
+  {
+    return gfClamp;
+  }
+  typedef Real type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel knReplaceClampedGhostFluidVels ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, vel, flags, pressure, phi, gfClamp);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, vel, flags, pressure, phi, gfClamp);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  MACGrid &vel;
+  const FlagGrid &flags;
+  const Grid<Real> &pressure;
+  const Grid<Real> &phi;
+  Real gfClamp;
+};
+
+//! Kernel: Compute min value of Real grid
+
+struct CountEmptyCells : public KernelBase {
+  CountEmptyCells(const FlagGrid &flags) : KernelBase(&flags, 0), flags(flags), numEmpty(0)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx, const FlagGrid &flags, int &numEmpty)
+  {
+    if (flags.isEmpty(idx))
+      numEmpty++;
+  }
+  inline operator int()
+  {
+    return numEmpty;
+  }
+  inline int &getRet()
+  {
+    return numEmpty;
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  void runMessage()
+  {
+    debMsg("Executing kernel CountEmptyCells ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, flags, numEmpty);
+  }
+  void run()
+  {
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  CountEmptyCells(CountEmptyCells &o, tbb::split) : KernelBase(o), flags(o.flags), numEmpty(0)
+  {
+  }
+  void join(const CountEmptyCells &o)
+  {
+    numEmpty += o.numEmpty;
+  }
+  const FlagGrid &flags;
+  int numEmpty;
+};
+
+// *****************************************************************************
+// Misc helpers
+
+//! Change 'A' and 'rhs' such that pressure at 'fixPidx' is fixed to 'value'
+void fixPressure(int fixPidx,
+                 Real value,
+                 Grid<Real> &rhs,
+                 Grid<Real> &A0,
+                 Grid<Real> &Ai,
+                 Grid<Real> &Aj,
+                 Grid<Real> &Ak)
+{
+  // Bring to rhs at neighbors
+  rhs[fixPidx + Ai.getStrideX()] -= Ai[fixPidx] * value;
+  rhs[fixPidx + Aj.getStrideY()] -= Aj[fixPidx] * value;
+  rhs[fixPidx - Ai.getStrideX()] -= Ai[fixPidx - Ai.getStrideX()] * value;
+  rhs[fixPidx - Aj.getStrideY()] -= Aj[fixPidx - Aj.getStrideY()] * value;
+  if (rhs.is3D()) {
+    rhs[fixPidx + Ak.getStrideZ()] -= Ak[fixPidx] * value;
+    rhs[fixPidx - Ak.getStrideZ()] -= Ak[fixPidx - Ak.getStrideZ()] * value;
+  }
+
+  // Trivialize equation at 'fixPidx' to: pressure[fixPidx] = value
+  rhs[fixPidx] = value;
+  A0[fixPidx] = Real(1);
+  Ai[fixPidx] = Aj[fixPidx] = Ak[fixPidx] = Real(0);
+  Ai[fixPidx - Ai.getStrideX()] = Real(0);
+  Aj[fixPidx - Aj.getStrideY()] = Real(0);
+  if (rhs.is3D()) {
+    Ak[fixPidx - Ak.getStrideZ()] = Real(0);
+  }
+}
+
+// for "static" MG mode, keep one MG data structure per fluid solver
+// leave cleanup to OS/user if nonzero at program termination (PcMGStatic mode)
+// alternatively, manually release in scene file with releaseMG
+static std::map<FluidSolver *, GridMg *> gMapMG;
+
+void releaseMG(FluidSolver *solver = nullptr)
+{
+  // release all?
+  if (!solver) {
+    for (std::map<FluidSolver *, GridMg *>::iterator it = gMapMG.begin(); it != gMapMG.end();
+         it++) {
+      if (it->first != nullptr)
+        releaseMG(it->first);
+    }
+    return;
+  }
+
+  GridMg *mg = gMapMG[solver];
+  if (mg) {
+    delete mg;
+    gMapMG[solver] = nullptr;
+  }
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "releaseMG", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      FluidSolver *solver = _args.getPtrOpt<FluidSolver>("solver", 0, nullptr, &_lock);
+      _retval = getPyNone();
+      releaseMG(solver);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "releaseMG", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("releaseMG", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_releaseMG("", "releaseMG", _W_0);
+extern "C" {
+void PbRegister_releaseMG()
+{
+  KEEP_UNUSED(_RP_releaseMG);
+}
+}
+
+// *****************************************************************************
+// Main pressure solve
+
+// Note , all three pressure solve helper functions take
+// identical parameters, apart from the RHS grid (and different const values)
+
+//! Compute rhs for pressure solve
+
+void computePressureRhs(Grid<Real> &rhs,
+                        const MACGrid &vel,
+                        const Grid<Real> &pressure,
+                        const FlagGrid &flags,
+                        Real cgAccuracy = 1e-3,
+                        const Grid<Real> *phi = 0,
+                        const Grid<Real> *perCellCorr = 0,
+                        const MACGrid *fractions = 0,
+                        const MACGrid *obvel = 0,
+                        Real gfClamp = 1e-04,
+                        Real cgMaxIterFac = 1.5,
+                        bool precondition = true,
+                        int preconditioner = PcMIC,
+                        bool enforceCompatibility = false,
+                        bool useL2Norm = false,
+                        bool zeroPressureFixing = false,
+                        const Grid<Real> *curv = NULL,
+                        const Real surfTens = 0.)
+{
+  // compute divergence and init right hand side
+  MakeRhs kernMakeRhs(
+      flags, rhs, vel, perCellCorr, fractions, obvel, phi, curv, surfTens, gfClamp);
+
+  if (enforceCompatibility)
+    rhs += (Real)(-kernMakeRhs.sum / (Real)kernMakeRhs.cnt);
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "computePressureRhs", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &rhs = *_args.getPtr<Grid<Real>>("rhs", 0, &_lock);
+      const MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      const Grid<Real> &pressure = *_args.getPtr<Grid<Real>>("pressure", 2, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 3, &_lock);
+      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 4, 1e-3, &_lock);
+      const Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 5, 0, &_lock);
+      const Grid<Real> *perCellCorr = _args.getPtrOpt<Grid<Real>>("perCellCorr", 6, 0, &_lock);
+      const MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 7, 0, &_lock);
+      const MACGrid *obvel = _args.getPtrOpt<MACGrid>("obvel", 8, 0, &_lock);
+      Real gfClamp = _args.getOpt<Real>("gfClamp", 9, 1e-04, &_lock);
+      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 10, 1.5, &_lock);
+      bool precondition = _args.getOpt<bool>("precondition", 11, true, &_lock);
+      int preconditioner = _args.getOpt<int>("preconditioner", 12, PcMIC, &_lock);
+      bool enforceCompatibility = _args.getOpt<bool>("enforceCompatibility", 13, false, &_lock);
+      bool useL2Norm = _args.getOpt<bool>("useL2Norm", 14, false, &_lock);
+      bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 15, false, &_lock);
+      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 16, NULL, &_lock);
+      const Real surfTens = _args.getOpt<Real>("surfTens", 17, 0., &_lock);
+      _retval = getPyNone();
+      computePressureRhs(rhs,
+                         vel,
+                         pressure,
+                         flags,
+                         cgAccuracy,
+                         phi,
+                         perCellCorr,
+                         fractions,
+                         obvel,
+                         gfClamp,
+                         cgMaxIterFac,
+                         precondition,
+                         preconditioner,
+                         enforceCompatibility,
+                         useL2Norm,
+                         zeroPressureFixing,
+                         curv,
+                         surfTens);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "computePressureRhs", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("computePressureRhs", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_computePressureRhs("", "computePressureRhs", _W_1);
+extern "C" {
+void PbRegister_computePressureRhs()
+{
+  KEEP_UNUSED(_RP_computePressureRhs);
+}
+}
+
+//! Build and solve pressure system of equations
+//! perCellCorr: a divergence correction for each cell, optional
+//! fractions: for 2nd order obstacle boundaries, optional
+//! gfClamp: clamping threshold for ghost fluid method
+//! cgMaxIterFac: heuristic to determine maximal number of CG iteations, increase for more accurate
+//! solutions preconditioner: MIC, or MG (see Preconditioner enum) useL2Norm: use max norm by
+//! default, can be turned to L2 here zeroPressureFixing: remove null space by fixing a single
+//! pressure value, needed for MG curv: curvature for surface tension effects surfTens: surface
+//! tension coefficient retRhs: return RHS divergence, e.g., for debugging; optional
+
+void solvePressureSystem(Grid<Real> &rhs,
+                         MACGrid &vel,
+                         Grid<Real> &pressure,
+                         const FlagGrid &flags,
+                         Real cgAccuracy = 1e-3,
+                         const Grid<Real> *phi = 0,
+                         const Grid<Real> *perCellCorr = 0,
+                         const MACGrid *fractions = 0,
+                         Real gfClamp = 1e-04,
+                         Real cgMaxIterFac = 1.5,
+                         bool precondition = true,
+                         int preconditioner = PcMIC,
+                         const bool enforceCompatibility = false,
+                         const bool useL2Norm = false,
+                         const bool zeroPressureFixing = false,
+                         const Grid<Real> *curv = NULL,
+                         const Real surfTens = 0.)
+{
+  if (precondition == false)
+    preconditioner = PcNone;  // for backwards compatibility
+
+  // reserve temp grids
+  FluidSolver *parent = flags.getParent();
+  Grid<Real> residual(parent);
+  Grid<Real> search(parent);
+  Grid<Real> A0(parent);
+  Grid<Real> Ai(parent);
+  Grid<Real> Aj(parent);
+  Grid<Real> Ak(parent);
+  Grid<Real> tmp(parent);
+
+  // setup matrix and boundaries
+  MakeLaplaceMatrix(flags, A0, Ai, Aj, Ak, fractions);
+
+  if (phi) {
+    ApplyGhostFluidDiagonal(A0, flags, *phi, gfClamp);
+  }
+
+  // check whether we need to fix some pressure value...
+  // (manually enable, or automatically for high accuracy, can cause asymmetries otherwise)
+  if (zeroPressureFixing || cgAccuracy < 1e-07) {
+    if (FLOATINGPOINT_PRECISION == 1)
+      debMsg(
+          "Warning - high CG accuracy with single-precision floating point accuracy might not "
+          "converge...",
+          2);
+
+    int numEmpty = CountEmptyCells(flags);
+    IndexInt fixPidx = -1;
+    if (numEmpty == 0) {
+      // Determine appropriate fluid cell for pressure fixing
+      // 1) First check some preferred positions for approx. symmetric zeroPressureFixing
+      Vec3i topCenter(
+          flags.getSizeX() / 2, flags.getSizeY() - 1, flags.is3D() ? flags.getSizeZ() / 2 : 0);
+      Vec3i preferredPos[] = {topCenter, topCenter - Vec3i(0, 1, 0), topCenter - Vec3i(0, 2, 0)};
+
+      for (Vec3i pos : preferredPos) {
+        if (flags.isFluid(pos)) {
+          fixPidx = flags.index(pos);
+          break;
+        }
+      }
+
+      // 2) Then search whole domain
+      if (fixPidx == -1) {
+        FOR_IJK_BND(flags, 1)
+        {
+          if (flags.isFluid(i, j, k)) {
+            fixPidx = flags.index(i, j, k);
+            // break FOR_IJK_BND loop
+            i = flags.getSizeX() - 1;
+            j = flags.getSizeY() - 1;
+            k = __kmax;
+          }
+        }
+      }
+      // debMsg("No empty cells! Fixing pressure of cell "<<fixPidx<<" to zero",1);
+    }
+    if (fixPidx >= 0) {
+      fixPressure(fixPidx, Real(0), rhs, A0, Ai, Aj, Ak);
+      static bool msgOnce = false;
+      if (!msgOnce) {
+        debMsg("Pinning pressure of cell " << fixPidx << " to zero", 2);
+        msgOnce = true;
+      }
+    }
+  }
+
+  // CG setup
+  // note: the last factor increases the max iterations for 2d, which right now can't use a
+  // preconditioner
+  GridCgInterface *gcg;
+  if (vel.is3D())
+    gcg = new GridCg<ApplyMatrix>(pressure, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
+  else
+    gcg = new GridCg<ApplyMatrix2D>(
+        pressure, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
+
+  gcg->setAccuracy(cgAccuracy);
+  gcg->setUseL2Norm(useL2Norm);
+
+  int maxIter = 0;
+
+  Grid<Real> *pca0 = nullptr, *pca1 = nullptr, *pca2 = nullptr, *pca3 = nullptr;
+  GridMg *pmg = nullptr;
+
+  // optional preconditioning
+  if (preconditioner == PcNone || preconditioner == PcMIC) {
+    maxIter = (int)(cgMaxIterFac * flags.getSize().max()) * (flags.is3D() ? 1 : 4);
+
+    pca0 = new Grid<Real>(parent);
+    pca1 = new Grid<Real>(parent);
+    pca2 = new Grid<Real>(parent);
+    pca3 = new Grid<Real>(parent);
+
+    gcg->setICPreconditioner(preconditioner == PcMIC ? GridCgInterface::PC_mICP :
+                                                       GridCgInterface::PC_None,
+                             pca0,
+                             pca1,
+                             pca2,
+                             pca3);
+  }
+  else if (preconditioner == PcMGDynamic || preconditioner == PcMGStatic) {
+    maxIter = 100;
+
+    pmg = gMapMG[parent];
+    if (!pmg) {
+      pmg = new GridMg(pressure.getSize());
+      gMapMG[parent] = pmg;
+    }
+
+    gcg->setMGPreconditioner(GridCgInterface::PC_MGP, pmg);
+  }
+
+  // CG solve
+  for (int iter = 0; iter < maxIter; iter++) {
+    if (!gcg->iterate())
+      iter = maxIter;
+    if (iter < maxIter)
+      debMsg("FluidSolver::solvePressure iteration " << iter
+                                                     << ", residual: " << gcg->getResNorm(),
+             9);
+  }
+  debMsg("FluidSolver::solvePressure done. Iterations:" << gcg->getIterations()
+                                                        << ", residual:" << gcg->getResNorm(),
+         2);
+
+  // Cleanup
+  if (gcg)
+    delete gcg;
+  if (pca0)
+    delete pca0;
+  if (pca1)
+    delete pca1;
+  if (pca2)
+    delete pca2;
+  if (pca3)
+    delete pca3;
+
+  // PcMGDynamic: always delete multigrid solver after use
+  // PcMGStatic: keep multigrid solver for next solve
+  if (pmg && preconditioner == PcMGDynamic)
+    releaseMG(parent);
+}
+static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "solvePressureSystem", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &rhs = *_args.getPtr<Grid<Real>>("rhs", 0, &_lock);
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      Grid<Real> &pressure = *_args.getPtr<Grid<Real>>("pressure", 2, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 3, &_lock);
+      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 4, 1e-3, &_lock);
+      const Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 5, 0, &_lock);
+      const Grid<Real> *perCellCorr = _args.getPtrOpt<Grid<Real>>("perCellCorr", 6, 0, &_lock);
+      const MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 7, 0, &_lock);
+      Real gfClamp = _args.getOpt<Real>("gfClamp", 8, 1e-04, &_lock);
+      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 9, 1.5, &_lock);
+      bool precondition = _args.getOpt<bool>("precondition", 10, true, &_lock);
+      int preconditioner = _args.getOpt<int>("preconditioner", 11, PcMIC, &_lock);
+      const bool enforceCompatibility = _args.getOpt<bool>(
+          "enforceCompatibility", 12, false, &_lock);
+      const bool useL2Norm = _args.getOpt<bool>("useL2Norm", 13, false, &_lock);
+      const bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 14, false, &_lock);
+      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 15, NULL, &_lock);
+      const Real surfTens = _args.getOpt<Real>("surfTens", 16, 0., &_lock);
+      _retval = getPyNone();
+      solvePressureSystem(rhs,
+                          vel,
+                          pressure,
+                          flags,
+                          cgAccuracy,
+                          phi,
+                          perCellCorr,
+                          fractions,
+                          gfClamp,
+                          cgMaxIterFac,
+                          precondition,
+                          preconditioner,
+                          enforceCompatibility,
+                          useL2Norm,
+                          zeroPressureFixing,
+                          curv,
+                          surfTens);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "solvePressureSystem", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("solvePressureSystem", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_solvePressureSystem("", "solvePressureSystem", _W_2);
+extern "C" {
+void PbRegister_solvePressureSystem()
+{
+  KEEP_UNUSED(_RP_solvePressureSystem);
+}
+}
+
+//! Apply pressure gradient to make velocity field divergence free
+
+void correctVelocity(MACGrid &vel,
+                     Grid<Real> &pressure,
+                     const FlagGrid &flags,
+                     Real cgAccuracy = 1e-3,
+                     const Grid<Real> *phi = 0,
+                     const Grid<Real> *perCellCorr = 0,
+                     const MACGrid *fractions = 0,
+                     Real gfClamp = 1e-04,
+                     Real cgMaxIterFac = 1.5,
+                     bool precondition = true,
+                     int preconditioner = PcMIC,
+                     bool enforceCompatibility = false,
+                     bool useL2Norm = false,
+                     bool zeroPressureFixing = false,
+                     const Grid<Real> *curv = NULL,
+                     const Real surfTens = 0.)
+{
+  knCorrectVelocity(flags, vel, pressure);
+  if (phi) {
+    knCorrectVelocityGhostFluid(vel, flags, pressure, *phi, gfClamp, curv, surfTens);
+    // improve behavior of clamping for large time steps:
+    knReplaceClampedGhostFluidVels(vel, flags, pressure, *phi, gfClamp);
+  }
+}
+static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "correctVelocity", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 0, &_lock);
+      Grid<Real> &pressure = *_args.getPtr<Grid<Real>>("pressure", 1, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 2, &_lock);
+      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 3, 1e-3, &_lock);
+      const Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 4, 0, &_lock);
+      const Grid<Real> *perCellCorr = _args.getPtrOpt<Grid<Real>>("perCellCorr", 5, 0, &_lock);
+      const MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 6, 0, &_lock);
+      Real gfClamp = _args.getOpt<Real>("gfClamp", 7, 1e-04, &_lock);
+      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 8, 1.5, &_lock);
+      bool precondition = _args.getOpt<bool>("precondition", 9, true, &_lock);
+      int preconditioner = _args.getOpt<int>("preconditioner", 10, PcMIC, &_lock);
+      bool enforceCompatibility = _args.getOpt<bool>("enforceCompatibility", 11, false, &_lock);
+      bool useL2Norm = _args.getOpt<bool>("useL2Norm", 12, false, &_lock);
+      bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 13, false, &_lock);
+      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 14, NULL, &_lock);
+      const Real surfTens = _args.getOpt<Real>("surfTens", 15, 0., &_lock);
+      _retval = getPyNone();
+      correctVelocity(vel,
+                      pressure,
+                      flags,
+                      cgAccuracy,
+                      phi,
+                      perCellCorr,
+                      fractions,
+                      gfClamp,
+                      cgMaxIterFac,
+                      precondition,
+                      preconditioner,
+                      enforceCompatibility,
+                      useL2Norm,
+                      zeroPressureFixing,
+                      curv,
+                      surfTens);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "correctVelocity", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("correctVelocity", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_correctVelocity("", "correctVelocity", _W_3);
+extern "C" {
+void PbRegister_correctVelocity()
+{
+  KEEP_UNUSED(_RP_correctVelocity);
+}
+}
+
+//! Perform pressure projection of the velocity grid, calls
+//! all three pressure helper functions in a row.
+
+void solvePressure(MACGrid &vel,
+                   Grid<Real> &pressure,
+                   const FlagGrid &flags,
+                   Real cgAccuracy = 1e-3,
+                   const Grid<Real> *phi = 0,
+                   const Grid<Real> *perCellCorr = 0,
+                   const MACGrid *fractions = 0,
+                   const MACGrid *obvel = 0,
+                   Real gfClamp = 1e-04,
+                   Real cgMaxIterFac = 1.5,
+                   bool precondition = true,
+                   int preconditioner = PcMIC,
+                   bool enforceCompatibility = false,
+                   bool useL2Norm = false,
+                   bool zeroPressureFixing = false,
+                   const Grid<Real> *curv = NULL,
+                   const Real surfTens = 0.,
+                   Grid<Real> *retRhs = NULL)
+{
+  Grid<Real> rhs(vel.getParent());
+
+  computePressureRhs(rhs,
+                     vel,
+                     pressure,
+                     flags,
+                     cgAccuracy,
+                     phi,
+                     perCellCorr,
+                     fractions,
+                     obvel,
+                     gfClamp,
+                     cgMaxIterFac,
+                     precondition,
+                     preconditioner,
+                     enforceCompatibility,
+                     useL2Norm,
+                     zeroPressureFixing,
+                     curv,
+                     surfTens);
+
+  solvePressureSystem(rhs,
+                      vel,
+                      pressure,
+                      flags,
+                      cgAccuracy,
+                      phi,
+                      perCellCorr,
+                      fractions,
+                      gfClamp,
+                      cgMaxIterFac,
+                      precondition,
+                      preconditioner,
+                      enforceCompatibility,
+                      useL2Norm,
+                      zeroPressureFixing,
+                      curv,
+                      surfTens);
+
+  correctVelocity(vel,
+                  pressure,
+                  flags,
+                  cgAccuracy,
+                  phi,
+                  perCellCorr,
+                  fractions,
+                  gfClamp,
+                  cgMaxIterFac,
+                  precondition,
+                  preconditioner,
+                  enforceCompatibility,
+                  useL2Norm,
+                  zeroPressureFixing,
+                  curv,
+                  surfTens);
+
+  // optionally , return RHS
+  if (retRhs) {
+    retRhs->copyFrom(rhs);
+  }
+}
+static PyObject *_W_4(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "solvePressure", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 0, &_lock);
+      Grid<Real> &pressure = *_args.getPtr<Grid<Real>>("pressure", 1, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 2, &_lock);
+      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 3, 1e-3, &_lock);
+      const Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 4, 0, &_lock);
+      const Grid<Real> *perCellCorr = _args.getPtrOpt<Grid<Real>>("perCellCorr", 5, 0, &_lock);
+      const MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 6, 0, &_lock);
+      const MACGrid *obvel = _args.getPtrOpt<MACGrid>("obvel", 7, 0, &_lock);
+      Real gfClamp = _args.getOpt<Real>("gfClamp", 8, 1e-04, &_lock);
+      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 9, 1.5, &_lock);
+      bool precondition = _args.getOpt<bool>("precondition", 10, true, &_lock);
+      int preconditioner = _args.getOpt<int>("preconditioner", 11, PcMIC, &_lock);
+      bool enforceCompatibility = _args.getOpt<bool>("enforceCompatibility", 12, false, &_lock);
+      bool useL2Norm = _args.getOpt<bool>("useL2Norm", 13, false, &_lock);
+      bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 14, false, &_lock);
+      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 15, NULL, &_lock);
+      const Real surfTens = _args.getOpt<Real>("surfTens", 16, 0., &_lock);
+      Grid<Real> *retRhs = _args.getPtrOpt<Grid<Real>>("retRhs", 17, NULL, &_lock);
+      _retval = getPyNone();
+      solvePressure(vel,
+                    pressure,
+                    flags,
+                    cgAccuracy,
+                    phi,
+                    perCellCorr,
+                    fractions,
+                    obvel,
+                    gfClamp,
+                    cgMaxIterFac,
+                    precondition,
+                    preconditioner,
+                    enforceCompatibility,
+                    useL2Norm,
+                    zeroPressureFixing,
+                    curv,
+                    surfTens,
+                    retRhs);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "solvePressure", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("solvePressure", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_solvePressure("", "solvePressure", _W_4);
+extern "C" {
+void PbRegister_solvePressure()
+{
+  KEEP_UNUSED(_RP_solvePressure);
+}
+}
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/ptsplugins.cpp b/extern/mantaflow/preprocessed/plugin/ptsplugins.cpp
new file mode 100644
index 00000000000..a6bbccc5966
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/ptsplugins.cpp
@@ -0,0 +1,502 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+// ----------------------------------------------------------------------------
+//
+// MantaFlow fluid solver framework
+// Copyright 2018 Kiwon Um, Nils Thuerey
+//
+// This program is free software, distributed under the terms of the
+// GNU General Public License (GPL)
+// http://www.gnu.org/licenses
+//
+// Particle system helper
+//
+// ----------------------------------------------------------------------------
+
+#include "particle.h"
+
+namespace Manta {
+
+struct KnAddForcePvel : public KernelBase {
+  KnAddForcePvel(ParticleDataImpl<Vec3> &v,
+                 const Vec3 &da,
+                 const ParticleDataImpl<int> *ptype,
+                 const int exclude)
+      : KernelBase(v.size()), v(v), da(da), ptype(ptype), exclude(exclude)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 ParticleDataImpl<Vec3> &v,
+                 const Vec3 &da,
+                 const ParticleDataImpl<int> *ptype,
+                 const int exclude) const
+  {
+    if (ptype && ((*ptype)[idx] & exclude))
+      return;
+    v[idx] += da;
+  }
+  inline ParticleDataImpl<Vec3> &getArg0()
+  {
+    return v;
+  }
+  typedef ParticleDataImpl<Vec3> type0;
+  inline const Vec3 &getArg1()
+  {
+    return da;
+  }
+  typedef Vec3 type1;
+  inline const ParticleDataImpl<int> *getArg2()
+  {
+    return ptype;
+  }
+  typedef ParticleDataImpl<int> type2;
+  inline const int &getArg3()
+  {
+    return exclude;
+  }
+  typedef int type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnAddForcePvel ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, v, da, ptype, exclude);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  ParticleDataImpl<Vec3> &v;
+  const Vec3 &da;
+  const ParticleDataImpl<int> *ptype;
+  const int exclude;
+};
+//! add force to vec3 particle data; a: acceleration
+
+void addForcePvel(ParticleDataImpl<Vec3> &vel,
+                  const Vec3 &a,
+                  const Real dt,
+                  const ParticleDataImpl<int> *ptype,
+                  const int exclude)
+{
+  KnAddForcePvel(vel, a * dt, ptype, exclude);
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "addForcePvel", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      ParticleDataImpl<Vec3> &vel = *_args.getPtr<ParticleDataImpl<Vec3>>("vel", 0, &_lock);
+      const Vec3 &a = _args.get<Vec3>("a", 1, &_lock);
+      const Real dt = _args.get<Real>("dt", 2, &_lock);
+      const ParticleDataImpl<int> *ptype = _args.getPtr<ParticleDataImpl<int>>("ptype", 3, &_lock);
+      const int exclude = _args.get<int>("exclude", 4, &_lock);
+      _retval = getPyNone();
+      addForcePvel(vel, a, dt, ptype, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "addForcePvel", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("addForcePvel", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_addForcePvel("", "addForcePvel", _W_0);
+extern "C" {
+void PbRegister_addForcePvel()
+{
+  KEEP_UNUSED(_RP_addForcePvel);
+}
+}
+
+struct KnUpdateVelocityFromDeltaPos : public KernelBase {
+  KnUpdateVelocityFromDeltaPos(const BasicParticleSystem &p,
+                               ParticleDataImpl<Vec3> &v,
+                               const ParticleDataImpl<Vec3> &x_prev,
+                               const Real over_dt,
+                               const ParticleDataImpl<int> *ptype,
+                               const int exclude)
+      : KernelBase(p.size()),
+        p(p),
+        v(v),
+        x_prev(x_prev),
+        over_dt(over_dt),
+        ptype(ptype),
+        exclude(exclude)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystem &p,
+                 ParticleDataImpl<Vec3> &v,
+                 const ParticleDataImpl<Vec3> &x_prev,
+                 const Real over_dt,
+                 const ParticleDataImpl<int> *ptype,
+                 const int exclude) const
+  {
+    if (ptype && ((*ptype)[idx] & exclude))
+      return;
+    v[idx] = (p[idx].pos - x_prev[idx]) * over_dt;
+  }
+  inline const BasicParticleSystem &getArg0()
+  {
+    return p;
+  }
+  typedef BasicParticleSystem type0;
+  inline ParticleDataImpl<Vec3> &getArg1()
+  {
+    return v;
+  }
+  typedef ParticleDataImpl<Vec3> type1;
+  inline const ParticleDataImpl<Vec3> &getArg2()
+  {
+    return x_prev;
+  }
+  typedef ParticleDataImpl<Vec3> type2;
+  inline const Real &getArg3()
+  {
+    return over_dt;
+  }
+  typedef Real type3;
+  inline const ParticleDataImpl<int> *getArg4()
+  {
+    return ptype;
+  }
+  typedef ParticleDataImpl<int> type4;
+  inline const int &getArg5()
+  {
+    return exclude;
+  }
+  typedef int type5;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnUpdateVelocityFromDeltaPos ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, p, v, x_prev, over_dt, ptype, exclude);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystem &p;
+  ParticleDataImpl<Vec3> &v;
+  const ParticleDataImpl<Vec3> &x_prev;
+  const Real over_dt;
+  const ParticleDataImpl<int> *ptype;
+  const int exclude;
+};
+//! retrieve velocity from position change
+
+void updateVelocityFromDeltaPos(const BasicParticleSystem &parts,
+                                ParticleDataImpl<Vec3> &vel,
+                                const ParticleDataImpl<Vec3> &x_prev,
+                                const Real dt,
+                                const ParticleDataImpl<int> *ptype,
+                                const int exclude)
+{
+  KnUpdateVelocityFromDeltaPos(parts, vel, x_prev, 1.0 / dt, ptype, exclude);
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "updateVelocityFromDeltaPos", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      ParticleDataImpl<Vec3> &vel = *_args.getPtr<ParticleDataImpl<Vec3>>("vel", 1, &_lock);
+      const ParticleDataImpl<Vec3> &x_prev = *_args.getPtr<ParticleDataImpl<Vec3>>(
+          "x_prev", 2, &_lock);
+      const Real dt = _args.get<Real>("dt", 3, &_lock);
+      const ParticleDataImpl<int> *ptype = _args.getPtr<ParticleDataImpl<int>>("ptype", 4, &_lock);
+      const int exclude = _args.get<int>("exclude", 5, &_lock);
+      _retval = getPyNone();
+      updateVelocityFromDeltaPos(parts, vel, x_prev, dt, ptype, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "updateVelocityFromDeltaPos", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("updateVelocityFromDeltaPos", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_updateVelocityFromDeltaPos("", "updateVelocityFromDeltaPos", _W_1);
+extern "C" {
+void PbRegister_updateVelocityFromDeltaPos()
+{
+  KEEP_UNUSED(_RP_updateVelocityFromDeltaPos);
+}
+}
+
+struct KnStepEuler : public KernelBase {
+  KnStepEuler(BasicParticleSystem &p,
+              const ParticleDataImpl<Vec3> &v,
+              const Real dt,
+              const ParticleDataImpl<int> *ptype,
+              const int exclude)
+      : KernelBase(p.size()), p(p), v(v), dt(dt), ptype(ptype), exclude(exclude)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 BasicParticleSystem &p,
+                 const ParticleDataImpl<Vec3> &v,
+                 const Real dt,
+                 const ParticleDataImpl<int> *ptype,
+                 const int exclude) const
+  {
+    if (ptype && ((*ptype)[idx] & exclude))
+      return;
+    p[idx].pos += v[idx] * dt;
+  }
+  inline BasicParticleSystem &getArg0()
+  {
+    return p;
+  }
+  typedef BasicParticleSystem type0;
+  inline const ParticleDataImpl<Vec3> &getArg1()
+  {
+    return v;
+  }
+  typedef ParticleDataImpl<Vec3> type1;
+  inline const Real &getArg2()
+  {
+    return dt;
+  }
+  typedef Real type2;
+  inline const ParticleDataImpl<int> *getArg3()
+  {
+    return ptype;
+  }
+  typedef ParticleDataImpl<int> type3;
+  inline const int &getArg4()
+  {
+    return exclude;
+  }
+  typedef int type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnStepEuler ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, p, v, dt, ptype, exclude);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  BasicParticleSystem &p;
+  const ParticleDataImpl<Vec3> &v;
+  const Real dt;
+  const ParticleDataImpl<int> *ptype;
+  const int exclude;
+};
+//! simple foward Euler integration for particle system
+
+void eulerStep(BasicParticleSystem &parts,
+               const ParticleDataImpl<Vec3> &vel,
+               const ParticleDataImpl<int> *ptype,
+               const int exclude)
+{
+  KnStepEuler(parts, vel, parts.getParent()->getDt(), ptype, exclude);
+}
+static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "eulerStep", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      const ParticleDataImpl<Vec3> &vel = *_args.getPtr<ParticleDataImpl<Vec3>>("vel", 1, &_lock);
+      const ParticleDataImpl<int> *ptype = _args.getPtr<ParticleDataImpl<int>>("ptype", 2, &_lock);
+      const int exclude = _args.get<int>("exclude", 3, &_lock);
+      _retval = getPyNone();
+      eulerStep(parts, vel, ptype, exclude);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "eulerStep", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("eulerStep", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_eulerStep("", "eulerStep", _W_2);
+extern "C" {
+void PbRegister_eulerStep()
+{
+  KEEP_UNUSED(_RP_eulerStep);
+}
+}
+
+struct KnSetPartType : public KernelBase {
+  KnSetPartType(ParticleDataImpl<int> &ptype,
+                const BasicParticleSystem &part,
+                const int mark,
+                const int stype,
+                const FlagGrid &flags,
+                const int cflag)
+      : KernelBase(ptype.size()),
+        ptype(ptype),
+        part(part),
+        mark(mark),
+        stype(stype),
+        flags(flags),
+        cflag(cflag)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 ParticleDataImpl<int> &ptype,
+                 const BasicParticleSystem &part,
+                 const int mark,
+                 const int stype,
+                 const FlagGrid &flags,
+                 const int cflag) const
+  {
+    if (flags.isInBounds(part.getPos(idx), 0) && (flags.getAt(part.getPos(idx)) & cflag) &&
+        (ptype[idx] & stype))
+      ptype[idx] = mark;
+  }
+  inline ParticleDataImpl<int> &getArg0()
+  {
+    return ptype;
+  }
+  typedef ParticleDataImpl<int> type0;
+  inline const BasicParticleSystem &getArg1()
+  {
+    return part;
+  }
+  typedef BasicParticleSystem type1;
+  inline const int &getArg2()
+  {
+    return mark;
+  }
+  typedef int type2;
+  inline const int &getArg3()
+  {
+    return stype;
+  }
+  typedef int type3;
+  inline const FlagGrid &getArg4()
+  {
+    return flags;
+  }
+  typedef FlagGrid type4;
+  inline const int &getArg5()
+  {
+    return cflag;
+  }
+  typedef int type5;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnSetPartType ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, ptype, part, mark, stype, flags, cflag);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  ParticleDataImpl<int> &ptype;
+  const BasicParticleSystem &part;
+  const int mark;
+  const int stype;
+  const FlagGrid &flags;
+  const int cflag;
+};
+//! if particle is stype and in cflag cell, set ptype as mark
+
+void setPartType(const BasicParticleSystem &parts,
+                 ParticleDataImpl<int> &ptype,
+                 const int mark,
+                 const int stype,
+                 const FlagGrid &flags,
+                 const int cflag)
+{
+  KnSetPartType(ptype, parts, mark, stype, flags, cflag);
+}
+static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "setPartType", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      ParticleDataImpl<int> &ptype = *_args.getPtr<ParticleDataImpl<int>>("ptype", 1, &_lock);
+      const int mark = _args.get<int>("mark", 2, &_lock);
+      const int stype = _args.get<int>("stype", 3, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 4, &_lock);
+      const int cflag = _args.get<int>("cflag", 5, &_lock);
+      _retval = getPyNone();
+      setPartType(parts, ptype, mark, stype, flags, cflag);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "setPartType", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("setPartType", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_setPartType("", "setPartType", _W_3);
+extern "C" {
+void PbRegister_setPartType()
+{
+  KEEP_UNUSED(_RP_setPartType);
+}
+}
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/secondaryparticles.cpp b/extern/mantaflow/preprocessed/plugin/secondaryparticles.cpp
new file mode 100644
index 00000000000..281e12ef04b
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/secondaryparticles.cpp
@@ -0,0 +1,3065 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Copyright 2017 Georg Kohl, Nils Thuerey
+ *
+ * This program is free software, distributed under the terms of the
+ * GNU General Public License (GPL)
+ * http://www.gnu.org/licenses
+ *
+ * Secondary particle plugin for FLIP simulations
+ *
+ ******************************************************************************/
+
+#include "particle.h"
+#include "commonkernels.h"
+
+namespace Manta {
+
+#pragma region Secondary Particles for FLIP
+//----------------------------------------------------------------------------------------------------------------------------------------------------
+// Secondary Particles for FLIP
+//----------------------------------------------------------------------------------------------------------------------------------------------------
+
+// helper function that clamps the value in potential to the interval [tauMin, tauMax] and
+// normalizes it to [0, 1] afterwards
+Real clampPotential(Real potential, Real tauMin, Real tauMax)
+{
+  return (std::min(potential, tauMax) - std::min(potential, tauMin)) / (tauMax - tauMin);
+}
+
+// computes all three potentials(trapped air, wave crest, kinetic energy) and the neighbor ratio
+// for every fluid cell and stores it in the respective grid. Is less readable but significantly
+// faster than using seperate potential computation
+
+struct knFlipComputeSecondaryParticlePotentials : public KernelBase {
+  knFlipComputeSecondaryParticlePotentials(Grid<Real> &potTA,
+                                           Grid<Real> &potWC,
+                                           Grid<Real> &potKE,
+                                           Grid<Real> &neighborRatio,
+                                           const FlagGrid &flags,
+                                           const MACGrid &v,
+                                           const Grid<Vec3> &normal,
+                                           const int radius,
+                                           const Real tauMinTA,
+                                           const Real tauMaxTA,
+                                           const Real tauMinWC,
+                                           const Real tauMaxWC,
+                                           const Real tauMinKE,
+                                           const Real tauMaxKE,
+                                           const Real scaleFromManta,
+                                           const int itype = FlagGrid::TypeFluid,
+                                           const int jtype = FlagGrid::TypeObstacle |
+                                                             FlagGrid::TypeOutflow |
+                                                             FlagGrid::TypeInflow)
+      : KernelBase(&potTA, radius),
+        potTA(potTA),
+        potWC(potWC),
+        potKE(potKE),
+        neighborRatio(neighborRatio),
+        flags(flags),
+        v(v),
+        normal(normal),
+        radius(radius),
+        tauMinTA(tauMinTA),
+        tauMaxTA(tauMaxTA),
+        tauMinWC(tauMinWC),
+        tauMaxWC(tauMaxWC),
+        tauMinKE(tauMinKE),
+        tauMaxKE(tauMaxKE),
+        scaleFromManta(scaleFromManta),
+        itype(itype),
+        jtype(jtype)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 Grid<Real> &potTA,
+                 Grid<Real> &potWC,
+                 Grid<Real> &potKE,
+                 Grid<Real> &neighborRatio,
+                 const FlagGrid &flags,
+                 const MACGrid &v,
+                 const Grid<Vec3> &normal,
+                 const int radius,
+                 const Real tauMinTA,
+                 const Real tauMaxTA,
+                 const Real tauMinWC,
+                 const Real tauMaxWC,
+                 const Real tauMinKE,
+                 const Real tauMaxKE,
+                 const Real scaleFromManta,
+                 const int itype = FlagGrid::TypeFluid,
+                 const int jtype = FlagGrid::TypeObstacle | FlagGrid::TypeOutflow |
+                                   FlagGrid::TypeInflow) const
+  {
+
+    if (!(flags(i, j, k) & itype))
+      return;
+
+    // compute trapped air potential + wave crest potential + neighbor ratio at once
+    const Vec3 &xi = scaleFromManta * Vec3(i, j, k);  // scale to unit cube
+    const Vec3 &vi = scaleFromManta * v.getCentered(i, j, k);
+    const Vec3 &ni = getNormalized(normal(i, j, k));
+    Real vdiff = 0;         // for trapped air
+    Real kappa = 0;         // for wave crests
+    int countFluid = 0;     // for neighbor ratio
+    int countMaxFluid = 0;  // for neighbor ratio
+
+    // iterate over neighboring cells within radius
+    for (IndexInt x = i - radius; x <= i + radius; x++) {
+      for (IndexInt y = j - radius; y <= j + radius; y++) {
+        for (IndexInt z = k - radius; z <= k + radius; z++) {
+          if ((x == i && y == j && z == k) || !flags.isInBounds(Vec3i(x, y, z)) ||
+              (flags(x, y, z) & jtype))
+            continue;
+
+          if (flags(x, y, z) & itype) {
+            countFluid++;
+            countMaxFluid++;
+          }
+          else {
+            countMaxFluid++;
+          }
+
+          const Vec3 &xj = scaleFromManta * Vec3(x, y, z);  // scale to unit cube
+          const Vec3 &vj = scaleFromManta * v.getCentered(x, y, z);
+          const Vec3 &nj = getNormalized(normal(x, y, z));
+          const Vec3 xij = xi - xj;
+          const Vec3 vij = vi - vj;
+          Real h = !potTA.is3D() ?
+                       1.414 * radius :
+                       1.732 * radius;  // estimate sqrt(2)*radius resp. sqrt(3)*radius for h, due
+                                        // to squared resp. cubic neighbor area
+          vdiff += norm(vij) * (1 - dot(getNormalized(vij), getNormalized(xij))) *
+                   (1 - norm(xij) / h);
+
+          if (dot(getNormalized(xij), ni) < 0) {  // identifies wave crests
+            kappa += (1 - dot(ni, nj)) * (1 - norm(xij) / h);
+          }
+        }
+      }
+    }
+
+    neighborRatio(i, j, k) = float(countFluid) / float(countMaxFluid);
+
+    potTA(i, j, k) = clampPotential(vdiff, tauMinTA, tauMaxTA);
+    if (dot(getNormalized(vi), ni) >= 0.6) {  // avoid to mark boarders of the scene as wave crest
+      potWC(i, j, k) = clampPotential(kappa, tauMinWC, tauMaxWC);
+    }
+    else {
+      potWC(i, j, k) = Real(0);
+    }
+
+    // compute kinetic energy potential
+    Real ek =
+        Real(0.5) * 125 *
+        normSquare(
+            vi);  // use arbitrary constant for mass, potential adjusts with thresholds anyways
+    potKE(i, j, k) = clampPotential(ek, tauMinKE, tauMaxKE);
+  }
+  inline Grid<Real> &getArg0()
+  {
+    return potTA;
+  }
+  typedef Grid<Real> type0;
+  inline Grid<Real> &getArg1()
+  {
+    return potWC;
+  }
+  typedef Grid<Real> type1;
+  inline Grid<Real> &getArg2()
+  {
+    return potKE;
+  }
+  typedef Grid<Real> type2;
+  inline Grid<Real> &getArg3()
+  {
+    return neighborRatio;
+  }
+  typedef Grid<Real> type3;
+  inline const FlagGrid &getArg4()
+  {
+    return flags;
+  }
+  typedef FlagGrid type4;
+  inline const MACGrid &getArg5()
+  {
+    return v;
+  }
+  typedef MACGrid type5;
+  inline const Grid<Vec3> &getArg6()
+  {
+    return normal;
+  }
+  typedef Grid<Vec3> type6;
+  inline const int &getArg7()
+  {
+    return radius;
+  }
+  typedef int type7;
+  inline const Real &getArg8()
+  {
+    return tauMinTA;
+  }
+  typedef Real type8;
+  inline const Real &getArg9()
+  {
+    return tauMaxTA;
+  }
+  typedef Real type9;
+  inline const Real &getArg10()
+  {
+    return tauMinWC;
+  }
+  typedef Real type10;
+  inline const Real &getArg11()
+  {
+    return tauMaxWC;
+  }
+  typedef Real type11;
+  inline const Real &getArg12()
+  {
+    return tauMinKE;
+  }
+  typedef Real type12;
+  inline const Real &getArg13()
+  {
+    return tauMaxKE;
+  }
+  typedef Real type13;
+  inline const Real &getArg14()
+  {
+    return scaleFromManta;
+  }
+  typedef Real type14;
+  inline const int &getArg15()
+  {
+    return itype;
+  }
+  typedef int type15;
+  inline const int &getArg16()
+  {
+    return jtype;
+  }
+  typedef int type16;
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipComputeSecondaryParticlePotentials ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = radius; j < _maxY; j++)
+          for (int i = radius; i < _maxX; i++)
+            op(i,
+               j,
+               k,
+               potTA,
+               potWC,
+               potKE,
+               neighborRatio,
+               flags,
+               v,
+               normal,
+               radius,
+               tauMinTA,
+               tauMaxTA,
+               tauMinWC,
+               tauMaxWC,
+               tauMinKE,
+               tauMaxKE,
+               scaleFromManta,
+               itype,
+               jtype);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = radius; i < _maxX; i++)
+          op(i,
+             j,
+             k,
+             potTA,
+             potWC,
+             potKE,
+             neighborRatio,
+             flags,
+             v,
+             normal,
+             radius,
+             tauMinTA,
+             tauMaxTA,
+             tauMinWC,
+             tauMaxWC,
+             tauMinKE,
+             tauMaxKE,
+             scaleFromManta,
+             itype,
+             jtype);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(radius, maxY), *this);
+  }
+  Grid<Real> &potTA;
+  Grid<Real> &potWC;
+  Grid<Real> &potKE;
+  Grid<Real> &neighborRatio;
+  const FlagGrid &flags;
+  const MACGrid &v;
+  const Grid<Vec3> &normal;
+  const int radius;
+  const Real tauMinTA;
+  const Real tauMaxTA;
+  const Real tauMinWC;
+  const Real tauMaxWC;
+  const Real tauMinKE;
+  const Real tauMaxKE;
+  const Real scaleFromManta;
+  const int itype;
+  const int jtype;
+};
+
+void flipComputeSecondaryParticlePotentials(Grid<Real> &potTA,
+                                            Grid<Real> &potWC,
+                                            Grid<Real> &potKE,
+                                            Grid<Real> &neighborRatio,
+                                            const FlagGrid &flags,
+                                            const MACGrid &v,
+                                            Grid<Vec3> &normal,
+                                            const Grid<Real> &phi,
+                                            const int radius,
+                                            const Real tauMinTA,
+                                            const Real tauMaxTA,
+                                            const Real tauMinWC,
+                                            const Real tauMaxWC,
+                                            const Real tauMinKE,
+                                            const Real tauMaxKE,
+                                            const Real scaleFromManta,
+                                            const int itype = FlagGrid::TypeFluid,
+                                            const int jtype = FlagGrid::TypeObstacle |
+                                                              FlagGrid::TypeOutflow |
+                                                              FlagGrid::TypeInflow)
+{
+  potTA.clear();
+  potWC.clear();
+  potKE.clear();
+  neighborRatio.clear();
+  GradientOp(normal, phi);
+  knFlipComputeSecondaryParticlePotentials(potTA,
+                                           potWC,
+                                           potKE,
+                                           neighborRatio,
+                                           flags,
+                                           v,
+                                           normal,
+                                           radius,
+                                           tauMinTA,
+                                           tauMaxTA,
+                                           tauMinWC,
+                                           tauMaxWC,
+                                           tauMinKE,
+                                           tauMaxKE,
+                                           scaleFromManta,
+                                           itype,
+                                           jtype);
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "flipComputeSecondaryParticlePotentials", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &potTA = *_args.getPtr<Grid<Real>>("potTA", 0, &_lock);
+      Grid<Real> &potWC = *_args.getPtr<Grid<Real>>("potWC", 1, &_lock);
+      Grid<Real> &potKE = *_args.getPtr<Grid<Real>>("potKE", 2, &_lock);
+      Grid<Real> &neighborRatio = *_args.getPtr<Grid<Real>>("neighborRatio", 3, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 4, &_lock);
+      const MACGrid &v = *_args.getPtr<MACGrid>("v", 5, &_lock);
+      Grid<Vec3> &normal = *_args.getPtr<Grid<Vec3>>("normal", 6, &_lock);
+      const Grid<Real> &phi = *_args.getPtr<Grid<Real>>("phi", 7, &_lock);
+      const int radius = _args.get<int>("radius", 8, &_lock);
+      const Real tauMinTA = _args.get<Real>("tauMinTA", 9, &_lock);
+      const Real tauMaxTA = _args.get<Real>("tauMaxTA", 10, &_lock);
+      const Real tauMinWC = _args.get<Real>("tauMinWC", 11, &_lock);
+      const Real tauMaxWC = _args.get<Real>("tauMaxWC", 12, &_lock);
+      const Real tauMinKE = _args.get<Real>("tauMinKE", 13, &_lock);
+      const Real tauMaxKE = _args.get<Real>("tauMaxKE", 14, &_lock);
+      const Real scaleFromManta = _args.get<Real>("scaleFromManta", 15, &_lock);
+      const int itype = _args.getOpt<int>("itype", 16, FlagGrid::TypeFluid, &_lock);
+      const int jtype = _args.getOpt<int>("jtype",
+                                          17,
+                                          FlagGrid::TypeObstacle | FlagGrid::TypeOutflow |
+                                              FlagGrid::TypeInflow,
+                                          &_lock);
+      _retval = getPyNone();
+      flipComputeSecondaryParticlePotentials(potTA,
+                                             potWC,
+                                             potKE,
+                                             neighborRatio,
+                                             flags,
+                                             v,
+                                             normal,
+                                             phi,
+                                             radius,
+                                             tauMinTA,
+                                             tauMaxTA,
+                                             tauMinWC,
+                                             tauMaxWC,
+                                             tauMinKE,
+                                             tauMaxKE,
+                                             scaleFromManta,
+                                             itype,
+                                             jtype);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "flipComputeSecondaryParticlePotentials", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("flipComputeSecondaryParticlePotentials", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_flipComputeSecondaryParticlePotentials(
+    "", "flipComputeSecondaryParticlePotentials", _W_0);
+extern "C" {
+void PbRegister_flipComputeSecondaryParticlePotentials()
+{
+  KEEP_UNUSED(_RP_flipComputeSecondaryParticlePotentials);
+}
+}
+
+// adds secondary particles to &pts_sec for every fluid cell in &flags according to the potential
+// grids &potTA, &potWC and &potKE secondary particles are uniformly sampled in every fluid cell in
+// a randomly offset cylinder in fluid movement direction In contrast to
+// flipSampleSecondaryParticles this uses more cylinders per cell and interpolates velocity and
+// potentials. To control number of cylinders in each dimension adjust radius(0.25=>2 cyl,
+// 0.1666=>3 cyl, 0.125=>3cyl etc.).
+
+struct knFlipSampleSecondaryParticlesMoreCylinders : public KernelBase {
+  knFlipSampleSecondaryParticlesMoreCylinders(const FlagGrid &flags,
+                                              const MACGrid &v,
+                                              BasicParticleSystem &pts_sec,
+                                              ParticleDataImpl<Vec3> &v_sec,
+                                              ParticleDataImpl<Real> &l_sec,
+                                              const Real lMin,
+                                              const Real lMax,
+                                              const Grid<Real> &potTA,
+                                              const Grid<Real> &potWC,
+                                              const Grid<Real> &potKE,
+                                              const Grid<Real> &neighborRatio,
+                                              const Real c_s,
+                                              const Real c_b,
+                                              const Real k_ta,
+                                              const Real k_wc,
+                                              const Real dt,
+                                              const int itype = FlagGrid::TypeFluid)
+      : KernelBase(&flags, 0),
+        flags(flags),
+        v(v),
+        pts_sec(pts_sec),
+        v_sec(v_sec),
+        l_sec(l_sec),
+        lMin(lMin),
+        lMax(lMax),
+        potTA(potTA),
+        potWC(potWC),
+        potKE(potKE),
+        neighborRatio(neighborRatio),
+        c_s(c_s),
+        c_b(c_b),
+        k_ta(k_ta),
+        k_wc(k_wc),
+        dt(dt),
+        itype(itype)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 const MACGrid &v,
+                 BasicParticleSystem &pts_sec,
+                 ParticleDataImpl<Vec3> &v_sec,
+                 ParticleDataImpl<Real> &l_sec,
+                 const Real lMin,
+                 const Real lMax,
+                 const Grid<Real> &potTA,
+                 const Grid<Real> &potWC,
+                 const Grid<Real> &potKE,
+                 const Grid<Real> &neighborRatio,
+                 const Real c_s,
+                 const Real c_b,
+                 const Real k_ta,
+                 const Real k_wc,
+                 const Real dt,
+                 const int itype = FlagGrid::TypeFluid)
+  {
+
+    if (!(flags(i, j, k) & itype))
+      return;
+
+    RandomStream mRand(9832);
+    Real radius =
+        0.25;  // diameter=0.5 => sampling with two cylinders in each dimension since cell size=1
+    for (Real x = i - radius; x <= i + radius; x += 2 * radius) {
+      for (Real y = j - radius; y <= j + radius; y += 2 * radius) {
+        for (Real z = k - radius; z <= k + radius; z += 2 * radius) {
+
+          Vec3 xi = Vec3(x, y, z);
+          Real KE = potKE.getInterpolated(xi);
+          Real TA = potTA.getInterpolated(xi);
+          Real WC = potWC.getInterpolated(xi);
+
+          const int n = KE * (k_ta * TA + k_wc * WC) * dt;  // number of secondary particles
+          if (n == 0)
+            continue;
+          Vec3 vi = v.getInterpolated(xi);
+          Vec3 dir = dt * vi;  // direction of movement of current particle
+          Vec3 e1 = getNormalized(Vec3(dir.z, 0, -dir.x));  // perpendicular to dir
+          Vec3 e2 = getNormalized(
+              cross(e1, dir));  // perpendicular to dir and e1, so e1 and e1 create reference plane
+
+          for (int di = 0; di < n; di++) {
+            const Real r = radius * sqrt(mRand.getReal());        // distance to cylinder axis
+            const Real theta = mRand.getReal() * Real(2) * M_PI;  // azimuth
+            const Real h = mRand.getReal() * norm(dt * vi);       // distance to reference plane
+            Vec3 xd = xi + r * cos(theta) * e1 + r * sin(theta) * e2 + h * getNormalized(vi);
+            if (!flags.is3D())
+              xd.z = 0;
+            pts_sec.add(xd);
+
+            v_sec[v_sec.size() - 1] = r * cos(theta) * e1 + r * sin(theta) * e2 +
+                                      vi;  // init velocity of new particle
+            Real temp = (KE + TA + WC) / 3;
+            l_sec[l_sec.size() - 1] = ((lMax - lMin) * temp) + lMin +
+                                      mRand.getReal() * 0.1;  // init lifetime of new particle
+
+            // init type of new particle
+            if (neighborRatio(i, j, k) < c_s) {
+              pts_sec[pts_sec.size() - 1].flag = ParticleBase::PSPRAY;
+            }
+            else if (neighborRatio(i, j, k) > c_b) {
+              pts_sec[pts_sec.size() - 1].flag = ParticleBase::PBUBBLE;
+            }
+            else {
+              pts_sec[pts_sec.size() - 1].flag = ParticleBase::PFOAM;
+            }
+          }
+        }
+      }
+    }
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline const MACGrid &getArg1()
+  {
+    return v;
+  }
+  typedef MACGrid type1;
+  inline BasicParticleSystem &getArg2()
+  {
+    return pts_sec;
+  }
+  typedef BasicParticleSystem type2;
+  inline ParticleDataImpl<Vec3> &getArg3()
+  {
+    return v_sec;
+  }
+  typedef ParticleDataImpl<Vec3> type3;
+  inline ParticleDataImpl<Real> &getArg4()
+  {
+    return l_sec;
+  }
+  typedef ParticleDataImpl<Real> type4;
+  inline const Real &getArg5()
+  {
+    return lMin;
+  }
+  typedef Real type5;
+  inline const Real &getArg6()
+  {
+    return lMax;
+  }
+  typedef Real type6;
+  inline const Grid<Real> &getArg7()
+  {
+    return potTA;
+  }
+  typedef Grid<Real> type7;
+  inline const Grid<Real> &getArg8()
+  {
+    return potWC;
+  }
+  typedef Grid<Real> type8;
+  inline const Grid<Real> &getArg9()
+  {
+    return potKE;
+  }
+  typedef Grid<Real> type9;
+  inline const Grid<Real> &getArg10()
+  {
+    return neighborRatio;
+  }
+  typedef Grid<Real> type10;
+  inline const Real &getArg11()
+  {
+    return c_s;
+  }
+  typedef Real type11;
+  inline const Real &getArg12()
+  {
+    return c_b;
+  }
+  typedef Real type12;
+  inline const Real &getArg13()
+  {
+    return k_ta;
+  }
+  typedef Real type13;
+  inline const Real &getArg14()
+  {
+    return k_wc;
+  }
+  typedef Real type14;
+  inline const Real &getArg15()
+  {
+    return dt;
+  }
+  typedef Real type15;
+  inline const int &getArg16()
+  {
+    return itype;
+  }
+  typedef int type16;
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipSampleSecondaryParticlesMoreCylinders ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void run()
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    for (int k = minZ; k < maxZ; k++)
+      for (int j = 0; j < _maxY; j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i,
+             j,
+             k,
+             flags,
+             v,
+             pts_sec,
+             v_sec,
+             l_sec,
+             lMin,
+             lMax,
+             potTA,
+             potWC,
+             potKE,
+             neighborRatio,
+             c_s,
+             c_b,
+             k_ta,
+             k_wc,
+             dt,
+             itype);
+  }
+  const FlagGrid &flags;
+  const MACGrid &v;
+  BasicParticleSystem &pts_sec;
+  ParticleDataImpl<Vec3> &v_sec;
+  ParticleDataImpl<Real> &l_sec;
+  const Real lMin;
+  const Real lMax;
+  const Grid<Real> &potTA;
+  const Grid<Real> &potWC;
+  const Grid<Real> &potKE;
+  const Grid<Real> &neighborRatio;
+  const Real c_s;
+  const Real c_b;
+  const Real k_ta;
+  const Real k_wc;
+  const Real dt;
+  const int itype;
+};
+
+// adds secondary particles to &pts_sec for every fluid cell in &flags according to the potential
+// grids &potTA, &potWC and &potKE secondary particles are uniformly sampled in every fluid cell in
+// a randomly offset cylinder in fluid movement direction
+
+struct knFlipSampleSecondaryParticles : public KernelBase {
+  knFlipSampleSecondaryParticles(const FlagGrid &flags,
+                                 const MACGrid &v,
+                                 BasicParticleSystem &pts_sec,
+                                 ParticleDataImpl<Vec3> &v_sec,
+                                 ParticleDataImpl<Real> &l_sec,
+                                 const Real lMin,
+                                 const Real lMax,
+                                 const Grid<Real> &potTA,
+                                 const Grid<Real> &potWC,
+                                 const Grid<Real> &potKE,
+                                 const Grid<Real> &neighborRatio,
+                                 const Real c_s,
+                                 const Real c_b,
+                                 const Real k_ta,
+                                 const Real k_wc,
+                                 const Real dt,
+                                 const int itype = FlagGrid::TypeFluid)
+      : KernelBase(&flags, 0),
+        flags(flags),
+        v(v),
+        pts_sec(pts_sec),
+        v_sec(v_sec),
+        l_sec(l_sec),
+        lMin(lMin),
+        lMax(lMax),
+        potTA(potTA),
+        potWC(potWC),
+        potKE(potKE),
+        neighborRatio(neighborRatio),
+        c_s(c_s),
+        c_b(c_b),
+        k_ta(k_ta),
+        k_wc(k_wc),
+        dt(dt),
+        itype(itype)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 const MACGrid &v,
+                 BasicParticleSystem &pts_sec,
+                 ParticleDataImpl<Vec3> &v_sec,
+                 ParticleDataImpl<Real> &l_sec,
+                 const Real lMin,
+                 const Real lMax,
+                 const Grid<Real> &potTA,
+                 const Grid<Real> &potWC,
+                 const Grid<Real> &potKE,
+                 const Grid<Real> &neighborRatio,
+                 const Real c_s,
+                 const Real c_b,
+                 const Real k_ta,
+                 const Real k_wc,
+                 const Real dt,
+                 const int itype = FlagGrid::TypeFluid)
+  {
+
+    if (!(flags(i, j, k) & itype))
+      return;
+
+    Real KE = potKE(i, j, k);
+    Real TA = potTA(i, j, k);
+    Real WC = potWC(i, j, k);
+
+    const int n = KE * (k_ta * TA + k_wc * WC) * dt;  // number of secondary particles
+    if (n == 0)
+      return;
+    RandomStream mRand(9832);
+
+    Vec3 xi = Vec3(i + mRand.getReal(),
+                   j + mRand.getReal(),
+                   k + mRand.getReal());  // randomized offset uniform in cell
+    Vec3 vi = v.getInterpolated(xi);
+    Vec3 dir = dt * vi;                               // direction of movement of current particle
+    Vec3 e1 = getNormalized(Vec3(dir.z, 0, -dir.x));  // perpendicular to dir
+    Vec3 e2 = getNormalized(
+        cross(e1, dir));  // perpendicular to dir and e1, so e1 and e1 create reference plane
+
+    for (int di = 0; di < n; di++) {
+      const Real r = Real(0.5) * sqrt(mRand.getReal());     // distance to cylinder axis
+      const Real theta = mRand.getReal() * Real(2) * M_PI;  // azimuth
+      const Real h = mRand.getReal() * norm(dt * vi);       // distance to reference plane
+      Vec3 xd = xi + r * cos(theta) * e1 + r * sin(theta) * e2 + h * getNormalized(vi);
+      if (!flags.is3D())
+        xd.z = 0;
+      pts_sec.add(xd);
+
+      v_sec[v_sec.size() - 1] = r * cos(theta) * e1 + r * sin(theta) * e2 +
+                                vi;  // init velocity of new particle
+      Real temp = (KE + TA + WC) / 3;
+      l_sec[l_sec.size() - 1] = ((lMax - lMin) * temp) + lMin +
+                                mRand.getReal() * 0.1;  // init lifetime of new particle
+
+      // init type of new particle
+      if (neighborRatio(i, j, k) < c_s) {
+        pts_sec[pts_sec.size() - 1].flag = ParticleBase::PSPRAY;
+      }
+      else if (neighborRatio(i, j, k) > c_b) {
+        pts_sec[pts_sec.size() - 1].flag = ParticleBase::PBUBBLE;
+      }
+      else {
+        pts_sec[pts_sec.size() - 1].flag = ParticleBase::PFOAM;
+      }
+    }
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline const MACGrid &getArg1()
+  {
+    return v;
+  }
+  typedef MACGrid type1;
+  inline BasicParticleSystem &getArg2()
+  {
+    return pts_sec;
+  }
+  typedef BasicParticleSystem type2;
+  inline ParticleDataImpl<Vec3> &getArg3()
+  {
+    return v_sec;
+  }
+  typedef ParticleDataImpl<Vec3> type3;
+  inline ParticleDataImpl<Real> &getArg4()
+  {
+    return l_sec;
+  }
+  typedef ParticleDataImpl<Real> type4;
+  inline const Real &getArg5()
+  {
+    return lMin;
+  }
+  typedef Real type5;
+  inline const Real &getArg6()
+  {
+    return lMax;
+  }
+  typedef Real type6;
+  inline const Grid<Real> &getArg7()
+  {
+    return potTA;
+  }
+  typedef Grid<Real> type7;
+  inline const Grid<Real> &getArg8()
+  {
+    return potWC;
+  }
+  typedef Grid<Real> type8;
+  inline const Grid<Real> &getArg9()
+  {
+    return potKE;
+  }
+  typedef Grid<Real> type9;
+  inline const Grid<Real> &getArg10()
+  {
+    return neighborRatio;
+  }
+  typedef Grid<Real> type10;
+  inline const Real &getArg11()
+  {
+    return c_s;
+  }
+  typedef Real type11;
+  inline const Real &getArg12()
+  {
+    return c_b;
+  }
+  typedef Real type12;
+  inline const Real &getArg13()
+  {
+    return k_ta;
+  }
+  typedef Real type13;
+  inline const Real &getArg14()
+  {
+    return k_wc;
+  }
+  typedef Real type14;
+  inline const Real &getArg15()
+  {
+    return dt;
+  }
+  typedef Real type15;
+  inline const int &getArg16()
+  {
+    return itype;
+  }
+  typedef int type16;
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipSampleSecondaryParticles ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void run()
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    for (int k = minZ; k < maxZ; k++)
+      for (int j = 0; j < _maxY; j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i,
+             j,
+             k,
+             flags,
+             v,
+             pts_sec,
+             v_sec,
+             l_sec,
+             lMin,
+             lMax,
+             potTA,
+             potWC,
+             potKE,
+             neighborRatio,
+             c_s,
+             c_b,
+             k_ta,
+             k_wc,
+             dt,
+             itype);
+  }
+  const FlagGrid &flags;
+  const MACGrid &v;
+  BasicParticleSystem &pts_sec;
+  ParticleDataImpl<Vec3> &v_sec;
+  ParticleDataImpl<Real> &l_sec;
+  const Real lMin;
+  const Real lMax;
+  const Grid<Real> &potTA;
+  const Grid<Real> &potWC;
+  const Grid<Real> &potKE;
+  const Grid<Real> &neighborRatio;
+  const Real c_s;
+  const Real c_b;
+  const Real k_ta;
+  const Real k_wc;
+  const Real dt;
+  const int itype;
+};
+
+void flipSampleSecondaryParticles(const std::string mode,
+                                  const FlagGrid &flags,
+                                  const MACGrid &v,
+                                  BasicParticleSystem &pts_sec,
+                                  ParticleDataImpl<Vec3> &v_sec,
+                                  ParticleDataImpl<Real> &l_sec,
+                                  const Real lMin,
+                                  const Real lMax,
+                                  const Grid<Real> &potTA,
+                                  const Grid<Real> &potWC,
+                                  const Grid<Real> &potKE,
+                                  const Grid<Real> &neighborRatio,
+                                  const Real c_s,
+                                  const Real c_b,
+                                  const Real k_ta,
+                                  const Real k_wc,
+                                  const Real dt,
+                                  const int itype = FlagGrid::TypeFluid)
+{
+  if (mode == "single") {
+    knFlipSampleSecondaryParticles(flags,
+                                   v,
+                                   pts_sec,
+                                   v_sec,
+                                   l_sec,
+                                   lMin,
+                                   lMax,
+                                   potTA,
+                                   potWC,
+                                   potKE,
+                                   neighborRatio,
+                                   c_s,
+                                   c_b,
+                                   k_ta,
+                                   k_wc,
+                                   dt,
+                                   itype);
+  }
+  else if (mode == "multiple") {
+    knFlipSampleSecondaryParticlesMoreCylinders(flags,
+                                                v,
+                                                pts_sec,
+                                                v_sec,
+                                                l_sec,
+                                                lMin,
+                                                lMax,
+                                                potTA,
+                                                potWC,
+                                                potKE,
+                                                neighborRatio,
+                                                c_s,
+                                                c_b,
+                                                k_ta,
+                                                k_wc,
+                                                dt,
+                                                itype);
+  }
+  else {
+    throw std::invalid_argument("Unknown mode: use \"single\" or \"multiple\" instead!");
+  }
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "flipSampleSecondaryParticles", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const std::string mode = _args.get<std::string>("mode", 0, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 1, &_lock);
+      const MACGrid &v = *_args.getPtr<MACGrid>("v", 2, &_lock);
+      BasicParticleSystem &pts_sec = *_args.getPtr<BasicParticleSystem>("pts_sec", 3, &_lock);
+      ParticleDataImpl<Vec3> &v_sec = *_args.getPtr<ParticleDataImpl<Vec3>>("v_sec", 4, &_lock);
+      ParticleDataImpl<Real> &l_sec = *_args.getPtr<ParticleDataImpl<Real>>("l_sec", 5, &_lock);
+      const Real lMin = _args.get<Real>("lMin", 6, &_lock);
+      const Real lMax = _args.get<Real>("lMax", 7, &_lock);
+      const Grid<Real> &potTA = *_args.getPtr<Grid<Real>>("potTA", 8, &_lock);
+      const Grid<Real> &potWC = *_args.getPtr<Grid<Real>>("potWC", 9, &_lock);
+      const Grid<Real> &potKE = *_args.getPtr<Grid<Real>>("potKE", 10, &_lock);
+      const Grid<Real> &neighborRatio = *_args.getPtr<Grid<Real>>("neighborRatio", 11, &_lock);
+      const Real c_s = _args.get<Real>("c_s", 12, &_lock);
+      const Real c_b = _args.get<Real>("c_b", 13, &_lock);
+      const Real k_ta = _args.get<Real>("k_ta", 14, &_lock);
+      const Real k_wc = _args.get<Real>("k_wc", 15, &_lock);
+      const Real dt = _args.get<Real>("dt", 16, &_lock);
+      const int itype = _args.getOpt<int>("itype", 17, FlagGrid::TypeFluid, &_lock);
+      _retval = getPyNone();
+      flipSampleSecondaryParticles(mode,
+                                   flags,
+                                   v,
+                                   pts_sec,
+                                   v_sec,
+                                   l_sec,
+                                   lMin,
+                                   lMax,
+                                   potTA,
+                                   potWC,
+                                   potKE,
+                                   neighborRatio,
+                                   c_s,
+                                   c_b,
+                                   k_ta,
+                                   k_wc,
+                                   dt,
+                                   itype);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "flipSampleSecondaryParticles", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("flipSampleSecondaryParticles", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_flipSampleSecondaryParticles("",
+                                                           "flipSampleSecondaryParticles",
+                                                           _W_1);
+extern "C" {
+void PbRegister_flipSampleSecondaryParticles()
+{
+  KEEP_UNUSED(_RP_flipSampleSecondaryParticles);
+}
+}
+
+// evaluates cubic spline with radius h and distance l in dim dimensions
+Real cubicSpline(const Real h, const Real l, const int dim)
+{
+  const Real h2 = square(h), h3 = h2 * h, h4 = h3 * h, h5 = h4 * h;
+  const Real c[] = {
+      Real(2e0 / (3e0 * h)), Real(10e0 / (7e0 * M_PI * h2)), Real(1e0 / (M_PI * h3))};
+  const Real q = l / h;
+  if (q < 1e0)
+    return c[dim - 1] * (1e0 - 1.5 * square(q) + 0.75 * cubed(q));
+  else if (q < 2e0)
+    return c[dim - 1] * (0.25 * cubed(2e0 - q));
+  return 0;
+}
+
+// updates position &pts_sec.pos and velocity &v_sec of secondary particles according to the
+// particle type determined by the neighbor ratio with linear interpolation
+
+struct knFlipUpdateSecondaryParticlesLinear : public KernelBase {
+  knFlipUpdateSecondaryParticlesLinear(BasicParticleSystem &pts_sec,
+                                       ParticleDataImpl<Vec3> &v_sec,
+                                       ParticleDataImpl<Real> &l_sec,
+                                       const ParticleDataImpl<Vec3> &f_sec,
+                                       const FlagGrid &flags,
+                                       const MACGrid &v,
+                                       const Grid<Real> &neighborRatio,
+                                       const Vec3 gravity,
+                                       const Real k_b,
+                                       const Real k_d,
+                                       const Real c_s,
+                                       const Real c_b,
+                                       const Real dt,
+                                       const int exclude,
+                                       const int antitunneling)
+      : KernelBase(pts_sec.size()),
+        pts_sec(pts_sec),
+        v_sec(v_sec),
+        l_sec(l_sec),
+        f_sec(f_sec),
+        flags(flags),
+        v(v),
+        neighborRatio(neighborRatio),
+        gravity(gravity),
+        k_b(k_b),
+        k_d(k_d),
+        c_s(c_s),
+        c_b(c_b),
+        dt(dt),
+        exclude(exclude),
+        antitunneling(antitunneling)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 BasicParticleSystem &pts_sec,
+                 ParticleDataImpl<Vec3> &v_sec,
+                 ParticleDataImpl<Real> &l_sec,
+                 const ParticleDataImpl<Vec3> &f_sec,
+                 const FlagGrid &flags,
+                 const MACGrid &v,
+                 const Grid<Real> &neighborRatio,
+                 const Vec3 gravity,
+                 const Real k_b,
+                 const Real k_d,
+                 const Real c_s,
+                 const Real c_b,
+                 const Real dt,
+                 const int exclude,
+                 const int antitunneling) const
+  {
+
+    if (!pts_sec.isActive(idx) || pts_sec[idx].flag & exclude)
+      return;
+    if (!flags.isInBounds(pts_sec[idx].pos)) {
+      pts_sec.kill(idx);
+      return;
+    }
+
+    Vec3i gridpos = toVec3i(pts_sec[idx].pos);
+    int i = gridpos.x;
+    int j = gridpos.y;
+    int k = gridpos.z;
+
+    // spray particle
+    if (neighborRatio(gridpos) < c_s) {
+      pts_sec[idx].flag |= ParticleBase::PSPRAY;
+      pts_sec[idx].flag &= ~(ParticleBase::PBUBBLE | ParticleBase::PFOAM);
+      v_sec[idx] += dt *
+                    ((f_sec[idx] / 1) + gravity);  // TODO: if forces are added (e.g. fluid
+                                                   // guiding), add parameter for mass instead of 1
+
+      // anti tunneling for small obstacles
+      for (int ct = 1; ct < antitunneling; ct++) {
+        Vec3i tempPos = toVec3i(pts_sec[idx].pos +
+                                ct * (1 / Real(antitunneling)) * dt * v_sec[idx]);
+        if (!flags.isInBounds(tempPos) || flags(tempPos) & FlagGrid::TypeObstacle) {
+          pts_sec.kill(idx);
+          return;
+        }
+      }
+      pts_sec[idx].pos += dt * v_sec[idx];
+    }
+
+    // air bubble particle
+    else if (neighborRatio(gridpos) > c_b) {
+      pts_sec[idx].flag |= ParticleBase::PBUBBLE;
+      pts_sec[idx].flag &= ~(ParticleBase::PSPRAY | ParticleBase::PFOAM);
+
+      const Vec3 vj = (v.getInterpolated(pts_sec[idx].pos) - v_sec[idx]) / dt;
+      v_sec[idx] += dt * (k_b * -gravity + k_d * vj);
+
+      // anti tunneling for small obstacles
+      for (int ct = 1; ct < antitunneling; ct++) {
+        Vec3i tempPos = toVec3i(pts_sec[idx].pos +
+                                ct * (1 / Real(antitunneling)) * dt * v_sec[idx]);
+        if (!flags.isInBounds(tempPos) || flags(tempPos) & FlagGrid::TypeObstacle) {
+          pts_sec.kill(idx);
+          return;
+        }
+      }
+      pts_sec[idx].pos += dt * v_sec[idx];
+    }
+
+    // foam particle
+    else {
+      pts_sec[idx].flag |= ParticleBase::PFOAM;
+      pts_sec[idx].flag &= ~(ParticleBase::PBUBBLE | ParticleBase::PSPRAY);
+
+      const Vec3 vj = v.getInterpolated(pts_sec[idx].pos);
+      // anti tunneling for small obstacles
+      for (int ct = 1; ct < antitunneling; ct++) {
+        Vec3i tempPos = toVec3i(pts_sec[idx].pos + ct * (1 / Real(antitunneling)) * dt * vj);
+        if (!flags.isInBounds(tempPos) || flags(tempPos) & FlagGrid::TypeObstacle) {
+          pts_sec.kill(idx);
+          return;
+        }
+      }
+      pts_sec[idx].pos += dt * v.getInterpolated(pts_sec[idx].pos);
+    }
+
+    // lifetime
+    l_sec[idx] -= dt;
+    if (l_sec[idx] <= Real(0)) {
+      pts_sec.kill(idx);
+    }
+  }
+  inline BasicParticleSystem &getArg0()
+  {
+    return pts_sec;
+  }
+  typedef BasicParticleSystem type0;
+  inline ParticleDataImpl<Vec3> &getArg1()
+  {
+    return v_sec;
+  }
+  typedef ParticleDataImpl<Vec3> type1;
+  inline ParticleDataImpl<Real> &getArg2()
+  {
+    return l_sec;
+  }
+  typedef ParticleDataImpl<Real> type2;
+  inline const ParticleDataImpl<Vec3> &getArg3()
+  {
+    return f_sec;
+  }
+  typedef ParticleDataImpl<Vec3> type3;
+  inline const FlagGrid &getArg4()
+  {
+    return flags;
+  }
+  typedef FlagGrid type4;
+  inline const MACGrid &getArg5()
+  {
+    return v;
+  }
+  typedef MACGrid type5;
+  inline const Grid<Real> &getArg6()
+  {
+    return neighborRatio;
+  }
+  typedef Grid<Real> type6;
+  inline const Vec3 &getArg7()
+  {
+    return gravity;
+  }
+  typedef Vec3 type7;
+  inline const Real &getArg8()
+  {
+    return k_b;
+  }
+  typedef Real type8;
+  inline const Real &getArg9()
+  {
+    return k_d;
+  }
+  typedef Real type9;
+  inline const Real &getArg10()
+  {
+    return c_s;
+  }
+  typedef Real type10;
+  inline const Real &getArg11()
+  {
+    return c_b;
+  }
+  typedef Real type11;
+  inline const Real &getArg12()
+  {
+    return dt;
+  }
+  typedef Real type12;
+  inline const int &getArg13()
+  {
+    return exclude;
+  }
+  typedef int type13;
+  inline const int &getArg14()
+  {
+    return antitunneling;
+  }
+  typedef int type14;
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipUpdateSecondaryParticlesLinear ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx,
+         pts_sec,
+         v_sec,
+         l_sec,
+         f_sec,
+         flags,
+         v,
+         neighborRatio,
+         gravity,
+         k_b,
+         k_d,
+         c_s,
+         c_b,
+         dt,
+         exclude,
+         antitunneling);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  BasicParticleSystem &pts_sec;
+  ParticleDataImpl<Vec3> &v_sec;
+  ParticleDataImpl<Real> &l_sec;
+  const ParticleDataImpl<Vec3> &f_sec;
+  const FlagGrid &flags;
+  const MACGrid &v;
+  const Grid<Real> &neighborRatio;
+  const Vec3 gravity;
+  const Real k_b;
+  const Real k_d;
+  const Real c_s;
+  const Real c_b;
+  const Real dt;
+  const int exclude;
+  const int antitunneling;
+};
+// updates position &pts_sec.pos and velocity &v_sec of secondary particles according to the
+// particle type determined by the neighbor ratio with cubic spline interpolation
+
+struct knFlipUpdateSecondaryParticlesCubic : public KernelBase {
+  knFlipUpdateSecondaryParticlesCubic(BasicParticleSystem &pts_sec,
+                                      ParticleDataImpl<Vec3> &v_sec,
+                                      ParticleDataImpl<Real> &l_sec,
+                                      const ParticleDataImpl<Vec3> &f_sec,
+                                      const FlagGrid &flags,
+                                      const MACGrid &v,
+                                      const Grid<Real> &neighborRatio,
+                                      const int radius,
+                                      const Vec3 gravity,
+                                      const Real k_b,
+                                      const Real k_d,
+                                      const Real c_s,
+                                      const Real c_b,
+                                      const Real dt,
+                                      const int exclude,
+                                      const int antitunneling,
+                                      const int itype)
+      : KernelBase(pts_sec.size()),
+        pts_sec(pts_sec),
+        v_sec(v_sec),
+        l_sec(l_sec),
+        f_sec(f_sec),
+        flags(flags),
+        v(v),
+        neighborRatio(neighborRatio),
+        radius(radius),
+        gravity(gravity),
+        k_b(k_b),
+        k_d(k_d),
+        c_s(c_s),
+        c_b(c_b),
+        dt(dt),
+        exclude(exclude),
+        antitunneling(antitunneling),
+        itype(itype)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 BasicParticleSystem &pts_sec,
+                 ParticleDataImpl<Vec3> &v_sec,
+                 ParticleDataImpl<Real> &l_sec,
+                 const ParticleDataImpl<Vec3> &f_sec,
+                 const FlagGrid &flags,
+                 const MACGrid &v,
+                 const Grid<Real> &neighborRatio,
+                 const int radius,
+                 const Vec3 gravity,
+                 const Real k_b,
+                 const Real k_d,
+                 const Real c_s,
+                 const Real c_b,
+                 const Real dt,
+                 const int exclude,
+                 const int antitunneling,
+                 const int itype) const
+  {
+
+    if (!pts_sec.isActive(idx) || pts_sec[idx].flag & exclude)
+      return;
+    if (!flags.isInBounds(pts_sec[idx].pos)) {
+      pts_sec.kill(idx);
+      return;
+    }
+
+    Vec3i gridpos = toVec3i(pts_sec[idx].pos);
+    int i = gridpos.x;
+    int j = gridpos.y;
+    int k = gridpos.z;
+
+    // spray particle
+    if (neighborRatio(gridpos) < c_s) {
+      pts_sec[idx].flag |= ParticleBase::PSPRAY;
+      pts_sec[idx].flag &= ~(ParticleBase::PBUBBLE | ParticleBase::PFOAM);
+      v_sec[idx] += dt *
+                    ((f_sec[idx] / 1) + gravity);  // TODO: if forces are added (e.g. fluid
+                                                   // guiding), add parameter for mass instead of 1
+
+      // anti tunneling for small obstacles
+      for (int ct = 1; ct < antitunneling; ct++) {
+        Vec3i tempPos = toVec3i(pts_sec[idx].pos +
+                                ct * (1 / Real(antitunneling)) * dt * v_sec[idx]);
+        if (!flags.isInBounds(tempPos) || flags(tempPos) & FlagGrid::TypeObstacle) {
+          pts_sec.kill(idx);
+          return;
+        }
+      }
+      pts_sec[idx].pos += dt * v_sec[idx];
+    }
+
+    // air bubble particle
+    else if (neighborRatio(gridpos) > c_b) {
+      pts_sec[idx].flag |= ParticleBase::PBUBBLE;
+      pts_sec[idx].flag &= ~(ParticleBase::PSPRAY | ParticleBase::PFOAM);
+      const Vec3 &xi = pts_sec[idx].pos;
+      Vec3 sumNumerator = Vec3(0, 0, 0);
+      Real sumDenominator = 0;
+      for (IndexInt x = i - radius; x <= i + radius; x++) {
+        for (IndexInt y = j - radius; y <= j + radius; y++) {
+          for (IndexInt z = k - radius; z <= k + radius; z++) {
+            Vec3i xj = Vec3i(x, y, z);
+            if ((x == i && y == j && z == k) || !flags.isInBounds(xj))
+              continue;
+            if (!(flags(xj) & itype))
+              continue;
+            const Real len_xij = norm(xi - Vec3(x, y, z));
+
+            int dim = flags.is3D() ? 3 : 2;
+            Real dist = flags.is3D() ? 1.732 : 1.414;
+            Real weight = cubicSpline(radius * dist, len_xij, dim);
+            sumNumerator += v.getCentered(xj) *
+                            weight;  // estimate next position by current velocity
+            sumDenominator += weight;
+          }
+        }
+      }
+      const Vec3 temp = ((sumNumerator / sumDenominator) - v_sec[idx]) / dt;
+      v_sec[idx] += dt * (k_b * -gravity + k_d * temp);
+
+      // anti tunneling for small obstacles
+      for (int ct = 1; ct < antitunneling; ct++) {
+        Vec3i tempPos = toVec3i(pts_sec[idx].pos +
+                                ct * (1 / Real(antitunneling)) * dt * v_sec[idx]);
+        if (!flags.isInBounds(tempPos) || flags(tempPos) & FlagGrid::TypeObstacle) {
+          pts_sec.kill(idx);
+          return;
+        }
+      }
+      pts_sec[idx].pos += dt * v_sec[idx];
+    }
+
+    // foam particle
+    else {
+      pts_sec[idx].flag |= ParticleBase::PFOAM;
+      pts_sec[idx].flag &= ~(ParticleBase::PBUBBLE | ParticleBase::PSPRAY);
+      const Vec3 &xi = pts_sec[idx].pos;
+      Vec3 sumNumerator = Vec3(0, 0, 0);
+      Real sumDenominator = 0;
+      for (IndexInt x = i - radius; x <= i + radius; x++) {
+        for (IndexInt y = j - radius; y <= j + radius; y++) {
+          for (IndexInt z = k - radius; z <= k + radius; z++) {
+            Vec3i xj = Vec3i(x, y, z);
+            if ((x == i && y == j && z == k) || !flags.isInBounds(xj))
+              continue;
+            if (!(flags(xj) & itype))
+              continue;
+            const Real len_xij = norm(xi - Vec3(x, y, z));
+
+            int dim = flags.is3D() ? 3 : 2;
+            Real dist = flags.is3D() ? 1.732 : 1.414;
+            Real weight = cubicSpline(radius * dist, len_xij, dim);
+            sumNumerator += v.getCentered(xj) *
+                            weight;  // estimate next position by current velocity
+            sumDenominator += weight;
+          }
+        }
+      }
+
+      // anti tunneling for small obstacles
+      for (int ct = 1; ct < antitunneling; ct++) {
+        Vec3i tempPos = toVec3i(pts_sec[idx].pos + ct * (1 / Real(antitunneling)) * dt *
+                                                       (sumNumerator / sumDenominator));
+        if (!flags.isInBounds(tempPos) || flags(tempPos) & FlagGrid::TypeObstacle) {
+          pts_sec.kill(idx);
+          return;
+        }
+      }
+      pts_sec[idx].pos += dt * (sumNumerator / sumDenominator);
+    }
+
+    // lifetime
+    l_sec[idx] -= dt;
+    if (l_sec[idx] <= Real(0)) {
+      pts_sec.kill(idx);
+    }
+  }
+  inline BasicParticleSystem &getArg0()
+  {
+    return pts_sec;
+  }
+  typedef BasicParticleSystem type0;
+  inline ParticleDataImpl<Vec3> &getArg1()
+  {
+    return v_sec;
+  }
+  typedef ParticleDataImpl<Vec3> type1;
+  inline ParticleDataImpl<Real> &getArg2()
+  {
+    return l_sec;
+  }
+  typedef ParticleDataImpl<Real> type2;
+  inline const ParticleDataImpl<Vec3> &getArg3()
+  {
+    return f_sec;
+  }
+  typedef ParticleDataImpl<Vec3> type3;
+  inline const FlagGrid &getArg4()
+  {
+    return flags;
+  }
+  typedef FlagGrid type4;
+  inline const MACGrid &getArg5()
+  {
+    return v;
+  }
+  typedef MACGrid type5;
+  inline const Grid<Real> &getArg6()
+  {
+    return neighborRatio;
+  }
+  typedef Grid<Real> type6;
+  inline const int &getArg7()
+  {
+    return radius;
+  }
+  typedef int type7;
+  inline const Vec3 &getArg8()
+  {
+    return gravity;
+  }
+  typedef Vec3 type8;
+  inline const Real &getArg9()
+  {
+    return k_b;
+  }
+  typedef Real type9;
+  inline const Real &getArg10()
+  {
+    return k_d;
+  }
+  typedef Real type10;
+  inline const Real &getArg11()
+  {
+    return c_s;
+  }
+  typedef Real type11;
+  inline const Real &getArg12()
+  {
+    return c_b;
+  }
+  typedef Real type12;
+  inline const Real &getArg13()
+  {
+    return dt;
+  }
+  typedef Real type13;
+  inline const int &getArg14()
+  {
+    return exclude;
+  }
+  typedef int type14;
+  inline const int &getArg15()
+  {
+    return antitunneling;
+  }
+  typedef int type15;
+  inline const int &getArg16()
+  {
+    return itype;
+  }
+  typedef int type16;
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipUpdateSecondaryParticlesCubic ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx,
+         pts_sec,
+         v_sec,
+         l_sec,
+         f_sec,
+         flags,
+         v,
+         neighborRatio,
+         radius,
+         gravity,
+         k_b,
+         k_d,
+         c_s,
+         c_b,
+         dt,
+         exclude,
+         antitunneling,
+         itype);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  BasicParticleSystem &pts_sec;
+  ParticleDataImpl<Vec3> &v_sec;
+  ParticleDataImpl<Real> &l_sec;
+  const ParticleDataImpl<Vec3> &f_sec;
+  const FlagGrid &flags;
+  const MACGrid &v;
+  const Grid<Real> &neighborRatio;
+  const int radius;
+  const Vec3 gravity;
+  const Real k_b;
+  const Real k_d;
+  const Real c_s;
+  const Real c_b;
+  const Real dt;
+  const int exclude;
+  const int antitunneling;
+  const int itype;
+};
+
+void flipUpdateSecondaryParticles(const std::string mode,
+                                  BasicParticleSystem &pts_sec,
+                                  ParticleDataImpl<Vec3> &v_sec,
+                                  ParticleDataImpl<Real> &l_sec,
+                                  const ParticleDataImpl<Vec3> &f_sec,
+                                  FlagGrid &flags,
+                                  const MACGrid &v,
+                                  const Grid<Real> &neighborRatio,
+                                  const int radius,
+                                  const Vec3 gravity,
+                                  const Real k_b,
+                                  const Real k_d,
+                                  const Real c_s,
+                                  const Real c_b,
+                                  const Real dt,
+                                  const int exclude = ParticleBase::PTRACER,
+                                  const int antitunneling = 0,
+                                  const int itype = FlagGrid::TypeFluid)
+{
+
+  Vec3 g = gravity / flags.getDx();
+  if (mode == "linear") {
+    knFlipUpdateSecondaryParticlesLinear(pts_sec,
+                                         v_sec,
+                                         l_sec,
+                                         f_sec,
+                                         flags,
+                                         v,
+                                         neighborRatio,
+                                         g,
+                                         k_b,
+                                         k_d,
+                                         c_s,
+                                         c_b,
+                                         dt,
+                                         exclude,
+                                         antitunneling);
+  }
+  else if (mode == "cubic") {
+    knFlipUpdateSecondaryParticlesCubic(pts_sec,
+                                        v_sec,
+                                        l_sec,
+                                        f_sec,
+                                        flags,
+                                        v,
+                                        neighborRatio,
+                                        radius,
+                                        g,
+                                        k_b,
+                                        k_d,
+                                        c_s,
+                                        c_b,
+                                        dt,
+                                        exclude,
+                                        antitunneling,
+                                        itype);
+  }
+  else {
+    throw std::invalid_argument("Unknown mode: use \"linear\" or \"cubic\" instead!");
+  }
+  pts_sec.doCompress();
+}
+static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "flipUpdateSecondaryParticles", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const std::string mode = _args.get<std::string>("mode", 0, &_lock);
+      BasicParticleSystem &pts_sec = *_args.getPtr<BasicParticleSystem>("pts_sec", 1, &_lock);
+      ParticleDataImpl<Vec3> &v_sec = *_args.getPtr<ParticleDataImpl<Vec3>>("v_sec", 2, &_lock);
+      ParticleDataImpl<Real> &l_sec = *_args.getPtr<ParticleDataImpl<Real>>("l_sec", 3, &_lock);
+      const ParticleDataImpl<Vec3> &f_sec = *_args.getPtr<ParticleDataImpl<Vec3>>(
+          "f_sec", 4, &_lock);
+      FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 5, &_lock);
+      const MACGrid &v = *_args.getPtr<MACGrid>("v", 6, &_lock);
+      const Grid<Real> &neighborRatio = *_args.getPtr<Grid<Real>>("neighborRatio", 7, &_lock);
+      const int radius = _args.get<int>("radius", 8, &_lock);
+      const Vec3 gravity = _args.get<Vec3>("gravity", 9, &_lock);
+      const Real k_b = _args.get<Real>("k_b", 10, &_lock);
+      const Real k_d = _args.get<Real>("k_d", 11, &_lock);
+      const Real c_s = _args.get<Real>("c_s", 12, &_lock);
+      const Real c_b = _args.get<Real>("c_b", 13, &_lock);
+      const Real dt = _args.get<Real>("dt", 14, &_lock);
+      const int exclude = _args.getOpt<int>("exclude", 15, ParticleBase::PTRACER, &_lock);
+      const int antitunneling = _args.getOpt<int>("antitunneling", 16, 0, &_lock);
+      const int itype = _args.getOpt<int>("itype", 17, FlagGrid::TypeFluid, &_lock);
+      _retval = getPyNone();
+      flipUpdateSecondaryParticles(mode,
+                                   pts_sec,
+                                   v_sec,
+                                   l_sec,
+                                   f_sec,
+                                   flags,
+                                   v,
+                                   neighborRatio,
+                                   radius,
+                                   gravity,
+                                   k_b,
+                                   k_d,
+                                   c_s,
+                                   c_b,
+                                   dt,
+                                   exclude,
+                                   antitunneling,
+                                   itype);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "flipUpdateSecondaryParticles", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("flipUpdateSecondaryParticles", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_flipUpdateSecondaryParticles("",
+                                                           "flipUpdateSecondaryParticles",
+                                                           _W_2);
+extern "C" {
+void PbRegister_flipUpdateSecondaryParticles()
+{
+  KEEP_UNUSED(_RP_flipUpdateSecondaryParticles);
+}
+}
+
+// removes secondary particles in &pts_sec that are inside boundaries (cells that are marked as
+// obstacle/outflow in &flags)
+
+struct knFlipDeleteParticlesInObstacle : public KernelBase {
+  knFlipDeleteParticlesInObstacle(BasicParticleSystem &pts, const FlagGrid &flags)
+      : KernelBase(pts.size()), pts(pts), flags(flags)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx, BasicParticleSystem &pts, const FlagGrid &flags) const
+  {
+
+    if (!pts.isActive(idx))
+      return;
+
+    const Vec3 &xi = pts[idx].pos;
+    const Vec3i xidx = toVec3i(xi);
+    // remove particles that completely left the bounds
+    if (!flags.isInBounds(xidx)) {
+      pts.kill(idx);
+      return;
+    }
+    int gridIndex = flags.index(xidx);
+    // remove particles that penetrate obstacles
+    if (flags[gridIndex] == FlagGrid::TypeObstacle || flags[gridIndex] == FlagGrid::TypeOutflow) {
+      pts.kill(idx);
+    }
+  }
+  inline BasicParticleSystem &getArg0()
+  {
+    return pts;
+  }
+  typedef BasicParticleSystem type0;
+  inline const FlagGrid &getArg1()
+  {
+    return flags;
+  }
+  typedef FlagGrid type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipDeleteParticlesInObstacle ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, pts, flags);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  BasicParticleSystem &pts;
+  const FlagGrid &flags;
+};
+
+void flipDeleteParticlesInObstacle(BasicParticleSystem &pts, const FlagGrid &flags)
+{
+
+  knFlipDeleteParticlesInObstacle(pts, flags);
+  pts.doCompress();
+}
+static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "flipDeleteParticlesInObstacle", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      BasicParticleSystem &pts = *_args.getPtr<BasicParticleSystem>("pts", 0, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 1, &_lock);
+      _retval = getPyNone();
+      flipDeleteParticlesInObstacle(pts, flags);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "flipDeleteParticlesInObstacle", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("flipDeleteParticlesInObstacle", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_flipDeleteParticlesInObstacle("",
+                                                            "flipDeleteParticlesInObstacle",
+                                                            _W_3);
+extern "C" {
+void PbRegister_flipDeleteParticlesInObstacle()
+{
+  KEEP_UNUSED(_RP_flipDeleteParticlesInObstacle);
+}
+}
+
+// helper method to debug statistical data from grid
+
+void debugGridInfo(const FlagGrid &flags,
+                   Grid<Real> &grid,
+                   std::string name,
+                   const int itype = FlagGrid::TypeFluid)
+{
+  FluidSolver *s = flags.getParent();
+  int countFluid = 0;
+  int countLargerZero = 0;
+  Real avg = 0;
+  Real max = 0;
+  Real sum = 0;
+  Real avgLargerZero = 0;
+  FOR_IJK_BND(grid, 1)
+  {
+    if (!(flags(i, j, k) & itype))
+      continue;
+    countFluid++;
+    if (grid(i, j, k) > 0)
+      countLargerZero++;
+    sum += grid(i, j, k);
+    if (grid(i, j, k) > max)
+      max = grid(i, j, k);
+  }
+  avg = sum / std::max(Real(countFluid), Real(1));
+  avgLargerZero = sum / std::max(Real(countLargerZero), Real(1));
+
+  debMsg("Step: " << s->mFrame << " - Grid " << name << "\n\tcountFluid \t\t" << countFluid
+                  << "\n\tcountLargerZero \t" << countLargerZero << "\n\tsum \t\t\t" << sum
+                  << "\n\tavg \t\t\t" << avg << "\n\tavgLargerZero \t\t" << avgLargerZero
+                  << "\n\tmax \t\t\t" << max,
+         1);
+}
+static PyObject *_W_4(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "debugGridInfo", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Real> &grid = *_args.getPtr<Grid<Real>>("grid", 1, &_lock);
+      std::string name = _args.get<std::string>("name", 2, &_lock);
+      const int itype = _args.getOpt<int>("itype", 3, FlagGrid::TypeFluid, &_lock);
+      _retval = getPyNone();
+      debugGridInfo(flags, grid, name, itype);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "debugGridInfo", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("debugGridInfo", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_debugGridInfo("", "debugGridInfo", _W_4);
+extern "C" {
+void PbRegister_debugGridInfo()
+{
+  KEEP_UNUSED(_RP_debugGridInfo);
+}
+}
+
+// The following methods are helper functions to recreate the velocity and flag grid from the
+// underlying FLIP simulation. They cannot simply be loaded because of the upres to a higher
+// resolution, instead a levelset is used.
+
+struct knSetFlagsFromLevelset : public KernelBase {
+  knSetFlagsFromLevelset(FlagGrid &flags,
+                         const Grid<Real> &phi,
+                         const int exclude = FlagGrid::TypeObstacle,
+                         const int itype = FlagGrid::TypeFluid)
+      : KernelBase(&flags, 0), flags(flags), phi(phi), exclude(exclude), itype(itype)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 FlagGrid &flags,
+                 const Grid<Real> &phi,
+                 const int exclude = FlagGrid::TypeObstacle,
+                 const int itype = FlagGrid::TypeFluid) const
+  {
+    if (phi(idx) < 0 && !(flags(idx) & exclude))
+      flags(idx) = itype;
+  }
+  inline FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline const Grid<Real> &getArg1()
+  {
+    return phi;
+  }
+  typedef Grid<Real> type1;
+  inline const int &getArg2()
+  {
+    return exclude;
+  }
+  typedef int type2;
+  inline const int &getArg3()
+  {
+    return itype;
+  }
+  typedef int type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetFlagsFromLevelset ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, flags, phi, exclude, itype);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  FlagGrid &flags;
+  const Grid<Real> &phi;
+  const int exclude;
+  const int itype;
+};
+
+void setFlagsFromLevelset(FlagGrid &flags,
+                          const Grid<Real> &phi,
+                          const int exclude = FlagGrid::TypeObstacle,
+                          const int itype = FlagGrid::TypeFluid)
+{
+  knSetFlagsFromLevelset(flags, phi, exclude, itype);
+}
+static PyObject *_W_5(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "setFlagsFromLevelset", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      const Grid<Real> &phi = *_args.getPtr<Grid<Real>>("phi", 1, &_lock);
+      const int exclude = _args.getOpt<int>("exclude", 2, FlagGrid::TypeObstacle, &_lock);
+      const int itype = _args.getOpt<int>("itype", 3, FlagGrid::TypeFluid, &_lock);
+      _retval = getPyNone();
+      setFlagsFromLevelset(flags, phi, exclude, itype);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "setFlagsFromLevelset", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("setFlagsFromLevelset", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_setFlagsFromLevelset("", "setFlagsFromLevelset", _W_5);
+extern "C" {
+void PbRegister_setFlagsFromLevelset()
+{
+  KEEP_UNUSED(_RP_setFlagsFromLevelset);
+}
+}
+
+struct knSetMACFromLevelset : public KernelBase {
+  knSetMACFromLevelset(MACGrid &v, const Grid<Real> &phi, const Vec3 c)
+      : KernelBase(&v, 0), v(v), phi(phi), c(c)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, MACGrid &v, const Grid<Real> &phi, const Vec3 c) const
+  {
+    if (phi.getInterpolated(Vec3(i, j, k)) > 0)
+      v(i, j, k) = c;
+  }
+  inline MACGrid &getArg0()
+  {
+    return v;
+  }
+  typedef MACGrid type0;
+  inline const Grid<Real> &getArg1()
+  {
+    return phi;
+  }
+  typedef Grid<Real> type1;
+  inline const Vec3 &getArg2()
+  {
+    return c;
+  }
+  typedef Vec3 type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetMACFromLevelset ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, v, phi, c);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, v, phi, c);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  MACGrid &v;
+  const Grid<Real> &phi;
+  const Vec3 c;
+};
+
+void setMACFromLevelset(MACGrid &v, const Grid<Real> &phi, const Vec3 c)
+{
+  knSetMACFromLevelset(v, phi, c);
+}
+static PyObject *_W_6(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "setMACFromLevelset", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      MACGrid &v = *_args.getPtr<MACGrid>("v", 0, &_lock);
+      const Grid<Real> &phi = *_args.getPtr<Grid<Real>>("phi", 1, &_lock);
+      const Vec3 c = _args.get<Vec3>("c", 2, &_lock);
+      _retval = getPyNone();
+      setMACFromLevelset(v, phi, c);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "setMACFromLevelset", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("setMACFromLevelset", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_setMACFromLevelset("", "setMACFromLevelset", _W_6);
+extern "C" {
+void PbRegister_setMACFromLevelset()
+{
+  KEEP_UNUSED(_RP_setMACFromLevelset);
+}
+}
+
+//----------------------------------------------------------------------------------------------------------------------------------------------------
+// END Secondary Particles for FLIP
+//----------------------------------------------------------------------------------------------------------------------------------------------------
+#pragma endregion
+
+#pragma region Legacy Methods(still useful for debugging)
+//-----------------------------------------------------------------------------------------------------------------------------------
+//-----------------
+// Legacy Methods (still useful for debugging)
+//----------------------------------------------------------------------------------------------------------------------------------------------------
+
+// LEGACY METHOD! Use flipComputeSecondaryParticlePotentials instead!
+// computes trapped air potential for all fluid cells in &flags and saves it in &pot
+
+struct knFlipComputePotentialTrappedAir : public KernelBase {
+  knFlipComputePotentialTrappedAir(Grid<Real> &pot,
+                                   const FlagGrid &flags,
+                                   const MACGrid &v,
+                                   const int radius,
+                                   const Real tauMin,
+                                   const Real tauMax,
+                                   const Real scaleFromManta,
+                                   const int itype = FlagGrid::TypeFluid,
+                                   const int jtype = FlagGrid::TypeFluid)
+      : KernelBase(&pot, 1),
+        pot(pot),
+        flags(flags),
+        v(v),
+        radius(radius),
+        tauMin(tauMin),
+        tauMax(tauMax),
+        scaleFromManta(scaleFromManta),
+        itype(itype),
+        jtype(jtype)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 Grid<Real> &pot,
+                 const FlagGrid &flags,
+                 const MACGrid &v,
+                 const int radius,
+                 const Real tauMin,
+                 const Real tauMax,
+                 const Real scaleFromManta,
+                 const int itype = FlagGrid::TypeFluid,
+                 const int jtype = FlagGrid::TypeFluid) const
+  {
+
+    if (!(flags(i, j, k) & itype))
+      return;
+
+    const Vec3 &xi = scaleFromManta * Vec3(i, j, k);  // scale to unit cube
+    const Vec3 &vi = scaleFromManta * v.getCentered(i, j, k);
+    Real vdiff = 0;
+    for (IndexInt x = i - radius; x <= i + radius; x++) {
+      for (IndexInt y = j - radius; y <= j + radius; y++) {
+        for (IndexInt z = k - radius; z <= k + radius; z++) {
+          if ((x == i && y == j && z == k) || !(flags(x, y, z) & jtype))
+            continue;
+
+          const Vec3 &xj = scaleFromManta * Vec3(x, y, z);  // scale to unit cube
+          const Vec3 &vj = scaleFromManta * v.getCentered(x, y, z);
+          const Vec3 xij = xi - xj;
+          const Vec3 vij = vi - vj;
+          Real h = !pot.is3D() ? 1.414 * radius :
+                                 1.732 * radius;  // estimate sqrt(2)*radius resp. sqrt(3)*radius
+                                                  // for h, due to squared resp. cubic neighbor area
+          vdiff += norm(vij) * (1 - dot(getNormalized(vij), getNormalized(xij))) *
+                   (1 - norm(xij) / h);
+        }
+      }
+    }
+    pot(i, j, k) = (std::min(vdiff, tauMax) - std::min(vdiff, tauMin)) / (tauMax - tauMin);
+  }
+  inline Grid<Real> &getArg0()
+  {
+    return pot;
+  }
+  typedef Grid<Real> type0;
+  inline const FlagGrid &getArg1()
+  {
+    return flags;
+  }
+  typedef FlagGrid type1;
+  inline const MACGrid &getArg2()
+  {
+    return v;
+  }
+  typedef MACGrid type2;
+  inline const int &getArg3()
+  {
+    return radius;
+  }
+  typedef int type3;
+  inline const Real &getArg4()
+  {
+    return tauMin;
+  }
+  typedef Real type4;
+  inline const Real &getArg5()
+  {
+    return tauMax;
+  }
+  typedef Real type5;
+  inline const Real &getArg6()
+  {
+    return scaleFromManta;
+  }
+  typedef Real type6;
+  inline const int &getArg7()
+  {
+    return itype;
+  }
+  typedef int type7;
+  inline const int &getArg8()
+  {
+    return jtype;
+  }
+  typedef int type8;
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipComputePotentialTrappedAir ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, pot, flags, v, radius, tauMin, tauMax, scaleFromManta, itype, jtype);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, pot, flags, v, radius, tauMin, tauMax, scaleFromManta, itype, jtype);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  Grid<Real> &pot;
+  const FlagGrid &flags;
+  const MACGrid &v;
+  const int radius;
+  const Real tauMin;
+  const Real tauMax;
+  const Real scaleFromManta;
+  const int itype;
+  const int jtype;
+};
+
+void flipComputePotentialTrappedAir(Grid<Real> &pot,
+                                    const FlagGrid &flags,
+                                    const MACGrid &v,
+                                    const int radius,
+                                    const Real tauMin,
+                                    const Real tauMax,
+                                    const Real scaleFromManta,
+                                    const int itype = FlagGrid::TypeFluid,
+                                    const int jtype = FlagGrid::TypeFluid)
+{
+  pot.clear();
+  knFlipComputePotentialTrappedAir(
+      pot, flags, v, radius, tauMin, tauMax, scaleFromManta, itype, jtype);
+}
+static PyObject *_W_7(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "flipComputePotentialTrappedAir", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &pot = *_args.getPtr<Grid<Real>>("pot", 0, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 1, &_lock);
+      const MACGrid &v = *_args.getPtr<MACGrid>("v", 2, &_lock);
+      const int radius = _args.get<int>("radius", 3, &_lock);
+      const Real tauMin = _args.get<Real>("tauMin", 4, &_lock);
+      const Real tauMax = _args.get<Real>("tauMax", 5, &_lock);
+      const Real scaleFromManta = _args.get<Real>("scaleFromManta", 6, &_lock);
+      const int itype = _args.getOpt<int>("itype", 7, FlagGrid::TypeFluid, &_lock);
+      const int jtype = _args.getOpt<int>("jtype", 8, FlagGrid::TypeFluid, &_lock);
+      _retval = getPyNone();
+      flipComputePotentialTrappedAir(
+          pot, flags, v, radius, tauMin, tauMax, scaleFromManta, itype, jtype);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "flipComputePotentialTrappedAir", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("flipComputePotentialTrappedAir", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_flipComputePotentialTrappedAir("",
+                                                             "flipComputePotentialTrappedAir",
+                                                             _W_7);
+extern "C" {
+void PbRegister_flipComputePotentialTrappedAir()
+{
+  KEEP_UNUSED(_RP_flipComputePotentialTrappedAir);
+}
+}
+
+// LEGACY METHOD! Use flipComputeSecondaryParticlePotentials instead!
+// computes kinetic energy potential for all fluid cells in &flags and saves it in &pot
+
+struct knFlipComputePotentialKineticEnergy : public KernelBase {
+  knFlipComputePotentialKineticEnergy(Grid<Real> &pot,
+                                      const FlagGrid &flags,
+                                      const MACGrid &v,
+                                      const Real tauMin,
+                                      const Real tauMax,
+                                      const Real scaleFromManta,
+                                      const int itype = FlagGrid::TypeFluid)
+      : KernelBase(&pot, 0),
+        pot(pot),
+        flags(flags),
+        v(v),
+        tauMin(tauMin),
+        tauMax(tauMax),
+        scaleFromManta(scaleFromManta),
+        itype(itype)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 Grid<Real> &pot,
+                 const FlagGrid &flags,
+                 const MACGrid &v,
+                 const Real tauMin,
+                 const Real tauMax,
+                 const Real scaleFromManta,
+                 const int itype = FlagGrid::TypeFluid) const
+  {
+
+    if (!(flags(i, j, k) & itype))
+      return;
+
+    const Vec3 &vi = scaleFromManta * v.getCentered(i, j, k);  // scale to unit cube
+    Real ek =
+        Real(0.5) * 125 *
+        normSquare(
+            vi);  // use arbitrary constant for mass, potential adjusts with thresholds anyways
+    pot(i, j, k) = (std::min(ek, tauMax) - std::min(ek, tauMin)) / (tauMax - tauMin);
+  }
+  inline Grid<Real> &getArg0()
+  {
+    return pot;
+  }
+  typedef Grid<Real> type0;
+  inline const FlagGrid &getArg1()
+  {
+    return flags;
+  }
+  typedef FlagGrid type1;
+  inline const MACGrid &getArg2()
+  {
+    return v;
+  }
+  typedef MACGrid type2;
+  inline const Real &getArg3()
+  {
+    return tauMin;
+  }
+  typedef Real type3;
+  inline const Real &getArg4()
+  {
+    return tauMax;
+  }
+  typedef Real type4;
+  inline const Real &getArg5()
+  {
+    return scaleFromManta;
+  }
+  typedef Real type5;
+  inline const int &getArg6()
+  {
+    return itype;
+  }
+  typedef int type6;
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipComputePotentialKineticEnergy ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, pot, flags, v, tauMin, tauMax, scaleFromManta, itype);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, pot, flags, v, tauMin, tauMax, scaleFromManta, itype);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  Grid<Real> &pot;
+  const FlagGrid &flags;
+  const MACGrid &v;
+  const Real tauMin;
+  const Real tauMax;
+  const Real scaleFromManta;
+  const int itype;
+};
+
+void flipComputePotentialKineticEnergy(Grid<Real> &pot,
+                                       const FlagGrid &flags,
+                                       const MACGrid &v,
+                                       const Real tauMin,
+                                       const Real tauMax,
+                                       const Real scaleFromManta,
+                                       const int itype = FlagGrid::TypeFluid)
+{
+  pot.clear();
+  knFlipComputePotentialKineticEnergy(pot, flags, v, tauMin, tauMax, scaleFromManta, itype);
+}
+static PyObject *_W_8(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "flipComputePotentialKineticEnergy", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &pot = *_args.getPtr<Grid<Real>>("pot", 0, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 1, &_lock);
+      const MACGrid &v = *_args.getPtr<MACGrid>("v", 2, &_lock);
+      const Real tauMin = _args.get<Real>("tauMin", 3, &_lock);
+      const Real tauMax = _args.get<Real>("tauMax", 4, &_lock);
+      const Real scaleFromManta = _args.get<Real>("scaleFromManta", 5, &_lock);
+      const int itype = _args.getOpt<int>("itype", 6, FlagGrid::TypeFluid, &_lock);
+      _retval = getPyNone();
+      flipComputePotentialKineticEnergy(pot, flags, v, tauMin, tauMax, scaleFromManta, itype);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "flipComputePotentialKineticEnergy", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("flipComputePotentialKineticEnergy", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_flipComputePotentialKineticEnergy(
+    "", "flipComputePotentialKineticEnergy", _W_8);
+extern "C" {
+void PbRegister_flipComputePotentialKineticEnergy()
+{
+  KEEP_UNUSED(_RP_flipComputePotentialKineticEnergy);
+}
+}
+
+// LEGACY METHOD! Use flipComputeSecondaryParticlePotentials instead!
+// computes wave crest potential for all fluid cells in &flags and saves it in &pot
+
+struct knFlipComputePotentialWaveCrest : public KernelBase {
+  knFlipComputePotentialWaveCrest(Grid<Real> &pot,
+                                  const FlagGrid &flags,
+                                  const MACGrid &v,
+                                  const int radius,
+                                  Grid<Vec3> &normal,
+                                  const Real tauMin,
+                                  const Real tauMax,
+                                  const Real scaleFromManta,
+                                  const int itype = FlagGrid::TypeFluid,
+                                  const int jtype = FlagGrid::TypeFluid)
+      : KernelBase(&pot, 1),
+        pot(pot),
+        flags(flags),
+        v(v),
+        radius(radius),
+        normal(normal),
+        tauMin(tauMin),
+        tauMax(tauMax),
+        scaleFromManta(scaleFromManta),
+        itype(itype),
+        jtype(jtype)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 Grid<Real> &pot,
+                 const FlagGrid &flags,
+                 const MACGrid &v,
+                 const int radius,
+                 Grid<Vec3> &normal,
+                 const Real tauMin,
+                 const Real tauMax,
+                 const Real scaleFromManta,
+                 const int itype = FlagGrid::TypeFluid,
+                 const int jtype = FlagGrid::TypeFluid) const
+  {
+
+    if (!(flags(i, j, k) & itype))
+      return;
+
+    const Vec3 &xi = scaleFromManta * Vec3(i, j, k);  // scale to unit cube
+    const Vec3 &vi = scaleFromManta * v.getCentered(i, j, k);
+    const Vec3 &ni = normal(i, j, k);
+    Real kappa = 0;
+    for (IndexInt x = i - radius; x <= i + radius; x++) {
+      for (IndexInt y = j - radius; y <= j + radius; y++) {
+        for (IndexInt z = k - radius; z <= k + radius; z++) {
+          if ((x == i && y == j && z == k) || !(flags(x, y, z) & jtype))
+            continue;
+          const Vec3 &xj = scaleFromManta * Vec3(x, y, z);  // scale to unit cube
+          const Vec3 &nj = normal(x, y, z);
+          const Vec3 xij = xi - xj;
+          if (dot(getNormalized(xij), ni) < 0) {  // identifies wave crests
+            Real h = !pot.is3D() ?
+                         1.414 * radius :
+                         1.732 * radius;  // estimate sqrt(2)*radius resp. sqrt(3)*radius for h,
+                                          // due to squared resp. cubic neighbor area
+            kappa += (1 - dot(ni, nj)) * (1 - norm(xij) / h);
+          }
+        }
+      }
+    }
+
+    if (dot(getNormalized(vi), ni) >= 0.6) {  // avoid to mark boarders of the scene as wave crest
+      pot(i, j, k) = (std::min(kappa, tauMax) - std::min(kappa, tauMin)) / (tauMax - tauMin);
+    }
+    else {
+      pot(i, j, k) = Real(0);
+    }
+  }
+  inline Grid<Real> &getArg0()
+  {
+    return pot;
+  }
+  typedef Grid<Real> type0;
+  inline const FlagGrid &getArg1()
+  {
+    return flags;
+  }
+  typedef FlagGrid type1;
+  inline const MACGrid &getArg2()
+  {
+    return v;
+  }
+  typedef MACGrid type2;
+  inline const int &getArg3()
+  {
+    return radius;
+  }
+  typedef int type3;
+  inline Grid<Vec3> &getArg4()
+  {
+    return normal;
+  }
+  typedef Grid<Vec3> type4;
+  inline const Real &getArg5()
+  {
+    return tauMin;
+  }
+  typedef Real type5;
+  inline const Real &getArg6()
+  {
+    return tauMax;
+  }
+  typedef Real type6;
+  inline const Real &getArg7()
+  {
+    return scaleFromManta;
+  }
+  typedef Real type7;
+  inline const int &getArg8()
+  {
+    return itype;
+  }
+  typedef int type8;
+  inline const int &getArg9()
+  {
+    return jtype;
+  }
+  typedef int type9;
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipComputePotentialWaveCrest ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i,
+               j,
+               k,
+               pot,
+               flags,
+               v,
+               radius,
+               normal,
+               tauMin,
+               tauMax,
+               scaleFromManta,
+               itype,
+               jtype);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, pot, flags, v, radius, normal, tauMin, tauMax, scaleFromManta, itype, jtype);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  Grid<Real> &pot;
+  const FlagGrid &flags;
+  const MACGrid &v;
+  const int radius;
+  Grid<Vec3> &normal;
+  const Real tauMin;
+  const Real tauMax;
+  const Real scaleFromManta;
+  const int itype;
+  const int jtype;
+};
+
+void flipComputePotentialWaveCrest(Grid<Real> &pot,
+                                   const FlagGrid &flags,
+                                   const MACGrid &v,
+                                   const int radius,
+                                   Grid<Vec3> &normal,
+                                   const Real tauMin,
+                                   const Real tauMax,
+                                   const Real scaleFromManta,
+                                   const int itype = FlagGrid::TypeFluid,
+                                   const int jtype = FlagGrid::TypeFluid)
+{
+
+  pot.clear();
+  knFlipComputePotentialWaveCrest(
+      pot, flags, v, radius, normal, tauMin, tauMax, scaleFromManta, itype, jtype);
+}
+static PyObject *_W_9(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "flipComputePotentialWaveCrest", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &pot = *_args.getPtr<Grid<Real>>("pot", 0, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 1, &_lock);
+      const MACGrid &v = *_args.getPtr<MACGrid>("v", 2, &_lock);
+      const int radius = _args.get<int>("radius", 3, &_lock);
+      Grid<Vec3> &normal = *_args.getPtr<Grid<Vec3>>("normal", 4, &_lock);
+      const Real tauMin = _args.get<Real>("tauMin", 5, &_lock);
+      const Real tauMax = _args.get<Real>("tauMax", 6, &_lock);
+      const Real scaleFromManta = _args.get<Real>("scaleFromManta", 7, &_lock);
+      const int itype = _args.getOpt<int>("itype", 8, FlagGrid::TypeFluid, &_lock);
+      const int jtype = _args.getOpt<int>("jtype", 9, FlagGrid::TypeFluid, &_lock);
+      _retval = getPyNone();
+      flipComputePotentialWaveCrest(
+          pot, flags, v, radius, normal, tauMin, tauMax, scaleFromManta, itype, jtype);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "flipComputePotentialWaveCrest", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("flipComputePotentialWaveCrest", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_flipComputePotentialWaveCrest("",
+                                                            "flipComputePotentialWaveCrest",
+                                                            _W_9);
+extern "C" {
+void PbRegister_flipComputePotentialWaveCrest()
+{
+  KEEP_UNUSED(_RP_flipComputePotentialWaveCrest);
+}
+}
+
+// LEGACY METHOD! Use flipComputeSecondaryParticlePotentials instead!
+// computes normal grid &normal as gradient of levelset &phi and normalizes it
+
+struct knFlipComputeSurfaceNormals : public KernelBase {
+  knFlipComputeSurfaceNormals(Grid<Vec3> &normal, const Grid<Real> &phi)
+      : KernelBase(&normal, 0), normal(normal), phi(phi)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx, Grid<Vec3> &normal, const Grid<Real> &phi) const
+  {
+    normal[idx] = getNormalized(normal[idx]);
+  }
+  inline Grid<Vec3> &getArg0()
+  {
+    return normal;
+  }
+  typedef Grid<Vec3> type0;
+  inline const Grid<Real> &getArg1()
+  {
+    return phi;
+  }
+  typedef Grid<Real> type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipComputeSurfaceNormals ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, normal, phi);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  Grid<Vec3> &normal;
+  const Grid<Real> &phi;
+};
+
+void flipComputeSurfaceNormals(Grid<Vec3> &normal, const Grid<Real> &phi)
+{
+  GradientOp(normal, phi);
+  knFlipComputeSurfaceNormals(normal, phi);
+}
+static PyObject *_W_10(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "flipComputeSurfaceNormals", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Vec3> &normal = *_args.getPtr<Grid<Vec3>>("normal", 0, &_lock);
+      const Grid<Real> &phi = *_args.getPtr<Grid<Real>>("phi", 1, &_lock);
+      _retval = getPyNone();
+      flipComputeSurfaceNormals(normal, phi);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "flipComputeSurfaceNormals", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("flipComputeSurfaceNormals", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_flipComputeSurfaceNormals("", "flipComputeSurfaceNormals", _W_10);
+extern "C" {
+void PbRegister_flipComputeSurfaceNormals()
+{
+  KEEP_UNUSED(_RP_flipComputeSurfaceNormals);
+}
+}
+
+// LEGACY METHOD! Use flipComputeSecondaryParticlePotentials instead!
+// computes the neighbor ratio for every fluid cell in &flags as the number of fluid neighbors over
+// the maximum possible number of fluid neighbors
+
+struct knFlipUpdateNeighborRatio : public KernelBase {
+  knFlipUpdateNeighborRatio(const FlagGrid &flags,
+                            Grid<Real> &neighborRatio,
+                            const int radius,
+                            const int itype = FlagGrid::TypeFluid,
+                            const int jtype = FlagGrid::TypeObstacle)
+      : KernelBase(&flags, 1),
+        flags(flags),
+        neighborRatio(neighborRatio),
+        radius(radius),
+        itype(itype),
+        jtype(jtype)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 Grid<Real> &neighborRatio,
+                 const int radius,
+                 const int itype = FlagGrid::TypeFluid,
+                 const int jtype = FlagGrid::TypeObstacle) const
+  {
+
+    if (!(flags(i, j, k) & itype))
+      return;
+
+    int countFluid = 0;
+    int countMaxFluid = 0;
+    for (IndexInt x = i - radius; x <= i + radius; x++) {
+      for (IndexInt y = j - radius; y <= j + radius; y++) {
+        for (IndexInt z = k - radius; z <= k + radius; z++) {
+          if ((x == i && y == j && z == k) || (flags(x, y, z) & jtype))
+            continue;
+          if (flags(x, y, z) & itype) {
+            countFluid++;
+            countMaxFluid++;
+          }
+          else {
+            countMaxFluid++;
+          }
+        }
+      }
+    }
+    neighborRatio(i, j, k) = float(countFluid) / float(countMaxFluid);
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<Real> &getArg1()
+  {
+    return neighborRatio;
+  }
+  typedef Grid<Real> type1;
+  inline const int &getArg2()
+  {
+    return radius;
+  }
+  typedef int type2;
+  inline const int &getArg3()
+  {
+    return itype;
+  }
+  typedef int type3;
+  inline const int &getArg4()
+  {
+    return jtype;
+  }
+  typedef int type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipUpdateNeighborRatio ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, neighborRatio, radius, itype, jtype);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, neighborRatio, radius, itype, jtype);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  Grid<Real> &neighborRatio;
+  const int radius;
+  const int itype;
+  const int jtype;
+};
+
+void flipUpdateNeighborRatio(const FlagGrid &flags,
+                             Grid<Real> &neighborRatio,
+                             const int radius,
+                             const int itype = FlagGrid::TypeFluid,
+                             const int jtype = FlagGrid::TypeObstacle)
+{
+
+  neighborRatio.clear();
+  knFlipUpdateNeighborRatio(flags, neighborRatio, radius, itype, jtype);
+}
+static PyObject *_W_11(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "flipUpdateNeighborRatio", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Real> &neighborRatio = *_args.getPtr<Grid<Real>>("neighborRatio", 1, &_lock);
+      const int radius = _args.get<int>("radius", 2, &_lock);
+      const int itype = _args.getOpt<int>("itype", 3, FlagGrid::TypeFluid, &_lock);
+      const int jtype = _args.getOpt<int>("jtype", 4, FlagGrid::TypeObstacle, &_lock);
+      _retval = getPyNone();
+      flipUpdateNeighborRatio(flags, neighborRatio, radius, itype, jtype);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "flipUpdateNeighborRatio", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("flipUpdateNeighborRatio", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_flipUpdateNeighborRatio("", "flipUpdateNeighborRatio", _W_11);
+extern "C" {
+void PbRegister_flipUpdateNeighborRatio()
+{
+  KEEP_UNUSED(_RP_flipUpdateNeighborRatio);
+}
+}
+
+//----------------------------------------------------------------------------------------------------------------------------------------------------
+// Legacy Methods (still useful for debugging)
+//----------------------------------------------------------------------------------------------------------------------------------------------------
+#pragma endregion
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/surfaceturbulence.cpp b/extern/mantaflow/preprocessed/plugin/surfaceturbulence.cpp
new file mode 100644
index 00000000000..465314f51ed
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/surfaceturbulence.cpp
@@ -0,0 +1,2189 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Copyright 2016 Olivier Mercier, oli.mercier@gmail.com
+ *
+ * This program is free software, distributed under the terms of the
+ * Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Surface Turbulence for Particle-Based Liquid Simulations
+ * Mercier et al., SIGGRAPH Asia 2015
+ *
+ * Possible speedups :
+ * - only initialize surface points around coarse particles near the surface. Use the flags in the
+ *fluid grid and only use cells with non-fluid neighbors.
+ *
+ ******************************************************************************/
+
+// use chrono stl for detailed timing only if available
+#ifdef __GNUC__
+#  if __GNUC__ < 5
+#    define USE_CHRONO 0
+#  endif
+#endif
+
+#if MANTA_WITHCPP11 == 1
+#  ifndef USE_CHRONO
+#    define USE_CHRONO 1
+#  endif
+#endif
+
+#include <iomanip>
+#if USE_CHRONO == 1
+#  include <chrono>
+#endif
+#include "particle.h"
+
+using namespace std;
+namespace Manta {
+
+// own namespace for globals
+namespace SurfaceTurbulence {
+
+//
+// **** surface turbulence parameters ****
+//
+struct SurfaceTurbulenceParameters {
+  int res;
+  Real outerRadius;
+  int surfaceDensity;
+  int nbSurfaceMaintenanceIterations;
+  Real dt;
+  Real waveSpeed;
+  Real waveDamping;
+  Real waveSeedFrequency;
+  Real waveMaxAmplitude;
+  Real waveMaxFrequency;
+  Real waveMaxSeedingAmplitude;  // as ratio of max amp;
+  Real waveSeedingCurvatureThresholdRegionCenter;
+  Real waveSeedingCurvatureThresholdRegionRadius;
+  Real waveSeedStepSizeRatioOfMax;
+  Real innerRadius;
+  Real meanFineDistance;
+  Real constraintA;
+  Real normalRadius;
+  Real tangentRadius;
+  Real bndXm, bndXp, bndYm, bndYp, bndZm, bndZp;
+};
+SurfaceTurbulenceParameters params;
+
+//
+// **** acceleration grid for particle neighbor queries ****
+//
+struct ParticleAccelGrid {
+  int res;
+  vector<int> ***indices;
+
+  void init(int inRes)
+  {
+    res = inRes;
+    indices = new vector<int> **[res];
+    for (int i = 0; i < res; i++) {
+      indices[i] = new vector<int> *[res];
+      for (int j = 0; j < res; j++) {
+        indices[i][j] = new vector<int>[res];
+      }
+    }
+  }
+
+  void fillWith(const BasicParticleSystem &particles)
+  {
+    // clear
+    for (int i = 0; i < res; i++) {
+      for (int j = 0; j < res; j++) {
+        for (int k = 0; k < res; k++) {
+          indices[i][j][k].clear();
+        }
+      }
+    }
+
+    // fill
+    for (int id = 0; id < particles.size(); id++) {
+      Vec3 pos = particles.getPos(id);
+      int i = clamp<int>(floor(pos.x / params.res * res), 0, res - 1);
+      int j = clamp<int>(floor(pos.y / params.res * res), 0, res - 1);
+      int k = clamp<int>(floor(pos.z / params.res * res), 0, res - 1);
+      indices[i][j][k].push_back(id);
+    }
+  }
+
+  void fillWith(const ParticleDataImpl<Vec3> &particles)
+  {
+    // clear
+    for (int i = 0; i < res; i++) {
+      for (int j = 0; j < res; j++) {
+        for (int k = 0; k < res; k++) {
+          indices[i][j][k].clear();
+        }
+      }
+    }
+
+    // fill
+    for (int id = 0; id < particles.size(); id++) {
+      Vec3 pos = particles[id];
+      int i = clamp<int>(floor(pos.x / params.res * res), 0, res - 1);
+      int j = clamp<int>(floor(pos.y / params.res * res), 0, res - 1);
+      int k = clamp<int>(floor(pos.z / params.res * res), 0, res - 1);
+      indices[i][j][k].push_back(id);
+    }
+  }
+};
+
+#define LOOP_NEIGHBORS_BEGIN(points, center, radius) \
+  int minI = clamp<int>( \
+      floor((center.x - radius) / params.res * points.accel->res), 0, points.accel->res - 1); \
+  int maxI = clamp<int>( \
+      floor((center.x + radius) / params.res * points.accel->res), 0, points.accel->res - 1); \
+  int minJ = clamp<int>( \
+      floor((center.y - radius) / params.res * points.accel->res), 0, points.accel->res - 1); \
+  int maxJ = clamp<int>( \
+      floor((center.y + radius) / params.res * points.accel->res), 0, points.accel->res - 1); \
+  int minK = clamp<int>( \
+      floor((center.z - radius) / params.res * points.accel->res), 0, points.accel->res - 1); \
+  int maxK = clamp<int>( \
+      floor((center.z + radius) / params.res * points.accel->res), 0, points.accel->res - 1); \
+  for (int i = minI; i <= maxI; i++) { \
+    for (int j = minJ; j <= maxJ; j++) { \
+      for (int k = minK; k <= maxK; k++) { \
+        for (int idLOOPNEIGHBORS = 0; \
+             idLOOPNEIGHBORS < (int)points.accel->indices[i][j][k].size(); \
+             idLOOPNEIGHBORS++) { \
+          int idn = points.accel->indices[i][j][k][idLOOPNEIGHBORS]; \
+          if (points.isActive(idn)) {
+#define LOOP_NEIGHBORS_END \
+  } \
+  } \
+  } \
+  } \
+  }
+
+#define LOOP_GHOSTS_POS_BEGIN(pos, radius) \
+  int flagLOOPGHOSTS = -1; \
+  Vec3 gPos; \
+  while (flagLOOPGHOSTS < 6) { \
+    if (flagLOOPGHOSTS < 0 && pos.x - params.bndXm <= radius) { \
+      flagLOOPGHOSTS = 0; \
+      gPos = Vec3(2.f * params.bndXm - pos.x, pos.y, pos.z); \
+    } \
+    else if (flagLOOPGHOSTS < 1 && params.bndXp - pos.x <= radius) { \
+      flagLOOPGHOSTS = 1; \
+      gPos = Vec3(2.f * params.bndXp - pos.x, pos.y, pos.z); \
+    } \
+    else if (flagLOOPGHOSTS < 2 && pos.y - params.bndYm <= radius) { \
+      flagLOOPGHOSTS = 2; \
+      gPos = Vec3(pos.x, 2.f * params.bndYm - pos.y, pos.z); \
+    } \
+    else if (flagLOOPGHOSTS < 3 && params.bndYp - pos.y <= radius) { \
+      flagLOOPGHOSTS = 3; \
+      gPos = Vec3(pos.x, 2.f * params.bndYp - pos.y, pos.z); \
+    } \
+    else if (flagLOOPGHOSTS < 4 && pos.z - params.bndZm <= radius) { \
+      flagLOOPGHOSTS = 4; \
+      gPos = Vec3(pos.x, pos.y, 2.f * params.bndZm - pos.z); \
+    } \
+    else if (flagLOOPGHOSTS < 5 && params.bndZp - pos.Z <= radius) { \
+      flagLOOPGHOSTS = 5; \
+      gPos = Vec3(pos.x, pos.y, 2.f * params.bndZp - pos.z); \
+    } \
+    else { \
+      flagLOOPGHOSTS = 6; \
+      gPos = Vec3(pos.x, pos.y, pos.z); \
+    }
+#define LOOP_GHOSTS_POS_NORMAL_BEGIN(pos, normal, radius) \
+  int flagLOOPGHOSTS = -1; \
+  Vec3 gPos, gNormal; \
+  while (flagLOOPGHOSTS < 6) { \
+    if (flagLOOPGHOSTS < 0 && pos.x - params.bndXm <= radius) { \
+      flagLOOPGHOSTS = 0; \
+      gPos = Vec3(2.f * params.bndXm - pos.x, pos.y, pos.z); \
+      gNormal = Vec3(-normal.x, normal.y, normal.z); \
+    } \
+    else if (flagLOOPGHOSTS < 1 && params.bndXp - pos.x <= radius) { \
+      flagLOOPGHOSTS = 1; \
+      gPos = Vec3(2.f * params.bndXp - pos.x, pos.y, pos.z); \
+      gNormal = Vec3(-normal.x, normal.y, normal.z); \
+    } \
+    else if (flagLOOPGHOSTS < 2 && pos.y - params.bndYm <= radius) { \
+      flagLOOPGHOSTS = 2; \
+      gPos = Vec3(pos.x, 2.f * params.bndYm - pos.y, pos.z); \
+      gNormal = Vec3(normal.x, -normal.y, normal.z); \
+    } \
+    else if (flagLOOPGHOSTS < 3 && params.bndYp - pos.y <= radius) { \
+      flagLOOPGHOSTS = 3; \
+      gPos = Vec3(pos.x, 2.f * params.bndYp - pos.y, pos.z); \
+      gNormal = Vec3(normal.x, -normal.y, normal.z); \
+    } \
+    else if (flagLOOPGHOSTS < 4 && pos.z - params.bndZm <= radius) { \
+      flagLOOPGHOSTS = 4; \
+      gPos = Vec3(pos.x, pos.y, 2.f * params.bndZm - pos.z); \
+      gNormal = Vec3(normal.x, normal.y, -normal.z); \
+    } \
+    else if (flagLOOPGHOSTS < 5 && params.bndZp - pos.Z <= radius) { \
+      flagLOOPGHOSTS = 5; \
+      gPos = Vec3(pos.x, pos.y, 2.f * params.bndZp - pos.z); \
+      gNormal = Vec3(normal.x, normal.y, -normal.z); \
+    } \
+    else { \
+      flagLOOPGHOSTS = 6; \
+      gPos = pos; \
+      gNormal = normal; \
+    }
+#define LOOP_GHOSTS_END }
+
+//
+// **** Wrappers around point sets to attach it an acceleration grid ****
+//
+struct PointSetWrapper {
+  ParticleAccelGrid *accel;
+
+  PointSetWrapper(ParticleAccelGrid *inAccel)
+  {
+    accel = inAccel;
+  }
+  virtual void updateAccel() = 0;
+};
+
+struct BasicParticleSystemWrapper : PointSetWrapper {
+  BasicParticleSystem *points;
+
+  BasicParticleSystemWrapper(ParticleAccelGrid *inAccel) : PointSetWrapper(inAccel)
+  {
+  }
+
+  Vec3 getPos(int id) const
+  {
+    return points->getPos(id);
+  }
+  void setPos(int id, Vec3 pos)
+  {
+    points->setPos(id, pos);
+  }
+  void updateAccel()
+  {
+    accel->fillWith(*points);
+  }
+  void clear()
+  {
+    points->clear();
+  }
+  int size() const
+  {
+    return points->size();
+  }
+  bool isActive(int id) const
+  {
+    return points->isActive(id);
+  }
+  void addParticle(Vec3 pos)
+  {
+    points->addParticle(pos);
+  }
+  int getStatus(int id) const
+  {
+    return points->getStatus(id);
+  }
+  void addBuffered(Vec3 pos)
+  {
+    points->addBuffered(pos);
+  }
+  void doCompress()
+  {
+    points->doCompress();
+  }
+  void insertBufferedParticles()
+  {
+    points->insertBufferedParticles();
+  }
+  void kill(int id)
+  {
+    points->kill(id);
+  }
+
+  bool hasNeighbor(Vec3 pos, Real radius) const
+  {
+    bool answer = false;
+    int minI = clamp<int>(floor((pos.x - radius) / params.res * accel->res), 0, accel->res - 1);
+    int maxI = clamp<int>(floor((pos.x + radius) / params.res * accel->res), 0, accel->res - 1);
+    int minJ = clamp<int>(floor((pos.y - radius) / params.res * accel->res), 0, accel->res - 1);
+    int maxJ = clamp<int>(floor((pos.y + radius) / params.res * accel->res), 0, accel->res - 1);
+    int minK = clamp<int>(floor((pos.z - radius) / params.res * accel->res), 0, accel->res - 1);
+    int maxK = clamp<int>(floor((pos.z + radius) / params.res * accel->res), 0, accel->res - 1);
+    for (int i = minI; i <= maxI; i++) {
+      for (int j = minJ; j <= maxJ; j++) {
+        for (int k = minK; k <= maxK; k++) {
+          for (int id = 0; id < (int)accel->indices[i][j][k].size(); id++) {
+            if (points->isActive(accel->indices[i][j][k][id]) &&
+                norm(points->getPos(accel->indices[i][j][k][id]) - pos) <= radius) {
+              answer = true;
+              break;
+            }
+          }
+          if (answer)
+            break;
+        }
+        if (answer)
+          break;
+      }
+      if (answer)
+        break;
+    }
+    return answer;
+  }
+
+  bool hasNeighborOtherThanItself(int idx, Real radius) const
+  {
+    bool answer = false;
+    Vec3 pos = points->getPos(idx);
+    int minI = clamp<int>(floor((pos.x - radius) / params.res * accel->res), 0, accel->res - 1);
+    int maxI = clamp<int>(floor((pos.x + radius) / params.res * accel->res), 0, accel->res - 1);
+    int minJ = clamp<int>(floor((pos.y - radius) / params.res * accel->res), 0, accel->res - 1);
+    int maxJ = clamp<int>(floor((pos.y + radius) / params.res * accel->res), 0, accel->res - 1);
+    int minK = clamp<int>(floor((pos.z - radius) / params.res * accel->res), 0, accel->res - 1);
+    int maxK = clamp<int>(floor((pos.z + radius) / params.res * accel->res), 0, accel->res - 1);
+    for (int i = minI; i <= maxI; i++) {
+      for (int j = minJ; j <= maxJ; j++) {
+        for (int k = minK; k <= maxK; k++) {
+          for (int id = 0; id < (int)accel->indices[i][j][k].size(); id++) {
+            if (accel->indices[i][j][k][id] != idx &&
+                points->isActive(accel->indices[i][j][k][id]) &&
+                norm(points->getPos(accel->indices[i][j][k][id]) - pos) <= radius) {
+              answer = true;
+              break;
+            }
+          }
+          if (answer)
+            break;
+        }
+        if (answer)
+          break;
+      }
+      if (answer)
+        break;
+    }
+    return answer;
+  }
+
+  void removeInvalidIndices(vector<int> &indices)
+  {
+    vector<int> copy;
+    copy.resize(indices.size());
+    for (int i = 0; i < (int)indices.size(); i++) {
+      copy[i] = indices[i];
+    }
+    indices.clear();
+    for (int i = 0; i < (int)copy.size(); i++) {
+      if (points->isActive(copy[i])) {
+        indices.push_back(copy[i]);
+      }
+    }
+  }
+};
+
+struct ParticleDataImplVec3Wrapper : PointSetWrapper {
+  ParticleDataImpl<Vec3> *points;
+
+  ParticleDataImplVec3Wrapper(ParticleAccelGrid *inAccel) : PointSetWrapper(inAccel)
+  {
+  }
+
+  Vec3 getVec3(int id) const
+  {
+    return (*points)[id];
+  }
+  void setVec3(int id, Vec3 vec)
+  {
+    (*points)[id] = vec;
+  }
+  void updateAccel()
+  {
+    accel->fillWith(*points);
+  }
+  bool isActive(int i) const
+  {
+    return true;
+  }
+};
+
+//
+// **** globals ****
+//
+ParticleAccelGrid accelCoarse, accelSurface;
+BasicParticleSystemWrapper coarseParticles(&accelCoarse), surfacePoints(&accelSurface);
+ParticleDataImplVec3Wrapper coarseParticlesPrevPos(
+    &accelCoarse);             // WARNING: reusing the coarse accel grid to save space, don't query
+                               // coarseParticlesPrevPos and coarseParticles at the same time.
+vector<Vec3> tempSurfaceVec3;  // to store misc info on surface points
+vector<Real> tempSurfaceFloat;  // to store misc info on surface points
+int frameCount = 0;
+
+//
+//**** weighting kernels *****
+//
+Real triangularWeight(Real distance, Real radius)
+{
+  return 1.0f - distance / radius;
+}
+Real exponentialWeight(Real distance, Real radius, Real falloff)
+{
+  if (distance > radius)
+    return 0;
+  Real tmp = distance / radius;
+  return expf(-falloff * tmp * tmp);
+}
+
+Real weightKernelAdvection(Real distance)
+{
+  if (distance > 2.f * params.outerRadius) {
+    return 0;
+  }
+  else {
+    return triangularWeight(distance, 2.f * params.outerRadius);
+  }
+}
+
+Real weightKernelCoarseDensity(Real distance)
+{
+  return exponentialWeight(distance, params.outerRadius, 2.0f);
+}
+
+Real weightSurfaceNormal(Real distance)
+{
+  if (distance > params.normalRadius) {
+    return 0;
+  }
+  else {
+    return triangularWeight(distance, params.normalRadius);
+  }
+}
+
+Real weightSurfaceTangent(Real distance)
+{
+  if (distance > params.tangentRadius) {
+    return 0;
+  }
+  else {
+    return triangularWeight(distance, params.tangentRadius);
+  }
+}
+
+//
+// **** utility ****
+//
+
+bool isInDomain(Vec3 pos)
+{
+  return params.bndXm <= pos.x && pos.x <= params.bndXp && params.bndYm <= pos.y &&
+         pos.y <= params.bndYp && params.bndZm <= pos.z && pos.z <= params.bndZp;
+}
+
+Real smoothstep(Real edgeLeft, Real edgeRight, Real val)
+{
+  Real x = clamp((val - edgeLeft) / (edgeRight - edgeLeft), Real(0.), Real(1.));
+  return x * x * (3 - 2 * x);
+}
+
+//
+// **** surface initialization ****
+//
+
+void initFines(const BasicParticleSystemWrapper &coarseParticles,
+               BasicParticleSystemWrapper &surfacePoints,
+               const FlagGrid &flags)
+{
+  unsigned int discretization = (unsigned int)M_PI * (params.outerRadius + params.innerRadius) /
+                                params.meanFineDistance;
+  Real dtheta = 2 * params.meanFineDistance / (params.outerRadius + params.innerRadius);
+  Real outerRadius2 = params.outerRadius * params.outerRadius;
+
+  surfacePoints.clear();
+  for (int idx = 0; idx < (int)coarseParticles.size(); idx++) {
+
+    if (idx % 500 == 0) {
+      cout << "Initializing surface points : " << setprecision(4)
+           << 100.f * idx / coarseParticles.size() << "%" << endl;
+    }
+
+    if (coarseParticles.isActive(idx)) {
+
+      // check flags if we are near surface
+      bool nearSurface = false;
+      Vec3 pos = coarseParticles.getPos(idx);
+      for (int i = -1; i <= 1; i++) {
+        for (int j = -1; j <= 1; j++) {
+          for (int k = -1; k <= 1; k++) {
+            if (!flags.isFluid(((int)pos.x) + i, ((int)pos.y) + j, ((int)pos.z) + k)) {
+              nearSurface = true;
+              break;
+            }
+          }
+        }
+      }
+
+      if (nearSurface) {
+        for (unsigned int i = 0; i <= discretization / 2; ++i) {
+          Real discretization2 = Real(floor(2 * M_PI * sin(i * dtheta) / dtheta) + 1);
+          for (Real phi = 0; phi < 2 * M_PI; phi += Real(2 * M_PI / discretization2)) {
+            Real theta = i * dtheta;
+            Vec3 normal(sin(theta) * cos(phi), cos(theta), sin(theta) * sin(phi));
+            Vec3 position = coarseParticles.getPos(idx) + params.outerRadius * normal;
+
+            bool valid = true;
+            LOOP_NEIGHBORS_BEGIN(coarseParticles, position, 2.f * params.outerRadius)
+            if (idx != idn && normSquare(position - coarseParticles.getPos(idn)) < outerRadius2) {
+              valid = false;
+              break;
+            }
+            LOOP_NEIGHBORS_END
+            if (valid) {
+              surfacePoints.addParticle(position);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+//
+// **** surface advection ****
+//
+
+struct advectSurfacePoints : public KernelBase {
+  advectSurfacePoints(BasicParticleSystemWrapper &surfacePoints,
+                      const BasicParticleSystemWrapper &coarseParticles,
+                      const ParticleDataImplVec3Wrapper &coarseParticlesPrevPos)
+      : KernelBase(surfacePoints.size()),
+        surfacePoints(surfacePoints),
+        coarseParticles(coarseParticles),
+        coarseParticlesPrevPos(coarseParticlesPrevPos)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 BasicParticleSystemWrapper &surfacePoints,
+                 const BasicParticleSystemWrapper &coarseParticles,
+                 const ParticleDataImplVec3Wrapper &coarseParticlesPrevPos) const
+  {
+    if (surfacePoints.isActive(idx)) {
+      Vec3 avgDisplacement(0, 0, 0);
+      Real totalWeight = 0;
+      Vec3 p = surfacePoints.getPos(idx);
+      LOOP_NEIGHBORS_BEGIN(
+          coarseParticlesPrevPos, surfacePoints.getPos(idx), 2.0f * params.outerRadius)
+      if ((coarseParticles.getStatus(idn) & ParticleBase::PNEW) == 0 &&
+          (coarseParticles.getStatus(idn) & ParticleBase::PDELETE) == 0) {
+        Vec3 disp = coarseParticles.getPos(idn) - coarseParticlesPrevPos.getVec3(idn);
+        Real distance = norm(coarseParticlesPrevPos.getVec3(idn) - p);
+        Real w = weightKernelAdvection(distance);
+        avgDisplacement += w * disp;
+        totalWeight += w;
+      }
+      LOOP_NEIGHBORS_END
+      if (totalWeight != 0)
+        avgDisplacement /= totalWeight;
+      surfacePoints.setPos(idx, p + avgDisplacement);
+    }
+  }
+  inline BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline const BasicParticleSystemWrapper &getArg1()
+  {
+    return coarseParticles;
+  }
+  typedef BasicParticleSystemWrapper type1;
+  inline const ParticleDataImplVec3Wrapper &getArg2()
+  {
+    return coarseParticlesPrevPos;
+  }
+  typedef ParticleDataImplVec3Wrapper type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel advectSurfacePoints ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, coarseParticles, coarseParticlesPrevPos);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  BasicParticleSystemWrapper &surfacePoints;
+  const BasicParticleSystemWrapper &coarseParticles;
+  const ParticleDataImplVec3Wrapper &coarseParticlesPrevPos;
+};
+
+//
+// **** value and gradient of level-set band constraint ****
+//
+Real computeConstraintLevel(const BasicParticleSystemWrapper &coarseParticles, Vec3 pos)
+{
+  Real lvl = 0.0f;
+  LOOP_NEIGHBORS_BEGIN(coarseParticles, pos, 1.5f * params.outerRadius)
+  lvl += expf(-params.constraintA * normSquare(coarseParticles.getPos(idn) - pos));
+  LOOP_NEIGHBORS_END
+  if (lvl > 1.0f)
+    lvl = 1.0f;
+  lvl = (sqrtf(-logf(lvl) / params.constraintA) - params.innerRadius) /
+        (params.outerRadius - params.innerRadius);
+  return lvl;
+}
+
+Vec3 computeConstraintGradient(const BasicParticleSystemWrapper &coarseParticles, Vec3 pos)
+{
+  Vec3 gradient(0, 0, 0);
+  LOOP_NEIGHBORS_BEGIN(coarseParticles, pos, 1.5f * params.outerRadius)
+  gradient += 2.f * params.constraintA *
+              (Real)(expf(-params.constraintA * normSquare(coarseParticles.getPos(idn) - pos))) *
+              (pos - coarseParticles.getPos(idn));
+  LOOP_NEIGHBORS_END
+  return getNormalized(gradient);
+}
+
+//
+// **** compute surface normals ****
+//
+
+struct computeSurfaceNormals : public KernelBase {
+  computeSurfaceNormals(const BasicParticleSystemWrapper &surfacePoints,
+                        const BasicParticleSystemWrapper &coarseParticles,
+                        ParticleDataImpl<Vec3> &surfaceNormals)
+      : KernelBase(surfacePoints.size()),
+        surfacePoints(surfacePoints),
+        coarseParticles(coarseParticles),
+        surfaceNormals(surfaceNormals)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystemWrapper &surfacePoints,
+                 const BasicParticleSystemWrapper &coarseParticles,
+                 ParticleDataImpl<Vec3> &surfaceNormals) const
+  {
+    Vec3 pos = surfacePoints.getPos(idx);
+
+    // approx normal with gradient
+    Vec3 gradient = computeConstraintGradient(coarseParticles, pos);
+
+    // get tangent frame
+    Vec3 n = getNormalized(gradient);
+    Vec3 vx(1, 0, 0);
+    Vec3 vy(0, 1, 0);
+    Real dotX = dot(n, vx);
+    Real dotY = dot(n, vy);
+    Vec3 t1 = getNormalized(fabs(dotX) < fabs(dotY) ? cross(n, vx) : cross(n, vy));
+    Vec3 t2 = getNormalized(cross(n, t1));  // initial frame
+
+    // linear fit of neighboring surface points in approximated tangent frame
+    Real sw = 0, swx = 0, swy = 0, swxy = 0, swx2 = 0, swy2 = 0, swxz = 0, swyz = 0, swz = 0;
+    LOOP_NEIGHBORS_BEGIN(surfacePoints, pos, params.normalRadius)
+    LOOP_GHOSTS_POS_BEGIN(surfacePoints.getPos(idn), params.normalRadius)
+    Real x = dot(gPos - pos, t1);
+    Real y = dot(gPos - pos, t2);
+    Real z = dot(gPos - pos, n);
+    Real w = weightSurfaceNormal(norm(pos - gPos));
+    swx2 += w * x * x;
+    swy2 += w * y * y;
+    swxy += w * x * y;
+    swxz += w * x * z;
+    swyz += w * y * z;
+    swx += w * x;
+    swy += w * y;
+    swz += w * z;
+    sw += w;
+    LOOP_GHOSTS_END
+    LOOP_NEIGHBORS_END
+    Real det = -sw * swxy * swxy + 2.f * swx * swxy * swy - swx2 * swy * swy - swx * swx * swy2 +
+               sw * swx2 * swy2;
+    if (det == 0) {
+      surfaceNormals[idx] = Vec3(0, 0, 0);
+    }
+    else {
+      Vec3 abc = 1.f / det *
+                 Vec3(swxz * (-swy * swy + sw * swy2) + swyz * (-sw * swxy + swx * swy) +
+                          swz * (swxy * swy - swx * swy2),
+                      swxz * (-sw * swxy + swx * swy) + swyz * (-swx * swx + sw * swx2) +
+                          swz * (swx * swxy - swx2 * swy),
+                      swxz * (swxy * swy - swx * swy2) + swyz * (swx * swxy - swx2 * swy) +
+                          swz * (-swxy * swxy + swx2 * swy2));
+      Vec3 normal = -getNormalized(t1 * abc.x + t2 * abc.y - n);
+      if (dot(gradient, normal) < 0) {
+        normal = -normal;
+      }
+      surfaceNormals[idx] = normal;
+    }
+  }
+  inline const BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline const BasicParticleSystemWrapper &getArg1()
+  {
+    return coarseParticles;
+  }
+  typedef BasicParticleSystemWrapper type1;
+  inline ParticleDataImpl<Vec3> &getArg2()
+  {
+    return surfaceNormals;
+  }
+  typedef ParticleDataImpl<Vec3> type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel computeSurfaceNormals ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, coarseParticles, surfaceNormals);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystemWrapper &surfacePoints;
+  const BasicParticleSystemWrapper &coarseParticles;
+  ParticleDataImpl<Vec3> &surfaceNormals;
+};
+
+//
+// **** smooth surface normals ****
+//
+
+struct computeAveragedNormals : public KernelBase {
+  computeAveragedNormals(const BasicParticleSystemWrapper &surfacePoints,
+                         const ParticleDataImpl<Vec3> &surfaceNormals)
+      : KernelBase(surfacePoints.size()),
+        surfacePoints(surfacePoints),
+        surfaceNormals(surfaceNormals)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystemWrapper &surfacePoints,
+                 const ParticleDataImpl<Vec3> &surfaceNormals) const
+  {
+    Vec3 pos = surfacePoints.getPos(idx);
+    Vec3 newNormal = Vec3(0, 0, 0);
+    LOOP_NEIGHBORS_BEGIN(surfacePoints, pos, params.normalRadius)
+    Real w = weightSurfaceNormal(norm(pos - surfacePoints.getPos(idn)));
+    newNormal += w * surfaceNormals[idn];
+    LOOP_NEIGHBORS_END
+    tempSurfaceVec3[idx] = getNormalized(newNormal);
+  }
+  inline const BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline const ParticleDataImpl<Vec3> &getArg1()
+  {
+    return surfaceNormals;
+  }
+  typedef ParticleDataImpl<Vec3> type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel computeAveragedNormals ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceNormals);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystemWrapper &surfacePoints;
+  const ParticleDataImpl<Vec3> &surfaceNormals;
+};
+
+struct assignNormals : public KernelBase {
+  assignNormals(const BasicParticleSystemWrapper &surfacePoints,
+                ParticleDataImpl<Vec3> &surfaceNormals)
+      : KernelBase(surfacePoints.size()),
+        surfacePoints(surfacePoints),
+        surfaceNormals(surfaceNormals)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystemWrapper &surfacePoints,
+                 ParticleDataImpl<Vec3> &surfaceNormals) const
+  {
+    surfaceNormals[idx] = tempSurfaceVec3[idx];
+  }
+  inline const BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline ParticleDataImpl<Vec3> &getArg1()
+  {
+    return surfaceNormals;
+  }
+  typedef ParticleDataImpl<Vec3> type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel assignNormals ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceNormals);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystemWrapper &surfacePoints;
+  ParticleDataImpl<Vec3> &surfaceNormals;
+};
+
+void smoothSurfaceNormals(const BasicParticleSystemWrapper &surfacePoints,
+                          ParticleDataImpl<Vec3> &surfaceNormals)
+{
+  tempSurfaceVec3.resize(surfacePoints.size());
+
+  computeAveragedNormals(surfacePoints, surfaceNormals);
+  assignNormals(surfacePoints, surfaceNormals);
+}
+
+//
+// **** addition/deletion of particles. Not parallel to prevent write/delete conflicts ****
+//
+
+void addDeleteSurfacePoints(BasicParticleSystemWrapper &surfacePoints)
+{
+  int fixedSize = surfacePoints.size();
+  for (int idx = 0; idx < fixedSize; idx++) {
+    // compute proxy tangent displacement
+    Vec3 pos = surfacePoints.getPos(idx);
+
+    Vec3 gradient = computeConstraintGradient(coarseParticles, pos);
+
+    Real wt = 0;
+    Vec3 tangentDisplacement(0, 0, 0);
+    LOOP_NEIGHBORS_BEGIN(surfacePoints, pos, params.tangentRadius)
+    if (idn != idx) {
+      Vec3 dir = pos - surfacePoints.getPos(idn);
+      Real length = norm(dir);
+      dir = getNormalized(dir);
+
+      // Decompose direction into normal and tangent directions.
+      Vec3 dn = dot(dir, gradient) * gradient;
+      Vec3 dt = dir - dn;
+
+      Real w = weightSurfaceTangent(length);
+      wt += w;
+      tangentDisplacement += w * dt;
+    }
+    LOOP_NEIGHBORS_END
+    if (norm(tangentDisplacement) != 0) {
+      tangentDisplacement = getNormalized(tangentDisplacement);
+    }
+
+    // check density criterion, add surface point if necessary
+    Vec3 creationPos = pos + params.meanFineDistance * tangentDisplacement;
+    if (isInDomain(creationPos) &&
+        !surfacePoints.hasNeighbor(creationPos, params.meanFineDistance - (1e-6))) {
+      // create point
+      surfacePoints.addBuffered(creationPos);
+    }
+  }
+
+  surfacePoints.doCompress();
+  surfacePoints.insertBufferedParticles();
+
+  // check density criterion, delete surface points if necessary
+  fixedSize = surfacePoints.size();
+  for (int idx = 0; idx < fixedSize; idx++) {
+    if (!isInDomain(surfacePoints.getPos(idx)) ||
+        surfacePoints.hasNeighborOtherThanItself(idx, 0.67 * params.meanFineDistance)) {
+      surfacePoints.kill(idx);
+    }
+  }
+
+  // delete surface points if no coarse neighbors in advection radius
+  fixedSize = surfacePoints.size();
+  for (int idx = 0; idx < fixedSize; idx++) {
+    Vec3 pos = surfacePoints.getPos(idx);
+    if (!coarseParticles.hasNeighbor(pos, 2.f * params.outerRadius)) {
+      surfacePoints.kill(idx);
+    }
+  }
+
+  // delete surface point if too far from constraint
+  fixedSize = surfacePoints.size();
+  for (int idx = 0; idx < fixedSize; idx++) {
+    Real level = computeConstraintLevel(coarseParticles, surfacePoints.getPos(idx));
+    if (level < -0.2 || level > 1.2) {
+      surfacePoints.kill(idx);
+    }
+  }
+
+  surfacePoints.doCompress();
+  surfacePoints.insertBufferedParticles();
+}
+
+//
+// **** surface maintenance ****
+//
+
+struct computeSurfaceDensities : public KernelBase {
+  computeSurfaceDensities(const BasicParticleSystemWrapper &surfacePoints, void *dummy)
+      : KernelBase(surfacePoints.size()), surfacePoints(surfacePoints), dummy(dummy)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx, const BasicParticleSystemWrapper &surfacePoints, void *dummy) const
+  {
+    Vec3 pos = surfacePoints.getPos(idx);
+    Real density = 0;
+    LOOP_NEIGHBORS_BEGIN(surfacePoints, pos, params.normalRadius)
+    LOOP_GHOSTS_POS_BEGIN(surfacePoints.getPos(idn), params.normalRadius)
+    density += weightSurfaceNormal(norm(pos - gPos));
+    LOOP_GHOSTS_END
+    LOOP_NEIGHBORS_END
+    tempSurfaceFloat[idx] = density;
+  }
+  inline const BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline void *getArg1()
+  {
+    return dummy;
+  }
+  typedef void type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel computeSurfaceDensities ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, dummy);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystemWrapper &surfacePoints;
+  void *dummy;
+};
+
+struct computeSurfaceDisplacements : public KernelBase {
+  computeSurfaceDisplacements(const BasicParticleSystemWrapper &surfacePoints,
+                              const ParticleDataImpl<Vec3> &surfaceNormals)
+      : KernelBase(surfacePoints.size()),
+        surfacePoints(surfacePoints),
+        surfaceNormals(surfaceNormals)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystemWrapper &surfacePoints,
+                 const ParticleDataImpl<Vec3> &surfaceNormals) const
+  {
+    Vec3 pos = surfacePoints.getPos(idx);
+    Vec3 normal = surfaceNormals[idx];
+
+    Vec3 displacementNormal(0, 0, 0);
+    Vec3 displacementTangent(0, 0, 0);
+    Real wTotal = 0;
+    LOOP_NEIGHBORS_BEGIN(surfacePoints, pos, params.normalRadius)
+
+    LOOP_GHOSTS_POS_NORMAL_BEGIN(
+        surfacePoints.getPos(idn), surfaceNormals[idn], params.normalRadius)
+    Vec3 dir = pos - gPos;
+    Real length = norm(dir);
+    Vec3 dn = dot(dir, surfaceNormals[idx]) * surfaceNormals[idx];
+    Vec3 dt = dir - dn;
+    if (tempSurfaceFloat[idn] == 0) {
+      continue;
+    }
+    Real w = weightSurfaceNormal(length) / tempSurfaceFloat[idn];
+
+    Vec3 crossVec = getNormalized(cross(normal, -dir));
+    Vec3 projectedNormal = getNormalized(gNormal - dot(crossVec, gNormal) * crossVec);
+    if (dot(projectedNormal, normal) < 0 || abs(dot(normal, normal + projectedNormal)) < 1e-6) {
+      continue;
+    }
+    dn = -dot(normal + projectedNormal, dir) / dot(normal, normal + projectedNormal) * normal;
+
+    displacementNormal += w * dn;
+    displacementTangent += w * getNormalized(dt);
+    wTotal += w;
+    LOOP_GHOSTS_END
+
+    LOOP_NEIGHBORS_END
+    if (wTotal != 0) {
+      displacementNormal /= wTotal;
+      displacementTangent /= wTotal;
+    }
+    displacementNormal *= .75f;
+    displacementTangent *= .25f * params.meanFineDistance;
+    tempSurfaceVec3[idx] = displacementNormal + displacementTangent;
+  }
+  inline const BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline const ParticleDataImpl<Vec3> &getArg1()
+  {
+    return surfaceNormals;
+  }
+  typedef ParticleDataImpl<Vec3> type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel computeSurfaceDisplacements ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceNormals);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystemWrapper &surfacePoints;
+  const ParticleDataImpl<Vec3> &surfaceNormals;
+};
+
+struct applySurfaceDisplacements : public KernelBase {
+  applySurfaceDisplacements(BasicParticleSystemWrapper &surfacePoints, void *dummy)
+      : KernelBase(surfacePoints.size()), surfacePoints(surfacePoints), dummy(dummy)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx, BasicParticleSystemWrapper &surfacePoints, void *dummy) const
+  {
+    surfacePoints.setPos(idx, surfacePoints.getPos(idx) + tempSurfaceVec3[idx]);
+  }
+  inline BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline void *getArg1()
+  {
+    return dummy;
+  }
+  typedef void type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel applySurfaceDisplacements ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, dummy);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  BasicParticleSystemWrapper &surfacePoints;
+  void *dummy;
+};
+
+void regularizeSurfacePoints(BasicParticleSystemWrapper &surfacePoints,
+                             const ParticleDataImpl<Vec3> &surfaceNormals)
+{
+  tempSurfaceVec3.resize(surfacePoints.size());
+  tempSurfaceFloat.resize(surfacePoints.size());
+
+  computeSurfaceDensities(surfacePoints, 0);
+  computeSurfaceDisplacements(surfacePoints, surfaceNormals);
+  applySurfaceDisplacements(surfacePoints, 0);
+}
+
+struct constrainSurface : public KernelBase {
+  constrainSurface(BasicParticleSystemWrapper &surfacePoints,
+                   const BasicParticleSystemWrapper &coarseParticles)
+      : KernelBase(surfacePoints.size()),
+        surfacePoints(surfacePoints),
+        coarseParticles(coarseParticles)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 BasicParticleSystemWrapper &surfacePoints,
+                 const BasicParticleSystemWrapper &coarseParticles) const
+  {
+    Vec3 pos = surfacePoints.getPos(idx);
+    Real level = computeConstraintLevel(coarseParticles, surfacePoints.getPos(idx));
+    if (level > 1) {
+      surfacePoints.setPos(
+          idx,
+          pos - (params.outerRadius - params.innerRadius) * (level - 1) *
+                    computeConstraintGradient(coarseParticles, surfacePoints.getPos(idx)));
+    }
+    else if (level < 0) {
+      surfacePoints.setPos(
+          idx,
+          pos - (params.outerRadius - params.innerRadius) * level *
+                    computeConstraintGradient(coarseParticles, surfacePoints.getPos(idx)));
+    }
+  }
+  inline BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline const BasicParticleSystemWrapper &getArg1()
+  {
+    return coarseParticles;
+  }
+  typedef BasicParticleSystemWrapper type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel constrainSurface ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, coarseParticles);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  BasicParticleSystemWrapper &surfacePoints;
+  const BasicParticleSystemWrapper &coarseParticles;
+};
+
+struct interpolateNewWaveData : public KernelBase {
+  interpolateNewWaveData(const BasicParticleSystemWrapper &surfacePoints,
+                         ParticleDataImpl<Real> &surfaceWaveH,
+                         ParticleDataImpl<Real> &surfaceWaveDtH,
+                         ParticleDataImpl<Real> &surfaceWaveSeed,
+                         ParticleDataImpl<Real> &surfaceWaveSeedAmplitude)
+      : KernelBase(surfacePoints.size()),
+        surfacePoints(surfacePoints),
+        surfaceWaveH(surfaceWaveH),
+        surfaceWaveDtH(surfaceWaveDtH),
+        surfaceWaveSeed(surfaceWaveSeed),
+        surfaceWaveSeedAmplitude(surfaceWaveSeedAmplitude)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystemWrapper &surfacePoints,
+                 ParticleDataImpl<Real> &surfaceWaveH,
+                 ParticleDataImpl<Real> &surfaceWaveDtH,
+                 ParticleDataImpl<Real> &surfaceWaveSeed,
+                 ParticleDataImpl<Real> &surfaceWaveSeedAmplitude) const
+  {
+    if (surfacePoints.getStatus(idx) & ParticleBase::PNEW) {
+      Vec3 pos = surfacePoints.getPos(idx);
+      surfaceWaveH[idx] = 0;
+      surfaceWaveDtH[idx] = 0;
+      Real wTotal = 0;
+      LOOP_NEIGHBORS_BEGIN(surfacePoints, pos, params.tangentRadius)
+      if (!(surfacePoints.getStatus(idn) & ParticleBase::PNEW)) {
+        Real w = weightSurfaceTangent(norm(pos - surfacePoints.getPos(idn)));
+        surfaceWaveH[idx] += w * surfaceWaveH[idn];
+        surfaceWaveDtH[idx] += w * surfaceWaveDtH[idn];
+        surfaceWaveSeed[idx] += w * surfaceWaveSeed[idn];
+        surfaceWaveSeedAmplitude[idx] += w * surfaceWaveSeedAmplitude[idn];
+        wTotal += w;
+      }
+      LOOP_NEIGHBORS_END
+      if (wTotal != 0) {
+        surfaceWaveH[idx] /= wTotal;
+        surfaceWaveDtH[idx] /= wTotal;
+        surfaceWaveSeed[idx] /= wTotal;
+        surfaceWaveSeedAmplitude[idx] /= wTotal;
+      }
+    }
+  }
+  inline const BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline ParticleDataImpl<Real> &getArg1()
+  {
+    return surfaceWaveH;
+  }
+  typedef ParticleDataImpl<Real> type1;
+  inline ParticleDataImpl<Real> &getArg2()
+  {
+    return surfaceWaveDtH;
+  }
+  typedef ParticleDataImpl<Real> type2;
+  inline ParticleDataImpl<Real> &getArg3()
+  {
+    return surfaceWaveSeed;
+  }
+  typedef ParticleDataImpl<Real> type3;
+  inline ParticleDataImpl<Real> &getArg4()
+  {
+    return surfaceWaveSeedAmplitude;
+  }
+  typedef ParticleDataImpl<Real> type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel interpolateNewWaveData ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx,
+         surfacePoints,
+         surfaceWaveH,
+         surfaceWaveDtH,
+         surfaceWaveSeed,
+         surfaceWaveSeedAmplitude);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystemWrapper &surfacePoints;
+  ParticleDataImpl<Real> &surfaceWaveH;
+  ParticleDataImpl<Real> &surfaceWaveDtH;
+  ParticleDataImpl<Real> &surfaceWaveSeed;
+  ParticleDataImpl<Real> &surfaceWaveSeedAmplitude;
+};
+
+void surfaceMaintenance(const BasicParticleSystemWrapper &coarseParticles,
+                        BasicParticleSystemWrapper &surfacePoints,
+                        ParticleDataImpl<Vec3> &surfaceNormals,
+                        ParticleDataImpl<Real> &surfaceWaveH,
+                        ParticleDataImpl<Real> &surfaceWaveDtH,
+                        ParticleDataImpl<Real> &surfaceWaveSeed,
+                        ParticleDataImpl<Real> &surfaceWaveSeedAmplitude,
+                        int nbIterations)
+{
+  int countIterations = nbIterations;
+  while (countIterations > 0) {
+    addDeleteSurfacePoints(surfacePoints);
+    surfacePoints.updateAccel();
+    computeSurfaceNormals(surfacePoints, coarseParticles, surfaceNormals);
+    smoothSurfaceNormals(surfacePoints, surfaceNormals);
+
+    regularizeSurfacePoints(surfacePoints, surfaceNormals);
+    surfacePoints.updateAccel();
+    constrainSurface(surfacePoints, coarseParticles);
+    surfacePoints.updateAccel();
+
+    interpolateNewWaveData(
+        surfacePoints, surfaceWaveH, surfaceWaveDtH, surfaceWaveSeed, surfaceWaveSeedAmplitude);
+
+    countIterations--;
+  }
+}
+
+//
+// **** surface wave seeding and evolution ****
+//
+
+struct addSeed : public KernelBase {
+  addSeed(const BasicParticleSystemWrapper &surfacePoints,
+          ParticleDataImpl<Real> &surfaceWaveH,
+          const ParticleDataImpl<Real> &surfaceWaveSeed)
+      : KernelBase(surfacePoints.size()),
+        surfacePoints(surfacePoints),
+        surfaceWaveH(surfaceWaveH),
+        surfaceWaveSeed(surfaceWaveSeed)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystemWrapper &surfacePoints,
+                 ParticleDataImpl<Real> &surfaceWaveH,
+                 const ParticleDataImpl<Real> &surfaceWaveSeed) const
+  {
+    surfaceWaveH[idx] += surfaceWaveSeed[idx];
+  }
+  inline const BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline ParticleDataImpl<Real> &getArg1()
+  {
+    return surfaceWaveH;
+  }
+  typedef ParticleDataImpl<Real> type1;
+  inline const ParticleDataImpl<Real> &getArg2()
+  {
+    return surfaceWaveSeed;
+  }
+  typedef ParticleDataImpl<Real> type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel addSeed ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceWaveH, surfaceWaveSeed);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystemWrapper &surfacePoints;
+  ParticleDataImpl<Real> &surfaceWaveH;
+  const ParticleDataImpl<Real> &surfaceWaveSeed;
+};
+
+struct computeSurfaceWaveNormal : public KernelBase {
+  computeSurfaceWaveNormal(const BasicParticleSystemWrapper &surfacePoints,
+                           const ParticleDataImpl<Vec3> &surfaceNormals,
+                           const ParticleDataImpl<Real> &surfaceWaveH)
+      : KernelBase(surfacePoints.size()),
+        surfacePoints(surfacePoints),
+        surfaceNormals(surfaceNormals),
+        surfaceWaveH(surfaceWaveH)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystemWrapper &surfacePoints,
+                 const ParticleDataImpl<Vec3> &surfaceNormals,
+                 const ParticleDataImpl<Real> &surfaceWaveH) const
+  {
+    Vec3 pos = surfacePoints.getPos(idx);
+
+    // get tangent frame
+    Vec3 n = getNormalized(surfaceNormals[idx]);
+    Vec3 vx(1, 0, 0);
+    Vec3 vy(0, 1, 0);
+    Real dotX = dot(n, vx);
+    Real dotY = dot(n, vy);
+    Vec3 t1 = getNormalized(fabs(dotX) < fabs(dotY) ? cross(n, vx) : cross(n, vy));
+    Vec3 t2 = getNormalized(cross(n, t1));
+
+    // linear fit
+    Real sw = 0, swx = 0, swy = 0, swxy = 0, swx2 = 0, swy2 = 0, swxz = 0, swyz = 0, swz = 0;
+    LOOP_NEIGHBORS_BEGIN(surfacePoints, pos, params.tangentRadius)
+    LOOP_GHOSTS_POS_BEGIN(surfacePoints.getPos(idn), params.tangentRadius)
+    Real x = dot(gPos - pos, t1);
+    Real y = dot(gPos - pos, t2);
+    Real z = surfaceWaveH[idn];
+    Real w = weightSurfaceTangent(norm(pos - gPos));
+    swx2 += w * x * x;
+    swy2 += w * y * y;
+    swxy += w * x * y;
+    swxz += w * x * z;
+    swyz += w * y * z;
+    swx += w * x;
+    swy += w * y;
+    swz += w * z;
+    sw += w;
+    LOOP_GHOSTS_END
+    LOOP_NEIGHBORS_END
+    Real det = -sw * swxy * swxy + 2.f * swx * swxy * swy - swx2 * swy * swy - swx * swx * swy2 +
+               sw * swx2 * swy2;
+    if (det == 0) {
+      tempSurfaceVec3[idx] = Vec3(0, 0, 0);
+    }
+    else {
+      Vec3 abc = 1.f / det *
+                 Vec3(swxz * (-swy * swy + sw * swy2) + swyz * (-sw * swxy + swx * swy) +
+                          swz * (swxy * swy - swx * swy2),
+                      swxz * (-sw * swxy + swx * swy) + swyz * (-swx * swx + sw * swx2) +
+                          swz * (swx * swxy - swx2 * swy),
+                      swxz * (swxy * swy - swx * swy2) + swyz * (swx * swxy - swx2 * swy) +
+                          swz * (-swxy * swxy + swx2 * swy2));
+      Vec3 waveNormal = -getNormalized(vx * abc.x + vy * abc.y - Vec3(0, 0, 1));
+      tempSurfaceVec3[idx] = waveNormal;
+    }
+  }
+  inline const BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline const ParticleDataImpl<Vec3> &getArg1()
+  {
+    return surfaceNormals;
+  }
+  typedef ParticleDataImpl<Vec3> type1;
+  inline const ParticleDataImpl<Real> &getArg2()
+  {
+    return surfaceWaveH;
+  }
+  typedef ParticleDataImpl<Real> type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel computeSurfaceWaveNormal ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceNormals, surfaceWaveH);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystemWrapper &surfacePoints;
+  const ParticleDataImpl<Vec3> &surfaceNormals;
+  const ParticleDataImpl<Real> &surfaceWaveH;
+};
+
+struct computeSurfaceWaveLaplacians : public KernelBase {
+  computeSurfaceWaveLaplacians(const BasicParticleSystemWrapper &surfacePoints,
+                               const ParticleDataImpl<Vec3> &surfaceNormals,
+                               const ParticleDataImpl<Real> &surfaceWaveH)
+      : KernelBase(surfacePoints.size()),
+        surfacePoints(surfacePoints),
+        surfaceNormals(surfaceNormals),
+        surfaceWaveH(surfaceWaveH)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystemWrapper &surfacePoints,
+                 const ParticleDataImpl<Vec3> &surfaceNormals,
+                 const ParticleDataImpl<Real> &surfaceWaveH) const
+  {
+    Real laplacian = 0;
+    Real wTotal = 0;
+    Vec3 pPos = surfacePoints.getPos(idx);
+    Vec3 pNormal = surfaceNormals[idx];
+
+    Vec3 vx(1, 0, 0);
+    Vec3 vy(0, 1, 0);
+    Real dotX = dot(pNormal, vx);
+    Real dotY = dot(pNormal, vy);
+    Vec3 t1 = getNormalized(fabs(dotX) < fabs(dotY) ? cross(pNormal, vx) : cross(pNormal, vy));
+    Vec3 t2 = getNormalized(cross(pNormal, t1));
+
+    Vec3 pWaveNormal = tempSurfaceVec3[idx];
+    Real ph = surfaceWaveH[idx];
+    if (pWaveNormal.z == 0) {
+      tempSurfaceFloat[idx] = 0;
+    }
+    else {
+
+      LOOP_NEIGHBORS_BEGIN(surfacePoints, pPos, params.tangentRadius)
+      Real nh = surfaceWaveH[idn];
+      LOOP_GHOSTS_POS_BEGIN(surfacePoints.getPos(idn), params.tangentRadius)
+      Vec3 dir = gPos - pPos;
+      Real lengthDir = norm(dir);
+      if (lengthDir < 1e-5)
+        continue;
+      Vec3 tangentDir = lengthDir * getNormalized(dir - dot(dir, pNormal) * pNormal);
+      Real dirX = dot(tangentDir, t1);
+      Real dirY = dot(tangentDir, t2);
+      Real dz = nh - ph - (-pWaveNormal.x / pWaveNormal.z) * dirX -
+                (-pWaveNormal.y / pWaveNormal.z) * dirY;
+      Real w = weightSurfaceTangent(norm(pPos - gPos));
+      wTotal += w;
+      laplacian += clamp(w * 4 * dz / (lengthDir * lengthDir), Real(-100.), Real(100.));
+      LOOP_GHOSTS_END
+      LOOP_NEIGHBORS_END
+      if (wTotal != 0) {
+        tempSurfaceFloat[idx] = laplacian / wTotal;
+      }
+      else {
+        tempSurfaceFloat[idx] = 0;
+      }
+    }
+  }
+  inline const BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline const ParticleDataImpl<Vec3> &getArg1()
+  {
+    return surfaceNormals;
+  }
+  typedef ParticleDataImpl<Vec3> type1;
+  inline const ParticleDataImpl<Real> &getArg2()
+  {
+    return surfaceWaveH;
+  }
+  typedef ParticleDataImpl<Real> type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel computeSurfaceWaveLaplacians ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceNormals, surfaceWaveH);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystemWrapper &surfacePoints;
+  const ParticleDataImpl<Vec3> &surfaceNormals;
+  const ParticleDataImpl<Real> &surfaceWaveH;
+};
+
+struct evolveWave : public KernelBase {
+  evolveWave(const BasicParticleSystemWrapper &surfacePoints,
+             ParticleDataImpl<Real> &surfaceWaveH,
+             ParticleDataImpl<Real> &surfaceWaveDtH,
+             const ParticleDataImpl<Real> &surfaceWaveSeed)
+      : KernelBase(surfacePoints.size()),
+        surfacePoints(surfacePoints),
+        surfaceWaveH(surfaceWaveH),
+        surfaceWaveDtH(surfaceWaveDtH),
+        surfaceWaveSeed(surfaceWaveSeed)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystemWrapper &surfacePoints,
+                 ParticleDataImpl<Real> &surfaceWaveH,
+                 ParticleDataImpl<Real> &surfaceWaveDtH,
+                 const ParticleDataImpl<Real> &surfaceWaveSeed) const
+  {
+    surfaceWaveDtH[idx] += params.waveSpeed * params.waveSpeed * params.dt * tempSurfaceFloat[idx];
+    surfaceWaveDtH[idx] /= (1 + params.dt * params.waveDamping);
+    surfaceWaveH[idx] += params.dt * surfaceWaveDtH[idx];
+    surfaceWaveH[idx] /= (1 + params.dt * params.waveDamping);
+    surfaceWaveH[idx] -= surfaceWaveSeed[idx];
+
+    // clamp H and DtH (to prevent rare extreme behaviors)
+    surfaceWaveDtH[idx] = clamp(surfaceWaveDtH[idx],
+                                -params.waveMaxFrequency * params.waveMaxAmplitude,
+                                params.waveMaxFrequency * params.waveMaxAmplitude);
+    surfaceWaveH[idx] = clamp(
+        surfaceWaveH[idx], -params.waveMaxAmplitude, params.waveMaxAmplitude);
+  }
+  inline const BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline ParticleDataImpl<Real> &getArg1()
+  {
+    return surfaceWaveH;
+  }
+  typedef ParticleDataImpl<Real> type1;
+  inline ParticleDataImpl<Real> &getArg2()
+  {
+    return surfaceWaveDtH;
+  }
+  typedef ParticleDataImpl<Real> type2;
+  inline const ParticleDataImpl<Real> &getArg3()
+  {
+    return surfaceWaveSeed;
+  }
+  typedef ParticleDataImpl<Real> type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel evolveWave ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceWaveH, surfaceWaveDtH, surfaceWaveSeed);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystemWrapper &surfacePoints;
+  ParticleDataImpl<Real> &surfaceWaveH;
+  ParticleDataImpl<Real> &surfaceWaveDtH;
+  const ParticleDataImpl<Real> &surfaceWaveSeed;
+};
+
+struct computeSurfaceCurvature : public KernelBase {
+  computeSurfaceCurvature(const BasicParticleSystemWrapper &surfacePoints,
+                          const ParticleDataImpl<Vec3> &surfaceNormals)
+      : KernelBase(surfacePoints.size()),
+        surfacePoints(surfacePoints),
+        surfaceNormals(surfaceNormals)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystemWrapper &surfacePoints,
+                 const ParticleDataImpl<Vec3> &surfaceNormals) const
+  {
+    Vec3 pPos = surfacePoints.getPos(idx);
+    Real wTotal = 0;
+    Real curv = 0;
+    Vec3 pNormal = surfaceNormals[idx];
+
+    LOOP_NEIGHBORS_BEGIN(surfacePoints, pPos, params.normalRadius)
+    LOOP_GHOSTS_POS_NORMAL_BEGIN(
+        surfacePoints.getPos(idn), surfaceNormals[idn], params.normalRadius)
+    Vec3 dir = pPos - gPos;
+    if (dot(pNormal, gNormal) < 0) {
+      continue;
+    }  // backfacing
+    Real dist = norm(dir);
+    if (dist < params.normalRadius / 100.f) {
+      continue;
+    }
+
+    Real distn = dot(dir, pNormal);
+
+    Real w = weightSurfaceNormal(dist);
+    curv += w * distn;
+    wTotal += w;
+    LOOP_GHOSTS_END
+    LOOP_NEIGHBORS_END
+    if (wTotal != 0) {
+      curv /= wTotal;
+    }
+    tempSurfaceFloat[idx] = fabs(curv);
+  }
+  inline const BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline const ParticleDataImpl<Vec3> &getArg1()
+  {
+    return surfaceNormals;
+  }
+  typedef ParticleDataImpl<Vec3> type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel computeSurfaceCurvature ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceNormals);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystemWrapper &surfacePoints;
+  const ParticleDataImpl<Vec3> &surfaceNormals;
+};
+
+struct smoothCurvature : public KernelBase {
+  smoothCurvature(const BasicParticleSystemWrapper &surfacePoints,
+                  ParticleDataImpl<Real> &surfaceWaveSource)
+      : KernelBase(surfacePoints.size()),
+        surfacePoints(surfacePoints),
+        surfaceWaveSource(surfaceWaveSource)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystemWrapper &surfacePoints,
+                 ParticleDataImpl<Real> &surfaceWaveSource) const
+  {
+    Vec3 pPos = surfacePoints.getPos(idx);
+    Real curv = 0;
+    Real wTotal = 0;
+
+    LOOP_NEIGHBORS_BEGIN(surfacePoints, pPos, params.normalRadius)
+    Real w = weightSurfaceNormal(norm(pPos - surfacePoints.getPos(idn)));
+    curv += w * tempSurfaceFloat[idn];
+    wTotal += w;
+    LOOP_NEIGHBORS_END
+    if (wTotal != 0) {
+      curv /= wTotal;
+    }
+    surfaceWaveSource[idx] = curv;
+  }
+  inline const BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline ParticleDataImpl<Real> &getArg1()
+  {
+    return surfaceWaveSource;
+  }
+  typedef ParticleDataImpl<Real> type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel smoothCurvature ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceWaveSource);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystemWrapper &surfacePoints;
+  ParticleDataImpl<Real> &surfaceWaveSource;
+};
+
+struct seedWaves : public KernelBase {
+  seedWaves(const BasicParticleSystemWrapper &surfacePoints,
+            ParticleDataImpl<Real> &surfaceWaveSeed,
+            ParticleDataImpl<Real> &surfaceWaveSeedAmplitude,
+            ParticleDataImpl<Real> &surfaceWaveSource)
+      : KernelBase(surfacePoints.size()),
+        surfacePoints(surfacePoints),
+        surfaceWaveSeed(surfaceWaveSeed),
+        surfaceWaveSeedAmplitude(surfaceWaveSeedAmplitude),
+        surfaceWaveSource(surfaceWaveSource)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(IndexInt idx,
+                 const BasicParticleSystemWrapper &surfacePoints,
+                 ParticleDataImpl<Real> &surfaceWaveSeed,
+                 ParticleDataImpl<Real> &surfaceWaveSeedAmplitude,
+                 ParticleDataImpl<Real> &surfaceWaveSource) const
+  {
+    Real source = smoothstep(params.waveSeedingCurvatureThresholdRegionCenter -
+                                 params.waveSeedingCurvatureThresholdRegionRadius,
+                             params.waveSeedingCurvatureThresholdRegionCenter +
+                                 params.waveSeedingCurvatureThresholdRegionRadius,
+                             (Real)surfaceWaveSource[idx]) *
+                      2.f -
+                  1.f;
+    Real freq = params.waveSeedFrequency;
+    Real theta = params.dt * frameCount * params.waveSpeed * freq;
+    Real costheta = cosf(theta);
+    Real maxSeedAmplitude = params.waveMaxSeedingAmplitude * params.waveMaxAmplitude;
+
+    surfaceWaveSeedAmplitude[idx] = clamp<Real>(surfaceWaveSeedAmplitude[idx] +
+                                                    source * params.waveSeedStepSizeRatioOfMax *
+                                                        maxSeedAmplitude,
+                                                0.f,
+                                                maxSeedAmplitude);
+    surfaceWaveSeed[idx] = surfaceWaveSeedAmplitude[idx] * costheta;
+
+    // source values for display (not used after this point anyway)
+    surfaceWaveSource[idx] = (source >= 0) ? 1 : 0;
+  }
+  inline const BasicParticleSystemWrapper &getArg0()
+  {
+    return surfacePoints;
+  }
+  typedef BasicParticleSystemWrapper type0;
+  inline ParticleDataImpl<Real> &getArg1()
+  {
+    return surfaceWaveSeed;
+  }
+  typedef ParticleDataImpl<Real> type1;
+  inline ParticleDataImpl<Real> &getArg2()
+  {
+    return surfaceWaveSeedAmplitude;
+  }
+  typedef ParticleDataImpl<Real> type2;
+  inline ParticleDataImpl<Real> &getArg3()
+  {
+    return surfaceWaveSource;
+  }
+  typedef ParticleDataImpl<Real> type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel seedWaves ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceWaveSeed, surfaceWaveSeedAmplitude, surfaceWaveSource);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  const BasicParticleSystemWrapper &surfacePoints;
+  ParticleDataImpl<Real> &surfaceWaveSeed;
+  ParticleDataImpl<Real> &surfaceWaveSeedAmplitude;
+  ParticleDataImpl<Real> &surfaceWaveSource;
+};
+
+void surfaceWaves(const BasicParticleSystemWrapper &surfacePoints,
+                  const ParticleDataImpl<Vec3> &surfaceNormals,
+                  ParticleDataImpl<Real> &surfaceWaveH,
+                  ParticleDataImpl<Real> &surfaceWaveDtH,
+                  ParticleDataImpl<Real> &surfaceWaveSource,
+                  ParticleDataImpl<Real> &surfaceWaveSeed,
+                  ParticleDataImpl<Real> &surfaceWaveSeedAmplitude)
+{
+  addSeed(surfacePoints, surfaceWaveH, surfaceWaveSeed);
+  computeSurfaceWaveNormal(surfacePoints, surfaceNormals, surfaceWaveH);
+  computeSurfaceWaveLaplacians(surfacePoints, surfaceNormals, surfaceWaveH);
+  evolveWave(surfacePoints, surfaceWaveH, surfaceWaveDtH, surfaceWaveSeed);
+  computeSurfaceCurvature(surfacePoints, surfaceNormals);
+  smoothCurvature(surfacePoints, surfaceWaveSource);
+  seedWaves(surfacePoints, surfaceWaveSeed, surfaceWaveSeedAmplitude, surfaceWaveSource);
+}
+
+//
+// **** main function ****
+//
+
+void particleSurfaceTurbulence(const FlagGrid &flags,
+                               BasicParticleSystem &coarseParts,
+                               ParticleDataImpl<Vec3> &coarsePartsPrevPos,
+                               BasicParticleSystem &surfPoints,
+                               ParticleDataImpl<Vec3> &surfaceNormals,
+                               ParticleDataImpl<Real> &surfaceWaveH,
+                               ParticleDataImpl<Real> &surfaceWaveDtH,
+                               BasicParticleSystem &surfacePointsDisplaced,
+                               ParticleDataImpl<Real> &surfaceWaveSource,
+                               ParticleDataImpl<Real> &surfaceWaveSeed,
+                               ParticleDataImpl<Real> &surfaceWaveSeedAmplitude,
+                               int res,
+                               Real outerRadius = 1.0f,
+                               int surfaceDensity = 20,
+                               int nbSurfaceMaintenanceIterations = 4,
+                               Real dt = 0.005f,
+                               Real waveSpeed = 16.0f,
+                               Real waveDamping = 0.0f,
+                               Real waveSeedFrequency = 4,
+                               Real waveMaxAmplitude = 0.25f,
+                               Real waveMaxFrequency = 800,
+                               Real waveMaxSeedingAmplitude = 0.5,
+                               Real waveSeedingCurvatureThresholdRegionCenter = 0.025f,
+                               Real waveSeedingCurvatureThresholdRegionRadius = 0.01f,
+                               Real waveSeedStepSizeRatioOfMax = 0.05f)
+{
+#if USE_CHRONO == 1
+  static std::chrono::high_resolution_clock::time_point begin, end;
+  end = std::chrono::high_resolution_clock::now();
+  cout << std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count() / 1000000000.f
+       << " : time sim" << endl;
+  begin = std::chrono::high_resolution_clock::now();
+#endif
+
+  // wrap data
+  coarseParticles.points = &coarseParts;
+  coarseParticlesPrevPos.points = &coarsePartsPrevPos;
+  surfacePoints.points = &surfPoints;
+
+  // copy parameters
+  params.res = res;
+  params.outerRadius = outerRadius;
+  params.surfaceDensity = surfaceDensity;
+  params.nbSurfaceMaintenanceIterations = nbSurfaceMaintenanceIterations;
+  params.dt = dt;
+  params.waveSpeed = waveSpeed;
+  params.waveDamping = waveDamping;
+  params.waveSeedFrequency = waveSeedFrequency;
+  params.waveMaxAmplitude = waveMaxAmplitude;
+  params.waveMaxFrequency = waveMaxFrequency;
+  params.waveMaxSeedingAmplitude = waveMaxSeedingAmplitude;
+  params.waveSeedingCurvatureThresholdRegionCenter = waveSeedingCurvatureThresholdRegionCenter;
+  params.waveSeedingCurvatureThresholdRegionRadius = waveSeedingCurvatureThresholdRegionRadius;
+  params.waveSeedStepSizeRatioOfMax = waveSeedStepSizeRatioOfMax;
+
+  // compute other parameters
+  params.innerRadius = params.outerRadius / 2.0;
+  params.meanFineDistance = M_PI * (params.outerRadius + params.innerRadius) /
+                            params.surfaceDensity;
+  params.constraintA = logf(2.0f / (1.0f + weightKernelCoarseDensity(params.outerRadius +
+                                                                     params.innerRadius))) /
+                       (powf((params.outerRadius + params.innerRadius) / 2, 2) -
+                        params.innerRadius * params.innerRadius);
+  params.normalRadius = 0.5f * (params.outerRadius + params.innerRadius);
+  params.tangentRadius = 2.1f * params.meanFineDistance;
+  params.bndXm = params.bndYm = params.bndZm = 2;
+  params.bndXp = params.bndYp = params.bndZp = params.res - 2;
+
+  if (frameCount == 0) {
+
+    // initialize accel grids
+    accelCoarse.init(2.f * res / params.outerRadius);
+    accelSurface.init(1.f * res / (2.f * params.meanFineDistance));
+
+    // update coarse accel structure
+    coarseParticles.updateAccel();
+
+    // create surface points
+    initFines(coarseParticles, surfacePoints, flags);
+
+    // smooth surface
+    surfaceMaintenance(coarseParticles,
+                       surfacePoints,
+                       surfaceNormals,
+                       surfaceWaveH,
+                       surfaceWaveDtH,
+                       surfaceWaveSeed,
+                       surfaceWaveSeedAmplitude,
+                       6 * params.nbSurfaceMaintenanceIterations);
+
+    // set wave values to zero
+    for (int idx = 0; idx < surfacePoints.size(); idx++) {
+      surfaceWaveH[idx] = 0;
+      surfaceWaveDtH[idx] = 0;
+      surfaceWaveSeed[idx] = 0;
+      surfaceWaveSeedAmplitude[idx] = 0;
+    }
+  }
+  else {
+
+    // update coarse accel structure with previous coarse particles positions
+    coarseParticlesPrevPos.updateAccel();
+
+    // advect surface points following coarse particles
+    advectSurfacePoints(surfacePoints, coarseParticles, coarseParticlesPrevPos);
+    surfacePoints.updateAccel();
+
+    // update acceleration structure for surface points
+    coarseParticles.updateAccel();
+
+    // surface maintenance
+    surfaceMaintenance(coarseParticles,
+                       surfacePoints,
+                       surfaceNormals,
+                       surfaceWaveH,
+                       surfaceWaveDtH,
+                       surfaceWaveSeed,
+                       surfaceWaveSeedAmplitude,
+                       params.nbSurfaceMaintenanceIterations);
+
+    // surface waves
+    surfaceWaves(surfacePoints,
+                 surfaceNormals,
+                 surfaceWaveH,
+                 surfaceWaveDtH,
+                 surfaceWaveSource,
+                 surfaceWaveSeed,
+                 surfaceWaveSeedAmplitude);
+  }
+  frameCount++;
+
+  // save positions as previous positions for next step
+  for (int id = 0; id < coarseParticles.size(); id++) {
+    if ((coarseParticles.getStatus(id) & ParticleBase::PNEW) == 0 &&
+        (coarseParticles.getStatus(id) & ParticleBase::PDELETE) == 0) {
+      coarseParticlesPrevPos.setVec3(id, coarseParticles.getPos(id));
+    }
+  }
+
+  // create displaced points for display
+  surfacePointsDisplaced.clear();
+  for (int idx = 0; idx < surfacePoints.size(); idx++) {
+    if ((surfacePoints.getStatus(idx) & ParticleBase::PDELETE) == 0) {
+      surfacePointsDisplaced.addParticle(surfacePoints.getPos(idx) +
+                                         surfaceNormals[idx] * surfaceWaveH[idx]);
+    }
+  }
+
+#if USE_CHRONO == 1
+  end = std::chrono::high_resolution_clock::now();
+  cout << std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count() / 1000000000.f
+       << " : time upres" << endl;
+  begin = std::chrono::high_resolution_clock::now();
+#endif
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "particleSurfaceTurbulence", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      BasicParticleSystem &coarseParts = *_args.getPtr<BasicParticleSystem>(
+          "coarseParts", 1, &_lock);
+      ParticleDataImpl<Vec3> &coarsePartsPrevPos = *_args.getPtr<ParticleDataImpl<Vec3>>(
+          "coarsePartsPrevPos", 2, &_lock);
+      BasicParticleSystem &surfPoints = *_args.getPtr<BasicParticleSystem>(
+          "surfPoints", 3, &_lock);
+      ParticleDataImpl<Vec3> &surfaceNormals = *_args.getPtr<ParticleDataImpl<Vec3>>(
+          "surfaceNormals", 4, &_lock);
+      ParticleDataImpl<Real> &surfaceWaveH = *_args.getPtr<ParticleDataImpl<Real>>(
+          "surfaceWaveH", 5, &_lock);
+      ParticleDataImpl<Real> &surfaceWaveDtH = *_args.getPtr<ParticleDataImpl<Real>>(
+          "surfaceWaveDtH", 6, &_lock);
+      BasicParticleSystem &surfacePointsDisplaced = *_args.getPtr<BasicParticleSystem>(
+          "surfacePointsDisplaced", 7, &_lock);
+      ParticleDataImpl<Real> &surfaceWaveSource = *_args.getPtr<ParticleDataImpl<Real>>(
+          "surfaceWaveSource", 8, &_lock);
+      ParticleDataImpl<Real> &surfaceWaveSeed = *_args.getPtr<ParticleDataImpl<Real>>(
+          "surfaceWaveSeed", 9, &_lock);
+      ParticleDataImpl<Real> &surfaceWaveSeedAmplitude = *_args.getPtr<ParticleDataImpl<Real>>(
+          "surfaceWaveSeedAmplitude", 10, &_lock);
+      int res = _args.get<int>("res", 11, &_lock);
+      Real outerRadius = _args.getOpt<Real>("outerRadius", 12, 1.0f, &_lock);
+      int surfaceDensity = _args.getOpt<int>("surfaceDensity", 13, 20, &_lock);
+      int nbSurfaceMaintenanceIterations = _args.getOpt<int>(
+          "nbSurfaceMaintenanceIterations", 14, 4, &_lock);
+      Real dt = _args.getOpt<Real>("dt", 15, 0.005f, &_lock);
+      Real waveSpeed = _args.getOpt<Real>("waveSpeed", 16, 16.0f, &_lock);
+      Real waveDamping = _args.getOpt<Real>("waveDamping", 17, 0.0f, &_lock);
+      Real waveSeedFrequency = _args.getOpt<Real>("waveSeedFrequency", 18, 4, &_lock);
+      Real waveMaxAmplitude = _args.getOpt<Real>("waveMaxAmplitude", 19, 0.25f, &_lock);
+      Real waveMaxFrequency = _args.getOpt<Real>("waveMaxFrequency", 20, 800, &_lock);
+      Real waveMaxSeedingAmplitude = _args.getOpt<Real>(
+          "waveMaxSeedingAmplitude", 21, 0.5, &_lock);
+      Real waveSeedingCurvatureThresholdRegionCenter = _args.getOpt<Real>(
+          "waveSeedingCurvatureThresholdRegionCenter", 22, 0.025f, &_lock);
+      Real waveSeedingCurvatureThresholdRegionRadius = _args.getOpt<Real>(
+          "waveSeedingCurvatureThresholdRegionRadius", 23, 0.01f, &_lock);
+      Real waveSeedStepSizeRatioOfMax = _args.getOpt<Real>(
+          "waveSeedStepSizeRatioOfMax", 24, 0.05f, &_lock);
+      _retval = getPyNone();
+      particleSurfaceTurbulence(flags,
+                                coarseParts,
+                                coarsePartsPrevPos,
+                                surfPoints,
+                                surfaceNormals,
+                                surfaceWaveH,
+                                surfaceWaveDtH,
+                                surfacePointsDisplaced,
+                                surfaceWaveSource,
+                                surfaceWaveSeed,
+                                surfaceWaveSeedAmplitude,
+                                res,
+                                outerRadius,
+                                surfaceDensity,
+                                nbSurfaceMaintenanceIterations,
+                                dt,
+                                waveSpeed,
+                                waveDamping,
+                                waveSeedFrequency,
+                                waveMaxAmplitude,
+                                waveMaxFrequency,
+                                waveMaxSeedingAmplitude,
+                                waveSeedingCurvatureThresholdRegionCenter,
+                                waveSeedingCurvatureThresholdRegionRadius,
+                                waveSeedStepSizeRatioOfMax);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "particleSurfaceTurbulence", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("particleSurfaceTurbulence", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_particleSurfaceTurbulence("", "particleSurfaceTurbulence", _W_0);
+extern "C" {
+void PbRegister_particleSurfaceTurbulence()
+{
+  KEEP_UNUSED(_RP_particleSurfaceTurbulence);
+}
+}
+
+void debugCheckParts(const BasicParticleSystem &parts, const FlagGrid &flags)
+{
+  for (int idx = 0; idx < parts.size(); idx++) {
+    Vec3i p = toVec3i(parts.getPos(idx));
+    if (!flags.isInBounds(p)) {
+      debMsg("bad position??? " << idx << " " << parts.getPos(idx), 1);
+      exit(1);
+    }
+  }
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "debugCheckParts", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const BasicParticleSystem &parts = *_args.getPtr<BasicParticleSystem>("parts", 0, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 1, &_lock);
+      _retval = getPyNone();
+      debugCheckParts(parts, flags);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "debugCheckParts", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("debugCheckParts", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_debugCheckParts("", "debugCheckParts", _W_1);
+extern "C" {
+void PbRegister_debugCheckParts()
+{
+  KEEP_UNUSED(_RP_debugCheckParts);
+}
+}
+
+}  // namespace SurfaceTurbulence
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/vortexplugins.cpp b/extern/mantaflow/preprocessed/plugin/vortexplugins.cpp
new file mode 100644
index 00000000000..c2a21d82689
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/vortexplugins.cpp
@@ -0,0 +1,695 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Copyright 2011 Tobias Pfaff, Nils Thuerey
+ *
+ * This program is free software, distributed under the terms of the
+ * Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Plugins for using vortex sheet meshes
+ *
+ ******************************************************************************/
+
+#include <iostream>
+#include "vortexsheet.h"
+#include "vortexpart.h"
+#include "shapes.h"
+#include "commonkernels.h"
+#include "conjugategrad.h"
+#include "randomstream.h"
+#include "levelset.h"
+
+using namespace std;
+
+namespace Manta {
+
+//! Mark area of mesh inside shape as fixed nodes.
+//! Remove all other fixed nodes if 'exclusive' is set
+
+void markAsFixed(Mesh &mesh, const Shape *shape, bool exclusive = true)
+{
+  for (int i = 0; i < mesh.numNodes(); i++) {
+    if (shape->isInside(mesh.nodes(i).pos))
+      mesh.nodes(i).flags |= Mesh::NfFixed;
+    else if (exclusive)
+      mesh.nodes(i).flags &= ~Mesh::NfFixed;
+  }
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "markAsFixed", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Mesh &mesh = *_args.getPtr<Mesh>("mesh", 0, &_lock);
+      const Shape *shape = _args.getPtr<Shape>("shape", 1, &_lock);
+      bool exclusive = _args.getOpt<bool>("exclusive", 2, true, &_lock);
+      _retval = getPyNone();
+      markAsFixed(mesh, shape, exclusive);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "markAsFixed", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("markAsFixed", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_markAsFixed("", "markAsFixed", _W_0);
+extern "C" {
+void PbRegister_markAsFixed()
+{
+  KEEP_UNUSED(_RP_markAsFixed);
+}
+}
+
+//! Adapt texture coordinates of mesh inside shape
+//! to obtain an effective inflow effect
+
+void texcoordInflow(VortexSheetMesh &mesh, const Shape *shape, const MACGrid &vel)
+{
+  static Vec3 t0 = Vec3::Zero;
+
+  // get mean velocity
+  int cnt = 0;
+  Vec3 meanV(0.0);
+  FOR_IJK(vel)
+  {
+    if (shape->isInsideGrid(i, j, k)) {
+      cnt++;
+      meanV += vel.getCentered(i, j, k);
+    }
+  }
+  meanV /= (Real)cnt;
+  t0 -= mesh.getParent()->getDt() * meanV;
+  mesh.setReferenceTexOffset(t0);
+
+  // apply mean velocity
+  for (int i = 0; i < mesh.numNodes(); i++) {
+    if (shape->isInside(mesh.nodes(i).pos)) {
+      Vec3 tc = mesh.nodes(i).pos + t0;
+      mesh.tex1(i) = tc;
+      mesh.tex2(i) = tc;
+    }
+  }
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "texcoordInflow", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      VortexSheetMesh &mesh = *_args.getPtr<VortexSheetMesh>("mesh", 0, &_lock);
+      const Shape *shape = _args.getPtr<Shape>("shape", 1, &_lock);
+      const MACGrid &vel = *_args.getPtr<MACGrid>("vel", 2, &_lock);
+      _retval = getPyNone();
+      texcoordInflow(mesh, shape, vel);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "texcoordInflow", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("texcoordInflow", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_texcoordInflow("", "texcoordInflow", _W_1);
+extern "C" {
+void PbRegister_texcoordInflow()
+{
+  KEEP_UNUSED(_RP_texcoordInflow);
+}
+}
+
+;
+
+//! Init smoke density values of the mesh surface inside source shape
+
+void meshSmokeInflow(VortexSheetMesh &mesh, const Shape *shape, Real amount)
+{
+  for (int t = 0; t < mesh.numTris(); t++) {
+    if (shape->isInside(mesh.getFaceCenter(t)))
+      mesh.sheet(t).smokeAmount = amount;
+  }
+}
+static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "meshSmokeInflow", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      VortexSheetMesh &mesh = *_args.getPtr<VortexSheetMesh>("mesh", 0, &_lock);
+      const Shape *shape = _args.getPtr<Shape>("shape", 1, &_lock);
+      Real amount = _args.get<Real>("amount", 2, &_lock);
+      _retval = getPyNone();
+      meshSmokeInflow(mesh, shape, amount);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "meshSmokeInflow", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("meshSmokeInflow", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_meshSmokeInflow("", "meshSmokeInflow", _W_2);
+extern "C" {
+void PbRegister_meshSmokeInflow()
+{
+  KEEP_UNUSED(_RP_meshSmokeInflow);
+}
+}
+
+struct KnAcceleration : public KernelBase {
+  KnAcceleration(MACGrid &a, const MACGrid &v1, const MACGrid &v0, const Real idt)
+      : KernelBase(&a, 0), a(a), v1(v1), v0(v0), idt(idt)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(
+      IndexInt idx, MACGrid &a, const MACGrid &v1, const MACGrid &v0, const Real idt) const
+  {
+    a[idx] = (v1[idx] - v0[idx]) * idt;
+  }
+  inline MACGrid &getArg0()
+  {
+    return a;
+  }
+  typedef MACGrid type0;
+  inline const MACGrid &getArg1()
+  {
+    return v1;
+  }
+  typedef MACGrid type1;
+  inline const MACGrid &getArg2()
+  {
+    return v0;
+  }
+  typedef MACGrid type2;
+  inline const Real &getArg3()
+  {
+    return idt;
+  }
+  typedef Real type3;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnAcceleration ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, a, v1, v0, idt);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  MACGrid &a;
+  const MACGrid &v1;
+  const MACGrid &v0;
+  const Real idt;
+};
+
+//! Add vorticity to vortex sheets based on buoyancy
+
+void vorticitySource(VortexSheetMesh &mesh,
+                     Vec3 gravity,
+                     const MACGrid *vel = NULL,
+                     const MACGrid *velOld = NULL,
+                     Real scale = 0.1,
+                     Real maxAmount = 0,
+                     Real mult = 1.0)
+{
+  Real dt = mesh.getParent()->getDt();
+  Real dx = mesh.getParent()->getDx();
+  MACGrid acceleration(mesh.getParent());
+  if (vel)
+    KnAcceleration(acceleration, *vel, *velOld, 1.0 / dt);
+  const Real A = -1.0;
+  Real maxV = 0, meanV = 0;
+
+  for (int t = 0; t < mesh.numTris(); t++) {
+    Vec3 fn = mesh.getFaceNormal(t);
+    Vec3 source;
+    if (vel) {
+      Vec3 a = acceleration.getInterpolated(mesh.getFaceCenter(t));
+      source = A * cross(fn, a - gravity) * scale;
+    }
+    else {
+      source = A * cross(fn, -gravity) * scale;
+    }
+
+    if (mesh.isTriangleFixed(t))
+      source = 0;
+
+    mesh.sheet(t).vorticity *= mult;
+    mesh.sheet(t).vorticity += dt * source / dx;
+    // upper limit
+    Real v = norm(mesh.sheet(t).vorticity);
+    if (maxAmount > 0 && v > maxAmount)
+      mesh.sheet(t).vorticity *= maxAmount / v;
+
+    // stats
+    if (v > maxV)
+      maxV = v;
+    meanV += v;
+  }
+
+  cout << "vorticity: max " << maxV << " / mean " << meanV / mesh.numTris() << endl;
+}
+static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "vorticitySource", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      VortexSheetMesh &mesh = *_args.getPtr<VortexSheetMesh>("mesh", 0, &_lock);
+      Vec3 gravity = _args.get<Vec3>("gravity", 1, &_lock);
+      const MACGrid *vel = _args.getPtrOpt<MACGrid>("vel", 2, NULL, &_lock);
+      const MACGrid *velOld = _args.getPtrOpt<MACGrid>("velOld", 3, NULL, &_lock);
+      Real scale = _args.getOpt<Real>("scale", 4, 0.1, &_lock);
+      Real maxAmount = _args.getOpt<Real>("maxAmount", 5, 0, &_lock);
+      Real mult = _args.getOpt<Real>("mult", 6, 1.0, &_lock);
+      _retval = getPyNone();
+      vorticitySource(mesh, gravity, vel, velOld, scale, maxAmount, mult);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "vorticitySource", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("vorticitySource", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_vorticitySource("", "vorticitySource", _W_3);
+extern "C" {
+void PbRegister_vorticitySource()
+{
+  KEEP_UNUSED(_RP_vorticitySource);
+}
+}
+
+void smoothVorticity(VortexSheetMesh &mesh, int iter = 1, Real sigma = 0.2, Real alpha = 0.8)
+{
+  const Real mult = -0.5 / sigma / sigma;
+
+  // pre-calculate positions and weights
+  vector<Vec3> vort(mesh.numTris()), pos(mesh.numTris());
+  vector<Real> weights(3 * mesh.numTris());
+  vector<int> index(3 * mesh.numTris());
+  for (int i = 0; i < mesh.numTris(); i++) {
+    pos[i] = mesh.getFaceCenter(i);
+    mesh.sheet(i).vorticitySmoothed = mesh.sheet(i).vorticity;
+  }
+  for (int i = 0; i < mesh.numTris(); i++) {
+    for (int c = 0; c < 3; c++) {
+      int oc = mesh.corners(i, c).opposite;
+      if (oc >= 0) {
+        int t = mesh.corners(oc).tri;
+        weights[3 * i + c] = exp(normSquare(pos[t] - pos[i]) * mult);
+        index[3 * i + c] = t;
+      }
+      else {
+        weights[3 * i + c] = 0;
+        index[3 * i + c] = 0;
+      }
+    }
+  }
+
+  for (int it = 0; it < iter; ++it) {
+    // first, preload
+    for (int i = 0; i < mesh.numTris(); i++)
+      vort[i] = mesh.sheet(i).vorticitySmoothed;
+
+    for (int i = 0, idx = 0; i < mesh.numTris(); i++) {
+      // loop over adjacent tris
+      Real sum = 1.0f;
+      Vec3 v = vort[i];
+      for (int c = 0; c < 3; c++, idx++) {
+        Real w = weights[index[idx]];
+        v += w * vort[index[idx]];
+        sum += w;
+      }
+      mesh.sheet(i).vorticitySmoothed = v / sum;
+    }
+  }
+  for (int i = 0; i < mesh.numTris(); i++)
+    mesh.sheet(i).vorticitySmoothed *= alpha;
+}
+static PyObject *_W_4(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "smoothVorticity", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      VortexSheetMesh &mesh = *_args.getPtr<VortexSheetMesh>("mesh", 0, &_lock);
+      int iter = _args.getOpt<int>("iter", 1, 1, &_lock);
+      Real sigma = _args.getOpt<Real>("sigma", 2, 0.2, &_lock);
+      Real alpha = _args.getOpt<Real>("alpha", 3, 0.8, &_lock);
+      _retval = getPyNone();
+      smoothVorticity(mesh, iter, sigma, alpha);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "smoothVorticity", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("smoothVorticity", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_smoothVorticity("", "smoothVorticity", _W_4);
+extern "C" {
+void PbRegister_smoothVorticity()
+{
+  KEEP_UNUSED(_RP_smoothVorticity);
+}
+}
+
+//! Seed Vortex Particles inside shape with K41 characteristics
+void VPseedK41(VortexParticleSystem &system,
+               const Shape *shape,
+               Real strength = 0,
+               Real sigma0 = 0.2,
+               Real sigma1 = 1.0,
+               Real probability = 1.0,
+               Real N = 3.0)
+{
+  Grid<Real> temp(system.getParent());
+  const Real dt = system.getParent()->getDt();
+  static RandomStream rand(3489572);
+  Real s0 = pow((Real)sigma0, (Real)(-N + 1.0));
+  Real s1 = pow((Real)sigma1, (Real)(-N + 1.0));
+
+  FOR_IJK(temp)
+  {
+    if (shape->isInsideGrid(i, j, k)) {
+      if (rand.getReal() < probability * dt) {
+        Real p = rand.getReal();
+        Real sigma = pow((1.0 - p) * s0 + p * s1, 1. / (-N + 1.0));
+        Vec3 randDir(rand.getReal(), rand.getReal(), rand.getReal());
+        Vec3 posUpd(i + rand.getReal(), j + rand.getReal(), k + rand.getReal());
+        normalize(randDir);
+        Vec3 vorticity = randDir * strength * pow((Real)sigma, (Real)(-10. / 6. + N / 2.0));
+        system.add(VortexParticleData(posUpd, vorticity, sigma));
+      }
+    }
+  }
+}
+static PyObject *_W_5(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "VPseedK41", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      VortexParticleSystem &system = *_args.getPtr<VortexParticleSystem>("system", 0, &_lock);
+      const Shape *shape = _args.getPtr<Shape>("shape", 1, &_lock);
+      Real strength = _args.getOpt<Real>("strength", 2, 0, &_lock);
+      Real sigma0 = _args.getOpt<Real>("sigma0", 3, 0.2, &_lock);
+      Real sigma1 = _args.getOpt<Real>("sigma1", 4, 1.0, &_lock);
+      Real probability = _args.getOpt<Real>("probability", 5, 1.0, &_lock);
+      Real N = _args.getOpt<Real>("N", 6, 3.0, &_lock);
+      _retval = getPyNone();
+      VPseedK41(system, shape, strength, sigma0, sigma1, probability, N);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "VPseedK41", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("VPseedK41", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_VPseedK41("", "VPseedK41", _W_5);
+extern "C" {
+void PbRegister_VPseedK41()
+{
+  KEEP_UNUSED(_RP_VPseedK41);
+}
+}
+
+//! Vortex-in-cell integration
+
+void VICintegration(VortexSheetMesh &mesh,
+                    Real sigma,
+                    Grid<Vec3> &vel,
+                    const FlagGrid &flags,
+                    Grid<Vec3> *vorticity = NULL,
+                    Real cgMaxIterFac = 1.5,
+                    Real cgAccuracy = 1e-3,
+                    Real scale = 0.01,
+                    int precondition = 0)
+{
+
+  MuTime t0;
+  const Real fac = 16.0;  // experimental factor to balance out regularization
+
+  // if no vort grid is given, use a temporary one
+  Grid<Vec3> vortTemp(mesh.getParent());
+  Grid<Vec3> &vort = (vorticity) ? (*vorticity) : (vortTemp);
+  vort.clear();
+
+  // map vorticity to grid using Peskin kernel
+  int sgi = ceil(sigma);
+  Real pkfac = M_PI / sigma;
+  const int numTris = mesh.numTris();
+  for (int t = 0; t < numTris; t++) {
+    Vec3 pos = mesh.getFaceCenter(t);
+    Vec3 v = mesh.sheet(t).vorticity * mesh.getFaceArea(t) * fac;
+
+    // inner kernel
+    // first, summate
+    Real sum = 0;
+    for (int i = -sgi; i < sgi; i++) {
+      if (pos.x + i < 0 || (int)pos.x + i >= vort.getSizeX())
+        continue;
+      for (int j = -sgi; j < sgi; j++) {
+        if (pos.y + j < 0 || (int)pos.y + j >= vort.getSizeY())
+          continue;
+        for (int k = -sgi; k < sgi; k++) {
+          if (pos.z + k < 0 || (int)pos.z + k >= vort.getSizeZ())
+            continue;
+          Vec3i cell(pos.x + i, pos.y + j, pos.z + k);
+          if (!flags.isFluid(cell))
+            continue;
+          Vec3 d = pos -
+                   Vec3(i + 0.5 + floor(pos.x), j + 0.5 + floor(pos.y), k + 0.5 + floor(pos.z));
+          Real dl = norm(d);
+          if (dl > sigma)
+            continue;
+          // precalc Peskin kernel
+          sum += 1.0 + cos(dl * pkfac);
+        }
+      }
+    }
+    // then, apply normalized kernel
+    Real wnorm = 1.0 / sum;
+    for (int i = -sgi; i < sgi; i++) {
+      if (pos.x + i < 0 || (int)pos.x + i >= vort.getSizeX())
+        continue;
+      for (int j = -sgi; j < sgi; j++) {
+        if (pos.y + j < 0 || (int)pos.y + j >= vort.getSizeY())
+          continue;
+        for (int k = -sgi; k < sgi; k++) {
+          if (pos.z + k < 0 || (int)pos.z + k >= vort.getSizeZ())
+            continue;
+          Vec3i cell(pos.x + i, pos.y + j, pos.z + k);
+          if (!flags.isFluid(cell))
+            continue;
+          Vec3 d = pos -
+                   Vec3(i + 0.5 + floor(pos.x), j + 0.5 + floor(pos.y), k + 0.5 + floor(pos.z));
+          Real dl = norm(d);
+          if (dl > sigma)
+            continue;
+          Real w = (1.0 + cos(dl * pkfac)) * wnorm;
+          vort(cell) += v * w;
+        }
+      }
+    }
+  }
+
+  // Prepare grids for poisson solve
+  Grid<Vec3> vortexCurl(mesh.getParent());
+  Grid<Real> rhs(mesh.getParent());
+  Grid<Real> solution(mesh.getParent());
+  Grid<Real> residual(mesh.getParent());
+  Grid<Real> search(mesh.getParent());
+  Grid<Real> temp1(mesh.getParent());
+  Grid<Real> A0(mesh.getParent());
+  Grid<Real> Ai(mesh.getParent());
+  Grid<Real> Aj(mesh.getParent());
+  Grid<Real> Ak(mesh.getParent());
+  Grid<Real> pca0(mesh.getParent());
+  Grid<Real> pca1(mesh.getParent());
+  Grid<Real> pca2(mesh.getParent());
+  Grid<Real> pca3(mesh.getParent());
+
+  MakeLaplaceMatrix(flags, A0, Ai, Aj, Ak);
+  CurlOp(vort, vortexCurl);
+
+  // Solve vector poisson equation
+  for (int c = 0; c < 3; c++) {
+    // construct rhs
+    if (vel.getType() & GridBase::TypeMAC)
+      GetShiftedComponent(vortexCurl, rhs, c);
+    else
+      GetComponent(vortexCurl, rhs, c);
+
+    // prepare CG solver
+    const int maxIter = (int)(cgMaxIterFac * vel.getSize().max());
+    GridCgInterface *gcg = new GridCg<ApplyMatrix>(
+        solution, rhs, residual, search, flags, temp1, &A0, &Ai, &Aj, &Ak);
+    gcg->setAccuracy(cgAccuracy);
+    gcg->setUseL2Norm(true);
+    gcg->setICPreconditioner(
+        (GridCgInterface::PreconditionType)precondition, &pca0, &pca1, &pca2, &pca3);
+
+    // iterations
+    for (int iter = 0; iter < maxIter; iter++) {
+      if (!gcg->iterate())
+        iter = maxIter;
+    }
+    debMsg("VICintegration CG iterations:" << gcg->getIterations() << ", res:" << gcg->getSigma(),
+           1);
+    delete gcg;
+
+    // copy back
+    solution *= scale;
+    SetComponent(vel, solution, c);
+  }
+}
+static PyObject *_W_6(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "VICintegration", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      VortexSheetMesh &mesh = *_args.getPtr<VortexSheetMesh>("mesh", 0, &_lock);
+      Real sigma = _args.get<Real>("sigma", 1, &_lock);
+      Grid<Vec3> &vel = *_args.getPtr<Grid<Vec3>>("vel", 2, &_lock);
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 3, &_lock);
+      Grid<Vec3> *vorticity = _args.getPtrOpt<Grid<Vec3>>("vorticity", 4, NULL, &_lock);
+      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 5, 1.5, &_lock);
+      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 6, 1e-3, &_lock);
+      Real scale = _args.getOpt<Real>("scale", 7, 0.01, &_lock);
+      int precondition = _args.getOpt<int>("precondition", 8, 0, &_lock);
+      _retval = getPyNone();
+      VICintegration(
+          mesh, sigma, vel, flags, vorticity, cgMaxIterFac, cgAccuracy, scale, precondition);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "VICintegration", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("VICintegration", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_VICintegration("", "VICintegration", _W_6);
+extern "C" {
+void PbRegister_VICintegration()
+{
+  KEEP_UNUSED(_RP_VICintegration);
+}
+}
+
+//! Obtain density field from levelset with linear gradient of size sigma over the interface
+void densityFromLevelset(const LevelsetGrid &phi,
+                         Grid<Real> &density,
+                         Real value = 1.0,
+                         Real sigma = 1.0)
+{
+  FOR_IJK(phi)
+  {
+    // remove boundary
+    if (i < 2 || j < 2 || k < 2 || i >= phi.getSizeX() - 2 || j >= phi.getSizeY() - 2 ||
+        k >= phi.getSizeZ() - 2)
+      density(i, j, k) = 0;
+    else if (phi(i, j, k) < -sigma)
+      density(i, j, k) = value;
+    else if (phi(i, j, k) > sigma)
+      density(i, j, k) = 0;
+    else
+      density(i, j, k) = clamp(
+          (Real)(0.5 * value / sigma * (1.0 - phi(i, j, k))), (Real)0.0, value);
+  }
+}
+static PyObject *_W_7(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "densityFromLevelset", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const LevelsetGrid &phi = *_args.getPtr<LevelsetGrid>("phi", 0, &_lock);
+      Grid<Real> &density = *_args.getPtr<Grid<Real>>("density", 1, &_lock);
+      Real value = _args.getOpt<Real>("value", 2, 1.0, &_lock);
+      Real sigma = _args.getOpt<Real>("sigma", 3, 1.0, &_lock);
+      _retval = getPyNone();
+      densityFromLevelset(phi, density, value, sigma);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "densityFromLevelset", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("densityFromLevelset", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_densityFromLevelset("", "densityFromLevelset", _W_7);
+extern "C" {
+void PbRegister_densityFromLevelset()
+{
+  KEEP_UNUSED(_RP_densityFromLevelset);
+}
+}
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/waveletturbulence.cpp b/extern/mantaflow/preprocessed/plugin/waveletturbulence.cpp
new file mode 100644
index 00000000000..9d3bdaa3f21
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/waveletturbulence.cpp
@@ -0,0 +1,1292 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Copyright 2011 Tobias Pfaff, Nils Thuerey
+ *
+ * This program is free software, distributed under the terms of the
+ * Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Functions for calculating wavelet turbulence,
+ * plus helpers to compute vorticity, and strain rate magnitude
+ *
+ ******************************************************************************/
+
+#include "vectorbase.h"
+#include "shapes.h"
+#include "commonkernels.h"
+#include "noisefield.h"
+
+using namespace std;
+
+namespace Manta {
+
+//*****************************************************************************
+
+// first some fairly generic interpolation functions for grids with multiple sizes
+
+//! same as in grid.h , but takes an additional optional "desired" size
+inline void calcGridSizeFactorMod(
+    Vec3i s1, Vec3i s2, Vec3i optSize, Vec3 scale, Vec3 &sourceFactor, Vec3 &retOff)
+{
+  for (int c = 0; c < 3; c++) {
+    if (optSize[c] > 0) {
+      s2[c] = optSize[c];
+    }
+  }
+  sourceFactor = calcGridSizeFactor(s1, s2) / scale;
+  retOff = -retOff * sourceFactor + sourceFactor * 0.5;
+}
+
+void interpolateGrid(Grid<Real> &target,
+                     const Grid<Real> &source,
+                     Vec3 scale = Vec3(1.),
+                     Vec3 offset = Vec3(0.),
+                     Vec3i size = Vec3i(-1, -1, -1),
+                     int orderSpace = 1)
+{
+  Vec3 sourceFactor(1.), off2 = offset;
+  calcGridSizeFactorMod(source.getSize(), target.getSize(), size, scale, sourceFactor, off2);
+
+  // a brief note on a mantaflow specialty: the target grid has to be the first argument here!
+  // the parent fluidsolver object is taken from the first grid, and it determines the size of the
+  // loop for the kernel call. as we're writing into target, it's important to loop exactly over
+  // all cells of the target grid... (note, when calling the plugin in python, it doesnt matter
+  // anymore).
+
+  // sourceFactor offset necessary to shift eval points by half a small cell width
+  knInterpolateGridTempl<Real>(target, source, sourceFactor, off2, orderSpace);
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "interpolateGrid", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &target = *_args.getPtr<Grid<Real>>("target", 0, &_lock);
+      const Grid<Real> &source = *_args.getPtr<Grid<Real>>("source", 1, &_lock);
+      Vec3 scale = _args.getOpt<Vec3>("scale", 2, Vec3(1.), &_lock);
+      Vec3 offset = _args.getOpt<Vec3>("offset", 3, Vec3(0.), &_lock);
+      Vec3i size = _args.getOpt<Vec3i>("size", 4, Vec3i(-1, -1, -1), &_lock);
+      int orderSpace = _args.getOpt<int>("orderSpace", 5, 1, &_lock);
+      _retval = getPyNone();
+      interpolateGrid(target, source, scale, offset, size, orderSpace);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "interpolateGrid", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("interpolateGrid", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_interpolateGrid("", "interpolateGrid", _W_0);
+extern "C" {
+void PbRegister_interpolateGrid()
+{
+  KEEP_UNUSED(_RP_interpolateGrid);
+}
+}
+
+void interpolateGridVec3(Grid<Vec3> &target,
+                         const Grid<Vec3> &source,
+                         Vec3 scale = Vec3(1.),
+                         Vec3 offset = Vec3(0.),
+                         Vec3i size = Vec3i(-1, -1, -1),
+                         int orderSpace = 1)
+{
+  Vec3 sourceFactor(1.), off2 = offset;
+  calcGridSizeFactorMod(source.getSize(), target.getSize(), size, scale, sourceFactor, off2);
+  knInterpolateGridTempl<Vec3>(target, source, sourceFactor, off2, orderSpace);
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "interpolateGridVec3", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Vec3> &target = *_args.getPtr<Grid<Vec3>>("target", 0, &_lock);
+      const Grid<Vec3> &source = *_args.getPtr<Grid<Vec3>>("source", 1, &_lock);
+      Vec3 scale = _args.getOpt<Vec3>("scale", 2, Vec3(1.), &_lock);
+      Vec3 offset = _args.getOpt<Vec3>("offset", 3, Vec3(0.), &_lock);
+      Vec3i size = _args.getOpt<Vec3i>("size", 4, Vec3i(-1, -1, -1), &_lock);
+      int orderSpace = _args.getOpt<int>("orderSpace", 5, 1, &_lock);
+      _retval = getPyNone();
+      interpolateGridVec3(target, source, scale, offset, size, orderSpace);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "interpolateGridVec3", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("interpolateGridVec3", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_interpolateGridVec3("", "interpolateGridVec3", _W_1);
+extern "C" {
+void PbRegister_interpolateGridVec3()
+{
+  KEEP_UNUSED(_RP_interpolateGridVec3);
+}
+}
+
+//! interpolate a mac velocity grid from one size to another size
+
+struct KnInterpolateMACGrid : public KernelBase {
+  KnInterpolateMACGrid(MACGrid &target,
+                       const MACGrid &source,
+                       const Vec3 &sourceFactor,
+                       const Vec3 &off,
+                       int orderSpace)
+      : KernelBase(&target, 0),
+        target(target),
+        source(source),
+        sourceFactor(sourceFactor),
+        off(off),
+        orderSpace(orderSpace)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 MACGrid &target,
+                 const MACGrid &source,
+                 const Vec3 &sourceFactor,
+                 const Vec3 &off,
+                 int orderSpace) const
+  {
+    Vec3 pos = Vec3(i, j, k) * sourceFactor + off;
+
+    Real vx = source.getInterpolatedHi(pos - Vec3(0.5, 0, 0), orderSpace)[0];
+    Real vy = source.getInterpolatedHi(pos - Vec3(0, 0.5, 0), orderSpace)[1];
+    Real vz = 0.f;
+    if (source.is3D())
+      vz = source.getInterpolatedHi(pos - Vec3(0, 0, 0.5), orderSpace)[2];
+
+    target(i, j, k) = Vec3(vx, vy, vz);
+  }
+  inline MACGrid &getArg0()
+  {
+    return target;
+  }
+  typedef MACGrid type0;
+  inline const MACGrid &getArg1()
+  {
+    return source;
+  }
+  typedef MACGrid type1;
+  inline const Vec3 &getArg2()
+  {
+    return sourceFactor;
+  }
+  typedef Vec3 type2;
+  inline const Vec3 &getArg3()
+  {
+    return off;
+  }
+  typedef Vec3 type3;
+  inline int &getArg4()
+  {
+    return orderSpace;
+  }
+  typedef int type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnInterpolateMACGrid ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, target, source, sourceFactor, off, orderSpace);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, target, source, sourceFactor, off, orderSpace);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  MACGrid &target;
+  const MACGrid &source;
+  const Vec3 &sourceFactor;
+  const Vec3 &off;
+  int orderSpace;
+};
+
+void interpolateMACGrid(MACGrid &target,
+                        const MACGrid &source,
+                        Vec3 scale = Vec3(1.),
+                        Vec3 offset = Vec3(0.),
+                        Vec3i size = Vec3i(-1, -1, -1),
+                        int orderSpace = 1)
+{
+  Vec3 sourceFactor(1.), off2 = offset;
+  calcGridSizeFactorMod(source.getSize(), target.getSize(), size, scale, sourceFactor, off2);
+  KnInterpolateMACGrid(target, source, sourceFactor, off2, orderSpace);
+}
+static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "interpolateMACGrid", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      MACGrid &target = *_args.getPtr<MACGrid>("target", 0, &_lock);
+      const MACGrid &source = *_args.getPtr<MACGrid>("source", 1, &_lock);
+      Vec3 scale = _args.getOpt<Vec3>("scale", 2, Vec3(1.), &_lock);
+      Vec3 offset = _args.getOpt<Vec3>("offset", 3, Vec3(0.), &_lock);
+      Vec3i size = _args.getOpt<Vec3i>("size", 4, Vec3i(-1, -1, -1), &_lock);
+      int orderSpace = _args.getOpt<int>("orderSpace", 5, 1, &_lock);
+      _retval = getPyNone();
+      interpolateMACGrid(target, source, scale, offset, size, orderSpace);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "interpolateMACGrid", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("interpolateMACGrid", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_interpolateMACGrid("", "interpolateMACGrid", _W_2);
+extern "C" {
+void PbRegister_interpolateMACGrid()
+{
+  KEEP_UNUSED(_RP_interpolateMACGrid);
+}
+}
+
+//*****************************************************************************
+
+//! Apply vector noise to grid, this is a simplified version - no position scaling or UVs
+
+struct knApplySimpleNoiseVec3 : public KernelBase {
+  knApplySimpleNoiseVec3(const FlagGrid &flags,
+                         Grid<Vec3> &target,
+                         const WaveletNoiseField &noise,
+                         Real scale,
+                         const Grid<Real> *weight)
+      : KernelBase(&flags, 0),
+        flags(flags),
+        target(target),
+        noise(noise),
+        scale(scale),
+        weight(weight)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 Grid<Vec3> &target,
+                 const WaveletNoiseField &noise,
+                 Real scale,
+                 const Grid<Real> *weight) const
+  {
+    if (!flags.isFluid(i, j, k))
+      return;
+    Real factor = 1;
+    if (weight)
+      factor = (*weight)(i, j, k);
+    target(i, j, k) += noise.evaluateCurl(Vec3(i, j, k) + Vec3(0.5)) * scale * factor;
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<Vec3> &getArg1()
+  {
+    return target;
+  }
+  typedef Grid<Vec3> type1;
+  inline const WaveletNoiseField &getArg2()
+  {
+    return noise;
+  }
+  typedef WaveletNoiseField type2;
+  inline Real &getArg3()
+  {
+    return scale;
+  }
+  typedef Real type3;
+  inline const Grid<Real> *getArg4()
+  {
+    return weight;
+  }
+  typedef Grid<Real> type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel knApplySimpleNoiseVec3 ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, target, noise, scale, weight);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, target, noise, scale, weight);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const FlagGrid &flags;
+  Grid<Vec3> &target;
+  const WaveletNoiseField &noise;
+  Real scale;
+  const Grid<Real> *weight;
+};
+
+void applySimpleNoiseVec3(const FlagGrid &flags,
+                          Grid<Vec3> &target,
+                          const WaveletNoiseField &noise,
+                          Real scale = 1.0,
+                          const Grid<Real> *weight = NULL)
+{
+  // note - passing a MAC grid here is slightly inaccurate, we should evaluate each component
+  // separately
+  knApplySimpleNoiseVec3(flags, target, noise, scale, weight);
+}
+static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "applySimpleNoiseVec3", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Vec3> &target = *_args.getPtr<Grid<Vec3>>("target", 1, &_lock);
+      const WaveletNoiseField &noise = *_args.getPtr<WaveletNoiseField>("noise", 2, &_lock);
+      Real scale = _args.getOpt<Real>("scale", 3, 1.0, &_lock);
+      const Grid<Real> *weight = _args.getPtrOpt<Grid<Real>>("weight", 4, NULL, &_lock);
+      _retval = getPyNone();
+      applySimpleNoiseVec3(flags, target, noise, scale, weight);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "applySimpleNoiseVec3", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("applySimpleNoiseVec3", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_applySimpleNoiseVec3("", "applySimpleNoiseVec3", _W_3);
+extern "C" {
+void PbRegister_applySimpleNoiseVec3()
+{
+  KEEP_UNUSED(_RP_applySimpleNoiseVec3);
+}
+}
+
+//! Simple noise for a real grid , follows applySimpleNoiseVec3
+
+struct knApplySimpleNoiseReal : public KernelBase {
+  knApplySimpleNoiseReal(const FlagGrid &flags,
+                         Grid<Real> &target,
+                         const WaveletNoiseField &noise,
+                         Real scale,
+                         const Grid<Real> *weight)
+      : KernelBase(&flags, 0),
+        flags(flags),
+        target(target),
+        noise(noise),
+        scale(scale),
+        weight(weight)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 Grid<Real> &target,
+                 const WaveletNoiseField &noise,
+                 Real scale,
+                 const Grid<Real> *weight) const
+  {
+    if (!flags.isFluid(i, j, k))
+      return;
+    Real factor = 1;
+    if (weight)
+      factor = (*weight)(i, j, k);
+    target(i, j, k) += noise.evaluate(Vec3(i, j, k) + Vec3(0.5)) * scale * factor;
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<Real> &getArg1()
+  {
+    return target;
+  }
+  typedef Grid<Real> type1;
+  inline const WaveletNoiseField &getArg2()
+  {
+    return noise;
+  }
+  typedef WaveletNoiseField type2;
+  inline Real &getArg3()
+  {
+    return scale;
+  }
+  typedef Real type3;
+  inline const Grid<Real> *getArg4()
+  {
+    return weight;
+  }
+  typedef Grid<Real> type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel knApplySimpleNoiseReal ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, target, noise, scale, weight);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, target, noise, scale, weight);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const FlagGrid &flags;
+  Grid<Real> &target;
+  const WaveletNoiseField &noise;
+  Real scale;
+  const Grid<Real> *weight;
+};
+
+void applySimpleNoiseReal(const FlagGrid &flags,
+                          Grid<Real> &target,
+                          const WaveletNoiseField &noise,
+                          Real scale = 1.0,
+                          const Grid<Real> *weight = NULL)
+{
+  knApplySimpleNoiseReal(flags, target, noise, scale, weight);
+}
+static PyObject *_W_4(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "applySimpleNoiseReal", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Real> &target = *_args.getPtr<Grid<Real>>("target", 1, &_lock);
+      const WaveletNoiseField &noise = *_args.getPtr<WaveletNoiseField>("noise", 2, &_lock);
+      Real scale = _args.getOpt<Real>("scale", 3, 1.0, &_lock);
+      const Grid<Real> *weight = _args.getPtrOpt<Grid<Real>>("weight", 4, NULL, &_lock);
+      _retval = getPyNone();
+      applySimpleNoiseReal(flags, target, noise, scale, weight);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "applySimpleNoiseReal", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("applySimpleNoiseReal", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_applySimpleNoiseReal("", "applySimpleNoiseReal", _W_4);
+extern "C" {
+void PbRegister_applySimpleNoiseReal()
+{
+  KEEP_UNUSED(_RP_applySimpleNoiseReal);
+}
+}
+
+//! Apply vector-based wavelet noise to target grid
+//! This is the version with more functionality - supports uv grids, and on-the-fly interpolation
+//! of input grids.
+
+struct knApplyNoiseVec3 : public KernelBase {
+  knApplyNoiseVec3(const FlagGrid &flags,
+                   Grid<Vec3> &target,
+                   const WaveletNoiseField &noise,
+                   Real scale,
+                   Real scaleSpatial,
+                   const Grid<Real> *weight,
+                   const Grid<Vec3> *uv,
+                   bool uvInterpol,
+                   const Vec3 &sourceFactor)
+      : KernelBase(&flags, 0),
+        flags(flags),
+        target(target),
+        noise(noise),
+        scale(scale),
+        scaleSpatial(scaleSpatial),
+        weight(weight),
+        uv(uv),
+        uvInterpol(uvInterpol),
+        sourceFactor(sourceFactor)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 Grid<Vec3> &target,
+                 const WaveletNoiseField &noise,
+                 Real scale,
+                 Real scaleSpatial,
+                 const Grid<Real> *weight,
+                 const Grid<Vec3> *uv,
+                 bool uvInterpol,
+                 const Vec3 &sourceFactor) const
+  {
+    if (!flags.isFluid(i, j, k))
+      return;
+
+    // get weighting, interpolate if necessary
+    Real w = 1;
+    if (weight) {
+      if (!uvInterpol) {
+        w = (*weight)(i, j, k);
+      }
+      else {
+        w = weight->getInterpolated(Vec3(i, j, k) * sourceFactor);
+      }
+    }
+
+    // compute position where to evaluate the noise
+    Vec3 pos = Vec3(i, j, k) + Vec3(0.5);
+    if (uv) {
+      if (!uvInterpol) {
+        pos = (*uv)(i, j, k);
+      }
+      else {
+        pos = uv->getInterpolated(Vec3(i, j, k) * sourceFactor);
+        // uv coordinates are in local space - so we need to adjust the values of the positions
+        pos /= sourceFactor;
+      }
+    }
+    pos *= scaleSpatial;
+
+    Vec3 noiseVec3 = noise.evaluateCurl(pos) * scale * w;
+    // noiseVec3=pos; // debug , show interpolated positions
+    target(i, j, k) += noiseVec3;
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<Vec3> &getArg1()
+  {
+    return target;
+  }
+  typedef Grid<Vec3> type1;
+  inline const WaveletNoiseField &getArg2()
+  {
+    return noise;
+  }
+  typedef WaveletNoiseField type2;
+  inline Real &getArg3()
+  {
+    return scale;
+  }
+  typedef Real type3;
+  inline Real &getArg4()
+  {
+    return scaleSpatial;
+  }
+  typedef Real type4;
+  inline const Grid<Real> *getArg5()
+  {
+    return weight;
+  }
+  typedef Grid<Real> type5;
+  inline const Grid<Vec3> *getArg6()
+  {
+    return uv;
+  }
+  typedef Grid<Vec3> type6;
+  inline bool &getArg7()
+  {
+    return uvInterpol;
+  }
+  typedef bool type7;
+  inline const Vec3 &getArg8()
+  {
+    return sourceFactor;
+  }
+  typedef Vec3 type8;
+  void runMessage()
+  {
+    debMsg("Executing kernel knApplyNoiseVec3 ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i,
+               j,
+               k,
+               flags,
+               target,
+               noise,
+               scale,
+               scaleSpatial,
+               weight,
+               uv,
+               uvInterpol,
+               sourceFactor);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i,
+             j,
+             k,
+             flags,
+             target,
+             noise,
+             scale,
+             scaleSpatial,
+             weight,
+             uv,
+             uvInterpol,
+             sourceFactor);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const FlagGrid &flags;
+  Grid<Vec3> &target;
+  const WaveletNoiseField &noise;
+  Real scale;
+  Real scaleSpatial;
+  const Grid<Real> *weight;
+  const Grid<Vec3> *uv;
+  bool uvInterpol;
+  const Vec3 &sourceFactor;
+};
+
+void applyNoiseVec3(const FlagGrid &flags,
+                    Grid<Vec3> &target,
+                    const WaveletNoiseField &noise,
+                    Real scale = 1.0,
+                    Real scaleSpatial = 1.0,
+                    const Grid<Real> *weight = NULL,
+                    const Grid<Vec3> *uv = NULL)
+{
+  // check whether the uv grid has a different resolution
+  bool uvInterpol = false;
+  // and pre-compute conversion (only used if uvInterpol==true)
+  // used for both uv and weight grid...
+  Vec3 sourceFactor = Vec3(1.);
+  if (uv) {
+    uvInterpol = (target.getSize() != uv->getSize());
+    sourceFactor = calcGridSizeFactor(uv->getSize(), target.getSize());
+  }
+  else if (weight) {
+    uvInterpol = (target.getSize() != weight->getSize());
+    sourceFactor = calcGridSizeFactor(weight->getSize(), target.getSize());
+  }
+  if (uv && weight)
+    assertMsg(uv->getSize() == weight->getSize(), "UV and weight grid have to match!");
+
+  // note - passing a MAC grid here is slightly inaccurate, we should evaluate each component
+  // separately
+  knApplyNoiseVec3(
+      flags, target, noise, scale, scaleSpatial, weight, uv, uvInterpol, sourceFactor);
+}
+static PyObject *_W_5(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "applyNoiseVec3", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Vec3> &target = *_args.getPtr<Grid<Vec3>>("target", 1, &_lock);
+      const WaveletNoiseField &noise = *_args.getPtr<WaveletNoiseField>("noise", 2, &_lock);
+      Real scale = _args.getOpt<Real>("scale", 3, 1.0, &_lock);
+      Real scaleSpatial = _args.getOpt<Real>("scaleSpatial", 4, 1.0, &_lock);
+      const Grid<Real> *weight = _args.getPtrOpt<Grid<Real>>("weight", 5, NULL, &_lock);
+      const Grid<Vec3> *uv = _args.getPtrOpt<Grid<Vec3>>("uv", 6, NULL, &_lock);
+      _retval = getPyNone();
+      applyNoiseVec3(flags, target, noise, scale, scaleSpatial, weight, uv);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "applyNoiseVec3", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("applyNoiseVec3", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_applyNoiseVec3("", "applyNoiseVec3", _W_5);
+extern "C" {
+void PbRegister_applyNoiseVec3()
+{
+  KEEP_UNUSED(_RP_applyNoiseVec3);
+}
+}
+
+//! Compute energy of a staggered velocity field (at cell center)
+
+struct KnApplyComputeEnergy : public KernelBase {
+  KnApplyComputeEnergy(const FlagGrid &flags, const MACGrid &vel, Grid<Real> &energy)
+      : KernelBase(&flags, 0), flags(flags), vel(vel), energy(energy)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(
+      int i, int j, int k, const FlagGrid &flags, const MACGrid &vel, Grid<Real> &energy) const
+  {
+    Real e = 0.f;
+    if (flags.isFluid(i, j, k)) {
+      Vec3 v = vel.getCentered(i, j, k);
+      e = 0.5 * (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+    }
+    energy(i, j, k) = e;
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline const MACGrid &getArg1()
+  {
+    return vel;
+  }
+  typedef MACGrid type1;
+  inline Grid<Real> &getArg2()
+  {
+    return energy;
+  }
+  typedef Grid<Real> type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnApplyComputeEnergy ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, vel, energy);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, vel, energy);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+  }
+  const FlagGrid &flags;
+  const MACGrid &vel;
+  Grid<Real> &energy;
+};
+
+void computeEnergy(const FlagGrid &flags, const MACGrid &vel, Grid<Real> &energy)
+{
+  KnApplyComputeEnergy(flags, vel, energy);
+}
+static PyObject *_W_6(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "computeEnergy", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      const MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
+      Grid<Real> &energy = *_args.getPtr<Grid<Real>>("energy", 2, &_lock);
+      _retval = getPyNone();
+      computeEnergy(flags, vel, energy);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "computeEnergy", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("computeEnergy", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_computeEnergy("", "computeEnergy", _W_6);
+extern "C" {
+void PbRegister_computeEnergy()
+{
+  KEEP_UNUSED(_RP_computeEnergy);
+}
+}
+
+void computeWaveletCoeffs(Grid<Real> &input)
+{
+  Grid<Real> temp1(input.getParent()), temp2(input.getParent());
+  WaveletNoiseField::computeCoefficients(input, temp1, temp2);
+}
+static PyObject *_W_7(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "computeWaveletCoeffs", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &input = *_args.getPtr<Grid<Real>>("input", 0, &_lock);
+      _retval = getPyNone();
+      computeWaveletCoeffs(input);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "computeWaveletCoeffs", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("computeWaveletCoeffs", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_computeWaveletCoeffs("", "computeWaveletCoeffs", _W_7);
+extern "C" {
+void PbRegister_computeWaveletCoeffs()
+{
+  KEEP_UNUSED(_RP_computeWaveletCoeffs);
+}
+}
+
+// note - alomst the same as for vorticity confinement
+void computeVorticity(const MACGrid &vel, Grid<Vec3> &vorticity, Grid<Real> *norm = NULL)
+{
+  Grid<Vec3> velCenter(vel.getParent());
+  GetCentered(velCenter, vel);
+  CurlOp(velCenter, vorticity);
+  if (norm)
+    GridNorm(*norm, vorticity);
+}
+static PyObject *_W_8(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "computeVorticity", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const MACGrid &vel = *_args.getPtr<MACGrid>("vel", 0, &_lock);
+      Grid<Vec3> &vorticity = *_args.getPtr<Grid<Vec3>>("vorticity", 1, &_lock);
+      Grid<Real> *norm = _args.getPtrOpt<Grid<Real>>("norm", 2, NULL, &_lock);
+      _retval = getPyNone();
+      computeVorticity(vel, vorticity, norm);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "computeVorticity", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("computeVorticity", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_computeVorticity("", "computeVorticity", _W_8);
+extern "C" {
+void PbRegister_computeVorticity()
+{
+  KEEP_UNUSED(_RP_computeVorticity);
+}
+}
+
+// note - very similar to KnComputeProductionStrain, but for use as wavelet turb weighting
+
+struct KnComputeStrainRateMag : public KernelBase {
+  KnComputeStrainRateMag(const MACGrid &vel, const Grid<Vec3> &velCenter, Grid<Real> &prod)
+      : KernelBase(&vel, 1), vel(vel), velCenter(velCenter), prod(prod)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(
+      int i, int j, int k, const MACGrid &vel, const Grid<Vec3> &velCenter, Grid<Real> &prod) const
+  {
+    // compute Sij = 1/2 * (dU_i/dx_j + dU_j/dx_i)
+    Vec3 diag = Vec3(vel(i + 1, j, k).x, vel(i, j + 1, k).y, 0.) - vel(i, j, k);
+    if (vel.is3D())
+      diag[2] += vel(i, j, k + 1).z;
+    else
+      diag[2] = 0.;
+
+    Vec3 ux = 0.5 * (velCenter(i + 1, j, k) - velCenter(i - 1, j, k));
+    Vec3 uy = 0.5 * (velCenter(i, j + 1, k) - velCenter(i, j - 1, k));
+    Vec3 uz;
+    if (vel.is3D())
+      uz = 0.5 * (velCenter(i, j, k + 1) - velCenter(i, j, k - 1));
+
+    Real S12 = 0.5 * (ux.y + uy.x);
+    Real S13 = 0.5 * (ux.z + uz.x);
+    Real S23 = 0.5 * (uy.z + uz.y);
+    Real S2 = square(diag.x) + square(diag.y) + square(diag.z) + 2.0 * square(S12) +
+              2.0 * square(S13) + 2.0 * square(S23);
+    prod(i, j, k) = S2;
+  }
+  inline const MACGrid &getArg0()
+  {
+    return vel;
+  }
+  typedef MACGrid type0;
+  inline const Grid<Vec3> &getArg1()
+  {
+    return velCenter;
+  }
+  typedef Grid<Vec3> type1;
+  inline Grid<Real> &getArg2()
+  {
+    return prod;
+  }
+  typedef Grid<Real> type2;
+  void runMessage()
+  {
+    debMsg("Executing kernel KnComputeStrainRateMag ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, vel, velCenter, prod);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, vel, velCenter, prod);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const MACGrid &vel;
+  const Grid<Vec3> &velCenter;
+  Grid<Real> &prod;
+};
+void computeStrainRateMag(const MACGrid &vel, Grid<Real> &mag)
+{
+  Grid<Vec3> velCenter(vel.getParent());
+  GetCentered(velCenter, vel);
+  KnComputeStrainRateMag(vel, velCenter, mag);
+}
+static PyObject *_W_9(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "computeStrainRateMag", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const MACGrid &vel = *_args.getPtr<MACGrid>("vel", 0, &_lock);
+      Grid<Real> &mag = *_args.getPtr<Grid<Real>>("mag", 1, &_lock);
+      _retval = getPyNone();
+      computeStrainRateMag(vel, mag);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "computeStrainRateMag", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("computeStrainRateMag", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_computeStrainRateMag("", "computeStrainRateMag", _W_9);
+extern "C" {
+void PbRegister_computeStrainRateMag()
+{
+  KEEP_UNUSED(_RP_computeStrainRateMag);
+}
+}
+
+// extrapolate a real grid into a flagged region (based on initial flags)
+// by default extrapolates from fluid to obstacle cells
+template<class T>
+void extrapolSimpleFlagsHelper(const FlagGrid &flags,
+                               Grid<T> &val,
+                               int distance = 4,
+                               int flagFrom = FlagGrid::TypeFluid,
+                               int flagTo = FlagGrid::TypeObstacle)
+{
+  Grid<int> tmp(flags.getParent());
+  int dim = (flags.is3D() ? 3 : 2);
+  const Vec3i nb[6] = {Vec3i(1, 0, 0),
+                       Vec3i(-1, 0, 0),
+                       Vec3i(0, 1, 0),
+                       Vec3i(0, -1, 0),
+                       Vec3i(0, 0, 1),
+                       Vec3i(0, 0, -1)};
+
+  // remove all fluid cells (set to 1)
+  tmp.clear();
+  bool foundTarget = false;
+  FOR_IJK_BND(flags, 0)
+  {
+    if (flags(i, j, k) & flagFrom)
+      tmp(Vec3i(i, j, k)) = 1;
+    if (!foundTarget && (flags(i, j, k) & flagTo))
+      foundTarget = true;
+  }
+  // optimization, skip extrapolation if we dont have any cells to extrapolate to
+  if (!foundTarget) {
+    debMsg("No target cells found, skipping extrapolation", 1);
+    return;
+  }
+
+  // extrapolate for given distance
+  for (int d = 1; d < 1 + distance; ++d) {
+
+    // TODO, parallelize
+    FOR_IJK_BND(flags, 1)
+    {
+      if (tmp(i, j, k) != 0)
+        continue;
+      if (!(flags(i, j, k) & flagTo))
+        continue;
+
+      // copy from initialized neighbors
+      Vec3i p(i, j, k);
+      int nbs = 0;
+      T avgVal = 0.;
+      for (int n = 0; n < 2 * dim; ++n) {
+        if (tmp(p + nb[n]) == d) {
+          avgVal += val(p + nb[n]);
+          nbs++;
+        }
+      }
+
+      if (nbs > 0) {
+        tmp(p) = d + 1;
+        val(p) = avgVal / nbs;
+      }
+    }
+
+  }  // distance
+}
+
+void extrapolateSimpleFlags(const FlagGrid &flags,
+                            GridBase *val,
+                            int distance = 4,
+                            int flagFrom = FlagGrid::TypeFluid,
+                            int flagTo = FlagGrid::TypeObstacle)
+{
+  if (val->getType() & GridBase::TypeReal) {
+    extrapolSimpleFlagsHelper<Real>(flags, *((Grid<Real> *)val), distance, flagFrom, flagTo);
+  }
+  else if (val->getType() & GridBase::TypeInt) {
+    extrapolSimpleFlagsHelper<int>(flags, *((Grid<int> *)val), distance, flagFrom, flagTo);
+  }
+  else if (val->getType() & GridBase::TypeVec3) {
+    extrapolSimpleFlagsHelper<Vec3>(flags, *((Grid<Vec3> *)val), distance, flagFrom, flagTo);
+  }
+  else
+    errMsg("extrapolateSimpleFlags: Grid Type is not supported (only int, Real, Vec3)");
+}
+static PyObject *_W_10(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "extrapolateSimpleFlags", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      GridBase *val = _args.getPtr<GridBase>("val", 1, &_lock);
+      int distance = _args.getOpt<int>("distance", 2, 4, &_lock);
+      int flagFrom = _args.getOpt<int>("flagFrom", 3, FlagGrid::TypeFluid, &_lock);
+      int flagTo = _args.getOpt<int>("flagTo", 4, FlagGrid::TypeObstacle, &_lock);
+      _retval = getPyNone();
+      extrapolateSimpleFlags(flags, val, distance, flagFrom, flagTo);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "extrapolateSimpleFlags", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("extrapolateSimpleFlags", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_extrapolateSimpleFlags("", "extrapolateSimpleFlags", _W_10);
+extern "C" {
+void PbRegister_extrapolateSimpleFlags()
+{
+  KEEP_UNUSED(_RP_extrapolateSimpleFlags);
+}
+}
+
+//! convert vel to a centered grid, then compute its curl
+void getCurl(const MACGrid &vel, Grid<Real> &vort, int comp)
+{
+  Grid<Vec3> velCenter(vel.getParent()), curl(vel.getParent());
+
+  GetCentered(velCenter, vel);
+  CurlOp(velCenter, curl);
+  GetComponent(curl, vort, comp);
+}
+static PyObject *_W_11(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "getCurl", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const MACGrid &vel = *_args.getPtr<MACGrid>("vel", 0, &_lock);
+      Grid<Real> &vort = *_args.getPtr<Grid<Real>>("vort", 1, &_lock);
+      int comp = _args.get<int>("comp", 2, &_lock);
+      _retval = getPyNone();
+      getCurl(vel, vort, comp);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "getCurl", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("getCurl", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_getCurl("", "getCurl", _W_11);
+extern "C" {
+void PbRegister_getCurl()
+{
+  KEEP_UNUSED(_RP_getCurl);
+}
+}
+
+}  // namespace Manta
diff --git a/extern/mantaflow/preprocessed/plugin/waves.cpp b/extern/mantaflow/preprocessed/plugin/waves.cpp
new file mode 100644
index 00000000000..7745dce4711
--- /dev/null
+++ b/extern/mantaflow/preprocessed/plugin/waves.cpp
@@ -0,0 +1,483 @@
+
+
+// DO NOT EDIT !
+// This file is generated using the MantaFlow preprocessor (prep generate).
+
+/******************************************************************************
+ *
+ * MantaFlow fluid solver framework
+ * Copyright 2011 Tobias Pfaff, Nils Thuerey
+ *
+ * This program is free software, distributed under the terms of the
+ * Apache License, Version 2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Wave equation
+ *
+ ******************************************************************************/
+
+#include "levelset.h"
+#include "commonkernels.h"
+#include "particle.h"
+#include "conjugategrad.h"
+#include <cmath>
+
+using namespace std;
+
+namespace Manta {
+
+/******************************************************************************
+ *
+ * explicit integration
+ *
+ ******************************************************************************/
+
+struct knCalcSecDeriv2d : public KernelBase {
+  knCalcSecDeriv2d(const Grid<Real> &v, Grid<Real> &ret) : KernelBase(&v, 1), v(v), ret(ret)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, const Grid<Real> &v, Grid<Real> &ret) const
+  {
+    ret(i, j, k) = (-4. * v(i, j, k) + v(i - 1, j, k) + v(i + 1, j, k) + v(i, j - 1, k) +
+                    v(i, j + 1, k));
+  }
+  inline const Grid<Real> &getArg0()
+  {
+    return v;
+  }
+  typedef Grid<Real> type0;
+  inline Grid<Real> &getArg1()
+  {
+    return ret;
+  }
+  typedef Grid<Real> type1;
+  void runMessage()
+  {
+    debMsg("Executing kernel knCalcSecDeriv2d ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, v, ret);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, v, ret);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const Grid<Real> &v;
+  Grid<Real> &ret;
+};
+;
+
+//! calculate a second derivative for the wave equation
+void calcSecDeriv2d(const Grid<Real> &v, Grid<Real> &curv)
+{
+  knCalcSecDeriv2d(v, curv);
+}
+static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "calcSecDeriv2d", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const Grid<Real> &v = *_args.getPtr<Grid<Real>>("v", 0, &_lock);
+      Grid<Real> &curv = *_args.getPtr<Grid<Real>>("curv", 1, &_lock);
+      _retval = getPyNone();
+      calcSecDeriv2d(v, curv);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "calcSecDeriv2d", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("calcSecDeriv2d", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_calcSecDeriv2d("", "calcSecDeriv2d", _W_0);
+extern "C" {
+void PbRegister_calcSecDeriv2d()
+{
+  KEEP_UNUSED(_RP_calcSecDeriv2d);
+}
+}
+
+// mass conservation
+
+struct knTotalSum : public KernelBase {
+  knTotalSum(Grid<Real> &h) : KernelBase(&h, 1), h(h), sum(0)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, Grid<Real> &h, double &sum)
+  {
+    sum += h(i, j, k);
+  }
+  inline operator double()
+  {
+    return sum;
+  }
+  inline double &getRet()
+  {
+    return sum;
+  }
+  inline Grid<Real> &getArg0()
+  {
+    return h;
+  }
+  typedef Grid<Real> type0;
+  void runMessage()
+  {
+    debMsg("Executing kernel knTotalSum ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, h, sum);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, h, sum);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_reduce(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_reduce(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  knTotalSum(knTotalSum &o, tbb::split) : KernelBase(o), h(o.h), sum(0)
+  {
+  }
+  void join(const knTotalSum &o)
+  {
+    sum += o.sum;
+  }
+  Grid<Real> &h;
+  double sum;
+};
+
+//! calculate the sum of all values in a grid (for wave equation solves)
+Real totalSum(Grid<Real> &height)
+{
+  knTotalSum ts(height);
+  return ts.sum;
+}
+static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "totalSum", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &height = *_args.getPtr<Grid<Real>>("height", 0, &_lock);
+      _retval = toPy(totalSum(height));
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "totalSum", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("totalSum", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_totalSum("", "totalSum", _W_1);
+extern "C" {
+void PbRegister_totalSum()
+{
+  KEEP_UNUSED(_RP_totalSum);
+}
+}
+
+//! normalize all values in a grid (for wave equation solves)
+void normalizeSumTo(Grid<Real> &height, Real target)
+{
+  knTotalSum ts(height);
+  Real factor = target / ts.sum;
+  height.multConst(factor);
+}
+static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "normalizeSumTo", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      Grid<Real> &height = *_args.getPtr<Grid<Real>>("height", 0, &_lock);
+      Real target = _args.get<Real>("target", 1, &_lock);
+      _retval = getPyNone();
+      normalizeSumTo(height, target);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "normalizeSumTo", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("normalizeSumTo", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_normalizeSumTo("", "normalizeSumTo", _W_2);
+extern "C" {
+void PbRegister_normalizeSumTo()
+{
+  KEEP_UNUSED(_RP_normalizeSumTo);
+}
+}
+
+/******************************************************************************
+ *
+ * implicit time integration
+ *
+ ******************************************************************************/
+
+//! Kernel: Construct the right-hand side of the poisson equation
+
+struct MakeRhsWE : public KernelBase {
+  MakeRhsWE(const FlagGrid &flags,
+            Grid<Real> &rhs,
+            const Grid<Real> &ut,
+            const Grid<Real> &utm1,
+            Real s,
+            bool crankNic = false)
+      : KernelBase(&flags, 1), flags(flags), rhs(rhs), ut(ut), utm1(utm1), s(s), crankNic(crankNic)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 Grid<Real> &rhs,
+                 const Grid<Real> &ut,
+                 const Grid<Real> &utm1,
+                 Real s,
+                 bool crankNic = false) const
+  {
+    rhs(i, j, k) = (2. * ut(i, j, k) - utm1(i, j, k));
+    if (crankNic) {
+      rhs(i, j, k) += s * (-4. * ut(i, j, k) + 1. * ut(i - 1, j, k) + 1. * ut(i + 1, j, k) +
+                           1. * ut(i, j - 1, k) + 1. * ut(i, j + 1, k));
+    }
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<Real> &getArg1()
+  {
+    return rhs;
+  }
+  typedef Grid<Real> type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return ut;
+  }
+  typedef Grid<Real> type2;
+  inline const Grid<Real> &getArg3()
+  {
+    return utm1;
+  }
+  typedef Grid<Real> type3;
+  inline Real &getArg4()
+  {
+    return s;
+  }
+  typedef Real type4;
+  inline bool &getArg5()
+  {
+    return crankNic;
+  }
+  typedef bool type5;
+  void runMessage()
+  {
+    debMsg("Executing kernel MakeRhsWE ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, rhs, ut, utm1, s, crankNic);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, rhs, ut, utm1, s, crankNic);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  Grid<Real> &rhs;
+  const Grid<Real> &ut;
+  const Grid<Real> &utm1;
+  Real s;
+  bool crankNic;
+};
+
+//! do a CG solve for the wave equation (note, out grid only there for debugging... could be
+//! removed)
+
+void cgSolveWE(const FlagGrid &flags,
+               Grid<Real> &ut,
+               Grid<Real> &utm1,
+               Grid<Real> &out,
+               bool crankNic = false,
+               Real cSqr = 0.25,
+               Real cgMaxIterFac = 1.5,
+               Real cgAccuracy = 1e-5)
+{
+  // reserve temp grids
+  FluidSolver *parent = flags.getParent();
+  Grid<Real> rhs(parent);
+  Grid<Real> residual(parent);
+  Grid<Real> search(parent);
+  Grid<Real> A0(parent);
+  Grid<Real> Ai(parent);
+  Grid<Real> Aj(parent);
+  Grid<Real> Ak(parent);
+  Grid<Real> tmp(parent);
+  // solution...
+  out.clear();
+
+  // setup matrix and boundaries
+  MakeLaplaceMatrix(flags, A0, Ai, Aj, Ak);
+  Real dt = parent->getDt();
+  Real s = dt * dt * cSqr * 0.5;
+  FOR_IJK(flags)
+  {
+    Ai(i, j, k) *= s;
+    Aj(i, j, k) *= s;
+    Ak(i, j, k) *= s;
+    A0(i, j, k) *= s;
+    A0(i, j, k) += 1.;
+  }
+
+  // compute divergence and init right hand side
+  rhs.clear();
+  // h=dt
+  // rhs:   = 2 ut - ut-1
+  // A:    (h2 c2/ dx)=s   ,  (1+4s)uij + s ui-1j + ...
+  // Cr.Nic.
+  // rhs:  cr nic = 2 ut - ut-1 + h^2c^2/2 b
+  // A:    (h2 c2/2 dx)=s   ,  (1+4s)uij + s ui-1j + ...
+  MakeRhsWE kernMakeRhs(flags, rhs, ut, utm1, s, crankNic);
+
+  const int maxIter = (int)(cgMaxIterFac * flags.getSize().max()) * (flags.is3D() ? 1 : 4);
+  GridCgInterface *gcg;
+  if (flags.is3D())
+    gcg = new GridCg<ApplyMatrix>(out, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
+  else
+    gcg = new GridCg<ApplyMatrix2D>(out, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
+
+  gcg->setAccuracy(cgAccuracy);
+
+  // no preconditioning for now...
+  for (int iter = 0; iter < maxIter; iter++) {
+    if (!gcg->iterate())
+      iter = maxIter;
+  }
+  debMsg("cgSolveWaveEq iterations:" << gcg->getIterations() << ", res:" << gcg->getSigma(), 1);
+
+  utm1.swap(ut);
+  ut.copyFrom(out);
+
+  delete gcg;
+}
+static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "cgSolveWE", !noTiming);
+    PyObject *_retval = 0;
+    {
+      ArgLocker _lock;
+      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
+      Grid<Real> &ut = *_args.getPtr<Grid<Real>>("ut", 1, &_lock);
+      Grid<Real> &utm1 = *_args.getPtr<Grid<Real>>("utm1", 2, &_lock);
+      Grid<Real> &out = *_args.getPtr<Grid<Real>>("out", 3, &_lock);
+      bool crankNic = _args.getOpt<bool>("crankNic", 4, false, &_lock);
+      Real cSqr = _args.getOpt<Real>("cSqr", 5, 0.25, &_lock);
+      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 6, 1.5, &_lock);
+      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 7, 1e-5, &_lock);
+      _retval = getPyNone();
+      cgSolveWE(flags, ut, utm1, out, crankNic, cSqr, cgMaxIterFac, cgAccuracy);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "cgSolveWE", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("cgSolveWE", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_cgSolveWE("", "cgSolveWE", _W_3);
+extern "C" {
+void PbRegister_cgSolveWE()
+{
+  KEEP_UNUSED(_RP_cgSolveWE);
+}
+}
+
+}  // namespace Manta