Fluid: Initial changes for OpenMP GPU supportfluid-mantaflow-gpu

Contains basic support for OpenMP GPU offloading. That is, offloading of fluid KERNEL loops to the GPU. This branch offloads pressure and advection calls only - the 2 most expensive operation per step. In theory though, any function can be offloaded. For now, this branch needs to be build with a compiler that supports Nvidia GPU offloading. Exact GPU models need to be specified via CMake.
author: Sebastián Barschkis <sebbas@sebbas.org> 2021-09-13 16:03:52 +0300
committer: Sebastián Barschkis <sebbas@sebbas.org> 2021-09-13 16:03:52 +0300
commit: 063ce7f550f1612ab0e34c4ecb4b57f8401b84b4 (patch)
tree: 53584b6c514510b0bab33a480b3ec85274b48a6b
parent: 4b06420e65040c642d2b0a7a1c9bf7515d3cec0c (diff)
65 files changed, 6167 insertions, 8354 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 47712f0ac1e..a257068801a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -747,7 +747,7 @@ set_and_warn_dependency(WITH_TBB WITH_CYCLES            OFF)
 set_and_warn_dependency(WITH_TBB WITH_USD               OFF)
 set_and_warn_dependency(WITH_TBB WITH_OPENIMAGEDENOISE  OFF)
 set_and_warn_dependency(WITH_TBB WITH_OPENVDB           OFF)
-set_and_warn_dependency(WITH_TBB WITH_MOD_FLUID         OFF)
+#set_and_warn_dependency(WITH_TBB WITH_MOD_FLUID         OFF)
 
 # NanoVDB requires OpenVDB to convert the data structure
 set_and_warn_dependency(WITH_OPENVDB WITH_NANOVDB       OFF)
@@ -1239,15 +1239,29 @@ endif()
 if(WITH_OPENMP)
   if(NOT OPENMP_CUSTOM)
     find_package(OpenMP)
+
+    list(APPEND CMAKE_MODULE_PATH "${LLVM_LIBPATH}/cmake/openmp")
+    find_package(OpenMPTarget REQUIRED NVPTX)
   endif()
 
   if(OPENMP_FOUND)
     if(NOT WITH_OPENMP_STATIC)
+      message(STATUS "============ No Static OpenMP")
+      message(${OpenMPTarget_NVPTX_FLAGS})
+
       string(APPEND CMAKE_C_FLAGS " ${OpenMP_C_FLAGS}")
       string(APPEND CMAKE_CXX_FLAGS " ${OpenMP_CXX_FLAGS}")
       string(APPEND CMAKE_EXE_LINKER_FLAGS " ${OpenMP_LINKER_FLAGS}")
       string(APPEND CMAKE_MODULE_LINKER_FLAGS " ${OpenMP_LINKER_FLAGS}")
+
+      string(APPEND CMAKE_C_FLAGS " -gline-tables-only -fopenmp-targets=nvptx64-nvidia-cuda")
+      string(APPEND CMAKE_CXX_FLAGS " -gline-tables-only -fopenmp-targets=nvptx64-nvidia-cuda")
+      string(APPEND CMAKE_EXE_LINKER_FLAGS " -fopenmp-targets=nvptx64-nvidia-cuda")
+      string(APPEND CMAKE_MODULE_LINKER_FLAGS " -fopenmp-targets=nvptx64-nvidia-cuda")
+
+      set(PLATFORM_LINKLIBS "${PLATFORM_LINKLIBS};-fopenmp=libomp;-fopenmp-targets=nvptx64-nvidia-cuda")
     else()
+      message(STATUS "============= Static OpenMP")
       # Typically avoid adding flags as defines but we can't
       # pass OpenMP flags to the linker for static builds, meaning
       # we can't add any OpenMP related flags to CFLAGS variables
diff --git a/build_files/build_environment/cmake/llvm.cmake b/build_files/build_environment/cmake/llvm.cmake
index 7a8ce2ddfec..7f54e83c92e 100644
--- a/build_files/build_environment/cmake/llvm.cmake
+++ b/build_files/build_environment/cmake/llvm.cmake
@@ -30,6 +30,10 @@ if(APPLE)
   set(BUILD_CLANG_TOOLS ON)
 endif()
 
+if(UNIX AND NOT APPLE)
+  set(LLVM_BUILD_OPENMP ^^openmp)
+  set(LLVM_TARGETS ${LLVM_TARGETS} ^^NVPTX)
+endif()
 
 set(LLVM_EXTRA_ARGS
   -DLLVM_USE_CRT_RELEASE=MD
@@ -40,10 +44,18 @@ set(LLVM_EXTRA_ARGS
   -DLLVM_ENABLE_TERMINFO=OFF
   -DLLVM_BUILD_LLVM_C_DYLIB=OFF
   -DLLVM_ENABLE_UNWIND_TABLES=OFF
-  -DLLVM_ENABLE_PROJECTS=clang${LLVM_BUILD_CLANG_TOOLS_EXTRA}
+  -DLLVM_ENABLE_PROJECTS=clang${LLVM_BUILD_CLANG_TOOLS_EXTRA}${LLVM_BUILD_OPENMP}
   ${LLVM_XML2_ARGS}
 )
 
+if(UNIX AND NOT APPLE)
+  list(APPEND LLVM_EXTRA_ARGS
+    -DCLANG_OPENMP_NVPTX_DEFAULT_ARCH=sm_61
+    -DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES=61
+  )
+endif()
+
+
 if(WIN32)
   set(LLVM_GENERATOR "Ninja")
 else()
diff --git a/build_files/cmake/platform/platform_unix.cmake b/build_files/cmake/platform/platform_unix.cmake
index fc0c37e4c8b..065c0b48416 100644
--- a/build_files/cmake/platform/platform_unix.cmake
+++ b/build_files/cmake/platform/platform_unix.cmake
@@ -63,7 +63,8 @@ if(EXISTS ${LIBDIR})
   # OpenMP usually can't be statically linked into shared libraries,
   # due to not being compiled with position independent code.
   if(NOT WITH_PYTHON_MODULE)
-    set(WITH_OPENMP_STATIC ON)
+    #message(STATUS "============= ENABLING Static OpenMP")
+    #set(WITH_OPENMP_STATIC ON)
   endif()
   set(Boost_NO_BOOST_CMAKE ON)
   set(BOOST_ROOT ${LIBDIR}/boost)
diff --git a/extern/curve_fit_nd/intern/curve_fit_corners_detect.c b/extern/curve_fit_nd/intern/curve_fit_corners_detect.c
index 415ef40fdd5..b600a41f9cd 100644
--- a/extern/curve_fit_nd/intern/curve_fit_corners_detect.c
+++ b/extern/curve_fit_nd/intern/curve_fit_corners_detect.c
@@ -81,7 +81,7 @@ static double cos_vnvnvn(
 	normalize_vn_vnvn(dvec1, v1, v2, dims);
 	double d = dot_vnvn(dvec0, dvec1, dims);
 	/* sanity check */
-	d = max(-1.0, min(1.0, d));
+	d = maxV(-1.0, minV(1.0, d));
 	return d;
 }
 
diff --git a/extern/curve_fit_nd/intern/curve_fit_cubic.c b/extern/curve_fit_nd/intern/curve_fit_cubic.c
index 47c5344c821..65042d7dbef 100644
--- a/extern/curve_fit_nd/intern/curve_fit_cubic.c
+++ b/extern/curve_fit_nd/intern/curve_fit_cubic.c
@@ -29,9 +29,9 @@
  *  \ingroup curve_fit
  */
 
-#ifdef _MSC_VER
+//#ifdef _MSC_VER
 #  define _USE_MATH_DEFINES
-#endif
+//#endif
 
 #include <math.h>
 #include <float.h>
@@ -456,7 +456,7 @@ static double points_calc_circumference_factor(
 	const double len_tangent = dot < 0.0 ? len_vnvn(tan_l, tan_r, dims) : len_negated_vnvn(tan_l, tan_r, dims);
 	if (len_tangent > DBL_EPSILON) {
 		/* only clamp to avoid precision error */
-		double angle = acos(max(-fabs(dot), -1.0));
+		double angle = acos(maxV(-fabs(dot), -1.0));
 		/* Angle may be less than the length when the tangents define >180 degrees of the circle,
 		 * (tangents that point away from each other).
 		 * We could try support this but will likely cause extreme >1 scales which could cause other issues. */
@@ -607,7 +607,7 @@ static void cubic_from_points_offset_fallback(
 		for (uint k = 0; k < 2; k++) {
 			sub_vn_vnvn(tmp, p0, pt, dims);
 			project_vn_vnvn_normalized(tmp, tmp, a[k], dims);
-			dists[k] = max(dists[k], dot_vnvn(tmp, a[k], dims));
+			dists[k] = maxV(dists[k], dot_vnvn(tmp, a[k], dims));
 		}
 	}
 
@@ -796,7 +796,7 @@ static void cubic_from_points(
 					dist_sq_test += sq((pt[j] - center[j]) * clamp_scale);
 				}
 #endif
-				dist_sq_max = max(dist_sq_max, dist_sq_test);
+				dist_sq_max = maxV(dist_sq_max, dist_sq_test);
 			}
 		}
 
diff --git a/extern/curve_fit_nd/intern/curve_fit_inline.h b/extern/curve_fit_nd/intern/curve_fit_inline.h
index f9eaa4c647c..6b47d3c12b0 100644
--- a/extern/curve_fit_nd/intern/curve_fit_inline.h
+++ b/extern/curve_fit_nd/intern/curve_fit_inline.h
@@ -45,12 +45,12 @@ MINLINE double sq(const double d)
 }
 
 #ifndef _MSC_VER
-MINLINE double min(const double a, const double b)
+MINLINE double minV(const double a, const double b)
 {
 	return b < a ? b : a;
 }
 
-MINLINE double max(const double a, const double b)
+MINLINE double maxV(const double a, const double b)
 {
 	return a < b ? b : a;
 }
diff --git a/extern/draco/draco/src/draco/animation/keyframe_animation.cc b/extern/draco/draco/src/draco/animation/keyframe_animation.cc
index eaf94a3305d..e602140ea0f 100644
--- a/extern/draco/draco/src/draco/animation/keyframe_animation.cc
+++ b/extern/draco/draco/src/draco/animation/keyframe_animation.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 //
 #include "draco/animation/keyframe_animation.h"
+#include <iostream>
 
 namespace draco {
 
diff --git a/extern/mantaflow/CMakeLists.txt b/extern/mantaflow/CMakeLists.txt
index 9b047eb1a3e..cdb3afcff4a 100644
--- a/extern/mantaflow/CMakeLists.txt
+++ b/extern/mantaflow/CMakeLists.txt
@@ -55,26 +55,30 @@ if(NOT WITH_MANTA_DEPENDENCIES)
   add_definitions(-DNO_CNPY=1)
 endif()
 
-set(MANTA_HLP
-  helper
-)
-set(MANTA_PP
-  preprocessed
-)
 if(WITH_MANTA_DEPENDENCIES)
   set(MANTA_DEP
     dependencies
   )
 endif()
 
-if(WITH_TBB)
-  add_definitions(-DTBB=1)
+if(WITH_OPENMP)
+  add_definitions(-DOPENMP=1)
+
+  if(WITH_OPENMP_STATIC)
+    list(APPEND LIB
+      ${OpenMP_LIBRARIES}
+    )
+  endif()
+endif()
+
+if(WITH_OPENMP)
+  add_definitions(-DOPENMP_OFFLOAD=1)
 endif()
 
 if(WITH_OPENVDB)
   add_definitions(-DOPENVDB=1)
   # OpenVDB headers use deprecated TBB headers, silence warning.
-  add_definitions(-DTBB_SUPPRESS_DEPRECATED_MESSAGES=1)
+  #add_definitions(-DTBB_SUPPRESS_DEPRECATED_MESSAGES=1)
 endif()
 
 if(WITH_OPENVDB_BLOSC)
@@ -90,12 +94,12 @@ if(WITH_MANTA_NUMPY AND WITH_PYTHON_NUMPY)
 endif()
 
 set(INC
-  ${MANTA_PP}
-  ${MANTA_PP}/fileio
-  ${MANTA_PP}/python
-  ${MANTA_PP}/plugin
-  ${MANTA_HLP}/pwrapper
-  ${MANTA_HLP}/util
+  preprocessed
+  preprocessed/fileio
+  preprocessed/python
+  preprocessed/plugin
+  helper/pwrapper
+  helper/util
 )
 
 if(WITH_MANTA_DEPENDENCIES)
@@ -115,15 +119,6 @@ if(WITH_MANTA_NUMPY AND WITH_PYTHON_NUMPY)
   )
 endif()
 
-if(WITH_TBB)
-  list(APPEND INC_SYS
-    ${TBB_INCLUDE_DIRS}
-  )
-  list(APPEND LIB
-    ${TBB_LIBRARIES}
-  )
-endif()
-
 if(WITH_OPENVDB)
   list(APPEND INC_SYS
     ${OPENVDB_INCLUDE_DIRS}
@@ -142,120 +137,120 @@ if(WITH_OPENVDB)
 endif()
 
 set(SRC
-  ${MANTA_PP}/commonkernels.h
-  ${MANTA_PP}/commonkernels.h.reg.cpp
-  ${MANTA_PP}/conjugategrad.cpp
-  ${MANTA_PP}/conjugategrad.h
-  ${MANTA_PP}/conjugategrad.h.reg.cpp
-  ${MANTA_PP}/edgecollapse.cpp
-  ${MANTA_PP}/edgecollapse.h
-  ${MANTA_PP}/edgecollapse.h.reg.cpp
-  ${MANTA_PP}/fastmarch.cpp
-  ${MANTA_PP}/fastmarch.h
-  ${MANTA_PP}/fastmarch.h.reg.cpp
-  ${MANTA_PP}/fileio/iogrids.cpp
-  ${MANTA_PP}/fileio/iomeshes.cpp
-  ${MANTA_PP}/fileio/ioparticles.cpp
-  ${MANTA_PP}/fileio/ioutil.cpp
-  ${MANTA_PP}/fileio/iovdb.cpp
-  ${MANTA_PP}/fileio/mantaio.cpp
-  ${MANTA_PP}/fileio/mantaio.h
-  ${MANTA_PP}/fileio/mantaio.h.reg.cpp
-  ${MANTA_PP}/fluidsolver.cpp
-  ${MANTA_PP}/fluidsolver.h
-  ${MANTA_PP}/fluidsolver.h.reg.cpp
-  ${MANTA_PP}/general.cpp
-  ${MANTA_PP}/general.h
-  ${MANTA_PP}/general.h.reg.cpp
-  ${MANTA_PP}/gitinfo.h
-  ${MANTA_PP}/grid.cpp
-  ${MANTA_PP}/grid.h
-  ${MANTA_PP}/grid.h.reg.cpp
-  ${MANTA_PP}/grid4d.cpp
-  ${MANTA_PP}/grid4d.h
-  ${MANTA_PP}/grid4d.h.reg.cpp
-  ${MANTA_PP}/kernel.cpp
-  ${MANTA_PP}/kernel.h
-  ${MANTA_PP}/kernel.h.reg.cpp
-  ${MANTA_PP}/levelset.cpp
-  ${MANTA_PP}/levelset.h
-  ${MANTA_PP}/levelset.h.reg.cpp
-  ${MANTA_PP}/mesh.cpp
-  ${MANTA_PP}/mesh.h
-  ${MANTA_PP}/mesh.h.reg.cpp
-  ${MANTA_PP}/movingobs.cpp
-  ${MANTA_PP}/movingobs.h
-  ${MANTA_PP}/movingobs.h.reg.cpp
-  ${MANTA_PP}/multigrid.cpp
-  ${MANTA_PP}/multigrid.h
-  ${MANTA_PP}/multigrid.h.reg.cpp
-  ${MANTA_PP}/noisefield.cpp
-  ${MANTA_PP}/noisefield.h
-  ${MANTA_PP}/noisefield.h.reg.cpp
-  ${MANTA_PP}/particle.cpp
-  ${MANTA_PP}/particle.h
-  ${MANTA_PP}/particle.h.reg.cpp
-  ${MANTA_PP}/plugin/advection.cpp
-  ${MANTA_PP}/plugin/apic.cpp
-  ${MANTA_PP}/plugin/extforces.cpp
-  ${MANTA_PP}/plugin/fire.cpp
-  ${MANTA_PP}/plugin/flip.cpp
-  ${MANTA_PP}/plugin/fluidguiding.cpp
-  ${MANTA_PP}/plugin/initplugins.cpp
-  ${MANTA_PP}/plugin/kepsilon.cpp
-  ${MANTA_PP}/plugin/meshplugins.cpp
-  ${MANTA_PP}/plugin/pressure.cpp
-  ${MANTA_PP}/plugin/ptsplugins.cpp
-  ${MANTA_PP}/plugin/secondaryparticles.cpp
-  ${MANTA_PP}/plugin/surfaceturbulence.cpp
-  ${MANTA_PP}/plugin/viscosity.cpp
-  ${MANTA_PP}/plugin/vortexplugins.cpp
-  ${MANTA_PP}/plugin/waveletturbulence.cpp
-  ${MANTA_PP}/plugin/waves.cpp
-  ${MANTA_PP}/python/defines.py
-  ${MANTA_PP}/python/defines.py.reg.cpp
-  ${MANTA_PP}/registration.cpp
-  ${MANTA_PP}/shapes.cpp
-  ${MANTA_PP}/shapes.h
-  ${MANTA_PP}/shapes.h.reg.cpp
-  ${MANTA_PP}/test.cpp
-  ${MANTA_PP}/timing.cpp
-  ${MANTA_PP}/timing.h
-  ${MANTA_PP}/timing.h.reg.cpp
-  ${MANTA_PP}/turbulencepart.cpp
-  ${MANTA_PP}/turbulencepart.h
-  ${MANTA_PP}/turbulencepart.h.reg.cpp
-  ${MANTA_PP}/vortexpart.cpp
-  ${MANTA_PP}/vortexpart.h
-  ${MANTA_PP}/vortexpart.h.reg.cpp
-  ${MANTA_PP}/vortexsheet.cpp
-  ${MANTA_PP}/vortexsheet.h
-  ${MANTA_PP}/vortexsheet.h.reg.cpp
+  preprocessed/commonkernels.h
+  preprocessed/commonkernels.h.reg.cpp
+  preprocessed/conjugategrad.cpp
+  preprocessed/conjugategrad.h
+  preprocessed/conjugategrad.h.reg.cpp
+  preprocessed/edgecollapse.cpp
+  preprocessed/edgecollapse.h
+  preprocessed/edgecollapse.h.reg.cpp
+  preprocessed/fastmarch.cpp
+  preprocessed/fastmarch.h
+  preprocessed/fastmarch.h.reg.cpp
+  preprocessed/fileio/iogrids.cpp
+  preprocessed/fileio/iomeshes.cpp
+  preprocessed/fileio/ioparticles.cpp
+  preprocessed/fileio/ioutil.cpp
+  preprocessed/fileio/iovdb.cpp
+  preprocessed/fileio/mantaio.cpp
+  preprocessed/fileio/mantaio.h
+  preprocessed/fileio/mantaio.h.reg.cpp
+  preprocessed/fluidsolver.cpp
+  preprocessed/fluidsolver.h
+  preprocessed/fluidsolver.h.reg.cpp
+  preprocessed/general.cpp
+  preprocessed/general.h
+  preprocessed/general.h.reg.cpp
+  preprocessed/gitinfo.h
+  preprocessed/grid.cpp
+  preprocessed/grid.h
+  preprocessed/grid.h.reg.cpp
+  preprocessed/grid4d.cpp
+  preprocessed/grid4d.h
+  preprocessed/grid4d.h.reg.cpp
+  preprocessed/kernel.cpp
+  preprocessed/kernel.h
+  preprocessed/kernel.h.reg.cpp
+  preprocessed/levelset.cpp
+  preprocessed/levelset.h
+  preprocessed/levelset.h.reg.cpp
+  preprocessed/mesh.cpp
+  preprocessed/mesh.h
+  preprocessed/mesh.h.reg.cpp
+  preprocessed/movingobs.cpp
+  preprocessed/movingobs.h
+  preprocessed/movingobs.h.reg.cpp
+  preprocessed/multigrid.cpp
+  preprocessed/multigrid.h
+  preprocessed/multigrid.h.reg.cpp
+  preprocessed/noisefield.cpp
+  preprocessed/noisefield.h
+  preprocessed/noisefield.h.reg.cpp
+  preprocessed/particle.cpp
+  preprocessed/particle.h
+  preprocessed/particle.h.reg.cpp
+  preprocessed/plugin/advection.cpp
+  preprocessed/plugin/apic.cpp
+  preprocessed/plugin/extforces.cpp
+  preprocessed/plugin/fire.cpp
+  preprocessed/plugin/flip.cpp
+  preprocessed/plugin/fluidguiding.cpp
+  preprocessed/plugin/initplugins.cpp
+  preprocessed/plugin/kepsilon.cpp
+  preprocessed/plugin/meshplugins.cpp
+  preprocessed/plugin/pressure.cpp
+  preprocessed/plugin/ptsplugins.cpp
+  preprocessed/plugin/secondaryparticles.cpp
+  preprocessed/plugin/surfaceturbulence.cpp
+#  preprocessed/plugin/viscosity.cpp
+  preprocessed/plugin/vortexplugins.cpp
+  preprocessed/plugin/waveletturbulence.cpp
+  preprocessed/plugin/waves.cpp
+  preprocessed/python/defines.py
+  preprocessed/python/defines.py.reg.cpp
+  preprocessed/registration.cpp
+  preprocessed/shapes.cpp
+  preprocessed/shapes.h
+  preprocessed/shapes.h.reg.cpp
+  preprocessed/test.cpp
+  preprocessed/timing.cpp
+  preprocessed/timing.h
+  preprocessed/timing.h.reg.cpp
+  preprocessed/turbulencepart.cpp
+  preprocessed/turbulencepart.h
+  preprocessed/turbulencepart.h.reg.cpp
+  preprocessed/vortexpart.cpp
+  preprocessed/vortexpart.h
+  preprocessed/vortexpart.h.reg.cpp
+  preprocessed/vortexsheet.cpp
+  preprocessed/vortexsheet.h
+  preprocessed/vortexsheet.h.reg.cpp
 
-  ${MANTA_HLP}/pwrapper/manta.h
-  ${MANTA_HLP}/pwrapper/pclass.cpp
-  ${MANTA_HLP}/pwrapper/pclass.h
-  ${MANTA_HLP}/pwrapper/pconvert.cpp
-  ${MANTA_HLP}/pwrapper/pconvert.h
-  ${MANTA_HLP}/pwrapper/pvec3.cpp
-  ${MANTA_HLP}/pwrapper/pythonInclude.h
-  ${MANTA_HLP}/pwrapper/registry.cpp
-  ${MANTA_HLP}/pwrapper/registry.h
-  ${MANTA_HLP}/util/integrator.h
-  ${MANTA_HLP}/util/interpol.h
-  ${MANTA_HLP}/util/interpolHigh.h
-  ${MANTA_HLP}/util/matrixbase.h
-  ${MANTA_HLP}/util/mcubes.h
-  ${MANTA_HLP}/util/quaternion.h
-  ${MANTA_HLP}/util/randomstream.h
-  ${MANTA_HLP}/util/rcmatrix.h
-  ${MANTA_HLP}/util/simpleimage.cpp
-  ${MANTA_HLP}/util/simpleimage.h
-  ${MANTA_HLP}/util/solvana.h
-  ${MANTA_HLP}/util/vector4d.cpp
-  ${MANTA_HLP}/util/vector4d.h
-  ${MANTA_HLP}/util/vectorbase.cpp
-  ${MANTA_HLP}/util/vectorbase.h
+  helper/pwrapper/manta.h
+  helper/pwrapper/pclass.cpp
+  helper/pwrapper/pclass.h
+  helper/pwrapper/pconvert.cpp
+  helper/pwrapper/pconvert.h
+  helper/pwrapper/pvec3.cpp
+  helper/pwrapper/pythonInclude.h
+  helper/pwrapper/registry.cpp
+  helper/pwrapper/registry.h
+  helper/util/integrator.h
+  helper/util/interpol.h
+  helper/util/interpolHigh.h
+  helper/util/matrixbase.h
+  helper/util/mcubes.h
+  helper/util/quaternion.h
+  helper/util/randomstream.h
+  helper/util/rcmatrix.h
+  helper/util/simpleimage.cpp
+  helper/util/simpleimage.h
+  helper/util/solvana.h
+  helper/util/vector4d.cpp
+  helper/util/vector4d.h
+  helper/util/vectorbase.cpp
+  helper/util/vectorbase.h
 )
 
 if(WITH_MANTA_DEPENDENCIES)
@@ -266,16 +261,34 @@ if(WITH_MANTA_DEPENDENCIES)
 endif()
 if(WITH_MANTA_NUMPY AND WITH_PYTHON_NUMPY)
   list(APPEND SRC
-    ${MANTA_PP}/plugin/numpyconvert.cpp
-    ${MANTA_PP}/plugin/tfplugins.cpp
-    ${MANTA_HLP}/pwrapper/numpyWrap.cpp
-    ${MANTA_HLP}/pwrapper/numpyWrap.h
+    preprocessed/plugin/numpyconvert.cpp
+    preprocessed/plugin/tfplugins.cpp
+    helper/pwrapper/numpyWrap.cpp
+    helper/pwrapper/numpyWrap.h
   )
 endif()
 
 set(LIB
   ${PYTHON_LINKFLAGS}
   ${PYTHON_LIBRARIES}
+  ${OPENVDB_LIBRARIES}  
 )
 
-blender_add_lib(extern_mantaflow "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
+#blender_add_lib(extern_mantaflow "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
+
+add_library(extern_mantaflow SHARED "${SRC}")
+
+include_directories(preprocessed)
+include_directories(preprocessed/fileio)
+include_directories(preprocessed/python)
+include_directories(preprocessed/plugin)
+include_directories(helper/pwrapper)
+include_directories(helper/util)
+include_directories(${PYTHON_INCLUDE_DIRS})
+include_directories(${ZLIB_INCLUDE_DIRS})
+include_directories(${OPENVDB_INCLUDE_DIRS})
+
+#target_include_directories(extern_mantaflow PRIVATE "${INC}")
+target_link_libraries(extern_mantaflow PRIVATE "${LIB}")
+
+blender_source_group(extern_mantaflow "${SRC}")
diff --git a/extern/mantaflow/UPDATE.sh b/extern/mantaflow/UPDATE.sh
index 1158ff13455..83895d8c84d 100644
--- a/extern/mantaflow/UPDATE.sh
+++ b/extern/mantaflow/UPDATE.sh
@@ -7,8 +7,11 @@
 # ====================  1) ENVIRONMENT SETUP =============================================
 
 # YOUR INSTALLATION PATHS GO HERE:
-MANTA_INSTALLATION=/Users/sebbas/Developer/Mantaflow/mantaflowDevelop
-BLENDER_INSTALLATION=/Users/sebbas/Developer/Blender
+MANTA_INSTALLATION=/home/sebbas/Developer/Mantaflow
+BLENDER_INSTALLATION=/home/sebbas/Developer/Blender
+
+CC=/home/sebbas/Developer/LLVM-Project/install/bin/clang
+CXX=/home/sebbas/Developer/LLVM-Project/install/bin/clang++
 
 # Try to check out Mantaflow repository before building?
 CLEAN_REPOSITORY=0
@@ -20,8 +23,13 @@ WITH_DEPENDENCIES=0
 USE_NUMPY=0
 
 # Choose which multithreading platform to use for Mantaflow preprocessing
-USE_OMP=0
-USE_TBB=1
+USE_OMP=1
+USE_TBB=0
+
+# Use OpenMP offloading too?
+if [[ "$USE_OMP" -eq "1" ]]; then
+  USE_OMP_OFFLOAD=1
+fi
 
 if [[ "$USE_OMP" -eq "1" && "$USE_TBB" -eq "1" ]]; then
   echo "Cannot build Mantaflow for OpenMP and TBB at the same time"
@@ -56,7 +64,7 @@ fi
 MANTA_BUILD_PATH=$MANTA_INSTALLATION/build_blender/
 mkdir -p $MANTA_BUILD_PATH
 cd $MANTA_BUILD_PATH
-cmake ../mantaflowgit -DGUI=0 -DOPENMP=$USE_OMP -DTBB=$USE_TBB -DBLENDER=1 -DPREPDEBUG=1 -DNUMPY=$USE_NUMPY && make -j8
+cmake ../mantaflowgit -DGUI=0 -DOPENMP=$USE_OMP -DTBB=$USE_TBB -DOPENMP_OFFLOAD=$USE_OMP_OFFLOAD -DBLENDER=1 -DPREPDEBUG=1 -DNUMPY=$USE_NUMPY -DPYTHON_VERSION=3 -DCMAKE_C_COMPILER=$CC -DCMAKE_CXX_COMPILER=$CXX && make -j8
 
 # ==================== 3) COPY MANTAFLOW FILES TO BLENDER ROOT ===========================
 
diff --git a/extern/mantaflow/helper/util/rcmatrix.h b/extern/mantaflow/helper/util/rcmatrix.h
index 330fd1f64f7..f1f0efe6416 100644
--- a/extern/mantaflow/helper/util/rcmatrix.h
+++ b/extern/mantaflow/helper/util/rcmatrix.h
@@ -1035,7 +1035,7 @@ template<class N, class T> struct RCFixedMatrix {
 typedef RCMatrix<int, Real> Matrix;
 typedef RCFixedMatrix<int, Real> FixedMatrix;
 
-}
+}  // namespace Manta
 
 #undef parallel_for
 #undef parallel_end
diff --git a/extern/mantaflow/preprocessed/commonkernels.h b/extern/mantaflow/preprocessed/commonkernels.h
index 7fa6f185146..7ac13fbdfc4 100644
--- a/extern/mantaflow/preprocessed/commonkernels.h
+++ b/extern/mantaflow/preprocessed/commonkernels.h
@@ -34,7 +34,7 @@ struct InvertCheckFluid : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, const FlagGrid &flags, Grid<Real> &grid) const
+  inline void op(IndexInt idx, const FlagGrid &flags, Grid<Real> &grid)
   {
     if (flags.isFluid(idx) && grid[idx] > 0)
       grid[idx] = 1.0 / grid[idx];
@@ -49,21 +49,17 @@ struct InvertCheckFluid : public KernelBase {
     return grid;
   }
   typedef Grid<Real> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel InvertCheckFluid ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, flags, grid);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, flags, grid);
+    }
   }
   const FlagGrid &flags;
   Grid<Real> &grid;
@@ -77,9 +73,9 @@ struct GridSumSqr : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, const Grid<Real> &grid, double &sum)
+  inline void op(int i, int j, int k, const Grid<Real> &grid, double &sum)
   {
-    sum += square((double)grid[idx]);
+    sum += square((double)grid(i, j, k));
   }
   inline operator double()
   {
@@ -94,28 +90,37 @@ struct GridSumSqr : public KernelBase {
     return grid;
   }
   typedef Grid<Real> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel GridSumSqr ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, grid, sum);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  GridSumSqr(GridSumSqr &o, tbb::split) : KernelBase(o), grid(o.grid), sum(0)
-  {
-  }
-  void join(const GridSumSqr &o)
-  {
-    sum += o.sum;
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      const Grid<Real> &grid = getArg0();
+#pragma omp target teams distribute parallel for reduction(+ : sum) collapse(2) schedule(static, 1)
+      {
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, grid, sum);
+      }
+      {
+        this->sum = sum;
+      }
+    }
+    else {
+      const int k = 0;
+      const Grid<Real> &grid = getArg0();
+#pragma omp target teams distribute parallel for reduction(+ : sum) collapse(1) schedule(static, 1)
+      {
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, grid, sum);
+      }
+      {
+        this->sum = sum;
+      }
+    }
   }
   const Grid<Real> &grid;
   double sum;
@@ -129,7 +134,7 @@ struct CurlOp : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const Grid<Vec3> &grid, Grid<Vec3> &dst) const
+  inline void op(int i, int j, int k, const Grid<Vec3> &grid, Grid<Vec3> &dst)
   {
     Vec3 v = Vec3(0.,
                   0.,
@@ -153,37 +158,35 @@ struct CurlOp : public KernelBase {
     return dst;
   }
   typedef Grid<Vec3> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel CurlOp ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, grid, dst);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, grid, dst);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, grid, dst);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, grid, dst);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const Grid<Vec3> &grid;
   Grid<Vec3> &dst;
 };
@@ -197,7 +200,7 @@ struct DivergenceOpMAC : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<Real> &div, const MACGrid &grid) const
+  inline void op(int i, int j, int k, Grid<Real> &div, const MACGrid &grid)
   {
     Vec3 del = Vec3(grid(i + 1, j, k).x, grid(i, j + 1, k).y, 0.) - grid(i, j, k);
     if (grid.is3D())
@@ -216,37 +219,35 @@ struct DivergenceOpMAC : public KernelBase {
     return grid;
   }
   typedef MACGrid type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel DivergenceOpMAC ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, div, grid);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, div, grid);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, div, grid);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, div, grid);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   Grid<Real> &div;
   const MACGrid &grid;
 };
@@ -259,7 +260,7 @@ struct GradientOpMAC : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, MACGrid &gradient, const Grid<Real> &grid) const
+  inline void op(int i, int j, int k, MACGrid &gradient, const Grid<Real> &grid)
   {
     Vec3 grad = (Vec3(grid(i, j, k)) - Vec3(grid(i - 1, j, k), grid(i, j - 1, k), 0.));
     if (grid.is3D())
@@ -278,37 +279,35 @@ struct GradientOpMAC : public KernelBase {
     return grid;
   }
   typedef Grid<Real> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel GradientOpMAC ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, gradient, grid);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, gradient, grid);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, gradient, grid);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, gradient, grid);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   MACGrid &gradient;
   const Grid<Real> &grid;
 };
@@ -321,7 +320,7 @@ struct GradientOp : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<Vec3> &gradient, const Grid<Real> &grid) const
+  inline void op(int i, int j, int k, Grid<Vec3> &gradient, const Grid<Real> &grid)
   {
     Vec3 grad = 0.5 * Vec3(grid(i + 1, j, k) - grid(i - 1, j, k),
                            grid(i, j + 1, k) - grid(i, j - 1, k),
@@ -340,37 +339,35 @@ struct GradientOp : public KernelBase {
     return grid;
   }
   typedef Grid<Real> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel GradientOp ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, gradient, grid);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, gradient, grid);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, gradient, grid);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, gradient, grid);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   Grid<Vec3> &gradient;
   const Grid<Real> &grid;
 };
@@ -383,7 +380,7 @@ struct LaplaceOp : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<Real> &laplace, const Grid<Real> &grid) const
+  inline void op(int i, int j, int k, Grid<Real> &laplace, const Grid<Real> &grid)
   {
     laplace(i, j, k) = grid(i + 1, j, k) - 2.0 * grid(i, j, k) + grid(i - 1, j, k);
     laplace(i, j, k) += grid(i, j + 1, k) - 2.0 * grid(i, j, k) + grid(i, j - 1, k);
@@ -401,37 +398,35 @@ struct LaplaceOp : public KernelBase {
     return grid;
   }
   typedef Grid<Real> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel LaplaceOp ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, laplace, grid);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, laplace, grid);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, laplace, grid);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, laplace, grid);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   Grid<Real> &laplace;
   const Grid<Real> &grid;
 };
@@ -444,7 +439,7 @@ struct CurvatureOp : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<Real> &curv, const Grid<Real> &grid, const Real h) const
+  inline void op(int i, int j, int k, Grid<Real> &curv, const Grid<Real> &grid, const Real h)
   {
     const Real over_h = 1.0 / h;
     const Real x = 0.5 * (grid(i + 1, j, k) - grid(i - 1, j, k)) * over_h;
@@ -492,37 +487,35 @@ struct CurvatureOp : public KernelBase {
     return h;
   }
   typedef Real type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel CurvatureOp ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, curv, grid, h);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, curv, grid, h);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, curv, grid, h);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, curv, grid, h);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   Grid<Real> &curv;
   const Grid<Real> &grid;
   const Real h;
@@ -536,7 +529,7 @@ struct GetShiftedComponent : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const Grid<Vec3> &grid, Grid<Real> &comp, int dim) const
+  inline void op(int i, int j, int k, const Grid<Vec3> &grid, Grid<Real> &comp, int dim)
   {
     Vec3i ishift(i, j, k);
     ishift[dim]--;
@@ -557,37 +550,35 @@ struct GetShiftedComponent : public KernelBase {
     return dim;
   }
   typedef int type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel GetShiftedComponent ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, grid, comp, dim);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, grid, comp, dim);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, grid, comp, dim);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, grid, comp, dim);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const Grid<Vec3> &grid;
   Grid<Real> &comp;
   int dim;
@@ -602,7 +593,7 @@ struct GetComponent : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, const Grid<Vec3> &grid, Grid<Real> &comp, int dim) const
+  inline void op(IndexInt idx, const Grid<Vec3> &grid, Grid<Real> &comp, int dim)
   {
     comp[idx] = grid[idx][dim];
   }
@@ -621,21 +612,17 @@ struct GetComponent : public KernelBase {
     return dim;
   }
   typedef int type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel GetComponent ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, grid, comp, dim);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, grid, comp, dim);
+    }
   }
   const Grid<Vec3> &grid;
   Grid<Real> &comp;
@@ -650,7 +637,7 @@ struct GridNorm : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<Real> &n, const Grid<Vec3> &grid) const
+  inline void op(IndexInt idx, Grid<Real> &n, const Grid<Vec3> &grid)
   {
     n[idx] = norm(grid[idx]);
   }
@@ -664,21 +651,17 @@ struct GridNorm : public KernelBase {
     return grid;
   }
   typedef Grid<Vec3> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel GridNorm ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, n, grid);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, n, grid);
+    }
   }
   Grid<Real> &n;
   const Grid<Vec3> &grid;
@@ -693,7 +676,7 @@ struct SetComponent : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<Vec3> &grid, const Grid<Real> &comp, int dim) const
+  inline void op(IndexInt idx, Grid<Vec3> &grid, const Grid<Real> &comp, int dim)
   {
     grid[idx][dim] = comp[idx];
   }
@@ -712,21 +695,17 @@ struct SetComponent : public KernelBase {
     return dim;
   }
   typedef int type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel SetComponent ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, grid, comp, dim);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, grid, comp, dim);
+    }
   }
   Grid<Vec3> &grid;
   const Grid<Real> &comp;
@@ -742,7 +721,7 @@ struct GetCentered : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<Vec3> &center, const MACGrid &vel) const
+  inline void op(int i, int j, int k, Grid<Vec3> &center, const MACGrid &vel)
   {
     Vec3 v = 0.5 * (vel(i, j, k) + Vec3(vel(i + 1, j, k).x, vel(i, j + 1, k).y, 0.));
     if (vel.is3D())
@@ -761,37 +740,35 @@ struct GetCentered : public KernelBase {
     return vel;
   }
   typedef MACGrid type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel GetCentered ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, center, vel);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, center, vel);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, center, vel);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, center, vel);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   Grid<Vec3> &center;
   const MACGrid &vel;
 };
@@ -804,7 +781,7 @@ struct GetMAC : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, MACGrid &vel, const Grid<Vec3> &center) const
+  inline void op(int i, int j, int k, MACGrid &vel, const Grid<Vec3> &center)
   {
     Vec3 v = 0.5 * (center(i, j, k) + Vec3(center(i - 1, j, k).x, center(i, j - 1, k).y, 0.));
     if (vel.is3D())
@@ -823,37 +800,35 @@ struct GetMAC : public KernelBase {
     return center;
   }
   typedef Grid<Vec3> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel GetMAC ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, vel, center);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, vel, center);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, vel, center);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, vel, center);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   MACGrid &vel;
   const Grid<Vec3> &center;
 };
@@ -866,7 +841,7 @@ struct FillInBoundary : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<Vec3> &grid, int g) const
+  inline void op(int i, int j, int k, Grid<Vec3> &grid, int g)
   {
     if (i == 0)
       grid(i, j, k) = grid(i + 1, j, k);
@@ -891,37 +866,35 @@ struct FillInBoundary : public KernelBase {
     return g;
   }
   typedef int type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel FillInBoundary ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, grid, g);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, grid, g);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, grid, g);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, grid, g);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<Vec3> &grid;
   int g;
 };
@@ -939,7 +912,7 @@ struct kn_conv_mex_in_to_MAC : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const double *p_lin_array, MACGrid *p_result) const
+  inline void op(int i, int j, int k, const double *p_lin_array, MACGrid *p_result)
   {
     int ijk = i + j * p_result->getSizeX() + k * p_result->getSizeX() * p_result->getSizeY();
     const int n = p_result->getSizeX() * p_result->getSizeY() * p_result->getSizeZ();
@@ -958,37 +931,35 @@ struct kn_conv_mex_in_to_MAC : public KernelBase {
     return p_result;
   }
   typedef MACGrid type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn_conv_mex_in_to_MAC ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, p_lin_array, p_result);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, p_lin_array, p_result);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, p_lin_array, p_result);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, p_lin_array, p_result);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const double *p_lin_array;
   MACGrid *p_result;
 };
@@ -1000,7 +971,7 @@ struct kn_conv_MAC_to_mex_out : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const MACGrid *p_mac, double *p_result) const
+  inline void op(int i, int j, int k, const MACGrid *p_mac, double *p_result)
   {
     int ijk = i + j * p_mac->getSizeX() + k * p_mac->getSizeX() * p_mac->getSizeY();
     const int n = p_mac->getSizeX() * p_mac->getSizeY() * p_mac->getSizeZ();
@@ -1019,37 +990,35 @@ struct kn_conv_MAC_to_mex_out : public KernelBase {
     return p_result;
   }
   typedef double type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn_conv_MAC_to_mex_out ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, p_mac, p_result);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, p_mac, p_result);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, p_mac, p_result);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, p_mac, p_result);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const MACGrid *p_mac;
   double *p_result;
 };
@@ -1063,7 +1032,7 @@ struct kn_conv_mex_in_to_Vec3 : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const double *p_lin_array, Grid<Vec3> *p_result) const
+  inline void op(int i, int j, int k, const double *p_lin_array, Grid<Vec3> *p_result)
   {
     int ijk = i + j * p_result->getSizeX() + k * p_result->getSizeX() * p_result->getSizeY();
     const int n = p_result->getSizeX() * p_result->getSizeY() * p_result->getSizeZ();
@@ -1082,37 +1051,35 @@ struct kn_conv_mex_in_to_Vec3 : public KernelBase {
     return p_result;
   }
   typedef Grid<Vec3> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn_conv_mex_in_to_Vec3 ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, p_lin_array, p_result);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, p_lin_array, p_result);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, p_lin_array, p_result);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, p_lin_array, p_result);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const double *p_lin_array;
   Grid<Vec3> *p_result;
 };
@@ -1124,7 +1091,7 @@ struct kn_conv_Vec3_to_mex_out : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const Grid<Vec3> *p_Vec3, double *p_result) const
+  inline void op(int i, int j, int k, const Grid<Vec3> *p_Vec3, double *p_result)
   {
     int ijk = i + j * p_Vec3->getSizeX() + k * p_Vec3->getSizeX() * p_Vec3->getSizeY();
     const int n = p_Vec3->getSizeX() * p_Vec3->getSizeY() * p_Vec3->getSizeZ();
@@ -1143,37 +1110,35 @@ struct kn_conv_Vec3_to_mex_out : public KernelBase {
     return p_result;
   }
   typedef double type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn_conv_Vec3_to_mex_out ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, p_Vec3, p_result);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, p_Vec3, p_result);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, p_Vec3, p_result);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, p_Vec3, p_result);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const Grid<Vec3> *p_Vec3;
   double *p_result;
 };
@@ -1187,7 +1152,7 @@ struct kn_conv_mex_in_to_Real : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const double *p_lin_array, Grid<Real> *p_result) const
+  inline void op(int i, int j, int k, const double *p_lin_array, Grid<Real> *p_result)
   {
     int ijk = i + j * p_result->getSizeX() + k * p_result->getSizeX() * p_result->getSizeY();
 
@@ -1203,37 +1168,35 @@ struct kn_conv_mex_in_to_Real : public KernelBase {
     return p_result;
   }
   typedef Grid<Real> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn_conv_mex_in_to_Real ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, p_lin_array, p_result);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, p_lin_array, p_result);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, p_lin_array, p_result);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, p_lin_array, p_result);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const double *p_lin_array;
   Grid<Real> *p_result;
 };
@@ -1245,7 +1208,7 @@ struct kn_conv_Real_to_mex_out : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const Grid<Real> *p_grid, double *p_result) const
+  inline void op(int i, int j, int k, const Grid<Real> *p_grid, double *p_result)
   {
     int ijk = i + j * p_grid->getSizeX() + k * p_grid->getSizeX() * p_grid->getSizeY();
 
@@ -1261,37 +1224,35 @@ struct kn_conv_Real_to_mex_out : public KernelBase {
     return p_result;
   }
   typedef double type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn_conv_Real_to_mex_out ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, p_grid, p_result);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, p_grid, p_result);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, p_grid, p_result);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, p_grid, p_result);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const Grid<Real> *p_grid;
   double *p_result;
 };
diff --git a/extern/mantaflow/preprocessed/conjugategrad.cpp b/extern/mantaflow/preprocessed/conjugategrad.cpp
index bdcceb29520..df184f654b6 100644
--- a/extern/mantaflow/preprocessed/conjugategrad.cpp
+++ b/extern/mantaflow/preprocessed/conjugategrad.cpp
@@ -18,6 +18,8 @@
 
 #include "conjugategrad.h"
 #include "commonkernels.h"
+#include <chrono>
+using namespace std::chrono;
 
 using namespace std;
 namespace Manta {
@@ -213,9 +215,9 @@ struct GridDotProduct : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, const Grid<Real> &a, const Grid<Real> &b, double &result)
+  inline void op(int i, int j, int k, const Grid<Real> &a, const Grid<Real> &b, double &result)
   {
-    result += (a[idx] * b[idx]);
+    result += (a(i, j, k) * b(i, j, k));
   }
   inline operator double()
   {
@@ -235,28 +237,39 @@ struct GridDotProduct : public KernelBase {
     return b;
   }
   typedef Grid<Real> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel GridDotProduct ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, a, b, result);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  GridDotProduct(GridDotProduct &o, tbb::split) : KernelBase(o), a(o.a), b(o.b), result(0.0)
-  {
-  }
-  void join(const GridDotProduct &o)
-  {
-    result += o.result;
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      const Grid<Real> &a = getArg0();
+      const Grid<Real> &b = getArg1();
+#pragma omp target teams distribute parallel for reduction(+:result) collapse(2) schedule(static,1)
+      {
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, a, b, result);
+      }
+      {
+        this->result = result;
+      }
+    }
+    else {
+      const int k = 0;
+      const Grid<Real> &a = getArg0();
+      const Grid<Real> &b = getArg1();
+#pragma omp target teams distribute parallel for reduction(+:result) collapse(1) schedule(static,1)
+      {
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, a, b, result);
+      }
+      {
+        this->result = result;
+      }
+    }
   }
   const Grid<Real> &a;
   const Grid<Real> &b;
@@ -315,29 +328,21 @@ struct InitSigma : public KernelBase {
     return temp;
   }
   typedef Grid<Real> type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel InitSigma ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, flags, dst, rhs, temp, sigma);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  InitSigma(InitSigma &o, tbb::split)
-      : KernelBase(o), flags(o.flags), dst(o.dst), rhs(o.rhs), temp(o.temp), sigma(0)
-  {
-  }
-  void join(const InitSigma &o)
-  {
-    sigma += o.sigma;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      double sigma = 0;
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, flags, dst, rhs, temp, sigma);
+#pragma omp critical
+      {
+        this->sigma += sigma;
+      }
+    }
   }
   const FlagGrid &flags;
   Grid<Real> &dst;
@@ -356,8 +361,9 @@ struct UpdateSearchVec : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<Real> &dst, Grid<Real> &src, Real factor) const
+  inline void op(int i, int j, int k, Grid<Real> &dst, Grid<Real> &src, Real factor)
   {
+    const IndexInt idx = dst.index(i, j, k);
     dst[idx] = src[idx] + factor * dst[idx];
   }
   inline Grid<Real> &getArg0()
@@ -375,21 +381,35 @@ struct UpdateSearchVec : public KernelBase {
     return factor;
   }
   typedef Real type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel UpdateSearchVec ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, dst, src, factor);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      Grid<Real> &dst = getArg0();
+      Grid<Real> &src = getArg1();
+      Real &factor = getArg2();
+#pragma omp target teams distribute parallel for collapse(3) schedule(static, 1)
+      {
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, dst, src, factor);
+      }
+    }
+    else {
+      const int k = 0;
+      Grid<Real> &dst = getArg0();
+      Grid<Real> &src = getArg1();
+      Real &factor = getArg2();
+#pragma omp target teams distribute parallel for collapse(2) schedule(static, 1)
+      {
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, dst, src, factor);
+      }
+    }
   }
   Grid<Real> &dst;
   Grid<Real> &src;
@@ -406,8 +426,10 @@ GridCg<APPLYMAT>::GridCg(Grid<Real> &dst,
                          Grid<Real> &search,
                          const FlagGrid &flags,
                          Grid<Real> &tmp,
-                         std::vector<Grid<Real> *> matrixAVec,
-                         std::vector<Grid<Real> *> rhsVec)
+                         Grid<Real> *pA0,
+                         Grid<Real> *pAi,
+                         Grid<Real> *pAj,
+                         Grid<Real> *pAk)
     : GridCgInterface(),
       mInited(false),
       mIterations(0),
@@ -417,8 +439,10 @@ GridCg<APPLYMAT>::GridCg(Grid<Real> &dst,
       mSearch(search),
       mFlags(flags),
       mTmp(tmp),
-      mMatrixA(matrixAVec),
-      mVecRhs(rhsVec),
+      mpA0(pA0),
+      mpAi(pAi),
+      mpAj(pAj),
+      mpAk(pAk),
       mPcMethod(PC_None),
       mpPCA0(nullptr),
       mpPCAi(nullptr),
@@ -436,54 +460,37 @@ template<class APPLYMAT> void GridCg<APPLYMAT>::doInit()
   mInited = true;
   mIterations = 0;
 
-  mDst.clear();
-  mResidual.copyFrom(mRhs);  // p=0, residual = b
+  mDst.clear(1);
+  mResidual.copyFrom(mRhs, true, 1);  // p=0, residual = b
 
   if (mPcMethod == PC_ICP) {
-    assertMsg(mDst.is3D(), "ICP only supports 3D grids so far");
-    InitPreconditionIncompCholesky(mFlags,
-                                   *mpPCA0,
-                                   *mpPCAi,
-                                   *mpPCAj,
-                                   *mpPCAk,
-                                   *mMatrixA[0],
-                                   *mMatrixA[1],
-                                   *mMatrixA[2],
-                                   *mMatrixA[3]);
-    ApplyPreconditionIncompCholesky(mTmp,
-                                    mResidual,
-                                    mFlags,
-                                    *mpPCA0,
-                                    *mpPCAi,
-                                    *mpPCAj,
-                                    *mpPCAk,
-                                    *mMatrixA[0],
-                                    *mMatrixA[1],
-                                    *mMatrixA[2],
-                                    *mMatrixA[3]);
+    // assertMsg(mDst.is3D(), "ICP only supports 3D grids so far");
+    InitPreconditionIncompCholesky(
+        mFlags, *mpPCA0, *mpPCAi, *mpPCAj, *mpPCAk, *mpA0, *mpAi, *mpAj, *mpAk);
+    ApplyPreconditionIncompCholesky(
+        mTmp, mResidual, mFlags, *mpPCA0, *mpPCAi, *mpPCAj, *mpPCAk, *mpA0, *mpAi, *mpAj, *mpAk);
   }
   else if (mPcMethod == PC_mICP) {
-    assertMsg(mDst.is3D(), "mICP only supports 3D grids so far");
-    InitPreconditionModifiedIncompCholesky2(
-        mFlags, *mpPCA0, *mMatrixA[0], *mMatrixA[1], *mMatrixA[2], *mMatrixA[3]);
+    // assertMsg(mDst.is3D(), "mICP only supports 3D grids so far");
+    InitPreconditionModifiedIncompCholesky2(mFlags, *mpPCA0, *mpA0, *mpAi, *mpAj, *mpAk);
     ApplyPreconditionModifiedIncompCholesky2(
-        mTmp, mResidual, mFlags, *mpPCA0, *mMatrixA[0], *mMatrixA[1], *mMatrixA[2], *mMatrixA[3]);
+        mTmp, mResidual, mFlags, *mpPCA0, *mpA0, *mpAi, *mpAj, *mpAk);
   }
   else if (mPcMethod == PC_MGP) {
-    InitPreconditionMultigrid(
-        mMG, *mMatrixA[0], *mMatrixA[1], *mMatrixA[2], *mMatrixA[3], mAccuracy);
+    InitPreconditionMultigrid(mMG, *mpA0, *mpAi, *mpAj, *mpAk, mAccuracy);
     ApplyPreconditionMultigrid(mMG, mTmp, mResidual);
   }
   else {
-    mTmp.copyFrom(mResidual);
+    mTmp.copyFrom(mResidual, true, 1);
   }
 
-  mSearch.copyFrom(mTmp);
+  mSearch.copyFrom(mTmp, true, 1);
   mSigma = GridDotProduct(mTmp, mResidual);
 }
 
-template<class APPLYMAT> bool GridCg<APPLYMAT>::iterate()
+template<class APPLYMAT> bool GridCg<APPLYMAT>::iterate(Real &time)
 {
+  auto start = high_resolution_clock::now();
   if (!mInited)
     doInit();
 
@@ -493,7 +500,14 @@ template<class APPLYMAT> bool GridCg<APPLYMAT>::iterate()
   // this could reinterpret the mpA pointers (not so clean right now)
   // tmp = applyMat(search)
 
-  APPLYMAT(mFlags, mTmp, mSearch, mMatrixA, mVecRhs);
+  APPLYMAT(mFlags, mTmp, mSearch, *mpA0, *mpAi, *mpAj, *mpAk);
+
+  auto stop = high_resolution_clock::now();
+  auto duration = duration_cast<microseconds>(stop - start);
+  time += duration.count();
+  // std::cout << "APPLYMAT Time taken: " << duration.count() << std::endl;
+
+  start = high_resolution_clock::now();
 
   // alpha = sigma/dot(tmp, search)
   Real dp = GridDotProduct(mTmp, mSearch);
@@ -501,35 +515,49 @@ template<class APPLYMAT> bool GridCg<APPLYMAT>::iterate()
   if (fabs(dp) > 0.)
     alpha = mSigma / (Real)dp;
 
+  stop = high_resolution_clock::now();
+  duration = duration_cast<microseconds>(stop - start);
+  time += duration.count();
+  // std::cout << "GridDotProduct Time taken: " << duration.count() << std::endl;
+
+  start = high_resolution_clock::now();
+
   gridScaledAdd<Real, Real>(mDst, mSearch, alpha);     // dst += search * alpha
   gridScaledAdd<Real, Real>(mResidual, mTmp, -alpha);  // residual += tmp * -alpha
 
+  stop = high_resolution_clock::now();
+  duration = duration_cast<microseconds>(stop - start);
+  time += duration.count();
+  // std::cout << "gridScaledAdd Time taken: " << duration.count() << std::endl;
+
+  start = high_resolution_clock::now();
+
   if (mPcMethod == PC_ICP)
-    ApplyPreconditionIncompCholesky(mTmp,
-                                    mResidual,
-                                    mFlags,
-                                    *mpPCA0,
-                                    *mpPCAi,
-                                    *mpPCAj,
-                                    *mpPCAk,
-                                    *mMatrixA[0],
-                                    *mMatrixA[1],
-                                    *mMatrixA[2],
-                                    *mMatrixA[3]);
+    ApplyPreconditionIncompCholesky(
+        mTmp, mResidual, mFlags, *mpPCA0, *mpPCAi, *mpPCAj, *mpPCAk, *mpA0, *mpAi, *mpAj, *mpAk);
   else if (mPcMethod == PC_mICP)
     ApplyPreconditionModifiedIncompCholesky2(
-        mTmp, mResidual, mFlags, *mpPCA0, *mMatrixA[0], *mMatrixA[1], *mMatrixA[2], *mMatrixA[3]);
+        mTmp, mResidual, mFlags, *mpPCA0, *mpA0, *mpAi, *mpAj, *mpAk);
   else if (mPcMethod == PC_MGP)
     ApplyPreconditionMultigrid(mMG, mTmp, mResidual);
   else
-    mTmp.copyFrom(mResidual);
+    mTmp.copyFrom(mResidual, true, 1);
+
+  stop = high_resolution_clock::now();
+  duration = duration_cast<microseconds>(stop - start);
+  time += duration.count();
+  // std::cout << "copyFrom Time taken: " << duration.count() << std::endl;
+
+  start = high_resolution_clock::now();
 
   // use the l2 norm of the residual for convergence check? (usually max norm is recommended
   // instead)
   if (this->mUseL2Norm) {
+    // std::cout << "USING L2" << std::endl;
     mResNorm = GridSumSqr(mResidual).sum;
   }
   else {
+    // std::cout << "NOT USING L2" << std::endl;
     mResNorm = mResidual.getMaxAbs();
   }
 
@@ -539,27 +567,43 @@ template<class APPLYMAT> bool GridCg<APPLYMAT>::iterate()
     return false;
   }
 
+  stop = high_resolution_clock::now();
+  duration = duration_cast<microseconds>(stop - start);
+  time += duration.count();
+  // std::cout << "GridSumSqr Time taken: " << duration.count() << std::endl;
+
+  start = high_resolution_clock::now();
+
   Real sigmaNew = GridDotProduct(mTmp, mResidual);
   Real beta = sigmaNew / mSigma;
 
+  stop = high_resolution_clock::now();
+  duration = duration_cast<microseconds>(stop - start);
+  time += duration.count();
+  // std::cout << "GridDotProduct Time taken: " << duration.count() << std::endl;
+
+  start = high_resolution_clock::now();
+
   // search =  tmp + beta * search
   UpdateSearchVec(mSearch, mTmp, beta);
 
-  debMsg("GridCg::iterate i=" << mIterations << " sigmaNew=" << sigmaNew << " sigmaLast=" << mSigma
-                              << " alpha=" << alpha << " beta=" << beta << " ",
-         CG_DEBUGLEVEL);
+  stop = high_resolution_clock::now();
+  duration = duration_cast<microseconds>(stop - start);
+  time += duration.count();
+  // std::cout << "UpdateSearchVec Time taken: " << duration.count() << std::endl;
+
+  // debMsg("GridCg::iterate i="<<mIterations<<" sigmaNew="<<sigmaNew<<" sigmaLast="<<mSigma<<"
+  // alpha="<<alpha<<" beta="<<beta<<" ", CG_DEBUGLEVEL);
   mSigma = sigmaNew;
 
   if (!(mResNorm < 1e35)) {
     if (mPcMethod == PC_MGP) {
       // diverging solves can be caused by the static multigrid mode, we cannot detect this here,
       // though only the pressure solve call "knows" whether the MG is static or dynamics...
-      debMsg(
-          "GridCg::iterate: Warning - this diverging solve can be caused by the 'static' mode of "
-          "the MG preconditioner. If the static mode is active, try switching to dynamic.",
-          1);
+      // debMsg("GridCg::iterate: Warning - this diverging solve can be caused by the 'static' mode
+      // of the MG preconditioner. If the static mode is active, try switching to dynamic.", 1);
     }
-    errMsg("GridCg::iterate: The CG solver diverged, residual norm > 1e30, stopping.");
+    // errMsg("GridCg::iterate: The CG solver diverged, residual norm > 1e30, stopping.");
   }
 
   // debMsg("PB-CG-Norms::p"<<sqrt( GridOpNormNosqrt(mpDst, mpFlags).getValue() ) <<"
@@ -571,8 +615,9 @@ template<class APPLYMAT> bool GridCg<APPLYMAT>::iterate()
 
 template<class APPLYMAT> void GridCg<APPLYMAT>::solve(int maxIter)
 {
+  Real time = 0;
   for (int iter = 0; iter < maxIter; iter++) {
-    if (!iterate())
+    if (!iterate(time))
       iter = maxIter;
   }
   return;
@@ -583,13 +628,13 @@ template<class APPLYMAT>
 void GridCg<APPLYMAT>::setICPreconditioner(
     PreconditionType method, Grid<Real> *A0, Grid<Real> *Ai, Grid<Real> *Aj, Grid<Real> *Ak)
 {
-  assertMsg(method == PC_ICP || method == PC_mICP,
-            "GridCg<APPLYMAT>::setICPreconditioner: Invalid method specified.");
+  // assertMsg(method==PC_ICP || method==PC_mICP, "GridCg<APPLYMAT>::setICPreconditioner: Invalid
+  // method specified.");
 
   mPcMethod = method;
   if ((!A0->is3D())) {
     if (gPrint2dWarning) {
-      debMsg("ICP/mICP pre-conditioning only supported in 3D for now, disabling it.", 1);
+      // debMsg("ICP/mICP pre-conditioning only supported in 3D for now, disabling it.", 1);
       gPrint2dWarning = false;
     }
     mPcMethod = PC_None;
@@ -603,7 +648,7 @@ void GridCg<APPLYMAT>::setICPreconditioner(
 template<class APPLYMAT>
 void GridCg<APPLYMAT>::setMGPreconditioner(PreconditionType method, GridMg *MG)
 {
-  assertMsg(method == PC_MGP, "GridCg<APPLYMAT>::setMGPreconditioner: Invalid method specified.");
+  // assertMsg(method==PC_MGP, "GridCg<APPLYMAT>::setMGPreconditioner: Invalid method specified.");
 
   mPcMethod = method;
   mMG = MG;
@@ -612,9 +657,6 @@ void GridCg<APPLYMAT>::setMGPreconditioner(PreconditionType method, GridMg *MG)
 // explicit instantiation
 template class GridCg<ApplyMatrix>;
 template class GridCg<ApplyMatrix2D>;
-template class GridCg<ApplyMatrixViscosityU>;
-template class GridCg<ApplyMatrixViscosityV>;
-template class GridCg<ApplyMatrixViscosityW>;
 
 //*****************************************************************************
 // diffusion for real and vec grids, e.g. for viscosity
@@ -655,44 +697,33 @@ void cgSolveDiffusion(const FlagGrid &flags,
     }
   }
 
-  GridCgInterface *gcg;
+  GridCgInterface *gcg = nullptr;
   // note , no preconditioning for now...
   const int maxIter = (int)(cgMaxIterFac * flags.getSize().max()) * (flags.is3D() ? 1 : 4);
 
   if (grid.getType() & GridBase::TypeReal) {
     Grid<Real> &u = ((Grid<Real> &)grid);
     rhs.copyFrom(u);
-    vector<Grid<Real> *> matA{&A0, &Ai, &Aj};
-
-    if (flags.is3D()) {
-      matA.push_back(&Ak);
-      gcg = new GridCg<ApplyMatrix>(u, rhs, residual, search, flags, tmp, matA);
-    }
-    else {
-      gcg = new GridCg<ApplyMatrix2D>(u, rhs, residual, search, flags, tmp, matA);
-    }
+    if (flags.is3D())
+      gcg = new GridCg<ApplyMatrix>(u, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
+    else
+      gcg = new GridCg<ApplyMatrix2D>(u, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
 
     gcg->setAccuracy(cgAccuracy);
     gcg->solve(maxIter);
 
-    debMsg("FluidSolver::solveDiffusion iterations:" << gcg->getIterations()
-                                                     << ", res:" << gcg->getSigma(),
-           CG_DEBUGLEVEL);
+    // debMsg("FluidSolver::solveDiffusion iterations:"<<gcg->getIterations()<<",
+    // res:"<<gcg->getSigma(), CG_DEBUGLEVEL);
   }
   else if ((grid.getType() & GridBase::TypeVec3) || (grid.getType() & GridBase::TypeMAC)) {
     Grid<Vec3> &vec = ((Grid<Vec3> &)grid);
     Grid<Real> u(parent);
-    vector<Grid<Real> *> matA{&A0, &Ai, &Aj};
 
     // core solve is same as for a regular real grid
-    if (flags.is3D()) {
-      matA.push_back(&Ak);
-      gcg = new GridCg<ApplyMatrix>(u, rhs, residual, search, flags, tmp, matA);
-    }
-    else {
-      gcg = new GridCg<ApplyMatrix2D>(u, rhs, residual, search, flags, tmp, matA);
-    }
-
+    if (flags.is3D())
+      gcg = new GridCg<ApplyMatrix>(u, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
+    else
+      gcg = new GridCg<ApplyMatrix2D>(u, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
     gcg->setAccuracy(cgAccuracy);
 
     // diffuse every component separately
@@ -702,15 +733,14 @@ void cgSolveDiffusion(const FlagGrid &flags,
 
       rhs.copyFrom(u);
       gcg->solve(maxIter);
-      debMsg("FluidSolver::solveDiffusion vec3, iterations:" << gcg->getIterations()
-                                                             << ", res:" << gcg->getSigma(),
-             CG_DEBUGLEVEL);
+      // debMsg("FluidSolver::solveDiffusion vec3, iterations:"<<gcg->getIterations()<<",
+      // res:"<<gcg->getSigma(), CG_DEBUGLEVEL);
 
       setComponent(u, vec, component);
     }
   }
   else {
-    errMsg("cgSolveDiffusion: Grid Type is not supported (only Real, Vec3, MAC, or Levelset)");
+    // errMsg("cgSolveDiffusion: Grid Type is not supported (only Real, Vec3, MAC, or Levelset)");
   }
 
   delete gcg;
diff --git a/extern/mantaflow/preprocessed/conjugategrad.h b/extern/mantaflow/preprocessed/conjugategrad.h
index 35cb3960656..189f85de689 100644
--- a/extern/mantaflow/preprocessed/conjugategrad.h
+++ b/extern/mantaflow/preprocessed/conjugategrad.h
@@ -37,7 +37,7 @@ class GridCgInterface {
   virtual ~GridCgInterface(){};
 
   // solving functions
-  virtual bool iterate() = 0;
+  virtual bool iterate(Real &time) = 0;
   virtual void solve(int maxIter) = 0;
 
   // precond
@@ -78,12 +78,16 @@ template<class APPLYMAT> class GridCg : public GridCgInterface {
          Grid<Real> &search,
          const FlagGrid &flags,
          Grid<Real> &tmp,
-         std::vector<Grid<Real> *> matrixAVec,
-         std::vector<Grid<Real> *> rhsVec = {});
-  ~GridCg(){};
+         Grid<Real> *A0,
+         Grid<Real> *pAi,
+         Grid<Real> *pAj,
+         Grid<Real> *pAk);
+  ~GridCg()
+  {
+  }
 
   void doInit();
-  bool iterate();
+  bool iterate(Real &time);
   void solve(int maxIter);
   //! init pointers, and copy values from "normal" matrix
   void setICPreconditioner(
@@ -129,10 +133,7 @@ template<class APPLYMAT> class GridCg : public GridCgInterface {
   const FlagGrid &mFlags;
   Grid<Real> &mTmp;
 
-  //! shape of A matrix defined here (e.g. diagonal,  positive neighbor cells, etc)
-  std::vector<Grid<Real> *> mMatrixA;
-  //! shape of rhs vector defined here (e.g. 1 rhs for regular fluids solve, 3 rhs for viscosity)
-  std::vector<Grid<Real> *> mVecRhs;
+  Grid<Real> *mpA0, *mpAi, *mpAj, *mpAk;
 
   PreconditionType mPcMethod;
   //! preconditioning grids
@@ -153,33 +154,32 @@ struct ApplyMatrix : public KernelBase {
   ApplyMatrix(const FlagGrid &flags,
               Grid<Real> &dst,
               const Grid<Real> &src,
-              const std::vector<Grid<Real> *> matrixA,
-              const std::vector<Grid<Real> *> vecRhs)
-      : KernelBase(&flags, 0), flags(flags), dst(dst), src(src), matrixA(matrixA), vecRhs(vecRhs)
+              Grid<Real> &A0,
+              Grid<Real> &Ai,
+              Grid<Real> &Aj,
+              Grid<Real> &Ak)
+      : KernelBase(&flags, 0), flags(flags), dst(dst), src(src), A0(A0), Ai(Ai), Aj(Aj), Ak(Ak)
   {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx,
+  inline void op(int i,
+                 int j,
+                 int k,
                  const FlagGrid &flags,
                  Grid<Real> &dst,
                  const Grid<Real> &src,
-                 const std::vector<Grid<Real> *> matrixA,
-                 const std::vector<Grid<Real> *> vecRhs) const
+                 Grid<Real> &A0,
+                 Grid<Real> &Ai,
+                 Grid<Real> &Aj,
+                 Grid<Real> &Ak)
   {
-    unusedParameter(vecRhs);  // Not needed in this matrix application
-
-    if (matrixA.size() != 4)
-      errMsg("ConjugateGrad: Invalid A matrix in apply matrix step");
-    Grid<Real> &A0 = *matrixA[0];
-    Grid<Real> &Ai = *matrixA[1];
-    Grid<Real> &Aj = *matrixA[2];
-    Grid<Real> &Ak = *matrixA[3];
-
+    const IndexInt idx = dst.index(i, j, k);
     if (!flags.isFluid(idx)) {
       dst[idx] = src[idx];
       return;
     }
+    const IndexInt X = flags.getStrideX(), Y = flags.getStrideY(), Z = flags.getStrideZ();
 
     dst[idx] = src[idx] * A0[idx] + src[idx - X] * Ai[idx - X] + src[idx + X] * Ai[idx] +
                src[idx - Y] * Aj[idx - Y] + src[idx + Y] * Aj[idx] + src[idx - Z] * Ak[idx - Z] +
@@ -200,37 +200,71 @@ struct ApplyMatrix : public KernelBase {
     return src;
   }
   typedef Grid<Real> type2;
-  inline const std::vector<Grid<Real> *> &getArg3()
+  inline Grid<Real> &getArg3()
   {
-    return matrixA;
+    return A0;
   }
-  typedef std::vector<Grid<Real> *> type3;
-  inline const std::vector<Grid<Real> *> &getArg4()
+  typedef Grid<Real> type3;
+  inline Grid<Real> &getArg4()
   {
-    return vecRhs;
+    return Ai;
   }
-  typedef std::vector<Grid<Real> *> type4;
-  void runMessage()
+  typedef Grid<Real> type4;
+  inline Grid<Real> &getArg5()
   {
-    debMsg("Executing kernel ApplyMatrix ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+    return Aj;
+  }
+  typedef Grid<Real> type5;
+  inline Grid<Real> &getArg6()
   {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, flags, dst, src, matrixA, vecRhs);
+    return Ak;
   }
+  typedef Grid<Real> type6;
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      const FlagGrid &flags = getArg0();
+      Grid<Real> &dst = getArg1();
+      const Grid<Real> &src = getArg2();
+      Grid<Real> &A0 = getArg3();
+      Grid<Real> &Ai = getArg4();
+      Grid<Real> &Aj = getArg5();
+      Grid<Real> &Ak = getArg6();
+#pragma omp target teams distribute parallel for collapse(3) schedule(static, 1)
+      {
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, dst, src, A0, Ai, Aj, Ak);
+      }
+    }
+    else {
+      const int k = 0;
+      const FlagGrid &flags = getArg0();
+      Grid<Real> &dst = getArg1();
+      const Grid<Real> &src = getArg2();
+      Grid<Real> &A0 = getArg3();
+      Grid<Real> &Ai = getArg4();
+      Grid<Real> &Aj = getArg5();
+      Grid<Real> &Ak = getArg6();
+#pragma omp target teams distribute parallel for collapse(2) schedule(static, 1)
+      {
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, dst, src, A0, Ai, Aj, Ak);
+      }
+    }
   }
   const FlagGrid &flags;
   Grid<Real> &dst;
   const Grid<Real> &src;
-  const std::vector<Grid<Real> *> matrixA;
-  const std::vector<Grid<Real> *> vecRhs;
+  Grid<Real> &A0;
+  Grid<Real> &Ai;
+  Grid<Real> &Aj;
+  Grid<Real> &Ak;
 };
 
 //! Kernel: Apply symmetric stored Matrix. 2D version
@@ -239,32 +273,34 @@ struct ApplyMatrix2D : public KernelBase {
   ApplyMatrix2D(const FlagGrid &flags,
                 Grid<Real> &dst,
                 const Grid<Real> &src,
-                const std::vector<Grid<Real> *> matrixA,
-                const std::vector<Grid<Real> *> vecRhs)
-      : KernelBase(&flags, 0), flags(flags), dst(dst), src(src), matrixA(matrixA), vecRhs(vecRhs)
+                Grid<Real> &A0,
+                Grid<Real> &Ai,
+                Grid<Real> &Aj,
+                Grid<Real> &Ak)
+      : KernelBase(&flags, 0), flags(flags), dst(dst), src(src), A0(A0), Ai(Ai), Aj(Aj), Ak(Ak)
   {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx,
+  inline void op(int i,
+                 int j,
+                 int k,
                  const FlagGrid &flags,
                  Grid<Real> &dst,
                  const Grid<Real> &src,
-                 const std::vector<Grid<Real> *> matrixA,
-                 const std::vector<Grid<Real> *> vecRhs) const
+                 Grid<Real> &A0,
+                 Grid<Real> &Ai,
+                 Grid<Real> &Aj,
+                 Grid<Real> &Ak)
   {
-    unusedParameter(vecRhs);  // Not needed in this matrix application
-
-    if (matrixA.size() != 3)
-      errMsg("ConjugateGrad: Invalid A matrix in apply matrix step");
-    Grid<Real> &A0 = *matrixA[0];
-    Grid<Real> &Ai = *matrixA[1];
-    Grid<Real> &Aj = *matrixA[2];
+    unusedParameter(Ak);  // only there for parameter compatibility with ApplyMatrix
 
+    const IndexInt idx = dst.index(i, j, k);
     if (!flags.isFluid(idx)) {
       dst[idx] = src[idx];
       return;
     }
+    const IndexInt X = flags.getStrideX(), Y = flags.getStrideY(), Z = flags.getStrideZ();
 
     dst[idx] = src[idx] * A0[idx] + src[idx - X] * Ai[idx - X] + src[idx + X] * Ai[idx] +
                src[idx - Y] * Aj[idx - Y] + src[idx + Y] * Aj[idx];
@@ -284,387 +320,73 @@ struct ApplyMatrix2D : public KernelBase {
     return src;
   }
   typedef Grid<Real> type2;
-  inline const std::vector<Grid<Real> *> &getArg3()
-  {
-    return matrixA;
-  }
-  typedef std::vector<Grid<Real> *> type3;
-  inline const std::vector<Grid<Real> *> &getArg4()
-  {
-    return vecRhs;
-  }
-  typedef std::vector<Grid<Real> *> type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel ApplyMatrix2D ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, flags, dst, src, matrixA, vecRhs);
-  }
-  void run()
-  {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  const FlagGrid &flags;
-  Grid<Real> &dst;
-  const Grid<Real> &src;
-  const std::vector<Grid<Real> *> matrixA;
-  const std::vector<Grid<Real> *> vecRhs;
-};
-
-struct ApplyMatrixViscosityU : public KernelBase {
-  ApplyMatrixViscosityU(const FlagGrid &flags,
-                        Grid<Real> &dst,
-                        const Grid<Real> &src,
-                        const std::vector<Grid<Real> *> matrixA,
-                        const std::vector<Grid<Real> *> vecRhs)
-      : KernelBase(&flags, 1), flags(flags), dst(dst), src(src), matrixA(matrixA), vecRhs(vecRhs)
-  {
-    runMessage();
-    run();
-  }
-  inline void op(int i,
-                 int j,
-                 int k,
-                 const FlagGrid &flags,
-                 Grid<Real> &dst,
-                 const Grid<Real> &src,
-                 const std::vector<Grid<Real> *> matrixA,
-                 const std::vector<Grid<Real> *> vecRhs) const
-  {
-    if (matrixA.size() != 15)
-      errMsg("ConjugateGrad: Invalid A matrix in apply matrix step");
-    Grid<Real> &A0 = *matrixA[0];
-    Grid<Real> &Aplusi = *matrixA[1];
-    Grid<Real> &Aplusj = *matrixA[2];
-    Grid<Real> &Aplusk = *matrixA[3];
-    Grid<Real> &Aminusi = *matrixA[4];
-    Grid<Real> &Aminusj = *matrixA[5];
-    Grid<Real> &Aminusk = *matrixA[6];
-
-    if (vecRhs.size() != 2)
-      errMsg("ConjugateGrad: Invalid rhs vector in apply matrix step");
-    Grid<Real> &srcV = *vecRhs[0];
-    Grid<Real> &srcW = *vecRhs[1];
-
-    dst(i, j, k) = src(i, j, k) * A0(i, j, k) + src(i + 1, j, k) * Aplusi(i, j, k) +
-                   src(i, j + 1, k) * Aplusj(i, j, k) + src(i, j, k + 1) * Aplusk(i, j, k) +
-                   src(i - 1, j, k) * Aminusi(i, j, k) + src(i, j - 1, k) * Aminusj(i, j, k) +
-                   src(i, j, k - 1) * Aminusk(i, j, k);
-
-    dst(i, j, k) += srcV(i, j + 1, k) * (*matrixA[7])(i, j, k) +
-                    srcV(i - 1, j + 1, k) * (*matrixA[8])(i, j, k) +
-                    srcV(i, j, k) * (*matrixA[9])(i, j, k) +
-                    srcV(i - 1, j, k) * (*matrixA[10])(i, j, k) +
-                    srcW(i, j, k + 1) * (*matrixA[11])(i, j, k) +
-                    srcW(i - 1, j, k + 1) * (*matrixA[12])(i, j, k) +
-                    srcW(i, j, k) * (*matrixA[13])(i, j, k) +
-                    srcW(i - 1, j, k) * (*matrixA[14])(i, j, k);
-  }
-  inline const FlagGrid &getArg0()
-  {
-    return flags;
-  }
-  typedef FlagGrid type0;
-  inline Grid<Real> &getArg1()
-  {
-    return dst;
-  }
-  typedef Grid<Real> type1;
-  inline const Grid<Real> &getArg2()
-  {
-    return src;
-  }
-  typedef Grid<Real> type2;
-  inline const std::vector<Grid<Real> *> &getArg3()
-  {
-    return matrixA;
-  }
-  typedef std::vector<Grid<Real> *> type3;
-  inline const std::vector<Grid<Real> *> &getArg4()
-  {
-    return vecRhs;
-  }
-  typedef std::vector<Grid<Real> *> type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel ApplyMatrixViscosityU ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
-    if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, dst, src, matrixA, vecRhs);
-    }
-    else {
-      const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, dst, src, matrixA, vecRhs);
-    }
-  }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
-  const FlagGrid &flags;
-  Grid<Real> &dst;
-  const Grid<Real> &src;
-  const std::vector<Grid<Real> *> matrixA;
-  const std::vector<Grid<Real> *> vecRhs;
-};
-
-struct ApplyMatrixViscosityV : public KernelBase {
-  ApplyMatrixViscosityV(const FlagGrid &flags,
-                        Grid<Real> &dst,
-                        const Grid<Real> &src,
-                        const std::vector<Grid<Real> *> matrixA,
-                        const std::vector<Grid<Real> *> vecRhs)
-      : KernelBase(&flags, 1), flags(flags), dst(dst), src(src), matrixA(matrixA), vecRhs(vecRhs)
-  {
-    runMessage();
-    run();
-  }
-  inline void op(int i,
-                 int j,
-                 int k,
-                 const FlagGrid &flags,
-                 Grid<Real> &dst,
-                 const Grid<Real> &src,
-                 const std::vector<Grid<Real> *> matrixA,
-                 const std::vector<Grid<Real> *> vecRhs) const
-  {
-    if (matrixA.size() != 15)
-      errMsg("ConjugateGrad: Invalid A matrix in apply matrix step");
-    Grid<Real> &A0 = *matrixA[0];
-    Grid<Real> &Aplusi = *matrixA[1];
-    Grid<Real> &Aplusj = *matrixA[2];
-    Grid<Real> &Aplusk = *matrixA[3];
-    Grid<Real> &Aminusi = *matrixA[4];
-    Grid<Real> &Aminusj = *matrixA[5];
-    Grid<Real> &Aminusk = *matrixA[6];
-
-    if (vecRhs.size() != 2)
-      errMsg("ConjugateGrad: Invalid rhs vector in apply matrix step");
-    Grid<Real> &srcU = *vecRhs[0];
-    Grid<Real> &srcW = *vecRhs[1];
-
-    dst(i, j, k) = src(i, j, k) * A0(i, j, k) + src(i + 1, j, k) * Aplusi(i, j, k) +
-                   src(i, j + 1, k) * Aplusj(i, j, k) + src(i, j, k + 1) * Aplusk(i, j, k) +
-                   src(i - 1, j, k) * Aminusi(i, j, k) + src(i, j - 1, k) * Aminusj(i, j, k) +
-                   src(i, j, k - 1) * Aminusk(i, j, k);
-
-    dst(i, j, k) += srcU(i + 1, j, k) * (*matrixA[7])(i, j, k) +
-                    srcU(i + 1, j - 1, k) * (*matrixA[8])(i, j, k) +
-                    srcU(i, j, k) * (*matrixA[9])(i, j, k) +
-                    srcU(i, j - 1, k) * (*matrixA[10])(i, j, k) +
-                    srcW(i, j, k + 1) * (*matrixA[11])(i, j, k) +
-                    srcW(i, j - 1, k + 1) * (*matrixA[12])(i, j, k) +
-                    srcW(i, j, k) * (*matrixA[13])(i, j, k) +
-                    srcW(i, j - 1, k) * (*matrixA[14])(i, j, k);
-  }
-  inline const FlagGrid &getArg0()
-  {
-    return flags;
-  }
-  typedef FlagGrid type0;
-  inline Grid<Real> &getArg1()
-  {
-    return dst;
-  }
-  typedef Grid<Real> type1;
-  inline const Grid<Real> &getArg2()
+  inline Grid<Real> &getArg3()
   {
-    return src;
+    return A0;
   }
-  typedef Grid<Real> type2;
-  inline const std::vector<Grid<Real> *> &getArg3()
+  typedef Grid<Real> type3;
+  inline Grid<Real> &getArg4()
   {
-    return matrixA;
+    return Ai;
   }
-  typedef std::vector<Grid<Real> *> type3;
-  inline const std::vector<Grid<Real> *> &getArg4()
+  typedef Grid<Real> type4;
+  inline Grid<Real> &getArg5()
   {
-    return vecRhs;
+    return Aj;
   }
-  typedef std::vector<Grid<Real> *> type4;
-  void runMessage()
+  typedef Grid<Real> type5;
+  inline Grid<Real> &getArg6()
   {
-    debMsg("Executing kernel ApplyMatrixViscosityV ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
-    if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, dst, src, matrixA, vecRhs);
-    }
-    else {
-      const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, dst, src, matrixA, vecRhs);
-    }
+    return Ak;
   }
+  typedef Grid<Real> type6;
+  void runMessage(){};
   void run()
   {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
-  const FlagGrid &flags;
-  Grid<Real> &dst;
-  const Grid<Real> &src;
-  const std::vector<Grid<Real> *> matrixA;
-  const std::vector<Grid<Real> *> vecRhs;
-};
-
-struct ApplyMatrixViscosityW : public KernelBase {
-  ApplyMatrixViscosityW(const FlagGrid &flags,
-                        Grid<Real> &dst,
-                        const Grid<Real> &src,
-                        const std::vector<Grid<Real> *> matrixA,
-                        const std::vector<Grid<Real> *> vecRhs)
-      : KernelBase(&flags, 1), flags(flags), dst(dst), src(src), matrixA(matrixA), vecRhs(vecRhs)
-  {
-    runMessage();
-    run();
-  }
-  inline void op(int i,
-                 int j,
-                 int k,
-                 const FlagGrid &flags,
-                 Grid<Real> &dst,
-                 const Grid<Real> &src,
-                 const std::vector<Grid<Real> *> matrixA,
-                 const std::vector<Grid<Real> *> vecRhs) const
-  {
-    if (matrixA.size() != 15)
-      errMsg("ConjugateGrad: Invalid A matrix in apply matrix step");
-    Grid<Real> &A0 = *matrixA[0];
-    Grid<Real> &Aplusi = *matrixA[1];
-    Grid<Real> &Aplusj = *matrixA[2];
-    Grid<Real> &Aplusk = *matrixA[3];
-    Grid<Real> &Aminusi = *matrixA[4];
-    Grid<Real> &Aminusj = *matrixA[5];
-    Grid<Real> &Aminusk = *matrixA[6];
-
-    if (vecRhs.size() != 2)
-      errMsg("ConjugateGrad: Invalid rhs vector in apply matrix step");
-    Grid<Real> &srcU = *vecRhs[0];
-    Grid<Real> &srcV = *vecRhs[1];
-
-    dst(i, j, k) = src(i, j, k) * A0(i, j, k) + src(i + 1, j, k) * Aplusi(i, j, k) +
-                   src(i, j + 1, k) * Aplusj(i, j, k) + src(i, j, k + 1) * Aplusk(i, j, k) +
-                   src(i - 1, j, k) * Aminusi(i, j, k) + src(i, j - 1, k) * Aminusj(i, j, k) +
-                   src(i, j, k - 1) * Aminusk(i, j, k);
-
-    dst(i, j, k) += srcU(i + 1, j, k) * (*matrixA[7])(i, j, k) +
-                    srcU(i + 1, j, k - 1) * (*matrixA[8])(i, j, k) +
-                    srcU(i, j, k) * (*matrixA[9])(i, j, k) +
-                    srcU(i, j, k - 1) * (*matrixA[10])(i, j, k) +
-                    srcV(i, j + 1, k) * (*matrixA[11])(i, j, k) +
-                    srcV(i, j + 1, k - 1) * (*matrixA[12])(i, j, k) +
-                    srcV(i, j, k) * (*matrixA[13])(i, j, k) +
-                    srcV(i, j, k - 1) * (*matrixA[14])(i, j, k);
-  }
-  inline const FlagGrid &getArg0()
-  {
-    return flags;
-  }
-  typedef FlagGrid type0;
-  inline Grid<Real> &getArg1()
-  {
-    return dst;
-  }
-  typedef Grid<Real> type1;
-  inline const Grid<Real> &getArg2()
-  {
-    return src;
-  }
-  typedef Grid<Real> type2;
-  inline const std::vector<Grid<Real> *> &getArg3()
-  {
-    return matrixA;
-  }
-  typedef std::vector<Grid<Real> *> type3;
-  inline const std::vector<Grid<Real> *> &getArg4()
-  {
-    return vecRhs;
-  }
-  typedef std::vector<Grid<Real> *> type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel ApplyMatrixViscosityW ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, dst, src, matrixA, vecRhs);
+      const FlagGrid &flags = getArg0();
+      Grid<Real> &dst = getArg1();
+      const Grid<Real> &src = getArg2();
+      Grid<Real> &A0 = getArg3();
+      Grid<Real> &Ai = getArg4();
+      Grid<Real> &Aj = getArg5();
+      Grid<Real> &Ak = getArg6();
+#pragma omp target teams distribute parallel for collapse(2) schedule(static, 1)
+      {
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, dst, src, A0, Ai, Aj, Ak);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, dst, src, matrixA, vecRhs);
+      const FlagGrid &flags = getArg0();
+      Grid<Real> &dst = getArg1();
+      const Grid<Real> &src = getArg2();
+      Grid<Real> &A0 = getArg3();
+      Grid<Real> &Ai = getArg4();
+      Grid<Real> &Aj = getArg5();
+      Grid<Real> &Ak = getArg6();
+#pragma omp target teams distribute parallel for collapse(1) schedule(static, 1)
+      {
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, dst, src, A0, Ai, Aj, Ak);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<Real> &dst;
   const Grid<Real> &src;
-  const std::vector<Grid<Real> *> matrixA;
-  const std::vector<Grid<Real> *> vecRhs;
+  Grid<Real> &A0;
+  Grid<Real> &Ai;
+  Grid<Real> &Aj;
+  Grid<Real> &Ak;
 };
 
-/* NOTE: Use this template for new matrix application kernels
-
-//! Template for matrix application kernels
-KERNEL()
-void ApplyMatrixTemplate (const FlagGrid& flags, Grid<Real>& dst, const Grid<Real>& src,
-  const std::vector<Grid<Real> *> matrixA, const std::vector<Grid<Real> *> vecRhs)
-{
-  // The kernel must define how to use the grids from the matrixA and vecRhs lists
-}
-
-*/
-
 //! Kernel: Construct the matrix for the poisson equation
 
 struct MakeLaplaceMatrix : public KernelBase {
@@ -687,7 +409,7 @@ struct MakeLaplaceMatrix : public KernelBase {
                  Grid<Real> &Ai,
                  Grid<Real> &Aj,
                  Grid<Real> &Ak,
-                 const MACGrid *fractions = 0) const
+                 const MACGrid *fractions = 0)
   {
     if (!flags.isFluid(i, j, k))
       return;
@@ -765,37 +487,42 @@ struct MakeLaplaceMatrix : public KernelBase {
     return fractions;
   }
   typedef MACGrid type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel MakeLaplaceMatrix ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, A0, Ai, Aj, Ak, fractions);
+      const FlagGrid &flags = getArg0();
+      Grid<Real> &A0 = getArg1();
+      Grid<Real> &Ai = getArg2();
+      Grid<Real> &Aj = getArg3();
+      Grid<Real> &Ak = getArg4();
+      const MACGrid *fractions = getArg5();
+#pragma omp target teams distribute parallel for collapse(3) schedule(static, 1)
+      {
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, A0, Ai, Aj, Ak, fractions);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, A0, Ai, Aj, Ak, fractions);
+      const FlagGrid &flags = getArg0();
+      Grid<Real> &A0 = getArg1();
+      Grid<Real> &Ai = getArg2();
+      Grid<Real> &Aj = getArg3();
+      Grid<Real> &Ak = getArg4();
+      const MACGrid *fractions = getArg5();
+#pragma omp target teams distribute parallel for collapse(2) schedule(static, 1)
+      {
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, A0, Ai, Aj, Ak, fractions);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<Real> &A0;
   Grid<Real> &Ai;
diff --git a/extern/mantaflow/preprocessed/fastmarch.cpp b/extern/mantaflow/preprocessed/fastmarch.cpp
index 31e43483b49..5499ec9cbdf 100644
--- a/extern/mantaflow/preprocessed/fastmarch.cpp
+++ b/extern/mantaflow/preprocessed/fastmarch.cpp
@@ -236,13 +236,7 @@ struct SetLevelsetBoundaries : public KernelBase {
     return phi;
   }
   typedef Grid<Real> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel SetLevelsetBoundaries ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
+  void runMessage(){};
   void run()
   {
     const int _maxX = maxX;
@@ -297,14 +291,8 @@ struct knExtrapolateMACSimple : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i,
-                 int j,
-                 int k,
-                 MACGrid &vel,
-                 int distance,
-                 Grid<int> &tmp,
-                 const int d,
-                 const int c) const
+  inline void op(
+      int i, int j, int k, MACGrid &vel, int distance, Grid<int> &tmp, const int d, const int c)
   {
     static const Vec3i nb[6] = {Vec3i(1, 0, 0),
                                 Vec3i(-1, 0, 0),
@@ -359,37 +347,35 @@ struct knExtrapolateMACSimple : public KernelBase {
     return c;
   }
   typedef int type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel knExtrapolateMACSimple ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, vel, distance, tmp, d, c);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, vel, distance, tmp, d, c);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, vel, distance, tmp, d, c);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, vel, distance, tmp, d, c);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   MACGrid &vel;
   int distance;
   Grid<int> &tmp;
@@ -405,7 +391,7 @@ struct knExtrapolateIntoBnd : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, FlagGrid &flags, MACGrid &vel, const MACGrid &velTmp) const
+  inline void op(int i, int j, int k, FlagGrid &flags, MACGrid &vel, const MACGrid &velTmp)
   {
     int c = 0;
     Vec3 v(0, 0, 0);
@@ -467,37 +453,35 @@ struct knExtrapolateIntoBnd : public KernelBase {
     return velTmp;
   }
   typedef MACGrid type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knExtrapolateIntoBnd ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, vel, velTmp);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, vel, velTmp);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, vel, velTmp);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, vel, velTmp);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   FlagGrid &flags;
   MACGrid &vel;
   const MACGrid &velTmp;
@@ -538,8 +522,7 @@ struct knUnprojectNormalComp : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(
-      int i, int j, int k, FlagGrid &flags, MACGrid &vel, Grid<Real> &phi, Real maxDist) const
+  inline void op(int i, int j, int k, FlagGrid &flags, MACGrid &vel, Grid<Real> &phi, Real maxDist)
   {
     // apply inside, within range near obstacle surface
     if (phi(i, j, k) > 0. || phi(i, j, k) < -maxDist)
@@ -573,37 +556,35 @@ struct knUnprojectNormalComp : public KernelBase {
     return maxDist;
   }
   typedef Real type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knUnprojectNormalComp ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, vel, phi, maxDist);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, vel, phi, maxDist);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, vel, phi, maxDist);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, phi, maxDist);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   FlagGrid &flags;
   MACGrid &vel;
   Grid<Real> &phi;
@@ -712,7 +693,7 @@ struct knExtrapolateMACFromWeight : public KernelBase {
                  Grid<Vec3> &weight,
                  int distance,
                  const int d,
-                 const int c) const
+                 const int c)
   {
     static const Vec3i nb[6] = {Vec3i(1, 0, 0),
                                 Vec3i(-1, 0, 0),
@@ -766,37 +747,35 @@ struct knExtrapolateMACFromWeight : public KernelBase {
     return c;
   }
   typedef int type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel knExtrapolateMACFromWeight ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, vel, weight, distance, d, c);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, vel, weight, distance, d, c);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, vel, weight, distance, d, c);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, vel, weight, distance, d, c);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   MACGrid &vel;
   Grid<Vec3> &weight;
   int distance;
@@ -1011,14 +990,8 @@ template<class S> struct knExtrapolateLsSimple : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i,
-                 int j,
-                 int k,
-                 Grid<S> &val,
-                 int distance,
-                 Grid<int> &tmp,
-                 const int d,
-                 S direction) const
+  inline void op(
+      int i, int j, int k, Grid<S> &val, int distance, Grid<int> &tmp, const int d, S direction)
   {
     const int dim = (val.is3D() ? 3 : 2);
     if (tmp(i, j, k) != 0)
@@ -1065,37 +1038,35 @@ template<class S> struct knExtrapolateLsSimple : public KernelBase {
     return direction;
   }
   typedef S type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel knExtrapolateLsSimple ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, val, distance, tmp, d, direction);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, val, distance, tmp, d, direction);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, val, distance, tmp, d, direction);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, val, distance, tmp, d, direction);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   Grid<S> &val;
   int distance;
   Grid<int> &tmp;
@@ -1110,7 +1081,7 @@ template<class S> struct knSetRemaining : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<S> &phi, Grid<int> &tmp, S distance) const
+  inline void op(int i, int j, int k, Grid<S> &phi, Grid<int> &tmp, S distance)
   {
     if (tmp(i, j, k) != 0)
       return;
@@ -1131,37 +1102,35 @@ template<class S> struct knSetRemaining : public KernelBase {
     return distance;
   }
   typedef S type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetRemaining ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, phi, tmp, distance);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, phi, tmp, distance);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, phi, tmp, distance);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, phi, tmp, distance);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   Grid<S> &phi;
   Grid<int> &tmp;
   S distance;
diff --git a/extern/mantaflow/preprocessed/fileio/iogrids.cpp b/extern/mantaflow/preprocessed/fileio/iogrids.cpp
index d138cd2925f..77e3e89ed26 100644
--- a/extern/mantaflow/preprocessed/fileio/iogrids.cpp
+++ b/extern/mantaflow/preprocessed/fileio/iogrids.cpp
@@ -1256,7 +1256,7 @@ struct knQuantize : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<Real> &grid, Real step) const
+  inline void op(IndexInt idx, Grid<Real> &grid, Real step)
   {
     quantizeReal(grid(idx), step);
   }
@@ -1270,21 +1270,17 @@ struct knQuantize : public KernelBase {
     return step;
   }
   typedef Real type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knQuantize ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, grid, step);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, grid, step);
+    }
   }
   Grid<Real> &grid;
   Real step;
@@ -1331,7 +1327,7 @@ struct knQuantizeVec3 : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<Vec3> &grid, Real step) const
+  inline void op(IndexInt idx, Grid<Vec3> &grid, Real step)
   {
     for (int c = 0; c < 3; ++c)
       quantizeReal(grid(idx)[c], step);
@@ -1346,21 +1342,17 @@ struct knQuantizeVec3 : public KernelBase {
     return step;
   }
   typedef Real type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knQuantizeVec3 ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, grid, step);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, grid, step);
+    }
   }
   Grid<Vec3> &grid;
   Real step;
diff --git a/extern/mantaflow/preprocessed/fileio/ioutil.cpp b/extern/mantaflow/preprocessed/fileio/ioutil.cpp
index 2bac34c2a65..409760b0a0f 100644
--- a/extern/mantaflow/preprocessed/fileio/ioutil.cpp
+++ b/extern/mantaflow/preprocessed/fileio/ioutil.cpp
@@ -66,7 +66,7 @@ void *safeGzopen(const char *filename, const char *mode)
 #endif  // NO_ZLIB != 1
 }
 
-#if defined(OPENVDB)
+#if OPENVDB == 1
 // Convert from OpenVDB value to Manta value.
 template<class S, class T> void convertFrom(S &in, T *out)
 {
diff --git a/extern/mantaflow/preprocessed/fileio/iovdb.cpp b/extern/mantaflow/preprocessed/fileio/iovdb.cpp
index 1846ef7ecbb..815bfa19aaa 100644
--- a/extern/mantaflow/preprocessed/fileio/iovdb.cpp
+++ b/extern/mantaflow/preprocessed/fileio/iovdb.cpp
@@ -29,10 +29,10 @@
 
 #if OPENVDB == 1
 #  include "openvdb/openvdb.h"
-#  include "openvdb/points/PointConversion.h"
-#  include "openvdb/points/PointCount.h"
-#  include "openvdb/tools/Clip.h"
-#  include "openvdb/tools/Dense.h"
+#  include <openvdb/points/PointConversion.h>
+#  include <openvdb/points/PointCount.h>
+#  include <openvdb/tools/Clip.h>
+#  include <openvdb/tools/Dense.h>
 #endif
 
 #define POSITION_NAME "P"
@@ -433,9 +433,9 @@ int writeObjectsVDB(const string &filename,
         debMsg("Writing int grid '" << mantaGrid->getName() << "' to vdb file " << filename, 1);
         Grid<int> *mantaIntGrid = (Grid<int> *)mantaGrid;
         if (clipGrid && mantaIntGrid->saveSparse()) {
-          assertMsg(clipGrid->getSize() == mantaGrid->getSize(),
-                    "writeObjectsVDB: Clip grid and exported grid must have the same size "
-                        << clipGrid->getSize() << " vs " << mantaGrid->getSize());
+          // assertMsg(clipGrid->getSize() == mantaGrid->getSize(), "writeObjectsVDB: Clip grid and
+          // exported grid must have the same size " << clipGrid->getSize() << " vs " <<
+          // mantaGrid->getSize());
         }
         vdbGrid = exportVDB<int, openvdb::Int32Grid>(mantaIntGrid, clip, vdbClipGrid);
         gridsVDB.push_back(vdbGrid);
@@ -448,9 +448,9 @@ int writeObjectsVDB(const string &filename,
         // Only supply clip grid if real grid is not equal to the clip grid
         openvdb::FloatGrid::Ptr tmpClipGrid = (mantaRealGrid == clipGrid) ? nullptr : vdbClipGrid;
         if (clipGrid && mantaRealGrid->saveSparse()) {
-          assertMsg(clipGrid->getSize() == mantaGrid->getSize(),
-                    "writeObjectsVDB: Clip grid and exported grid must have the same size "
-                        << clipGrid->getSize() << " vs " << mantaGrid->getSize());
+          // assertMsg(clipGrid->getSize() == mantaGrid->getSize(), "writeObjectsVDB: Clip grid and
+          // exported grid must have the same size " << clipGrid->getSize() << " vs " <<
+          // mantaGrid->getSize());
         }
         vdbGrid = exportVDB<Real, openvdb::FloatGrid>(mantaRealGrid, clip, tmpClipGrid);
         gridsVDB.push_back(vdbGrid);
@@ -461,9 +461,9 @@ int writeObjectsVDB(const string &filename,
                                                               openvdb::GRID_UNKNOWN;
         Grid<Vec3> *mantaVec3Grid = (Grid<Vec3> *)mantaGrid;
         if (clipGrid && mantaVec3Grid->saveSparse()) {
-          assertMsg(clipGrid->getSize() == mantaGrid->getSize(),
-                    "writeObjectsVDB: Clip grid and exported grid must have the same size "
-                        << clipGrid->getSize() << " vs " << mantaGrid->getSize());
+          // assertMsg(clipGrid->getSize() == mantaGrid->getSize(), "writeObjectsVDB: Clip grid and
+          // exported grid must have the same size " << clipGrid->getSize() << " vs " <<
+          // mantaGrid->getSize());
         }
         vdbGrid = exportVDB<Vec3, openvdb::Vec3SGrid>(mantaVec3Grid, clip, vdbClipGrid);
         gridsVDB.push_back(vdbGrid);
@@ -519,7 +519,7 @@ int writeObjectsVDB(const string &filename,
     }
   }
 
-  // Write only if there is at least one grid, optionally write with compression.
+  // Write only if the is at least one grid, optionally write with compression.
   if (gridsVDB.size()) {
     int vdb_flags = openvdb::io::COMPRESS_ACTIVE_MASK;
     switch (compression) {
@@ -534,8 +534,7 @@ int writeObjectsVDB(const string &filename,
       }
       case COMPRESSION_BLOSC: {
 #  if OPENVDB_BLOSC == 1
-        // Cannot use |= here, causes segfault with blosc 1.5.0 (== recommended version)
-        vdb_flags = openvdb::io::COMPRESS_BLOSC;
+        vdb_flags |= openvdb::io::COMPRESS_BLOSC;
 #  else
         debMsg("OpenVDB was built without Blosc support, using Zip compression instead", 1);
         vdb_flags |= openvdb::io::COMPRESS_ZIP;
@@ -696,36 +695,28 @@ int readObjectsVDB(const string &filename, std::vector<PbClass *> *objects, floa
 
       // Compare metadata with allocated grid setup. This prevents invalid index access.
       if (notZero(metaRes) && metaRes != origRes) {
-        debMsg("readObjectsVDB Warning: Grid '" << vdbGrid->getName()
-                                                << "' has not been read. Meta grid res " << metaRes
-                                                << " vs " << origRes << " current grid size",
-               1);
+        // debMsg("readObjectsVDB Warning: Grid '" << vdbGrid->getName() << "' has not been read.
+        // Meta grid res " << metaRes << " vs " << origRes << " current grid size", 1);
         readFailure++;
         break;
       }
       if (notZero(metaVoxelSize) && metaVoxelSize != voxelSize) {
-        debMsg("readObjectsVDB Warning: Grid '"
-                   << vdbGrid->getName() << "' has not been read. Meta voxel size "
-                   << metaVoxelSize << " vs " << voxelSize << " current voxel size",
-               1);
+        // debMsg("readObjectsVDB Warning: Grid '" << vdbGrid->getName() << "' has not been read.
+        // Meta voxel size " << metaVoxelSize << " vs " << voxelSize << " current voxel size", 1);
         readFailure++;
         break;
       }
       if (metaBBoxMax.x > origRes.x || metaBBoxMax.y > origRes.y || metaBBoxMax.z > origRes.z) {
-        debMsg("readObjectsVDB Warning: Grid '"
-                   << vdbGrid->getName() << "' has not been read. Vdb bbox max " << metaBBoxMax
-                   << " vs " << origRes << " current grid size",
-               1);
+        // debMsg("readObjectsVDB Warning: Grid '" << vdbGrid->getName() << "' has not been read.
+        // Vdb bbox max " << metaBBoxMax << " vs " << origRes << " current grid size", 1);
         readFailure++;
         break;
       }
       const Vec3i origOrigin(0);
       if (metaBBoxMin.x < origOrigin.x || metaBBoxMin.y < origOrigin.y ||
           metaBBoxMin.z < origOrigin.z) {
-        debMsg("readObjectsVDB Warning: Grid '"
-                   << vdbGrid->getName() << "' has not been read. Vdb bbox min " << metaBBoxMin
-                   << " vs " << origOrigin << " current grid origin",
-               1);
+        // debMsg("readObjectsVDB Warning: Grid '" << vdbGrid->getName() << "' has not been read.
+        // Vdb bbox min " << metaBBoxMin << " vs " << origOrigin << " current grid origin", 1);
         readFailure++;
         break;
       }
diff --git a/extern/mantaflow/preprocessed/fluidsolver.h b/extern/mantaflow/preprocessed/fluidsolver.h
index 6770f8b7b05..0c871bca3a1 100644
--- a/extern/mantaflow/preprocessed/fluidsolver.h
+++ b/extern/mantaflow/preprocessed/fluidsolver.h
@@ -384,7 +384,6 @@ class FluidSolver : public PbClass {
   GridStorage<Real> mGrids4dReal;
   GridStorage<Vec3> mGrids4dVec;
   GridStorage<Vec4> mGrids4dVec4;
-
  public:
   PbArgs _args;
 }
diff --git a/extern/mantaflow/preprocessed/general.h b/extern/mantaflow/preprocessed/general.h
index 8bf1c2e25de..7a840517cef 100644
--- a/extern/mantaflow/preprocessed/general.h
+++ b/extern/mantaflow/preprocessed/general.h
@@ -242,39 +242,6 @@ inline bool c_isnan(float c)
   return d != d;
 }
 
-//! Swap so that a<b
-template<class T> inline void sort(T &a, T &b)
-{
-  if (a > b)
-    std::swap(a, b);
-}
-
-//! Swap so that a<b<c
-template<class T> inline void sort(T &a, T &b, T &c)
-{
-  if (a > b)
-    std::swap(a, b);
-  if (a > c)
-    std::swap(a, c);
-  if (b > c)
-    std::swap(b, c);
-}
-
-//! Swap so that a<b<c<d
-template<class T> inline void sort(T &a, T &b, T &c, T &d)
-{
-  if (a > b)
-    std::swap(a, b);
-  if (c > d)
-    std::swap(c, d);
-  if (a > c)
-    std::swap(a, c);
-  if (b > d)
-    std::swap(b, d);
-  if (b > c)
-    std::swap(b, c);
-}
-
 }  // namespace Manta
 
 #endif
diff --git a/extern/mantaflow/preprocessed/gitinfo.h b/extern/mantaflow/preprocessed/gitinfo.h
index 6d367b764af..206d13bdf00 100644
--- a/extern/mantaflow/preprocessed/gitinfo.h
+++ b/extern/mantaflow/preprocessed/gitinfo.h
@@ -1,3 +1,3 @@
 
 
-#define MANTA_GIT_VERSION "commit d5d9a6c28daa8f21426d7a285f48639c0d8fd13f"
+#define MANTA_GIT_VERSION "commit 39b7a415721ecbf6643612a24e8eadd221aeb934"
+\ No newline at end of file
diff --git a/extern/mantaflow/preprocessed/grid.cpp b/extern/mantaflow/preprocessed/grid.cpp
index e0ea3830fae..cda3c5b213d 100644
--- a/extern/mantaflow/preprocessed/grid.cpp
+++ b/extern/mantaflow/preprocessed/grid.cpp
@@ -60,7 +60,8 @@ template<> inline GridBase::GridType typeList<Vec3>()
 }
 
 template<class T>
-Grid<T>::Grid(FluidSolver *parent, bool show, bool sparse) : GridBase(parent), mExternalData(false)
+Grid<T>::Grid(FluidSolver *parent, bool show, bool sparse, bool offload)
+    : GridBase(parent), mExternalData(false)
 {
   mType = typeList<T>();
   mSize = parent->getGridSize();
@@ -78,6 +79,34 @@ Grid<T>::Grid(FluidSolver *parent, bool show, bool sparse) : GridBase(parent), m
     debMsg("Cannot enable sparse save option without OpenVDB", 1);
   mSaveSparse = false;
 #endif
+
+#if OPENMP && OPENMP_OFFLOAD
+  if (offload) {
+    Grid<T> *grid = this;
+    printf("OMP Allocating %s (%d,%d,%d)\n",
+           this->getName().c_str(),
+           this->getSizeX(),
+           this->getSizeY(),
+           this->getSizeZ());
+    int size = this->getSizeX() * this->getSizeY() * this->getSizeZ();
+
+    printf("Size OMP is %d\n", size);
+#  pragma omp target enter data map(to \
+                                    : grid, \
+                                      grid->mSize, \
+                                      grid->mStrideZ, \
+                                      grid->mType, \
+                                      grid->mDx, \
+                                      grid->m3D, \
+                                      grid->mData [0:size])
+    //#pragma omp target enter data map(alloc: grid, grid->mData[0:size])
+  }
+  mOffload = offload;
+#else
+  if (offload)
+    debMsg("Cannot offload grid without OpenMP and OpenMP offloading support", 2);
+  mOffload = false;
+#endif
 }
 
 template<class T>
@@ -99,14 +128,119 @@ template<class T> Grid<T>::Grid(const Grid<T> &a) : GridBase(a.getParent()), mEx
 
 template<class T> Grid<T>::~Grid()
 {
+#if OPENMP && OPENMP_OFFLOAD
+  if (mOffload) {
+    this->mapDeleteOmp();
+  }
+#endif
   if (!mExternalData) {
     mParent->freeGridPointer<T>(mData);
   }
 }
 
-template<class T> void Grid<T>::clear()
+template<class T> struct knGridClear : public KernelBase {
+  knGridClear(Grid<T> &dest) : KernelBase(&dest, 0), dest(dest)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, Grid<T> &dest)
+  {
+    dest(i, j, k) = T(0.);
+  }
+  inline Grid<T> &getArg0()
+  {
+    return dest;
+  }
+  typedef Grid<T> type0;
+  void runMessage(){};
+  void run()
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      Grid<T> &dest = getArg0();
+#pragma omp target teams distribute parallel for collapse(3) schedule(static, 1)
+      {
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, dest);
+      }
+    }
+    else {
+      const int k = 0;
+      Grid<T> &dest = getArg0();
+#pragma omp target teams distribute parallel for collapse(2) schedule(static, 1)
+      {
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, dest);
+      }
+    }
+  }
+  Grid<T> &dest;
+};
+template<class T> struct knGridClear2 : public KernelBase {
+  knGridClear2(Grid<T> &dest) : KernelBase(&dest, 0), dest(dest)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, Grid<T> &dest)
+  {
+    dest(i, j, k) = T(0.);
+  }
+  inline Grid<T> &getArg0()
+  {
+    return dest;
+  }
+  typedef Grid<T> type0;
+  void runMessage(){};
+  void run()
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, dest);
+      }
+    }
+    else {
+      const int k = 0;
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, dest);
+      }
+    }
+  }
+  Grid<T> &dest;
+};
+
+template<class T> void Grid<T>::clear(bool isOmp)
 {
+
+#if OPENMP && OPENMP_OFFLOAD
+  if (isOmp) {
+    knGridClear<T>(*this);
+  }
+  else {
+    knGridClear2<T>(*this);
+  }
+#else
   memset(mData, 0, sizeof(T) * mSize.x * mSize.y * mSize.z);
+#endif
 }
 
 template<class T> void Grid<T>::swap(Grid<T> &other)
@@ -201,29 +335,21 @@ struct CompMinReal : public KernelBase {
     return val;
   }
   typedef Grid<Real> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel CompMinReal ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, minVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  CompMinReal(CompMinReal &o, tbb::split)
-      : KernelBase(o), val(o.val), minVal(std::numeric_limits<Real>::max())
-  {
-  }
-  void join(const CompMinReal &o)
-  {
-    minVal = min(minVal, o.minVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real minVal = std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, minVal);
+#pragma omp critical
+      {
+        this->minVal = min(minVal, this->minVal);
+      }
+    }
   }
   const Grid<Real> &val;
   Real minVal;
@@ -256,29 +382,21 @@ struct CompMaxReal : public KernelBase {
     return val;
   }
   typedef Grid<Real> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel CompMaxReal ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, maxVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  CompMaxReal(CompMaxReal &o, tbb::split)
-      : KernelBase(o), val(o.val), maxVal(-std::numeric_limits<Real>::max())
-  {
-  }
-  void join(const CompMaxReal &o)
-  {
-    maxVal = max(maxVal, o.maxVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real maxVal = -std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, maxVal);
+#pragma omp critical
+      {
+        this->maxVal = max(maxVal, this->maxVal);
+      }
+    }
   }
   const Grid<Real> &val;
   Real maxVal;
@@ -311,29 +429,21 @@ struct CompMinInt : public KernelBase {
     return val;
   }
   typedef Grid<int> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel CompMinInt ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, minVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  CompMinInt(CompMinInt &o, tbb::split)
-      : KernelBase(o), val(o.val), minVal(std::numeric_limits<int>::max())
-  {
-  }
-  void join(const CompMinInt &o)
-  {
-    minVal = min(minVal, o.minVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      int minVal = std::numeric_limits<int>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, minVal);
+#pragma omp critical
+      {
+        this->minVal = min(minVal, this->minVal);
+      }
+    }
   }
   const Grid<int> &val;
   int minVal;
@@ -366,29 +476,21 @@ struct CompMaxInt : public KernelBase {
     return val;
   }
   typedef Grid<int> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel CompMaxInt ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, maxVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  CompMaxInt(CompMaxInt &o, tbb::split)
-      : KernelBase(o), val(o.val), maxVal(-std::numeric_limits<int>::max())
-  {
-  }
-  void join(const CompMaxInt &o)
-  {
-    maxVal = max(maxVal, o.maxVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      int maxVal = -std::numeric_limits<int>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, maxVal);
+#pragma omp critical
+      {
+        this->maxVal = max(maxVal, this->maxVal);
+      }
+    }
   }
   const Grid<int> &val;
   int maxVal;
@@ -422,29 +524,21 @@ struct CompMinVec : public KernelBase {
     return val;
   }
   typedef Grid<Vec3> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel CompMinVec ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, minVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  CompMinVec(CompMinVec &o, tbb::split)
-      : KernelBase(o), val(o.val), minVal(std::numeric_limits<Real>::max())
-  {
-  }
-  void join(const CompMinVec &o)
-  {
-    minVal = min(minVal, o.minVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real minVal = std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, minVal);
+#pragma omp critical
+      {
+        this->minVal = min(minVal, this->minVal);
+      }
+    }
   }
   const Grid<Vec3> &val;
   Real minVal;
@@ -478,38 +572,143 @@ struct CompMaxVec : public KernelBase {
     return val;
   }
   typedef Grid<Vec3> type0;
-  void runMessage()
+  void runMessage(){};
+  void run()
+  {
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real maxVal = -std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, maxVal);
+#pragma omp critical
+      {
+        this->maxVal = max(maxVal, this->maxVal);
+      }
+    }
+  }
+  const Grid<Vec3> &val;
+  Real maxVal;
+};
+
+template<class T> struct knGridCopyGrid : public KernelBase {
+  knGridCopyGrid(Grid<T> &dest, const Grid<T> &source)
+      : KernelBase(&dest, 0), dest(dest), source(source)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i, int j, int k, Grid<T> &dest, const Grid<T> &source)
+  {
+    dest(i, j, k) = source(i, j, k);
+  }
+  inline Grid<T> &getArg0()
   {
-    debMsg("Executing kernel CompMaxVec ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
+    return dest;
+  }
+  typedef Grid<T> type0;
+  inline const Grid<T> &getArg1()
   {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, maxVal);
+    return source;
   }
+  typedef Grid<T> type1;
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      Grid<T> &dest = getArg0();
+      const Grid<T> &source = getArg1();
+#pragma omp target teams distribute parallel for collapse(3) schedule(static, 1)
+      {
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, dest, source);
+      }
+    }
+    else {
+      const int k = 0;
+      Grid<T> &dest = getArg0();
+      const Grid<T> &source = getArg1();
+#pragma omp target teams distribute parallel for collapse(2) schedule(static, 1)
+      {
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, dest, source);
+      }
+    }
   }
-  CompMaxVec(CompMaxVec &o, tbb::split)
-      : KernelBase(o), val(o.val), maxVal(-std::numeric_limits<Real>::max())
+  Grid<T> &dest;
+  const Grid<T> &source;
+};
+template<class T> struct knGridCopyGrid2 : public KernelBase {
+  knGridCopyGrid2(Grid<T> &dest, const Grid<T> &source)
+      : KernelBase(&dest, 0), dest(dest), source(source)
   {
+    runMessage();
+    run();
   }
-  void join(const CompMaxVec &o)
+  inline void op(int i, int j, int k, Grid<T> &dest, const Grid<T> &source)
   {
-    maxVal = max(maxVal, o.maxVal);
+    dest(i, j, k) = source(i, j, k);
   }
-  const Grid<Vec3> &val;
-  Real maxVal;
+  inline Grid<T> &getArg0()
+  {
+    return dest;
+  }
+  typedef Grid<T> type0;
+  inline const Grid<T> &getArg1()
+  {
+    return source;
+  }
+  typedef Grid<T> type1;
+  void runMessage(){};
+  void run()
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, dest, source);
+      }
+    }
+    else {
+      const int k = 0;
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, dest, source);
+      }
+    }
+  }
+  Grid<T> &dest;
+  const Grid<T> &source;
 };
 
-template<class T> Grid<T> &Grid<T>::copyFrom(const Grid<T> &a, bool copyType)
+template<class T> Grid<T> &Grid<T>::copyFrom(const Grid<T> &a, bool copyType, bool isOmp)
 {
   assertMsg(a.mSize == mSize, "different grid resolutions " << a.mSize << " vs " << this->mSize);
+#if OPENMP && OPENMP_OFFLOAD
+  if (isOmp)
+    knGridCopyGrid<T>(*this, a);
+  else
+    knGridCopyGrid2<T>(*this, a);
+#else
   memcpy(mData, a.mData, sizeof(T) * mSize.x * mSize.y * mSize.z);
+#endif
   if (copyType)
     mType = a.mType;  // copy type marker
   return *this;
@@ -524,7 +723,7 @@ template<class T> struct knGridSetConstReal : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<T> &me, T val) const
+  inline void op(IndexInt idx, Grid<T> &me, T val)
   {
     me[idx] = val;
   }
@@ -538,21 +737,17 @@ template<class T> struct knGridSetConstReal : public KernelBase {
     return val;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knGridSetConstReal ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, val);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, val);
+    }
   }
   Grid<T> &me;
   T val;
@@ -563,7 +758,7 @@ template<class T> struct knGridAddConstReal : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<T> &me, T val) const
+  inline void op(IndexInt idx, Grid<T> &me, T val)
   {
     me[idx] += val;
   }
@@ -577,21 +772,17 @@ template<class T> struct knGridAddConstReal : public KernelBase {
     return val;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knGridAddConstReal ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, val);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, val);
+    }
   }
   Grid<T> &me;
   T val;
@@ -602,7 +793,7 @@ template<class T> struct knGridMultConst : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<T> &me, T val) const
+  inline void op(IndexInt idx, Grid<T> &me, T val)
   {
     me[idx] *= val;
   }
@@ -616,21 +807,17 @@ template<class T> struct knGridMultConst : public KernelBase {
     return val;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knGridMultConst ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, val);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, val);
+    }
   }
   Grid<T> &me;
   T val;
@@ -642,7 +829,7 @@ template<class T> struct knGridSafeDiv : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<T> &me, const Grid<T> &other) const
+  inline void op(IndexInt idx, Grid<T> &me, const Grid<T> &other)
   {
     me[idx] = safeDivide(me[idx], other[idx]);
   }
@@ -656,21 +843,17 @@ template<class T> struct knGridSafeDiv : public KernelBase {
     return other;
   }
   typedef Grid<T> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knGridSafeDiv ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   Grid<T> &me;
   const Grid<T> &other;
@@ -685,7 +868,7 @@ template<class T> struct knGridClamp : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<T> &me, const T &min, const T &max) const
+  inline void op(IndexInt idx, Grid<T> &me, const T &min, const T &max)
   {
     me[idx] = clamp(me[idx], min, max);
   }
@@ -704,21 +887,17 @@ template<class T> struct knGridClamp : public KernelBase {
     return max;
   }
   typedef T type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knGridClamp ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, min, max);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, min, max);
+    }
   }
   Grid<T> &me;
   const T &min;
@@ -745,7 +924,7 @@ template<class T> struct knGridStomp : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<T> &me, const T &threshold) const
+  inline void op(IndexInt idx, Grid<T> &me, const T &threshold)
   {
     stomp(me[idx], threshold);
   }
@@ -759,21 +938,17 @@ template<class T> struct knGridStomp : public KernelBase {
     return threshold;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knGridStomp ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, threshold);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, threshold);
+    }
   }
   Grid<T> &me;
   const T &threshold;
@@ -787,7 +962,7 @@ template<class T> struct knPermuteAxes : public KernelBase {
     run();
   }
   inline void op(
-      int i, int j, int k, Grid<T> &self, Grid<T> &target, int axis0, int axis1, int axis2) const
+      int i, int j, int k, Grid<T> &self, Grid<T> &target, int axis0, int axis1, int axis2)
   {
     int i0 = axis0 == 0 ? i : (axis0 == 1 ? j : k);
     int i1 = axis1 == 0 ? i : (axis1 == 1 ? j : k);
@@ -819,37 +994,35 @@ template<class T> struct knPermuteAxes : public KernelBase {
     return axis2;
   }
   typedef int type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPermuteAxes ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, self, target, axis0, axis1, axis2);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, self, target, axis0, axis1, axis2);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, self, target, axis0, axis1, axis2);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, self, target, axis0, axis1, axis2);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<T> &self;
   Grid<T> &target;
   int axis0;
@@ -864,7 +1037,7 @@ struct knJoinVec : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<Vec3> &a, const Grid<Vec3> &b, bool keepMax) const
+  inline void op(IndexInt idx, Grid<Vec3> &a, const Grid<Vec3> &b, bool keepMax)
   {
     Real a1 = normSquare(a[idx]);
     Real b1 = normSquare(b[idx]);
@@ -885,21 +1058,17 @@ struct knJoinVec : public KernelBase {
     return keepMax;
   }
   typedef bool type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knJoinVec ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, a, b, keepMax);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, a, b, keepMax);
+    }
   }
   Grid<Vec3> &a;
   const Grid<Vec3> &b;
@@ -912,7 +1081,7 @@ struct knJoinInt : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<int> &a, const Grid<int> &b, bool keepMax) const
+  inline void op(IndexInt idx, Grid<int> &a, const Grid<int> &b, bool keepMax)
   {
     a[idx] = (keepMax) ? max(a[idx], b[idx]) : min(a[idx], b[idx]);
   }
@@ -931,21 +1100,17 @@ struct knJoinInt : public KernelBase {
     return keepMax;
   }
   typedef bool type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knJoinInt ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, a, b, keepMax);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, a, b, keepMax);
+    }
   }
   Grid<int> &a;
   const Grid<int> &b;
@@ -958,7 +1123,7 @@ struct knJoinReal : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<Real> &a, const Grid<Real> &b, bool keepMax) const
+  inline void op(IndexInt idx, Grid<Real> &a, const Grid<Real> &b, bool keepMax)
   {
     a[idx] = (keepMax) ? max(a[idx], b[idx]) : min(a[idx], b[idx]);
   }
@@ -977,21 +1142,17 @@ struct knJoinReal : public KernelBase {
     return keepMax;
   }
   typedef bool type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knJoinReal ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, a, b, keepMax);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, a, b, keepMax);
+    }
   }
   Grid<Real> &a;
   const Grid<Real> &b;
@@ -1218,45 +1379,43 @@ struct knCountCells : public KernelBase {
     return mask;
   }
   typedef Grid<Real> type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knCountCells ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, flag, bnd, mask, cnt);
+
+#pragma omp parallel
+      {
+        int cnt = 0;
+#pragma omp for nowait
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, flag, bnd, mask, cnt);
+#pragma omp critical
+        {
+          this->cnt += cnt;
+        }
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, flag, bnd, mask, cnt);
+#pragma omp parallel
+      {
+        int cnt = 0;
+#pragma omp for nowait
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, flag, bnd, mask, cnt);
+#pragma omp critical
+        {
+          this->cnt += cnt;
+        }
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_reduce(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
-  knCountCells(knCountCells &o, tbb::split)
-      : KernelBase(o), flags(o.flags), flag(o.flag), bnd(o.bnd), mask(o.mask), cnt(0)
-  {
-  }
-  void join(const knCountCells &o)
-  {
-    cnt += o.cnt;
-  }
   const FlagGrid &flags;
   int flag;
   int bnd;
@@ -1408,7 +1567,7 @@ struct knCopyMacToVec3 : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, MACGrid &source, Grid<Vec3> &target) const
+  inline void op(int i, int j, int k, MACGrid &source, Grid<Vec3> &target)
   {
     target(i, j, k) = source(i, j, k);
   }
@@ -1422,37 +1581,35 @@ struct knCopyMacToVec3 : public KernelBase {
     return target;
   }
   typedef Grid<Vec3> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knCopyMacToVec3 ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, source, target);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, source, target);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, source, target);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, source, target);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   MACGrid &source;
   Grid<Vec3> &target;
 };
@@ -1539,7 +1696,7 @@ struct knResampleVec3ToMac : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<Vec3> &source, MACGrid &target) const
+  inline void op(int i, int j, int k, Grid<Vec3> &source, MACGrid &target)
   {
     target(i, j, k)[0] = 0.5 * (source(i - 1, j, k)[0] + source(i, j, k))[0];
     target(i, j, k)[1] = 0.5 * (source(i, j - 1, k)[1] + source(i, j, k))[1];
@@ -1557,37 +1714,35 @@ struct knResampleVec3ToMac : public KernelBase {
     return target;
   }
   typedef MACGrid type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knResampleVec3ToMac ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, source, target);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, source, target);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, source, target);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, source, target);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   Grid<Vec3> &source;
   MACGrid &target;
 };
@@ -1636,7 +1791,7 @@ struct knResampleMacToVec3 : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, MACGrid &source, Grid<Vec3> &target) const
+  inline void op(int i, int j, int k, MACGrid &source, Grid<Vec3> &target)
   {
     target(i, j, k) = source.getCentered(i, j, k);
   }
@@ -1650,37 +1805,35 @@ struct knResampleMacToVec3 : public KernelBase {
     return target;
   }
   typedef Grid<Vec3> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knResampleMacToVec3 ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, source, target);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, source, target);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, source, target);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, source, target);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   MACGrid &source;
   Grid<Vec3> &target;
 };
@@ -1729,7 +1882,7 @@ struct knCopyLevelsetToReal : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, LevelsetGrid &source, Grid<Real> &target) const
+  inline void op(int i, int j, int k, LevelsetGrid &source, Grid<Real> &target)
   {
     target(i, j, k) = source(i, j, k);
   }
@@ -1743,37 +1896,35 @@ struct knCopyLevelsetToReal : public KernelBase {
     return target;
   }
   typedef Grid<Real> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knCopyLevelsetToReal ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, source, target);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, source, target);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, source, target);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, source, target);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   LevelsetGrid &source;
   Grid<Real> &target;
 };
@@ -1834,7 +1985,7 @@ struct knCopyVec3ToReal : public KernelBase {
                  Grid<Vec3> &source,
                  Grid<Real> &targetX,
                  Grid<Real> &targetY,
-                 Grid<Real> &targetZ) const
+                 Grid<Real> &targetZ)
   {
     targetX(i, j, k) = source(i, j, k).x;
     targetY(i, j, k) = source(i, j, k).y;
@@ -1860,37 +2011,35 @@ struct knCopyVec3ToReal : public KernelBase {
     return targetZ;
   }
   typedef Grid<Real> type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knCopyVec3ToReal ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, source, targetX, targetY, targetZ);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, source, targetX, targetY, targetZ);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, source, targetX, targetY, targetZ);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, source, targetX, targetY, targetZ);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<Vec3> &source;
   Grid<Real> &targetX;
   Grid<Real> &targetY;
@@ -1958,7 +2107,7 @@ struct knCopyRealToVec3 : public KernelBase {
                  Grid<Real> &sourceX,
                  Grid<Real> &sourceY,
                  Grid<Real> &sourceZ,
-                 Grid<Vec3> &target) const
+                 Grid<Vec3> &target)
   {
     target(i, j, k).x = sourceX(i, j, k);
     target(i, j, k).y = sourceY(i, j, k);
@@ -1984,37 +2133,35 @@ struct knCopyRealToVec3 : public KernelBase {
     return target;
   }
   typedef Grid<Vec3> type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knCopyRealToVec3 ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, sourceX, sourceY, sourceZ, target);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, sourceX, sourceY, sourceZ, target);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, sourceX, sourceY, sourceZ, target);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, sourceX, sourceY, sourceZ, target);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<Real> &sourceX;
   Grid<Real> &sourceY;
   Grid<Real> &sourceZ;
@@ -2228,7 +2375,7 @@ struct knResetUvGrid : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<Vec3> &target, const Vec3 *offset) const
+  inline void op(int i, int j, int k, Grid<Vec3> &target, const Vec3 *offset)
   {
     Vec3 coord = Vec3((Real)i, (Real)j, (Real)k);
     if (offset)
@@ -2245,37 +2392,35 @@ struct knResetUvGrid : public KernelBase {
     return offset;
   }
   typedef Vec3 type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knResetUvGrid ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, target, offset);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, target, offset);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, target, offset);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, target, offset);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<Vec3> &target;
   const Vec3 *offset;
 };
@@ -2392,7 +2537,7 @@ template<class T> struct knSetBoundary : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<T> &grid, T value, int w) const
+  inline void op(int i, int j, int k, Grid<T> &grid, T value, int w)
   {
     bool bnd = (i <= w || i >= grid.getSizeX() - 1 - w || j <= w || j >= grid.getSizeY() - 1 - w ||
                 (grid.is3D() && (k <= w || k >= grid.getSizeZ() - 1 - w)));
@@ -2414,37 +2559,35 @@ template<class T> struct knSetBoundary : public KernelBase {
     return w;
   }
   typedef int type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetBoundary ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, grid, value, w);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, grid, value, w);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, grid, value, w);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, grid, value, w);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<T> &grid;
   T value;
   int w;
@@ -2461,7 +2604,7 @@ template<class T> struct knSetBoundaryNeumann : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<T> &grid, int w) const
+  inline void op(int i, int j, int k, Grid<T> &grid, int w)
   {
     bool set = false;
     int si = i, sj = j, sk = k;
@@ -2504,37 +2647,35 @@ template<class T> struct knSetBoundaryNeumann : public KernelBase {
     return w;
   }
   typedef int type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetBoundaryNeumann ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, grid, w);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, grid, w);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, grid, w);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, grid, w);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<T> &grid;
   int w;
 };
@@ -2552,7 +2693,7 @@ struct knSetBoundaryMAC : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<Vec3> &grid, Vec3 value, int w) const
+  inline void op(int i, int j, int k, Grid<Vec3> &grid, Vec3 value, int w)
   {
     if (i <= w || i >= grid.getSizeX() - w || j <= w - 1 || j >= grid.getSizeY() - 1 - w ||
         (grid.is3D() && (k <= w - 1 || k >= grid.getSizeZ() - 1 - w)))
@@ -2579,37 +2720,35 @@ struct knSetBoundaryMAC : public KernelBase {
     return w;
   }
   typedef int type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetBoundaryMAC ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, grid, value, w);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, grid, value, w);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, grid, value, w);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, grid, value, w);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<Vec3> &grid;
   Vec3 value;
   int w;
@@ -2623,7 +2762,7 @@ struct knSetBoundaryMACNorm : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<Vec3> &grid, Vec3 value, int w) const
+  inline void op(int i, int j, int k, Grid<Vec3> &grid, Vec3 value, int w)
   {
     if (i <= w || i >= grid.getSizeX() - w)
       grid(i, j, k).x = value.x;
@@ -2647,37 +2786,35 @@ struct knSetBoundaryMACNorm : public KernelBase {
     return w;
   }
   typedef int type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetBoundaryMACNorm ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, grid, value, w);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, grid, value, w);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, grid, value, w);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, grid, value, w);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<Vec3> &grid;
   Vec3 value;
   int w;
@@ -2730,29 +2867,21 @@ struct knGridTotalSum : public KernelBase {
     return flags;
   }
   typedef FlagGrid type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knGridTotalSum ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, a, flags, result);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  knGridTotalSum(knGridTotalSum &o, tbb::split)
-      : KernelBase(o), a(o.a), flags(o.flags), result(0.0)
-  {
-  }
-  void join(const knGridTotalSum &o)
-  {
-    result += o.result;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      double result = 0.0;
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, a, flags, result);
+#pragma omp critical
+      {
+        this->result += result;
+      }
+    }
   }
   const Grid<Real> &a;
   FlagGrid *flags;
@@ -2783,28 +2912,21 @@ struct knCountFluidCells : public KernelBase {
     return flags;
   }
   typedef FlagGrid type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel knCountFluidCells ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, flags, numEmpty);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  knCountFluidCells(knCountFluidCells &o, tbb::split) : KernelBase(o), flags(o.flags), numEmpty(0)
-  {
-  }
-  void join(const knCountFluidCells &o)
-  {
-    numEmpty += o.numEmpty;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      int numEmpty = 0;
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, flags, numEmpty);
+#pragma omp critical
+      {
+        this->numEmpty += numEmpty;
+      }
+    }
   }
   FlagGrid &flags;
   int numEmpty;
@@ -2870,7 +2992,7 @@ struct knGetComponent : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, const Grid<Vec3> &source, Grid<Real> &target, int component) const
+  inline void op(IndexInt idx, const Grid<Vec3> &source, Grid<Real> &target, int component)
   {
     target[idx] = source[idx][component];
   }
@@ -2889,21 +3011,17 @@ struct knGetComponent : public KernelBase {
     return component;
   }
   typedef int type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knGetComponent ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, source, target, component);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, source, target, component);
+    }
   }
   const Grid<Vec3> &source;
   Grid<Real> &target;
@@ -2953,7 +3071,7 @@ struct knSetComponent : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, const Grid<Real> &source, Grid<Vec3> &target, int component) const
+  inline void op(IndexInt idx, const Grid<Real> &source, Grid<Vec3> &target, int component)
   {
     target[idx][component] = source[idx];
   }
@@ -2972,21 +3090,17 @@ struct knSetComponent : public KernelBase {
     return component;
   }
   typedef int type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetComponent ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, source, target, component);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, source, target, component);
+    }
   }
   const Grid<Real> &source;
   Grid<Vec3> &target;
@@ -3327,7 +3441,7 @@ struct knMarkIsolatedFluidCell : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, FlagGrid &flags, const int mark) const
+  inline void op(IndexInt idx, FlagGrid &flags, const int mark)
   {
     if (isIsolatedFluidCell(idx, flags))
       flags[idx] = mark;
@@ -3342,21 +3456,17 @@ struct knMarkIsolatedFluidCell : public KernelBase {
     return mark;
   }
   typedef int type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMarkIsolatedFluidCell ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, flags, mark);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, flags, mark);
+    }
   }
   FlagGrid &flags;
   const int mark;
@@ -3454,6 +3564,156 @@ void PbRegister_copyMACData()
 }
 }
 
+template<class T> void Grid<T>::mapToOmp()
+{
+#if OPENMP && OPENMP_OFFLOAD
+  Grid<T> *grid = this;
+  // printf("Mapping grid (pointer: %p) to device\n", grid);
+  int size = this->getSizeX() * this->getSizeY() * this->getSizeZ();
+
+#  pragma omp target enter data map(to \
+                                    : grid, \
+                                      grid->mData [0:size], \
+                                      grid->mSize, \
+                                      grid->mStrideZ, \
+                                      grid->mType, \
+                                      grid->mDx, \
+                                      grid->m3D)
+#else
+  // std::cout << "Error! Device offloading not enabled. Compile with -DOPENMP=ON and
+  // -DOPENMP_OFFLOAD=ON" << std::endl;
+#endif
+}
+
+template<class T> void Grid<T>::mapAllocOmp()
+{
+#if OPENMP && OPENMP_OFFLOAD
+  Grid<T> *grid = this;
+  // printf("Mapping grid (pointer: %p) to device\n", grid);
+  int size = this->getSizeX() * this->getSizeY() * this->getSizeZ();
+
+#  pragma omp target enter data map(alloc \
+                                    : grid, \
+                                      grid->mData [0:size], \
+                                      grid->mSize, \
+                                      grid->mStrideZ, \
+                                      grid->mType, \
+                                      grid->mDx, \
+                                      grid->m3D)
+#else
+  // std::cout << "Error! Device offloading not enabled. Compile with -DOPENMP=ON and
+  // -DOPENMP_OFFLOAD=ON" << std::endl;
+#endif
+}
+
+template<class T> void Grid<T>::mapFromOmp()
+{
+#if OPENMP && OPENMP_OFFLOAD
+  Grid<T> *grid = this;
+  // printf("Mapping grid (pointer: %p) from device\n", grid);
+  int size = grid->getSizeX() * grid->getSizeY() * grid->getSizeZ();
+
+#  pragma omp target exit data map(from \
+                                   : grid, \
+                                     grid->mData [0:size], \
+                                     grid->mSize, \
+                                     grid->mStrideZ, \
+                                     grid->mType, \
+                                     grid->mDx, \
+                                     grid->m3D)
+#else
+  // std::cout << "Error! Device offloading not enabled. Compile with -DOPENMP=ON and
+  // -DOPENMP_OFFLOAD=ON" << std::endl;
+#endif
+}
+
+template<class T> void Grid<T>::mapDeleteOmp()
+{
+#if OPENMP && OPENMP_OFFLOAD
+  Grid<T> *grid = this;
+  // printf("Mapping grid (pointer: %p) from device\n", grid);
+  int size = grid->getSizeX() * grid->getSizeY() * grid->getSizeZ();
+
+#  pragma omp target exit data map(delete \
+                                   : grid, \
+                                     grid->mData [0:size], \
+                                     grid->mSize, \
+                                     grid->mStrideZ, \
+                                     grid->mType, \
+                                     grid->mDx, \
+                                     grid->m3D)
+#else
+  // std::cout << "Error! Device offloading not enabled. Compile with -DOPENMP=ON and
+  // -DOPENMP_OFFLOAD=ON" << std::endl;
+#endif
+}
+
+template<class T> void Grid<T>::updateFromOmp()
+{
+#if OPENMP && OPENMP_OFFLOAD
+  Grid<T> *grid = this;
+  // printf("Updating grid (pointer: %p) from device\n", grid);
+  int size = grid->getSizeX() * grid->getSizeY() * grid->getSizeZ();
+
+//#pragma omp target update from(grid, grid->mData[0:size], grid->mSize, grid->mStrideZ)
+//#pragma omp target update from(grid)
+#  pragma omp target update from( \
+      grid->mData [0:size], grid->mSize, grid->mStrideZ, grid->mType, grid->mDx, grid->m3D)
+#else
+  // std::cout << "Error! Device offloading not enabled. Compile with -DOPENMP=ON and
+  // -DOPENMP_OFFLOAD=ON" << std::endl;
+#endif
+}
+
+template<class T> void Grid<T>::updateToOmp()
+{
+#if OPENMP && OPENMP_OFFLOAD
+  Grid<T> *grid = this;
+  // printf("Updating grid (pointer: %p) from device\n", grid);
+  int size = grid->getSizeX() * grid->getSizeY() * grid->getSizeZ();
+
+//#pragma omp target update to(grid, grid->mData[0:size], grid->mSize, grid->mStrideZ)
+//#pragma omp target update to(grid)
+#  pragma omp target update to( \
+      grid->mData [0:size], grid->mSize, grid->mStrideZ, grid->mType, grid->mDx, grid->m3D)
+#else
+  // std::cout << "Error! Device offloading not enabled. Compile with -DOPENMP=ON and
+  // -DOPENMP_OFFLOAD=ON" << std::endl;
+#endif
+}
+
+// template<class T> void Grid<T>::mapDeclare() {
+// #if OPENMP && OPENMP_OFFLOAD
+// 	if (this->getType() & GridBase::TypeReal) {
+// 		printf("Map declare for Real\n");
+// 		Grid<Real> *grid = (Grid<Real> *)this;
+// 		#pragma omp declare mapper(Grid<Real> grid) map(to: \
+// 			grid.mData[0:grid.getSizeX()*grid.getSizeY()*grid.getSizeZ()], \
+// 			grid.mSize, grid.mStrideZ, grid.mType, grid.mDx, grid.m3D)
+// 		#pragma omp target enter data map(to: grid)
+// 	}
+// 	else if (this->getType() & GridBase::TypeInt) {
+// 		printf("Map declare for Int\n");
+// 		Grid<int> *grid = (Grid<int> *)this;
+// 		#pragma omp declare mapper(Grid<int> grid) map(to: \
+// 			grid.mData[0:grid.getSizeX()*grid.getSizeY()*grid.getSizeZ()], \
+// 			grid.mSize, grid.mStrideZ, grid.mType, grid.mDx, grid.m3D)
+// 		#pragma omp target enter data map(to: grid)
+// 	}
+// 	else if (this->getType() & GridBase::TypeVec3) {
+// 		printf("Map declare for Vec3\n");
+// 		Grid<Vec3> *grid = (Grid<Vec3> *)this;
+// 		#pragma omp declare mapper(Grid<Vec3> grid) map(to: \
+// 			grid.mData[0:grid.getSizeX()*grid.getSizeY()*grid.getSizeZ()], \
+// 			grid.mSize, grid.mStrideZ, grid.mType, grid.mDx, grid.m3D)
+// 		#pragma omp target enter data map(to: grid)
+// 	}
+
+// #else
+// 	//std::cout << "Error! Device offloading not enabled. Compile with -DOPENMP=ON and
+// -DOPENMP_OFFLOAD=ON" << std::endl; #endif
+// }
+
 // explicit instantiation
 template class Grid<int>;
 template class Grid<Real>;
diff --git a/extern/mantaflow/preprocessed/grid.h b/extern/mantaflow/preprocessed/grid.h
index 2c4296e78dd..3f0bcb9dfae 100644
--- a/extern/mantaflow/preprocessed/grid.h
+++ b/extern/mantaflow/preprocessed/grid.h
@@ -383,13 +383,15 @@ class GridBase : public PbClass {
     }
   }
 
- protected:
+  // TODO (sebbas): Moved attributes to public for now
   GridType mType;
   Vec3i mSize;
   Real mDx;
-  bool m3D;  // precomputed Z shift: to ensure 2D compatibility, always use this instead of sx*sy !
+  bool m3D;
+  // precomputed Z shift: to ensure 2D compatibility, always use this instead of sx*sy !
   IndexInt mStrideZ;
 
+ protected:
  public:
   PbArgs _args;
 }
@@ -401,7 +403,7 @@ class GridBase : public PbClass {
 template<class T> class Grid : public GridBase {
  public:
   //! init new grid, values are set to zero
-  Grid(FluidSolver *parent, bool show = true, bool sparse = false);
+  Grid(FluidSolver *parent, bool show = true, bool sparse = false, bool offload = false);
   static int _W_10(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
   {
     PbClass *obj = Pb::objFromPy(_self);
@@ -416,7 +418,8 @@ template<class T> class Grid : public GridBase {
         FluidSolver *parent = _args.getPtr<FluidSolver>("parent", 0, &_lock);
         bool show = _args.getOpt<bool>("show", 1, true, &_lock);
         bool sparse = _args.getOpt<bool>("sparse", 2, false, &_lock);
-        obj = new Grid(parent, show, sparse);
+        bool offload = _args.getOpt<bool>("offload", 3, false, &_lock);
+        obj = new Grid(parent, show, sparse, offload);
         obj->registerObject(_self, &_args);
         _args.check();
       }
@@ -490,7 +493,7 @@ template<class T> class Grid : public GridBase {
   }
 
   //! set all cells to zero
-  void clear();
+  void clear(bool isOmp = false);
   static PyObject *_W_13(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
   {
     try {
@@ -501,9 +504,10 @@ template<class T> class Grid : public GridBase {
       PyObject *_retval = nullptr;
       {
         ArgLocker _lock;
+        bool isOmp = _args.getOpt<bool>("isOmp", 0, false, &_lock);
         pbo->_args.copy(_args);
         _retval = getPyNone();
-        pbo->clear();
+        pbo->clear(isOmp);
         pbo->_args.check();
       }
       pbFinalizePlugin(pbo->getParent(), "Grid::clear", !noTiming);
@@ -591,6 +595,11 @@ template<class T> class Grid : public GridBase {
   {
     return mSaveSparse;
   }
+  //! query if this grid should be saved as a sparse grid
+  inline bool isOffload()
+  {
+    return mOffload;
+  }
 
   //! set data
   inline void set(int i, int j, int k, T &val)
@@ -616,9 +625,8 @@ template<class T> class Grid : public GridBase {
         return interpol<T>(mData, mSize, mStrideZ, pos);
       case 2:
         return interpolCubic<T>(mData, mSize, mStrideZ, pos);
-      default:
-        assertMsg(false, "Unknown interpolation order " << order);
     }
+    // default: assertMsg(false, "Unknown interpolation order "<<order); }
     return T(0.);  // should never be reached, just to prevent compiler warnings
   }
 
@@ -628,7 +636,7 @@ template<class T> class Grid : public GridBase {
   //! content...
   // Grid<T>& operator=(const Grid<T>& a);
   //! copy content from other grid (use this one instead of operator= !)
-  Grid<T> &copyFrom(const Grid<T> &a, bool copyType = true);
+  Grid<T> &copyFrom(const Grid<T> &a, bool copyType = true, bool isOmp = false);
   static PyObject *_W_14(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
   {
     try {
@@ -641,8 +649,9 @@ template<class T> class Grid : public GridBase {
         ArgLocker _lock;
         const Grid<T> &a = *_args.getPtr<Grid<T>>("a", 0, &_lock);
         bool copyType = _args.getOpt<bool>("copyType", 1, true, &_lock);
+        bool isOmp = _args.getOpt<bool>("isOmp", 2, false, &_lock);
         pbo->_args.copy(_args);
-        _retval = toPy(pbo->copyFrom(a, copyType));
+        _retval = toPy(pbo->copyFrom(a, copyType, isOmp));
         pbo->_args.check();
       }
       pbFinalizePlugin(pbo->getParent(), "Grid::copyFrom", !noTiming);
@@ -1276,6 +1285,157 @@ template<class T> class Grid : public GridBase {
     }
   }
 
+  //! OpenMP data mapping from / to target device
+  void mapToOmp();
+  static PyObject *_W_38(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  {
+    try {
+      PbArgs _args(_linargs, _kwds);
+      Grid *pbo = dynamic_cast<Grid *>(Pb::objFromPy(_self));
+      bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+      pbPreparePlugin(pbo->getParent(), "Grid::mapToOmp", !noTiming);
+      PyObject *_retval = nullptr;
+      {
+        ArgLocker _lock;
+        pbo->_args.copy(_args);
+        _retval = getPyNone();
+        pbo->mapToOmp();
+        pbo->_args.check();
+      }
+      pbFinalizePlugin(pbo->getParent(), "Grid::mapToOmp", !noTiming);
+      return _retval;
+    }
+    catch (std::exception &e) {
+      pbSetError("Grid::mapToOmp", e.what());
+      return 0;
+    }
+  }
+
+  void mapFromOmp();
+  static PyObject *_W_39(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  {
+    try {
+      PbArgs _args(_linargs, _kwds);
+      Grid *pbo = dynamic_cast<Grid *>(Pb::objFromPy(_self));
+      bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+      pbPreparePlugin(pbo->getParent(), "Grid::mapFromOmp", !noTiming);
+      PyObject *_retval = nullptr;
+      {
+        ArgLocker _lock;
+        pbo->_args.copy(_args);
+        _retval = getPyNone();
+        pbo->mapFromOmp();
+        pbo->_args.check();
+      }
+      pbFinalizePlugin(pbo->getParent(), "Grid::mapFromOmp", !noTiming);
+      return _retval;
+    }
+    catch (std::exception &e) {
+      pbSetError("Grid::mapFromOmp", e.what());
+      return 0;
+    }
+  }
+
+  void mapAllocOmp();
+  static PyObject *_W_40(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  {
+    try {
+      PbArgs _args(_linargs, _kwds);
+      Grid *pbo = dynamic_cast<Grid *>(Pb::objFromPy(_self));
+      bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+      pbPreparePlugin(pbo->getParent(), "Grid::mapAllocOmp", !noTiming);
+      PyObject *_retval = nullptr;
+      {
+        ArgLocker _lock;
+        pbo->_args.copy(_args);
+        _retval = getPyNone();
+        pbo->mapAllocOmp();
+        pbo->_args.check();
+      }
+      pbFinalizePlugin(pbo->getParent(), "Grid::mapAllocOmp", !noTiming);
+      return _retval;
+    }
+    catch (std::exception &e) {
+      pbSetError("Grid::mapAllocOmp", e.what());
+      return 0;
+    }
+  }
+
+  void mapDeleteOmp();
+  static PyObject *_W_41(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  {
+    try {
+      PbArgs _args(_linargs, _kwds);
+      Grid *pbo = dynamic_cast<Grid *>(Pb::objFromPy(_self));
+      bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+      pbPreparePlugin(pbo->getParent(), "Grid::mapDeleteOmp", !noTiming);
+      PyObject *_retval = nullptr;
+      {
+        ArgLocker _lock;
+        pbo->_args.copy(_args);
+        _retval = getPyNone();
+        pbo->mapDeleteOmp();
+        pbo->_args.check();
+      }
+      pbFinalizePlugin(pbo->getParent(), "Grid::mapDeleteOmp", !noTiming);
+      return _retval;
+    }
+    catch (std::exception &e) {
+      pbSetError("Grid::mapDeleteOmp", e.what());
+      return 0;
+    }
+  }
+
+  void updateToOmp();
+  static PyObject *_W_42(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  {
+    try {
+      PbArgs _args(_linargs, _kwds);
+      Grid *pbo = dynamic_cast<Grid *>(Pb::objFromPy(_self));
+      bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+      pbPreparePlugin(pbo->getParent(), "Grid::updateToOmp", !noTiming);
+      PyObject *_retval = nullptr;
+      {
+        ArgLocker _lock;
+        pbo->_args.copy(_args);
+        _retval = getPyNone();
+        pbo->updateToOmp();
+        pbo->_args.check();
+      }
+      pbFinalizePlugin(pbo->getParent(), "Grid::updateToOmp", !noTiming);
+      return _retval;
+    }
+    catch (std::exception &e) {
+      pbSetError("Grid::updateToOmp", e.what());
+      return 0;
+    }
+  }
+
+  void updateFromOmp();
+  static PyObject *_W_43(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  {
+    try {
+      PbArgs _args(_linargs, _kwds);
+      Grid *pbo = dynamic_cast<Grid *>(Pb::objFromPy(_self));
+      bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+      pbPreparePlugin(pbo->getParent(), "Grid::updateFromOmp", !noTiming);
+      PyObject *_retval = nullptr;
+      {
+        ArgLocker _lock;
+        pbo->_args.copy(_args);
+        _retval = getPyNone();
+        pbo->updateFromOmp();
+        pbo->_args.check();
+      }
+      pbFinalizePlugin(pbo->getParent(), "Grid::updateFromOmp", !noTiming);
+      return _retval;
+    }
+    catch (std::exception &e) {
+      pbSetError("Grid::updateFromOmp", e.what());
+      return 0;
+    }
+  }
+
   // c++ only operators
   template<class S> Grid<T> &operator+=(const Grid<S> &a);
   template<class S> Grid<T> &operator+=(const S &a);
@@ -1299,10 +1459,12 @@ template<class T> class Grid : public GridBase {
     return mData[index(i, j, k)];
   }
 
- protected:
+  // TODO (sebbas): Moved attributes to public for now
   T *mData;
+ protected:
   bool mExternalData;  // True if mData is managed outside of the Fluidsolver
   bool mSaveSparse;    // True if this grid may be cached in a sparse structure
+  bool mOffload;       // True if this grid shall be allocated on an OpenMP offload device too
  public:
   PbArgs _args;
 }
@@ -1314,12 +1476,12 @@ template<class T> class Grid : public GridBase {
 //! Special function for staggered grids
 class MACGrid : public Grid<Vec3> {
  public:
-  MACGrid(FluidSolver *parent, bool show = true, bool sparse = false)
-      : Grid<Vec3>(parent, show, sparse)
+  MACGrid(FluidSolver *parent, bool show = true, bool sparse = false, bool offload = false)
+      : Grid<Vec3>(parent, show, sparse, offload)
   {
     mType = (GridType)(TypeMAC | TypeVec3);
   }
-  static int _W_38(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  static int _W_44(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
   {
     PbClass *obj = Pb::objFromPy(_self);
     if (obj)
@@ -1333,7 +1495,8 @@ class MACGrid : public Grid<Vec3> {
         FluidSolver *parent = _args.getPtr<FluidSolver>("parent", 0, &_lock);
         bool show = _args.getOpt<bool>("show", 1, true, &_lock);
         bool sparse = _args.getOpt<bool>("sparse", 2, false, &_lock);
-        obj = new MACGrid(parent, show, sparse);
+        bool offload = _args.getOpt<bool>("offload", 3, false, &_lock);
+        obj = new MACGrid(parent, show, sparse, offload);
         obj->registerObject(_self, &_args);
         _args.check();
       }
@@ -1392,17 +1555,16 @@ class MACGrid : public Grid<Vec3> {
       case 1:
         return interpolComponent<comp>(mData, mSize, mStrideZ, pos);
       case 2:
-        return interpolCubicMAC(mData, mSize, mStrideZ, pos)[comp];  // warning - not yet optimized
-      default:
-        assertMsg(false, "Unknown interpolation order " << order);
-    }
+        return interpolCubicMAC(mData, mSize, mStrideZ, pos)[comp];
+    }  // warning - not yet optimized
+    // default: assertMsg(false, "Unknown interpolation order "<<order); }
     return 0.;  // should never be reached, just to prevent compiler warnings
   }
 
   //! set all boundary cells of a MAC grid to certain value (Dirchlet). Respects staggered grid
   //! locations optionally, only set normal components
   void setBoundMAC(Vec3 value, int boundaryWidth, bool normalOnly = false);
-  static PyObject *_W_39(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  static PyObject *_W_45(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
   {
     try {
       PbArgs _args(_linargs, _kwds);
@@ -1439,12 +1601,16 @@ class MACGrid : public Grid<Vec3> {
 //! Special functions for FlagGrid
 class FlagGrid : public Grid<int> {
  public:
-  FlagGrid(FluidSolver *parent, int dim = 3, bool show = true, bool sparse = false)
-      : Grid<int>(parent, show, sparse)
+  FlagGrid(FluidSolver *parent,
+           int dim = 3,
+           bool show = true,
+           bool sparse = false,
+           bool offload = false)
+      : Grid<int>(parent, show, sparse, offload)
   {
     mType = (GridType)(TypeFlags | TypeInt);
   }
-  static int _W_40(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  static int _W_46(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
   {
     PbClass *obj = Pb::objFromPy(_self);
     if (obj)
@@ -1459,7 +1625,8 @@ class FlagGrid : public Grid<int> {
         int dim = _args.getOpt<int>("dim", 1, 3, &_lock);
         bool show = _args.getOpt<bool>("show", 2, true, &_lock);
         bool sparse = _args.getOpt<bool>("sparse", 3, false, &_lock);
-        obj = new FlagGrid(parent, dim, show, sparse);
+        bool offload = _args.getOpt<bool>("offload", 4, false, &_lock);
+        obj = new FlagGrid(parent, dim, show, sparse, offload);
         obj->registerObject(_self, &_args);
         _args.check();
       }
@@ -1625,7 +1792,7 @@ class FlagGrid : public Grid<int> {
                   const std::string &inflow = "      ",
                   const std::string &outflow = "      ",
                   Grid<Real> *phiWalls = 0x00);
-  static PyObject *_W_41(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  static PyObject *_W_47(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
   {
     try {
       PbArgs _args(_linargs, _kwds);
@@ -1659,7 +1826,7 @@ class FlagGrid : public Grid<int> {
 
   //! set fluid flags inside levelset (liquids)
   void updateFromLevelset(LevelsetGrid &levelset);
-  static PyObject *_W_42(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  static PyObject *_W_48(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
   {
     try {
       PbArgs _args(_linargs, _kwds);
@@ -1686,7 +1853,7 @@ class FlagGrid : public Grid<int> {
 
   //! set all cells (except obs/in/outflow) to type (fluid by default)
   void fillGrid(int type = TypeFluid);
-  static PyObject *_W_43(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  static PyObject *_W_49(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
   {
     try {
       PbArgs _args(_linargs, _kwds);
@@ -1715,7 +1882,7 @@ class FlagGrid : public Grid<int> {
   //! warning for large grids! only regular int returned (due to python interface)
   //! optionally creates mask in RealGrid (1 where flag matches, 0 otherwise)
   int countCells(int flag, int bnd = 0, Grid<Real> *mask = nullptr);
-  static PyObject *_W_44(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  static PyObject *_W_50(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
   {
     try {
       PbArgs _args(_linargs, _kwds);
@@ -1868,7 +2035,7 @@ template<class T, class S> struct gridAdd : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other) const
+  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other)
   {
     me[idx] += other[idx];
   }
@@ -1882,21 +2049,17 @@ template<class T, class S> struct gridAdd : public KernelBase {
     return other;
   }
   typedef Grid<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel gridAdd ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   Grid<T> &me;
   const Grid<S> &other;
@@ -1907,7 +2070,7 @@ template<class T, class S> struct gridSub : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other) const
+  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other)
   {
     me[idx] -= other[idx];
   }
@@ -1921,21 +2084,17 @@ template<class T, class S> struct gridSub : public KernelBase {
     return other;
   }
   typedef Grid<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel gridSub ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   Grid<T> &me;
   const Grid<S> &other;
@@ -1946,7 +2105,7 @@ template<class T, class S> struct gridMult : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other) const
+  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other)
   {
     me[idx] *= other[idx];
   }
@@ -1960,21 +2119,17 @@ template<class T, class S> struct gridMult : public KernelBase {
     return other;
   }
   typedef Grid<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel gridMult ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   Grid<T> &me;
   const Grid<S> &other;
@@ -1985,7 +2140,7 @@ template<class T, class S> struct gridDiv : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other) const
+  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other)
   {
     me[idx] /= other[idx];
   }
@@ -1999,21 +2154,17 @@ template<class T, class S> struct gridDiv : public KernelBase {
     return other;
   }
   typedef Grid<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel gridDiv ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   Grid<T> &me;
   const Grid<S> &other;
@@ -2024,7 +2175,7 @@ template<class T, class S> struct gridAddScalar : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<T> &me, const S &other) const
+  inline void op(IndexInt idx, Grid<T> &me, const S &other)
   {
     me[idx] += other;
   }
@@ -2038,21 +2189,17 @@ template<class T, class S> struct gridAddScalar : public KernelBase {
     return other;
   }
   typedef S type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel gridAddScalar ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   Grid<T> &me;
   const S &other;
@@ -2063,7 +2210,7 @@ template<class T, class S> struct gridMultScalar : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<T> &me, const S &other) const
+  inline void op(IndexInt idx, Grid<T> &me, const S &other)
   {
     me[idx] *= other;
   }
@@ -2077,25 +2224,22 @@ template<class T, class S> struct gridMultScalar : public KernelBase {
     return other;
   }
   typedef S type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel gridMultScalar ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   Grid<T> &me;
   const S &other;
 };
+
 template<class T, class S> struct gridScaledAdd : public KernelBase {
   gridScaledAdd(Grid<T> &me, const Grid<T> &other, const S &factor)
       : KernelBase(&me, 0), me(me), other(other), factor(factor)
@@ -2103,8 +2247,9 @@ template<class T, class S> struct gridScaledAdd : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<T> &me, const Grid<T> &other, const S &factor) const
+  inline void op(int i, int j, int k, Grid<T> &me, const Grid<T> &other, const S &factor)
   {
+    const IndexInt idx = me.index(i, j, k);
     me[idx] += factor * other[idx];
   }
   inline Grid<T> &getArg0()
@@ -2122,21 +2267,35 @@ template<class T, class S> struct gridScaledAdd : public KernelBase {
     return factor;
   }
   typedef S type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel gridScaledAdd ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other, factor);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      Grid<T> &me = getArg0();
+      const Grid<T> &other = getArg1();
+      const S &factor = getArg2();
+#pragma omp target teams distribute parallel for collapse(3) schedule(static, 1)
+      {
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, me, other, factor);
+      }
+    }
+    else {
+      const int k = 0;
+      Grid<T> &me = getArg0();
+      const Grid<T> &other = getArg1();
+      const S &factor = getArg2();
+#pragma omp target teams distribute parallel for collapse(2) schedule(static, 1)
+      {
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, me, other, factor);
+      }
+    }
   }
   Grid<T> &me;
   const Grid<T> &other;
@@ -2149,7 +2308,7 @@ template<class T> struct gridSetConst : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<T> &grid, T value) const
+  inline void op(IndexInt idx, Grid<T> &grid, T value)
   {
     grid[idx] = value;
   }
@@ -2163,21 +2322,17 @@ template<class T> struct gridSetConst : public KernelBase {
     return value;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel gridSetConst ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, grid, value);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, grid, value);
+    }
   }
   Grid<T> &grid;
   T value;
@@ -2279,7 +2434,7 @@ template<class S> struct knInterpolateGridTempl : public KernelBase {
                  const Grid<S> &source,
                  const Vec3 &sourceFactor,
                  Vec3 offset,
-                 int orderSpace = 1) const
+                 int orderSpace = 1)
   {
     Vec3 pos = Vec3(i, j, k) * sourceFactor + offset;
     if (!source.is3D())
@@ -2311,37 +2466,35 @@ template<class S> struct knInterpolateGridTempl : public KernelBase {
     return orderSpace;
   }
   typedef int type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel knInterpolateGridTempl ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, target, source, sourceFactor, offset, orderSpace);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, target, source, sourceFactor, offset, orderSpace);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, target, source, sourceFactor, offset, orderSpace);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, target, source, sourceFactor, offset, orderSpace);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<S> &target;
   const Grid<S> &source;
   const Vec3 &sourceFactor;
diff --git a/extern/mantaflow/preprocessed/grid.h.reg.cpp b/extern/mantaflow/preprocessed/grid.h.reg.cpp
index 2c95c002dad..1f3a80903a4 100644
--- a/extern/mantaflow/preprocessed/grid.h.reg.cpp
+++ b/extern/mantaflow/preprocessed/grid.h.reg.cpp
@@ -8,11 +8,11 @@ namespace Manta {
 #ifdef _C_FlagGrid
 static const Pb::Register _R_26("FlagGrid", "FlagGrid", "Grid<int>");
 template<> const char *Namify<FlagGrid>::S = "FlagGrid";
-static const Pb::Register _R_27("FlagGrid", "FlagGrid", FlagGrid::_W_40);
-static const Pb::Register _R_28("FlagGrid", "initDomain", FlagGrid::_W_41);
-static const Pb::Register _R_29("FlagGrid", "updateFromLevelset", FlagGrid::_W_42);
-static const Pb::Register _R_30("FlagGrid", "fillGrid", FlagGrid::_W_43);
-static const Pb::Register _R_31("FlagGrid", "countCells", FlagGrid::_W_44);
+static const Pb::Register _R_27("FlagGrid", "FlagGrid", FlagGrid::_W_46);
+static const Pb::Register _R_28("FlagGrid", "initDomain", FlagGrid::_W_47);
+static const Pb::Register _R_29("FlagGrid", "updateFromLevelset", FlagGrid::_W_48);
+static const Pb::Register _R_30("FlagGrid", "fillGrid", FlagGrid::_W_49);
+static const Pb::Register _R_31("FlagGrid", "countCells", FlagGrid::_W_50);
 #endif
 #ifdef _C_Grid
 static const Pb::Register _R_32("Grid<int>", "Grid<int>", "GridBase");
@@ -45,86 +45,104 @@ static const Pb::Register _R_57("Grid<int>", "setBound", Grid<int>::_W_34);
 static const Pb::Register _R_58("Grid<int>", "setBoundNeumann", Grid<int>::_W_35);
 static const Pb::Register _R_59("Grid<int>", "getDataPointer", Grid<int>::_W_36);
 static const Pb::Register _R_60("Grid<int>", "printGrid", Grid<int>::_W_37);
-static const Pb::Register _R_61("Grid<Real>", "Grid<Real>", "GridBase");
+static const Pb::Register _R_61("Grid<int>", "mapToOmp", Grid<int>::_W_38);
+static const Pb::Register _R_62("Grid<int>", "mapFromOmp", Grid<int>::_W_39);
+static const Pb::Register _R_63("Grid<int>", "mapAllocOmp", Grid<int>::_W_40);
+static const Pb::Register _R_64("Grid<int>", "mapDeleteOmp", Grid<int>::_W_41);
+static const Pb::Register _R_65("Grid<int>", "updateToOmp", Grid<int>::_W_42);
+static const Pb::Register _R_66("Grid<int>", "updateFromOmp", Grid<int>::_W_43);
+static const Pb::Register _R_67("Grid<Real>", "Grid<Real>", "GridBase");
 template<> const char *Namify<Grid<Real>>::S = "Grid<Real>";
-static const Pb::Register _R_62("Grid<Real>", "Grid", Grid<Real>::_W_10);
-static const Pb::Register _R_63("Grid<Real>", "save", Grid<Real>::_W_11);
-static const Pb::Register _R_64("Grid<Real>", "load", Grid<Real>::_W_12);
-static const Pb::Register _R_65("Grid<Real>", "clear", Grid<Real>::_W_13);
-static const Pb::Register _R_66("Grid<Real>", "copyFrom", Grid<Real>::_W_14);
-static const Pb::Register _R_67("Grid<Real>", "getGridType", Grid<Real>::_W_15);
-static const Pb::Register _R_68("Grid<Real>", "add", Grid<Real>::_W_16);
-static const Pb::Register _R_69("Grid<Real>", "sub", Grid<Real>::_W_17);
-static const Pb::Register _R_70("Grid<Real>", "setConst", Grid<Real>::_W_18);
-static const Pb::Register _R_71("Grid<Real>", "addConst", Grid<Real>::_W_19);
-static const Pb::Register _R_72("Grid<Real>", "addScaled", Grid<Real>::_W_20);
-static const Pb::Register _R_73("Grid<Real>", "mult", Grid<Real>::_W_21);
-static const Pb::Register _R_74("Grid<Real>", "multConst", Grid<Real>::_W_22);
-static const Pb::Register _R_75("Grid<Real>", "safeDivide", Grid<Real>::_W_23);
-static const Pb::Register _R_76("Grid<Real>", "clamp", Grid<Real>::_W_24);
-static const Pb::Register _R_77("Grid<Real>", "stomp", Grid<Real>::_W_25);
-static const Pb::Register _R_78("Grid<Real>", "permuteAxes", Grid<Real>::_W_26);
-static const Pb::Register _R_79("Grid<Real>", "permuteAxesCopyToGrid", Grid<Real>::_W_27);
-static const Pb::Register _R_80("Grid<Real>", "join", Grid<Real>::_W_28);
-static const Pb::Register _R_81("Grid<Real>", "getMaxAbs", Grid<Real>::_W_29);
-static const Pb::Register _R_82("Grid<Real>", "getMax", Grid<Real>::_W_30);
-static const Pb::Register _R_83("Grid<Real>", "getMin", Grid<Real>::_W_31);
-static const Pb::Register _R_84("Grid<Real>", "getL1", Grid<Real>::_W_32);
-static const Pb::Register _R_85("Grid<Real>", "getL2", Grid<Real>::_W_33);
-static const Pb::Register _R_86("Grid<Real>", "setBound", Grid<Real>::_W_34);
-static const Pb::Register _R_87("Grid<Real>", "setBoundNeumann", Grid<Real>::_W_35);
-static const Pb::Register _R_88("Grid<Real>", "getDataPointer", Grid<Real>::_W_36);
-static const Pb::Register _R_89("Grid<Real>", "printGrid", Grid<Real>::_W_37);
-static const Pb::Register _R_90("Grid<Vec3>", "Grid<Vec3>", "GridBase");
+static const Pb::Register _R_68("Grid<Real>", "Grid", Grid<Real>::_W_10);
+static const Pb::Register _R_69("Grid<Real>", "save", Grid<Real>::_W_11);
+static const Pb::Register _R_70("Grid<Real>", "load", Grid<Real>::_W_12);
+static const Pb::Register _R_71("Grid<Real>", "clear", Grid<Real>::_W_13);
+static const Pb::Register _R_72("Grid<Real>", "copyFrom", Grid<Real>::_W_14);
+static const Pb::Register _R_73("Grid<Real>", "getGridType", Grid<Real>::_W_15);
+static const Pb::Register _R_74("Grid<Real>", "add", Grid<Real>::_W_16);
+static const Pb::Register _R_75("Grid<Real>", "sub", Grid<Real>::_W_17);
+static const Pb::Register _R_76("Grid<Real>", "setConst", Grid<Real>::_W_18);
+static const Pb::Register _R_77("Grid<Real>", "addConst", Grid<Real>::_W_19);
+static const Pb::Register _R_78("Grid<Real>", "addScaled", Grid<Real>::_W_20);
+static const Pb::Register _R_79("Grid<Real>", "mult", Grid<Real>::_W_21);
+static const Pb::Register _R_80("Grid<Real>", "multConst", Grid<Real>::_W_22);
+static const Pb::Register _R_81("Grid<Real>", "safeDivide", Grid<Real>::_W_23);
+static const Pb::Register _R_82("Grid<Real>", "clamp", Grid<Real>::_W_24);
+static const Pb::Register _R_83("Grid<Real>", "stomp", Grid<Real>::_W_25);
+static const Pb::Register _R_84("Grid<Real>", "permuteAxes", Grid<Real>::_W_26);
+static const Pb::Register _R_85("Grid<Real>", "permuteAxesCopyToGrid", Grid<Real>::_W_27);
+static const Pb::Register _R_86("Grid<Real>", "join", Grid<Real>::_W_28);
+static const Pb::Register _R_87("Grid<Real>", "getMaxAbs", Grid<Real>::_W_29);
+static const Pb::Register _R_88("Grid<Real>", "getMax", Grid<Real>::_W_30);
+static const Pb::Register _R_89("Grid<Real>", "getMin", Grid<Real>::_W_31);
+static const Pb::Register _R_90("Grid<Real>", "getL1", Grid<Real>::_W_32);
+static const Pb::Register _R_91("Grid<Real>", "getL2", Grid<Real>::_W_33);
+static const Pb::Register _R_92("Grid<Real>", "setBound", Grid<Real>::_W_34);
+static const Pb::Register _R_93("Grid<Real>", "setBoundNeumann", Grid<Real>::_W_35);
+static const Pb::Register _R_94("Grid<Real>", "getDataPointer", Grid<Real>::_W_36);
+static const Pb::Register _R_95("Grid<Real>", "printGrid", Grid<Real>::_W_37);
+static const Pb::Register _R_96("Grid<Real>", "mapToOmp", Grid<Real>::_W_38);
+static const Pb::Register _R_97("Grid<Real>", "mapFromOmp", Grid<Real>::_W_39);
+static const Pb::Register _R_98("Grid<Real>", "mapAllocOmp", Grid<Real>::_W_40);
+static const Pb::Register _R_99("Grid<Real>", "mapDeleteOmp", Grid<Real>::_W_41);
+static const Pb::Register _R_100("Grid<Real>", "updateToOmp", Grid<Real>::_W_42);
+static const Pb::Register _R_101("Grid<Real>", "updateFromOmp", Grid<Real>::_W_43);
+static const Pb::Register _R_102("Grid<Vec3>", "Grid<Vec3>", "GridBase");
 template<> const char *Namify<Grid<Vec3>>::S = "Grid<Vec3>";
-static const Pb::Register _R_91("Grid<Vec3>", "Grid", Grid<Vec3>::_W_10);
-static const Pb::Register _R_92("Grid<Vec3>", "save", Grid<Vec3>::_W_11);
-static const Pb::Register _R_93("Grid<Vec3>", "load", Grid<Vec3>::_W_12);
-static const Pb::Register _R_94("Grid<Vec3>", "clear", Grid<Vec3>::_W_13);
-static const Pb::Register _R_95("Grid<Vec3>", "copyFrom", Grid<Vec3>::_W_14);
-static const Pb::Register _R_96("Grid<Vec3>", "getGridType", Grid<Vec3>::_W_15);
-static const Pb::Register _R_97("Grid<Vec3>", "add", Grid<Vec3>::_W_16);
-static const Pb::Register _R_98("Grid<Vec3>", "sub", Grid<Vec3>::_W_17);
-static const Pb::Register _R_99("Grid<Vec3>", "setConst", Grid<Vec3>::_W_18);
-static const Pb::Register _R_100("Grid<Vec3>", "addConst", Grid<Vec3>::_W_19);
-static const Pb::Register _R_101("Grid<Vec3>", "addScaled", Grid<Vec3>::_W_20);
-static const Pb::Register _R_102("Grid<Vec3>", "mult", Grid<Vec3>::_W_21);
-static const Pb::Register _R_103("Grid<Vec3>", "multConst", Grid<Vec3>::_W_22);
-static const Pb::Register _R_104("Grid<Vec3>", "safeDivide", Grid<Vec3>::_W_23);
-static const Pb::Register _R_105("Grid<Vec3>", "clamp", Grid<Vec3>::_W_24);
-static const Pb::Register _R_106("Grid<Vec3>", "stomp", Grid<Vec3>::_W_25);
-static const Pb::Register _R_107("Grid<Vec3>", "permuteAxes", Grid<Vec3>::_W_26);
-static const Pb::Register _R_108("Grid<Vec3>", "permuteAxesCopyToGrid", Grid<Vec3>::_W_27);
-static const Pb::Register _R_109("Grid<Vec3>", "join", Grid<Vec3>::_W_28);
-static const Pb::Register _R_110("Grid<Vec3>", "getMaxAbs", Grid<Vec3>::_W_29);
-static const Pb::Register _R_111("Grid<Vec3>", "getMax", Grid<Vec3>::_W_30);
-static const Pb::Register _R_112("Grid<Vec3>", "getMin", Grid<Vec3>::_W_31);
-static const Pb::Register _R_113("Grid<Vec3>", "getL1", Grid<Vec3>::_W_32);
-static const Pb::Register _R_114("Grid<Vec3>", "getL2", Grid<Vec3>::_W_33);
-static const Pb::Register _R_115("Grid<Vec3>", "setBound", Grid<Vec3>::_W_34);
-static const Pb::Register _R_116("Grid<Vec3>", "setBoundNeumann", Grid<Vec3>::_W_35);
-static const Pb::Register _R_117("Grid<Vec3>", "getDataPointer", Grid<Vec3>::_W_36);
-static const Pb::Register _R_118("Grid<Vec3>", "printGrid", Grid<Vec3>::_W_37);
+static const Pb::Register _R_103("Grid<Vec3>", "Grid", Grid<Vec3>::_W_10);
+static const Pb::Register _R_104("Grid<Vec3>", "save", Grid<Vec3>::_W_11);
+static const Pb::Register _R_105("Grid<Vec3>", "load", Grid<Vec3>::_W_12);
+static const Pb::Register _R_106("Grid<Vec3>", "clear", Grid<Vec3>::_W_13);
+static const Pb::Register _R_107("Grid<Vec3>", "copyFrom", Grid<Vec3>::_W_14);
+static const Pb::Register _R_108("Grid<Vec3>", "getGridType", Grid<Vec3>::_W_15);
+static const Pb::Register _R_109("Grid<Vec3>", "add", Grid<Vec3>::_W_16);
+static const Pb::Register _R_110("Grid<Vec3>", "sub", Grid<Vec3>::_W_17);
+static const Pb::Register _R_111("Grid<Vec3>", "setConst", Grid<Vec3>::_W_18);
+static const Pb::Register _R_112("Grid<Vec3>", "addConst", Grid<Vec3>::_W_19);
+static const Pb::Register _R_113("Grid<Vec3>", "addScaled", Grid<Vec3>::_W_20);
+static const Pb::Register _R_114("Grid<Vec3>", "mult", Grid<Vec3>::_W_21);
+static const Pb::Register _R_115("Grid<Vec3>", "multConst", Grid<Vec3>::_W_22);
+static const Pb::Register _R_116("Grid<Vec3>", "safeDivide", Grid<Vec3>::_W_23);
+static const Pb::Register _R_117("Grid<Vec3>", "clamp", Grid<Vec3>::_W_24);
+static const Pb::Register _R_118("Grid<Vec3>", "stomp", Grid<Vec3>::_W_25);
+static const Pb::Register _R_119("Grid<Vec3>", "permuteAxes", Grid<Vec3>::_W_26);
+static const Pb::Register _R_120("Grid<Vec3>", "permuteAxesCopyToGrid", Grid<Vec3>::_W_27);
+static const Pb::Register _R_121("Grid<Vec3>", "join", Grid<Vec3>::_W_28);
+static const Pb::Register _R_122("Grid<Vec3>", "getMaxAbs", Grid<Vec3>::_W_29);
+static const Pb::Register _R_123("Grid<Vec3>", "getMax", Grid<Vec3>::_W_30);
+static const Pb::Register _R_124("Grid<Vec3>", "getMin", Grid<Vec3>::_W_31);
+static const Pb::Register _R_125("Grid<Vec3>", "getL1", Grid<Vec3>::_W_32);
+static const Pb::Register _R_126("Grid<Vec3>", "getL2", Grid<Vec3>::_W_33);
+static const Pb::Register _R_127("Grid<Vec3>", "setBound", Grid<Vec3>::_W_34);
+static const Pb::Register _R_128("Grid<Vec3>", "setBoundNeumann", Grid<Vec3>::_W_35);
+static const Pb::Register _R_129("Grid<Vec3>", "getDataPointer", Grid<Vec3>::_W_36);
+static const Pb::Register _R_130("Grid<Vec3>", "printGrid", Grid<Vec3>::_W_37);
+static const Pb::Register _R_131("Grid<Vec3>", "mapToOmp", Grid<Vec3>::_W_38);
+static const Pb::Register _R_132("Grid<Vec3>", "mapFromOmp", Grid<Vec3>::_W_39);
+static const Pb::Register _R_133("Grid<Vec3>", "mapAllocOmp", Grid<Vec3>::_W_40);
+static const Pb::Register _R_134("Grid<Vec3>", "mapDeleteOmp", Grid<Vec3>::_W_41);
+static const Pb::Register _R_135("Grid<Vec3>", "updateToOmp", Grid<Vec3>::_W_42);
+static const Pb::Register _R_136("Grid<Vec3>", "updateFromOmp", Grid<Vec3>::_W_43);
 #endif
 #ifdef _C_GridBase
-static const Pb::Register _R_119("GridBase", "GridBase", "PbClass");
+static const Pb::Register _R_137("GridBase", "GridBase", "PbClass");
 template<> const char *Namify<GridBase>::S = "GridBase";
-static const Pb::Register _R_120("GridBase", "GridBase", GridBase::_W_0);
-static const Pb::Register _R_121("GridBase", "getSizeX", GridBase::_W_1);
-static const Pb::Register _R_122("GridBase", "getSizeY", GridBase::_W_2);
-static const Pb::Register _R_123("GridBase", "getSizeZ", GridBase::_W_3);
-static const Pb::Register _R_124("GridBase", "getSize", GridBase::_W_4);
-static const Pb::Register _R_125("GridBase", "is3D", GridBase::_W_5);
-static const Pb::Register _R_126("GridBase", "is4D", GridBase::_W_6);
-static const Pb::Register _R_127("GridBase", "getSizeT", GridBase::_W_7);
-static const Pb::Register _R_128("GridBase", "getStrideT", GridBase::_W_8);
-static const Pb::Register _R_129("GridBase", "setName", GridBase::_W_9);
+static const Pb::Register _R_138("GridBase", "GridBase", GridBase::_W_0);
+static const Pb::Register _R_139("GridBase", "getSizeX", GridBase::_W_1);
+static const Pb::Register _R_140("GridBase", "getSizeY", GridBase::_W_2);
+static const Pb::Register _R_141("GridBase", "getSizeZ", GridBase::_W_3);
+static const Pb::Register _R_142("GridBase", "getSize", GridBase::_W_4);
+static const Pb::Register _R_143("GridBase", "is3D", GridBase::_W_5);
+static const Pb::Register _R_144("GridBase", "is4D", GridBase::_W_6);
+static const Pb::Register _R_145("GridBase", "getSizeT", GridBase::_W_7);
+static const Pb::Register _R_146("GridBase", "getStrideT", GridBase::_W_8);
+static const Pb::Register _R_147("GridBase", "setName", GridBase::_W_9);
 #endif
 #ifdef _C_MACGrid
-static const Pb::Register _R_130("MACGrid", "MACGrid", "Grid<Vec3>");
+static const Pb::Register _R_148("MACGrid", "MACGrid", "Grid<Vec3>");
 template<> const char *Namify<MACGrid>::S = "MACGrid";
-static const Pb::Register _R_131("MACGrid", "MACGrid", MACGrid::_W_38);
-static const Pb::Register _R_132("MACGrid", "setBoundMAC", MACGrid::_W_39);
+static const Pb::Register _R_149("MACGrid", "MACGrid", MACGrid::_W_44);
+static const Pb::Register _R_150("MACGrid", "setBoundMAC", MACGrid::_W_45);
 #endif
 static const Pb::Register _R_7("GridType_TypeNone", 0);
 static const Pb::Register _R_8("GridType_TypeReal", 1);
@@ -255,6 +273,24 @@ void PbRegister_file_7()
   KEEP_UNUSED(_R_130);
   KEEP_UNUSED(_R_131);
   KEEP_UNUSED(_R_132);
+  KEEP_UNUSED(_R_133);
+  KEEP_UNUSED(_R_134);
+  KEEP_UNUSED(_R_135);
+  KEEP_UNUSED(_R_136);
+  KEEP_UNUSED(_R_137);
+  KEEP_UNUSED(_R_138);
+  KEEP_UNUSED(_R_139);
+  KEEP_UNUSED(_R_140);
+  KEEP_UNUSED(_R_141);
+  KEEP_UNUSED(_R_142);
+  KEEP_UNUSED(_R_143);
+  KEEP_UNUSED(_R_144);
+  KEEP_UNUSED(_R_145);
+  KEEP_UNUSED(_R_146);
+  KEEP_UNUSED(_R_147);
+  KEEP_UNUSED(_R_148);
+  KEEP_UNUSED(_R_149);
+  KEEP_UNUSED(_R_150);
 }
 }
 }  // namespace Manta
 \ No newline at end of file
diff --git a/extern/mantaflow/preprocessed/grid4d.cpp b/extern/mantaflow/preprocessed/grid4d.cpp
index 72bd3a6fe50..a896b003736 100644
--- a/extern/mantaflow/preprocessed/grid4d.cpp
+++ b/extern/mantaflow/preprocessed/grid4d.cpp
@@ -179,29 +179,21 @@ struct kn4dMinReal : public KernelBase {
     return val;
   }
   typedef Grid4d<Real> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn4dMinReal ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, minVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  kn4dMinReal(kn4dMinReal &o, tbb::split)
-      : KernelBase(o), val(o.val), minVal(std::numeric_limits<Real>::max())
-  {
-  }
-  void join(const kn4dMinReal &o)
-  {
-    minVal = min(minVal, o.minVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real minVal = std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, minVal);
+#pragma omp critical
+      {
+        this->minVal = min(minVal, this->minVal);
+      }
+    }
   }
   Grid4d<Real> &val;
   Real minVal;
@@ -234,29 +226,21 @@ struct kn4dMaxReal : public KernelBase {
     return val;
   }
   typedef Grid4d<Real> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn4dMaxReal ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, maxVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  kn4dMaxReal(kn4dMaxReal &o, tbb::split)
-      : KernelBase(o), val(o.val), maxVal(-std::numeric_limits<Real>::max())
-  {
-  }
-  void join(const kn4dMaxReal &o)
-  {
-    maxVal = max(maxVal, o.maxVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real maxVal = -std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, maxVal);
+#pragma omp critical
+      {
+        this->maxVal = max(maxVal, this->maxVal);
+      }
+    }
   }
   Grid4d<Real> &val;
   Real maxVal;
@@ -289,29 +273,21 @@ struct kn4dMinInt : public KernelBase {
     return val;
   }
   typedef Grid4d<int> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn4dMinInt ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, minVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  kn4dMinInt(kn4dMinInt &o, tbb::split)
-      : KernelBase(o), val(o.val), minVal(std::numeric_limits<int>::max())
-  {
-  }
-  void join(const kn4dMinInt &o)
-  {
-    minVal = min(minVal, o.minVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      int minVal = std::numeric_limits<int>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, minVal);
+#pragma omp critical
+      {
+        this->minVal = min(minVal, this->minVal);
+      }
+    }
   }
   Grid4d<int> &val;
   int minVal;
@@ -344,29 +320,21 @@ struct kn4dMaxInt : public KernelBase {
     return val;
   }
   typedef Grid4d<int> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn4dMaxInt ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, maxVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  kn4dMaxInt(kn4dMaxInt &o, tbb::split)
-      : KernelBase(o), val(o.val), maxVal(std::numeric_limits<int>::min())
-  {
-  }
-  void join(const kn4dMaxInt &o)
-  {
-    maxVal = max(maxVal, o.maxVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      int maxVal = std::numeric_limits<int>::min();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, maxVal);
+#pragma omp critical
+      {
+        this->maxVal = max(maxVal, this->maxVal);
+      }
+    }
   }
   Grid4d<int> &val;
   int maxVal;
@@ -400,29 +368,21 @@ template<class VEC> struct kn4dMinVec : public KernelBase {
     return val;
   }
   typedef Grid4d<VEC> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn4dMinVec ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, minVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  kn4dMinVec(kn4dMinVec &o, tbb::split)
-      : KernelBase(o), val(o.val), minVal(std::numeric_limits<Real>::max())
-  {
-  }
-  void join(const kn4dMinVec &o)
-  {
-    minVal = min(minVal, o.minVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real minVal = std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, minVal);
+#pragma omp critical
+      {
+        this->minVal = min(minVal, this->minVal);
+      }
+    }
   }
   Grid4d<VEC> &val;
   Real minVal;
@@ -456,29 +416,21 @@ template<class VEC> struct kn4dMaxVec : public KernelBase {
     return val;
   }
   typedef Grid4d<VEC> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn4dMaxVec ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, maxVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  kn4dMaxVec(kn4dMaxVec &o, tbb::split)
-      : KernelBase(o), val(o.val), maxVal(-std::numeric_limits<Real>::max())
-  {
-  }
-  void join(const kn4dMaxVec &o)
-  {
-    maxVal = max(maxVal, o.maxVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real maxVal = -std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, maxVal);
+#pragma omp critical
+      {
+        this->maxVal = max(maxVal, this->maxVal);
+      }
+    }
   }
   Grid4d<VEC> &val;
   Real maxVal;
@@ -507,7 +459,7 @@ template<class T> struct kn4dSetConstReal : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid4d<T> &me, T val) const
+  inline void op(IndexInt idx, Grid4d<T> &me, T val)
   {
     me[idx] = val;
   }
@@ -521,21 +473,17 @@ template<class T> struct kn4dSetConstReal : public KernelBase {
     return val;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn4dSetConstReal ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, val);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, val);
+    }
   }
   Grid4d<T> &me;
   T val;
@@ -546,7 +494,7 @@ template<class T> struct kn4dAddConstReal : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid4d<T> &me, T val) const
+  inline void op(IndexInt idx, Grid4d<T> &me, T val)
   {
     me[idx] += val;
   }
@@ -560,21 +508,17 @@ template<class T> struct kn4dAddConstReal : public KernelBase {
     return val;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn4dAddConstReal ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, val);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, val);
+    }
   }
   Grid4d<T> &me;
   T val;
@@ -585,7 +529,7 @@ template<class T> struct kn4dMultConst : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid4d<T> &me, T val) const
+  inline void op(IndexInt idx, Grid4d<T> &me, T val)
   {
     me[idx] *= val;
   }
@@ -599,21 +543,17 @@ template<class T> struct kn4dMultConst : public KernelBase {
     return val;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn4dMultConst ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, val);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, val);
+    }
   }
   Grid4d<T> &me;
   T val;
@@ -624,7 +564,7 @@ template<class T> struct kn4dClamp : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid4d<T> &me, T min, T max) const
+  inline void op(IndexInt idx, Grid4d<T> &me, T min, T max)
   {
     me[idx] = clamp(me[idx], min, max);
   }
@@ -643,21 +583,17 @@ template<class T> struct kn4dClamp : public KernelBase {
     return max;
   }
   typedef T type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel kn4dClamp ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, min, max);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, min, max);
+    }
   }
   Grid4d<T> &me;
   T min;
@@ -788,7 +724,7 @@ struct knGetComp4d : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, const Grid4d<Vec4> &src, Grid4d<Real> &dst, int c) const
+  inline void op(IndexInt idx, const Grid4d<Vec4> &src, Grid4d<Real> &dst, int c)
   {
     dst[idx] = src[idx][c];
   }
@@ -807,21 +743,17 @@ struct knGetComp4d : public KernelBase {
     return c;
   }
   typedef int type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knGetComp4d ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, src, dst, c);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, src, dst, c);
+    }
   }
   const Grid4d<Vec4> &src;
   Grid4d<Real> &dst;
@@ -835,7 +767,7 @@ struct knSetComp4d : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, const Grid4d<Real> &src, Grid4d<Vec4> &dst, int c) const
+  inline void op(IndexInt idx, const Grid4d<Real> &src, Grid4d<Vec4> &dst, int c)
   {
     dst[idx][c] = src[idx];
   }
@@ -854,21 +786,17 @@ struct knSetComp4d : public KernelBase {
     return c;
   }
   typedef int type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetComp4d ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, src, dst, c);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, src, dst, c);
+    }
   }
   const Grid4d<Real> &src;
   Grid4d<Vec4> &dst;
@@ -959,7 +887,7 @@ template<class T> struct knSetBnd4d : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, int t, Grid4d<T> &grid, T value, int w) const
+  inline void op(int i, int j, int k, int t, Grid4d<T> &grid, T value, int w)
   {
     bool bnd = (i <= w || i >= grid.getSizeX() - 1 - w || j <= w || j >= grid.getSizeY() - 1 - w ||
                 k <= w || k >= grid.getSizeZ() - 1 - w || t <= w || t >= grid.getSizeT() - 1 - w);
@@ -981,50 +909,47 @@ template<class T> struct knSetBnd4d : public KernelBase {
     return w;
   }
   typedef int type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetBnd4d ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ
-               << " "
-                  " t "
-               << minT << " - " << maxT,
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
     if (maxT > 1) {
-      for (int t = __r.begin(); t != (int)__r.end(); t++)
-        for (int k = 0; k < maxZ; k++)
-          for (int j = 0; j < maxY; j++)
-            for (int i = 0; i < maxX; i++)
-              op(i, j, k, t, grid, value, w);
+      const int _maxZ = maxZ;
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int t = 0; t < maxT; t++)
+          for (int k = 0; k < _maxZ; k++)
+            for (int j = 0; j < _maxY; j++)
+              for (int i = 0; i < _maxX; i++)
+                op(i, j, k, t, grid, value, w);
+      }
     }
     else if (maxZ > 1) {
       const int t = 0;
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < maxY; j++)
-          for (int i = 0; i < maxX; i++)
-            op(i, j, k, t, grid, value, w);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, t, grid, value, w);
+      }
     }
     else {
       const int t = 0;
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < maxX; i++)
-          op(i, j, k, t, grid, value, w);
-    }
-  }
-  void run()
-  {
-    if (maxT > 1) {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minT, maxT), *this);
-    }
-    else if (maxZ > 1) {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    }
-    else {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, t, grid, value, w);
+      }
     }
   }
   Grid4d<T> &grid;
@@ -1043,7 +968,7 @@ template<class T> struct knSetBnd4dNeumann : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, int t, Grid4d<T> &grid, int w) const
+  inline void op(int i, int j, int k, int t, Grid4d<T> &grid, int w)
   {
     bool set = false;
     int si = i, sj = j, sk = k, st = t;
@@ -1092,50 +1017,47 @@ template<class T> struct knSetBnd4dNeumann : public KernelBase {
     return w;
   }
   typedef int type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetBnd4dNeumann ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ
-               << " "
-                  " t "
-               << minT << " - " << maxT,
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
     if (maxT > 1) {
-      for (int t = __r.begin(); t != (int)__r.end(); t++)
-        for (int k = 0; k < maxZ; k++)
-          for (int j = 0; j < maxY; j++)
-            for (int i = 0; i < maxX; i++)
-              op(i, j, k, t, grid, w);
+      const int _maxZ = maxZ;
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int t = 0; t < maxT; t++)
+          for (int k = 0; k < _maxZ; k++)
+            for (int j = 0; j < _maxY; j++)
+              for (int i = 0; i < _maxX; i++)
+                op(i, j, k, t, grid, w);
+      }
     }
     else if (maxZ > 1) {
       const int t = 0;
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < maxY; j++)
-          for (int i = 0; i < maxX; i++)
-            op(i, j, k, t, grid, w);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, t, grid, w);
+      }
     }
     else {
       const int t = 0;
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < maxX; i++)
-          op(i, j, k, t, grid, w);
-    }
-  }
-  void run()
-  {
-    if (maxT > 1) {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minT, maxT), *this);
-    }
-    else if (maxZ > 1) {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    }
-    else {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, t, grid, w);
+      }
     }
   }
   Grid4d<T> &grid;
@@ -1329,7 +1251,7 @@ template<class S> struct knSetRegion4d : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, int t, Grid4d<S> &dst, Vec4 start, Vec4 end, S value) const
+  inline void op(int i, int j, int k, int t, Grid4d<S> &dst, Vec4 start, Vec4 end, S value)
   {
     Vec4 p(i, j, k, t);
     for (int c = 0; c < 4; ++c)
@@ -1357,50 +1279,47 @@ template<class S> struct knSetRegion4d : public KernelBase {
     return value;
   }
   typedef S type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetRegion4d ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ
-               << " "
-                  " t "
-               << minT << " - " << maxT,
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
     if (maxT > 1) {
-      for (int t = __r.begin(); t != (int)__r.end(); t++)
-        for (int k = 0; k < maxZ; k++)
-          for (int j = 0; j < maxY; j++)
-            for (int i = 0; i < maxX; i++)
-              op(i, j, k, t, dst, start, end, value);
+      const int _maxZ = maxZ;
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int t = 0; t < maxT; t++)
+          for (int k = 0; k < _maxZ; k++)
+            for (int j = 0; j < _maxY; j++)
+              for (int i = 0; i < _maxX; i++)
+                op(i, j, k, t, dst, start, end, value);
+      }
     }
     else if (maxZ > 1) {
       const int t = 0;
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < maxY; j++)
-          for (int i = 0; i < maxX; i++)
-            op(i, j, k, t, dst, start, end, value);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, t, dst, start, end, value);
+      }
     }
     else {
       const int t = 0;
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < maxX; i++)
-          op(i, j, k, t, dst, start, end, value);
-    }
-  }
-  void run()
-  {
-    if (maxT > 1) {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minT, maxT), *this);
-    }
-    else if (maxZ > 1) {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    }
-    else {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, t, dst, start, end, value);
+      }
     }
   }
   Grid4d<S> &dst;
@@ -1619,7 +1538,7 @@ template<class S> struct knInterpol4d : public KernelBase {
                  Grid4d<S> &target,
                  Grid4d<S> &source,
                  const Vec4 &srcFac,
-                 const Vec4 &offset) const
+                 const Vec4 &offset)
   {
     Vec4 pos = Vec4(i, j, k, t) * srcFac + offset;
     target(i, j, k, t) = source.getInterpolated(pos);
@@ -1644,50 +1563,47 @@ template<class S> struct knInterpol4d : public KernelBase {
     return offset;
   }
   typedef Vec4 type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knInterpol4d ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ
-               << " "
-                  " t "
-               << minT << " - " << maxT,
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
     if (maxT > 1) {
-      for (int t = __r.begin(); t != (int)__r.end(); t++)
-        for (int k = 0; k < maxZ; k++)
-          for (int j = 0; j < maxY; j++)
-            for (int i = 0; i < maxX; i++)
-              op(i, j, k, t, target, source, srcFac, offset);
+      const int _maxZ = maxZ;
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int t = 0; t < maxT; t++)
+          for (int k = 0; k < _maxZ; k++)
+            for (int j = 0; j < _maxY; j++)
+              for (int i = 0; i < _maxX; i++)
+                op(i, j, k, t, target, source, srcFac, offset);
+      }
     }
     else if (maxZ > 1) {
       const int t = 0;
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < maxY; j++)
-          for (int i = 0; i < maxX; i++)
-            op(i, j, k, t, target, source, srcFac, offset);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, t, target, source, srcFac, offset);
+      }
     }
     else {
       const int t = 0;
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < maxX; i++)
-          op(i, j, k, t, target, source, srcFac, offset);
-    }
-  }
-  void run()
-  {
-    if (maxT > 1) {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minT, maxT), *this);
-    }
-    else if (maxZ > 1) {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    }
-    else {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, t, target, source, srcFac, offset);
+      }
     }
   }
   Grid4d<S> &target;
diff --git a/extern/mantaflow/preprocessed/grid4d.h b/extern/mantaflow/preprocessed/grid4d.h
index 1741db590b7..2a2abad1e24 100644
--- a/extern/mantaflow/preprocessed/grid4d.h
+++ b/extern/mantaflow/preprocessed/grid4d.h
@@ -326,7 +326,6 @@ class Grid4dBase : public PbClass {
   // precomputed Z,T shift: to ensure 2D compatibility, always use this instead of sx*sy !
   IndexInt mStrideZ;
   IndexInt mStrideT;
-
  public:
   PbArgs _args;
 }
@@ -951,7 +950,6 @@ template<class T> class Grid4d : public Grid4dBase {
 
  protected:
   T *mData;
-
  public:
   PbArgs _args;
 }
@@ -1027,7 +1025,7 @@ template<class T, class S> struct Grid4dAdd : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other) const
+  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other)
   {
     me[idx] += other[idx];
   }
@@ -1041,21 +1039,17 @@ template<class T, class S> struct Grid4dAdd : public KernelBase {
     return other;
   }
   typedef Grid4d<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel Grid4dAdd ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   Grid4d<T> &me;
   const Grid4d<S> &other;
@@ -1066,7 +1060,7 @@ template<class T, class S> struct Grid4dSub : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other) const
+  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other)
   {
     me[idx] -= other[idx];
   }
@@ -1080,21 +1074,17 @@ template<class T, class S> struct Grid4dSub : public KernelBase {
     return other;
   }
   typedef Grid4d<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel Grid4dSub ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   Grid4d<T> &me;
   const Grid4d<S> &other;
@@ -1105,7 +1095,7 @@ template<class T, class S> struct Grid4dMult : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other) const
+  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other)
   {
     me[idx] *= other[idx];
   }
@@ -1119,21 +1109,17 @@ template<class T, class S> struct Grid4dMult : public KernelBase {
     return other;
   }
   typedef Grid4d<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel Grid4dMult ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   Grid4d<T> &me;
   const Grid4d<S> &other;
@@ -1144,7 +1130,7 @@ template<class T, class S> struct Grid4dDiv : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other) const
+  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other)
   {
     me[idx] /= other[idx];
   }
@@ -1158,21 +1144,17 @@ template<class T, class S> struct Grid4dDiv : public KernelBase {
     return other;
   }
   typedef Grid4d<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel Grid4dDiv ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   Grid4d<T> &me;
   const Grid4d<S> &other;
@@ -1183,7 +1165,7 @@ template<class T, class S> struct Grid4dAddScalar : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid4d<T> &me, const S &other) const
+  inline void op(IndexInt idx, Grid4d<T> &me, const S &other)
   {
     me[idx] += other;
   }
@@ -1197,21 +1179,17 @@ template<class T, class S> struct Grid4dAddScalar : public KernelBase {
     return other;
   }
   typedef S type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel Grid4dAddScalar ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   Grid4d<T> &me;
   const S &other;
@@ -1222,7 +1200,7 @@ template<class T, class S> struct Grid4dMultScalar : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid4d<T> &me, const S &other) const
+  inline void op(IndexInt idx, Grid4d<T> &me, const S &other)
   {
     me[idx] *= other;
   }
@@ -1236,21 +1214,17 @@ template<class T, class S> struct Grid4dMultScalar : public KernelBase {
     return other;
   }
   typedef S type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel Grid4dMultScalar ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   Grid4d<T> &me;
   const S &other;
@@ -1262,7 +1236,7 @@ template<class T, class S> struct Grid4dScaledAdd : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<T> &other, const S &factor) const
+  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<T> &other, const S &factor)
   {
     me[idx] += factor * other[idx];
   }
@@ -1281,21 +1255,17 @@ template<class T, class S> struct Grid4dScaledAdd : public KernelBase {
     return factor;
   }
   typedef S type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel Grid4dScaledAdd ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other, factor);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other, factor);
+    }
   }
   Grid4d<T> &me;
   const Grid4d<T> &other;
@@ -1308,7 +1278,7 @@ template<class T> struct Grid4dSafeDiv : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<T> &other) const
+  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<T> &other)
   {
     me[idx] = safeDivide(me[idx], other[idx]);
   }
@@ -1322,21 +1292,17 @@ template<class T> struct Grid4dSafeDiv : public KernelBase {
     return other;
   }
   typedef Grid4d<T> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel Grid4dSafeDiv ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   Grid4d<T> &me;
   const Grid4d<T> &other;
@@ -1347,7 +1313,7 @@ template<class T> struct Grid4dSetConst : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid4d<T> &me, T value) const
+  inline void op(IndexInt idx, Grid4d<T> &me, T value)
   {
     me[idx] = value;
   }
@@ -1361,21 +1327,17 @@ template<class T> struct Grid4dSetConst : public KernelBase {
     return value;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel Grid4dSetConst ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, value);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, value);
+    }
   }
   Grid4d<T> &me;
   T value;
@@ -1473,7 +1435,7 @@ template<class S> struct KnInterpolateGrid4dTempl : public KernelBase {
                  Grid4d<S> &target,
                  Grid4d<S> &source,
                  const Vec4 &sourceFactor,
-                 Vec4 offset) const
+                 Vec4 offset)
   {
     Vec4 pos = Vec4(i, j, k, t) * sourceFactor + offset;
     if (!source.is3D())
@@ -1502,50 +1464,47 @@ template<class S> struct KnInterpolateGrid4dTempl : public KernelBase {
     return offset;
   }
   typedef Vec4 type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnInterpolateGrid4dTempl ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ
-               << " "
-                  " t "
-               << minT << " - " << maxT,
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
     if (maxT > 1) {
-      for (int t = __r.begin(); t != (int)__r.end(); t++)
-        for (int k = 0; k < maxZ; k++)
-          for (int j = 0; j < maxY; j++)
-            for (int i = 0; i < maxX; i++)
-              op(i, j, k, t, target, source, sourceFactor, offset);
+      const int _maxZ = maxZ;
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int t = 0; t < maxT; t++)
+          for (int k = 0; k < _maxZ; k++)
+            for (int j = 0; j < _maxY; j++)
+              for (int i = 0; i < _maxX; i++)
+                op(i, j, k, t, target, source, sourceFactor, offset);
+      }
     }
     else if (maxZ > 1) {
       const int t = 0;
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < maxY; j++)
-          for (int i = 0; i < maxX; i++)
-            op(i, j, k, t, target, source, sourceFactor, offset);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, t, target, source, sourceFactor, offset);
+      }
     }
     else {
       const int t = 0;
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < maxX; i++)
-          op(i, j, k, t, target, source, sourceFactor, offset);
-    }
-  }
-  void run()
-  {
-    if (maxT > 1) {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minT, maxT), *this);
-    }
-    else if (maxZ > 1) {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    }
-    else {
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, t, target, source, sourceFactor, offset);
+      }
     }
   }
   Grid4d<S> &target;
diff --git a/extern/mantaflow/preprocessed/kernel.h b/extern/mantaflow/preprocessed/kernel.h
index dbcc2342a11..90e30cd21e1 100644
--- a/extern/mantaflow/preprocessed/kernel.h
+++ b/extern/mantaflow/preprocessed/kernel.h
@@ -71,19 +71,6 @@ class ParticleBase;
       for (int j = bnd; j < (grid).getSizeY() - bnd; ++j) \
         for (int i = bnd; i < (grid).getSizeX() - bnd; ++i)
 
-#define FOR_NEIGHBORS_BND(grid, radius, bnd) \
-  for (int zj = ((grid).is3D() ? std::max(bnd, k - radius) : 0); \
-       zj <= ((grid).is3D() ? std::min(k + radius, (grid).getSizeZ() - 1 - bnd) : 0); \
-       zj++) \
-    for (int yj = std::max(bnd, j - radius); \
-         yj <= std::min(j + radius, (grid).getSizeY() - 1 - bnd); \
-         yj++) \
-      for (int xj = std::max(bnd, i - radius); \
-           xj <= std::min(i + radius, (grid).getSizeX() - 1 - bnd); \
-           xj++)
-
-#define FOR_NEIGHBORS(grid, radius) FOR_NEIGHBORS_BND(grid, radius, 0)
-
 //! Basic data structure for kernel data, initialized based on kernel type (e.g. single, idx, etc).
 struct KernelBase {
   int maxX, maxY, maxZ, minZ, maxT, minT;
diff --git a/extern/mantaflow/preprocessed/levelset.cpp b/extern/mantaflow/preprocessed/levelset.cpp
index c4fecace7ca..1ce18fab3f6 100644
--- a/extern/mantaflow/preprocessed/levelset.cpp
+++ b/extern/mantaflow/preprocessed/levelset.cpp
@@ -62,7 +62,7 @@ struct InitFmIn : public KernelBase {
                  Grid<int> &fmFlags,
                  Grid<Real> &phi,
                  bool ignoreWalls,
-                 int obstacleType) const
+                 int obstacleType)
   {
     const IndexInt idx = flags.index(i, j, k);
     const Real v = phi[idx];
@@ -104,37 +104,35 @@ struct InitFmIn : public KernelBase {
     return obstacleType;
   }
   typedef int type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel InitFmIn ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, fmFlags, phi, ignoreWalls, obstacleType);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, fmFlags, phi, ignoreWalls, obstacleType);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, fmFlags, phi, ignoreWalls, obstacleType);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, fmFlags, phi, ignoreWalls, obstacleType);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<int> &fmFlags;
   Grid<Real> &phi;
@@ -165,7 +163,7 @@ struct InitFmOut : public KernelBase {
                  Grid<int> &fmFlags,
                  Grid<Real> &phi,
                  bool ignoreWalls,
-                 int obstacleType) const
+                 int obstacleType)
   {
     const IndexInt idx = flags.index(i, j, k);
     const Real v = phi[idx];
@@ -205,37 +203,35 @@ struct InitFmOut : public KernelBase {
     return obstacleType;
   }
   typedef int type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel InitFmOut ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, fmFlags, phi, ignoreWalls, obstacleType);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, fmFlags, phi, ignoreWalls, obstacleType);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, fmFlags, phi, ignoreWalls, obstacleType);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, fmFlags, phi, ignoreWalls, obstacleType);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<int> &fmFlags;
   Grid<Real> &phi;
@@ -269,7 +265,7 @@ struct SetUninitialized : public KernelBase {
                  Grid<Real> &phi,
                  const Real val,
                  int ignoreWalls,
-                 int obstacleType) const
+                 int obstacleType)
   {
     if (ignoreWalls) {
       if ((fmFlags(i, j, k) != FlagInited) && ((flags(i, j, k) & obstacleType) == 0)) {
@@ -311,37 +307,35 @@ struct SetUninitialized : public KernelBase {
     return obstacleType;
   }
   typedef int type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel SetUninitialized ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, fmFlags, phi, val, ignoreWalls, obstacleType);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, fmFlags, phi, val, ignoreWalls, obstacleType);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, fmFlags, phi, val, ignoreWalls, obstacleType);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, fmFlags, phi, val, ignoreWalls, obstacleType);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const Grid<int> &flags;
   Grid<int> &fmFlags;
   Grid<Real> &phi;
@@ -371,7 +365,8 @@ inline bool isAtInterface(const Grid<int> &fmFlags, Grid<Real> &phi, const Vec3i
 //************************************************************************
 // Levelset class def
 
-LevelsetGrid::LevelsetGrid(FluidSolver *parent, bool show) : Grid<Real>(parent, show)
+LevelsetGrid::LevelsetGrid(FluidSolver *parent, bool show, bool offload)
+    : Grid<Real>(parent, show, false /*sparse*/, offload)
 {
   mType = (GridType)(TypeLevelset | TypeReal);
 }
@@ -394,7 +389,7 @@ struct KnJoin : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<Real> &a, const Grid<Real> &b) const
+  inline void op(IndexInt idx, Grid<Real> &a, const Grid<Real> &b)
   {
     a[idx] = min(a[idx], b[idx]);
   }
@@ -408,21 +403,17 @@ struct KnJoin : public KernelBase {
     return b;
   }
   typedef Grid<Real> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnJoin ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, a, b);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, a, b);
+    }
   }
   Grid<Real> &a;
   const Grid<Real> &b;
@@ -440,11 +431,8 @@ struct KnSubtract : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx,
-                 Grid<Real> &a,
-                 const Grid<Real> &b,
-                 const FlagGrid *flags,
-                 int subtractType) const
+  inline void op(
+      IndexInt idx, Grid<Real> &a, const Grid<Real> &b, const FlagGrid *flags, int subtractType)
   {
     if (flags && ((*flags)(idx)&subtractType) == 0)
       return;
@@ -471,21 +459,17 @@ struct KnSubtract : public KernelBase {
     return subtractType;
   }
   typedef int type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnSubtract ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, a, b, flags, subtractType);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, a, b, flags, subtractType);
+    }
   }
   Grid<Real> &a;
   const Grid<Real> &b;
@@ -668,7 +652,7 @@ struct KnFillApply : public KernelBase {
                  Grid<int> &visited,
                  const Real value,
                  const int boundaryWidth,
-                 const bool outside) const
+                 const bool outside)
   {
 
     if (visited(i, j, k) == ID_VISITED)
@@ -706,37 +690,35 @@ struct KnFillApply : public KernelBase {
     return outside;
   }
   typedef bool type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnFillApply ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = boundaryWidth; j < _maxY; j++)
-          for (int i = boundaryWidth; i < _maxX; i++)
-            op(i, j, k, target, visited, value, boundaryWidth, outside);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = boundaryWidth; j < _maxY; j++)
+            for (int i = boundaryWidth; i < _maxX; i++)
+              op(i, j, k, target, visited, value, boundaryWidth, outside);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = boundaryWidth; i < _maxX; i++)
-          op(i, j, k, target, visited, value, boundaryWidth, outside);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = boundaryWidth; j < _maxY; j++)
+          for (int i = boundaryWidth; i < _maxX; i++)
+            op(i, j, k, target, visited, value, boundaryWidth, outside);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(boundaryWidth, maxY), *this);
-  }
   Grid<Real> &target;
   Grid<int> &visited;
   const Real value;
diff --git a/extern/mantaflow/preprocessed/levelset.h b/extern/mantaflow/preprocessed/levelset.h
index ae162f73c3d..eea604ca2fd 100644
--- a/extern/mantaflow/preprocessed/levelset.h
+++ b/extern/mantaflow/preprocessed/levelset.h
@@ -27,7 +27,7 @@ class Mesh;
 //! Special function for levelsets
 class LevelsetGrid : public Grid<Real> {
  public:
-  LevelsetGrid(FluidSolver *parent, bool show = true);
+  LevelsetGrid(FluidSolver *parent, bool show = true, bool offload = false);
   static int _W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
   {
     PbClass *obj = Pb::objFromPy(_self);
@@ -41,7 +41,8 @@ class LevelsetGrid : public Grid<Real> {
         ArgLocker _lock;
         FluidSolver *parent = _args.getPtr<FluidSolver>("parent", 0, &_lock);
         bool show = _args.getOpt<bool>("show", 1, true, &_lock);
-        obj = new LevelsetGrid(parent, show);
+        bool offload = _args.getOpt<bool>("offload", 2, false, &_lock);
+        obj = new LevelsetGrid(parent, show, offload);
         obj->registerObject(_self, &_args);
         _args.check();
       }
@@ -266,7 +267,6 @@ class LevelsetGrid : public Grid<Real> {
   }
 
   static Real invalidTimeValue();
-
  public:
   PbArgs _args;
 }
diff --git a/extern/mantaflow/preprocessed/mesh.cpp b/extern/mantaflow/preprocessed/mesh.cpp
index 79ddc4bd879..7eb69438e7f 100644
--- a/extern/mantaflow/preprocessed/mesh.cpp
+++ b/extern/mantaflow/preprocessed/mesh.cpp
@@ -344,22 +344,19 @@ void Mesh::rebuildChannels()
     mNodeChannels[i]->resize(mNodes.size());
 }
 
-struct _KnAdvectMeshInGrid : public KernelBase {
-  _KnAdvectMeshInGrid(const KernelBase &base,
-                      vector<Node> &nodes,
-                      const FlagGrid &flags,
-                      const MACGrid &vel,
-                      const Real dt,
-                      vector<Vec3> &u)
-      : KernelBase(base), nodes(nodes), flags(flags), vel(vel), dt(dt), u(u)
+struct KnAdvectMeshInGrid : public KernelBase {
+  KnAdvectMeshInGrid(vector<Node> &nodes, const FlagGrid &flags, const MACGrid &vel, const Real dt)
+      : KernelBase(nodes.size()), nodes(nodes), flags(flags), vel(vel), dt(dt), u((size))
   {
+    runMessage();
+    run();
   }
   inline void op(IndexInt idx,
                  vector<Node> &nodes,
                  const FlagGrid &flags,
                  const MACGrid &vel,
                  const Real dt,
-                 vector<Vec3> &u) const
+                 vector<Vec3> &u)
   {
     if (nodes[idx].flags & Mesh::NfFixed)
       u[idx] = 0.0;
@@ -368,38 +365,6 @@ struct _KnAdvectMeshInGrid : public KernelBase {
     else
       u[idx] = vel.getInterpolated(nodes[idx].pos) * dt;
   }
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, nodes, flags, vel, dt, u);
-  }
-  void run()
-  {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  vector<Node> &nodes;
-  const FlagGrid &flags;
-  const MACGrid &vel;
-  const Real dt;
-  vector<Vec3> &u;
-};
-struct KnAdvectMeshInGrid : public KernelBase {
-  KnAdvectMeshInGrid(vector<Node> &nodes, const FlagGrid &flags, const MACGrid &vel, const Real dt)
-      : KernelBase(nodes.size()),
-        _inner(KernelBase(nodes.size()), nodes, flags, vel, dt, u),
-        nodes(nodes),
-        flags(flags),
-        vel(vel),
-        dt(dt),
-        u((size))
-  {
-    runMessage();
-    run();
-  }
-  void run()
-  {
-    _inner.run();
-  }
   inline operator vector<Vec3>()
   {
     return u;
@@ -428,14 +393,18 @@ struct KnAdvectMeshInGrid : public KernelBase {
     return dt;
   }
   typedef Real type3;
-  void runMessage()
+  void runMessage(){};
+  void run()
   {
-    debMsg("Executing kernel KnAdvectMeshInGrid ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  _KnAdvectMeshInGrid _inner;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, nodes, flags, vel, dt, u);
+    }
+  }
   vector<Node> &nodes;
   const FlagGrid &flags;
   const MACGrid &vel;
@@ -1012,7 +981,7 @@ template<class T> struct ApplyMeshToGrid : public KernelBase {
     run();
   }
   inline void op(
-      int i, int j, int k, Grid<T> *grid, Grid<Real> &sdf, T value, FlagGrid *respectFlags) const
+      int i, int j, int k, Grid<T> *grid, Grid<Real> &sdf, T value, FlagGrid *respectFlags)
   {
     if (respectFlags && respectFlags->isObstacle(i, j, k))
       return;
@@ -1040,37 +1009,35 @@ template<class T> struct ApplyMeshToGrid : public KernelBase {
     return respectFlags;
   }
   typedef FlagGrid type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel ApplyMeshToGrid ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, grid, sdf, value, respectFlags);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, grid, sdf, value, respectFlags);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, grid, sdf, value, respectFlags);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, grid, sdf, value, respectFlags);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<T> *grid;
   Grid<Real> &sdf;
   T value;
@@ -1431,7 +1398,7 @@ template<class T> struct knSetMdataConst : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<T> &mdata, T value) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &mdata, T value)
   {
     mdata[idx] = value;
   }
@@ -1445,21 +1412,17 @@ template<class T> struct knSetMdataConst : public KernelBase {
     return value;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetMdataConst ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, mdata, value);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, mdata, value);
+    }
   }
   MeshDataImpl<T> &mdata;
   T value;
@@ -1472,7 +1435,7 @@ template<class T, class S> struct knMdataSet : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<T> &me, const MeshDataImpl<S> &other) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &me, const MeshDataImpl<S> &other)
   {
     me[idx] += other[idx];
   }
@@ -1486,21 +1449,17 @@ template<class T, class S> struct knMdataSet : public KernelBase {
     return other;
   }
   typedef MeshDataImpl<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataSet ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   MeshDataImpl<T> &me;
   const MeshDataImpl<S> &other;
@@ -1512,7 +1471,7 @@ template<class T, class S> struct knMdataAdd : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<T> &me, const MeshDataImpl<S> &other) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &me, const MeshDataImpl<S> &other)
   {
     me[idx] += other[idx];
   }
@@ -1526,21 +1485,17 @@ template<class T, class S> struct knMdataAdd : public KernelBase {
     return other;
   }
   typedef MeshDataImpl<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataAdd ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   MeshDataImpl<T> &me;
   const MeshDataImpl<S> &other;
@@ -1552,7 +1507,7 @@ template<class T, class S> struct knMdataSub : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<T> &me, const MeshDataImpl<S> &other) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &me, const MeshDataImpl<S> &other)
   {
     me[idx] -= other[idx];
   }
@@ -1566,21 +1521,17 @@ template<class T, class S> struct knMdataSub : public KernelBase {
     return other;
   }
   typedef MeshDataImpl<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataSub ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   MeshDataImpl<T> &me;
   const MeshDataImpl<S> &other;
@@ -1592,7 +1543,7 @@ template<class T, class S> struct knMdataMult : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<T> &me, const MeshDataImpl<S> &other) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &me, const MeshDataImpl<S> &other)
   {
     me[idx] *= other[idx];
   }
@@ -1606,21 +1557,17 @@ template<class T, class S> struct knMdataMult : public KernelBase {
     return other;
   }
   typedef MeshDataImpl<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataMult ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   MeshDataImpl<T> &me;
   const MeshDataImpl<S> &other;
@@ -1632,7 +1579,7 @@ template<class T, class S> struct knMdataDiv : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<T> &me, const MeshDataImpl<S> &other) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &me, const MeshDataImpl<S> &other)
   {
     me[idx] /= other[idx];
   }
@@ -1646,21 +1593,17 @@ template<class T, class S> struct knMdataDiv : public KernelBase {
     return other;
   }
   typedef MeshDataImpl<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataDiv ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   MeshDataImpl<T> &me;
   const MeshDataImpl<S> &other;
@@ -1673,7 +1616,7 @@ template<class T, class S> struct knMdataSetScalar : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<T> &me, const S &other) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &me, const S &other)
   {
     me[idx] = other;
   }
@@ -1687,21 +1630,17 @@ template<class T, class S> struct knMdataSetScalar : public KernelBase {
     return other;
   }
   typedef S type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataSetScalar ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   MeshDataImpl<T> &me;
   const S &other;
@@ -1713,7 +1652,7 @@ template<class T, class S> struct knMdataAddScalar : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<T> &me, const S &other) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &me, const S &other)
   {
     me[idx] += other;
   }
@@ -1727,21 +1666,17 @@ template<class T, class S> struct knMdataAddScalar : public KernelBase {
     return other;
   }
   typedef S type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataAddScalar ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   MeshDataImpl<T> &me;
   const S &other;
@@ -1753,7 +1688,7 @@ template<class T, class S> struct knMdataMultScalar : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<T> &me, const S &other) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &me, const S &other)
   {
     me[idx] *= other;
   }
@@ -1767,21 +1702,17 @@ template<class T, class S> struct knMdataMultScalar : public KernelBase {
     return other;
   }
   typedef S type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataMultScalar ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   MeshDataImpl<T> &me;
   const S &other;
@@ -1793,10 +1724,7 @@ template<class T, class S> struct knMdataScaledAdd : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx,
-                 MeshDataImpl<T> &me,
-                 const MeshDataImpl<T> &other,
-                 const S &factor) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &me, const MeshDataImpl<T> &other, const S &factor)
   {
     me[idx] += factor * other[idx];
   }
@@ -1815,21 +1743,17 @@ template<class T, class S> struct knMdataScaledAdd : public KernelBase {
     return factor;
   }
   typedef S type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataScaledAdd ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other, factor);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other, factor);
+    }
   }
   MeshDataImpl<T> &me;
   const MeshDataImpl<T> &other;
@@ -1843,7 +1767,7 @@ template<class T> struct knMdataSafeDiv : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<T> &me, const MeshDataImpl<T> &other) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &me, const MeshDataImpl<T> &other)
   {
     me[idx] = safeDivide(me[idx], other[idx]);
   }
@@ -1857,21 +1781,17 @@ template<class T> struct knMdataSafeDiv : public KernelBase {
     return other;
   }
   typedef MeshDataImpl<T> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataSafeDiv ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   MeshDataImpl<T> &me;
   const MeshDataImpl<T> &other;
@@ -1883,7 +1803,7 @@ template<class T> struct knMdataSetConst : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<T> &mdata, T value) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &mdata, T value)
   {
     mdata[idx] = value;
   }
@@ -1897,21 +1817,17 @@ template<class T> struct knMdataSetConst : public KernelBase {
     return value;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataSetConst ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, mdata, value);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, mdata, value);
+    }
   }
   MeshDataImpl<T> &mdata;
   T value;
@@ -1924,7 +1840,7 @@ template<class T> struct knMdataClamp : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<T> &me, T min, T max) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &me, T min, T max)
   {
     me[idx] = clamp(me[idx], min, max);
   }
@@ -1943,21 +1859,17 @@ template<class T> struct knMdataClamp : public KernelBase {
     return max;
   }
   typedef T type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataClamp ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, min, max);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, min, max);
+    }
   }
   MeshDataImpl<T> &me;
   T min;
@@ -1969,7 +1881,7 @@ template<class T> struct knMdataClampMin : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<T> &me, const T vmin) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &me, const T vmin)
   {
     me[idx] = std::max(vmin, me[idx]);
   }
@@ -1983,21 +1895,17 @@ template<class T> struct knMdataClampMin : public KernelBase {
     return vmin;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataClampMin ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, vmin);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, vmin);
+    }
   }
   MeshDataImpl<T> &me;
   const T vmin;
@@ -2008,7 +1916,7 @@ template<class T> struct knMdataClampMax : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<T> &me, const T vmax) const
+  inline void op(IndexInt idx, MeshDataImpl<T> &me, const T vmax)
   {
     me[idx] = std::min(vmax, me[idx]);
   }
@@ -2022,21 +1930,17 @@ template<class T> struct knMdataClampMax : public KernelBase {
     return vmax;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataClampMax ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, vmax);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, vmax);
+    }
   }
   MeshDataImpl<T> &me;
   const T vmax;
@@ -2048,7 +1952,7 @@ struct knMdataClampMinVec3 : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<Vec3> &me, const Real vmin) const
+  inline void op(IndexInt idx, MeshDataImpl<Vec3> &me, const Real vmin)
   {
     me[idx].x = std::max(vmin, me[idx].x);
     me[idx].y = std::max(vmin, me[idx].y);
@@ -2064,21 +1968,17 @@ struct knMdataClampMinVec3 : public KernelBase {
     return vmin;
   }
   typedef Real type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataClampMinVec3 ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, vmin);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, vmin);
+    }
   }
   MeshDataImpl<Vec3> &me;
   const Real vmin;
@@ -2090,7 +1990,7 @@ struct knMdataClampMaxVec3 : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, MeshDataImpl<Vec3> &me, const Real vmax) const
+  inline void op(IndexInt idx, MeshDataImpl<Vec3> &me, const Real vmax)
   {
     me[idx].x = std::min(vmax, me[idx].x);
     me[idx].y = std::min(vmax, me[idx].y);
@@ -2106,21 +2006,17 @@ struct knMdataClampMaxVec3 : public KernelBase {
     return vmax;
   }
   typedef Real type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataClampMaxVec3 ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, vmax);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, vmax);
+    }
   }
   MeshDataImpl<Vec3> &me;
   const Real vmax;
@@ -2162,7 +2058,7 @@ template<class T, class S> struct knMdataSetScalarIntFlag : public KernelBase {
                  MeshDataImpl<T> &me,
                  const S &other,
                  const MeshDataImpl<int> &t,
-                 const int itype) const
+                 const int itype)
   {
     if (t[idx] & itype)
       me[idx] = other;
@@ -2187,21 +2083,17 @@ template<class T, class S> struct knMdataSetScalarIntFlag : public KernelBase {
     return itype;
   }
   typedef int type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMdataSetScalarIntFlag ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other, t, itype);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other, t, itype);
+    }
   }
   MeshDataImpl<T> &me;
   const S &other;
@@ -2311,29 +2203,21 @@ template<typename T> struct KnPtsSum : public KernelBase {
     return itype;
   }
   typedef int type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnPtsSum ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, t, itype, result);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  KnPtsSum(KnPtsSum &o, tbb::split)
-      : KernelBase(o), val(o.val), t(o.t), itype(o.itype), result(T(0.))
-  {
-  }
-  void join(const KnPtsSum &o)
-  {
-    result += o.result;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      T result = T(0.);
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, t, itype, result);
+#pragma omp critical
+      {
+        this->result += result;
+      }
+    }
   }
   const MeshDataImpl<T> &val;
   const MeshDataImpl<int> *t;
@@ -2363,28 +2247,21 @@ template<typename T> struct KnPtsSumSquare : public KernelBase {
     return val;
   }
   typedef MeshDataImpl<T> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnPtsSumSquare ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, result);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  KnPtsSumSquare(KnPtsSumSquare &o, tbb::split) : KernelBase(o), val(o.val), result(0.)
-  {
-  }
-  void join(const KnPtsSumSquare &o)
-  {
-    result += o.result;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real result = 0.;
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, result);
+#pragma omp critical
+      {
+        this->result += result;
+      }
+    }
   }
   const MeshDataImpl<T> &val;
   Real result;
@@ -2412,28 +2289,21 @@ template<typename T> struct KnPtsSumMagnitude : public KernelBase {
     return val;
   }
   typedef MeshDataImpl<T> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnPtsSumMagnitude ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, result);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  KnPtsSumMagnitude(KnPtsSumMagnitude &o, tbb::split) : KernelBase(o), val(o.val), result(0.)
-  {
-  }
-  void join(const KnPtsSumMagnitude &o)
-  {
-    result += o.result;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real result = 0.;
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, result);
+#pragma omp critical
+      {
+        this->result += result;
+      }
+    }
   }
   const MeshDataImpl<T> &val;
   Real result;
@@ -2479,29 +2349,21 @@ struct CompMdata_Min : public KernelBase {
     return val;
   }
   typedef MeshDataImpl<T> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel CompMdata_Min ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, minVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  CompMdata_Min(CompMdata_Min &o, tbb::split)
-      : KernelBase(o), val(o.val), minVal(std::numeric_limits<Real>::max())
-  {
-  }
-  void join(const CompMdata_Min &o)
-  {
-    minVal = min(minVal, o.minVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real minVal = std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, minVal);
+#pragma omp critical
+      {
+        this->minVal = min(minVal, this->minVal);
+      }
+    }
   }
   const MeshDataImpl<T> &val;
   Real minVal;
@@ -2534,29 +2396,21 @@ struct CompMdata_Max : public KernelBase {
     return val;
   }
   typedef MeshDataImpl<T> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel CompMdata_Max ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, maxVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  CompMdata_Max(CompMdata_Max &o, tbb::split)
-      : KernelBase(o), val(o.val), maxVal(-std::numeric_limits<Real>::max())
-  {
-  }
-  void join(const CompMdata_Max &o)
-  {
-    maxVal = max(maxVal, o.maxVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real maxVal = -std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, maxVal);
+#pragma omp critical
+      {
+        this->maxVal = max(maxVal, this->maxVal);
+      }
+    }
   }
   const MeshDataImpl<T> &val;
   Real maxVal;
@@ -2632,29 +2486,21 @@ struct CompMdata_MinVec3 : public KernelBase {
     return val;
   }
   typedef MeshDataImpl<Vec3> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel CompMdata_MinVec3 ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, minVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  CompMdata_MinVec3(CompMdata_MinVec3 &o, tbb::split)
-      : KernelBase(o), val(o.val), minVal(-std::numeric_limits<Real>::max())
-  {
-  }
-  void join(const CompMdata_MinVec3 &o)
-  {
-    minVal = min(minVal, o.minVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real minVal = -std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, minVal);
+#pragma omp critical
+      {
+        this->minVal = min(minVal, this->minVal);
+      }
+    }
   }
   const MeshDataImpl<Vec3> &val;
   Real minVal;
@@ -2686,29 +2532,21 @@ struct CompMdata_MaxVec3 : public KernelBase {
     return val;
   }
   typedef MeshDataImpl<Vec3> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel CompMdata_MaxVec3 ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, maxVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  CompMdata_MaxVec3(CompMdata_MaxVec3 &o, tbb::split)
-      : KernelBase(o), val(o.val), maxVal(-std::numeric_limits<Real>::min())
-  {
-  }
-  void join(const CompMdata_MaxVec3 &o)
-  {
-    maxVal = max(maxVal, o.maxVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real maxVal = -std::numeric_limits<Real>::min();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, maxVal);
+#pragma omp critical
+      {
+        this->maxVal = max(maxVal, this->maxVal);
+      }
+    }
   }
   const MeshDataImpl<Vec3> &val;
   Real maxVal;
diff --git a/extern/mantaflow/preprocessed/mesh.h b/extern/mantaflow/preprocessed/mesh.h
index b5de66ce095..d3a69abc4ea 100644
--- a/extern/mantaflow/preprocessed/mesh.h
+++ b/extern/mantaflow/preprocessed/mesh.h
@@ -796,7 +796,6 @@ class Mesh : public PbClass {
   std::vector<MeshDataImpl<int> *>
       mMdataInt;  //! indicate that mdata of this mesh is copied, and needs to be freed
   bool mFreeMdata;
-
  public:
   PbArgs _args;
 }
@@ -882,7 +881,6 @@ class MeshDataBase : public PbClass {
 
  protected:
   Mesh *mMesh;
-
  public:
   PbArgs _args;
 }
@@ -1647,7 +1645,6 @@ template<class T> class MeshDataImpl : public MeshDataBase {
   //! optionally , we might have an associated grid from which to grab new data
   Grid<T> *mpGridSource;  //! unfortunately , we need to distinguish mac vs regular vec3
   bool mGridSourceMAC;
-
  public:
   PbArgs _args;
 }
diff --git a/extern/mantaflow/preprocessed/movingobs.h b/extern/mantaflow/preprocessed/movingobs.h
index 83ef6ed0c9f..0661ddf5b37 100644
--- a/extern/mantaflow/preprocessed/movingobs.h
+++ b/extern/mantaflow/preprocessed/movingobs.h
@@ -154,7 +154,6 @@ class MovingObstacle : public PbClass {
   int mEmptyType;
   int mID;
   static int sIDcnt;
-
  public:
   PbArgs _args;
 }
diff --git a/extern/mantaflow/preprocessed/multigrid.cpp b/extern/mantaflow/preprocessed/multigrid.cpp
index 9e35c6f9368..763eede540d 100644
--- a/extern/mantaflow/preprocessed/multigrid.cpp
+++ b/extern/mantaflow/preprocessed/multigrid.cpp
@@ -428,7 +428,7 @@ struct knCopyA : public KernelBase {
                  const Grid<Real> *pA0,
                  const Grid<Real> *pAi,
                  const Grid<Real> *pAj,
-                 const Grid<Real> *pAk) const
+                 const Grid<Real> *pAk)
   {
     A0[idx * stencilSize0 + 0] = (*pA0)[idx];
     A0[idx * stencilSize0 + 1] = (*pAi)[idx];
@@ -476,21 +476,17 @@ struct knCopyA : public KernelBase {
     return pAk;
   }
   typedef Grid<Real> type7;
-  void runMessage()
-  {
-    debMsg("Executing kernel knCopyA ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, sizeRef, A0, stencilSize0, is3D, pA0, pAi, pAj, pAk);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, sizeRef, A0, stencilSize0, is3D, pA0, pAi, pAj, pAk);
+    }
   }
   std::vector<Real> &sizeRef;
   std::vector<Real> &A0;
@@ -523,7 +519,7 @@ struct knActivateVertices : public KernelBase {
                  std::vector<Real> &A0,
                  bool &nonZeroStencilSumFound,
                  bool &trivialEquationsFound,
-                 const GridMg &mg) const
+                 const GridMg &mg)
   {
     // active vertices on level 0 are vertices with non-zero diagonal entry in A
     type_0[idx] = GridMg::vtInactive;
@@ -572,21 +568,17 @@ struct knActivateVertices : public KernelBase {
     return mg;
   }
   typedef GridMg type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel knActivateVertices ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, type_0, A0, nonZeroStencilSumFound, trivialEquationsFound, mg);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, type_0, A0, nonZeroStencilSumFound, trivialEquationsFound, mg);
+    }
   }
   std::vector<GridMg::VertexType> &type_0;
   std::vector<Real> &A0;
@@ -642,7 +634,7 @@ struct knSetRhs : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, std::vector<Real> &b, const Grid<Real> &rhs, const GridMg &mg) const
+  inline void op(IndexInt idx, std::vector<Real> &b, const Grid<Real> &rhs, const GridMg &mg)
   {
     b[idx] = rhs[idx];
 
@@ -666,21 +658,17 @@ struct knSetRhs : public KernelBase {
     return mg;
   }
   typedef GridMg type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetRhs ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, b, rhs, mg);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, b, rhs, mg);
+    }
   }
   std::vector<Real> &b;
   const Grid<Real> &rhs;
@@ -702,7 +690,7 @@ template<class T> struct knSet : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, std::vector<T> &data, T value) const
+  inline void op(IndexInt idx, std::vector<T> &data, T value)
   {
     data[idx] = value;
   }
@@ -716,21 +704,17 @@ template<class T> struct knSet : public KernelBase {
     return value;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSet ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, data, value);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, data, value);
+    }
   }
   std::vector<T> &data;
   T value;
@@ -743,7 +727,7 @@ template<class T> struct knCopyToVector : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, std::vector<T> &dst, const Grid<T> &src) const
+  inline void op(IndexInt idx, std::vector<T> &dst, const Grid<T> &src)
   {
     dst[idx] = src[idx];
   }
@@ -757,21 +741,17 @@ template<class T> struct knCopyToVector : public KernelBase {
     return src;
   }
   typedef Grid<T> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knCopyToVector ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, dst, src);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, dst, src);
+    }
   }
   std::vector<T> &dst;
   const Grid<T> &src;
@@ -784,7 +764,7 @@ template<class T> struct knCopyToGrid : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, const std::vector<T> &src, Grid<T> &dst) const
+  inline void op(IndexInt idx, const std::vector<T> &src, Grid<T> &dst)
   {
     dst[idx] = src[idx];
   }
@@ -798,21 +778,17 @@ template<class T> struct knCopyToGrid : public KernelBase {
     return dst;
   }
   typedef Grid<T> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knCopyToGrid ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, src, dst);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, src, dst);
+    }
   }
   const std::vector<T> &src;
   Grid<T> &dst;
@@ -825,7 +801,7 @@ template<class T> struct knAddAssign : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, std::vector<T> &dst, const std::vector<T> &src) const
+  inline void op(IndexInt idx, std::vector<T> &dst, const std::vector<T> &src)
   {
     dst[idx] += src[idx];
   }
@@ -839,21 +815,17 @@ template<class T> struct knAddAssign : public KernelBase {
     return src;
   }
   typedef std::vector<T> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knAddAssign ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, dst, src);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, dst, src);
+    }
   }
   std::vector<T> &dst;
   const std::vector<T> &src;
@@ -930,7 +902,7 @@ struct knActivateCoarseVertices : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, std::vector<GridMg::VertexType> &type, int unused) const
+  inline void op(IndexInt idx, std::vector<GridMg::VertexType> &type, int unused)
   {
     // set all remaining 'free' vertices to 'removed',
     if (type[idx] == GridMg::vtFree)
@@ -952,21 +924,17 @@ struct knActivateCoarseVertices : public KernelBase {
     return unused;
   }
   typedef int type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knActivateCoarseVertices ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, type, unused);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, type, unused);
+    }
   }
   std::vector<GridMg::VertexType> &type;
   int unused;
@@ -1052,11 +1020,8 @@ struct knGenCoarseGridOperator : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx,
-                 std::vector<Real> &sizeRef,
-                 std::vector<Real> &A,
-                 int l,
-                 const GridMg &mg) const
+  inline void op(
+      IndexInt idx, std::vector<Real> &sizeRef, std::vector<Real> &A, int l, const GridMg &mg)
   {
     if (mg.mType[l][idx] == GridMg::vtInactive)
       return;
@@ -1178,21 +1143,17 @@ struct knGenCoarseGridOperator : public KernelBase {
     return mg;
   }
   typedef GridMg type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knGenCoarseGridOperator ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, sizeRef, A, l, mg);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for schedule(static, 1)
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, sizeRef, A, l, mg);
+    }
   }
   std::vector<Real> &sizeRef;
   std::vector<Real> &A;
@@ -1232,7 +1193,7 @@ struct knSmoothColor : public KernelBase {
                  const Vec3i &blockSize,
                  const std::vector<Vec3i> &colorOffs,
                  int l,
-                 const GridMg &mg) const
+                 const GridMg &mg)
   {
     Vec3i blockOff(int(idx) % blockSize.x,
                    (int(idx) % (blockSize.x * blockSize.y)) / blockSize.x,
@@ -1318,21 +1279,17 @@ struct knSmoothColor : public KernelBase {
     return mg;
   }
   typedef GridMg type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSmoothColor ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, numBlocks, x, blockSize, colorOffs, l, mg);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for schedule(static, 1)
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, numBlocks, x, blockSize, colorOffs, l, mg);
+    }
   }
   ThreadSize &numBlocks;
   std::vector<Real> &x;
@@ -1386,7 +1343,7 @@ struct knCalcResidual : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, std::vector<Real> &r, int l, const GridMg &mg) const
+  inline void op(IndexInt idx, std::vector<Real> &r, int l, const GridMg &mg)
   {
     if (mg.mType[l][idx] == GridMg::vtInactive)
       return;
@@ -1443,21 +1400,17 @@ struct knCalcResidual : public KernelBase {
     return mg;
   }
   typedef GridMg type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knCalcResidual ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, r, l, mg);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for schedule(static, 1)
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, r, l, mg);
+    }
   }
   std::vector<Real> &r;
   int l;
@@ -1506,29 +1459,21 @@ struct knResidualNormSumSqr : public KernelBase {
     return mg;
   }
   typedef GridMg type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knResidualNormSumSqr ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, r, l, mg, result);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  knResidualNormSumSqr(knResidualNormSumSqr &o, tbb::split)
-      : KernelBase(o), r(o.r), l(o.l), mg(o.mg), result(Real(0))
-  {
-  }
-  void join(const knResidualNormSumSqr &o)
-  {
-    result += o.result;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real result = Real(0);
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, r, l, mg, result);
+#pragma omp critical
+      {
+        this->result += result;
+      }
+    }
   }
   const vector<Real> &r;
   int l;
@@ -1700,7 +1645,7 @@ struct knRestrict : public KernelBase {
                  std::vector<Real> &dst,
                  const std::vector<Real> &src,
                  int l_dst,
-                 const GridMg &mg) const
+                 const GridMg &mg)
   {
     if (mg.mType[l_dst][idx] == GridMg::vtInactive)
       return;
@@ -1746,21 +1691,17 @@ struct knRestrict : public KernelBase {
     return mg;
   }
   typedef GridMg type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knRestrict ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, dst, src, l_dst, mg);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for schedule(static, 1)
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, dst, src, l_dst, mg);
+    }
   }
   std::vector<Real> &dst;
   const std::vector<Real> &src;
@@ -1784,7 +1725,7 @@ struct knInterpolate : public KernelBase {
                  std::vector<Real> &dst,
                  const std::vector<Real> &src,
                  int l_dst,
-                 const GridMg &mg) const
+                 const GridMg &mg)
   {
     if (mg.mType[l_dst][idx] == GridMg::vtInactive)
       return;
@@ -1827,21 +1768,17 @@ struct knInterpolate : public KernelBase {
     return mg;
   }
   typedef GridMg type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knInterpolate ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, dst, src, l_dst, mg);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for schedule(static, 1)
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, dst, src, l_dst, mg);
+    }
   }
   std::vector<Real> &dst;
   const std::vector<Real> &src;
diff --git a/extern/mantaflow/preprocessed/noisefield.h b/extern/mantaflow/preprocessed/noisefield.h
index 6ed8ac0012d..73c9de779ef 100644
--- a/extern/mantaflow/preprocessed/noisefield.h
+++ b/extern/mantaflow/preprocessed/noisefield.h
@@ -236,7 +236,6 @@ class WaveletNoiseField : public PbClass {
   static int randomSeed;
   // global reference count for noise tile
   static std::atomic<int> mNoiseReferenceCount;
-
  public:
   PbArgs _args;
 }
diff --git a/extern/mantaflow/preprocessed/particle.cpp b/extern/mantaflow/preprocessed/particle.cpp
index ad1c344d307..45234389cfe 100644
--- a/extern/mantaflow/preprocessed/particle.cpp
+++ b/extern/mantaflow/preprocessed/particle.cpp
@@ -455,7 +455,7 @@ template<class T, class S> struct knPdataAdd : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other) const
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other)
   {
     me[idx] += other[idx];
   }
@@ -469,21 +469,17 @@ template<class T, class S> struct knPdataAdd : public KernelBase {
     return other;
   }
   typedef ParticleDataImpl<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataAdd ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   ParticleDataImpl<T> &me;
   const ParticleDataImpl<S> &other;
@@ -495,7 +491,7 @@ template<class T, class S> struct knPdataSub : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other) const
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other)
   {
     me[idx] -= other[idx];
   }
@@ -509,21 +505,17 @@ template<class T, class S> struct knPdataSub : public KernelBase {
     return other;
   }
   typedef ParticleDataImpl<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataSub ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   ParticleDataImpl<T> &me;
   const ParticleDataImpl<S> &other;
@@ -535,7 +527,7 @@ template<class T, class S> struct knPdataMult : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other) const
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other)
   {
     me[idx] *= other[idx];
   }
@@ -549,21 +541,17 @@ template<class T, class S> struct knPdataMult : public KernelBase {
     return other;
   }
   typedef ParticleDataImpl<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataMult ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   ParticleDataImpl<T> &me;
   const ParticleDataImpl<S> &other;
@@ -575,7 +563,7 @@ template<class T, class S> struct knPdataDiv : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other) const
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other)
   {
     me[idx] /= other[idx];
   }
@@ -589,21 +577,17 @@ template<class T, class S> struct knPdataDiv : public KernelBase {
     return other;
   }
   typedef ParticleDataImpl<S> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataDiv ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   ParticleDataImpl<T> &me;
   const ParticleDataImpl<S> &other;
@@ -615,7 +599,7 @@ template<class T> struct knPdataSafeDiv : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<T> &other) const
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<T> &other)
   {
     me[idx] = safeDivide(me[idx], other[idx]);
   }
@@ -629,21 +613,17 @@ template<class T> struct knPdataSafeDiv : public KernelBase {
     return other;
   }
   typedef ParticleDataImpl<T> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataSafeDiv ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   ParticleDataImpl<T> &me;
   const ParticleDataImpl<T> &other;
@@ -656,7 +636,7 @@ template<class T, class S> struct knPdataSetScalar : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const S &other) const
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const S &other)
   {
     me[idx] = other;
   }
@@ -670,21 +650,17 @@ template<class T, class S> struct knPdataSetScalar : public KernelBase {
     return other;
   }
   typedef S type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataSetScalar ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   ParticleDataImpl<T> &me;
   const S &other;
@@ -696,7 +672,7 @@ template<class T, class S> struct knPdataAddScalar : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const S &other) const
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const S &other)
   {
     me[idx] += other;
   }
@@ -710,21 +686,17 @@ template<class T, class S> struct knPdataAddScalar : public KernelBase {
     return other;
   }
   typedef S type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataAddScalar ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   ParticleDataImpl<T> &me;
   const S &other;
@@ -736,7 +708,7 @@ template<class T, class S> struct knPdataMultScalar : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const S &other) const
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const S &other)
   {
     me[idx] *= other;
   }
@@ -750,21 +722,17 @@ template<class T, class S> struct knPdataMultScalar : public KernelBase {
     return other;
   }
   typedef S type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataMultScalar ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other);
+    }
   }
   ParticleDataImpl<T> &me;
   const S &other;
@@ -779,7 +747,7 @@ template<class T, class S> struct knPdataScaledAdd : public KernelBase {
   inline void op(IndexInt idx,
                  ParticleDataImpl<T> &me,
                  const ParticleDataImpl<T> &other,
-                 const S &factor) const
+                 const S &factor)
   {
     me[idx] += factor * other[idx];
   }
@@ -798,21 +766,17 @@ template<class T, class S> struct knPdataScaledAdd : public KernelBase {
     return factor;
   }
   typedef S type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataScaledAdd ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other, factor);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other, factor);
+    }
   }
   ParticleDataImpl<T> &me;
   const ParticleDataImpl<T> &other;
@@ -826,7 +790,7 @@ template<class T> struct knPdataClamp : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const T vmin, const T vmax) const
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const T vmin, const T vmax)
   {
     me[idx] = clamp(me[idx], vmin, vmax);
   }
@@ -845,21 +809,17 @@ template<class T> struct knPdataClamp : public KernelBase {
     return vmax;
   }
   typedef T type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataClamp ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, vmin, vmax);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, vmin, vmax);
+    }
   }
   ParticleDataImpl<T> &me;
   const T vmin;
@@ -872,7 +832,7 @@ template<class T> struct knPdataClampMin : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const T vmin) const
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const T vmin)
   {
     me[idx] = std::max(vmin, me[idx]);
   }
@@ -886,21 +846,17 @@ template<class T> struct knPdataClampMin : public KernelBase {
     return vmin;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataClampMin ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, vmin);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, vmin);
+    }
   }
   ParticleDataImpl<T> &me;
   const T vmin;
@@ -912,7 +868,7 @@ template<class T> struct knPdataClampMax : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const T vmax) const
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const T vmax)
   {
     me[idx] = std::min(vmax, me[idx]);
   }
@@ -926,21 +882,17 @@ template<class T> struct knPdataClampMax : public KernelBase {
     return vmax;
   }
   typedef T type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataClampMax ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, vmax);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, vmax);
+    }
   }
   ParticleDataImpl<T> &me;
   const T vmax;
@@ -953,7 +905,7 @@ struct knPdataClampMinVec3 : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, ParticleDataImpl<Vec3> &me, const Real vmin) const
+  inline void op(IndexInt idx, ParticleDataImpl<Vec3> &me, const Real vmin)
   {
     me[idx].x = std::max(vmin, me[idx].x);
     me[idx].y = std::max(vmin, me[idx].y);
@@ -969,21 +921,17 @@ struct knPdataClampMinVec3 : public KernelBase {
     return vmin;
   }
   typedef Real type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataClampMinVec3 ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, vmin);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, vmin);
+    }
   }
   ParticleDataImpl<Vec3> &me;
   const Real vmin;
@@ -996,7 +944,7 @@ struct knPdataClampMaxVec3 : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, ParticleDataImpl<Vec3> &me, const Real vmax) const
+  inline void op(IndexInt idx, ParticleDataImpl<Vec3> &me, const Real vmax)
   {
     me[idx].x = std::min(vmax, me[idx].x);
     me[idx].y = std::min(vmax, me[idx].y);
@@ -1012,21 +960,17 @@ struct knPdataClampMaxVec3 : public KernelBase {
     return vmax;
   }
   typedef Real type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataClampMaxVec3 ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, vmax);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, vmax);
+    }
   }
   ParticleDataImpl<Vec3> &me;
   const Real vmax;
@@ -1071,7 +1015,7 @@ template<class T, class S> struct knPdataSetScalarIntFlag : public KernelBase {
                  ParticleDataImpl<T> &me,
                  const S &other,
                  const ParticleDataImpl<int> &t,
-                 const int itype) const
+                 const int itype)
   {
     if (t[idx] & itype)
       me[idx] = other;
@@ -1096,21 +1040,17 @@ template<class T, class S> struct knPdataSetScalarIntFlag : public KernelBase {
     return itype;
   }
   typedef int type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPdataSetScalarIntFlag ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other, t, itype);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other, t, itype);
+    }
   }
   ParticleDataImpl<T> &me;
   const S &other;
@@ -1223,29 +1163,21 @@ template<typename T> struct KnPtsSum : public KernelBase {
     return itype;
   }
   typedef int type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnPtsSum ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, t, itype, result);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  KnPtsSum(KnPtsSum &o, tbb::split)
-      : KernelBase(o), val(o.val), t(o.t), itype(o.itype), result(T(0.))
-  {
-  }
-  void join(const KnPtsSum &o)
-  {
-    result += o.result;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      T result = T(0.);
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, t, itype, result);
+#pragma omp critical
+      {
+        this->result += result;
+      }
+    }
   }
   const ParticleDataImpl<T> &val;
   const ParticleDataImpl<int> *t;
@@ -1275,28 +1207,21 @@ template<typename T> struct KnPtsSumSquare : public KernelBase {
     return val;
   }
   typedef ParticleDataImpl<T> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnPtsSumSquare ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, result);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  KnPtsSumSquare(KnPtsSumSquare &o, tbb::split) : KernelBase(o), val(o.val), result(0.)
-  {
-  }
-  void join(const KnPtsSumSquare &o)
-  {
-    result += o.result;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real result = 0.;
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, result);
+#pragma omp critical
+      {
+        this->result += result;
+      }
+    }
   }
   const ParticleDataImpl<T> &val;
   Real result;
@@ -1324,28 +1249,21 @@ template<typename T> struct KnPtsSumMagnitude : public KernelBase {
     return val;
   }
   typedef ParticleDataImpl<T> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnPtsSumMagnitude ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, result);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  KnPtsSumMagnitude(KnPtsSumMagnitude &o, tbb::split) : KernelBase(o), val(o.val), result(0.)
-  {
-  }
-  void join(const KnPtsSumMagnitude &o)
-  {
-    result += o.result;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real result = 0.;
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, result);
+#pragma omp critical
+      {
+        this->result += result;
+      }
+    }
   }
   const ParticleDataImpl<T> &val;
   Real result;
@@ -1392,29 +1310,21 @@ struct CompPdata_Min : public KernelBase {
     return val;
   }
   typedef ParticleDataImpl<T> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel CompPdata_Min ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, minVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  CompPdata_Min(CompPdata_Min &o, tbb::split)
-      : KernelBase(o), val(o.val), minVal(std::numeric_limits<Real>::max())
-  {
-  }
-  void join(const CompPdata_Min &o)
-  {
-    minVal = min(minVal, o.minVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real minVal = std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, minVal);
+#pragma omp critical
+      {
+        this->minVal = min(minVal, this->minVal);
+      }
+    }
   }
   const ParticleDataImpl<T> &val;
   Real minVal;
@@ -1447,29 +1357,21 @@ struct CompPdata_Max : public KernelBase {
     return val;
   }
   typedef ParticleDataImpl<T> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel CompPdata_Max ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, maxVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  CompPdata_Max(CompPdata_Max &o, tbb::split)
-      : KernelBase(o), val(o.val), maxVal(-std::numeric_limits<Real>::max())
-  {
-  }
-  void join(const CompPdata_Max &o)
-  {
-    maxVal = max(maxVal, o.maxVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real maxVal = -std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, maxVal);
+#pragma omp critical
+      {
+        this->maxVal = max(maxVal, this->maxVal);
+      }
+    }
   }
   const ParticleDataImpl<T> &val;
   Real maxVal;
@@ -1545,29 +1447,21 @@ struct CompPdata_MinVec3 : public KernelBase {
     return val;
   }
   typedef ParticleDataImpl<Vec3> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel CompPdata_MinVec3 ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, minVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  CompPdata_MinVec3(CompPdata_MinVec3 &o, tbb::split)
-      : KernelBase(o), val(o.val), minVal(std::numeric_limits<Real>::max())
-  {
-  }
-  void join(const CompPdata_MinVec3 &o)
-  {
-    minVal = min(minVal, o.minVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real minVal = std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, minVal);
+#pragma omp critical
+      {
+        this->minVal = min(minVal, this->minVal);
+      }
+    }
   }
   const ParticleDataImpl<Vec3> &val;
   Real minVal;
@@ -1599,29 +1493,21 @@ struct CompPdata_MaxVec3 : public KernelBase {
     return val;
   }
   typedef ParticleDataImpl<Vec3> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel CompPdata_MaxVec3 ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, val, maxVal);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  CompPdata_MaxVec3(CompPdata_MaxVec3 &o, tbb::split)
-      : KernelBase(o), val(o.val), maxVal(-std::numeric_limits<Real>::max())
-  {
-  }
-  void join(const CompPdata_MaxVec3 &o)
-  {
-    maxVal = max(maxVal, o.maxVal);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      Real maxVal = -std::numeric_limits<Real>::max();
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, val, maxVal);
+#pragma omp critical
+      {
+        this->maxVal = max(maxVal, this->maxVal);
+      }
+    }
   }
   const ParticleDataImpl<Vec3> &val;
   Real maxVal;
diff --git a/extern/mantaflow/preprocessed/particle.h b/extern/mantaflow/preprocessed/particle.h
index 7e0c64e6d03..1e3abcaca20 100644
--- a/extern/mantaflow/preprocessed/particle.h
+++ b/extern/mantaflow/preprocessed/particle.h
@@ -205,7 +205,6 @@ class ParticleBase : public PbClass {
   //! custom seed for particle systems, used by plugins
   int mSeed;  //! fix global random seed storage, used mainly by functions in this class
   static int globalSeed;
-
  public:
   PbArgs _args;
 }
@@ -629,7 +628,6 @@ template<class S> class ParticleSystem : public ParticleBase {
   std::vector<S> mData;
   //! reduce storage , called by doCompress
   virtual void compress();
-
  public:
   PbArgs _args;
 }
@@ -920,7 +918,6 @@ class ParticleIndexSystem : public ParticleSystem<ParticleIndexData> {
       return -1;
     }
   };
-
  public:
   PbArgs _args;
 }
@@ -985,7 +982,6 @@ template<class DATA, class CON> class ConnectedParticleSystem : public ParticleS
  protected:
   std::vector<CON> mSegments;
   virtual void compress();
-
  public:
   PbArgs _args;
 }
@@ -1075,7 +1071,6 @@ class ParticleDataBase : public PbClass {
 
  protected:
   ParticleBase *mpParticleSys;
-
  public:
   PbArgs _args;
 }
@@ -1848,7 +1843,6 @@ template<class T> class ParticleDataImpl : public ParticleDataBase {
   //! optionally , we might have an associated grid from which to grab new data
   Grid<T> *mpGridSource;  //! unfortunately , we need to distinguish mac vs regular vec3
   bool mGridSourceMAC;
-
  public:
   PbArgs _args;
 }
@@ -1912,19 +1906,17 @@ template<class S> void ParticleSystem<S>::transformPositions(Vec3i dimOld, Vec3i
 
 // check for deletion/invalid position, otherwise return velocity
 
-template<class S> struct _GridAdvectKernel : public KernelBase {
-  _GridAdvectKernel(const KernelBase &base,
-                    std::vector<S> &p,
-                    const MACGrid &vel,
-                    const FlagGrid &flags,
-                    const Real dt,
-                    const bool deleteInObstacle,
-                    const bool stopInObstacle,
-                    const bool skipNew,
-                    const ParticleDataImpl<int> *ptype,
-                    const int exclude,
-                    std::vector<Vec3> &u)
-      : KernelBase(base),
+template<class S> struct GridAdvectKernel : public KernelBase {
+  GridAdvectKernel(std::vector<S> &p,
+                   const MACGrid &vel,
+                   const FlagGrid &flags,
+                   const Real dt,
+                   const bool deleteInObstacle,
+                   const bool stopInObstacle,
+                   const bool skipNew,
+                   const ParticleDataImpl<int> *ptype,
+                   const int exclude)
+      : KernelBase(p.size()),
         p(p),
         vel(vel),
         flags(flags),
@@ -1934,8 +1926,10 @@ template<class S> struct _GridAdvectKernel : public KernelBase {
         skipNew(skipNew),
         ptype(ptype),
         exclude(exclude),
-        u(u)
+        u((size))
   {
+    runMessage();
+    run();
   }
   inline void op(IndexInt idx,
                  std::vector<S> &p,
@@ -1947,7 +1941,7 @@ template<class S> struct _GridAdvectKernel : public KernelBase {
                  const bool skipNew,
                  const ParticleDataImpl<int> *ptype,
                  const int exclude,
-                 std::vector<Vec3> &u) const
+                 std::vector<Vec3> &u)
   {
     if ((p[idx].flag & ParticleBase::PDELETE) || (ptype && ((*ptype)[idx] & exclude)) ||
         (skipNew && (p[idx].flag & ParticleBase::PNEW))) {
@@ -1968,66 +1962,6 @@ template<class S> struct _GridAdvectKernel : public KernelBase {
     }
     u[idx] = vel.getInterpolated(p[idx].pos) * dt;
   }
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, p, vel, flags, dt, deleteInObstacle, stopInObstacle, skipNew, ptype, exclude, u);
-  }
-  void run()
-  {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  std::vector<S> &p;
-  const MACGrid &vel;
-  const FlagGrid &flags;
-  const Real dt;
-  const bool deleteInObstacle;
-  const bool stopInObstacle;
-  const bool skipNew;
-  const ParticleDataImpl<int> *ptype;
-  const int exclude;
-  std::vector<Vec3> &u;
-};
-template<class S> struct GridAdvectKernel : public KernelBase {
-  GridAdvectKernel(std::vector<S> &p,
-                   const MACGrid &vel,
-                   const FlagGrid &flags,
-                   const Real dt,
-                   const bool deleteInObstacle,
-                   const bool stopInObstacle,
-                   const bool skipNew,
-                   const ParticleDataImpl<int> *ptype,
-                   const int exclude)
-      : KernelBase(p.size()),
-        _inner(KernelBase(p.size()),
-               p,
-               vel,
-               flags,
-               dt,
-               deleteInObstacle,
-               stopInObstacle,
-               skipNew,
-               ptype,
-               exclude,
-               u),
-        p(p),
-        vel(vel),
-        flags(flags),
-        dt(dt),
-        deleteInObstacle(deleteInObstacle),
-        stopInObstacle(stopInObstacle),
-        skipNew(skipNew),
-        ptype(ptype),
-        exclude(exclude),
-        u((size))
-  {
-    runMessage();
-    run();
-  }
-  void run()
-  {
-    _inner.run();
-  }
   inline operator std::vector<Vec3>()
   {
     return u;
@@ -2081,14 +2015,18 @@ template<class S> struct GridAdvectKernel : public KernelBase {
     return exclude;
   }
   typedef int type8;
-  void runMessage()
+  void runMessage(){};
+  void run()
   {
-    debMsg("Executing kernel GridAdvectKernel ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  _GridAdvectKernel<S> _inner;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, p, vel, flags, dt, deleteInObstacle, stopInObstacle, skipNew, ptype, exclude, u);
+    }
+  }
   std::vector<S> &p;
   const MACGrid &vel;
   const FlagGrid &flags;
@@ -2112,7 +2050,7 @@ template<class S> struct KnDeleteInObstacle : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, std::vector<S> &p, const FlagGrid &flags) const
+  inline void op(IndexInt idx, std::vector<S> &p, const FlagGrid &flags)
   {
     if (p[idx].flag & ParticleBase::PDELETE)
       return;
@@ -2130,21 +2068,17 @@ template<class S> struct KnDeleteInObstacle : public KernelBase {
     return flags;
   }
   typedef FlagGrid type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnDeleteInObstacle ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, p, flags);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, p, flags);
+    }
   }
   std::vector<S> &p;
   const FlagGrid &flags;
@@ -2189,7 +2123,7 @@ template<class S> struct KnClampPositions : public KernelBase {
                  ParticleDataImpl<Vec3> *posOld = nullptr,
                  bool stopInObstacle = true,
                  const ParticleDataImpl<int> *ptype = nullptr,
-                 const int exclude = 0) const
+                 const int exclude = 0)
   {
     if (p[idx].flag & ParticleBase::PDELETE)
       return;
@@ -2235,21 +2169,17 @@ template<class S> struct KnClampPositions : public KernelBase {
     return exclude;
   }
   typedef int type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnClampPositions ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, p, flags, posOld, stopInObstacle, ptype, exclude);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, p, flags, posOld, stopInObstacle, ptype, exclude);
+    }
   }
   std::vector<S> &p;
   const FlagGrid &flags;
@@ -2341,13 +2271,7 @@ template<class S> struct KnProjectParticles : public KernelBase {
     return rand;
   }
   typedef RandomStream type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnProjectParticles ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
+  void runMessage(){};
   void run()
   {
     const IndexInt _sz = size;
@@ -2389,7 +2313,7 @@ template<class S> struct KnProjectOutOfBnd : public KernelBase {
                  const Real bnd,
                  const bool *axis,
                  const ParticleDataImpl<int> *ptype,
-                 const int exclude) const
+                 const int exclude)
   {
     if (!part.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
       return;
@@ -2438,21 +2362,17 @@ template<class S> struct KnProjectOutOfBnd : public KernelBase {
     return exclude;
   }
   typedef int type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnProjectOutOfBnd ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, part, flags, bnd, axis, ptype, exclude);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, part, flags, bnd, axis, ptype, exclude);
+    }
   }
   ParticleSystem<S> &part;
   const FlagGrid &flags;
diff --git a/extern/mantaflow/preprocessed/plugin/advection.cpp b/extern/mantaflow/preprocessed/plugin/advection.cpp
index dd891e22088..6a548841bef 100644
--- a/extern/mantaflow/preprocessed/plugin/advection.cpp
+++ b/extern/mantaflow/preprocessed/plugin/advection.cpp
@@ -59,7 +59,7 @@ template<class T> struct SemiLagrange : public KernelBase {
                  Real dt,
                  bool isLevelset,
                  int orderSpace,
-                 int orderTrace) const
+                 int orderTrace)
   {
     if (orderTrace == 1) {
       // traceback position
@@ -117,37 +117,35 @@ template<class T> struct SemiLagrange : public KernelBase {
     return orderTrace;
   }
   typedef int type7;
-  void runMessage()
-  {
-    debMsg("Executing kernel SemiLagrange ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, vel, dst, src, dt, isLevelset, orderSpace, orderTrace);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, vel, dst, src, dt, isLevelset, orderSpace, orderTrace);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, vel, dst, src, dt, isLevelset, orderSpace, orderTrace);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, dst, src, dt, isLevelset, orderSpace, orderTrace);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   const MACGrid &vel;
   Grid<T> &dst;
@@ -189,7 +187,7 @@ struct SemiLagrangeMAC : public KernelBase {
                  const MACGrid &src,
                  Real dt,
                  int orderSpace,
-                 int orderTrace) const
+                 int orderTrace)
   {
     if (orderTrace == 1) {
       // get currect velocity at MAC position
@@ -259,37 +257,35 @@ struct SemiLagrangeMAC : public KernelBase {
     return orderTrace;
   }
   typedef int type6;
-  void runMessage()
-  {
-    debMsg("Executing kernel SemiLagrangeMAC ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, vel, dst, src, dt, orderSpace, orderTrace);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, vel, dst, src, dt, orderSpace, orderTrace);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, vel, dst, src, dt, orderSpace, orderTrace);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, dst, src, dt, orderSpace, orderTrace);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   const MACGrid &vel;
   MACGrid &dst;
@@ -331,7 +327,7 @@ template<class T> struct MacCormackCorrect : public KernelBase {
                  const Grid<T> &bwd,
                  Real strength,
                  bool isLevelSet,
-                 bool isMAC = false) const
+                 bool isMAC = false)
   {
     dst[idx] = fwd[idx];
 
@@ -380,21 +376,17 @@ template<class T> struct MacCormackCorrect : public KernelBase {
     return isMAC;
   }
   typedef bool type7;
-  void runMessage()
-  {
-    debMsg("Executing kernel MacCormackCorrect ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, flags, dst, old, fwd, bwd, strength, isLevelSet, isMAC);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, flags, dst, old, fwd, bwd, strength, isLevelSet, isMAC);
+    }
   }
   const FlagGrid &flags;
   Grid<T> &dst;
@@ -440,7 +432,7 @@ template<class T> struct MacCormackCorrectMAC : public KernelBase {
                  const Grid<T> &bwd,
                  Real strength,
                  bool isLevelSet,
-                 bool isMAC = false) const
+                 bool isMAC = false)
   {
     bool skip[3] = {false, false, false};
 
@@ -505,37 +497,35 @@ template<class T> struct MacCormackCorrectMAC : public KernelBase {
     return isMAC;
   }
   typedef bool type7;
-  void runMessage()
-  {
-    debMsg("Executing kernel MacCormackCorrectMAC ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, dst, old, fwd, bwd, strength, isLevelSet, isMAC);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, dst, old, fwd, bwd, strength, isLevelSet, isMAC);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, dst, old, fwd, bwd, strength, isLevelSet, isMAC);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, dst, old, fwd, bwd, strength, isLevelSet, isMAC);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<T> &dst;
   const Grid<T> &old;
@@ -762,7 +752,7 @@ template<class T> struct MacCormackClamp : public KernelBase {
                  const Grid<T> &orig,
                  const Grid<T> &fwd,
                  Real dt,
-                 const int clampMode) const
+                 const int clampMode)
   {
     T dval = dst(i, j, k);
     Vec3i gridUpper = flags.getSize() - 1;
@@ -830,37 +820,35 @@ template<class T> struct MacCormackClamp : public KernelBase {
     return clampMode;
   }
   typedef int type6;
-  void runMessage()
-  {
-    debMsg("Executing kernel MacCormackClamp ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   const MACGrid &vel;
   Grid<T> &dst;
@@ -901,7 +889,7 @@ struct MacCormackClampMAC : public KernelBase {
                  const MACGrid &orig,
                  const MACGrid &fwd,
                  Real dt,
-                 const int clampMode) const
+                 const int clampMode)
   {
     Vec3 pos(i, j, k);
     Vec3 dval = dst(i, j, k);
@@ -957,37 +945,35 @@ struct MacCormackClampMAC : public KernelBase {
     return clampMode;
   }
   typedef int type6;
-  void runMessage()
-  {
-    debMsg("Executing kernel MacCormackClampMAC ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   const MACGrid &vel;
   MACGrid &dst;
@@ -1016,27 +1002,39 @@ void fnAdvectSemiLagrange(FluidSolver *parent,
   bool levelset = orig.getType() & GridBase::TypeLevelset;
 
   // forward step
-  GridType fwd(parent);
-  SemiLagrange<T>(flags, vel, fwd, orig, dt, levelset, orderSpace, orderTrace);
+  GridType *fwd = new GridType(parent, true, false, false);
+  SemiLagrange<T>(flags, vel, *fwd, orig, dt, levelset, orderSpace, orderTrace);
 
   if (order == 1) {
-    orig.swap(fwd);
+#if OPENMP && OPENMP_OFFLOAD
+    orig.copyFrom(*fwd, true, false);
+#else
+    orig.swap(*fwd);
+#endif
   }
   else if (order == 2) {  // MacCormack
     GridType bwd(parent);
-    GridType newGrid(parent);
+    GridType *newGrid = new GridType(parent, true, false, false);
 
     // bwd <- backwards step
-    SemiLagrange<T>(flags, vel, bwd, fwd, -dt, levelset, orderSpace, orderTrace);
+    SemiLagrange<T>(flags, vel, bwd, *fwd, -dt, levelset, orderSpace, orderTrace);
 
     // newGrid <- compute correction
-    MacCormackCorrect<T>(flags, newGrid, orig, fwd, bwd, strength, levelset);
+    MacCormackCorrect<T>(flags, *newGrid, orig, *fwd, bwd, strength, levelset);
 
     // clamp values
-    MacCormackClamp<T>(flags, vel, newGrid, orig, fwd, dt, clampMode);
-
-    orig.swap(newGrid);
-  }
+    MacCormackClamp<T>(flags, vel, *newGrid, orig, *fwd, dt, clampMode);
+
+#if OPENMP && OPENMP_OFFLOAD
+    orig.copyFrom(*newGrid, true, false);
+#else
+    orig.swap(*newGrid);
+#endif
+    if (newGrid)
+      delete newGrid;
+  }
+  if (fwd)
+    delete fwd;
 }
 
 // outflow functions
@@ -1087,7 +1085,7 @@ struct extrapolateVelConvectiveBC : public KernelBase {
                  const MACGrid &vel,
                  MACGrid &velDst,
                  const MACGrid &velPrev,
-                 Real timeStep) const
+                 Real timeStep)
   {
     if (flags.isOutflow(i, j, k)) {
       const Vec3 bulkVel = getBulkVel(flags, vel, i, j, k);
@@ -1154,37 +1152,35 @@ struct extrapolateVelConvectiveBC : public KernelBase {
     return timeStep;
   }
   typedef Real type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel extrapolateVelConvectiveBC ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, vel, velDst, velPrev, timeStep);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, vel, velDst, velPrev, timeStep);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, vel, velDst, velPrev, timeStep);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, vel, velDst, velPrev, timeStep);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   const MACGrid &vel;
   MACGrid &velDst;
@@ -1200,8 +1196,7 @@ struct copyChangedVels : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(
-      int i, int j, int k, const FlagGrid &flags, const MACGrid &velDst, MACGrid &vel) const
+  inline void op(int i, int j, int k, const FlagGrid &flags, const MACGrid &velDst, MACGrid &vel)
   {
     if (flags.isOutflow(i, j, k))
       vel(i, j, k) = velDst(i, j, k);
@@ -1221,37 +1216,35 @@ struct copyChangedVels : public KernelBase {
     return vel;
   }
   typedef MACGrid type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel copyChangedVels ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, velDst, vel);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, velDst, vel);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, velDst, vel);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, velDst, vel);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   const MACGrid &velDst;
   MACGrid &vel;
@@ -1275,7 +1268,7 @@ struct knResetPhiInObs : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const FlagGrid &flags, Grid<Real> &sdf) const
+  inline void op(int i, int j, int k, const FlagGrid &flags, Grid<Real> &sdf)
   {
     if (flags.isObstacle(i, j, k) && (sdf(i, j, k) < 0.)) {
       sdf(i, j, k) = 0.1;
@@ -1291,37 +1284,35 @@ struct knResetPhiInObs : public KernelBase {
     return sdf;
   }
   typedef Grid<Real> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knResetPhiInObs ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, sdf);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, sdf);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, sdf);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, sdf);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<Real> &sdf;
 };
@@ -1378,33 +1369,45 @@ void fnAdvectSemiLagrange<MACGrid>(FluidSolver *parent,
   Real dt = parent->getDt();
 
   // forward step
-  MACGrid fwd(parent);
-  SemiLagrangeMAC(flags, vel, fwd, orig, dt, orderSpace, orderTrace);
+  MACGrid *fwd = new MACGrid(parent, true, false, false);
+  SemiLagrangeMAC(flags, vel, *fwd, orig, dt, orderSpace, orderTrace);
 
   if (orderSpace != 1) {
     debMsg("Warning higher order for MAC grids not yet implemented...", 1);
   }
 
   if (order == 1) {
-    applyOutflowBC(flags, fwd, orig, dt);
-    orig.swap(fwd);
+    applyOutflowBC(flags, *fwd, orig, dt);
+#if OPENMP && OPENMP_OFFLOAD
+    orig.copyFrom(*fwd, true, false);
+#else
+    orig.swap(*fwd);
+#endif
   }
   else if (order == 2) {  // MacCormack
     MACGrid bwd(parent);
-    MACGrid newGrid(parent);
+    MACGrid *newGrid = new MACGrid(parent, true, false, false);
 
     // bwd <- backwards step
-    SemiLagrangeMAC(flags, vel, bwd, fwd, -dt, orderSpace, orderTrace);
+    SemiLagrangeMAC(flags, vel, bwd, *fwd, -dt, orderSpace, orderTrace);
 
     // newGrid <- compute correction
-    MacCormackCorrectMAC<Vec3>(flags, newGrid, orig, fwd, bwd, strength, false, true);
+    MacCormackCorrectMAC<Vec3>(flags, *newGrid, orig, *fwd, bwd, strength, false, true);
 
     // clamp values
-    MacCormackClampMAC(flags, vel, newGrid, orig, fwd, dt, clampMode);
-
-    applyOutflowBC(flags, newGrid, orig, dt);
-    orig.swap(newGrid);
-  }
+    MacCormackClampMAC(flags, vel, *newGrid, orig, *fwd, dt, clampMode);
+
+    applyOutflowBC(flags, *newGrid, orig, dt);
+#if OPENMP && OPENMP_OFFLOAD
+    orig.copyFrom(*newGrid, true, false);
+#else
+    orig.swap(*newGrid);
+#endif
+    if (newGrid)
+      delete newGrid;
+  }
+  if (fwd)
+    delete fwd;
 }
 
 //! Perform semi-lagrangian advection of target Real- or Vec3 grid
diff --git a/extern/mantaflow/preprocessed/plugin/apic.cpp b/extern/mantaflow/preprocessed/plugin/apic.cpp
index 9df6b597e05..5da507bb44b 100644
--- a/extern/mantaflow/preprocessed/plugin/apic.cpp
+++ b/extern/mantaflow/preprocessed/plugin/apic.cpp
@@ -239,13 +239,7 @@ struct knApicMapLinearVec3ToMACGrid : public KernelBase {
     return boundaryWidth;
   }
   typedef int type9;
-  void runMessage()
-  {
-    debMsg("Executing kernel knApicMapLinearVec3ToMACGrid ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
+  void runMessage(){};
   void run()
   {
     const IndexInt _sz = size;
@@ -370,7 +364,7 @@ struct knApicMapLinearMACGridToVec3 : public KernelBase {
                  const FlagGrid &flags,
                  const ParticleDataImpl<int> *ptype,
                  const int exclude,
-                 const int boundaryWidth) const
+                 const int boundaryWidth)
   {
     if (!p.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
       return;
@@ -509,21 +503,17 @@ struct knApicMapLinearMACGridToVec3 : public KernelBase {
     return boundaryWidth;
   }
   typedef int type9;
-  void runMessage()
-  {
-    debMsg("Executing kernel knApicMapLinearMACGridToVec3 ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, vp, cpx, cpy, cpz, p, vg, flags, ptype, exclude, boundaryWidth);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, vp, cpx, cpy, cpz, p, vg, flags, ptype, exclude, boundaryWidth);
+    }
   }
   ParticleDataImpl<Vec3> &vp;
   ParticleDataImpl<Vec3> &cpx;
diff --git a/extern/mantaflow/preprocessed/plugin/extforces.cpp b/extern/mantaflow/preprocessed/plugin/extforces.cpp
index 88935fa7ae9..a0fb4659aa2 100644
--- a/extern/mantaflow/preprocessed/plugin/extforces.cpp
+++ b/extern/mantaflow/preprocessed/plugin/extforces.cpp
@@ -52,7 +52,7 @@ struct KnApplyForceField : public KernelBase {
                  const Grid<Vec3> &force,
                  const Grid<Real> *include,
                  bool additive,
-                 bool isMAC) const
+                 bool isMAC)
   {
     bool curFluid = flags.isFluid(i, j, k);
     bool curEmpty = flags.isEmpty(i, j, k);
@@ -105,37 +105,35 @@ struct KnApplyForceField : public KernelBase {
     return isMAC;
   }
   typedef bool type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnApplyForceField ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, vel, force, include, additive, isMAC);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, vel, force, include, additive, isMAC);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, vel, force, include, additive, isMAC);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, force, include, additive, isMAC);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   MACGrid &vel;
   const Grid<Vec3> &force;
@@ -165,7 +163,7 @@ struct KnApplyForce : public KernelBase {
                  MACGrid &vel,
                  Vec3 force,
                  const Grid<Real> *exclude,
-                 bool additive) const
+                 bool additive)
   {
     bool curFluid = flags.isFluid(i, j, k);
     bool curEmpty = flags.isEmpty(i, j, k);
@@ -206,37 +204,35 @@ struct KnApplyForce : public KernelBase {
     return additive;
   }
   typedef bool type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnApplyForce ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, vel, force, exclude, additive);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, vel, force, exclude, additive);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, vel, force, exclude, additive);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, force, exclude, additive);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   MACGrid &vel;
   Vec3 force;
@@ -346,7 +342,7 @@ struct KnAddBuoyancy : public KernelBase {
                  const FlagGrid &flags,
                  const Grid<Real> &factor,
                  MACGrid &vel,
-                 Vec3 strength) const
+                 Vec3 strength)
   {
     if (!flags.isFluid(i, j, k))
       return;
@@ -377,37 +373,35 @@ struct KnAddBuoyancy : public KernelBase {
     return strength;
   }
   typedef Vec3 type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnAddBuoyancy ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, factor, vel, strength);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, factor, vel, strength);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, factor, vel, strength);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, factor, vel, strength);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   const Grid<Real> &factor;
   MACGrid &vel;
@@ -662,7 +656,7 @@ struct KnSetInflow : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, MACGrid &vel, int dim, int p0, const Vec3 &val) const
+  inline void op(int i, int j, int k, MACGrid &vel, int dim, int p0, const Vec3 &val)
   {
     Vec3i p(i, j, k);
     if (p[dim] == p0 || p[dim] == p0 + 1)
@@ -688,37 +682,35 @@ struct KnSetInflow : public KernelBase {
     return val;
   }
   typedef Vec3 type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnSetInflow ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, vel, dim, p0, val);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, vel, dim, p0, val);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, vel, dim, p0, val);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, vel, dim, p0, val);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   MACGrid &vel;
   int dim;
   int p0;
@@ -784,8 +776,7 @@ struct KnSetWallBcs : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(
-      int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const MACGrid *obvel) const
+  inline void op(int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const MACGrid *obvel)
   {
 
     bool curFluid = flags.isFluid(i, j, k);
@@ -848,37 +839,35 @@ struct KnSetWallBcs : public KernelBase {
     return obvel;
   }
   typedef MACGrid type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnSetWallBcs ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, vel, obvel);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, vel, obvel);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, vel, obvel);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, vel, obvel);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   MACGrid &vel;
   const MACGrid *obvel;
@@ -912,7 +901,7 @@ struct KnSetWallBcsFrac : public KernelBase {
                  MACGrid &velTarget,
                  const MACGrid *obvel,
                  const Grid<Real> *phiObs,
-                 const int &boundaryWidth = 0) const
+                 const int &boundaryWidth = 0)
   {
     bool curFluid = flags.isFluid(i, j, k);
     bool curObs = flags.isObstacle(i, j, k);
@@ -1025,37 +1014,35 @@ struct KnSetWallBcsFrac : public KernelBase {
     return boundaryWidth;
   }
   typedef int type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnSetWallBcsFrac ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, vel, velTarget, obvel, phiObs, boundaryWidth);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, vel, velTarget, obvel, phiObs, boundaryWidth);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, vel, velTarget, obvel, phiObs, boundaryWidth);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, vel, velTarget, obvel, phiObs, boundaryWidth);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   const MACGrid &vel;
   MACGrid &velTarget;
@@ -1127,8 +1114,7 @@ struct KnAddForceIfLower : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(
-      int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const Grid<Vec3> &force) const
+  inline void op(int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const Grid<Vec3> &force)
   {
     bool curFluid = flags.isFluid(i, j, k);
     bool curEmpty = flags.isEmpty(i, j, k);
@@ -1173,37 +1159,35 @@ struct KnAddForceIfLower : public KernelBase {
     return force;
   }
   typedef Grid<Vec3> type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnAddForceIfLower ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, vel, force);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, vel, force);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, vel, force);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, force);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   MACGrid &vel;
   const Grid<Vec3> &force;
@@ -1266,7 +1250,7 @@ struct KnConfForce : public KernelBase {
                  const Grid<Real> &grid,
                  const Grid<Vec3> &curl,
                  Real str,
-                 const Grid<Real> *strGrid) const
+                 const Grid<Real> *strGrid)
   {
     Vec3 grad = 0.5 * Vec3(grid(i + 1, j, k) - grid(i - 1, j, k),
                            grid(i, j + 1, k) - grid(i, j - 1, k),
@@ -1303,37 +1287,35 @@ struct KnConfForce : public KernelBase {
     return strGrid;
   }
   typedef Grid<Real> type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnConfForce ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, force, grid, curl, str, strGrid);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, force, grid, curl, str, strGrid);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, force, grid, curl, str, strGrid);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, force, grid, curl, str, strGrid);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   Grid<Vec3> &force;
   const Grid<Real> &grid;
   const Grid<Vec3> &curl;
@@ -1514,7 +1496,7 @@ struct KnDissolveSmoke : public KernelBase {
                  int speed,
                  bool logFalloff,
                  float dydx,
-                 float fac) const
+                 float fac)
   {
 
     bool curFluid = flags.isFluid(i, j, k);
@@ -1602,37 +1584,35 @@ struct KnDissolveSmoke : public KernelBase {
     return fac;
   }
   typedef float type9;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnDissolveSmoke ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, density, heat, red, green, blue, speed, logFalloff, dydx, fac);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, density, heat, red, green, blue, speed, logFalloff, dydx, fac);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, density, heat, red, green, blue, speed, logFalloff, dydx, fac);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, density, heat, red, green, blue, speed, logFalloff, dydx, fac);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<Real> &density;
   Grid<Real> *heat;
diff --git a/extern/mantaflow/preprocessed/plugin/fire.cpp b/extern/mantaflow/preprocessed/plugin/fire.cpp
index f907bdf0504..8f924c57ddc 100644
--- a/extern/mantaflow/preprocessed/plugin/fire.cpp
+++ b/extern/mantaflow/preprocessed/plugin/fire.cpp
@@ -71,7 +71,7 @@ struct KnProcessBurn : public KernelBase {
                  Real ignitionTemp,
                  Real maxTemp,
                  Real dt,
-                 Vec3 flameSmokeColor) const
+                 Vec3 flameSmokeColor)
   {
     // Save initial values
     Real origFuel = fuel(i, j, k);
@@ -179,19 +179,44 @@ struct KnProcessBurn : public KernelBase {
     return flameSmokeColor;
   }
   typedef Vec3 type12;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnProcessBurn ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i,
+                 j,
+                 k,
+                 fuel,
+                 density,
+                 react,
+                 red,
+                 green,
+                 blue,
+                 heat,
+                 burningRate,
+                 flameSmoke,
+                 ignitionTemp,
+                 maxTemp,
+                 dt,
+                 flameSmokeColor);
+      }
+    }
+    else {
+      const int k = 0;
+#pragma omp parallel
+      {
+
+#pragma omp for
         for (int j = 1; j < _maxY; j++)
           for (int i = 1; i < _maxX; i++)
             op(i,
@@ -210,35 +235,8 @@ struct KnProcessBurn : public KernelBase {
                maxTemp,
                dt,
                flameSmokeColor);
+      }
     }
-    else {
-      const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i,
-             j,
-             k,
-             fuel,
-             density,
-             react,
-             red,
-             green,
-             blue,
-             heat,
-             burningRate,
-             flameSmoke,
-             ignitionTemp,
-             maxTemp,
-             dt,
-             flameSmokeColor);
-    }
-  }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
   }
   Grid<Real> &fuel;
   Grid<Real> &density;
@@ -344,7 +342,7 @@ struct KnUpdateFlame : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const Grid<Real> &react, Grid<Real> &flame) const
+  inline void op(int i, int j, int k, const Grid<Real> &react, Grid<Real> &flame)
   {
     if (react(i, j, k) > 0.0f)
       flame(i, j, k) = pow(react(i, j, k), 0.5f);
@@ -361,37 +359,35 @@ struct KnUpdateFlame : public KernelBase {
     return flame;
   }
   typedef Grid<Real> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnUpdateFlame ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, react, flame);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, react, flame);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, react, flame);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, react, flame);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const Grid<Real> &react;
   Grid<Real> &flame;
 };
diff --git a/extern/mantaflow/preprocessed/plugin/flip.cpp b/extern/mantaflow/preprocessed/plugin/flip.cpp
index 8757958d4b0..e63d8236f28 100644
--- a/extern/mantaflow/preprocessed/plugin/flip.cpp
+++ b/extern/mantaflow/preprocessed/plugin/flip.cpp
@@ -283,7 +283,7 @@ struct knClearFluidFlags : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, FlagGrid &flags, int dummy = 0) const
+  inline void op(int i, int j, int k, FlagGrid &flags, int dummy = 0)
   {
     if (flags.isFluid(i, j, k)) {
       flags(i, j, k) = (flags(i, j, k) | FlagGrid::TypeEmpty) & ~FlagGrid::TypeFluid;
@@ -299,37 +299,35 @@ struct knClearFluidFlags : public KernelBase {
     return dummy;
   }
   typedef int type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knClearFluidFlags ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, dummy);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, dummy);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, dummy);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, dummy);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   FlagGrid &flags;
   int dummy;
 };
@@ -342,7 +340,7 @@ struct knSetNbObstacle : public KernelBase {
     run();
   }
   inline void op(
-      int i, int j, int k, FlagGrid &nflags, const FlagGrid &flags, const Grid<Real> &phiObs) const
+      int i, int j, int k, FlagGrid &nflags, const FlagGrid &flags, const Grid<Real> &phiObs)
   {
     if (phiObs(i, j, k) > 0.)
       return;
@@ -381,37 +379,35 @@ struct knSetNbObstacle : public KernelBase {
     return phiObs;
   }
   typedef Grid<Real> type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetNbObstacle ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, nflags, flags, phiObs);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, nflags, flags, phiObs);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, nflags, flags, phiObs);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, nflags, flags, phiObs);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   FlagGrid &nflags;
   const FlagGrid &flags;
   const Grid<Real> &phiObs;
@@ -438,7 +434,11 @@ void markFluidCells(const BasicParticleSystem &parts,
   if (phiObs) {
     FlagGrid tmp(flags);
     knSetNbObstacle(tmp, flags, *phiObs);
+#if OPENMP && OPENMP_OFFLOAD
+    flags.copyFrom(tmp, true, false);
+#else
     flags.swap(tmp);
+#endif
   }
 }
 static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
@@ -818,33 +818,37 @@ struct ComputeUnionLevelsetPindex : public KernelBase {
                  LevelsetGrid &phi,
                  const Real radius,
                  const ParticleDataImpl<int> *ptype,
-                 const int exclude) const
+                 const int exclude)
   {
     const Vec3 gridPos = Vec3(i, j, k) + Vec3(0.5);  // shifted by half cell
     Real phiv = radius * 1.0;                        // outside
-    const int r = int(radius) + 1;
 
-    FOR_NEIGHBORS(phi, r)
-    {
+    int r = int(radius) + 1;
+    int rZ = phi.is3D() ? r : 0;
+    for (int zj = k - rZ; zj <= k + rZ; zj++)
+      for (int yj = j - r; yj <= j + r; yj++)
+        for (int xj = i - r; xj <= i + r; xj++) {
+          if (!phi.isInBounds(Vec3i(xj, yj, zj)))
+            continue;
 
-      // note, for the particle indices in indexSys the access is periodic (ie, dont skip for eg
-      // inBounds(sx,10,10)
-      IndexInt isysIdxS = index.index(xj, yj, zj);
-      IndexInt pStart = index(isysIdxS), pEnd = 0;
-      if (phi.isInBounds(isysIdxS + 1))
-        pEnd = index(isysIdxS + 1);
-      else
-        pEnd = indexSys.size();
-
-      // now loop over particles in cell
-      for (IndexInt p = pStart; p < pEnd; ++p) {
-        const int psrc = indexSys[p].sourceIndex;
-        if (ptype && ((*ptype)[psrc] & exclude))
-          continue;
-        const Vec3 pos = parts[psrc].pos;
-        phiv = std::min(phiv, fabs(norm(gridPos - pos)) - radius);
-      }
-    }
+          // note, for the particle indices in indexSys the access is periodic (ie, dont skip for
+          // eg inBounds(sx,10,10)
+          IndexInt isysIdxS = index.index(xj, yj, zj);
+          IndexInt pStart = index(isysIdxS), pEnd = 0;
+          if (phi.isInBounds(isysIdxS + 1))
+            pEnd = index(isysIdxS + 1);
+          else
+            pEnd = indexSys.size();
+
+          // now loop over particles in cell
+          for (IndexInt p = pStart; p < pEnd; ++p) {
+            const int psrc = indexSys[p].sourceIndex;
+            if (ptype && ((*ptype)[psrc] & exclude))
+              continue;
+            const Vec3 pos = parts[psrc].pos;
+            phiv = std::min(phiv, fabs(norm(gridPos - pos)) - radius);
+          }
+        }
     phi(i, j, k) = phiv;
   }
   inline const Grid<int> &getArg0()
@@ -882,37 +886,35 @@ struct ComputeUnionLevelsetPindex : public KernelBase {
     return exclude;
   }
   typedef int type6;
-  void runMessage()
-  {
-    debMsg("Executing kernel ComputeUnionLevelsetPindex ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, index, parts, indexSys, phi, radius, ptype, exclude);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, index, parts, indexSys, phi, radius, ptype, exclude);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, index, parts, indexSys, phi, radius, ptype, exclude);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, index, parts, indexSys, phi, radius, ptype, exclude);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const Grid<int> &index;
   const BasicParticleSystem &parts;
   const ParticleIndexSystem &indexSys;
@@ -1015,42 +1017,46 @@ struct ComputeAveragedLevelsetWeight : public KernelBase {
                  const ParticleDataImpl<int> *ptype,
                  const int exclude,
                  Grid<Vec3> *save_pAcc = nullptr,
-                 Grid<Real> *save_rAcc = nullptr) const
+                 Grid<Real> *save_rAcc = nullptr)
   {
     const Vec3 gridPos = Vec3(i, j, k) + Vec3(0.5);  // shifted by half cell
     Real phiv = radius * 1.0;                        // outside
 
     // loop over neighborhood, similar to ComputeUnionLevelsetPindex
     const Real sradiusInv = 1. / (4. * radius * radius);
-    const int r = int(radius) + 1;
+    int r = int(1. * radius) + 1;
+    int rZ = phi.is3D() ? r : 0;
     // accumulators
     Real wacc = 0.;
     Vec3 pacc = Vec3(0.);
     Real racc = 0.;
 
-    FOR_NEIGHBORS(phi, r)
-    {
+    for (int zj = k - rZ; zj <= k + rZ; zj++)
+      for (int yj = j - r; yj <= j + r; yj++)
+        for (int xj = i - r; xj <= i + r; xj++) {
+          if (!phi.isInBounds(Vec3i(xj, yj, zj)))
+            continue;
 
-      IndexInt isysIdxS = index.index(xj, yj, zj);
-      IndexInt pStart = index(isysIdxS), pEnd = 0;
-      if (phi.isInBounds(isysIdxS + 1))
-        pEnd = index(isysIdxS + 1);
-      else
-        pEnd = indexSys.size();
-      for (IndexInt p = pStart; p < pEnd; ++p) {
-        IndexInt psrc = indexSys[p].sourceIndex;
-        if (ptype && ((*ptype)[psrc] & exclude))
-          continue;
+          IndexInt isysIdxS = index.index(xj, yj, zj);
+          IndexInt pStart = index(isysIdxS), pEnd = 0;
+          if (phi.isInBounds(isysIdxS + 1))
+            pEnd = index(isysIdxS + 1);
+          else
+            pEnd = indexSys.size();
+          for (IndexInt p = pStart; p < pEnd; ++p) {
+            IndexInt psrc = indexSys[p].sourceIndex;
+            if (ptype && ((*ptype)[psrc] & exclude))
+              continue;
 
-        Vec3 pos = parts[psrc].pos;
-        Real s = normSquare(gridPos - pos) * sradiusInv;
-        // Real  w = std::max(0., cubed(1.-s) );
-        Real w = std::max(0., (1. - s));  // a bit smoother
-        wacc += w;
-        racc += radius * w;
-        pacc += pos * w;
-      }
-    }
+            Vec3 pos = parts[psrc].pos;
+            Real s = normSquare(gridPos - pos) * sradiusInv;
+            // Real  w = std::max(0., cubed(1.-s) );
+            Real w = std::max(0., (1. - s));  // a bit smoother
+            wacc += w;
+            racc += radius * w;
+            pacc += pos * w;
+          }
+        }
 
     if (wacc > VECTOR_EPSILON) {
       racc /= wacc;
@@ -1109,37 +1115,46 @@ struct ComputeAveragedLevelsetWeight : public KernelBase {
     return save_rAcc;
   }
   typedef Grid<Real> type8;
-  void runMessage()
-  {
-    debMsg("Executing kernel ComputeAveragedLevelsetWeight ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, parts, index, indexSys, phi, radius, ptype, exclude, save_pAcc, save_rAcc);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i,
+                 j,
+                 k,
+                 parts,
+                 index,
+                 indexSys,
+                 phi,
+                 radius,
+                 ptype,
+                 exclude,
+                 save_pAcc,
+                 save_rAcc);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, parts, index, indexSys, phi, radius, ptype, exclude, save_pAcc, save_rAcc);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, parts, index, indexSys, phi, radius, ptype, exclude, save_pAcc, save_rAcc);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const BasicParticleSystem &parts;
   const Grid<int> &index;
   const ParticleIndexSystem &indexSys;
@@ -1165,7 +1180,7 @@ template<class T> struct knSmoothGrid : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const Grid<T> &me, Grid<T> &tmp, Real factor) const
+  inline void op(int i, int j, int k, const Grid<T> &me, Grid<T> &tmp, Real factor)
   {
     T val = me(i, j, k) + me(i + 1, j, k) + me(i - 1, j, k) + me(i, j + 1, k) + me(i, j - 1, k);
     if (me.is3D()) {
@@ -1188,37 +1203,35 @@ template<class T> struct knSmoothGrid : public KernelBase {
     return factor;
   }
   typedef Real type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSmoothGrid ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, me, tmp, factor);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, me, tmp, factor);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, me, tmp, factor);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, me, tmp, factor);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const Grid<T> &me;
   Grid<T> &tmp;
   Real factor;
@@ -1231,7 +1244,7 @@ template<class T> struct knSmoothGridNeg : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const Grid<T> &me, Grid<T> &tmp, Real factor) const
+  inline void op(int i, int j, int k, const Grid<T> &me, Grid<T> &tmp, Real factor)
   {
     T val = me(i, j, k) + me(i + 1, j, k) + me(i - 1, j, k) + me(i, j + 1, k) + me(i, j - 1, k);
     if (me.is3D()) {
@@ -1258,37 +1271,35 @@ template<class T> struct knSmoothGridNeg : public KernelBase {
     return factor;
   }
   typedef Real type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSmoothGridNeg ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, me, tmp, factor);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, me, tmp, factor);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, me, tmp, factor);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, me, tmp, factor);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const Grid<T> &me;
   Grid<T> &tmp;
   Real factor;
@@ -1396,7 +1407,7 @@ struct correctLevelset : public KernelBase {
                  const Grid<Real> &rAcc,
                  const Real radius,
                  const Real t_low,
-                 const Real t_high) const
+                 const Real t_high)
   {
     if (rAcc(i, j, k) <= VECTOR_EPSILON)
       return;  // outside nothing happens
@@ -1463,37 +1474,35 @@ struct correctLevelset : public KernelBase {
     return t_high;
   }
   typedef Real type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel correctLevelset ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, phi, pAcc, rAcc, radius, t_low, t_high);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, phi, pAcc, rAcc, radius, t_low, t_high);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, phi, pAcc, rAcc, radius, t_low, t_high);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, phi, pAcc, rAcc, radius, t_low, t_high);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   LevelsetGrid &phi;
   const Grid<Vec3> &pAcc;
   const Grid<Real> &rAcc;
@@ -1624,7 +1633,7 @@ struct knPushOutofObs : public KernelBase {
                  const Real shift,
                  const Real thresh,
                  const ParticleDataImpl<int> *ptype,
-                 const int exclude) const
+                 const int exclude)
   {
     if (!parts.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
       return;
@@ -1675,21 +1684,17 @@ struct knPushOutofObs : public KernelBase {
     return exclude;
   }
   typedef int type6;
-  void runMessage()
-  {
-    debMsg("Executing kernel knPushOutofObs ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, parts, flags, phiObs, shift, thresh, ptype, exclude);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, parts, flags, phiObs, shift, thresh, ptype, exclude);
+    }
   }
   BasicParticleSystem &parts;
   const FlagGrid &flags;
@@ -1759,10 +1764,7 @@ template<class T> struct knSafeDivReal : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx,
-                 Grid<T> &me,
-                 const Grid<Real> &other,
-                 Real cutoff = VECTOR_EPSILON) const
+  inline void op(IndexInt idx, Grid<T> &me, const Grid<Real> &other, Real cutoff = VECTOR_EPSILON)
   {
     if (other[idx] < cutoff) {
       me[idx] = 0.;
@@ -1787,21 +1789,17 @@ template<class T> struct knSafeDivReal : public KernelBase {
     return cutoff;
   }
   typedef Real type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSafeDivReal ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, me, other, cutoff);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, me, other, cutoff);
+    }
   }
   Grid<T> &me;
   const Grid<Real> &other;
@@ -1879,13 +1877,7 @@ struct knMapLinearVec3ToMACGrid : public KernelBase {
     return exclude;
   }
   typedef int type6;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMapLinearVec3ToMACGrid ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
+  void runMessage(){};
   void run()
   {
     const IndexInt _sz = size;
@@ -2022,13 +2014,7 @@ template<class T> struct knMapLinear : public KernelBase {
     return psource;
   }
   typedef ParticleDataImpl<T> type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMapLinear ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
+  void runMessage(){};
   void run()
   {
     const IndexInt _sz = size;
@@ -2153,7 +2139,7 @@ template<class T> struct knMapFromGrid : public KernelBase {
   inline void op(IndexInt idx,
                  const BasicParticleSystem &p,
                  const Grid<T> &gsrc,
-                 ParticleDataImpl<T> &target) const
+                 ParticleDataImpl<T> &target)
   {
     if (!p.isActive(idx))
       return;
@@ -2174,21 +2160,17 @@ template<class T> struct knMapFromGrid : public KernelBase {
     return target;
   }
   typedef ParticleDataImpl<T> type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMapFromGrid ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, p, gsrc, target);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, p, gsrc, target);
+    }
   }
   const BasicParticleSystem &p;
   const Grid<T> &gsrc;
@@ -2298,7 +2280,7 @@ struct knMapLinearMACGridToVec3_PIC : public KernelBase {
                  const MACGrid &vel,
                  ParticleDataImpl<Vec3> &pvel,
                  const ParticleDataImpl<int> *ptype,
-                 const int exclude) const
+                 const int exclude)
   {
     if (!p.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
       return;
@@ -2335,21 +2317,17 @@ struct knMapLinearMACGridToVec3_PIC : public KernelBase {
     return exclude;
   }
   typedef int type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMapLinearMACGridToVec3_PIC ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, p, flags, vel, pvel, ptype, exclude);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, p, flags, vel, pvel, ptype, exclude);
+    }
   }
   const BasicParticleSystem &p;
   const FlagGrid &flags;
@@ -2438,7 +2416,7 @@ struct knMapLinearMACGridToVec3_FLIP : public KernelBase {
                  ParticleDataImpl<Vec3> &pvel,
                  const Real flipRatio,
                  const ParticleDataImpl<int> *ptype,
-                 const int exclude) const
+                 const int exclude)
   {
     if (!p.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
       return;
@@ -2486,21 +2464,17 @@ struct knMapLinearMACGridToVec3_FLIP : public KernelBase {
     return exclude;
   }
   typedef int type7;
-  void runMessage()
-  {
-    debMsg("Executing kernel knMapLinearMACGridToVec3_FLIP ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, p, flags, vel, oldVel, pvel, flipRatio, ptype, exclude);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, p, flags, vel, oldVel, pvel, flipRatio, ptype, exclude);
+    }
   }
   const BasicParticleSystem &p;
   const FlagGrid &flags;
@@ -2592,7 +2566,7 @@ struct knCombineVels : public KernelBase {
                  MACGrid &combineVel,
                  const LevelsetGrid *phi,
                  Real narrowBand,
-                 Real thresh) const
+                 Real thresh)
   {
     int idx = vel.index(i, j, k);
 
@@ -2648,37 +2622,35 @@ struct knCombineVels : public KernelBase {
     return thresh;
   }
   typedef Real type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel knCombineVels ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, vel, w, combineVel, phi, narrowBand, thresh);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, vel, w, combineVel, phi, narrowBand, thresh);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, vel, w, combineVel, phi, narrowBand, thresh);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, vel, w, combineVel, phi, narrowBand, thresh);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   MACGrid &vel;
   const Grid<Vec3> &w;
   MACGrid &combineVel;
diff --git a/extern/mantaflow/preprocessed/plugin/fluidguiding.cpp b/extern/mantaflow/preprocessed/plugin/fluidguiding.cpp
index de881840a2e..2b11f3a2557 100644
--- a/extern/mantaflow/preprocessed/plugin/fluidguiding.cpp
+++ b/extern/mantaflow/preprocessed/plugin/fluidguiding.cpp
@@ -62,7 +62,7 @@ struct apply1DKernelDirX : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel) const
+  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel)
   {
     int nx = in.getSizeX();
     int kn = kernel.n;
@@ -91,37 +91,35 @@ struct apply1DKernelDirX : public KernelBase {
     return kernel;
   }
   typedef Matrix type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel apply1DKernelDirX ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, in, out, kernel);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, in, out, kernel);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, in, out, kernel);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, in, out, kernel);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const MACGrid &in;
   MACGrid &out;
   const Matrix &kernel;
@@ -136,7 +134,7 @@ struct apply1DKernelDirY : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel) const
+  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel)
   {
     int ny = in.getSizeY();
     int kn = kernel.n;
@@ -165,37 +163,35 @@ struct apply1DKernelDirY : public KernelBase {
     return kernel;
   }
   typedef Matrix type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel apply1DKernelDirY ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, in, out, kernel);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, in, out, kernel);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, in, out, kernel);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, in, out, kernel);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const MACGrid &in;
   MACGrid &out;
   const Matrix &kernel;
@@ -210,7 +206,7 @@ struct apply1DKernelDirZ : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel) const
+  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel)
   {
     int nz = in.getSizeZ();
     int kn = kernel.n;
@@ -239,37 +235,35 @@ struct apply1DKernelDirZ : public KernelBase {
     return kernel;
   }
   typedef Matrix type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel apply1DKernelDirZ ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, in, out, kernel);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, in, out, kernel);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, in, out, kernel);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, in, out, kernel);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const MACGrid &in;
   MACGrid &out;
   const Matrix &kernel;
@@ -569,197 +563,88 @@ void prox_f(MACGrid &v,
 
 // *****************************************************************************
 
-// re-uses main pressure solve from pressure.cpp
-void solvePressure(MACGrid &vel,
-                   Grid<Real> &pressure,
-                   const FlagGrid &flags,
-                   Real cgAccuracy = 1e-3,
-                   const Grid<Real> *phi = nullptr,
-                   const Grid<Real> *perCellCorr = nullptr,
-                   const MACGrid *fractions = nullptr,
-                   const MACGrid *obvel = nullptr,
-                   Real gfClamp = 1e-04,
-                   Real cgMaxIterFac = 1.5,
-                   bool precondition = true,
-                   int preconditioner = 1,
-                   bool enforceCompatibility = false,
-                   bool useL2Norm = false,
-                   bool zeroPressureFixing = false,
-                   const Grid<Real> *curv = nullptr,
-                   const Real surfTens = 0.0,
-                   Grid<Real> *retRhs = nullptr);
-
-//! Main function for fluid guiding , includes "regular" pressure solve
-
-void PD_fluid_guiding(MACGrid &vel,
-                      MACGrid &velT,
-                      Grid<Real> &pressure,
-                      FlagGrid &flags,
-                      Grid<Real> &weight,
-                      int blurRadius = 5,
-                      Real theta = 1.0,
-                      Real tau = 1.0,
-                      Real sigma = 1.0,
-                      Real epsRel = 1e-3,
-                      Real epsAbs = 1e-3,
-                      int maxIters = 200,
-                      Grid<Real> *phi = nullptr,
-                      Grid<Real> *perCellCorr = nullptr,
-                      MACGrid *fractions = nullptr,
-                      MACGrid *obvel = nullptr,
-                      Real gfClamp = 1e-04,
-                      Real cgMaxIterFac = 1.5,
-                      Real cgAccuracy = 1e-3,
-                      int preconditioner = 1,
-                      bool zeroPressureFixing = false,
-                      const Grid<Real> *curv = nullptr,
-                      const Real surfTens = 0.)
-{
-  FluidSolver *parent = vel.getParent();
-
-  // initialize dual/slack variables
-  MACGrid velC = MACGrid(parent);
-  velC.copyFrom(vel);
-  MACGrid x = MACGrid(parent);
-  MACGrid y = MACGrid(parent);
-  MACGrid z = MACGrid(parent);
-  MACGrid x0 = MACGrid(parent);
-  MACGrid z0 = MACGrid(parent);
-
-  // precomputation
-  ADMM_precompute_Separable(blurRadius);
-  MACGrid Q = MACGrid(parent);
-  precomputeQ(Q, flags, velT, velC, gBlurKernel, sigma);
-  MACGrid invA = MACGrid(parent);
-  precomputeInvA(invA, weight, sigma);
-
-  // loop
-  int iter = 0;
-  for (iter = 0; iter < maxIters; iter++) {
-    // x-update
-    x0.copyFrom(x);
-    x.multConst(1.0 / sigma);
-    x.add(y);
-    prox_f(x, flags, Q, velC, sigma, invA);
-    x.multConst(-sigma);
-    x.addScaled(y, sigma);
-    x.add(x0);
-
-    // z-update
-    z0.copyFrom(z);
-    z.addScaled(x, -tau);
-    Real cgAccuracyAdaptive = cgAccuracy;
-
-    solvePressure(z,
-                  pressure,
-                  flags,
-                  cgAccuracyAdaptive,
-                  phi,
-                  perCellCorr,
-                  fractions,
-                  obvel,
-                  gfClamp,
-                  cgMaxIterFac,
-                  true,
-                  preconditioner,
-                  false,
-                  false,
-                  zeroPressureFixing,
-                  curv,
-                  surfTens);
-
-    // y-update
-    y.copyFrom(z);
-    y.sub(z0);
-    y.multConst(theta);
-    y.add(z);
-
-    // stopping criterion
-    bool stop = (iter > 0 && getRNorm(z, z0) < getEpsDual(epsAbs, epsRel, z));
-
-    if (stop || (iter == maxIters - 1))
-      break;
-  }
-
-  // vel_new = z
-  vel.copyFrom(z);
-
-  debMsg("PD_fluid_guiding iterations:" << iter, 1);
-}
-static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
-{
-  try {
-    PbArgs _args(_linargs, _kwds);
-    FluidSolver *parent = _args.obtainParent();
-    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
-    pbPreparePlugin(parent, "PD_fluid_guiding", !noTiming);
-    PyObject *_retval = nullptr;
-    {
-      ArgLocker _lock;
-      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 0, &_lock);
-      MACGrid &velT = *_args.getPtr<MACGrid>("velT", 1, &_lock);
-      Grid<Real> &pressure = *_args.getPtr<Grid<Real>>("pressure", 2, &_lock);
-      FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 3, &_lock);
-      Grid<Real> &weight = *_args.getPtr<Grid<Real>>("weight", 4, &_lock);
-      int blurRadius = _args.getOpt<int>("blurRadius", 5, 5, &_lock);
-      Real theta = _args.getOpt<Real>("theta", 6, 1.0, &_lock);
-      Real tau = _args.getOpt<Real>("tau", 7, 1.0, &_lock);
-      Real sigma = _args.getOpt<Real>("sigma", 8, 1.0, &_lock);
-      Real epsRel = _args.getOpt<Real>("epsRel", 9, 1e-3, &_lock);
-      Real epsAbs = _args.getOpt<Real>("epsAbs", 10, 1e-3, &_lock);
-      int maxIters = _args.getOpt<int>("maxIters", 11, 200, &_lock);
-      Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 12, nullptr, &_lock);
-      Grid<Real> *perCellCorr = _args.getPtrOpt<Grid<Real>>("perCellCorr", 13, nullptr, &_lock);
-      MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 14, nullptr, &_lock);
-      MACGrid *obvel = _args.getPtrOpt<MACGrid>("obvel", 15, nullptr, &_lock);
-      Real gfClamp = _args.getOpt<Real>("gfClamp", 16, 1e-04, &_lock);
-      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 17, 1.5, &_lock);
-      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 18, 1e-3, &_lock);
-      int preconditioner = _args.getOpt<int>("preconditioner", 19, 1, &_lock);
-      bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 20, false, &_lock);
-      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 21, nullptr, &_lock);
-      const Real surfTens = _args.getOpt<Real>("surfTens", 22, 0., &_lock);
-      _retval = getPyNone();
-      PD_fluid_guiding(vel,
-                       velT,
-                       pressure,
-                       flags,
-                       weight,
-                       blurRadius,
-                       theta,
-                       tau,
-                       sigma,
-                       epsRel,
-                       epsAbs,
-                       maxIters,
-                       phi,
-                       perCellCorr,
-                       fractions,
-                       obvel,
-                       gfClamp,
-                       cgMaxIterFac,
-                       cgAccuracy,
-                       preconditioner,
-                       zeroPressureFixing,
-                       curv,
-                       surfTens);
-      _args.check();
-    }
-    pbFinalizePlugin(parent, "PD_fluid_guiding", !noTiming);
-    return _retval;
-  }
-  catch (std::exception &e) {
-    pbSetError("PD_fluid_guiding", e.what());
-    return 0;
-  }
-}
-static const Pb::Register _RP_PD_fluid_guiding("", "PD_fluid_guiding", _W_2);
-extern "C" {
-void PbRegister_PD_fluid_guiding()
-{
-  KEEP_UNUSED(_RP_PD_fluid_guiding);
-}
-}
+// TODO (sebbas): Disabled for now
+// // re-uses main pressure solve from pressure.cpp
+// void solvePressure(
+//         MACGrid& vel, Grid<Real>& pressure, const FlagGrid& flags, Real cgAccuracy = 1e-3,
+// 	const Grid<Real>* phi = nullptr,
+// 	const Grid<Real>* perCellCorr = nullptr,
+// 	const MACGrid* fractions = nullptr,
+// 	const MACGrid* obvel = nullptr,
+// 	Real gfClamp = 1e-04,
+// 	Real cgMaxIterFac = 1.5,
+// 	bool precondition = true,
+// 	int preconditioner = 1,
+// 	bool enforceCompatibility = false,
+// 	bool useL2Norm = false,
+// 	bool zeroPressureFixing = false,
+// 	const Grid<Real> *curv = nullptr,
+// 	const Real surfTens = 0.0,
+// 	Grid<Real>* retRhs = nullptr );
+
+// //! Main function for fluid guiding , includes "regular" pressure solve
+// PYTHON() void PD_fluid_guiding(MACGrid& vel, MACGrid& velT,
+// 	Grid<Real>& pressure, FlagGrid& flags, Grid<Real>& weight, int blurRadius = 5,
+// 	Real theta = 1.0, Real tau = 1.0, Real sigma = 1.0,
+// 	Real epsRel = 1e-3, Real epsAbs = 1e-3, int maxIters = 200,
+// 	// duplicated for pressure solve
+// 	Grid<Real>* phi = nullptr, Grid<Real>* perCellCorr = nullptr, MACGrid* fractions = nullptr,
+// MACGrid* obvel = nullptr, Real gfClamp = 1e-04, Real cgMaxIterFac = 1.5, Real cgAccuracy = 1e-3,
+// 	int preconditioner = 1, bool zeroPressureFixing = false, const Grid<Real> *curv = nullptr,
+// const Real surfTens = 0.)
+// {
+// 	FluidSolver* parent = vel.getParent();
+
+// 	// initialize dual/slack variables
+// 	MACGrid velC = MACGrid(parent); velC.copyFrom(vel);
+// 	MACGrid x = MACGrid(parent);
+// 	MACGrid y = MACGrid(parent);
+// 	MACGrid z = MACGrid(parent);
+// 	MACGrid x0 = MACGrid(parent);
+// 	MACGrid z0 = MACGrid(parent);
+
+// 	// precomputation
+// 	ADMM_precompute_Separable(blurRadius);
+// 	MACGrid Q = MACGrid(parent);
+// 	precomputeQ(Q, flags, velT, velC, gBlurKernel, sigma);
+// 	MACGrid invA = MACGrid(parent);
+// 	precomputeInvA(invA, weight, sigma);
+
+// 	// loop
+// 	int iter = 0;
+// 	for (iter = 0; iter < maxIters; iter++) {
+// 		// x-update
+// 		x0.copyFrom(x);
+// 		x.multConst(1.0 / sigma);
+// 		x.add(y);
+// 		prox_f(x, flags, Q, velC, sigma, invA);
+// 		x.multConst(-sigma); x.addScaled(y, sigma); x.add(x0);
+
+// 		// z-update
+// 		z0.copyFrom(z);
+// 		z.addScaled(x, -tau);
+// 		Real cgAccuracyAdaptive = cgAccuracy;
+
+// 		solvePressure (z, pressure, flags, cgAccuracyAdaptive, phi, perCellCorr, fractions, obvel,
+// gfClamp, 		    cgMaxIterFac, true, preconditioner, false, false, zeroPressureFixing, curv, surfTens );
+
+// 		// y-update
+// 		y.copyFrom(z);
+// 		y.sub(z0);
+// 		y.multConst(theta);
+// 		y.add(z);
+
+// 		// stopping criterion
+// 		bool stop = (iter > 0 && getRNorm(z, z0) < getEpsDual(epsAbs, epsRel, z));
+
+// 		if (stop || (iter == maxIters - 1)) break;
+// 	}
+
+// 	// vel_new = z
+// 	vel.copyFrom(z);
+
+// 	debMsg("PD_fluid_guiding iterations:" << iter, 1);
+// }
 
 //! reset precomputation
 void releaseBlurPrecomp()
@@ -768,7 +653,7 @@ void releaseBlurPrecomp()
   gBlurKernelRadius = -1;
   gBlurKernel = 0.f;
 }
-static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
 {
   try {
     PbArgs _args(_linargs, _kwds);
@@ -790,7 +675,7 @@ static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
     return 0;
   }
 }
-static const Pb::Register _RP_releaseBlurPrecomp("", "releaseBlurPrecomp", _W_3);
+static const Pb::Register _RP_releaseBlurPrecomp("", "releaseBlurPrecomp", _W_2);
 extern "C" {
 void PbRegister_releaseBlurPrecomp()
 {
diff --git a/extern/mantaflow/preprocessed/plugin/initplugins.cpp b/extern/mantaflow/preprocessed/plugin/initplugins.cpp
index a0cc2761dab..db507642f68 100644
--- a/extern/mantaflow/preprocessed/plugin/initplugins.cpp
+++ b/extern/mantaflow/preprocessed/plugin/initplugins.cpp
@@ -56,7 +56,7 @@ struct KnApplyNoiseInfl : public KernelBase {
                  const WaveletNoiseField &noise,
                  const Grid<Real> &sdf,
                  Real scale,
-                 Real sigma) const
+                 Real sigma)
   {
     if (!flags.isFluid(i, j, k) || sdf(i, j, k) > sigma)
       return;
@@ -96,37 +96,35 @@ struct KnApplyNoiseInfl : public KernelBase {
     return sigma;
   }
   typedef Real type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnApplyNoiseInfl ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, density, noise, sdf, scale, sigma);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, density, noise, sdf, scale, sigma);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, density, noise, sdf, scale, sigma);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, density, noise, sdf, scale, sigma);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<Real> &density;
   const WaveletNoiseField &noise;
@@ -202,7 +200,7 @@ struct KnAddNoise : public KernelBase {
                  Grid<Real> &density,
                  const WaveletNoiseField &noise,
                  const Grid<Real> *sdf,
-                 Real scale) const
+                 Real scale)
   {
     if (!flags.isFluid(i, j, k) || (sdf && (*sdf)(i, j, k) > 0.))
       return;
@@ -233,37 +231,35 @@ struct KnAddNoise : public KernelBase {
     return scale;
   }
   typedef Real type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnAddNoise ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, density, noise, sdf, scale);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, density, noise, sdf, scale);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, density, noise, sdf, scale);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, density, noise, sdf, scale);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<Real> &density;
   const WaveletNoiseField &noise;
@@ -329,7 +325,7 @@ template<class T> struct knSetPdataNoise : public KernelBase {
                  const BasicParticleSystem &parts,
                  ParticleDataImpl<T> &pdata,
                  const WaveletNoiseField &noise,
-                 Real scale) const
+                 Real scale)
   {
     pdata[idx] = noise.evaluate(parts.getPos(idx)) * scale;
   }
@@ -353,21 +349,17 @@ template<class T> struct knSetPdataNoise : public KernelBase {
     return scale;
   }
   typedef Real type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetPdataNoise ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, parts, pdata, noise, scale);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, parts, pdata, noise, scale);
+    }
   }
   const BasicParticleSystem &parts;
   ParticleDataImpl<T> &pdata;
@@ -389,7 +381,7 @@ template<class T> struct knSetPdataNoiseVec : public KernelBase {
                  const BasicParticleSystem &parts,
                  ParticleDataImpl<T> &pdata,
                  const WaveletNoiseField &noise,
-                 Real scale) const
+                 Real scale)
   {
     pdata[idx] = noise.evaluateVec(parts.getPos(idx)) * scale;
   }
@@ -413,21 +405,17 @@ template<class T> struct knSetPdataNoiseVec : public KernelBase {
     return scale;
   }
   typedef Real type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetPdataNoiseVec ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, parts, pdata, noise, scale);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, parts, pdata, noise, scale);
+    }
   }
   const BasicParticleSystem &parts;
   ParticleDataImpl<T> &pdata;
@@ -688,7 +676,7 @@ struct KnApplyEmission : public KernelBase {
                  const Grid<Real> &source,
                  const Grid<Real> *emissionTexture,
                  bool isAbsolute,
-                 int type) const
+                 int type)
   {
     // if type is given, only apply emission when celltype matches type from flaggrid
     // and if emission texture is given, only apply emission when some emission is present at cell
@@ -733,37 +721,35 @@ struct KnApplyEmission : public KernelBase {
     return type;
   }
   typedef int type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnApplyEmission ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, target, source, emissionTexture, isAbsolute, type);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, target, source, emissionTexture, isAbsolute, type);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, target, source, emissionTexture, isAbsolute, type);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, target, source, emissionTexture, isAbsolute, type);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<Real> &target;
   const Grid<Real> &source;
@@ -837,7 +823,7 @@ struct KnApplyDensity : public KernelBase {
                  Grid<Real> &density,
                  const Grid<Real> &sdf,
                  Real value,
-                 Real sigma) const
+                 Real sigma)
   {
     if (!flags.isFluid(i, j, k) || sdf(i, j, k) > sigma)
       return;
@@ -868,37 +854,35 @@ struct KnApplyDensity : public KernelBase {
     return sigma;
   }
   typedef Real type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnApplyDensity ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, density, sdf, value, sigma);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, density, sdf, value, sigma);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, density, sdf, value, sigma);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, density, sdf, value, sigma);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<Real> &density;
   const Grid<Real> &sdf;
@@ -1041,7 +1025,7 @@ struct KnResetInObstacle : public KernelBase {
                  Grid<Real> *red,
                  Grid<Real> *green,
                  Grid<Real> *blue,
-                 Real resetValue) const
+                 Real resetValue)
   {
     if (!flags.isObstacle(i, j, k))
       return;
@@ -1115,37 +1099,35 @@ struct KnResetInObstacle : public KernelBase {
     return resetValue;
   }
   typedef Real type9;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnResetInObstacle ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, vel, density, heat, fuel, flame, red, green, blue, resetValue);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, vel, density, heat, fuel, flame, red, green, blue, resetValue);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, vel, density, heat, fuel, flame, red, green, blue, resetValue);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, vel, density, heat, fuel, flame, red, green, blue, resetValue);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   FlagGrid &flags;
   MACGrid &vel;
   Grid<Real> *density;
@@ -1692,7 +1674,7 @@ struct KnUpdateFractions : public KernelBase {
                  const Grid<Real> &phiObs,
                  MACGrid &fractions,
                  const int &boundaryWidth,
-                 const Real fracThreshold) const
+                 const Real fracThreshold)
   {
 
     // walls at domain bounds and inner objects
@@ -1787,37 +1769,35 @@ struct KnUpdateFractions : public KernelBase {
     return fracThreshold;
   }
   typedef Real type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnUpdateFractions ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, phiObs, fractions, boundaryWidth, fracThreshold);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, phiObs, fractions, boundaryWidth, fracThreshold);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, phiObs, fractions, boundaryWidth, fracThreshold);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, phiObs, fractions, boundaryWidth, fracThreshold);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   const Grid<Real> &phiObs;
   MACGrid &fractions;
@@ -1896,7 +1876,7 @@ struct KnUpdateFlagsObs : public KernelBase {
                  const Grid<Real> &phiObs,
                  const Grid<Real> *phiOut,
                  const Grid<Real> *phiIn,
-                 int boundaryWidth) const
+                 int boundaryWidth)
   {
 
     bool isObs = false;
@@ -1964,37 +1944,35 @@ struct KnUpdateFlagsObs : public KernelBase {
     return boundaryWidth;
   }
   typedef int type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnUpdateFlagsObs ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = boundaryWidth; j < _maxY; j++)
-          for (int i = boundaryWidth; i < _maxX; i++)
-            op(i, j, k, flags, fractions, phiObs, phiOut, phiIn, boundaryWidth);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = boundaryWidth; j < _maxY; j++)
+            for (int i = boundaryWidth; i < _maxX; i++)
+              op(i, j, k, flags, fractions, phiObs, phiOut, phiIn, boundaryWidth);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = boundaryWidth; i < _maxX; i++)
-          op(i, j, k, flags, fractions, phiObs, phiOut, phiIn, boundaryWidth);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = boundaryWidth; j < _maxY; j++)
+          for (int i = boundaryWidth; i < _maxX; i++)
+            op(i, j, k, flags, fractions, phiObs, phiOut, phiIn, boundaryWidth);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(boundaryWidth, maxY), *this);
-  }
   FlagGrid &flags;
   const MACGrid *fractions;
   const Grid<Real> &phiObs;
@@ -2067,7 +2045,7 @@ struct kninitVortexVelocity : public KernelBase {
                  const Grid<Real> &phiObs,
                  MACGrid &vel,
                  const Vec3 &center,
-                 const Real &radius) const
+                 const Real &radius)
   {
 
     if (phiObs(i, j, k) >= -1.) {
@@ -2115,37 +2093,35 @@ struct kninitVortexVelocity : public KernelBase {
     return radius;
   }
   typedef Real type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel kninitVortexVelocity ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, phiObs, vel, center, radius);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, phiObs, vel, center, radius);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, phiObs, vel, center, radius);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, phiObs, vel, center, radius);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const Grid<Real> &phiObs;
   MACGrid &vel;
   const Vec3 &center;
@@ -2318,7 +2294,7 @@ template<class T> struct knBlurGrid : public KernelBase {
                  Grid<T> &originGrid,
                  Grid<T> &targetGrid,
                  GaussianKernelCreator &gkSigma,
-                 int cdir) const
+                 int cdir)
   {
     targetGrid(i, j, k) = convolveGrid<T>(originGrid, gkSigma, Vec3(i, j, k), cdir);
   }
@@ -2342,37 +2318,35 @@ template<class T> struct knBlurGrid : public KernelBase {
     return cdir;
   }
   typedef int type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knBlurGrid ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, originGrid, targetGrid, gkSigma, cdir);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, originGrid, targetGrid, gkSigma, cdir);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, originGrid, targetGrid, gkSigma, cdir);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, originGrid, targetGrid, gkSigma, cdir);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<T> &originGrid;
   Grid<T> &targetGrid;
   GaussianKernelCreator &gkSigma;
@@ -2412,7 +2386,7 @@ struct KnBlurMACGridGauss : public KernelBase {
                  MACGrid &originGrid,
                  MACGrid &target,
                  GaussianKernelCreator &gkSigma,
-                 int cdir) const
+                 int cdir)
   {
     Vec3 pos(i, j, k);
     Vec3 step(1.0, 0.0, 0.0);
@@ -2462,37 +2436,35 @@ struct KnBlurMACGridGauss : public KernelBase {
     return cdir;
   }
   typedef int type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnBlurMACGridGauss ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, originGrid, target, gkSigma, cdir);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, originGrid, target, gkSigma, cdir);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, originGrid, target, gkSigma, cdir);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, originGrid, target, gkSigma, cdir);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   MACGrid &originGrid;
   MACGrid &target;
   GaussianKernelCreator &gkSigma;
diff --git a/extern/mantaflow/preprocessed/plugin/kepsilon.cpp b/extern/mantaflow/preprocessed/plugin/kepsilon.cpp
index 32425a5756e..c5711b95242 100644
--- a/extern/mantaflow/preprocessed/plugin/kepsilon.cpp
+++ b/extern/mantaflow/preprocessed/plugin/kepsilon.cpp
@@ -61,7 +61,7 @@ struct KnTurbulenceClamp : public KernelBase {
                  Real minK,
                  Real maxK,
                  Real minNu,
-                 Real maxNu) const
+                 Real maxNu)
   {
     Real eps = egrid[idx];
     Real ke = clamp(kgrid[idx], minK, maxK);
@@ -104,21 +104,17 @@ struct KnTurbulenceClamp : public KernelBase {
     return maxNu;
   }
   typedef Real type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnTurbulenceClamp ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, kgrid, egrid, minK, maxK, minNu, maxNu);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, kgrid, egrid, minK, maxK, minNu, maxNu);
+    }
   }
   Grid<Real> &kgrid;
   Grid<Real> &egrid;
@@ -163,7 +159,7 @@ struct KnComputeProduction : public KernelBase {
                  Grid<Real> &prod,
                  Grid<Real> &nuT,
                  Grid<Real> *strain,
-                 Real pscale = 1.0f) const
+                 Real pscale = 1.0f)
   {
     Real curEps = eps(i, j, k);
     if (curEps > 0) {
@@ -234,37 +230,35 @@ struct KnComputeProduction : public KernelBase {
     return pscale;
   }
   typedef Real type7;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnComputeProduction ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, vel, velCenter, ke, eps, prod, nuT, strain, pscale);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, vel, velCenter, ke, eps, prod, nuT, strain, pscale);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, vel, velCenter, ke, eps, prod, nuT, strain, pscale);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, vel, velCenter, ke, eps, prod, nuT, strain, pscale);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const MACGrid &vel;
   const Grid<Vec3> &velCenter;
   const Grid<Real> &ke;
@@ -345,7 +339,7 @@ struct KnAddTurbulenceSource : public KernelBase {
     run();
   }
   inline void op(
-      IndexInt idx, Grid<Real> &kgrid, Grid<Real> &egrid, const Grid<Real> &pgrid, Real dt) const
+      IndexInt idx, Grid<Real> &kgrid, Grid<Real> &egrid, const Grid<Real> &pgrid, Real dt)
   {
     Real eps = egrid[idx], prod = pgrid[idx], ke = kgrid[idx];
     if (ke <= 0)
@@ -379,21 +373,17 @@ struct KnAddTurbulenceSource : public KernelBase {
     return dt;
   }
   typedef Real type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnAddTurbulenceSource ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, kgrid, egrid, pgrid, dt);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, kgrid, egrid, pgrid, dt);
+    }
   }
   Grid<Real> &kgrid;
   Grid<Real> &egrid;
diff --git a/extern/mantaflow/preprocessed/plugin/pressure.cpp b/extern/mantaflow/preprocessed/plugin/pressure.cpp
index 593aeb16859..4674bf4a7bf 100644
--- a/extern/mantaflow/preprocessed/plugin/pressure.cpp
+++ b/extern/mantaflow/preprocessed/plugin/pressure.cpp
@@ -138,9 +138,10 @@ struct MakeRhs : public KernelBase {
       }
     }
 
+    // TODO (sebbas): Disabled for now
     // per cell divergence correction (optional)
-    if (perCellCorr)
-      set += perCellCorr->get(i, j, k);
+    // if(perCellCorr)
+    // 	set += perCellCorr->get(i,j,k);
 
     // obtain sum, cell count
     sum += set;
@@ -198,19 +199,61 @@ struct MakeRhs : public KernelBase {
     return gfClamp;
   }
   typedef Real type9;
-  void runMessage()
-  {
-    debMsg("Executing kernel MakeRhs ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
+      const FlagGrid &flags = getArg0();
+      Grid<Real> &rhs = getArg1();
+      const MACGrid &vel = getArg2();
+      const Grid<Real> *perCellCorr = getArg3();
+      const MACGrid *fractions = getArg4();
+      const MACGrid *obvel = getArg5();
+      const Grid<Real> *phi = getArg6();
+      const Grid<Real> *curv = getArg7();
+      const Real &surfTens = getArg8();
+      const Real &gfClamp = getArg9();
+#pragma omp target teams distribute parallel for reduction(+:cnt, sum) collapse(3) schedule(static,1)
+      {
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i,
+                 j,
+                 k,
+                 flags,
+                 rhs,
+                 vel,
+                 perCellCorr,
+                 fractions,
+                 obvel,
+                 phi,
+                 curv,
+                 surfTens,
+                 gfClamp,
+                 cnt,
+                 sum);
+      }
+      {
+        this->sum = sum;
+      }
+    }
+    else {
+      const int k = 0;
+      const FlagGrid &flags = getArg0();
+      Grid<Real> &rhs = getArg1();
+      const MACGrid &vel = getArg2();
+      const Grid<Real> *perCellCorr = getArg3();
+      const MACGrid *fractions = getArg4();
+      const MACGrid *obvel = getArg5();
+      const Grid<Real> *phi = getArg6();
+      const Grid<Real> *curv = getArg7();
+      const Real &surfTens = getArg8();
+      const Real &gfClamp = getArg9();
+#pragma omp target teams distribute parallel for reduction(+:cnt, sum) collapse(2) schedule(static,1)
+      {
         for (int j = 1; j < _maxY; j++)
           for (int i = 1; i < _maxX; i++)
             op(i,
@@ -228,55 +271,11 @@ struct MakeRhs : public KernelBase {
                gfClamp,
                cnt,
                sum);
+      }
+      {
+        this->sum = sum;
+      }
     }
-    else {
-      const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i,
-             j,
-             k,
-             flags,
-             rhs,
-             vel,
-             perCellCorr,
-             fractions,
-             obvel,
-             phi,
-             curv,
-             surfTens,
-             gfClamp,
-             cnt,
-             sum);
-    }
-  }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_reduce(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_reduce(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
-  MakeRhs(MakeRhs &o, tbb::split)
-      : KernelBase(o),
-        flags(o.flags),
-        rhs(o.rhs),
-        vel(o.vel),
-        perCellCorr(o.perCellCorr),
-        fractions(o.fractions),
-        obvel(o.obvel),
-        phi(o.phi),
-        curv(o.curv),
-        surfTens(o.surfTens),
-        gfClamp(o.gfClamp),
-        cnt(0),
-        sum(0)
-  {
-  }
-  void join(const MakeRhs &o)
-  {
-    cnt += o.cnt;
-    sum += o.sum;
   }
   const FlagGrid &flags;
   Grid<Real> &rhs;
@@ -302,7 +301,7 @@ struct knCorrectVelocity : public KernelBase {
     run();
   }
   inline void op(
-      int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const Grid<Real> &pressure) const
+      int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const Grid<Real> &pressure)
   {
     const IndexInt idx = flags.index(i, j, k);
     if (flags.isFluid(idx)) {
@@ -353,37 +352,36 @@ struct knCorrectVelocity : public KernelBase {
     return pressure;
   }
   typedef Grid<Real> type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knCorrectVelocity ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, vel, pressure);
+      const FlagGrid &flags = getArg0();
+      MACGrid &vel = getArg1();
+      const Grid<Real> &pressure = getArg2();
+#pragma omp target teams distribute parallel for collapse(3) schedule(static, 1)
+      {
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, vel, pressure);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, vel, pressure);
+      const FlagGrid &flags = getArg0();
+      MACGrid &vel = getArg1();
+      const Grid<Real> &pressure = getArg2();
+#pragma omp target teams distribute parallel for collapse(2) schedule(static, 1)
+      {
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, vel, pressure);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   MACGrid &vel;
   const Grid<Real> &pressure;
@@ -441,7 +439,7 @@ struct ApplyGhostFluidDiagonal : public KernelBase {
                  Grid<Real> &A0,
                  const FlagGrid &flags,
                  const Grid<Real> &phi,
-                 const Real gfClamp) const
+                 const Real gfClamp)
   {
     const int X = flags.getStrideX(), Y = flags.getStrideY(), Z = flags.getStrideZ();
     const IndexInt idx = flags.index(i, j, k);
@@ -483,37 +481,35 @@ struct ApplyGhostFluidDiagonal : public KernelBase {
     return gfClamp;
   }
   typedef Real type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel ApplyGhostFluidDiagonal ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, A0, flags, phi, gfClamp);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, A0, flags, phi, gfClamp);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, A0, flags, phi, gfClamp);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, A0, flags, phi, gfClamp);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   Grid<Real> &A0;
   const FlagGrid &flags;
   const Grid<Real> &phi;
@@ -551,7 +547,7 @@ struct knCorrectVelocityGhostFluid : public KernelBase {
                  const Grid<Real> &phi,
                  Real gfClamp,
                  const Grid<Real> *curv,
-                 const Real surfTens) const
+                 const Real surfTens)
   {
     const IndexInt X = flags.getStrideX(), Y = flags.getStrideY(), Z = flags.getStrideZ();
     const IndexInt idx = flags.index(i, j, k);
@@ -640,37 +636,35 @@ struct knCorrectVelocityGhostFluid : public KernelBase {
     return surfTens;
   }
   typedef Real type6;
-  void runMessage()
-  {
-    debMsg("Executing kernel knCorrectVelocityGhostFluid ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, vel, flags, pressure, phi, gfClamp, curv, surfTens);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, vel, flags, pressure, phi, gfClamp, curv, surfTens);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, vel, flags, pressure, phi, gfClamp, curv, surfTens);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, vel, flags, pressure, phi, gfClamp, curv, surfTens);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   MACGrid &vel;
   const FlagGrid &flags;
   const Grid<Real> &pressure;
@@ -710,7 +704,7 @@ struct knReplaceClampedGhostFluidVels : public KernelBase {
                  const FlagGrid &flags,
                  const Grid<Real> &pressure,
                  const Grid<Real> &phi,
-                 Real gfClamp) const
+                 Real gfClamp)
   {
     const IndexInt idx = flags.index(i, j, k);
     const IndexInt X = flags.getStrideX(), Y = flags.getStrideY(), Z = flags.getStrideZ();
@@ -758,37 +752,35 @@ struct knReplaceClampedGhostFluidVels : public KernelBase {
     return gfClamp;
   }
   typedef Real type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel knReplaceClampedGhostFluidVels ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, vel, flags, pressure, phi, gfClamp);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, vel, flags, pressure, phi, gfClamp);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, vel, flags, pressure, phi, gfClamp);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, vel, flags, pressure, phi, gfClamp);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   MACGrid &vel;
   const FlagGrid &flags;
   const Grid<Real> &pressure;
@@ -822,28 +814,21 @@ struct CountEmptyCells : public KernelBase {
     return flags;
   }
   typedef FlagGrid type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel CountEmptyCells ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, flags, numEmpty);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  CountEmptyCells(CountEmptyCells &o, tbb::split) : KernelBase(o), flags(o.flags), numEmpty(0)
-  {
-  }
-  void join(const CountEmptyCells &o)
-  {
-    numEmpty += o.numEmpty;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      int numEmpty = 0;
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, flags, numEmpty);
+#pragma omp critical
+      {
+        this->numEmpty += numEmpty;
+      }
+    }
   }
   const FlagGrid &flags;
   int numEmpty;
@@ -964,11 +949,19 @@ void computePressureRhs(Grid<Real> &rhs,
                         const Real surfTens = 0.)
 {
   // compute divergence and init right hand side
-  MakeRhs kernMakeRhs(
-      flags, rhs, vel, perCellCorr, fractions, obvel, phi, curv, surfTens, gfClamp);
-
-  if (enforceCompatibility)
-    rhs += (Real)(-kernMakeRhs.sum / (Real)kernMakeRhs.cnt);
+  // auto kernMakeRhs = new MakeRhs(flags, rhs, vel, perCellCorr, fractions, obvel, phi, curv,
+  // surfTens, gfClamp );
+  printf("pressure = %p, flags = %p, rhs = %p, vel = %p\n",
+         pressure.mData,
+         flags.mData,
+         rhs.mData,
+         vel.mData);
+  MakeRhs(flags, rhs, vel, perCellCorr, fractions, obvel, phi, curv, surfTens, gfClamp);
+
+  // TODO (sebbas): Disabled for now
+  // if(enforceCompatibility)
+  // 	rhs += (Real)(-kernMakeRhs->sum / (Real)kernMakeRhs->cnt);
+  // delete kernMakeRhs;
 }
 static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
 {
@@ -1050,6 +1043,13 @@ void solvePressureSystem(Grid<Real> &rhs,
                          MACGrid &vel,
                          Grid<Real> &pressure,
                          const FlagGrid &flags,
+                         Grid<Real> *residual = nullptr,
+                         Grid<Real> *search = nullptr,
+                         Grid<Real> *A0 = nullptr,
+                         Grid<Real> *Ai = nullptr,
+                         Grid<Real> *Aj = nullptr,
+                         Grid<Real> *Ak = nullptr,
+                         Grid<Real> *tmp = nullptr,
                          Real cgAccuracy = 1e-3,
                          const Grid<Real> *phi = nullptr,
                          const Grid<Real> *perCellCorr = nullptr,
@@ -1069,19 +1069,37 @@ void solvePressureSystem(Grid<Real> &rhs,
 
   // reserve temp grids
   FluidSolver *parent = flags.getParent();
-  Grid<Real> residual(parent);
-  Grid<Real> search(parent);
-  Grid<Real> A0(parent);
-  Grid<Real> Ai(parent);
-  Grid<Real> Aj(parent);
-  Grid<Real> Ak(parent);
-  Grid<Real> tmp(parent);
+
+  bool cleanUp = false;
+  if (!residual) {
+    residual = new Grid<Real>(parent, true, false, true);
+    search = new Grid<Real>(parent, true, false, true);
+    A0 = new Grid<Real>(parent, true, false, true);
+    Ai = new Grid<Real>(parent, true, false, true);
+    Aj = new Grid<Real>(parent, true, false, true);
+    Ak = new Grid<Real>(parent, true, false, true);
+    tmp = new Grid<Real>(parent, true, false, true);
+    cleanUp = true;
+  }
+  else {
+    residual->clear(true);
+    search->clear(true);
+    A0->clear(true);
+    Ai->clear(true);
+    Aj->clear(true);
+    Ak->clear(true);
+    tmp->clear(true);
+  }
+
+  std::cout << "HERE 5" << std::endl;
 
   // setup matrix and boundaries
-  MakeLaplaceMatrix(flags, A0, Ai, Aj, Ak, fractions);
+  MakeLaplaceMatrix(flags, *A0, *Ai, *Aj, *Ak, fractions);
+  // MakeLaplaceMatrix(flags, A0, Ai, Aj, Ak, fractions);
 
+  // TODO (sebbas): Disabled for now
   if (phi) {
-    ApplyGhostFluidDiagonal(A0, flags, *phi, gfClamp);
+    ApplyGhostFluidDiagonal(*A0, flags, *phi, gfClamp);
   }
 
   // check whether we need to fix some pressure value...
@@ -1125,7 +1143,8 @@ void solvePressureSystem(Grid<Real> &rhs,
       // debMsg("No empty cells! Fixing pressure of cell "<<fixPidx<<" to zero",1);
     }
     if (fixPidx >= 0) {
-      fixPressure(fixPidx, Real(0), rhs, A0, Ai, Aj, Ak);
+      fixPressure(fixPidx, Real(0), rhs, *A0, *Ai, *Aj, *Ak);
+      // fixPressure(fixPidx, Real(0), rhs, A0, Ai, Aj, Ak);
       static bool msgOnce = false;
       if (!msgOnce) {
         debMsg("Pinning pressure of cell " << fixPidx << " to zero", 2);
@@ -1133,20 +1152,21 @@ void solvePressureSystem(Grid<Real> &rhs,
       }
     }
   }
+  std::cout << "HERE 6" << std::endl;
 
   // CG setup
   // note: the last factor increases the max iterations for 2d, which right now can't use a
   // preconditioner
   GridCgInterface *gcg;
-  vector<Grid<Real> *> matA{&A0, &Ai, &Aj};
-
-  if (vel.is3D()) {
-    matA.push_back(&Ak);
-    gcg = new GridCg<ApplyMatrix>(pressure, rhs, residual, search, flags, tmp, matA);
-  }
-  else {
-    gcg = new GridCg<ApplyMatrix2D>(pressure, rhs, residual, search, flags, tmp, matA);
-  }
+  if (vel.is3D())
+    gcg = new GridCg<ApplyMatrix>(pressure, rhs, *residual, *search, flags, *tmp, A0, Ai, Aj, Ak);
+  // gcg = new GridCg<ApplyMatrix>  (pressure, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj,
+  // &Ak);
+  else
+    gcg = new GridCg<ApplyMatrix2D>(
+        pressure, rhs, *residual, *search, flags, *tmp, A0, Ai, Aj, Ak);
+  // gcg = new GridCg<ApplyMatrix2D>(pressure, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj,
+  // &Ak);
 
   gcg->setAccuracy(cgAccuracy);
   gcg->setUseL2Norm(useL2Norm);
@@ -1155,6 +1175,7 @@ void solvePressureSystem(Grid<Real> &rhs,
 
   Grid<Real> *pca0 = nullptr, *pca1 = nullptr, *pca2 = nullptr, *pca3 = nullptr;
   GridMg *pmg = nullptr;
+  std::cout << "HERE 7" << std::endl;
 
   // optional preconditioning
   if (preconditioner == PcMIC) {
@@ -1180,10 +1201,12 @@ void solvePressureSystem(Grid<Real> &rhs,
 
     gcg->setMGPreconditioner(GridCgInterface::PC_MGP, pmg);
   }
+  std::cout << "HERE 8" << std::endl;
 
   // CG solve
+  Real time = 0;
   for (int iter = 0; iter < maxIter; iter++) {
-    if (!gcg->iterate())
+    if (!gcg->iterate(time))
       iter = maxIter;
     if (iter < maxIter)
       debMsg("FluidSolver::solvePressure iteration " << iter
@@ -1193,8 +1216,26 @@ void solvePressureSystem(Grid<Real> &rhs,
   debMsg("FluidSolver::solvePressure done. Iterations:" << gcg->getIterations()
                                                         << ", residual:" << gcg->getResNorm(),
          2);
+  // std::cout << "TIME: " << time << std::endl;
 
   // Cleanup
+  if (cleanUp) {
+    if (residual)
+      delete residual;
+    if (search)
+      delete search;
+    if (A0)
+      delete A0;
+    if (Ai)
+      delete Ai;
+    if (Aj)
+      delete Aj;
+    if (Ak)
+      delete Ak;
+    if (tmp)
+      delete tmp;
+  }
+
   if (gcg)
     delete gcg;
   if (pca0)
@@ -1225,26 +1266,40 @@ static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
       MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
       Grid<Real> &pressure = *_args.getPtr<Grid<Real>>("pressure", 2, &_lock);
       const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 3, &_lock);
-      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 4, 1e-3, &_lock);
-      const Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 5, nullptr, &_lock);
+      Grid<Real> *residual = _args.getPtrOpt<Grid<Real>>("residual", 4, nullptr, &_lock);
+      Grid<Real> *search = _args.getPtrOpt<Grid<Real>>("search", 5, nullptr, &_lock);
+      Grid<Real> *A0 = _args.getPtrOpt<Grid<Real>>("A0", 6, nullptr, &_lock);
+      Grid<Real> *Ai = _args.getPtrOpt<Grid<Real>>("Ai", 7, nullptr, &_lock);
+      Grid<Real> *Aj = _args.getPtrOpt<Grid<Real>>("Aj", 8, nullptr, &_lock);
+      Grid<Real> *Ak = _args.getPtrOpt<Grid<Real>>("Ak", 9, nullptr, &_lock);
+      Grid<Real> *tmp = _args.getPtrOpt<Grid<Real>>("tmp", 10, nullptr, &_lock);
+      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 11, 1e-3, &_lock);
+      const Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 12, nullptr, &_lock);
       const Grid<Real> *perCellCorr = _args.getPtrOpt<Grid<Real>>(
-          "perCellCorr", 6, nullptr, &_lock);
-      const MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 7, nullptr, &_lock);
-      Real gfClamp = _args.getOpt<Real>("gfClamp", 8, 1e-04, &_lock);
-      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 9, 1.5, &_lock);
-      bool precondition = _args.getOpt<bool>("precondition", 10, true, &_lock);
-      int preconditioner = _args.getOpt<int>("preconditioner", 11, PcMIC, &_lock);
+          "perCellCorr", 13, nullptr, &_lock);
+      const MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 14, nullptr, &_lock);
+      Real gfClamp = _args.getOpt<Real>("gfClamp", 15, 1e-04, &_lock);
+      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 16, 1.5, &_lock);
+      bool precondition = _args.getOpt<bool>("precondition", 17, true, &_lock);
+      int preconditioner = _args.getOpt<int>("preconditioner", 18, PcMIC, &_lock);
       const bool enforceCompatibility = _args.getOpt<bool>(
-          "enforceCompatibility", 12, false, &_lock);
-      const bool useL2Norm = _args.getOpt<bool>("useL2Norm", 13, false, &_lock);
-      const bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 14, false, &_lock);
-      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 15, nullptr, &_lock);
-      const Real surfTens = _args.getOpt<Real>("surfTens", 16, 0., &_lock);
+          "enforceCompatibility", 19, false, &_lock);
+      const bool useL2Norm = _args.getOpt<bool>("useL2Norm", 20, false, &_lock);
+      const bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 21, false, &_lock);
+      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 22, nullptr, &_lock);
+      const Real surfTens = _args.getOpt<Real>("surfTens", 23, 0., &_lock);
       _retval = getPyNone();
       solvePressureSystem(rhs,
                           vel,
                           pressure,
                           flags,
+                          residual,
+                          search,
+                          A0,
+                          Ai,
+                          Aj,
+                          Ak,
+                          tmp,
                           cgAccuracy,
                           phi,
                           perCellCorr,
@@ -1370,6 +1425,13 @@ void PbRegister_correctVelocity()
 void solvePressure(MACGrid &vel,
                    Grid<Real> &pressure,
                    const FlagGrid &flags,
+                   Grid<Real> *residual = nullptr,
+                   Grid<Real> *search = nullptr,
+                   Grid<Real> *A0 = nullptr,
+                   Grid<Real> *Ai = nullptr,
+                   Grid<Real> *Aj = nullptr,
+                   Grid<Real> *Ak = nullptr,
+                   Grid<Real> *tmp = nullptr,
                    Real cgAccuracy = 1e-3,
                    const Grid<Real> *phi = nullptr,
                    const Grid<Real> *perCellCorr = nullptr,
@@ -1411,6 +1473,13 @@ void solvePressure(MACGrid &vel,
                       vel,
                       pressure,
                       flags,
+                      residual,
+                      search,
+                      A0,
+                      Ai,
+                      Aj,
+                      Aj,
+                      tmp,
                       cgAccuracy,
                       phi,
                       perCellCorr,
@@ -1442,10 +1511,11 @@ void solvePressure(MACGrid &vel,
                   curv,
                   surfTens);
 
+  // TODO (sebbas): Disabled for now
   // optionally , return RHS
-  if (retRhs) {
-    retRhs->copyFrom(rhs);
-  }
+  // if(retRhs) {
+  // 	retRhs->copyFrom(rhs);
+  // }
 }
 static PyObject *_W_4(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
 {
@@ -1460,26 +1530,40 @@ static PyObject *_W_4(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
       MACGrid &vel = *_args.getPtr<MACGrid>("vel", 0, &_lock);
       Grid<Real> &pressure = *_args.getPtr<Grid<Real>>("pressure", 1, &_lock);
       const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 2, &_lock);
-      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 3, 1e-3, &_lock);
-      const Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 4, nullptr, &_lock);
+      Grid<Real> *residual = _args.getPtrOpt<Grid<Real>>("residual", 3, nullptr, &_lock);
+      Grid<Real> *search = _args.getPtrOpt<Grid<Real>>("search", 4, nullptr, &_lock);
+      Grid<Real> *A0 = _args.getPtrOpt<Grid<Real>>("A0", 5, nullptr, &_lock);
+      Grid<Real> *Ai = _args.getPtrOpt<Grid<Real>>("Ai", 6, nullptr, &_lock);
+      Grid<Real> *Aj = _args.getPtrOpt<Grid<Real>>("Aj", 7, nullptr, &_lock);
+      Grid<Real> *Ak = _args.getPtrOpt<Grid<Real>>("Ak", 8, nullptr, &_lock);
+      Grid<Real> *tmp = _args.getPtrOpt<Grid<Real>>("tmp", 9, nullptr, &_lock);
+      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 10, 1e-3, &_lock);
+      const Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 11, nullptr, &_lock);
       const Grid<Real> *perCellCorr = _args.getPtrOpt<Grid<Real>>(
-          "perCellCorr", 5, nullptr, &_lock);
-      const MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 6, nullptr, &_lock);
-      const MACGrid *obvel = _args.getPtrOpt<MACGrid>("obvel", 7, nullptr, &_lock);
-      Real gfClamp = _args.getOpt<Real>("gfClamp", 8, 1e-04, &_lock);
-      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 9, 1.5, &_lock);
-      bool precondition = _args.getOpt<bool>("precondition", 10, true, &_lock);
-      int preconditioner = _args.getOpt<int>("preconditioner", 11, PcMIC, &_lock);
-      bool enforceCompatibility = _args.getOpt<bool>("enforceCompatibility", 12, false, &_lock);
-      bool useL2Norm = _args.getOpt<bool>("useL2Norm", 13, false, &_lock);
-      bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 14, false, &_lock);
-      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 15, nullptr, &_lock);
-      const Real surfTens = _args.getOpt<Real>("surfTens", 16, 0., &_lock);
-      Grid<Real> *retRhs = _args.getPtrOpt<Grid<Real>>("retRhs", 17, nullptr, &_lock);
+          "perCellCorr", 12, nullptr, &_lock);
+      const MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 13, nullptr, &_lock);
+      const MACGrid *obvel = _args.getPtrOpt<MACGrid>("obvel", 14, nullptr, &_lock);
+      Real gfClamp = _args.getOpt<Real>("gfClamp", 15, 1e-04, &_lock);
+      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 16, 1.5, &_lock);
+      bool precondition = _args.getOpt<bool>("precondition", 17, true, &_lock);
+      int preconditioner = _args.getOpt<int>("preconditioner", 18, PcMIC, &_lock);
+      bool enforceCompatibility = _args.getOpt<bool>("enforceCompatibility", 19, false, &_lock);
+      bool useL2Norm = _args.getOpt<bool>("useL2Norm", 20, false, &_lock);
+      bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 21, false, &_lock);
+      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 22, nullptr, &_lock);
+      const Real surfTens = _args.getOpt<Real>("surfTens", 23, 0., &_lock);
+      Grid<Real> *retRhs = _args.getPtrOpt<Grid<Real>>("retRhs", 24, nullptr, &_lock);
       _retval = getPyNone();
       solvePressure(vel,
                     pressure,
                     flags,
+                    residual,
+                    search,
+                    A0,
+                    Ai,
+                    Aj,
+                    Ak,
+                    tmp,
                     cgAccuracy,
                     phi,
                     perCellCorr,
diff --git a/extern/mantaflow/preprocessed/plugin/ptsplugins.cpp b/extern/mantaflow/preprocessed/plugin/ptsplugins.cpp
index 7b2aedb694e..6d75e220d7a 100644
--- a/extern/mantaflow/preprocessed/plugin/ptsplugins.cpp
+++ b/extern/mantaflow/preprocessed/plugin/ptsplugins.cpp
@@ -34,7 +34,7 @@ struct KnAddForcePvel : public KernelBase {
                  ParticleDataImpl<Vec3> &v,
                  const Vec3 &da,
                  const ParticleDataImpl<int> *ptype,
-                 const int exclude) const
+                 const int exclude)
   {
     if (ptype && ((*ptype)[idx] & exclude))
       return;
@@ -60,21 +60,17 @@ struct KnAddForcePvel : public KernelBase {
     return exclude;
   }
   typedef int type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnAddForcePvel ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, v, da, ptype, exclude);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, v, da, ptype, exclude);
+    }
   }
   ParticleDataImpl<Vec3> &v;
   const Vec3 &da;
@@ -150,7 +146,7 @@ struct KnUpdateVelocityFromDeltaPos : public KernelBase {
                  const ParticleDataImpl<Vec3> &x_prev,
                  const Real over_dt,
                  const ParticleDataImpl<int> *ptype,
-                 const int exclude) const
+                 const int exclude)
   {
     if (ptype && ((*ptype)[idx] & exclude))
       return;
@@ -186,21 +182,17 @@ struct KnUpdateVelocityFromDeltaPos : public KernelBase {
     return exclude;
   }
   typedef int type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnUpdateVelocityFromDeltaPos ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, p, v, x_prev, over_dt, ptype, exclude);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, p, v, x_prev, over_dt, ptype, exclude);
+    }
   }
   const BasicParticleSystem &p;
   ParticleDataImpl<Vec3> &v;
@@ -273,7 +265,7 @@ struct KnStepEuler : public KernelBase {
                  const ParticleDataImpl<Vec3> &v,
                  const Real dt,
                  const ParticleDataImpl<int> *ptype,
-                 const int exclude) const
+                 const int exclude)
   {
     if (ptype && ((*ptype)[idx] & exclude))
       return;
@@ -304,21 +296,17 @@ struct KnStepEuler : public KernelBase {
     return exclude;
   }
   typedef int type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnStepEuler ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, p, v, dt, ptype, exclude);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, p, v, dt, ptype, exclude);
+    }
   }
   BasicParticleSystem &p;
   const ParticleDataImpl<Vec3> &v;
@@ -393,7 +381,7 @@ struct KnSetPartType : public KernelBase {
                  const int mark,
                  const int stype,
                  const FlagGrid &flags,
-                 const int cflag) const
+                 const int cflag)
   {
     if (flags.isInBounds(part.getPos(idx), 0) && (flags.getAt(part.getPos(idx)) & cflag) &&
         (ptype[idx] & stype))
@@ -429,21 +417,17 @@ struct KnSetPartType : public KernelBase {
     return cflag;
   }
   typedef int type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnSetPartType ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, ptype, part, mark, stype, flags, cflag);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, ptype, part, mark, stype, flags, cflag);
+    }
   }
   ParticleDataImpl<int> &ptype;
   const BasicParticleSystem &part;
diff --git a/extern/mantaflow/preprocessed/plugin/secondaryparticles.cpp b/extern/mantaflow/preprocessed/plugin/secondaryparticles.cpp
index 7a1d8224d94..5d519710296 100644
--- a/extern/mantaflow/preprocessed/plugin/secondaryparticles.cpp
+++ b/extern/mantaflow/preprocessed/plugin/secondaryparticles.cpp
@@ -99,7 +99,7 @@ struct knFlipComputeSecondaryParticlePotentials : public KernelBase {
                  const Real scaleFromManta,
                  const int itype = FlagGrid::TypeFluid,
                  const int jtype = FlagGrid::TypeObstacle | FlagGrid::TypeOutflow |
-                                   FlagGrid::TypeInflow) const
+                                   FlagGrid::TypeInflow)
   {
 
     if (!(flags(i, j, k) & itype))
@@ -253,19 +253,48 @@ struct knFlipComputeSecondaryParticlePotentials : public KernelBase {
     return jtype;
   }
   typedef int type16;
-  void runMessage()
-  {
-    debMsg("Executing kernel knFlipComputeSecondaryParticlePotentials ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = radius; j < _maxY; j++)
+            for (int i = radius; i < _maxX; i++)
+              op(i,
+                 j,
+                 k,
+                 potTA,
+                 potWC,
+                 potKE,
+                 neighborRatio,
+                 flags,
+                 v,
+                 normal,
+                 radius,
+                 tauMinTA,
+                 tauMaxTA,
+                 tauMinWC,
+                 tauMaxWC,
+                 tauMinKE,
+                 tauMaxKE,
+                 scaleFromManta,
+                 itype,
+                 jtype);
+      }
+    }
+    else {
+      const int k = 0;
+#pragma omp parallel
+      {
+
+#pragma omp for
         for (int j = radius; j < _maxY; j++)
           for (int i = radius; i < _maxX; i++)
             op(i,
@@ -288,40 +317,9 @@ struct knFlipComputeSecondaryParticlePotentials : public KernelBase {
                scaleFromManta,
                itype,
                jtype);
-    }
-    else {
-      const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = radius; i < _maxX; i++)
-          op(i,
-             j,
-             k,
-             potTA,
-             potWC,
-             potKE,
-             neighborRatio,
-             flags,
-             v,
-             normal,
-             radius,
-             tauMinTA,
-             tauMaxTA,
-             tauMinWC,
-             tauMaxWC,
-             tauMinKE,
-             tauMaxKE,
-             scaleFromManta,
-             itype,
-             jtype);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(radius, maxY), *this);
-  }
   Grid<Real> &potTA;
   Grid<Real> &potWC;
   Grid<Real> &potKE;
@@ -670,13 +668,7 @@ struct knFlipSampleSecondaryParticlesMoreCylinders : public KernelBase {
     return rand;
   }
   typedef RandomStream type17;
-  void runMessage()
-  {
-    debMsg("Executing kernel knFlipSampleSecondaryParticlesMoreCylinders ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
+  void runMessage(){};
   void run()
   {
     const int _maxX = maxX;
@@ -930,13 +922,7 @@ struct knFlipSampleSecondaryParticles : public KernelBase {
     return rand;
   }
   typedef RandomStream type17;
-  void runMessage()
-  {
-    debMsg("Executing kernel knFlipSampleSecondaryParticles ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
+  void runMessage(){};
   void run()
   {
     const int _maxX = maxX;
@@ -1192,7 +1178,7 @@ struct knFlipUpdateSecondaryParticlesLinear : public KernelBase {
                  const Real c_b,
                  const Real dt,
                  const int exclude,
-                 const int antitunneling) const
+                 const int antitunneling)
   {
 
     if (!pts_sec.isActive(idx) || pts_sec[idx].flag & exclude)
@@ -1342,36 +1328,32 @@ struct knFlipUpdateSecondaryParticlesLinear : public KernelBase {
     return antitunneling;
   }
   typedef int type14;
-  void runMessage()
-  {
-    debMsg("Executing kernel knFlipUpdateSecondaryParticlesLinear ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx,
-         pts_sec,
-         v_sec,
-         l_sec,
-         f_sec,
-         flags,
-         v,
-         neighborRatio,
-         gravity,
-         k_b,
-         k_d,
-         c_s,
-         c_b,
-         dt,
-         exclude,
-         antitunneling);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i,
+           pts_sec,
+           v_sec,
+           l_sec,
+           f_sec,
+           flags,
+           v,
+           neighborRatio,
+           gravity,
+           k_b,
+           k_d,
+           c_s,
+           c_b,
+           dt,
+           exclude,
+           antitunneling);
+    }
   }
   BasicParticleSystem &pts_sec;
   ParticleDataImpl<Vec3> &v_sec;
@@ -1449,7 +1431,7 @@ struct knFlipUpdateSecondaryParticlesCubic : public KernelBase {
                  const Real dt,
                  const int exclude,
                  const int antitunneling,
-                 const int itype) const
+                 const int itype)
   {
 
     if (!pts_sec.isActive(idx) || pts_sec[idx].flag & exclude)
@@ -1655,38 +1637,34 @@ struct knFlipUpdateSecondaryParticlesCubic : public KernelBase {
     return itype;
   }
   typedef int type16;
-  void runMessage()
-  {
-    debMsg("Executing kernel knFlipUpdateSecondaryParticlesCubic ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx,
-         pts_sec,
-         v_sec,
-         l_sec,
-         f_sec,
-         flags,
-         v,
-         neighborRatio,
-         radius,
-         gravity,
-         k_b,
-         k_d,
-         c_s,
-         c_b,
-         dt,
-         exclude,
-         antitunneling,
-         itype);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i,
+           pts_sec,
+           v_sec,
+           l_sec,
+           f_sec,
+           flags,
+           v,
+           neighborRatio,
+           radius,
+           gravity,
+           k_b,
+           k_d,
+           c_s,
+           c_b,
+           dt,
+           exclude,
+           antitunneling,
+           itype);
+    }
   }
   BasicParticleSystem &pts_sec;
   ParticleDataImpl<Vec3> &v_sec;
@@ -1856,7 +1834,7 @@ struct knFlipDeleteParticlesInObstacle : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, BasicParticleSystem &pts, const FlagGrid &flags) const
+  inline void op(IndexInt idx, BasicParticleSystem &pts, const FlagGrid &flags)
   {
 
     if (!pts.isActive(idx))
@@ -1885,21 +1863,17 @@ struct knFlipDeleteParticlesInObstacle : public KernelBase {
     return flags;
   }
   typedef FlagGrid type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knFlipDeleteParticlesInObstacle ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, pts, flags);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, pts, flags);
+    }
   }
   BasicParticleSystem &pts;
   const FlagGrid &flags;
@@ -2031,7 +2005,7 @@ struct knSetFlagsFromLevelset : public KernelBase {
                  FlagGrid &flags,
                  const Grid<Real> &phi,
                  const int exclude = FlagGrid::TypeObstacle,
-                 const int itype = FlagGrid::TypeFluid) const
+                 const int itype = FlagGrid::TypeFluid)
   {
     if (phi(idx) < 0 && !(flags(idx) & exclude))
       flags(idx) = itype;
@@ -2056,21 +2030,17 @@ struct knSetFlagsFromLevelset : public KernelBase {
     return itype;
   }
   typedef int type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetFlagsFromLevelset ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, flags, phi, exclude, itype);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, flags, phi, exclude, itype);
+    }
   }
   FlagGrid &flags;
   const Grid<Real> &phi;
@@ -2126,7 +2096,7 @@ struct knSetMACFromLevelset : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, MACGrid &v, const Grid<Real> &phi, const Vec3 c) const
+  inline void op(int i, int j, int k, MACGrid &v, const Grid<Real> &phi, const Vec3 c)
   {
     if (phi.getInterpolated(Vec3(i, j, k)) > 0)
       v(i, j, k) = c;
@@ -2146,37 +2116,35 @@ struct knSetMACFromLevelset : public KernelBase {
     return c;
   }
   typedef Vec3 type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel knSetMACFromLevelset ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, v, phi, c);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, v, phi, c);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, v, phi, c);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, v, phi, c);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   MACGrid &v;
   const Grid<Real> &phi;
   const Vec3 c;
@@ -2268,7 +2236,7 @@ struct knFlipComputePotentialTrappedAir : public KernelBase {
                  const Real tauMax,
                  const Real scaleFromManta,
                  const int itype = FlagGrid::TypeFluid,
-                 const int jtype = FlagGrid::TypeFluid) const
+                 const int jtype = FlagGrid::TypeFluid)
   {
 
     if (!(flags(i, j, k) & itype))
@@ -2342,37 +2310,35 @@ struct knFlipComputePotentialTrappedAir : public KernelBase {
     return jtype;
   }
   typedef int type8;
-  void runMessage()
-  {
-    debMsg("Executing kernel knFlipComputePotentialTrappedAir ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, pot, flags, v, radius, tauMin, tauMax, scaleFromManta, itype, jtype);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, pot, flags, v, radius, tauMin, tauMax, scaleFromManta, itype, jtype);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, pot, flags, v, radius, tauMin, tauMax, scaleFromManta, itype, jtype);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, pot, flags, v, radius, tauMin, tauMax, scaleFromManta, itype, jtype);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   Grid<Real> &pot;
   const FlagGrid &flags;
   const MACGrid &v;
@@ -2472,7 +2438,7 @@ struct knFlipComputePotentialKineticEnergy : public KernelBase {
                  const Real tauMin,
                  const Real tauMax,
                  const Real scaleFromManta,
-                 const int itype = FlagGrid::TypeFluid) const
+                 const int itype = FlagGrid::TypeFluid)
   {
 
     if (!(flags(i, j, k) & itype))
@@ -2520,37 +2486,35 @@ struct knFlipComputePotentialKineticEnergy : public KernelBase {
     return itype;
   }
   typedef int type6;
-  void runMessage()
-  {
-    debMsg("Executing kernel knFlipComputePotentialKineticEnergy ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, pot, flags, v, tauMin, tauMax, scaleFromManta, itype);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, pot, flags, v, tauMin, tauMax, scaleFromManta, itype);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, pot, flags, v, tauMin, tauMax, scaleFromManta, itype);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, pot, flags, v, tauMin, tauMax, scaleFromManta, itype);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<Real> &pot;
   const FlagGrid &flags;
   const MACGrid &v;
@@ -2650,7 +2614,7 @@ struct knFlipComputePotentialWaveCrest : public KernelBase {
                  const Real tauMax,
                  const Real scaleFromManta,
                  const int itype = FlagGrid::TypeFluid,
-                 const int jtype = FlagGrid::TypeFluid) const
+                 const int jtype = FlagGrid::TypeFluid)
   {
 
     if (!(flags(i, j, k) & itype))
@@ -2736,19 +2700,41 @@ struct knFlipComputePotentialWaveCrest : public KernelBase {
     return jtype;
   }
   typedef int type9;
-  void runMessage()
-  {
-    debMsg("Executing kernel knFlipComputePotentialWaveCrest ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i,
+                 j,
+                 k,
+                 pot,
+                 flags,
+                 v,
+                 radius,
+                 normal,
+                 tauMin,
+                 tauMax,
+                 scaleFromManta,
+                 itype,
+                 jtype);
+      }
+    }
+    else {
+      const int k = 0;
+#pragma omp parallel
+      {
+
+#pragma omp for
         for (int j = 1; j < _maxY; j++)
           for (int i = 1; i < _maxX; i++)
             op(i,
@@ -2764,21 +2750,9 @@ struct knFlipComputePotentialWaveCrest : public KernelBase {
                scaleFromManta,
                itype,
                jtype);
-    }
-    else {
-      const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, pot, flags, v, radius, normal, tauMin, tauMax, scaleFromManta, itype, jtype);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   Grid<Real> &pot;
   const FlagGrid &flags;
   const MACGrid &v;
@@ -2860,7 +2834,7 @@ struct knFlipComputeSurfaceNormals : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, Grid<Vec3> &normal, const Grid<Real> &phi) const
+  inline void op(IndexInt idx, Grid<Vec3> &normal, const Grid<Real> &phi)
   {
     normal[idx] = getNormalized(normal[idx]);
   }
@@ -2874,21 +2848,17 @@ struct knFlipComputeSurfaceNormals : public KernelBase {
     return phi;
   }
   typedef Grid<Real> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knFlipComputeSurfaceNormals ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, normal, phi);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, normal, phi);
+    }
   }
   Grid<Vec3> &normal;
   const Grid<Real> &phi;
@@ -2958,7 +2928,7 @@ struct knFlipUpdateNeighborRatio : public KernelBase {
                  Grid<Real> &neighborRatio,
                  const int radius,
                  const int itype = FlagGrid::TypeFluid,
-                 const int jtype = FlagGrid::TypeObstacle) const
+                 const int jtype = FlagGrid::TypeObstacle)
   {
 
     if (!(flags(i, j, k) & itype))
@@ -3008,37 +2978,35 @@ struct knFlipUpdateNeighborRatio : public KernelBase {
     return jtype;
   }
   typedef int type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel knFlipUpdateNeighborRatio ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, neighborRatio, radius, itype, jtype);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, neighborRatio, radius, itype, jtype);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, neighborRatio, radius, itype, jtype);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, neighborRatio, radius, itype, jtype);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<Real> &neighborRatio;
   const int radius;
diff --git a/extern/mantaflow/preprocessed/plugin/surfaceturbulence.cpp b/extern/mantaflow/preprocessed/plugin/surfaceturbulence.cpp
index e5aa09117ea..c4be7ab3ea5 100644
--- a/extern/mantaflow/preprocessed/plugin/surfaceturbulence.cpp
+++ b/extern/mantaflow/preprocessed/plugin/surfaceturbulence.cpp
@@ -569,7 +569,7 @@ struct advectSurfacePoints : public KernelBase {
   inline void op(IndexInt idx,
                  BasicParticleSystemWrapper &surfacePoints,
                  const BasicParticleSystemWrapper &coarseParticles,
-                 const ParticleDataImplVec3Wrapper &coarseParticlesPrevPos) const
+                 const ParticleDataImplVec3Wrapper &coarseParticlesPrevPos)
   {
     if (surfacePoints.isActive(idx)) {
       Vec3 avgDisplacement(0, 0, 0);
@@ -606,21 +606,17 @@ struct advectSurfacePoints : public KernelBase {
     return coarseParticlesPrevPos;
   }
   typedef ParticleDataImplVec3Wrapper type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel advectSurfacePoints ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, coarseParticles, coarseParticlesPrevPos);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, coarseParticles, coarseParticlesPrevPos);
+    }
   }
   BasicParticleSystemWrapper &surfacePoints;
   const BasicParticleSystemWrapper &coarseParticles;
@@ -673,7 +669,7 @@ struct computeSurfaceNormals : public KernelBase {
   inline void op(IndexInt idx,
                  const BasicParticleSystemWrapper &surfacePoints,
                  const BasicParticleSystemWrapper &coarseParticles,
-                 ParticleDataImpl<Vec3> &surfaceNormals) const
+                 ParticleDataImpl<Vec3> &surfaceNormals)
   {
     Vec3 pos = surfacePoints.getPos(idx);
 
@@ -743,21 +739,17 @@ struct computeSurfaceNormals : public KernelBase {
     return surfaceNormals;
   }
   typedef ParticleDataImpl<Vec3> type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel computeSurfaceNormals ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, coarseParticles, surfaceNormals);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, coarseParticles, surfaceNormals);
+    }
   }
   const BasicParticleSystemWrapper &surfacePoints;
   const BasicParticleSystemWrapper &coarseParticles;
@@ -780,7 +772,7 @@ struct computeAveragedNormals : public KernelBase {
   }
   inline void op(IndexInt idx,
                  const BasicParticleSystemWrapper &surfacePoints,
-                 const ParticleDataImpl<Vec3> &surfaceNormals) const
+                 const ParticleDataImpl<Vec3> &surfaceNormals)
   {
     Vec3 pos = surfacePoints.getPos(idx);
     Vec3 newNormal = Vec3(0, 0, 0);
@@ -800,21 +792,17 @@ struct computeAveragedNormals : public KernelBase {
     return surfaceNormals;
   }
   typedef ParticleDataImpl<Vec3> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel computeAveragedNormals ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, surfaceNormals);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, surfaceNormals);
+    }
   }
   const BasicParticleSystemWrapper &surfacePoints;
   const ParticleDataImpl<Vec3> &surfaceNormals;
@@ -832,7 +820,7 @@ struct assignNormals : public KernelBase {
   }
   inline void op(IndexInt idx,
                  const BasicParticleSystemWrapper &surfacePoints,
-                 ParticleDataImpl<Vec3> &surfaceNormals) const
+                 ParticleDataImpl<Vec3> &surfaceNormals)
   {
     surfaceNormals[idx] = tempSurfaceVec3[idx];
   }
@@ -846,21 +834,17 @@ struct assignNormals : public KernelBase {
     return surfaceNormals;
   }
   typedef ParticleDataImpl<Vec3> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel assignNormals ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, surfaceNormals);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, surfaceNormals);
+    }
   }
   const BasicParticleSystemWrapper &surfacePoints;
   ParticleDataImpl<Vec3> &surfaceNormals;
@@ -963,7 +947,7 @@ struct computeSurfaceDensities : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, const BasicParticleSystemWrapper &surfacePoints, void *dummy) const
+  inline void op(IndexInt idx, const BasicParticleSystemWrapper &surfacePoints, void *dummy)
   {
     Vec3 pos = surfacePoints.getPos(idx);
     Real density = 0;
@@ -984,21 +968,17 @@ struct computeSurfaceDensities : public KernelBase {
     return dummy;
   }
   typedef void type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel computeSurfaceDensities ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, dummy);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, dummy);
+    }
   }
   const BasicParticleSystemWrapper &surfacePoints;
   void *dummy;
@@ -1016,7 +996,7 @@ struct computeSurfaceDisplacements : public KernelBase {
   }
   inline void op(IndexInt idx,
                  const BasicParticleSystemWrapper &surfacePoints,
-                 const ParticleDataImpl<Vec3> &surfaceNormals) const
+                 const ParticleDataImpl<Vec3> &surfaceNormals)
   {
     Vec3 pos = surfacePoints.getPos(idx);
     Vec3 normal = surfaceNormals[idx];
@@ -1068,21 +1048,17 @@ struct computeSurfaceDisplacements : public KernelBase {
     return surfaceNormals;
   }
   typedef ParticleDataImpl<Vec3> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel computeSurfaceDisplacements ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, surfaceNormals);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, surfaceNormals);
+    }
   }
   const BasicParticleSystemWrapper &surfacePoints;
   const ParticleDataImpl<Vec3> &surfaceNormals;
@@ -1095,7 +1071,7 @@ struct applySurfaceDisplacements : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(IndexInt idx, BasicParticleSystemWrapper &surfacePoints, void *dummy) const
+  inline void op(IndexInt idx, BasicParticleSystemWrapper &surfacePoints, void *dummy)
   {
     surfacePoints.setPos(idx, surfacePoints.getPos(idx) + tempSurfaceVec3[idx]);
   }
@@ -1109,21 +1085,17 @@ struct applySurfaceDisplacements : public KernelBase {
     return dummy;
   }
   typedef void type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel applySurfaceDisplacements ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, dummy);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, dummy);
+    }
   }
   BasicParticleSystemWrapper &surfacePoints;
   void *dummy;
@@ -1152,7 +1124,7 @@ struct constrainSurface : public KernelBase {
   }
   inline void op(IndexInt idx,
                  BasicParticleSystemWrapper &surfacePoints,
-                 const BasicParticleSystemWrapper &coarseParticles) const
+                 const BasicParticleSystemWrapper &coarseParticles)
   {
     Vec3 pos = surfacePoints.getPos(idx);
     Real level = computeConstraintLevel(coarseParticles, surfacePoints.getPos(idx));
@@ -1179,21 +1151,17 @@ struct constrainSurface : public KernelBase {
     return coarseParticles;
   }
   typedef BasicParticleSystemWrapper type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel constrainSurface ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, coarseParticles);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, coarseParticles);
+    }
   }
   BasicParticleSystemWrapper &surfacePoints;
   const BasicParticleSystemWrapper &coarseParticles;
@@ -1220,7 +1188,7 @@ struct interpolateNewWaveData : public KernelBase {
                  ParticleDataImpl<Real> &surfaceWaveH,
                  ParticleDataImpl<Real> &surfaceWaveDtH,
                  ParticleDataImpl<Real> &surfaceWaveSeed,
-                 ParticleDataImpl<Real> &surfaceWaveSeedAmplitude) const
+                 ParticleDataImpl<Real> &surfaceWaveSeedAmplitude)
   {
     if (surfacePoints.getStatus(idx) & ParticleBase::PNEW) {
       Vec3 pos = surfacePoints.getPos(idx);
@@ -1270,26 +1238,22 @@ struct interpolateNewWaveData : public KernelBase {
     return surfaceWaveSeedAmplitude;
   }
   typedef ParticleDataImpl<Real> type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel interpolateNewWaveData ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx,
-         surfacePoints,
-         surfaceWaveH,
-         surfaceWaveDtH,
-         surfaceWaveSeed,
-         surfaceWaveSeedAmplitude);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i,
+           surfacePoints,
+           surfaceWaveH,
+           surfaceWaveDtH,
+           surfaceWaveSeed,
+           surfaceWaveSeedAmplitude);
+    }
   }
   const BasicParticleSystemWrapper &surfacePoints;
   ParticleDataImpl<Real> &surfaceWaveH;
@@ -1345,7 +1309,7 @@ struct addSeed : public KernelBase {
   inline void op(IndexInt idx,
                  const BasicParticleSystemWrapper &surfacePoints,
                  ParticleDataImpl<Real> &surfaceWaveH,
-                 const ParticleDataImpl<Real> &surfaceWaveSeed) const
+                 const ParticleDataImpl<Real> &surfaceWaveSeed)
   {
     surfaceWaveH[idx] += surfaceWaveSeed[idx];
   }
@@ -1364,21 +1328,17 @@ struct addSeed : public KernelBase {
     return surfaceWaveSeed;
   }
   typedef ParticleDataImpl<Real> type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel addSeed ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, surfaceWaveH, surfaceWaveSeed);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, surfaceWaveH, surfaceWaveSeed);
+    }
   }
   const BasicParticleSystemWrapper &surfacePoints;
   ParticleDataImpl<Real> &surfaceWaveH;
@@ -1400,7 +1360,7 @@ struct computeSurfaceWaveNormal : public KernelBase {
   inline void op(IndexInt idx,
                  const BasicParticleSystemWrapper &surfacePoints,
                  const ParticleDataImpl<Vec3> &surfaceNormals,
-                 const ParticleDataImpl<Real> &surfaceWaveH) const
+                 const ParticleDataImpl<Real> &surfaceWaveH)
   {
     Vec3 pos = surfacePoints.getPos(idx);
 
@@ -1464,21 +1424,17 @@ struct computeSurfaceWaveNormal : public KernelBase {
     return surfaceWaveH;
   }
   typedef ParticleDataImpl<Real> type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel computeSurfaceWaveNormal ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, surfaceNormals, surfaceWaveH);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, surfaceNormals, surfaceWaveH);
+    }
   }
   const BasicParticleSystemWrapper &surfacePoints;
   const ParticleDataImpl<Vec3> &surfaceNormals;
@@ -1500,7 +1456,7 @@ struct computeSurfaceWaveLaplacians : public KernelBase {
   inline void op(IndexInt idx,
                  const BasicParticleSystemWrapper &surfacePoints,
                  const ParticleDataImpl<Vec3> &surfaceNormals,
-                 const ParticleDataImpl<Real> &surfaceWaveH) const
+                 const ParticleDataImpl<Real> &surfaceWaveH)
   {
     Real laplacian = 0;
     Real wTotal = 0;
@@ -1561,21 +1517,17 @@ struct computeSurfaceWaveLaplacians : public KernelBase {
     return surfaceWaveH;
   }
   typedef ParticleDataImpl<Real> type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel computeSurfaceWaveLaplacians ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, surfaceNormals, surfaceWaveH);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, surfaceNormals, surfaceWaveH);
+    }
   }
   const BasicParticleSystemWrapper &surfacePoints;
   const ParticleDataImpl<Vec3> &surfaceNormals;
@@ -1600,7 +1552,7 @@ struct evolveWave : public KernelBase {
                  const BasicParticleSystemWrapper &surfacePoints,
                  ParticleDataImpl<Real> &surfaceWaveH,
                  ParticleDataImpl<Real> &surfaceWaveDtH,
-                 const ParticleDataImpl<Real> &surfaceWaveSeed) const
+                 const ParticleDataImpl<Real> &surfaceWaveSeed)
   {
     surfaceWaveDtH[idx] += params.waveSpeed * params.waveSpeed * params.dt * tempSurfaceFloat[idx];
     surfaceWaveDtH[idx] /= (1 + params.dt * params.waveDamping);
@@ -1635,21 +1587,17 @@ struct evolveWave : public KernelBase {
     return surfaceWaveSeed;
   }
   typedef ParticleDataImpl<Real> type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel evolveWave ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, surfaceWaveH, surfaceWaveDtH, surfaceWaveSeed);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, surfaceWaveH, surfaceWaveDtH, surfaceWaveSeed);
+    }
   }
   const BasicParticleSystemWrapper &surfacePoints;
   ParticleDataImpl<Real> &surfaceWaveH;
@@ -1669,7 +1617,7 @@ struct computeSurfaceCurvature : public KernelBase {
   }
   inline void op(IndexInt idx,
                  const BasicParticleSystemWrapper &surfacePoints,
-                 const ParticleDataImpl<Vec3> &surfaceNormals) const
+                 const ParticleDataImpl<Vec3> &surfaceNormals)
   {
     Vec3 pPos = surfacePoints.getPos(idx);
     Real wTotal = 0;
@@ -1710,21 +1658,17 @@ struct computeSurfaceCurvature : public KernelBase {
     return surfaceNormals;
   }
   typedef ParticleDataImpl<Vec3> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel computeSurfaceCurvature ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, surfaceNormals);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, surfaceNormals);
+    }
   }
   const BasicParticleSystemWrapper &surfacePoints;
   const ParticleDataImpl<Vec3> &surfaceNormals;
@@ -1742,7 +1686,7 @@ struct smoothCurvature : public KernelBase {
   }
   inline void op(IndexInt idx,
                  const BasicParticleSystemWrapper &surfacePoints,
-                 ParticleDataImpl<Real> &surfaceWaveSource) const
+                 ParticleDataImpl<Real> &surfaceWaveSource)
   {
     Vec3 pPos = surfacePoints.getPos(idx);
     Real curv = 0;
@@ -1768,21 +1712,17 @@ struct smoothCurvature : public KernelBase {
     return surfaceWaveSource;
   }
   typedef ParticleDataImpl<Real> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel smoothCurvature ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, surfaceWaveSource);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, surfaceWaveSource);
+    }
   }
   const BasicParticleSystemWrapper &surfacePoints;
   ParticleDataImpl<Real> &surfaceWaveSource;
@@ -1806,7 +1746,7 @@ struct seedWaves : public KernelBase {
                  const BasicParticleSystemWrapper &surfacePoints,
                  ParticleDataImpl<Real> &surfaceWaveSeed,
                  ParticleDataImpl<Real> &surfaceWaveSeedAmplitude,
-                 ParticleDataImpl<Real> &surfaceWaveSource) const
+                 ParticleDataImpl<Real> &surfaceWaveSource)
   {
     Real source = smoothstep(params.waveSeedingCurvatureThresholdRegionCenter -
                                  params.waveSeedingCurvatureThresholdRegionRadius,
@@ -1850,21 +1790,17 @@ struct seedWaves : public KernelBase {
     return surfaceWaveSource;
   }
   typedef ParticleDataImpl<Real> type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel seedWaves ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, surfacePoints, surfaceWaveSeed, surfaceWaveSeedAmplitude, surfaceWaveSource);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, surfacePoints, surfaceWaveSeed, surfaceWaveSeedAmplitude, surfaceWaveSource);
+    }
   }
   const BasicParticleSystemWrapper &surfacePoints;
   ParticleDataImpl<Real> &surfaceWaveSeed;
diff --git a/extern/mantaflow/preprocessed/plugin/viscosity.cpp b/extern/mantaflow/preprocessed/plugin/viscosity.cpp
deleted file mode 100644
index a9e1985336e..00000000000
--- a/extern/mantaflow/preprocessed/plugin/viscosity.cpp
+++ /dev/null
@@ -1,1428 +0,0 @@
-
-
-// DO NOT EDIT !
-// This file is generated using the MantaFlow preprocessor (prep generate).
-
-/******************************************************************************
- *
- * MantaFlow fluid solver framework
- * Copyright 2020 Sebastian Barschkis, Nils Thuerey
- *
- * This program is free software, distributed under the terms of the
- * Apache License, Version 2.0
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Accurate Viscous Free Surfaces for Buckling, Coiling, and Rotating Liquids
- * Batty et al., SCA 2008
- *
- ******************************************************************************/
-
-#include "conjugategrad.h"
-#include "general.h"
-#include "grid.h"
-#include "vectorbase.h"
-
-#include <chrono>
-
-#if OPENMP == 1 || TBB == 1
-#  define ENABLE_PARALLEL 0
-#endif
-
-#if ENABLE_PARALLEL == 1
-#  include <thread>
-#  include <algorithm>
-
-static const int manta_num_threads = std::thread::hardware_concurrency();
-
-#  define parallel_block \
-    { \
-      std::vector<std::thread> threads; \
-      {
-
-#  define do_parallel threads.push_back( std::thread([&]() {
-#  define do_end \
-    } ) );
-
-#  define block_end \
-    } \
-    for (auto &thread : threads) { \
-      thread.join(); \
-    } \
-    }
-
-#endif
-
-#define FOR_INT_IJK(num) \
-  for (int k_off = 0; k_off < num; ++k_off) \
-    for (int j_off = 0; j_off < num; ++j_off) \
-      for (int i_off = 0; i_off < num; ++i_off)
-
-using namespace std;
-
-namespace Manta {
-
-//! Assumes phi0<0 and phi1>=0, phi2>=0, and phi3>=0 or vice versa.
-//! In particular, phi0 must not equal any of phi1, phi2 or phi3.
-static Real sortedTetFraction(Real phi0, Real phi1, Real phi2, Real phi3)
-{
-  return phi0 * phi0 * phi0 / ((phi0 - phi1) * (phi0 - phi2) * (phi0 - phi3));
-}
-
-//! Assumes phi0<0, phi1<0, and phi2>=0, and phi3>=0 or vice versa.
-//! In particular, phi0 and phi1 must not equal any of phi2 and phi3.
-static Real sortedPrismFraction(Real phi0, Real phi1, Real phi2, Real phi3)
-{
-  Real a = phi0 / (phi0 - phi2);
-  Real b = phi0 / (phi0 - phi3);
-  Real c = phi1 / (phi1 - phi3);
-  Real d = phi1 / (phi1 - phi2);
-  return a * b * (1 - d) + b * (1 - c) * d + c * d;
-}
-
-Real volumeFraction(Real phi0, Real phi1, Real phi2, Real phi3)
-{
-  sort(phi0, phi1, phi2, phi3);
-  if (phi3 <= 0)
-    return 1;
-  else if (phi2 <= 0)
-    return 1 - sortedTetFraction(phi3, phi2, phi1, phi0);
-  else if (phi1 <= 0)
-    return sortedPrismFraction(phi0, phi1, phi2, phi3);
-  else if (phi0 <= 0)
-    return sortedTetFraction(phi0, phi1, phi2, phi3);
-  else
-    return 0;
-}
-
-//! The average of the two possible decompositions of the cube into five tetrahedra.
-Real volumeFraction(Real phi000,
-                    Real phi100,
-                    Real phi010,
-                    Real phi110,
-                    Real phi001,
-                    Real phi101,
-                    Real phi011,
-                    Real phi111)
-{
-  return (volumeFraction(phi000, phi001, phi101, phi011) +
-          volumeFraction(phi000, phi101, phi100, phi110) +
-          volumeFraction(phi000, phi010, phi011, phi110) +
-          volumeFraction(phi101, phi011, phi111, phi110) +
-          2 * volumeFraction(phi000, phi011, phi101, phi110) +
-          volumeFraction(phi100, phi101, phi001, phi111) +
-          volumeFraction(phi100, phi001, phi000, phi010) +
-          volumeFraction(phi100, phi110, phi111, phi010) +
-          volumeFraction(phi001, phi111, phi011, phi010) +
-          2 * volumeFraction(phi100, phi111, phi001, phi010)) /
-         12;
-}
-
-//! Kernel loop over grid with 2x base resolution!
-
-struct KnEstimateVolumeFraction : public KernelBase {
-  KnEstimateVolumeFraction(Grid<Real> &volumes,
-                           const Grid<Real> &phi,
-                           const Vec3 &startCentre,
-                           const Real dx)
-      : KernelBase(&volumes, 0), volumes(volumes), phi(phi), startCentre(startCentre), dx(dx)
-  {
-    runMessage();
-    run();
-  }
-  inline void op(int i,
-                 int j,
-                 int k,
-                 Grid<Real> &volumes,
-                 const Grid<Real> &phi,
-                 const Vec3 &startCentre,
-                 const Real dx) const
-  {
-    const Vec3 centre = startCentre + Vec3(i, j, k) * 0.5;
-    const Real offset = 0.5 * dx;
-    const int order = 1;  // is sufficient
-
-    Real phi000 = phi.getInterpolatedHi(centre + Vec3(-offset, -offset, -offset), order);
-    Real phi001 = phi.getInterpolatedHi(centre + Vec3(-offset, -offset, +offset), order);
-    Real phi010 = phi.getInterpolatedHi(centre + Vec3(-offset, +offset, -offset), order);
-    Real phi011 = phi.getInterpolatedHi(centre + Vec3(-offset, +offset, +offset), order);
-    Real phi100 = phi.getInterpolatedHi(centre + Vec3(+offset, -offset, -offset), order);
-    Real phi101 = phi.getInterpolatedHi(centre + Vec3(+offset, -offset, +offset), order);
-    Real phi110 = phi.getInterpolatedHi(centre + Vec3(+offset, +offset, -offset), order);
-    Real phi111 = phi.getInterpolatedHi(centre + Vec3(+offset, +offset, +offset), order);
-
-    volumes(i, j, k) = volumeFraction(
-        phi000, phi100, phi010, phi110, phi001, phi101, phi011, phi111);
-  }
-  inline Grid<Real> &getArg0()
-  {
-    return volumes;
-  }
-  typedef Grid<Real> type0;
-  inline const Grid<Real> &getArg1()
-  {
-    return phi;
-  }
-  typedef Grid<Real> type1;
-  inline const Vec3 &getArg2()
-  {
-    return startCentre;
-  }
-  typedef Vec3 type2;
-  inline const Real &getArg3()
-  {
-    return dx;
-  }
-  typedef Real type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnEstimateVolumeFraction ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
-    if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, volumes, phi, startCentre, dx);
-    }
-    else {
-      const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, volumes, phi, startCentre, dx);
-    }
-  }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
-  Grid<Real> &volumes;
-  const Grid<Real> &phi;
-  const Vec3 &startCentre;
-  const Real dx;
-};
-
-struct KnUpdateVolumeGrid : public KernelBase {
-  KnUpdateVolumeGrid(Grid<Real> &cVolLiquid,
-                     Grid<Real> &uVolLiquid,
-                     Grid<Real> &vVolLiquid,
-                     Grid<Real> &wVolLiquid,
-                     Grid<Real> &exVolLiquid,
-                     Grid<Real> &eyVolLiquid,
-                     Grid<Real> &ezVolLiquid,
-                     const Grid<Real> &src)
-      : KernelBase(&cVolLiquid, 0),
-        cVolLiquid(cVolLiquid),
-        uVolLiquid(uVolLiquid),
-        vVolLiquid(vVolLiquid),
-        wVolLiquid(wVolLiquid),
-        exVolLiquid(exVolLiquid),
-        eyVolLiquid(eyVolLiquid),
-        ezVolLiquid(ezVolLiquid),
-        src(src)
-  {
-    runMessage();
-    run();
-  }
-  inline void op(int i,
-                 int j,
-                 int k,
-                 Grid<Real> &cVolLiquid,
-                 Grid<Real> &uVolLiquid,
-                 Grid<Real> &vVolLiquid,
-                 Grid<Real> &wVolLiquid,
-                 Grid<Real> &exVolLiquid,
-                 Grid<Real> &eyVolLiquid,
-                 Grid<Real> &ezVolLiquid,
-                 const Grid<Real> &src) const
-  {
-    // Work out c
-    cVolLiquid(i, j, k) = 0;
-    FOR_INT_IJK(2)
-    {
-      cVolLiquid(i, j, k) += src(2 * i + i_off, 2 * j + j_off, 2 * k + k_off);
-    }
-    cVolLiquid(i, j, k) /= 8;
-
-    // Work out u
-    if (i >= 1) {
-      uVolLiquid(i, j, k) = 0;
-      int base_i = 2 * i - 1;
-      int base_j = 2 * j;
-      int base_k = 2 * k;
-      FOR_INT_IJK(2)
-      {
-        uVolLiquid(i, j, k) += src(base_i + i_off, base_j + j_off, base_k + k_off);
-      }
-      uVolLiquid(i, j, k) /= 8;
-    }
-
-    // v
-    if (j >= 1) {
-      vVolLiquid(i, j, k) = 0;
-      int base_i = 2 * i;
-      int base_j = 2 * j - 1;
-      int base_k = 2 * k;
-      FOR_INT_IJK(2)
-      {
-        vVolLiquid(i, j, k) += src(base_i + i_off, base_j + j_off, base_k + k_off);
-      }
-      vVolLiquid(i, j, k) /= 8;
-    }
-
-    // w
-    if (k >= 1) {
-      wVolLiquid(i, j, k) = 0;
-      int base_i = 2 * i;
-      int base_j = 2 * j;
-      int base_k = 2 * k - 1;
-      FOR_INT_IJK(2)
-      {
-        wVolLiquid(i, j, k) += src(base_i + i_off, base_j + j_off, base_k + k_off);
-      }
-      wVolLiquid(i, j, k) /= 8;
-    }
-
-    // e-x
-    if (j >= 1 && k >= 1) {
-      exVolLiquid(i, j, k) = 0;
-      int base_i = 2 * i;
-      int base_j = 2 * j - 1;
-      int base_k = 2 * k - 1;
-      FOR_INT_IJK(2)
-      {
-        exVolLiquid(i, j, k) += src(base_i + i_off, base_j + j_off, base_k + k_off);
-      }
-      exVolLiquid(i, j, k) /= 8;
-    }
-
-    // e-y
-    if (i >= 1 && k >= 1) {
-      eyVolLiquid(i, j, k) = 0;
-      int base_i = 2 * i - 1;
-      int base_j = 2 * j;
-      int base_k = 2 * k - 1;
-      FOR_INT_IJK(2)
-      {
-        eyVolLiquid(i, j, k) += src(base_i + i_off, base_j + j_off, base_k + k_off);
-      }
-      eyVolLiquid(i, j, k) /= 8;
-    }
-
-    // e-z
-    if (i >= 1 && j >= 1) {
-      ezVolLiquid(i, j, k) = 0;
-      int base_i = 2 * i - 1;
-      int base_j = 2 * j - 1;
-      int base_k = 2 * k;
-      FOR_INT_IJK(2)
-      {
-        ezVolLiquid(i, j, k) += src(base_i + i_off, base_j + j_off, base_k + k_off);
-      }
-      ezVolLiquid(i, j, k) /= 8;
-    }
-  }
-  inline Grid<Real> &getArg0()
-  {
-    return cVolLiquid;
-  }
-  typedef Grid<Real> type0;
-  inline Grid<Real> &getArg1()
-  {
-    return uVolLiquid;
-  }
-  typedef Grid<Real> type1;
-  inline Grid<Real> &getArg2()
-  {
-    return vVolLiquid;
-  }
-  typedef Grid<Real> type2;
-  inline Grid<Real> &getArg3()
-  {
-    return wVolLiquid;
-  }
-  typedef Grid<Real> type3;
-  inline Grid<Real> &getArg4()
-  {
-    return exVolLiquid;
-  }
-  typedef Grid<Real> type4;
-  inline Grid<Real> &getArg5()
-  {
-    return eyVolLiquid;
-  }
-  typedef Grid<Real> type5;
-  inline Grid<Real> &getArg6()
-  {
-    return ezVolLiquid;
-  }
-  typedef Grid<Real> type6;
-  inline const Grid<Real> &getArg7()
-  {
-    return src;
-  }
-  typedef Grid<Real> type7;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnUpdateVolumeGrid ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
-    if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i,
-               j,
-               k,
-               cVolLiquid,
-               uVolLiquid,
-               vVolLiquid,
-               wVolLiquid,
-               exVolLiquid,
-               eyVolLiquid,
-               ezVolLiquid,
-               src);
-    }
-    else {
-      const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i,
-             j,
-             k,
-             cVolLiquid,
-             uVolLiquid,
-             vVolLiquid,
-             wVolLiquid,
-             exVolLiquid,
-             eyVolLiquid,
-             ezVolLiquid,
-             src);
-    }
-  }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
-  Grid<Real> &cVolLiquid;
-  Grid<Real> &uVolLiquid;
-  Grid<Real> &vVolLiquid;
-  Grid<Real> &wVolLiquid;
-  Grid<Real> &exVolLiquid;
-  Grid<Real> &eyVolLiquid;
-  Grid<Real> &ezVolLiquid;
-  const Grid<Real> &src;
-};
-
-void computeWeights(const Grid<Real> &phi,
-                    Grid<Real> &doubleSized,
-                    Grid<Real> &cVolLiquid,
-                    Grid<Real> &uVolLiquid,
-                    Grid<Real> &vVolLiquid,
-                    Grid<Real> &wVolLiquid,
-                    Grid<Real> &exVolLiquid,
-                    Grid<Real> &eyVolLiquid,
-                    Grid<Real> &ezVolLiquid,
-                    Real dx)
-{
-  KnEstimateVolumeFraction(doubleSized, phi, Vec3(0.25 * dx, 0.25 * dx, 0.25 * dx), 0.5 * dx);
-  KnUpdateVolumeGrid(cVolLiquid,
-                     uVolLiquid,
-                     vVolLiquid,
-                     wVolLiquid,
-                     exVolLiquid,
-                     eyVolLiquid,
-                     ezVolLiquid,
-                     doubleSized);
-}
-
-struct KnUpdateFaceStates : public KernelBase {
-  KnUpdateFaceStates(const FlagGrid &flags,
-                     Grid<int> &uState,
-                     Grid<int> &vState,
-                     Grid<int> &wState)
-      : KernelBase(&flags, 0), flags(flags), uState(uState), vState(vState), wState(wState)
-  {
-    runMessage();
-    run();
-  }
-  inline void op(int i,
-                 int j,
-                 int k,
-                 const FlagGrid &flags,
-                 Grid<int> &uState,
-                 Grid<int> &vState,
-                 Grid<int> &wState) const
-  {
-    bool curObs = flags.isObstacle(i, j, k);
-    uState(i, j, k) = (i > 0 && !flags.isObstacle(i - 1, j, k) && !curObs) ?
-                          FlagGrid::TypeFluid :
-                          FlagGrid::TypeObstacle;
-    vState(i, j, k) = (j > 0 && !flags.isObstacle(i, j - 1, k) && !curObs) ?
-                          FlagGrid::TypeFluid :
-                          FlagGrid::TypeObstacle;
-    wState(i, j, k) = (k > 0 && !flags.isObstacle(i, j, k - 1) && !curObs) ?
-                          FlagGrid::TypeFluid :
-                          FlagGrid::TypeObstacle;
-  }
-  inline const FlagGrid &getArg0()
-  {
-    return flags;
-  }
-  typedef FlagGrid type0;
-  inline Grid<int> &getArg1()
-  {
-    return uState;
-  }
-  typedef Grid<int> type1;
-  inline Grid<int> &getArg2()
-  {
-    return vState;
-  }
-  typedef Grid<int> type2;
-  inline Grid<int> &getArg3()
-  {
-    return wState;
-  }
-  typedef Grid<int> type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnUpdateFaceStates ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
-    if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, uState, vState, wState);
-    }
-    else {
-      const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, uState, vState, wState);
-    }
-  }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
-  const FlagGrid &flags;
-  Grid<int> &uState;
-  Grid<int> &vState;
-  Grid<int> &wState;
-};
-
-struct KnApplyVelocities : public KernelBase {
-  KnApplyVelocities(MACGrid &dst,
-                    const Grid<int> &uState,
-                    const Grid<int> &vState,
-                    const Grid<int> &wState,
-                    Grid<Real> &srcU,
-                    Grid<Real> &srcV,
-                    Grid<Real> &srcW)
-      : KernelBase(&dst, 0),
-        dst(dst),
-        uState(uState),
-        vState(vState),
-        wState(wState),
-        srcU(srcU),
-        srcV(srcV),
-        srcW(srcW)
-  {
-    runMessage();
-    run();
-  }
-  inline void op(int i,
-                 int j,
-                 int k,
-                 MACGrid &dst,
-                 const Grid<int> &uState,
-                 const Grid<int> &vState,
-                 const Grid<int> &wState,
-                 Grid<Real> &srcU,
-                 Grid<Real> &srcV,
-                 Grid<Real> &srcW) const
-  {
-    dst(i, j, k).x = (uState(i, j, k) == FlagGrid::TypeFluid) ? srcU(i, j, k) : 0;
-    dst(i, j, k).y = (vState(i, j, k) == FlagGrid::TypeFluid) ? srcV(i, j, k) : 0;
-    if (dst.is3D())
-      dst(i, j, k).z = (wState(i, j, k) == FlagGrid::TypeFluid) ? srcW(i, j, k) : 0;
-  }
-  inline MACGrid &getArg0()
-  {
-    return dst;
-  }
-  typedef MACGrid type0;
-  inline const Grid<int> &getArg1()
-  {
-    return uState;
-  }
-  typedef Grid<int> type1;
-  inline const Grid<int> &getArg2()
-  {
-    return vState;
-  }
-  typedef Grid<int> type2;
-  inline const Grid<int> &getArg3()
-  {
-    return wState;
-  }
-  typedef Grid<int> type3;
-  inline Grid<Real> &getArg4()
-  {
-    return srcU;
-  }
-  typedef Grid<Real> type4;
-  inline Grid<Real> &getArg5()
-  {
-    return srcV;
-  }
-  typedef Grid<Real> type5;
-  inline Grid<Real> &getArg6()
-  {
-    return srcW;
-  }
-  typedef Grid<Real> type6;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnApplyVelocities ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
-    if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, dst, uState, vState, wState, srcU, srcV, srcW);
-    }
-    else {
-      const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, dst, uState, vState, wState, srcU, srcV, srcW);
-    }
-  }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
-  MACGrid &dst;
-  const Grid<int> &uState;
-  const Grid<int> &vState;
-  const Grid<int> &wState;
-  Grid<Real> &srcU;
-  Grid<Real> &srcV;
-  Grid<Real> &srcW;
-};
-
-void solveViscosity(const FlagGrid &flags,
-                    MACGrid &vel,
-                    Grid<Real> &cVolLiquid,
-                    Grid<Real> &uVolLiquid,
-                    Grid<Real> &vVolLiquid,
-                    Grid<Real> &wVolLiquid,
-                    Grid<Real> &exVolLiquid,
-                    Grid<Real> &eyVolLiquid,
-                    Grid<Real> &ezVolLiquid,
-                    Grid<Real> &viscosity,
-                    const Real dt,
-                    const Real dx,
-                    const Real cgAccuracy,
-                    const Real cgMaxIterFac)
-{
-  const Real factor = dt * square(1.0 / dx);
-  const int maxIter = (int)(cgMaxIterFac * flags.getSize().max()) * (flags.is3D() ? 1 : 4);
-  GridCg<ApplyMatrixViscosityU> *uGcg;
-  GridCg<ApplyMatrixViscosityV> *vGcg;
-  GridCg<ApplyMatrixViscosityW> *wGcg;
-
-  // Tmp grids for CG solve in U, V, W dimensions
-  FluidSolver *parent = flags.getParent();
-  Grid<Real> uResidual(parent);
-  Grid<Real> vResidual(parent);
-  Grid<Real> wResidual(parent);
-  Grid<Real> uSearch(parent);
-  Grid<Real> vSearch(parent);
-  Grid<Real> wSearch(parent);
-  Grid<Real> uTmp(parent);
-  Grid<Real> vTmp(parent);
-  Grid<Real> wTmp(parent);
-  Grid<Real> uRhs(parent);
-  Grid<Real> vRhs(parent);
-  Grid<Real> wRhs(parent);
-
-  // A matrix U grids
-  Grid<Real> uA0(parent);        // diagonal elements in A matrix
-  Grid<Real> uAplusi(parent);    // neighbor at i+1
-  Grid<Real> uAplusj(parent);    // neighbor at j+1
-  Grid<Real> uAplusk(parent);    // neighbor at k+1
-  Grid<Real> uAminusi(parent);   // neighbor at i-1
-  Grid<Real> uAminusj(parent);   // neighbor at j-1
-  Grid<Real> uAminusk(parent);   // neighbor at k-1
-  Grid<Real> uAhelper1(parent);  // additional helper grids for off diagonal elements
-  Grid<Real> uAhelper2(parent);
-  Grid<Real> uAhelper3(parent);
-  Grid<Real> uAhelper4(parent);
-  Grid<Real> uAhelper5(parent);
-  Grid<Real> uAhelper6(parent);
-  Grid<Real> uAhelper7(parent);
-  Grid<Real> uAhelper8(parent);
-
-  // A matrix V grids
-  Grid<Real> vA0(parent);
-  Grid<Real> vAplusi(parent);
-  Grid<Real> vAplusj(parent);
-  Grid<Real> vAplusk(parent);
-  Grid<Real> vAminusi(parent);
-  Grid<Real> vAminusj(parent);
-  Grid<Real> vAminusk(parent);
-  Grid<Real> vAhelper1(parent);
-  Grid<Real> vAhelper2(parent);
-  Grid<Real> vAhelper3(parent);
-  Grid<Real> vAhelper4(parent);
-  Grid<Real> vAhelper5(parent);
-  Grid<Real> vAhelper6(parent);
-  Grid<Real> vAhelper7(parent);
-  Grid<Real> vAhelper8(parent);
-
-  // A matrix W grids
-  Grid<Real> wA0(parent);
-  Grid<Real> wAplusi(parent);
-  Grid<Real> wAplusj(parent);
-  Grid<Real> wAplusk(parent);
-  Grid<Real> wAminusi(parent);
-  Grid<Real> wAminusj(parent);
-  Grid<Real> wAminusk(parent);
-  Grid<Real> wAhelper1(parent);
-  Grid<Real> wAhelper2(parent);
-  Grid<Real> wAhelper3(parent);
-  Grid<Real> wAhelper4(parent);
-  Grid<Real> wAhelper5(parent);
-  Grid<Real> wAhelper6(parent);
-  Grid<Real> wAhelper7(parent);
-  Grid<Real> wAhelper8(parent);
-
-  // Solution grids for CG solvers
-  Grid<Real> uSolution(parent);
-  Grid<Real> vSolution(parent);
-  Grid<Real> wSolution(parent);
-
-  // Save state of voxel face (fluid or obstacle)
-  Grid<int> uState(parent);
-  Grid<int> vState(parent);
-  Grid<int> wState(parent);
-
-  // Save state of voxel face (fluid or obstacle)
-  KnUpdateFaceStates(flags, uState, vState, wState);
-
-  // Shorter names for flags, we will use them often
-  int isFluid = FlagGrid::TypeFluid;
-  int isObstacle = FlagGrid::TypeObstacle;
-
-  // Main viscosity loop: construct A matrices and rhs's in all dimensions
-  FOR_IJK_BND(flags, 1)
-  {
-
-    // U-terms: 2u_xx+ v_xy +uyy + u_zz + w_xz
-    if (uState(i, j, k) == isFluid) {
-
-      uRhs(i, j, k) = uVolLiquid(i, j, k) * vel(i, j, k).x;
-      uA0(i, j, k) = uVolLiquid(i, j, k);
-
-      Real viscRight = viscosity(i, j, k);
-      Real viscLeft = viscosity(i - 1, j, k);
-      Real volRight = cVolLiquid(i, j, k);
-      Real volLeft = cVolLiquid(i - 1, j, k);
-
-      Real viscTop = 0.25 * (viscosity(i - 1, j + 1, k) + viscosity(i - 1, j, k) +
-                             viscosity(i, j + 1, k) + viscosity(i, j, k));
-      Real viscBottom = 0.25 * (viscosity(i - 1, j, k) + viscosity(i - 1, j - 1, k) +
-                                viscosity(i, j, k) + viscosity(i, j - 1, k));
-      Real volTop = ezVolLiquid(i, j + 1, k);
-      Real volBottom = ezVolLiquid(i, j, k);
-
-      Real viscFront = 0.25 * (viscosity(i - 1, j, k + 1) + viscosity(i - 1, j, k) +
-                               viscosity(i, j, k + 1) + viscosity(i, j, k));
-      Real viscBack = 0.25 * (viscosity(i - 1, j, k) + viscosity(i - 1, j, k - 1) +
-                              viscosity(i, j, k) + viscosity(i, j, k - 1));
-      Real volFront = eyVolLiquid(i, j, k + 1);
-      Real volBack = eyVolLiquid(i, j, k);
-
-      Real factorRight = 2 * factor * viscRight * volRight;
-      Real factorLeft = 2 * factor * viscLeft * volLeft;
-      Real factorTop = factor * viscTop * volTop;
-      Real factorBottom = factor * viscBottom * volBottom;
-      Real factorFront = factor * viscFront * volFront;
-      Real factorBack = factor * viscBack * volBack;
-
-      // u_x_right
-      uA0(i, j, k) += factorRight;
-      if (uState(i + 1, j, k) == isFluid) {
-        uAplusi(i, j, k) += -factorRight;
-      }
-      else if (uState(i + 1, j, k) == isObstacle) {
-        uRhs(i, j, k) -= -vel(i + 1, j, k).x * factorRight;
-      }
-
-      // u_x_left
-      uA0(i, j, k) += factorLeft;
-      if (uState(i - 1, j, k) == isFluid) {
-        uAminusi(i, j, k) += -factorLeft;
-      }
-      else if (uState(i - 1, j, k) == isObstacle) {
-        uRhs(i, j, k) -= -vel(i - 1, j, k).x * factorLeft;
-      }
-
-      // u_y_top
-      uA0(i, j, k) += factorTop;
-      if (uState(i, j + 1, k) == isFluid) {
-        uAplusj(i, j, k) += -factorTop;
-      }
-      else if (uState(i, j + 1, k) == isObstacle) {
-        uRhs(i, j, k) -= -vel(i, j + 1, k).x * factorTop;
-      }
-
-      // u_y_bottom
-      uA0(i, j, k) += factorBottom;
-      if (uState(i, j - 1, k) == isFluid) {
-        uAminusj(i, j, k) += -factorBottom;
-      }
-      else if (uState(i, j - 1, k) == isObstacle) {
-        uRhs(i, j, k) -= -vel(i, j - 1, k).x * factorBottom;
-      }
-
-      // u_z_front
-      uA0(i, j, k) += factorFront;
-      if (uState(i, j, k + 1) == isFluid) {
-        uAplusk(i, j, k) += -factorFront;
-      }
-      else if (uState(i, j, k + 1) == isObstacle) {
-        uRhs(i, j, k) -= -vel(i, j, k + 1).x * factorFront;
-      }
-
-      // u_z_back
-      uA0(i, j, k) += factorBack;
-      if (uState(i, j, k - 1) == isFluid) {
-        uAminusk(i, j, k) += -factorBack;
-      }
-      else if (uState(i, j, k - 1) == isObstacle) {
-        uRhs(i, j, k) -= -vel(i, j, k - 1).x * factorBack;
-      }
-
-      // v_x_top
-      if (vState(i, j + 1, k) == isFluid) {
-        uAhelper1(i, j, k) += -factorTop;
-      }
-      else if (vState(i, j + 1, k) == isObstacle) {
-        uRhs(i, j, k) -= -vel(i, j + 1, k).y * factorTop;
-      }
-
-      if (vState(i - 1, j + 1, k) == isFluid) {
-        uAhelper2(i, j, k) += factorTop;
-      }
-      else if (vState(i - 1, j + 1, k) == isObstacle) {
-        uRhs(i, j, k) -= vel(i - 1, j + 1, k).y * factorTop;
-      }
-
-      // v_x_bottom
-      if (vState(i, j, k) == isFluid) {
-        uAhelper3(i, j, k) += factorBottom;
-      }
-      else if (vState(i, j, k) == isObstacle) {
-        uRhs(i, j, k) -= vel(i, j, k).y * factorBottom;
-      }
-
-      if (vState(i - 1, j, k) == isFluid) {
-        uAhelper4(i, j, k) += -factorBottom;
-      }
-      else if (vState(i - 1, j, k) == isObstacle) {
-        uRhs(i, j, k) -= -vel(i - 1, j, k).y * factorBottom;
-      }
-
-      // w_x_front
-      if (wState(i, j, k + 1) == isFluid) {
-        uAhelper5(i, j, k) += -factorFront;
-      }
-      else if (wState(i, j, k + 1) == isObstacle) {
-        uRhs(i, j, k) -= -vel(i, j, k + 1).z * factorFront;
-      }
-
-      if (wState(i - 1, j, k + 1) == isFluid) {
-        uAhelper6(i, j, k) += factorFront;
-      }
-      else if (wState(i - 1, j, k + 1) == isObstacle) {
-        uRhs(i, j, k) -= vel(i - 1, j, k + 1).z * factorFront;
-      }
-
-      // w_x_back
-      if (wState(i, j, k) == isFluid) {
-        uAhelper7(i, j, k) += factorBack;
-      }
-      else if (wState(i, j, k) == isObstacle) {
-        uRhs(i, j, k) -= vel(i, j, k).z * factorBack;
-      }
-
-      if (wState(i - 1, j, k) == isFluid) {
-        uAhelper8(i, j, k) += -factorBack;
-      }
-      else if (wState(i - 1, j, k) == isObstacle) {
-        uRhs(i, j, k) -= -vel(i - 1, j, k).z * factorBack;
-      }
-    }
-
-    // V-terms: vxx + 2vyy + vzz + u_yx + w_yz
-    if (vState(i, j, k) == isFluid) {
-
-      vRhs(i, j, k) = vVolLiquid(i, j, k) * vel(i, j, k).y;
-      vA0(i, j, k) = vVolLiquid(i, j, k);
-
-      Real viscRight = 0.25 * (viscosity(i, j - 1, k) + viscosity(i + 1, j - 1, k) +
-                               viscosity(i, j, k) + viscosity(i + 1, j, k));
-      Real viscLeft = 0.25 * (viscosity(i, j - 1, k) + viscosity(i - 1, j - 1, k) +
-                              viscosity(i, j, k) + viscosity(i - 1, j, k));
-      Real volRight = ezVolLiquid(i + 1, j, k);
-      Real volLeft = ezVolLiquid(i, j, k);
-
-      Real viscTop = viscosity(i, j, k);
-      Real viscBottom = viscosity(i, j - 1, k);
-      Real volTop = cVolLiquid(i, j, k);
-      Real volBottom = cVolLiquid(i, j - 1, k);
-
-      Real viscFront = 0.25 * (viscosity(i, j - 1, k) + viscosity(i, j - 1, k + 1) +
-                               viscosity(i, j, k) + viscosity(i, j, k + 1));
-      Real viscBack = 0.25 * (viscosity(i, j - 1, k) + viscosity(i, j - 1, k - 1) +
-                              viscosity(i, j, k) + viscosity(i, j, k - 1));
-      Real volFront = exVolLiquid(i, j, k + 1);
-      Real volBack = exVolLiquid(i, j, k);
-
-      Real factorRight = factor * viscRight * volRight;
-      Real factorLeft = factor * viscLeft * volLeft;
-      Real factorTop = 2 * factor * viscTop * volTop;
-      Real factorBottom = 2 * factor * viscBottom * volBottom;
-      Real factorFront = factor * viscFront * volFront;
-      Real factorBack = factor * viscBack * volBack;
-
-      // v_x_right
-      vA0(i, j, k) += factorRight;
-      if (vState(i + 1, j, k) == isFluid) {
-        vAplusi(i, j, k) += -factorRight;
-      }
-      else if (vState(i + 1, j, k) == isObstacle) {
-        vRhs(i, j, k) -= -vel(i + 1, j, k).y * factorRight;
-      }
-
-      // v_x_left
-      vA0(i, j, k) += factorLeft;
-      if (vState(i - 1, j, k) == isFluid) {
-        vAminusi(i, j, k) += -factorLeft;
-      }
-      else if (vState(i - 1, j, k) == isObstacle) {
-        vRhs(i, j, k) -= -vel(i - 1, j, k).y * factorLeft;
-      }
-
-      // vy_top
-      vA0(i, j, k) += factorTop;
-      if (vState(i, j + 1, k) == isFluid) {
-        vAplusj(i, j, k) += -factorTop;
-      }
-      else if (vState(i, j + 1, k) == isObstacle) {
-        vRhs(i, j, k) -= -vel(i, j + 1, k).y * factorTop;
-      }
-
-      // vy_bottom
-      vA0(i, j, k) += factorBottom;
-      if (vState(i, j - 1, k) == isFluid) {
-        vAminusj(i, j, k) += -factorBottom;
-      }
-      else if (vState(i, j - 1, k) == isObstacle) {
-        vRhs(i, j, k) -= -vel(i, j - 1, k).y * factorBottom;
-      }
-
-      // v_z_front
-      vA0(i, j, k) += factorFront;
-      if (vState(i, j, k + 1) == isFluid) {
-        vAplusk(i, j, k) += -factorFront;
-      }
-      else if (vState(i, j, k + 1) == isObstacle) {
-        vRhs(i, j, k) -= -vel(i, j, k + 1).y * factorFront;
-      }
-
-      // v_z_back
-      vA0(i, j, k) += factorBack;
-      if (vState(i, j, k - 1) == isFluid) {
-        vAminusk(i, j, k) += -factorBack;
-      }
-      else if (vState(i, j, k - 1) == isObstacle) {
-        vRhs(i, j, k) -= -vel(i, j, k - 1).y * factorBack;
-      }
-
-      // u_y_right
-      if (uState(i + 1, j, k) == isFluid) {
-        vAhelper1(i, j, k) += -factorRight;
-      }
-      else if (uState(i + 1, j, k) == isObstacle) {
-        vRhs(i, j, k) -= -vel(i + 1, j, k).x * factorRight;
-      }
-
-      if (uState(i + 1, j - 1, k) == isFluid) {
-        vAhelper2(i, j, k) += factorRight;
-      }
-      else if (uState(i + 1, j - 1, k) == isObstacle) {
-        vRhs(i, j, k) -= vel(i + 1, j - 1, k).x * factorRight;
-      }
-
-      // u_y_left
-      if (uState(i, j, k) == isFluid) {
-        vAhelper3(i, j, k) += factorLeft;
-      }
-      else if (uState(i, j, k) == isObstacle) {
-        vRhs(i, j, k) -= vel(i, j, k).x * factorLeft;
-      }
-
-      if (uState(i, j - 1, k) == isFluid) {
-        vAhelper4(i, j, k) += -factorLeft;
-      }
-      else if (uState(i, j - 1, k) == isObstacle) {
-        vRhs(i, j, k) -= -vel(i, j - 1, k).x * factorLeft;
-      }
-
-      // w_y_front
-      if (wState(i, j, k + 1) == isFluid) {
-        vAhelper5(i, j, k) += -factorFront;
-      }
-      else if (wState(i, j, k + 1) == isObstacle) {
-        vRhs(i, j, k) -= -vel(i, j, k + 1).z * factorFront;
-      }
-
-      if (wState(i, j - 1, k + 1) == isFluid) {
-        vAhelper6(i, j, k) += factorFront;
-      }
-      else if (wState(i, j - 1, k + 1) == isObstacle) {
-        vRhs(i, j, k) -= vel(i, j - 1, k + 1).z * factorFront;
-      }
-
-      // w_y_back
-      if (wState(i, j, k) == isFluid) {
-        vAhelper7(i, j, k) += factorBack;
-      }
-      else if (wState(i, j, k) == isObstacle) {
-        vRhs(i, j, k) -= vel(i, j, k).z * factorBack;
-      }
-
-      if (wState(i, j - 1, k) == isFluid) {
-        vAhelper8(i, j, k) += -factorBack;
-      }
-      else if (wState(i, j - 1, k) == isObstacle) {
-        vRhs(i, j, k) -= -vel(i, j - 1, k).z * factorBack;
-      }
-    }
-
-    // W-terms: wxx+ wyy+ 2wzz + u_zx + v_zy
-    if (wState(i, j, k) == isFluid) {
-
-      wRhs(i, j, k) = wVolLiquid(i, j, k) * vel(i, j, k).z;
-      wA0(i, j, k) = wVolLiquid(i, j, k);
-
-      Real viscRight = 0.25 * (viscosity(i, j, k) + viscosity(i, j, k - 1) +
-                               viscosity(i + 1, j, k) + viscosity(i + 1, j, k - 1));
-      Real viscLeft = 0.25 * (viscosity(i, j, k) + viscosity(i, j, k - 1) +
-                              viscosity(i - 1, j, k) + viscosity(i - 1, j, k - 1));
-      Real volRight = eyVolLiquid(i + 1, j, k);
-      Real volLeft = eyVolLiquid(i, j, k);
-
-      Real viscTop = 0.25 * (viscosity(i, j, k) + viscosity(i, j, k - 1) + viscosity(i, j + 1, k) +
-                             viscosity(i, j + 1, k - 1));
-      Real viscBottom = 0.25 * (viscosity(i, j, k) + viscosity(i, j, k - 1) +
-                                viscosity(i, j - 1, k) + viscosity(i, j - 1, k - 1));
-      Real volTop = exVolLiquid(i, j + 1, k);
-      Real volBottom = exVolLiquid(i, j, k);
-
-      Real viscFront = viscosity(i, j, k);
-      Real viscBack = viscosity(i, j, k - 1);
-      Real volFront = cVolLiquid(i, j, k);
-      Real volBack = cVolLiquid(i, j, k - 1);
-
-      Real factorRight = factor * viscRight * volRight;
-      Real factorLeft = factor * viscLeft * volLeft;
-      Real factorTop = factor * viscTop * volTop;
-      Real factorBottom = factor * viscBottom * volBottom;
-      Real factorFront = 2 * factor * viscFront * volFront;
-      Real factorBack = 2 * factor * viscBack * volBack;
-
-      // w_x_right
-      wA0(i, j, k) += factorRight;
-      if (wState(i + 1, j, k) == isFluid) {
-        wAplusi(i, j, k) += -factorRight;
-      }
-      else if (wState(i + 1, j, k) == isObstacle) {
-        wRhs(i, j, k) -= -vel(i + 1, j, k).z * factorRight;
-      }
-
-      // w_x_left
-      wA0(i, j, k) += factorLeft;
-      if (wState(i - 1, j, k) == isFluid) {
-        wAminusi(i, j, k) += -factorLeft;
-      }
-      else if (wState(i - 1, j, k) == isObstacle) {
-        wRhs(i, j, k) -= -vel(i - 1, j, k).z * factorLeft;
-      }
-
-      // w_y_top
-      wA0(i, j, k) += factorTop;
-      if (wState(i, j + 1, k) == isFluid) {
-        wAplusj(i, j, k) += -factorTop;
-      }
-      else if (wState(i, j + 1, k) == isObstacle) {
-        wRhs(i, j, k) -= -vel(i, j + 1, k).z * factorTop;
-      }
-
-      // w_y_bottom
-      wA0(i, j, k) += factorBottom;
-      if (wState(i, j - 1, k) == isFluid) {
-        wAminusj(i, j, k) += -factorBottom;
-      }
-      else if (wState(i, j - 1, k) == isObstacle) {
-        wRhs(i, j, k) -= -vel(i, j - 1, k).z * factorBottom;
-      }
-
-      // w_z_front
-      wA0(i, j, k) += factorFront;
-      if (wState(i, j, k + 1) == isFluid) {
-        wAplusk(i, j, k) += -factorFront;
-      }
-      else if (wState(i, j, k + 1) == isObstacle) {
-        wRhs(i, j, k) -= -vel(i, j, k + 1).z * factorFront;
-      }
-
-      // w_z_back
-      wA0(i, j, k) += factorBack;
-      if (wState(i, j, k - 1) == isFluid) {
-        wAminusk(i, j, k) += -factorBack;
-      }
-      else if (wState(i, j, k - 1) == isObstacle) {
-        wRhs(i, j, k) -= -vel(i, j, k - 1).z * factorBack;
-      }
-
-      // u_z_right
-      if (uState(i + 1, j, k) == isFluid) {
-        wAhelper1(i, j, k) += -factorRight;
-      }
-      else if (uState(i + 1, j, k) == isObstacle) {
-        wRhs(i, j, k) -= -vel(i + 1, j, k).x * factorRight;
-      }
-
-      if (uState(i + 1, j, k - 1) == isFluid) {
-        wAhelper2(i, j, k) += factorRight;
-      }
-      else if (uState(i + 1, j, k - 1) == isObstacle) {
-        wRhs(i, j, k) -= vel(i + 1, j, k - 1).x * factorRight;
-      }
-
-      // u_z_left
-      if (uState(i, j, k) == isFluid) {
-        wAhelper3(i, j, k) += factorLeft;
-      }
-      else if (uState(i, j, k) == isObstacle) {
-        wRhs(i, j, k) -= vel(i, j, k).x * factorLeft;
-      }
-
-      if (uState(i, j, k - 1) == isFluid) {
-        wAhelper4(i, j, k) += -factorLeft;
-      }
-      else if (uState(i, j, k - 1) == isObstacle) {
-        wRhs(i, j, k) -= -vel(i, j, k - 1).x * factorLeft;
-      }
-
-      // v_z_top
-      if (vState(i, j + 1, k) == isFluid) {
-        wAhelper5(i, j, k) += -factorTop;
-      }
-      else if (vState(i, j + 1, k) == isObstacle) {
-        wRhs(i, j, k) -= -vel(i, j + 1, k).y * factorTop;
-      }
-
-      if (vState(i, j + 1, k - 1) == isFluid) {
-        wAhelper6(i, j, k) += factorTop;
-      }
-      else if (vState(i, j + 1, k - 1) == isObstacle) {
-        wRhs(i, j, k) -= vel(i, j + 1, k - 1).y * factorTop;
-      }
-
-      // v_z_bottom
-      if (vState(i, j, k) == isFluid) {
-        wAhelper7(i, j, k) += factorBottom;
-      }
-      else if (vState(i, j, k) == isObstacle) {
-        wRhs(i, j, k) -= vel(i, j, k).y * factorBottom;
-      }
-
-      if (vState(i, j, k - 1) == isFluid) {
-        wAhelper8(i, j, k) += -factorBottom;
-      }
-      else if (vState(i, j, k - 1) == isObstacle) {
-        wRhs(i, j, k) -= -vel(i, j, k - 1).y * factorBottom;
-      }
-    }
-  }
-
-  // CG solver for U
-  if (flags.is3D()) {
-    vector<Grid<Real> *> uMatA{&uA0,
-                               &uAplusi,
-                               &uAplusj,
-                               &uAplusk,
-                               &uAminusi,
-                               &uAminusj,
-                               &uAminusk,
-                               &uAhelper1,
-                               &uAhelper2,
-                               &uAhelper3,
-                               &uAhelper4,
-                               &uAhelper5,
-                               &uAhelper6,
-                               &uAhelper7,
-                               &uAhelper8};
-    vector<Grid<Real> *> uVecRhs{&vRhs, &wRhs};
-    uGcg = new GridCg<ApplyMatrixViscosityU>(
-        uSolution, uRhs, uResidual, uSearch, flags, uTmp, uMatA, uVecRhs);
-  }
-  else {
-    errMsg("Viscosity: 2D Matrix application not yet supported in viscosity solver");
-  }
-
-  // CG solver for V
-  if (flags.is3D()) {
-    vector<Grid<Real> *> vMatA{&vA0,
-                               &vAplusi,
-                               &vAplusj,
-                               &vAplusk,
-                               &vAminusi,
-                               &vAminusj,
-                               &vAminusk,
-                               &vAhelper1,
-                               &vAhelper2,
-                               &vAhelper3,
-                               &vAhelper4,
-                               &vAhelper5,
-                               &vAhelper6,
-                               &vAhelper7,
-                               &vAhelper8};
-    vector<Grid<Real> *> vVecRhs{&uRhs, &wRhs};
-    vGcg = new GridCg<ApplyMatrixViscosityV>(
-        vSolution, vRhs, vResidual, vSearch, flags, vTmp, vMatA, vVecRhs);
-  }
-  else {
-    errMsg("Viscosity: 2D Matrix application not yet supported in viscosity solver");
-  }
-
-  // CG solver for W
-  if (flags.is3D()) {
-    vector<Grid<Real> *> wMatA{&wA0,
-                               &wAplusi,
-                               &wAplusj,
-                               &wAplusk,
-                               &wAminusi,
-                               &wAminusj,
-                               &wAminusk,
-                               &wAhelper1,
-                               &wAhelper2,
-                               &wAhelper3,
-                               &wAhelper4,
-                               &wAhelper5,
-                               &wAhelper6,
-                               &wAhelper7,
-                               &wAhelper8};
-    vector<Grid<Real> *> wVecRhs{&uRhs, &vRhs};
-    wGcg = new GridCg<ApplyMatrixViscosityW>(
-        wSolution, wRhs, wResidual, wSearch, flags, wTmp, wMatA, wVecRhs);
-  }
-  else {
-    errMsg("Viscosity: 2D Matrix application not yet supported in viscosity solver");
-  }
-
-  // Same accuracy for all dimensions
-  uGcg->setAccuracy(cgAccuracy);
-  vGcg->setAccuracy(cgAccuracy);
-  wGcg->setAccuracy(cgAccuracy);
-
-  // CG solve. Preconditioning not supported yet. Instead, U, V, W  can optionally be solved in
-  // parallel.
-  for (int uIter = 0, vIter = 0, wIter = 0; uIter < maxIter || vIter < maxIter || wIter < maxIter;
-       uIter++, vIter++, wIter++) {
-#if ENABLE_PARALLEL == 1
-    parallel_block do_parallel
-#endif
-        if (uIter < maxIter && !uGcg->iterate()) uIter = maxIter;
-#if ENABLE_PARALLEL == 1
-    do_end do_parallel
-#endif
-        if (vIter < maxIter && !vGcg->iterate()) vIter = maxIter;
-#if ENABLE_PARALLEL == 1
-    do_end do_parallel
-#endif
-        if (wIter < maxIter && !wGcg->iterate()) wIter = maxIter;
-#if ENABLE_PARALLEL == 1
-    do_end block_end
-#endif
-
-        // Make sure that next CG iteration has updated rhs grids
-        uRhs.copyFrom(uSearch);
-    vRhs.copyFrom(vSearch);
-    wRhs.copyFrom(wSearch);
-  }
-  debMsg(
-      "Viscosity: solveViscosity() done. "
-      "Iterations (u,v,w): ("
-          << uGcg->getIterations() << "," << vGcg->getIterations() << "," << wGcg->getIterations()
-          << "), "
-             "Residual (u,v,w): ("
-          << uGcg->getResNorm() << "," << vGcg->getResNorm() << "," << wGcg->getResNorm() << ")",
-      2);
-
-  delete uGcg;
-  delete vGcg;
-  delete wGcg;
-
-  // Apply solutions to global velocity grid
-  KnApplyVelocities(vel, uState, vState, wState, uSolution, vSolution, wSolution);
-}
-
-//! To use the viscosity plugin, scenes must call this function before solving pressure.
-//! Note that the 'volumes' grid uses 2x the base resolution
-
-void applyViscosity(const FlagGrid &flags,
-                    const Grid<Real> &phi,
-                    MACGrid &vel,
-                    Grid<Real> &volumes,
-                    Grid<Real> &viscosity,
-                    const Real cgAccuracy = 1e-9,
-                    const Real cgMaxIterFac = 1.5)
-{
-  const Real dx = flags.getParent()->getDx();
-  const Real dt = flags.getParent()->getDt();
-
-  // Reserve temp grids for volume weight calculation
-  FluidSolver *parent = flags.getParent();
-  Grid<Real> cVolLiquid(parent);
-  Grid<Real> uVolLiquid(parent);
-  Grid<Real> vVolLiquid(parent);
-  Grid<Real> wVolLiquid(parent);
-  Grid<Real> exVolLiquid(parent);
-  Grid<Real> eyVolLiquid(parent);
-  Grid<Real> ezVolLiquid(parent);
-
-  // Ensure final weight grid gets cleared at every step
-  volumes.clear();
-
-  // Save viscous fluid volume in double-sized volumes grid
-  computeWeights(phi,
-                 volumes,
-                 cVolLiquid,
-                 uVolLiquid,
-                 vVolLiquid,
-                 wVolLiquid,
-                 exVolLiquid,
-                 eyVolLiquid,
-                 ezVolLiquid,
-                 dx);
-
-  // Set up A matrix and rhs. Solve with CG. Update velocity grid.
-  solveViscosity(flags,
-                 vel,
-                 cVolLiquid,
-                 uVolLiquid,
-                 vVolLiquid,
-                 wVolLiquid,
-                 exVolLiquid,
-                 eyVolLiquid,
-                 ezVolLiquid,
-                 viscosity,
-                 dt,
-                 dx,
-                 cgAccuracy,
-                 cgMaxIterFac);
-}
-static PyObject *_W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
-{
-  try {
-    PbArgs _args(_linargs, _kwds);
-    FluidSolver *parent = _args.obtainParent();
-    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
-    pbPreparePlugin(parent, "applyViscosity", !noTiming);
-    PyObject *_retval = nullptr;
-    {
-      ArgLocker _lock;
-      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 0, &_lock);
-      const Grid<Real> &phi = *_args.getPtr<Grid<Real>>("phi", 1, &_lock);
-      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 2, &_lock);
-      Grid<Real> &volumes = *_args.getPtr<Grid<Real>>("volumes", 3, &_lock);
-      Grid<Real> &viscosity = *_args.getPtr<Grid<Real>>("viscosity", 4, &_lock);
-      const Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 5, 1e-9, &_lock);
-      const Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 6, 1.5, &_lock);
-      _retval = getPyNone();
-      applyViscosity(flags, phi, vel, volumes, viscosity, cgAccuracy, cgMaxIterFac);
-      _args.check();
-    }
-    pbFinalizePlugin(parent, "applyViscosity", !noTiming);
-    return _retval;
-  }
-  catch (std::exception &e) {
-    pbSetError("applyViscosity", e.what());
-    return 0;
-  }
-}
-static const Pb::Register _RP_applyViscosity("", "applyViscosity", _W_0);
-extern "C" {
-void PbRegister_applyViscosity()
-{
-  KEEP_UNUSED(_RP_applyViscosity);
-}
-}
-
-}  // namespace Manta
-
-#if ENABLE_PARALLEL == 1
-
-#  undef parallel_block
-#  undef do_parallel
-#  undef do_end
-#  undef block_end
-#  undef parallel_for
-#  undef parallel_end
-
-#endif
diff --git a/extern/mantaflow/preprocessed/plugin/vortexplugins.cpp b/extern/mantaflow/preprocessed/plugin/vortexplugins.cpp
index 18222c4ccda..6386e835447 100644
--- a/extern/mantaflow/preprocessed/plugin/vortexplugins.cpp
+++ b/extern/mantaflow/preprocessed/plugin/vortexplugins.cpp
@@ -188,8 +188,7 @@ struct KnAcceleration : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(
-      IndexInt idx, MACGrid &a, const MACGrid &v1, const MACGrid &v0, const Real idt) const
+  inline void op(IndexInt idx, MACGrid &a, const MACGrid &v1, const MACGrid &v0, const Real idt)
   {
     a[idx] = (v1[idx] - v0[idx]) * idt;
   }
@@ -213,21 +212,17 @@ struct KnAcceleration : public KernelBase {
     return idt;
   }
   typedef Real type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnAcceleration ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, a, v1, v0, idt);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, a, v1, v0, idt);
+    }
   }
   MACGrid &a;
   const MACGrid &v1;
@@ -576,18 +571,17 @@ void VICintegration(VortexSheetMesh &mesh,
 
     // prepare CG solver
     const int maxIter = (int)(cgMaxIterFac * vel.getSize().max());
-    vector<Grid<Real> *> matA{&A0, &Ai, &Aj, &Ak};
-
     GridCgInterface *gcg = new GridCg<ApplyMatrix>(
-        solution, rhs, residual, search, flags, temp1, matA);
+        solution, rhs, residual, search, flags, temp1, &A0, &Ai, &Aj, &Ak);
     gcg->setAccuracy(cgAccuracy);
     gcg->setUseL2Norm(true);
     gcg->setICPreconditioner(
         (GridCgInterface::PreconditionType)precondition, &pca0, &pca1, &pca2, &pca3);
 
     // iterations
+    Real time = 0;
     for (int iter = 0; iter < maxIter; iter++) {
-      if (!gcg->iterate())
+      if (!gcg->iterate(time))
         iter = maxIter;
     }
     debMsg("VICintegration CG iterations:" << gcg->getIterations() << ", res:" << gcg->getSigma(),
diff --git a/extern/mantaflow/preprocessed/plugin/waveletturbulence.cpp b/extern/mantaflow/preprocessed/plugin/waveletturbulence.cpp
index 7d867542132..20e6a098d72 100644
--- a/extern/mantaflow/preprocessed/plugin/waveletturbulence.cpp
+++ b/extern/mantaflow/preprocessed/plugin/waveletturbulence.cpp
@@ -170,7 +170,7 @@ struct KnInterpolateMACGrid : public KernelBase {
                  const MACGrid &source,
                  const Vec3 &sourceFactor,
                  const Vec3 &off,
-                 int orderSpace) const
+                 int orderSpace)
   {
     Vec3 pos = Vec3(i, j, k) * sourceFactor + off;
 
@@ -207,37 +207,35 @@ struct KnInterpolateMACGrid : public KernelBase {
     return orderSpace;
   }
   typedef int type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnInterpolateMACGrid ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, target, source, sourceFactor, off, orderSpace);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, target, source, sourceFactor, off, orderSpace);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, target, source, sourceFactor, off, orderSpace);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, target, source, sourceFactor, off, orderSpace);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   MACGrid &target;
   const MACGrid &source;
   const Vec3 &sourceFactor;
@@ -319,7 +317,7 @@ struct knApplySimpleNoiseVec3 : public KernelBase {
                  Grid<Vec3> &target,
                  const WaveletNoiseField &noise,
                  Real scale,
-                 const Grid<Real> *weight) const
+                 const Grid<Real> *weight)
   {
     if (!flags.isFluid(i, j, k))
       return;
@@ -353,37 +351,35 @@ struct knApplySimpleNoiseVec3 : public KernelBase {
     return weight;
   }
   typedef Grid<Real> type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel knApplySimpleNoiseVec3 ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, target, noise, scale, weight);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, target, noise, scale, weight);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, target, noise, scale, weight);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, target, noise, scale, weight);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<Vec3> &target;
   const WaveletNoiseField &noise;
@@ -461,7 +457,7 @@ struct knApplySimpleNoiseReal : public KernelBase {
                  Grid<Real> &target,
                  const WaveletNoiseField &noise,
                  Real scale,
-                 const Grid<Real> *weight) const
+                 const Grid<Real> *weight)
   {
     if (!flags.isFluid(i, j, k))
       return;
@@ -495,37 +491,35 @@ struct knApplySimpleNoiseReal : public KernelBase {
     return weight;
   }
   typedef Grid<Real> type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel knApplySimpleNoiseReal ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, target, noise, scale, weight);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, target, noise, scale, weight);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, target, noise, scale, weight);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, target, noise, scale, weight);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<Real> &target;
   const WaveletNoiseField &noise;
@@ -615,7 +609,7 @@ struct knApplyNoiseVec3 : public KernelBase {
                  const Grid<Real> *weight,
                  const Grid<Vec3> *uv,
                  bool uvInterpol,
-                 const Vec3 &sourceFactor) const
+                 const Vec3 &sourceFactor)
   {
     if (!flags.isFluid(i, j, k))
       return;
@@ -694,19 +688,40 @@ struct knApplyNoiseVec3 : public KernelBase {
     return sourceFactor;
   }
   typedef Vec3 type8;
-  void runMessage()
-  {
-    debMsg("Executing kernel knApplyNoiseVec3 ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i,
+                 j,
+                 k,
+                 flags,
+                 target,
+                 noise,
+                 scale,
+                 scaleSpatial,
+                 weight,
+                 uv,
+                 uvInterpol,
+                 sourceFactor);
+      }
+    }
+    else {
+      const int k = 0;
+#pragma omp parallel
+      {
+
+#pragma omp for
         for (int j = 0; j < _maxY; j++)
           for (int i = 0; i < _maxX; i++)
             op(i,
@@ -721,32 +736,9 @@ struct knApplyNoiseVec3 : public KernelBase {
                uv,
                uvInterpol,
                sourceFactor);
-    }
-    else {
-      const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i,
-             j,
-             k,
-             flags,
-             target,
-             noise,
-             scale,
-             scaleSpatial,
-             weight,
-             uv,
-             uvInterpol,
-             sourceFactor);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<Vec3> &target;
   const WaveletNoiseField &noise;
@@ -834,7 +826,7 @@ struct KnApplyComputeEnergy : public KernelBase {
     run();
   }
   inline void op(
-      int i, int j, int k, const FlagGrid &flags, const MACGrid &vel, Grid<Real> &energy) const
+      int i, int j, int k, const FlagGrid &flags, const MACGrid &vel, Grid<Real> &energy)
   {
     Real e = 0.f;
     if (flags.isFluid(i, j, k)) {
@@ -858,37 +850,35 @@ struct KnApplyComputeEnergy : public KernelBase {
     return energy;
   }
   typedef Grid<Real> type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnApplyComputeEnergy ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, vel, energy);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, flags, vel, energy);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, flags, vel, energy);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, flags, vel, energy);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const FlagGrid &flags;
   const MACGrid &vel;
   Grid<Real> &energy;
@@ -1019,7 +1009,7 @@ struct KnComputeStrainRateMag : public KernelBase {
     run();
   }
   inline void op(
-      int i, int j, int k, const MACGrid &vel, const Grid<Vec3> &velCenter, Grid<Real> &prod) const
+      int i, int j, int k, const MACGrid &vel, const Grid<Vec3> &velCenter, Grid<Real> &prod)
   {
     // compute Sij = 1/2 * (dU_i/dx_j + dU_j/dx_i)
     Vec3 diag = Vec3(vel(i + 1, j, k).x, vel(i, j + 1, k).y, 0.) - vel(i, j, k);
@@ -1056,37 +1046,35 @@ struct KnComputeStrainRateMag : public KernelBase {
     return prod;
   }
   typedef Grid<Real> type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnComputeStrainRateMag ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, vel, velCenter, prod);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, vel, velCenter, prod);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, vel, velCenter, prod);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, vel, velCenter, prod);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const MACGrid &vel;
   const Grid<Vec3> &velCenter;
   Grid<Real> &prod;
diff --git a/extern/mantaflow/preprocessed/plugin/waves.cpp b/extern/mantaflow/preprocessed/plugin/waves.cpp
index 53c56b8c506..8becb5f6341 100644
--- a/extern/mantaflow/preprocessed/plugin/waves.cpp
+++ b/extern/mantaflow/preprocessed/plugin/waves.cpp
@@ -38,7 +38,7 @@ struct knCalcSecDeriv2d : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, const Grid<Real> &v, Grid<Real> &ret) const
+  inline void op(int i, int j, int k, const Grid<Real> &v, Grid<Real> &ret)
   {
     ret(i, j, k) = (-4. * v(i, j, k) + v(i - 1, j, k) + v(i + 1, j, k) + v(i, j - 1, k) +
                     v(i, j + 1, k));
@@ -53,37 +53,35 @@ struct knCalcSecDeriv2d : public KernelBase {
     return ret;
   }
   typedef Grid<Real> type1;
-  void runMessage()
-  {
-    debMsg("Executing kernel knCalcSecDeriv2d ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, v, ret);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, v, ret);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, v, ret);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, v, ret);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const Grid<Real> &v;
   Grid<Real> &ret;
 };
@@ -151,44 +149,43 @@ struct knTotalSum : public KernelBase {
     return h;
   }
   typedef Grid<Real> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel knTotalSum ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, h, sum);
+
+#pragma omp parallel
+      {
+        double sum = 0;
+#pragma omp for nowait
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, h, sum);
+#pragma omp critical
+        {
+          this->sum += sum;
+        }
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, h, sum);
+#pragma omp parallel
+      {
+        double sum = 0;
+#pragma omp for nowait
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, h, sum);
+#pragma omp critical
+        {
+          this->sum += sum;
+        }
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_reduce(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_reduce(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
-  knTotalSum(knTotalSum &o, tbb::split) : KernelBase(o), h(o.h), sum(0)
-  {
-  }
-  void join(const knTotalSum &o)
-  {
-    sum += o.sum;
-  }
   Grid<Real> &h;
   double sum;
 };
@@ -296,7 +293,7 @@ struct MakeRhsWE : public KernelBase {
                  const Grid<Real> &ut,
                  const Grid<Real> &utm1,
                  Real s,
-                 bool crankNic = false) const
+                 bool crankNic = false)
   {
     rhs(i, j, k) = (2. * ut(i, j, k) - utm1(i, j, k));
     if (crankNic) {
@@ -334,37 +331,35 @@ struct MakeRhsWE : public KernelBase {
     return crankNic;
   }
   typedef bool type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel MakeRhsWE ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 1; j < _maxY; j++)
-          for (int i = 1; i < _maxX; i++)
-            op(i, j, k, flags, rhs, ut, utm1, s, crankNic);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 1; j < _maxY; j++)
+            for (int i = 1; i < _maxX; i++)
+              op(i, j, k, flags, rhs, ut, utm1, s, crankNic);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 1; i < _maxX; i++)
-          op(i, j, k, flags, rhs, ut, utm1, s, crankNic);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, rhs, ut, utm1, s, crankNic);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
-  }
   const FlagGrid &flags;
   Grid<Real> &rhs;
   const Grid<Real> &ut;
@@ -423,21 +418,17 @@ void cgSolveWE(const FlagGrid &flags,
 
   const int maxIter = (int)(cgMaxIterFac * flags.getSize().max()) * (flags.is3D() ? 1 : 4);
   GridCgInterface *gcg;
-  vector<Grid<Real> *> matA{&A0, &Ai, &Aj};
-
-  if (flags.is3D()) {
-    matA.push_back(&Ak);
-    gcg = new GridCg<ApplyMatrix>(out, rhs, residual, search, flags, tmp, matA);
-  }
-  else {
-    gcg = new GridCg<ApplyMatrix2D>(out, rhs, residual, search, flags, tmp, matA);
-  }
+  if (flags.is3D())
+    gcg = new GridCg<ApplyMatrix>(out, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
+  else
+    gcg = new GridCg<ApplyMatrix2D>(out, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
 
   gcg->setAccuracy(cgAccuracy);
 
   // no preconditioning for now...
+  Real time = 0;
   for (int iter = 0; iter < maxIter; iter++) {
-    if (!gcg->iterate())
+    if (!gcg->iterate(time))
       iter = maxIter;
   }
   debMsg("cgSolveWaveEq iterations:" << gcg->getIterations() << ", res:" << gcg->getSigma(), 1);
diff --git a/extern/mantaflow/preprocessed/registration.cpp b/extern/mantaflow/preprocessed/registration.cpp
index fd32463b95f..dfbd4074d23 100644
--- a/extern/mantaflow/preprocessed/registration.cpp
+++ b/extern/mantaflow/preprocessed/registration.cpp
@@ -89,7 +89,6 @@ extern void PbRegister_processBurn();
 extern void PbRegister_updateFlame();
 extern void PbRegister_getSpiralVelocity();
 extern void PbRegister_setGradientYWeight();
-extern void PbRegister_PD_fluid_guiding();
 extern void PbRegister_releaseBlurPrecomp();
 extern void PbRegister_KEpsilonComputeProduction();
 extern void PbRegister_KEpsilonSources();
@@ -145,7 +144,6 @@ extern void PbRegister_flipComputeSurfaceNormals();
 extern void PbRegister_flipUpdateNeighborRatio();
 extern void PbRegister_particleSurfaceTurbulence();
 extern void PbRegister_debugCheckParts();
-extern void PbRegister_applyViscosity();
 extern void PbRegister_markAsFixed();
 extern void PbRegister_texcoordInflow();
 extern void PbRegister_meshSmokeInflow();
@@ -287,7 +285,6 @@ void MantaEnsureRegistration()
   PbRegister_updateFlame();
   PbRegister_getSpiralVelocity();
   PbRegister_setGradientYWeight();
-  PbRegister_PD_fluid_guiding();
   PbRegister_releaseBlurPrecomp();
   PbRegister_KEpsilonComputeProduction();
   PbRegister_KEpsilonSources();
@@ -343,7 +340,6 @@ void MantaEnsureRegistration()
   PbRegister_flipUpdateNeighborRatio();
   PbRegister_particleSurfaceTurbulence();
   PbRegister_debugCheckParts();
-  PbRegister_applyViscosity();
   PbRegister_markAsFixed();
   PbRegister_texcoordInflow();
   PbRegister_meshSmokeInflow();
diff --git a/extern/mantaflow/preprocessed/shapes.cpp b/extern/mantaflow/preprocessed/shapes.cpp
index 4095758cbc0..546d39a7ed9 100644
--- a/extern/mantaflow/preprocessed/shapes.cpp
+++ b/extern/mantaflow/preprocessed/shapes.cpp
@@ -52,8 +52,7 @@ template<class T> struct ApplyShapeToGrid : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(
-      int i, int j, int k, Grid<T> *grid, Shape *shape, T value, FlagGrid *respectFlags) const
+  inline void op(int i, int j, int k, Grid<T> *grid, Shape *shape, T value, FlagGrid *respectFlags)
   {
     if (respectFlags && respectFlags->isObstacle(i, j, k))
       return;
@@ -80,37 +79,35 @@ template<class T> struct ApplyShapeToGrid : public KernelBase {
     return respectFlags;
   }
   typedef FlagGrid type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel ApplyShapeToGrid ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, grid, shape, value, respectFlags);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, grid, shape, value, respectFlags);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, grid, shape, value, respectFlags);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, grid, shape, value, respectFlags);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<T> *grid;
   Shape *shape;
   T value;
@@ -141,7 +138,7 @@ template<class T> struct ApplyShapeToGridSmooth : public KernelBase {
                  Real sigma,
                  Real shift,
                  T value,
-                 FlagGrid *respectFlags) const
+                 FlagGrid *respectFlags)
   {
     if (respectFlags && respectFlags->isObstacle(i, j, k))
       return;
@@ -181,37 +178,35 @@ template<class T> struct ApplyShapeToGridSmooth : public KernelBase {
     return respectFlags;
   }
   typedef FlagGrid type5;
-  void runMessage()
-  {
-    debMsg("Executing kernel ApplyShapeToGridSmooth ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, grid, phi, sigma, shift, value, respectFlags);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, grid, phi, sigma, shift, value, respectFlags);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, grid, phi, sigma, shift, value, respectFlags);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, grid, phi, sigma, shift, value, respectFlags);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<T> *grid;
   Grid<Real> &phi;
   Real sigma;
@@ -230,7 +225,7 @@ struct ApplyShapeToMACGrid : public KernelBase {
     run();
   }
   inline void op(
-      int i, int j, int k, MACGrid *grid, Shape *shape, Vec3 value, FlagGrid *respectFlags) const
+      int i, int j, int k, MACGrid *grid, Shape *shape, Vec3 value, FlagGrid *respectFlags)
   {
     if (respectFlags && respectFlags->isObstacle(i, j, k))
       return;
@@ -261,37 +256,35 @@ struct ApplyShapeToMACGrid : public KernelBase {
     return respectFlags;
   }
   typedef FlagGrid type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel ApplyShapeToMACGrid ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, grid, shape, value, respectFlags);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, grid, shape, value, respectFlags);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, grid, shape, value, respectFlags);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, grid, shape, value, respectFlags);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   MACGrid *grid;
   Shape *shape;
   Vec3 value;
@@ -429,7 +422,7 @@ struct BoxSDF : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<Real> &phi, const Vec3 &p1, const Vec3 &p2) const
+  inline void op(int i, int j, int k, Grid<Real> &phi, const Vec3 &p1, const Vec3 &p2)
   {
     const Vec3 p(i + 0.5, j + 0.5, k + 0.5);
     if (p.x <= p2.x && p.x >= p1.x && p.y <= p2.y && p.y >= p1.y && p.z <= p2.z && p.z >= p1.z) {
@@ -505,37 +498,35 @@ struct BoxSDF : public KernelBase {
     return p2;
   }
   typedef Vec3 type2;
-  void runMessage()
-  {
-    debMsg("Executing kernel BoxSDF ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, phi, p1, p2);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, phi, p1, p2);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, phi, p1, p2);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, phi, p1, p2);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<Real> &phi;
   const Vec3 &p1;
   const Vec3 &p2;
@@ -647,7 +638,7 @@ struct SphereSDF : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i, int j, int k, Grid<Real> &phi, Vec3 center, Real radius, Vec3 scale) const
+  inline void op(int i, int j, int k, Grid<Real> &phi, Vec3 center, Real radius, Vec3 scale)
   {
     phi(i, j, k) = norm((Vec3(i + 0.5, j + 0.5, k + 0.5) - center) / scale) - radius;
   }
@@ -671,37 +662,35 @@ struct SphereSDF : public KernelBase {
     return scale;
   }
   typedef Vec3 type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel SphereSDF ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, phi, center, radius, scale);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, phi, center, radius, scale);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, phi, center, radius, scale);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, phi, center, radius, scale);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<Real> &phi;
   Vec3 center;
   Real radius;
@@ -774,7 +763,7 @@ struct CylinderSDF : public KernelBase {
     run();
   }
   inline void op(
-      int i, int j, int k, Grid<Real> &phi, Vec3 center, Real radius, Vec3 zaxis, Real maxz) const
+      int i, int j, int k, Grid<Real> &phi, Vec3 center, Real radius, Vec3 zaxis, Real maxz)
   {
     Vec3 p = Vec3(i + 0.5, j + 0.5, k + 0.5) - center;
     Real z = fabs(dot(p, zaxis));
@@ -820,37 +809,35 @@ struct CylinderSDF : public KernelBase {
     return maxz;
   }
   typedef Real type4;
-  void runMessage()
-  {
-    debMsg("Executing kernel CylinderSDF ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, phi, center, radius, zaxis, maxz);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, phi, center, radius, zaxis, maxz);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, phi, center, radius, zaxis, maxz);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, phi, center, radius, zaxis, maxz);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   Grid<Real> &phi;
   Vec3 center;
   Real radius;
@@ -920,13 +907,8 @@ struct SlopeSDF : public KernelBase {
     runMessage();
     run();
   }
-  inline void op(int i,
-                 int j,
-                 int k,
-                 const Vec3 &n,
-                 Grid<Real> &phiObs,
-                 const Real &fac,
-                 const Real &origin) const
+  inline void op(
+      int i, int j, int k, const Vec3 &n, Grid<Real> &phiObs, const Real &fac, const Real &origin)
   {
 
     phiObs(i, j, k) = (n.x * (double)i + n.y * (double)j + n.z * (double)k - origin) * fac;
@@ -951,37 +933,35 @@ struct SlopeSDF : public KernelBase {
     return origin;
   }
   typedef Real type3;
-  void runMessage()
-  {
-    debMsg("Executing kernel SlopeSDF ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  void runMessage(){};
+  void run()
   {
     const int _maxX = maxX;
     const int _maxY = maxY;
     if (maxZ > 1) {
-      for (int k = __r.begin(); k != (int)__r.end(); k++)
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, n, phiObs, fac, origin);
+
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int k = minZ; k < maxZ; k++)
+          for (int j = 0; j < _maxY; j++)
+            for (int i = 0; i < _maxX; i++)
+              op(i, j, k, n, phiObs, fac, origin);
+      }
     }
     else {
       const int k = 0;
-      for (int j = __r.begin(); j != (int)__r.end(); j++)
-        for (int i = 0; i < _maxX; i++)
-          op(i, j, k, n, phiObs, fac, origin);
+#pragma omp parallel
+      {
+
+#pragma omp for
+        for (int j = 0; j < _maxY; j++)
+          for (int i = 0; i < _maxX; i++)
+            op(i, j, k, n, phiObs, fac, origin);
+      }
     }
   }
-  void run()
-  {
-    if (maxZ > 1)
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
-    else
-      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
-  }
   const Vec3 &n;
   Grid<Real> &phiObs;
   const Real &fac;
diff --git a/extern/mantaflow/preprocessed/shapes.h b/extern/mantaflow/preprocessed/shapes.h
index 5a400eaed09..fa645389bfe 100644
--- a/extern/mantaflow/preprocessed/shapes.h
+++ b/extern/mantaflow/preprocessed/shapes.h
@@ -269,7 +269,6 @@ class Shape : public PbClass {
 
  protected:
   GridType mType;
-
  public:
   PbArgs _args;
 }
@@ -320,7 +319,6 @@ class NullShape : public Shape {
   {
     gridSetConst<Real>(phi, 1000.0f);
   }
-
  public:
   PbArgs _args;
 }
@@ -396,7 +394,6 @@ class Box : public Shape {
 
  protected:
   Vec3 mP0, mP1;
-
  public:
   PbArgs _args;
 }
@@ -458,7 +455,6 @@ class Sphere : public Shape {
  protected:
   Vec3 mCenter, mScale;
   Real mRadius;
-
  public:
   PbArgs _args;
 }
@@ -583,7 +579,6 @@ class Cylinder : public Shape {
  protected:
   Vec3 mCenter, mZDir;
   Real mRadius, mZ;
-
  public:
   PbArgs _args;
 }
@@ -660,7 +655,6 @@ class Slope : public Shape {
   Real mAnglexy, mAngleyz;
   Real mOrigin;
   Vec3 mGs;
-
  public:
   PbArgs _args;
 }
diff --git a/extern/mantaflow/preprocessed/test.cpp b/extern/mantaflow/preprocessed/test.cpp
index b90c886efe7..ae9b533a7ba 100644
--- a/extern/mantaflow/preprocessed/test.cpp
+++ b/extern/mantaflow/preprocessed/test.cpp
@@ -50,28 +50,21 @@ struct reductionTest : public KernelBase {
     return v;
   }
   typedef Grid<Real> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel reductionTest ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, v, sum);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  reductionTest(reductionTest &o, tbb::split) : KernelBase(o), v(o.v), sum(0)
-  {
-  }
-  void join(const reductionTest &o)
-  {
-    sum += o.sum;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      double sum = 0;
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, v, sum);
+#pragma omp critical
+      {
+        this->sum += sum;
+      }
+    }
   }
   const Grid<Real> &v;
   double sum;
@@ -101,28 +94,21 @@ struct minReduction : public KernelBase {
     return v;
   }
   typedef Grid<Real> type0;
-  void runMessage()
-  {
-    debMsg("Executing kernel minReduction ", 3);
-    debMsg("Kernel range"
-               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r)
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, v, sum);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  minReduction(minReduction &o, tbb::split) : KernelBase(o), v(o.v), sum(0)
-  {
-  }
-  void join(const minReduction &o)
-  {
-    sum = min(sum, o.sum);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+      double sum = 0;
+#pragma omp for nowait
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, v, sum);
+#pragma omp critical
+      {
+        this->sum = min(sum, this->sum);
+      }
+    }
   }
   const Grid<Real> &v;
   double sum;
diff --git a/extern/mantaflow/preprocessed/turbulencepart.cpp b/extern/mantaflow/preprocessed/turbulencepart.cpp
index 168ae9cc2f2..0dddf8f9cc6 100644
--- a/extern/mantaflow/preprocessed/turbulencepart.cpp
+++ b/extern/mantaflow/preprocessed/turbulencepart.cpp
@@ -136,7 +136,7 @@ struct KnSynthesizeTurbulence : public KernelBase {
                  int octaves,
                  Real scale,
                  Real invL0,
-                 Real kmin) const
+                 Real kmin)
   {
     const Real PERSISTENCE = 0.56123f;
 
@@ -217,21 +217,17 @@ struct KnSynthesizeTurbulence : public KernelBase {
     return kmin;
   }
   typedef Real type9;
-  void runMessage()
-  {
-    debMsg("Executing kernel KnSynthesizeTurbulence ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, p, flags, noise, kGrid, alpha, dt, octaves, scale, invL0, kmin);
-  }
+  void runMessage(){};
   void run()
   {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, p, flags, noise, kGrid, alpha, dt, octaves, scale, invL0, kmin);
+    }
   }
   TurbulenceParticleSystem &p;
   FlagGrid &flags;
diff --git a/extern/mantaflow/preprocessed/turbulencepart.h b/extern/mantaflow/preprocessed/turbulencepart.h
index 5177aeb2d96..81c94d77722 100644
--- a/extern/mantaflow/preprocessed/turbulencepart.h
+++ b/extern/mantaflow/preprocessed/turbulencepart.h
@@ -199,7 +199,6 @@ class TurbulenceParticleSystem : public ParticleSystem<TurbulenceParticleData> {
 
  private:
   WaveletNoiseField &noise;
-
  public:
   PbArgs _args;
 }
diff --git a/extern/mantaflow/preprocessed/vortexpart.cpp b/extern/mantaflow/preprocessed/vortexpart.cpp
index 0eba2743ee8..db22ff85208 100644
--- a/extern/mantaflow/preprocessed/vortexpart.cpp
+++ b/extern/mantaflow/preprocessed/vortexpart.cpp
@@ -60,56 +60,24 @@ inline Vec3 VortexKernel(const Vec3 &p, const vector<VortexParticleData> &vp, Re
   return u;
 }
 
-struct _KnVpAdvectMesh : public KernelBase {
-  _KnVpAdvectMesh(const KernelBase &base,
-                  vector<Node> &nodes,
-                  const vector<VortexParticleData> &vp,
-                  Real scale,
-                  vector<Vec3> &u)
-      : KernelBase(base), nodes(nodes), vp(vp), scale(scale), u(u)
+struct KnVpAdvectMesh : public KernelBase {
+  KnVpAdvectMesh(vector<Node> &nodes, const vector<VortexParticleData> &vp, Real scale)
+      : KernelBase(nodes.size()), nodes(nodes), vp(vp), scale(scale), u((size))
   {
+    runMessage();
+    run();
   }
   inline void op(IndexInt idx,
                  vector<Node> &nodes,
                  const vector<VortexParticleData> &vp,
                  Real scale,
-                 vector<Vec3> &u) const
+                 vector<Vec3> &u)
   {
     if (nodes[idx].flags & Mesh::NfFixed)
       u[idx] = 0.0;
     else
       u[idx] = VortexKernel(nodes[idx].pos, vp, scale);
   }
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, nodes, vp, scale, u);
-  }
-  void run()
-  {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  vector<Node> &nodes;
-  const vector<VortexParticleData> &vp;
-  Real scale;
-  vector<Vec3> &u;
-};
-struct KnVpAdvectMesh : public KernelBase {
-  KnVpAdvectMesh(vector<Node> &nodes, const vector<VortexParticleData> &vp, Real scale)
-      : KernelBase(nodes.size()),
-        _inner(KernelBase(nodes.size()), nodes, vp, scale, u),
-        nodes(nodes),
-        vp(vp),
-        scale(scale),
-        u((size))
-  {
-    runMessage();
-    run();
-  }
-  void run()
-  {
-    _inner.run();
-  }
   inline operator vector<Vec3>()
   {
     return u;
@@ -133,62 +101,37 @@ struct KnVpAdvectMesh : public KernelBase {
     return scale;
   }
   typedef Real type2;
-  void runMessage()
+  void runMessage(){};
+  void run()
   {
-    debMsg("Executing kernel KnVpAdvectMesh ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  _KnVpAdvectMesh _inner;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, nodes, vp, scale, u);
+    }
+  }
   vector<Node> &nodes;
   const vector<VortexParticleData> &vp;
   Real scale;
   vector<Vec3> u;
 };
 
-struct _KnVpAdvectSelf : public KernelBase {
-  _KnVpAdvectSelf(const KernelBase &base,
-                  vector<VortexParticleData> &vp,
-                  Real scale,
-                  vector<Vec3> &u)
-      : KernelBase(base), vp(vp), scale(scale), u(u)
-  {
-  }
-  inline void op(IndexInt idx, vector<VortexParticleData> &vp, Real scale, vector<Vec3> &u) const
-  {
-    if (vp[idx].flag & ParticleBase::PDELETE)
-      u[idx] = 0.0;
-    else
-      u[idx] = VortexKernel(vp[idx].pos, vp, scale);
-  }
-  void operator()(const tbb::blocked_range<IndexInt> &__r) const
-  {
-    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
-      op(idx, vp, scale, u);
-  }
-  void run()
-  {
-    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
-  }
-  vector<VortexParticleData> &vp;
-  Real scale;
-  vector<Vec3> &u;
-};
 struct KnVpAdvectSelf : public KernelBase {
   KnVpAdvectSelf(vector<VortexParticleData> &vp, Real scale)
-      : KernelBase(vp.size()),
-        _inner(KernelBase(vp.size()), vp, scale, u),
-        vp(vp),
-        scale(scale),
-        u((size))
+      : KernelBase(vp.size()), vp(vp), scale(scale), u((size))
   {
     runMessage();
     run();
   }
-  void run()
+  inline void op(IndexInt idx, vector<VortexParticleData> &vp, Real scale, vector<Vec3> &u)
   {
-    _inner.run();
+    if (vp[idx].flag & ParticleBase::PDELETE)
+      u[idx] = 0.0;
+    else
+      u[idx] = VortexKernel(vp[idx].pos, vp, scale);
   }
   inline operator vector<Vec3>()
   {
@@ -208,14 +151,18 @@ struct KnVpAdvectSelf : public KernelBase {
     return scale;
   }
   typedef Real type1;
-  void runMessage()
+  void runMessage(){};
+  void run()
   {
-    debMsg("Executing kernel KnVpAdvectSelf ", 3);
-    debMsg("Kernel range"
-               << " size " << size << " ",
-           4);
-  };
-  _KnVpAdvectSelf _inner;
+    const IndexInt _sz = size;
+#pragma omp parallel
+    {
+
+#pragma omp for
+      for (IndexInt i = 0; i < _sz; i++)
+        op(i, vp, scale, u);
+    }
+  }
   vector<VortexParticleData> &vp;
   Real scale;
   vector<Vec3> u;
diff --git a/extern/mantaflow/preprocessed/vortexpart.h b/extern/mantaflow/preprocessed/vortexpart.h
index 8f80cf910eb..e48fbc7f507 100644
--- a/extern/mantaflow/preprocessed/vortexpart.h
+++ b/extern/mantaflow/preprocessed/vortexpart.h
@@ -127,7 +127,6 @@ class VortexParticleSystem : public ParticleSystem<VortexParticleData> {
   }
 
   virtual ParticleBase *clone();
-
  public:
   PbArgs _args;
 }
diff --git a/extern/mantaflow/preprocessed/vortexsheet.h b/extern/mantaflow/preprocessed/vortexsheet.h
index 0fc0f3a1258..01c32e4e806 100644
--- a/extern/mantaflow/preprocessed/vortexsheet.h
+++ b/extern/mantaflow/preprocessed/vortexsheet.h
@@ -240,7 +240,6 @@ class VortexSheetMesh : public Mesh {
   VorticityChannel mVorticity;
   TexCoord3Channel mTex1, mTex2;
   TurbulenceChannel mTurb;
-
  public:
   PbArgs _args;
 }
diff --git a/intern/mantaflow/intern/MANTA_main.cpp b/intern/mantaflow/intern/MANTA_main.cpp
index 6c8e45ceeb4..0959e017352 100644
--- a/intern/mantaflow/intern/MANTA_main.cpp
+++ b/intern/mantaflow/intern/MANTA_main.cpp
@@ -59,8 +59,8 @@ int MANTA::with_debug(0);
 MANTA::MANTA(int *res, FluidModifierData *fmd)
     : mCurrentID(++solverID), mMaxRes(fmd->domain->maxres)
 {
-  if (with_debug)
-    cout << "FLUID: " << mCurrentID << " with res(" << res[0] << ", " << res[1] << ", " << res[2]
+  //if (with_debug)
+    cout << "============= FLUID: " << mCurrentID << " with res(" << res[0] << ", " << res[1] << ", " << res[2]
          << ")" << endl;
 
   FluidDomainSettings *fds = fmd->domain;
@@ -279,6 +279,7 @@ MANTA::MANTA(int *res, FluidModifierData *fmd)
   }
   /* All requested initializations must not fail in constructor. */
   BLI_assert(initSuccess);
+  UNUSED_VARS(initSuccess);
   updatePointers(fmd);
 }
 
diff --git a/intern/mantaflow/intern/strings/fluid_script.h b/intern/mantaflow/intern/strings/fluid_script.h
index 3bf8e66c110..eb29b95affb 100644
--- a/intern/mantaflow/intern/strings/fluid_script.h
+++ b/intern/mantaflow/intern/strings/fluid_script.h
@@ -273,13 +273,32 @@ def fluid_adapt_time_step_$ID$():\n\
 const std::string fluid_alloc =
     "\n\
 mantaMsg('Fluid alloc data')\n\
-flags_s$ID$       = s$ID$.create(FlagGrid, name='$NAME_FLAGS$')\n\
-vel_s$ID$         = s$ID$.create(MACGrid, name='$NAME_VELOCITY$', sparse=True)\n\
+flags_s$ID$       = s$ID$.create(FlagGrid, name='$NAME_FLAGS$', offload=True)\n\
+mantaMsg('Fluid alloc data vel')\n\
+vel_s$ID$         = s$ID$.create(MACGrid, name='$NAME_VELOCITY$', sparse=True, offload=True)\n\
 velTmp_s$ID$      = s$ID$.create(MACGrid, name='$NAME_VELOCITYTMP$', sparse=True)\n\
 x_vel_s$ID$       = s$ID$.create(RealGrid, name='$NAME_VELOCITY_X$')\n\
 y_vel_s$ID$       = s$ID$.create(RealGrid, name='$NAME_VELOCITY_Y$')\n\
 z_vel_s$ID$       = s$ID$.create(RealGrid, name='$NAME_VELOCITY_Z$')\n\
-pressure_s$ID$    = s$ID$.create(RealGrid, name='$NAME_PRESSURE$')\n\
+mantaMsg('Fluid alloc data pressure')\n\
+pressure_s$ID$    = s$ID$.create(RealGrid, name='$NAME_PRESSURE$', offload=True)\n\
+mantaMsg('Fluid alloc data rhs')\n\
+rhs_s$ID$         = s$ID$.create(RealGrid, offload=True)\n\
+mantaMsg('Fluid alloc data A0')\n\
+A0_s$ID$          = s$ID$.create(RealGrid, offload=True)\n\
+mantaMsg('Fluid alloc data Ai')\n\
+Ai_s$ID$          = s$ID$.create(RealGrid, offload=True)\n\
+mantaMsg('Fluid alloc data Aj')\n\
+Aj_s$ID$          = s$ID$.create(RealGrid, offload=True)\n\
+mantaMsg('Fluid alloc data Ak')\n\
+Ak_s$ID$          = s$ID$.create(RealGrid, offload=True)\n\
+mantaMsg('Fluid alloc data search')\n\
+search_s$ID$      = s$ID$.create(RealGrid, offload=True)\n\
+mantaMsg('Fluid alloc data residual')\n\
+residual_s$ID$    = s$ID$.create(RealGrid, offload=True)\n\
+mantaMsg('Fluid alloc data tmp')\n\
+tmp_s$ID$         = s$ID$.create(RealGrid, offload=True)\n\
+mantaMsg('Fluid alloc data 6')\n\
 phiObs_s$ID$      = s$ID$.create(LevelsetGrid, name='$NAME_PHIOBS$')\n\
 phiSIn_s$ID$      = s$ID$.create(LevelsetGrid, name='$NAME_PHISIN$') # helper for static flow objects\n\
 phiIn_s$ID$       = s$ID$.create(LevelsetGrid, name='$NAME_PHIIN$')\n\
@@ -298,7 +317,8 @@ phiOut_s$ID$.setConst(9999)\n\
 \n\
 # Keep track of important objects in dict to load them later on\n\
 fluid_data_dict_final_s$ID$  = { 'vel' : vel_s$ID$ }\n\
-fluid_data_dict_resume_s$ID$ = { 'phiObs' : phiObs_s$ID$, 'phiIn' : phiIn_s$ID$, 'phiOut' : phiOut_s$ID$, 'flags' : flags_s$ID$, 'velTmp' : velTmp_s$ID$ }\n";
+fluid_data_dict_resume_s$ID$ = { 'phiObs' : phiObs_s$ID$, 'phiIn' : phiIn_s$ID$, 'phiOut' : phiOut_s$ID$, 'flags' : flags_s$ID$, 'velTmp' : velTmp_s$ID$ }\n\
+mantaMsg('Fluid alloc DONE')\n";
 
 const std::string fluid_alloc_obstacle =
     "\n\
@@ -477,7 +497,8 @@ mantaMsg('Delete guiding solver')\n\
 if 'sg$ID$' in globals(): del sg$ID$\n\
 \n\
 # Release unreferenced memory (if there is some left)\n\
-gc.collect()\n";
+gc.collect()\n\
+mantaMsg('Done deleting')\n";
 
 //////////////////////////////////////////////////////////////////////
 // BAKE
diff --git a/intern/mantaflow/intern/strings/liquid_script.h b/intern/mantaflow/intern/strings/liquid_script.h
index c44727bd47e..1773d4869aa 100644
--- a/intern/mantaflow/intern/strings/liquid_script.h
+++ b/intern/mantaflow/intern/strings/liquid_script.h
@@ -274,9 +274,14 @@ def liquid_step_$ID$():\n\
         velTmp_s$ID$.copyFrom(vel_s$ID$)\n\
     \n\
     mantaMsg('Advecting phi')\n\
+    #phi_s$ID$.updateToOmp()\n\
+    #vel_s$ID$.updateToOmp()\n\
     advectSemiLagrange(flags=flags_s$ID$, vel=vel_s$ID$, grid=phi_s$ID$, order=1) # first order is usually enough\n\
+    \n\
     mantaMsg('Advecting velocity')\n\
     advectSemiLagrange(flags=flags_s$ID$, vel=vel_s$ID$, grid=vel_s$ID$, order=2)\n\
+    #phi_s$ID$.updateFromOmp()\n\
+    #vel_s$ID$.updateFromOmp()\n\
     \n\
     # create level set of particles\n\
     gridParticleIndex(parts=pp_s$ID$, flags=flags_s$ID$, indexSys=pindex_s$ID$, index=gpi_s$ID$)\n\
@@ -323,21 +328,45 @@ def liquid_step_$ID$():\n\
         getLaplacian(laplacian=curvature_s$ID$, grid=phi_s$ID$)\n\
         curvature_s$ID$.clamp(-1.0, 1.0)\n\
     \n\
+    #vel_s$ID$.updateToOmp()\n\
     setWallBcs(flags=flags_s$ID$, vel=vel_s$ID$, obvel=None if using_fractions_s$ID$ else obvel_s$ID$, phiObs=phiObs_s$ID$, fractions=fractions_s$ID$)\n\
+    #vel_s$ID$.updateFromOmp()\n\
+    \n\
     if using_viscosity_s$ID$:\n\
         viscosity_s$ID$.setConst(viscosityValue_s$ID$)\n\
         applyViscosity(flags=flags_s$ID$, phi=phi_s$ID$, vel=vel_s$ID$, volumes=volumes_s$ID$, viscosity=viscosity_s$ID$)\n\
     \n\
+    #vel_s$ID$.updateToOmp()\n\
     setWallBcs(flags=flags_s$ID$, vel=vel_s$ID$, obvel=None if using_fractions_s$ID$ else obvel_s$ID$, phiObs=phiObs_s$ID$, fractions=fractions_s$ID$)\n\
+    #vel_s$ID$.updateFromOmp()\n\
+    \n\
     if using_guiding_s$ID$:\n\
         mantaMsg('Guiding and pressure')\n\
         PD_fluid_guiding(vel=vel_s$ID$, velT=velT_s$ID$, flags=flags_s$ID$, phi=phi_s$ID$, curv=curvature_s$ID$, surfTens=surfaceTension_s$ID$, fractions=fractions_s$ID$, weight=weightGuide_s$ID$, blurRadius=beta_sg$ID$, pressure=pressure_s$ID$, tau=tau_sg$ID$, sigma=sigma_sg$ID$, theta=theta_sg$ID$, zeroPressureFixing=domainClosed_s$ID$)\n\
     else:\n\
-        mantaMsg('Pressure')\n\
-        solvePressure(flags=flags_s$ID$, vel=vel_s$ID$, pressure=pressure_s$ID$, curv=curvature_s$ID$, surfTens=surfaceTension_s$ID$, fractions=fractions_s$ID$, obvel=obvel_s$ID$ if using_fractions_s$ID$ else None, zeroPressureFixing=domainClosed_s$ID$)\n\
+        print('Pressure')\n\
+        # openmp sync to device\n\
+        flags_s$ID$.updateToOmp()\n\
+        vel_s$ID$.updateToOmp()\n\
+        print('Pressure 2')\n\
+        \n\
+        #solvePressure(flags=flags_s$ID$, vel=vel_s$ID$, pressure=pressure_s$ID$, curv=curvature_s$ID$, surfTens=surfaceTension_s$ID$, fractions=fractions_s$ID$, obvel=obvel_s$ID$ if using_fractions_s$ID$ else None, zeroPressureFixing=domainClosed_s$ID$)\n\
+        computePressureRhs(rhs=rhs_s$ID$, vel=vel_s$ID$, pressure=pressure_s$ID$, flags=flags_s$ID$, preconditioner=PcNone)\n\
+        print('Pressure 21')\n\
+        solvePressureSystem(rhs=rhs_s$ID$, vel=vel_s$ID$, pressure=pressure_s$ID$, flags=flags_s$ID$, useL2Norm=True, preconditioner=PcNone, residual=residual_s$ID$, search=search_s$ID$, A0=A0_s$ID$, Ai=Ai_s$ID$, Aj=Aj_s$ID$, Ak=Ak_s$ID$, tmp=tmp_s$ID$)\n\
+        print('Pressure 22')\n\
+        correctVelocity(vel=vel_s$ID$, pressure=pressure_s$ID$, flags=flags_s$ID$, preconditioner=PcNone)\n\
+        \n\
+        print('Pressure 3')\n\
+        # openmp sync from device\n\
+        pressure_s$ID$.updateFromOmp()\n\
+        vel_s$ID$.updateFromOmp()\n\
     \n\
     extrapolateMACSimple(flags=flags_s$ID$, vel=vel_s$ID$, distance=4, intoObs=True if using_fractions_s$ID$ else False)\n\
+    \n\
+    #vel_s$ID$.updateToOmp()\n\
     setWallBcs(flags=flags_s$ID$, vel=vel_s$ID$, obvel=None if using_fractions_s$ID$ else obvel_s$ID$, phiObs=phiObs_s$ID$, fractions=fractions_s$ID$)\n\
+    #vel_s$ID$.updateFromOmp()\n\
     \n\
     if not using_fractions_s$ID$:\n\
         extrapolateMACSimple(flags=flags_s$ID$, vel=vel_s$ID$)\n\
diff --git a/source/blender/makesdna/DNA_fluid_defaults.h b/source/blender/makesdna/DNA_fluid_defaults.h
index 4135c4d40a8..ddb1248741e 100644
--- a/source/blender/makesdna/DNA_fluid_defaults.h
+++ b/source/blender/makesdna/DNA_fluid_defaults.h
@@ -76,7 +76,7 @@
     .adapt_margin = 4, \
     .adapt_res = 0, \
     .adapt_threshold = 0.02f, \
-    .maxres = 32, \
+    .maxres = 64, \
     .solver_res = 3, \
     .border_collisions = 0, \
     .flags = FLUID_DOMAIN_USE_DISSOLVE_LOG | FLUID_DOMAIN_USE_ADAPTIVE_TIME, \
diff --git a/source/creator/CMakeLists.txt b/source/creator/CMakeLists.txt
index 47fb2642da1..cf05bdca72f 100644
--- a/source/creator/CMakeLists.txt
+++ b/source/creator/CMakeLists.txt
@@ -688,6 +688,13 @@ if(UNIX AND NOT APPLE)
       DESTINATION ${TARGETDIR_VER}/python/lib/python${PYTHON_VERSION}/site-packages
     )
   endif()
+  if(WITH_MOD_FLUID)
+    message(STATUS "============== Here")
+    install(
+      PROGRAMS $<TARGET_FILE:extern_mantaflow>
+      DESTINATION ${TARGETDIR_VER}/python/lib/python${PYTHON_VERSION}/site-packages
+    )
+  endif()
 elseif(WIN32)
 
   set(BLENDER_TEXT_FILES_DESTINATION ".")
author	Sebastián Barschkis <sebbas@sebbas.org>	2021-09-13 16:03:52 +0300
committer	Sebastián Barschkis <sebbas@sebbas.org>	2021-09-13 16:03:52 +0300
commit	063ce7f550f1612ab0e34c4ecb4b57f8401b84b4 (patch)
tree	53584b6c514510b0bab33a480b3ec85274b48a6b
parent	4b06420e65040c642d2b0a7a1c9bf7515d3cec0c (diff)