27 files changed, 1254 insertions, 637 deletions
diff --git a/source/blender/blenlib/intern/BLI_color.cc b/source/blender/blenlib/intern/BLI_color.cc
new file mode 100644
index 00000000000..6dcef4f4688
--- /dev/null
+++ b/source/blender/blenlib/intern/BLI_color.cc
@@ -0,0 +1,55 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "BLI_color.hh"
+
+namespace blender {
+
+std::ostream &operator<<(std::ostream &stream, const eAlpha &space)
+{
+  switch (space) {
+    case eAlpha::Straight: {
+      stream << "Straight";
+      break;
+    }
+    case eAlpha::Premultiplied: {
+      stream << "Premultiplied";
+      break;
+    }
+  }
+  return stream;
+}
+
+std::ostream &operator<<(std::ostream &stream, const eSpace &space)
+{
+  switch (space) {
+    case eSpace::Theme: {
+      stream << "Theme";
+      break;
+    }
+    case eSpace::SceneLinear: {
+      stream << "SceneLinear";
+      break;
+    }
+    case eSpace::SceneLinearByteEncoded: {
+      stream << "SceneLinearByteEncoded";
+      break;
+    }
+  }
+  return stream;
+}
+
+}  // namespace blender
diff --git a/source/blender/blenlib/intern/BLI_dial_2d.c b/source/blender/blenlib/intern/BLI_dial_2d.c
index 7363233d573..786fdd219a3 100644
--- a/source/blender/blenlib/intern/BLI_dial_2d.c
+++ b/source/blender/blenlib/intern/BLI_dial_2d.c
@@ -78,7 +78,7 @@ float BLI_dial_angle(Dial *dial, const float current_position[2])
     cosval = dot_v2v2(current_direction, dial->initial_direction);
     sinval = cross_v2v2(current_direction, dial->initial_direction);
 
-    /* clamp to avoid nans in acos */
+    /* Clamp to avoid NAN's in #acos */
     angle = atan2f(sinval, cosval);
 
     /* change of sign, we passed the 180 degree threshold. This means we need to add a turn.
diff --git a/source/blender/blenlib/intern/BLI_mempool.c b/source/blender/blenlib/intern/BLI_mempool.c
index 75f934c1fb8..d3000ef38e8 100644
--- a/source/blender/blenlib/intern/BLI_mempool.c
+++ b/source/blender/blenlib/intern/BLI_mempool.c
@@ -36,7 +36,8 @@
 
 #include "BLI_utildefines.h"
 
-#include "BLI_mempool.h" /* own include */
+#include "BLI_mempool.h"         /* own include */
+#include "BLI_mempool_private.h" /* own include */
 
 #include "MEM_guardedalloc.h"
 
@@ -541,8 +542,12 @@ void BLI_mempool_iternew(BLI_mempool *pool, BLI_mempool_iter *iter)
   iter->pool = pool;
   iter->curchunk = pool->chunks;
   iter->curindex = 0;
+}
 
-  iter->curchunk_threaded_shared = NULL;
+static void mempool_threadsafe_iternew(BLI_mempool *pool, BLI_mempool_threadsafe_iter *ts_iter)
+{
+  BLI_mempool_iternew(pool, &ts_iter->iter);
+  ts_iter->curchunk_threaded_shared = NULL;
 }
 
 /**
@@ -558,33 +563,31 @@ void BLI_mempool_iternew(BLI_mempool *pool, BLI_mempool_iter *iter)
  *
  * See BLI_task_parallel_mempool implementation for detailed usage example.
  */
-BLI_mempool_iter *BLI_mempool_iter_threadsafe_create(BLI_mempool *pool, const size_t num_iter)
+ParallelMempoolTaskData *mempool_iter_threadsafe_create(BLI_mempool *pool, const size_t num_iter)
 {
   BLI_assert(pool->flag & BLI_MEMPOOL_ALLOW_ITER);
 
-  BLI_mempool_iter *iter_arr = MEM_mallocN(sizeof(*iter_arr) * num_iter, __func__);
+  ParallelMempoolTaskData *iter_arr = MEM_mallocN(sizeof(*iter_arr) * num_iter, __func__);
   BLI_mempool_chunk **curchunk_threaded_shared = MEM_mallocN(sizeof(void *), __func__);
 
-  BLI_mempool_iternew(pool, iter_arr);
-
-  *curchunk_threaded_shared = iter_arr->curchunk;
-  iter_arr->curchunk_threaded_shared = curchunk_threaded_shared;
+  mempool_threadsafe_iternew(pool, &iter_arr->ts_iter);
 
+  *curchunk_threaded_shared = iter_arr->ts_iter.iter.curchunk;
+  iter_arr->ts_iter.curchunk_threaded_shared = curchunk_threaded_shared;
   for (size_t i = 1; i < num_iter; i++) {
-    iter_arr[i] = iter_arr[0];
-    *curchunk_threaded_shared = iter_arr[i].curchunk = ((*curchunk_threaded_shared) ?
-                                                            (*curchunk_threaded_shared)->next :
-                                                            NULL);
+    iter_arr[i].ts_iter = iter_arr[0].ts_iter;
+    *curchunk_threaded_shared = iter_arr[i].ts_iter.iter.curchunk =
+        ((*curchunk_threaded_shared) ? (*curchunk_threaded_shared)->next : NULL);
   }
 
   return iter_arr;
 }
 
-void BLI_mempool_iter_threadsafe_free(BLI_mempool_iter *iter_arr)
+void mempool_iter_threadsafe_destroy(ParallelMempoolTaskData *iter_arr)
 {
-  BLI_assert(iter_arr->curchunk_threaded_shared != NULL);
+  BLI_assert(iter_arr->ts_iter.curchunk_threaded_shared != NULL);
 
-  MEM_freeN(iter_arr->curchunk_threaded_shared);
+  MEM_freeN(iter_arr->ts_iter.curchunk_threaded_shared);
   MEM_freeN(iter_arr);
 }
 
@@ -605,19 +608,6 @@ static void *bli_mempool_iternext(BLI_mempool_iter *iter)
 
   if (iter->curindex == iter->pool->pchunk) {
     iter->curindex = 0;
-    if (iter->curchunk_threaded_shared) {
-      while (1) {
-        iter->curchunk = *iter->curchunk_threaded_shared;
-        if (iter->curchunk == NULL) {
-          return ret;
-        }
-        if (atomic_cas_ptr((void **)iter->curchunk_threaded_shared,
-                           iter->curchunk,
-                           iter->curchunk->next) == iter->curchunk) {
-          break;
-        }
-      }
-    }
     iter->curchunk = iter->curchunk->next;
   }
 
@@ -659,19 +649,54 @@ void *BLI_mempool_iterstep(BLI_mempool_iter *iter)
     }
     else {
       iter->curindex = 0;
-      if (iter->curchunk_threaded_shared) {
-        for (iter->curchunk = *iter->curchunk_threaded_shared;
-             (iter->curchunk != NULL) && (atomic_cas_ptr((void **)iter->curchunk_threaded_shared,
-                                                         iter->curchunk,
-                                                         iter->curchunk->next) != iter->curchunk);
-             iter->curchunk = *iter->curchunk_threaded_shared) {
-          /* pass. */
-        }
-
-        if (UNLIKELY(iter->curchunk == NULL)) {
-          return (ret->freeword == FREEWORD) ? NULL : ret;
-        }
+      iter->curchunk = iter->curchunk->next;
+      if (UNLIKELY(iter->curchunk == NULL)) {
+        return (ret->freeword == FREEWORD) ? NULL : ret;
       }
+      curnode = CHUNK_DATA(iter->curchunk);
+    }
+  } while (ret->freeword == FREEWORD);
+
+  return ret;
+}
+
+/**
+ * A version of #BLI_mempool_iterstep that uses
+ * #BLI_mempool_threadsafe_iter.curchunk_threaded_shared for threaded iteration support.
+ * (threaded section noted in comments).
+ */
+void *mempool_iter_threadsafe_step(BLI_mempool_threadsafe_iter *ts_iter)
+{
+  BLI_mempool_iter *iter = &ts_iter->iter;
+  if (UNLIKELY(iter->curchunk == NULL)) {
+    return NULL;
+  }
+
+  const uint esize = iter->pool->esize;
+  BLI_freenode *curnode = POINTER_OFFSET(CHUNK_DATA(iter->curchunk), (esize * iter->curindex));
+  BLI_freenode *ret;
+  do {
+    ret = curnode;
+
+    if (++iter->curindex != iter->pool->pchunk) {
+      curnode = POINTER_OFFSET(curnode, esize);
+    }
+    else {
+      iter->curindex = 0;
+
+      /* Begin unique to the `threadsafe` version of this function. */
+      for (iter->curchunk = *ts_iter->curchunk_threaded_shared;
+           (iter->curchunk != NULL) && (atomic_cas_ptr((void **)ts_iter->curchunk_threaded_shared,
+                                                       iter->curchunk,
+                                                       iter->curchunk->next) != iter->curchunk);
+           iter->curchunk = *ts_iter->curchunk_threaded_shared) {
+        /* pass. */
+      }
+      if (UNLIKELY(iter->curchunk == NULL)) {
+        return (ret->freeword == FREEWORD) ? NULL : ret;
+      }
+      /* End `threadsafe` exception. */
+
       iter->curchunk = iter->curchunk->next;
       if (UNLIKELY(iter->curchunk == NULL)) {
         return (ret->freeword == FREEWORD) ? NULL : ret;
diff --git a/source/blender/blenlib/intern/BLI_mempool_private.h b/source/blender/blenlib/intern/BLI_mempool_private.h
new file mode 100644
index 00000000000..e1c8205c80c
--- /dev/null
+++ b/source/blender/blenlib/intern/BLI_mempool_private.h
@@ -0,0 +1,52 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2008 Blender Foundation.
+ * All rights reserved.
+ */
+
+#pragma once
+
+/** \file
+ * \ingroup bli
+ *
+ * Shared logic for #BLI_task_parallel_mempool to create a threaded iterator,
+ * without exposing the these functions publicly.
+ */
+
+#include "BLI_compiler_attrs.h"
+
+#include "BLI_mempool.h"
+#include "BLI_task.h"
+
+typedef struct BLI_mempool_threadsafe_iter {
+  BLI_mempool_iter iter;
+  struct BLI_mempool_chunk **curchunk_threaded_shared;
+} BLI_mempool_threadsafe_iter;
+
+typedef struct ParallelMempoolTaskData {
+  BLI_mempool_threadsafe_iter ts_iter;
+  TaskParallelTLS tls;
+} ParallelMempoolTaskData;
+
+ParallelMempoolTaskData *mempool_iter_threadsafe_create(BLI_mempool *pool, const size_t num_iter)
+    ATTR_WARN_UNUSED_RESULT ATTR_NONNULL();
+void mempool_iter_threadsafe_destroy(ParallelMempoolTaskData *iter_arr) ATTR_NONNULL();
+
+void *mempool_iter_threadsafe_step(BLI_mempool_threadsafe_iter *iter);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/source/blender/blenlib/intern/array_utils.c b/source/blender/blenlib/intern/array_utils.c
index 57302f1ff81..25261e82cc9 100644
--- a/source/blender/blenlib/intern/array_utils.c
+++ b/source/blender/blenlib/intern/array_utils.c
@@ -337,10 +337,10 @@ bool _bli_array_iter_spiral_square(const void *arr_v,
              center[1] < arr_shape[1]);
 
   const char *arr = arr_v;
-  const int stride[2] = {arr_shape[1] * (int)elem_size, (int)elem_size};
+  const int stride[2] = {arr_shape[0] * (int)elem_size, (int)elem_size};
 
   /* Test center first. */
-  int ofs[2] = {center[0] * stride[0], center[1] * stride[1]};
+  int ofs[2] = {center[0] * stride[1], center[1] * stride[0]};
   if (test_fn(arr + ofs[0] + ofs[1], user_data)) {
     return true;
   }
diff --git a/source/blender/blenlib/intern/delaunay_2d.cc b/source/blender/blenlib/intern/delaunay_2d.cc
index 06a749ab921..9444d1a29cb 100644
--- a/source/blender/blenlib/intern/delaunay_2d.cc
+++ b/source/blender/blenlib/intern/delaunay_2d.cc
@@ -334,9 +334,6 @@ template<typename T> class CDT_state {
   T epsilon;
 
   explicit CDT_state(int num_input_verts, int num_input_edges, int num_input_faces, T epsilon);
-  ~CDT_state()
-  {
-  }
 };
 
 template<typename T> CDTArrangement<T>::~CDTArrangement()
diff --git a/source/blender/blenlib/intern/fileops.c b/source/blender/blenlib/intern/fileops.c
index 106bd5bc793..107c27da6a2 100644
--- a/source/blender/blenlib/intern/fileops.c
+++ b/source/blender/blenlib/intern/fileops.c
@@ -171,7 +171,7 @@ size_t BLI_gzip_mem_to_file_at_pos(
   z_stream strm;
   unsigned char out[CHUNK];
 
-  fseek(file, gz_stream_offset, 0);
+  BLI_fseek(file, gz_stream_offset, 0);
 
   strm.zalloc = Z_NULL;
   strm.zfree = Z_NULL;
@@ -217,7 +217,7 @@ size_t BLI_ungzip_file_to_mem_at_pos(void *buf, size_t len, FILE *file, size_t g
   size_t chunk = 256 * 1024;
   unsigned char in[CHUNK];
 
-  fseek(file, gz_stream_offset, 0);
+  BLI_fseek(file, gz_stream_offset, 0);
 
   strm.zalloc = Z_NULL;
   strm.zfree = Z_NULL;
diff --git a/source/blender/blenlib/intern/freetypefont.c b/source/blender/blenlib/intern/freetypefont.c
index e755a8e75b5..98da85e3a8c 100644
--- a/source/blender/blenlib/intern/freetypefont.c
+++ b/source/blender/blenlib/intern/freetypefont.c
@@ -135,7 +135,6 @@ static VChar *freetypechar_to_vchar(FT_Face face, FT_ULong charcode, VFontData *
       nu->type = CU_BEZIER;
       nu->pntsu = onpoints[j];
       nu->resolu = 8;
-      nu->flag = CU_2D;
       nu->flagu = CU_NURB_CYCLIC;
       nu->bezt = bezt;
 
diff --git a/source/blender/blenlib/intern/list_sort_impl.h b/source/blender/blenlib/intern/list_sort_impl.h
index 8f979ba5b0b..680044f9ccb 100644
--- a/source/blender/blenlib/intern/list_sort_impl.h
+++ b/source/blender/blenlib/intern/list_sort_impl.h
@@ -17,13 +17,13 @@
 /** \file
  * \ingroup bli
  *
- * Common implementation of linked-list a non-recursive mergesort.
+ * Common implementation of linked-list a non-recursive merge-sort.
  *
- * Originally from Mono's eglib, adapted for portable inclusion.
+ * Originally from Mono's `eglib`, adapted for portable inclusion.
  * This file is to be directly included in C-source,
  * with defines to control its use.
  *
- * This code requires a typedef named `SORT_IMPL_LINKTYPE` for the list node.
+ * This code requires a `typedef` named `SORT_IMPL_LINKTYPE` for the list node.
  * It is assumed that the list type is the type of a pointer to a list
  * node, and that the node has a field named 'next' that implements to
  * the linked list.  No additional invariant is maintained
diff --git a/source/blender/blenlib/intern/math_base_inline.c b/source/blender/blenlib/intern/math_base_inline.c
index 6481fac5a14..c1db9ec1a69 100644
--- a/source/blender/blenlib/intern/math_base_inline.c
+++ b/source/blender/blenlib/intern/math_base_inline.c
@@ -192,6 +192,13 @@ MINLINE double ratiod(double min, double max, double pos)
   return range == 0 ? 0 : ((pos - min) / range);
 }
 
+/* Map a normalized value, i.e. from interval [0, 1] to interval [a, b]  */
+MINLINE float scalenorm(float a, float b, float x)
+{
+  BLI_assert(x <= 1 && x >= 0);
+  return (x * (b - a)) + a;
+}
+
 /* used for zoom values*/
 MINLINE float power_of_2(float val)
 {
@@ -507,6 +514,15 @@ MINLINE int max_ii(int a, int b)
   return (b < a) ? a : b;
 }
 
+MINLINE uint min_uu(uint a, uint b)
+{
+  return (a < b) ? a : b;
+}
+MINLINE uint max_uu(uint a, uint b)
+{
+  return (b < a) ? a : b;
+}
+
 MINLINE float min_fff(float a, float b, float c)
 {
   return min_ff(min_ff(a, b), c);
diff --git a/source/blender/blenlib/intern/math_color.c b/source/blender/blenlib/intern/math_color.c
index 8fd2802a547..abcb3139dc7 100644
--- a/source/blender/blenlib/intern/math_color.c
+++ b/source/blender/blenlib/intern/math_color.c
@@ -713,3 +713,75 @@ void blackbody_temperature_to_rgb_table(float *r_table, int width, float min, fl
     r_table[i * 4 + 3] = 0.0f;
   }
 }
+
+/* ****************************** wavelength ******************************** */
+/* Wavelength to RGB. */
+
+/* CIE colour matching functions xBar, yBar, and zBar for
+ *   wavelengths from 380 through 780 nanometers, every 5
+ *   nanometers.
+ * For a wavelength lambda in this range:
+ *   cie_colour_match[(lambda - 380) / 5][0] = xBar
+ *   cie_colour_match[(lambda - 380) / 5][1] = yBar
+ *   cie_colour_match[(lambda - 380) / 5][2] = zBar */
+
+static float cie_colour_match[81][3] = {
+    {0.0014f, 0.0000f, 0.0065f}, {0.0022f, 0.0001f, 0.0105f}, {0.0042f, 0.0001f, 0.0201f},
+    {0.0076f, 0.0002f, 0.0362f}, {0.0143f, 0.0004f, 0.0679f}, {0.0232f, 0.0006f, 0.1102f},
+    {0.0435f, 0.0012f, 0.2074f}, {0.0776f, 0.0022f, 0.3713f}, {0.1344f, 0.0040f, 0.6456f},
+    {0.2148f, 0.0073f, 1.0391f}, {0.2839f, 0.0116f, 1.3856f}, {0.3285f, 0.0168f, 1.6230f},
+    {0.3483f, 0.0230f, 1.7471f}, {0.3481f, 0.0298f, 1.7826f}, {0.3362f, 0.0380f, 1.7721f},
+    {0.3187f, 0.0480f, 1.7441f}, {0.2908f, 0.0600f, 1.6692f}, {0.2511f, 0.0739f, 1.5281f},
+    {0.1954f, 0.0910f, 1.2876f}, {0.1421f, 0.1126f, 1.0419f}, {0.0956f, 0.1390f, 0.8130f},
+    {0.0580f, 0.1693f, 0.6162f}, {0.0320f, 0.2080f, 0.4652f}, {0.0147f, 0.2586f, 0.3533f},
+    {0.0049f, 0.3230f, 0.2720f}, {0.0024f, 0.4073f, 0.2123f}, {0.0093f, 0.5030f, 0.1582f},
+    {0.0291f, 0.6082f, 0.1117f}, {0.0633f, 0.7100f, 0.0782f}, {0.1096f, 0.7932f, 0.0573f},
+    {0.1655f, 0.8620f, 0.0422f}, {0.2257f, 0.9149f, 0.0298f}, {0.2904f, 0.9540f, 0.0203f},
+    {0.3597f, 0.9803f, 0.0134f}, {0.4334f, 0.9950f, 0.0087f}, {0.5121f, 1.0000f, 0.0057f},
+    {0.5945f, 0.9950f, 0.0039f}, {0.6784f, 0.9786f, 0.0027f}, {0.7621f, 0.9520f, 0.0021f},
+    {0.8425f, 0.9154f, 0.0018f}, {0.9163f, 0.8700f, 0.0017f}, {0.9786f, 0.8163f, 0.0014f},
+    {1.0263f, 0.7570f, 0.0011f}, {1.0567f, 0.6949f, 0.0010f}, {1.0622f, 0.6310f, 0.0008f},
+    {1.0456f, 0.5668f, 0.0006f}, {1.0026f, 0.5030f, 0.0003f}, {0.9384f, 0.4412f, 0.0002f},
+    {0.8544f, 0.3810f, 0.0002f}, {0.7514f, 0.3210f, 0.0001f}, {0.6424f, 0.2650f, 0.0000f},
+    {0.5419f, 0.2170f, 0.0000f}, {0.4479f, 0.1750f, 0.0000f}, {0.3608f, 0.1382f, 0.0000f},
+    {0.2835f, 0.1070f, 0.0000f}, {0.2187f, 0.0816f, 0.0000f}, {0.1649f, 0.0610f, 0.0000f},
+    {0.1212f, 0.0446f, 0.0000f}, {0.0874f, 0.0320f, 0.0000f}, {0.0636f, 0.0232f, 0.0000f},
+    {0.0468f, 0.0170f, 0.0000f}, {0.0329f, 0.0119f, 0.0000f}, {0.0227f, 0.0082f, 0.0000f},
+    {0.0158f, 0.0057f, 0.0000f}, {0.0114f, 0.0041f, 0.0000f}, {0.0081f, 0.0029f, 0.0000f},
+    {0.0058f, 0.0021f, 0.0000f}, {0.0041f, 0.0015f, 0.0000f}, {0.0029f, 0.0010f, 0.0000f},
+    {0.0020f, 0.0007f, 0.0000f}, {0.0014f, 0.0005f, 0.0000f}, {0.0010f, 0.0004f, 0.0000f},
+    {0.0007f, 0.0002f, 0.0000f}, {0.0005f, 0.0002f, 0.0000f}, {0.0003f, 0.0001f, 0.0000f},
+    {0.0002f, 0.0001f, 0.0000f}, {0.0002f, 0.0001f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f},
+    {0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0000f, 0.0000f, 0.0000f}};
+
+static void wavelength_to_xyz(float xyz[3], float lambda_nm)
+{
+  float ii = (lambda_nm - 380.0f) * (1.0f / 5.0f); /* Scaled 0..80. */
+  int i = (int)ii;
+
+  if (i < 0 || i >= 80) {
+    xyz[0] = 0.0f;
+    xyz[1] = 0.0f;
+    xyz[2] = 0.0f;
+  }
+  else {
+    ii -= (float)i;
+    const float *c = cie_colour_match[i];
+    xyz[0] = c[0] + ii * (c[3] - c[0]);
+    xyz[1] = c[1] + ii * (c[4] - c[1]);
+    xyz[2] = c[2] + ii * (c[5] - c[2]);
+  }
+}
+
+void wavelength_to_xyz_table(float *r_table, int width)
+{
+  for (int i = 0; i < width; i++) {
+    float temperature = 380 + 400 / (float)width * (float)i;
+
+    float rgb[3];
+    wavelength_to_xyz(rgb, temperature);
+
+    copy_v3_v3(&r_table[i * 4], rgb);
+    r_table[i * 4 + 3] = 0.0f;
+  }
+}
diff --git a/source/blender/blenlib/intern/math_geom.c b/source/blender/blenlib/intern/math_geom.c
index 01cda6c9e4a..508de506ae8 100644
--- a/source/blender/blenlib/intern/math_geom.c
+++ b/source/blender/blenlib/intern/math_geom.c
@@ -2353,7 +2353,7 @@ bool isect_planes_v3_fn(
         for (i_test = 0; i_test < planes_len; i_test++) {
           const float *np_test = planes[i_test];
           if (((dot_v3v3(np_test, co_test) + np_test[3]) > eps_isect)) {
-            /* For low epsilon values the point could intersect it's own plane. */
+            /* For low epsilon values the point could intersect its own plane. */
             if (!ELEM(i_test, i, j, k)) {
               break;
             }
@@ -5393,7 +5393,7 @@ void accumulate_vertex_normals_poly_v3(float **vertnos,
 
 void tangent_from_uv_v3(const float uv1[2],
                         const float uv2[2],
-                        const float uv3[3],
+                        const float uv3[2],
                         const float co1[3],
                         const float co2[3],
                         const float co3[3],
diff --git a/source/blender/blenlib/intern/math_matrix.c b/source/blender/blenlib/intern/math_matrix.c
index 2ada05d2965..d447da4de64 100644
--- a/source/blender/blenlib/intern/math_matrix.c
+++ b/source/blender/blenlib/intern/math_matrix.c
@@ -1700,6 +1700,89 @@ void orthogonalize_m4_stable(float R[4][4], int axis, bool normalize)
   }
 }
 
+/* -------------------------------------------------------------------- */
+/** \name Orthogonalize Matrix Zeroed Axes
+ *
+ * Set any zeroed axes to an orthogonal vector in relation to the other axes.
+ *
+ * Typically used so matrix inversion can be performed.
+ *
+ * \note If an object has a zero scaled axis, this function can be used to "clean" the matrix
+ * to behave as if the scale on that axis was `unit_length`. So it can be inverted
+ * or used in matrix multiply without creating degenerate matrices, see: T50103
+ * \{ */
+
+/**
+ * \return true if any axis needed to be modified.
+ */
+static bool orthogonalize_m3_zero_axes_impl(float *mat[3], const float unit_length)
+{
+  enum { X = 1 << 0, Y = 1 << 1, Z = 1 << 2 };
+  int flag = 0;
+  for (int i = 0; i < 3; i++) {
+    flag |= (len_squared_v3(mat[i]) == 0.0f) ? (1 << i) : 0;
+  }
+
+  /* Either all or none are zero, either way we can't properly resolve this
+   * since we need to fill invalid axes from valid ones. */
+  if (ELEM(flag, 0, X | Y | Z)) {
+    return false;
+  }
+
+  switch (flag) {
+    case X | Y: {
+      ortho_v3_v3(mat[1], mat[2]);
+      ATTR_FALLTHROUGH;
+    }
+    case X: {
+      cross_v3_v3v3(mat[0], mat[1], mat[2]);
+      break;
+    }
+
+    case Y | Z: {
+      ortho_v3_v3(mat[2], mat[0]);
+      ATTR_FALLTHROUGH;
+    }
+    case Y: {
+      cross_v3_v3v3(mat[1], mat[0], mat[2]);
+      break;
+    }
+
+    case Z | X: {
+      ortho_v3_v3(mat[0], mat[1]);
+      ATTR_FALLTHROUGH;
+    }
+    case Z: {
+      cross_v3_v3v3(mat[2], mat[0], mat[1]);
+      break;
+    }
+    default: {
+      BLI_assert(0); /* Unreachable! */
+    }
+  }
+
+  for (int i = 0; i < 3; i++) {
+    if (flag & (1 << i)) {
+      if (UNLIKELY(normalize_v3_length(mat[i], unit_length) == 0.0f)) {
+        mat[i][i] = unit_length;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool orthogonalize_m3_zero_axes(float m[3][3], const float unit_length)
+{
+  return orthogonalize_m3_zero_axes_impl((float *[3]){UNPACK3(m)}, unit_length);
+}
+bool orthogonalize_m4_zero_axes(float m[4][4], const float unit_length)
+{
+  return orthogonalize_m3_zero_axes_impl((float *[3]){UNPACK3(m)}, unit_length);
+}
+
+/** \} */
+
 bool is_orthogonal_m3(const float m[3][3])
 {
   int i, j;
@@ -3196,68 +3279,6 @@ void invert_m4_m4_safe(float Ainv[4][4], const float A[4][4])
  * \{ */
 
 /**
- * Return true if invert should be attempted again.
- *
- * \note Takes an array of points to be usable from 3x3 and 4x4 matrices.
- */
-static bool invert_m3_m3_safe_ortho_prepare(float *mat[3])
-{
-  enum { X = 1 << 0, Y = 1 << 1, Z = 1 << 2 };
-  int flag = 0;
-  for (int i = 0; i < 3; i++) {
-    flag |= (len_squared_v3(mat[i]) == 0.0f) ? (1 << i) : 0;
-  }
-
-  /* Either all or none are zero, either way we can't properly resolve this
-   * since we need to fill invalid axes from valid ones. */
-  if (ELEM(flag, 0, X | Y | Z)) {
-    return false;
-  }
-
-  switch (flag) {
-    case X | Y: {
-      ortho_v3_v3(mat[1], mat[2]);
-      ATTR_FALLTHROUGH;
-    }
-    case X: {
-      cross_v3_v3v3(mat[0], mat[1], mat[2]);
-      break;
-    }
-
-    case Y | Z: {
-      ortho_v3_v3(mat[2], mat[0]);
-      ATTR_FALLTHROUGH;
-    }
-    case Y: {
-      cross_v3_v3v3(mat[1], mat[0], mat[2]);
-      break;
-    }
-
-    case Z | X: {
-      ortho_v3_v3(mat[0], mat[1]);
-      ATTR_FALLTHROUGH;
-    }
-    case Z: {
-      cross_v3_v3v3(mat[2], mat[0], mat[1]);
-      break;
-    }
-    default: {
-      BLI_assert(0); /* Unreachable! */
-    }
-  }
-
-  for (int i = 0; i < 3; i++) {
-    if (flag & (1 << i)) {
-      if (UNLIKELY(normalize_v3(mat[i]) == 0.0f)) {
-        mat[i][i] = 1.0f;
-      }
-    }
-  }
-
-  return true;
-}
-
-/**
  * A safe version of invert that uses valid axes, calculating the zero'd axis
  * based on the non-zero ones.
  *
@@ -3268,8 +3289,7 @@ void invert_m4_m4_safe_ortho(float Ainv[4][4], const float A[4][4])
   if (UNLIKELY(!invert_m4_m4(Ainv, A))) {
     float Atemp[4][4];
     copy_m4_m4(Atemp, A);
-    if (UNLIKELY(!(invert_m3_m3_safe_ortho_prepare((float *[3]){UNPACK3(Atemp)}) &&
-                   invert_m4_m4(Ainv, Atemp)))) {
+    if (UNLIKELY(!(orthogonalize_m4_zero_axes(Atemp, 1.0f) && invert_m4_m4(Ainv, Atemp)))) {
       unit_m4(Ainv);
     }
   }
@@ -3280,8 +3300,7 @@ void invert_m3_m3_safe_ortho(float Ainv[3][3], const float A[3][3])
   if (UNLIKELY(!invert_m3_m3(Ainv, A))) {
     float Atemp[3][3];
     copy_m3_m3(Atemp, A);
-    if (UNLIKELY(!(invert_m3_m3_safe_ortho_prepare((float *[3]){UNPACK3(Atemp)}) &&
-                   invert_m3_m3(Ainv, Atemp)))) {
+    if (UNLIKELY(!(orthogonalize_m3_zero_axes(Atemp, 1.0f) && invert_m3_m3(Ainv, Atemp)))) {
       unit_m3(Ainv);
     }
   }
diff --git a/source/blender/blenlib/intern/math_vector.c b/source/blender/blenlib/intern/math_vector.c
index a21e0c8f092..fb7b96fde78 100644
--- a/source/blender/blenlib/intern/math_vector.c
+++ b/source/blender/blenlib/intern/math_vector.c
@@ -412,7 +412,7 @@ bool is_finite_v4(const float v[4])
  * this would return the angle at the elbow.
  *
  * note that when v1/v2/v3 represent 3 points along a straight line
- * that the angle returned will be pi (180deg), rather then 0.0
+ * that the angle returned will be pi (180deg), rather than 0.0
  */
 float angle_v3v3v3(const float a[3], const float b[3], const float c[3])
 {
diff --git a/source/blender/blenlib/intern/memory_utils.c b/source/blender/blenlib/intern/memory_utils.c
index d477c20a242..5ca7b96c136 100644
--- a/source/blender/blenlib/intern/memory_utils.c
+++ b/source/blender/blenlib/intern/memory_utils.c
@@ -31,7 +31,7 @@
 #include "BLI_strict_flags.h"
 
 /**
- * Check if memory is zero'd, as with memset(arr, 0, arr_size)
+ * Check if memory is zeroed, as with `memset(arr, 0, arr_size)`.
  */
 bool BLI_memory_is_zero(const void *arr, const size_t arr_size)
 {
diff --git a/source/blender/blenlib/intern/mesh_boolean.cc b/source/blender/blenlib/intern/mesh_boolean.cc
index bf70b044d0d..c5c830bb4dd 100644
--- a/source/blender/blenlib/intern/mesh_boolean.cc
+++ b/source/blender/blenlib/intern/mesh_boolean.cc
@@ -38,10 +38,10 @@
 #  include "BLI_math_mpq.hh"
 #  include "BLI_mesh_intersect.hh"
 #  include "BLI_mpq3.hh"
-#  include "BLI_polyfill_2d.h"
 #  include "BLI_set.hh"
 #  include "BLI_span.hh"
 #  include "BLI_stack.hh"
+#  include "BLI_task.hh"
 #  include "BLI_vector.hh"
 #  include "BLI_vector_set.hh"
 
@@ -49,6 +49,10 @@
 
 #  include "BLI_mesh_boolean.hh"
 
+#  ifdef WITH_TBB
+#    include "tbb/spin_mutex.h"
+#  endif
+
 // #  define PERFDEBUG
 
 namespace blender::meshintersect {
@@ -145,11 +149,9 @@ class TriMeshTopology : NonCopyable {
    * Else return NO_INDEX. */
   int other_tri_if_manifold(Edge e, int t) const
   {
-    if (edge_tri_.contains(e)) {
-      auto *p = edge_tri_.lookup(e);
-      if (p->size() == 2) {
-        return ((*p)[0] == t) ? (*p)[1] : (*p)[0];
-      }
+    auto p = edge_tri_.lookup_ptr(e);
+    if (p != nullptr && (*p)->size() == 2) {
+      return ((**p)[0] == t) ? (**p)[1] : (**p)[0];
     }
     return NO_INDEX;
   }
@@ -1404,9 +1406,9 @@ static int find_cell_for_point_near_edge(mpq3 p,
   int dummy_index = p_sorted_dummy - sorted_tris.begin();
   int prev_tri = (dummy_index == 0) ? sorted_tris[sorted_tris.size() - 1] :
                                       sorted_tris[dummy_index - 1];
-  int next_tri = (dummy_index == sorted_tris.size() - 1) ? sorted_tris[0] :
-                                                           sorted_tris[dummy_index + 1];
   if (dbg_level > 0) {
+    int next_tri = (dummy_index == sorted_tris.size() - 1) ? sorted_tris[0] :
+                                                             sorted_tris[dummy_index + 1];
     std::cout << "prev tri to dummy = " << prev_tri << ";  next tri to dummy = " << next_tri
               << "\n";
   }
@@ -1691,9 +1693,24 @@ static int find_containing_cell(const Vert *v,
  * If the closest point is on an edge, return 0, 1, or 2
  * for edges ab, bc, or ca in *r_edge; else -1.
  * (Adapted from #closest_on_tri_to_point_v3()).
+ * The arguments ab, ac, ..., r are used as temporaries
+ * in this routine. Passing them in from the caller can
+ * avoid many allocs and frees of temporary mpq3 values
+ * and the mpq_class values within them.
  */
-static mpq_class closest_on_tri_to_point(
-    const mpq3 &p, const mpq3 &a, const mpq3 &b, const mpq3 &c, int *r_edge, int *r_vert)
+static mpq_class closest_on_tri_to_point(const mpq3 &p,
+                                         const mpq3 &a,
+                                         const mpq3 &b,
+                                         const mpq3 &c,
+                                         mpq3 &ab,
+                                         mpq3 &ac,
+                                         mpq3 &ap,
+                                         mpq3 &bp,
+                                         mpq3 &cp,
+                                         mpq3 &m,
+                                         mpq3 &r,
+                                         int *r_edge,
+                                         int *r_vert)
 {
   constexpr int dbg_level = 0;
   if (dbg_level > 0) {
@@ -1701,11 +1718,15 @@ static mpq_class closest_on_tri_to_point(
     std::cout << " a = " << a << ", b = " << b << ", c = " << c << "\n";
   }
   /* Check if p in vertex region outside a. */
-  mpq3 ab = b - a;
-  mpq3 ac = c - a;
-  mpq3 ap = p - a;
-  mpq_class d1 = mpq3::dot(ab, ap);
-  mpq_class d2 = mpq3::dot(ac, ap);
+  ab = b;
+  ab -= a;
+  ac = c;
+  ac -= a;
+  ap = p;
+  ap -= a;
+
+  mpq_class d1 = mpq3::dot_with_buffer(ab, ap, m);
+  mpq_class d2 = mpq3::dot_with_buffer(ac, ap, m);
   if (d1 <= 0 && d2 <= 0) {
     /* Barycentric coordinates (1,0,0). */
     *r_edge = -1;
@@ -1713,12 +1734,13 @@ static mpq_class closest_on_tri_to_point(
     if (dbg_level > 0) {
       std::cout << "  answer = a\n";
     }
-    return mpq3::distance_squared(p, a);
+    return mpq3::distance_squared_with_buffer(p, a, m);
   }
   /* Check if p in vertex region outside b. */
-  mpq3 bp = p - b;
-  mpq_class d3 = mpq3::dot(ab, bp);
-  mpq_class d4 = mpq3::dot(ac, bp);
+  bp = p;
+  bp -= b;
+  mpq_class d3 = mpq3::dot_with_buffer(ab, bp, m);
+  mpq_class d4 = mpq3::dot_with_buffer(ac, bp, m);
   if (d3 >= 0 && d4 <= d3) {
     /* Barycentric coordinates (0,1,0). */
     *r_edge = -1;
@@ -1726,25 +1748,28 @@ static mpq_class closest_on_tri_to_point(
     if (dbg_level > 0) {
       std::cout << "  answer = b\n";
     }
-    return mpq3::distance_squared(p, b);
+    return mpq3::distance_squared_with_buffer(p, b, m);
   }
   /* Check if p in region of ab. */
   mpq_class vc = d1 * d4 - d3 * d2;
   if (vc <= 0 && d1 >= 0 && d3 <= 0) {
     mpq_class v = d1 / (d1 - d3);
     /* Barycentric coordinates (1-v,v,0). */
-    mpq3 r = a + v * ab;
+    r = ab;
+    r *= v;
+    r += a;
     *r_vert = -1;
     *r_edge = 0;
     if (dbg_level > 0) {
       std::cout << "  answer = on ab at " << r << "\n";
     }
-    return mpq3::distance_squared(p, r);
+    return mpq3::distance_squared_with_buffer(p, r, m);
   }
   /* Check if p in vertex region outside c. */
-  mpq3 cp = p - c;
-  mpq_class d5 = mpq3::dot(ab, cp);
-  mpq_class d6 = mpq3::dot(ac, cp);
+  cp = p;
+  cp -= c;
+  mpq_class d5 = mpq3::dot_with_buffer(ab, cp, m);
+  mpq_class d6 = mpq3::dot_with_buffer(ac, cp, m);
   if (d6 >= 0 && d5 <= d6) {
     /* Barycentric coordinates (0,0,1). */
     *r_edge = -1;
@@ -1752,49 +1777,67 @@ static mpq_class closest_on_tri_to_point(
     if (dbg_level > 0) {
       std::cout << "  answer = c\n";
     }
-    return mpq3::distance_squared(p, c);
+    return mpq3::distance_squared_with_buffer(p, c, m);
   }
   /* Check if p in edge region of ac. */
   mpq_class vb = d5 * d2 - d1 * d6;
   if (vb <= 0 && d2 >= 0 && d6 <= 0) {
     mpq_class w = d2 / (d2 - d6);
     /* Barycentric coordinates (1-w,0,w). */
-    mpq3 r = a + w * ac;
+    r = ac;
+    r *= w;
+    r += a;
     *r_vert = -1;
     *r_edge = 2;
     if (dbg_level > 0) {
       std::cout << "  answer = on ac at " << r << "\n";
     }
-    return mpq3::distance_squared(p, r);
+    return mpq3::distance_squared_with_buffer(p, r, m);
   }
   /* Check if p in edge region of bc. */
   mpq_class va = d3 * d6 - d5 * d4;
   if (va <= 0 && (d4 - d3) >= 0 && (d5 - d6) >= 0) {
     mpq_class w = (d4 - d3) / ((d4 - d3) + (d5 - d6));
     /* Barycentric coordinates (0,1-w,w). */
-    mpq3 r = c - b;
-    r = w * r;
-    r = r + b;
+    r = c;
+    r -= b;
+    r *= w;
+    r += b;
     *r_vert = -1;
     *r_edge = 1;
     if (dbg_level > 0) {
       std::cout << "  answer = on bc at " << r << "\n";
     }
-    return mpq3::distance_squared(p, r);
+    return mpq3::distance_squared_with_buffer(p, r, m);
   }
   /* p inside face region. Compute barycentric coordinates (u,v,w). */
   mpq_class denom = 1 / (va + vb + vc);
   mpq_class v = vb * denom;
   mpq_class w = vc * denom;
-  ac = w * ac;
-  mpq3 r = a + v * ab;
-  r = r + ac;
+  ac *= w;
+  r = ab;
+  r *= v;
+  r += a;
+  r += ac;
   *r_vert = -1;
   *r_edge = -1;
   if (dbg_level > 0) {
     std::cout << "  answer = inside at " << r << "\n";
   }
-  return mpq3::distance_squared(p, r);
+  return mpq3::distance_squared_with_buffer(p, r, m);
+}
+
+static float closest_on_tri_to_point_float_dist_squared(const float3 &p,
+                                                        const double3 &a,
+                                                        const double3 &b,
+                                                        const double3 &c)
+{
+  float3 fa, fb, fc, closest;
+  copy_v3fl_v3db(fa, a);
+  copy_v3fl_v3db(fb, b);
+  copy_v3fl_v3db(fc, c);
+  closest_on_tri_to_point_v3(closest, p, fa, fb, fc);
+  return len_squared_v3v3(p, closest);
 }
 
 struct ComponentContainer {
@@ -1820,6 +1863,7 @@ static Vector<ComponentContainer> find_component_containers(int comp,
                                                             const IMesh &tm,
                                                             const PatchesInfo &pinfo,
                                                             const TriMeshTopology &tmtopo,
+                                                            Array<BoundingBox> &comp_bb,
                                                             IMeshArena *arena)
 {
   constexpr int dbg_level = 0;
@@ -1833,6 +1877,11 @@ static Vector<ComponentContainer> find_component_containers(int comp,
   if (dbg_level > 0) {
     std::cout << "test vertex in comp: " << test_v << "\n";
   }
+  const double3 &test_v_d = test_v->co;
+  float3 test_v_f(test_v_d[0], test_v_d[1], test_v_d[2]);
+
+  mpq3 buf[7];
+
   for (int comp_other : components.index_range()) {
     if (comp == comp_other) {
       continue;
@@ -1840,10 +1889,17 @@ static Vector<ComponentContainer> find_component_containers(int comp,
     if (dbg_level > 0) {
       std::cout << "comp_other = " << comp_other << "\n";
     }
+    if (!bbs_might_intersect(comp_bb[comp], comp_bb[comp_other])) {
+      if (dbg_level > 0) {
+        std::cout << "bounding boxes don't overlap\n";
+      }
+      continue;
+    }
     int nearest_tri = NO_INDEX;
     int nearest_tri_close_vert = -1;
     int nearest_tri_close_edge = -1;
     mpq_class nearest_tri_dist_squared;
+    float nearest_tri_dist_squared_float = FLT_MAX;
     for (int p : components[comp_other]) {
       const Patch &patch = pinfo.patch(p);
       for (int t : patch.tris()) {
@@ -1853,10 +1909,23 @@ static Vector<ComponentContainer> find_component_containers(int comp,
         }
         int close_vert;
         int close_edge;
+        /* Try a cheap float test first. */
+        float d2_f = closest_on_tri_to_point_float_dist_squared(
+            test_v_f, tri[0]->co, tri[1]->co, tri[2]->co);
+        if (d2_f - FLT_EPSILON > nearest_tri_dist_squared_float) {
+          continue;
+        }
         mpq_class d2 = closest_on_tri_to_point(test_v->co_exact,
                                                tri[0]->co_exact,
                                                tri[1]->co_exact,
                                                tri[2]->co_exact,
+                                               buf[0],
+                                               buf[1],
+                                               buf[2],
+                                               buf[3],
+                                               buf[4],
+                                               buf[5],
+                                               buf[6],
                                                &close_edge,
                                                &close_vert);
         if (dbg_level > 1) {
@@ -1868,6 +1937,7 @@ static Vector<ComponentContainer> find_component_containers(int comp,
           nearest_tri_close_edge = close_edge;
           nearest_tri_close_vert = close_vert;
           nearest_tri_dist_squared = d2;
+          nearest_tri_dist_squared_float = d2_f;
         }
       }
     }
@@ -1899,6 +1969,51 @@ static Vector<ComponentContainer> find_component_containers(int comp,
 }
 
 /**
+ * Populate the per-component bounding boxes, expanding them
+ * by an appropriate epsilon so that we conservatively will say
+ * that components could intersect if the BBs overlap.
+ */
+static void populate_comp_bbs(const Vector<Vector<int>> &components,
+                              const PatchesInfo &pinfo,
+                              const IMesh &im,
+                              Array<BoundingBox> &comp_bb)
+{
+  const int comp_grainsize = 16;
+  /* To get a good expansion epsilon, we need to find the maximum
+   * absolute value of any coordinate. Do it first per component,
+   * then get the overall max. */
+  Array<double> max_abs(components.size(), 0.0);
+  parallel_for(components.index_range(), comp_grainsize, [&](IndexRange comp_range) {
+    for (int c : comp_range) {
+      BoundingBox &bb = comp_bb[c];
+      double &maxa = max_abs[c];
+      for (int p : components[c]) {
+        const Patch &patch = pinfo.patch(p);
+        for (int t : patch.tris()) {
+          const Face &tri = *im.face(t);
+          for (const Vert *v : tri) {
+            bb.combine(v->co);
+            for (int i = 0; i < 3; ++i) {
+              maxa = max_dd(maxa, fabs(v->co[i]));
+            }
+          }
+        }
+      }
+    }
+  });
+  double all_max_abs = 0.0;
+  for (int c : components.index_range()) {
+    all_max_abs = max_dd(all_max_abs, max_abs[c]);
+  }
+  constexpr float pad_factor = 10.0f;
+  float pad = all_max_abs == 0.0 ? FLT_EPSILON : 2 * FLT_EPSILON * all_max_abs;
+  pad *= pad_factor;
+  for (int c : components.index_range()) {
+    comp_bb[c].expand(pad);
+  }
+}
+
+/**
  * The cells and patches are supposed to form a bipartite graph.
  * The graph may be disconnected (if parts of meshes are nested or side-by-side
  * without intersection with other each other).
@@ -1940,19 +2055,23 @@ static void finish_patch_cell_graph(const IMesh &tm,
   }
   int tot_components = components.size();
   Array<Vector<ComponentContainer>> comp_cont(tot_components);
-  for (int comp : components.index_range()) {
-    comp_cont[comp] = find_component_containers(
-        comp, components, ambient_cell, tm, pinfo, tmtopo, arena);
-  }
-  if (dbg_level > 0) {
-    std::cout << "component containers:\n";
-    for (int comp : comp_cont.index_range()) {
-      std::cout << comp << ": ";
-      for (const ComponentContainer &cc : comp_cont[comp]) {
-        std::cout << "[containing_comp=" << cc.containing_component
-                  << ", nearest_cell=" << cc.nearest_cell << ", d2=" << cc.dist_to_cell << "] ";
+  if (tot_components > 1) {
+    Array<BoundingBox> comp_bb(tot_components);
+    populate_comp_bbs(components, pinfo, tm, comp_bb);
+    for (int comp : components.index_range()) {
+      comp_cont[comp] = find_component_containers(
+          comp, components, ambient_cell, tm, pinfo, tmtopo, comp_bb, arena);
+    }
+    if (dbg_level > 0) {
+      std::cout << "component containers:\n";
+      for (int comp : comp_cont.index_range()) {
+        std::cout << comp << ": ";
+        for (const ComponentContainer &cc : comp_cont[comp]) {
+          std::cout << "[containing_comp=" << cc.containing_component
+                    << ", nearest_cell=" << cc.nearest_cell << ", d2=" << cc.dist_to_cell << "] ";
+        }
+        std::cout << "\n";
       }
-      std::cout << "\n";
     }
   }
   if (dbg_level > 1) {
@@ -2568,47 +2687,58 @@ static IMesh raycast_tris_boolean(const IMesh &tm,
   BVHTree *tree = raycast_tree(tm);
   Vector<Face *> out_faces;
   out_faces.reserve(tm.face_size());
-  Array<float> in_shape(nshapes, 0);
-  Array<int> winding(nshapes, 0);
-  for (int t : tm.face_index_range()) {
-    Face &tri = *tm.face(t);
-    int shape = shape_fn(tri.orig);
-    if (dbg_level > 0) {
-      std::cout << "process triangle " << t << " = " << &tri << "\n";
-      std::cout << "shape = " << shape << "\n";
-    }
-    test_tri_inside_shapes(tm, shape_fn, nshapes, t, tree, in_shape);
-    for (int other_shape = 0; other_shape < nshapes; ++other_shape) {
-      if (other_shape == shape) {
-        continue;
-      }
-      /* The in_shape array has a confidence value for "insideness".
-       * For most operations, even a hint of being inside
-       * gives good results, but when shape is a cutter in a Difference
-       * operation, we want to be pretty sure that the point is inside other_shape.
-       * E.g., T75827.
-       * Also, when the operation is intersection, we also want high confidence.
-       */
-      bool need_high_confidence = (op == BoolOpType::Difference && shape != 0) ||
-                                  op == BoolOpType::Intersect;
-      bool inside = in_shape[other_shape] >= (need_high_confidence ? 0.5f : 0.1f);
+#  ifdef WITH_TBB
+  tbb::spin_mutex mtx;
+#  endif
+  const int grainsize = 256;
+  parallel_for(IndexRange(tm.face_size()), grainsize, [&](IndexRange range) {
+    Array<float> in_shape(nshapes, 0);
+    Array<int> winding(nshapes, 0);
+    for (int t : range) {
+      Face &tri = *tm.face(t);
+      int shape = shape_fn(tri.orig);
       if (dbg_level > 0) {
-        std::cout << "test point is " << (inside ? "inside" : "outside") << " other_shape "
-                  << other_shape << " val = " << in_shape[other_shape] << "\n";
+        std::cout << "process triangle " << t << " = " << &tri << "\n";
+        std::cout << "shape = " << shape << "\n";
       }
-      winding[other_shape] = inside;
-    }
-    bool do_flip;
-    bool do_remove = raycast_test_remove(op, winding, shape, &do_flip);
-    if (!do_remove) {
-      if (!do_flip) {
-        out_faces.append(&tri);
+      test_tri_inside_shapes(tm, shape_fn, nshapes, t, tree, in_shape);
+      for (int other_shape = 0; other_shape < nshapes; ++other_shape) {
+        if (other_shape == shape) {
+          continue;
+        }
+        /* The in_shape array has a confidence value for "insideness".
+         * For most operations, even a hint of being inside
+         * gives good results, but when shape is a cutter in a Difference
+         * operation, we want to be pretty sure that the point is inside other_shape.
+         * E.g., T75827.
+         * Also, when the operation is intersection, we also want high confidence.
+         */
+        bool need_high_confidence = (op == BoolOpType::Difference && shape != 0) ||
+                                    op == BoolOpType::Intersect;
+        bool inside = in_shape[other_shape] >= (need_high_confidence ? 0.5f : 0.1f);
+        if (dbg_level > 0) {
+          std::cout << "test point is " << (inside ? "inside" : "outside") << " other_shape "
+                    << other_shape << " val = " << in_shape[other_shape] << "\n";
+        }
+        winding[other_shape] = inside;
       }
-      else {
-        raycast_add_flipped(out_faces, tri, arena);
+      bool do_flip;
+      bool do_remove = raycast_test_remove(op, winding, shape, &do_flip);
+      {
+#  ifdef WITH_TBB
+        tbb::spin_mutex::scoped_lock lock(mtx);
+#  endif
+        if (!do_remove) {
+          if (!do_flip) {
+            out_faces.append(&tri);
+          }
+          else {
+            raycast_add_flipped(out_faces, tri, arena);
+          }
+        }
       }
     }
-  }
+  });
   BLI_bvhtree_free(tree);
   ans.set_faces(out_faces);
   return ans;
@@ -2680,113 +2810,10 @@ static IMesh raycast_patches_boolean(const IMesh &tm,
     }
   }
   BLI_bvhtree_free(tree);
-  ans.set_faces(out_faces);
-  return ans;
-}
-
-/**
- * Tessellate face f into triangles and return an array of `const Face *`
- * giving that triangulation. Intended to be used when f has > 4 vertices.
- * Care is taken so that the original edge index associated with
- * each edge in the output triangles either matches the original edge
- * for the (identical) edge of f, or else is -1. So diagonals added
- * for triangulation can later be identified by having #NO_INDEX for original.
- */
-static Array<Face *> triangulate_poly(Face *f, IMeshArena *arena)
-{
-  /* Similar to loop body in BM_mesh_calc_tesselation. */
-  int flen = f->size();
-  BLI_assert(flen > 4);
-  if (!f->plane_populated()) {
-    f->populate_plane(false);
-  }
-  /* Project along negative face normal so (x,y) can be used in 2d. */
-  const double3 &poly_normal = f->plane->norm;
-  float no[3] = {float(poly_normal[0]), float(poly_normal[1]), float(poly_normal[2])};
-  normalize_v3(no);
-  float axis_mat[3][3];
-  float(*projverts)[2];
-  unsigned int(*tris)[3];
-  const int totfilltri = flen - 2;
-  /* Prepare projected vertices and array to receive triangles in tesselation. */
-  tris = static_cast<unsigned int(*)[3]>(MEM_malloc_arrayN(totfilltri, sizeof(*tris), __func__));
-  projverts = static_cast<float(*)[2]>(MEM_malloc_arrayN(flen, sizeof(*projverts), __func__));
-  axis_dominant_v3_to_m3_negate(axis_mat, no);
-  for (int j = 0; j < flen; ++j) {
-    const double3 &dco = (*f)[j]->co;
-    float co[3] = {float(dco[0]), float(dco[1]), float(dco[2])};
-    mul_v2_m3v3(projverts[j], axis_mat, co);
-  }
-  BLI_polyfill_calc(projverts, flen, 1, tris);
-  /* Put tesselation triangles into Face form. Record original edges where they exist. */
-  Array<Face *> ans(totfilltri);
-  for (int t = 0; t < totfilltri; ++t) {
-    unsigned int *tri = tris[t];
-    int eo[3];
-    const Vert *v[3];
-    for (int k = 0; k < 3; k++) {
-      BLI_assert(tri[k] < flen);
-      v[k] = (*f)[tri[k]];
-      /* If tri edge goes between two successive indices in
-       * the original face, then it is an original edge. */
-      if ((tri[k] + 1) % flen == tri[(k + 1) % 3]) {
-        eo[k] = f->edge_orig[tri[k]];
-      }
-      else {
-        eo[k] = NO_INDEX;
-      }
-      ans[t] = arena->add_face(
-          {v[0], v[1], v[2]}, f->orig, {eo[0], eo[1], eo[2]}, {false, false, false});
-    }
-  }
-
-  MEM_freeN(tris);
-  MEM_freeN(projverts);
 
+  ans.set_faces(out_faces);
   return ans;
 }
-
-/**
- * Return an #IMesh that is a triangulation of a mesh with general
- * polygonal faces, #IMesh.
- * Added diagonals will be distinguishable by having edge original
- * indices of #NO_INDEX.
- */
-static IMesh triangulate_polymesh(IMesh &imesh, IMeshArena *arena)
-{
-  Vector<Face *> face_tris;
-  constexpr int estimated_tris_per_face = 3;
-  face_tris.reserve(estimated_tris_per_face * imesh.face_size());
-  for (Face *f : imesh.faces()) {
-    /* Tessellate face f, following plan similar to #BM_face_calc_tesselation. */
-    int flen = f->size();
-    if (flen == 3) {
-      face_tris.append(f);
-    }
-    else if (flen == 4) {
-      const Vert *v0 = (*f)[0];
-      const Vert *v1 = (*f)[1];
-      const Vert *v2 = (*f)[2];
-      const Vert *v3 = (*f)[3];
-      int eo_01 = f->edge_orig[0];
-      int eo_12 = f->edge_orig[1];
-      int eo_23 = f->edge_orig[2];
-      int eo_30 = f->edge_orig[3];
-      Face *f0 = arena->add_face({v0, v1, v2}, f->orig, {eo_01, eo_12, -1}, {false, false, false});
-      Face *f1 = arena->add_face({v0, v2, v3}, f->orig, {-1, eo_23, eo_30}, {false, false, false});
-      face_tris.append(f0);
-      face_tris.append(f1);
-    }
-    else {
-      Array<Face *> tris = triangulate_poly(f, arena);
-      for (Face *tri : tris) {
-        face_tris.append(tri);
-      }
-    }
-  }
-  return IMesh(face_tris);
-}
-
 /**
  * If \a tri1 and \a tri2 have a common edge (in opposite orientation),
  * return the indices into \a tri1 and \a tri2 where that common edge starts. Else return (-1,-1).
@@ -3363,9 +3390,13 @@ static IMesh polymesh_from_trimesh_with_dissolve(const IMesh &tm_out,
     std::cout << "\nPOLYMESH_FROM_TRIMESH_WITH_DISSOLVE\n";
   }
   /* For now: need plane normals for all triangles. */
-  for (Face *tri : tm_out.faces()) {
-    tri->populate_plane(false);
-  }
+  const int grainsize = 1024;
+  parallel_for(tm_out.face_index_range(), grainsize, [&](IndexRange range) {
+    for (int i : range) {
+      Face *tri = tm_out.face(i);
+      tri->populate_plane(false);
+    }
+  });
   /* Gather all output triangles that are part of each input face.
    * face_output_tris[f] will be indices of triangles in tm_out
    * that have f as their original face. */
diff --git a/source/blender/blenlib/intern/mesh_intersect.cc b/source/blender/blenlib/intern/mesh_intersect.cc
index b2b8dd4e900..5b7b6d9a929 100644
--- a/source/blender/blenlib/intern/mesh_intersect.cc
+++ b/source/blender/blenlib/intern/mesh_intersect.cc
@@ -39,6 +39,7 @@
 #  include "BLI_math_mpq.hh"
 #  include "BLI_mpq2.hh"
 #  include "BLI_mpq3.hh"
+#  include "BLI_polyfill_2d.h"
 #  include "BLI_set.hh"
 #  include "BLI_span.hh"
 #  include "BLI_task.h"
@@ -527,9 +528,7 @@ IMeshArena::IMeshArena()
   pimpl_ = std::make_unique<IMeshArenaImpl>();
 }
 
-IMeshArena::~IMeshArena()
-{
-}
+IMeshArena::~IMeshArena() = default;
 
 void IMeshArena::reserve(int vert_num_hint, int face_num_hint)
 {
@@ -745,83 +744,12 @@ std::ostream &operator<<(std::ostream &os, const IMesh &mesh)
   return os;
 }
 
-struct BoundingBox {
-  float3 min{FLT_MAX, FLT_MAX, FLT_MAX};
-  float3 max{-FLT_MAX, -FLT_MAX, -FLT_MAX};
-
-  BoundingBox() = default;
-  BoundingBox(const float3 &min, const float3 &max) : min(min), max(max)
-  {
-  }
-  BoundingBox(const BoundingBox &other) : min(other.min), max(other.max)
-  {
-  }
-  BoundingBox(BoundingBox &&other) noexcept : min(std::move(other.min)), max(std::move(other.max))
-  {
-  }
-  ~BoundingBox() = default;
-  BoundingBox operator=(const BoundingBox &other)
-  {
-    if (this != &other) {
-      min = other.min;
-      max = other.max;
-    }
-    return *this;
-  }
-  BoundingBox operator=(BoundingBox &&other) noexcept
-  {
-    min = std::move(other.min);
-    max = std::move(other.max);
-    return *this;
-  }
-
-  void combine(const float3 &p)
-  {
-    min.x = min_ff(min.x, p.x);
-    min.y = min_ff(min.y, p.y);
-    min.z = min_ff(min.z, p.z);
-    max.x = max_ff(max.x, p.x);
-    max.y = max_ff(max.y, p.y);
-    max.z = max_ff(max.z, p.z);
-  }
-
-  void combine(const double3 &p)
-  {
-    min.x = min_ff(min.x, static_cast<float>(p.x));
-    min.y = min_ff(min.y, static_cast<float>(p.y));
-    min.z = min_ff(min.z, static_cast<float>(p.z));
-    max.x = max_ff(max.x, static_cast<float>(p.x));
-    max.y = max_ff(max.y, static_cast<float>(p.y));
-    max.z = max_ff(max.z, static_cast<float>(p.z));
-  }
-
-  void combine(const BoundingBox &bb)
-  {
-    min.x = min_ff(min.x, bb.min.x);
-    min.y = min_ff(min.y, bb.min.y);
-    min.z = min_ff(min.z, bb.min.z);
-    max.x = max_ff(max.x, bb.max.x);
-    max.y = max_ff(max.y, bb.max.y);
-    max.z = max_ff(max.z, bb.max.z);
-  }
-
-  void expand(float pad)
-  {
-    min.x -= pad;
-    min.y -= pad;
-    min.z -= pad;
-    max.x += pad;
-    max.y += pad;
-    max.z += pad;
-  }
-};
-
 /**
  * Assume bounding boxes have been expanded by a sufficient epsilon on all sides
  * so that the comparisons against the bb bounds are sufficient to guarantee that
  * if an overlap or even touching could happen, this will return true.
  */
-static bool bbs_might_intersect(const BoundingBox &bb_a, const BoundingBox &bb_b)
+bool bbs_might_intersect(const BoundingBox &bb_a, const BoundingBox &bb_b)
 {
   return isect_aabb_aabb_v3(bb_a.min, bb_a.max, bb_b.min, bb_b.max);
 }
@@ -936,28 +864,6 @@ class CoplanarCluster {
   {
     this->add_tri(t, bb);
   }
-  CoplanarCluster(const CoplanarCluster &other) : tris_(other.tris_), bb_(other.bb_)
-  {
-  }
-  CoplanarCluster(CoplanarCluster &&other) noexcept
-      : tris_(std::move(other.tris_)), bb_(std::move(other.bb_))
-  {
-  }
-  ~CoplanarCluster() = default;
-  CoplanarCluster &operator=(const CoplanarCluster &other)
-  {
-    if (this != &other) {
-      tris_ = other.tris_;
-      bb_ = other.bb_;
-    }
-    return *this;
-  }
-  CoplanarCluster &operator=(CoplanarCluster &&other) noexcept
-  {
-    tris_ = std::move(other.tris_);
-    bb_ = std::move(other.bb_);
-    return *this;
-  }
 
   /* Assume that caller knows this will not be a duplicate. */
   void add_tri(int t, const BoundingBox &bb)
@@ -1073,38 +979,6 @@ struct ITT_value {
   ITT_value(ITT_value_kind k, const mpq3 &p1, const mpq3 &p2) : p1(p1), p2(p2), kind(k)
   {
   }
-  ITT_value(const ITT_value &other)
-      : p1(other.p1), p2(other.p2), t_source(other.t_source), kind(other.kind)
-  {
-  }
-  ITT_value(ITT_value &&other) noexcept
-      : p1(std::move(other.p1)),
-        p2(std::move(other.p2)),
-        t_source(other.t_source),
-        kind(other.kind)
-  {
-  }
-  ~ITT_value()
-  {
-  }
-  ITT_value &operator=(const ITT_value &other)
-  {
-    if (this != &other) {
-      kind = other.kind;
-      p1 = other.p1;
-      p2 = other.p2;
-      t_source = other.t_source;
-    }
-    return *this;
-  }
-  ITT_value &operator=(ITT_value &&other) noexcept
-  {
-    kind = other.kind;
-    p1 = std::move(other.p1);
-    p2 = std::move(other.p2);
-    t_source = other.t_source;
-    return *this;
-  }
 };
 
 static std::ostream &operator<<(std::ostream &os, const ITT_value &itt);
@@ -1241,13 +1115,19 @@ static int filter_plane_side(const double3 &p,
  * Assumes ab is not perpendicular to n.
  * This works because the ratio of the projections of ab and ac onto n is the same as
  * the ratio along the line ab of the intersection point to the whole of ab.
+ * The ab, ac, and dotbuf arguments are used as a temporaries; declaring them
+ * in the caller can avoid many allocs and frees of mpq3 and mpq_class structures.
  */
-static inline mpq3 tti_interp(const mpq3 &a, const mpq3 &b, const mpq3 &c, const mpq3 &n)
-{
-  mpq3 ab = a - b;
-  mpq_class den = mpq3::dot(ab, n);
+static inline mpq3 tti_interp(
+    const mpq3 &a, const mpq3 &b, const mpq3 &c, const mpq3 &n, mpq3 &ab, mpq3 &ac, mpq3 &dotbuf)
+{
+  ab = a;
+  ab -= b;
+  ac = a;
+  ac -= c;
+  mpq_class den = mpq3::dot_with_buffer(ab, n, dotbuf);
   BLI_assert(den != 0);
-  mpq_class alpha = mpq3::dot(a - c, n) / den;
+  mpq_class alpha = mpq3::dot_with_buffer(ac, n, dotbuf) / den;
   return a - alpha * ab;
 }
 
@@ -1255,11 +1135,28 @@ static inline mpq3 tti_interp(const mpq3 &a, const mpq3 &b, const mpq3 &c, const
  * Return +1, 0, -1 as a + ad is above, on, or below the oriented plane containing a, b, c in CCW
  * order. This is the same as -oriented(a, b, c, a + ad), but uses fewer arithmetic operations.
  * TODO: change arguments to `const Vert *` and use floating filters.
+ * The ba, ca, n, and dotbuf arguments are used as temporaries; declaring them
+ * in the caller can avoid many allocs and frees of mpq3 and mpq_class structures.
  */
-static inline int tti_above(const mpq3 &a, const mpq3 &b, const mpq3 &c, const mpq3 &ad)
+static inline int tti_above(const mpq3 &a,
+                            const mpq3 &b,
+                            const mpq3 &c,
+                            const mpq3 &ad,
+                            mpq3 &ba,
+                            mpq3 &ca,
+                            mpq3 &n,
+                            mpq3 &dotbuf)
 {
-  mpq3 n = mpq3::cross(b - a, c - a);
-  return sgn(mpq3::dot(ad, n));
+  ba = b;
+  ba -= a;
+  ca = c;
+  ca -= a;
+
+  n.x = ba.y * ca.z - ba.z * ca.y;
+  n.y = ba.z * ca.x - ba.x * ca.z;
+  n.z = ba.x * ca.y - ba.y * ca.x;
+
+  return sgn(mpq3::dot_with_buffer(ad, n, dotbuf));
 }
 
 /**
@@ -1303,20 +1200,21 @@ static ITT_value itt_canon2(const mpq3 &p1,
   mpq3 p1p2 = p2 - p1;
   mpq3 intersect_1;
   mpq3 intersect_2;
+  mpq3 buf[4];
   bool no_overlap = false;
   /* Top test in classification tree. */
-  if (tti_above(p1, q1, r2, p1p2) > 0) {
+  if (tti_above(p1, q1, r2, p1p2, buf[0], buf[1], buf[2], buf[3]) > 0) {
     /* Middle right test in classification tree. */
-    if (tti_above(p1, r1, r2, p1p2) <= 0) {
+    if (tti_above(p1, r1, r2, p1p2, buf[0], buf[1], buf[2], buf[3]) <= 0) {
       /* Bottom right test in classification tree. */
-      if (tti_above(p1, r1, q2, p1p2) > 0) {
+      if (tti_above(p1, r1, q2, p1p2, buf[0], buf[1], buf[2], buf[3]) > 0) {
         /* Overlap is [k [i l] j]. */
         if (dbg_level > 0) {
           std::cout << "overlap [k [i l] j]\n";
         }
         /* i is intersect with p1r1. l is intersect with p2r2. */
-        intersect_1 = tti_interp(p1, r1, p2, n2);
-        intersect_2 = tti_interp(p2, r2, p1, n1);
+        intersect_1 = tti_interp(p1, r1, p2, n2, buf[0], buf[1], buf[2]);
+        intersect_2 = tti_interp(p2, r2, p1, n1, buf[0], buf[1], buf[2]);
       }
       else {
         /* Overlap is [i [k l] j]. */
@@ -1324,8 +1222,8 @@ static ITT_value itt_canon2(const mpq3 &p1,
           std::cout << "overlap [i [k l] j]\n";
         }
         /* k is intersect with p2q2. l is intersect is p2r2. */
-        intersect_1 = tti_interp(p2, q2, p1, n1);
-        intersect_2 = tti_interp(p2, r2, p1, n1);
+        intersect_1 = tti_interp(p2, q2, p1, n1, buf[0], buf[1], buf[2]);
+        intersect_2 = tti_interp(p2, r2, p1, n1, buf[0], buf[1], buf[2]);
       }
     }
     else {
@@ -1338,7 +1236,7 @@ static ITT_value itt_canon2(const mpq3 &p1,
   }
   else {
     /* Middle left test in classification tree. */
-    if (tti_above(p1, q1, q2, p1p2) < 0) {
+    if (tti_above(p1, q1, q2, p1p2, buf[0], buf[1], buf[2], buf[3]) < 0) {
       /* No overlap: [i j] [k l]. */
       if (dbg_level > 0) {
         std::cout << "no overlap: [i j] [k l]\n";
@@ -1347,14 +1245,14 @@ static ITT_value itt_canon2(const mpq3 &p1,
     }
     else {
       /* Bottom left test in classification tree. */
-      if (tti_above(p1, r1, q2, p1p2) >= 0) {
+      if (tti_above(p1, r1, q2, p1p2, buf[0], buf[1], buf[2], buf[3]) >= 0) {
         /* Overlap is [k [i j] l]. */
         if (dbg_level > 0) {
           std::cout << "overlap [k [i j] l]\n";
         }
         /* i is intersect with p1r1. j is intersect with p1q1. */
-        intersect_1 = tti_interp(p1, r1, p2, n2);
-        intersect_2 = tti_interp(p1, q1, p2, n2);
+        intersect_1 = tti_interp(p1, r1, p2, n2, buf[0], buf[1], buf[2]);
+        intersect_2 = tti_interp(p1, q1, p2, n2, buf[0], buf[1], buf[2]);
       }
       else {
         /* Overlap is [i [k j] l]. */
@@ -1362,8 +1260,8 @@ static ITT_value itt_canon2(const mpq3 &p1,
           std::cout << "overlap [i [k j] l]\n";
         }
         /* k is intersect with p2q2. j is intersect with p1q1. */
-        intersect_1 = tti_interp(p2, q2, p1, n1);
-        intersect_2 = tti_interp(p1, q1, p2, n2);
+        intersect_1 = tti_interp(p2, q2, p1, n1, buf[0], buf[1], buf[2]);
+        intersect_2 = tti_interp(p1, q1, p2, n2, buf[0], buf[1], buf[2]);
       }
     }
   }
@@ -1514,6 +1412,7 @@ static ITT_value intersect_tri_tri(const IMesh &tm, int t1, int t2)
     return ITT_value(INONE);
   }
 
+  mpq3 buf[2];
   const mpq3 &p1 = vp1->co_exact;
   const mpq3 &q1 = vq1->co_exact;
   const mpq3 &r1 = vr1->co_exact;
@@ -1523,13 +1422,19 @@ static ITT_value intersect_tri_tri(const IMesh &tm, int t1, int t2)
 
   const mpq3 &n2 = tri2.plane->norm_exact;
   if (sp1 == 0) {
-    sp1 = sgn(mpq3::dot(p1 - r2, n2));
+    buf[0] = p1;
+    buf[0] -= r2;
+    sp1 = sgn(mpq3::dot_with_buffer(buf[0], n2, buf[1]));
   }
   if (sq1 == 0) {
-    sq1 = sgn(mpq3::dot(q1 - r2, n2));
+    buf[0] = q1;
+    buf[0] -= r2;
+    sq1 = sgn(mpq3::dot_with_buffer(buf[0], n2, buf[1]));
   }
   if (sr1 == 0) {
-    sr1 = sgn(mpq3::dot(r1 - r2, n2));
+    buf[0] = r1;
+    buf[0] -= r2;
+    sr1 = sgn(mpq3::dot_with_buffer(buf[0], n2, buf[1]));
   }
 
   if (dbg_level > 1) {
@@ -1549,13 +1454,19 @@ static ITT_value intersect_tri_tri(const IMesh &tm, int t1, int t2)
   /* Repeat for signs of t2's vertices with respect to plane of t1. */
   const mpq3 &n1 = tri1.plane->norm_exact;
   if (sp2 == 0) {
-    sp2 = sgn(mpq3::dot(p2 - r1, n1));
+    buf[0] = p2;
+    buf[0] -= r1;
+    sp2 = sgn(mpq3::dot_with_buffer(buf[0], n1, buf[1]));
   }
   if (sq2 == 0) {
-    sq2 = sgn(mpq3::dot(q2 - r1, n1));
+    buf[0] = q2;
+    buf[0] -= r1;
+    sq2 = sgn(mpq3::dot_with_buffer(buf[0], n1, buf[1]));
   }
   if (sr2 == 0) {
-    sr2 = sgn(mpq3::dot(r2 - r1, n1));
+    buf[0] = r2;
+    buf[0] -= r1;
+    sr2 = sgn(mpq3::dot_with_buffer(buf[0], n1, buf[1]));
   }
 
   if (dbg_level > 1) {
@@ -1653,6 +1564,9 @@ struct CDT_data {
   Vector<bool> is_reversed;
   /** Result of running CDT on input with (vert, edge, face). */
   CDT_result<mpq_class> cdt_out;
+  /** To speed up get_cdt_edge_orig, sometimes populate this map from vertex pair to output edge.
+   */
+  Map<std::pair<int, int>, int> verts_to_edge;
   int proj_axis;
 };
 
@@ -1811,6 +1725,30 @@ static CDT_data prepare_cdt_input_for_cluster(const IMesh &tm,
   return ans;
 }
 
+/* Return a copy of the argument with the integers ordered in ascending order. */
+static inline std::pair<int, int> sorted_int_pair(std::pair<int, int> pair)
+{
+  if (pair.first <= pair.second) {
+    return pair;
+  }
+  return std::pair<int, int>(pair.second, pair.first);
+}
+
+/**
+ * Build cd.verts_to_edge to map from a pair of cdt output indices to an index in cd.cdt_out.edge.
+ * Order the vertex indices so that the smaller one is first in the pair.
+ */
+static void populate_cdt_edge_map(Map<std::pair<int, int>, int> &verts_to_edge,
+                                  const CDT_result<mpq_class> &cdt_out)
+{
+  verts_to_edge.reserve(cdt_out.edge.size());
+  for (int e : cdt_out.edge.index_range()) {
+    std::pair<int, int> vpair = sorted_int_pair(cdt_out.edge[e]);
+    /* There should be only one edge for each vertex pair. */
+    verts_to_edge.add(vpair, e);
+  }
+}
+
 /**
  * Fills in cd.cdt_out with result of doing the cdt calculation on (vert, edge, face).
  */
@@ -1843,6 +1781,10 @@ static void do_cdt(CDT_data &cd)
   }
   cdt_in.epsilon = 0; /* TODO: needs attention for non-exact T. */
   cd.cdt_out = blender::meshintersect::delaunay_2d_calc(cdt_in, CDT_INSIDE);
+  constexpr int make_edge_map_threshold = 15;
+  if (cd.cdt_out.edge.size() >= make_edge_map_threshold) {
+    populate_cdt_edge_map(cd.verts_to_edge, cd.cdt_out);
+  }
   if (dbg_level > 0) {
     std::cout << "\nCDT result\nVerts:\n";
     for (int i : cd.cdt_out.vert.index_range()) {
@@ -1879,39 +1821,52 @@ static int get_cdt_edge_orig(
 {
   int foff = cd.cdt_out.face_edge_offset;
   *r_is_intersect = false;
-  for (int e : cd.cdt_out.edge.index_range()) {
-    std::pair<int, int> edge = cd.cdt_out.edge[e];
-    if ((edge.first == i0 && edge.second == i1) || (edge.first == i1 && edge.second == i0)) {
-      /* Pick an arbitrary orig, but not one equal to NO_INDEX, if we can help it. */
-      /* TODO: if edge has origs from more than on part of the nary input,
-       * then want to set *r_is_intersect to true. */
-      for (int orig_index : cd.cdt_out.edge_orig[e]) {
-        /* orig_index encodes the triangle and pos within the triangle of the input edge. */
-        if (orig_index >= foff) {
-          int in_face_index = (orig_index / foff) - 1;
-          int pos = orig_index % foff;
-          /* We need to retrieve the edge orig field from the Face used to populate the
-           * in_face_index'th face of the CDT, at the pos'th position of the face. */
-          int in_tm_face_index = cd.input_face[in_face_index];
-          BLI_assert(in_tm_face_index < in_tm.face_size());
-          const Face *facep = in_tm.face(in_tm_face_index);
-          BLI_assert(pos < facep->size());
-          bool is_rev = cd.is_reversed[in_face_index];
-          int eorig = is_rev ? facep->edge_orig[2 - pos] : facep->edge_orig[pos];
-          if (eorig != NO_INDEX) {
-            return eorig;
-          }
-        }
-        else {
-          /* This edge came from an edge input to the CDT problem,
-           * so it is an intersect edge. */
-          *r_is_intersect = true;
-          /* TODO: maybe there is an orig index:
-           * This happens if an input edge was formed by an input face having
-           * an edge that is co-planar with the cluster, while the face as a whole is not. */
-          return NO_INDEX;
-        }
+  int e = NO_INDEX;
+  if (cd.verts_to_edge.size() > 0) {
+    /* Use the populated map to find the edge, if any, between vertices i0 and i1. */
+    std::pair<int, int> vpair = sorted_int_pair(std::pair<int, int>(i0, i1));
+    e = cd.verts_to_edge.lookup_default(vpair, NO_INDEX);
+  }
+  else {
+    for (int ee : cd.cdt_out.edge.index_range()) {
+      std::pair<int, int> edge = cd.cdt_out.edge[ee];
+      if ((edge.first == i0 && edge.second == i1) || (edge.first == i1 && edge.second == i0)) {
+        e = ee;
+        break;
       }
+    }
+  }
+  if (e == NO_INDEX) {
+    return NO_INDEX;
+  }
+
+  /* Pick an arbitrary orig, but not one equal to NO_INDEX, if we can help it. */
+  /* TODO: if edge has origs from more than on part of the nary input,
+   * then want to set *r_is_intersect to true. */
+  for (int orig_index : cd.cdt_out.edge_orig[e]) {
+    /* orig_index encodes the triangle and pos within the triangle of the input edge. */
+    if (orig_index >= foff) {
+      int in_face_index = (orig_index / foff) - 1;
+      int pos = orig_index % foff;
+      /* We need to retrieve the edge orig field from the Face used to populate the
+       * in_face_index'th face of the CDT, at the pos'th position of the face. */
+      int in_tm_face_index = cd.input_face[in_face_index];
+      BLI_assert(in_tm_face_index < in_tm.face_size());
+      const Face *facep = in_tm.face(in_tm_face_index);
+      BLI_assert(pos < facep->size());
+      bool is_rev = cd.is_reversed[in_face_index];
+      int eorig = is_rev ? facep->edge_orig[2 - pos] : facep->edge_orig[pos];
+      if (eorig != NO_INDEX) {
+        return eorig;
+      }
+    }
+    else {
+      /* This edge came from an edge input to the CDT problem,
+       * so it is an intersect edge. */
+      *r_is_intersect = true;
+      /* TODO: maybe there is an orig index:
+       * This happens if an input edge was formed by an input face having
+       * an edge that is co-planar with the cluster, while the face as a whole is not. */
       return NO_INDEX;
     }
   }
@@ -1919,6 +1874,256 @@ static int get_cdt_edge_orig(
 }
 
 /**
+ * Make a Face from the CDT output triangle cdt_out_t, which has corresponding input triangle
+ * cdt_in_t. The triangle indices that are in cd.input_face are from IMesh tm.
+ * Return a pointer to the newly constructed Face.
+ */
+static Face *cdt_tri_as_imesh_face(
+    int cdt_out_t, int cdt_in_t, const CDT_data &cd, const IMesh &tm, IMeshArena *arena)
+{
+  const CDT_result<mpq_class> &cdt_out = cd.cdt_out;
+  int t_orig = tm.face(cd.input_face[cdt_in_t])->orig;
+  BLI_assert(cdt_out.face[cdt_out_t].size() == 3);
+  int i0 = cdt_out.face[cdt_out_t][0];
+  int i1 = cdt_out.face[cdt_out_t][1];
+  int i2 = cdt_out.face[cdt_out_t][2];
+  mpq3 v0co = unproject_cdt_vert(cd, cdt_out.vert[i0]);
+  mpq3 v1co = unproject_cdt_vert(cd, cdt_out.vert[i1]);
+  mpq3 v2co = unproject_cdt_vert(cd, cdt_out.vert[i2]);
+  /* No need to provide an original index: if coord matches
+   * an original one, then it will already be in the arena
+   * with the correct orig field. */
+  const Vert *v0 = arena->add_or_find_vert(v0co, NO_INDEX);
+  const Vert *v1 = arena->add_or_find_vert(v1co, NO_INDEX);
+  const Vert *v2 = arena->add_or_find_vert(v2co, NO_INDEX);
+  Face *facep;
+  bool is_isect0;
+  bool is_isect1;
+  bool is_isect2;
+  if (cd.is_reversed[cdt_in_t]) {
+    int oe0 = get_cdt_edge_orig(i0, i2, cd, tm, &is_isect0);
+    int oe1 = get_cdt_edge_orig(i2, i1, cd, tm, &is_isect1);
+    int oe2 = get_cdt_edge_orig(i1, i0, cd, tm, &is_isect2);
+    facep = arena->add_face(
+        {v0, v2, v1}, t_orig, {oe0, oe1, oe2}, {is_isect0, is_isect1, is_isect2});
+  }
+  else {
+    int oe0 = get_cdt_edge_orig(i0, i1, cd, tm, &is_isect0);
+    int oe1 = get_cdt_edge_orig(i1, i2, cd, tm, &is_isect1);
+    int oe2 = get_cdt_edge_orig(i2, i0, cd, tm, &is_isect2);
+    facep = arena->add_face(
+        {v0, v1, v2}, t_orig, {oe0, oe1, oe2}, {is_isect0, is_isect1, is_isect2});
+  }
+  facep->populate_plane(false);
+  return facep;
+}
+
+/**
+ * Tessellate face f into triangles and return an array of `const Face *`
+ * giving that triangulation. Intended to be used when f has > 4 vertices.
+ * Care is taken so that the original edge index associated with
+ * each edge in the output triangles either matches the original edge
+ * for the (identical) edge of f, or else is -1. So diagonals added
+ * for triangulation can later be identified by having #NO_INDEX for original.
+ *
+ * This code uses Blenlib's BLI_polyfill_calc to do triangulation, and is therefore quite fast.
+ * Unfortunately, it can product degenerate triangles that mesh_intersect will remove, leaving
+ * the mesh non-PWN.
+ */
+static Array<Face *> polyfill_triangulate_poly(Face *f, IMeshArena *arena)
+{
+  /* Similar to loop body in BM_mesh_calc_tesselation. */
+  int flen = f->size();
+  BLI_assert(flen > 4);
+  if (!f->plane_populated()) {
+    f->populate_plane(false);
+  }
+  /* Project along negative face normal so (x,y) can be used in 2d. */
+  const double3 &poly_normal = f->plane->norm;
+  float no[3] = {float(poly_normal[0]), float(poly_normal[1]), float(poly_normal[2])};
+  normalize_v3(no);
+  float axis_mat[3][3];
+  float(*projverts)[2];
+  unsigned int(*tris)[3];
+  const int totfilltri = flen - 2;
+  /* Prepare projected vertices and array to receive triangles in tesselation. */
+  tris = static_cast<unsigned int(*)[3]>(MEM_malloc_arrayN(totfilltri, sizeof(*tris), __func__));
+  projverts = static_cast<float(*)[2]>(MEM_malloc_arrayN(flen, sizeof(*projverts), __func__));
+  axis_dominant_v3_to_m3_negate(axis_mat, no);
+  for (int j = 0; j < flen; ++j) {
+    const double3 &dco = (*f)[j]->co;
+    float co[3] = {float(dco[0]), float(dco[1]), float(dco[2])};
+    mul_v2_m3v3(projverts[j], axis_mat, co);
+  }
+  BLI_polyfill_calc(projverts, flen, 1, tris);
+  /* Put tesselation triangles into Face form. Record original edges where they exist. */
+  Array<Face *> ans(totfilltri);
+  for (int t = 0; t < totfilltri; ++t) {
+    unsigned int *tri = tris[t];
+    int eo[3];
+    const Vert *v[3];
+    for (int k = 0; k < 3; k++) {
+      BLI_assert(tri[k] < flen);
+      v[k] = (*f)[tri[k]];
+      /* If tri edge goes between two successive indices in
+       * the original face, then it is an original edge. */
+      if ((tri[k] + 1) % flen == tri[(k + 1) % 3]) {
+        eo[k] = f->edge_orig[tri[k]];
+      }
+      else {
+        eo[k] = NO_INDEX;
+      }
+      ans[t] = arena->add_face(
+          {v[0], v[1], v[2]}, f->orig, {eo[0], eo[1], eo[2]}, {false, false, false});
+    }
+  }
+
+  MEM_freeN(tris);
+  MEM_freeN(projverts);
+
+  return ans;
+}
+
+/**
+ * Tessellate face f into triangles and return an array of `const Face *`
+ * giving that triangulation.
+ * Care is taken so that the original edge index associated with
+ * each edge in the output triangles either matches the original edge
+ * for the (identical) edge of f, or else is -1. So diagonals added
+ * for triangulation can later be identified by having #NO_INDEX for original.
+ *
+ * The method used is to use the CDT triangulation. Usually that triangulation
+ * will only use the existing vertices. However, if the face self-intersects
+ * then the CDT triangulation will include the intersection points.
+ * If this happens, we use the polyfill triangulator instead. We don't
+ * use the polyfill triangulator by default because it can create degenerate
+ * triangles (which we can handle but they'll create non-manifold meshes).
+ */
+static Array<Face *> triangulate_poly(Face *f, IMeshArena *arena)
+{
+  int flen = f->size();
+  CDT_input<mpq_class> cdt_in;
+  cdt_in.vert = Array<mpq2>(flen);
+  cdt_in.face = Array<Vector<int>>(1);
+  cdt_in.face[0].reserve(flen);
+  for (int i : f->index_range()) {
+    cdt_in.face[0].append(i);
+  }
+  /* Project poly along dominant axis of normal to get 2d coords. */
+  if (!f->plane_populated()) {
+    f->populate_plane(false);
+  }
+  const double3 &poly_normal = f->plane->norm;
+  int axis = double3::dominant_axis(poly_normal);
+  /* If project down y axis as opposed to x or z, the orientation
+   * of the polygon will be reversed.
+   * Yet another reversal happens if the poly normal in the dominant
+   * direction is opposite that of the positive dominant axis. */
+  bool rev1 = (axis == 1);
+  bool rev2 = poly_normal[axis] < 0;
+  bool rev = rev1 ^ rev2;
+  for (int i = 0; i < flen; ++i) {
+    int ii = rev ? flen - i - 1 : i;
+    mpq2 &p2d = cdt_in.vert[ii];
+    int k = 0;
+    for (int j = 0; j < 3; ++j) {
+      if (j != axis) {
+        p2d[k++] = (*f)[ii]->co_exact[j];
+      }
+    }
+  }
+  CDT_result<mpq_class> cdt_out = delaunay_2d_calc(cdt_in, CDT_INSIDE);
+  int n_tris = cdt_out.face.size();
+  Array<Face *> ans(n_tris);
+  for (int t = 0; t < n_tris; ++t) {
+    int i_v_out[3];
+    const Vert *v[3];
+    int eo[3];
+    bool needs_steiner = false;
+    for (int i = 0; i < 3; ++i) {
+      i_v_out[i] = cdt_out.face[t][i];
+      if (cdt_out.vert_orig[i_v_out[i]].size() == 0) {
+        needs_steiner = true;
+        break;
+      }
+      v[i] = (*f)[cdt_out.vert_orig[i_v_out[i]][0]];
+    }
+    if (needs_steiner) {
+      /* Fall back on the polyfill triangulator. */
+      return polyfill_triangulate_poly(f, arena);
+    }
+    Map<std::pair<int, int>, int> verts_to_edge;
+    populate_cdt_edge_map(verts_to_edge, cdt_out);
+    int foff = cdt_out.face_edge_offset;
+    for (int i = 0; i < 3; ++i) {
+      std::pair<int, int> vpair(i_v_out[i], i_v_out[(i + 1) % 3]);
+      std::pair<int, int> vpair_canon = sorted_int_pair(vpair);
+      int e_out = verts_to_edge.lookup_default(vpair_canon, NO_INDEX);
+      BLI_assert(e_out != NO_INDEX);
+      eo[i] = NO_INDEX;
+      for (int orig : cdt_out.edge_orig[e_out]) {
+        if (orig >= foff) {
+          int pos = orig % foff;
+          BLI_assert(pos < f->size());
+          eo[i] = f->edge_orig[pos];
+          break;
+        }
+      }
+    }
+    if (rev) {
+      ans[t] = arena->add_face(
+          {v[0], v[2], v[1]}, f->orig, {eo[2], eo[1], eo[0]}, {false, false, false});
+    }
+    else {
+      ans[t] = arena->add_face(
+          {v[0], v[1], v[2]}, f->orig, {eo[0], eo[1], eo[2]}, {false, false, false});
+    }
+  }
+  return ans;
+}
+
+/**
+ * Return an #IMesh that is a triangulation of a mesh with general
+ * polygonal faces, #IMesh.
+ * Added diagonals will be distinguishable by having edge original
+ * indices of #NO_INDEX.
+ */
+IMesh triangulate_polymesh(IMesh &imesh, IMeshArena *arena)
+{
+  Vector<Face *> face_tris;
+  constexpr int estimated_tris_per_face = 3;
+  face_tris.reserve(estimated_tris_per_face * imesh.face_size());
+  for (Face *f : imesh.faces()) {
+    /* Tessellate face f, following plan similar to #BM_face_calc_tesselation. */
+    int flen = f->size();
+    if (flen == 3) {
+      face_tris.append(f);
+    }
+    else if (flen == 4) {
+      const Vert *v0 = (*f)[0];
+      const Vert *v1 = (*f)[1];
+      const Vert *v2 = (*f)[2];
+      const Vert *v3 = (*f)[3];
+      int eo_01 = f->edge_orig[0];
+      int eo_12 = f->edge_orig[1];
+      int eo_23 = f->edge_orig[2];
+      int eo_30 = f->edge_orig[3];
+      Face *f0 = arena->add_face({v0, v1, v2}, f->orig, {eo_01, eo_12, -1}, {false, false, false});
+      Face *f1 = arena->add_face({v0, v2, v3}, f->orig, {-1, eo_23, eo_30}, {false, false, false});
+      face_tris.append(f0);
+      face_tris.append(f1);
+    }
+    else {
+      Array<Face *> tris = triangulate_poly(f, arena);
+      for (Face *tri : tris) {
+        face_tris.append(tri);
+      }
+    }
+  }
+  return IMesh(face_tris);
+}
+
+/**
  * Using the result of CDT in cd.cdt_out, extract an #IMesh representing the subdivision
  * of input triangle t, which should be an element of cd.input_face.
  */
@@ -1939,55 +2144,17 @@ static IMesh extract_subdivided_tri(const CDT_data &cd,
     BLI_assert(false);
     return IMesh();
   }
-  int t_orig = in_tm.face(t)->orig;
   constexpr int inline_buf_size = 20;
   Vector<Face *, inline_buf_size> faces;
   for (int f : cdt_out.face.index_range()) {
     if (cdt_out.face_orig[f].contains(t_in_cdt)) {
-      BLI_assert(cdt_out.face[f].size() == 3);
-      int i0 = cdt_out.face[f][0];
-      int i1 = cdt_out.face[f][1];
-      int i2 = cdt_out.face[f][2];
-      mpq3 v0co = unproject_cdt_vert(cd, cdt_out.vert[i0]);
-      mpq3 v1co = unproject_cdt_vert(cd, cdt_out.vert[i1]);
-      mpq3 v2co = unproject_cdt_vert(cd, cdt_out.vert[i2]);
-      /* No need to provide an original index: if coord matches
-       * an original one, then it will already be in the arena
-       * with the correct orig field. */
-      const Vert *v0 = arena->add_or_find_vert(v0co, NO_INDEX);
-      const Vert *v1 = arena->add_or_find_vert(v1co, NO_INDEX);
-      const Vert *v2 = arena->add_or_find_vert(v2co, NO_INDEX);
-      Face *facep;
-      bool is_isect0;
-      bool is_isect1;
-      bool is_isect2;
-      if (cd.is_reversed[t_in_cdt]) {
-        int oe0 = get_cdt_edge_orig(i0, i2, cd, in_tm, &is_isect0);
-        int oe1 = get_cdt_edge_orig(i2, i1, cd, in_tm, &is_isect1);
-        int oe2 = get_cdt_edge_orig(i1, i0, cd, in_tm, &is_isect2);
-        facep = arena->add_face(
-            {v0, v2, v1}, t_orig, {oe0, oe1, oe2}, {is_isect0, is_isect1, is_isect2});
-      }
-      else {
-        int oe0 = get_cdt_edge_orig(i0, i1, cd, in_tm, &is_isect0);
-        int oe1 = get_cdt_edge_orig(i1, i2, cd, in_tm, &is_isect1);
-        int oe2 = get_cdt_edge_orig(i2, i0, cd, in_tm, &is_isect2);
-        facep = arena->add_face(
-            {v0, v1, v2}, t_orig, {oe0, oe1, oe2}, {is_isect0, is_isect1, is_isect2});
-      }
-      facep->populate_plane(false);
+      Face *facep = cdt_tri_as_imesh_face(f, t_in_cdt, cd, in_tm, arena);
       faces.append(facep);
     }
   }
   return IMesh(faces);
 }
 
-static IMesh extract_single_tri(const IMesh &tm, int t)
-{
-  Face *f = tm.face(t);
-  return IMesh({f});
-}
-
 static bool bvhtreeverlap_cmp(const BVHTreeOverlap &a, const BVHTreeOverlap &b)
 {
   if (a.indexA < b.indexA) {
@@ -2203,7 +2370,7 @@ static void calc_overlap_itts(Map<std::pair<int, int>, ITT_value> &itt_map,
 }
 
 /**
- * Data needed for parallelization of calc_subdivided_tris.
+ * Data needed for parallelization of calc_subdivided_non_cluster_tris.
  */
 struct OverlapTriRange {
   int tri_index;
@@ -2280,14 +2447,13 @@ static void calc_subdivided_tri_range_func(void *__restrict userdata,
  * r_tri_subdivided with the result of intersecting it with
  * all the other triangles in the mesh, if it intersects any others.
  * But don't do this for triangles that are part of a cluster.
- * Also, do nothing here if the answer is just the triangle itself.
  */
-static void calc_subdivided_tris(Array<IMesh> &r_tri_subdivided,
-                                 const IMesh &tm,
-                                 const Map<std::pair<int, int>, ITT_value> &itt_map,
-                                 const CoplanarClusterInfo &clinfo,
-                                 const TriOverlaps &ov,
-                                 IMeshArena *arena)
+static void calc_subdivided_non_cluster_tris(Array<IMesh> &r_tri_subdivided,
+                                             const IMesh &tm,
+                                             const Map<std::pair<int, int>, ITT_value> &itt_map,
+                                             const CoplanarClusterInfo &clinfo,
+                                             const TriOverlaps &ov,
+                                             IMeshArena *arena)
 {
   const int dbg_level = 0;
   if (dbg_level > 0) {
@@ -2327,6 +2493,54 @@ static void calc_subdivided_tris(Array<IMesh> &r_tri_subdivided,
   settings.use_threading = intersect_use_threading;
   BLI_task_parallel_range(
       0, overlap_tri_range_tot, &data, calc_subdivided_tri_range_func, &settings);
+  /* Now have to put in the triangles that are the same as the input ones, and not in clusters.
+   */
+  for (int t : tm.face_index_range()) {
+    if (r_tri_subdivided[t].face_size() == 0 && clinfo.tri_cluster(t) == NO_INDEX) {
+      r_tri_subdivided[t] = IMesh({tm.face(t)});
+    }
+  }
+}
+
+/**
+ * For each cluster in clinfo, extract the triangles from the cluster
+ * that correspond to each original triangle t that is part of the cluster,
+ * and put the resulting triangles into an IMesh in tri_subdivided[t].
+ * We have already done the CDT for the triangles in the cluster, whose
+ * result is in cluster_subdivided[c] for each cluster c.
+ */
+static void calc_cluster_tris(Array<IMesh> &tri_subdivided,
+                              const IMesh &tm,
+                              const CoplanarClusterInfo &clinfo,
+                              const Array<CDT_data> &cluster_subdivided,
+                              IMeshArena *arena)
+{
+  for (int c : clinfo.index_range()) {
+    const CoplanarCluster &cl = clinfo.cluster(c);
+    const CDT_data &cd = cluster_subdivided[c];
+    /* Each triangle in cluster c should be an input triangle in cd.input_faces.
+     * (See prepare_cdt_input_for_cluster.)
+     * So accumulate a Vector of Face* for each input face by going through the
+     * output faces and making a Face for each input face that it is part of.
+     * (The Boolean algorithm wants duplicates if a given output triangle is part
+     * of more than one input triangle.)
+     */
+    int n_cluster_tris = cl.tot_tri();
+    const CDT_result<mpq_class> &cdt_out = cd.cdt_out;
+    BLI_assert(cd.input_face.size() == n_cluster_tris);
+    Array<Vector<Face *>> face_vec(n_cluster_tris);
+    for (int cdt_out_t : cdt_out.face.index_range()) {
+      for (int cdt_in_t : cdt_out.face_orig[cdt_out_t]) {
+        Face *f = cdt_tri_as_imesh_face(cdt_out_t, cdt_in_t, cd, tm, arena);
+        face_vec[cdt_in_t].append(f);
+      }
+    }
+    for (int cdt_in_t : cd.input_face.index_range()) {
+      int tm_t = cd.input_face[cdt_in_t];
+      BLI_assert(tri_subdivided[tm_t].face_size() == 0);
+      tri_subdivided[tm_t] = IMesh(face_vec[cdt_in_t]);
+    }
+  }
 }
 
 static CDT_data calc_cluster_subdivided(const CoplanarClusterInfo &clinfo,
@@ -2699,10 +2913,11 @@ IMesh trimesh_nary_intersect(const IMesh &tm_in,
   doperfmax(2, tri_ov.overlap().size());
 #  endif
   Array<IMesh> tri_subdivided(tm_clean->face_size());
-  calc_subdivided_tris(tri_subdivided, *tm_clean, itt_map, clinfo, tri_ov, arena);
+  calc_subdivided_non_cluster_tris(tri_subdivided, *tm_clean, itt_map, clinfo, tri_ov, arena);
 #  ifdef PERFDEBUG
   double subdivided_tris_time = PIL_check_seconds_timer();
-  std::cout << "subdivided tris found, time = " << subdivided_tris_time - itt_time << "\n";
+  std::cout << "subdivided non-cluster tris found, time = " << subdivided_tris_time - itt_time
+            << "\n";
 #  endif
   Array<CDT_data> cluster_subdivided(clinfo.tot_cluster());
   for (int c : clinfo.index_range()) {
@@ -2713,19 +2928,11 @@ IMesh trimesh_nary_intersect(const IMesh &tm_in,
   std::cout << "subdivided clusters found, time = "
             << cluster_subdivide_time - subdivided_tris_time << "\n";
 #  endif
-  for (int t : tm_clean->face_index_range()) {
-    int c = clinfo.tri_cluster(t);
-    if (c != NO_INDEX) {
-      BLI_assert(tri_subdivided[t].face_size() == 0);
-      tri_subdivided[t] = extract_subdivided_tri(cluster_subdivided[c], *tm_clean, t, arena);
-    }
-    else if (tri_subdivided[t].face_size() == 0) {
-      tri_subdivided[t] = extract_single_tri(*tm_clean, t);
-    }
-  }
+  calc_cluster_tris(tri_subdivided, *tm_clean, clinfo, cluster_subdivided, arena);
 #  ifdef PERFDEBUG
   double extract_time = PIL_check_seconds_timer();
-  std::cout << "triangles extracted, time = " << extract_time - cluster_subdivide_time << "\n";
+  std::cout << "subdivided cluster tris found, time = " << extract_time - cluster_subdivide_time
+            << "\n";
 #  endif
   IMesh combined = union_tri_subdivides(tri_subdivided);
   if (dbg_level > 1) {
diff --git a/source/blender/blenlib/intern/noise.c b/source/blender/blenlib/intern/noise.c
index 996a1622239..8e28088c9fa 100644
--- a/source/blender/blenlib/intern/noise.c
+++ b/source/blender/blenlib/intern/noise.c
@@ -1121,7 +1121,7 @@ static float voronoi_CrS(float x, float y, float z)
 /** \name Cell-Noise Implementation
  * \{ */
 
-/* returns unsigned cellnoise */
+/** Returns unsigned cell-noise. */
 static float BLI_cellNoiseU(float x, float y, float z)
 {
   /* avoid precision issues on unit coordinates */
@@ -1166,7 +1166,9 @@ void BLI_noise_cell_v3(float x, float y, float z, float ca[3])
 /** \name Public API's
  * \{ */
 
-/* newnoise: generic noise function for use with different noisebases */
+/**
+ * newnoise: generic noise function for use with different `noisebasis`.
+ */
 float BLI_noise_generic_noise(
     float noisesize, float x, float y, float z, bool hard, int noisebasis)
 {
diff --git a/source/blender/blenlib/intern/path_util.c b/source/blender/blenlib/intern/path_util.c
index b076d0f09c1..e8027836ed6 100644
--- a/source/blender/blenlib/intern/path_util.c
+++ b/source/blender/blenlib/intern/path_util.c
@@ -1044,17 +1044,17 @@ bool BLI_path_abs(char *path, const char *basepath)
 #else
   BLI_strncpy(tmp, path, sizeof(tmp));
 
-  /* Check for loading a windows path on a posix system
-   * in this case, there is no use in trying C:/ since it
-   * will never exist on a unix os.
+  /* Check for loading a MS-Windows path on a POSIX system
+   * in this case, there is no use in trying `C:/` since it
+   * will never exist on a Unix system.
    *
-   * Add a '/' prefix and lowercase the drive-letter, remove the ':'.
-   * C:\foo.JPG -> /c/foo.JPG */
+   * Add a `/` prefix and lowercase the drive-letter, remove the `:`.
+   * `C:\foo.JPG` -> `/c/foo.JPG` */
 
   if (isalpha(tmp[0]) && (tmp[1] == ':') && ELEM(tmp[2], '\\', '/')) {
     tmp[1] = tolower(tmp[0]); /* Replace ':' with drive-letter. */
     tmp[0] = '/';
-    /* '\' the slash will be converted later */
+    /* `\` the slash will be converted later. */
   }
 
 #endif
@@ -2013,9 +2013,9 @@ void BLI_path_slash_native(char *path)
 {
 #ifdef WIN32
   if (path && BLI_strnlen(path, 3) > 2) {
-    BLI_str_replace_char(path + 2, '/', '\\');
+    BLI_str_replace_char(path + 2, ALTSEP, SEP);
   }
 #else
-  BLI_str_replace_char(path + BLI_path_unc_prefix_len(path), '\\', '/');
+  BLI_str_replace_char(path + BLI_path_unc_prefix_len(path), ALTSEP, SEP);
 #endif
 }
diff --git a/source/blender/blenlib/intern/polyfill_2d_beautify.c b/source/blender/blenlib/intern/polyfill_2d_beautify.c
index 7bfca149ffb..98fa5c872b0 100644
--- a/source/blender/blenlib/intern/polyfill_2d_beautify.c
+++ b/source/blender/blenlib/intern/polyfill_2d_beautify.c
@@ -375,7 +375,7 @@ void BLI_polyfill_beautify(const float (*coords)[2],
   for (uint i = 0, base_index = 0; i < order_edges_len; base_index++) {
     const struct OrderEdge *oe_a = &order_edges[i++];
     const struct OrderEdge *oe_b = &order_edges[i++];
-    BLI_assert(oe_a->verts[0] == oe_a->verts[0] && oe_a->verts[1] == oe_a->verts[1]);
+    BLI_assert(oe_a->verts[0] == oe_b->verts[0] && oe_a->verts[1] == oe_b->verts[1]);
     half_edges[oe_a->e_half].e_radial = oe_b->e_half;
     half_edges[oe_b->e_half].e_radial = oe_a->e_half;
     half_edges[oe_a->e_half].base_index = base_index;
diff --git a/source/blender/blenlib/intern/storage.c b/source/blender/blenlib/intern/storage.c
index 287334a34ee..8964dac31a9 100644
--- a/source/blender/blenlib/intern/storage.c
+++ b/source/blender/blenlib/intern/storage.c
@@ -266,7 +266,8 @@ eFileAttributes BLI_file_attributes(const char *path)
   if (attr & FILE_ATTRIBUTE_SPARSE_FILE) {
     ret |= FILE_ATTR_SPARSE_FILE;
   }
-  if (attr & FILE_ATTRIBUTE_OFFLINE) {
+  if (attr & FILE_ATTRIBUTE_OFFLINE || attr & FILE_ATTRIBUTE_RECALL_ON_OPEN ||
+      attr & FILE_ATTRIBUTE_RECALL_ON_DATA_ACCESS) {
     ret |= FILE_ATTR_OFFLINE;
   }
   if (attr & FILE_ATTRIBUTE_REPARSE_POINT) {
@@ -292,19 +293,23 @@ bool BLI_file_alias_target(const char *filepath,
                            /* This parameter can only be `const` on Linux since
                             * redirections are not supported there.
                             * NOLINTNEXTLINE: readability-non-const-parameter. */
-                           char r_targetpath[FILE_MAXDIR])
+                           char r_targetpath[/*FILE_MAXDIR*/])
 {
 #  ifdef WIN32
   if (!BLI_path_extension_check(filepath, ".lnk")) {
     return false;
   }
 
-  IShellLinkW *Shortcut = NULL;
-  bool success = false;
-  CoInitializeEx(NULL, COINIT_MULTITHREADED);
+  HRESULT hr = CoInitializeEx(NULL, COINIT_MULTITHREADED);
+  if (FAILED(hr)) {
+    return false;
+  }
 
-  HRESULT hr = CoCreateInstance(
+  IShellLinkW *Shortcut = NULL;
+  hr = CoCreateInstance(
       &CLSID_ShellLink, NULL, CLSCTX_INPROC_SERVER, &IID_IShellLinkW, (LPVOID *)&Shortcut);
+
+  bool success = false;
   if (SUCCEEDED(hr)) {
     IPersistFile *PersistFile;
     hr = Shortcut->lpVtbl->QueryInterface(Shortcut, &IID_IPersistFile, (LPVOID *)&PersistFile);
@@ -328,6 +333,7 @@ bool BLI_file_alias_target(const char *filepath,
     Shortcut->lpVtbl->Release(Shortcut);
   }
 
+  CoUninitialize();
   return (success && r_targetpath[0]);
 #  else
   UNUSED_VARS(r_targetpath, filepath);
diff --git a/source/blender/blenlib/intern/string.c b/source/blender/blenlib/intern/string.c
index ccc11af9f2b..3d40c6ef146 100644
--- a/source/blender/blenlib/intern/string.c
+++ b/source/blender/blenlib/intern/string.c
@@ -540,6 +540,29 @@ void BLI_str_replace_char(char *str, char src, char dst)
 }
 
 /**
+ * Simple exact-match string replacement.
+ *
+ * \param replace_table: Array of source, destination pairs.
+ *
+ * \note Larger tables should use a hash table.
+ */
+bool BLI_str_replace_table_exact(char *string,
+                                 const size_t string_len,
+                                 const char *replace_table[][2],
+                                 int replace_table_len)
+{
+  for (int i = 0; i < replace_table_len; i++) {
+    if (STREQ(string, replace_table[i][0])) {
+      BLI_strncpy(string, replace_table[i][1], string_len);
+      return true;
+    }
+  }
+  return false;
+}
+
+/** \} */
+
+/**
  * Compare two strings without regard to case.
  *
  * \retval True if the strings are equal, false otherwise.
diff --git a/source/blender/blenlib/intern/task_iterator.c b/source/blender/blenlib/intern/task_iterator.c
index 38271e5823f..ee41c277b34 100644
--- a/source/blender/blenlib/intern/task_iterator.c
+++ b/source/blender/blenlib/intern/task_iterator.c
@@ -29,11 +29,16 @@
 #include "BLI_listbase.h"
 #include "BLI_math.h"
 #include "BLI_mempool.h"
+#include "BLI_mempool_private.h"
 #include "BLI_task.h"
 #include "BLI_threads.h"
 
 #include "atomic_ops.h"
 
+/* -------------------------------------------------------------------- */
+/** \name Macros
+ * \{ */
+
 /* Allows to avoid using malloc for userdata_chunk in tasks, when small enough. */
 #define MALLOCA(_size) ((_size) <= 8192) ? alloca((_size)) : MEM_mallocN((_size), __func__)
 #define MALLOCA_FREE(_mem, _size) \
@@ -42,6 +47,12 @@
   } \
   ((void)0)
 
+/** \} */
+
+/* -------------------------------------------------------------------- */
+/** \name Generic Iteration
+ * \{ */
+
 BLI_INLINE void task_parallel_calc_chunk_size(const TaskParallelSettings *settings,
                                               const int tot_items,
                                               int num_tasks,
@@ -182,9 +193,12 @@ static void task_parallel_iterator_no_threads(const TaskParallelSettings *settin
 
   parallel_iterator_func_do(state, userdata_chunk);
 
-  if (use_userdata_chunk && settings->func_free != NULL) {
-    /* `func_free` should only free data that was created during execution of `func`. */
-    settings->func_free(state->userdata, userdata_chunk_local);
+  if (use_userdata_chunk) {
+    if (settings->func_free != NULL) {
+      /* `func_free` should only free data that was created during execution of `func`. */
+      settings->func_free(state->userdata, userdata_chunk_local);
+    }
+    MALLOCA_FREE(userdata_chunk_local, userdata_chunk_size);
   }
 }
 
@@ -223,7 +237,7 @@ static void task_parallel_iterator_do(const TaskParallelSettings *settings,
   void *userdata_chunk_array = NULL;
   const bool use_userdata_chunk = (userdata_chunk_size != 0) && (userdata_chunk != NULL);
 
-  TaskPool *task_pool = BLI_task_pool_create(state, TASK_PRIORITY_HIGH);
+  TaskPool *task_pool = BLI_task_pool_create(state, TASK_PRIORITY_HIGH, TASK_ISOLATION_ON);
 
   if (use_userdata_chunk) {
     userdata_chunk_array = MALLOCA(userdata_chunk_size * num_tasks);
@@ -241,14 +255,16 @@ static void task_parallel_iterator_do(const TaskParallelSettings *settings,
   BLI_task_pool_work_and_wait(task_pool);
   BLI_task_pool_free(task_pool);
 
-  if (use_userdata_chunk && (settings->func_reduce != NULL || settings->func_free != NULL)) {
-    for (size_t i = 0; i < num_tasks; i++) {
-      userdata_chunk_local = (char *)userdata_chunk_array + (userdata_chunk_size * i);
-      if (settings->func_reduce != NULL) {
-        settings->func_reduce(state->userdata, userdata_chunk, userdata_chunk_local);
-      }
-      if (settings->func_free != NULL) {
-        settings->func_free(state->userdata, userdata_chunk_local);
+  if (use_userdata_chunk) {
+    if (settings->func_reduce != NULL || settings->func_free != NULL) {
+      for (size_t i = 0; i < num_tasks; i++) {
+        userdata_chunk_local = (char *)userdata_chunk_array + (userdata_chunk_size * i);
+        if (settings->func_reduce != NULL) {
+          settings->func_reduce(state->userdata, userdata_chunk, userdata_chunk_local);
+        }
+        if (settings->func_free != NULL) {
+          settings->func_free(state->userdata, userdata_chunk_local);
+        }
       }
     }
     MALLOCA_FREE(userdata_chunk_array, userdata_chunk_size * num_tasks);
@@ -294,6 +310,12 @@ void BLI_task_parallel_iterator(void *userdata,
   task_parallel_iterator_do(settings, &state);
 }
 
+/** \} */
+
+/* -------------------------------------------------------------------- */
+/** \name ListBase Iteration
+ * \{ */
+
 static void task_parallel_listbase_get(void *__restrict UNUSED(userdata),
                                        const TaskParallelTLS *__restrict UNUSED(tls),
                                        void **r_next_item,
@@ -343,8 +365,11 @@ void BLI_task_parallel_listbase(ListBase *listbase,
   task_parallel_iterator_do(settings, &state);
 }
 
-#undef MALLOCA
-#undef MALLOCA_FREE
+/** \} */
+
+/* -------------------------------------------------------------------- */
+/** \name MemPool Iteration
+ * \{ */
 
 typedef struct ParallelMempoolState {
   void *userdata;
@@ -354,11 +379,12 @@ typedef struct ParallelMempoolState {
 static void parallel_mempool_func(TaskPool *__restrict pool, void *taskdata)
 {
   ParallelMempoolState *__restrict state = BLI_task_pool_user_data(pool);
-  BLI_mempool_iter *iter = taskdata;
-  MempoolIterData *item;
+  BLI_mempool_threadsafe_iter *iter = &((ParallelMempoolTaskData *)taskdata)->ts_iter;
+  TaskParallelTLS *tls = &((ParallelMempoolTaskData *)taskdata)->tls;
 
-  while ((item = BLI_mempool_iterstep(iter)) != NULL) {
-    state->func(state->userdata, item);
+  MempoolIterData *item;
+  while ((item = mempool_iter_threadsafe_step(iter)) != NULL) {
+    state->func(state->userdata, item, tls);
   }
 }
 
@@ -368,16 +394,14 @@ static void parallel_mempool_func(TaskPool *__restrict pool, void *taskdata)
  * \param mempool: The iterable BLI_mempool to loop over.
  * \param userdata: Common userdata passed to all instances of \a func.
  * \param func: Callback function.
- * \param use_threading: If \a true, actually split-execute loop in threads,
- * else just do a sequential for loop
- * (allows caller to use any kind of test to switch on parallelization or not).
+ * \param settings: See public API doc of TaskParallelSettings for description of all settings.
  *
  * \note There is no static scheduling here.
  */
 void BLI_task_parallel_mempool(BLI_mempool *mempool,
                                void *userdata,
                                TaskParallelMempoolFunc func,
-                               const bool use_threading)
+                               const TaskParallelSettings *settings)
 {
   TaskPool *task_pool;
   ParallelMempoolState state;
@@ -387,18 +411,38 @@ void BLI_task_parallel_mempool(BLI_mempool *mempool,
     return;
   }
 
-  if (!use_threading) {
+  void *userdata_chunk = settings->userdata_chunk;
+  const size_t userdata_chunk_size = settings->userdata_chunk_size;
+  void *userdata_chunk_local = NULL;
+  void *userdata_chunk_array = NULL;
+  const bool use_userdata_chunk = (userdata_chunk_size != 0) && (userdata_chunk != NULL);
+
+  if (!settings->use_threading) {
+    TaskParallelTLS tls = {NULL};
+    if (use_userdata_chunk) {
+      userdata_chunk_local = MALLOCA(userdata_chunk_size);
+      memcpy(userdata_chunk_local, userdata_chunk, userdata_chunk_size);
+      tls.userdata_chunk = userdata_chunk_local;
+    }
+
     BLI_mempool_iter iter;
     BLI_mempool_iternew(mempool, &iter);
 
-    for (void *item = BLI_mempool_iterstep(&iter); item != NULL;
-         item = BLI_mempool_iterstep(&iter)) {
-      func(userdata, item);
+    void *item;
+    while ((item = BLI_mempool_iterstep(&iter))) {
+      func(userdata, item, &tls);
+    }
+
+    if (settings->func_free != NULL) {
+      /* `func_free` should only free data that was created during execution of `func`. */
+      settings->func_free(userdata, userdata_chunk_local);
     }
+
+    MALLOCA_FREE(userdata_chunk_local, userdata_chunk_size);
     return;
   }
 
-  task_pool = BLI_task_pool_create(&state, TASK_PRIORITY_HIGH);
+  task_pool = BLI_task_pool_create(&state, TASK_PRIORITY_HIGH, TASK_ISOLATION_ON);
   num_threads = BLI_task_scheduler_num_threads();
 
   /* The idea here is to prevent creating task for each of the loop iterations
@@ -410,16 +454,46 @@ void BLI_task_parallel_mempool(BLI_mempool *mempool,
   state.userdata = userdata;
   state.func = func;
 
-  BLI_mempool_iter *mempool_iterators = BLI_mempool_iter_threadsafe_create(mempool,
-                                                                           (size_t)num_tasks);
+  if (use_userdata_chunk) {
+    userdata_chunk_array = MALLOCA(userdata_chunk_size * num_tasks);
+  }
+
+  ParallelMempoolTaskData *mempool_iterator_data = mempool_iter_threadsafe_create(
+      mempool, (size_t)num_tasks);
 
   for (i = 0; i < num_tasks; i++) {
+    if (use_userdata_chunk) {
+      userdata_chunk_local = (char *)userdata_chunk_array + (userdata_chunk_size * i);
+      memcpy(userdata_chunk_local, userdata_chunk, userdata_chunk_size);
+    }
+    mempool_iterator_data[i].tls.userdata_chunk = userdata_chunk_local;
+
     /* Use this pool's pre-allocated tasks. */
-    BLI_task_pool_push(task_pool, parallel_mempool_func, &mempool_iterators[i], false, NULL);
+    BLI_task_pool_push(task_pool, parallel_mempool_func, &mempool_iterator_data[i], false, NULL);
   }
 
   BLI_task_pool_work_and_wait(task_pool);
   BLI_task_pool_free(task_pool);
 
-  BLI_mempool_iter_threadsafe_free(mempool_iterators);
+  if (use_userdata_chunk) {
+    if ((settings->func_free != NULL) || (settings->func_reduce != NULL)) {
+      for (i = 0; i < num_tasks; i++) {
+        if (settings->func_reduce) {
+          settings->func_reduce(
+              userdata, userdata_chunk, mempool_iterator_data[i].tls.userdata_chunk);
+        }
+        if (settings->func_free) {
+          settings->func_free(userdata, mempool_iterator_data[i].tls.userdata_chunk);
+        }
+      }
+    }
+    MALLOCA_FREE(userdata_chunk_array, userdata_chunk_size * num_tasks);
+  }
+
+  mempool_iter_threadsafe_destroy(mempool_iterator_data);
 }
+
+#undef MALLOCA
+#undef MALLOCA_FREE
+
+/** \} */
diff --git a/source/blender/blenlib/intern/task_pool.cc b/source/blender/blenlib/intern/task_pool.cc
index 00ba659a9c8..d72674c1c00 100644
--- a/source/blender/blenlib/intern/task_pool.cc
+++ b/source/blender/blenlib/intern/task_pool.cc
@@ -22,6 +22,7 @@
 
 #include <cstdlib>
 #include <memory>
+#include <thread>
 #include <utility>
 
 #include "MEM_guardedalloc.h"
@@ -111,15 +112,7 @@ class Task {
   Task &operator=(const Task &other) = delete;
   Task &operator=(Task &&other) = delete;
 
-  /* Execute task. */
-  void operator()() const
-  {
-#ifdef WITH_TBB
-    tbb::this_task_arena::isolate([this] { run(pool, taskdata); });
-#else
-    run(pool, taskdata);
-#endif
-  }
+  void operator()() const;
 };
 
 /* TBB Task Group.
@@ -147,10 +140,6 @@ class TBBTaskGroup : public tbb::task_group {
     }
 #  endif
   }
-
-  ~TBBTaskGroup()
-  {
-  }
 };
 #endif
 
@@ -167,13 +156,16 @@ enum TaskPoolType {
 struct TaskPool {
   TaskPoolType type;
   bool use_threads;
+  TaskIsolation task_isolation;
 
   ThreadMutex user_mutex;
   void *userdata;
 
-  /* TBB task pool. */
 #ifdef WITH_TBB
+  /* TBB task pool. */
   TBBTaskGroup tbb_group;
+  /* This is used to detect a common way to accidentally create a deadlock with task isolation. */
+  std::thread::id task_pool_create_thread_id;
 #endif
   volatile bool is_suspended;
   BLI_mempool *suspended_mempool;
@@ -184,6 +176,36 @@ struct TaskPool {
   volatile bool background_is_canceling;
 };
 
+/* Execute task. */
+void Task::operator()() const
+{
+#ifdef WITH_TBB
+  if (pool->task_isolation == TASK_ISOLATION_ON) {
+    tbb::this_task_arena::isolate([this] { run(pool, taskdata); });
+    return;
+  }
+#endif
+  run(pool, taskdata);
+}
+
+static void assert_on_valid_thread(TaskPool *pool)
+{
+  /* TODO: Remove this `return` to enable the check. */
+  return;
+#ifdef DEBUG
+#  ifdef WITH_TBB
+  if (pool->task_isolation == TASK_ISOLATION_ON) {
+    const std::thread::id current_id = std::this_thread::get_id();
+    /* This task pool is modified from different threads. To avoid deadlocks, `TASK_ISOLATION_OFF`
+     * has to be used. Task isolation can still be used in a more fine-grained way within the
+     * tasks, but should not be enabled for the entire task pool. */
+    BLI_assert(pool->task_pool_create_thread_id == current_id);
+  }
+#  endif
+#endif
+  UNUSED_VARS_NDEBUG(pool);
+}
+
 /* TBB Task Pool.
  *
  * Task pool using the TBB scheduler for tasks. When building without TBB
@@ -369,7 +391,10 @@ static void background_task_pool_free(TaskPool *pool)
 
 /* Task Pool */
 
-static TaskPool *task_pool_create_ex(void *userdata, TaskPoolType type, TaskPriority priority)
+static TaskPool *task_pool_create_ex(void *userdata,
+                                     TaskPoolType type,
+                                     TaskPriority priority,
+                                     TaskIsolation task_isolation)
 {
   const bool use_threads = BLI_task_scheduler_num_threads() > 1 && type != TASK_POOL_NO_THREADS;
 
@@ -385,6 +410,11 @@ static TaskPool *task_pool_create_ex(void *userdata, TaskPoolType type, TaskPrio
 
   pool->type = type;
   pool->use_threads = use_threads;
+  pool->task_isolation = task_isolation;
+
+#ifdef WITH_TBB
+  pool->task_pool_create_thread_id = std::this_thread::get_id();
+#endif
 
   pool->userdata = userdata;
   BLI_mutex_init(&pool->user_mutex);
@@ -407,9 +437,9 @@ static TaskPool *task_pool_create_ex(void *userdata, TaskPoolType type, TaskPrio
 /**
  * Create a normal task pool. Tasks will be executed as soon as they are added.
  */
-TaskPool *BLI_task_pool_create(void *userdata, TaskPriority priority)
+TaskPool *BLI_task_pool_create(void *userdata, TaskPriority priority, TaskIsolation task_isolation)
 {
-  return task_pool_create_ex(userdata, TASK_POOL_TBB, priority);
+  return task_pool_create_ex(userdata, TASK_POOL_TBB, priority, task_isolation);
 }
 
 /**
@@ -424,9 +454,11 @@ TaskPool *BLI_task_pool_create(void *userdata, TaskPriority priority)
  * they could end never being executed, since the 'fallback' background thread is already
  * busy with parent task in single-threaded context).
  */
-TaskPool *BLI_task_pool_create_background(void *userdata, TaskPriority priority)
+TaskPool *BLI_task_pool_create_background(void *userdata,
+                                          TaskPriority priority,
+                                          TaskIsolation task_isolation)
 {
-  return task_pool_create_ex(userdata, TASK_POOL_BACKGROUND, priority);
+  return task_pool_create_ex(userdata, TASK_POOL_BACKGROUND, priority, task_isolation);
 }
 
 /**
@@ -434,9 +466,11 @@ TaskPool *BLI_task_pool_create_background(void *userdata, TaskPriority priority)
  * for until BLI_task_pool_work_and_wait() is called. This helps reducing threading
  * overhead when pushing huge amount of small initial tasks from the main thread.
  */
-TaskPool *BLI_task_pool_create_suspended(void *userdata, TaskPriority priority)
+TaskPool *BLI_task_pool_create_suspended(void *userdata,
+                                         TaskPriority priority,
+                                         TaskIsolation task_isolation)
 {
-  return task_pool_create_ex(userdata, TASK_POOL_TBB_SUSPENDED, priority);
+  return task_pool_create_ex(userdata, TASK_POOL_TBB_SUSPENDED, priority, task_isolation);
 }
 
 /**
@@ -445,7 +479,8 @@ TaskPool *BLI_task_pool_create_suspended(void *userdata, TaskPriority priority)
  */
 TaskPool *BLI_task_pool_create_no_threads(void *userdata)
 {
-  return task_pool_create_ex(userdata, TASK_POOL_NO_THREADS, TASK_PRIORITY_HIGH);
+  return task_pool_create_ex(
+      userdata, TASK_POOL_NO_THREADS, TASK_PRIORITY_HIGH, TASK_ISOLATION_ON);
 }
 
 /**
@@ -454,7 +489,7 @@ TaskPool *BLI_task_pool_create_no_threads(void *userdata)
  */
 TaskPool *BLI_task_pool_create_background_serial(void *userdata, TaskPriority priority)
 {
-  return task_pool_create_ex(userdata, TASK_POOL_BACKGROUND_SERIAL, priority);
+  return task_pool_create_ex(userdata, TASK_POOL_BACKGROUND_SERIAL, priority, TASK_ISOLATION_ON);
 }
 
 void BLI_task_pool_free(TaskPool *pool)
@@ -482,6 +517,8 @@ void BLI_task_pool_push(TaskPool *pool,
                         bool free_taskdata,
                         TaskFreeFunction freedata)
 {
+  assert_on_valid_thread(pool);
+
   Task task(pool, run, taskdata, free_taskdata, freedata);
 
   switch (pool->type) {
@@ -499,6 +536,8 @@ void BLI_task_pool_push(TaskPool *pool,
 
 void BLI_task_pool_work_and_wait(TaskPool *pool)
 {
+  assert_on_valid_thread(pool);
+
   switch (pool->type) {
     case TASK_POOL_TBB:
     case TASK_POOL_TBB_SUSPENDED:
diff --git a/source/blender/blenlib/intern/timecode.c b/source/blender/blenlib/intern/timecode.c
index 9586da941a4..7d7436411ac 100644
--- a/source/blender/blenlib/intern/timecode.c
+++ b/source/blender/blenlib/intern/timecode.c
@@ -216,10 +216,10 @@ size_t BLI_timecode_string_from_time_simple(char *str,
   const int hun = ((int)(fmod(time_seconds, 1.0) * 100));
 
   if (hr) {
-    rlen = BLI_snprintf(str, maxncpy, "%.2d:%.2d:%.2d.%.2d", hr, min, sec, hun);
+    rlen = BLI_snprintf_rlen(str, maxncpy, "%.2d:%.2d:%.2d.%.2d", hr, min, sec, hun);
   }
   else {
-    rlen = BLI_snprintf(str, maxncpy, "%.2d:%.2d.%.2d", min, sec, hun);
+    rlen = BLI_snprintf_rlen(str, maxncpy, "%.2d:%.2d.%.2d", min, sec, hun);
   }
 
   return rlen;
diff --git a/source/blender/blenlib/intern/uvproject.c b/source/blender/blenlib/intern/uvproject.c
index 329c4d48fe8..093d08e643d 100644
--- a/source/blender/blenlib/intern/uvproject.c
+++ b/source/blender/blenlib/intern/uvproject.c
@@ -134,7 +134,7 @@ void BLI_uvproject_from_view(float target[2],
 
 /* 'rotmat' can be `obedit->obmat` when uv project is used.
  * 'winx' and 'winy' can be from `scene->r.xsch/ysch` */
-ProjCameraInfo *BLI_uvproject_camera_info(Object *ob, float (*rotmat)[4], float winx, float winy)
+ProjCameraInfo *BLI_uvproject_camera_info(Object *ob, float rotmat[4][4], float winx, float winy)
 {
   ProjCameraInfo uci;
   Camera *camera = ob->data;
diff --git a/source/blender/blenlib/intern/winstuff.c b/source/blender/blenlib/intern/winstuff.c
index 333b6783087..3aa61d1fec5 100644
--- a/source/blender/blenlib/intern/winstuff.c
+++ b/source/blender/blenlib/intern/winstuff.c
@@ -94,9 +94,9 @@ void BLI_windows_register_blend_extension(const bool background)
   GetModuleFileName(0, BlPath, MAX_PATH);
 
   /* Replace the actual app name with the wrapper. */
-  blender_app = strstr(BlPath, "blender-app.exe");
+  blender_app = strstr(BlPath, "blender.exe");
   if (blender_app != NULL) {
-    strcpy(blender_app, "blender.exe");
+    strcpy(blender_app, "blender-launcher.exe");
   }
 
   /* root is HKLM by default */