58 files changed, 759 insertions, 279 deletions
diff --git a/source/blender/blenlib/BLI_array_utils.h b/source/blender/blenlib/BLI_array_utils.h
index 2847bc960ad..52d41173a0e 100644
--- a/source/blender/blenlib/BLI_array_utils.h
+++ b/source/blender/blenlib/BLI_array_utils.h
@@ -28,52 +28,53 @@
 extern "C" {
 #endif
 
-void _bli_array_reverse(void *arr, unsigned int arr_len, size_t arr_stride);
+void _bli_array_reverse(void *arr, uint arr_len, size_t arr_stride);
 #define BLI_array_reverse(arr, arr_len) _bli_array_reverse(arr, arr_len, sizeof(*(arr)))
 
-void _bli_array_wrap(void *arr, unsigned int arr_len, size_t arr_stride, int dir);
+void _bli_array_wrap(void *arr, uint arr_len, size_t arr_stride, int dir);
 #define BLI_array_wrap(arr, arr_len, dir) _bli_array_wrap(arr, arr_len, sizeof(*(arr)), dir)
 
-void _bli_array_permute(void *arr,
-                        const unsigned int arr_len,
-                        const size_t arr_stride,
-                        const unsigned int *order,
-                        void *arr_temp);
+void _bli_array_permute(
+    void *arr, const uint arr_len, const size_t arr_stride, const uint *order, void *arr_temp);
 #define BLI_array_permute(arr, arr_len, order) \
   _bli_array_permute(arr, arr_len, sizeof(*(arr)), order, NULL)
 #define BLI_array_permute_ex(arr, arr_len, order, arr_temp) \
   _bli_array_permute(arr, arr_len, sizeof(*(arr)), order, arr_temp)
 
-int _bli_array_findindex(const void *arr, unsigned int arr_len, size_t arr_stride, const void *p);
+uint _bli_array_deduplicate_ordered(void *arr, uint arr_len, size_t arr_stride);
+#define BLI_array_deduplicate_ordered(arr, arr_len) \
+  _bli_array_deduplicate_ordered(arr, arr_len, sizeof(*(arr)))
+
+int _bli_array_findindex(const void *arr, uint arr_len, size_t arr_stride, const void *p);
 #define BLI_array_findindex(arr, arr_len, p) _bli_array_findindex(arr, arr_len, sizeof(*(arr)), p)
 
-int _bli_array_rfindindex(const void *arr, unsigned int arr_len, size_t arr_stride, const void *p);
+int _bli_array_rfindindex(const void *arr, uint arr_len, size_t arr_stride, const void *p);
 #define BLI_array_rfindindex(arr, arr_len, p) \
   _bli_array_rfindindex(arr, arr_len, sizeof(*(arr)), p)
 
 void _bli_array_binary_and(
-    void *arr, const void *arr_a, const void *arr_b, unsigned int arr_len, size_t arr_stride);
+    void *arr, const void *arr_a, const void *arr_b, uint arr_len, size_t arr_stride);
 #define BLI_array_binary_and(arr, arr_a, arr_b, arr_len) \
   (CHECK_TYPE_PAIR_INLINE(*(arr), *(arr_a)), \
    CHECK_TYPE_PAIR_INLINE(*(arr), *(arr_b)), \
    _bli_array_binary_and(arr, arr_a, arr_b, arr_len, sizeof(*(arr))))
 
 void _bli_array_binary_or(
-    void *arr, const void *arr_a, const void *arr_b, unsigned int arr_len, size_t arr_stride);
+    void *arr, const void *arr_a, const void *arr_b, uint arr_len, size_t arr_stride);
 #define BLI_array_binary_or(arr, arr_a, arr_b, arr_len) \
   (CHECK_TYPE_PAIR_INLINE(*(arr), *(arr_a)), \
    CHECK_TYPE_PAIR_INLINE(*(arr), *(arr_b)), \
    _bli_array_binary_or(arr, arr_a, arr_b, arr_len, sizeof(*(arr))))
 
 bool _bli_array_iter_span(const void *arr,
-                          unsigned int arr_len,
+                          uint arr_len,
                           size_t arr_stride,
                           bool use_wrap,
                           bool use_delimit_bounds,
                           bool (*test_fn)(const void *arr_item, void *user_data),
                           void *user_data,
-                          unsigned int span_step[2],
-                          unsigned int *r_span_len);
+                          uint span_step[2],
+                          uint *r_span_len);
 #define BLI_array_iter_span( \
     arr, arr_len, use_wrap, use_delimit_bounds, test_fn, user_data, span_step, r_span_len) \
   _bli_array_iter_span(arr, \
@@ -86,7 +87,7 @@ bool _bli_array_iter_span(const void *arr,
                        span_step, \
                        r_span_len)
 
-bool _bli_array_is_zeroed(const void *arr, unsigned int arr_len, size_t arr_stride);
+bool _bli_array_is_zeroed(const void *arr, uint arr_len, size_t arr_stride);
 #define BLI_array_is_zeroed(arr, arr_len) _bli_array_is_zeroed(arr, arr_len, sizeof(*(arr)))
 
 bool _bli_array_iter_spiral_square(const void *arr_v,
diff --git a/source/blender/blenlib/BLI_assert.h b/source/blender/blenlib/BLI_assert.h
index 685f526b4ad..6019f0f3566 100644
--- a/source/blender/blenlib/BLI_assert.h
+++ b/source/blender/blenlib/BLI_assert.h
@@ -31,6 +31,7 @@ extern "C" {
 
 /* Utility functions. */
 void _BLI_assert_print_pos(const char *file, const int line, const char *function, const char *id);
+void _BLI_assert_print_extra(const char *str);
 void _BLI_assert_print_backtrace(void);
 void _BLI_assert_abort(void);
 void _BLI_assert_unreachable_print(const char *file, const int line, const char *function);
@@ -61,8 +62,17 @@ void _BLI_assert_unreachable_print(const char *file, const int line, const char
                       _BLI_ASSERT_ABORT(), \
                       NULL)) : \
                     NULL)
+/** A version of #BLI_assert() to pass an additional message to be printed on failure. */
+#  define BLI_assert_msg(a, msg) \
+    (void)((!(a)) ? ((_BLI_assert_print_backtrace(), \
+                      _BLI_ASSERT_PRINT_POS(a), \
+                      _BLI_assert_print_extra(msg), \
+                      _BLI_ASSERT_ABORT(), \
+                      NULL)) : \
+                    NULL)
 #else
 #  define BLI_assert(a) ((void)0)
+#  define BLI_assert_msg(a, msg) ((void)0)
 #endif
 
 #if defined(__cplusplus)
@@ -96,7 +106,7 @@ void _BLI_assert_unreachable_print(const char *file, const int line, const char
 #define BLI_assert_unreachable() \
   { \
     _BLI_assert_unreachable_print(__FILE__, __LINE__, __func__); \
-    BLI_assert(!"This line of code is marked to be unreachable."); \
+    BLI_assert_msg(0, "This line of code is marked to be unreachable."); \
   } \
   ((void)0)
 
diff --git a/source/blender/blenlib/BLI_endian_switch_inline.h b/source/blender/blenlib/BLI_endian_switch_inline.h
index d42126fbe61..ec4cfe4801a 100644
--- a/source/blender/blenlib/BLI_endian_switch_inline.h
+++ b/source/blender/blenlib/BLI_endian_switch_inline.h
@@ -29,7 +29,7 @@ extern "C" {
  * \ingroup bli
  */
 
-/* note: using a temp char to switch endian is a lot slower,
+/* NOTE: using a temp char to switch endian is a lot slower,
  * use bit shifting instead. */
 
 /* *** 16 *** */
diff --git a/source/blender/blenlib/BLI_enumerable_thread_specific.hh b/source/blender/blenlib/BLI_enumerable_thread_specific.hh
index a05f7724dd2..3051d980d45 100644
--- a/source/blender/blenlib/BLI_enumerable_thread_specific.hh
+++ b/source/blender/blenlib/BLI_enumerable_thread_specific.hh
@@ -46,25 +46,72 @@ template<typename T> class EnumerableThreadSpecific : NonCopyable, NonMovable {
   tbb::enumerable_thread_specific<T> values_;
 
  public:
+  using iterator = typename tbb::enumerable_thread_specific<T>::iterator;
+
+  EnumerableThreadSpecific() = default;
+
+  template<typename F> EnumerableThreadSpecific(F initializer) : values_(std::move(initializer))
+  {
+  }
+
   T &local()
   {
     return values_.local();
   }
 
+  iterator begin()
+  {
+    return values_.begin();
+  }
+
+  iterator end()
+  {
+    return values_.end();
+  }
+
 #else /* WITH_TBB */
 
  private:
   std::mutex mutex_;
   /* Maps thread ids to their corresponding values. The values are not embedded in the map, so that
    * their addresses do not change when the map grows. */
-  Map<int, std::unique_ptr<T>> values_;
+  Map<int, std::reference_wrapper<T>> values_;
+  Vector<std::unique_ptr<T>> owned_values_;
+  std::function<void(void *)> initializer_;
 
  public:
+  using iterator = typename Map<int, std::reference_wrapper<T>>::MutableValueIterator;
+
+  EnumerableThreadSpecific() : initializer_([](void *buffer) { new (buffer) T(); })
+  {
+  }
+
+  template<typename F>
+  EnumerableThreadSpecific(F initializer)
+      : initializer_([=](void *buffer) { new (buffer) T(initializer()); })
+  {
+  }
+
   T &local()
   {
     const int thread_id = enumerable_thread_specific_utils::thread_id;
     std::lock_guard lock{mutex_};
-    return *values_.lookup_or_add_cb(thread_id, []() { return std::make_unique<T>(); });
+    return values_.lookup_or_add_cb(thread_id, [&]() {
+      T *value = (T *)::operator new(sizeof(T));
+      initializer_(value);
+      owned_values_.append(std::unique_ptr<T>{value});
+      return std::reference_wrapper<T>{*value};
+    });
+  }
+
+  iterator begin()
+  {
+    return values_.values().begin();
+  }
+
+  iterator end()
+  {
+    return values_.values().end();
   }
 
 #endif /* WITH_TBB */
diff --git a/source/blender/blenlib/BLI_function_ref.hh b/source/blender/blenlib/BLI_function_ref.hh
index 38e1ba593c5..70a064adc5d 100644
--- a/source/blender/blenlib/BLI_function_ref.hh
+++ b/source/blender/blenlib/BLI_function_ref.hh
@@ -95,7 +95,7 @@ template<typename Ret, typename... Params> class FunctionRef<Ret(Params...)> {
    * A pointer to the referenced callable object. This can be a C function, a lambda object or any
    * other callable.
    *
-   * The value does not need to be initialized because it is not used unless callback_ is set as
+   * The value does not need to be initialized because it is not used unless `callback_` is set as
    * well, in which case it will be initialized as well.
    *
    * Use `intptr_t` to avoid warnings when casting to function pointers.
diff --git a/source/blender/blenlib/BLI_math_geom.h b/source/blender/blenlib/BLI_math_geom.h
index 43b31d76bb0..9ac14a6edfe 100644
--- a/source/blender/blenlib/BLI_math_geom.h
+++ b/source/blender/blenlib/BLI_math_geom.h
@@ -255,7 +255,7 @@ void limit_dist_v3(float v1[3], float v2[3], const float dist);
 
 /******************************* Intersection ********************************/
 
-/* TODO int return value consistency */
+/* TODO: int return value consistency. */
 
 /* line-line */
 #define ISECT_LINE_LINE_COLINEAR -1
diff --git a/source/blender/blenlib/BLI_math_matrix.h b/source/blender/blenlib/BLI_math_matrix.h
index 54df88ca541..e38df58c1ca 100644
--- a/source/blender/blenlib/BLI_math_matrix.h
+++ b/source/blender/blenlib/BLI_math_matrix.h
@@ -211,6 +211,7 @@ void mul_transposed_mat3_m4_v3(const float M[4][4], float r[3]);
 void mul_m3_v3_double(const float M[3][3], double r[3]);
 
 void mul_m4_m4m4_aligned_scale(float R[4][4], const float A[4][4], const float B[4][4]);
+void mul_m4_m4m4_split_channels(float R[4][4], const float A[4][4], const float B[4][4]);
 
 void mul_m3_fl(float R[3][3], float f);
 void mul_m4_fl(float R[4][4], float f);
@@ -277,7 +278,7 @@ bool is_orthonormal_m4(const float mat[4][4]);
 bool is_uniform_scaled_m3(const float mat[3][3]);
 bool is_uniform_scaled_m4(const float m[4][4]);
 
-/* Note: 'adjoint' here means the adjugate (adjunct, "classical adjoint") matrix!
+/* NOTE: 'adjoint' here means the adjugate (adjunct, "classical adjoint") matrix!
  * Nowadays 'adjoint' usually refers to the conjugate transpose,
  * which for real-valued matrices is simply the transpose.
  */
diff --git a/source/blender/blenlib/BLI_math_rotation.h b/source/blender/blenlib/BLI_math_rotation.h
index ef10d02f10f..461b5a60c9d 100644
--- a/source/blender/blenlib/BLI_math_rotation.h
+++ b/source/blender/blenlib/BLI_math_rotation.h
@@ -90,7 +90,7 @@ void tri_to_quat_ex(float quat[4],
                     const float no_orig[3]);
 float tri_to_quat(float q[4], const float a[3], const float b[3], const float c[3]);
 void vec_to_quat(float q[4], const float vec[3], short axis, const short upflag);
-/* note: v1 and v2 must be normalized */
+/* NOTE: v1 and v2 must be normalized. */
 void rotation_between_vecs_to_mat3(float m[3][3], const float v1[3], const float v2[3]);
 void rotation_between_vecs_to_quat(float q[4], const float v1[3], const float v2[3]);
 void rotation_between_quats_to_quat(float q[4], const float q1[4], const float q2[4]);
diff --git a/source/blender/blenlib/BLI_memarena.h b/source/blender/blenlib/BLI_memarena.h
index d7798f12fcc..b2e05b00735 100644
--- a/source/blender/blenlib/BLI_memarena.h
+++ b/source/blender/blenlib/BLI_memarena.h
@@ -50,6 +50,8 @@ void *BLI_memarena_alloc(struct MemArena *ma, size_t size) ATTR_WARN_UNUSED_RESU
 void *BLI_memarena_calloc(struct MemArena *ma, size_t size) ATTR_WARN_UNUSED_RESULT
     ATTR_NONNULL(1) ATTR_MALLOC ATTR_ALLOC_SIZE(2);
 
+void BLI_memarena_merge(MemArena *ma_dst, MemArena *ma_src) ATTR_NONNULL(1, 2);
+
 void BLI_memarena_clear(MemArena *ma) ATTR_NONNULL(1);
 
 #ifdef __cplusplus
diff --git a/source/blender/blenlib/BLI_memory_utils.hh b/source/blender/blenlib/BLI_memory_utils.hh
index bdbbda9f0c7..14eca49d126 100644
--- a/source/blender/blenlib/BLI_memory_utils.hh
+++ b/source/blender/blenlib/BLI_memory_utils.hh
@@ -309,6 +309,12 @@ template<typename T> void uninitialized_fill_n(T *dst, int64_t n, const T &value
 }
 
 template<typename T> struct DestructValueAtAddress {
+  DestructValueAtAddress() = default;
+
+  template<typename U> DestructValueAtAddress(const U &)
+  {
+  }
+
   void operator()(T *ptr)
   {
     ptr->~T();
diff --git a/source/blender/blenlib/BLI_mempool.h b/source/blender/blenlib/BLI_mempool.h
index e5e0df02033..61b572a4943 100644
--- a/source/blender/blenlib/BLI_mempool.h
+++ b/source/blender/blenlib/BLI_mempool.h
@@ -65,7 +65,10 @@ void *BLI_mempool_as_arrayN(BLI_mempool *pool,
 void BLI_mempool_set_memory_debug(void);
 #endif
 
-/** iteration stuff.  note: this may easy to produce bugs with */
+/**
+ * Iteration stuff.
+ * NOTE: this may easy to produce bugs with.
+ */
 /* private structure */
 typedef struct BLI_mempool_iter {
   BLI_mempool *pool;
diff --git a/source/blender/blenlib/BLI_mesh_intersect.hh b/source/blender/blenlib/BLI_mesh_intersect.hh
index 6b8e79f376c..f28be9bf59b 100644
--- a/source/blender/blenlib/BLI_mesh_intersect.hh
+++ b/source/blender/blenlib/BLI_mesh_intersect.hh
@@ -225,6 +225,7 @@ class IMeshArena : NonCopyable, NonMovable {
    */
   const Vert *add_or_find_vert(const mpq3 &co, int orig);
   const Vert *add_or_find_vert(const double3 &co, int orig);
+  const Vert *add_or_find_vert(Vert *vert);
 
   Face *add_face(Span<const Vert *> verts,
                  int orig,
@@ -405,7 +406,7 @@ bool bbs_might_intersect(const BoundingBox &bb_a, const BoundingBox &bb_b);
  * that the output triangle was a part of (input can have -1 for that field and then
  * the index in `tri[]` will be used as the original index).
  * The orig structure of the output #IMesh gives the originals for vertices and edges.
- * Note: if the input tm_in has a non-empty orig structure, then it is ignored.
+ * NOTE: if the input tm_in has a non-empty orig structure, then it is ignored.
  */
 IMesh trimesh_self_intersect(const IMesh &tm_in, IMeshArena *arena);
 
diff --git a/source/blender/blenlib/BLI_multi_value_map.hh b/source/blender/blenlib/BLI_multi_value_map.hh
index fb52ac78243..d3073c98417 100644
--- a/source/blender/blenlib/BLI_multi_value_map.hh
+++ b/source/blender/blenlib/BLI_multi_value_map.hh
@@ -129,7 +129,7 @@ template<typename Key, typename Value> class MultiValueMap {
   }
 
   /**
-   * Note: This signature will change when the implementation changes.
+   * NOTE: This signature will change when the implementation changes.
    */
   typename MapType::ItemIterator items() const
   {
@@ -137,7 +137,7 @@ template<typename Key, typename Value> class MultiValueMap {
   }
 
   /**
-   * Note: This signature will change when the implementation changes.
+   * NOTE: This signature will change when the implementation changes.
    */
   typename MapType::KeyIterator keys() const
   {
@@ -145,7 +145,7 @@ template<typename Key, typename Value> class MultiValueMap {
   }
 
   /**
-   * Note: This signature will change when the implementation changes.
+   * NOTE: This signature will change when the implementation changes.
    */
   typename MapType::ValueIterator values() const
   {
diff --git a/source/blender/blenlib/BLI_scanfill.h b/source/blender/blenlib/BLI_scanfill.h
index fa57f0486e5..8f281023177 100644
--- a/source/blender/blenlib/BLI_scanfill.h
+++ b/source/blender/blenlib/BLI_scanfill.h
@@ -97,15 +97,15 @@ struct ScanFillEdge *BLI_scanfill_edge_add(ScanFillContext *sf_ctx,
                                            struct ScanFillVert *v2);
 
 enum {
-  /* note: using BLI_SCANFILL_CALC_REMOVE_DOUBLES
+  /* NOTE(campbell): using BLI_SCANFILL_CALC_REMOVE_DOUBLES
    * Assumes ordered edges, otherwise we risk an eternal loop
-   * removing double verts. - campbell */
+   * removing double verts. */
   BLI_SCANFILL_CALC_REMOVE_DOUBLES = (1 << 1),
 
   /* calculate isolated polygons */
   BLI_SCANFILL_CALC_POLYS = (1 << 2),
 
-  /* note: This flag removes checks for overlapping polygons.
+  /* NOTE: This flag removes checks for overlapping polygons.
    * when this flag is set, we'll never get back more faces than (totvert - 2) */
   BLI_SCANFILL_CALC_HOLES = (1 << 3),
 
diff --git a/source/blender/blenlib/BLI_set_slots.hh b/source/blender/blenlib/BLI_set_slots.hh
index a4d01dfdb68..d50ef95f11e 100644
--- a/source/blender/blenlib/BLI_set_slots.hh
+++ b/source/blender/blenlib/BLI_set_slots.hh
@@ -249,7 +249,7 @@ template<typename Key> class HashedSetSlot {
   template<typename ForwardKey, typename IsEqual>
   bool contains(const ForwardKey &key, const IsEqual &is_equal, const uint64_t hash) const
   {
-    /* hash_ might be uninitialized here, but that is ok. */
+    /* `hash_` might be uninitialized here, but that is ok. */
     if (hash_ == hash) {
       if (state_ == Occupied) {
         return is_equal(key, *key_buffer_);
diff --git a/source/blender/blenlib/BLI_span.hh b/source/blender/blenlib/BLI_span.hh
index c3876d4eaf8..e04295b0e51 100644
--- a/source/blender/blenlib/BLI_span.hh
+++ b/source/blender/blenlib/BLI_span.hh
@@ -58,7 +58,7 @@
  * its task, without having to worry about memory allocation. Alternatively, a function could
  * return an Array or Vector.
  *
- * Note: When a function has a MutableSpan<T> output parameter and T is not a trivial type,
+ * NOTE: When a function has a MutableSpan<T> output parameter and T is not a trivial type,
  * then the function has to specify whether the referenced array is expected to be initialized or
  * not.
  *
diff --git a/source/blender/blenlib/BLI_task.h b/source/blender/blenlib/BLI_task.h
index dbe8ec3dcc0..418db14e2f3 100644
--- a/source/blender/blenlib/BLI_task.h
+++ b/source/blender/blenlib/BLI_task.h
@@ -129,6 +129,9 @@ typedef struct TaskParallelTLS {
 typedef void (*TaskParallelRangeFunc)(void *__restrict userdata,
                                       const int iter,
                                       const TaskParallelTLS *__restrict tls);
+
+typedef void (*TaskParallelInitFunc)(const void *__restrict userdata, void *__restrict chunk);
+
 typedef void (*TaskParallelReduceFunc)(const void *__restrict userdata,
                                        void *__restrict chunk_join,
                                        void *__restrict chunk);
@@ -151,6 +154,10 @@ typedef struct TaskParallelSettings {
   /* Function called from calling thread once whole range have been
    * processed.
    */
+  /* Function called to initialize user data chunk,
+   * typically to allocate data, freed by `func_free`.
+   */
+  TaskParallelInitFunc func_init;
   /* Function called to join user data chunk into another, to reduce
    * the result to the original userdata_chunk memory.
    * The reduce functions should have no side effects, so that they
diff --git a/source/blender/blenlib/BLI_task.hh b/source/blender/blenlib/BLI_task.hh
index 5f5a17f6b58..e2446ad143e 100644
--- a/source/blender/blenlib/BLI_task.hh
+++ b/source/blender/blenlib/BLI_task.hh
@@ -28,6 +28,7 @@
 #    define NOMINMAX
 #    define TBB_MIN_MAX_CLEANUP
 #  endif
+#  include "tbb/parallel_reduce.h"
 #  include <tbb/blocked_range.h>
 #  include <tbb/parallel_for.h>
 #  include <tbb/parallel_for_each.h>
@@ -76,6 +77,27 @@ void parallel_for(IndexRange range, int64_t grain_size, const Function &function
 #endif
 }
 
+template<typename Value, typename Function, typename Reduction>
+Value parallel_reduce(IndexRange range,
+                      int64_t grain_size,
+                      const Value &identity,
+                      const Function &function,
+                      const Reduction &reduction)
+{
+#ifdef WITH_TBB
+  return tbb::parallel_reduce(
+      tbb::blocked_range<int64_t>(range.first(), range.one_after_last(), grain_size),
+      identity,
+      [&](const tbb::blocked_range<int64_t> &subrange, const Value &ident) {
+        return function(IndexRange(subrange.begin(), subrange.size()), ident);
+      },
+      reduction);
+#else
+  UNUSED_VARS(grain_size, reduction);
+  return function(range, identity);
+#endif
+}
+
 /** See #BLI_task_isolate for a description of what isolating a task means. */
 template<typename Function> void isolate_task(const Function &function)
 {
diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt
index 677df9db026..ea5572f1c8a 100644
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@@ -188,6 +188,7 @@ set(SRC
   BLI_dynstr.h
   BLI_easing.h
   BLI_edgehash.h
+  BLI_endian_defines.h
   BLI_endian_switch.h
   BLI_endian_switch_inline.h
   BLI_enumerable_thread_specific.hh
diff --git a/source/blender/blenlib/intern/BLI_assert.c b/source/blender/blenlib/intern/BLI_assert.c
index 887f583242f..cebc6f8957f 100644
--- a/source/blender/blenlib/intern/BLI_assert.c
+++ b/source/blender/blenlib/intern/BLI_assert.c
@@ -31,6 +31,11 @@ void _BLI_assert_print_pos(const char *file, const int line, const char *functio
   fprintf(stderr, "BLI_assert failed: %s:%d, %s(), at \'%s\'\n", file, line, function, id);
 }
 
+void _BLI_assert_print_extra(const char *str)
+{
+  fprintf(stderr, "  %s\n", str);
+}
+
 void _BLI_assert_unreachable_print(const char *file, const int line, const char *function)
 {
   fprintf(stderr, "Code marked as unreachable has been executed. Please report this as a bug.\n");
diff --git a/source/blender/blenlib/intern/BLI_filelist.c b/source/blender/blenlib/intern/BLI_filelist.c
index 55fd28667fc..f05dea46dc8 100644
--- a/source/blender/blenlib/intern/BLI_filelist.c
+++ b/source/blender/blenlib/intern/BLI_filelist.c
@@ -432,7 +432,7 @@ void BLI_filelist_entry_duplicate(struct direntry *dst, const struct direntry *s
 }
 
 /**
- * Deep-duplicate of an array of direntries, including the array itself.
+ * Deep-duplicate of a #direntry array including the array itself.
  */
 void BLI_filelist_duplicate(struct direntry **dest_filelist,
                             struct direntry *const src_filelist,
@@ -462,7 +462,7 @@ void BLI_filelist_entry_free(struct direntry *entry)
 }
 
 /**
- * frees storage for an array of direntries, including the array itself.
+ * Frees storage for an array of #direntry, including the array itself.
  */
 void BLI_filelist_free(struct direntry *filelist, const unsigned int nrentries)
 {
diff --git a/source/blender/blenlib/intern/BLI_ghash.c b/source/blender/blenlib/intern/BLI_ghash.c
index 8463c0ec511..46e599b7cf3 100644
--- a/source/blender/blenlib/intern/BLI_ghash.c
+++ b/source/blender/blenlib/intern/BLI_ghash.c
@@ -611,7 +611,7 @@ static Entry *ghash_pop(GHash *gh, GHashIterState *state)
     return NULL;
   }
 
-  /* Note: using first_bucket_index here allows us to avoid potential
+  /* NOTE: using first_bucket_index here allows us to avoid potential
    * huge number of loops over buckets,
    * in case we are popping from a large ghash with few items in it... */
   curr_bucket = ghash_find_next_bucket_index(gh, curr_bucket);
@@ -677,7 +677,7 @@ static GHash *ghash_copy(const GHash *gh, GHashKeyCopyFP keycopyfp, GHashValCopy
        * This means entries in buckets in new copy will be in reversed order!
        * This shall not be an issue though, since order should never be assumed in ghash. */
 
-      /* Note: We can use 'i' here, since we are sure that
+      /* NOTE: We can use 'i' here, since we are sure that
        * 'gh' and 'gh_new' have the same number of buckets! */
       e_new->next = gh_new->buckets[i];
       gh_new->buckets[i] = e_new;
diff --git a/source/blender/blenlib/intern/BLI_ghash_utils.c b/source/blender/blenlib/intern/BLI_ghash_utils.c
index 83f64043cd0..182c27aed6d 100644
--- a/source/blender/blenlib/intern/BLI_ghash_utils.c
+++ b/source/blender/blenlib/intern/BLI_ghash_utils.c
@@ -53,7 +53,7 @@ uint BLI_ghashutil_ptrhash(const void *key)
   /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
    * excessive hash collisions for dicts and sets */
 
-  /* Note: Unlike Python 'sizeof(uint)' is used instead of 'sizeof(void *)',
+  /* NOTE: Unlike Python 'sizeof(uint)' is used instead of 'sizeof(void *)',
    * Otherwise casting to 'uint' ignores the upper bits on 64bit platforms. */
   return (uint)(y >> 4) | ((uint)y << (sizeof(uint[8]) - 4));
 }
@@ -141,7 +141,7 @@ size_t BLI_ghashutil_combine_hash(size_t hash_a, size_t hash_b)
  * string, is updated: ``hash = hash * 33 + c``.  This
  * function uses the signed value of each byte.
  *
- * note: this is the same hash method that glib 2.34.0 uses.
+ * NOTE: this is the same hash method that glib 2.34.0 uses.
  */
 uint BLI_ghashutil_strhash_n(const char *key, size_t n)
 {
diff --git a/source/blender/blenlib/intern/BLI_kdopbvh.c b/source/blender/blenlib/intern/BLI_kdopbvh.c
index 8f556e0ddb6..25939323b73 100644
--- a/source/blender/blenlib/intern/BLI_kdopbvh.c
+++ b/source/blender/blenlib/intern/BLI_kdopbvh.c
@@ -26,7 +26,7 @@
  *
  * See: http://www.gris.uni-tuebingen.de/people/staff/jmezger/papers/bvh.pdf
  *
- * implements a bvh-tree structure with support for:
+ * implements a BVH-tree structure with support for:
  *
  * - Ray-cast:
  *   #BLI_bvhtree_ray_cast, #BVHRayCastData
@@ -98,8 +98,8 @@ struct BVHTree {
   int totleaf;         /* leafs */
   int totbranch;
   axis_t start_axis, stop_axis; /* bvhtree_kdop_axes array indices according to axis */
-  axis_t axis;                  /* kdop type (6 => OBB, 7 => AABB, ...) */
-  char tree_type;               /* type of tree (4 => quadtree) */
+  axis_t axis;                  /* KDOP type (6 => OBB, 7 => AABB, ...) */
+  char tree_type;               /* type of tree (4 => quad-tree). */
 };
 
 /* optimization, ensure we stay small */
@@ -726,7 +726,7 @@ static void non_recursive_bvh_div_nodes_task_cb(void *__restrict userdata,
   /* Save split axis (this can be used on ray-tracing to speedup the query time) */
   parent->main_axis = split_axis / 2;
 
-  /* Split the children along the split_axis, note: its not needed to sort the whole leafs array
+  /* Split the children along the split_axis, NOTE: its not needed to sort the whole leafs array
    * Only to assure that the elements are partitioned on a way that each child takes the elements
    * it would take in case the whole array was sorted.
    * Split_leafs takes care of that "sort" problem. */
@@ -881,7 +881,7 @@ BVHTree *BLI_bvhtree_new(int maxsize, float epsilon, char tree_type, char axis)
 
   /* tree epsilon must be >= FLT_EPSILON
    * so that tangent rays can still hit a bounding volume..
-   * this bug would show up when casting a ray aligned with a kdop-axis
+   * this bug would show up when casting a ray aligned with a KDOP-axis
    * and with an edge of 2 faces */
   epsilon = max_ff(FLT_EPSILON, epsilon);
 
@@ -1423,7 +1423,7 @@ BVHTreeOverlap *BLI_bvhtree_overlap(
 
 static bool tree_intersect_plane_test(const float *bv, const float plane[4])
 {
-  /* TODO(germano): Support other kdop geometries. */
+  /* TODO(germano): Support other KDOP geometries. */
   const float bb_min[3] = {bv[0], bv[2], bv[4]};
   const float bb_max[3] = {bv[1], bv[3], bv[5]};
   float bb_near[3], bb_far[3];
@@ -1805,7 +1805,7 @@ static float ray_nearest_hit(const BVHRayCastData *data, const float bv[6])
  * Based on Tactical Optimization of Ray/Box Intersection, by Graham Fyffe
  * [http://tog.acm.org/resources/RTNews/html/rtnv21n1.html#art9]
  *
- * TODO this doesn't take data->ray.radius into consideration */
+ * TODO: this doesn't take data->ray.radius into consideration. */
 static float fast_ray_nearest_hit(const BVHRayCastData *data, const BVHNode *node)
 {
   const float *bv = node->bv;
diff --git a/source/blender/blenlib/intern/BLI_memarena.c b/source/blender/blenlib/intern/BLI_memarena.c
index fc381c22315..0ab27a5adad 100644
--- a/source/blender/blenlib/intern/BLI_memarena.c
+++ b/source/blender/blenlib/intern/BLI_memarena.c
@@ -45,6 +45,7 @@
 #  define VALGRIND_CREATE_MEMPOOL(pool, rzB, is_zeroed) UNUSED_VARS(pool, rzB, is_zeroed)
 #  define VALGRIND_DESTROY_MEMPOOL(pool) UNUSED_VARS(pool)
 #  define VALGRIND_MEMPOOL_ALLOC(pool, addr, size) UNUSED_VARS(pool, addr, size)
+#  define VALGRIND_MOVE_MEMPOOL(pool_a, pool_b) UNUSED_VARS(pool_a, pool_b)
 #endif
 
 struct MemBuf {
@@ -179,6 +180,58 @@ void *BLI_memarena_calloc(MemArena *ma, size_t size)
 }
 
 /**
+ * Transfer ownership of allocated blocks from `ma_src` into `ma_dst`,
+ * cleaning the contents of `ma_src`.
+ *
+ * \note Useful for multi-threaded tasks that need a thread-local #MemArena
+ * that is kept after the multi-threaded operation is completed.
+ *
+ * \note Avoid accumulating memory pools where possible
+ * as any unused memory in `ma_src` is wasted every merge.
+ */
+void BLI_memarena_merge(MemArena *ma_dst, MemArena *ma_src)
+{
+  /* Memory arenas must be compatible. */
+  BLI_assert(ma_dst != ma_src);
+  BLI_assert(ma_dst->align == ma_src->align);
+  BLI_assert(ma_dst->use_calloc == ma_src->use_calloc);
+  BLI_assert(ma_dst->bufsize == ma_src->bufsize);
+
+  if (ma_src->bufs == NULL) {
+    return;
+  }
+
+  if (UNLIKELY(ma_dst->bufs == NULL)) {
+    BLI_assert(ma_dst->curbuf == NULL);
+    ma_dst->bufs = ma_src->bufs;
+    ma_dst->curbuf = ma_src->curbuf;
+    ma_dst->cursize = ma_src->cursize;
+  }
+  else {
+    /* Keep the 'ma_dst->curbuf' for simplicity.
+     * Insert buffers after the first. */
+    if (ma_dst->bufs->next != NULL) {
+      /* Loop over `ma_src` instead of `ma_dst` since it's likely the destination is larger
+       * when used for accumulating from multiple sources. */
+      struct MemBuf *mb_src = ma_src->bufs;
+      mb_src = ma_src->bufs;
+      while (mb_src && mb_src->next) {
+        mb_src = mb_src->next;
+      }
+      mb_src->next = ma_dst->bufs->next;
+    }
+    ma_dst->bufs->next = ma_src->bufs;
+  }
+
+  ma_src->bufs = NULL;
+  ma_src->curbuf = NULL;
+  ma_src->cursize = 0;
+
+  VALGRIND_MOVE_MEMPOOL(ma_src, ma_dst);
+  VALGRIND_CREATE_MEMPOOL(ma_src, 0, false);
+}
+
+/**
  * Clear for reuse, avoids re-allocation when an arena may
  * otherwise be free'd and recreated.
  */
diff --git a/source/blender/blenlib/intern/BLI_mempool.c b/source/blender/blenlib/intern/BLI_mempool.c
index 8196438eb25..5263af2ae56 100644
--- a/source/blender/blenlib/intern/BLI_mempool.c
+++ b/source/blender/blenlib/intern/BLI_mempool.c
@@ -47,7 +47,7 @@
 #  include "valgrind/memcheck.h"
 #endif
 
-/* note: copied from BLO_blend_defs.h, don't use here because we're in BLI */
+/* NOTE: copied from BLO_blend_defs.h, don't use here because we're in BLI. */
 #ifdef __BIG_ENDIAN__
 /* Big Endian */
 #  define MAKE_ID(a, b, c, d) ((int)(a) << 24 | (int)(b) << 16 | (c) << 8 | (d))
@@ -387,7 +387,7 @@ void BLI_mempool_free(BLI_mempool *pool, void *addr)
       }
     }
     if (!found) {
-      BLI_assert(!"Attempt to free data which is not in pool.\n");
+      BLI_assert_msg(0, "Attempt to free data which is not in pool.\n");
     }
   }
 
diff --git a/source/blender/blenlib/intern/array_store.c b/source/blender/blenlib/intern/array_store.c
index 250915383cf..e1a7ee98ce5 100644
--- a/source/blender/blenlib/intern/array_store.c
+++ b/source/blender/blenlib/intern/array_store.c
@@ -191,7 +191,7 @@
 #  define BCHUNK_SIZE_MIN_DIV 8
 
 /* Disallow chunks bigger than the regular chunk size scaled by this value
- * note: must be at least 2!
+ * NOTE: must be at least 2!
  * however, this code runs won't run in tests unless it's ~1.1 ugh.
  * so lower only to check splitting works.
  */
@@ -980,7 +980,7 @@ static const BChunkRef *table_lookup(const BArrayInfo *info,
                                      const size_t offset,
                                      const hash_key *UNUSED(table_hash_array))
 {
-  const size_t data_hash_len = BCHUNK_HASH_LEN * info->chunk_stride; /* TODO, cache */
+  const size_t data_hash_len = BCHUNK_HASH_LEN * info->chunk_stride; /* TODO: cache. */
 
   size_t size_left = data_len - offset;
   hash_key key = hash_data(&data[offset], MIN2(data_hash_len, size_left));
@@ -1155,7 +1155,7 @@ static BChunkList *bchunk_list_from_data_merge(const BArrayInfo *info,
       use_aligned = true;
     }
     else {
-      /* TODO, walk over chunks and check if some arbitrary amount align */
+      /* TODO: walk over chunks and check if some arbitrary amount align. */
     }
   }
 #endif /* USE_ALIGN_CHUNKS_TEST */
@@ -1787,7 +1787,7 @@ bool BLI_array_store_is_valid(BArrayStore *bs)
   }
 
   return ok;
-  /* TODO, dangling pointer checks */
+  /* TODO: dangling pointer checks. */
 }
 
 /** \} */
diff --git a/source/blender/blenlib/intern/array_utils.c b/source/blender/blenlib/intern/array_utils.c
index 25261e82cc9..9a12a7442b7 100644
--- a/source/blender/blenlib/intern/array_utils.c
+++ b/source/blender/blenlib/intern/array_utils.c
@@ -40,11 +40,11 @@
  *
  * Access via #BLI_array_reverse
  */
-void _bli_array_reverse(void *arr_v, unsigned int arr_len, size_t arr_stride)
+void _bli_array_reverse(void *arr_v, uint arr_len, size_t arr_stride)
 {
-  const unsigned int arr_stride_uint = (unsigned int)arr_stride;
-  const unsigned int arr_half_stride = (arr_len / 2) * arr_stride_uint;
-  unsigned int i, i_end;
+  const uint arr_stride_uint = (uint)arr_stride;
+  const uint arr_half_stride = (arr_len / 2) * arr_stride_uint;
+  uint i, i_end;
   char *arr = arr_v;
   char *buf = BLI_array_alloca(buf, arr_stride);
 
@@ -62,7 +62,7 @@ void _bli_array_reverse(void *arr_v, unsigned int arr_len, size_t arr_stride)
  *
  * Access via #BLI_array_wrap
  */
-void _bli_array_wrap(void *arr_v, unsigned int arr_len, size_t arr_stride, int dir)
+void _bli_array_wrap(void *arr_v, uint arr_len, size_t arr_stride, int dir)
 {
   char *arr = arr_v;
   char *buf = BLI_array_alloca(buf, arr_stride);
@@ -88,16 +88,13 @@ void _bli_array_wrap(void *arr_v, unsigned int arr_len, size_t arr_stride, int d
  *
  * Access via #BLI_array_wrap
  */
-void _bli_array_permute(void *arr,
-                        const unsigned int arr_len,
-                        const size_t arr_stride,
-                        const unsigned int *order,
-                        void *arr_temp)
+void _bli_array_permute(
+    void *arr, const uint arr_len, const size_t arr_stride, const uint *order, void *arr_temp)
 {
   const size_t len = arr_len * arr_stride;
-  const unsigned int arr_stride_uint = (unsigned int)arr_stride;
+  const uint arr_stride_uint = (uint)arr_stride;
   void *arr_orig;
-  unsigned int i;
+  uint i;
 
   if (arr_temp == NULL) {
     arr_orig = MEM_mallocN(len, __func__);
@@ -121,16 +118,45 @@ void _bli_array_permute(void *arr,
 }
 
 /**
+ * In-place array de-duplication of an ordered array.
+ *
+ * \return The new length of the array.
+ *
+ * Access via #BLI_array_deduplicate_ordered
+ */
+uint _bli_array_deduplicate_ordered(void *arr, uint arr_len, size_t arr_stride)
+{
+  if (UNLIKELY(arr_len <= 1)) {
+    return arr_len;
+  }
+
+  const uint arr_stride_uint = (uint)arr_stride;
+  uint j = 0;
+  for (uint i = 0; i < arr_len; i++) {
+    if ((i == j) || (memcmp(POINTER_OFFSET(arr, arr_stride_uint * i),
+                            POINTER_OFFSET(arr, arr_stride_uint * j),
+                            arr_stride) == 0)) {
+      continue;
+    }
+    j += 1;
+    memcpy(POINTER_OFFSET(arr, arr_stride_uint * j),
+           POINTER_OFFSET(arr, arr_stride_uint * i),
+           arr_stride);
+  }
+  return j + 1;
+}
+
+/**
  * Find the first index of an item in an array.
  *
  * Access via #BLI_array_findindex
  *
  * \note Not efficient, use for error checks/asserts.
  */
-int _bli_array_findindex(const void *arr, unsigned int arr_len, size_t arr_stride, const void *p)
+int _bli_array_findindex(const void *arr, uint arr_len, size_t arr_stride, const void *p)
 {
   const char *arr_step = (const char *)arr;
-  for (unsigned int i = 0; i < arr_len; i++, arr_step += arr_stride) {
+  for (uint i = 0; i < arr_len; i++, arr_step += arr_stride) {
     if (memcmp(arr_step, p, arr_stride) == 0) {
       return (int)i;
     }
@@ -141,10 +167,10 @@ int _bli_array_findindex(const void *arr, unsigned int arr_len, size_t arr_strid
 /**
  * A version of #BLI_array_findindex that searches from the end of the list.
  */
-int _bli_array_rfindindex(const void *arr, unsigned int arr_len, size_t arr_stride, const void *p)
+int _bli_array_rfindindex(const void *arr, uint arr_len, size_t arr_stride, const void *p)
 {
   const char *arr_step = (const char *)arr + (arr_stride * arr_len);
-  for (unsigned int i = arr_len; i-- != 0;) {
+  for (uint i = arr_len; i-- != 0;) {
     arr_step -= arr_stride;
     if (memcmp(arr_step, p, arr_stride) == 0) {
       return (int)i;
@@ -154,7 +180,7 @@ int _bli_array_rfindindex(const void *arr, unsigned int arr_len, size_t arr_stri
 }
 
 void _bli_array_binary_and(
-    void *arr, const void *arr_a, const void *arr_b, unsigned int arr_len, size_t arr_stride)
+    void *arr, const void *arr_a, const void *arr_b, uint arr_len, size_t arr_stride)
 {
   char *dst = arr;
   const char *src_a = arr_a;
@@ -167,7 +193,7 @@ void _bli_array_binary_and(
 }
 
 void _bli_array_binary_or(
-    void *arr, const void *arr_a, const void *arr_b, unsigned int arr_len, size_t arr_stride)
+    void *arr, const void *arr_a, const void *arr_b, uint arr_len, size_t arr_stride)
 {
   char *dst = arr;
   const char *src_a = arr_a;
@@ -196,14 +222,14 @@ void _bli_array_binary_or(
  * where calculating the length isn't a simple subtraction.
  */
 bool _bli_array_iter_span(const void *arr,
-                          unsigned int arr_len,
+                          uint arr_len,
                           size_t arr_stride,
                           bool use_wrap,
                           bool use_delimit_bounds,
                           bool (*test_fn)(const void *arr_item, void *user_data),
                           void *user_data,
-                          unsigned int span_step[2],
-                          unsigned int *r_span_len)
+                          uint span_step[2],
+                          uint *r_span_len)
 {
   if (arr_len == 0) {
     return false;
@@ -212,11 +238,11 @@ bool _bli_array_iter_span(const void *arr,
     return false;
   }
 
-  const unsigned int arr_stride_uint = (unsigned int)arr_stride;
+  const uint arr_stride_uint = (uint)arr_stride;
   const void *item_prev;
   bool test_prev;
 
-  unsigned int i_curr;
+  uint i_curr;
 
   if ((span_step[0] == arr_len) && (span_step[1] == arr_len)) {
     if (use_wrap) {
@@ -249,11 +275,11 @@ bool _bli_array_iter_span(const void *arr,
   while (i_curr < arr_len) {
     bool test_curr = test_fn(item_curr, user_data);
     if ((test_prev == false) && (test_curr == true)) {
-      unsigned int span_len;
-      unsigned int i_step_prev = i_curr;
+      uint span_len;
+      uint i_step_prev = i_curr;
 
       if (use_wrap) {
-        unsigned int i_step = i_curr + 1;
+        uint i_step = i_curr + 1;
         if (UNLIKELY(i_step == arr_len)) {
           i_step = 0;
         }
@@ -273,7 +299,7 @@ bool _bli_array_iter_span(const void *arr,
         }
       }
       else {
-        unsigned int i_step = i_curr + 1;
+        uint i_step = i_curr + 1;
         while ((i_step != arr_len) &&
                test_fn(POINTER_OFFSET(arr, i_step * arr_stride_uint), user_data)) {
           i_step_prev = i_step;
@@ -307,7 +333,7 @@ bool _bli_array_iter_span(const void *arr,
 /**
  * Simple utility to check memory is zeroed.
  */
-bool _bli_array_is_zeroed(const void *arr_v, unsigned int arr_len, size_t arr_stride)
+bool _bli_array_is_zeroed(const void *arr_v, uint arr_len, size_t arr_stride)
 {
   const char *arr_step = (const char *)arr_v;
   size_t i = arr_stride * arr_len;
diff --git a/source/blender/blenlib/intern/bitmap_draw_2d.c b/source/blender/blenlib/intern/bitmap_draw_2d.c
index 9d3b66d72d7..b0afe1349ad 100644
--- a/source/blender/blenlib/intern/bitmap_draw_2d.c
+++ b/source/blender/blenlib/intern/bitmap_draw_2d.c
@@ -496,7 +496,7 @@ void BLI_bitmap_draw_2d_poly_v2i_n(const int xmin,
 
     /* Scan for new x-nodes */
     while ((span_y_index < span_y_len) && (verts[span_y[span_y_index][0]][1] == pixel_y)) {
-      /* note, node_x these are just added at the end,
+      /* NOTE: node_x these are just added at the end,
        * not ideal but sorting once will resolve. */
 
       /* x is initialized for the next pixel_y */
diff --git a/source/blender/blenlib/intern/delaunay_2d.cc b/source/blender/blenlib/intern/delaunay_2d.cc
index eb3e64c49e6..24a58103a10 100644
--- a/source/blender/blenlib/intern/delaunay_2d.cc
+++ b/source/blender/blenlib/intern/delaunay_2d.cc
@@ -896,7 +896,9 @@ template<typename T> inline bool is_original_vert(const CDTVert<T> *v, CDT_state
   return (v->index < cdt->input_vert_tot);
 }
 
-/* Return the Symedge that goes from v1 to v2, if it exists, else return nullptr. */
+/**
+ * Return the #SymEdge that goes from v1 to v2, if it exists, else return nullptr.
+ */
 template<typename T>
 SymEdge<T> *find_symedge_between_verts(const CDTVert<T> *v1, const CDTVert<T> *v2)
 {
@@ -2106,7 +2108,7 @@ template<typename T> void add_edge_constraints(CDT_state<T> *cdt_state, const CD
  * for the boundary of the input face.
  * fedge_start..fedge_end is the inclusive range of edge input ids that are for the given face.
  *
- * Note: if the input face is not CCW oriented, we'll be labeling the outside, not the inside.
+ * NOTE: if the input face is not CCW oriented, we'll be labeling the outside, not the inside.
  * Note 2: if the boundary has self-crossings, this method will arbitrarily pick one of the
  * contiguous set of faces enclosed by parts of the boundary, leaving the other such un-tagged.
  * This may be a feature instead of a bug if the first contiguous section is most of the face and
diff --git a/source/blender/blenlib/intern/expr_pylike_eval.c b/source/blender/blenlib/intern/expr_pylike_eval.c
index a5d4130cb20..4d1ba190c14 100644
--- a/source/blender/blenlib/intern/expr_pylike_eval.c
+++ b/source/blender/blenlib/intern/expr_pylike_eval.c
@@ -569,7 +569,7 @@ static int opcode_arg_count(eOpCode code)
     case OPCODE_FUNC3:
       return 3;
     default:
-      BLI_assert(!"unexpected opcode");
+      BLI_assert_msg(0, "unexpected opcode");
       return -1;
   }
 }
diff --git a/source/blender/blenlib/intern/fileops.c b/source/blender/blenlib/intern/fileops.c
index 107c27da6a2..1a00142ddb1 100644
--- a/source/blender/blenlib/intern/fileops.c
+++ b/source/blender/blenlib/intern/fileops.c
@@ -64,7 +64,7 @@
 #if 0 /* UNUSED */
 /* gzip the file in from and write it to "to".
  * return -1 if zlib fails, -2 if the originating file does not exist
- * note: will remove the "from" file
+ * NOTE: will remove the "from" file
  */
 int BLI_file_gzip(const char *from, const char *to)
 {
@@ -355,7 +355,7 @@ void *BLI_gzopen(const char *filename, const char *mode)
 
   BLI_assert(!BLI_path_is_rel(filename));
 
-  /* xxx Creates file before transcribing the path */
+  /* XXX: Creates file before transcribing the path. */
   if (mode[0] == 'w') {
     FILE *file = ufopen(filename, "a");
     if (file == NULL) {
diff --git a/source/blender/blenlib/intern/kdtree_impl.h b/source/blender/blenlib/intern/kdtree_impl.h
index 2aec3ce082a..0c9de0aa128 100644
--- a/source/blender/blenlib/intern/kdtree_impl.h
+++ b/source/blender/blenlib/intern/kdtree_impl.h
@@ -132,7 +132,7 @@ void BLI_kdtree_nd_(insert)(KDTree *tree, int index, const float co[KD_DIMS])
   BLI_assert(tree->nodes_len <= tree->nodes_len_capacity);
 #endif
 
-  /* note, array isn't calloc'd,
+  /* NOTE: array isn't calloc'd,
    * need to initialize all struct members */
 
   node->left = node->right = KD_NODE_UNSET;
diff --git a/source/blender/blenlib/intern/math_color.c b/source/blender/blenlib/intern/math_color.c
index 263c508c07c..da97e697f2f 100644
--- a/source/blender/blenlib/intern/math_color.c
+++ b/source/blender/blenlib/intern/math_color.c
@@ -153,7 +153,7 @@ void rgb_to_ycc(float r, float g, float b, float *r_y, float *r_cb, float *r_cr,
       cr = (0.5f * sr) - (0.41869f * sg) - (0.08131f * sb) + 128.0f;
       break;
     default:
-      BLI_assert(!"invalid colorspace");
+      BLI_assert_msg(0, "invalid colorspace");
       break;
   }
 
diff --git a/source/blender/blenlib/intern/math_geom.c b/source/blender/blenlib/intern/math_geom.c
index 3175bf116a0..80f0008c7eb 100644
--- a/source/blender/blenlib/intern/math_geom.c
+++ b/source/blender/blenlib/intern/math_geom.c
@@ -1756,8 +1756,8 @@ bool isect_ray_tri_v3(const float ray_origin[3],
                       float *r_lambda,
                       float r_uv[2])
 {
-  /* note: these values were 0.000001 in 2.4x but for projection snapping on
-   * a human head (1BU == 1m), subsurf level 2, this gave many errors - campbell */
+  /* NOTE(campbell): these values were 0.000001 in 2.4x but for projection snapping on
+   * a human head (1BU == 1m), subsurf level 2, this gave many errors. */
   const float epsilon = 0.00000001f;
   float p[3], s[3], e1[3], e2[3], q[3];
   float a, f, u, v;
@@ -3287,8 +3287,8 @@ bool isect_ray_aabb_v3(const struct IsectRayAABB_Precalc *data,
     tmin = tzmin;
   }
 
-  /* Note: tmax does not need to be updated since we don't use it
-   * keeping this here for future reference - jwilkins */
+  /* NOTE(jwilkins): tmax does not need to be updated since we don't use it
+   * keeping this here for future reference. */
   // if (tzmax < tmax) tmax = tzmax;
 
   if (tmin_out) {
@@ -3559,7 +3559,7 @@ static bool point_in_slice(const float p[3],
 
   sub_v3_v3v3(rp, p, v1);
   h = dot_v3v3(q, rp) / dot_v3v3(q, q);
-  /* note: when 'h' is nan/-nan, this check returns false
+  /* NOTE: when 'h' is nan/-nan, this check returns false
    * without explicit check - covering the degenerate case */
   return (h >= 0.0f && h <= 1.0f);
 }
@@ -4020,7 +4020,7 @@ void barycentric_weights_v2_persp(
 
 /**
  * same as #barycentric_weights_v2 but works with a quad,
- * note: untested for values outside the quad's bounds
+ * NOTE: untested for values outside the quad's bounds
  * this is #interp_weights_poly_v2 expanded for quads only
  */
 void barycentric_weights_v2_quad(const float v1[2],
@@ -4030,10 +4030,11 @@ void barycentric_weights_v2_quad(const float v1[2],
                                  const float co[2],
                                  float w[4])
 {
-  /* note: fabsf() here is not needed for convex quads (and not used in interp_weights_poly_v2).
-   * but in the case of concave/bow-tie quads for the mask rasterizer it gives unreliable results
-   * without adding absf(). If this becomes an issue for more general usage we could have
-   * this optional or use a different function - Campbell */
+  /* NOTE(campbell): fabsf() here is not needed for convex quads
+   * (and not used in #interp_weights_poly_v2).
+   * But in the case of concave/bow-tie quads for the mask rasterizer it
+   * gives unreliable results without adding absf(). If this becomes an issue for more general
+   * usage we could have this optional or use a different function. */
 #define MEAN_VALUE_HALF_TAN_V2(_area, i1, i2) \
   ((_area = cross_v2v2(dirs[i1], dirs[i2])) != 0.0f ? \
        fabsf(((lens[i1] * lens[i2]) - dot_v2v2(dirs[i1], dirs[i2])) / _area) : \
@@ -4820,7 +4821,7 @@ void orthographic_m4(float matrix[4][4],
   matrix[3][0] = -(right + left) / Xdelta;
   matrix[1][1] = 2.0f / Ydelta;
   matrix[3][1] = -(top + bottom) / Ydelta;
-  matrix[2][2] = -2.0f / Zdelta; /* note: negate Z */
+  matrix[2][2] = -2.0f / Zdelta; /* NOTE: negate Z. */
   matrix[3][2] = -(farClip + nearClip) / Zdelta;
 }
 
@@ -4844,7 +4845,7 @@ void perspective_m4(float mat[4][4],
   }
   mat[0][0] = nearClip * 2.0f / Xdelta;
   mat[1][1] = nearClip * 2.0f / Ydelta;
-  mat[2][0] = (right + left) / Xdelta; /* note: negate Z */
+  mat[2][0] = (right + left) / Xdelta; /* NOTE: negate Z. */
   mat[2][1] = (top + bottom) / Ydelta;
   mat[2][2] = -(farClip + nearClip) / Zdelta;
   mat[2][3] = -1.0f;
diff --git a/source/blender/blenlib/intern/math_interp.c b/source/blender/blenlib/intern/math_interp.c
index 163a3ab5fe3..04fae6a0e68 100644
--- a/source/blender/blenlib/intern/math_interp.c
+++ b/source/blender/blenlib/intern/math_interp.c
@@ -655,7 +655,7 @@ void BLI_ewa_filter(const int width,
   v2 = (int)(ceilf(V0 + ve));
 
   /* sane clamping to avoid unnecessarily huge loops */
-  /* note: if eccentricity gets clamped (see above),
+  /* NOTE: if eccentricity gets clamped (see above),
    * the ue/ve limits can also be lowered accordingly
    */
   if (U0 - (float)u1 > EWA_MAXIDX) {
diff --git a/source/blender/blenlib/intern/math_matrix.c b/source/blender/blenlib/intern/math_matrix.c
index 5eb0125062d..5920788821c 100644
--- a/source/blender/blenlib/intern/math_matrix.c
+++ b/source/blender/blenlib/intern/math_matrix.c
@@ -1290,6 +1290,9 @@ bool invert_m4_m4(float inverse[4][4], const float mat[4][4])
  * Combines transformations, handling scale separately in a manner equivalent
  * to the Aligned Inherit Scale mode, in order to avoid creating shear.
  * If A scale is uniform, the result is equivalent to ordinary multiplication.
+ *
+ * NOTE: this effectively takes output location from simple multiplication,
+ *       and uses mul_m4_m4m4_split_channels for rotation and scale.
  */
 void mul_m4_m4m4_aligned_scale(float R[4][4], const float A[4][4], const float B[4][4])
 {
@@ -1307,6 +1310,25 @@ void mul_m4_m4m4_aligned_scale(float R[4][4], const float A[4][4], const float B
   loc_rot_size_to_mat4(R, loc_r, rot_r, size_r);
 }
 
+/**
+ * Separately combines location, rotation and scale of the input matrices.
+ */
+void mul_m4_m4m4_split_channels(float R[4][4], const float A[4][4], const float B[4][4])
+{
+  float loc_a[3], rot_a[3][3], size_a[3];
+  float loc_b[3], rot_b[3][3], size_b[3];
+  float loc_r[3], rot_r[3][3], size_r[3];
+
+  mat4_to_loc_rot_size(loc_a, rot_a, size_a, A);
+  mat4_to_loc_rot_size(loc_b, rot_b, size_b, B);
+
+  add_v3_v3v3(loc_r, loc_a, loc_b);
+  mul_m3_m3m3_uniq(rot_r, rot_a, rot_b);
+  mul_v3_v3v3(size_r, size_a, size_b);
+
+  loc_rot_size_to_mat4(R, loc_r, rot_r, size_r);
+}
+
 /****************************** Linear Algebra *******************************/
 
 void transpose_m3(float R[3][3])
@@ -2252,8 +2274,8 @@ void mat4_to_loc_quat(float loc[3], float quat[4], const float wmat[4][4])
   copy_m3_m4(mat3, wmat);
   normalize_m3_m3(mat3_n, mat3);
 
-  /* so scale doesn't interfere with rotation T24291. */
-  /* note: this is a workaround for negative matrix not working for rotation conversion, FIXME */
+  /* So scale doesn't interfere with rotation T24291. */
+  /* FIXME: this is a workaround for negative matrix not working for rotation conversion. */
   if (is_negative_m3(mat3)) {
     negate_m3(mat3_n);
   }
diff --git a/source/blender/blenlib/intern/math_rotation.c b/source/blender/blenlib/intern/math_rotation.c
index 52737de227b..34baac6f2a4 100644
--- a/source/blender/blenlib/intern/math_rotation.c
+++ b/source/blender/blenlib/intern/math_rotation.c
@@ -167,7 +167,7 @@ void invert_qt_qt_normalized(float q1[4], const float q2[4])
   invert_qt_normalized(q1);
 }
 
-/* simple mult */
+/* Simple multiply. */
 void mul_qt_fl(float q[4], const float f)
 {
   q[0] *= f;
@@ -373,7 +373,7 @@ void mat3_normalized_to_quat(float q[4], const float mat[3][3])
       q[2] = (mat[2][1] + mat[1][2]) * s;
     }
 
-    /* Make sure w is nonnegative for a canonical result. */
+    /* Make sure W is non-negative for a canonical result. */
     if (q[0] < 0) {
       negate_v4(q);
     }
@@ -511,7 +511,7 @@ void rotation_between_vecs_to_mat3(float m[3][3], const float v1[3], const float
   }
 }
 
-/* note: expects vectors to be normalized */
+/* NOTE: expects vectors to be normalized. */
 void rotation_between_vecs_to_quat(float q[4], const float v1[3], const float v2[3])
 {
   float axis[3];
diff --git a/source/blender/blenlib/intern/mesh_boolean.cc b/source/blender/blenlib/intern/mesh_boolean.cc
index 9f7824a0029..8b8850c7cdb 100644
--- a/source/blender/blenlib/intern/mesh_boolean.cc
+++ b/source/blender/blenlib/intern/mesh_boolean.cc
@@ -21,6 +21,7 @@
 #ifdef WITH_GMP
 
 #  include <algorithm>
+#  include <atomic>
 #  include <fstream>
 #  include <iostream>
 
@@ -50,6 +51,7 @@
 #  include "BLI_mesh_boolean.hh"
 
 #  ifdef WITH_TBB
+#    include "tbb/parallel_reduce.h"
 #    include "tbb/spin_mutex.h"
 #  endif
 
@@ -201,9 +203,14 @@ TriMeshTopology::TriMeshTopology(const IMesh &tm)
         BLI_assert(edges != nullptr);
       }
       edges->append_non_duplicates(e);
-      auto createf = [t](Vector<int> **pvec) { *pvec = new Vector<int>{t}; };
-      auto modifyf = [t](Vector<int> **pvec) { (*pvec)->append_non_duplicates(t); };
-      this->edge_tri_.add_or_modify(Edge(v, vnext), createf, modifyf);
+
+      auto p = edge_tri_.lookup_ptr(Edge(v, vnext));
+      if (p == nullptr) {
+        edge_tri_.add_new(e, new Vector<int>{t});
+      }
+      else {
+        (*p)->append_non_duplicates(t);
+      }
     }
   }
   /* Debugging. */
@@ -228,9 +235,18 @@ TriMeshTopology::TriMeshTopology(const IMesh &tm)
 
 TriMeshTopology::~TriMeshTopology()
 {
-  for (const Vector<int> *vec : edge_tri_.values()) {
-    delete vec;
+  Vector<Vector<int> *> values;
+
+  /* Deconstructing is faster in parallel, so it is worth building an array of things to delete. */
+  for (auto item : edge_tri_.values()) {
+    values.append(item);
   }
+
+  threading::parallel_for(values.index_range(), 256, [&](IndexRange range) {
+    for (int i : range) {
+      delete values[i];
+    }
+  });
 }
 
 /** A Patch is a maximal set of triangles that share manifold edges only. */
@@ -719,6 +735,18 @@ static PatchesInfo find_patches(const IMesh &tm, const TriMeshTopology &tmtopo)
   PatchesInfo pinfo(ntri);
   /* Algorithm: Grow patches across manifold edges as long as there are unassigned triangles. */
   Stack<int> cur_patch_grow;
+
+  /* Create an Array containing indices of adjacent faces. */
+  Array<std::array<int, 3>> t_others(tm.face_size());
+  threading::parallel_for(tm.face_index_range(), 2048, [&](IndexRange range) {
+    for (int t : range) {
+      const Face &tri = *tm.face(t);
+      for (int i = 0; i < 3; ++i) {
+        Edge e(tri[i], tri[(i + 1) % 3]);
+        t_others[t][i] = tmtopo.other_tri_if_manifold(e, t);
+      }
+    }
+  });
   for (int t : tm.face_index_range()) {
     if (pinfo.tri_patch(t) == -1) {
       cur_patch_grow.push(t);
@@ -739,7 +767,7 @@ static PatchesInfo find_patches(const IMesh &tm, const TriMeshTopology &tmtopo)
         const Face &tri = *tm.face(tcand);
         for (int i = 0; i < 3; ++i) {
           Edge e(tri[i], tri[(i + 1) % 3]);
-          int t_other = tmtopo.other_tri_if_manifold(e, tcand);
+          int t_other = t_others[tcand][i];
           if (dbg_level > 1) {
             std::cout << "  edge " << e << " generates t_other=" << t_other << "\n";
           }
@@ -953,12 +981,8 @@ static void sort_by_signed_triangle_index(Vector<int> &g,
  * To accommodate this:
  * If extra_tri is non-null, then an index of EXTRA_TRI_INDEX should use it for the triangle.
  */
-static Array<int> sort_tris_around_edge(const IMesh &tm,
-                                        const TriMeshTopology &tmtopo,
-                                        const Edge e,
-                                        const Span<int> tris,
-                                        const int t0,
-                                        const Face *extra_tri)
+static Array<int> sort_tris_around_edge(
+    const IMesh &tm, const Edge e, const Span<int> tris, const int t0, const Face *extra_tri)
 {
   /* Divide and conquer, quick-sort-like sort.
    * Pick a triangle t0, then partition into groups:
@@ -1023,14 +1047,14 @@ static Array<int> sort_tris_around_edge(const IMesh &tm,
     }
   }
   if (g3.size() > 1) {
-    Array<int> g3sorted = sort_tris_around_edge(tm, tmtopo, e, g3, t0, extra_tri);
+    Array<int> g3sorted = sort_tris_around_edge(tm, e, g3, t0, extra_tri);
     std::copy(g3sorted.begin(), g3sorted.end(), g3.begin());
     if (dbg_level > 1) {
       std::cout << "g3 sorted: " << g3 << "\n";
     }
   }
   if (g4.size() > 1) {
-    Array<int> g4sorted = sort_tris_around_edge(tm, tmtopo, e, g4, t0, extra_tri);
+    Array<int> g4sorted = sort_tris_around_edge(tm, e, g4, t0, extra_tri);
     std::copy(g4sorted.begin(), g4sorted.end(), g4.begin());
     if (dbg_level > 1) {
       std::cout << "g4 sorted: " << g4 << "\n";
@@ -1076,7 +1100,7 @@ static void find_cells_from_edge(const IMesh &tm,
   const Vector<int> *edge_tris = tmtopo.edge_tris(e);
   BLI_assert(edge_tris != nullptr);
   Array<int> sorted_tris = sort_tris_around_edge(
-      tm, tmtopo, e, Span<int>(*edge_tris), (*edge_tris)[0], nullptr);
+      tm, e, Span<int>(*edge_tris), (*edge_tris)[0], nullptr);
 
   int n_edge_tris = edge_tris->size();
   Array<int> edge_patches(n_edge_tris);
@@ -1338,34 +1362,46 @@ static bool patch_cell_graph_ok(const CellsInfo &cinfo, const PatchesInfo &pinfo
 static bool is_pwn(const IMesh &tm, const TriMeshTopology &tmtopo)
 {
   constexpr int dbg_level = 0;
+  std::atomic<bool> is_pwn = true;
+  Vector<std::pair<Edge, Vector<int> *>> tris;
+
   for (auto item : tmtopo.edge_tri_map_items()) {
-    const Edge &edge = item.key;
-    int tot_orient = 0;
-    /* For each face t attached to edge, add +1 if the edge
-     * is positively in t, and -1 if negatively in t. */
-    for (int t : *item.value) {
-      const Face &face = *tm.face(t);
-      BLI_assert(face.size() == 3);
-      for (int i : face.index_range()) {
-        if (face[i] == edge.v0()) {
-          if (face[(i + 1) % 3] == edge.v1()) {
-            ++tot_orient;
-          }
-          else {
-            BLI_assert(face[(i + 3 - 1) % 3] == edge.v1());
-            --tot_orient;
+    tris.append(std::pair<Edge, Vector<int> *>(item.key, item.value));
+  }
+
+  threading::parallel_for(tris.index_range(), 2048, [&](IndexRange range) {
+    for (int j : range) {
+      const Edge &edge = tris[j].first;
+      int tot_orient = 0;
+      /* For each face t attached to edge, add +1 if the edge
+       * is positively in t, and -1 if negatively in t. */
+      for (int t : *tris[j].second) {
+        const Face &face = *tm.face(t);
+        BLI_assert(face.size() == 3);
+        for (int i : face.index_range()) {
+          if (face[i] == edge.v0()) {
+            if (face[(i + 1) % 3] == edge.v1()) {
+              ++tot_orient;
+            }
+            else {
+              BLI_assert(face[(i + 3 - 1) % 3] == edge.v1());
+              --tot_orient;
+            }
           }
         }
       }
-    }
-    if (tot_orient != 0) {
-      if (dbg_level > 0) {
-        std::cout << "edge causing non-pwn: " << edge << "\n";
+      if (tot_orient != 0) {
+        if (dbg_level > 0) {
+          std::cout << "edge causing non-pwn: " << edge << "\n";
+        }
+        is_pwn = false;
+#  ifdef WITH_TBB
+        tbb::task::self().cancel_group_execution();
+#  endif
       }
-      return false;
     }
-  }
-  return true;
+  });
+  return is_pwn.load();
 }
 
 /**
@@ -1396,8 +1432,7 @@ static int find_cell_for_point_near_edge(mpq3 p,
   Array<int> edge_tris(etris->size() + 1);
   std::copy(etris->begin(), etris->end(), edge_tris.begin());
   edge_tris[edge_tris.size() - 1] = EXTRA_TRI_INDEX;
-  Array<int> sorted_tris = sort_tris_around_edge(
-      tm, tmtopo, e, edge_tris, edge_tris[0], dummy_tri);
+  Array<int> sorted_tris = sort_tris_around_edge(tm, e, edge_tris, edge_tris[0], dummy_tri);
   if (dbg_level > 0) {
     std::cout << "sorted tris = " << sorted_tris << "\n";
   }
@@ -1452,39 +1487,66 @@ static int find_ambient_cell(const IMesh &tm,
   /* First find a vertex with the maximum x value. */
   /* Prefer not to populate the verts in the #IMesh just for this. */
   const Vert *v_extreme;
-  mpq_class extreme_x;
+  auto max_x_vert = [](const Vert *a, const Vert *b) {
+    return (a->co_exact.x > b->co_exact.x) ? a : b;
+  };
   if (component_patches == nullptr) {
-    v_extreme = (*tm.face(0))[0];
-    extreme_x = v_extreme->co_exact.x;
-    for (const Face *f : tm.faces()) {
-      for (const Vert *v : *f) {
-        const mpq_class &x = v->co_exact.x;
-        if (x > extreme_x) {
-          v_extreme = v;
-          extreme_x = x;
-        }
-      }
-    }
+    v_extreme = threading::parallel_reduce(
+        tm.face_index_range(),
+        2048,
+        (*tm.face(0))[0],
+        [&](IndexRange range, const Vert *init) {
+          const Vert *ans = init;
+          for (int i : range) {
+            const Face *f = tm.face(i);
+            for (const Vert *v : *f) {
+              if (v->co_exact.x > ans->co_exact.x) {
+                ans = v;
+              }
+            }
+          }
+          return ans;
+        },
+        max_x_vert);
   }
   else {
     if (dbg_level > 0) {
       std::cout << "restrict to patches " << *component_patches << "\n";
     }
     int p0 = (*component_patches)[0];
-    v_extreme = (*tm.face(pinfo.patch(p0).tri(0)))[0];
-    extreme_x = v_extreme->co_exact.x;
-    for (int p : *component_patches) {
-      for (int t : pinfo.patch(p).tris()) {
-        const Face *f = tm.face(t);
-        for (const Vert *v : *f) {
-          const mpq_class &x = v->co_exact.x;
-          if (x > extreme_x) {
-            v_extreme = v;
-            extreme_x = x;
+    v_extreme = threading::parallel_reduce(
+        component_patches->index_range(),
+        2048,
+        (*tm.face(pinfo.patch(p0).tri(0)))[0],
+        [&](IndexRange range, const Vert *init) {
+          const Vert *ans = init;
+          for (int pi : range) {
+            int p = (*component_patches)[pi];
+            const Vert *tris_ans = threading::parallel_reduce(
+                IndexRange(pinfo.patch(p).tot_tri()),
+                2048,
+                init,
+                [&](IndexRange tris_range, const Vert *t_init) {
+                  const Vert *v_ans = t_init;
+                  for (int i : tris_range) {
+                    int t = pinfo.patch(p).tri(i);
+                    const Face *f = tm.face(t);
+                    for (const Vert *v : *f) {
+                      if (v->co_exact.x > v_ans->co_exact.x) {
+                        v_ans = v;
+                      }
+                    }
+                  }
+                  return v_ans;
+                },
+                max_x_vert);
+            if (tris_ans->co_exact.x > ans->co_exact.x) {
+              ans = tris_ans;
+            }
           }
-        }
-      }
-    }
+          return ans;
+        },
+        max_x_vert);
   }
   if (dbg_level > 0) {
     std::cout << "v_extreme = " << v_extreme << "\n";
@@ -1493,7 +1555,8 @@ static int find_ambient_cell(const IMesh &tm,
    * when projected onto the XY plane. That edge is guaranteed to
    * be on the convex hull of the mesh. */
   const Vector<Edge> &edges = tmtopo.vert_edges(v_extreme);
-  const mpq_class extreme_y = v_extreme->co_exact.y;
+  const mpq_class &extreme_x = v_extreme->co_exact.x;
+  const mpq_class &extreme_y = v_extreme->co_exact.y;
   Edge ehull;
   mpq_class max_abs_slope = -1;
   for (Edge e : edges) {
@@ -1514,8 +1577,8 @@ static int find_ambient_cell(const IMesh &tm,
   if (dbg_level > 0) {
     std::cout << "ehull = " << ehull << " slope = " << max_abs_slope << "\n";
   }
-  /* Sort triangles around ehull, including a dummy triangle that include a known point in ambient
-   * cell. */
+  /* Sort triangles around ehull, including a dummy triangle that include a known point in
+   * ambient cell. */
   mpq3 p_in_ambient = v_extreme->co_exact;
   p_in_ambient.x += 1;
   int c_ambient = find_cell_for_point_near_edge(p_in_ambient, ehull, tm, tmtopo, pinfo, arena);
@@ -2816,7 +2879,8 @@ static IMesh raycast_patches_boolean(const IMesh &tm,
 }
 /**
  * If \a tri1 and \a tri2 have a common edge (in opposite orientation),
- * return the indices into \a tri1 and \a tri2 where that common edge starts. Else return (-1,-1).
+ * return the indices into \a tri1 and \a tri2 where that common edge starts. Else return
+ * (-1,-1).
  */
 static std::pair<int, int> find_tris_common_edge(const Face &tri1, const Face &tri2)
 {
@@ -3378,8 +3442,8 @@ static void dissolve_verts(IMesh *imesh, const Array<bool> dissolve, IMeshArena
  * will have an original edge that is NO_INDEX.
  * Not all triangulation edges can be removed: if they ended up non-trivially overlapping a real
  * input edge, then we need to keep it. Also, some are necessary to make the output satisfy
- * the "valid #BMesh" property: we can't produce output faces that have repeated vertices in them,
- * or have several disconnected boundaries (e.g., faces with holes).
+ * the "valid #BMesh" property: we can't produce output faces that have repeated vertices in
+ * them, or have several disconnected boundaries (e.g., faces with holes).
  */
 static IMesh polymesh_from_trimesh_with_dissolve(const IMesh &tm_out,
                                                  const IMesh &imesh_in,
diff --git a/source/blender/blenlib/intern/mesh_intersect.cc b/source/blender/blenlib/intern/mesh_intersect.cc
index 988988179fd..f91dd762e70 100644
--- a/source/blender/blenlib/intern/mesh_intersect.cc
+++ b/source/blender/blenlib/intern/mesh_intersect.cc
@@ -43,6 +43,7 @@
 #  include "BLI_set.hh"
 #  include "BLI_span.hh"
 #  include "BLI_task.h"
+#  include "BLI_task.hh"
 #  include "BLI_threads.h"
 #  include "BLI_vector.hh"
 #  include "BLI_vector_set.hh"
@@ -51,6 +52,10 @@
 
 #  include "BLI_mesh_intersect.hh"
 
+#  ifdef WITH_TBB
+#    include "tbb/parallel_sort.h"
+#  endif
+
 // #  define PERFDEBUG
 
 namespace blender::meshintersect {
@@ -406,6 +411,11 @@ class IMeshArena::IMeshArenaImpl : NonCopyable, NonMovable {
     return add_or_find_vert(mco, co, orig);
   }
 
+  const Vert *add_or_find_vert(Vert *vert)
+  {
+    return add_or_find_vert_(vert);
+  }
+
   Face *add_face(Span<const Vert *> verts, int orig, Span<int> edge_origs, Span<bool> is_intersect)
   {
     Face *f = new Face(verts, next_face_id_++, orig, edge_origs, is_intersect);
@@ -486,10 +496,9 @@ class IMeshArena::IMeshArenaImpl : NonCopyable, NonMovable {
  private:
   const Vert *add_or_find_vert(const mpq3 &mco, const double3 &dco, int orig)
   {
-    /* Don't allocate Vert yet, in case it is already there. */
-    Vert vtry(mco, dco, NO_INDEX, NO_INDEX);
+    Vert *vtry = new Vert(mco, dco, NO_INDEX, NO_INDEX);
     const Vert *ans;
-    VSetKey vskey(&vtry);
+    VSetKey vskey(vtry);
     if (intersect_use_threading) {
 #  ifdef USE_SPINLOCK
       BLI_spin_lock(&lock_);
@@ -499,7 +508,9 @@ class IMeshArena::IMeshArenaImpl : NonCopyable, NonMovable {
     }
     const VSetKey *lookup = vset_.lookup_key_ptr(vskey);
     if (!lookup) {
-      vskey.vert = new Vert(mco, dco, next_vert_id_++, orig);
+      vtry->id = next_vert_id_++;
+      vtry->orig = orig;
+      vskey.vert = vtry;  // new Vert(mco, dco, next_vert_id_++, orig);
       vset_.add_new(vskey);
       allocated_verts_.append(std::unique_ptr<Vert>(vskey.vert));
       ans = vskey.vert;
@@ -510,6 +521,45 @@ class IMeshArena::IMeshArenaImpl : NonCopyable, NonMovable {
        * This is the intended semantics: if the Vert already
        * exists then we are merging verts and using the first-seen
        * one as the canonical one. */
+      delete vtry;
+      ans = lookup->vert;
+    }
+    if (intersect_use_threading) {
+#  ifdef USE_SPINLOCK
+      BLI_spin_unlock(&lock_);
+#  else
+      BLI_mutex_unlock(mutex_);
+#  endif
+    }
+    return ans;
+  };
+
+  const Vert *add_or_find_vert_(Vert *vtry)
+  {
+    const Vert *ans;
+    VSetKey vskey(vtry);
+    if (intersect_use_threading) {
+#  ifdef USE_SPINLOCK
+      BLI_spin_lock(&lock_);
+#  else
+      BLI_mutex_lock(mutex_);
+#  endif
+    }
+    const VSetKey *lookup = vset_.lookup_key_ptr(vskey);
+    if (!lookup) {
+      vtry->id = next_vert_id_++;
+      vskey.vert = vtry;  // new Vert(mco, dco, next_vert_id_++, orig);
+      vset_.add_new(vskey);
+      allocated_verts_.append(std::unique_ptr<Vert>(vskey.vert));
+      ans = vskey.vert;
+    }
+    else {
+      /* It was a duplicate, so return the existing one.
+       * Note that the returned Vert may have a different orig.
+       * This is the intended semantics: if the Vert already
+       * exists then we are merging verts and using the first-seen
+       * one as the canonical one. */
+      delete vtry;
       ans = lookup->vert;
     }
     if (intersect_use_threading) {
@@ -550,6 +600,11 @@ const Vert *IMeshArena::add_or_find_vert(const mpq3 &co, int orig)
   return pimpl_->add_or_find_vert(co, orig);
 }
 
+const Vert *IMeshArena::add_or_find_vert(Vert *vert)
+{
+  return pimpl_->add_or_find_vert(vert);
+}
+
 Face *IMeshArena::add_face(Span<const Vert *> verts,
                            int orig,
                            Span<int> edge_origs,
@@ -633,7 +688,11 @@ void IMesh::populate_vert(int max_verts)
    * TODO: when all debugged, set fix_order = false. */
   const bool fix_order = true;
   if (fix_order) {
+#  ifdef WITH_TBB
+    tbb::parallel_sort(vert_.begin(), vert_.end(), [](const Vert *a, const Vert *b) {
+#  else
     std::sort(vert_.begin(), vert_.end(), [](const Vert *a, const Vert *b) {
+#  endif
       if (a->orig != NO_INDEX && b->orig != NO_INDEX) {
         return a->orig < b->orig;
       }
@@ -1037,7 +1096,7 @@ static mpq2 project_3d_to_2d(const mpq3 &p3d, int proj_axis)
  * So the sign of E is the same as the sign of E_exact if
  *    |E| > supremum(E) * index(E) * DBL_EPSILON
  *
- * Note: a possible speedup would be to have a simple function
+ * NOTE: a possible speedup would be to have a simple function
  * that calculates the error bound if one knows that all values
  * are less than some global maximum - most of the function would
  * be calculated ahead of time. The global max could be passed
@@ -1918,9 +1977,22 @@ static Face *cdt_tri_as_imesh_face(
   return facep;
 }
 
+/* Like BLI_math's is_quad_flip_v3_first_third_fast_with_normal, with const double3's. */
+static bool is_quad_flip_first_third(const double3 &v1,
+                                     const double3 &v2,
+                                     const double3 &v3,
+                                     const double3 &v4,
+                                     const double3 &normal)
+{
+  double3 dir_v3v1 = v3 - v1;
+  double3 tangent = double3::cross_high_precision(dir_v3v1, normal);
+  double dot = double3::dot(v1, tangent);
+  return (double3::dot(v4, tangent) >= dot) || (double3::dot(v2, tangent) <= dot);
+}
+
 /**
  * Tessellate face f into triangles and return an array of `const Face *`
- * giving that triangulation. Intended to be used when f has > 4 vertices.
+ * giving that triangulation. Intended to be used when f has => 4 vertices.
  * Care is taken so that the original edge index associated with
  * each edge in the output triangles either matches the original edge
  * for the (identical) edge of f, or else is -1. So diagonals added
@@ -1932,21 +2004,40 @@ static Face *cdt_tri_as_imesh_face(
  */
 static Array<Face *> polyfill_triangulate_poly(Face *f, IMeshArena *arena)
 {
-  /* Similar to loop body in BM_mesh_calc_tesselation. */
+  /* Similar to loop body in #BM_mesh_calc_tessellation. */
   int flen = f->size();
-  BLI_assert(flen > 4);
+  BLI_assert(flen >= 4);
   if (!f->plane_populated()) {
     f->populate_plane(false);
   }
-  /* Project along negative face normal so (x,y) can be used in 2d. */
   const double3 &poly_normal = f->plane->norm;
   float no[3] = {float(poly_normal[0]), float(poly_normal[1]), float(poly_normal[2])};
   normalize_v3(no);
-  float axis_mat[3][3];
+  if (flen == 4) {
+    const Vert *v0 = (*f)[0];
+    const Vert *v1 = (*f)[1];
+    const Vert *v2 = (*f)[2];
+    const Vert *v3 = (*f)[3];
+    int eo_01 = f->edge_orig[0];
+    int eo_12 = f->edge_orig[1];
+    int eo_23 = f->edge_orig[2];
+    int eo_30 = f->edge_orig[3];
+    Face *f0, *f1;
+    if (UNLIKELY(is_quad_flip_first_third(v0->co, v1->co, v2->co, v3->co, f->plane->norm))) {
+      f0 = arena->add_face({v0, v1, v3}, f->orig, {eo_01, -1, eo_30}, {false, false, false});
+      f1 = arena->add_face({v1, v2, v3}, f->orig, {eo_12, eo_23, -1}, {false, false, false});
+    }
+    else {
+      f0 = arena->add_face({v0, v1, v2}, f->orig, {eo_01, eo_12, -1}, {false, false, false});
+      f1 = arena->add_face({v0, v2, v3}, f->orig, {-1, eo_23, eo_30}, {false, false, false});
+    }
+    return Array<Face *>{f0, f1};
+  }
+  /* Project along negative face normal so (x,y) can be used in 2d. */ float axis_mat[3][3];
   float(*projverts)[2];
   unsigned int(*tris)[3];
   const int totfilltri = flen - 2;
-  /* Prepare projected vertices and array to receive triangles in tesselation. */
+  /* Prepare projected vertices and array to receive triangles in tessellation. */
   tris = static_cast<unsigned int(*)[3]>(MEM_malloc_arrayN(totfilltri, sizeof(*tris), __func__));
   projverts = static_cast<float(*)[2]>(MEM_malloc_arrayN(flen, sizeof(*projverts), __func__));
   axis_dominant_v3_to_m3_negate(axis_mat, no);
@@ -1956,7 +2047,7 @@ static Array<Face *> polyfill_triangulate_poly(Face *f, IMeshArena *arena)
     mul_v2_m3v3(projverts[j], axis_mat, co);
   }
   BLI_polyfill_calc(projverts, flen, 1, tris);
-  /* Put tesselation triangles into Face form. Record original edges where they exist. */
+  /* Put tessellation triangles into Face form. Record original edges where they exist. */
   Array<Face *> ans(totfilltri);
   for (int t = 0; t < totfilltri; ++t) {
     unsigned int *tri = tris[t];
@@ -1986,11 +2077,7 @@ static Array<Face *> polyfill_triangulate_poly(Face *f, IMeshArena *arena)
 
 /**
  * Tessellate face f into triangles and return an array of `const Face *`
- * giving that triangulation.
- * Care is taken so that the original edge index associated with
- * each edge in the output triangles either matches the original edge
- * for the (identical) edge of f, or else is -1. So diagonals added
- * for triangulation can later be identified by having #NO_INDEX for original.
+ * giving that triangulation, using an exact triangulation method.
  *
  * The method used is to use the CDT triangulation. Usually that triangulation
  * will only use the existing vertices. However, if the face self-intersects
@@ -2003,7 +2090,7 @@ static Array<Face *> polyfill_triangulate_poly(Face *f, IMeshArena *arena)
  * is by far the usual case, we need to know if the quad is convex when
  * projected before doing so, and that takes a fair amount of computation by itself.
  */
-static Array<Face *> triangulate_poly(Face *f, IMeshArena *arena)
+static Array<Face *> exact_triangulate_poly(Face *f, IMeshArena *arena)
 {
   int flen = f->size();
   CDT_input<mpq_class> cdt_in;
@@ -2086,6 +2173,68 @@ static Array<Face *> triangulate_poly(Face *f, IMeshArena *arena)
   return ans;
 }
 
+static bool face_is_degenerate(const Face *f)
+{
+  const Face &face = *f;
+  const Vert *v0 = face[0];
+  const Vert *v1 = face[1];
+  const Vert *v2 = face[2];
+  if (v0 == v1 || v0 == v2 || v1 == v2) {
+    return true;
+  }
+  double3 da = v2->co - v0->co;
+  double3 db = v2->co - v1->co;
+  double3 dab = double3::cross_high_precision(da, db);
+  double dab_length_squared = dab.length_squared();
+  double err_bound = supremum_dot_cross(dab, dab) * index_dot_cross * DBL_EPSILON;
+  if (dab_length_squared > err_bound) {
+    return false;
+  }
+  mpq3 a = v2->co_exact - v0->co_exact;
+  mpq3 b = v2->co_exact - v1->co_exact;
+  mpq3 ab = mpq3::cross(a, b);
+  if (ab.x == 0 && ab.y == 0 && ab.z == 0) {
+    return true;
+  }
+
+  return false;
+}
+
+/** Fast check for degenerate tris. Only tests for when verts are identical,
+ * not cases where there are zero-length edges. */
+static bool any_degenerate_tris_fast(const Array<Face *> triangulation)
+{
+  for (const Face *f : triangulation) {
+    const Vert *v0 = (*f)[0];
+    const Vert *v1 = (*f)[1];
+    const Vert *v2 = (*f)[2];
+    if (v0 == v1 || v0 == v2 || v1 == v2) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/**
+ * Tessellate face f into triangles and return an array of `const Face *`
+ * giving that triangulation.
+ * Care is taken so that the original edge index associated with
+ * each edge in the output triangles either matches the original edge
+ * for the (identical) edge of f, or else is -1. So diagonals added
+ * for triangulation can later be identified by having #NO_INDEX for original.
+ */
+static Array<Face *> triangulate_poly(Face *f, IMeshArena *arena)
+{
+  /* Try the much faster method using Blender's BLI_polyfill_calc. */
+  Array<Face *> ans = polyfill_triangulate_poly(f, arena);
+
+  /* This may create degenerate triangles. If so, try the exact CDT-based triangulator. */
+  if (any_degenerate_tris_fast(ans)) {
+    return exact_triangulate_poly(f, arena);
+  }
+  return ans;
+}
+
 /**
  * Return an #IMesh that is a triangulation of a mesh with general
  * polygonal faces, #IMesh.
@@ -2097,8 +2246,16 @@ IMesh triangulate_polymesh(IMesh &imesh, IMeshArena *arena)
   Vector<Face *> face_tris;
   constexpr int estimated_tris_per_face = 3;
   face_tris.reserve(estimated_tris_per_face * imesh.face_size());
+  threading::parallel_for(imesh.face_index_range(), 2048, [&](IndexRange range) {
+    for (int i : range) {
+      Face *f = imesh.face(i);
+      if (!f->plane_populated() && f->size() >= 4) {
+        f->populate_plane(false);
+      }
+    }
+  });
   for (Face *f : imesh.faces()) {
-    /* Tessellate face f, following plan similar to #BM_face_calc_tesselation. */
+    /* Tessellate face f, following plan similar to #BM_face_calc_tessellation. */
     int flen = f->size();
     if (flen == 3) {
       face_tris.append(f);
@@ -2188,12 +2345,22 @@ class TriOverlaps {
     if (two_trees_no_self) {
       tree_b_ = BLI_bvhtree_new(tm.face_size(), FLT_EPSILON, 8, 6);
     }
+
+    /* Create a Vector containing face shape. */
+    Vector<int> shapes;
+    shapes.resize(tm.face_size());
+    threading::parallel_for(tm.face_index_range(), 2048, [&](IndexRange range) {
+      for (int t : range) {
+        shapes[t] = shape_fn(tm.face(t)->orig);
+      }
+    });
+
     float bbpts[6];
     for (int t : tm.face_index_range()) {
       const BoundingBox &bb = tri_bb[t];
       copy_v3_v3(bbpts, bb.min);
       copy_v3_v3(bbpts + 3, bb.max);
-      int shape = shape_fn(tm.face(t)->orig);
+      int shape = shapes[t];
       if (two_trees_no_self) {
         if (shape == 0) {
           BLI_bvhtree_insert(tree_, t, bbpts, 2);
@@ -2485,11 +2652,13 @@ static void calc_subdivided_non_cluster_tris(Array<IMesh> &r_tri_subdivided,
       0, overlap_tri_range_tot, &data, calc_subdivided_tri_range_func, &settings);
   /* Now have to put in the triangles that are the same as the input ones, and not in clusters.
    */
-  for (int t : tm.face_index_range()) {
-    if (r_tri_subdivided[t].face_size() == 0 && clinfo.tri_cluster(t) == NO_INDEX) {
-      r_tri_subdivided[t] = IMesh({tm.face(t)});
+  threading::parallel_for(tm.face_index_range(), 2048, [&](IndexRange range) {
+    for (int t : range) {
+      if (r_tri_subdivided[t].face_size() == 0 && clinfo.tri_cluster(t) == NO_INDEX) {
+        r_tri_subdivided[t] = IMesh({tm.face(t)});
+      }
     }
-  }
+  });
 }
 
 /**
@@ -2725,33 +2894,6 @@ static CoplanarClusterInfo find_clusters(const IMesh &tm,
   return ans;
 }
 
-static bool face_is_degenerate(const Face *f)
-{
-  const Face &face = *f;
-  const Vert *v0 = face[0];
-  const Vert *v1 = face[1];
-  const Vert *v2 = face[2];
-  if (v0 == v1 || v0 == v2 || v1 == v2) {
-    return true;
-  }
-  double3 da = v2->co - v0->co;
-  double3 db = v2->co - v1->co;
-  double3 dab = double3::cross_high_precision(da, db);
-  double dab_length_squared = dab.length_squared();
-  double err_bound = supremum_dot_cross(dab, dab) * index_dot_cross * DBL_EPSILON;
-  if (dab_length_squared > err_bound) {
-    return false;
-  }
-  mpq3 a = v2->co_exact - v0->co_exact;
-  mpq3 b = v2->co_exact - v1->co_exact;
-  mpq3 ab = mpq3::cross(a, b);
-  if (ab.x == 0 && ab.y == 0 && ab.z == 0) {
-    return true;
-  }
-
-  return false;
-}
-
 /* Data and functions to test triangle degeneracy in parallel. */
 struct DegenData {
   const IMesh &tm;
@@ -2873,11 +3015,15 @@ IMesh trimesh_nary_intersect(const IMesh &tm_in,
   double overlap_time = PIL_check_seconds_timer();
   std::cout << "intersect overlaps calculated, time = " << overlap_time - bb_calc_time << "\n";
 #  endif
-  for (int t : tm_clean->face_index_range()) {
-    if (tri_ov.first_overlap_index(t) != -1) {
-      tm_clean->face(t)->populate_plane(true);
+  Array<IMesh> tri_subdivided(tm_clean->face_size(), NoInitialization());
+  threading::parallel_for(tm_clean->face_index_range(), 1024, [&](IndexRange range) {
+    for (int t : range) {
+      if (tri_ov.first_overlap_index(t) != -1) {
+        tm_clean->face(t)->populate_plane(true);
+      }
+      new (static_cast<void *>(&tri_subdivided[t])) IMesh;
     }
-  }
+  });
 #  ifdef PERFDEBUG
   double plane_populate = PIL_check_seconds_timer();
   std::cout << "planes populated, time = " << plane_populate - overlap_time << "\n";
@@ -2902,7 +3048,6 @@ IMesh trimesh_nary_intersect(const IMesh &tm_in,
   doperfmax(1, clinfo.tot_cluster());
   doperfmax(2, tri_ov.overlap().size());
 #  endif
-  Array<IMesh> tri_subdivided(tm_clean->face_size());
   calc_subdivided_non_cluster_tris(tri_subdivided, *tm_clean, itt_map, clinfo, tri_ov, arena);
 #  ifdef PERFDEBUG
   double subdivided_tris_time = PIL_check_seconds_timer();
diff --git a/source/blender/blenlib/intern/noise.c b/source/blender/blenlib/intern/noise.c
index 8e28088c9fa..01aad5b078f 100644
--- a/source/blender/blenlib/intern/noise.c
+++ b/source/blender/blenlib/intern/noise.c
@@ -884,7 +884,7 @@ static float dist_Real(float x, float y, float z, float e)
   (void)e;
   return sqrtf(x * x + y * y + z * z);
 }
-/* manhattan/taxicab/cityblock distance */
+/* Manhattan/Taxi-Cab/City-Block distance. */
 static float dist_Manhattan(float x, float y, float z, float e)
 {
   (void)e;
diff --git a/source/blender/blenlib/intern/path_util.c b/source/blender/blenlib/intern/path_util.c
index f3c348b2b44..4d0dc43ed1e 100644
--- a/source/blender/blenlib/intern/path_util.c
+++ b/source/blender/blenlib/intern/path_util.c
@@ -235,13 +235,13 @@ void BLI_path_normalize(const char *relabase, char *path)
       memmove(path + a, eind, strlen(eind) + 1);
     }
     else {
-      /* support for odd paths: eg /../home/me --> /home/me
+      /* Support for odd paths: eg `/../home/me` --> `/home/me`
        * this is a valid path in blender but we can't handle this the usual way below
        * simply strip this prefix then evaluate the path as usual.
-       * pythons os.path.normpath() does this */
+       * Python's `os.path.normpath()` does this. */
 
-      /* Note: previous version of following call used an offset of 3 instead of 4,
-       * which meant that the "/../home/me" example actually became "home/me".
+      /* NOTE: previous version of following call used an offset of 3 instead of 4,
+       * which meant that the `/../home/me` example actually became `home/me`.
        * Using offset of 3 gives behavior consistent with the aforementioned
        * Python routine. */
       memmove(path, path + 3, strlen(path + 3) + 1);
@@ -1070,8 +1070,8 @@ bool BLI_path_abs(char *path, const char *basepath)
    * paths relative to the .blend file -elubie */
   BLI_str_replace_char(tmp + BLI_path_unc_prefix_len(tmp), '\\', '/');
 
-  /* Paths starting with // will get the blend file as their base,
-   * this isn't standard in any os but is used in blender all over the place */
+  /* Paths starting with `//` will get the blend file as their base,
+   * this isn't standard in any OS but is used in blender all over the place. */
   if (wasrelative) {
     const char *lslash;
     BLI_strncpy(base, basepath, sizeof(base));
@@ -1275,7 +1275,7 @@ void BLI_setenv(const char *env, const char *val)
 {
   /* free windows */
 
-#if (defined(WIN32) || defined(WIN64))
+#if (defined(_WIN32) || defined(_WIN64))
   uputenv(env, val);
 
 #else
diff --git a/source/blender/blenlib/intern/polyfill_2d.c b/source/blender/blenlib/intern/polyfill_2d.c
index dadef979b09..817572ba85c 100644
--- a/source/blender/blenlib/intern/polyfill_2d.c
+++ b/source/blender/blenlib/intern/polyfill_2d.c
@@ -719,7 +719,7 @@ static bool pf_ear_tip_check(PolyFill *pf, PolyIndex *pi_ear_tip)
        * the area sign will be positive if the point is strictly inside.
        * It will be 0 on the edge, which we want to include as well. */
 
-      /* note: check (v3, v1) first since it fails _far_ more often than the other 2 checks
+      /* NOTE: check (v3, v1) first since it fails _far_ more often than the other 2 checks
        * (those fail equally).
        * It's logical - the chance is low that points exist on the
        * same side as the ear we're clipping off. */
diff --git a/source/blender/blenlib/intern/polyfill_2d_beautify.c b/source/blender/blenlib/intern/polyfill_2d_beautify.c
index 7425bab885c..7781e3a0f6f 100644
--- a/source/blender/blenlib/intern/polyfill_2d_beautify.c
+++ b/source/blender/blenlib/intern/polyfill_2d_beautify.c
@@ -175,7 +175,7 @@ float BLI_polyfill_beautify_quad_rotate_calc_ex(const float v1[2],
       len_13 = len_v2v2(v1, v3);
       len_24 = len_v2v2(v2, v4);
 
-      /* note, area is in fact (area * 2),
+      /* NOTE: area is in fact (area * 2),
        * but in this case its OK, since we're comparing ratios */
 
       /* edge (2-4), current state */
diff --git a/source/blender/blenlib/intern/scanfill.c b/source/blender/blenlib/intern/scanfill.c
index 8c9a229860e..b0d00007580 100644
--- a/source/blender/blenlib/intern/scanfill.c
+++ b/source/blender/blenlib/intern/scanfill.c
@@ -397,7 +397,7 @@ static void testvertexnearedge(ScanFillContext *sf_ctx)
   for (eve = sf_ctx->fillvertbase.first; eve; eve = eve->next) {
     if (eve->edge_tot == 1) {
       /* find the edge which has vertex eve,
-       * note: we _know_ this will crash if 'ed1' becomes NULL
+       * NOTE: we _know_ this will crash if 'ed1' becomes NULL
        * but this will never happen. */
       for (ed1 = sf_ctx->filledgebase.first; !(ed1->v1 == eve || ed1->v2 == eve);
            ed1 = ed1->next) {
@@ -529,7 +529,7 @@ static unsigned int scanfill(ScanFillContext *sf_ctx, PolyFill *pf, const int fl
         eve->f = SF_VERT_NEW; /* Flag for connect edges later on. */
         sc->vert = eve;
         sc->edge_first = sc->edge_last = NULL;
-        /* Note, debug print only will work for curve poly-fill, union is in use for mesh. */
+        /* NOTE: debug print only will work for curve poly-fill, union is in use for mesh. */
         /* if (even->tmp.v == NULL) eve->tmp.u = verts; */
         sc++;
       }
diff --git a/source/blender/blenlib/intern/smallhash.c b/source/blender/blenlib/intern/smallhash.c
index 5961893cae3..6e5a3e961a5 100644
--- a/source/blender/blenlib/intern/smallhash.c
+++ b/source/blender/blenlib/intern/smallhash.c
@@ -124,7 +124,7 @@ BLI_INLINE SmallHashEntry *smallhash_lookup(const SmallHash *sh, const uintptr_t
 
   BLI_assert(key != SMHASH_KEY_UNUSED);
 
-  /* note: there are always more buckets than entries,
+  /* NOTE: there are always more buckets than entries,
    * so we know there will always be a free bucket if the key isn't found. */
   for (e = &sh->buckets[h % sh->nbuckets]; e->val != SMHASH_CELL_FREE;
        h = SMHASH_NEXT(h, hoff), e = &sh->buckets[h % sh->nbuckets]) {
@@ -353,8 +353,8 @@ void **BLI_smallhash_iternew_p(const SmallHash *sh, SmallHashIter *iter, uintptr
 /** \name Debugging & Introspection
  * \{ */
 
-/* note, this was called _print_smhash in knifetool.c
- * it may not be intended for general use - campbell */
+/* NOTE(campbell): this was called _print_smhash in knifetool.c
+ * it may not be intended for general use. */
 #if 0
 void BLI_smallhash_print(SmallHash *sh)
 {
diff --git a/source/blender/blenlib/intern/sort.c b/source/blender/blenlib/intern/sort.c
index 6a13c0aa6f0..0d52faaa8c6 100644
--- a/source/blender/blenlib/intern/sort.c
+++ b/source/blender/blenlib/intern/sort.c
@@ -31,7 +31,7 @@
 
 #  include "BLI_sort.h"
 
-#  ifdef min /* for msvc */
+#  ifdef min /* For MSVC. */
 #    undef min
 #  endif
 
diff --git a/source/blender/blenlib/intern/string_utf8.c b/source/blender/blenlib/intern/string_utf8.c
index 3a02ddaa349..19ff8764259 100644
--- a/source/blender/blenlib/intern/string_utf8.c
+++ b/source/blender/blenlib/intern/string_utf8.c
@@ -43,7 +43,7 @@
 // #define DEBUG_STRSIZE
 
 /* array copied from glib's gutf8.c, */
-/* Note: last two values (0xfe and 0xff) are forbidden in utf-8,
+/* NOTE: last two values (0xfe and 0xff) are forbidden in utf-8,
  * so they are considered 1 byte length too. */
 static const size_t utf8_skip_data[256] = {
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -265,7 +265,7 @@ char *BLI_strncpy_utf8(char *__restrict dst, const char *__restrict src, size_t
   memset(dst, 0xff, sizeof(*dst) * maxncpy);
 #endif
 
-  /* note: currently we don't attempt to deal with invalid utf8 chars */
+  /* NOTE: currently we don't attempt to deal with invalid utf8 chars. */
   BLI_STR_UTF8_CPY(dst, src, maxncpy);
 
   return r_dst;
@@ -281,7 +281,7 @@ size_t BLI_strncpy_utf8_rlen(char *__restrict dst, const char *__restrict src, s
   memset(dst, 0xff, sizeof(*dst) * maxncpy);
 #endif
 
-  /* note: currently we don't attempt to deal with invalid utf8 chars */
+  /* NOTE: currently we don't attempt to deal with invalid utf8 chars. */
   BLI_STR_UTF8_CPY(dst, src, maxncpy);
 
   return (size_t)(dst - r_dst);
@@ -444,8 +444,8 @@ int BLI_str_utf8_char_width_safe(const char *p)
 
 /* copied from glib's gutf8.c, added 'Err' arg */
 
-/* note, glib uses uint for unicode, best we do the same,
- * though we don't typedef it - campbell */
+/* NOTE(campbell): glib uses uint for unicode, best we do the same,
+ * though we don't typedef it. */
 
 #define UTF8_COMPUTE(Char, Mask, Len, Err) \
   if (Char < 128) { \
@@ -580,8 +580,10 @@ uint BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, size_t *__r
   return result;
 }
 
-/* another variant that steps over the index,
- * note, currently this also falls back to latin1 for text drawing. */
+/**
+ * Another variant that steps over the index.
+ * \note currently this also falls back to latin1 for text drawing.
+ */
 uint BLI_str_utf8_as_unicode_step(const char *__restrict p, size_t *__restrict index)
 {
   int i, len;
diff --git a/source/blender/blenlib/intern/system.c b/source/blender/blenlib/intern/system.c
index f4110c65a6d..66d0b44cfb3 100644
--- a/source/blender/blenlib/intern/system.c
+++ b/source/blender/blenlib/intern/system.c
@@ -100,8 +100,8 @@ void BLI_system_backtrace(FILE *fp)
 #    undef SIZE
 
 #  else
-  /* ------------------ */
-  /* non msvc/osx/linux */
+  /* --------------------- */
+  /* Non MSVC/Apple/Linux. */
   (void)fp;
 #  endif
 }
@@ -184,7 +184,7 @@ size_t BLI_system_memory_max_in_megabytes(void)
   /* Maximum addressable bytes on this platform.
    *
    * NOTE: Due to the shift arithmetic this is a half of the memory. */
-  const size_t limit_bytes_half = (((size_t)1) << ((sizeof(size_t[8])) - 1));
+  const size_t limit_bytes_half = (((size_t)1) << (sizeof(size_t[8]) - 1));
   /* Convert it to megabytes and return. */
   return (limit_bytes_half >> 20) * 2;
 }
diff --git a/source/blender/blenlib/intern/task_iterator.c b/source/blender/blenlib/intern/task_iterator.c
index f67671c65e0..06087869685 100644
--- a/source/blender/blenlib/intern/task_iterator.c
+++ b/source/blender/blenlib/intern/task_iterator.c
@@ -80,7 +80,7 @@ BLI_INLINE void task_parallel_calc_chunk_size(const TaskParallelSettings *settin
      *              else 3 if num_tasks < 48;
      *              else 4 if num_tasks < 64;
      *                   etc.
-     * Note: If we wanted to keep the 'power of two' multiplier, we'd need something like:
+     * NOTE: If we wanted to keep the 'power of two' multiplier, we'd need something like:
      *     1 << max_ii(0, (int)(sizeof(int) * 8) - 1 - bitscan_reverse_i(num_tasks) - 3)
      */
     const int num_tasks_factor = max_ii(1, num_tasks >> 3);
@@ -186,6 +186,9 @@ static void task_parallel_iterator_no_threads(const TaskParallelSettings *settin
   if (use_userdata_chunk) {
     userdata_chunk_local = MALLOCA(userdata_chunk_size);
     memcpy(userdata_chunk_local, userdata_chunk, userdata_chunk_size);
+    if (settings->func_init != NULL) {
+      settings->func_init(state->userdata, userdata_chunk_local);
+    }
   }
 
   /* Also marking it as non-threaded for the iterator callback. */
@@ -247,6 +250,9 @@ static void task_parallel_iterator_do(const TaskParallelSettings *settings,
     if (use_userdata_chunk) {
       userdata_chunk_local = (char *)userdata_chunk_array + (userdata_chunk_size * i);
       memcpy(userdata_chunk_local, userdata_chunk, userdata_chunk_size);
+      if (settings->func_init != NULL) {
+        settings->func_init(state->userdata, userdata_chunk_local);
+      }
     }
     /* Use this pool's pre-allocated tasks. */
     BLI_task_pool_push(task_pool, parallel_iterator_func, userdata_chunk_local, false, NULL);
@@ -403,11 +409,7 @@ void BLI_task_parallel_mempool(BLI_mempool *mempool,
                                TaskParallelMempoolFunc func,
                                const TaskParallelSettings *settings)
 {
-  TaskPool *task_pool;
-  ParallelMempoolState state;
-  int i, num_threads, num_tasks;
-
-  if (BLI_mempool_len(mempool) == 0) {
+  if (UNLIKELY(BLI_mempool_len(mempool) == 0)) {
     return;
   }
 
@@ -422,6 +424,9 @@ void BLI_task_parallel_mempool(BLI_mempool *mempool,
     if (use_userdata_chunk) {
       userdata_chunk_local = MALLOCA(userdata_chunk_size);
       memcpy(userdata_chunk_local, userdata_chunk, userdata_chunk_size);
+      if (settings->func_init != NULL) {
+        settings->func_init(userdata, userdata_chunk_local);
+      }
       tls.userdata_chunk = userdata_chunk_local;
     }
 
@@ -442,14 +447,15 @@ void BLI_task_parallel_mempool(BLI_mempool *mempool,
     return;
   }
 
-  task_pool = BLI_task_pool_create(&state, TASK_PRIORITY_HIGH);
-  num_threads = BLI_task_scheduler_num_threads();
+  ParallelMempoolState state;
+  TaskPool *task_pool = BLI_task_pool_create(&state, TASK_PRIORITY_HIGH);
+  const int num_threads = BLI_task_scheduler_num_threads();
 
   /* The idea here is to prevent creating task for each of the loop iterations
    * and instead have tasks which are evenly distributed across CPU cores and
    * pull next item to be crunched using the threaded-aware BLI_mempool_iter.
    */
-  num_tasks = num_threads + 2;
+  const int num_tasks = num_threads + 2;
 
   state.userdata = userdata;
   state.func = func;
@@ -461,10 +467,13 @@ void BLI_task_parallel_mempool(BLI_mempool *mempool,
   ParallelMempoolTaskData *mempool_iterator_data = mempool_iter_threadsafe_create(
       mempool, (size_t)num_tasks);
 
-  for (i = 0; i < num_tasks; i++) {
+  for (int i = 0; i < num_tasks; i++) {
     if (use_userdata_chunk) {
       userdata_chunk_local = (char *)userdata_chunk_array + (userdata_chunk_size * i);
       memcpy(userdata_chunk_local, userdata_chunk, userdata_chunk_size);
+      if (settings->func_init != NULL) {
+        settings->func_init(userdata, userdata_chunk_local);
+      }
     }
     mempool_iterator_data[i].tls.userdata_chunk = userdata_chunk_local;
 
@@ -477,7 +486,7 @@ void BLI_task_parallel_mempool(BLI_mempool *mempool,
 
   if (use_userdata_chunk) {
     if ((settings->func_free != NULL) || (settings->func_reduce != NULL)) {
-      for (i = 0; i < num_tasks; i++) {
+      for (int i = 0; i < num_tasks; i++) {
         if (settings->func_reduce) {
           settings->func_reduce(
               userdata, userdata_chunk, mempool_iterator_data[i].tls.userdata_chunk);
diff --git a/source/blender/blenlib/intern/task_range.cc b/source/blender/blenlib/intern/task_range.cc
index 871d04c1f35..8407be2cb2b 100644
--- a/source/blender/blenlib/intern/task_range.cc
+++ b/source/blender/blenlib/intern/task_range.cc
@@ -156,7 +156,7 @@ int BLI_task_parallel_thread_id(const TaskParallelTLS *UNUSED(tls))
   if (thread_id == -1) {
     thread_id = atomic_fetch_and_add_int32(&tbb_thread_id_counter, 1);
     if (thread_id >= BLENDER_MAX_THREADS) {
-      BLI_assert(!"Maximum number of threads exceeded for sculpting");
+      BLI_assert_msg(0, "Maximum number of threads exceeded for sculpting");
       thread_id = thread_id % BLENDER_MAX_THREADS;
     }
   }
diff --git a/source/blender/blenlib/tests/BLI_array_utils_test.cc b/source/blender/blenlib/tests/BLI_array_utils_test.cc
index 5d12b8fbd4d..1bf221c5335 100644
--- a/source/blender/blenlib/tests/BLI_array_utils_test.cc
+++ b/source/blender/blenlib/tests/BLI_array_utils_test.cc
@@ -189,3 +189,53 @@ TEST(array_utils, BinaryOrInt4Mix)
   BINARY_OR_TEST(data_cmp, data_a, data_b, data_combine, ARRAY_SIZE(data_cmp));
 }
 #undef BINARY_OR_TEST
+
+/* BLI_array_deduplicate_ordered */
+#define DEDUPLICATE_ORDERED_TEST(data, data_cmp) \
+  { \
+    const uint data_len_new = BLI_array_deduplicate_ordered(data, ARRAY_SIZE(data)); \
+    EXPECT_EQ(data_len_new, ARRAY_SIZE(data_cmp)); \
+    EXPECT_EQ_ARRAY(data, data_cmp, data_len_new); \
+    /* Ensure running a second time does nothing. */ \
+    const uint data_len_test = BLI_array_deduplicate_ordered(data, data_len_new); \
+    EXPECT_EQ(data_len_test, ARRAY_SIZE(data_cmp)); \
+    EXPECT_EQ_ARRAY(data, data_cmp, data_len_new); \
+  } \
+  ((void)0)
+
+TEST(array_utils, DeduplicateOrdered1)
+{
+  int data[] = {0};
+  const int data_cmp[] = {0};
+  DEDUPLICATE_ORDERED_TEST(data, data_cmp);
+}
+
+TEST(array_utils, DeduplicateOrdered2)
+{
+  int data[] = {1, 2};
+  const int data_cmp[] = {1, 2};
+  DEDUPLICATE_ORDERED_TEST(data, data_cmp);
+}
+
+TEST(array_utils, DeduplicateOrdered2Same)
+{
+  int data[] = {1, 1};
+  const int data_cmp[] = {1};
+  DEDUPLICATE_ORDERED_TEST(data, data_cmp);
+}
+
+TEST(array_utils, DeduplicateOrdered3Same)
+{
+  int data[] = {1, 1, 1};
+  const int data_cmp[] = {1};
+  DEDUPLICATE_ORDERED_TEST(data, data_cmp);
+}
+
+TEST(array_utils, DeduplicateOrdered3)
+{
+  int data[] = {3, 3, 2, 2, 1, 1};
+  const int data_cmp[] = {3, 2, 1};
+  DEDUPLICATE_ORDERED_TEST(data, data_cmp);
+}
+
+#undef DEDUPLICATE_ORDERED_TEST
diff --git a/source/blender/blenlib/tests/BLI_delaunay_2d_test.cc b/source/blender/blenlib/tests/BLI_delaunay_2d_test.cc
index 59c4be6d952..08a3818e18f 100644
--- a/source/blender/blenlib/tests/BLI_delaunay_2d_test.cc
+++ b/source/blender/blenlib/tests/BLI_delaunay_2d_test.cc
@@ -353,27 +353,27 @@ void graph_draw(const std::string &label,
     const vec2<T> &uco = verts[e.first];
     const vec2<T> &vco = verts[e.second];
     int strokew = thin_line;
-    f << "<line fill=\"none\" stroke=\"black\" stroke-width=\"" << strokew << "\" x1=\""
+    f << R"(<line fill="none" stroke="black" stroke-width=")" << strokew << "\" x1=\""
       << SX(uco[0]) << "\" y1=\"" << SY(uco[1]) << "\" x2=\"" << SX(vco[0]) << "\" y2=\""
       << SY(vco[1]) << "\">\n";
     f << "  <title>[" << e.first << "][" << e.second << "]</title>\n";
     f << "</line>\n";
     if (draw_edge_labels) {
       f << "<text x=\"" << SX(0.5 * (uco[0] + vco[0])) << "\" y=\"" << SY(0.5 * (uco[1] + vco[1]))
-        << "\" font-size=\"small\">";
+        << R"(" font-size="small">)";
       f << "[" << e.first << "][" << e.second << "]</text>\n";
     }
   }
 
   int i = 0;
   for (const vec2<T> &vco : verts) {
-    f << "<circle fill=\"black\" cx=\"" << SX(vco[0]) << "\" cy=\"" << SY(vco[1]) << "\" r=\""
+    f << R"(<circle fill="black" cx=")" << SX(vco[0]) << "\" cy=\"" << SY(vco[1]) << "\" r=\""
       << vert_radius << "\">\n";
     f << "  <title>[" << i << "]" << vco << "</title>\n";
     f << "</circle>\n";
     if (draw_vert_labels) {
       f << "<text x=\"" << SX(vco[0]) + vert_radius << "\" y=\"" << SY(vco[1]) - vert_radius
-        << "\" font-size=\"small\">[" << i << "]</text>\n";
+        << R"(" font-size="small">[)" << i << "]</text>\n";
     }
     ++i;
   }
diff --git a/source/blender/blenlib/tests/BLI_ghash_test.cc b/source/blender/blenlib/tests/BLI_ghash_test.cc
index a0b24e96fcc..1eb29a006db 100644
--- a/source/blender/blenlib/tests/BLI_ghash_test.cc
+++ b/source/blender/blenlib/tests/BLI_ghash_test.cc
@@ -31,7 +31,7 @@
   } \
   void(0)
 
-/* Note: for pure-ghash testing, nature of the keys and data have absolutely no importance! So here
+/* NOTE: for pure-ghash testing, nature of the keys and data have absolutely no importance! So here
  * we just use mere random integers stored in pointers. */
 
 static void init_keys(unsigned int keys[TESTCASE_SIZE], const int seed)
diff --git a/source/blender/blenlib/tests/BLI_hash_mm2a_test.cc b/source/blender/blenlib/tests/BLI_hash_mm2a_test.cc
index c7bea8e15de..c6d3265881d 100644
--- a/source/blender/blenlib/tests/BLI_hash_mm2a_test.cc
+++ b/source/blender/blenlib/tests/BLI_hash_mm2a_test.cc
@@ -4,7 +4,7 @@
 
 #include "BLI_hash_mm2a.h"
 
-/* Note: Reference results are taken from reference implementation
+/* NOTE: Reference results are taken from reference implementation
  * (cpp code, CMurmurHash2A variant):
  * https://smhasher.googlecode.com/svn-history/r130/trunk/MurmurHash2.cpp
  */
diff --git a/source/blender/blenlib/tests/BLI_math_base_test.cc b/source/blender/blenlib/tests/BLI_math_base_test.cc
index d006a2eb59a..f354dd4ce23 100644
--- a/source/blender/blenlib/tests/BLI_math_base_test.cc
+++ b/source/blender/blenlib/tests/BLI_math_base_test.cc
@@ -72,7 +72,7 @@ TEST(math_base, CompareFFRelativeZero)
   EXPECT_TRUE(compare_ff_relative(f0, fn1, max_diff, 1));
   EXPECT_TRUE(compare_ff_relative(fn1, f0, max_diff, 1));
 
-  /* Note: in theory, this should return false, since 0.0f  and -0.0f have 0x80000000 diff,
+  /* NOTE: in theory, this should return false, since 0.0f  and -0.0f have 0x80000000 diff,
    *       but overflow in subtraction seems to break something here
    *       (abs(*(int *)&fn0 - *(int *)&f0) == 0x80000000 == fn0), probably because int32 cannot
    * hold this abs value. this is yet another illustration of why one shall never use (near-)zero
diff --git a/source/blender/blenlib/tests/BLI_mesh_intersect_test.cc b/source/blender/blenlib/tests/BLI_mesh_intersect_test.cc
index 1a9ffbd3403..24fa7f1a476 100644
--- a/source/blender/blenlib/tests/BLI_mesh_intersect_test.cc
+++ b/source/blender/blenlib/tests/BLI_mesh_intersect_test.cc
@@ -459,7 +459,7 @@ TEST(mesh_intersect, TwoTris)
       {4, 13, 6, 2}, /* 11: non-parallel planes, not intersecting, all one side. */
       {0, 14, 6, 2}, /* 12: non-paralel planes, not intersecting, alternate sides. */
       /* Following are all coplanar cases. */
-      {15, 16, 6, 8},   /* 13: T16 inside T15. Note: dup'd tri is expected. */
+      {15, 16, 6, 8},   /* 13: T16 inside T15. NOTE: dup'd tri is expected. */
       {15, 17, 8, 8},   /* 14: T17 intersects one edge of T15 at (1,1,0)(3,3,0). */
       {15, 18, 10, 12}, /* 15: T18 intersects T15 at (1,1,0)(3,3,0)(3,15/4,1/2)(0,3,2). */
       {15, 19, 8, 10},  /* 16: T19 intersects T15 at (3,3,0)(0,3,2). */
diff --git a/source/blender/blenlib/tests/performance/BLI_task_performance_test.cc b/source/blender/blenlib/tests/performance/BLI_task_performance_test.cc
index c5b0f86e384..dd1a084037b 100644
--- a/source/blender/blenlib/tests/performance/BLI_task_performance_test.cc
+++ b/source/blender/blenlib/tests/performance/BLI_task_performance_test.cc
@@ -21,7 +21,7 @@
 
 static uint gen_pseudo_random_number(uint num)
 {
-  /* Note: this is taken from BLI_ghashutil_uinthash(), don't want to depend on external code that
+  /* NOTE: this is taken from BLI_ghashutil_uinthash(), don't want to depend on external code that
    * might change here... */
   num += ~(num << 16);
   num ^= (num >> 5);