From d8b8b4d7e297b5dceddeba3a60e71e13372484da Mon Sep 17 00:00:00 2001
From: Campbell Barton <ideasman42@gmail.com>
Date: Thu, 10 Jun 2021 01:03:22 +1000
Subject: BMesh: multi-thread face tessellation

Use BM_iter_parallel for face tessellation, this gives around
6.5x speedup for BM_mesh_calc_tessellation on high poly meshes in my
tests, although exact speedup depends on available cores.
---
 .../blender/bmesh/intern/bmesh_mesh_tessellate.c   | 66 +++++++++++++++++++---
 1 file changed, 59 insertions(+), 7 deletions(-)

(limited to 'source')

diff --git a/source/blender/bmesh/intern/bmesh_mesh_tessellate.c b/source/blender/bmesh/intern/bmesh_mesh_tessellate.c
index f2a5fbe3bde..7a95e52ce25 100644
--- a/source/blender/bmesh/intern/bmesh_mesh_tessellate.c
+++ b/source/blender/bmesh/intern/bmesh_mesh_tessellate.c
@@ -37,6 +37,15 @@
 #include "bmesh.h"
 #include "bmesh_tools.h"
 
+/**
+ * On systems with 32+ cores,
+ * only a very small number of faces has any advantage single threading (in the 100's).
+ * Note that between 500-2000 quads, the difference isn't so much
+ * (tessellation isn't a bottleneck in this case anyway).
+ * Avoid the slight overhead of using threads in this case.
+ */
+#define BM_FACE_TESSELLATE_THREADED_LIMIT 1024
+
 /* -------------------------------------------------------------------- */
 /** \name Default Mesh Tessellation
  * \{ */
@@ -125,7 +134,7 @@ static int mesh_calc_tessellation_for_face(BMLoop *(*looptris)[3],
  *
  * \note \a looptris Must be pre-allocated to at least the size of given by: poly_to_tri_count
  */
-void BM_mesh_calc_tessellation(BMesh *bm, BMLoop *(*looptris)[3])
+static void bm_mesh_calc_tessellation__single_threaded(BMesh *bm, BMLoop *(*looptris)[3])
 {
 #ifndef NDEBUG
   const int looptris_tot = poly_to_tri_count(bm->totface, bm->totloop);
@@ -150,6 +159,54 @@ void BM_mesh_calc_tessellation(BMesh *bm, BMLoop *(*looptris)[3])
   BLI_assert(i <= looptris_tot);
 }
 
+struct TessellationUserTLS {
+  MemArena *pf_arena;
+};
+
+static void mesh_calc_tessellation_for_face_fn(void *__restrict userdata,
+                                               MempoolIterData *mp_f,
+                                               const TaskParallelTLS *__restrict tls)
+{
+  struct TessellationUserTLS *tls_data = tls->userdata_chunk;
+  BMLoop *(*looptris)[3] = userdata;
+  BMFace *f = (BMFace *)mp_f;
+  BMLoop *l = BM_FACE_FIRST_LOOP(f);
+  const int offset = BM_elem_index_get(l) - (BM_elem_index_get(f) * 2);
+  mesh_calc_tessellation_for_face(looptris + offset, f, &tls_data->pf_arena);
+}
+
+static void mesh_calc_tessellation_for_face_free_fn(const void *__restrict UNUSED(userdata),
+                                                    void *__restrict tls_v)
+{
+  struct TessellationUserTLS *tls_data = tls_v;
+  if (tls_data->pf_arena) {
+    BLI_memarena_free(tls_data->pf_arena);
+  }
+}
+
+static void bm_mesh_calc_tessellation__multi_threaded(BMesh *bm, BMLoop *(*looptris)[3])
+{
+  BM_mesh_elem_index_ensure(bm, BM_LOOP | BM_FACE);
+
+  TaskParallelSettings settings;
+  struct TessellationUserTLS tls_dummy = {NULL};
+  BLI_parallel_mempool_settings_defaults(&settings);
+  settings.userdata_chunk = &tls_dummy;
+  settings.userdata_chunk_size = sizeof(tls_dummy);
+  settings.func_free = mesh_calc_tessellation_for_face_free_fn;
+  BM_iter_parallel(bm, BM_FACES_OF_MESH, mesh_calc_tessellation_for_face_fn, looptris, &settings);
+}
+
+void BM_mesh_calc_tessellation(BMesh *bm, BMLoop *(*looptris)[3])
+{
+  if (bm->totface < BM_FACE_TESSELLATE_THREADED_LIMIT) {
+    bm_mesh_calc_tessellation__single_threaded(bm, looptris);
+  }
+  else {
+    bm_mesh_calc_tessellation__multi_threaded(bm, looptris);
+  }
+}
+
 /** \} */
 
 /* -------------------------------------------------------------------- */
@@ -236,12 +293,7 @@ void BM_mesh_calc_tessellation_with_partial(BMesh *bm,
 
   BM_mesh_elem_index_ensure(bm, BM_LOOP | BM_FACE);
 
-  /* On systems with 32+ cores,
-   * only a very small number of faces has any advantage single threading (in the 100's).
-   * Note that between 500-2000 quads, the difference isn't so much
-   * (tessellation isn't a bottleneck in this case anyway).
-   * Avoid the slight overhead of using threads in this case. */
-  if (bmpinfo->faces_len < 1024) {
+  if (bmpinfo->faces_len < BM_FACE_TESSELLATE_THREADED_LIMIT) {
     bm_mesh_calc_tessellation_with_partial__single_threaded(looptris, bmpinfo);
   }
   else {
-- 
cgit v1.2.3