Fix scalability issue in threaded code of Mesh normals computation.

We tried to do as much as possible in a single threaded callback, which lead to using some nasty tricks like fake atomic-based spinlocks to perform some operations (like float addition, which has no atomic intrinsics). While OK with 'standard' low number of working threads (8-16), because collision were rather rare and implied memory barrier not *that* much overhead, this performed poorly with more powerful systems reaching the 100 of threads and beyond (like workstations or render farm hardware). There, both memory barrier overhead and more frequent collisions would have significant impact on performances. This was addressed by splitting further the process, we now have three loops, one over polys, loops and vertices, and we added an intermediate storage for weighted loop normals. This allows to avoid completely any atomic operation in body of threaded loops, which should fix scalability issues. This costs us slightly higher temp memory usage (something like 50Mb per million of polygons on average), but looks like acceptable tradeoff. Further more, tests showed that we could gain an additional ~7% of speed in computing normals of heavy meshes, by also parallelizing the last two loops (might be 1 or 2% on overall mesh update at best...). Note that further tweaking in this code should be possible once Sergey adds the 'minimum batch size' option to threaded foreach API, since very light loops like the one on loops (mere v3 addition) require much bigger batches than heavier code (like the one on polys) to keep optimal performances.
author: Bastien Montagne <montagne29@wanadoo.fr> 2017-12-23 00:12:23 +0300
committer: Bastien Montagne <montagne29@wanadoo.fr> 2018-01-09 16:14:59 +0300
commit: 71e0894e0ddf7d32894d9e0ea93e11a4810ae4d6 (patch)
tree: 0860dd990d646c7bb4751467c73f0e76e635d1d4
parent: 72151f3e36b2e20418734c7e61aa09e0112cbaac (diff)
1 files changed, 48 insertions, 26 deletions
diff --git a/source/blender/blenkernel/intern/mesh_evaluate.c b/source/blender/blenkernel/intern/mesh_evaluate.c
index a8deaf67802..81ebb517eda 100644
--- a/source/blender/blenkernel/intern/mesh_evaluate.c
+++ b/source/blender/blenkernel/intern/mesh_evaluate.c
@@ -173,10 +173,11 @@ typedef struct MeshCalcNormalsData {
 	const MLoop *mloop;
 	MVert *mverts;
 	float (*pnors)[3];
+	float (*lnors_weighted)[3];
 	float (*vnors)[3];
 } MeshCalcNormalsData;
 
-static void mesh_calc_normals_poly_task_cb(void *userdata, const int pidx)
+static void mesh_calc_normals_poly_cb(void *userdata, const int pidx)
 {
 	MeshCalcNormalsData *data = userdata;
 	const MPoly *mp = &data->mpolys[pidx];
@@ -184,7 +185,7 @@ static void mesh_calc_normals_poly_task_cb(void *userdata, const int pidx)
 	BKE_mesh_calc_poly_normal(mp, data->mloop + mp->loopstart, data->mverts, data->pnors[pidx]);
 }
 
-static void mesh_calc_normals_poly_accum_task_cb(void *userdata, const int pidx)
+static void mesh_calc_normals_poly_prepare_cb(void *userdata, const int pidx)
 {
 	MeshCalcNormalsData *data = userdata;
 	const MPoly *mp = &data->mpolys[pidx];
@@ -193,7 +194,7 @@ static void mesh_calc_normals_poly_accum_task_cb(void *userdata, const int pidx)
 
 	float pnor_temp[3];
 	float *pnor = data->pnors ? data->pnors[pidx] : pnor_temp;
-	float (*vnors)[3] = data->vnors;
+	float (*lnors_weighted)[3] = data->lnors_weighted;
 
 	const int nverts = mp->totloop;
 	float (*edgevecbuf)[3] = BLI_array_alloca(edgevecbuf, (size_t)nverts);
@@ -220,42 +221,62 @@ static void mesh_calc_normals_poly_accum_task_cb(void *userdata, const int pidx)
 			v_prev = v_curr;
 		}
 		if (UNLIKELY(normalize_v3(pnor) == 0.0f)) {
-			pnor[2] = 1.0f; /* other axis set to 0.0 */
+			pnor[2] = 1.0f; /* other axes set to 0.0 */
 		}
 	}
 
 	/* accumulate angle weighted face normal */
-	/* inline version of #accumulate_vertex_normals_poly */
+	/* inline version of #accumulate_vertex_normals_poly_v3,
+	 * split between this threaded callback and #mesh_calc_normals_poly_accum_cb. */
 	{
 		const float *prev_edge = edgevecbuf[nverts - 1];
 
 		for (i = 0; i < nverts; i++) {
+			const int lidx = mp->loopstart + i;
 			const float *cur_edge = edgevecbuf[i];
 
 			/* calculate angle between the two poly edges incident on
 			 * this vertex */
 			const float fac = saacos(-dot_v3v3(cur_edge, prev_edge));
 
-			/* accumulate */
-			for (int k = 3; k--; ) {
-				atomic_add_and_fetch_fl(&vnors[ml[i].v][k], pnor[k] * fac);
-			}
+			/* Store for later accumulation */
+			mul_v3_v3fl(lnors_weighted[lidx], pnor, fac);
+
 			prev_edge = cur_edge;
 		}
 	}
+}
+
+static void mesh_calc_normals_poly_accum_cb(void *userdata, const int lidx)
+{
+	MeshCalcNormalsData *data = userdata;
+
+	add_v3_v3(data->vnors[data->mloop[lidx].v], data->lnors_weighted[lidx]);
+}
+
+static void mesh_calc_normals_poly_finalize_cb(void *userdata, const int vidx)
+{
+	MeshCalcNormalsData *data = userdata;
+
+	MVert *mv = &data->mverts[vidx];
+	float *no = data->vnors[vidx];
+
+	if (UNLIKELY(normalize_v3(no) == 0.0f)) {
+		/* following Mesh convention; we use vertex coordinate itself for normal in this case */
+		normalize_v3_v3(no, mv->co);
+	}
 
+	normal_float_to_short_v3(mv->no, no);
 }
 
 void BKE_mesh_calc_normals_poly(
         MVert *mverts, float (*r_vertnors)[3], int numVerts,
         const MLoop *mloop, const MPoly *mpolys,
-        int UNUSED(numLoops), int numPolys, float (*r_polynors)[3],
+        int numLoops, int numPolys, float (*r_polynors)[3],
         const bool only_face_normals)
 {
+	const bool do_threaded = (numPolys > BKE_MESH_OMP_LIMIT);
 	float (*pnors)[3] = r_polynors;
-	float (*vnors)[3] = r_vertnors;
-	bool free_vnors = false;
-	int i;
 
 	if (only_face_normals) {
 		BLI_assert((pnors != NULL) || (numPolys == 0));
@@ -265,10 +286,14 @@ void BKE_mesh_calc_normals_poly(
 		    .mpolys = mpolys, .mloop = mloop, .mverts = mverts, .pnors = pnors,
 		};
 
-		BLI_task_parallel_range(0, numPolys, &data, mesh_calc_normals_poly_task_cb, (numPolys > BKE_MESH_OMP_LIMIT));
+		BLI_task_parallel_range(0, numPolys, &data, mesh_calc_normals_poly_cb, do_threaded);
 		return;
 	}
 
+	float (*vnors)[3] = r_vertnors;
+	float (*lnors_weighted)[3] = MEM_mallocN(sizeof(*lnors_weighted) * (size_t)numLoops, __func__);
+	bool free_vnors = false;
+
 	/* first go through and calculate normals for all the polys */
 	if (vnors == NULL) {
 		vnors = MEM_callocN(sizeof(*vnors) * (size_t)numVerts, __func__);
@@ -279,26 +304,23 @@ void BKE_mesh_calc_normals_poly(
 	}
 
 	MeshCalcNormalsData data = {
-	    .mpolys = mpolys, .mloop = mloop, .mverts = mverts, .pnors = pnors, .vnors = vnors,
+	    .mpolys = mpolys, .mloop = mloop, .mverts = mverts,
+	    .pnors = pnors, .lnors_weighted = lnors_weighted, .vnors = vnors
 	};
 
-	BLI_task_parallel_range(0, numPolys, &data, mesh_calc_normals_poly_accum_task_cb, (numPolys > BKE_MESH_OMP_LIMIT));
-
-	for (i = 0; i < numVerts; i++) {
-		MVert *mv = &mverts[i];
-		float *no = vnors[i];
+	/* Compute poly normals, and prepare weighted loop normals. */
+	BLI_task_parallel_range(0, numPolys, &data, mesh_calc_normals_poly_prepare_cb, do_threaded);
 
-		if (UNLIKELY(normalize_v3(no) == 0.0f)) {
-			/* following Mesh convention; we use vertex coordinate itself for normal in this case */
-			normalize_v3_v3(no, mv->co);
-		}
+	/* Actually accumulate weighted loop normals into vertex ones. */
+	BLI_task_parallel_range(0, numLoops, &data, mesh_calc_normals_poly_accum_cb, do_threaded);
 
-		normal_float_to_short_v3(mv->no, no);
-	}
+	/* Normalize and validate computed vertex normals. */
+	BLI_task_parallel_range(0, numVerts, &data, mesh_calc_normals_poly_finalize_cb, do_threaded);
 
 	if (free_vnors) {
 		MEM_freeN(vnors);
 	}
+	MEM_freeN(lnors_weighted);
 }
 
 void BKE_mesh_calc_normals(Mesh *mesh)
author	Bastien Montagne <montagne29@wanadoo.fr>	2017-12-23 00:12:23 +0300
committer	Bastien Montagne <montagne29@wanadoo.fr>	2018-01-09 16:14:59 +0300
commit	71e0894e0ddf7d32894d9e0ea93e11a4810ae4d6 (patch)
tree	0860dd990d646c7bb4751467c73f0e76e635d1d4
parent	72151f3e36b2e20418734c7e61aa09e0112cbaac (diff)