11 files changed, 459 insertions, 107 deletions
diff --git a/intern/atomic/atomic_ops.h b/intern/atomic/atomic_ops.h
index 578cfb76eb6..e849bcf6cef 100644
--- a/intern/atomic/atomic_ops.h
+++ b/intern/atomic/atomic_ops.h
@@ -130,6 +130,9 @@ ATOMIC_INLINE unsigned int atomic_cas_u(unsigned int *v, unsigned int old, unsig
 
 ATOMIC_INLINE void *atomic_cas_ptr(void **v, void *old, void *_new);
 
+
+ATOMIC_INLINE float atomic_cas_float(float *v, float old, float _new);
+
 /* WARNING! Float 'atomics' are really faked ones, those are actually closer to some kind of spinlock-sync'ed operation,
  *          which means they are only efficient if collisions are highly unlikely (i.e. if probability of two threads
  *          working on the same pointer at the same time is very low). */
diff --git a/intern/atomic/intern/atomic_ops_ext.h b/intern/atomic/intern/atomic_ops_ext.h
index 7eef20f46d3..1b1fea9642d 100644
--- a/intern/atomic/intern/atomic_ops_ext.h
+++ b/intern/atomic/intern/atomic_ops_ext.h
@@ -191,6 +191,12 @@ ATOMIC_INLINE void *atomic_cas_ptr(void **v, void *old, void *_new)
 /* float operations. */
 ATOMIC_STATIC_ASSERT(sizeof(float) == sizeof(uint32_t), "sizeof(float) != sizeof(uint32_t)");
 
+ATOMIC_INLINE float atomic_cas_float(float *v, float old, float _new)
+{
+	uint32_t ret = atomic_cas_uint32((uint32_t *)v, *(uint32_t *)&old, *(uint32_t *)&_new);
+	return *(float *)&ret;
+}
+
 ATOMIC_INLINE float atomic_add_and_fetch_fl(float *p, const float x)
 {
 	float oldval, newval;
diff --git a/source/blender/blenlib/BLI_mempool.h b/source/blender/blenlib/BLI_mempool.h
index 0c754f551e0..b68ca6b1f2b 100644
--- a/source/blender/blenlib/BLI_mempool.h
+++ b/source/blender/blenlib/BLI_mempool.h
@@ -71,6 +71,8 @@ typedef struct BLI_mempool_iter {
 	BLI_mempool *pool;
 	struct BLI_mempool_chunk *curchunk;
 	unsigned int curindex;
+
+	struct BLI_mempool_chunk **curchunk_threaded_shared;
 } BLI_mempool_iter;
 
 /* flag */
@@ -87,6 +89,9 @@ enum {
 void  BLI_mempool_iternew(BLI_mempool *pool, BLI_mempool_iter *iter) ATTR_NONNULL();
 void *BLI_mempool_iterstep(BLI_mempool_iter *iter) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL();
 
+BLI_mempool_iter *BLI_mempool_iter_threadsafe_create(BLI_mempool *pool, const size_t num_iter) ATTR_WARN_UNUSED_RESULT ATTR_NONNULL();
+void  BLI_mempool_iter_threadsafe_free(BLI_mempool_iter *iter_arr) ATTR_NONNULL();
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/source/blender/blenlib/BLI_task.h b/source/blender/blenlib/BLI_task.h
index 721327d26a8..ccfa2b6e2e7 100644
--- a/source/blender/blenlib/BLI_task.h
+++ b/source/blender/blenlib/BLI_task.h
@@ -35,6 +35,8 @@ extern "C" {
 #include "BLI_threads.h"
 #include "BLI_utildefines.h"
 
+struct BLI_mempool;
+
 /* Task Scheduler
  * 
  * Central scheduler that holds running threads ready to execute tasks. A single
@@ -150,6 +152,15 @@ void BLI_task_parallel_listbase(
         TaskParallelListbaseFunc func,
         const bool use_threading);
 
+typedef struct MempoolIterData MempoolIterData;
+typedef void (*TaskParallelMempoolFunc)(void *userdata,
+                                        MempoolIterData *iter);
+void BLI_task_parallel_mempool(
+        struct BLI_mempool *mempool,
+        void *userdata,
+        TaskParallelMempoolFunc func,
+        const bool use_threading);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/source/blender/blenlib/intern/BLI_mempool.c b/source/blender/blenlib/intern/BLI_mempool.c
index b02811616dd..c90f9e300b7 100644
--- a/source/blender/blenlib/intern/BLI_mempool.c
+++ b/source/blender/blenlib/intern/BLI_mempool.c
@@ -41,6 +41,8 @@
 #include <string.h>
 #include <stdlib.h>
 
+#include "atomic_ops.h"
+
 #include "BLI_utildefines.h"
 
 #include "BLI_mempool.h" /* own include */
@@ -553,7 +555,7 @@ void *BLI_mempool_as_arrayN(BLI_mempool *pool, const char *allocstr)
 }
 
 /**
- * Create a new mempool iterator, \a BLI_MEMPOOL_ALLOW_ITER flag must be set.
+ * Initialize a new mempool iterator, \a BLI_MEMPOOL_ALLOW_ITER flag must be set.
  */
 void BLI_mempool_iternew(BLI_mempool *pool, BLI_mempool_iter *iter)
 {
@@ -562,6 +564,47 @@ void BLI_mempool_iternew(BLI_mempool *pool, BLI_mempool_iter *iter)
 	iter->pool = pool;
 	iter->curchunk = pool->chunks;
 	iter->curindex = 0;
+
+	iter->curchunk_threaded_shared = NULL;
+}
+
+/**
+ * Initialize an array of mempool iterators, \a BLI_MEMPOOL_ALLOW_ITER flag must be set.
+ *
+ * This is used in threaded code, to generate as much iterators as needed (each task should have its own),
+ * such that each iterator goes over its own single chunk, and only getting the next chunk to iterate over has to be
+ * protected against concurrency (which can be done in a lockless way).
+ *
+ * To be used when creating a task for each single item in the pool is totally overkill.
+ *
+ * See BLI_task_parallel_mempool implementation for detailed usage example.
+ */
+BLI_mempool_iter *BLI_mempool_iter_threadsafe_create(BLI_mempool *pool, const size_t num_iter)
+{
+	BLI_assert(pool->flag & BLI_MEMPOOL_ALLOW_ITER);
+
+	BLI_mempool_iter *iter_arr = MEM_mallocN(sizeof(*iter_arr) * num_iter, __func__);
+	BLI_mempool_chunk **curchunk_threaded_shared = MEM_mallocN(sizeof(void *), __func__);
+
+	BLI_mempool_iternew(pool, iter_arr);
+
+	*curchunk_threaded_shared = iter_arr->curchunk;
+	iter_arr->curchunk_threaded_shared = curchunk_threaded_shared;
+
+	for (size_t i = 1; i < num_iter; i++) {
+		iter_arr[i] = iter_arr[0];
+		*curchunk_threaded_shared = iter_arr[i].curchunk = (*curchunk_threaded_shared) ? (*curchunk_threaded_shared)->next : NULL;
+	}
+
+	return iter_arr;
+}
+
+void  BLI_mempool_iter_threadsafe_free(BLI_mempool_iter *iter_arr)
+{
+	BLI_assert(iter_arr->curchunk_threaded_shared != NULL);
+
+	MEM_freeN(iter_arr->curchunk_threaded_shared);
+	MEM_freeN(iter_arr);
 }
 
 #if 0
@@ -571,15 +614,28 @@ static void *bli_mempool_iternext(BLI_mempool_iter *iter)
 {
 	void *ret = NULL;
 
-	if (!iter->curchunk || !iter->pool->totused) return NULL;
+	if (iter->curchunk == NULL || !iter->pool->totused) {
+		return ret;
+	}
 
 	ret = ((char *)CHUNK_DATA(iter->curchunk)) + (iter->pool->esize * iter->curindex);
 
 	iter->curindex++;
 
 	if (iter->curindex == iter->pool->pchunk) {
-		iter->curchunk = iter->curchunk->next;
 		iter->curindex = 0;
+		if (iter->curchunk_threaded_shared) {
+			while (1) {
+				iter->curchunk = *iter->curchunk_threaded_shared;
+				if (iter->curchunk == NULL) {
+					break;
+				}
+				if (atomic_cas_ptr((void **)iter->curchunk_threaded_shared, iter->curchunk, iter->curchunk->next) == iter->curchunk) {
+					break;
+				}
+			}
+		}
+		iter->curchunk = iter->curchunk->next;
 	}
 
 	return ret;
@@ -620,8 +676,18 @@ void *BLI_mempool_iterstep(BLI_mempool_iter *iter)
 		}
 		else {
 			iter->curindex = 0;
+			if (iter->curchunk_threaded_shared) {
+				for (iter->curchunk = *iter->curchunk_threaded_shared;
+				     (iter->curchunk != NULL) &&
+				     (atomic_cas_ptr((void **)iter->curchunk_threaded_shared, iter->curchunk, iter->curchunk->next) != iter->curchunk);
+				     iter->curchunk = *iter->curchunk_threaded_shared);
+
+				if (UNLIKELY(iter->curchunk == NULL)) {
+					return (ret->freeword == FREEWORD) ? NULL : ret;
+				}
+			}
 			iter->curchunk = iter->curchunk->next;
-			if (iter->curchunk == NULL) {
+			if (UNLIKELY(iter->curchunk == NULL)) {
 				return (ret->freeword == FREEWORD) ? NULL : ret;
 			}
 			curnode = CHUNK_DATA(iter->curchunk);
diff --git a/source/blender/blenlib/intern/task.c b/source/blender/blenlib/intern/task.c
index d69241c3737..eb7f186702b 100644
--- a/source/blender/blenlib/intern/task.c
+++ b/source/blender/blenlib/intern/task.c
@@ -32,6 +32,7 @@
 
 #include "BLI_listbase.h"
 #include "BLI_math.h"
+#include "BLI_mempool.h"
 #include "BLI_task.h"
 #include "BLI_threads.h"
 
@@ -1354,3 +1355,89 @@ void BLI_task_parallel_listbase(
 
 	BLI_spin_end(&state.lock);
 }
+
+
+typedef struct ParallelMempoolState {
+	void *userdata;
+	TaskParallelMempoolFunc func;
+} ParallelMempoolState;
+
+static void parallel_mempool_func(
+        TaskPool * __restrict pool,
+        void *taskdata,
+        int UNUSED(threadid))
+{
+	ParallelMempoolState * __restrict state = BLI_task_pool_userdata(pool);
+	BLI_mempool_iter *iter = taskdata;
+	MempoolIterData *item;
+
+	while ((item = BLI_mempool_iterstep(iter)) != NULL) {
+		state->func(state->userdata, item);
+	}
+}
+
+/**
+ * This function allows to parallelize for loops over Mempool items.
+ *
+ * \param pool The iterable BLI_mempool to loop over.
+ * \param userdata Common userdata passed to all instances of \a func.
+ * \param func Callback function.
+ * \param use_threading If \a true, actually split-execute loop in threads, else just do a sequential forloop
+ *                      (allows caller to use any kind of test to switch on parallelization or not).
+ *
+ * \note There is no static scheduling here.
+ */
+void BLI_task_parallel_mempool(
+        BLI_mempool *mempool,
+        void *userdata,
+        TaskParallelMempoolFunc func,
+        const bool use_threading)
+{
+	TaskScheduler *task_scheduler;
+	TaskPool *task_pool;
+	ParallelMempoolState state;
+	int i, num_threads, num_tasks;
+
+	if (BLI_mempool_count(mempool) == 0) {
+		return;
+	}
+
+	if (!use_threading) {
+		BLI_mempool_iter iter;
+		BLI_mempool_iternew(mempool, &iter);
+
+		for (void *item = BLI_mempool_iterstep(&iter); item != NULL; item = BLI_mempool_iterstep(&iter)) {
+			func(userdata, item);
+		}
+		return;
+	}
+
+	task_scheduler = BLI_task_scheduler_get();
+	task_pool = BLI_task_pool_create(task_scheduler, &state);
+	num_threads = BLI_task_scheduler_num_threads(task_scheduler);
+
+	/* The idea here is to prevent creating task for each of the loop iterations
+	 * and instead have tasks which are evenly distributed across CPU cores and
+	 * pull next item to be crunched using the threaded-aware BLI_mempool_iter.
+	 */
+	num_tasks = num_threads * 2;
+
+	state.userdata = userdata;
+	state.func = func;
+
+	BLI_mempool_iter *mempool_iterators = BLI_mempool_iter_threadsafe_create(mempool, (size_t)num_tasks);
+
+	for (i = 0; i < num_tasks; i++) {
+		/* Use this pool's pre-allocated tasks. */
+		BLI_task_pool_push_from_thread(task_pool,
+		                               parallel_mempool_func,
+		                               &mempool_iterators[i], false,
+		                               TASK_PRIORITY_HIGH,
+		                               task_pool->thread_id);
+	}
+
+	BLI_task_pool_work_and_wait(task_pool);
+	BLI_task_pool_free(task_pool);
+
+	BLI_mempool_iter_threadsafe_free(mempool_iterators);
+}
diff --git a/source/blender/bmesh/CMakeLists.txt b/source/blender/bmesh/CMakeLists.txt
index ea24da86626..43e45eab98f 100644
--- a/source/blender/bmesh/CMakeLists.txt
+++ b/source/blender/bmesh/CMakeLists.txt
@@ -30,6 +30,7 @@ set(INC
 	../blentranslation
 	../makesdna
 	../../../intern/guardedalloc
+	../../../intern/atomic
 	../../../intern/eigen
 	../../../extern/rangetree
 )
diff --git a/source/blender/bmesh/intern/bmesh_iterators_inline.h b/source/blender/bmesh/intern/bmesh_iterators_inline.h
index e68440021e6..32f0f4b67c4 100644
--- a/source/blender/bmesh/intern/bmesh_iterators_inline.h
+++ b/source/blender/bmesh/intern/bmesh_iterators_inline.h
@@ -182,4 +182,40 @@ BLI_INLINE void *BM_iter_new(BMIter *iter, BMesh *bm, const char itype, void *da
 	}
 }
 
+/**
+ * \brief Parallel (threaded) iterator, only available for most basic itertypes (verts/edges/faces of mesh).
+ *
+ * Uses BLI_task_parallel_mempool to iterate over all items of underlying matching mempool.
+ *
+ * \note You have to include BLI_task.h before BMesh includes to be able to use this function!
+ */
+
+#ifdef __BLI_TASK_H__
+
+ATTR_NONNULL(1)
+BLI_INLINE void BM_iter_parallel(
+        BMesh *bm, const char itype, TaskParallelMempoolFunc func, void *userdata, const bool use_threading)
+{
+	BLI_assert(bm != NULL);
+
+	/* inlining optimizes out this switch when called with the defined type */
+	switch ((BMIterType)itype) {
+		case BM_VERTS_OF_MESH:
+			BLI_task_parallel_mempool(bm->vpool, userdata, func, use_threading);
+			break;
+		case BM_EDGES_OF_MESH:
+			BLI_task_parallel_mempool(bm->epool, userdata, func, use_threading);
+			break;
+		case BM_FACES_OF_MESH:
+			BLI_task_parallel_mempool(bm->fpool, userdata, func, use_threading);
+			break;
+		default:
+			/* should never happen */
+			BLI_assert(0);
+			break;
+	}
+}
+
+#endif  /* __BLI_TASK_H__ */
+
 #endif /* __BMESH_ITERATORS_INLINE_H__ */
diff --git a/source/blender/bmesh/intern/bmesh_mesh.c b/source/blender/bmesh/intern/bmesh_mesh.c
index 2ff670c770e..8407dc36040 100644
--- a/source/blender/bmesh/intern/bmesh_mesh.c
+++ b/source/blender/bmesh/intern/bmesh_mesh.c
@@ -35,6 +35,7 @@
 #include "BLI_listbase.h"
 #include "BLI_math.h"
 #include "BLI_stack.h"
+#include "BLI_task.h"
 #include "BLI_utildefines.h"
 
 #include "BKE_cdderivedmesh.h"
@@ -42,6 +43,8 @@
 #include "BKE_mesh.h"
 #include "BKE_multires.h"
 
+#include "atomic_ops.h"
+
 #include "intern/bmesh_private.h"
 
 /* used as an extern, defined in bmesh.h */
@@ -318,146 +321,202 @@ void BM_mesh_free(BMesh *bm)
 	MEM_freeN(bm);
 }
 
+
 /**
  * Helpers for #BM_mesh_normals_update and #BM_verts_calc_normal_vcos
  */
-static void bm_mesh_edges_calc_vectors(BMesh *bm, float (*edgevec)[3], const float (*vcos)[3])
+
+typedef struct BMEdgesCalcVectorsData {
+	/* Read-only data. */
+	const float (*vcos)[3];
+
+	/* Read-write data, but no need to protect it, no concurrency to fear here. */
+	float (*edgevec)[3];
+} BMEdgesCalcVectorsData;
+
+
+static void mesh_edges_calc_vectors_cb(void *userdata, MempoolIterData *mp_e)
 {
-	BMIter eiter;
-	BMEdge *e;
-	int index;
+	BMEdgesCalcVectorsData *data = userdata;
+	BMEdge *e = (BMEdge *)mp_e;
 
-	if (vcos) {
-		BM_mesh_elem_index_ensure(bm, BM_VERT);
+	if (e->l) {
+		const float *v1_co = data->vcos ? data->vcos[BM_elem_index_get(e->v1)] : e->v1->co;
+		const float *v2_co = data->vcos ? data->vcos[BM_elem_index_get(e->v2)] : e->v2->co;
+		sub_v3_v3v3(data->edgevec[BM_elem_index_get(e)], v2_co, v1_co);
+		normalize_v3(data->edgevec[BM_elem_index_get(e)]);
 	}
+	else {
+		/* the edge vector will not be needed when the edge has no radial */
+	}
+}
 
-	BM_ITER_MESH_INDEX (e, &eiter, bm, BM_EDGES_OF_MESH, index) {
-		BM_elem_index_set(e, index); /* set_inline */
+static void bm_mesh_edges_calc_vectors(BMesh *bm, float (*edgevec)[3], const float (*vcos)[3])
+{
+	BM_mesh_elem_index_ensure(bm, BM_EDGE | (vcos ?  BM_VERT : 0));
 
-		if (e->l) {
-			const float *v1_co = vcos ? vcos[BM_elem_index_get(e->v1)] : e->v1->co;
-			const float *v2_co = vcos ? vcos[BM_elem_index_get(e->v2)] : e->v2->co;
-			sub_v3_v3v3(edgevec[index], v2_co, v1_co);
-			normalize_v3(edgevec[index]);
-		}
-		else {
-			/* the edge vector will not be needed when the edge has no radial */
+	BMEdgesCalcVectorsData data = {
+	    .vcos = vcos,
+	    .edgevec = edgevec
+	};
+
+	BM_iter_parallel(bm, BM_EDGES_OF_MESH, mesh_edges_calc_vectors_cb, &data, bm->totedge >= BM_OMP_LIMIT);
+}
+
+
+typedef struct BMVertsCalcNormalsData {
+	/* Read-only data. */
+	const float (*fnos)[3];
+	const float (*edgevec)[3];
+	const float (*vcos)[3];
+
+	/* Read-write data, protected by an atomic-based fake spinlock-like system... */
+	float (*vnos)[3];
+} BMVertsCalcNormalsData;
+
+static void mesh_verts_calc_normals_accum_cb(void *userdata, MempoolIterData *mp_f)
+{
+	BMVertsCalcNormalsData *data = userdata;
+	BMFace *f = (BMFace *)mp_f;
+
+	const float *f_no = data->fnos ? data->fnos[BM_elem_index_get(f)] : f->no;
+
+	BMLoop *l_first, *l_iter;
+	l_iter = l_first = BM_FACE_FIRST_LOOP(f);
+	do {
+		const float *e1diff, *e2diff;
+		float dotprod;
+		float fac;
+
+		/* calculate the dot product of the two edges that
+		 * meet at the loop's vertex */
+		e1diff = data->edgevec[BM_elem_index_get(l_iter->prev->e)];
+		e2diff = data->edgevec[BM_elem_index_get(l_iter->e)];
+		dotprod = dot_v3v3(e1diff, e2diff);
+
+		/* edge vectors are calculated from e->v1 to e->v2, so
+		 * adjust the dot product if one but not both loops
+		 * actually runs from from e->v2 to e->v1 */
+		if ((l_iter->prev->e->v1 == l_iter->prev->v) ^ (l_iter->e->v1 == l_iter->v)) {
+			dotprod = -dotprod;
+		}
+
+		fac = saacos(-dotprod);
+
+		/* accumulate weighted face normal into the vertex's normal */
+		float *v_no = data->vnos ? data->vnos[BM_elem_index_get(l_iter->v)] : l_iter->v->no;
+
+		/* This block is a lockless threadsafe madd_v3_v3fl.
+		 * It uses the first float of the vector as a sort of cheap spinlock,
+		 * assuming FLT_MAX is a safe 'illegal' value that cannot be set here otherwise.
+		 * It also assumes that collisions between threads are highly unlikely,
+		 * else performances would be quite bad here. */
+		float virtual_lock = v_no[0];
+		while ((virtual_lock = atomic_cas_float(&v_no[0], virtual_lock, FLT_MAX)) == FLT_MAX) {
+			/* This loops until following conditions are met:
+			 *   - v_no[0] has same value as virtual_lock (i.e. it did not change since last try).
+			 *   - v_no_[0] was not FLT_MAX, i.e. it was not locked by another thread.
+			 */
 		}
+		/* Now we own that normal value, and can change it.
+		 * But first scalar of the vector must not be changed yet, it's our lock! */
+		virtual_lock += f_no[0] * fac;
+		v_no[1] += f_no[1] * fac;
+		v_no[2] += f_no[2] * fac;
+		/* Second atomic operation to 'release' our lock on that vector and set its first scalar value. */
+		virtual_lock = atomic_cas_float(&v_no[0], FLT_MAX, virtual_lock);
+		BLI_assert(virtual_lock == FLT_MAX);
+
+	} while ((l_iter = l_iter->next) != l_first);
+}
+
+static void mesh_verts_calc_normals_normalize_cb(void *userdata, MempoolIterData *mp_v)
+{
+	BMVertsCalcNormalsData *data = userdata;
+	BMVert *v = (BMVert *)mp_v;
+
+	float *v_no = data->vnos ? data->vnos[BM_elem_index_get(v)] : v->no;
+	if (UNLIKELY(normalize_v3(v_no) == 0.0f)) {
+		const float *v_co = data->vcos ? data->vcos[BM_elem_index_get(v)] : v->co;
+		normalize_v3_v3(v_no, v_co);
 	}
-	bm->elem_index_dirty &= ~BM_EDGE;
 }
 
 static void bm_mesh_verts_calc_normals(
         BMesh *bm, const float (*edgevec)[3], const float (*fnos)[3],
         const float (*vcos)[3], float (*vnos)[3])
 {
-	BM_mesh_elem_index_ensure(bm, (vnos) ? (BM_EDGE | BM_VERT) : BM_EDGE);
+	BM_mesh_elem_index_ensure(bm, (BM_EDGE | BM_FACE) | ((vnos || vcos) ?  BM_VERT : 0));
 
-	/* add weighted face normals to vertices */
-	{
-		BMIter fiter;
-		BMFace *f;
-		int i;
-
-		BM_ITER_MESH_INDEX (f, &fiter, bm, BM_FACES_OF_MESH, i) {
-			BMLoop *l_first, *l_iter;
-			const float *f_no = fnos ? fnos[i] : f->no;
-
-			l_iter = l_first = BM_FACE_FIRST_LOOP(f);
-			do {
-				const float *e1diff, *e2diff;
-				float dotprod;
-				float fac;
-				float *v_no = vnos ? vnos[BM_elem_index_get(l_iter->v)] : l_iter->v->no;
-
-				/* calculate the dot product of the two edges that
-				 * meet at the loop's vertex */
-				e1diff = edgevec[BM_elem_index_get(l_iter->prev->e)];
-				e2diff = edgevec[BM_elem_index_get(l_iter->e)];
-				dotprod = dot_v3v3(e1diff, e2diff);
-
-				/* edge vectors are calculated from e->v1 to e->v2, so
-				 * adjust the dot product if one but not both loops
-				 * actually runs from from e->v2 to e->v1 */
-				if ((l_iter->prev->e->v1 == l_iter->prev->v) ^ (l_iter->e->v1 == l_iter->v)) {
-					dotprod = -dotprod;
-				}
+	BMVertsCalcNormalsData data = {
+	    .fnos = fnos,
+	    .edgevec = edgevec,
+	    .vcos = vcos,
+	    .vnos = vnos
+	};
 
-				fac = saacos(-dotprod);
+	BM_iter_parallel(bm, BM_FACES_OF_MESH, mesh_verts_calc_normals_accum_cb, &data, bm->totface >= BM_OMP_LIMIT);
 
-				/* accumulate weighted face normal into the vertex's normal */
-				madd_v3_v3fl(v_no, f_no, fac);
-			} while ((l_iter = l_iter->next) != l_first);
-		}
-	}
+	/* normalize the accumulated vertex normals */
+	BM_iter_parallel(bm, BM_VERTS_OF_MESH, mesh_verts_calc_normals_normalize_cb, &data, bm->totvert >= BM_OMP_LIMIT);
+}
 
 
-	/* normalize the accumulated vertex normals */
-	{
-		BMIter viter;
-		BMVert *v;
-		int i;
-
-		BM_ITER_MESH_INDEX (v, &viter, bm, BM_VERTS_OF_MESH, i) {
-			float *v_no = vnos ? vnos[i] : v->no;
-			if (UNLIKELY(normalize_v3(v_no) == 0.0f)) {
-				const float *v_co = vcos ? vcos[i] : v->co;
-				normalize_v3_v3(v_no, v_co);
-			}
-		}
-	}
+static void mesh_faces_calc_normals_cb(void *UNUSED(userdata), MempoolIterData *mp_f)
+{
+	BMFace *f = (BMFace *)mp_f;
+
+	BM_face_normal_update(f);
 }
 
+
 /**
  * \brief BMesh Compute Normals
  *
  * Updates the normals of a mesh.
  */
+#include "PIL_time_utildefines.h"
 void BM_mesh_normals_update(BMesh *bm)
 {
 	float (*edgevec)[3] = MEM_mallocN(sizeof(*edgevec) * bm->totedge, __func__);
 
-#pragma omp parallel sections if (bm->totvert + bm->totedge + bm->totface >= BM_OMP_LIMIT)
-	{
-#pragma omp section
-		{
-			/* calculate all face normals */
-			BMIter fiter;
-			BMFace *f;
-			int i;
-
-			BM_ITER_MESH_INDEX (f, &fiter, bm, BM_FACES_OF_MESH, i) {
-				BM_elem_index_set(f, i); /* set_inline */
-				BM_face_normal_update(f);
-			}
-			bm->elem_index_dirty &= ~BM_FACE;
-		}
-#pragma omp section
-		{
-			/* Zero out vertex normals */
-			BMIter viter;
-			BMVert *v;
-			int i;
-
-			BM_ITER_MESH_INDEX (v, &viter, bm, BM_VERTS_OF_MESH, i) {
-				BM_elem_index_set(v, i); /* set_inline */
-				zero_v3(v->no);
-			}
-			bm->elem_index_dirty &= ~BM_VERT;
-		}
-#pragma omp section
-		{
-			/* Compute normalized direction vectors for each edge.
-			 * Directions will be used for calculating the weights of the face normals on the vertex normals.
-			 */
-			bm_mesh_edges_calc_vectors(bm, edgevec, NULL);
-		}
+	TIMEIT_START_AVERAGED(bmesh_nors);
+
+	/* Parallel mempool iteration does not allow to generate indices inline anymore... */
+	BM_mesh_elem_index_ensure(bm, (BM_EDGE | BM_FACE));
+
+	/* calculate all face normals */
+	TIMEIT_START_AVERAGED(faces_nors);
+	BM_iter_parallel(bm, BM_FACES_OF_MESH, mesh_faces_calc_normals_cb, NULL, bm->totface >= BM_OMP_LIMIT);
+	TIMEIT_END_AVERAGED(faces_nors);
+
+	/* Zero out vertex normals */
+	BMIter viter;
+	BMVert *v;
+	int i;
+
+	TIMEIT_START_AVERAGED(verts_zero_nors);
+	BM_ITER_MESH_INDEX (v, &viter, bm, BM_VERTS_OF_MESH, i) {
+		BM_elem_index_set(v, i); /* set_inline */
+		zero_v3(v->no);
 	}
-	/* end omp */
+	bm->elem_index_dirty &= ~BM_VERT;
+	TIMEIT_END_AVERAGED(verts_zero_nors);
+
+	/* Compute normalized direction vectors for each edge.
+	 * Directions will be used for calculating the weights of the face normals on the vertex normals.
+	 */
+	TIMEIT_START_AVERAGED(edges_vecs);
+	bm_mesh_edges_calc_vectors(bm, edgevec, NULL);
+	TIMEIT_END_AVERAGED(edges_vecs);
 
 	/* Add weighted face normals to vertices, and normalize vert normals. */
+	TIMEIT_START_AVERAGED(verts_nors);
 	bm_mesh_verts_calc_normals(bm, (const float(*)[3])edgevec, NULL, NULL, NULL);
+	TIMEIT_END_AVERAGED(verts_nors);
 	MEM_freeN(edgevec);
+
+	TIMEIT_END_AVERAGED(bmesh_nors);
 }
 
 /**
diff --git a/tests/gtests/blenlib/BLI_task_test.cc b/tests/gtests/blenlib/BLI_task_test.cc
new file mode 100644
index 00000000000..e6464164ecb
--- /dev/null
+++ b/tests/gtests/blenlib/BLI_task_test.cc
@@ -0,0 +1,76 @@
+/* Apache License, Version 2.0 */
+
+#include "testing/testing.h"
+#include <string.h>
+
+#include "atomic_ops.h"
+
+extern "C" {
+#include "BLI_mempool.h"
+#include "BLI_task.h"
+#include "BLI_utildefines.h"
+};
+
+#define NUM_ITEMS 10000
+
+static void task_mempool_iter_func(void *userdata, MempoolIterData *item) {
+	int *data = (int *)item;
+	int *count = (int *)userdata;
+
+	EXPECT_TRUE(data != NULL);
+
+	*data += 1;
+	atomic_sub_and_fetch_uint32((uint32_t *)count, 1);
+}
+
+TEST(task, MempoolIter)
+{
+	int *data[NUM_ITEMS];
+	BLI_mempool *mempool = BLI_mempool_create(sizeof(*data[0]), NUM_ITEMS, 32, BLI_MEMPOOL_ALLOW_ITER);
+
+	int i;
+
+	/* 'Randomly' add and remove some items from mempool, to create a non-homogenous one. */
+	int num_items = 0;
+	for (i = 0; i < NUM_ITEMS; i++) {
+		data[i] = (int *)BLI_mempool_alloc(mempool);
+		*data[i] = i - 1;
+		num_items++;
+	}
+
+	for (i = 0; i < NUM_ITEMS; i += 3) {
+		BLI_mempool_free(mempool, data[i]);
+		data[i] = NULL;
+		num_items--;
+	}
+
+	for (i = 0; i < NUM_ITEMS; i += 7) {
+		if (data[i] == NULL) {
+			data[i] = (int *)BLI_mempool_alloc(mempool);
+			*data[i] = i - 1;
+			num_items++;
+		}
+	}
+
+	for (i = 0; i < NUM_ITEMS - 5; i += 23) {
+		for (int j = 0; j < 5; j++) {
+			if (data[i + j] != NULL) {
+				BLI_mempool_free(mempool, data[i + j]);
+				data[i + j] = NULL;
+				num_items--;
+			}
+		}
+	}
+
+	BLI_task_parallel_mempool(mempool, &num_items, task_mempool_iter_func, true);
+
+	/* Those checks should ensure us all items of the mempool were processed once, and only once - as expected. */
+	EXPECT_EQ(num_items, 0);
+	for (i = 0; i < NUM_ITEMS; i++) {
+		if (data[i] != NULL) {
+			EXPECT_EQ(*data[i], i);
+		}
+	}
+
+	BLI_mempool_destroy(mempool);
+}
diff --git a/tests/gtests/blenlib/CMakeLists.txt b/tests/gtests/blenlib/CMakeLists.txt
index f3b2e81c61a..001f1d5f7b3 100644
--- a/tests/gtests/blenlib/CMakeLists.txt
+++ b/tests/gtests/blenlib/CMakeLists.txt
@@ -27,6 +27,7 @@ set(INC
 	../../../source/blender/blenlib
 	../../../source/blender/makesdna
 	../../../intern/guardedalloc
+	../../../intern/atomic
 )
 
 include_directories(${INC})
@@ -56,6 +57,7 @@ BLENDER_TEST(BLI_polyfill2d "bf_blenlib")
 BLENDER_TEST(BLI_stack "bf_blenlib")
 BLENDER_TEST(BLI_string "bf_blenlib")
 BLENDER_TEST(BLI_string_utf8 "bf_blenlib")
+BLENDER_TEST(BLI_task "bf_blenlib")
 
 BLENDER_TEST_PERFORMANCE(BLI_ghash_performance "bf_blenlib")