From 498ffef7b054c08a1d05271aeb4fde5eeeebf450 Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Thu, 11 Jan 2018 14:32:56 +0100 Subject: Subsurf: Use regular mutex instead of RW one Mutex is now local to particular CCGDM, and guarding edge hash which is only used by a single function only. There is no need to acquire read lock after edge hash was created. --- source/blender/blenkernel/BKE_subsurf.h | 2 +- source/blender/blenkernel/intern/subsurf_ccg.c | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'source/blender') diff --git a/source/blender/blenkernel/BKE_subsurf.h b/source/blender/blenkernel/BKE_subsurf.h index d7b9d20d7b0..96320415b16 100644 --- a/source/blender/blenkernel/BKE_subsurf.h +++ b/source/blender/blenkernel/BKE_subsurf.h @@ -144,7 +144,7 @@ typedef struct CCGDerivedMesh { struct EdgeHash *ehash; - ThreadRWMutex loops_cache_rwlock; + ThreadMutex loops_cache_lock; ThreadRWMutex origindex_cache_rwlock; } CCGDerivedMesh; diff --git a/source/blender/blenkernel/intern/subsurf_ccg.c b/source/blender/blenkernel/intern/subsurf_ccg.c index d2f325fb3c4..c6b701d2d28 100644 --- a/source/blender/blenkernel/intern/subsurf_ccg.c +++ b/source/blender/blenkernel/intern/subsurf_ccg.c @@ -1488,7 +1488,7 @@ static void ccgDM_copyFinalLoopArray(DerivedMesh *dm, MLoop *mloop) /* DMFlagMat *faceFlags = ccgdm->faceFlags; */ /* UNUSED */ if (!ccgdm->ehash) { - BLI_rw_mutex_lock(&ccgdm->loops_cache_rwlock, THREAD_LOCK_WRITE); + BLI_mutex_lock(&ccgdm->loops_cache_lock); if (!ccgdm->ehash) { MEdge *medge; EdgeHash *ehash; @@ -1502,10 +1502,9 @@ static void ccgDM_copyFinalLoopArray(DerivedMesh *dm, MLoop *mloop) atomic_cas_ptr((void**)&ccgdm->ehash, ccgdm->ehash, ehash); } - BLI_rw_mutex_unlock(&ccgdm->loops_cache_rwlock); + BLI_mutex_unlock(&ccgdm->loops_cache_lock); } - BLI_rw_mutex_lock(&ccgdm->loops_cache_rwlock, THREAD_LOCK_READ); totface = ccgSubSurf_getNumFaces(ss); ml = mloop; for (index = 0; index < totface; index++) { @@ -1548,7 +1547,6 @@ static void ccgDM_copyFinalLoopArray(DerivedMesh *dm, MLoop *mloop) } } } - BLI_rw_mutex_unlock(&ccgdm->loops_cache_rwlock); } static void ccgDM_copyFinalPolyArray(DerivedMesh *dm, MPoly *mpoly) @@ -4050,7 +4048,7 @@ static void ccgDM_release(DerivedMesh *dm) MEM_freeN(ccgdm->faceMap); } - BLI_rw_mutex_end(&ccgdm->loops_cache_rwlock); + BLI_mutex_end(&ccgdm->loops_cache_lock); BLI_rw_mutex_end(&ccgdm->origindex_cache_rwlock); MEM_freeN(ccgdm); @@ -5044,7 +5042,7 @@ static CCGDerivedMesh *getCCGDerivedMesh(CCGSubSurf *ss, ccgdm->dm.numLoopData = ccgdm->dm.numPolyData * 4; ccgdm->dm.numTessFaceData = 0; - BLI_rw_mutex_init(&ccgdm->loops_cache_rwlock); + BLI_mutex_init(&ccgdm->loops_cache_lock); BLI_rw_mutex_init(&ccgdm->origindex_cache_rwlock); return ccgdm; -- cgit v1.2.3 From 1255f572c76d989e9acf86d1d6089303406bc72f Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Thu, 11 Jan 2018 14:47:12 +0100 Subject: Depsgraph: Make eval initialization more friendly for threading Helps in cases of not very complex scenes and lots of system threads available. A bit hard to measure change on it's own, it works best with the upcoming changes and gives measurable improvements. --- source/blender/depsgraph/intern/eval/deg_eval.cc | 3 +-- source/blender/depsgraph/intern/eval/deg_eval_flush.cc | 7 +++---- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'source/blender') diff --git a/source/blender/depsgraph/intern/eval/deg_eval.cc b/source/blender/depsgraph/intern/eval/deg_eval.cc index 76e76b5eb7b..c29a0708cef 100644 --- a/source/blender/depsgraph/intern/eval/deg_eval.cc +++ b/source/blender/depsgraph/intern/eval/deg_eval.cc @@ -144,13 +144,12 @@ static void calculate_pending_func( static void calculate_pending_parents(Depsgraph *graph, unsigned int layers) { const int num_operations = graph->operations.size(); - const bool do_threads = (num_operations > 256); CalculatePengindData data; data.graph = graph; data.layers = layers; ParallelRangeSettings settings; BLI_parallel_range_settings_defaults(&settings); - settings.use_threading = do_threads; + settings.min_iter_per_thread = 1024; BLI_task_parallel_range(0, num_operations, &data, diff --git a/source/blender/depsgraph/intern/eval/deg_eval_flush.cc b/source/blender/depsgraph/intern/eval/deg_eval_flush.cc index 9e910afea07..daf008ddb7d 100644 --- a/source/blender/depsgraph/intern/eval/deg_eval_flush.cc +++ b/source/blender/depsgraph/intern/eval/deg_eval_flush.cc @@ -112,7 +112,7 @@ BLI_INLINE void flush_prepare(Depsgraph *graph) const int num_operations = graph->operations.size(); ParallelRangeSettings settings; BLI_parallel_range_settings_defaults(&settings); - settings.use_threading = (num_operations > 256); + settings.min_iter_per_thread = 1024; BLI_task_parallel_range(0, num_operations, graph, flush_init_operation_node_func, @@ -122,7 +122,7 @@ BLI_INLINE void flush_prepare(Depsgraph *graph) const int num_id_nodes = graph->id_nodes.size(); ParallelRangeSettings settings; BLI_parallel_range_settings_defaults(&settings); - settings.use_threading = (num_id_nodes > 256); + settings.min_iter_per_thread = 1024; BLI_task_parallel_range(0, num_id_nodes, graph, flush_init_id_node_func, @@ -311,10 +311,9 @@ void deg_graph_clear_tags(Depsgraph *graph) { /* Go over all operation nodes, clearing tags. */ const int num_operations = graph->operations.size(); - const bool do_threads = num_operations > 256; ParallelRangeSettings settings; BLI_parallel_range_settings_defaults(&settings); - settings.use_threading = do_threads; + settings.min_iter_per_thread = 1024; BLI_task_parallel_range(0, num_operations, graph, graph_clear_func, -- cgit v1.2.3 From c276fef4132557118a3bcba38220e02bf4f9f1b4 Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Thu, 11 Jan 2018 14:49:32 +0100 Subject: Subsurf: Make copyFinalLoopArray() threaded Gives about 40% speedup of object which has simple-ish deformation applied on top of subdivided mesh. This might easily happen with single character animation. --- source/blender/blenkernel/intern/subsurf_ccg.c | 121 ++++++++++++++++--------- 1 file changed, 76 insertions(+), 45 deletions(-) (limited to 'source/blender') diff --git a/source/blender/blenkernel/intern/subsurf_ccg.c b/source/blender/blenkernel/intern/subsurf_ccg.c index c6b701d2d28..1b174cf4654 100644 --- a/source/blender/blenkernel/intern/subsurf_ccg.c +++ b/source/blender/blenkernel/intern/subsurf_ccg.c @@ -58,6 +58,7 @@ #include "BLI_edgehash.h" #include "BLI_math.h" #include "BLI_memarena.h" +#include "BLI_task.h" #include "BLI_threads.h" #include "BKE_pbvh.h" @@ -1476,16 +1477,67 @@ static void ccgDM_copyFinalFaceArray(DerivedMesh *dm, MFace *mface) } } +typedef struct CopyFinalLoopArrayData { + CCGDerivedMesh *ccgdm; + MLoop *mloop; + int grid_size; + int *grid_offset; + int edge_size; + size_t mloop_index; +} CopyFinalLoopArrayData; + +static void copyFinalLoopArray_task_cb( + void *__restrict userdata, + const int iter, + const ParallelRangeTLS *__restrict UNUSED(tls)) +{ + CopyFinalLoopArrayData *data = userdata; + CCGDerivedMesh *ccgdm = data->ccgdm; + CCGSubSurf *ss = ccgdm->ss; + const int grid_size = data->grid_size; + const int edge_size = data->edge_size; + CCGFace *f = ccgdm->faceMap[iter].face; + const int num_verts = ccgSubSurf_getFaceNumVerts(f); + const int grid_index = data->grid_offset[iter]; + const size_t loop_index = 4 * (size_t)grid_index * (grid_size - 1) * (grid_size - 1); + MLoop *ml = &data->mloop[loop_index]; + for (int S = 0; S < num_verts; S++) { + for (int y = 0; y < grid_size - 1; y++) { + for (int x = 0; x < grid_size - 1; x++) { + + uint v1 = getFaceIndex(ss, f, S, x + 0, y + 0, + edge_size, grid_size); + uint v2 = getFaceIndex(ss, f, S, x + 0, y + 1, + edge_size, grid_size); + uint v3 = getFaceIndex(ss, f, S, x + 1, y + 1, + edge_size, grid_size); + uint v4 = getFaceIndex(ss, f, S, x + 1, y + 0, + edge_size, grid_size); + + ml->v = v1; + ml->e = GET_UINT_FROM_POINTER(BLI_edgehash_lookup(ccgdm->ehash, v1, v2)); + ml++; + + ml->v = v2; + ml->e = GET_UINT_FROM_POINTER(BLI_edgehash_lookup(ccgdm->ehash, v2, v3)); + ml++; + + ml->v = v3; + ml->e = GET_UINT_FROM_POINTER(BLI_edgehash_lookup(ccgdm->ehash, v3, v4)); + ml++; + + ml->v = v4; + ml->e = GET_UINT_FROM_POINTER(BLI_edgehash_lookup(ccgdm->ehash, v4, v1)); + ml++; + } + } + } +} + static void ccgDM_copyFinalLoopArray(DerivedMesh *dm, MLoop *mloop) { CCGDerivedMesh *ccgdm = (CCGDerivedMesh *) dm; CCGSubSurf *ss = ccgdm->ss; - int index; - int totface; - int gridSize = ccgSubSurf_getGridSize(ss); - int edgeSize = ccgSubSurf_getEdgeSize(ss); - MLoop *ml; - /* DMFlagMat *faceFlags = ccgdm->faceFlags; */ /* UNUSED */ if (!ccgdm->ehash) { BLI_mutex_lock(&ccgdm->loops_cache_lock); @@ -1505,48 +1557,27 @@ static void ccgDM_copyFinalLoopArray(DerivedMesh *dm, MLoop *mloop) BLI_mutex_unlock(&ccgdm->loops_cache_lock); } - totface = ccgSubSurf_getNumFaces(ss); - ml = mloop; - for (index = 0; index < totface; index++) { - CCGFace *f = ccgdm->faceMap[index].face; - int x, y, S, numVerts = ccgSubSurf_getFaceNumVerts(f); - /* int flag = (faceFlags) ? faceFlags[index * 2]: ME_SMOOTH; */ /* UNUSED */ - /* int mat_nr = (faceFlags) ? faceFlags[index * 2 + 1]: 0; */ /* UNUSED */ - - for (S = 0; S < numVerts; S++) { - for (y = 0; y < gridSize - 1; y++) { - for (x = 0; x < gridSize - 1; x++) { - unsigned int v1, v2, v3, v4; - - v1 = getFaceIndex(ss, f, S, x + 0, y + 0, - edgeSize, gridSize); - - v2 = getFaceIndex(ss, f, S, x + 0, y + 1, - edgeSize, gridSize); - v3 = getFaceIndex(ss, f, S, x + 1, y + 1, - edgeSize, gridSize); - v4 = getFaceIndex(ss, f, S, x + 1, y + 0, - edgeSize, gridSize); + CopyFinalLoopArrayData data; + data.ccgdm = ccgdm; + data.mloop = mloop; + data.grid_size = ccgSubSurf_getGridSize(ss); + data.grid_offset = dm->getGridOffset(dm); + data.edge_size = ccgSubSurf_getEdgeSize(ss); - ml->v = v1; - ml->e = GET_UINT_FROM_POINTER(BLI_edgehash_lookup(ccgdm->ehash, v1, v2)); - ml++; - - ml->v = v2; - ml->e = GET_UINT_FROM_POINTER(BLI_edgehash_lookup(ccgdm->ehash, v2, v3)); - ml++; + /* NOTE: For a dense subdivision we've got enough work for each face and + * hence can dedicate whole thread to single face. For less dense + * subdivision we handle multiple faces per thread. + */ + data.mloop_index = data.grid_size >= 5 ? 1 : 8; - ml->v = v3; - ml->e = GET_UINT_FROM_POINTER(BLI_edgehash_lookup(ccgdm->ehash, v3, v4)); - ml++; + ParallelRangeSettings settings; + BLI_parallel_range_settings_defaults(&settings); + settings.min_iter_per_thread = 1; - ml->v = v4; - ml->e = GET_UINT_FROM_POINTER(BLI_edgehash_lookup(ccgdm->ehash, v4, v1)); - ml++; - } - } - } - } + BLI_task_parallel_range(0, ccgSubSurf_getNumFaces(ss), + &data, + copyFinalLoopArray_task_cb, + &settings); } static void ccgDM_copyFinalPolyArray(DerivedMesh *dm, MPoly *mpoly) -- cgit v1.2.3 From cdcdae663ac36118fa23abb86c6f85be2f543c56 Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Thu, 11 Jan 2018 14:51:30 +0100 Subject: Mesh deform: Tweak threadability criteria Allow threading using subset of all available threads. Makes it faster to evaluate lower resolution mesh but with complex deform groups and such. --- source/blender/modifiers/intern/MOD_meshdeform.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'source/blender') diff --git a/source/blender/modifiers/intern/MOD_meshdeform.c b/source/blender/modifiers/intern/MOD_meshdeform.c index 5e878f28e25..8f197ce3b99 100644 --- a/source/blender/modifiers/intern/MOD_meshdeform.c +++ b/source/blender/modifiers/intern/MOD_meshdeform.c @@ -418,7 +418,7 @@ static void meshdeformModifier_do( /* Do deformation. */ ParallelRangeSettings settings; BLI_parallel_range_settings_defaults(&settings); - settings.use_threading = (totvert > 1000); + settings.min_iter_per_thread = 16; BLI_task_parallel_range(0, totvert, &data, meshdeform_vert_task, -- cgit v1.2.3 From 374c4f168d6278c100cb88d192d1eec6e243247b Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Thu, 11 Jan 2018 15:04:13 +0100 Subject: Mesh evaluate: Tweak threadability criteria Gives measurable speedup on layout scenes from Spring. Actual value for chunk size is a subject for more scientific research. --- source/blender/blenkernel/intern/mesh_evaluate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'source/blender') diff --git a/source/blender/blenkernel/intern/mesh_evaluate.c b/source/blender/blenkernel/intern/mesh_evaluate.c index b321065d84c..76c629912ac 100644 --- a/source/blender/blenkernel/intern/mesh_evaluate.c +++ b/source/blender/blenkernel/intern/mesh_evaluate.c @@ -287,12 +287,11 @@ void BKE_mesh_calc_normals_poly( int numLoops, int numPolys, float (*r_polynors)[3], const bool only_face_normals) { - const bool do_threaded = (numPolys > BKE_MESH_OMP_LIMIT); float (*pnors)[3] = r_polynors; ParallelRangeSettings settings; BLI_parallel_range_settings_defaults(&settings); - settings.use_threading = do_threaded; + settings.min_iter_per_thread = 1024; if (only_face_normals) { BLI_assert((pnors != NULL) || (numPolys == 0)); -- cgit v1.2.3 From 38d480fb54dbac5bb87b002e4ee22c1d3df89f90 Mon Sep 17 00:00:00 2001 From: Sergey Sharybin Date: Thu, 11 Jan 2018 15:06:56 +0100 Subject: Subsurf: Allow partial threading over geometry arrays This helps avoiding threading overhead when having lots of system threads.. --- source/blender/blenkernel/intern/CCGSubSurf_legacy.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'source/blender') diff --git a/source/blender/blenkernel/intern/CCGSubSurf_legacy.c b/source/blender/blenkernel/intern/CCGSubSurf_legacy.c index d567b50af56..2b331eae950 100644 --- a/source/blender/blenkernel/intern/CCGSubSurf_legacy.c +++ b/source/blender/blenkernel/intern/CCGSubSurf_legacy.c @@ -34,6 +34,9 @@ #define FACE_calcIFNo(f, lvl, S, x, y, no) _face_calcIFNo(f, lvl, S, x, y, no, subdivLevels, vertDataSize) +/* TODO(sergey): This actually depends on subsurf level as well. */ +#define CCG_TASK_LIMIT 16 + /* TODO(sergey): Deduplicate the following functions/ */ static void *_edge_getCoVert(CCGEdge *e, CCGVert *v, int lvl, int x, int dataSize) { @@ -340,7 +343,7 @@ static void ccgSubSurf__calcVertNormals(CCGSubSurf *ss, { ParallelRangeSettings settings; BLI_parallel_range_settings_defaults(&settings); - settings.use_threading = (numEffectedF * edgeSize * edgeSize * 4 >= CCG_OMP_LIMIT); + settings.min_iter_per_thread = CCG_TASK_LIMIT; BLI_task_parallel_range(0, numEffectedF, &data, ccgSubSurf__calcVertNormals_faces_accumulate_cb, @@ -374,7 +377,7 @@ static void ccgSubSurf__calcVertNormals(CCGSubSurf *ss, { ParallelRangeSettings settings; BLI_parallel_range_settings_defaults(&settings); - settings.use_threading = (numEffectedE * edgeSize * 4 >= CCG_OMP_LIMIT); + settings.min_iter_per_thread = CCG_TASK_LIMIT; BLI_task_parallel_range(0, numEffectedE, &data, ccgSubSurf__calcVertNormals_edges_accumulate_cb, @@ -384,7 +387,7 @@ static void ccgSubSurf__calcVertNormals(CCGSubSurf *ss, { ParallelRangeSettings settings; BLI_parallel_range_settings_defaults(&settings); - settings.use_threading = (numEffectedF * edgeSize * edgeSize * 4 >= CCG_OMP_LIMIT); + settings.min_iter_per_thread = CCG_TASK_LIMIT; BLI_task_parallel_range(0, numEffectedF, &data, ccgSubSurf__calcVertNormals_faces_finalize_cb, @@ -683,7 +686,7 @@ static void ccgSubSurf__calcSubdivLevel( { ParallelRangeSettings settings; BLI_parallel_range_settings_defaults(&settings); - settings.use_threading = (numEffectedF * edgeSize * edgeSize * 4 >= CCG_OMP_LIMIT); + settings.min_iter_per_thread = CCG_TASK_LIMIT; BLI_task_parallel_range(0, numEffectedF, &data, ccgSubSurf__calcSubdivLevel_interior_faces_edges_midpoints_cb, @@ -966,7 +969,7 @@ static void ccgSubSurf__calcSubdivLevel( { ParallelRangeSettings settings; BLI_parallel_range_settings_defaults(&settings); - settings.use_threading = (numEffectedF * edgeSize * edgeSize * 4 >= CCG_OMP_LIMIT); + settings.min_iter_per_thread = CCG_TASK_LIMIT; BLI_task_parallel_range(0, numEffectedF, &data, ccgSubSurf__calcSubdivLevel_interior_faces_edges_centerpoints_shift_cb, @@ -986,7 +989,7 @@ static void ccgSubSurf__calcSubdivLevel( { ParallelRangeSettings settings; BLI_parallel_range_settings_defaults(&settings); - settings.use_threading = (numEffectedF * edgeSize * edgeSize * 4 >= CCG_OMP_LIMIT); + settings.min_iter_per_thread = CCG_TASK_LIMIT; BLI_task_parallel_range(0, numEffectedF, &data, ccgSubSurf__calcSubdivLevel_verts_copydata_cb, -- cgit v1.2.3