diff options
author | Clément Foucault <foucault.clem@gmail.com> | 2019-06-18 23:02:52 +0300 |
---|---|---|
committer | Clément Foucault <foucault.clem@gmail.com> | 2019-08-17 15:48:48 +0300 |
commit | 15a4171e2dd75059222637ccba44db59733c735d (patch) | |
tree | 6da76dc2ff856f1a03eb155896d9b7dbab2687c1 | |
parent | 0e6d17edfeb9ede447b5bbe9e84fbb4c48175ac9 (diff) |
DRW: Add draw call sorting
This makes rendering lots of similar objects much faster (with lower CPU
overhead).
29 fps -> 38 fps
34 ms -> 26 ms
In my test case with 30K instances of 4 different meshes
-rw-r--r-- | source/blender/draw/intern/draw_manager.h | 7 | ||||
-rw-r--r-- | source/blender/draw/intern/draw_manager_data.c | 40 |
2 files changed, 45 insertions, 2 deletions
diff --git a/source/blender/draw/intern/draw_manager.h b/source/blender/draw/intern/draw_manager.h index 9ccc6832127..54e35c7e5c4 100644 --- a/source/blender/draw/intern/draw_manager.h +++ b/source/blender/draw/intern/draw_manager.h @@ -313,7 +313,7 @@ typedef struct DRWCallChunk { struct DRWCallChunk *next; /* single-linked list */ uchar chunk_len; uchar call_used; - DRWCall calls[63]; + DRWCall calls[126]; } DRWCallChunk; typedef struct DRWCallSmallChunk { @@ -322,9 +322,12 @@ typedef struct DRWCallSmallChunk { uchar call_used; /* Small chunk to avoid wasting too much memory * on small shading groups. */ - DRWCall calls[5]; + DRWCall calls[4]; } DRWCallSmallChunk; +BLI_STATIC_ASSERT_ALIGN(DRWCallChunk, 16); +BLI_STATIC_ASSERT_ALIGN(DRWCallSmallChunk, 16); + /* ------------- DRAW DEBUG ------------ */ typedef struct DRWDebugLine { diff --git a/source/blender/draw/intern/draw_manager_data.c b/source/blender/draw/intern/draw_manager_data.c index 34c67f5c727..35d33917593 100644 --- a/source/blender/draw/intern/draw_manager_data.c +++ b/source/blender/draw/intern/draw_manager_data.c @@ -34,6 +34,7 @@ #include "DNA_mesh_types.h" #include "DNA_meta_types.h" +#include "BLI_alloca.h" #include "BLI_hash.h" #include "BLI_link_utils.h" #include "BLI_mempool.h" @@ -51,6 +52,35 @@ /** \name Uniform Buffer Object (DRW_uniformbuffer) * \{ */ +static void draw_call_sort(DRWCall *array, DRWCall *array_tmp, int array_len) +{ + /* Count unique batches. Tt's not really important if + * there is colisions. If there is a lot of different batches, + * the sorting benefit will be negligeable. So at least + * sort fast! */ + uchar idx[128] = {0}; + /* Shift by 7 positions knowing each GPUBatch is > 64 bytes */ +#define KEY(a) ((((size_t)((a).batch)) >> 7) % ARRAY_SIZE(idx)) + BLI_assert(array_len <= ARRAY_SIZE(idx)); + + for (int i = 0; i < array_len; i++) { + /* Early out if nothing to sort. */ + if (++idx[KEY(array[i])] == array_len) + return; + } + /* Cumulate batch indices */ + for (int i = 1; i < ARRAY_SIZE(idx); i++) { + idx[i] += idx[i - 1]; + } + /* Traverse in reverse to not change the order of the resource ids. */ + for (int src = array_len - 1; src >= 0; src--) { + array_tmp[--idx[KEY(array[src])]] = array[src]; + } +#undef KEY + + memcpy(array, array_tmp, sizeof(DRWCallChunk) - offsetof(DRWCallChunk, calls)); +} + GPUUniformBuffer *DRW_uniformbuffer_create(int size, const void *data) { return GPU_uniformbuffer_create(size, data, NULL); @@ -107,6 +137,16 @@ void drw_resource_buffer_finish(ViewportMemoryPool *vmempool) GPU_uniformbuffer_update(vmempool->obinfos_ubo[i], data_infos); } } + + /* Aligned alloc to avoid unaligned memcpy. */ + DRWCallChunk *chunk_tmp = MEM_mallocN_aligned(sizeof(DRWCallChunk), 16, "tmp call chunk"); + DRWCallChunk *chunk; + BLI_memblock_iter iter; + BLI_memblock_iternew(vmempool->calls, &iter); + while ((chunk = BLI_memblock_iterstep(&iter))) { + draw_call_sort(chunk->calls, chunk_tmp->calls, chunk->call_used); + } + MEM_freeN(chunk_tmp); } /** \} */ |