DRW: Add draw call sorting

This makes rendering lots of similar objects much faster (with lower CPU overhead). 29 fps -> 38 fps 34 ms -> 26 ms In my test case with 30K instances of 4 different meshes
author: Clément Foucault <foucault.clem@gmail.com> 2019-06-18 23:02:52 +0300
committer: Clément Foucault <foucault.clem@gmail.com> 2019-08-17 15:48:48 +0300
commit: 15a4171e2dd75059222637ccba44db59733c735d (patch)
tree: 6da76dc2ff856f1a03eb155896d9b7dbab2687c1
parent: 0e6d17edfeb9ede447b5bbe9e84fbb4c48175ac9 (diff)
2 files changed, 45 insertions, 2 deletions
diff --git a/source/blender/draw/intern/draw_manager.h b/source/blender/draw/intern/draw_manager.h
index 9ccc6832127..54e35c7e5c4 100644
--- a/source/blender/draw/intern/draw_manager.h
+++ b/source/blender/draw/intern/draw_manager.h
@@ -313,7 +313,7 @@ typedef struct DRWCallChunk {
   struct DRWCallChunk *next; /* single-linked list */
   uchar chunk_len;
   uchar call_used;
-  DRWCall calls[63];
+  DRWCall calls[126];
 } DRWCallChunk;
 
 typedef struct DRWCallSmallChunk {
@@ -322,9 +322,12 @@ typedef struct DRWCallSmallChunk {
   uchar call_used;
   /* Small chunk to avoid wasting too much memory
    * on small shading groups. */
-  DRWCall calls[5];
+  DRWCall calls[4];
 } DRWCallSmallChunk;
 
+BLI_STATIC_ASSERT_ALIGN(DRWCallChunk, 16);
+BLI_STATIC_ASSERT_ALIGN(DRWCallSmallChunk, 16);
+
 /* ------------- DRAW DEBUG ------------ */
 
 typedef struct DRWDebugLine {
diff --git a/source/blender/draw/intern/draw_manager_data.c b/source/blender/draw/intern/draw_manager_data.c
index 34c67f5c727..35d33917593 100644
--- a/source/blender/draw/intern/draw_manager_data.c
+++ b/source/blender/draw/intern/draw_manager_data.c
@@ -34,6 +34,7 @@
 #include "DNA_mesh_types.h"
 #include "DNA_meta_types.h"
 
+#include "BLI_alloca.h"
 #include "BLI_hash.h"
 #include "BLI_link_utils.h"
 #include "BLI_mempool.h"
@@ -51,6 +52,35 @@
 /** \name Uniform Buffer Object (DRW_uniformbuffer)
  * \{ */
 
+static void draw_call_sort(DRWCall *array, DRWCall *array_tmp, int array_len)
+{
+  /* Count unique batches. Tt's not really important if
+   * there is colisions. If there is a lot of different batches,
+   * the sorting benefit will be negligeable. So at least
+   * sort fast! */
+  uchar idx[128] = {0};
+  /* Shift by 7 positions knowing each GPUBatch is > 64 bytes */
+#define KEY(a) ((((size_t)((a).batch)) >> 7) % ARRAY_SIZE(idx))
+  BLI_assert(array_len <= ARRAY_SIZE(idx));
+
+  for (int i = 0; i < array_len; i++) {
+    /* Early out if nothing to sort. */
+    if (++idx[KEY(array[i])] == array_len)
+      return;
+  }
+  /* Cumulate batch indices */
+  for (int i = 1; i < ARRAY_SIZE(idx); i++) {
+    idx[i] += idx[i - 1];
+  }
+  /* Traverse in reverse to not change the order of the resource ids. */
+  for (int src = array_len - 1; src >= 0; src--) {
+    array_tmp[--idx[KEY(array[src])]] = array[src];
+  }
+#undef KEY
+
+  memcpy(array, array_tmp, sizeof(DRWCallChunk) - offsetof(DRWCallChunk, calls));
+}
+
 GPUUniformBuffer *DRW_uniformbuffer_create(int size, const void *data)
 {
   return GPU_uniformbuffer_create(size, data, NULL);
@@ -107,6 +137,16 @@ void drw_resource_buffer_finish(ViewportMemoryPool *vmempool)
       GPU_uniformbuffer_update(vmempool->obinfos_ubo[i], data_infos);
     }
   }
+
+  /* Aligned alloc to avoid unaligned memcpy. */
+  DRWCallChunk *chunk_tmp = MEM_mallocN_aligned(sizeof(DRWCallChunk), 16, "tmp call chunk");
+  DRWCallChunk *chunk;
+  BLI_memblock_iter iter;
+  BLI_memblock_iternew(vmempool->calls, &iter);
+  while ((chunk = BLI_memblock_iterstep(&iter))) {
+    draw_call_sort(chunk->calls, chunk_tmp->calls, chunk->call_used);
+  }
+  MEM_freeN(chunk_tmp);
 }
 
 /** \} */
author	Clément Foucault <foucault.clem@gmail.com>	2019-06-18 23:02:52 +0300
committer	Clément Foucault <foucault.clem@gmail.com>	2019-08-17 15:48:48 +0300
commit	15a4171e2dd75059222637ccba44db59733c735d (patch)
tree	6da76dc2ff856f1a03eb155896d9b7dbab2687c1
parent	0e6d17edfeb9ede447b5bbe9e84fbb4c48175ac9 (diff)