DRW: Refactor to support draw call batching

Reviewers: brecht Differential Revision: D4997
author: Clément Foucault <foucault.clem@gmail.com> 2019-05-31 02:45:41 +0300
committer: Clément Foucault <foucault.clem@gmail.com> 2019-09-13 18:32:18 +0300
commit: ce34a6b0d727bbde6ae373afa8ec6c42bc8980ce (patch)
tree: f8cc84f7e2038f2a81ac0141d79205f1df649e4e /source/blender/gpu/intern
parent: f7e8b580989ec70d1cf8f15a11d4f09e6b36f407 (diff)
5 files changed, 285 insertions, 23 deletions
diff --git a/source/blender/gpu/intern/gpu_batch.c b/source/blender/gpu/intern/gpu_batch.c
index e0c0aea576c..697bf37f34d 100644
--- a/source/blender/gpu/intern/gpu_batch.c
+++ b/source/blender/gpu/intern/gpu_batch.c
@@ -39,8 +39,9 @@
 
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
 
-static void batch_update_program_bindings(GPUBatch *batch, uint v_first);
+static void batch_update_program_bindings(GPUBatch *batch, uint i_first);
 
 void GPU_batch_vao_cache_clear(GPUBatch *batch)
 {
@@ -446,20 +447,51 @@ static void create_bindings(GPUVertBuf *verts,
   }
 }
 
-static void batch_update_program_bindings(GPUBatch *batch, uint v_first)
+static void instance_id_workaround(GPUBatch *batch)
+{
+  /**
+   * A driver bug make it so that when using an attribute with GL_INT_2_10_10_10_REV as format,
+   * the gl_InstanceID is incremented by the 2 bit component of the attrib. To workaround this,
+   * we create a new vertex attrib containing the expected value of gl_InstanceID.
+   **/
+  const GPUShaderInput *input = GPU_shaderinterface_attr(batch->interface, "_instanceId");
+  if (input) {
+#define DRW_RESOURCE_CHUNK_LEN 512 /* Keep in sync. */
+    static GLint vbo_id = 0;
+    if (vbo_id == 0) {
+      short data[DRW_RESOURCE_CHUNK_LEN];
+      for (int i = 0; i < DRW_RESOURCE_CHUNK_LEN; i++) {
+        data[i] = i;
+      }
+      /* GPU_context takes care of deleting `vbo_id` at the end. */
+      vbo_id = GPU_buf_alloc();
+      glBindBuffer(GL_ARRAY_BUFFER, vbo_id);
+      glBufferData(GL_ARRAY_BUFFER, sizeof(data), data, GL_STATIC_DRAW);
+    }
+    glBindBuffer(GL_ARRAY_BUFFER, vbo_id);
+    glEnableVertexAttribArray(input->location);
+    glVertexAttribIPointer(input->location, 1, GL_SHORT, 0, NULL);
+    glVertexAttribDivisor(input->location, 1);
+  }
+}
+
+static void batch_update_program_bindings(GPUBatch *batch, uint i_first)
 {
   /* Reverse order so first vbos have more prevalence (in term of attrib override). */
   for (int v = GPU_BATCH_VBO_MAX_LEN - 1; v > -1; v--) {
     if (batch->verts[v] != NULL) {
-      create_bindings(batch->verts[v], batch->interface, (batch->inst) ? 0 : v_first, false);
+      create_bindings(batch->verts[v], batch->interface, 0, false);
     }
   }
   if (batch->inst) {
-    create_bindings(batch->inst, batch->interface, v_first, true);
+    create_bindings(batch->inst, batch->interface, i_first, true);
   }
   if (batch->elem) {
     GPU_indexbuf_use(batch->elem);
   }
+  if (GPU_crappy_amd_driver()) {
+    instance_id_workaround(batch);
+  }
 }
 
 void GPU_batch_program_use_begin(GPUBatch *batch)
@@ -618,6 +650,14 @@ void GPU_batch_draw(GPUBatch *batch)
   GPU_batch_program_use_end(batch);
 }
 
+#if GPU_TRACK_INDEX_RANGE
+#  define BASE_INDEX(el) ((el)->base_index)
+#  define INDEX_TYPE(el) ((el)->gl_index_type)
+#else
+#  define BASE_INDEX(el) 0
+#  define INDEX_TYPE(el) GL_UNSIGNED_INT
+#endif
+
 void GPU_batch_draw_advanced(GPUBatch *batch, int v_first, int v_count, int i_first, int i_count)
 {
 #if TRUST_NO_ONE
@@ -632,8 +672,13 @@ void GPU_batch_draw_advanced(GPUBatch *batch, int v_first, int v_count, int i_fi
     i_count = (batch->inst) ? batch->inst->vertex_len : 1;
   }
 
+  if (v_count == 0 || i_count == 0) {
+    /* Nothing to draw. */
+    return;
+  }
+
   if (!GPU_arb_base_instance_is_supported()) {
-    if (i_first > 0 && i_count > 0) {
+    if (i_first > 0) {
       /* If using offset drawing with instancing, we must
        * use the default VAO and redo bindings. */
       glBindVertexArray(GPU_vao_default());
@@ -648,13 +693,8 @@ void GPU_batch_draw_advanced(GPUBatch *batch, int v_first, int v_count, int i_fi
 
   if (batch->elem) {
     const GPUIndexBuf *el = batch->elem;
-#if GPU_TRACK_INDEX_RANGE
-    GLenum index_type = el->gl_index_type;
-    GLint base_index = el->base_index;
-#else
-    GLenum index_type = GL_UNSIGNED_INT;
-    GLint base_index = 0;
-#endif
+    GLenum index_type = INDEX_TYPE(el);
+    GLint base_index = BASE_INDEX(el);
     void *v_first_ofs = elem_offset(el, v_first);
 
     if (GPU_arb_base_instance_is_supported()) {
@@ -697,6 +737,179 @@ void GPU_draw_primitive(GPUPrimType prim_type, int v_count)
 }
 
 /* -------------------------------------------------------------------- */
+/** \name Indirect Draw Calls
+ * \{ */
+
+#if 0
+#  define USE_MULTI_DRAW_INDIRECT 0
+#else
+#  define USE_MULTI_DRAW_INDIRECT \
+    (GL_ARB_multi_draw_indirect && GPU_arb_base_instance_is_supported())
+#endif
+
+typedef struct GPUDrawCommand {
+  uint v_count;
+  uint i_count;
+  uint v_first;
+  uint i_first;
+} GPUDrawCommand;
+
+typedef struct GPUDrawCommandIndexed {
+  uint v_count;
+  uint i_count;
+  uint v_first;
+  uint base_index;
+  uint i_first;
+} GPUDrawCommandIndexed;
+
+struct GPUDrawList {
+  GPUBatch *batch;
+  uint base_index;  /* Avoid dereferencing batch. */
+  uint cmd_offset;  /* in bytes, offset  inside indirect command buffer. */
+  uint cmd_len;     /* Number of used command for the next call. */
+  uint buffer_size; /* in bytes, size of indirect command buffer. */
+  GLuint buffer_id; /* Draw Indirect Buffer id */
+  union {
+    GPUDrawCommand *commands;
+    GPUDrawCommandIndexed *commands_indexed;
+  };
+};
+
+GPUDrawList *GPU_draw_list_create(int length)
+{
+  GPUDrawList *list = MEM_callocN(sizeof(GPUDrawList), "GPUDrawList");
+  /* Alloc the biggest possible command list which is indexed. */
+  list->buffer_size = sizeof(GPUDrawCommandIndexed) * length;
+  if (USE_MULTI_DRAW_INDIRECT) {
+    list->buffer_id = GPU_buf_alloc();
+    glBindBuffer(GL_DRAW_INDIRECT_BUFFER, list->buffer_id);
+    glBufferData(GL_DRAW_INDIRECT_BUFFER, list->buffer_size, NULL, GL_DYNAMIC_DRAW);
+  }
+  else {
+    list->commands = MEM_mallocN(list->buffer_size, "GPUDrawList data");
+  }
+  return list;
+}
+
+void GPU_draw_list_discard(GPUDrawList *list)
+{
+  if (list->buffer_id) {
+    GPU_buf_free(list->buffer_id);
+  }
+  else {
+    MEM_SAFE_FREE(list->commands);
+  }
+  MEM_freeN(list);
+}
+
+void GPU_draw_list_init(GPUDrawList *list, GPUBatch *batch)
+{
+  BLI_assert(batch->phase == GPU_BATCH_READY_TO_DRAW);
+  list->batch = batch;
+  list->base_index = batch->elem ? BASE_INDEX(batch->elem) : UINT_MAX;
+  list->cmd_len = 0;
+
+  if (USE_MULTI_DRAW_INDIRECT) {
+    if (list->commands == NULL) {
+      glBindBuffer(GL_DRAW_INDIRECT_BUFFER, list->buffer_id);
+      if (list->cmd_offset >= list->buffer_size) {
+        /* Orphan buffer data and start fresh. */
+        glBufferData(GL_DRAW_INDIRECT_BUFFER, list->buffer_size, NULL, GL_DYNAMIC_DRAW);
+        list->cmd_offset = 0;
+      }
+      GLenum flags = GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_FLUSH_EXPLICIT_BIT;
+      list->commands = glMapBufferRange(
+          GL_DRAW_INDIRECT_BUFFER, list->cmd_offset, list->buffer_size - list->cmd_offset, flags);
+    }
+  }
+  else {
+    list->cmd_offset = 0;
+  }
+}
+
+void GPU_draw_list_command_add(
+    GPUDrawList *list, int v_first, int v_count, int i_first, int i_count)
+{
+  BLI_assert(list->commands);
+
+  if (list->base_index != UINT_MAX) {
+    GPUDrawCommandIndexed *cmd = list->commands_indexed + list->cmd_len;
+    cmd->v_first = v_first;
+    cmd->v_count = v_count;
+    cmd->i_count = i_count;
+    cmd->base_index = list->base_index;
+    cmd->i_first = i_first;
+  }
+  else {
+    GPUDrawCommand *cmd = list->commands + list->cmd_len;
+    cmd->v_first = v_first;
+    cmd->v_count = v_count;
+    cmd->i_count = i_count;
+    cmd->i_first = i_first;
+  }
+
+  list->cmd_len++;
+  uint offset = list->cmd_offset + list->cmd_len * sizeof(GPUDrawCommandIndexed);
+
+  if (offset == list->buffer_size) {
+    GPU_draw_list_submit(list);
+    GPU_draw_list_init(list, list->batch);
+  }
+}
+
+void GPU_draw_list_submit(GPUDrawList *list)
+{
+  GPUBatch *batch = list->batch;
+
+  if (list->cmd_len == 0)
+    return;
+
+  BLI_assert(list->commands);
+  BLI_assert(batch->program_in_use);
+  /* TODO could assert that VAO is bound. */
+
+  /* TODO We loose a bit of memory here if we only draw arrays. Fix that. */
+  uintptr_t offset = list->cmd_offset;
+  uint cmd_len = list->cmd_len;
+  size_t bytes_used = cmd_len * sizeof(GPUDrawCommandIndexed);
+  list->cmd_offset += bytes_used;
+  list->cmd_len = 0; /* Avoid reuse. */
+
+  if (USE_MULTI_DRAW_INDIRECT) {
+    GLenum prim = batch->gl_prim_type;
+
+    glBindBuffer(GL_DRAW_INDIRECT_BUFFER, list->buffer_id);
+    glFlushMappedBufferRange(GL_DRAW_INDIRECT_BUFFER, 0, bytes_used);
+    glUnmapBuffer(GL_DRAW_INDIRECT_BUFFER);
+    list->commands = NULL; /* Unmapped */
+
+    if (batch->elem) {
+      glMultiDrawElementsIndirect(prim, INDEX_TYPE(batch->elem), (void *)offset, cmd_len, 0);
+    }
+    else {
+      glMultiDrawArraysIndirect(prim, (void *)offset, cmd_len, 0);
+    }
+  }
+  else {
+    /* Fallback */
+    if (batch->elem) {
+      GPUDrawCommandIndexed *cmd = list->commands_indexed;
+      for (int i = 0; i < cmd_len; i++, cmd++) {
+        GPU_batch_draw_advanced(batch, cmd->v_first, cmd->v_count, cmd->i_first, cmd->i_count);
+      }
+    }
+    else {
+      GPUDrawCommand *cmd = list->commands;
+      for (int i = 0; i < cmd_len; i++, cmd++) {
+        GPU_batch_draw_advanced(batch, cmd->v_first, cmd->v_count, cmd->i_first, cmd->i_count);
+      }
+    }
+  }
+}
+
+/** \} */
+
+/* -------------------------------------------------------------------- */
 /** \name Utilities
  * \{ */
 
diff --git a/source/blender/gpu/intern/gpu_codegen.c b/source/blender/gpu/intern/gpu_codegen.c
index 7483be74e01..410e23c9576 100644
--- a/source/blender/gpu/intern/gpu_codegen.c
+++ b/source/blender/gpu/intern/gpu_codegen.c
@@ -55,6 +55,12 @@
 #include <string.h>
 #include <stdarg.h>
 
+extern char datatoc_gpu_shader_material_glsl[];
+extern char datatoc_gpu_shader_geometry_glsl[];
+
+extern char datatoc_gpu_shader_common_obinfos_lib_glsl[];
+extern char datatoc_common_view_lib_glsl[];
+
 /* -------------------- GPUPass Cache ------------------ */
 /**
  * Internal shader cache: This prevent the shader recompilation / stall when
@@ -778,6 +784,12 @@ static void codegen_call_functions(DynStr *ds, ListBase *nodes, GPUOutput *final
         else if (input->builtin == GPU_OBJECT_MATRIX) {
           BLI_dynstr_append(ds, "objmat");
         }
+        else if (input->builtin == GPU_OBJECT_INFO) {
+          BLI_dynstr_append(ds, "ObjectInfo");
+        }
+        else if (input->builtin == GPU_OBJECT_COLOR) {
+          BLI_dynstr_append(ds, "ObjectColor");
+        }
         else if (input->builtin == GPU_INVERSE_OBJECT_MATRIX) {
           BLI_dynstr_append(ds, "objinv");
         }
@@ -840,6 +852,10 @@ static char *code_generate_fragment(GPUMaterial *material,
   codegen_set_unique_ids(nodes);
   *rbuiltins = builtins = codegen_process_uniforms_functions(material, ds, nodes);
 
+  if (builtins & (GPU_OBJECT_INFO | GPU_OBJECT_COLOR)) {
+    BLI_dynstr_append(ds, datatoc_gpu_shader_common_obinfos_lib_glsl);
+  }
+
   if (builtins & GPU_BARYCENTRIC_TEXCO) {
     BLI_dynstr_append(ds, "in vec2 barycentricTexCo;\n");
   }
@@ -988,7 +1004,7 @@ static char *code_generate_vertex(ListBase *nodes, const char *vert_code, bool u
         /* NOTE : Replicate changes to mesh_render_data_create() in draw_cache_impl_mesh.c */
         if (input->attr_type == CD_ORCO) {
           /* OPTI : orco is computed from local positions, but only if no modifier is present. */
-          BLI_dynstr_append(ds, "uniform vec3 OrcoTexCoFactors[2];\n");
+          BLI_dynstr_append(ds, datatoc_gpu_shader_common_obinfos_lib_glsl);
           BLI_dynstr_append(ds, "DEFINE_ATTR(vec4, orco);\n");
         }
         else if (input->attr_name[0] == '\0') {
@@ -1070,6 +1086,8 @@ static char *code_generate_vertex(ListBase *nodes, const char *vert_code, bool u
 
   BLI_dynstr_append(ds, "\n");
 
+  BLI_dynstr_append(ds, use_geom ? "RESOURCE_ID_VARYING_GEOM\n" : "RESOURCE_ID_VARYING\n");
+
   BLI_dynstr_append(ds,
                     "#define USE_ATTR\n"
                     "vec3 srgb_to_linear_attr(vec3 c) {\n"
@@ -1099,6 +1117,8 @@ static char *code_generate_vertex(ListBase *nodes, const char *vert_code, bool u
 
   BLI_dynstr_append(ds, "void pass_attr(in vec3 position) {\n");
 
+  BLI_dynstr_append(ds, use_geom ? "\tPASS_RESOURCE_ID_GEOM\n" : "\tPASS_RESOURCE_ID\n");
+
   BLI_dynstr_append(ds, "#ifdef HAIR_SHADER\n");
 
   if (builtins & GPU_BARYCENTRIC_TEXCO) {
@@ -1125,8 +1145,8 @@ static char *code_generate_vertex(ListBase *nodes, const char *vert_code, bool u
         }
         else if (input->attr_type == CD_ORCO) {
           BLI_dynstr_appendf(ds,
-                             "\tvar%d%s = OrcoTexCoFactors[0] + (ModelMatrixInverse * "
-                             "vec4(hair_get_strand_pos(), 1.0)).xyz * OrcoTexCoFactors[1];\n",
+                             "\tvar%d%s = OrcoTexCoFactors[0].xyz + (ModelMatrixInverse * "
+                             "vec4(hair_get_strand_pos(), 1.0)).xyz * OrcoTexCoFactors[1].xyz;\n",
                              input->attr_id,
                              use_geom ? "g" : "");
           /* TODO: fix ORCO with modifiers. */
@@ -1181,7 +1201,8 @@ static char *code_generate_vertex(ListBase *nodes, const char *vert_code, bool u
         }
         else if (input->attr_type == CD_ORCO) {
           BLI_dynstr_appendf(ds,
-                             "\tvar%d%s = OrcoTexCoFactors[0] + position * OrcoTexCoFactors[1];\n",
+                             "\tvar%d%s = OrcoTexCoFactors[0].xyz + position *"
+                             " OrcoTexCoFactors[1].xyz;\n",
                              input->attr_id,
                              use_geom ? "g" : "");
           /* See mesh_create_loop_orco() for explanation. */
@@ -1296,6 +1317,8 @@ static char *code_generate_geometry(ListBase *nodes, const char *geom_code, cons
       BLI_dynstr_append(ds, "out vec3 worldNormal;\n");
       BLI_dynstr_append(ds, "out vec3 viewNormal;\n");
 
+      BLI_dynstr_append(ds, datatoc_common_view_lib_glsl);
+
       BLI_dynstr_append(ds, "void main(){\n");
 
       if (builtins & GPU_BARYCENTRIC_DIST) {
@@ -1340,9 +1363,13 @@ static char *code_generate_geometry(ListBase *nodes, const char *geom_code, cons
     BLI_dynstr_append(ds, "}\n");
   }
 
+  BLI_dynstr_append(ds, "RESOURCE_ID_VARYING\n");
+
   /* Generate varying assignments. */
   BLI_dynstr_append(ds, "void pass_attr(in int vert) {\n");
 
+  BLI_dynstr_append(ds, "\tPASS_RESOURCE_ID(vert)\n");
+
   /* XXX HACK: Eevee specific. */
   if (geom_code == NULL) {
     BLI_dynstr_append(ds, "\tworldPosition = worldPositiong[vert];\n");
diff --git a/source/blender/gpu/intern/gpu_shader.c b/source/blender/gpu/intern/gpu_shader.c
index 42c21626c05..7e8cb8a4fa0 100644
--- a/source/blender/gpu/intern/gpu_shader.c
+++ b/source/blender/gpu/intern/gpu_shader.c
@@ -248,6 +248,9 @@ static void gpu_shader_standard_extensions(char defines[MAX_EXT_DEFINE_LENGTH])
     /* a #version 400 feature, but we use #version 330 maximum so use extension */
     strcat(defines, "#extension GL_ARB_texture_query_lod: enable\n");
   }
+  if (GLEW_ARB_shader_draw_parameters) {
+    strcat(defines, "#extension GL_ARB_shader_draw_parameters : enable\n");
+  }
 }
 
 static void gpu_shader_standard_defines(char defines[MAX_DEFINE_LENGTH])
@@ -255,6 +258,9 @@ static void gpu_shader_standard_defines(char defines[MAX_DEFINE_LENGTH])
   /* some useful defines to detect GPU type */
   if (GPU_type_matches(GPU_DEVICE_ATI, GPU_OS_ANY, GPU_DRIVER_ANY)) {
     strcat(defines, "#define GPU_ATI\n");
+    if (GPU_crappy_amd_driver()) {
+      strcat(defines, "#define GPU_CRAPPY_AMD_DRIVER\n");
+    }
   }
   else if (GPU_type_matches(GPU_DEVICE_NVIDIA, GPU_OS_ANY, GPU_DRIVER_ANY)) {
     strcat(defines, "#define GPU_NVIDIA\n");
diff --git a/source/blender/gpu/intern/gpu_shader_interface.c b/source/blender/gpu/intern/gpu_shader_interface.c
index 083c5bf2b60..983c5dfc27a 100644
--- a/source/blender/gpu/intern/gpu_shader_interface.c
+++ b/source/blender/gpu/intern/gpu_shader_interface.c
@@ -65,9 +65,8 @@ static const char *BuiltinUniform_name(GPUUniformBuiltin u)
       [GPU_UNIFORM_CLIPPLANES] = "WorldClipPlanes",
 
       [GPU_UNIFORM_COLOR] = "color",
-      [GPU_UNIFORM_CALLID] = "callId",
-      [GPU_UNIFORM_OBJECT_INFO] = "unfobjectinfo",
-      [GPU_UNIFORM_OBJECT_COLOR] = "unfobjectcolor",
+      [GPU_UNIFORM_BASE_INSTANCE] = "baseInstance",
+      [GPU_UNIFORM_RESOURCE_CHUNK] = "resourceChunk",
 
       [GPU_UNIFORM_CUSTOM] = NULL,
       [GPU_NUM_UNIFORMS] = NULL,
diff --git a/source/blender/gpu/intern/gpu_viewport.c b/source/blender/gpu/intern/gpu_viewport.c
index fcb1a008226..615af57c1bd 100644
--- a/source/blender/gpu/intern/gpu_viewport.c
+++ b/source/blender/gpu/intern/gpu_viewport.c
@@ -39,6 +39,7 @@
 #include "GPU_immediate.h"
 #include "GPU_texture.h"
 #include "GPU_viewport.h"
+#include "GPU_uniformbuffer.h"
 
 #include "DRW_engine.h"
 
@@ -619,11 +620,20 @@ void GPU_viewport_free(GPUViewport *viewport)
   MEM_freeN(viewport->fbl);
   MEM_freeN(viewport->txl);
 
-  if (viewport->vmempool.calls != NULL) {
-    BLI_memblock_destroy(viewport->vmempool.calls, NULL);
+  if (viewport->vmempool.commands != NULL) {
+    BLI_memblock_destroy(viewport->vmempool.commands, NULL);
   }
-  if (viewport->vmempool.states != NULL) {
-    BLI_memblock_destroy(viewport->vmempool.states, NULL);
+  if (viewport->vmempool.commands_small != NULL) {
+    BLI_memblock_destroy(viewport->vmempool.commands_small, NULL);
+  }
+  if (viewport->vmempool.callbuffers != NULL) {
+    BLI_memblock_destroy(viewport->vmempool.callbuffers, NULL);
+  }
+  if (viewport->vmempool.obmats != NULL) {
+    BLI_memblock_destroy(viewport->vmempool.obmats, NULL);
+  }
+  if (viewport->vmempool.obinfos != NULL) {
+    BLI_memblock_destroy(viewport->vmempool.obinfos, NULL);
   }
   if (viewport->vmempool.cullstates != NULL) {
     BLI_memblock_destroy(viewport->vmempool.cullstates, NULL);
@@ -650,6 +660,13 @@ void GPU_viewport_free(GPUViewport *viewport)
     BLI_memblock_destroy(viewport->vmempool.images, NULL);
   }
 
+  for (int i = 0; i < viewport->vmempool.ubo_len; i++) {
+    GPU_uniformbuffer_free(viewport->vmempool.matrices_ubo[i]);
+    GPU_uniformbuffer_free(viewport->vmempool.obinfos_ubo[i]);
+  }
+  MEM_SAFE_FREE(viewport->vmempool.matrices_ubo);
+  MEM_SAFE_FREE(viewport->vmempool.obinfos_ubo);
+
   DRW_instance_data_list_free(viewport->idatalist);
   MEM_freeN(viewport->idatalist);
author	Clément Foucault <foucault.clem@gmail.com>	2019-05-31 02:45:41 +0300
committer	Clément Foucault <foucault.clem@gmail.com>	2019-09-13 18:32:18 +0300
commit	ce34a6b0d727bbde6ae373afa8ec6c42bc8980ce (patch)
tree	f8cc84f7e2038f2a81ac0141d79205f1df649e4e /source/blender/gpu/intern
parent	f7e8b580989ec70d1cf8f15a11d4f09e6b36f407 (diff)