Mesh Batch Cache: Refactor + Multithread

For clarity sake, the batch cache now uses exclusively per Loop attributes. While this is a bit of a waste of VRAM (for the few case where per vert attribs are enough) it reduces the complexity and amount of overall VBO to update in general situations. This patch also makes the VertexBuffers filling multithreaded. This make the update of dense meshes a bit faster. The main bottleneck is the IndexBuffers update which cannot be multithreaded efficiently (have to increment a counter and/or do a final sorting pass). We introduce the concept of "extract" functions/step. All extract functions are executed in one thread each and if possible, using multiple thread for looping over all elements. Reviewed By: brecht Differential Revision: http://developer.blender.org/D5424
author: Clément Foucault <foucault.clem@gmail.com> 2019-07-14 17:49:44 +0300
committer: Clément Foucault <foucault.clem@gmail.com> 2019-08-14 20:05:26 +0300
commit: 9c010c44f4201ab114b3facc69d0343525a1779f (patch)
tree: 744271becd24cead6c900e4f023d8c0bfa6138b6 /source/blender/gpu
parent: 45a45f7d66211e82a3a3288782ad9523e8fdc516 (diff)
8 files changed, 249 insertions, 69 deletions
diff --git a/source/blender/gpu/GPU_batch.h b/source/blender/gpu/GPU_batch.h
index 365dd89a006..c65ca5d905e 100644
--- a/source/blender/gpu/GPU_batch.h
+++ b/source/blender/gpu/GPU_batch.h
@@ -40,7 +40,7 @@ typedef enum {
   GPU_BATCH_READY_TO_DRAW,
 } GPUBatchPhase;
 
-#define GPU_BATCH_VBO_MAX_LEN 4
+#define GPU_BATCH_VBO_MAX_LEN 5
 #define GPU_BATCH_VAO_STATIC_LEN 3
 #define GPU_BATCH_VAO_DYN_ALLOC_COUNT 16
 
@@ -115,6 +115,7 @@ void GPU_batch_vao_cache_clear(GPUBatch *);
 void GPU_batch_callback_free_set(GPUBatch *, void (*callback)(GPUBatch *, void *), void *);
 
 void GPU_batch_instbuf_set(GPUBatch *, GPUVertBuf *, bool own_vbo); /* Instancing */
+void GPU_batch_elembuf_set(GPUBatch *batch, GPUIndexBuf *elem, bool own_ibo);
 
 int GPU_batch_vertbuf_add_ex(GPUBatch *, GPUVertBuf *, bool own_vbo);
 
diff --git a/source/blender/gpu/GPU_element.h b/source/blender/gpu/GPU_element.h
index 4ac89d2658b..75caf4cbd6a 100644
--- a/source/blender/gpu/GPU_element.h
+++ b/source/blender/gpu/GPU_element.h
@@ -36,14 +36,19 @@ typedef enum {
 } GPUIndexBufType;
 
 typedef struct GPUIndexBuf {
+  uint index_start;
   uint index_len;
+  bool is_subrange;
 #if GPU_TRACK_INDEX_RANGE
   GPUIndexBufType index_type;
   uint32_t gl_index_type;
   uint base_index;
 #endif
   uint32_t ibo_id; /* 0 indicates not yet sent to VRAM */
-  void *data;      /* non-NULL indicates not yet sent to VRAM */
+  union {
+    void *data;              /* non-NULL indicates not yet sent to VRAM */
+    struct GPUIndexBuf *src; /* if is_subrange is true, this is the source buffer. */
+  };
 } GPUIndexBuf;
 
 void GPU_indexbuf_use(GPUIndexBuf *);
@@ -71,9 +76,21 @@ void GPU_indexbuf_add_line_verts(GPUIndexBufBuilder *, uint v1, uint v2);
 void GPU_indexbuf_add_tri_verts(GPUIndexBufBuilder *, uint v1, uint v2, uint v3);
 void GPU_indexbuf_add_line_adj_verts(GPUIndexBufBuilder *, uint v1, uint v2, uint v3, uint v4);
 
+void GPU_indexbuf_set_point_vert(GPUIndexBufBuilder *builder, uint elem, uint v1);
+void GPU_indexbuf_set_line_verts(GPUIndexBufBuilder *builder, uint elem, uint v1, uint v2);
+void GPU_indexbuf_set_tri_verts(GPUIndexBufBuilder *builder, uint elem, uint v1, uint v2, uint v3);
+
+/* Skip primitive rendering at the given index. */
+void GPU_indexbuf_set_point_restart(GPUIndexBufBuilder *builder, uint elem);
+void GPU_indexbuf_set_line_restart(GPUIndexBufBuilder *builder, uint elem);
+void GPU_indexbuf_set_tri_restart(GPUIndexBufBuilder *builder, uint elem);
+
 GPUIndexBuf *GPU_indexbuf_build(GPUIndexBufBuilder *);
 void GPU_indexbuf_build_in_place(GPUIndexBufBuilder *, GPUIndexBuf *);
 
+/* Create a subrange of an existing indexbuffer. */
+GPUIndexBuf *GPU_indexbuf_create_subrange(GPUIndexBuf *ibo, uint start, uint length);
+
 void GPU_indexbuf_discard(GPUIndexBuf *);
 
 int GPU_indexbuf_primitive_len(GPUPrimType prim_type);
diff --git a/source/blender/gpu/GPU_vertex_format.h b/source/blender/gpu/GPU_vertex_format.h
index 68608a98a79..dc60c52122c 100644
--- a/source/blender/gpu/GPU_vertex_format.h
+++ b/source/blender/gpu/GPU_vertex_format.h
@@ -31,7 +31,7 @@
 #include "BLI_assert.h"
 
 #define GPU_VERT_ATTR_MAX_LEN 16
-#define GPU_VERT_ATTR_MAX_NAMES 5
+#define GPU_VERT_ATTR_MAX_NAMES 6
 #define GPU_VERT_ATTR_NAME_AVERAGE_LEN 11
 #define GPU_VERT_ATTR_NAMES_BUF_LEN ((GPU_VERT_ATTR_NAME_AVERAGE_LEN + 1) * GPU_VERT_ATTR_MAX_LEN)
 
@@ -88,6 +88,8 @@ typedef struct GPUVertFormat {
   uint packed : 1;
   /** Current offset in names[]. */
   uint name_offset : 8;
+  /** Store each attrib in one contiguous buffer region. */
+  uint deinterleaved : 1;
 
   GPUVertAttr attrs[GPU_VERT_ATTR_MAX_LEN];
   char names[GPU_VERT_ATTR_NAMES_BUF_LEN];
@@ -104,6 +106,8 @@ uint GPU_vertformat_attr_add(
     GPUVertFormat *, const char *name, GPUVertCompType, uint comp_len, GPUVertFetchMode);
 void GPU_vertformat_alias_add(GPUVertFormat *, const char *alias);
 
+void GPU_vertformat_deinterleave(GPUVertFormat *format);
+
 int GPU_vertformat_attr_id_get(const GPUVertFormat *, const char *name);
 
 BLI_INLINE const char *GPU_vertformat_attr_name_get(const GPUVertFormat *format,
@@ -122,7 +126,59 @@ typedef struct GPUPackedNormal {
   int w : 2; /* 0 by default, can manually set to { -2, -1, 0, 1 } */
 } GPUPackedNormal;
 
-GPUPackedNormal GPU_normal_convert_i10_v3(const float data[3]);
-GPUPackedNormal GPU_normal_convert_i10_s3(const short data[3]);
+/* OpenGL ES packs in a different order as desktop GL but component conversion is the same.
+ * Of the code here, only struct GPUPackedNormal needs to change. */
+
+#define SIGNED_INT_10_MAX 511
+#define SIGNED_INT_10_MIN -512
+
+BLI_INLINE int clampi(int x, int min_allowed, int max_allowed)
+{
+#if TRUST_NO_ONE
+  assert(min_allowed <= max_allowed);
+#endif
+  if (x < min_allowed) {
+    return min_allowed;
+  }
+  else if (x > max_allowed) {
+    return max_allowed;
+  }
+  else {
+    return x;
+  }
+}
+
+BLI_INLINE int gpu_convert_normalized_f32_to_i10(float x)
+{
+  int qx = x * 511.0f;
+  return clampi(qx, SIGNED_INT_10_MIN, SIGNED_INT_10_MAX);
+}
+
+BLI_INLINE int gpu_convert_i16_to_i10(short x)
+{
+  /* 16-bit signed --> 10-bit signed */
+  /* TODO: round? */
+  return x >> 6;
+}
+
+BLI_INLINE GPUPackedNormal GPU_normal_convert_i10_v3(const float data[3])
+{
+  GPUPackedNormal n = {
+      .x = gpu_convert_normalized_f32_to_i10(data[0]),
+      .y = gpu_convert_normalized_f32_to_i10(data[1]),
+      .z = gpu_convert_normalized_f32_to_i10(data[2]),
+  };
+  return n;
+}
+
+BLI_INLINE GPUPackedNormal GPU_normal_convert_i10_s3(const short data[3])
+{
+  GPUPackedNormal n = {
+      .x = gpu_convert_i16_to_i10(data[0]),
+      .y = gpu_convert_i16_to_i10(data[1]),
+      .z = gpu_convert_i16_to_i10(data[2]),
+  };
+  return n;
+}
 
 #endif /* __GPU_VERTEX_FORMAT_H__ */
diff --git a/source/blender/gpu/intern/gpu_batch.c b/source/blender/gpu/intern/gpu_batch.c
index 11b487f7be4..583551e3e58 100644
--- a/source/blender/gpu/intern/gpu_batch.c
+++ b/source/blender/gpu/intern/gpu_batch.c
@@ -182,6 +182,25 @@ void GPU_batch_instbuf_set(GPUBatch *batch, GPUVertBuf *inst, bool own_vbo)
   }
 }
 
+void GPU_batch_elembuf_set(GPUBatch *batch, GPUIndexBuf *elem, bool own_ibo)
+{
+  BLI_assert(elem != NULL);
+  /* redo the bindings */
+  GPU_batch_vao_cache_clear(batch);
+
+  if (batch->elem != NULL && (batch->owns_flag & GPU_BATCH_OWNS_INDEX)) {
+    GPU_indexbuf_discard(batch->elem);
+  }
+  batch->elem = elem;
+
+  if (own_ibo) {
+    batch->owns_flag |= GPU_BATCH_OWNS_INDEX;
+  }
+  else {
+    batch->owns_flag &= ~GPU_BATCH_OWNS_INDEX;
+  }
+}
+
 /* Returns the index of verts in the batch. */
 int GPU_batch_vertbuf_add_ex(GPUBatch *batch, GPUVertBuf *verts, bool own_vbo)
 {
@@ -362,13 +381,23 @@ static void create_bindings(GPUVertBuf *verts,
   const GPUVertFormat *format = &verts->format;
 
   const uint attr_len = format->attr_len;
-  const uint stride = format->stride;
+  uint stride = format->stride;
+  uint offset = 0;
 
   GPU_vertbuf_use(verts);
 
   for (uint a_idx = 0; a_idx < attr_len; ++a_idx) {
     const GPUVertAttr *a = &format->attrs[a_idx];
-    const GLvoid *pointer = (const GLubyte *)0 + a->offset + v_first * stride;
+
+    if (format->deinterleaved) {
+      offset += ((a_idx == 0) ? 0 : format->attrs[a_idx - 1].sz) * verts->vertex_len;
+      stride = a->sz;
+    }
+    else {
+      offset = a->offset;
+    }
+
+    const GLvoid *pointer = (const GLubyte *)0 + offset + v_first * stride;
 
     for (uint n_idx = 0; n_idx < a->name_len; ++n_idx) {
       const char *name = GPU_vertformat_attr_name_get(format, a, n_idx);
@@ -419,8 +448,11 @@ static void create_bindings(GPUVertBuf *verts,
 
 static void batch_update_program_bindings(GPUBatch *batch, uint v_first)
 {
-  for (int v = 0; v < GPU_BATCH_VBO_MAX_LEN && batch->verts[v] != NULL; ++v) {
-    create_bindings(batch->verts[v], batch->interface, (batch->inst) ? 0 : v_first, false);
+  /* Reverse order so first vbos have more prevalence (in term of attrib override). */
+  for (int v = GPU_BATCH_VBO_MAX_LEN - 1; v > -1; --v) {
+    if (batch->verts[v] != NULL) {
+      create_bindings(batch->verts[v], batch->interface, (batch->inst) ? 0 : v_first, false);
+    }
   }
   if (batch->inst) {
     create_bindings(batch->inst, batch->interface, v_first, true);
@@ -550,10 +582,10 @@ static void *elem_offset(const GPUIndexBuf *el, int v_first)
 {
 #if GPU_TRACK_INDEX_RANGE
   if (el->index_type == GPU_INDEX_U16) {
-    return (GLushort *)0 + v_first;
+    return (GLushort *)0 + v_first + el->index_start;
   }
 #endif
-  return (GLuint *)0 + v_first;
+  return (GLuint *)0 + v_first + el->index_start;
 }
 
 /* Use when drawing with GPU_batch_draw_advanced */
diff --git a/source/blender/gpu/intern/gpu_element.c b/source/blender/gpu/intern/gpu_element.c
index 50e7df96503..6c9331b4903 100644
--- a/source/blender/gpu/intern/gpu_element.c
+++ b/source/blender/gpu/intern/gpu_element.c
@@ -162,6 +162,100 @@ void GPU_indexbuf_add_line_adj_verts(
   GPU_indexbuf_add_generic_vert(builder, v4);
 }
 
+void GPU_indexbuf_set_point_vert(GPUIndexBufBuilder *builder, uint elem, uint v1)
+{
+  BLI_assert(builder->prim_type == GPU_PRIM_POINTS);
+  BLI_assert(elem < builder->max_index_len);
+  builder->data[elem++] = v1;
+  if (builder->index_len < elem) {
+    builder->index_len = elem;
+  }
+}
+
+void GPU_indexbuf_set_line_verts(GPUIndexBufBuilder *builder, uint elem, uint v1, uint v2)
+{
+  BLI_assert(builder->prim_type == GPU_PRIM_LINES);
+  BLI_assert(v1 != v2);
+  BLI_assert(v1 <= builder->max_allowed_index);
+  BLI_assert(v2 <= builder->max_allowed_index);
+  BLI_assert((elem + 1) * 2 <= builder->max_index_len);
+  uint idx = elem * 2;
+  builder->data[idx++] = v1;
+  builder->data[idx++] = v2;
+  if (builder->index_len < idx) {
+    builder->index_len = idx;
+  }
+}
+
+void GPU_indexbuf_set_tri_verts(GPUIndexBufBuilder *builder, uint elem, uint v1, uint v2, uint v3)
+{
+  BLI_assert(builder->prim_type == GPU_PRIM_TRIS);
+  BLI_assert(v1 != v2 && v2 != v3 && v3 != v1);
+  BLI_assert(v1 <= builder->max_allowed_index);
+  BLI_assert(v2 <= builder->max_allowed_index);
+  BLI_assert(v3 <= builder->max_allowed_index);
+  BLI_assert((elem + 1) * 3 <= builder->max_index_len);
+  uint idx = elem * 3;
+  builder->data[idx++] = v1;
+  builder->data[idx++] = v2;
+  builder->data[idx++] = v3;
+  if (builder->index_len < idx) {
+    builder->index_len = idx;
+  }
+}
+
+void GPU_indexbuf_set_point_restart(GPUIndexBufBuilder *builder, uint elem)
+{
+  BLI_assert(builder->prim_type == GPU_PRIM_POINTS);
+  BLI_assert(elem < builder->max_index_len);
+  builder->data[elem++] = RESTART_INDEX;
+  if (builder->index_len < elem) {
+    builder->index_len = elem;
+  }
+}
+
+void GPU_indexbuf_set_line_restart(GPUIndexBufBuilder *builder, uint elem)
+{
+  BLI_assert(builder->prim_type == GPU_PRIM_LINES);
+  BLI_assert((elem + 1) * 2 <= builder->max_index_len);
+  uint idx = elem * 2;
+  builder->data[idx++] = RESTART_INDEX;
+  builder->data[idx++] = RESTART_INDEX;
+  if (builder->index_len < idx) {
+    builder->index_len = idx;
+  }
+}
+
+void GPU_indexbuf_set_tri_restart(GPUIndexBufBuilder *builder, uint elem)
+{
+  BLI_assert(builder->prim_type == GPU_PRIM_TRIS);
+  BLI_assert((elem + 1) * 3 <= builder->max_index_len);
+  uint idx = elem * 3;
+  builder->data[idx++] = RESTART_INDEX;
+  builder->data[idx++] = RESTART_INDEX;
+  builder->data[idx++] = RESTART_INDEX;
+  if (builder->index_len < idx) {
+    builder->index_len = idx;
+  }
+}
+
+GPUIndexBuf *GPU_indexbuf_create_subrange(GPUIndexBuf *elem_src, uint start, uint length)
+{
+  GPUIndexBuf *elem = MEM_callocN(sizeof(GPUIndexBuf), "GPUIndexBuf");
+  BLI_assert(elem_src && !elem_src->is_subrange);
+  BLI_assert(start + length <= elem_src->index_len);
+#if GPU_TRACK_INDEX_RANGE
+  elem->index_type = elem_src->index_type;
+  elem->gl_index_type = elem_src->gl_index_type;
+  elem->base_index = elem_src->base_index;
+#endif
+  elem->is_subrange = true;
+  elem->src = elem_src;
+  elem->index_start = start;
+  elem->index_len = length;
+  return elem;
+}
+
 #if GPU_TRACK_INDEX_RANGE
 /* Everything remains 32 bit while building to keep things simple.
  * Find min/max after, then convert to smallest index type possible. */
@@ -271,6 +365,10 @@ static void indexbuf_upload_data(GPUIndexBuf *elem)
 
 void GPU_indexbuf_use(GPUIndexBuf *elem)
 {
+  if (elem->is_subrange) {
+    GPU_indexbuf_use(elem->src);
+    return;
+  }
   if (elem->ibo_id == 0) {
     elem->ibo_id = GPU_buf_alloc();
   }
@@ -285,7 +383,7 @@ void GPU_indexbuf_discard(GPUIndexBuf *elem)
   if (elem->ibo_id) {
     GPU_buf_free(elem->ibo_id);
   }
-  if (elem->data) {
+  if (!elem->is_subrange && elem->data) {
     MEM_freeN(elem->data);
   }
   MEM_freeN(elem);
diff --git a/source/blender/gpu/intern/gpu_vertex_format.c b/source/blender/gpu/intern/gpu_vertex_format.c
index 493c6d3ec59..f672d350afa 100644
--- a/source/blender/gpu/intern/gpu_vertex_format.c
+++ b/source/blender/gpu/intern/gpu_vertex_format.c
@@ -218,6 +218,29 @@ int GPU_vertformat_attr_id_get(const GPUVertFormat *format, const char *name)
   return -1;
 }
 
+/* Make attribute layout non-interleaved.
+ * Warning! This does not change data layout!
+ * Use direct buffer access to fill the data.
+ * This is for advanced usage.
+ *
+ * Deinterleaved data means all attrib data for each attrib
+ * is stored continuously like this :
+ * 000011112222
+ * instead of :
+ * 012012012012
+ *
+ * Note this is per attrib deinterleaving, NOT per component.
+ *  */
+void GPU_vertformat_deinterleave(GPUVertFormat *format)
+{
+  /* Ideally we should change the stride and offset here. This would allow
+   * us to use GPU_vertbuf_attr_set / GPU_vertbuf_attr_fill. But since
+   * we use only 11 bits for attr->offset this limits the size of the
+   * buffer considerably. So instead we do the conversion when creating
+   * bindings in create_bindings(). */
+  format->deinterleaved = true;
+}
+
 uint padding(uint offset, uint alignment)
 {
   const uint mod = offset % alignment;
@@ -391,58 +414,3 @@ void GPU_vertformat_from_interface(GPUVertFormat *format, const GPUShaderInterfa
     }
   }
 }
-
-/* OpenGL ES packs in a different order as desktop GL but component conversion is the same.
- * Of the code here, only struct GPUPackedNormal needs to change. */
-
-#define SIGNED_INT_10_MAX 511
-#define SIGNED_INT_10_MIN -512
-
-static int clampi(int x, int min_allowed, int max_allowed)
-{
-#if TRUST_NO_ONE
-  assert(min_allowed <= max_allowed);
-#endif
-  if (x < min_allowed) {
-    return min_allowed;
-  }
-  else if (x > max_allowed) {
-    return max_allowed;
-  }
-  else {
-    return x;
-  }
-}
-
-static int quantize(float x)
-{
-  int qx = x * 511.0f;
-  return clampi(qx, SIGNED_INT_10_MIN, SIGNED_INT_10_MAX);
-}
-
-static int convert_i16(short x)
-{
-  /* 16-bit signed --> 10-bit signed */
-  /* TODO: round? */
-  return x >> 6;
-}
-
-GPUPackedNormal GPU_normal_convert_i10_v3(const float data[3])
-{
-  GPUPackedNormal n = {
-      .x = quantize(data[0]),
-      .y = quantize(data[1]),
-      .z = quantize(data[2]),
-  };
-  return n;
-}
-
-GPUPackedNormal GPU_normal_convert_i10_s3(const short data[3])
-{
-  GPUPackedNormal n = {
-      .x = convert_i16(data[0]),
-      .y = convert_i16(data[1]),
-      .z = convert_i16(data[2]),
-  };
-  return n;
-}
diff --git a/source/blender/gpu/intern/gpu_vertex_format_private.h b/source/blender/gpu/intern/gpu_vertex_format_private.h
index a850d17a1dd..13459101669 100644
--- a/source/blender/gpu/intern/gpu_vertex_format_private.h
+++ b/source/blender/gpu/intern/gpu_vertex_format_private.h
@@ -27,6 +27,7 @@
 #define __GPU_VERTEX_FORMAT_PRIVATE_H__
 
 void VertexFormat_pack(GPUVertFormat *format);
+void VertexFormat_deinterleave(GPUVertFormat *format, uint vertex_len);
 uint padding(uint offset, uint alignment);
 uint vertex_buffer_size(const GPUVertFormat *format, uint vertex_len);
 
diff --git a/source/blender/gpu/shaders/gpu_shader_2D_edituvs_stretch_vert.glsl b/source/blender/gpu/shaders/gpu_shader_2D_edituvs_stretch_vert.glsl
index 810784e2fbc..0ce5504dfa8 100644
--- a/source/blender/gpu/shaders/gpu_shader_2D_edituvs_stretch_vert.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_2D_edituvs_stretch_vert.glsl
@@ -8,7 +8,7 @@ in vec2 pos;
 in float stretch;
 #else
 
-in vec4 uv_adj;
+in vec2 uv_angles;
 in float angle;
 #endif
 
@@ -52,6 +52,11 @@ vec3 weight_to_rgb(float weight)
 
 #define M_PI 3.1415926535897932
 
+vec2 angle_to_v2(float angle)
+{
+  return vec2(cos(angle), sin(angle));
+}
+
 /* Adapted from BLI_math_vector.h */
 float angle_normalized_v2v2(vec2 v1, vec2 v2)
 {
@@ -69,7 +74,9 @@ void main()
   gl_Position = ModelViewProjectionMatrix * vec4(pos, 0.0, 1.0);
 
 #ifdef STRETCH_ANGLE
-  float uv_angle = angle_normalized_v2v2(uv_adj.xy, uv_adj.zw) / M_PI;
+  vec2 v1 = angle_to_v2(uv_angles.x * M_PI);
+  vec2 v2 = angle_to_v2(uv_angles.y * M_PI);
+  float uv_angle = angle_normalized_v2v2(v1, v2) / M_PI;
   float stretch = 1.0 - abs(uv_angle - angle);
   stretch = stretch;
   stretch = 1.0 - stretch * stretch;
author	Clément Foucault <foucault.clem@gmail.com>	2019-07-14 17:49:44 +0300
committer	Clément Foucault <foucault.clem@gmail.com>	2019-08-14 20:05:26 +0300
commit	9c010c44f4201ab114b3facc69d0343525a1779f (patch)
tree	744271becd24cead6c900e4f023d8c0bfa6138b6 /source/blender/gpu
parent	45a45f7d66211e82a3a3288782ad9523e8fdc516 (diff)