Gawain: Refactor: VAOs caching AND use new VAOs manager.

A major bottleneck of current implementation is the call to create_bindings() for basically every drawcalls. This is due to the VAO being tagged dirty when assigning a new shader to the Batch, defeating the purpose of the Batch (reuse it for drawing). Since managing hundreds of batches in DrawManager and DrawCache seems not fun enough to me, I prefered rewritting the batches itself. --- Batch changes --- For this to happen I needed to change the Instancing to be part of the Batch rather than being another batch supplied at drawtime. The Gwn_VertBuffers are copied from the batch to be instanciated and a new Gwn_VertBuffer is supplied for instancing attribs. This mean a VAO can be generated and cached for this instancing case. A Batch can be rendered with instancing, without instancing attribs and without the need for a new VAO using the GWN_batch_draw_range_ex with the force_instance parameter set to true. --- Draw manager changes --- The downside with this approach is that we must track the validity of the instanced batch (the original one). For this the only way (I could think of) is to set a callback for when the batch is getting free. This means a bit of refactor in the DrawManager with the separation of batching and instancing Batches. --- VAO cache --- Each VAO is generated for a given ShaderInterface. This means we can keep it alive as long as the shader interface lives. If a ShaderInterface is discarded, it needs to destroy every VAO associated to it. Otherwise, a new ShaderInterface with the same adress could be generated and reuse the same VAO with incorrect bindings. The VAO cache itself is using a mix between a static array of VAO and a dynamic array if the is not enough space in the static. Using this hybrid approach is a bit more performant than the dynamic array alone. The array will not resize down but empty entries will be filled up again. It's unlikely we get a buffer overflow from this. Resizing could be done on next allocation if needed. --- Results --- Using Cached VAOs means that we are not querying each vertex attrib for each vbo for each drawcall, every redraw! In a CPU limited test scene (10000 cubes in Clay engine) I get a reduction of CPU drawing time from ~20ms to 13ms. The only area that is not caching VAOs is the instancing from particles (see comment DRW_shgroup_instance_batch).
author: Clément Foucault <foucault.clem@gmail.com> 2018-02-20 03:55:19 +0300
committer: Clément Foucault <foucault.clem@gmail.com> 2018-02-21 17:28:26 +0300
commit: c5eba46d7f4ddfcdf372a3f4968e4d170ee0a002 (patch)
tree: f9a170c28cf42f9948f83003b8c8b693475db567 /source
parent: 1b3f9ecd0d0bdf20de24f72d73517cc97d925a15 (diff)
5 files changed, 247 insertions, 114 deletions
diff --git a/source/blender/draw/intern/DRW_render.h b/source/blender/draw/intern/DRW_render.h
index f62b224b094..82ba2922dd0 100644
--- a/source/blender/draw/intern/DRW_render.h
+++ b/source/blender/draw/intern/DRW_render.h
@@ -341,7 +341,7 @@ typedef void (DRWCallGenerateFn)(
         void (*draw_fn)(DRWShadingGroup *shgroup, struct Gwn_Batch *geom),
         void *user_data);
 
-void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *instances);
+void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *batch);
 
 void DRW_shgroup_free(struct DRWShadingGroup *shgroup);
 void DRW_shgroup_call_add(DRWShadingGroup *shgroup, struct Gwn_Batch *geom, float (*obmat)[4]);
diff --git a/source/blender/draw/intern/draw_instance_data.c b/source/blender/draw/intern/draw_instance_data.c
index c2aae8e33ae..bfff1a2f546 100644
--- a/source/blender/draw/intern/draw_instance_data.c
+++ b/source/blender/draw/intern/draw_instance_data.c
@@ -42,12 +42,29 @@
 #define BUFFER_CHUNK_SIZE 32
 #define BUFFER_VERTS_CHUNK 32
 
-typedef struct DRWInstanceBuffer {
+typedef struct DRWBatchingBuffer {
 	struct DRWShadingGroup *shgroup;  /* Link back to the owning shGroup. Also tells if it's used */
 	Gwn_VertFormat *format;           /* Identifier. */
 	Gwn_VertBuf *vert;                /* Gwn_VertBuf contained in the Gwn_Batch. */
 	Gwn_Batch *batch;                 /* Gwn_Batch containing the Gwn_VertBuf. */
-} DRWInstanceBuffer;
+} DRWBatchingBuffer;
+
+typedef struct DRWInstancingBuffer {
+	struct DRWShadingGroup *shgroup;  /* Link back to the owning shGroup. Also tells if it's used */
+	Gwn_VertFormat *format;           /* Identifier. */
+	Gwn_Batch *instance;              /* Identifier. */
+	Gwn_VertBuf *vert;                /* Gwn_VertBuf contained in the Gwn_Batch. */
+	Gwn_Batch *batch;                 /* Gwn_Batch containing the Gwn_VertBuf. */
+} DRWInstancingBuffer;
+
+typedef struct DRWInstanceChunk {
+	size_t cursor;             /* Offset to the next instance data. */
+	size_t alloc_size;         /* Number of DRWBatchingBuffer/Batches alloc'd in ibufs/btchs. */
+	union {
+		DRWBatchingBuffer *bbufs;
+		DRWInstancingBuffer *ibufs;
+	};
+} DRWInstanceChunk;
 
 struct DRWInstanceData {
 	struct DRWInstanceData *next;
@@ -60,19 +77,19 @@ struct DRWInstanceData {
 };
 
 struct DRWInstanceDataList {
+	struct DRWInstanceDataList *next, *prev;
 	/* Linked lists for all possible data pool size */
 	/* Not entirely sure if we should separate them in the first place.
 	 * This is done to minimize the reattribution misses. */
 	DRWInstanceData *idata_head[MAX_INSTANCE_DATA_SIZE];
 	DRWInstanceData *idata_tail[MAX_INSTANCE_DATA_SIZE];
 
-	struct {
-		size_t cursor;             /* Offset to the next instance data. */
-		size_t alloc_size;         /* Number of DRWInstanceBuffer alloc'd in ibufs. */
-		DRWInstanceBuffer *ibufs;
-	} ibuffers;
+	DRWInstanceChunk instancing;
+	DRWInstanceChunk batching;
 };
 
+static ListBase g_idatalists = {NULL, NULL};
+
 /* -------------------------------------------------------------------- */
 
 /** \name Instance Buffer Management
@@ -87,89 +104,174 @@ struct DRWInstanceDataList {
  * that would be too slow]).
  **/
 
-void DRW_instance_buffer_request(
-        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, struct DRWShadingGroup *shgroup,
-        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert, Gwn_PrimType type)
+static void instance_batch_free(Gwn_Batch *batch, void *UNUSED(user_data))
 {
-	BLI_assert(format);
+	/* Free all batches that have the same key before they are reused. */
+	/* TODO: Make it thread safe! Batch freeing can happen from another thread. */
+	/* XXX we need to iterate over all idatalists unless we make some smart
+	 * data structure to store the locations to update. */
+	for (DRWInstanceDataList *idatalist = g_idatalists.first; idatalist; ++idatalist) {
+		DRWInstancingBuffer *ibuf = idatalist->instancing.ibufs;
+		for (int i = 0; i < idatalist->instancing.alloc_size; i++, ibuf++) {
+			if (ibuf->instance == batch) {
+				BLI_assert(ibuf->shgroup == NULL); /* Make sure it has no other users. */
+				GWN_VERTBUF_DISCARD_SAFE(ibuf->vert);
+				GWN_BATCH_DISCARD_SAFE(ibuf->batch);
+				/* Tag as non alloced. */
+				ibuf->format = NULL;
+			}
+		}
+	}
+}
 
-	DRWInstanceBuffer *ibuf = idatalist->ibuffers.ibufs;
-	int first_non_alloced = -1;
+void DRW_batching_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_PrimType type, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert)
+{
+	DRWInstanceChunk *chunk = &idatalist->batching;
+	DRWBatchingBuffer *bbuf = idatalist->batching.bbufs;
+	BLI_assert(format);
+	/* Search for an unused batch. */
+	for (int i = 0; i < idatalist->batching.alloc_size; i++, bbuf++) {
+		if (bbuf->shgroup == NULL) {
+			if (bbuf->format == format) {
+				bbuf->shgroup = shgroup;
+				*r_batch = bbuf->batch;
+				*r_vert = bbuf->vert;
+				return;
+			}
+		}
+	}
+	int new_id = 0; /* Find insertion point. */
+	for (; new_id < chunk->alloc_size; ++new_id) {
+		if (chunk->bbufs[new_id].format == NULL)
+			break;
+	}
+	/* If there is no batch left. Allocate more. */
+	if (new_id == chunk->alloc_size) {
+		new_id = chunk->alloc_size;
+		chunk->alloc_size += BUFFER_CHUNK_SIZE;
+		chunk->bbufs = MEM_reallocN(chunk->bbufs, chunk->alloc_size * sizeof(DRWBatchingBuffer));
+		memset(chunk->bbufs + new_id, 0, sizeof(DRWBatchingBuffer) * BUFFER_CHUNK_SIZE);
+	}
+	/* Create the batch. */
+	bbuf = chunk->bbufs + new_id;
+	bbuf->vert = *r_vert = GWN_vertbuf_create_dynamic_with_format(format);
+	bbuf->batch = *r_batch = GWN_batch_create_ex(type, bbuf->vert, NULL, 0);
+	bbuf->format = format;
+	bbuf->shgroup = shgroup;
+	GWN_vertbuf_data_alloc(*r_vert, BUFFER_VERTS_CHUNK);
+}
 
+void DRW_instancing_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_Batch *instance, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert)
+{
+	DRWInstanceChunk *chunk = &idatalist->instancing;
+	DRWInstancingBuffer *ibuf = idatalist->instancing.ibufs;
+	BLI_assert(format);
 	/* Search for an unused batch. */
-	for (int i = 0; i < idatalist->ibuffers.alloc_size; i++, ibuf++) {
+	for (int i = 0; i < idatalist->instancing.alloc_size; i++, ibuf++) {
 		if (ibuf->shgroup == NULL) {
 			if (ibuf->format == format) {
-				ibuf->shgroup = shgroup;
-				*r_batch = ibuf->batch;
-				*r_vert = ibuf->vert;
-				return;
-			}
-			else if (ibuf->format == NULL && first_non_alloced == -1) {
-				first_non_alloced = i;
+				if (ibuf->instance == instance) {
+					ibuf->shgroup = shgroup;
+					*r_batch = ibuf->batch;
+					*r_vert = ibuf->vert;
+					return;
+				}
 			}
 		}
 	}
-
-	if (first_non_alloced == -1) {
-		/* There is no batch left. Allocate more. */
-		first_non_alloced = idatalist->ibuffers.alloc_size;
-		idatalist->ibuffers.alloc_size += BUFFER_CHUNK_SIZE;
-		idatalist->ibuffers.ibufs = MEM_reallocN(idatalist->ibuffers.ibufs,
-		                                         idatalist->ibuffers.alloc_size * sizeof(DRWInstanceBuffer));
-		/* Clear new part of the memory. */
-		memset(idatalist->ibuffers.ibufs + first_non_alloced, 0, sizeof(DRWInstanceBuffer) * BUFFER_CHUNK_SIZE);
+	int new_id = 0; /* Find insertion point. */
+	for (; new_id < chunk->alloc_size; ++new_id) {
+		if (chunk->ibufs[new_id].format == NULL)
+			break;
+	}
+	/* If there is no batch left. Allocate more. */
+	if (new_id == chunk->alloc_size) {
+		new_id = chunk->alloc_size;
+		chunk->alloc_size += BUFFER_CHUNK_SIZE;
+		chunk->ibufs = MEM_reallocN(chunk->ibufs, chunk->alloc_size * sizeof(DRWInstancingBuffer));
+		memset(chunk->ibufs + new_id, 0, sizeof(DRWInstancingBuffer) * BUFFER_CHUNK_SIZE);
 	}
-
 	/* Create the batch. */
-	ibuf = idatalist->ibuffers.ibufs + first_non_alloced;
+	ibuf = chunk->ibufs + new_id;
 	ibuf->vert = *r_vert = GWN_vertbuf_create_dynamic_with_format(format);
-	ibuf->batch = *r_batch = GWN_batch_create_ex(type, ibuf->vert, NULL, GWN_BATCH_OWNS_VBO);
+	ibuf->batch = *r_batch = GWN_batch_duplicate(instance);
 	ibuf->format = format;
 	ibuf->shgroup = shgroup;
-
+	ibuf->instance = instance;
 	GWN_vertbuf_data_alloc(*r_vert, BUFFER_VERTS_CHUNK);
+	GWN_batch_instbuf_set(ibuf->batch, ibuf->vert, false);
+	/* Make sure to free this ibuf if the instance batch gets free. */
+	GWN_batch_callback_free_set(instance, &instance_batch_free, NULL);
 }
 
 void DRW_instance_buffer_finish(DRWInstanceDataList *idatalist)
 {
-	DRWInstanceBuffer *ibuf = idatalist->ibuffers.ibufs;
-	size_t minimum_alloc_size = 1; /* Avoid 0 size realloc. */
+	size_t realloc_size = 1; /* Avoid 0 size realloc. */
+	/* Resize down buffers in use and send data to GPU & free unused buffers. */
+	DRWInstanceChunk *batching = &idatalist->batching;
+	DRWBatchingBuffer *bbuf = batching->bbufs;
+	for (int i = 0; i < batching->alloc_size; i++, bbuf++) {
+		if (bbuf->shgroup != NULL) {
+			realloc_size = i + 1;
+			unsigned int vert_ct = DRW_shgroup_get_instance_count(bbuf->shgroup);
+			vert_ct += (vert_ct == 0) ? 1 : 0; /* Do not realloc to 0 size buffer */
+			if (vert_ct + BUFFER_VERTS_CHUNK <= bbuf->vert->vertex_ct) {
+				unsigned int size = vert_ct + BUFFER_VERTS_CHUNK - 1;
+				size = size - size % BUFFER_VERTS_CHUNK;
+				GWN_vertbuf_data_resize(bbuf->vert, size);
+			}
+			GWN_vertbuf_use(bbuf->vert); /* Send data. */
+			bbuf->shgroup = NULL; /* Set as non used for the next round. */
+		}
+		else {
+			GWN_VERTBUF_DISCARD_SAFE(bbuf->vert);
+			GWN_BATCH_DISCARD_SAFE(bbuf->batch);
+			bbuf->format = NULL; /* Tag as non alloced. */
+		}
+	}
+	/* Rounding up to nearest chunk size. */
+	realloc_size += BUFFER_CHUNK_SIZE - 1;
+	realloc_size -= realloc_size % BUFFER_CHUNK_SIZE;
+	/* Resize down if necessary. */
+	if (realloc_size < batching->alloc_size) {
+		batching->alloc_size = realloc_size;
+		batching->ibufs = MEM_reallocN(batching->ibufs, realloc_size * sizeof(DRWBatchingBuffer));
+	}
 
+	realloc_size = 1;
 	/* Resize down buffers in use and send data to GPU & free unused buffers. */
-	for (int i = 0; i < idatalist->ibuffers.alloc_size; i++, ibuf++) {
+	DRWInstanceChunk *instancing = &idatalist->instancing;
+	DRWInstancingBuffer *ibuf = instancing->ibufs;
+	for (int i = 0; i < instancing->alloc_size; i++, ibuf++) {
 		if (ibuf->shgroup != NULL) {
-			minimum_alloc_size = i + 1;
+			realloc_size = i + 1;
 			unsigned int vert_ct = DRW_shgroup_get_instance_count(ibuf->shgroup);
-			/* Do not realloc to 0 size buffer */
-			vert_ct += (vert_ct == 0) ? 1 : 0;
-			/* Resize buffer to reclame space. */
+			vert_ct += (vert_ct == 0) ? 1 : 0; /* Do not realloc to 0 size buffer */
 			if (vert_ct + BUFFER_VERTS_CHUNK <= ibuf->vert->vertex_ct) {
 				unsigned int size = vert_ct + BUFFER_VERTS_CHUNK - 1;
 				size = size - size % BUFFER_VERTS_CHUNK;
 				GWN_vertbuf_data_resize(ibuf->vert, size);
 			}
-			/* Send data. */
-			GWN_vertbuf_use(ibuf->vert);
-			/* Set as non used for the next round. */
-			ibuf->shgroup = NULL;
+			GWN_vertbuf_use(ibuf->vert); /* Send data. */
+			ibuf->shgroup = NULL; /* Set as non used for the next round. */
 		}
 		else {
+			GWN_VERTBUF_DISCARD_SAFE(ibuf->vert);
 			GWN_BATCH_DISCARD_SAFE(ibuf->batch);
-			/* Tag as non alloced. */
-			ibuf->format = NULL;
+			ibuf->format = NULL; /* Tag as non alloced. */
 		}
 	}
-
-	/* Resize down the handle buffer (ibuffers). */
 	/* Rounding up to nearest chunk size. */
-	minimum_alloc_size += BUFFER_CHUNK_SIZE - 1;
-	minimum_alloc_size -= minimum_alloc_size % BUFFER_CHUNK_SIZE;
+	realloc_size += BUFFER_CHUNK_SIZE - 1;
+	realloc_size -= realloc_size % BUFFER_CHUNK_SIZE;
 	/* Resize down if necessary. */
-	if (minimum_alloc_size < idatalist->ibuffers.alloc_size) {
-		idatalist->ibuffers.alloc_size = minimum_alloc_size;
-		idatalist->ibuffers.ibufs = MEM_reallocN(idatalist->ibuffers.ibufs,
-		                                         minimum_alloc_size * sizeof(DRWInstanceBuffer));
+	if (realloc_size < instancing->alloc_size) {
+		instancing->alloc_size = realloc_size;
+		instancing->ibufs = MEM_reallocN(instancing->ibufs, realloc_size * sizeof(DRWInstancingBuffer));
 	}
 }
 
@@ -183,7 +285,7 @@ void DRW_instance_buffer_finish(DRWInstanceDataList *idatalist)
 static DRWInstanceData *drw_instance_data_create(
         DRWInstanceDataList *idatalist, unsigned int attrib_size, unsigned int instance_group)
 {
-	DRWInstanceData *idata = MEM_mallocN(sizeof(DRWInstanceData), "DRWInstanceData");
+	DRWInstanceData *idata = MEM_callocN(sizeof(DRWInstanceData), "DRWInstanceData");
 	idata->next = NULL;
 	idata->used = true;
 	idata->data_size = attrib_size;
@@ -263,15 +365,18 @@ DRWInstanceData *DRW_instance_data_request(
 DRWInstanceDataList *DRW_instance_data_list_create(void)
 {
 	DRWInstanceDataList *idatalist = MEM_callocN(sizeof(DRWInstanceDataList), "DRWInstanceDataList");
-	idatalist->ibuffers.ibufs = MEM_callocN(sizeof(DRWInstanceBuffer) * BUFFER_CHUNK_SIZE, "DRWInstanceBuffers");
-	idatalist->ibuffers.alloc_size = BUFFER_CHUNK_SIZE;
+	idatalist->batching.bbufs = MEM_callocN(sizeof(DRWBatchingBuffer) * BUFFER_CHUNK_SIZE, "DRWBatchingBuffers");
+	idatalist->batching.alloc_size = BUFFER_CHUNK_SIZE;
+	idatalist->instancing.ibufs = MEM_callocN(sizeof(DRWInstancingBuffer) * BUFFER_CHUNK_SIZE, "DRWInstancingBuffers");
+	idatalist->instancing.alloc_size = BUFFER_CHUNK_SIZE;
+
+	BLI_addtail(&g_idatalists, idatalist);
 
 	return idatalist;
 }
 
 void DRW_instance_data_list_free(DRWInstanceDataList *idatalist)
 {
-	DRWInstanceBuffer *ibuf = idatalist->ibuffers.ibufs;
 	DRWInstanceData *idata, *next_idata;
 
 	for (int i = 0; i < MAX_INSTANCE_DATA_SIZE; ++i) {
@@ -284,10 +389,21 @@ void DRW_instance_data_list_free(DRWInstanceDataList *idatalist)
 		idatalist->idata_tail[i] = NULL;
 	}
 
-	for (int i = 0; i < idatalist->ibuffers.alloc_size; i++, ibuf++) {
+	DRWBatchingBuffer *bbuf = idatalist->batching.bbufs;
+	for (int i = 0; i < idatalist->batching.alloc_size; i++, bbuf++) {
+		GWN_VERTBUF_DISCARD_SAFE(bbuf->vert);
+		GWN_BATCH_DISCARD_SAFE(bbuf->batch);
+	}
+	MEM_freeN(idatalist->batching.bbufs);
+
+	DRWInstancingBuffer *ibuf = idatalist->instancing.ibufs;
+	for (int i = 0; i < idatalist->instancing.alloc_size; i++, ibuf++) {
+		GWN_VERTBUF_DISCARD_SAFE(ibuf->vert);
 		GWN_BATCH_DISCARD_SAFE(ibuf->batch);
 	}
-	MEM_freeN(idatalist->ibuffers.ibufs);
+	MEM_freeN(idatalist->instancing.ibufs);
+
+	BLI_remlink(&g_idatalists, idatalist);
 }
 
 void DRW_instance_data_list_reset(DRWInstanceDataList *idatalist)
diff --git a/source/blender/draw/intern/draw_instance_data.h b/source/blender/draw/intern/draw_instance_data.h
index a7a66c9baff..3b0f7839277 100644
--- a/source/blender/draw/intern/draw_instance_data.h
+++ b/source/blender/draw/intern/draw_instance_data.h
@@ -43,9 +43,12 @@ void *DRW_instance_data_get(DRWInstanceData *idata);
 DRWInstanceData *DRW_instance_data_request(
         DRWInstanceDataList *idatalist, unsigned int attrib_size, unsigned int instance_group);
 
-void DRW_instance_buffer_request(
-        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, struct DRWShadingGroup *shgroup,
-        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert, Gwn_PrimType type);
+void DRW_batching_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_PrimType type, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert);
+void DRW_instancing_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_Batch *instance, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert);
 
 /* Upload all instance data to the GPU as soon as possible. */
 void DRW_instance_buffer_finish(DRWInstanceDataList *idatalist);
diff --git a/source/blender/draw/intern/draw_manager.c b/source/blender/draw/intern/draw_manager.c
index a3a59efc799..5299fa04e4e 100644
--- a/source/blender/draw/intern/draw_manager.c
+++ b/source/blender/draw/intern/draw_manager.c
@@ -665,6 +665,24 @@ static void drw_interface_init(DRWInterface *interface, GPUShader *shader)
 }
 
 static void drw_interface_instance_init(
+        DRWShadingGroup *shgroup, GPUShader *shader, Gwn_Batch *batch, Gwn_VertFormat *format)
+{
+	DRWInterface *interface = &shgroup->interface;
+	drw_interface_init(interface, shader);
+
+#ifndef NDEBUG
+	interface->attribs_count = (format != NULL) ? format->attrib_ct : 0;
+#endif
+	BLI_assert(shgroup->type == DRW_SHG_INSTANCE);
+	BLI_assert(shgroup->instance_geom != NULL);
+
+	if (format != NULL) {
+		DRW_instancing_buffer_request(DST.idatalist, format, batch, shgroup,
+		                              &shgroup->instancing_geom, &interface->instance_vbo);
+	}
+}
+
+static void drw_interface_batching_init(
         DRWShadingGroup *shgroup, GPUShader *shader, Gwn_VertFormat *format)
 {
 	DRWInterface *interface = &shgroup->interface;
@@ -673,36 +691,19 @@ static void drw_interface_instance_init(
 #ifndef NDEBUG
 	interface->attribs_count = (format != NULL) ? format->attrib_ct : 0;
 #endif
+	BLI_assert(format != NULL);
 
 	Gwn_PrimType type;
-	Gwn_Batch **r_batch = NULL;
 	switch (shgroup->type) {
-		case DRW_SHG_INSTANCE:
-			r_batch = &shgroup->instancing_geom;
-			type = GWN_PRIM_POINTS;
-			break;
-		case DRW_SHG_POINT_BATCH:
-			r_batch = &shgroup->batch_geom;
-			type = GWN_PRIM_POINTS;
-			break;
-		case DRW_SHG_LINE_BATCH:
-			r_batch = &shgroup->batch_geom;
-			type = GWN_PRIM_LINES;
-			break;
-		case DRW_SHG_TRIANGLE_BATCH:
-			r_batch = &shgroup->batch_geom;
-			type = GWN_PRIM_TRIS;
-			break;
+		case DRW_SHG_POINT_BATCH: type = GWN_PRIM_POINTS; break;
+		case DRW_SHG_LINE_BATCH: type = GWN_PRIM_LINES; break;
+		case DRW_SHG_TRIANGLE_BATCH: type = GWN_PRIM_TRIS; break;
 		default:
 			BLI_assert(0);
 	}
 
-	if (format != NULL) {
-		DRW_instance_buffer_request(DST.idatalist, format, shgroup, r_batch, &interface->instance_vbo, type);
-	}
-	else {
-		*r_batch = NULL;
-	}
+	DRW_batching_buffer_request(DST.idatalist, format, type, shgroup,
+	                            &shgroup->batch_geom, &interface->instance_vbo);
 }
 
 static void drw_interface_uniform(DRWShadingGroup *shgroup, const char *name,
@@ -882,7 +883,7 @@ DRWShadingGroup *DRW_shgroup_material_instance_create(
 		shgroup->type = DRW_SHG_INSTANCE;
 		shgroup->instance_geom = geom;
 		shgroup->instance_data = ob->data;
-		drw_interface_instance_init(shgroup, GPU_pass_shader(gpupass), format);
+		drw_interface_instance_init(shgroup, GPU_pass_shader(gpupass), geom, format);
 		drw_shgroup_material_inputs(shgroup, material, gpupass);
 	}
 
@@ -890,7 +891,7 @@ DRWShadingGroup *DRW_shgroup_material_instance_create(
 }
 
 DRWShadingGroup *DRW_shgroup_material_empty_tri_batch_create(
-        struct GPUMaterial *material, DRWPass *pass, int size)
+        struct GPUMaterial *material, DRWPass *pass, int tri_count)
 {
 #ifdef USE_GPU_SELECT
 	BLI_assert((G.f & G_PICKSEL) == 0);
@@ -899,10 +900,10 @@ DRWShadingGroup *DRW_shgroup_material_empty_tri_batch_create(
 	DRWShadingGroup *shgroup = drw_shgroup_material_create_ex(gpupass, pass);
 
 	if (shgroup) {
-		shgroup->type = DRW_SHG_TRIANGLE_BATCH;
-		shgroup->interface.instance_count = size * 3;
-		/* Calling drw_interface_init will cause it to GWN_batch_draw_procedural. */
+		/* Calling drw_interface_init will cause it to call GWN_draw_primitive(). */
 		drw_interface_init(&shgroup->interface, GPU_pass_shader(gpupass));
+		shgroup->type = DRW_SHG_TRIANGLE_BATCH;
+		shgroup->interface.instance_count = tri_count * 3;
 		drw_shgroup_material_inputs(shgroup, material, gpupass);
 	}
 
@@ -923,7 +924,7 @@ DRWShadingGroup *DRW_shgroup_instance_create(
 	shgroup->type = DRW_SHG_INSTANCE;
 	shgroup->instance_geom = geom;
 
-	drw_interface_instance_init(shgroup, shader, format);
+	drw_interface_instance_init(shgroup, shader, geom, format);
 
 	return shgroup;
 }
@@ -937,7 +938,7 @@ DRWShadingGroup *DRW_shgroup_point_batch_create(struct GPUShader *shader, DRWPas
 	DRWShadingGroup *shgroup = drw_shgroup_create_ex(shader, pass);
 	shgroup->type = DRW_SHG_POINT_BATCH;
 
-	drw_interface_instance_init(shgroup, shader, g_pos_format);
+	drw_interface_batching_init(shgroup, shader, g_pos_format);
 
 	return shgroup;
 }
@@ -949,7 +950,7 @@ DRWShadingGroup *DRW_shgroup_line_batch_create(struct GPUShader *shader, DRWPass
 	DRWShadingGroup *shgroup = drw_shgroup_create_ex(shader, pass);
 	shgroup->type = DRW_SHG_LINE_BATCH;
 
-	drw_interface_instance_init(shgroup, shader, g_pos_format);
+	drw_interface_batching_init(shgroup, shader, g_pos_format);
 
 	return shgroup;
 }
@@ -957,18 +958,18 @@ DRWShadingGroup *DRW_shgroup_line_batch_create(struct GPUShader *shader, DRWPass
 /* Very special batch. Use this if you position
  * your vertices with the vertex shader
  * and dont need any VBO attrib */
-DRWShadingGroup *DRW_shgroup_empty_tri_batch_create(struct GPUShader *shader, DRWPass *pass, int size)
+DRWShadingGroup *DRW_shgroup_empty_tri_batch_create(struct GPUShader *shader, DRWPass *pass, int tri_count)
 {
 #ifdef USE_GPU_SELECT
 	BLI_assert((G.f & G_PICKSEL) == 0);
 #endif
 	DRWShadingGroup *shgroup = drw_shgroup_create_ex(shader, pass);
 
-	/* Calling drw_interface_init will cause it to GWN_batch_draw_procedural. */
+	/* Calling drw_interface_init will cause it to call GWN_draw_primitive(). */
 	drw_interface_init(&shgroup->interface, shader);
 
 	shgroup->type = DRW_SHG_TRIANGLE_BATCH;
-	shgroup->interface.instance_count = size * 3;
+	shgroup->interface.instance_count = tri_count * 3;
 
 	return shgroup;
 }
@@ -991,13 +992,19 @@ void DRW_shgroup_free(struct DRWShadingGroup *UNUSED(shgroup))
 } ((void)0)
 
 /* Specify an external batch instead of adding each attrib one by one. */
-void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *instances)
+void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *batch)
 {
 	BLI_assert(shgroup->type == DRW_SHG_INSTANCE);
-	BLI_assert(shgroup->instancing_geom == NULL);
+	BLI_assert(shgroup->interface.instance_count == 0);
+	/* You cannot use external instancing batch without a dummy format. */
+	BLI_assert(shgroup->instancing_geom != NULL);
 
 	shgroup->type = DRW_SHG_INSTANCE_EXTERNAL;
-	shgroup->instancing_geom = instances;
+	/* PERF : This destroys the vaos cache so better check if it's necessary. */
+	/* Note: This WILL break if batch->verts[0] is destroyed and reallocated
+	 * at the same adress. Bindings/VAOs would remain obsolete. */
+	//if (shgroup->instancing_geom->inst != batch->verts[0])
+	GWN_batch_instbuf_set(shgroup->instancing_geom, batch->verts[0], false);
 
 #ifdef USE_GPU_SELECT
 	DRWCall *call = BLI_mempool_alloc(DST.vmempool->calls);
@@ -1140,8 +1147,6 @@ void DRW_shgroup_set_instance_count(DRWShadingGroup *shgroup, unsigned int count
 
 unsigned int DRW_shgroup_get_instance_count(const DRWShadingGroup *shgroup)
 {
-	BLI_assert(shgroup->type != DRW_SHG_NORMAL && shgroup->type != DRW_SHG_INSTANCE_EXTERNAL);
-
 	return shgroup->interface.instance_count;
 }
 
@@ -1765,18 +1770,17 @@ static void draw_geometry_execute_ex(
 	if (geom == NULL) {
 		BLI_assert(shgroup->type == DRW_SHG_TRIANGLE_BATCH); /* Add other type if needed. */
 		/* Shader is already bound. */
-		Gwn_Batch *batch = DRW_cache_fullscreen_quad_get();
-		GWN_batch_draw_procedural(batch, GWN_PRIM_TRIS, count);
+		GWN_draw_primitive(GWN_PRIM_TRIS, count);
 		return;
 	}
 
 	/* step 2 : bind vertex array & draw */
 	GWN_batch_program_set(geom, GPU_shader_get_program(shgroup->shader), GPU_shader_get_interface(shgroup->shader));
 	if (ELEM(shgroup->type, DRW_SHG_INSTANCE, DRW_SHG_INSTANCE_EXTERNAL)) {
-		GWN_batch_draw_stupid_instanced(geom, shgroup->instancing_geom, start, count);
+		GWN_batch_draw_range_ex(geom, start, count, true);
 	}
 	else {
-		GWN_batch_draw_stupid(geom, start, count);
+		GWN_batch_draw_range(geom, start, count);
 	}
 	/* XXX this just tells gawain we are done with the shader.
 	 * This does not unbind the shader. */
@@ -1998,7 +2002,7 @@ static void draw_shgroup(DRWShadingGroup *shgroup, DRWState pass_state)
 			if (shgroup->type == DRW_SHG_INSTANCE_EXTERNAL) {
 				if (shgroup->instancing_geom != NULL) {
 					GPU_SELECT_LOAD_IF_PICKSEL((DRWCall *)shgroup->calls_first);
-					draw_geometry(shgroup, shgroup->instance_geom, obmat, shgroup->instance_data, 0, 0);
+					draw_geometry(shgroup, shgroup->instancing_geom, obmat, shgroup->instance_data, 0, 0);
 				}
 			}
 			else {
@@ -2006,13 +2010,15 @@ static void draw_shgroup(DRWShadingGroup *shgroup, DRWState pass_state)
 					unsigned int count, start;
 					GPU_SELECT_LOAD_IF_PICKSEL_LIST(shgroup, start, count)
 					{
-						draw_geometry(shgroup, shgroup->instance_geom, obmat, shgroup->instance_data, start, count);
+						draw_geometry(shgroup,
+						              (shgroup->instancing_geom) ? shgroup->instancing_geom : shgroup->instance_geom,
+						              obmat, shgroup->instance_data, start, count);
 					}
 					GPU_SELECT_LOAD_IF_PICKSEL_LIST_END(start, count)
 				}
 			}
 		}
-		else {
+		else { /* DRW_SHG_***_BATCH */
 			/* Some dynamic batch can have no geom (no call to aggregate) */
 			if (shgroup->interface.instance_count > 0) {
 				unsigned int count, start;
diff --git a/source/blender/draw/modes/object_mode.c b/source/blender/draw/modes/object_mode.c
index 4a7a5d25b11..d6c0369b0a5 100644
--- a/source/blender/draw/modes/object_mode.c
+++ b/source/blender/draw/modes/object_mode.c
@@ -218,6 +218,7 @@ typedef struct OBJECT_PrivateData {
 
 static struct {
 	/* Instance Data format */
+	struct Gwn_VertFormat *particle_format;
 	struct Gwn_VertFormat *empty_image_format;
 	struct Gwn_VertFormat *empty_image_wire_format;
 
@@ -537,6 +538,7 @@ static void OBJECT_engine_init(void *vedata)
 
 static void OBJECT_engine_free(void)
 {
+	MEM_SAFE_FREE(e_data.particle_format);
 	MEM_SAFE_FREE(e_data.empty_image_format);
 	MEM_SAFE_FREE(e_data.empty_image_wire_format);
 	DRW_SHADER_FREE_SAFE(e_data.outline_resolve_sh);
@@ -1752,6 +1754,9 @@ static void OBJECT_cache_populate_particles(Object *ob,
 				static float def_prim_col[3] = {0.5f, 0.5f, 0.5f};
 				static float def_sec_col[3] = {1.0f, 1.0f, 1.0f};
 
+				/* Dummy particle format for instancing to work. */
+				DRW_shgroup_instance_format(e_data.particle_format, {{"dummy", DRW_ATTRIB_FLOAT, 1}});
+
 				Material *ma = give_current_material(ob, part->omat);
 
 				switch (draw_as) {
@@ -1766,21 +1771,24 @@ static void OBJECT_cache_populate_particles(Object *ob,
 						break;
 					case PART_DRAW_CROSS:
 						shgrp = DRW_shgroup_instance_create(
-						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CROSS), NULL);
+						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CROSS),
+						        e_data.particle_format);
 						DRW_shgroup_uniform_texture(shgrp, "ramp", globals_ramp);
 						DRW_shgroup_uniform_vec3(shgrp, "color", ma ? &ma->r : def_prim_col, 1);
 						DRW_shgroup_uniform_int(shgrp, "screen_space", &screen_space[0], 1);
 						break;
 					case PART_DRAW_CIRC:
 						shgrp = DRW_shgroup_instance_create(
-						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CIRC), NULL);
+						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CIRC),
+						        e_data.particle_format);
 						DRW_shgroup_uniform_texture(shgrp, "ramp", globals_ramp);
 						DRW_shgroup_uniform_vec3(shgrp, "color", ma ? &ma->r : def_prim_col, 1);
 						DRW_shgroup_uniform_int(shgrp, "screen_space", &screen_space[1], 1);
 						break;
 					case PART_DRAW_AXIS:
 						shgrp = DRW_shgroup_instance_create(
-						        e_data.part_axis_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_AXIS), NULL);
+						        e_data.part_axis_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_AXIS),
+						        e_data.particle_format);
 						DRW_shgroup_uniform_int(shgrp, "screen_space", &screen_space[0], 1);
 						break;
 					default:
author	Clément Foucault <foucault.clem@gmail.com>	2018-02-20 03:55:19 +0300
committer	Clément Foucault <foucault.clem@gmail.com>	2018-02-21 17:28:26 +0300
commit	c5eba46d7f4ddfcdf372a3f4968e4d170ee0a002 (patch)
tree	f9a170c28cf42f9948f83003b8c8b693475db567 /source
parent	1b3f9ecd0d0bdf20de24f72d73517cc97d925a15 (diff)