Gawain: Refactor: VAOs caching AND use new VAOs manager.

A major bottleneck of current implementation is the call to create_bindings() for basically every drawcalls. This is due to the VAO being tagged dirty when assigning a new shader to the Batch, defeating the purpose of the Batch (reuse it for drawing). Since managing hundreds of batches in DrawManager and DrawCache seems not fun enough to me, I prefered rewritting the batches itself. --- Batch changes --- For this to happen I needed to change the Instancing to be part of the Batch rather than being another batch supplied at drawtime. The Gwn_VertBuffers are copied from the batch to be instanciated and a new Gwn_VertBuffer is supplied for instancing attribs. This mean a VAO can be generated and cached for this instancing case. A Batch can be rendered with instancing, without instancing attribs and without the need for a new VAO using the GWN_batch_draw_range_ex with the force_instance parameter set to true. --- Draw manager changes --- The downside with this approach is that we must track the validity of the instanced batch (the original one). For this the only way (I could think of) is to set a callback for when the batch is getting free. This means a bit of refactor in the DrawManager with the separation of batching and instancing Batches. --- VAO cache --- Each VAO is generated for a given ShaderInterface. This means we can keep it alive as long as the shader interface lives. If a ShaderInterface is discarded, it needs to destroy every VAO associated to it. Otherwise, a new ShaderInterface with the same adress could be generated and reuse the same VAO with incorrect bindings. The VAO cache itself is using a mix between a static array of VAO and a dynamic array if the is not enough space in the static. Using this hybrid approach is a bit more performant than the dynamic array alone. The array will not resize down but empty entries will be filled up again. It's unlikely we get a buffer overflow from this. Resizing could be done on next allocation if needed. --- Results --- Using Cached VAOs means that we are not querying each vertex attrib for each vbo for each drawcall, every redraw! In a CPU limited test scene (10000 cubes in Clay engine) I get a reduction of CPU drawing time from ~20ms to 13ms. The only area that is not caching VAOs is the instancing from particles (see comment DRW_shgroup_instance_batch).
author: Clément Foucault <foucault.clem@gmail.com> 2018-02-20 03:55:19 +0300
committer: Clément Foucault <foucault.clem@gmail.com> 2018-02-21 17:28:26 +0300
commit: c5eba46d7f4ddfcdf372a3f4968e4d170ee0a002 (patch)
tree: f9a170c28cf42f9948f83003b8c8b693475db567
parent: 1b3f9ecd0d0bdf20de24f72d73517cc97d925a15 (diff)
15 files changed, 644 insertions, 353 deletions
diff --git a/intern/gawain/CMakeLists.txt b/intern/gawain/CMakeLists.txt
index 9924daa8cd1..424b364ae8e 100644
--- a/intern/gawain/CMakeLists.txt
+++ b/intern/gawain/CMakeLists.txt
@@ -16,6 +16,7 @@ set(SRC
 	src/gwn_imm_util.c
 	src/gwn_primitive.c
 	src/gwn_shader_interface.c
+	src/gwn_vertex_array_id.cpp
 	src/gwn_vertex_buffer.c
 	src/gwn_vertex_format.c
 
@@ -30,6 +31,7 @@ set(SRC
 	gawain/gwn_primitive.h
 	gawain/gwn_primitive_private.h
 	gawain/gwn_shader_interface.h
+	gawain/gwn_vertex_array_id.h
 	gawain/gwn_vertex_buffer.h
 	gawain/gwn_vertex_format.h
 	gawain/gwn_vertex_format_private.h
diff --git a/intern/gawain/gawain/gwn_batch.h b/intern/gawain/gawain/gwn_batch.h
index 94cd893f09e..c676cfef119 100644
--- a/intern/gawain/gawain/gwn_batch.h
+++ b/intern/gawain/gawain/gwn_batch.h
@@ -23,34 +23,61 @@ typedef enum {
 } Gwn_BatchPhase;
 
 #define GWN_BATCH_VBO_MAX_LEN 3
+#define GWN_BATCH_VAO_STATIC_LEN 3
+#define GWN_BATCH_VAO_DYN_ALLOC_COUNT 16
 
 typedef struct Gwn_Batch {
 	// geometry
 	Gwn_VertBuf* verts[GWN_BATCH_VBO_MAX_LEN]; // verts[0] is required, others can be NULL
+	Gwn_VertBuf* inst; // instance attribs
 	Gwn_IndexBuf* elem; // NULL if element list not needed
-	Gwn_PrimType prim_type;
 	GLenum gl_prim_type;
 
+	// cached values (avoid dereferencing later)
+	GLuint vao_id;
+	GLuint program;
+	const struct Gwn_ShaderInterface* interface;
+
 	// book-keeping
-	GLuint vao_id; // remembers all geometry state (vertex attrib bindings & element buffer)
+	unsigned owns_flag;
+	struct Gwn_Context *context; // used to free all vaos. this implies all vaos were created under the same context.
 	Gwn_BatchPhase phase;
-	bool program_dirty;
 	bool program_in_use;
-	unsigned owns_flag;
 
-	// state
-	GLuint program;
-	const Gwn_ShaderInterface* interface;
+	// Vao management: remembers all geometry state (vertex attrib bindings & element buffer)
+	// for each shader interface. Start with a static number of vaos and fallback to dynamic count
+	// if necessary. Once a batch goes dynamic it does not go back.
+	bool is_dynamic_vao_count;
+	union {
+		// Static handle count
+		struct {
+			const struct Gwn_ShaderInterface* interfaces[GWN_BATCH_VAO_STATIC_LEN];
+			GLuint vao_ids[GWN_BATCH_VAO_STATIC_LEN];
+		} static_vaos;
+		// Dynamic handle count
+		struct {
+			unsigned count;
+			const struct Gwn_ShaderInterface** interfaces;
+			GLuint* vao_ids;
+		} dynamic_vaos;
+	};
+
+	// XXX This is the only solution if we want to have some data structure using
+	// batches as key to identify nodes. We must destroy these nodes with this callback.
+	void (*free_callback)(struct Gwn_Batch*, void*);
+	void* callback_data;
 } Gwn_Batch;
 
 enum {
 	GWN_BATCH_OWNS_VBO = (1 << 0),
 	/* each vbo index gets bit-shifted */
+	GWN_BATCH_OWNS_INSTANCES = (1 << 30),
 	GWN_BATCH_OWNS_INDEX = (1 << 31),
 };
 
 Gwn_Batch* GWN_batch_create_ex(Gwn_PrimType, Gwn_VertBuf*, Gwn_IndexBuf*, unsigned owns_flag);
 void GWN_batch_init_ex(Gwn_Batch*, Gwn_PrimType, Gwn_VertBuf*, Gwn_IndexBuf*, unsigned owns_flag);
+Gwn_Batch* GWN_batch_duplicate(Gwn_Batch* batch_src);
 
 #define GWN_batch_create(prim, verts, elem) \
 	GWN_batch_create_ex(prim, verts, elem, 0)
@@ -59,11 +86,18 @@ void GWN_batch_init_ex(Gwn_Batch*, Gwn_PrimType, Gwn_VertBuf*, Gwn_IndexBuf*, un
 
 void GWN_batch_discard(Gwn_Batch*); // verts & elem are not discarded
 
+void GWN_batch_callback_free_set(Gwn_Batch*, void (*callback)(Gwn_Batch*, void*), void*);
+
+void GWN_batch_instbuf_set(Gwn_Batch*, Gwn_VertBuf*, bool own_vbo); // Instancing
+
 int GWN_batch_vertbuf_add_ex(Gwn_Batch*, Gwn_VertBuf*, bool own_vbo);
 
 #define GWN_batch_vertbuf_add(batch, verts) \
 	GWN_batch_vertbuf_add_ex(batch, verts, false)
 
+// This is a private function
+void GWN_batch_remove_interface_ref(Gwn_Batch*, const Gwn_ShaderInterface*);
+
 void GWN_batch_program_set(Gwn_Batch*, GLuint program, const Gwn_ShaderInterface*);
 void GWN_batch_program_unset(Gwn_Batch*);
 // Entire batch draws with one shader program, but can be redrawn later with another program.
@@ -84,11 +118,14 @@ void GWN_batch_uniform_4fv(Gwn_Batch*, const char* name, const float data[4]);
 
 void GWN_batch_draw(Gwn_Batch*);
 
+// This does not bind/unbind shader and does not call gpuBindMatrices()
+void GWN_batch_draw_range_ex(Gwn_Batch*, int v_first, int v_count, bool force_instance);
 
-void GWN_batch_draw_stupid(Gwn_Batch*, int v_first, int v_count);
-void GWN_batch_draw_stupid_instanced(Gwn_Batch*, Gwn_Batch*, int instance_first, int instance_count);
-void GWN_batch_draw_procedural(Gwn_Batch*, Gwn_PrimType, int v_count);
+#define GWN_batch_draw_range(batch, first, count) \
+	GWN_batch_draw_range_ex(batch, first, count, false)
 
+// Does not even need batch
+void GWN_draw_primitive(Gwn_PrimType, int v_count);
 
 #if 0 // future plans
 
diff --git a/intern/gawain/gawain/gwn_buffer_id.h b/intern/gawain/gawain/gwn_buffer_id.h
index db5df99f526..6f51ca6905d 100644
--- a/intern/gawain/gawain/gwn_buffer_id.h
+++ b/intern/gawain/gawain/gwn_buffer_id.h
@@ -25,10 +25,6 @@ extern "C" {
 GLuint GWN_buf_id_alloc(void);
 void GWN_buf_id_free(GLuint buffer_id);
 
-GLuint GWN_vao_alloc(void);
-void GWN_vao_free(GLuint vao_id);
-
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/intern/gawain/gawain/gwn_shader_interface.h b/intern/gawain/gawain/gwn_shader_interface.h
index 345ad8d389b..3bca541d6e8 100644
--- a/intern/gawain/gawain/gwn_shader_interface.h
+++ b/intern/gawain/gawain/gwn_shader_interface.h
@@ -54,6 +54,7 @@ typedef struct Gwn_ShaderInput {
 } Gwn_ShaderInput;
 
 #define GWN_NUM_SHADERINTERFACE_BUCKETS 257
+#define GWN_SHADERINTERFACE_REF_ALLOC_COUNT 16
 
 typedef struct Gwn_ShaderInterface {
 	GLint program;
@@ -63,6 +64,8 @@ typedef struct Gwn_ShaderInterface {
 	Gwn_ShaderInput* ubo_buckets[GWN_NUM_SHADERINTERFACE_BUCKETS];
 	Gwn_ShaderInput* builtin_uniforms[GWN_NUM_UNIFORMS];
 	char* name_buffer;
+	struct Gwn_Batch** batches; // references to batches using this interface
+	unsigned batches_ct;
 } Gwn_ShaderInterface;
 
 Gwn_ShaderInterface* GWN_shaderinterface_create(GLint program_id);
@@ -72,3 +75,7 @@ const Gwn_ShaderInput* GWN_shaderinterface_uniform(const Gwn_ShaderInterface*, c
 const Gwn_ShaderInput* GWN_shaderinterface_uniform_builtin(const Gwn_ShaderInterface*, Gwn_UniformBuiltin);
 const Gwn_ShaderInput* GWN_shaderinterface_ubo(const Gwn_ShaderInterface*, const char* name);
 const Gwn_ShaderInput* GWN_shaderinterface_attr(const Gwn_ShaderInterface*, const char* name);
+
+// keep track of batches using this interface
+void GWN_shaderinterface_add_batch_ref(Gwn_ShaderInterface*, struct Gwn_Batch*);
+void GWN_shaderinterface_remove_batch_ref(Gwn_ShaderInterface*, struct Gwn_Batch*);
diff --git a/intern/gawain/gawain/gwn_vertex_array_id.h b/intern/gawain/gawain/gwn_vertex_array_id.h
index 6d2a059b9bd..1c093d428ce 100644
--- a/intern/gawain/gawain/gwn_vertex_array_id.h
+++ b/intern/gawain/gawain/gwn_vertex_array_id.h
@@ -26,8 +26,8 @@ extern "C" {
 #include "gwn_context.h"
 
 GLuint GWN_vao_default(void);
-GLuint GWN_vao_alloc_new(void);
-void GWN_vao_free_new(GLuint vao_id, Gwn_Context*);
+GLuint GWN_vao_alloc(void);
+void GWN_vao_free(GLuint vao_id, Gwn_Context*);
 
 #ifdef __cplusplus
 }
diff --git a/intern/gawain/src/gwn_batch.c b/intern/gawain/src/gwn_batch.c
index ec3f98e348c..098c547c662 100644
--- a/intern/gawain/src/gwn_batch.c
+++ b/intern/gawain/src/gwn_batch.c
@@ -11,12 +11,48 @@
 
 #include "gwn_batch.h"
 #include "gwn_buffer_id.h"
+#include "gwn_vertex_array_id.h"
 #include "gwn_primitive_private.h"
 #include <stdlib.h>
+#include <string.h>
 
 // necessary functions from matrix API
 extern void gpuBindMatrices(const Gwn_ShaderInterface* shaderface);
-extern bool gpuMatricesDirty(void); // how best to use this here?
+
+static void batch_update_program_bindings(Gwn_Batch* batch, unsigned int v_first);
+
+static void Batch_vao_cache_clear(Gwn_Batch* batch)
+	{
+	if (batch->is_dynamic_vao_count)
+		{
+		for (int i = 0; i < batch->dynamic_vaos.count; ++i)
+			{
+			if (batch->dynamic_vaos.vao_ids[i])
+				GWN_vao_free(batch->dynamic_vaos.vao_ids[i], batch->context);
+			if (batch->dynamic_vaos.interfaces[i])
+				GWN_shaderinterface_remove_batch_ref((Gwn_ShaderInterface *)batch->dynamic_vaos.interfaces[i], batch);
+			}
+		free(batch->dynamic_vaos.interfaces);
+		free(batch->dynamic_vaos.vao_ids);
+		}
+	else
+		{
+		for (int i = 0; i < GWN_BATCH_VAO_STATIC_LEN; ++i)
+			{
+			if (batch->static_vaos.vao_ids[i])
+				GWN_vao_free(batch->static_vaos.vao_ids[i], batch->context);
+			if (batch->static_vaos.interfaces[i])
+				GWN_shaderinterface_remove_batch_ref((Gwn_ShaderInterface *)batch->static_vaos.interfaces[i], batch);
+			}
+		}
+
+	batch->is_dynamic_vao_count = false;
+	for (int i = 0; i < GWN_BATCH_VAO_STATIC_LEN; ++i)
+		{
+		batch->static_vaos.vao_ids[i] = 0;
+		batch->static_vaos.interfaces[i] = NULL;
+		}
+	}
 
 Gwn_Batch* GWN_batch_create_ex(
         Gwn_PrimType prim_type, Gwn_VertBuf* verts, Gwn_IndexBuf* elem,
@@ -40,11 +76,25 @@ void GWN_batch_init_ex(
 	batch->verts[0] = verts;
 	for (int v = 1; v < GWN_BATCH_VBO_MAX_LEN; ++v)
 		batch->verts[v] = NULL;
+	batch->inst = NULL;
 	batch->elem = elem;
-	batch->prim_type = prim_type;
 	batch->gl_prim_type = convert_prim_type_to_gl(prim_type);
 	batch->phase = GWN_BATCH_READY_TO_DRAW;
+	batch->is_dynamic_vao_count = false;
 	batch->owns_flag = owns_flag;
+	batch->free_callback = NULL;
+	}
+
+// This will share the VBOs with the new batch
+Gwn_Batch* GWN_batch_duplicate(Gwn_Batch* batch_src)
+	{
+	Gwn_Batch* batch = GWN_batch_create_ex(GWN_PRIM_POINTS, batch_src->verts[0], batch_src->elem, 0);
+
+	batch->gl_prim_type = batch_src->gl_prim_type;
+	for (int v = 1; v < GWN_BATCH_VBO_MAX_LEN; ++v)
+		batch->verts[v] = batch_src->verts[v];
+
+	return batch;
 	}
 
 void GWN_batch_discard(Gwn_Batch* batch)
@@ -52,6 +102,9 @@ void GWN_batch_discard(Gwn_Batch* batch)
 	if (batch->owns_flag & GWN_BATCH_OWNS_INDEX)
 		GWN_indexbuf_discard(batch->elem);
 
+	if (batch->owns_flag & GWN_BATCH_OWNS_INSTANCES)
+		GWN_vertbuf_discard(batch->inst);
+
 	if ((batch->owns_flag & ~GWN_BATCH_OWNS_INDEX) != 0)
 		{
 		for (int v = 0; v < GWN_BATCH_VBO_MAX_LEN; ++v)
@@ -63,12 +116,39 @@ void GWN_batch_discard(Gwn_Batch* batch)
 			}
 		}
 
-	if (batch->vao_id)
-		GWN_vao_free(batch->vao_id);
+	Batch_vao_cache_clear(batch);
+
+	if (batch->free_callback)
+		batch->free_callback(batch, batch->callback_data);
 
 	free(batch);
 	}
 
+void GWN_batch_callback_free_set(Gwn_Batch* batch, void (*callback)(Gwn_Batch*, void*), void* user_data)
+	{
+	batch->free_callback = callback;
+	batch->callback_data = user_data;
+	}
+
+void GWN_batch_instbuf_set(Gwn_Batch* batch, Gwn_VertBuf* inst, bool own_vbo)
+	{
+#if TRUST_NO_ONE
+	assert(inst != NULL);
+#endif
+	// redo the bindings
+	Batch_vao_cache_clear(batch);
+
+	if (batch->inst != NULL && (batch->owns_flag & GWN_BATCH_OWNS_INSTANCES))
+		GWN_vertbuf_discard(batch->inst);
+
+	batch->inst = inst;
+
+	if (own_vbo)
+		batch->owns_flag |= GWN_BATCH_OWNS_INSTANCES;
+	else
+		batch->owns_flag &= ~GWN_BATCH_OWNS_INSTANCES;
+	}
+
 int GWN_batch_vertbuf_add_ex(
         Gwn_Batch* batch, Gwn_VertBuf* verts,
         bool own_vbo)
@@ -100,12 +180,96 @@ int GWN_batch_vertbuf_add_ex(
 void GWN_batch_program_set(Gwn_Batch* batch, GLuint program, const Gwn_ShaderInterface* shaderface)
 	{
 #if TRUST_NO_ONE
-	assert(glIsProgram(program));
+	assert(glIsProgram(shaderface->program));
+	assert(batch->program_in_use == 0);
 #endif
 
+	batch->vao_id = 0;
 	batch->program = program;
 	batch->interface = shaderface;
-	batch->program_dirty = true;
+
+
+	// Search through cache
+	if (batch->is_dynamic_vao_count)
+		{
+		for (int i = 0; i < batch->dynamic_vaos.count && batch->vao_id == 0; ++i)
+			if (batch->dynamic_vaos.interfaces[i] == shaderface)
+				batch->vao_id = batch->dynamic_vaos.vao_ids[i];
+		}
+	else
+		{
+		for (int i = 0; i < GWN_BATCH_VAO_STATIC_LEN && batch->vao_id == 0; ++i)
+			if (batch->static_vaos.interfaces[i] == shaderface)
+				batch->vao_id = batch->static_vaos.vao_ids[i];
+		}
+
+	if (batch->vao_id == 0)
+		{
+		if (batch->context == NULL)
+			batch->context = GWN_context_active_get();
+#if TRUST_NO_ONE && 0 // disabled until we use a separate single context for UI.
+		else // Make sure you are not trying to draw this batch in another context.
+			assert(batch->context == GWN_context_active_get());
+#endif
+		// Cache miss, time to add a new entry!
+		if (!batch->is_dynamic_vao_count)
+			{
+			int i; // find first unused slot
+			for (i = 0; i < GWN_BATCH_VAO_STATIC_LEN; ++i)
+				if (batch->static_vaos.vao_ids[i] == 0)
+					break;
+
+			if (i < GWN_BATCH_VAO_STATIC_LEN)
+				{
+				batch->static_vaos.interfaces[i] = shaderface;
+				batch->static_vaos.vao_ids[i] = batch->vao_id = GWN_vao_alloc();
+				}
+			else
+				{
+				// Not enough place switch to dynamic.
+				batch->is_dynamic_vao_count = true;
+				// Erase previous entries, they will be added back if drawn again.
+				for (int j = 0; j < GWN_BATCH_VAO_STATIC_LEN; ++j)
+					{
+					GWN_shaderinterface_remove_batch_ref((Gwn_ShaderInterface*)batch->static_vaos.interfaces[j], batch);
+					GWN_vao_free(batch->static_vaos.vao_ids[j], batch->context);
+					}
+				// Init dynamic arrays and let the branch below set the values.
+				batch->dynamic_vaos.count = GWN_BATCH_VAO_DYN_ALLOC_COUNT;
+				batch->dynamic_vaos.interfaces = calloc(batch->dynamic_vaos.count, sizeof(Gwn_ShaderInterface*));
+				batch->dynamic_vaos.vao_ids = calloc(batch->dynamic_vaos.count, sizeof(GLuint));
+				}
+			}
+
+		if (batch->is_dynamic_vao_count)
+			{
+			int i; // find first unused slot
+			for (i = 0; i < batch->dynamic_vaos.count; ++i)
+				if (batch->dynamic_vaos.vao_ids[i] == 0)
+					break;
+
+			if (i == batch->dynamic_vaos.count)
+				{
+				// Not enough place, realloc the array.
+				i = batch->dynamic_vaos.count;
+				batch->dynamic_vaos.count += GWN_BATCH_VAO_DYN_ALLOC_COUNT;
+				batch->dynamic_vaos.interfaces = realloc(batch->dynamic_vaos.interfaces, sizeof(Gwn_ShaderInterface*) * batch->dynamic_vaos.count);
+				batch->dynamic_vaos.vao_ids = realloc(batch->dynamic_vaos.vao_ids, sizeof(GLuint) * batch->dynamic_vaos.count);
+				memset(batch->dynamic_vaos.interfaces + i, 0, sizeof(Gwn_ShaderInterface*) * GWN_BATCH_VAO_DYN_ALLOC_COUNT);
+				memset(batch->dynamic_vaos.vao_ids + i, 0, sizeof(GLuint) * GWN_BATCH_VAO_DYN_ALLOC_COUNT);
+				}
+
+			batch->dynamic_vaos.interfaces[i] = shaderface;
+			batch->dynamic_vaos.vao_ids[i] = batch->vao_id = GWN_vao_alloc();
+			}
+
+		GWN_shaderinterface_add_batch_ref((Gwn_ShaderInterface*)shaderface, batch);
+
+		// We just got a fresh VAO we need to initialize it.
+		glBindVertexArray(batch->vao_id);
+		batch_update_program_bindings(batch, 0);
+		glBindVertexArray(0);
+		}
 
 	GWN_batch_program_use_begin(batch); // hack! to make Batch_Uniform* simpler
 	}
@@ -118,94 +282,104 @@ void GWN_batch_program_unset(Gwn_Batch* batch)
 	batch->program_in_use = false;
 	}
 
-static void create_bindings(Gwn_Batch* batch, const Gwn_ShaderInterface* interface, unsigned int v_first, const bool use_instancing)
+void GWN_batch_remove_interface_ref(Gwn_Batch* batch, const Gwn_ShaderInterface* interface)
 	{
-	for (int v = 0; v < GWN_BATCH_VBO_MAX_LEN; ++v)
+	if (batch->is_dynamic_vao_count)
+		{
+		for (int i = 0; i < batch->dynamic_vaos.count; ++i)
+			{
+			if (batch->dynamic_vaos.interfaces[i] == interface)
+				{
+				GWN_vao_free(batch->dynamic_vaos.vao_ids[i], batch->context);
+				batch->dynamic_vaos.vao_ids[i] = 0;
+				batch->dynamic_vaos.interfaces[i] = NULL;
+				break; // cannot have duplicates
+				}
+			}
+		}
+	else
 		{
-		Gwn_VertBuf* verts = batch->verts[v];
-		if (verts == NULL)
-			break;
+		int i;
+		for (i = 0; i < GWN_BATCH_VAO_STATIC_LEN; ++i)
+			{
+			if (batch->static_vaos.interfaces[i] == interface)
+				{
+				GWN_vao_free(batch->static_vaos.vao_ids[i], batch->context);
+				batch->static_vaos.vao_ids[i] = 0;
+				batch->static_vaos.interfaces[i] = NULL;
+				break; // cannot have duplicates
+				}
+			}
+		}
+	}
 
-		const Gwn_VertFormat* format = &verts->format;
+static void create_bindings(Gwn_VertBuf* verts, const Gwn_ShaderInterface* interface, unsigned int v_first, const bool use_instancing)
+	{
+	const Gwn_VertFormat* format = &verts->format;
 
-		const unsigned attrib_ct = format->attrib_ct;
-		const unsigned stride = format->stride;
+	const unsigned attrib_ct = format->attrib_ct;
+	const unsigned stride = format->stride;
 
-		GWN_vertbuf_use(verts);
+	GWN_vertbuf_use(verts);
 
-		for (unsigned a_idx = 0; a_idx < attrib_ct; ++a_idx)
-			{
-			const Gwn_VertAttr* a = format->attribs + a_idx;
+	for (unsigned a_idx = 0; a_idx < attrib_ct; ++a_idx)
+		{
+		const Gwn_VertAttr* a = format->attribs + a_idx;
 
-			const GLvoid* pointer = (const GLubyte*)0 + a->offset + v_first * stride;
+		const GLvoid* pointer = (const GLubyte*)0 + a->offset + v_first * stride;
 
-			for (unsigned n_idx = 0; n_idx < a->name_ct; ++n_idx)
-				{
-				const Gwn_ShaderInput* input = GWN_shaderinterface_attr(interface, a->name[n_idx]);
+		for (unsigned n_idx = 0; n_idx < a->name_ct; ++n_idx)
+			{
+			const Gwn_ShaderInput* input = GWN_shaderinterface_attr(interface, a->name[n_idx]);
 
-				if (input == NULL) continue;
+			if (input == NULL) continue;
 
-				if (a->comp_ct == 16 || a->comp_ct == 12 || a->comp_ct == 8)
-					{
+			if (a->comp_ct == 16 || a->comp_ct == 12 || a->comp_ct == 8)
+				{
 #if TRUST_NO_ONE
-					assert(a->fetch_mode == GWN_FETCH_FLOAT);
-					assert(a->gl_comp_type == GL_FLOAT);
+				assert(a->fetch_mode == GWN_FETCH_FLOAT);
+				assert(a->gl_comp_type == GL_FLOAT);
 #endif
-					for (int i = 0; i < a->comp_ct / 4; ++i)
-						{
-						glEnableVertexAttribArray(input->location + i);
-						glVertexAttribDivisor(input->location + i, (use_instancing) ? 1 : 0);
-						glVertexAttribPointer(input->location + i, 4, a->gl_comp_type, GL_FALSE, stride,
-						                      (const GLubyte*)pointer + i * 16);
-						}
+				for (int i = 0; i < a->comp_ct / 4; ++i)
+					{
+					glEnableVertexAttribArray(input->location + i);
+					glVertexAttribDivisor(input->location + i, (use_instancing) ? 1 : 0);
+					glVertexAttribPointer(input->location + i, 4, a->gl_comp_type, GL_FALSE, stride,
+					                      (const GLubyte*)pointer + i * 16);
 					}
-				else
+				}
+			else
+				{
+				glEnableVertexAttribArray(input->location);
+				glVertexAttribDivisor(input->location, (use_instancing) ? 1 : 0);
+
+				switch (a->fetch_mode)
 					{
-					glEnableVertexAttribArray(input->location);
-					glVertexAttribDivisor(input->location, (use_instancing) ? 1 : 0);
-
-					switch (a->fetch_mode)
-						{
-						case GWN_FETCH_FLOAT:
-						case GWN_FETCH_INT_TO_FLOAT:
-							glVertexAttribPointer(input->location, a->comp_ct, a->gl_comp_type, GL_FALSE, stride, pointer);
-							break;
-						case GWN_FETCH_INT_TO_FLOAT_UNIT:
-							glVertexAttribPointer(input->location, a->comp_ct, a->gl_comp_type, GL_TRUE, stride, pointer);
-							break;
-						case GWN_FETCH_INT:
-							glVertexAttribIPointer(input->location, a->comp_ct, a->gl_comp_type, stride, pointer);
-						}
+					case GWN_FETCH_FLOAT:
+					case GWN_FETCH_INT_TO_FLOAT:
+						glVertexAttribPointer(input->location, a->comp_ct, a->gl_comp_type, GL_FALSE, stride, pointer);
+						break;
+					case GWN_FETCH_INT_TO_FLOAT_UNIT:
+						glVertexAttribPointer(input->location, a->comp_ct, a->gl_comp_type, GL_TRUE, stride, pointer);
+						break;
+					case GWN_FETCH_INT:
+						glVertexAttribIPointer(input->location, a->comp_ct, a->gl_comp_type, stride, pointer);
 					}
 				}
 			}
 		}
 	}
 
-static void Batch_update_program_bindings(Gwn_Batch* batch, unsigned int v_first)
+static void batch_update_program_bindings(Gwn_Batch* batch, unsigned int v_first)
 	{
-	// disable all as a precaution
-	// why are we not using prev_attrib_enabled_bits?? see immediate.c
-	for (unsigned a_idx = 0; a_idx < GWN_VERT_ATTR_MAX_LEN; ++a_idx)
-		glDisableVertexAttribArray(a_idx);
+	for (int v = 0; v < GWN_BATCH_VBO_MAX_LEN && batch->verts[v] != NULL; ++v)
+		create_bindings(batch->verts[v], batch->interface, (batch->inst) ? 0 : v_first, false);
 
-	create_bindings(batch, batch->interface, v_first, false);
+	if (batch->inst)
+		create_bindings(batch->inst, batch->interface, v_first, true);
 
-	batch->program_dirty = false;
-	}
-
-static void Batch_update_program_bindings_instancing(Gwn_Batch* batch, Gwn_Batch* batch_instancing, unsigned int instance_first)
-	{
-	// disable all as a precaution
-	// why are we not using prev_attrib_enabled_bits?? see immediate.c
-	for (unsigned a_idx = 0; a_idx < GWN_VERT_ATTR_MAX_LEN; ++a_idx)
-		glDisableVertexAttribArray(a_idx);
-
-	create_bindings(batch, batch->interface, 0, false);
-	if (batch_instancing)
-		create_bindings(batch_instancing, batch->interface, instance_first, true);
-
-	batch->program_dirty = false;
+	if (batch->elem)
+		GWN_indexbuf_use(batch->elem);
 	}
 
 void GWN_batch_program_use_begin(Gwn_Batch* batch)
@@ -290,142 +464,86 @@ void GWN_batch_uniform_4fv(Gwn_Batch* batch, const char* name, const float data[
 	glUniform4fv(uniform->location, 1, data);
 	}
 
-static void Batch_prime(Gwn_Batch* batch)
-	{
-	batch->vao_id = GWN_vao_alloc();
-	glBindVertexArray(batch->vao_id);
-
-	for (int v = 0; v < GWN_BATCH_VBO_MAX_LEN; ++v)
-		{
-		if (batch->verts[v] == NULL)
-			break;
-		GWN_vertbuf_use(batch->verts[v]);
-		}
-
-	if (batch->elem)
-		GWN_indexbuf_use(batch->elem);
-
-	// vertex attribs and element list remain bound to this VAO
-	}
-
 void GWN_batch_draw(Gwn_Batch* batch)
 	{
 #if TRUST_NO_ONE
 	assert(batch->phase == GWN_BATCH_READY_TO_DRAW);
-	assert(glIsProgram(batch->program));
+	assert(batch->verts[0]->vbo_id != 0);
 #endif
-
-	if (batch->vao_id)
-		glBindVertexArray(batch->vao_id);
-	else
-		Batch_prime(batch);
-
-	if (batch->program_dirty)
-		Batch_update_program_bindings(batch, 0);
-
 	GWN_batch_program_use_begin(batch);
+	gpuBindMatrices(batch->interface); // external call.
 
-	gpuBindMatrices(batch->interface);
-
-	if (batch->elem)
-		{
-		const Gwn_IndexBuf* el = batch->elem;
-
-#if GWN_TRACK_INDEX_RANGE
-		if (el->base_index)
-			glDrawRangeElementsBaseVertex(batch->gl_prim_type, el->min_index, el->max_index, el->index_ct, el->gl_index_type, 0, el->base_index);
-		else
-			glDrawRangeElements(batch->gl_prim_type, el->min_index, el->max_index, el->index_ct, el->gl_index_type, 0);
-#else
-		glDrawElements(batch->gl_prim_type, el->index_ct, GL_UNSIGNED_INT, 0);
-#endif
-		}
-	else
-		glDrawArrays(batch->gl_prim_type, 0, batch->verts[0]->vertex_ct);
+	GWN_batch_draw_range_ex(batch, 0, 0, false);
 
 	GWN_batch_program_use_end(batch);
-	glBindVertexArray(0);
 	}
 
-void GWN_batch_draw_stupid(Gwn_Batch* batch, int v_first, int v_count)
+void GWN_batch_draw_range_ex(Gwn_Batch* batch, int v_first, int v_count, bool force_instance)
 	{
-	if (batch->vao_id)
-		glBindVertexArray(batch->vao_id);
-	else
-		Batch_prime(batch);
-
-	if (batch->program_dirty)
-		Batch_update_program_bindings(batch, v_first);
-
-	// GWN_batch_program_use_begin(batch);
-
-	//gpuBindMatrices(batch->program);
+#if TRUST_NO_ONE
+	assert(!(force_instance && (batch->inst == NULL)) || v_count > 0); // we cannot infer length if force_instance
+#endif
 
-	// Infer lenght if vertex count is not given
-	if (v_count == 0)
-		v_count = (batch->elem) ? batch->elem->index_ct : batch->verts[0]->vertex_ct;
+	// If using offset drawing, use the default VAO and redo bindings.
+	if (v_first != 0)
+		{
+		glBindVertexArray(GWN_vao_default());
+		batch_update_program_bindings(batch, v_first);
+		}
+	else
+		glBindVertexArray(batch->vao_id);
 
-	if (batch->elem)
+	if (force_instance || batch->inst)
 		{
-		const Gwn_IndexBuf* el = batch->elem;
+		// Infer length if vertex count is not given
+		if (v_count == 0)
+			v_count = batch->inst->vertex_ct;
+
+		if (batch->elem)
+			{
+			const Gwn_IndexBuf* el = batch->elem;
 
 #if GWN_TRACK_INDEX_RANGE
-		if (el->base_index)
-			glDrawRangeElementsBaseVertex(batch->gl_prim_type, el->min_index, el->max_index, v_count, el->gl_index_type, 0, el->base_index);
-		else
-			glDrawRangeElements(batch->gl_prim_type, el->min_index, el->max_index, v_count, el->gl_index_type, 0);
+			glDrawElementsInstancedBaseVertex(batch->gl_prim_type, el->index_ct, el->gl_index_type, 0, v_count, el->base_index);
 #else
-		glDrawElements(batch->gl_prim_type, v_count, GL_UNSIGNED_INT, 0);
+			glDrawElementsInstanced(batch->gl_prim_type, el->index_ct, GL_UNSIGNED_INT, 0, v_count);
 #endif
+			}
+		else
+			glDrawArraysInstanced(batch->gl_prim_type, 0, batch->verts[0]->vertex_ct, v_count);
 		}
 	else
-		glDrawArrays(batch->gl_prim_type, 0, v_count);
-
-	// GWN_batch_program_use_end(batch);
-	glBindVertexArray(0);
-	}
-
-void GWN_batch_draw_stupid_instanced(Gwn_Batch* batch_instanced, Gwn_Batch* batch_instancing, int instance_first, int instance_count)
-	{
-#if TRUST_NO_ONE
-	// batch_instancing can be null if the number of instances is specified.
-	assert(batch_instancing != NULL || instance_count != 0);
-#endif
-	if (batch_instanced->vao_id)
-		glBindVertexArray(batch_instanced->vao_id);
-	else
-		Batch_prime(batch_instanced);
-
-	if (batch_instanced->program_dirty)
-		Batch_update_program_bindings_instancing(batch_instanced, batch_instancing, instance_first);
-
-	if (instance_count == 0)
-		instance_count = batch_instancing->verts[0]->vertex_ct;
-
-	if (batch_instanced->elem)
 		{
-		const Gwn_IndexBuf* el = batch_instanced->elem;
+		// Infer length if vertex count is not given
+		if (v_count == 0)
+			v_count = (batch->elem) ? batch->elem->index_ct : batch->verts[0]->vertex_ct;
+
+		if (batch->elem)
+			{
+			const Gwn_IndexBuf* el = batch->elem;
 
 #if GWN_TRACK_INDEX_RANGE
-		glDrawElementsInstancedBaseVertex(batch_instanced->gl_prim_type, el->index_ct, el->gl_index_type, 0, instance_count, el->base_index);
+			if (el->base_index)
+				glDrawRangeElementsBaseVertex(batch->gl_prim_type, el->min_index, el->max_index, v_count, el->gl_index_type, 0, el->base_index);
+			else
+				glDrawRangeElements(batch->gl_prim_type, el->min_index, el->max_index, v_count, el->gl_index_type, 0);
 #else
-		glDrawElementsInstanced(batch_instanced->gl_prim_type, el->index_ct, GL_UNSIGNED_INT, 0, instance_count);
+			glDrawElements(batch->gl_prim_type, v_count, GL_UNSIGNED_INT, 0);
 #endif
+			}
+		else
+			glDrawArrays(batch->gl_prim_type, 0, v_count);
 		}
-	else
-		glDrawArraysInstanced(batch_instanced->gl_prim_type, 0, batch_instanced->verts[0]->vertex_ct, instance_count);
+
 
 	glBindVertexArray(0);
 	}
 
 // just draw some vertices and let shader place them where we want.
-void GWN_batch_draw_procedural(Gwn_Batch* batch, Gwn_PrimType prim_type, int v_count)
+void GWN_draw_primitive(Gwn_PrimType prim_type, int v_count)
 	{
 	// we cannot draw without vao ... annoying ...
-	if (batch->vao_id)
-		glBindVertexArray(batch->vao_id);
-	else
-		Batch_prime(batch);
+	glBindVertexArray(GWN_vao_default());
 
 	GLenum type = convert_prim_type_to_gl(prim_type);
 	glDrawArrays(type, 0, v_count);
diff --git a/intern/gawain/src/gwn_buffer_id.cpp b/intern/gawain/src/gwn_buffer_id.cpp
index a93c3950d29..64bad855ca7 100644
--- a/intern/gawain/src/gwn_buffer_id.cpp
+++ b/intern/gawain/src/gwn_buffer_id.cpp
@@ -20,7 +20,6 @@
 #endif
 
 static std::vector<GLuint> orphaned_buffer_ids;
-static std::vector<GLuint> orphaned_vao_ids;
 
 static std::mutex orphan_mutex;
 
@@ -36,10 +35,6 @@ static bool thread_is_main()
 
 GLuint GWN_buf_id_alloc()
 	{
-#if TRUST_NO_ONE
-	assert(thread_is_main());
-#endif
-
 	// delete orphaned IDs
 	orphan_mutex.lock();
 	if (!orphaned_buffer_ids.empty())
@@ -73,43 +68,3 @@ void GWN_buf_id_free(GLuint buffer_id)
 		orphan_mutex.unlock();
 		}
 	}
-
-GLuint GWN_vao_alloc()
-	{
-#if TRUST_NO_ONE
-	assert(thread_is_main());
-#endif
-
-	// delete orphaned IDs
-	orphan_mutex.lock();
-	if (!orphaned_vao_ids.empty())
-		{
-		const auto orphaned_vao_ct = (unsigned)orphaned_vao_ids.size();
-#if ORPHAN_DEBUG
-		printf("deleting %u orphaned VAO%s\n", orphaned_vao_ct, orphaned_vao_ct == 1 ? "" : "s");
-#endif
-		glDeleteVertexArrays(orphaned_vao_ct, orphaned_vao_ids.data());
-		orphaned_vao_ids.clear();
-		}
-	orphan_mutex.unlock();
-
-	GLuint new_vao_id = 0;
-	glGenVertexArrays(1, &new_vao_id);
-	return new_vao_id;
-	}
-
-void GWN_vao_free(GLuint vao_id)
-	{
-	if (thread_is_main())
-		glDeleteVertexArrays(1, &vao_id);
-	else
-		{
-		// add this ID to the orphaned list
-		orphan_mutex.lock();
-#if ORPHAN_DEBUG
-		printf("orphaning VAO %u\n", vao_id);
-#endif
-		orphaned_vao_ids.emplace_back(vao_id);
-		orphan_mutex.unlock();
-		}
-	}
diff --git a/intern/gawain/src/gwn_immediate.c b/intern/gawain/src/gwn_immediate.c
index 1c0776d1bbf..f063665b423 100644
--- a/intern/gawain/src/gwn_immediate.c
+++ b/intern/gawain/src/gwn_immediate.c
@@ -14,6 +14,7 @@
 #include "gwn_attr_binding.h"
 #include "gwn_attr_binding_private.h"
 #include "gwn_vertex_format_private.h"
+#include "gwn_vertex_array_id.h"
 #include "gwn_primitive_private.h"
 #include <string.h>
 
@@ -27,6 +28,7 @@ typedef struct {
 #if IMM_BATCH_COMBO
 	Gwn_Batch* batch;
 #endif
+	Gwn_Context* context;
 
 	// current draw call
 	GLubyte* buffer_data;
@@ -86,8 +88,8 @@ void immActivate(void)
 	assert(imm.prim_type == GWN_PRIM_NONE); // make sure we're not between a Begin/End pair
 	assert(imm.vao_id == 0);
 #endif
-
 	imm.vao_id = GWN_vao_alloc();
+	imm.context = GWN_context_active_get();
 	}
 
 void immDeactivate(void)
@@ -97,8 +99,7 @@ void immDeactivate(void)
 	assert(imm.prim_type == GWN_PRIM_NONE); // make sure we're not between a Begin/End pair
 	assert(imm.vao_id != 0);
 #endif
-
-	GWN_vao_free(imm.vao_id);
+	GWN_vao_free(imm.vao_id, imm.context);
 	imm.vao_id = 0;
 	imm.prev_enabled_attrib_bits = 0;
 	}
diff --git a/intern/gawain/src/gwn_shader_interface.c b/intern/gawain/src/gwn_shader_interface.c
index 33821ae36e2..ef3e8f0f3fa 100644
--- a/intern/gawain/src/gwn_shader_interface.c
+++ b/intern/gawain/src/gwn_shader_interface.c
@@ -10,6 +10,7 @@
 // the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
 
 #include "gwn_shader_interface.h"
+#include "gwn_vertex_array_id.h"
 #include <stdlib.h>
 #include <stddef.h>
 #include <string.h>
@@ -263,6 +264,10 @@ Gwn_ShaderInterface* GWN_shaderinterface_create(GLint program)
 #endif
 		}
 
+	// Batches ref buffer
+	shaderface->batches_ct = GWN_SHADERINTERFACE_REF_ALLOC_COUNT;
+	shaderface->batches = calloc(shaderface->batches_ct, sizeof(Gwn_Batch*));
+
 	return shaderface;
 	}
 
@@ -274,6 +279,12 @@ void GWN_shaderinterface_discard(Gwn_ShaderInterface* shaderface)
 	buckets_free(shaderface->ubo_buckets);
 	// Free memory used by name_buffer.
 	free(shaderface->name_buffer);
+	// Remove this interface from all linked Batches vao cache.
+	for (int i = 0; i < shaderface->batches_ct; ++i)
+		if (shaderface->batches[i] != NULL)
+			GWN_batch_remove_interface_ref(shaderface->batches[i], shaderface);
+
+	free(shaderface->batches);
 	// Free memory used by shader interface by its self.
 	free(shaderface);
 	}
@@ -316,3 +327,34 @@ const Gwn_ShaderInput* GWN_shaderinterface_attr(const Gwn_ShaderInterface* shade
 	{
 	return buckets_lookup(shaderface->attrib_buckets, shaderface->name_buffer, name);
 	}
+
+void GWN_shaderinterface_add_batch_ref(Gwn_ShaderInterface* shaderface, Gwn_Batch* batch)
+	{
+	int i; // find first unused slot
+	for (i = 0; i < shaderface->batches_ct; ++i)
+		if (shaderface->batches[i] == NULL)
+			break;
+
+	if (i == shaderface->batches_ct)
+		{
+		// Not enough place, realloc the array.
+		i = shaderface->batches_ct;
+		shaderface->batches_ct += GWN_SHADERINTERFACE_REF_ALLOC_COUNT;
+		shaderface->batches = realloc(shaderface->batches, sizeof(Gwn_Batch*) * shaderface->batches_ct);
+		memset(shaderface->batches + i, 0, sizeof(Gwn_Batch*) * GWN_SHADERINTERFACE_REF_ALLOC_COUNT);
+		}
+
+	shaderface->batches[i] = batch;
+	}
+
+void GWN_shaderinterface_remove_batch_ref(Gwn_ShaderInterface* shaderface, Gwn_Batch* batch)
+	{
+	for (int i = 0; i < shaderface->batches_ct; ++i)
+		{
+		if (shaderface->batches[i] == batch)
+			{
+			shaderface->batches[i] = NULL;
+			break; // cannot have duplicates
+			}
+		}
+	}
diff --git a/intern/gawain/src/gwn_vertex_array_id.cpp b/intern/gawain/src/gwn_vertex_array_id.cpp
index 602c1c4919c..27010f03bc0 100644
--- a/intern/gawain/src/gwn_vertex_array_id.cpp
+++ b/intern/gawain/src/gwn_vertex_array_id.cpp
@@ -109,7 +109,7 @@ GLuint GWN_vao_default(void)
 	return active_ctx->default_vao;
 	}
 
-GLuint GWN_vao_alloc_new(void)
+GLuint GWN_vao_alloc(void)
 	{
 #if TRUST_NO_ONE
 	assert(active_ctx); // need at least an active context
@@ -123,7 +123,7 @@ GLuint GWN_vao_alloc_new(void)
 	}
 
 // this can be called from multiple thread
-void GWN_vao_free_new(GLuint vao_id, Gwn_Context* ctx)
+void GWN_vao_free(GLuint vao_id, Gwn_Context* ctx)
 	{
 	if (ctx == active_ctx)
 		glDeleteVertexArrays(1, &vao_id);
diff --git a/source/blender/draw/intern/DRW_render.h b/source/blender/draw/intern/DRW_render.h
index f62b224b094..82ba2922dd0 100644
--- a/source/blender/draw/intern/DRW_render.h
+++ b/source/blender/draw/intern/DRW_render.h
@@ -341,7 +341,7 @@ typedef void (DRWCallGenerateFn)(
         void (*draw_fn)(DRWShadingGroup *shgroup, struct Gwn_Batch *geom),
         void *user_data);
 
-void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *instances);
+void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *batch);
 
 void DRW_shgroup_free(struct DRWShadingGroup *shgroup);
 void DRW_shgroup_call_add(DRWShadingGroup *shgroup, struct Gwn_Batch *geom, float (*obmat)[4]);
diff --git a/source/blender/draw/intern/draw_instance_data.c b/source/blender/draw/intern/draw_instance_data.c
index c2aae8e33ae..bfff1a2f546 100644
--- a/source/blender/draw/intern/draw_instance_data.c
+++ b/source/blender/draw/intern/draw_instance_data.c
@@ -42,12 +42,29 @@
 #define BUFFER_CHUNK_SIZE 32
 #define BUFFER_VERTS_CHUNK 32
 
-typedef struct DRWInstanceBuffer {
+typedef struct DRWBatchingBuffer {
 	struct DRWShadingGroup *shgroup;  /* Link back to the owning shGroup. Also tells if it's used */
 	Gwn_VertFormat *format;           /* Identifier. */
 	Gwn_VertBuf *vert;                /* Gwn_VertBuf contained in the Gwn_Batch. */
 	Gwn_Batch *batch;                 /* Gwn_Batch containing the Gwn_VertBuf. */
-} DRWInstanceBuffer;
+} DRWBatchingBuffer;
+
+typedef struct DRWInstancingBuffer {
+	struct DRWShadingGroup *shgroup;  /* Link back to the owning shGroup. Also tells if it's used */
+	Gwn_VertFormat *format;           /* Identifier. */
+	Gwn_Batch *instance;              /* Identifier. */
+	Gwn_VertBuf *vert;                /* Gwn_VertBuf contained in the Gwn_Batch. */
+	Gwn_Batch *batch;                 /* Gwn_Batch containing the Gwn_VertBuf. */
+} DRWInstancingBuffer;
+
+typedef struct DRWInstanceChunk {
+	size_t cursor;             /* Offset to the next instance data. */
+	size_t alloc_size;         /* Number of DRWBatchingBuffer/Batches alloc'd in ibufs/btchs. */
+	union {
+		DRWBatchingBuffer *bbufs;
+		DRWInstancingBuffer *ibufs;
+	};
+} DRWInstanceChunk;
 
 struct DRWInstanceData {
 	struct DRWInstanceData *next;
@@ -60,19 +77,19 @@ struct DRWInstanceData {
 };
 
 struct DRWInstanceDataList {
+	struct DRWInstanceDataList *next, *prev;
 	/* Linked lists for all possible data pool size */
 	/* Not entirely sure if we should separate them in the first place.
 	 * This is done to minimize the reattribution misses. */
 	DRWInstanceData *idata_head[MAX_INSTANCE_DATA_SIZE];
 	DRWInstanceData *idata_tail[MAX_INSTANCE_DATA_SIZE];
 
-	struct {
-		size_t cursor;             /* Offset to the next instance data. */
-		size_t alloc_size;         /* Number of DRWInstanceBuffer alloc'd in ibufs. */
-		DRWInstanceBuffer *ibufs;
-	} ibuffers;
+	DRWInstanceChunk instancing;
+	DRWInstanceChunk batching;
 };
 
+static ListBase g_idatalists = {NULL, NULL};
+
 /* -------------------------------------------------------------------- */
 
 /** \name Instance Buffer Management
@@ -87,89 +104,174 @@ struct DRWInstanceDataList {
  * that would be too slow]).
  **/
 
-void DRW_instance_buffer_request(
-        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, struct DRWShadingGroup *shgroup,
-        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert, Gwn_PrimType type)
+static void instance_batch_free(Gwn_Batch *batch, void *UNUSED(user_data))
 {
-	BLI_assert(format);
+	/* Free all batches that have the same key before they are reused. */
+	/* TODO: Make it thread safe! Batch freeing can happen from another thread. */
+	/* XXX we need to iterate over all idatalists unless we make some smart
+	 * data structure to store the locations to update. */
+	for (DRWInstanceDataList *idatalist = g_idatalists.first; idatalist; ++idatalist) {
+		DRWInstancingBuffer *ibuf = idatalist->instancing.ibufs;
+		for (int i = 0; i < idatalist->instancing.alloc_size; i++, ibuf++) {
+			if (ibuf->instance == batch) {
+				BLI_assert(ibuf->shgroup == NULL); /* Make sure it has no other users. */
+				GWN_VERTBUF_DISCARD_SAFE(ibuf->vert);
+				GWN_BATCH_DISCARD_SAFE(ibuf->batch);
+				/* Tag as non alloced. */
+				ibuf->format = NULL;
+			}
+		}
+	}
+}
 
-	DRWInstanceBuffer *ibuf = idatalist->ibuffers.ibufs;
-	int first_non_alloced = -1;
+void DRW_batching_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_PrimType type, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert)
+{
+	DRWInstanceChunk *chunk = &idatalist->batching;
+	DRWBatchingBuffer *bbuf = idatalist->batching.bbufs;
+	BLI_assert(format);
+	/* Search for an unused batch. */
+	for (int i = 0; i < idatalist->batching.alloc_size; i++, bbuf++) {
+		if (bbuf->shgroup == NULL) {
+			if (bbuf->format == format) {
+				bbuf->shgroup = shgroup;
+				*r_batch = bbuf->batch;
+				*r_vert = bbuf->vert;
+				return;
+			}
+		}
+	}
+	int new_id = 0; /* Find insertion point. */
+	for (; new_id < chunk->alloc_size; ++new_id) {
+		if (chunk->bbufs[new_id].format == NULL)
+			break;
+	}
+	/* If there is no batch left. Allocate more. */
+	if (new_id == chunk->alloc_size) {
+		new_id = chunk->alloc_size;
+		chunk->alloc_size += BUFFER_CHUNK_SIZE;
+		chunk->bbufs = MEM_reallocN(chunk->bbufs, chunk->alloc_size * sizeof(DRWBatchingBuffer));
+		memset(chunk->bbufs + new_id, 0, sizeof(DRWBatchingBuffer) * BUFFER_CHUNK_SIZE);
+	}
+	/* Create the batch. */
+	bbuf = chunk->bbufs + new_id;
+	bbuf->vert = *r_vert = GWN_vertbuf_create_dynamic_with_format(format);
+	bbuf->batch = *r_batch = GWN_batch_create_ex(type, bbuf->vert, NULL, 0);
+	bbuf->format = format;
+	bbuf->shgroup = shgroup;
+	GWN_vertbuf_data_alloc(*r_vert, BUFFER_VERTS_CHUNK);
+}
 
+void DRW_instancing_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_Batch *instance, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert)
+{
+	DRWInstanceChunk *chunk = &idatalist->instancing;
+	DRWInstancingBuffer *ibuf = idatalist->instancing.ibufs;
+	BLI_assert(format);
 	/* Search for an unused batch. */
-	for (int i = 0; i < idatalist->ibuffers.alloc_size; i++, ibuf++) {
+	for (int i = 0; i < idatalist->instancing.alloc_size; i++, ibuf++) {
 		if (ibuf->shgroup == NULL) {
 			if (ibuf->format == format) {
-				ibuf->shgroup = shgroup;
-				*r_batch = ibuf->batch;
-				*r_vert = ibuf->vert;
-				return;
-			}
-			else if (ibuf->format == NULL && first_non_alloced == -1) {
-				first_non_alloced = i;
+				if (ibuf->instance == instance) {
+					ibuf->shgroup = shgroup;
+					*r_batch = ibuf->batch;
+					*r_vert = ibuf->vert;
+					return;
+				}
 			}
 		}
 	}
-
-	if (first_non_alloced == -1) {
-		/* There is no batch left. Allocate more. */
-		first_non_alloced = idatalist->ibuffers.alloc_size;
-		idatalist->ibuffers.alloc_size += BUFFER_CHUNK_SIZE;
-		idatalist->ibuffers.ibufs = MEM_reallocN(idatalist->ibuffers.ibufs,
-		                                         idatalist->ibuffers.alloc_size * sizeof(DRWInstanceBuffer));
-		/* Clear new part of the memory. */
-		memset(idatalist->ibuffers.ibufs + first_non_alloced, 0, sizeof(DRWInstanceBuffer) * BUFFER_CHUNK_SIZE);
+	int new_id = 0; /* Find insertion point. */
+	for (; new_id < chunk->alloc_size; ++new_id) {
+		if (chunk->ibufs[new_id].format == NULL)
+			break;
+	}
+	/* If there is no batch left. Allocate more. */
+	if (new_id == chunk->alloc_size) {
+		new_id = chunk->alloc_size;
+		chunk->alloc_size += BUFFER_CHUNK_SIZE;
+		chunk->ibufs = MEM_reallocN(chunk->ibufs, chunk->alloc_size * sizeof(DRWInstancingBuffer));
+		memset(chunk->ibufs + new_id, 0, sizeof(DRWInstancingBuffer) * BUFFER_CHUNK_SIZE);
 	}
-
 	/* Create the batch. */
-	ibuf = idatalist->ibuffers.ibufs + first_non_alloced;
+	ibuf = chunk->ibufs + new_id;
 	ibuf->vert = *r_vert = GWN_vertbuf_create_dynamic_with_format(format);
-	ibuf->batch = *r_batch = GWN_batch_create_ex(type, ibuf->vert, NULL, GWN_BATCH_OWNS_VBO);
+	ibuf->batch = *r_batch = GWN_batch_duplicate(instance);
 	ibuf->format = format;
 	ibuf->shgroup = shgroup;
-
+	ibuf->instance = instance;
 	GWN_vertbuf_data_alloc(*r_vert, BUFFER_VERTS_CHUNK);
+	GWN_batch_instbuf_set(ibuf->batch, ibuf->vert, false);
+	/* Make sure to free this ibuf if the instance batch gets free. */
+	GWN_batch_callback_free_set(instance, &instance_batch_free, NULL);
 }
 
 void DRW_instance_buffer_finish(DRWInstanceDataList *idatalist)
 {
-	DRWInstanceBuffer *ibuf = idatalist->ibuffers.ibufs;
-	size_t minimum_alloc_size = 1; /* Avoid 0 size realloc. */
+	size_t realloc_size = 1; /* Avoid 0 size realloc. */
+	/* Resize down buffers in use and send data to GPU & free unused buffers. */
+	DRWInstanceChunk *batching = &idatalist->batching;
+	DRWBatchingBuffer *bbuf = batching->bbufs;
+	for (int i = 0; i < batching->alloc_size; i++, bbuf++) {
+		if (bbuf->shgroup != NULL) {
+			realloc_size = i + 1;
+			unsigned int vert_ct = DRW_shgroup_get_instance_count(bbuf->shgroup);
+			vert_ct += (vert_ct == 0) ? 1 : 0; /* Do not realloc to 0 size buffer */
+			if (vert_ct + BUFFER_VERTS_CHUNK <= bbuf->vert->vertex_ct) {
+				unsigned int size = vert_ct + BUFFER_VERTS_CHUNK - 1;
+				size = size - size % BUFFER_VERTS_CHUNK;
+				GWN_vertbuf_data_resize(bbuf->vert, size);
+			}
+			GWN_vertbuf_use(bbuf->vert); /* Send data. */
+			bbuf->shgroup = NULL; /* Set as non used for the next round. */
+		}
+		else {
+			GWN_VERTBUF_DISCARD_SAFE(bbuf->vert);
+			GWN_BATCH_DISCARD_SAFE(bbuf->batch);
+			bbuf->format = NULL; /* Tag as non alloced. */
+		}
+	}
+	/* Rounding up to nearest chunk size. */
+	realloc_size += BUFFER_CHUNK_SIZE - 1;
+	realloc_size -= realloc_size % BUFFER_CHUNK_SIZE;
+	/* Resize down if necessary. */
+	if (realloc_size < batching->alloc_size) {
+		batching->alloc_size = realloc_size;
+		batching->ibufs = MEM_reallocN(batching->ibufs, realloc_size * sizeof(DRWBatchingBuffer));
+	}
 
+	realloc_size = 1;
 	/* Resize down buffers in use and send data to GPU & free unused buffers. */
-	for (int i = 0; i < idatalist->ibuffers.alloc_size; i++, ibuf++) {
+	DRWInstanceChunk *instancing = &idatalist->instancing;
+	DRWInstancingBuffer *ibuf = instancing->ibufs;
+	for (int i = 0; i < instancing->alloc_size; i++, ibuf++) {
 		if (ibuf->shgroup != NULL) {
-			minimum_alloc_size = i + 1;
+			realloc_size = i + 1;
 			unsigned int vert_ct = DRW_shgroup_get_instance_count(ibuf->shgroup);
-			/* Do not realloc to 0 size buffer */
-			vert_ct += (vert_ct == 0) ? 1 : 0;
-			/* Resize buffer to reclame space. */
+			vert_ct += (vert_ct == 0) ? 1 : 0; /* Do not realloc to 0 size buffer */
 			if (vert_ct + BUFFER_VERTS_CHUNK <= ibuf->vert->vertex_ct) {
 				unsigned int size = vert_ct + BUFFER_VERTS_CHUNK - 1;
 				size = size - size % BUFFER_VERTS_CHUNK;
 				GWN_vertbuf_data_resize(ibuf->vert, size);
 			}
-			/* Send data. */
-			GWN_vertbuf_use(ibuf->vert);
-			/* Set as non used for the next round. */
-			ibuf->shgroup = NULL;
+			GWN_vertbuf_use(ibuf->vert); /* Send data. */
+			ibuf->shgroup = NULL; /* Set as non used for the next round. */
 		}
 		else {
+			GWN_VERTBUF_DISCARD_SAFE(ibuf->vert);
 			GWN_BATCH_DISCARD_SAFE(ibuf->batch);
-			/* Tag as non alloced. */
-			ibuf->format = NULL;
+			ibuf->format = NULL; /* Tag as non alloced. */
 		}
 	}
-
-	/* Resize down the handle buffer (ibuffers). */
 	/* Rounding up to nearest chunk size. */
-	minimum_alloc_size += BUFFER_CHUNK_SIZE - 1;
-	minimum_alloc_size -= minimum_alloc_size % BUFFER_CHUNK_SIZE;
+	realloc_size += BUFFER_CHUNK_SIZE - 1;
+	realloc_size -= realloc_size % BUFFER_CHUNK_SIZE;
 	/* Resize down if necessary. */
-	if (minimum_alloc_size < idatalist->ibuffers.alloc_size) {
-		idatalist->ibuffers.alloc_size = minimum_alloc_size;
-		idatalist->ibuffers.ibufs = MEM_reallocN(idatalist->ibuffers.ibufs,
-		                                         minimum_alloc_size * sizeof(DRWInstanceBuffer));
+	if (realloc_size < instancing->alloc_size) {
+		instancing->alloc_size = realloc_size;
+		instancing->ibufs = MEM_reallocN(instancing->ibufs, realloc_size * sizeof(DRWInstancingBuffer));
 	}
 }
 
@@ -183,7 +285,7 @@ void DRW_instance_buffer_finish(DRWInstanceDataList *idatalist)
 static DRWInstanceData *drw_instance_data_create(
         DRWInstanceDataList *idatalist, unsigned int attrib_size, unsigned int instance_group)
 {
-	DRWInstanceData *idata = MEM_mallocN(sizeof(DRWInstanceData), "DRWInstanceData");
+	DRWInstanceData *idata = MEM_callocN(sizeof(DRWInstanceData), "DRWInstanceData");
 	idata->next = NULL;
 	idata->used = true;
 	idata->data_size = attrib_size;
@@ -263,15 +365,18 @@ DRWInstanceData *DRW_instance_data_request(
 DRWInstanceDataList *DRW_instance_data_list_create(void)
 {
 	DRWInstanceDataList *idatalist = MEM_callocN(sizeof(DRWInstanceDataList), "DRWInstanceDataList");
-	idatalist->ibuffers.ibufs = MEM_callocN(sizeof(DRWInstanceBuffer) * BUFFER_CHUNK_SIZE, "DRWInstanceBuffers");
-	idatalist->ibuffers.alloc_size = BUFFER_CHUNK_SIZE;
+	idatalist->batching.bbufs = MEM_callocN(sizeof(DRWBatchingBuffer) * BUFFER_CHUNK_SIZE, "DRWBatchingBuffers");
+	idatalist->batching.alloc_size = BUFFER_CHUNK_SIZE;
+	idatalist->instancing.ibufs = MEM_callocN(sizeof(DRWInstancingBuffer) * BUFFER_CHUNK_SIZE, "DRWInstancingBuffers");
+	idatalist->instancing.alloc_size = BUFFER_CHUNK_SIZE;
+
+	BLI_addtail(&g_idatalists, idatalist);
 
 	return idatalist;
 }
 
 void DRW_instance_data_list_free(DRWInstanceDataList *idatalist)
 {
-	DRWInstanceBuffer *ibuf = idatalist->ibuffers.ibufs;
 	DRWInstanceData *idata, *next_idata;
 
 	for (int i = 0; i < MAX_INSTANCE_DATA_SIZE; ++i) {
@@ -284,10 +389,21 @@ void DRW_instance_data_list_free(DRWInstanceDataList *idatalist)
 		idatalist->idata_tail[i] = NULL;
 	}
 
-	for (int i = 0; i < idatalist->ibuffers.alloc_size; i++, ibuf++) {
+	DRWBatchingBuffer *bbuf = idatalist->batching.bbufs;
+	for (int i = 0; i < idatalist->batching.alloc_size; i++, bbuf++) {
+		GWN_VERTBUF_DISCARD_SAFE(bbuf->vert);
+		GWN_BATCH_DISCARD_SAFE(bbuf->batch);
+	}
+	MEM_freeN(idatalist->batching.bbufs);
+
+	DRWInstancingBuffer *ibuf = idatalist->instancing.ibufs;
+	for (int i = 0; i < idatalist->instancing.alloc_size; i++, ibuf++) {
+		GWN_VERTBUF_DISCARD_SAFE(ibuf->vert);
 		GWN_BATCH_DISCARD_SAFE(ibuf->batch);
 	}
-	MEM_freeN(idatalist->ibuffers.ibufs);
+	MEM_freeN(idatalist->instancing.ibufs);
+
+	BLI_remlink(&g_idatalists, idatalist);
 }
 
 void DRW_instance_data_list_reset(DRWInstanceDataList *idatalist)
diff --git a/source/blender/draw/intern/draw_instance_data.h b/source/blender/draw/intern/draw_instance_data.h
index a7a66c9baff..3b0f7839277 100644
--- a/source/blender/draw/intern/draw_instance_data.h
+++ b/source/blender/draw/intern/draw_instance_data.h
@@ -43,9 +43,12 @@ void *DRW_instance_data_get(DRWInstanceData *idata);
 DRWInstanceData *DRW_instance_data_request(
         DRWInstanceDataList *idatalist, unsigned int attrib_size, unsigned int instance_group);
 
-void DRW_instance_buffer_request(
-        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, struct DRWShadingGroup *shgroup,
-        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert, Gwn_PrimType type);
+void DRW_batching_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_PrimType type, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert);
+void DRW_instancing_buffer_request(
+        DRWInstanceDataList *idatalist, Gwn_VertFormat *format, Gwn_Batch *instance, struct DRWShadingGroup *shgroup,
+        Gwn_Batch **r_batch, Gwn_VertBuf **r_vert);
 
 /* Upload all instance data to the GPU as soon as possible. */
 void DRW_instance_buffer_finish(DRWInstanceDataList *idatalist);
diff --git a/source/blender/draw/intern/draw_manager.c b/source/blender/draw/intern/draw_manager.c
index a3a59efc799..5299fa04e4e 100644
--- a/source/blender/draw/intern/draw_manager.c
+++ b/source/blender/draw/intern/draw_manager.c
@@ -665,6 +665,24 @@ static void drw_interface_init(DRWInterface *interface, GPUShader *shader)
 }
 
 static void drw_interface_instance_init(
+        DRWShadingGroup *shgroup, GPUShader *shader, Gwn_Batch *batch, Gwn_VertFormat *format)
+{
+	DRWInterface *interface = &shgroup->interface;
+	drw_interface_init(interface, shader);
+
+#ifndef NDEBUG
+	interface->attribs_count = (format != NULL) ? format->attrib_ct : 0;
+#endif
+	BLI_assert(shgroup->type == DRW_SHG_INSTANCE);
+	BLI_assert(shgroup->instance_geom != NULL);
+
+	if (format != NULL) {
+		DRW_instancing_buffer_request(DST.idatalist, format, batch, shgroup,
+		                              &shgroup->instancing_geom, &interface->instance_vbo);
+	}
+}
+
+static void drw_interface_batching_init(
         DRWShadingGroup *shgroup, GPUShader *shader, Gwn_VertFormat *format)
 {
 	DRWInterface *interface = &shgroup->interface;
@@ -673,36 +691,19 @@ static void drw_interface_instance_init(
 #ifndef NDEBUG
 	interface->attribs_count = (format != NULL) ? format->attrib_ct : 0;
 #endif
+	BLI_assert(format != NULL);
 
 	Gwn_PrimType type;
-	Gwn_Batch **r_batch = NULL;
 	switch (shgroup->type) {
-		case DRW_SHG_INSTANCE:
-			r_batch = &shgroup->instancing_geom;
-			type = GWN_PRIM_POINTS;
-			break;
-		case DRW_SHG_POINT_BATCH:
-			r_batch = &shgroup->batch_geom;
-			type = GWN_PRIM_POINTS;
-			break;
-		case DRW_SHG_LINE_BATCH:
-			r_batch = &shgroup->batch_geom;
-			type = GWN_PRIM_LINES;
-			break;
-		case DRW_SHG_TRIANGLE_BATCH:
-			r_batch = &shgroup->batch_geom;
-			type = GWN_PRIM_TRIS;
-			break;
+		case DRW_SHG_POINT_BATCH: type = GWN_PRIM_POINTS; break;
+		case DRW_SHG_LINE_BATCH: type = GWN_PRIM_LINES; break;
+		case DRW_SHG_TRIANGLE_BATCH: type = GWN_PRIM_TRIS; break;
 		default:
 			BLI_assert(0);
 	}
 
-	if (format != NULL) {
-		DRW_instance_buffer_request(DST.idatalist, format, shgroup, r_batch, &interface->instance_vbo, type);
-	}
-	else {
-		*r_batch = NULL;
-	}
+	DRW_batching_buffer_request(DST.idatalist, format, type, shgroup,
+	                            &shgroup->batch_geom, &interface->instance_vbo);
 }
 
 static void drw_interface_uniform(DRWShadingGroup *shgroup, const char *name,
@@ -882,7 +883,7 @@ DRWShadingGroup *DRW_shgroup_material_instance_create(
 		shgroup->type = DRW_SHG_INSTANCE;
 		shgroup->instance_geom = geom;
 		shgroup->instance_data = ob->data;
-		drw_interface_instance_init(shgroup, GPU_pass_shader(gpupass), format);
+		drw_interface_instance_init(shgroup, GPU_pass_shader(gpupass), geom, format);
 		drw_shgroup_material_inputs(shgroup, material, gpupass);
 	}
 
@@ -890,7 +891,7 @@ DRWShadingGroup *DRW_shgroup_material_instance_create(
 }
 
 DRWShadingGroup *DRW_shgroup_material_empty_tri_batch_create(
-        struct GPUMaterial *material, DRWPass *pass, int size)
+        struct GPUMaterial *material, DRWPass *pass, int tri_count)
 {
 #ifdef USE_GPU_SELECT
 	BLI_assert((G.f & G_PICKSEL) == 0);
@@ -899,10 +900,10 @@ DRWShadingGroup *DRW_shgroup_material_empty_tri_batch_create(
 	DRWShadingGroup *shgroup = drw_shgroup_material_create_ex(gpupass, pass);
 
 	if (shgroup) {
-		shgroup->type = DRW_SHG_TRIANGLE_BATCH;
-		shgroup->interface.instance_count = size * 3;
-		/* Calling drw_interface_init will cause it to GWN_batch_draw_procedural. */
+		/* Calling drw_interface_init will cause it to call GWN_draw_primitive(). */
 		drw_interface_init(&shgroup->interface, GPU_pass_shader(gpupass));
+		shgroup->type = DRW_SHG_TRIANGLE_BATCH;
+		shgroup->interface.instance_count = tri_count * 3;
 		drw_shgroup_material_inputs(shgroup, material, gpupass);
 	}
 
@@ -923,7 +924,7 @@ DRWShadingGroup *DRW_shgroup_instance_create(
 	shgroup->type = DRW_SHG_INSTANCE;
 	shgroup->instance_geom = geom;
 
-	drw_interface_instance_init(shgroup, shader, format);
+	drw_interface_instance_init(shgroup, shader, geom, format);
 
 	return shgroup;
 }
@@ -937,7 +938,7 @@ DRWShadingGroup *DRW_shgroup_point_batch_create(struct GPUShader *shader, DRWPas
 	DRWShadingGroup *shgroup = drw_shgroup_create_ex(shader, pass);
 	shgroup->type = DRW_SHG_POINT_BATCH;
 
-	drw_interface_instance_init(shgroup, shader, g_pos_format);
+	drw_interface_batching_init(shgroup, shader, g_pos_format);
 
 	return shgroup;
 }
@@ -949,7 +950,7 @@ DRWShadingGroup *DRW_shgroup_line_batch_create(struct GPUShader *shader, DRWPass
 	DRWShadingGroup *shgroup = drw_shgroup_create_ex(shader, pass);
 	shgroup->type = DRW_SHG_LINE_BATCH;
 
-	drw_interface_instance_init(shgroup, shader, g_pos_format);
+	drw_interface_batching_init(shgroup, shader, g_pos_format);
 
 	return shgroup;
 }
@@ -957,18 +958,18 @@ DRWShadingGroup *DRW_shgroup_line_batch_create(struct GPUShader *shader, DRWPass
 /* Very special batch. Use this if you position
  * your vertices with the vertex shader
  * and dont need any VBO attrib */
-DRWShadingGroup *DRW_shgroup_empty_tri_batch_create(struct GPUShader *shader, DRWPass *pass, int size)
+DRWShadingGroup *DRW_shgroup_empty_tri_batch_create(struct GPUShader *shader, DRWPass *pass, int tri_count)
 {
 #ifdef USE_GPU_SELECT
 	BLI_assert((G.f & G_PICKSEL) == 0);
 #endif
 	DRWShadingGroup *shgroup = drw_shgroup_create_ex(shader, pass);
 
-	/* Calling drw_interface_init will cause it to GWN_batch_draw_procedural. */
+	/* Calling drw_interface_init will cause it to call GWN_draw_primitive(). */
 	drw_interface_init(&shgroup->interface, shader);
 
 	shgroup->type = DRW_SHG_TRIANGLE_BATCH;
-	shgroup->interface.instance_count = size * 3;
+	shgroup->interface.instance_count = tri_count * 3;
 
 	return shgroup;
 }
@@ -991,13 +992,19 @@ void DRW_shgroup_free(struct DRWShadingGroup *UNUSED(shgroup))
 } ((void)0)
 
 /* Specify an external batch instead of adding each attrib one by one. */
-void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *instances)
+void DRW_shgroup_instance_batch(DRWShadingGroup *shgroup, struct Gwn_Batch *batch)
 {
 	BLI_assert(shgroup->type == DRW_SHG_INSTANCE);
-	BLI_assert(shgroup->instancing_geom == NULL);
+	BLI_assert(shgroup->interface.instance_count == 0);
+	/* You cannot use external instancing batch without a dummy format. */
+	BLI_assert(shgroup->instancing_geom != NULL);
 
 	shgroup->type = DRW_SHG_INSTANCE_EXTERNAL;
-	shgroup->instancing_geom = instances;
+	/* PERF : This destroys the vaos cache so better check if it's necessary. */
+	/* Note: This WILL break if batch->verts[0] is destroyed and reallocated
+	 * at the same adress. Bindings/VAOs would remain obsolete. */
+	//if (shgroup->instancing_geom->inst != batch->verts[0])
+	GWN_batch_instbuf_set(shgroup->instancing_geom, batch->verts[0], false);
 
 #ifdef USE_GPU_SELECT
 	DRWCall *call = BLI_mempool_alloc(DST.vmempool->calls);
@@ -1140,8 +1147,6 @@ void DRW_shgroup_set_instance_count(DRWShadingGroup *shgroup, unsigned int count
 
 unsigned int DRW_shgroup_get_instance_count(const DRWShadingGroup *shgroup)
 {
-	BLI_assert(shgroup->type != DRW_SHG_NORMAL && shgroup->type != DRW_SHG_INSTANCE_EXTERNAL);
-
 	return shgroup->interface.instance_count;
 }
 
@@ -1765,18 +1770,17 @@ static void draw_geometry_execute_ex(
 	if (geom == NULL) {
 		BLI_assert(shgroup->type == DRW_SHG_TRIANGLE_BATCH); /* Add other type if needed. */
 		/* Shader is already bound. */
-		Gwn_Batch *batch = DRW_cache_fullscreen_quad_get();
-		GWN_batch_draw_procedural(batch, GWN_PRIM_TRIS, count);
+		GWN_draw_primitive(GWN_PRIM_TRIS, count);
 		return;
 	}
 
 	/* step 2 : bind vertex array & draw */
 	GWN_batch_program_set(geom, GPU_shader_get_program(shgroup->shader), GPU_shader_get_interface(shgroup->shader));
 	if (ELEM(shgroup->type, DRW_SHG_INSTANCE, DRW_SHG_INSTANCE_EXTERNAL)) {
-		GWN_batch_draw_stupid_instanced(geom, shgroup->instancing_geom, start, count);
+		GWN_batch_draw_range_ex(geom, start, count, true);
 	}
 	else {
-		GWN_batch_draw_stupid(geom, start, count);
+		GWN_batch_draw_range(geom, start, count);
 	}
 	/* XXX this just tells gawain we are done with the shader.
 	 * This does not unbind the shader. */
@@ -1998,7 +2002,7 @@ static void draw_shgroup(DRWShadingGroup *shgroup, DRWState pass_state)
 			if (shgroup->type == DRW_SHG_INSTANCE_EXTERNAL) {
 				if (shgroup->instancing_geom != NULL) {
 					GPU_SELECT_LOAD_IF_PICKSEL((DRWCall *)shgroup->calls_first);
-					draw_geometry(shgroup, shgroup->instance_geom, obmat, shgroup->instance_data, 0, 0);
+					draw_geometry(shgroup, shgroup->instancing_geom, obmat, shgroup->instance_data, 0, 0);
 				}
 			}
 			else {
@@ -2006,13 +2010,15 @@ static void draw_shgroup(DRWShadingGroup *shgroup, DRWState pass_state)
 					unsigned int count, start;
 					GPU_SELECT_LOAD_IF_PICKSEL_LIST(shgroup, start, count)
 					{
-						draw_geometry(shgroup, shgroup->instance_geom, obmat, shgroup->instance_data, start, count);
+						draw_geometry(shgroup,
+						              (shgroup->instancing_geom) ? shgroup->instancing_geom : shgroup->instance_geom,
+						              obmat, shgroup->instance_data, start, count);
 					}
 					GPU_SELECT_LOAD_IF_PICKSEL_LIST_END(start, count)
 				}
 			}
 		}
-		else {
+		else { /* DRW_SHG_***_BATCH */
 			/* Some dynamic batch can have no geom (no call to aggregate) */
 			if (shgroup->interface.instance_count > 0) {
 				unsigned int count, start;
diff --git a/source/blender/draw/modes/object_mode.c b/source/blender/draw/modes/object_mode.c
index 4a7a5d25b11..d6c0369b0a5 100644
--- a/source/blender/draw/modes/object_mode.c
+++ b/source/blender/draw/modes/object_mode.c
@@ -218,6 +218,7 @@ typedef struct OBJECT_PrivateData {
 
 static struct {
 	/* Instance Data format */
+	struct Gwn_VertFormat *particle_format;
 	struct Gwn_VertFormat *empty_image_format;
 	struct Gwn_VertFormat *empty_image_wire_format;
 
@@ -537,6 +538,7 @@ static void OBJECT_engine_init(void *vedata)
 
 static void OBJECT_engine_free(void)
 {
+	MEM_SAFE_FREE(e_data.particle_format);
 	MEM_SAFE_FREE(e_data.empty_image_format);
 	MEM_SAFE_FREE(e_data.empty_image_wire_format);
 	DRW_SHADER_FREE_SAFE(e_data.outline_resolve_sh);
@@ -1752,6 +1754,9 @@ static void OBJECT_cache_populate_particles(Object *ob,
 				static float def_prim_col[3] = {0.5f, 0.5f, 0.5f};
 				static float def_sec_col[3] = {1.0f, 1.0f, 1.0f};
 
+				/* Dummy particle format for instancing to work. */
+				DRW_shgroup_instance_format(e_data.particle_format, {{"dummy", DRW_ATTRIB_FLOAT, 1}});
+
 				Material *ma = give_current_material(ob, part->omat);
 
 				switch (draw_as) {
@@ -1766,21 +1771,24 @@ static void OBJECT_cache_populate_particles(Object *ob,
 						break;
 					case PART_DRAW_CROSS:
 						shgrp = DRW_shgroup_instance_create(
-						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CROSS), NULL);
+						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CROSS),
+						        e_data.particle_format);
 						DRW_shgroup_uniform_texture(shgrp, "ramp", globals_ramp);
 						DRW_shgroup_uniform_vec3(shgrp, "color", ma ? &ma->r : def_prim_col, 1);
 						DRW_shgroup_uniform_int(shgrp, "screen_space", &screen_space[0], 1);
 						break;
 					case PART_DRAW_CIRC:
 						shgrp = DRW_shgroup_instance_create(
-						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CIRC), NULL);
+						        e_data.part_prim_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_CIRC),
+						        e_data.particle_format);
 						DRW_shgroup_uniform_texture(shgrp, "ramp", globals_ramp);
 						DRW_shgroup_uniform_vec3(shgrp, "color", ma ? &ma->r : def_prim_col, 1);
 						DRW_shgroup_uniform_int(shgrp, "screen_space", &screen_space[1], 1);
 						break;
 					case PART_DRAW_AXIS:
 						shgrp = DRW_shgroup_instance_create(
-						        e_data.part_axis_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_AXIS), NULL);
+						        e_data.part_axis_sh, psl->particle, DRW_cache_particles_get_prim(PART_DRAW_AXIS),
+						        e_data.particle_format);
 						DRW_shgroup_uniform_int(shgrp, "screen_space", &screen_space[0], 1);
 						break;
 					default:
author	Clément Foucault <foucault.clem@gmail.com>	2018-02-20 03:55:19 +0300
committer	Clément Foucault <foucault.clem@gmail.com>	2018-02-21 17:28:26 +0300
commit	c5eba46d7f4ddfcdf372a3f4968e4d170ee0a002 (patch)
tree	f9a170c28cf42f9948f83003b8c8b693475db567
parent	1b3f9ecd0d0bdf20de24f72d73517cc97d925a15 (diff)