From 8f9599d17e80254928d2d72081a4c7e0dee64038 Mon Sep 17 00:00:00 2001 From: Jeroen Bakker Date: Wed, 26 May 2021 17:02:32 +0200 Subject: DrawManager: Use Compute Shader to Update Hair. This patch will use compute shaders to create the VBO for hair. The previous implementation uses tranform feedback. Timings master (transform feedback with GPU_USAGE_STATIC between 0.000069s and 0.000362s Timings transform feedback with GPU_USAGE_DEVICE_ONLY. between 0.000057s and 0.000122s Timings compute shader between 0.000032 and 0.000092s Future improvements: * Generate hair Index buffer using compute shaders: currently done single threaded on CPU, easy to add as compute shader. Reviewed By: fclem Differential Revision: https://developer.blender.org/D11057 --- source/blender/draw/CMakeLists.txt | 1 + source/blender/draw/intern/DRW_render.h | 7 ++ source/blender/draw/intern/draw_cache_impl_hair.c | 3 +- source/blender/draw/intern/draw_hair.c | 135 +++++++++++++++------ source/blender/draw/intern/draw_manager.h | 13 ++ source/blender/draw/intern/draw_manager_data.c | 36 ++++++ source/blender/draw/intern/draw_manager_exec.c | 10 ++ .../draw/intern/shaders/common_hair_lib.glsl | 78 +++++++++++- .../intern/shaders/common_hair_refine_comp.glsl | 24 ++++ .../intern/shaders/common_hair_refine_vert.glsl | 45 +------ source/blender/gpu/GPU_capabilities.h | 2 + source/blender/gpu/intern/gpu_capabilities.cc | 10 ++ .../blender/gpu/intern/gpu_capabilities_private.hh | 2 + source/blender/gpu/opengl/gl_backend.cc | 8 ++ 14 files changed, 292 insertions(+), 82 deletions(-) create mode 100644 source/blender/draw/intern/shaders/common_hair_refine_comp.glsl (limited to 'source/blender') diff --git a/source/blender/draw/CMakeLists.txt b/source/blender/draw/CMakeLists.txt index 045adf4b380..95c0f5d300c 100644 --- a/source/blender/draw/CMakeLists.txt +++ b/source/blender/draw/CMakeLists.txt @@ -321,6 +321,7 @@ data_to_c_simple(intern/shaders/common_globals_lib.glsl SRC) data_to_c_simple(intern/shaders/common_pointcloud_lib.glsl SRC) data_to_c_simple(intern/shaders/common_hair_lib.glsl SRC) data_to_c_simple(intern/shaders/common_hair_refine_vert.glsl SRC) +data_to_c_simple(intern/shaders/common_hair_refine_comp.glsl SRC) data_to_c_simple(intern/shaders/common_math_lib.glsl SRC) data_to_c_simple(intern/shaders/common_math_geom_lib.glsl SRC) data_to_c_simple(intern/shaders/common_view_lib.glsl SRC) diff --git a/source/blender/draw/intern/DRW_render.h b/source/blender/draw/intern/DRW_render.h index 2545cfa65dc..5071658fd82 100644 --- a/source/blender/draw/intern/DRW_render.h +++ b/source/blender/draw/intern/DRW_render.h @@ -438,6 +438,10 @@ void DRW_shgroup_call_range( void DRW_shgroup_call_instance_range( DRWShadingGroup *shgroup, Object *ob, struct GPUBatch *geom, uint i_sta, uint i_ct); +void DRW_shgroup_call_compute(DRWShadingGroup *shgroup, + int groups_x_len, + int groups_y_len, + int groups_z_len); void DRW_shgroup_call_procedural_points(DRWShadingGroup *sh, Object *ob, uint point_count); void DRW_shgroup_call_procedural_lines(DRWShadingGroup *sh, Object *ob, uint line_count); void DRW_shgroup_call_procedural_triangles(DRWShadingGroup *sh, Object *ob, uint tri_count); @@ -575,6 +579,9 @@ void DRW_shgroup_uniform_vec4_array_copy(DRWShadingGroup *shgroup, const char *name, const float (*value)[4], int arraysize); +void DRW_shgroup_vertex_buffer(DRWShadingGroup *shgroup, + const char *name, + struct GPUVertBuf *vertex_buffer); bool DRW_shgroup_is_empty(DRWShadingGroup *shgroup); diff --git a/source/blender/draw/intern/draw_cache_impl_hair.c b/source/blender/draw/intern/draw_cache_impl_hair.c index fd28ac00186..6424b21666d 100644 --- a/source/blender/draw/intern/draw_cache_impl_hair.c +++ b/source/blender/draw/intern/draw_cache_impl_hair.c @@ -243,7 +243,8 @@ static void hair_batch_cache_ensure_procedural_final_points(ParticleHairCache *c GPUVertFormat format = {0}; GPU_vertformat_attr_add(&format, "pos", GPU_COMP_F32, 4, GPU_FETCH_FLOAT); - cache->final[subdiv].proc_buf = GPU_vertbuf_create_with_format(&format); + cache->final[subdiv].proc_buf = GPU_vertbuf_create_with_format_ex(&format, + GPU_USAGE_DEVICE_ONLY); /* Create a destination buffer for the transform feedback. Sized appropriately */ /* Those are points! not line segments. */ diff --git a/source/blender/draw/intern/draw_hair.c b/source/blender/draw/intern/draw_hair.c index bca227a24e2..258777b34fb 100644 --- a/source/blender/draw/intern/draw_hair.c +++ b/source/blender/draw/intern/draw_hair.c @@ -36,15 +36,28 @@ #include "BKE_duplilist.h" #include "GPU_batch.h" +#include "GPU_capabilities.h" +#include "GPU_compute.h" #include "GPU_shader.h" +#include "GPU_texture.h" #include "GPU_vertex_buffer.h" #include "draw_hair_private.h" #ifndef __APPLE__ # define USE_TRANSFORM_FEEDBACK +# define USE_COMPUTE_SHADERS #endif +BLI_INLINE bool drw_hair_use_compute_shaders(void) +{ +#ifdef USE_COMPUTE_SHADERS + return GPU_compute_shader_support(); +#else + return false; +#endif +} + typedef enum ParticleRefineShader { PART_REFINE_CATMULL_ROM = 0, PART_REFINE_MAX_SHADER, @@ -71,6 +84,7 @@ static DRWPass *g_tf_pass; /* XXX can be a problem with multiple DRWManager in t extern char datatoc_common_hair_lib_glsl[]; extern char datatoc_common_hair_refine_vert_glsl[]; +extern char datatoc_common_hair_refine_comp_glsl[]; extern char datatoc_gpu_shader_3D_smooth_color_frag_glsl[]; static GPUShader *hair_refine_shader_get(ParticleRefineShader sh) @@ -79,15 +93,26 @@ static GPUShader *hair_refine_shader_get(ParticleRefineShader sh) return g_refine_shaders[sh]; } - char *vert_with_lib = BLI_string_joinN(datatoc_common_hair_lib_glsl, - datatoc_common_hair_refine_vert_glsl); +#ifdef USE_COMPUTE_SHADERS + const bool do_compute = drw_hair_use_compute_shaders(); + if (do_compute) { + g_refine_shaders[sh] = GPU_shader_create_compute(datatoc_common_hair_refine_comp_glsl, + datatoc_common_hair_lib_glsl, + "#define HAIR_PHASE_SUBDIV\n", + __func__); + return g_refine_shaders[sh]; + } +#endif #ifdef USE_TRANSFORM_FEEDBACK + char *shader_src = BLI_string_joinN(datatoc_common_hair_lib_glsl, + datatoc_common_hair_refine_vert_glsl); const char *var_names[1] = {"finalColor"}; g_refine_shaders[sh] = DRW_shader_create_with_transform_feedback( - vert_with_lib, NULL, "#define HAIR_PHASE_SUBDIV\n", GPU_SHADER_TFB_POINTS, var_names, 1); + shader_src, NULL, "#define HAIR_PHASE_SUBDIV\n", GPU_SHADER_TFB_POINTS, var_names, 1); + #else - g_refine_shaders[sh] = DRW_shader_create(vert_with_lib, + g_refine_shaders[sh] = DRW_shader_create(shader_src, NULL, datatoc_gpu_shader_3D_smooth_color_frag_glsl, "#define blender_srgb_to_framebuffer_space(a) a\n" @@ -95,14 +120,14 @@ static GPUShader *hair_refine_shader_get(ParticleRefineShader sh) "#define TF_WORKAROUND\n"); #endif - MEM_freeN(vert_with_lib); + MEM_freeN(shader_src); return g_refine_shaders[sh]; } void DRW_hair_init(void) { -#ifdef USE_TRANSFORM_FEEDBACK +#if defined(USE_TRANSFORM_FEEDBACK) || defined(USE_COMPUTE_SHADERS) g_tf_pass = DRW_pass_create("Update Hair Pass", 0); #else g_tf_pass = DRW_pass_create("Update Hair Pass", DRW_STATE_WRITE_COLOR); @@ -125,6 +150,67 @@ void DRW_hair_init(void) } } +static void drw_hair_particle_cache_shgrp_attach_resources(DRWShadingGroup *shgrp, + ParticleHairCache *cache, + const int subdiv) +{ + DRW_shgroup_uniform_texture(shgrp, "hairPointBuffer", cache->point_tex); + DRW_shgroup_uniform_texture(shgrp, "hairStrandBuffer", cache->strand_tex); + DRW_shgroup_uniform_texture(shgrp, "hairStrandSegBuffer", cache->strand_seg_tex); + DRW_shgroup_uniform_int(shgrp, "hairStrandsRes", &cache->final[subdiv].strands_res, 1); +} + +static void drw_hair_particle_cache_update_compute(ParticleHairCache *cache, const int subdiv) +{ + const int strands_len = cache->strands_len; + const int final_points_len = cache->final[subdiv].strands_res * strands_len; + if (final_points_len > 0) { + GPUShader *shader = hair_refine_shader_get(PART_REFINE_CATMULL_ROM); + DRWShadingGroup *shgrp = DRW_shgroup_create(shader, g_tf_pass); + drw_hair_particle_cache_shgrp_attach_resources(shgrp, cache, subdiv); + DRW_shgroup_vertex_buffer(shgrp, "hairPointOutputBuffer", cache->final[subdiv].proc_buf); + + const int max_strands_per_call = GPU_max_work_group_count(0); + int strands_start = 0; + while (strands_start < strands_len) { + int batch_strands_len = MIN2(strands_len - strands_start, max_strands_per_call); + DRWShadingGroup *subgroup = DRW_shgroup_create_sub(shgrp); + DRW_shgroup_uniform_int_copy(subgroup, "hairStrandOffset", strands_start); + DRW_shgroup_call_compute(subgroup, batch_strands_len, cache->final[subdiv].strands_res, 1); + strands_start += batch_strands_len; + } + } +} + +static void drw_hair_particle_cache_update_transform_feedback(ParticleHairCache *cache, + const int subdiv) +{ + const int final_points_len = cache->final[subdiv].strands_res * cache->strands_len; + if (final_points_len > 0) { + GPUShader *tf_shader = hair_refine_shader_get(PART_REFINE_CATMULL_ROM); + +#ifdef USE_TRANSFORM_FEEDBACK + DRWShadingGroup *tf_shgrp = DRW_shgroup_transform_feedback_create( + tf_shader, g_tf_pass, cache->final[subdiv].proc_buf); +#else + DRWShadingGroup *tf_shgrp = DRW_shgroup_create(tf_shader, g_tf_pass); + + ParticleRefineCall *pr_call = MEM_mallocN(sizeof(*pr_call), __func__); + pr_call->next = g_tf_calls; + pr_call->vbo = cache->final[subdiv].proc_buf; + pr_call->shgrp = tf_shgrp; + pr_call->vert_len = final_points_len; + g_tf_calls = pr_call; + DRW_shgroup_uniform_int(tf_shgrp, "targetHeight", &g_tf_target_height, 1); + DRW_shgroup_uniform_int(tf_shgrp, "targetWidth", &g_tf_target_width, 1); + DRW_shgroup_uniform_int(tf_shgrp, "idOffset", &g_tf_id_offset, 1); +#endif + + drw_hair_particle_cache_shgrp_attach_resources(tf_shgrp, cache, subdiv); + DRW_shgroup_call_procedural_points(tf_shgrp, NULL, final_points_len); + } +} + static ParticleHairCache *drw_hair_particle_cache_get( Object *object, ParticleSystem *psys, ModifierData *md, int subdiv, int thickness_res) { @@ -140,32 +226,11 @@ static ParticleHairCache *drw_hair_particle_cache_get( } if (update) { - int final_points_len = cache->final[subdiv].strands_res * cache->strands_len; - if (final_points_len > 0) { - GPUShader *tf_shader = hair_refine_shader_get(PART_REFINE_CATMULL_ROM); - -#ifdef USE_TRANSFORM_FEEDBACK - DRWShadingGroup *tf_shgrp = DRW_shgroup_transform_feedback_create( - tf_shader, g_tf_pass, cache->final[subdiv].proc_buf); -#else - DRWShadingGroup *tf_shgrp = DRW_shgroup_create(tf_shader, g_tf_pass); - - ParticleRefineCall *pr_call = MEM_mallocN(sizeof(*pr_call), __func__); - pr_call->next = g_tf_calls; - pr_call->vbo = cache->final[subdiv].proc_buf; - pr_call->shgrp = tf_shgrp; - pr_call->vert_len = final_points_len; - g_tf_calls = pr_call; - DRW_shgroup_uniform_int(tf_shgrp, "targetHeight", &g_tf_target_height, 1); - DRW_shgroup_uniform_int(tf_shgrp, "targetWidth", &g_tf_target_width, 1); - DRW_shgroup_uniform_int(tf_shgrp, "idOffset", &g_tf_id_offset, 1); -#endif - - DRW_shgroup_uniform_texture(tf_shgrp, "hairPointBuffer", cache->point_tex); - DRW_shgroup_uniform_texture(tf_shgrp, "hairStrandBuffer", cache->strand_tex); - DRW_shgroup_uniform_texture(tf_shgrp, "hairStrandSegBuffer", cache->strand_seg_tex); - DRW_shgroup_uniform_int(tf_shgrp, "hairStrandsRes", &cache->final[subdiv].strands_res, 1); - DRW_shgroup_call_procedural_points(tf_shgrp, NULL, final_points_len); + if (drw_hair_use_compute_shaders()) { + drw_hair_particle_cache_update_compute(cache, subdiv); + } + else { + drw_hair_particle_cache_update_transform_feedback(cache, subdiv); } } return cache; @@ -367,9 +432,11 @@ void DRW_hair_update(void) MEM_freeN(data); GPU_framebuffer_free(fb); #else - /* TODO(fclem): replace by compute shader. */ - /* Just render using transform feedback. */ + /* Just render the pass when using compute shaders or transform feedback. */ DRW_draw_pass(g_tf_pass); + if (drw_hair_use_compute_shaders()) { + GPU_memory_barrier(GPU_BARRIER_SHADER_STORAGE); + } #endif } diff --git a/source/blender/draw/intern/draw_manager.h b/source/blender/draw/intern/draw_manager.h index 84bc0327aa2..d4e22c83798 100644 --- a/source/blender/draw/intern/draw_manager.h +++ b/source/blender/draw/intern/draw_manager.h @@ -187,6 +187,10 @@ typedef enum { DRW_CMD_DRAW_INSTANCE = 2, DRW_CMD_DRAW_INSTANCE_RANGE = 3, DRW_CMD_DRAW_PROCEDURAL = 4, + + /* Compute Commands. */ + DRW_CMD_COMPUTE = 8, + /* Other Commands */ DRW_CMD_CLEAR = 12, DRW_CMD_DRWSTATE = 13, @@ -224,6 +228,12 @@ typedef struct DRWCommandDrawInstanceRange { uint inst_count; } DRWCommandDrawInstanceRange; +typedef struct DRWCommandCompute { + int groups_x_len; + int groups_y_len; + int groups_z_len; +} DRWCommandCompute; + typedef struct DRWCommandDrawProcedural { GPUBatch *batch; DRWResourceHandle handle; @@ -260,6 +270,7 @@ typedef union DRWCommand { DRWCommandDrawInstance instance; DRWCommandDrawInstanceRange instance_range; DRWCommandDrawProcedural procedural; + DRWCommandCompute compute; DRWCommandSetMutableState state; DRWCommandSetStencil stencil; DRWCommandSetSelectID select_id; @@ -274,6 +285,7 @@ struct DRWCallBuffer { }; /** Used by #DRWUniform.type */ +/* TODO(jbakker): rename to DRW_RESOURCE/DRWResourceType. */ typedef enum { DRW_UNIFORM_INT = 0, DRW_UNIFORM_INT_COPY, @@ -286,6 +298,7 @@ typedef enum { DRW_UNIFORM_BLOCK, DRW_UNIFORM_BLOCK_REF, DRW_UNIFORM_TFEEDBACK_TARGET, + DRW_UNIFORM_VERTEX_BUFFER_AS_STORAGE, /** Per drawcall uniforms/UBO */ DRW_UNIFORM_BLOCK_OBMATS, DRW_UNIFORM_BLOCK_OBINFOS, diff --git a/source/blender/draw/intern/draw_manager_data.c b/source/blender/draw/intern/draw_manager_data.c index 6bdc5305fed..3b852e7f8c8 100644 --- a/source/blender/draw/intern/draw_manager_data.c +++ b/source/blender/draw/intern/draw_manager_data.c @@ -47,6 +47,7 @@ #endif #include "GPU_buffers.h" +#include "GPU_capabilities.h" #include "GPU_material.h" #include "GPU_uniform_buffer.h" @@ -446,6 +447,19 @@ void DRW_shgroup_uniform_vec4_array_copy(DRWShadingGroup *shgroup, } } +void DRW_shgroup_vertex_buffer(DRWShadingGroup *shgroup, + const char *name, + GPUVertBuf *vertex_buffer) +{ + int location = GPU_shader_get_ssbo(shgroup->shader, name); + if (location == -1) { + BLI_assert(false && "Unable to locate binding of shader storage buffer objects."); + return; + } + drw_shgroup_uniform_create_ex( + shgroup, location, DRW_UNIFORM_VERTEX_BUFFER_AS_STORAGE, vertex_buffer, 0, 0, 1); +} + /** \} */ /* -------------------------------------------------------------------- */ @@ -700,6 +714,17 @@ static void drw_command_draw_intance_range( cmd->inst_count = count; } +static void drw_command_compute(DRWShadingGroup *shgroup, + int groups_x_len, + int groups_y_len, + int groups_z_len) +{ + DRWCommandCompute *cmd = drw_command_create(shgroup, DRW_CMD_COMPUTE); + cmd->groups_x_len = groups_x_len; + cmd->groups_y_len = groups_y_len; + cmd->groups_z_len = groups_z_len; +} + static void drw_command_draw_procedural(DRWShadingGroup *shgroup, GPUBatch *batch, DRWResourceHandle handle, @@ -815,6 +840,17 @@ void DRW_shgroup_call_instance_range( drw_command_draw_intance_range(shgroup, geom, handle, i_sta, i_ct); } +void DRW_shgroup_call_compute(DRWShadingGroup *shgroup, + int groups_x_len, + int groups_y_len, + int groups_z_len) +{ + BLI_assert(groups_x_len > 0 && groups_y_len > 0 && groups_z_len > 0); + BLI_assert(GPU_compute_shader_support()); + + drw_command_compute(shgroup, groups_x_len, groups_y_len, groups_z_len); +} + static void drw_shgroup_call_procedural_add_ex(DRWShadingGroup *shgroup, GPUBatch *geom, Object *ob, diff --git a/source/blender/draw/intern/draw_manager_exec.c b/source/blender/draw/intern/draw_manager_exec.c index 4c8fcb0e016..f29caebeb84 100644 --- a/source/blender/draw/intern/draw_manager_exec.c +++ b/source/blender/draw/intern/draw_manager_exec.c @@ -29,6 +29,7 @@ #include "BKE_global.h" +#include "GPU_compute.h" #include "GPU_platform.h" #include "GPU_shader.h" #include "GPU_state.h" @@ -672,6 +673,9 @@ static void draw_update_uniforms(DRWShadingGroup *shgroup, *use_tfeedback = GPU_shader_transform_feedback_enable(shgroup->shader, ((GPUVertBuf *)uni->pvalue)); break; + case DRW_UNIFORM_VERTEX_BUFFER_AS_STORAGE: + GPU_vertbuf_bind_as_ssbo((GPUVertBuf *)uni->pvalue, uni->location); + break; /* Legacy/Fallback support. */ case DRW_UNIFORM_BASE_INSTANCE: state->baseinst_loc = uni->location; @@ -1050,6 +1054,12 @@ static void draw_shgroup(DRWShadingGroup *shgroup, DRWState pass_state) cmd->instance_range.inst_count, false); break; + case DRW_CMD_COMPUTE: + GPU_compute_dispatch(shgroup->shader, + cmd->compute.groups_x_len, + cmd->compute.groups_y_len, + cmd->compute.groups_z_len); + break; } } diff --git a/source/blender/draw/intern/shaders/common_hair_lib.glsl b/source/blender/draw/intern/shaders/common_hair_lib.glsl index 8684d82f228..02c335ddae2 100644 --- a/source/blender/draw/intern/shaders/common_hair_lib.glsl +++ b/source/blender/draw/intern/shaders/common_hair_lib.glsl @@ -28,6 +28,9 @@ uniform bool hairCloseTip = true; uniform vec4 hairDupliMatrix[4]; +/* Strand batch offset when used in compute shaders. */ +uniform int hairStrandOffset = 0; + /* -- Per control points -- */ uniform samplerBuffer hairPointBuffer; /* RGBA32F */ #define point_position xyz @@ -43,13 +46,37 @@ uniform usamplerBuffer hairStrandSegBuffer; /* R16UI */ /* -- Subdivision stage -- */ /** - * We use a transform feedback to preprocess the strands and add more subdivision to it. - * For the moment these are simple smooth interpolation but one could hope to see the full + * We use a transform feedback or compute shader to preprocess the strands and add more subdivision + * to it. For the moment these are simple smooth interpolation but one could hope to see the full * children particle modifiers being evaluated at this stage. * * If no more subdivision is needed, we can skip this step. */ +#ifdef GPU_VERTEX_SHADER +float hair_get_local_time() +{ + return float(gl_VertexID % hairStrandsRes) / float(hairStrandsRes - 1); +} + +int hair_get_id() +{ + return gl_VertexID / hairStrandsRes; +} +#endif + +#ifdef GPU_COMPUTE_SHADER +float hair_get_local_time() +{ + return float(gl_GlobalInvocationID.y) / float(hairStrandsRes - 1); +} + +int hair_get_id() +{ + return int(gl_GlobalInvocationID.x) + hairStrandOffset; +} +#endif + #ifdef HAIR_PHASE_SUBDIV int hair_get_base_id(float local_time, int strand_segments, out float interp_time) { @@ -64,9 +91,9 @@ int hair_get_base_id(float local_time, int strand_segments, out float interp_tim void hair_get_interp_attrs( out vec4 data0, out vec4 data1, out vec4 data2, out vec4 data3, out float interp_time) { - float local_time = float(gl_VertexID % hairStrandsRes) / float(hairStrandsRes - 1); + float local_time = hair_get_local_time(); - int hair_id = gl_VertexID / hairStrandsRes; + int hair_id = hair_get_id(); int strand_offset = int(texelFetch(hairStrandBuffer, hair_id).x); int strand_segments = int(texelFetch(hairStrandSegBuffer, hair_id).x); @@ -96,6 +123,7 @@ void hair_get_interp_attrs( */ #if !defined(HAIR_PHASE_SUBDIV) && defined(GPU_VERTEX_SHADER) + int hair_get_strand_id(void) { return gl_VertexID / (hairStrandsRes * hairThicknessRes); @@ -227,3 +255,45 @@ vec2 hair_resolve_barycentric(vec2 vert_barycentric) return vec2(1.0 - vert_barycentric.x, 0.0); } } + +/* Hair interpolation functions. */ +vec4 hair_get_weights_cardinal(float t) +{ + float t2 = t * t; + float t3 = t2 * t; +#if defined(CARDINAL) + float fc = 0.71; +#else /* defined(CATMULL_ROM) */ + float fc = 0.5; +#endif + + vec4 weights; + /* GLSL Optimized version of key_curve_position_weights() */ + float fct = t * fc; + float fct2 = t2 * fc; + float fct3 = t3 * fc; + weights.x = (fct2 * 2.0 - fct3) - fct; + weights.y = (t3 * 2.0 - fct3) + (-t2 * 3.0 + fct2) + 1.0; + weights.z = (-t3 * 2.0 + fct3) + (t2 * 3.0 - (2.0 * fct2)) + fct; + weights.w = fct3 - fct2; + return weights; +} + +/* TODO(fclem): This one is buggy, find why. (it's not the optimization!!) */ +vec4 hair_get_weights_bspline(float t) +{ + float t2 = t * t; + float t3 = t2 * t; + + vec4 weights; + /* GLSL Optimized version of key_curve_position_weights() */ + weights.xz = vec2(-0.16666666, -0.5) * t3 + (0.5 * t2 + 0.5 * vec2(-t, t) + 0.16666666); + weights.y = (0.5 * t3 - t2 + 0.66666666); + weights.w = (0.16666666 * t3); + return weights; +} + +vec4 hair_interp_data(vec4 v0, vec4 v1, vec4 v2, vec4 v3, vec4 w) +{ + return v0 * w.x + v1 * w.y + v2 * w.z + v3 * w.w; +} diff --git a/source/blender/draw/intern/shaders/common_hair_refine_comp.glsl b/source/blender/draw/intern/shaders/common_hair_refine_comp.glsl new file mode 100644 index 00000000000..4dcde4b0245 --- /dev/null +++ b/source/blender/draw/intern/shaders/common_hair_refine_comp.glsl @@ -0,0 +1,24 @@ + +/* + * To be compiled with common_hair_lib.glsl. + */ + +layout(local_size_x = 1, local_size_y = 1) in; +layout(std430, binding = 0) writeonly buffer hairPointOutputBuffer +{ + vec4 posTime[]; +} +out_vertbuf; + +void main(void) +{ + float interp_time; + vec4 data0, data1, data2, data3; + hair_get_interp_attrs(data0, data1, data2, data3, interp_time); + + vec4 weights = hair_get_weights_cardinal(interp_time); + vec4 result = hair_interp_data(data0, data1, data2, data3, weights); + + uint index = uint(hair_get_id() * hairStrandsRes) + gl_GlobalInvocationID.y; + out_vertbuf.posTime[index] = result; +} diff --git a/source/blender/draw/intern/shaders/common_hair_refine_vert.glsl b/source/blender/draw/intern/shaders/common_hair_refine_vert.glsl index 3f5e3f8226f..371d43827b9 100644 --- a/source/blender/draw/intern/shaders/common_hair_refine_vert.glsl +++ b/source/blender/draw/intern/shaders/common_hair_refine_vert.glsl @@ -3,47 +3,6 @@ out vec4 finalColor; -vec4 get_weights_cardinal(float t) -{ - float t2 = t * t; - float t3 = t2 * t; -#if defined(CARDINAL) - float fc = 0.71; -#else /* defined(CATMULL_ROM) */ - float fc = 0.5; -#endif - - vec4 weights; - /* GLSL Optimized version of key_curve_position_weights() */ - float fct = t * fc; - float fct2 = t2 * fc; - float fct3 = t3 * fc; - weights.x = (fct2 * 2.0 - fct3) - fct; - weights.y = (t3 * 2.0 - fct3) + (-t2 * 3.0 + fct2) + 1.0; - weights.z = (-t3 * 2.0 + fct3) + (t2 * 3.0 - (2.0 * fct2)) + fct; - weights.w = fct3 - fct2; - return weights; -} - -/* TODO(fclem): This one is buggy, find why. (it's not the optimization!!) */ -vec4 get_weights_bspline(float t) -{ - float t2 = t * t; - float t3 = t2 * t; - - vec4 weights; - /* GLSL Optimized version of key_curve_position_weights() */ - weights.xz = vec2(-0.16666666, -0.5) * t3 + (0.5 * t2 + 0.5 * vec2(-t, t) + 0.16666666); - weights.y = (0.5 * t3 - t2 + 0.66666666); - weights.w = (0.16666666 * t3); - return weights; -} - -vec4 interp_data(vec4 v0, vec4 v1, vec4 v2, vec4 v3, vec4 w) -{ - return v0 * w.x + v1 * w.y + v2 * w.z + v3 * w.w; -} - #ifdef TF_WORKAROUND uniform int targetWidth; uniform int targetHeight; @@ -56,8 +15,8 @@ void main(void) vec4 data0, data1, data2, data3; hair_get_interp_attrs(data0, data1, data2, data3, interp_time); - vec4 weights = get_weights_cardinal(interp_time); - finalColor = interp_data(data0, data1, data2, data3, weights); + vec4 weights = hair_get_weights_cardinal(interp_time); + finalColor = hair_interp_data(data0, data1, data2, data3, weights); #ifdef TF_WORKAROUND int id = gl_VertexID - idOffset; diff --git a/source/blender/gpu/GPU_capabilities.h b/source/blender/gpu/GPU_capabilities.h index 45c656b49be..0c054d4f264 100644 --- a/source/blender/gpu/GPU_capabilities.h +++ b/source/blender/gpu/GPU_capabilities.h @@ -37,6 +37,8 @@ int GPU_max_textures(void); int GPU_max_textures_vert(void); int GPU_max_textures_geom(void); int GPU_max_textures_frag(void); +int GPU_max_work_group_count(int index); +int GPU_max_work_group_size(int index); int GPU_max_uniforms_vert(void); int GPU_max_uniforms_frag(void); int GPU_max_batch_indices(void); diff --git a/source/blender/gpu/intern/gpu_capabilities.cc b/source/blender/gpu/intern/gpu_capabilities.cc index bedc9ad3092..c6e9dc210cb 100644 --- a/source/blender/gpu/intern/gpu_capabilities.cc +++ b/source/blender/gpu/intern/gpu_capabilities.cc @@ -82,6 +82,16 @@ int GPU_max_textures(void) return GCaps.max_textures; } +int GPU_max_work_group_count(int index) +{ + return GCaps.max_work_group_count[index]; +} + +int GPU_max_work_group_size(int index) +{ + return GCaps.max_work_group_size[index]; +} + int GPU_max_uniforms_vert(void) { return GCaps.max_uniforms_vert; diff --git a/source/blender/gpu/intern/gpu_capabilities_private.hh b/source/blender/gpu/intern/gpu_capabilities_private.hh index ee7ef1e69e6..95cf7fd335d 100644 --- a/source/blender/gpu/intern/gpu_capabilities_private.hh +++ b/source/blender/gpu/intern/gpu_capabilities_private.hh @@ -41,6 +41,8 @@ struct GPUCapabilities { int max_textures_vert = 0; int max_textures_geom = 0; int max_textures_frag = 0; + int max_work_group_count[3] = {0, 0, 0}; + int max_work_group_size[3] = {0, 0, 0}; int max_uniforms_vert = 0; int max_uniforms_frag = 0; int max_batch_indices = 0; diff --git a/source/blender/gpu/opengl/gl_backend.cc b/source/blender/gpu/opengl/gl_backend.cc index fb03a2c2d2a..d85f9f7684d 100644 --- a/source/blender/gpu/opengl/gl_backend.cc +++ b/source/blender/gpu/opengl/gl_backend.cc @@ -438,6 +438,14 @@ void GLBackend::capabilities_init() GCaps.mem_stats_support = GLEW_NVX_gpu_memory_info || GLEW_ATI_meminfo; GCaps.shader_image_load_store_support = GLEW_ARB_shader_image_load_store; GCaps.compute_shader_support = GLEW_ARB_compute_shader; + if (GCaps.compute_shader_support) { + glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, &GCaps.max_work_group_count[0]); + glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 1, &GCaps.max_work_group_count[1]); + glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 2, &GCaps.max_work_group_count[2]); + glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 0, &GCaps.max_work_group_size[0]); + glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 1, &GCaps.max_work_group_size[1]); + glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 2, &GCaps.max_work_group_size[2]); + } GCaps.shader_storage_buffer_objects_support = GLEW_ARB_shader_storage_buffer_object; /* GL specific capabilities. */ glGetIntegerv(GL_MAX_3D_TEXTURE_SIZE, &GLContext::max_texture_3d_size); -- cgit v1.2.3