diff options
Diffstat (limited to 'source/blender/gpu/intern/gpu_codegen.cc')
-rw-r--r-- | source/blender/gpu/intern/gpu_codegen.cc | 177 |
1 files changed, 147 insertions, 30 deletions
diff --git a/source/blender/gpu/intern/gpu_codegen.cc b/source/blender/gpu/intern/gpu_codegen.cc index 4a45a3e63ed..fdd0fe97a02 100644 --- a/source/blender/gpu/intern/gpu_codegen.cc +++ b/source/blender/gpu/intern/gpu_codegen.cc @@ -11,6 +11,7 @@ #include "DNA_customdata_types.h" #include "DNA_image_types.h" +#include "DNA_material_types.h" #include "BLI_ghash.h" #include "BLI_hash_mm2a.h" @@ -20,6 +21,7 @@ #include "PIL_time.h" +#include "BKE_cryptomatte.hh" #include "BKE_material.h" #include "GPU_capabilities.h" @@ -93,6 +95,9 @@ struct GPUPass { uint32_t hash; /** Did we already tried to compile the attached GPUShader. */ bool compiled; + /** Hint that an optimized variant of this pass should be created based on a complexity heuristic + * during pass code generation. */ + bool should_optimize; }; /* -------------------------------------------------------------------- */ @@ -208,9 +213,10 @@ static std::ostream &operator<<(std::ostream &stream, const GPUConstant *input) stream << input->type << "("; for (int i = 0; i < input->type; i++) { char formated_float[32]; - /* Print with the maximum precision for single precision float using scientific notation. - * See https://stackoverflow.com/questions/16839658/#answer-21162120 */ - SNPRINTF(formated_float, "%.9g", input->vec[i]); + /* Use uint representation to allow exact same bit pattern even if NaN. This is because we can + * pass UINTs as floats for constants. */ + const uint32_t *uint_vec = reinterpret_cast<const uint32_t *>(input->vec); + SNPRINTF(formated_float, "uintBitsToFloat(%uu)", uint_vec[i]); stream << formated_float; if (i < input->type - 1) { stream << ", "; @@ -237,6 +243,12 @@ class GPUCodegen { uint32_t hash_ = 0; BLI_HashMurmur2A hm2a_; ListBase ubo_inputs_ = {nullptr, nullptr}; + GPUInput *cryptomatte_input_ = nullptr; + + /** Cache parameters for complexity heuristic. */ + uint nodes_total_ = 0; + uint textures_total_ = 0; + uint uniforms_total_ = 0; public: GPUCodegen(GPUMaterial *mat_, GPUNodeGraph *graph_) : mat(*mat_), graph(*graph_) @@ -259,12 +271,15 @@ class GPUCodegen { MEM_SAFE_FREE(output.volume); MEM_SAFE_FREE(output.thickness); MEM_SAFE_FREE(output.displacement); + MEM_SAFE_FREE(output.composite); MEM_SAFE_FREE(output.material_functions); + MEM_SAFE_FREE(cryptomatte_input_); delete create_info; BLI_freelistN(&ubo_inputs_); }; void generate_graphs(); + void generate_cryptomatte(); void generate_uniform_buffer(); void generate_attribs(); void generate_resources(); @@ -275,6 +290,14 @@ class GPUCodegen { return hash_; } + /* Heuristic determined during pass codegen for whether a + * more optimal variant of this material should be compiled. */ + bool should_optimize_heuristic() const + { + bool do_optimize = (nodes_total_ >= 100 || textures_total_ >= 4 || uniforms_total_ >= 64); + return do_optimize; + } + private: void set_unique_ids(); @@ -352,37 +375,68 @@ void GPUCodegen::generate_resources() { GPUCodegenCreateInfo &info = *create_info; + /* Ref. T98190: Defines are optimizations for old compilers. + * Might become unnecessary with EEVEE-Next. */ + if (GPU_material_flag_get(&mat, GPU_MATFLAG_PRINCIPLED_CLEARCOAT)) { + info.define("PRINCIPLED_CLEARCOAT"); + } + if (GPU_material_flag_get(&mat, GPU_MATFLAG_PRINCIPLED_METALLIC)) { + info.define("PRINCIPLED_METALLIC"); + } + if (GPU_material_flag_get(&mat, GPU_MATFLAG_PRINCIPLED_DIELECTRIC)) { + info.define("PRINCIPLED_DIELECTRIC"); + } + if (GPU_material_flag_get(&mat, GPU_MATFLAG_PRINCIPLED_GLASS)) { + info.define("PRINCIPLED_GLASS"); + } + if (GPU_material_flag_get(&mat, GPU_MATFLAG_PRINCIPLED_ANY)) { + info.define("PRINCIPLED_ANY"); + } + std::stringstream ss; /* Textures. */ + int slot = 0; LISTBASE_FOREACH (GPUMaterialTexture *, tex, &graph.textures) { if (tex->colorband) { const char *name = info.name_buffer.append_sampler_name(tex->sampler_name); - info.sampler(0, ImageType::FLOAT_1D_ARRAY, name, Frequency::BATCH); + info.sampler(slot++, ImageType::FLOAT_1D_ARRAY, name, Frequency::BATCH); } - else if (tex->tiled_mapping_name[0] != '\0') { + else if (tex->sky) { const char *name = info.name_buffer.append_sampler_name(tex->sampler_name); info.sampler(0, ImageType::FLOAT_2D_ARRAY, name, Frequency::BATCH); + } + else if (tex->tiled_mapping_name[0] != '\0') { + const char *name = info.name_buffer.append_sampler_name(tex->sampler_name); + info.sampler(slot++, ImageType::FLOAT_2D_ARRAY, name, Frequency::BATCH); const char *name_mapping = info.name_buffer.append_sampler_name(tex->tiled_mapping_name); - info.sampler(0, ImageType::FLOAT_1D_ARRAY, name_mapping, Frequency::BATCH); + info.sampler(slot++, ImageType::FLOAT_1D_ARRAY, name_mapping, Frequency::BATCH); } else { const char *name = info.name_buffer.append_sampler_name(tex->sampler_name); - info.sampler(0, ImageType::FLOAT_2D, name, Frequency::BATCH); + info.sampler(slot++, ImageType::FLOAT_2D, name, Frequency::BATCH); } } + /* Increment heuristic. */ + textures_total_ = slot; + if (!BLI_listbase_is_empty(&ubo_inputs_)) { /* NOTE: generate_uniform_buffer() should have sorted the inputs before this. */ ss << "struct NodeTree {\n"; LISTBASE_FOREACH (LinkData *, link, &ubo_inputs_) { GPUInput *input = (GPUInput *)(link->data); - ss << input->type << " u" << input->id << ";\n"; + if (input->source == GPU_SOURCE_CRYPTOMATTE) { + ss << input->type << " crypto_hash;\n"; + } + else { + ss << input->type << " u" << input->id << ";\n"; + } } ss << "};\n\n"; - info.uniform_buf(0, "NodeTree", GPU_UBO_BLOCK_NAME, Frequency::BATCH); + info.uniform_buf(1, "NodeTree", GPU_UBO_BLOCK_NAME, Frequency::BATCH); } if (!BLI_listbase_is_empty(&graph.uniform_attrs.list)) { @@ -394,7 +448,7 @@ void GPUCodegen::generate_resources() /* TODO(fclem): Use the macro for length. Currently not working for EEVEE. */ /* DRW_RESOURCE_CHUNK_LEN = 512 */ - info.uniform_buf(0, "UniformAttrs", GPU_ATTRIBUTE_UBO_BLOCK_NAME "[512]", Frequency::BATCH); + info.uniform_buf(2, "UniformAttrs", GPU_ATTRIBUTE_UBO_BLOCK_NAME "[512]", Frequency::BATCH); } info.typedef_source_generated = ss.str(); @@ -405,11 +459,16 @@ void GPUCodegen::generate_library() GPUCodegenCreateInfo &info = *create_info; void *value; - GSetIterState pop_state = {}; - while (BLI_gset_pop(graph.used_libraries, &pop_state, &value)) { + /* Iterate over libraries. We need to keep this struct intact in case + * it is required for the optimization an pass. */ + GHashIterator *ihash = BLI_ghashIterator_new((GHash *)graph.used_libraries); + while (!BLI_ghashIterator_done(ihash)) { + value = BLI_ghashIterator_getKey(ihash); auto deps = gpu_shader_dependency_get_resolved_source((const char *)value); info.dependencies_generated.extend_non_duplicates(deps); + BLI_ghashIterator_step(ihash); } + BLI_ghashIterator_free(ihash); } void GPUCodegen::node_serialize(std::stringstream &eval_ss, const GPUNode *node) @@ -477,6 +536,9 @@ void GPUCodegen::node_serialize(std::stringstream &eval_ss, const GPUNode *node) } } eval_ss << ");\n\n"; + + /* Increment heuristic. */ + nodes_total_++; } char *GPUCodegen::graph_serialize(eGPUNodeTag tree_tag, GPUNodeLink *output_link) @@ -514,6 +576,24 @@ char *GPUCodegen::graph_serialize(eGPUNodeTag tree_tag) return eval_c_str; } +void GPUCodegen::generate_cryptomatte() +{ + cryptomatte_input_ = static_cast<GPUInput *>(MEM_callocN(sizeof(GPUInput), __func__)); + cryptomatte_input_->type = GPU_FLOAT; + cryptomatte_input_->source = GPU_SOURCE_CRYPTOMATTE; + + float material_hash = 0.0f; + Material *material = GPU_material_get_material(&mat); + if (material) { + blender::bke::cryptomatte::CryptomatteHash hash(material->id.name, + BLI_strnlen(material->id.name, MAX_NAME - 2)); + material_hash = hash.float_encoded(); + } + cryptomatte_input_->vec[0] = material_hash; + + BLI_addtail(&ubo_inputs_, BLI_genericNodeN(cryptomatte_input_)); +} + void GPUCodegen::generate_uniform_buffer() { /* Extract uniform inputs. */ @@ -522,6 +602,7 @@ void GPUCodegen::generate_uniform_buffer() if (input->source == GPU_SOURCE_UNIFORM && !input->link) { /* We handle the UBO uniforms separately. */ BLI_addtail(&ubo_inputs_, BLI_genericNodeN(input)); + uniforms_total_++; } } } @@ -549,6 +630,7 @@ void GPUCodegen::generate_graphs() { set_unique_ids(); + /* Serialize graph. */ output.surface = graph_serialize(GPU_NODE_TAG_SURFACE | GPU_NODE_TAG_AOV, graph.outlink_surface); output.volume = graph_serialize(GPU_NODE_TAG_VOLUME, graph.outlink_volume); output.displacement = graph_serialize(GPU_NODE_TAG_DISPLACEMENT, graph.outlink_displacement); @@ -584,33 +666,51 @@ void GPUCodegen::generate_graphs() GPUPass *GPU_generate_pass(GPUMaterial *material, GPUNodeGraph *graph, GPUCodegenCallbackFn finalize_source_cb, - void *thunk) + void *thunk, + bool optimize_graph) { gpu_node_graph_prune_unused(graph); + /* If Optimize flag is passed in, we are generating an optimized + * variant of the GPUMaterial's GPUPass. */ + if (optimize_graph) { + gpu_node_graph_optimize(graph); + } + /* Extract attributes before compiling so the generated VBOs are ready to accept the future * shader. */ gpu_node_graph_finalize_uniform_attrs(graph); GPUCodegen codegen(material, graph); codegen.generate_graphs(); - codegen.generate_uniform_buffer(); - - /* Cache lookup: Reuse shaders already compiled. */ - GPUPass *pass_hash = gpu_pass_cache_lookup(codegen.hash_get()); - - /* FIXME(fclem): This is broken. Since we only check for the hash and not the full source - * there is no way to have a collision currently. Some advocated to only use a bigger hash. */ - if (pass_hash && (pass_hash->next == nullptr || pass_hash->next->hash != codegen.hash_get())) { - if (!gpu_pass_is_valid(pass_hash)) { - /* Shader has already been created but failed to compile. */ - return nullptr; + codegen.generate_cryptomatte(); + + GPUPass *pass_hash = nullptr; + + if (!optimize_graph) { + /* The optimized version of the shader should not re-generate a UBO. + * The UBO will not be used for this variant. */ + codegen.generate_uniform_buffer(); + + /** Cache lookup: Reuse shaders already compiled. + * NOTE: We only perform cache look-up for non-optimized shader + * graphs, as baked constant data among other optimizations will generate too many + * shader source permutations, with minimal re-usability. */ + pass_hash = gpu_pass_cache_lookup(codegen.hash_get()); + + /* FIXME(fclem): This is broken. Since we only check for the hash and not the full source + * there is no way to have a collision currently. Some advocated to only use a bigger hash. */ + if (pass_hash && (pass_hash->next == nullptr || pass_hash->next->hash != codegen.hash_get())) { + if (!gpu_pass_is_valid(pass_hash)) { + /* Shader has already been created but failed to compile. */ + return nullptr; + } + /* No collision, just return the pass. */ + BLI_spin_lock(&pass_cache_spin); + pass_hash->refcount += 1; + BLI_spin_unlock(&pass_cache_spin); + return pass_hash; } - /* No collision, just return the pass. */ - BLI_spin_lock(&pass_cache_spin); - pass_hash->refcount += 1; - BLI_spin_unlock(&pass_cache_spin); - return pass_hash; } /* Either the shader is not compiled or there is a hash collision... @@ -648,14 +748,31 @@ GPUPass *GPU_generate_pass(GPUMaterial *material, pass->create_info = codegen.create_info; pass->hash = codegen.hash_get(); pass->compiled = false; + /* Only flag pass optimization hint if this is the first generated pass for a material. + * Optimized passes cannot be optimized further, even if the heuristic is still not + * favorable. */ + pass->should_optimize = (!optimize_graph) && codegen.should_optimize_heuristic(); codegen.create_info = nullptr; - gpu_pass_cache_insert_after(pass_hash, pass); + /* Only insert non-optimized graphs into cache. + * Optimized graphs will continuously be recompiled with new unique source during material + * editing, and thus causing the cache to fill up quickly with materials offering minimal + * re-use. */ + if (!optimize_graph) { + gpu_pass_cache_insert_after(pass_hash, pass); + } } return pass; } +bool GPU_pass_should_optimize(GPUPass *pass) +{ + /* Returns optimization heuristic prepared during + * initial codegen. */ + return pass->should_optimize; +} + /** \} */ /* -------------------------------------------------------------------- */ |