1 files changed, 147 insertions, 30 deletions
diff --git a/source/blender/gpu/intern/gpu_codegen.cc b/source/blender/gpu/intern/gpu_codegen.cc
index 4a45a3e63ed..fdd0fe97a02 100644
--- a/source/blender/gpu/intern/gpu_codegen.cc
+++ b/source/blender/gpu/intern/gpu_codegen.cc
@@ -11,6 +11,7 @@
 
 #include "DNA_customdata_types.h"
 #include "DNA_image_types.h"
+#include "DNA_material_types.h"
 
 #include "BLI_ghash.h"
 #include "BLI_hash_mm2a.h"
@@ -20,6 +21,7 @@
 
 #include "PIL_time.h"
 
+#include "BKE_cryptomatte.hh"
 #include "BKE_material.h"
 
 #include "GPU_capabilities.h"
@@ -93,6 +95,9 @@ struct GPUPass {
   uint32_t hash;
   /** Did we already tried to compile the attached GPUShader. */
   bool compiled;
+  /** Hint that an optimized variant of this pass should be created based on a complexity heuristic
+   * during pass code generation. */
+  bool should_optimize;
 };
 
 /* -------------------------------------------------------------------- */
@@ -208,9 +213,10 @@ static std::ostream &operator<<(std::ostream &stream, const GPUConstant *input)
   stream << input->type << "(";
   for (int i = 0; i < input->type; i++) {
     char formated_float[32];
-    /* Print with the maximum precision for single precision float using scientific notation.
-     * See https://stackoverflow.com/questions/16839658/#answer-21162120 */
-    SNPRINTF(formated_float, "%.9g", input->vec[i]);
+    /* Use uint representation to allow exact same bit pattern even if NaN. This is because we can
+     * pass UINTs as floats for constants. */
+    const uint32_t *uint_vec = reinterpret_cast<const uint32_t *>(input->vec);
+    SNPRINTF(formated_float, "uintBitsToFloat(%uu)", uint_vec[i]);
     stream << formated_float;
     if (i < input->type - 1) {
       stream << ", ";
@@ -237,6 +243,12 @@ class GPUCodegen {
   uint32_t hash_ = 0;
   BLI_HashMurmur2A hm2a_;
   ListBase ubo_inputs_ = {nullptr, nullptr};
+  GPUInput *cryptomatte_input_ = nullptr;
+
+  /** Cache parameters for complexity heuristic. */
+  uint nodes_total_ = 0;
+  uint textures_total_ = 0;
+  uint uniforms_total_ = 0;
 
  public:
   GPUCodegen(GPUMaterial *mat_, GPUNodeGraph *graph_) : mat(*mat_), graph(*graph_)
@@ -259,12 +271,15 @@ class GPUCodegen {
     MEM_SAFE_FREE(output.volume);
     MEM_SAFE_FREE(output.thickness);
     MEM_SAFE_FREE(output.displacement);
+    MEM_SAFE_FREE(output.composite);
     MEM_SAFE_FREE(output.material_functions);
+    MEM_SAFE_FREE(cryptomatte_input_);
     delete create_info;
     BLI_freelistN(&ubo_inputs_);
   };
 
   void generate_graphs();
+  void generate_cryptomatte();
   void generate_uniform_buffer();
   void generate_attribs();
   void generate_resources();
@@ -275,6 +290,14 @@ class GPUCodegen {
     return hash_;
   }
 
+  /* Heuristic determined during pass codegen for whether a
+   * more optimal variant of this material should be compiled. */
+  bool should_optimize_heuristic() const
+  {
+    bool do_optimize = (nodes_total_ >= 100 || textures_total_ >= 4 || uniforms_total_ >= 64);
+    return do_optimize;
+  }
+
  private:
   void set_unique_ids();
 
@@ -352,37 +375,68 @@ void GPUCodegen::generate_resources()
 {
   GPUCodegenCreateInfo &info = *create_info;
 
+  /* Ref. T98190: Defines are optimizations for old compilers.
+   * Might become unnecessary with EEVEE-Next. */
+  if (GPU_material_flag_get(&mat, GPU_MATFLAG_PRINCIPLED_CLEARCOAT)) {
+    info.define("PRINCIPLED_CLEARCOAT");
+  }
+  if (GPU_material_flag_get(&mat, GPU_MATFLAG_PRINCIPLED_METALLIC)) {
+    info.define("PRINCIPLED_METALLIC");
+  }
+  if (GPU_material_flag_get(&mat, GPU_MATFLAG_PRINCIPLED_DIELECTRIC)) {
+    info.define("PRINCIPLED_DIELECTRIC");
+  }
+  if (GPU_material_flag_get(&mat, GPU_MATFLAG_PRINCIPLED_GLASS)) {
+    info.define("PRINCIPLED_GLASS");
+  }
+  if (GPU_material_flag_get(&mat, GPU_MATFLAG_PRINCIPLED_ANY)) {
+    info.define("PRINCIPLED_ANY");
+  }
+
   std::stringstream ss;
 
   /* Textures. */
+  int slot = 0;
   LISTBASE_FOREACH (GPUMaterialTexture *, tex, &graph.textures) {
     if (tex->colorband) {
       const char *name = info.name_buffer.append_sampler_name(tex->sampler_name);
-      info.sampler(0, ImageType::FLOAT_1D_ARRAY, name, Frequency::BATCH);
+      info.sampler(slot++, ImageType::FLOAT_1D_ARRAY, name, Frequency::BATCH);
     }
-    else if (tex->tiled_mapping_name[0] != '\0') {
+    else if (tex->sky) {
       const char *name = info.name_buffer.append_sampler_name(tex->sampler_name);
       info.sampler(0, ImageType::FLOAT_2D_ARRAY, name, Frequency::BATCH);
+    }
+    else if (tex->tiled_mapping_name[0] != '\0') {
+      const char *name = info.name_buffer.append_sampler_name(tex->sampler_name);
+      info.sampler(slot++, ImageType::FLOAT_2D_ARRAY, name, Frequency::BATCH);
 
       const char *name_mapping = info.name_buffer.append_sampler_name(tex->tiled_mapping_name);
-      info.sampler(0, ImageType::FLOAT_1D_ARRAY, name_mapping, Frequency::BATCH);
+      info.sampler(slot++, ImageType::FLOAT_1D_ARRAY, name_mapping, Frequency::BATCH);
     }
     else {
       const char *name = info.name_buffer.append_sampler_name(tex->sampler_name);
-      info.sampler(0, ImageType::FLOAT_2D, name, Frequency::BATCH);
+      info.sampler(slot++, ImageType::FLOAT_2D, name, Frequency::BATCH);
     }
   }
 
+  /* Increment heuristic. */
+  textures_total_ = slot;
+
   if (!BLI_listbase_is_empty(&ubo_inputs_)) {
     /* NOTE: generate_uniform_buffer() should have sorted the inputs before this. */
     ss << "struct NodeTree {\n";
     LISTBASE_FOREACH (LinkData *, link, &ubo_inputs_) {
       GPUInput *input = (GPUInput *)(link->data);
-      ss << input->type << " u" << input->id << ";\n";
+      if (input->source == GPU_SOURCE_CRYPTOMATTE) {
+        ss << input->type << " crypto_hash;\n";
+      }
+      else {
+        ss << input->type << " u" << input->id << ";\n";
+      }
     }
     ss << "};\n\n";
 
-    info.uniform_buf(0, "NodeTree", GPU_UBO_BLOCK_NAME, Frequency::BATCH);
+    info.uniform_buf(1, "NodeTree", GPU_UBO_BLOCK_NAME, Frequency::BATCH);
   }
 
   if (!BLI_listbase_is_empty(&graph.uniform_attrs.list)) {
@@ -394,7 +448,7 @@ void GPUCodegen::generate_resources()
 
     /* TODO(fclem): Use the macro for length. Currently not working for EEVEE. */
     /* DRW_RESOURCE_CHUNK_LEN = 512 */
-    info.uniform_buf(0, "UniformAttrs", GPU_ATTRIBUTE_UBO_BLOCK_NAME "[512]", Frequency::BATCH);
+    info.uniform_buf(2, "UniformAttrs", GPU_ATTRIBUTE_UBO_BLOCK_NAME "[512]", Frequency::BATCH);
   }
 
   info.typedef_source_generated = ss.str();
@@ -405,11 +459,16 @@ void GPUCodegen::generate_library()
   GPUCodegenCreateInfo &info = *create_info;
 
   void *value;
-  GSetIterState pop_state = {};
-  while (BLI_gset_pop(graph.used_libraries, &pop_state, &value)) {
+  /* Iterate over libraries. We need to keep this struct intact in case
+   * it is required for the optimization an pass. */
+  GHashIterator *ihash = BLI_ghashIterator_new((GHash *)graph.used_libraries);
+  while (!BLI_ghashIterator_done(ihash)) {
+    value = BLI_ghashIterator_getKey(ihash);
     auto deps = gpu_shader_dependency_get_resolved_source((const char *)value);
     info.dependencies_generated.extend_non_duplicates(deps);
+    BLI_ghashIterator_step(ihash);
   }
+  BLI_ghashIterator_free(ihash);
 }
 
 void GPUCodegen::node_serialize(std::stringstream &eval_ss, const GPUNode *node)
@@ -477,6 +536,9 @@ void GPUCodegen::node_serialize(std::stringstream &eval_ss, const GPUNode *node)
     }
   }
   eval_ss << ");\n\n";
+
+  /* Increment heuristic. */
+  nodes_total_++;
 }
 
 char *GPUCodegen::graph_serialize(eGPUNodeTag tree_tag, GPUNodeLink *output_link)
@@ -514,6 +576,24 @@ char *GPUCodegen::graph_serialize(eGPUNodeTag tree_tag)
   return eval_c_str;
 }
 
+void GPUCodegen::generate_cryptomatte()
+{
+  cryptomatte_input_ = static_cast<GPUInput *>(MEM_callocN(sizeof(GPUInput), __func__));
+  cryptomatte_input_->type = GPU_FLOAT;
+  cryptomatte_input_->source = GPU_SOURCE_CRYPTOMATTE;
+
+  float material_hash = 0.0f;
+  Material *material = GPU_material_get_material(&mat);
+  if (material) {
+    blender::bke::cryptomatte::CryptomatteHash hash(material->id.name,
+                                                    BLI_strnlen(material->id.name, MAX_NAME - 2));
+    material_hash = hash.float_encoded();
+  }
+  cryptomatte_input_->vec[0] = material_hash;
+
+  BLI_addtail(&ubo_inputs_, BLI_genericNodeN(cryptomatte_input_));
+}
+
 void GPUCodegen::generate_uniform_buffer()
 {
   /* Extract uniform inputs. */
@@ -522,6 +602,7 @@ void GPUCodegen::generate_uniform_buffer()
       if (input->source == GPU_SOURCE_UNIFORM && !input->link) {
         /* We handle the UBO uniforms separately. */
         BLI_addtail(&ubo_inputs_, BLI_genericNodeN(input));
+        uniforms_total_++;
       }
     }
   }
@@ -549,6 +630,7 @@ void GPUCodegen::generate_graphs()
 {
   set_unique_ids();
 
+  /* Serialize graph. */
   output.surface = graph_serialize(GPU_NODE_TAG_SURFACE | GPU_NODE_TAG_AOV, graph.outlink_surface);
   output.volume = graph_serialize(GPU_NODE_TAG_VOLUME, graph.outlink_volume);
   output.displacement = graph_serialize(GPU_NODE_TAG_DISPLACEMENT, graph.outlink_displacement);
@@ -584,33 +666,51 @@ void GPUCodegen::generate_graphs()
 GPUPass *GPU_generate_pass(GPUMaterial *material,
                            GPUNodeGraph *graph,
                            GPUCodegenCallbackFn finalize_source_cb,
-                           void *thunk)
+                           void *thunk,
+                           bool optimize_graph)
 {
   gpu_node_graph_prune_unused(graph);
 
+  /* If Optimize flag is passed in, we are generating an optimized
+   * variant of the GPUMaterial's GPUPass. */
+  if (optimize_graph) {
+    gpu_node_graph_optimize(graph);
+  }
+
   /* Extract attributes before compiling so the generated VBOs are ready to accept the future
    * shader. */
   gpu_node_graph_finalize_uniform_attrs(graph);
 
   GPUCodegen codegen(material, graph);
   codegen.generate_graphs();
-  codegen.generate_uniform_buffer();
-
-  /* Cache lookup: Reuse shaders already compiled. */
-  GPUPass *pass_hash = gpu_pass_cache_lookup(codegen.hash_get());
-
-  /* FIXME(fclem): This is broken. Since we only check for the hash and not the full source
-   * there is no way to have a collision currently. Some advocated to only use a bigger hash. */
-  if (pass_hash && (pass_hash->next == nullptr || pass_hash->next->hash != codegen.hash_get())) {
-    if (!gpu_pass_is_valid(pass_hash)) {
-      /* Shader has already been created but failed to compile. */
-      return nullptr;
+  codegen.generate_cryptomatte();
+
+  GPUPass *pass_hash = nullptr;
+
+  if (!optimize_graph) {
+    /* The optimized version of the shader should not re-generate a UBO.
+     * The UBO will not be used for this variant. */
+    codegen.generate_uniform_buffer();
+
+    /** Cache lookup: Reuse shaders already compiled.
+     * NOTE: We only perform cache look-up for non-optimized shader
+     * graphs, as baked constant data among other optimizations will generate too many
+     * shader source permutations, with minimal re-usability. */
+    pass_hash = gpu_pass_cache_lookup(codegen.hash_get());
+
+    /* FIXME(fclem): This is broken. Since we only check for the hash and not the full source
+     * there is no way to have a collision currently. Some advocated to only use a bigger hash. */
+    if (pass_hash && (pass_hash->next == nullptr || pass_hash->next->hash != codegen.hash_get())) {
+      if (!gpu_pass_is_valid(pass_hash)) {
+        /* Shader has already been created but failed to compile. */
+        return nullptr;
+      }
+      /* No collision, just return the pass. */
+      BLI_spin_lock(&pass_cache_spin);
+      pass_hash->refcount += 1;
+      BLI_spin_unlock(&pass_cache_spin);
+      return pass_hash;
     }
-    /* No collision, just return the pass. */
-    BLI_spin_lock(&pass_cache_spin);
-    pass_hash->refcount += 1;
-    BLI_spin_unlock(&pass_cache_spin);
-    return pass_hash;
   }
 
   /* Either the shader is not compiled or there is a hash collision...
@@ -648,14 +748,31 @@ GPUPass *GPU_generate_pass(GPUMaterial *material,
     pass->create_info = codegen.create_info;
     pass->hash = codegen.hash_get();
     pass->compiled = false;
+    /* Only flag pass optimization hint if this is the first generated pass for a material.
+     * Optimized passes cannot be optimized further, even if the heuristic is still not
+     * favorable. */
+    pass->should_optimize = (!optimize_graph) && codegen.should_optimize_heuristic();
 
     codegen.create_info = nullptr;
 
-    gpu_pass_cache_insert_after(pass_hash, pass);
+    /* Only insert non-optimized graphs into cache.
+     * Optimized graphs will continuously be recompiled with new unique source during material
+     * editing, and thus causing the cache to fill up quickly with materials offering minimal
+     * re-use. */
+    if (!optimize_graph) {
+      gpu_pass_cache_insert_after(pass_hash, pass);
+    }
   }
   return pass;
 }
 
+bool GPU_pass_should_optimize(GPUPass *pass)
+{
+  /* Returns optimization heuristic prepared during
+   * initial codegen. */
+  return pass->should_optimize;
+}
+
 /** \} */
 
 /* -------------------------------------------------------------------- */