Cycles: code refactor to bake using regular render session and tiles

There should be no user visible change from this, except that tile size now affects performance. The goal here is to simplify bake denoising in D3099, letting it reuse more denoising tiles and pass code. A lot of code is now shared with regular rendering, with the two main differences being that we read some render result passes from the bake API when starting to render a tile, and call the bake kernel instead of the path trace kernel. With this kind of design where Cycles asks for tiles from the bake API, it should eventually be easier to reduce memory usage, show tiles as they are baked, or bake multiple passes at once, though there's still quite some work needed for that. Reviewers: #cycles Subscribers: monio, wmatyjewicz, lukasstockner97, michaelknubben Differential Revision: https://developer.blender.org/D3108
author: Brecht Van Lommel <brecht@blender.org> 2019-05-10 22:39:58 +0300
committer: Lukas Stockner <lukas.stockner@freenet.de> 2020-05-15 21:25:24 +0300
commit: d9773edaa394f61393f9c8b80275e62f74306097 (patch)
tree: 232b771b341e98a5403af16791bdcca133cb1edd /intern/cycles/kernel
parent: 3ff8ca60e94db2584ca76e323a54c738e677d5f8 (diff)
5 files changed, 113 insertions, 89 deletions
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index f1fc697553a..2709a9da734 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -18,38 +18,40 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __BAKING__
 
-ccl_device_inline void compute_light_pass(
+ccl_device_noinline void compute_light_pass(
     KernelGlobals *kg, ShaderData *sd, PathRadiance *L, uint rng_hash, int pass_filter, int sample)
 {
   kernel_assert(kernel_data.film.use_light_pass);
 
-  PathRadiance L_sample;
-  PathState state;
-  Ray ray;
   float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 
-  /* emission and indirect shader data memory used by various functions */
-  ShaderData emission_sd, indirect_sd;
-
-  ray.P = sd->P + sd->Ng;
-  ray.D = -sd->Ng;
-  ray.t = FLT_MAX;
-#  ifdef __CAMERA_MOTION__
-  ray.time = 0.5f;
-#  endif
+  /* Emission and indirect shader data memory used by various functions. */
+  ShaderDataTinyStorage emission_sd_storage;
+  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+  ShaderData indirect_sd;
 
-  /* init radiance */
-  path_radiance_init(kg, &L_sample);
+  /* Init radiance. */
+  path_radiance_init(kg, L);
 
-  /* init path state */
-  path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL);
+  /* Init path state. */
+  PathState state;
+  path_state_init(kg, emission_sd, &state, rng_hash, sample, NULL);
 
-  /* evaluate surface shader */
+  /* Evaluate surface shader. */
   shader_eval_surface(kg, sd, &state, NULL, state.flag);
 
   /* TODO, disable more closures we don't need besides transparent */
   shader_bsdf_disable_transparency(kg, sd);
 
+  /* Init ray. */
+  Ray ray;
+  ray.P = sd->P + sd->Ng;
+  ray.D = -sd->Ng;
+  ray.t = FLT_MAX;
+#  ifdef __CAMERA_MOTION__
+  ray.time = 0.5f;
+#  endif
+
 #  ifdef __BRANCHED_PATH__
   if (!kernel_data.integrator.branched) {
     /* regular path tracer */
@@ -57,14 +59,13 @@ ccl_device_inline void compute_light_pass(
 
     /* sample ambient occlusion */
     if (pass_filter & BAKE_FILTER_AO) {
-      kernel_path_ao(
-          kg, sd, &emission_sd, &L_sample, &state, throughput, shader_bsdf_alpha(kg, sd));
+      kernel_path_ao(kg, sd, emission_sd, L, &state, throughput, shader_bsdf_alpha(kg, sd));
     }
 
     /* sample emission */
     if ((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
       float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-      path_radiance_accum_emission(kg, &L_sample, &state, throughput, emission);
+      path_radiance_accum_emission(kg, L, &state, throughput, emission);
     }
 
     bool is_sss_sample = false;
@@ -77,12 +78,10 @@ ccl_device_inline void compute_light_pass(
       SubsurfaceIndirectRays ss_indirect;
       kernel_path_subsurface_init_indirect(&ss_indirect);
       if (kernel_path_subsurface_scatter(
-              kg, sd, &emission_sd, &L_sample, &state, &ray, &throughput, &ss_indirect)) {
+              kg, sd, emission_sd, L, &state, &ray, &throughput, &ss_indirect)) {
         while (ss_indirect.num_rays) {
-          kernel_path_subsurface_setup_indirect(
-              kg, &ss_indirect, &state, &ray, &L_sample, &throughput);
-          kernel_path_indirect(
-              kg, &indirect_sd, &emission_sd, &ray, throughput, &state, &L_sample);
+          kernel_path_subsurface_setup_indirect(kg, &ss_indirect, &state, &ray, L, &throughput);
+          kernel_path_indirect(kg, &indirect_sd, emission_sd, &ray, throughput, &state, L);
         }
         is_sss_sample = true;
       }
@@ -91,18 +90,18 @@ ccl_device_inline void compute_light_pass(
 
     /* sample light and BSDF */
     if (!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) {
-      kernel_path_surface_connect_light(kg, sd, &emission_sd, throughput, &state, &L_sample);
+      kernel_path_surface_connect_light(kg, sd, emission_sd, throughput, &state, L);
 
-      if (kernel_path_surface_bounce(kg, sd, &throughput, &state, &L_sample.state, &ray)) {
+      if (kernel_path_surface_bounce(kg, sd, &throughput, &state, &L->state, &ray)) {
 #  ifdef __LAMP_MIS__
         state.ray_t = 0.0f;
 #  endif
         /* compute indirect light */
-        kernel_path_indirect(kg, &indirect_sd, &emission_sd, &ray, throughput, &state, &L_sample);
+        kernel_path_indirect(kg, &indirect_sd, emission_sd, &ray, throughput, &state, L);
 
         /* sum and reset indirect light pass variables for the next samples */
-        path_radiance_sum_indirect(&L_sample);
-        path_radiance_reset_indirect(&L_sample);
+        path_radiance_sum_indirect(L);
+        path_radiance_reset_indirect(L);
       }
     }
 #  ifdef __BRANCHED_PATH__
@@ -112,13 +111,13 @@ ccl_device_inline void compute_light_pass(
 
     /* sample ambient occlusion */
     if (pass_filter & BAKE_FILTER_AO) {
-      kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput);
+      kernel_branched_path_ao(kg, sd, emission_sd, L, &state, throughput);
     }
 
     /* sample emission */
     if ((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
       float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-      path_radiance_accum_emission(kg, &L_sample, &state, throughput, emission);
+      path_radiance_accum_emission(kg, L, &state, throughput, emission);
     }
 
 #    ifdef __SUBSURFACE__
@@ -127,7 +126,7 @@ ccl_device_inline void compute_light_pass(
       /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
        * if scattering was successful. */
       kernel_branched_path_subsurface_scatter(
-          kg, sd, &indirect_sd, &emission_sd, &L_sample, &state, &ray, throughput);
+          kg, sd, &indirect_sd, emission_sd, L, &state, &ray, throughput);
     }
 #    endif
 
@@ -138,19 +137,16 @@ ccl_device_inline void compute_light_pass(
       if (kernel_data.integrator.use_direct_light) {
         int all = kernel_data.integrator.sample_all_lights_direct;
         kernel_branched_path_surface_connect_light(
-            kg, sd, &emission_sd, &state, throughput, 1.0f, &L_sample, all);
+            kg, sd, emission_sd, &state, throughput, 1.0f, L, all);
       }
 #    endif
 
       /* indirect light */
       kernel_branched_path_surface_indirect_light(
-          kg, sd, &indirect_sd, &emission_sd, throughput, 1.0f, &state, &L_sample);
+          kg, sd, &indirect_sd, emission_sd, throughput, 1.0f, &state, L);
     }
   }
 #  endif
-
-  /* accumulate into master L */
-  path_radiance_accum_sample(L, &L_sample);
 }
 
 /* this helps with AA but it's not the real solution as it does not AA the geometry
@@ -225,41 +221,28 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
   return out;
 }
 
-ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
-                                     ccl_global uint4 *input,
-                                     ccl_global float4 *output,
-                                     ShaderEvalType type,
-                                     int pass_filter,
-                                     int i,
-                                     int offset,
-                                     int sample)
+ccl_device void kernel_bake_evaluate(
+    KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
 {
-  ShaderData sd;
-  PathState state = {0};
-  uint4 in = input[i * 2];
-  uint4 diff = input[i * 2 + 1];
-
-  float3 out = make_float3(0.0f, 0.0f, 0.0f);
+  /* Setup render buffers. */
+  const int index = offset + x + y * stride;
+  const int pass_stride = kernel_data.film.pass_stride;
+  buffer += index * pass_stride;
 
-  int object = in.x;
-  int prim = in.y;
+  ccl_global float *primitive = buffer + kernel_data.film.pass_bake_primitive;
+  ccl_global float *differential = buffer + kernel_data.film.pass_bake_differential;
+  ccl_global float *output = buffer + kernel_data.film.pass_combined;
 
+  int prim = __float_as_uint(primitive[1]);
   if (prim == -1)
     return;
 
-  float u = __uint_as_float(in.z);
-  float v = __uint_as_float(in.w);
-
-  float dudx = __uint_as_float(diff.x);
-  float dudy = __uint_as_float(diff.y);
-  float dvdx = __uint_as_float(diff.z);
-  float dvdy = __uint_as_float(diff.w);
+  prim += kernel_data.bake.tri_offset;
 
+  /* Random number generator. */
+  uint rng_hash = hash_uint2(x, y) ^ kernel_data.integrator.seed;
   int num_samples = kernel_data.integrator.aa_samples;
 
-  /* random number generator */
-  uint rng_hash = cmj_hash(offset + i, kernel_data.integrator.seed);
-
   float filter_x, filter_y;
   if (sample == 0) {
     filter_x = filter_y = 0.5f;
@@ -268,23 +251,29 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
     path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
   }
 
-  /* subpixel u/v offset */
+  /* Barycentric UV with subpixel offset. */
+  float u = primitive[2];
+  float v = primitive[3];
+
+  float dudx = differential[0];
+  float dudy = differential[1];
+  float dvdx = differential[2];
+  float dvdy = differential[3];
+
   if (sample > 0) {
     u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f);
     v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f),
                                  1.0f - u);
   }
 
-  /* triangle */
+  /* Shader data setup. */
+  int object = kernel_data.bake.object_index;
   int shader;
   float3 P, Ng;
 
   triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
 
-  /* light passes */
-  PathRadiance L;
-  path_radiance_init(kg, &L);
-
+  ShaderData sd;
   shader_setup_from_sample(
       kg,
       &sd,
@@ -302,7 +291,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
       LAMP_NONE);
   sd.I = sd.N;
 
-  /* update differentials */
+  /* Setup differentials. */
   sd.dP.dx = sd.dPdu * dudx + sd.dPdv * dvdx;
   sd.dP.dy = sd.dPdu * dudy + sd.dPdv * dvdy;
   sd.du.dx = dudx;
@@ -310,17 +299,24 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
   sd.dv.dx = dvdx;
   sd.dv.dy = dvdy;
 
-  /* set RNG state for shaders that use sampling */
+  /* Set RNG state for shaders that use sampling. */
+  PathState state = {0};
   state.rng_hash = rng_hash;
   state.rng_offset = 0;
   state.sample = sample;
   state.num_samples = num_samples;
   state.min_ray_pdf = FLT_MAX;
 
-  /* light passes if we need more than color */
-  if (pass_filter & ~BAKE_FILTER_COLOR)
+  /* Light passes if we need more than color. */
+  PathRadiance L;
+  int pass_filter = kernel_data.bake.pass_filter;
+
+  if (kernel_data.bake.pass_filter & ~BAKE_FILTER_COLOR)
     compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample);
 
+  float3 out = make_float3(0.0f, 0.0f, 0.0f);
+
+  ShaderEvalType type = (ShaderEvalType)kernel_data.bake.type;
   switch (type) {
     /* data passes */
     case SHADER_EVAL_NORMAL:
@@ -441,10 +437,8 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg,
   }
 
   /* write output */
-  const float output_fac = 1.0f / num_samples;
-  const float4 scaled_result = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
-
-  output[i] = (sample == 0) ? scaled_result : output[i] + scaled_result;
+  const float4 result = make_float4(out.x, out.y, out.z, 1.0f);
+  kernel_write_pass_float4(output, result);
 }
 
 #endif /* __BAKING__ */
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index a1f8c35348d..304835a1685 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -395,6 +395,10 @@ typedef enum PassType {
   PASS_VOLUME_INDIRECT,
   /* No Scatter color since it's tricky to define what it would even mean. */
   PASS_CATEGORY_LIGHT_END = 63,
+
+  PASS_BAKE_PRIMITIVE,
+  PASS_BAKE_DIFFERENTIAL,
+  PASS_CATEGORY_BAKE_END = 95
 } PassType;
 
 #define PASS_ANY (~0)
@@ -1248,6 +1252,10 @@ typedef struct KernelFilm {
   float4 xyz_to_b;
   float4 rgb_to_y;
 
+  int pass_bake_primitive;
+  int pass_bake_differential;
+  int pad;
+
 #ifdef __KERNEL_DEBUG__
   int pass_bvh_traversed_nodes;
   int pass_bvh_traversed_instances;
@@ -1427,6 +1435,14 @@ typedef struct KernelTables {
 } KernelTables;
 static_assert_align(KernelTables, 16);
 
+typedef struct KernelBake {
+  int object_index;
+  int tri_offset;
+  int type;
+  int pass_filter;
+} KernelBake;
+static_assert_align(KernelBake, 16);
+
 typedef struct KernelData {
   KernelCamera cam;
   KernelFilm film;
@@ -1435,6 +1451,7 @@ typedef struct KernelData {
   KernelBVH bvh;
   KernelCurves curve;
   KernelTables tables;
+  KernelBake bake;
 } KernelData;
 static_assert_align(KernelData, 16);
 
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 683f4b88d79..ea3103f12c3 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -46,6 +46,9 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        int offset,
                                        int sample);
 
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
+
 /* Split kernels */
 
 void KERNEL_FUNCTION_FULL_NAME(data_init)(KernelGlobals *kg,
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 091e53cfd83..5aa3fb14318 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -132,6 +132,18 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 #  endif /* KERNEL_STUB */
 }
 
+/* Bake */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
+{
+#  ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, bake);
+#  else
+  kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
+#  endif /* KERNEL_STUB */
+}
+
 /* Shader Evaluate */
 
 void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
@@ -146,12 +158,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 #  ifdef KERNEL_STUB
   STUB_ASSERT(KERNEL_ARCH, shader);
 #  else
-  if (type >= SHADER_EVAL_BAKE) {
-#    ifdef __BAKING__
-    kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, filter, i, offset, sample);
-#    endif
-  }
-  else if (type == SHADER_EVAL_DISPLACE) {
+  if (type == SHADER_EVAL_DISPLACE) {
     kernel_displace_evaluate(kg, input, output, i);
   }
   else {
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index c4c810c6a82..d4f41132a11 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -214,13 +214,16 @@ kernel_cuda_background(uint4 *input,
 #ifdef __BAKING__
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int sw, int offset, int sample)
+kernel_cuda_bake(WorkTile *tile, uint total_work_size)
 {
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int work_index = ccl_global_id(0);
+
+	if(work_index < total_work_size) {
+		uint x, y, sample;
+		get_work_pixel(tile, work_index, &x, &y, &sample);
 
-	if(x < sx + sw) {
 		KernelGlobals kg;
-		kernel_bake_evaluate(&kg, input, output, (ShaderEvalType)type, filter, x, offset, sample);
+		kernel_bake_evaluate(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
 	}
 }
 #endif
author	Brecht Van Lommel <brecht@blender.org>	2019-05-10 22:39:58 +0300
committer	Lukas Stockner <lukas.stockner@freenet.de>	2020-05-15 21:25:24 +0300
commit	d9773edaa394f61393f9c8b80275e62f74306097 (patch)
tree	232b771b341e98a5403af16791bdcca133cb1edd /intern/cycles/kernel
parent	3ff8ca60e94db2584ca76e323a54c738e677d5f8 (diff)