Cycles: code refactor to bake using regular render session and tiles

There should be no user visible change from this, except that tile size now affects performance. The goal here is to simplify bake denoising in D3099, letting it reuse more denoising tiles and pass code. A lot of code is now shared with regular rendering, with the two main differences being that we read some render result passes from the bake API when starting to render a tile, and call the bake kernel instead of the path trace kernel. With this kind of design where Cycles asks for tiles from the bake API, it should eventually be easier to reduce memory usage, show tiles as they are baked, or bake multiple passes at once, though there's still quite some work needed for that. Reviewers: #cycles Subscribers: monio, wmatyjewicz, lukasstockner97, michaelknubben Differential Revision: https://developer.blender.org/D3108
author: Brecht Van Lommel <brecht@blender.org> 2019-05-10 22:39:58 +0300
committer: Lukas Stockner <lukas.stockner@freenet.de> 2020-05-15 21:25:24 +0300
commit: d9773edaa394f61393f9c8b80275e62f74306097 (patch)
tree: 232b771b341e98a5403af16791bdcca133cb1edd /intern/cycles/device/cuda
parent: 3ff8ca60e94db2584ca76e323a54c738e677d5f8 (diff)
2 files changed, 29 insertions, 25 deletions
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
index 3e397da895b..3f23f0fe4c5 100644
--- a/intern/cycles/device/cuda/device_cuda.h
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -223,7 +223,7 @@ class CUDADevice : public Device {
                               CUdeviceptr d_wtile,
                               CUstream stream = 0);
 
-  void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
+  void render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
 
   void film_convert(DeviceTask &task,
                     device_ptr buffer,
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index ba5d479e0e7..acf53c3eb1b 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -586,20 +586,23 @@ void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_f
   cuMemGetInfo(&free_before, &total);
 
   /* Get kernel function. */
-  CUfunction cuPathTrace;
+  CUfunction cuRender;
 
-  if (requested_features.use_integrator_branched) {
-    cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+  if (requested_features.use_baking) {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
+  }
+  else if (requested_features.use_integrator_branched) {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
   }
   else {
-    cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
   }
 
-  cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
 
   int min_blocks, num_threads_per_block;
-  cuda_assert(cuOccupancyMaxPotentialBlockSize(
-      &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+  cuda_assert(
+      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
 
   /* Launch kernel, using just 1 block appears sufficient to reserve
    * memory for all multiprocessors. It would be good to do this in
@@ -609,7 +612,7 @@ void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_f
 
   void *args[] = {&d_work_tiles, &total_work_size};
 
-  cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+  cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
 
   cuda_assert(cuCtxSynchronize());
 
@@ -1780,9 +1783,7 @@ void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
                              0));
 }
 
-void CUDADevice::path_trace(DeviceTask &task,
-                            RenderTile &rtile,
-                            device_vector<WorkTile> &work_tiles)
+void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
 {
   scoped_timer timer(&rtile.buffers->render_time);
 
@@ -1790,21 +1791,24 @@ void CUDADevice::path_trace(DeviceTask &task,
     return;
 
   CUDAContextScope scope(this);
-  CUfunction cuPathTrace;
+  CUfunction cuRender;
 
   /* Get kernel function. */
-  if (task.integrator_branched) {
-    cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+  if (rtile.task == RenderTile::BAKE) {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
+  }
+  else if (task.integrator_branched) {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
   }
   else {
-    cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
   }
 
   if (have_error()) {
     return;
   }
 
-  cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
 
   /* Allocate work tile. */
   work_tiles.alloc(1);
@@ -1822,8 +1826,8 @@ void CUDADevice::path_trace(DeviceTask &task,
    * remain conservative for GPUs connected to a display to avoid driver
    * timeouts and display freezing. */
   int min_blocks, num_threads_per_block;
-  cuda_assert(cuOccupancyMaxPotentialBlockSize(
-      &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+  cuda_assert(
+      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
   if (!info.display_device) {
     min_blocks *= 8;
   }
@@ -1851,7 +1855,7 @@ void CUDADevice::path_trace(DeviceTask &task,
     void *args[] = {&d_work_tiles, &total_work_size};
 
     cuda_assert(
-        cuLaunchKernel(cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+        cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
 
     /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
     uint filter_sample = sample + wtile->num_samples - 1;
@@ -1957,10 +1961,7 @@ void CUDADevice::shader(DeviceTask &task)
   CUdeviceptr d_output = (CUdeviceptr)task.shader_output;
 
   /* get kernel function */
-  if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
-  }
-  else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
+  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
     cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
   }
   else {
@@ -2297,9 +2298,12 @@ void CUDADevice::thread_run(DeviceTask *task)
           split_kernel->path_trace(task, tile, void_buffer, void_buffer);
         }
         else {
-          path_trace(*task, tile, work_tiles);
+          render(*task, tile, work_tiles);
         }
       }
+      else if (tile.task == RenderTile::BAKE) {
+        render(*task, tile, work_tiles);
+      }
       else if (tile.task == RenderTile::DENOISE) {
         tile.sample = tile.start_sample + tile.num_samples;
author	Brecht Van Lommel <brecht@blender.org>	2019-05-10 22:39:58 +0300
committer	Lukas Stockner <lukas.stockner@freenet.de>	2020-05-15 21:25:24 +0300
commit	d9773edaa394f61393f9c8b80275e62f74306097 (patch)
tree	232b771b341e98a5403af16791bdcca133cb1edd /intern/cycles/device/cuda
parent	3ff8ca60e94db2584ca76e323a54c738e677d5f8 (diff)