diff options
author | Brecht Van Lommel <brecht@blender.org> | 2019-05-10 22:39:58 +0300 |
---|---|---|
committer | Lukas Stockner <lukas.stockner@freenet.de> | 2020-05-15 21:25:24 +0300 |
commit | d9773edaa394f61393f9c8b80275e62f74306097 (patch) | |
tree | 232b771b341e98a5403af16791bdcca133cb1edd /intern/cycles/device/cuda | |
parent | 3ff8ca60e94db2584ca76e323a54c738e677d5f8 (diff) |
Cycles: code refactor to bake using regular render session and tiles
There should be no user visible change from this, except that tile size
now affects performance. The goal here is to simplify bake denoising in
D3099, letting it reuse more denoising tiles and pass code.
A lot of code is now shared with regular rendering, with the two main
differences being that we read some render result passes from the bake API
when starting to render a tile, and call the bake kernel instead of the
path trace kernel.
With this kind of design where Cycles asks for tiles from the bake API,
it should eventually be easier to reduce memory usage, show tiles as
they are baked, or bake multiple passes at once, though there's still
quite some work needed for that.
Reviewers: #cycles
Subscribers: monio, wmatyjewicz, lukasstockner97, michaelknubben
Differential Revision: https://developer.blender.org/D3108
Diffstat (limited to 'intern/cycles/device/cuda')
-rw-r--r-- | intern/cycles/device/cuda/device_cuda.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/cuda/device_cuda_impl.cpp | 52 |
2 files changed, 29 insertions, 25 deletions
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h index 3e397da895b..3f23f0fe4c5 100644 --- a/intern/cycles/device/cuda/device_cuda.h +++ b/intern/cycles/device/cuda/device_cuda.h @@ -223,7 +223,7 @@ class CUDADevice : public Device { CUdeviceptr d_wtile, CUstream stream = 0); - void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles); + void render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles); void film_convert(DeviceTask &task, device_ptr buffer, diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp index ba5d479e0e7..acf53c3eb1b 100644 --- a/intern/cycles/device/cuda/device_cuda_impl.cpp +++ b/intern/cycles/device/cuda/device_cuda_impl.cpp @@ -586,20 +586,23 @@ void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_f cuMemGetInfo(&free_before, &total); /* Get kernel function. */ - CUfunction cuPathTrace; + CUfunction cuRender; - if (requested_features.use_integrator_branched) { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); + if (requested_features.use_baking) { + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); + } + else if (requested_features.use_integrator_branched) { + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace")); } else { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); } - cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); int min_blocks, num_threads_per_block; - cuda_assert(cuOccupancyMaxPotentialBlockSize( - &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); + cuda_assert( + cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); /* Launch kernel, using just 1 block appears sufficient to reserve * memory for all multiprocessors. It would be good to do this in @@ -609,7 +612,7 @@ void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_f void *args[] = {&d_work_tiles, &total_work_size}; - cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); cuda_assert(cuCtxSynchronize()); @@ -1780,9 +1783,7 @@ void CUDADevice::adaptive_sampling_post(RenderTile &rtile, 0)); } -void CUDADevice::path_trace(DeviceTask &task, - RenderTile &rtile, - device_vector<WorkTile> &work_tiles) +void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles) { scoped_timer timer(&rtile.buffers->render_time); @@ -1790,21 +1791,24 @@ void CUDADevice::path_trace(DeviceTask &task, return; CUDAContextScope scope(this); - CUfunction cuPathTrace; + CUfunction cuRender; /* Get kernel function. */ - if (task.integrator_branched) { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); + if (rtile.task == RenderTile::BAKE) { + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); + } + else if (task.integrator_branched) { + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace")); } else { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); } if (have_error()) { return; } - cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); /* Allocate work tile. */ work_tiles.alloc(1); @@ -1822,8 +1826,8 @@ void CUDADevice::path_trace(DeviceTask &task, * remain conservative for GPUs connected to a display to avoid driver * timeouts and display freezing. */ int min_blocks, num_threads_per_block; - cuda_assert(cuOccupancyMaxPotentialBlockSize( - &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); + cuda_assert( + cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); if (!info.display_device) { min_blocks *= 8; } @@ -1851,7 +1855,7 @@ void CUDADevice::path_trace(DeviceTask &task, void *args[] = {&d_work_tiles, &total_work_size}; cuda_assert( - cuLaunchKernel(cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ uint filter_sample = sample + wtile->num_samples - 1; @@ -1957,10 +1961,7 @@ void CUDADevice::shader(DeviceTask &task) CUdeviceptr d_output = (CUdeviceptr)task.shader_output; /* get kernel function */ - if (task.shader_eval_type >= SHADER_EVAL_BAKE) { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); - } - else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { + if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); } else { @@ -2297,9 +2298,12 @@ void CUDADevice::thread_run(DeviceTask *task) split_kernel->path_trace(task, tile, void_buffer, void_buffer); } else { - path_trace(*task, tile, work_tiles); + render(*task, tile, work_tiles); } } + else if (tile.task == RenderTile::BAKE) { + render(*task, tile, work_tiles); + } else if (tile.task == RenderTile::DENOISE) { tile.sample = tile.start_sample + tile.num_samples; |