diff options
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 79 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_passes.h | 18 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/kernel.cu | 2 |
3 files changed, 53 insertions, 46 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 54e012191ae..d84771aefda 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1281,17 +1281,16 @@ public: task.unmap_neighbor_tiles(rtiles, this); } - void path_trace(RenderTile& rtile, int sample, bool branched) + void path_trace(DeviceTask& task, RenderTile& rtile) { if(have_error()) return; CUDAContextScope scope(this); - CUfunction cuPathTrace; - /* get kernel function */ - if(branched) { + /* Get kernel function. */ + if(task.integrator_branched) { cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); } else { @@ -1304,7 +1303,7 @@ public: cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); - /* allocate work tile */ + /* Allocate work tile. */ device_vector<WorkTile> work_tiles; work_tiles.resize(1); @@ -1315,32 +1314,50 @@ public: wtile->h = rtile.h; wtile->offset = rtile.offset; wtile->stride = rtile.stride; - wtile->start_sample = sample; - wtile->num_samples = 1; wtile->buffer = (float*)cuda_device_ptr(rtile.buffer); - mem_alloc("work_tiles", work_tiles, MEM_READ_ONLY); - mem_copy_to(work_tiles); CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); - uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + /* Prepare work size. More step samples render faster, but for now we + * remain conservative to avoid driver timeouts. */ + int min_blocks, num_threads_per_block; + cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); + uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);; - /* pass in parameters */ - void *args[] = {&d_work_tiles, - &total_work_size}; + /* Render all samples. */ + int start_sample = rtile.start_sample; + int end_sample = rtile.start_sample + rtile.num_samples; - /* launch kernel */ - int num_threads_per_block; - cuda_assert(cuFuncGetAttribute(&num_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace)); - int num_blocks = divide_up(total_work_size, num_threads_per_block); + for(int sample = start_sample; sample < end_sample; sample += step_samples) { + /* Setup and copy work tile to device. */ + wtile->start_sample = sample; + wtile->num_samples = min(step_samples, end_sample - sample);; + mem_copy_to(work_tiles); - cuda_assert(cuLaunchKernel(cuPathTrace, - num_blocks, 1, 1, - num_threads_per_block, 1, 1, - 0, 0, args, 0)); + uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); - cuda_assert(cuCtxSynchronize()); + /* Launch kernel. */ + void *args[] = {&d_work_tiles, + &total_work_size}; + + cuda_assert(cuLaunchKernel(cuPathTrace, + num_blocks, 1, 1, + num_threads_per_block, 1, 1, + 0, 0, args, 0)); + + cuda_assert(cuCtxSynchronize()); + + /* Update progress. */ + rtile.sample = sample + wtile->num_samples; + task.update_progress(&rtile, rtile.w*rtile.h); + + if(task.get_cancel()) { + if(task.need_finish_queue == false) + break; + } + } mem_free(work_tiles); } @@ -1700,8 +1717,6 @@ public: if(task->type == DeviceTask::RENDER) { RenderTile tile; - bool branched = task->integrator_branched; - /* Upload Bindless Mapping */ load_bindless_mapping(); @@ -1725,21 +1740,7 @@ public: split_kernel->path_trace(task, tile, void_buffer, void_buffer); } else { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } - - path_trace(tile, sample, branched); - - tile.sample = sample + 1; - - task->update_progress(&tile, tile.w*tile.h); - } + path_trace(*task, tile); } } else if(tile.task == RenderTile::DENOISE) { diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 239598f7dab..644cc173571 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -16,19 +16,23 @@ CCL_NAMESPACE_BEGIN +#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__) +#define __ATOMIC_PASS_WRITE__ +#endif + ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value) { ccl_global float *buf = buffer; -#if defined(__SPLIT_KERNEL__) +#ifdef __ATOMIC_PASS_WRITE__ atomic_add_and_fetch_float(buf, value); #else *buf += value; -#endif /* __SPLIT_KERNEL__ */ +#endif } ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value) { -#if defined(__SPLIT_KERNEL__) +#ifdef __ATOMIC_PASS_WRITE__ ccl_global float *buf_x = buffer + 0; ccl_global float *buf_y = buffer + 1; ccl_global float *buf_z = buffer + 2; @@ -39,12 +43,12 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 #else ccl_global float3 *buf = (ccl_global float3*)buffer; *buf += value; -#endif /* __SPLIT_KERNEL__ */ +#endif } ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value) { -#if defined(__SPLIT_KERNEL__) +#ifdef __ATOMIC_PASS_WRITE__ ccl_global float *buf_x = buffer + 0; ccl_global float *buf_y = buffer + 1; ccl_global float *buf_z = buffer + 2; @@ -57,7 +61,7 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 #else ccl_global float4 *buf = (ccl_global float4*)buffer; *buf += value; -#endif /* __SPLIT_KERNEL__ */ +#endif } #ifdef __DENOISING_FEATURES__ @@ -70,7 +74,7 @@ ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer kernel_write_pass_float(buffer+1, value*value); } -# if defined(__SPLIT_KERNEL__) +# ifdef __ATOMIC_PASS_WRITE__ # define kernel_write_pass_float3_unaligned kernel_write_pass_float3 # else ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value) diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index 799cd587fcf..e72edfa7bdf 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -21,6 +21,8 @@ #include "kernel/kernel_compat_cuda.h" #include "kernel_config.h" +#include "util/util_atomic.h" + #include "kernel/kernel_math.h" #include "kernel/kernel_types.h" #include "kernel/kernel_globals.h" |