Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--intern/cycles/device/device_cuda.cpp79
-rw-r--r--intern/cycles/kernel/kernel_passes.h18
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel.cu2
3 files changed, 53 insertions, 46 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 54e012191ae..d84771aefda 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -1281,17 +1281,16 @@ public:
task.unmap_neighbor_tiles(rtiles, this);
}
- void path_trace(RenderTile& rtile, int sample, bool branched)
+ void path_trace(DeviceTask& task, RenderTile& rtile)
{
if(have_error())
return;
CUDAContextScope scope(this);
-
CUfunction cuPathTrace;
- /* get kernel function */
- if(branched) {
+ /* Get kernel function. */
+ if(task.integrator_branched) {
cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
}
else {
@@ -1304,7 +1303,7 @@ public:
cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
- /* allocate work tile */
+ /* Allocate work tile. */
device_vector<WorkTile> work_tiles;
work_tiles.resize(1);
@@ -1315,32 +1314,50 @@ public:
wtile->h = rtile.h;
wtile->offset = rtile.offset;
wtile->stride = rtile.stride;
- wtile->start_sample = sample;
- wtile->num_samples = 1;
wtile->buffer = (float*)cuda_device_ptr(rtile.buffer);
-
mem_alloc("work_tiles", work_tiles, MEM_READ_ONLY);
- mem_copy_to(work_tiles);
CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
- uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+ /* Prepare work size. More step samples render faster, but for now we
+ * remain conservative to avoid driver timeouts. */
+ int min_blocks, num_threads_per_block;
+ cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+ uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);;
- /* pass in parameters */
- void *args[] = {&d_work_tiles,
- &total_work_size};
+ /* Render all samples. */
+ int start_sample = rtile.start_sample;
+ int end_sample = rtile.start_sample + rtile.num_samples;
- /* launch kernel */
- int num_threads_per_block;
- cuda_assert(cuFuncGetAttribute(&num_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace));
- int num_blocks = divide_up(total_work_size, num_threads_per_block);
+ for(int sample = start_sample; sample < end_sample; sample += step_samples) {
+ /* Setup and copy work tile to device. */
+ wtile->start_sample = sample;
+ wtile->num_samples = min(step_samples, end_sample - sample);;
+ mem_copy_to(work_tiles);
- cuda_assert(cuLaunchKernel(cuPathTrace,
- num_blocks, 1, 1,
- num_threads_per_block, 1, 1,
- 0, 0, args, 0));
+ uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+ uint num_blocks = divide_up(total_work_size, num_threads_per_block);
- cuda_assert(cuCtxSynchronize());
+ /* Launch kernel. */
+ void *args[] = {&d_work_tiles,
+ &total_work_size};
+
+ cuda_assert(cuLaunchKernel(cuPathTrace,
+ num_blocks, 1, 1,
+ num_threads_per_block, 1, 1,
+ 0, 0, args, 0));
+
+ cuda_assert(cuCtxSynchronize());
+
+ /* Update progress. */
+ rtile.sample = sample + wtile->num_samples;
+ task.update_progress(&rtile, rtile.w*rtile.h);
+
+ if(task.get_cancel()) {
+ if(task.need_finish_queue == false)
+ break;
+ }
+ }
mem_free(work_tiles);
}
@@ -1700,8 +1717,6 @@ public:
if(task->type == DeviceTask::RENDER) {
RenderTile tile;
- bool branched = task->integrator_branched;
-
/* Upload Bindless Mapping */
load_bindless_mapping();
@@ -1725,21 +1740,7 @@ public:
split_kernel->path_trace(task, tile, void_buffer, void_buffer);
}
else {
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
-
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
- }
-
- path_trace(tile, sample, branched);
-
- tile.sample = sample + 1;
-
- task->update_progress(&tile, tile.w*tile.h);
- }
+ path_trace(*task, tile);
}
}
else if(tile.task == RenderTile::DENOISE) {
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 239598f7dab..644cc173571 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -16,19 +16,23 @@
CCL_NAMESPACE_BEGIN
+#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__)
+#define __ATOMIC_PASS_WRITE__
+#endif
+
ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value)
{
ccl_global float *buf = buffer;
-#if defined(__SPLIT_KERNEL__)
+#ifdef __ATOMIC_PASS_WRITE__
atomic_add_and_fetch_float(buf, value);
#else
*buf += value;
-#endif /* __SPLIT_KERNEL__ */
+#endif
}
ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value)
{
-#if defined(__SPLIT_KERNEL__)
+#ifdef __ATOMIC_PASS_WRITE__
ccl_global float *buf_x = buffer + 0;
ccl_global float *buf_y = buffer + 1;
ccl_global float *buf_z = buffer + 2;
@@ -39,12 +43,12 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3
#else
ccl_global float3 *buf = (ccl_global float3*)buffer;
*buf += value;
-#endif /* __SPLIT_KERNEL__ */
+#endif
}
ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value)
{
-#if defined(__SPLIT_KERNEL__)
+#ifdef __ATOMIC_PASS_WRITE__
ccl_global float *buf_x = buffer + 0;
ccl_global float *buf_y = buffer + 1;
ccl_global float *buf_z = buffer + 2;
@@ -57,7 +61,7 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4
#else
ccl_global float4 *buf = (ccl_global float4*)buffer;
*buf += value;
-#endif /* __SPLIT_KERNEL__ */
+#endif
}
#ifdef __DENOISING_FEATURES__
@@ -70,7 +74,7 @@ ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer
kernel_write_pass_float(buffer+1, value*value);
}
-# if defined(__SPLIT_KERNEL__)
+# ifdef __ATOMIC_PASS_WRITE__
# define kernel_write_pass_float3_unaligned kernel_write_pass_float3
# else
ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value)
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 799cd587fcf..e72edfa7bdf 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -21,6 +21,8 @@
#include "kernel/kernel_compat_cuda.h"
#include "kernel_config.h"
+#include "util/util_atomic.h"
+
#include "kernel/kernel_math.h"
#include "kernel/kernel_types.h"
#include "kernel/kernel_globals.h"