Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorSergey Sharybin <sergey.vfx@gmail.com>2017-06-12 16:09:33 +0300
committerSergey Sharybin <sergey.vfx@gmail.com>2017-06-12 16:12:36 +0300
commit0f4f4d87542b96942234e10421823dcc6766331c (patch)
treefd545e6a4ed9ad0652e7e7e18fb76a9e1324e748 /intern
parentf52dc2f371923c22a974df7105245f7e0b8148ee (diff)
parentd8957e4ccec4cda1cd72e94045efba6ab2ceb6fd (diff)
Merge branch 'master' into blender2.8
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/blender/addon/engine.py3
-rw-r--r--intern/cycles/blender/addon/properties.py7
-rw-r--r--intern/cycles/blender/addon/ui.py20
-rw-r--r--intern/cycles/blender/blender_session.cpp9
-rw-r--r--intern/cycles/blender/blender_sync.cpp11
-rw-r--r--intern/cycles/blender/blender_sync.h3
-rw-r--r--intern/cycles/device/device.cpp2
-rw-r--r--intern/cycles/device/device.h10
-rw-r--r--intern/cycles/device/device_split_kernel.cpp4
-rw-r--r--intern/cycles/device/device_split_kernel.h1
-rw-r--r--intern/cycles/device/opencl/opencl.h5
-rw-r--r--intern/cycles/device/opencl/opencl_split.cpp63
-rw-r--r--intern/cycles/device/opencl/opencl_util.cpp51
-rw-r--r--intern/cycles/kernel/CMakeLists.txt9
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_cpu.h2
-rw-r--r--intern/cycles/kernel/filter/filter_nlm_gpu.h2
-rw-r--r--intern/cycles/kernel/filter/filter_reconstruction.h37
-rw-r--r--intern/cycles/kernel/kernel_accumulate.h52
-rw-r--r--intern/cycles/kernel/kernel_path_state.h2
-rw-r--r--intern/cycles/kernel/kernel_queues.h15
-rw-r--r--intern/cycles/kernel/kernel_types.h8
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu.h1
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h2
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_split.cu2
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl13
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl13
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl26
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl15
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl13
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_path_init.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl13
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl13
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl13
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl10
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_split.cl1
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_split_function.h72
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl10
-rw-r--r--intern/cycles/kernel/split/kernel_branched.h52
-rw-r--r--intern/cycles/kernel/split/kernel_do_volume.h19
-rw-r--r--intern/cycles/kernel/split/kernel_enqueue_inactive.h46
-rw-r--r--intern/cycles/kernel/split/kernel_next_iteration_setup.h2
-rw-r--r--intern/cycles/kernel/split/kernel_scene_intersect.h16
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked_dl.h8
-rw-r--r--intern/cycles/kernel/split/kernel_split_common.h15
-rw-r--r--intern/cycles/kernel/split/kernel_split_data_types.h23
-rw-r--r--intern/cycles/kernel/split/kernel_subsurface_scatter.h8
-rw-r--r--intern/cycles/render/session.cpp1
-rw-r--r--intern/cycles/util/util_atomic.h4
55 files changed, 631 insertions, 176 deletions
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 9e22ce331d0..5b2cb9fe39b 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -242,7 +242,8 @@ def register_passes(engine, scene, srl):
if crl.pass_debug_bvh_intersections: engine.register_pass(scene, srl, "Debug BVH Intersections", 1, "X", 'VALUE')
if crl.pass_debug_ray_bounces: engine.register_pass(scene, srl, "Debug Ray Bounces", 1, "X", 'VALUE')
- if crl.use_denoising and crl.denoising_store_passes:
+ cscene = scene.cycles
+ if crl.use_denoising and crl.denoising_store_passes and not cscene.use_progressive_refine:
engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR')
engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR')
engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR')
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 28d60671bd8..309e44ccbb8 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -689,7 +689,11 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
update=devices_update_callback
)
- cls.debug_opencl_kernel_single_program = BoolProperty(name="Single Program", default=True, update=devices_update_callback);
+ cls.debug_opencl_kernel_single_program = BoolProperty(
+ name="Single Program",
+ default=True,
+ update=devices_update_callback,
+ )
cls.debug_use_opencl_debug = BoolProperty(name="Debug OpenCL", default=False)
@@ -1203,6 +1207,7 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
name="Use Denoising",
description="Denoise the rendered image",
default=False,
+ update=update_render_passes,
)
cls.denoising_diffuse_direct = BoolProperty(
name="Diffuse Direct",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index a8018e3824d..756d3b15a89 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -532,17 +532,17 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
col.prop(rl, "use_pass_environment")
if context.scene.cycles.feature_set == 'EXPERIMENTAL':
- col.separator()
- sub = col.column()
- sub.active = crl.use_denoising
- sub.prop(crl, "denoising_store_passes", text="Denoising")
+ col.separator()
+ sub = col.column()
+ sub.active = crl.use_denoising
+ sub.prop(crl, "denoising_store_passes", text="Denoising")
if _cycles.with_cycles_debug:
- col = layout.column()
- col.prop(crl, "pass_debug_bvh_traversed_nodes")
- col.prop(crl, "pass_debug_bvh_traversed_instances")
- col.prop(crl, "pass_debug_bvh_intersections")
- col.prop(crl, "pass_debug_ray_bounces")
+ col = layout.column()
+ col.prop(crl, "pass_debug_bvh_traversed_nodes")
+ col.prop(crl, "pass_debug_bvh_traversed_instances")
+ col.prop(crl, "pass_debug_bvh_intersections")
+ col.prop(crl, "pass_debug_ray_bounces")
class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
@@ -1688,7 +1688,7 @@ def draw_device(self, context):
layout.prop(cscene, "feature_set")
- split = layout.split(percentage=1/3)
+ split = layout.split(percentage=1 / 3)
split.label("Device:")
row = split.row()
row.active = show_device_active(context)
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 46f32fe816d..d8c3750cf20 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -403,14 +403,7 @@ void BlenderSession::render()
BL::RenderLayer b_rlay = *b_single_rlay;
/* add passes */
- array<Pass> passes;
- if(session_params.device.advanced_shading) {
- passes = sync->sync_render_passes(b_rlay, *b_layer_iter);
- }
- else {
- Pass::add(PASS_COMBINED, passes);
- }
-
+ array<Pass> passes = sync->sync_render_passes(b_rlay, *b_layer_iter, session_params);
buffer_params.passes = passes;
PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 1604b0039eb..ab986766211 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -537,11 +537,16 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
}
array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
- BL::SceneRenderLayer& b_srlay)
+ BL::SceneRenderLayer& b_srlay,
+ const SessionParams &session_params)
{
array<Pass> passes;
Pass::add(PASS_COMBINED, passes);
+ if(!session_params.device.advanced_shading) {
+ return passes;
+ }
+
/* loop over passes */
BL::RenderLayer::passes_iterator b_pass_iter;
@@ -556,7 +561,9 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
}
PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
- if(get_boolean(crp, "denoising_store_passes")) {
+ if(get_boolean(crp, "denoising_store_passes") &&
+ get_boolean(crp, "use_denoising") &&
+ !session_params.progressive_refine) {
b_engine.add_pass("Denoising Normal", 3, "XYZ", b_srlay.name().c_str());
b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
b_engine.add_pass("Denoising Albedo", 3, "RGB", b_srlay.name().c_str());
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index a9f63346e00..fda8cb390c2 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -69,7 +69,8 @@ public:
const char *layer = 0);
void sync_render_layers(BL::SpaceView3D& b_v3d, const char *layer);
array<Pass> sync_render_passes(BL::RenderLayer& b_rlay,
- BL::SceneRenderLayer& b_srlay);
+ BL::SceneRenderLayer& b_srlay,
+ const SessionParams &session_params);
void sync_integrator();
void sync_camera(BL::RenderSettings& b_render,
BL::Object& b_override,
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 949c5f932a4..31671e76ec3 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -69,6 +69,8 @@ std::ostream& operator <<(std::ostream &os,
<< string_from_bool(requested_features.use_transparent) << std::endl;
os << "Use Principled BSDF: "
<< string_from_bool(requested_features.use_principled) << std::endl;
+ os << "Use Denoising: "
+ << string_from_bool(requested_features.use_denoising) << std::endl;
return os;
}
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index c22969d7dc6..68a555c1a93 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -127,6 +127,9 @@ public:
/* Per-uber shader usage flags. */
bool use_principled;
+ /* Denoising features. */
+ bool use_denoising;
+
DeviceRequestedFeatures()
{
/* TODO(sergey): Find more meaningful defaults. */
@@ -145,6 +148,7 @@ public:
use_transparent = false;
use_shadow_tricks = false;
use_principled = false;
+ use_denoising = false;
}
bool modified(const DeviceRequestedFeatures& requested_features)
@@ -163,7 +167,8 @@ public:
use_patch_evaluation == requested_features.use_patch_evaluation &&
use_transparent == requested_features.use_transparent &&
use_shadow_tricks == requested_features.use_shadow_tricks &&
- use_principled == requested_features.use_principled);
+ use_principled == requested_features.use_principled &&
+ use_denoising == requested_features.use_denoising);
}
/* Convert the requested features structure to a build options,
@@ -213,6 +218,9 @@ public:
if(!use_principled) {
build_options += " -D__NO_PRINCIPLED__";
}
+ if(!use_denoising) {
+ build_options += " -D__NO_DENOISING__";
+ }
return build_options;
}
};
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index dddd19f179f..d2b3a89fa98 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -47,6 +47,7 @@ DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
kernel_direct_lighting = NULL;
kernel_shadow_blocked_ao = NULL;
kernel_shadow_blocked_dl = NULL;
+ kernel_enqueue_inactive = NULL;
kernel_next_iteration_setup = NULL;
kernel_indirect_subsurface = NULL;
kernel_buffer_update = NULL;
@@ -74,6 +75,7 @@ DeviceSplitKernel::~DeviceSplitKernel()
delete kernel_direct_lighting;
delete kernel_shadow_blocked_ao;
delete kernel_shadow_blocked_dl;
+ delete kernel_enqueue_inactive;
delete kernel_next_iteration_setup;
delete kernel_indirect_subsurface;
delete kernel_buffer_update;
@@ -101,6 +103,7 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
LOAD_KERNEL(direct_lighting);
LOAD_KERNEL(shadow_blocked_ao);
LOAD_KERNEL(shadow_blocked_dl);
+ LOAD_KERNEL(enqueue_inactive);
LOAD_KERNEL(next_iteration_setup);
LOAD_KERNEL(indirect_subsurface);
LOAD_KERNEL(buffer_update);
@@ -256,6 +259,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 68c2ba974a5..2bac1998cb7 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -69,6 +69,7 @@ private:
SplitKernelFunction *kernel_direct_lighting;
SplitKernelFunction *kernel_shadow_blocked_ao;
SplitKernelFunction *kernel_shadow_blocked_dl;
+ SplitKernelFunction *kernel_enqueue_inactive;
SplitKernelFunction *kernel_next_iteration_setup;
SplitKernelFunction *kernel_indirect_subsurface;
SplitKernelFunction *kernel_buffer_update;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 52851061d7b..399fae9b42e 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -130,6 +130,11 @@ public:
cl_int* error = NULL);
static cl_device_type get_device_type(cl_device_id device_id);
+ static bool get_driver_version(cl_device_id device_id,
+ int *major,
+ int *minor,
+ cl_int* error = NULL);
+
static int mem_address_alignment(cl_device_id device_id);
/* Get somewhat more readable device name.
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 76dcbd6fc9a..08b632ee9d3 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -176,17 +176,62 @@ protected:
friend class OpenCLSplitKernelFunction;
};
+struct CachedSplitMemory {
+ int id;
+ device_memory *split_data;
+ device_memory *ray_state;
+ device_ptr *rng_state;
+ device_memory *queue_index;
+ device_memory *use_queues_flag;
+ device_memory *work_pools;
+ device_ptr *buffer;
+};
+
class OpenCLSplitKernelFunction : public SplitKernelFunction {
public:
OpenCLDeviceSplitKernel* device;
OpenCLDeviceBase::OpenCLProgram program;
+ CachedSplitMemory& cached_memory;
+ int cached_id;
+
+ OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) :
+ device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
+ {
+ }
- OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {}
- ~OpenCLSplitKernelFunction() { program.release(); }
+ ~OpenCLSplitKernelFunction()
+ {
+ program.release();
+ }
virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
{
- device->kernel_set_args(program(), 0, kg, data);
+ if(cached_id != cached_memory.id) {
+ cl_uint start_arg_index =
+ device->kernel_set_args(program(),
+ 0,
+ kg,
+ data,
+ *cached_memory.split_data,
+ *cached_memory.ray_state,
+ *cached_memory.rng_state);
+
+/* TODO(sergey): Avoid map lookup here. */
+#define KERNEL_TEX(type, ttype, name) \
+ device->set_kernel_arg_mem(program(), &start_arg_index, #name);
+#include "kernel/kernel_textures.h"
+#undef KERNEL_TEX
+
+ start_arg_index +=
+ device->kernel_set_args(program(),
+ start_arg_index,
+ *cached_memory.queue_index,
+ *cached_memory.use_queues_flag,
+ *cached_memory.work_pools,
+ *cached_memory.buffer);
+
+ cached_id = cached_memory.id;
+ }
device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
program(),
@@ -213,6 +258,7 @@ public:
class OpenCLSplitKernel : public DeviceSplitKernel {
OpenCLDeviceSplitKernel *device;
+ CachedSplitMemory cached_memory;
public:
explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
}
@@ -220,7 +266,7 @@ public:
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name,
const DeviceRequestedFeatures& requested_features)
{
- OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device);
+ OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
bool single_program = OpenCLInfo::use_single_program();
kernel->program =
@@ -349,6 +395,15 @@ public:
return false;
}
+ cached_memory.split_data = &split_data;
+ cached_memory.ray_state = &ray_state;
+ cached_memory.rng_state = &rtile.rng_state;
+ cached_memory.queue_index = &queue_index;
+ cached_memory.use_queues_flag = &use_queues_flag;
+ cached_memory.work_pools = &work_pool_wgs;
+ cached_memory.buffer = &rtile.buffer;
+ cached_memory.id++;
+
return true;
}
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index d27aa05c312..8ba2a8e26da 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -608,6 +608,14 @@ bool OpenCLInfo::device_supported(const string& platform_name,
if(!get_device_name(device_id, &device_name)) {
return false;
}
+
+ int driver_major = 0;
+ int driver_minor = 0;
+ if(!get_driver_version(device_id, &driver_major, &driver_minor)) {
+ return false;
+ }
+ VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
+
/* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework
* (aka, it will not be on Intel framework). This isn't supported
* and needs an explicit blacklist.
@@ -618,6 +626,21 @@ bool OpenCLInfo::device_supported(const string& platform_name,
if(platform_name == "AMD Accelerated Parallel Processing" &&
device_type == CL_DEVICE_TYPE_GPU)
{
+ if(driver_major < 2236) {
+ VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
+ return false;
+ }
+ const char *blacklist[] = {
+ /* GCN 1 */
+ "Tahiti", "Pitcairn", "Capeverde", "Oland",
+ NULL
+ };
+ for (int i = 0; blacklist[i] != NULL; i++) {
+ if(device_name == blacklist[i]) {
+ VLOG(1) << "AMD device " << device_name << " not supported";
+ return false;
+ }
+ }
return true;
}
if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
@@ -1073,6 +1096,34 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
return get_device_name(device_id);
}
+bool OpenCLInfo::get_driver_version(cl_device_id device_id,
+ int *major,
+ int *minor,
+ cl_int* error)
+{
+ char buffer[1024];
+ cl_int err;
+ if((err = clGetDeviceInfo(device_id,
+ CL_DRIVER_VERSION,
+ sizeof(buffer),
+ &buffer,
+ NULL)) != CL_SUCCESS)
+ {
+ if(error != NULL) {
+ *error = err;
+ }
+ return false;
+ }
+ if(error != NULL) {
+ *error = CL_SUCCESS;
+ }
+ if(sscanf(buffer, "%d.%d", major, minor) < 2) {
+ VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
+ return false;
+ }
+ return true;
+}
+
int OpenCLInfo::mem_address_alignment(cl_device_id device_id)
{
int base_align_bits;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index bef869f34b4..23e9bd311c4 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -45,6 +45,7 @@ set(SRC
kernels/opencl/kernel_direct_lighting.cl
kernels/opencl/kernel_shadow_blocked_ao.cl
kernels/opencl/kernel_shadow_blocked_dl.cl
+ kernels/opencl/kernel_enqueue_inactive.cl
kernels/opencl/kernel_next_iteration_setup.cl
kernels/opencl/kernel_indirect_subsurface.cl
kernels/opencl/kernel_buffer_update.cl
@@ -121,6 +122,10 @@ set(SRC_KERNELS_CUDA_HEADERS
kernels/cuda/kernel_config.h
)
+set(SRC_KERNELS_OPENCL_HEADERS
+ kernels/opencl/kernel_split_function.h
+)
+
set(SRC_CLOSURE_HEADERS
closure/alloc.h
closure/bsdf.h
@@ -278,6 +283,7 @@ set(SRC_SPLIT_HEADERS
split/kernel_data_init.h
split/kernel_direct_lighting.h
split/kernel_do_volume.h
+ split/kernel_enqueue_inactive.h
split/kernel_holdout_emission_blurring_pathtermination_ao.h
split/kernel_indirect_background.h
split/kernel_indirect_subsurface.h
@@ -450,6 +456,7 @@ add_library(cycles_kernel
${SRC_HEADERS}
${SRC_KERNELS_CPU_HEADERS}
${SRC_KERNELS_CUDA_HEADERS}
+ ${SRC_KERNELS_OPENCL_HEADERS}
${SRC_BVH_HEADERS}
${SRC_CLOSURE_HEADERS}
${SRC_FILTER_HEADERS}
@@ -490,9 +497,11 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_sc
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_dl.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_enqueue_inactive.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split_function.h" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index 88afc00ccb3..3e752bce68f 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -101,7 +101,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
for(int x = rect.x; x < rect.z; x++) {
const int low = max(rect.x, x-f);
const int high = min(rect.z, x+f+1);
- out_image[y*w+x] = expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
+ out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
}
}
}
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
index 62bd5be1de5..2c5ac807051 100644
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -66,7 +66,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
sum += difference_image[y*w+x1];
}
sum *= 1.0f/(high-low);
- out_image[y*w+x] = expf(-max(sum, 0.0f));
+ out_image[y*w+x] = fast_expf(-max(sum, 0.0f));
}
ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
index 90a2816ddf7..25a3025056c 100644
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -29,20 +29,24 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
ccl_global float3 *XtWY,
int localIdx)
{
+ if(weight < 1e-3f) {
+ return;
+ }
+
int p_offset = y *w + x;
int q_offset = (y+dy)*w + (x+dx);
-#ifdef __KERNEL_CPU__
- const int stride = 1;
- (void)storage_stride;
- (void)localIdx;
- float design_row[DENOISE_FEATURES+1];
-#elif defined(__KERNEL_CUDA__)
+#ifdef __KERNEL_GPU__
const int stride = storage_stride;
+#else
+ const int stride = 1;
+ (void) storage_stride;
+#endif
+
+#ifdef __KERNEL_CUDA__
ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
#else
- const int stride = storage_stride;
float design_row[DENOISE_FEATURES+1];
#endif
@@ -70,13 +74,19 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
int4 buffer_params,
int sample)
{
-#ifdef __KERNEL_CPU__
- const int stride = 1;
- (void)storage_stride;
-#else
+#ifdef __KERNEL_GPU__
const int stride = storage_stride;
+#else
+ const int stride = 1;
+ (void) storage_stride;
#endif
+ if(XtWX[0] < 1e-3f) {
+ /* There is not enough information to determine a denoised result.
+ * As a fallback, keep the original value of the pixel. */
+ return;
+ }
+
/* The weighted average of pixel colors (essentially, the NLM-filtered image).
* In case the solution of the linear model fails due to numerical issues,
* fall back to this value. */
@@ -89,6 +99,9 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
final_color = mean_color;
}
+ /* Clamp pixel value to positive values. */
+ final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
+
ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
final_color *= sample;
if(buffer_params.w) {
@@ -101,6 +114,4 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
combined_buffer[2] = final_color.z;
}
-#undef STORAGE_TYPE
-
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 06728415c15..175bd6b9737 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -621,25 +621,43 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance
{
float fac = 1.0f/num_samples;
+#ifdef __SPLIT_KERNEL__
+# define safe_float3_add(f, v) \
+ do { \
+ ccl_global float *p = (ccl_global float*)(&(f)); \
+ atomic_add_and_fetch_float(p+0, (v).x); \
+ atomic_add_and_fetch_float(p+1, (v).y); \
+ atomic_add_and_fetch_float(p+2, (v).z); \
+ } while(0)
+#else
+# define safe_float3_add(f, v) (f) += (v)
+#endif /* __SPLIT_KERNEL__ */
+
#ifdef __PASSES__
- L->direct_diffuse += L_sample->direct_diffuse*fac;
- L->direct_glossy += L_sample->direct_glossy*fac;
- L->direct_transmission += L_sample->direct_transmission*fac;
- L->direct_subsurface += L_sample->direct_subsurface*fac;
- L->direct_scatter += L_sample->direct_scatter*fac;
-
- L->indirect_diffuse += L_sample->indirect_diffuse*fac;
- L->indirect_glossy += L_sample->indirect_glossy*fac;
- L->indirect_transmission += L_sample->indirect_transmission*fac;
- L->indirect_subsurface += L_sample->indirect_subsurface*fac;
- L->indirect_scatter += L_sample->indirect_scatter*fac;
-
- L->background += L_sample->background*fac;
- L->ao += L_sample->ao*fac;
- L->shadow += L_sample->shadow*fac;
+ safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse*fac);
+ safe_float3_add(L->direct_glossy, L_sample->direct_glossy*fac);
+ safe_float3_add(L->direct_transmission, L_sample->direct_transmission*fac);
+ safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface*fac);
+ safe_float3_add(L->direct_scatter, L_sample->direct_scatter*fac);
+
+ safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse*fac);
+ safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy*fac);
+ safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission*fac);
+ safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface*fac);
+ safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter*fac);
+
+ safe_float3_add(L->background, L_sample->background*fac);
+ safe_float3_add(L->ao, L_sample->ao*fac);
+ safe_float3_add(L->shadow, L_sample->shadow*fac);
+# ifdef __SPLIT_KERNEL__
+ atomic_add_and_fetch_float(&L->mist, L_sample->mist*fac);
+# else
L->mist += L_sample->mist*fac;
-#endif
- L->emission += L_sample->emission * fac;
+# endif /* __SPLIT_KERNEL__ */
+#endif /* __PASSES__ */
+ safe_float3_add(L->emission, L_sample->emission*fac);
+
+#undef safe_float3_add
}
#ifdef __SHADOW_TRICKS__
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index 0fa77d9e8bd..5d92fd12201 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -139,9 +139,11 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
/* random number generator next bounce */
state->rng_offset += PRNG_BOUNCE_NUM;
+#ifdef __DENOISING_FEATURES__
if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
}
+#endif
}
ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index 96bc636d5ac..e32d4bbbc1b 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -128,6 +128,21 @@ ccl_device unsigned int get_global_queue_index(
return my_gqidx;
}
+ccl_device int dequeue_ray_index(
+ int queue_number,
+ ccl_global int *queues,
+ int queue_size,
+ ccl_global int *queue_index)
+{
+ int index = atomic_fetch_and_dec_uint32((ccl_global uint*)&queue_index[queue_number])-1;
+
+ if(index < 0) {
+ return QUEUE_EMPTY_SLOT;
+ }
+
+ return queues[index + queue_number * queue_size];
+}
+
CCL_NAMESPACE_END
#endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index dbeaffdfb24..31e47e837fd 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -236,6 +236,9 @@ CCL_NAMESPACE_BEGIN
#ifdef __NO_PRINCIPLED__
# undef __PRINCIPLED__
#endif
+#ifdef __NO_DENOISING__
+# undef __DENOISING_FEATURES__
+#endif
/* Random Numbers */
@@ -1387,6 +1390,8 @@ enum QueueNumber {
#ifdef __BRANCHED_PATH__
/* All rays moving to next iteration of the indirect loop for light */
QUEUE_LIGHT_INDIRECT_ITER,
+ /* Queue of all inactive rays. These are candidates for sharing work of indirect loops */
+ QUEUE_INACTIVE_RAYS,
# ifdef __VOLUME__
/* All rays moving to next iteration of the indirect loop for volumes */
QUEUE_VOLUME_INDIRECT_ITER,
@@ -1429,6 +1434,9 @@ enum RayState {
RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT | RAY_BRANCHED_SUBSURFACE_INDIRECT),
+
+ /* Ray is evaluating an iteration of an indirect loop for another thread */
+ RAY_BRANCHED_INDIRECT_SHARED = (1 << 7),
};
#define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 9895080d328..c8938534fe8 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -85,6 +85,7 @@ DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 9b85a864153..d4315ee5ec4 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -53,6 +53,7 @@
# include "kernel/split/kernel_direct_lighting.h"
# include "kernel/split/kernel_shadow_blocked_ao.h"
# include "kernel/split/kernel_shadow_blocked_dl.h"
+# include "kernel/split/kernel_enqueue_inactive.h"
# include "kernel/split/kernel_next_iteration_setup.h"
# include "kernel/split/kernel_indirect_subsurface.h"
# include "kernel/split/kernel_buffer_update.h"
@@ -230,6 +231,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index 8b7f1a8d405..628891b1458 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -39,6 +39,7 @@
#include "kernel/split/kernel_direct_lighting.h"
#include "kernel/split/kernel_shadow_blocked_ao.h"
#include "kernel/split/kernel_shadow_blocked_dl.h"
+#include "kernel/split/kernel_enqueue_inactive.h"
#include "kernel/split/kernel_next_iteration_setup.h"
#include "kernel/split/kernel_indirect_subsurface.h"
#include "kernel/split/kernel_buffer_update.h"
@@ -118,6 +119,7 @@ DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
index db65c91baf7..dcea2630aef 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
@@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_buffer_update.h"
-__kernel void kernel_ocl_path_trace_buffer_update(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local unsigned int local_queue_atomics;
- kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME buffer_update
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
index eb34f750881..ed64ae01aae 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_direct_lighting.h"
-__kernel void kernel_ocl_path_trace_direct_lighting(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local unsigned int local_queue_atomics;
- kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME direct_lighting
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
index 83ef5f5f3f2..8afaa686e28 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_do_volume.h"
-__kernel void kernel_ocl_path_trace_do_volume(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_do_volume((KernelGlobals*)kg);
-}
+#define KERNEL_NAME do_volume
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
new file mode 100644
index 00000000000..e68d4104a91
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_enqueue_inactive.h"
+
+#define KERNEL_NAME enqueue_inactive
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
index d071b39aa6f..9e1e57beba6 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -18,12 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local BackgroundAOLocals locals;
- kernel_holdout_emission_blurring_pathtermination_ao(
- (KernelGlobals*)kg,
- &locals);
-}
+#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
+#define LOCALS_TYPE BackgroundAOLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
index 8c213ff5cb2..192d01444ba 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_indirect_background.h"
-__kernel void kernel_ocl_path_trace_indirect_background(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_indirect_background((KernelGlobals*)kg);
-}
+#define KERNEL_NAME indirect_background
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
index 998ebc4c0c3..84938b889e5 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_indirect_subsurface.h"
-__kernel void kernel_ocl_path_trace_indirect_subsurface(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_indirect_subsurface((KernelGlobals*)kg);
-}
+#define KERNEL_NAME indirect_subsurface
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
index 822d2287715..c314dc96c33 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_lamp_emission.h"
-__kernel void kernel_ocl_path_trace_lamp_emission(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_lamp_emission((KernelGlobals*)kg);
-}
+#define KERNEL_NAME lamp_emission
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
index 6d207253a40..8b1332bf013 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_next_iteration_setup.h"
-__kernel void kernel_ocl_path_trace_next_iteration_setup(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local unsigned int local_queue_atomics;
- kernel_next_iteration_setup((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME next_iteration_setup
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
index bd9aa9538c8..fa210e747c0 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_path_init.h"
-__kernel void kernel_ocl_path_trace_path_init(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_path_init((KernelGlobals*)kg);
-}
+#define KERNEL_NAME path_init
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
index 9be154e3d75..68ee6f1d536 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_queue_enqueue.h"
-__kernel void kernel_ocl_path_trace_queue_enqueue(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local QueueEnqueueLocals locals;
- kernel_queue_enqueue((KernelGlobals*)kg, &locals);
-}
+#define KERNEL_NAME queue_enqueue
+#define LOCALS_TYPE QueueEnqueueLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
index eb4fb4d153a..10d09377ba9 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_scene_intersect.h"
-__kernel void kernel_ocl_path_trace_scene_intersect(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_scene_intersect((KernelGlobals*)kg);
-}
+#define KERNEL_NAME scene_intersect
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
index 5bfb31b193a..40eaa561863 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_shader_eval.h"
-__kernel void kernel_ocl_path_trace_shader_eval(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_shader_eval((KernelGlobals*)kg);
-}
+#define KERNEL_NAME shader_eval
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
index 38bfd04ad4c..8c36100f762 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
@@ -18,10 +18,9 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_shader_setup.h"
-__kernel void kernel_ocl_path_trace_shader_setup(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local unsigned int local_queue_atomics;
- kernel_shader_setup((KernelGlobals*)kg, &local_queue_atomics);
-}
+#define KERNEL_NAME shader_setup
+#define LOCALS_TYPE unsigned int
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
index 6f722915d45..bcacaa4a054 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
@@ -19,10 +19,9 @@
#include "kernel/split/kernel_shader_sort.h"
__attribute__((reqd_work_group_size(64, 1, 1)))
-__kernel void kernel_ocl_path_trace_shader_sort(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- ccl_local ShaderSortLocals locals;
- kernel_shader_sort((KernelGlobals*)kg, &locals);
-}
+#define KERNEL_NAME shader_sort
+#define LOCALS_TYPE ShaderSortLocals
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+#undef LOCALS_TYPE
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
index 6a8ef81b32a..8de250a375c 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_shadow_blocked_ao.h"
-__kernel void kernel_ocl_path_trace_shadow_blocked_ao(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_shadow_blocked_ao((KernelGlobals*)kg);
-}
+#define KERNEL_NAME shadow_blocked_ao
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
index b255cc5ef8b..29da77022ed 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_shadow_blocked_dl.h"
-__kernel void kernel_ocl_path_trace_shadow_blocked_dl(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_shadow_blocked_dl((KernelGlobals*)kg);
-}
+#define KERNEL_NAME shadow_blocked_dl
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
index 8de82db7afe..651addb02f4 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
@@ -31,6 +31,7 @@
#include "kernel/kernels/opencl/kernel_direct_lighting.cl"
#include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl"
#include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl"
+#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl"
#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
#include "kernel/kernels/opencl/kernel_buffer_update.cl"
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
new file mode 100644
index 00000000000..f1e914a70d4
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define KERNEL_NAME_JOIN(a, b) a ## _ ## b
+#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b)
+
+__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
+ ccl_global char *kg_global,
+ ccl_constant KernelData *data,
+
+ ccl_global void *split_data_buffer,
+ ccl_global char *ray_state,
+ ccl_global uint *rng_state,
+
+#define KERNEL_TEX(type, ttype, name) \
+ ccl_global type *name,
+#include "kernel/kernel_textures.h"
+
+ ccl_global int *queue_index,
+ ccl_global char *use_queues_flag,
+ ccl_global unsigned int *work_pools,
+ ccl_global float *buffer
+ )
+{
+#ifdef LOCALS_TYPE
+ ccl_local LOCALS_TYPE locals;
+#endif
+
+ KernelGlobals *kg = (KernelGlobals*)kg_global;
+
+ if(ccl_local_id(0) + ccl_local_id(1) == 0) {
+ kg->data = data;
+
+ kernel_split_params.rng_state = rng_state;
+ kernel_split_params.queue_index = queue_index;
+ kernel_split_params.use_queues_flag = use_queues_flag;
+ kernel_split_params.work_pools = work_pools;
+ kernel_split_params.buffer = buffer;
+
+ split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state);
+
+#define KERNEL_TEX(type, ttype, name) \
+ kg->name = name;
+#include "kernel/kernel_textures.h"
+ }
+
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ KERNEL_NAME_EVAL(kernel, KERNEL_NAME)(
+ kg
+#ifdef LOCALS_TYPE
+ , &locals
+#endif
+ );
+}
+
+#undef KERNEL_NAME_JOIN
+#undef KERNEL_NAME_EVAL
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
index 99b74a1802b..2b3be38df84 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
@@ -18,9 +18,7 @@
#include "kernel/split/kernel_split_common.h"
#include "kernel/split/kernel_subsurface_scatter.h"
-__kernel void kernel_ocl_path_trace_subsurface_scatter(
- ccl_global char *kg,
- ccl_constant KernelData *data)
-{
- kernel_subsurface_scatter((KernelGlobals*)kg);
-}
+#define KERNEL_NAME subsurface_scatter
+#include "kernel/kernels/opencl/kernel_split_function.h"
+#undef KERNEL_NAME
+
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
index dc74a2ada53..e2762a85fc8 100644
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -63,12 +63,49 @@ ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobal
REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
}
+ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg, int ray_index)
+{
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+
+ int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS,
+ kernel_split_state.queue_data, kernel_split_params.queue_size, kernel_split_params.queue_index);
+
+ if(!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) {
+ return false;
+ }
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+ kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index];
+ SPLIT_DATA_ENTRIES_BRANCHED_SHARED
+#undef SPLIT_DATA_ENTRY
+
+ kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0;
+ kernel_split_state.branched_state[inactive_ray].original_ray = ray_index;
+ kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false;
+
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
+
+ path_radiance_init(inactive_L, kernel_data.film.use_light_pass);
+ inactive_L->direct_throughput = L->direct_throughput;
+ path_radiance_copy_indirect(inactive_L, L);
+
+ ray_state[inactive_ray] = RAY_REGENERATED;
+ ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED);
+ ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT));
+
+ atomic_fetch_and_inc_uint32((ccl_global uint*)&kernel_split_state.branched_state[ray_index].shared_sample_count);
+
+ return true;
+}
+
/* bounce off surface and integrate indirect light */
ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(KernelGlobals *kg,
int ray_index,
float num_samples_adjust,
ShaderData *saved_sd,
- bool reset_path_state)
+ bool reset_path_state,
+ bool wait_for_shared)
{
SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
@@ -155,12 +192,25 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
/* start the indirect path */
*tp *= num_samples_inv;
+ if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+ continue;
+ }
+
return true;
}
branched_state->next_sample = 0;
}
+ branched_state->next_closure = sd->num_closure;
+
+ if(wait_for_shared) {
+ branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+ if(branched_state->waiting_on_shared_samples) {
+ return true;
+ }
+ }
+
return false;
}
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
index 694b777f429..9f8dd2392d9 100644
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -75,11 +75,30 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K
branched_state->next_sample = j+1;
branched_state->num_samples = num_samples;
+ /* Attempting to share too many samples is slow for volumes as it causes us to
+ * loop here more and have many calls to kernel_volume_integrate which evaluates
+ * shaders. The many expensive shader evaluations cause the work load to become
+ * unbalanced and many threads to become idle in this kernel. Limiting the
+ * number of shared samples here helps quite a lot.
+ */
+ if(branched_state->shared_sample_count < 2) {
+ if(kernel_split_branched_indirect_start_shared(kg, ray_index)) {
+ continue;
+ }
+ }
+
return true;
}
# endif
}
+ branched_state->next_sample = num_samples;
+
+ branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+ if(branched_state->waiting_on_shared_samples) {
+ return true;
+ }
+
kernel_split_branched_path_indirect_loop_end(kg, ray_index);
/* todo: avoid this calculation using decoupled ray marching */
diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
new file mode 100644
index 00000000000..496355bbc3a
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_enqueue_inactive(KernelGlobals *kg,
+ ccl_local_param unsigned int *local_queue_atomics)
+{
+#ifdef __BRANCHED_PATH__
+ /* Enqeueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ *local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+ char enqueue_flag = 0;
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) {
+ enqueue_flag = 1;
+ }
+
+ enqueue_ray_index_local(ray_index,
+ QUEUE_INACTIVE_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+#endif /* __BRANCHED_PATH__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 71017fed19e..7758e35fd32 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -147,6 +147,7 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
ray_index,
1.0f,
&kernel_split_state.branched_state[ray_index].sd,
+ true,
true))
{
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
@@ -193,6 +194,7 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
ray_index,
1.0f,
&kernel_split_state.branched_state[ray_index].sd,
+ true,
true))
{
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index 5dc94caec85..45984ca509b 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -43,11 +43,21 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg)
}
/* All regenerated rays become active here */
- if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED))
- ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
+#ifdef __BRANCHED_PATH__
+ if(kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) {
+ kernel_split_path_end(kg, ray_index);
+ }
+ else
+#endif /* __BRANCHED_PATH__ */
+ {
+ ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
+ }
+ }
- if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE))
+ if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
return;
+ }
#ifdef __KERNEL_DEBUG__
DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
index 386fbbc4d09..78e61709b01 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -29,6 +29,14 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
}
+#ifdef __BRANCHED_PATH__
+ /* TODO(mai): move this somewhere else? */
+ if(thread_index == 0) {
+ /* Clear QUEUE_INACTIVE_RAYS before next kernel. */
+ kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0;
+ }
+#endif /* __BRANCHED_PATH__ */
+
if(ray_index == QUEUE_EMPTY_SLOT)
return;
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 57f070d51e0..08f0124b529 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -56,7 +56,20 @@ ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
ccl_global char *ray_state = kernel_split_state.ray_state;
#ifdef __BRANCHED_PATH__
- if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
+ if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) {
+ int orig_ray = kernel_split_state.branched_state[ray_index].original_ray;
+
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
+
+ path_radiance_sum_indirect(L);
+ path_radiance_accum_sample(orig_ray_L, L, 1);
+
+ atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
+
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+ }
+ else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
}
else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
index bb1aca2acbf..4bb2f0d3d80 100644
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -95,6 +95,10 @@ typedef ccl_global struct SplitBranchedState {
VolumeStack volume_stack[VOLUME_STACK_SIZE];
# endif /* __VOLUME__ */
#endif /*__SUBSURFACE__ */
+
+ int shared_sample_count; /* number of branched samples shared with other threads */
+ int original_ray; /* index of original ray when sharing branched samples */
+ bool waiting_on_shared_samples;
} SplitBranchedState;
#define SPLIT_DATA_BRANCHED_ENTRIES \
@@ -137,6 +141,25 @@ typedef ccl_global struct SplitBranchedState {
SPLIT_DATA_BRANCHED_ENTRIES \
SPLIT_DATA_DEBUG_ENTRIES \
+/* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */
+#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
+ SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
+ SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+ SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
+ SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+ SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
+ SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+ SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+ SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
+ SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+ SPLIT_DATA_SUBSURFACE_ENTRIES \
+ SPLIT_DATA_VOLUME_ENTRIES \
+ SPLIT_DATA_BRANCHED_ENTRIES \
+ SPLIT_DATA_DEBUG_ENTRIES \
+
/* struct that holds pointers to data in the shared state buffer */
typedef struct SplitData {
#define SPLIT_DATA_ENTRY(type, name, num) type *name;
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
index 1dffe1b179e..4998714f28c 100644
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -169,6 +169,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
ray_index,
num_samples_inv,
bssrdf_sd,
+ false,
false))
{
branched_state->ss_next_closure = i;
@@ -187,6 +188,13 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
branched_state->ss_next_sample = 0;
}
+ branched_state->ss_next_closure = sd->num_closure;
+
+ branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
+ if(branched_state->waiting_on_shared_samples) {
+ return true;
+ }
+
kernel_split_branched_path_indirect_loop_end(kg, ray_index);
return false;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 08909943c49..ae462a1084a 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -722,6 +722,7 @@ DeviceRequestedFeatures Session::get_requested_device_features()
requested_features.use_baking = bake_manager->get_baking();
requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH);
requested_features.use_transparent &= scene->integrator->transparent_shadows;
+ requested_features.use_denoising = params.use_denoising;
return requested_features;
}
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 6c52117ef9a..643af87a65f 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -35,6 +35,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
#define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_add_uint32((p), -1)
#define CCL_LOCAL_MEM_FENCE 0
#define ccl_barrier(flags) (void)0
@@ -68,6 +69,7 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so
#define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
#define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
+#define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
#define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
#define ccl_barrier(flags) barrier(flags)
@@ -79,7 +81,9 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *so
#define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x))
#define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x))
+#define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int*)(p), (unsigned int)(x))
#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+#define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
#define CCL_LOCAL_MEM_FENCE
#define ccl_barrier(flags) __syncthreads()