Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCampbell Barton <ideasman42@gmail.com>2017-03-09 08:33:20 +0300
committerCampbell Barton <ideasman42@gmail.com>2017-03-09 08:41:33 +0300
commita68c631cf86ebc878ac5b96ef5fb69db15be9161 (patch)
tree80508a8291502a07ad999a9f385bdc854be0f22d
parent8d98362710798fe9e40e6a24c32306e4d4edfef0 (diff)
parent817e975dee27640947bf7d083db39d1d70120385 (diff)
Merge branch 'master' into blender2.8
-rw-r--r--extern/cuew/include/cuew.h2
-rw-r--r--intern/cycles/blender/addon/properties.py2
-rw-r--r--intern/cycles/blender/addon/ui.py2
-rw-r--r--intern/cycles/blender/blender_python.cpp2
-rw-r--r--intern/cycles/device/CMakeLists.txt3
-rw-r--r--intern/cycles/device/device.cpp2
-rw-r--r--intern/cycles/device/device.h2
-rw-r--r--intern/cycles/device/device_cpu.cpp359
-rw-r--r--intern/cycles/device/device_cuda.cpp334
-rw-r--r--intern/cycles/device/device_memory.h30
-rw-r--r--intern/cycles/device/device_multi.cpp4
-rw-r--r--intern/cycles/device/device_network.cpp10
-rw-r--r--intern/cycles/device/device_split_kernel.cpp281
-rw-r--r--intern/cycles/device/device_split_kernel.h127
-rw-r--r--intern/cycles/device/device_task.h2
-rw-r--r--intern/cycles/device/opencl/opencl.h82
-rw-r--r--intern/cycles/device/opencl/opencl_base.cpp87
-rw-r--r--intern/cycles/device/opencl/opencl_mega.cpp3
-rw-r--r--intern/cycles/device/opencl/opencl_split.cpp1465
-rw-r--r--intern/cycles/device/opencl/opencl_util.cpp10
-rw-r--r--intern/cycles/kernel/CMakeLists.txt65
-rw-r--r--intern/cycles/kernel/closure/alloc.h20
-rw-r--r--intern/cycles/kernel/closure/bsdf.h116
-rw-r--r--intern/cycles/kernel/geom/geom_attribute.h8
-rw-r--r--intern/cycles/kernel/geom/geom_curve.h92
-rw-r--r--intern/cycles/kernel/geom/geom_motion_triangle_intersect.h8
-rw-r--r--intern/cycles/kernel/geom/geom_motion_triangle_shader.h36
-rw-r--r--intern/cycles/kernel/geom/geom_object.h36
-rw-r--r--intern/cycles/kernel/geom/geom_patch.h6
-rw-r--r--intern/cycles/kernel/geom/geom_primitive.h36
-rw-r--r--intern/cycles/kernel/geom/geom_subd_triangle.h48
-rw-r--r--intern/cycles/kernel/geom/geom_triangle.h40
-rw-r--r--intern/cycles/kernel/geom/geom_triangle_intersect.h8
-rw-r--r--intern/cycles/kernel/geom/geom_volume.h4
-rw-r--r--intern/cycles/kernel/kernel.h2
-rw-r--r--intern/cycles/kernel/kernel_camera.h4
-rw-r--r--intern/cycles/kernel/kernel_compat_cpu.h9
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h47
-rw-r--r--intern/cycles/kernel/kernel_compat_opencl.h10
-rw-r--r--intern/cycles/kernel/kernel_emission.h20
-rw-r--r--intern/cycles/kernel/kernel_globals.h15
-rw-r--r--intern/cycles/kernel/kernel_passes.h22
-rw-r--r--intern/cycles/kernel/kernel_path.h10
-rw-r--r--intern/cycles/kernel/kernel_path_branched.h18
-rw-r--r--intern/cycles/kernel/kernel_path_surface.h38
-rw-r--r--intern/cycles/kernel/kernel_queues.h33
-rw-r--r--intern/cycles/kernel/kernel_shader.h326
-rw-r--r--intern/cycles/kernel/kernel_shadow.h12
-rw-r--r--intern/cycles/kernel/kernel_subsurface.h8
-rw-r--r--intern/cycles/kernel/kernel_types.h123
-rw-r--r--intern/cycles/kernel/kernel_work_stealing.h211
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu.h35
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h89
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split.cpp63
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp38
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp40
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp34
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp36
-rw-r--r--intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp37
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel.cu100
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_config.h110
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_split.cu125
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel.cl32
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl109
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_data_init.cl64
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl71
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl107
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl64
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl98
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_path_init.cl26
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl92
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl64
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl52
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl49
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl29
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl38
-rw-r--r--intern/cycles/kernel/osl/osl_bssrdf.cpp6
-rw-r--r--intern/cycles/kernel/osl/osl_closures.cpp1
-rw-r--r--intern/cycles/kernel/osl/osl_services.cpp1
-rw-r--r--intern/cycles/kernel/osl/osl_shader.cpp1
-rw-r--r--intern/cycles/kernel/split/kernel_background_buffer_update.h181
-rw-r--r--intern/cycles/kernel/split/kernel_data_init.h252
-rw-r--r--intern/cycles/kernel/split/kernel_direct_lighting.h86
-rw-r--r--intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h175
-rw-r--r--intern/cycles/kernel/split/kernel_lamp_emission.h58
-rw-r--r--intern/cycles/kernel/split/kernel_next_iteration_setup.h118
-rw-r--r--intern/cycles/kernel/split/kernel_path_init.h100
-rw-r--r--intern/cycles/kernel/split/kernel_queue_enqueue.h102
-rw-r--r--intern/cycles/kernel/split/kernel_scene_intersect.h58
-rw-r--r--intern/cycles/kernel/split/kernel_shader_eval.h58
-rw-r--r--intern/cycles/kernel/split/kernel_shadow_blocked.h56
-rw-r--r--intern/cycles/kernel/split/kernel_split_common.h16
-rw-r--r--intern/cycles/kernel/split/kernel_split_data.h57
-rw-r--r--intern/cycles/kernel/split/kernel_split_data_types.h109
-rw-r--r--intern/cycles/kernel/split/kernel_sum_all_radiance.h59
-rw-r--r--intern/cycles/kernel/svm/svm.h2
-rw-r--r--intern/cycles/kernel/svm/svm_attribute.h2
-rw-r--r--intern/cycles/kernel/svm/svm_bump.h18
-rw-r--r--intern/cycles/kernel/svm/svm_camera.h2
-rw-r--r--intern/cycles/kernel/svm/svm_closure.h142
-rw-r--r--intern/cycles/kernel/svm/svm_displace.h10
-rw-r--r--intern/cycles/kernel/svm/svm_fresnel.h14
-rw-r--r--intern/cycles/kernel/svm/svm_geometry.h42
-rw-r--r--intern/cycles/kernel/svm/svm_image.h4
-rw-r--r--intern/cycles/kernel/svm/svm_light_path.h10
-rw-r--r--intern/cycles/kernel/svm/svm_tex_coord.h112
-rw-r--r--intern/cycles/kernel/svm/svm_vector_transform.h2
-rw-r--r--intern/cycles/kernel/svm/svm_wireframe.h28
-rw-r--r--intern/cycles/render/background.h2
-rw-r--r--intern/cycles/render/bake.cpp4
-rw-r--r--intern/cycles/render/buffers.cpp4
-rw-r--r--intern/cycles/render/camera.h2
-rw-r--r--intern/cycles/render/film.h2
-rw-r--r--intern/cycles/render/graph.h4
-rw-r--r--intern/cycles/render/integrator.h2
-rw-r--r--intern/cycles/render/light.cpp4
-rw-r--r--intern/cycles/render/mesh.h2
-rw-r--r--intern/cycles/render/mesh_displace.cpp4
-rw-r--r--intern/cycles/render/object.h2
-rw-r--r--intern/cycles/render/session.cpp6
-rw-r--r--intern/cycles/render/shader.h2
-rw-r--r--intern/cycles/util/util_atomic.h28
-rw-r--r--intern/cycles/util/util_debug.cpp13
-rw-r--r--intern/cycles/util/util_debug.h6
-rw-r--r--intern/cycles/util/util_logging.cpp9
-rw-r--r--intern/cycles/util/util_logging.h3
-rw-r--r--intern/cycles/util/util_path.cpp2
-rw-r--r--intern/cycles/util/util_ssef.h8
-rw-r--r--intern/cycles/util/util_types.h23
-rw-r--r--release/scripts/presets/interface_theme/back_to_black.xml119
-rw-r--r--release/scripts/presets/keyconfig/3dsmax.py12
-rw-r--r--release/scripts/presets/keyconfig/maya.py3
-rw-r--r--source/blender/blenlib/BLI_rect.h4
-rw-r--r--source/blender/blenlib/BLI_task.h4
-rw-r--r--source/blender/blenlib/intern/rct.c4
-rw-r--r--source/blender/blenlib/intern/task.c293
-rw-r--r--source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp6
-rw-r--r--source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp6
-rw-r--r--source/blender/depsgraph/intern/eval/deg_eval.cc130
-rw-r--r--source/blender/depsgraph/intern/eval/deg_eval_flush.cc5
-rw-r--r--source/blender/editors/armature/armature_select.c4
-rw-r--r--source/blender/editors/armature/editarmature_sketch.c2
-rw-r--r--source/blender/editors/interface/interface_layout.c7
-rw-r--r--source/blender/editors/metaball/mball_edit.c2
-rw-r--r--source/blender/editors/render/render_opengl.c18
-rw-r--r--source/blender/editors/space_node/node_edit.c2
-rw-r--r--source/blender/editors/space_view3d/space_view3d.c1
-rw-r--r--source/blender/editors/space_view3d/view3d_edit.c2
-rw-r--r--source/blender/editors/space_view3d/view3d_intern.h2
-rw-r--r--source/blender/editors/space_view3d/view3d_select.c34
-rw-r--r--source/blender/editors/space_view3d/view3d_view.c8
-rw-r--r--source/blender/editors/transform/transform_manipulator.c18
-rw-r--r--source/blender/gpu/GPU_select.h4
-rw-r--r--source/blender/gpu/intern/gpu_select.c6
-rw-r--r--source/blender/windowmanager/manipulators/intern/wm_manipulatormap.c13
-rw-r--r--source/creator/creator_args.c2
156 files changed, 4671 insertions, 3896 deletions
diff --git a/extern/cuew/include/cuew.h b/extern/cuew/include/cuew.h
index 19087117667..4cce29d38ab 100644
--- a/extern/cuew/include/cuew.h
+++ b/extern/cuew/include/cuew.h
@@ -114,7 +114,7 @@ extern "C" {
#define cuGLGetDevices cuGLGetDevices_v2
/* Types. */
-#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined (__aarch64__)
typedef unsigned long long CUdeviceptr;
#else
typedef unsigned int CUdeviceptr;
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 5c51f9afc28..ca109734314 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -665,8 +665,10 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
cls.debug_use_cpu_sse3 = BoolProperty(name="SSE3", default=True)
cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True)
cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True)
+ cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)
cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)
+ cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False)
cls.debug_opencl_kernel_type = EnumProperty(
name="OpenCL Kernel Type",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index eb89f0b1efa..688d025e7d7 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -1518,10 +1518,12 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
row.prop(cscene, "debug_use_cpu_avx", toggle=True)
row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
col.prop(cscene, "debug_use_qbvh")
+ col.prop(cscene, "debug_use_cpu_split_kernel")
col = layout.column()
col.label('CUDA Flags:')
col.prop(cscene, "debug_use_cuda_adaptive_compile")
+ col.prop(cscene, "debug_use_cuda_split_kernel")
col = layout.column()
col.label('OpenCL Flags:')
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 438abc49f88..75118c43747 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -67,8 +67,10 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh");
+ flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
/* Synchronize CUDA flags. */
flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
+ flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
/* Synchronize OpenCL kernel type. */
switch(get_enum(cscene, "debug_opencl_kernel_type")) {
case 0:
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 966ff5e52ba..a2373451696 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -3,6 +3,7 @@ set(INC
.
../graph
../kernel
+ ../kernel/split
../kernel/svm
../kernel/osl
../util
@@ -33,6 +34,7 @@ set(SRC
device_cuda.cpp
device_multi.cpp
device_opencl.cpp
+ device_split_kernel.cpp
device_task.cpp
)
@@ -56,6 +58,7 @@ set(SRC_HEADERS
device_memory.h
device_intern.h
device_network.h
+ device_split_kernel.h
device_task.h
)
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 31c99f49d6d..6b07b9d04bd 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -80,7 +80,7 @@ Device::~Device()
void Device::pixels_alloc(device_memory& mem)
{
- mem_alloc(mem, MEM_READ_WRITE);
+ mem_alloc("pixels", mem, MEM_READ_WRITE);
}
void Device::pixels_copy_from(device_memory& mem, int y, int w, int h)
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ccee25ae34e..c740cada98b 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -234,7 +234,7 @@ public:
Stats &stats;
/* regular memory */
- virtual void mem_alloc(device_memory& mem, MemoryType type) = 0;
+ virtual void mem_alloc(const char *name, device_memory& mem, MemoryType type) = 0;
virtual void mem_copy_to(device_memory& mem) = 0;
virtual void mem_copy_from(device_memory& mem,
int y, int w, int h, int elem) = 0;
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index c8e001ec2fd..06a1568b4d6 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -26,10 +26,12 @@
#include "device.h"
#include "device_intern.h"
+#include "device_split_kernel.h"
#include "kernel.h"
#include "kernel_compat_cpu.h"
#include "kernel_types.h"
+#include "split/kernel_split_data.h"
#include "kernel_globals.h"
#include "osl_shader.h"
@@ -41,6 +43,7 @@
#include "util_foreach.h"
#include "util_function.h"
#include "util_logging.h"
+#include "util_map.h"
#include "util_opengl.h"
#include "util_progress.h"
#include "util_system.h"
@@ -48,8 +51,93 @@
CCL_NAMESPACE_BEGIN
+class CPUDevice;
+
+class CPUSplitKernel : public DeviceSplitKernel {
+ CPUDevice *device;
+public:
+ explicit CPUSplitKernel(CPUDevice *device);
+
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& kernel_globals,
+ device_memory& kernel_data_,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs);
+
+ virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+ virtual int2 split_kernel_local_size();
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+ virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+};
+
class CPUDevice : public Device
{
+ static unordered_map<string, void*> kernel_functions;
+
+ static void register_kernel_function(const char* name, void* func)
+ {
+ kernel_functions[name] = func;
+ }
+
+ static const char* get_arch_name()
+ {
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+ if(system_cpu_support_avx2()) {
+ return "cpu_avx2";
+ }
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+ if(system_cpu_support_avx()) {
+ return "cpu_avx";
+ }
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+ if(system_cpu_support_sse41()) {
+ return "cpu_sse41";
+ }
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+ if(system_cpu_support_sse3()) {
+ return "cpu_sse3";
+ }
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ if(system_cpu_support_sse2()) {
+ return "cpu_sse2";
+ }
+ else
+#endif
+ {
+ return "cpu";
+ }
+ }
+
+ template<typename F>
+ static F get_kernel_function(string name)
+ {
+ name = string("kernel_") + get_arch_name() + "_" + name;
+
+ unordered_map<string, void*>::iterator it = kernel_functions.find(name);
+
+ if(it == kernel_functions.end()) {
+ assert(!"kernel function not found");
+ return NULL;
+ }
+
+ return (F)it->second;
+ }
+
+ friend class CPUSplitKernel;
+
public:
TaskPool task_pool;
KernelGlobals kernel_globals;
@@ -57,10 +145,15 @@ public:
#ifdef WITH_OSL
OSLGlobals osl_globals;
#endif
+
+ bool use_split_kernel;
+
+ DeviceRequestedFeatures requested_features;
CPUDevice(DeviceInfo& info, Stats &stats, bool background)
: Device(info, stats, background)
{
+
#ifdef WITH_OSL
kernel_globals.osl = &osl_globals;
#endif
@@ -105,6 +198,28 @@ public:
{
VLOG(1) << "Will be using regular kernels.";
}
+
+ use_split_kernel = DebugFlags().cpu.split_kernel;
+ if(use_split_kernel) {
+ VLOG(1) << "Will be using split kernel.";
+ }
+
+ kernel_cpu_register_functions(register_kernel_function);
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ kernel_cpu_sse2_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+ kernel_cpu_sse3_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+ kernel_cpu_sse41_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+ kernel_cpu_avx_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+ kernel_cpu_avx2_register_functions(register_kernel_function);
+#endif
}
~CPUDevice()
@@ -117,9 +232,20 @@ public:
return (TaskScheduler::num_threads() == 1);
}
- void mem_alloc(device_memory& mem, MemoryType /*type*/)
+ void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
{
+ if(name) {
+ VLOG(1) << "Buffer allocate: " << name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+ }
+
mem.device_pointer = mem.data_pointer;
+
+ if(!mem.device_pointer) {
+ mem.device_pointer = (device_ptr)malloc(mem.memory_size());
+ }
+
mem.device_size = mem.memory_size();
stats.mem_alloc(mem.device_size);
}
@@ -144,6 +270,10 @@ public:
void mem_free(device_memory& mem)
{
if(mem.device_pointer) {
+ if(!mem.data_pointer) {
+ free((void*)mem.device_pointer);
+ }
+
mem.device_pointer = 0;
stats.mem_free(mem.device_size);
mem.device_size = 0;
@@ -196,8 +326,14 @@ public:
void thread_run(DeviceTask *task)
{
- if(task->type == DeviceTask::PATH_TRACE)
- thread_path_trace(*task);
+ if(task->type == DeviceTask::PATH_TRACE) {
+ if(!use_split_kernel) {
+ thread_path_trace(*task);
+ }
+ else {
+ thread_path_trace_split(*task);
+ }
+ }
else if(task->type == DeviceTask::FILM_CONVERT)
thread_film_convert(*task);
else if(task->type == DeviceTask::SHADER)
@@ -258,7 +394,7 @@ public:
{
path_trace_kernel = kernel_cpu_path_trace;
}
-
+
while(task.acquire_tile(this, tile)) {
float *render_buffer = (float*)tile.buffer;
uint *rng_state = (uint*)tile.rng_state;
@@ -294,6 +430,49 @@ public:
thread_kernel_globals_free(&kg);
}
+ void thread_path_trace_split(DeviceTask& task)
+ {
+ if(task_pool.canceled()) {
+ if(task.need_finish_queue == false)
+ return;
+ }
+
+ RenderTile tile;
+
+ CPUSplitKernel split_kernel(this);
+
+ /* allocate buffer for kernel globals */
+ device_memory kgbuffer;
+ kgbuffer.resize(sizeof(KernelGlobals));
+ mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
+
+ KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer;
+ *kg = thread_kernel_globals_init();
+
+ requested_features.max_closure = MAX_CLOSURE;
+ if(!split_kernel.load_kernels(requested_features)) {
+ thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+ mem_free(kgbuffer);
+
+ return;
+ }
+
+ while(task.acquire_tile(this, tile)) {
+ device_memory data;
+ split_kernel.path_trace(&task, tile, kgbuffer, data);
+
+ task.release_tile(tile);
+
+ if(task_pool.canceled()) {
+ if(task.need_finish_queue == false)
+ break;
+ }
+ }
+
+ thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+ mem_free(kgbuffer);
+ }
+
void thread_film_convert(DeviceTask& task)
{
float sample_scale = 1.0f/(task.sample + 1);
@@ -501,6 +680,10 @@ protected:
inline void thread_kernel_globals_free(KernelGlobals *kg)
{
+ if(kg == NULL) {
+ return;
+ }
+
if(kg->transparent_shadow_intersections != NULL) {
free(kg->transparent_shadow_intersections);
}
@@ -515,8 +698,176 @@ protected:
OSLShader::thread_free(kg);
#endif
}
+
+ virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
+ requested_features = requested_features_;
+
+ return true;
+ }
+};
+
+/* split kernel */
+
+class CPUSplitKernelFunction : public SplitKernelFunction {
+public:
+ CPUDevice* device;
+ void (*func)(KernelGlobals *kg, KernelData *data);
+
+ CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
+ ~CPUSplitKernelFunction() {}
+
+ virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
+ {
+ if(!func) {
+ return false;
+ }
+
+ KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+ kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+ for(int y = 0; y < dim.global_size[1]; y++) {
+ for(int x = 0; x < dim.global_size[0]; x++) {
+ kg->global_id = make_int2(x, y);
+
+ func(kg, (KernelData*)data.device_pointer);
+ }
+ }
+
+ return true;
+ }
};
+CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& kernel_globals,
+ device_memory& data,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flags,
+ device_memory& work_pool_wgs)
+{
+ typedef void(*data_init_t)(KernelGlobals *kg,
+ ccl_constant KernelData *data,
+ ccl_global void *split_data_buffer,
+ int num_elements,
+ ccl_global char *ray_state,
+ ccl_global uint *rng_state,
+ int start_sample,
+ int end_sample,
+ int sx, int sy, int sw, int sh, int offset, int stride,
+ ccl_global int *Queue_index,
+ int queuesize,
+ ccl_global char *use_queues_flag,
+ ccl_global unsigned int *work_pool_wgs,
+ unsigned int num_samples,
+ ccl_global float *buffer);
+
+ data_init_t data_init;
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+ if(system_cpu_support_avx2()) {
+ data_init = kernel_cpu_avx2_data_init;
+ }
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+ if(system_cpu_support_avx()) {
+ data_init = kernel_cpu_avx_data_init;
+ }
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+ if(system_cpu_support_sse41()) {
+ data_init = kernel_cpu_sse41_data_init;
+ }
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+ if(system_cpu_support_sse3()) {
+ data_init = kernel_cpu_sse3_data_init;
+ }
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ if(system_cpu_support_sse2()) {
+ data_init = kernel_cpu_sse2_data_init;
+ }
+ else
+#endif
+ {
+ data_init = kernel_cpu_data_init;
+ }
+
+ KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+ kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+ for(int y = 0; y < dim.global_size[1]; y++) {
+ for(int x = 0; x < dim.global_size[0]; x++) {
+ kg->global_id = make_int2(x, y);
+
+ data_init((KernelGlobals*)kernel_globals.device_pointer,
+ (KernelData*)data.device_pointer,
+ (void*)split_data.device_pointer,
+ num_global_elements,
+ (char*)ray_state.device_pointer,
+ (uint*)rtile.rng_state,
+ rtile.start_sample,
+ rtile.start_sample + rtile.num_samples,
+ rtile.x,
+ rtile.y,
+ rtile.w,
+ rtile.h,
+ rtile.offset,
+ rtile.stride,
+ (int*)queue_index.device_pointer,
+ dim.global_size[0] * dim.global_size[1],
+ (char*)use_queues_flags.device_pointer,
+ (uint*)work_pool_wgs.device_pointer,
+ rtile.num_samples,
+ (float*)rtile.buffer);
+ }
+ }
+
+ return true;
+}
+
+SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+{
+ CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
+
+ kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
+ if(!kernel->func) {
+ delete kernel;
+ return NULL;
+ }
+
+ return kernel;
+}
+
+int2 CPUSplitKernel::split_kernel_local_size()
+{
+ return make_int2(1, 1);
+}
+
+int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask *task) {
+ /* TODO(mai): this needs investigation but cpu gives incorrect render if global size doesnt match tile size */
+ return task->requested_tile_size;
+}
+
+size_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
+ KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+
+ return split_data_buffer_size(kg, num_threads);
+}
+
+unordered_map<string, void*> CPUDevice::kernel_functions;
+
Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
{
return new CPUDevice(info, stats, background);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 5e87f9ec895..a630a3d1183 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -22,6 +22,7 @@
#include "device.h"
#include "device_intern.h"
+#include "device_split_kernel.h"
#include "buffers.h"
@@ -43,6 +44,8 @@
#include "util_types.h"
#include "util_time.h"
+#include "split/kernel_split_data_types.h"
+
CCL_NAMESPACE_BEGIN
#ifndef WITH_CUDA_DYNLOAD
@@ -79,6 +82,31 @@ int cuewCompilerVersion(void)
} /* namespace */
#endif /* WITH_CUDA_DYNLOAD */
+class CUDADevice;
+
+class CUDASplitKernel : public DeviceSplitKernel {
+ CUDADevice *device;
+public:
+ explicit CUDASplitKernel(CUDADevice *device);
+
+ virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& kernel_globals,
+ device_memory& kernel_data_,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs);
+
+ virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+ virtual int2 split_kernel_local_size();
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+};
+
class CUDADevice : public Device
{
public:
@@ -259,11 +287,16 @@ public:
return DebugFlags().cuda.adaptive_compile;
}
+ bool use_split_kernel()
+ {
+ return DebugFlags().cuda.split_kernel;
+ }
+
/* Common NVCC flags which stays the same regardless of shading model,
* kernel sources md5 and only depends on compiler or compilation settings.
*/
string compile_kernel_get_common_cflags(
- const DeviceRequestedFeatures& requested_features)
+ const DeviceRequestedFeatures& requested_features, bool split=false)
{
const int cuda_version = cuewCompilerVersion();
const int machine = system_cpu_bits();
@@ -288,6 +321,11 @@ public:
#ifdef WITH_CYCLES_DEBUG
cflags += " -D__KERNEL_DEBUG__";
#endif
+
+ if(split) {
+ cflags += " -D__SPLIT__";
+ }
+
return cflags;
}
@@ -321,7 +359,7 @@ public:
return true;
}
- string compile_kernel(const DeviceRequestedFeatures& requested_features)
+ string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
{
/* Compute cubin name. */
int major, minor;
@@ -330,7 +368,8 @@ public:
/* Attempt to use kernel provided with Blender. */
if(!use_adaptive_compilation()) {
- const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin",
+ const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
+ : "lib/kernel_sm_%d%d.cubin",
major, minor));
VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
if(path_exists(cubin)) {
@@ -340,7 +379,7 @@ public:
}
const string common_cflags =
- compile_kernel_get_common_cflags(requested_features);
+ compile_kernel_get_common_cflags(requested_features, split);
/* Try to use locally compiled kernel. */
const string kernel_path = path_get("kernel");
@@ -351,7 +390,8 @@ public:
*/
const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
- const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin",
+ const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
+ : "cycles_kernel_sm%d%d_%s.cubin",
major, minor,
cubin_md5.c_str());
const string cubin = path_cache_get(path_join("kernels", cubin_file));
@@ -386,7 +426,7 @@ public:
const char *nvcc = cuewCompilerPath();
const string kernel = path_join(kernel_path,
path_join("kernels",
- path_join("cuda", "kernel.cu")));
+ path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
double starttime = time_dt();
printf("Compiling CUDA kernel ...\n");
@@ -434,7 +474,7 @@ public:
return false;
/* get kernel */
- string cubin = compile_kernel(requested_features);
+ string cubin = compile_kernel(requested_features, use_split_kernel());
if(cubin == "")
return false;
@@ -467,8 +507,14 @@ public:
}
}
- void mem_alloc(device_memory& mem, MemoryType /*type*/)
+ void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
{
+ if(name) {
+ VLOG(1) << "Buffer allocate: " << name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+ }
+
cuda_push_context();
CUdeviceptr device_pointer;
size_t size = mem.memory_size();
@@ -505,7 +551,9 @@ public:
void mem_zero(device_memory& mem)
{
- memset((void*)mem.data_pointer, 0, mem.memory_size());
+ if(mem.data_pointer) {
+ memset((void*)mem.data_pointer, 0, mem.memory_size());
+ }
cuda_push_context();
if(mem.device_pointer)
@@ -618,7 +666,7 @@ public:
/* Data Storage */
if(interpolation == INTERPOLATION_NONE) {
if(has_bindless_textures) {
- mem_alloc(mem, MEM_READ_ONLY);
+ mem_alloc(NULL, mem, MEM_READ_ONLY);
mem_copy_to(mem);
cuda_push_context();
@@ -642,7 +690,7 @@ public:
cuda_pop_context();
}
else {
- mem_alloc(mem, MEM_READ_ONLY);
+ mem_alloc(NULL, mem, MEM_READ_ONLY);
mem_copy_to(mem);
cuda_push_context();
@@ -1259,25 +1307,48 @@ public:
/* Upload Bindless Mapping */
load_bindless_mapping();
- /* keep rendering tiles until done */
- while(task->acquire_tile(this, tile)) {
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
+ if(!use_split_kernel()) {
+ /* keep rendering tiles until done */
+ while(task->acquire_tile(this, tile)) {
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
- }
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
+ }
- path_trace(tile, sample, branched);
+ path_trace(tile, sample, branched);
- tile.sample = sample + 1;
+ tile.sample = sample + 1;
- task->update_progress(&tile, tile.w*tile.h);
+ task->update_progress(&tile, tile.w*tile.h);
+ }
+
+ task->release_tile(tile);
+ }
+ }
+ else {
+ DeviceRequestedFeatures requested_features;
+ if(!use_adaptive_compilation()) {
+ requested_features.max_closure = 64;
}
- task->release_tile(tile);
+ CUDASplitKernel split_kernel(this);
+ split_kernel.load_kernels(requested_features);
+
+ while(task->acquire_tile(this, tile)) {
+ device_memory void_buffer;
+ split_kernel.path_trace(task, tile, void_buffer, void_buffer);
+
+ task->release_tile(tile);
+
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
+ }
+ }
}
}
else if(task->type == DeviceTask::SHADER) {
@@ -1330,8 +1401,223 @@ public:
{
task_pool.cancel();
}
+
+ friend class CUDASplitKernelFunction;
+ friend class CUDASplitKernel;
};
+/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
+ * now that the definition of that class is complete
+ */
+#undef cuda_assert
+#define cuda_assert(stmt) \
+ { \
+ CUresult result = stmt; \
+ \
+ if(result != CUDA_SUCCESS) { \
+ string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+ if(device->error_msg == "") \
+ device->error_msg = message; \
+ fprintf(stderr, "%s\n", message.c_str()); \
+ /*cuda_abort();*/ \
+ device->cuda_error_documentation(); \
+ } \
+ } (void)0
+
+/* split kernel */
+
+class CUDASplitKernelFunction : public SplitKernelFunction{
+ CUDADevice* device;
+ CUfunction func;
+public:
+ CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
+
+ /* enqueue the kernel, returns false if there is an error */
+ bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
+ {
+ return enqueue(dim, NULL);
+ }
+
+ /* enqueue the kernel, returns false if there is an error */
+ bool enqueue(const KernelDimensions &dim, void *args[])
+ {
+ device->cuda_push_context();
+
+ if(device->have_error())
+ return false;
+
+ /* we ignore dim.local_size for now, as this is faster */
+ int threads_per_block;
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+ int xthreads = (int)sqrt(threads_per_block);
+ int ythreads = (int)sqrt(threads_per_block);
+
+ int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
+ int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
+
+ cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+ cuda_assert(cuLaunchKernel(func,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, args, 0));
+
+ device->cuda_pop_context();
+
+ return !device->have_error();
+ }
+};
+
+CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+size_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+{
+ device_vector<uint> size_buffer;
+ size_buffer.resize(1);
+ device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+ device->cuda_push_context();
+
+ uint threads = num_threads;
+ CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+
+ struct args_t {
+ uint* num_threads;
+ CUdeviceptr* size;
+ };
+
+ args_t args = {
+ &threads,
+ &d_size
+ };
+
+ CUfunction state_buffer_size;
+ cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+
+ cuda_assert(cuLaunchKernel(state_buffer_size,
+ 1, 1, 1,
+ 1, 1, 1,
+ 0, 0, &args, 0));
+
+ device->cuda_pop_context();
+
+ device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
+ device->mem_free(size_buffer);
+
+ return *size_buffer.get_data();
+}
+
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& /*kernel_globals*/,
+ device_memory& /*kernel_data*/,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs)
+{
+ device->cuda_push_context();
+
+ CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
+ CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
+ CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
+ CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
+ CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
+
+ CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state);
+ CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
+
+ int end_sample = rtile.start_sample + rtile.num_samples;
+ int queue_size = dim.global_size[0] * dim.global_size[1];
+
+ struct args_t {
+ CUdeviceptr* split_data_buffer;
+ int* num_elements;
+ CUdeviceptr* ray_state;
+ CUdeviceptr* rng_state;
+ int* start_sample;
+ int* end_sample;
+ int* sx;
+ int* sy;
+ int* sw;
+ int* sh;
+ int* offset;
+ int* stride;
+ CUdeviceptr* queue_index;
+ int* queuesize;
+ CUdeviceptr* use_queues_flag;
+ CUdeviceptr* work_pool_wgs;
+ int* num_samples;
+ CUdeviceptr* buffer;
+ };
+
+ args_t args = {
+ &d_split_data,
+ &num_global_elements,
+ &d_ray_state,
+ &d_rng_state,
+ &rtile.start_sample,
+ &end_sample,
+ &rtile.x,
+ &rtile.y,
+ &rtile.w,
+ &rtile.h,
+ &rtile.offset,
+ &rtile.stride,
+ &d_queue_index,
+ &queue_size,
+ &d_use_queues_flag,
+ &d_work_pool_wgs,
+ &rtile.num_samples,
+ &d_buffer
+ };
+
+ CUfunction data_init;
+ cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+ if(device->have_error()) {
+ return false;
+ }
+
+ CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
+
+ device->cuda_pop_context();
+
+ return !device->have_error();
+}
+
+SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+{
+ CUfunction func;
+
+ device->cuda_push_context();
+
+ cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
+ if(device->have_error()) {
+ device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+ return NULL;
+ }
+
+ device->cuda_pop_context();
+
+ return new CUDASplitKernelFunction(device, func);
+}
+
+int2 CUDASplitKernel::split_kernel_local_size()
+{
+ return make_int2(32, 1);
+}
+
+int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask */*task*/)
+{
+ /* TODO(mai): implement something here to detect ideal work size */
+ return make_int2(256, 256);
+}
+
bool device_cuda_init(void)
{
#ifdef WITH_CUDA_DYNLOAD
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 5b5b4dc6802..b69c3dad604 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -180,10 +180,27 @@ public:
/* device pointer */
device_ptr device_pointer;
-protected:
- device_memory() {}
+ device_memory()
+ {
+ data_type = device_type_traits<uchar>::data_type;
+ data_elements = device_type_traits<uchar>::num_elements;
+ data_pointer = 0;
+ data_size = 0;
+ device_size = 0;
+ data_width = 0;
+ data_height = 0;
+ data_depth = 0;
+ device_pointer = 0;
+ }
virtual ~device_memory() { assert(!device_pointer); }
+ void resize(size_t size)
+ {
+ data_size = size;
+ data_width = size;
+ }
+
+protected:
/* no copying */
device_memory(const device_memory&);
device_memory& operator = (const device_memory&);
@@ -198,16 +215,8 @@ public:
{
data_type = device_type_traits<T>::data_type;
data_elements = device_type_traits<T>::num_elements;
- data_pointer = 0;
- data_size = 0;
- device_size = 0;
- data_width = 0;
- data_height = 0;
- data_depth = 0;
assert(data_elements > 0);
-
- device_pointer = 0;
}
virtual ~device_vector() {}
@@ -266,6 +275,7 @@ public:
data_height = 0;
data_depth = 0;
data_size = 0;
+ device_pointer = 0;
}
size_t size()
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 61d78ee65de..3368fd3d756 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -106,11 +106,11 @@ public:
return true;
}
- void mem_alloc(device_memory& mem, MemoryType type)
+ void mem_alloc(const char *name, device_memory& mem, MemoryType type)
{
foreach(SubDevice& sub, devices) {
mem.device_pointer = 0;
- sub.device->mem_alloc(mem, type);
+ sub.device->mem_alloc(name, mem, type);
sub.ptr_map[unique_ptr] = mem.device_pointer;
}
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 53eef6cf199..6dc4aecbc50 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -87,8 +87,14 @@ public:
snd.write();
}
- void mem_alloc(device_memory& mem, MemoryType type)
+ void mem_alloc(const char *name, device_memory& mem, MemoryType type)
{
+ if(name) {
+ VLOG(1) << "Buffer allocate: " << name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+ }
+
thread_scoped_lock lock(rpc_lock);
mem.device_pointer = ++mem_counter;
@@ -481,7 +487,7 @@ protected:
mem.data_pointer = 0;
/* perform the allocation on the actual device */
- device->mem_alloc(mem, type);
+ device->mem_alloc(NULL, mem, type);
/* store a mapping to/from client_pointer and real device pointer */
pointer_mapping_insert(client_pointer, mem.device_pointer);
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
new file mode 100644
index 00000000000..b9705077fbf
--- /dev/null
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device_split_kernel.h"
+
+#include "kernel_types.h"
+#include "kernel_split_data_types.h"
+
+#include "util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+static const double alpha = 0.1; /* alpha for rolling average */
+
+DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
+{
+ current_max_closure = -1;
+ first_tile = true;
+
+ avg_time_per_sample = 0.0;
+
+ kernel_path_init = NULL;
+ kernel_scene_intersect = NULL;
+ kernel_lamp_emission = NULL;
+ kernel_queue_enqueue = NULL;
+ kernel_background_buffer_update = NULL;
+ kernel_shader_eval = NULL;
+ kernel_holdout_emission_blurring_pathtermination_ao = NULL;
+ kernel_direct_lighting = NULL;
+ kernel_shadow_blocked = NULL;
+ kernel_next_iteration_setup = NULL;
+}
+
+DeviceSplitKernel::~DeviceSplitKernel()
+{
+ device->mem_free(split_data);
+ device->mem_free(ray_state);
+ device->mem_free(use_queues_flag);
+ device->mem_free(queue_index);
+ device->mem_free(work_pool_wgs);
+
+ delete kernel_path_init;
+ delete kernel_scene_intersect;
+ delete kernel_lamp_emission;
+ delete kernel_queue_enqueue;
+ delete kernel_background_buffer_update;
+ delete kernel_shader_eval;
+ delete kernel_holdout_emission_blurring_pathtermination_ao;
+ delete kernel_direct_lighting;
+ delete kernel_shadow_blocked;
+ delete kernel_next_iteration_setup;
+}
+
+bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
+{
+#define LOAD_KERNEL(name) \
+ kernel_##name = get_split_kernel_function(#name, requested_features); \
+ if(!kernel_##name) { \
+ return false; \
+ }
+
+ LOAD_KERNEL(path_init);
+ LOAD_KERNEL(scene_intersect);
+ LOAD_KERNEL(lamp_emission);
+ LOAD_KERNEL(queue_enqueue);
+ LOAD_KERNEL(background_buffer_update);
+ LOAD_KERNEL(shader_eval);
+ LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
+ LOAD_KERNEL(direct_lighting);
+ LOAD_KERNEL(shadow_blocked);
+ LOAD_KERNEL(next_iteration_setup);
+
+#undef LOAD_KERNEL
+
+ current_max_closure = requested_features.max_closure;
+
+ return true;
+}
+
+size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size)
+{
+ size_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
+ return max_buffer_size / size_per_element;
+}
+
+bool DeviceSplitKernel::path_trace(DeviceTask *task,
+ RenderTile& tile,
+ device_memory& kgbuffer,
+ device_memory& kernel_data)
+{
+ if(device->have_error()) {
+ return false;
+ }
+
+ /* Get local size */
+ size_t local_size[2];
+ {
+ int2 lsize = split_kernel_local_size();
+ local_size[0] = lsize[0];
+ local_size[1] = lsize[1];
+ }
+
+ /* Set gloabl size */
+ size_t global_size[2];
+ {
+ int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
+
+ /* Make sure that set work size is a multiple of local
+ * work size dimensions.
+ */
+ global_size[0] = round_up(gsize[0], local_size[0]);
+ global_size[1] = round_up(gsize[1], local_size[1]);
+ }
+
+ /* Number of elements in the global state buffer */
+ int num_global_elements = global_size[0] * global_size[1];
+
+ /* Allocate all required global memory once. */
+ if(first_tile) {
+ first_tile = false;
+
+ /* Calculate max groups */
+
+ /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
+ unsigned int max_work_groups = num_global_elements / WORK_POOL_SIZE + 1;
+
+ /* Allocate work_pool_wgs memory. */
+ work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
+ device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
+
+ queue_index.resize(NUM_QUEUES * sizeof(int));
+ device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
+
+ use_queues_flag.resize(sizeof(char));
+ device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
+
+ ray_state.resize(num_global_elements);
+ device->mem_alloc("ray_state", ray_state, MEM_READ_WRITE);
+
+ split_data.resize(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
+ device->mem_alloc("split_data", split_data, MEM_READ_WRITE);
+ }
+
+#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
+ if(device->have_error()) { \
+ return false; \
+ } \
+ if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
+ return false; \
+ }
+
+ tile.sample = tile.start_sample;
+
+ /* for exponential increase between tile updates */
+ int time_multiplier = 1;
+
+ while(tile.sample < tile.start_sample + tile.num_samples) {
+ /* to keep track of how long it takes to run a number of samples */
+ double start_time = time_dt();
+
+ /* initial guess to start rolling average */
+ const int initial_num_samples = 1;
+ /* approx number of samples per second */
+ int samples_per_second = (avg_time_per_sample > 0.0) ?
+ int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples;
+
+ RenderTile subtile = tile;
+ subtile.start_sample = tile.sample;
+ subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample);
+
+ if(device->have_error()) {
+ return false;
+ }
+
+ /* reset state memory here as global size for data_init
+ * kernel might not be large enough to do in kernel
+ */
+ device->mem_zero(work_pool_wgs);
+ device->mem_zero(split_data);
+
+ if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
+ subtile,
+ num_global_elements,
+ kgbuffer,
+ kernel_data,
+ split_data,
+ ray_state,
+ queue_index,
+ use_queues_flag,
+ work_pool_wgs))
+ {
+ return false;
+ }
+
+ ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
+
+ bool activeRaysAvailable = true;
+
+ while(activeRaysAvailable) {
+ /* Twice the global work size of other kernels for
+ * ckPathTraceKernel_shadow_blocked_direct_lighting. */
+ size_t global_size_shadow_blocked[2];
+ global_size_shadow_blocked[0] = global_size[0] * 2;
+ global_size_shadow_blocked[1] = global_size[1];
+
+ /* Do path-iteration in host [Enqueue Path-iteration kernels. */
+ for(int PathIter = 0; PathIter < 16; PathIter++) {
+ ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
+ ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
+
+ if(task->get_cancel()) {
+ return true;
+ }
+ }
+
+ /* Decide if we should exit path-iteration in host. */
+ device->mem_copy_from(ray_state, 0, global_size[0] * global_size[1] * sizeof(char), 1, 1);
+
+ activeRaysAvailable = false;
+
+ for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
+ if(int8_t(ray_state.get_data()[rayStateIter]) != RAY_INACTIVE) {
+ /* Not all rays are RAY_INACTIVE. */
+ activeRaysAvailable = true;
+ break;
+ }
+ }
+
+ if(task->get_cancel()) {
+ return true;
+ }
+ }
+
+ double time_per_sample = ((time_dt()-start_time) / subtile.num_samples);
+
+ if(avg_time_per_sample == 0.0) {
+ /* start rolling average */
+ avg_time_per_sample = time_per_sample;
+ }
+ else {
+ avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
+ }
+
+#undef ENQUEUE_SPLIT_KERNEL
+
+ tile.sample += subtile.num_samples;
+ task->update_progress(&tile, tile.w*tile.h*subtile.num_samples);
+
+ time_multiplier = min(time_multiplier << 1, 10);
+
+ if(task->get_cancel()) {
+ return true;
+ }
+ }
+
+ return true;
+}
+
+CCL_NAMESPACE_END
+
+
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
new file mode 100644
index 00000000000..cc3e1aa26ae
--- /dev/null
+++ b/intern/cycles/device/device_split_kernel.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_SPLIT_KERNEL_H__
+#define __DEVICE_SPLIT_KERNEL_H__
+
+#include "device.h"
+#include "buffers.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* When allocate global memory in chunks. We may not be able to
+ * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
+ * Since some bytes may be needed for aligning chunks of memory;
+ * This is the amount of memory that we dedicate for that purpose.
+ */
+#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+
+/* Types used for split kernel */
+
+class KernelDimensions {
+public:
+ size_t global_size[2];
+ size_t local_size[2];
+
+ KernelDimensions(size_t global_size_[2], size_t local_size_[2])
+ {
+ memcpy(global_size, global_size_, sizeof(global_size));
+ memcpy(local_size, local_size_, sizeof(local_size));
+ }
+};
+
+class SplitKernelFunction {
+public:
+ virtual ~SplitKernelFunction() {}
+
+ /* enqueue the kernel, returns false if there is an error */
+ virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) = 0;
+};
+
+class DeviceSplitKernel {
+private:
+ Device *device;
+
+ SplitKernelFunction *kernel_path_init;
+ SplitKernelFunction *kernel_scene_intersect;
+ SplitKernelFunction *kernel_lamp_emission;
+ SplitKernelFunction *kernel_queue_enqueue;
+ SplitKernelFunction *kernel_background_buffer_update;
+ SplitKernelFunction *kernel_shader_eval;
+ SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
+ SplitKernelFunction *kernel_direct_lighting;
+ SplitKernelFunction *kernel_shadow_blocked;
+ SplitKernelFunction *kernel_next_iteration_setup;
+
+ /* Global memory variables [porting]; These memory is used for
+ * co-operation between different kernels; Data written by one
+ * kernel will be available to another kernel via this global
+ * memory.
+ */
+ device_memory split_data;
+ device_vector<uchar> ray_state;
+ device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
+
+ /* Flag to make sceneintersect and lampemission kernel use queues. */
+ device_memory use_queues_flag;
+
+ /* Approximate time it takes to complete one sample */
+ double avg_time_per_sample;
+
+ /* Work pool with respect to each work group. */
+ device_memory work_pool_wgs;
+
+ /* clos_max value for which the kernels have been loaded currently. */
+ int current_max_closure;
+
+ /* Marked True in constructor and marked false at the end of path_trace(). */
+ bool first_tile;
+
+public:
+ explicit DeviceSplitKernel(Device* device);
+ virtual ~DeviceSplitKernel();
+
+ bool load_kernels(const DeviceRequestedFeatures& requested_features);
+ bool path_trace(DeviceTask *task,
+ RenderTile& rtile,
+ device_memory& kgbuffer,
+ device_memory& kernel_data);
+
+ virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
+ size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size);
+
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& kernel_globals,
+ device_memory& kernel_data_,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs) = 0;
+
+ virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0;
+ virtual int2 split_kernel_local_size() = 0;
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __DEVICE_SPLIT_KERNEL_H__ */
+
+
+
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 8bd54c3d2b0..f31092fd9d2 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -51,6 +51,8 @@ public:
int shader_filter;
int shader_x, shader_w;
+ int passes_size;
+
explicit DeviceTask(Type type = PATH_TRACE);
int get_subtask_count(int num, int max_size = 0);
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 4023ba89a10..6470cb8ff7e 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -26,29 +26,29 @@
CCL_NAMESPACE_BEGIN
-#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
-
-/* Macro declarations used with split kernel */
-
-/* Macro to enable/disable work-stealing */
-#define __WORK_STEALING__
-
-#define SPLIT_KERNEL_LOCAL_SIZE_X 64
-#define SPLIT_KERNEL_LOCAL_SIZE_Y 1
-
-/* This value may be tuned according to the scene we are rendering.
- *
- * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
- * ray-bounces will improve performance.
- */
-#define PATH_ITER_INC_FACTOR 8
+/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
+#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
+/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
+# undef clEnqueueNDRangeKernel
+# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
+ clFinish(a); \
+ CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
+ clFinish(a);
+
+# undef clEnqueueWriteBuffer
+# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
+ clFinish(a); \
+ CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
+ clFinish(a);
+
+# undef clEnqueueReadBuffer
+# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
+ clFinish(a); \
+ CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
+ clFinish(a);
+#endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
struct OpenCLPlatformDevice {
OpenCLPlatformDevice(cl_platform_id platform_id,
@@ -248,6 +248,7 @@ public:
bool device_initialized;
string platform_name;
+ string device_name;
bool opencl_error(cl_int err);
void opencl_error(const string& message);
@@ -266,10 +267,10 @@ public:
/* Has to be implemented by the real device classes.
* The base device will then load all these programs. */
- virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
+ virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
vector<OpenCLProgram*> &programs) = 0;
- void mem_alloc(device_memory& mem, MemoryType type);
+ void mem_alloc(const char *name, device_memory& mem, MemoryType type);
void mem_copy_to(device_memory& mem);
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
void mem_zero(device_memory& mem);
@@ -326,16 +327,39 @@ protected:
class ArgumentWrapper {
public:
- ArgumentWrapper() : size(0), pointer(NULL) {}
- template <typename T>
+ ArgumentWrapper() : size(0), pointer(NULL)
+ {
+ }
+
+ ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
+ pointer((void*)(&argument.device_pointer))
+ {
+ }
+
+ template<typename T>
+ ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
+ pointer((void*)(&argument.device_pointer))
+ {
+ }
+
+ template<typename T>
ArgumentWrapper(T& argument) : size(sizeof(argument)),
- pointer(&argument) { }
+ pointer(&argument)
+ {
+ }
+
ArgumentWrapper(int argument) : size(sizeof(int)),
int_value(argument),
- pointer(&int_value) { }
+ pointer(&int_value)
+ {
+ }
+
ArgumentWrapper(float argument) : size(sizeof(float)),
float_value(argument),
- pointer(&float_value) { }
+ pointer(&float_value)
+ {
+ }
+
size_t size;
int int_value;
float float_value;
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index a2b900312e7..c5f44f84e8c 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -82,9 +82,10 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
cpPlatform = platform_device.platform_id;
cdDevice = platform_device.device_id;
platform_name = platform_device.platform_name;
+ device_name = platform_device.device_name;
VLOG(2) << "Creating new Cycles device for OpenCL platform "
<< platform_name << ", device "
- << platform_device.device_name << ".";
+ << device_name << ".";
{
/* try to use cached context */
@@ -113,12 +114,16 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
}
cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
- if(opencl_error(ciErr))
+ if(opencl_error(ciErr)) {
+ opencl_error("OpenCL: Error creating command queue");
return;
+ }
null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
- if(opencl_error(ciErr))
+ if(opencl_error(ciErr)) {
+ opencl_error("OpenCL: Error creating memory buffer for NULL");
return;
+ }
fprintf(stderr, "Device init success\n");
device_initialized = true;
@@ -191,6 +196,8 @@ string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)
bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
{
+ VLOG(2) << "Loading kernels for platform " << platform_name
+ << ", device " << device_name << ".";
/* Verify if device was initialized. */
if(!device_initialized) {
fprintf(stderr, "OpenCL: failed to initialize device.\n");
@@ -206,11 +213,14 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
base_program.add_kernel(ustring("convert_to_half_float"));
base_program.add_kernel(ustring("shader"));
base_program.add_kernel(ustring("bake"));
+ base_program.add_kernel(ustring("zero_buffer"));
vector<OpenCLProgram*> programs;
programs.push_back(&base_program);
/* Call actual class to fill the vector with its programs. */
- load_kernels(requested_features, programs);
+ if(!load_kernels(requested_features, programs)) {
+ return false;
+ }
/* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
* serialize the calls internally, so it's not much use right now.
@@ -242,8 +252,14 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
return true;
}
-void OpenCLDeviceBase::mem_alloc(device_memory& mem, MemoryType type)
+void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryType type)
{
+ if(name) {
+ VLOG(1) << "Buffer allocate: " << name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+ }
+
size_t size = mem.memory_size();
cl_mem_flags mem_flag;
@@ -311,8 +327,61 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
void OpenCLDeviceBase::mem_zero(device_memory& mem)
{
if(mem.device_pointer) {
- memset((void*)mem.data_pointer, 0, mem.memory_size());
- mem_copy_to(mem);
+ if(base_program.is_loaded()) {
+ cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+
+ size_t global_size[] = {1024, 1024};
+ size_t num_threads = global_size[0] * global_size[1];
+
+ cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
+ unsigned long long d_offset = 0;
+ unsigned long long d_size = 0;
+
+ while(d_offset < mem.memory_size()) {
+ d_size = std::min<unsigned long long>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
+
+ kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+
+ ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
+ ckZeroBuffer,
+ 2,
+ NULL,
+ global_size,
+ NULL,
+ 0,
+ NULL,
+ NULL);
+ opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+
+ d_offset += d_size;
+ }
+ }
+
+ if(mem.data_pointer) {
+ memset((void*)mem.data_pointer, 0, mem.memory_size());
+ }
+
+ if(!base_program.is_loaded()) {
+ void* zero = (void*)mem.data_pointer;
+
+ if(!mem.data_pointer) {
+ zero = util_aligned_malloc(mem.memory_size(), 16);
+ memset(zero, 0, mem.memory_size());
+ }
+
+ opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
+ CL_MEM_PTR(mem.device_pointer),
+ CL_TRUE,
+ 0,
+ mem.memory_size(),
+ zero,
+ 0,
+ NULL, NULL));
+
+ if(!mem.data_pointer) {
+ util_aligned_free(zero);
+ }
+ }
}
}
@@ -337,7 +406,7 @@ void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
device_vector<uchar> *data = new device_vector<uchar>();
data->copy((uchar*)host, size);
- mem_alloc(*data, MEM_READ_ONLY);
+ mem_alloc(name, *data, MEM_READ_ONLY);
i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
}
else {
@@ -356,7 +425,7 @@ void OpenCLDeviceBase::tex_alloc(const char *name,
VLOG(1) << "Texture allocate: " << name << ", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")";
- mem_alloc(mem, MEM_READ_ONLY);
+ mem_alloc(NULL, mem, MEM_READ_ONLY);
mem_copy_to(mem);
assert(mem_map.find(name) == mem_map.end());
mem_map.insert(MemMap::value_type(name, mem.device_pointer));
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index 6ea7619e022..049e332272b 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -43,11 +43,12 @@ public:
return true;
}
- virtual void load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
+ virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
vector<OpenCLProgram*> &programs)
{
path_trace_program.add_kernel(ustring("path_trace"));
programs.push_back(&path_trace_program);
+ return true;
}
~OpenCLDeviceMegaKernel()
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 3c3c2150128..b651b4a848e 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -21,325 +21,48 @@
#include "buffers.h"
#include "kernel_types.h"
+#include "kernel_split_data_types.h"
+#include "device_split_kernel.h"
+
+#include "util_logging.h"
#include "util_md5.h"
#include "util_path.h"
#include "util_time.h"
CCL_NAMESPACE_BEGIN
-/* TODO(sergey): This is to keep tile split on OpenCL level working
- * for now, since without this view-port render does not work as it
- * should.
- *
- * Ideally it'll be done on the higher level, but we need to get ready
- * for merge rather soon, so let's keep split logic private here in
- * the file.
- */
-class SplitRenderTile : public RenderTile {
-public:
- SplitRenderTile()
- : RenderTile(),
- buffer_offset_x(0),
- buffer_offset_y(0),
- rng_state_offset_x(0),
- rng_state_offset_y(0),
- buffer_rng_state_stride(0) {}
-
- explicit SplitRenderTile(RenderTile& tile)
- : RenderTile(),
- buffer_offset_x(0),
- buffer_offset_y(0),
- rng_state_offset_x(0),
- rng_state_offset_y(0),
- buffer_rng_state_stride(0)
- {
- x = tile.x;
- y = tile.y;
- w = tile.w;
- h = tile.h;
- start_sample = tile.start_sample;
- num_samples = tile.num_samples;
- sample = tile.sample;
- resolution = tile.resolution;
- offset = tile.offset;
- stride = tile.stride;
- buffer = tile.buffer;
- rng_state = tile.rng_state;
- buffers = tile.buffers;
+class OpenCLSplitKernel;
+
+static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features)
+{
+ string build_options = "-D__SPLIT_KERNEL__ ";
+ build_options += requested_features.get_build_options();
+
+ /* Set compute device build option. */
+ cl_device_type device_type;
+ device->ciErr = clGetDeviceInfo(device->cdDevice,
+ CL_DEVICE_TYPE,
+ sizeof(cl_device_type),
+ &device_type,
+ NULL);
+ assert(device->ciErr == CL_SUCCESS);
+ if(device_type == CL_DEVICE_TYPE_GPU) {
+ build_options += " -D__COMPUTE_DEVICE_GPU__";
}
- /* Split kernel is device global memory constrained;
- * hence split kernel cant render big tile size's in
- * one go. If the user sets a big tile size (big tile size
- * is a term relative to the available device global memory),
- * we split the tile further and then call path_trace on
- * each of those split tiles. The following variables declared,
- * assist in achieving that purpose
- */
- int buffer_offset_x;
- int buffer_offset_y;
- int rng_state_offset_x;
- int rng_state_offset_y;
- int buffer_rng_state_stride;
-};
+ return build_options;
+}
/* OpenCLDeviceSplitKernel's declaration/definition. */
class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
{
public:
- /* Kernel declaration. */
+ DeviceSplitKernel *split_kernel;
OpenCLProgram program_data_init;
- OpenCLProgram program_scene_intersect;
- OpenCLProgram program_lamp_emission;
- OpenCLProgram program_queue_enqueue;
- OpenCLProgram program_background_buffer_update;
- OpenCLProgram program_shader_eval;
- OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
- OpenCLProgram program_direct_lighting;
- OpenCLProgram program_shadow_blocked;
- OpenCLProgram program_next_iteration_setup;
- OpenCLProgram program_sum_all_radiance;
-
- /* Global memory variables [porting]; These memory is used for
- * co-operation between different kernels; Data written by one
- * kernel will be available to another kernel via this global
- * memory.
- */
- cl_mem rng_coop;
- cl_mem throughput_coop;
- cl_mem L_transparent_coop;
- cl_mem PathRadiance_coop;
- cl_mem Ray_coop;
- cl_mem PathState_coop;
- cl_mem Intersection_coop;
- cl_mem kgbuffer; /* KernelGlobals buffer. */
-
- /* Global buffers for ShaderData. */
- cl_mem sd; /* ShaderData used in the main path-iteration loop. */
- cl_mem sd_DL_shadow; /* ShaderData used in Direct Lighting and
- * shadow_blocked kernel.
- */
-
- /* Global memory required for shadow blocked and accum_radiance. */
- cl_mem BSDFEval_coop;
- cl_mem ISLamp_coop;
- cl_mem LightRay_coop;
- cl_mem AOAlpha_coop;
- cl_mem AOBSDF_coop;
- cl_mem AOLightRay_coop;
- cl_mem Intersection_coop_shadow;
-
-#ifdef WITH_CYCLES_DEBUG
- /* DebugData memory */
- cl_mem debugdata_coop;
-#endif
-
- /* Global state array that tracks ray state. */
- cl_mem ray_state;
-
- /* Per sample buffers. */
- cl_mem per_sample_output_buffers;
-
- /* Denotes which sample each ray is being processed for. */
- cl_mem work_array;
-
- /* Queue */
- cl_mem Queue_data; /* Array of size queuesize * num_queues * sizeof(int). */
- cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
- * Tracks the size of each queue.
- */
-
- /* Flag to make sceneintersect and lampemission kernel use queues. */
- cl_mem use_queues_flag;
-
- /* Amount of memory in output buffer associated with one pixel/thread. */
- size_t per_thread_output_buffer_size;
-
- /* Total allocatable available device memory. */
- size_t total_allocatable_memory;
-
- /* host version of ray_state; Used in checking host path-iteration
- * termination.
- */
- char *hostRayStateArray;
-
- /* Number of path-iterations to be done in one shot. */
- unsigned int PathIteration_times;
-
-#ifdef __WORK_STEALING__
- /* Work pool with respect to each work group. */
- cl_mem work_pool_wgs;
-
- /* Denotes the maximum work groups possible w.r.t. current tile size. */
- unsigned int max_work_groups;
-#endif
-
- /* clos_max value for which the kernels have been loaded currently. */
- int current_max_closure;
-
- /* Marked True in constructor and marked false at the end of path_trace(). */
- bool first_tile;
-
- OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
- : OpenCLDeviceBase(info, stats, background_)
- {
- background = background_;
-
- /* Initialize cl_mem variables. */
- kgbuffer = NULL;
- sd = NULL;
- sd_DL_shadow = NULL;
-
- rng_coop = NULL;
- throughput_coop = NULL;
- L_transparent_coop = NULL;
- PathRadiance_coop = NULL;
- Ray_coop = NULL;
- PathState_coop = NULL;
- Intersection_coop = NULL;
- ray_state = NULL;
-
- AOAlpha_coop = NULL;
- AOBSDF_coop = NULL;
- AOLightRay_coop = NULL;
- BSDFEval_coop = NULL;
- ISLamp_coop = NULL;
- LightRay_coop = NULL;
- Intersection_coop_shadow = NULL;
-
-#ifdef WITH_CYCLES_DEBUG
- debugdata_coop = NULL;
-#endif
-
- work_array = NULL;
-
- /* Queue. */
- Queue_data = NULL;
- Queue_index = NULL;
- use_queues_flag = NULL;
-
- per_sample_output_buffers = NULL;
-
- per_thread_output_buffer_size = 0;
- hostRayStateArray = NULL;
- PathIteration_times = PATH_ITER_INC_FACTOR;
-#ifdef __WORK_STEALING__
- work_pool_wgs = NULL;
- max_work_groups = 0;
-#endif
- current_max_closure = -1;
- first_tile = true;
-
- /* Get device's maximum memory that can be allocated. */
- ciErr = clGetDeviceInfo(cdDevice,
- CL_DEVICE_MAX_MEM_ALLOC_SIZE,
- sizeof(size_t),
- &total_allocatable_memory,
- NULL);
- assert(ciErr == CL_SUCCESS);
- if(platform_name == "AMD Accelerated Parallel Processing") {
- /* This value is tweak-able; AMD platform does not seem to
- * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
- * is considered for further computation.
- */
- total_allocatable_memory /= 2;
- }
- }
-
- virtual bool show_samples() const {
- return false;
- }
-
- /* Split kernel utility functions. */
- size_t get_tex_size(const char *tex_name)
- {
- cl_mem ptr;
- size_t ret_size = 0;
- MemMap::iterator i = mem_map.find(tex_name);
- if(i != mem_map.end()) {
- ptr = CL_MEM_PTR(i->second);
- ciErr = clGetMemObjectInfo(ptr,
- CL_MEM_SIZE,
- sizeof(ret_size),
- &ret_size,
- NULL);
- assert(ciErr == CL_SUCCESS);
- }
- return ret_size;
- }
-
- size_t get_shader_data_size(size_t max_closure)
- {
- /* ShaderData size with variable size ShaderClosure array */
- return sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - max_closure));
- }
-
- /* Returns size of KernelGlobals structure associated with OpenCL. */
- size_t get_KernelGlobals_size()
- {
- /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
- * fetch its size.
- */
- typedef struct KernelGlobals {
- ccl_constant KernelData *data;
-#define KERNEL_TEX(type, ttype, name) \
- ccl_global type *name;
-#include "kernel_textures.h"
-#undef KERNEL_TEX
- void *sd_input;
- void *isect_shadow;
- } KernelGlobals;
-
- return sizeof(KernelGlobals);
- }
+ OpenCLProgram program_state_buffer_size;
- virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
- vector<OpenCLProgram*> &programs)
- {
- string build_options = "-D__SPLIT_KERNEL__ ";
-#ifdef __WORK_STEALING__
- build_options += "-D__WORK_STEALING__ ";
-#endif
- build_options += requested_features.get_build_options();
-
- /* Set compute device build option. */
- cl_device_type device_type;
- ciErr = clGetDeviceInfo(cdDevice,
- CL_DEVICE_TYPE,
- sizeof(cl_device_type),
- &device_type,
- NULL);
- assert(ciErr == CL_SUCCESS);
- if(device_type == CL_DEVICE_TYPE_GPU) {
- build_options += " -D__COMPUTE_DEVICE_GPU__";
- }
-
-#define GLUE(a, b) a ## b
-#define LOAD_KERNEL(name) \
- do { \
- GLUE(program_, name) = OpenCLProgram(this, "split_" #name, "kernel_" #name ".cl", build_options); \
- GLUE(program_, name).add_kernel(ustring("path_trace_" #name)); \
- programs.push_back(&GLUE(program_, name)); \
- } while(false)
-
- LOAD_KERNEL(data_init);
- LOAD_KERNEL(scene_intersect);
- LOAD_KERNEL(lamp_emission);
- LOAD_KERNEL(queue_enqueue);
- LOAD_KERNEL(background_buffer_update);
- LOAD_KERNEL(shader_eval);
- LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
- LOAD_KERNEL(direct_lighting);
- LOAD_KERNEL(shadow_blocked);
- LOAD_KERNEL(next_iteration_setup);
- LOAD_KERNEL(sum_all_radiance);
-
-#undef FIND_KERNEL
-#undef GLUE
-
- current_max_closure = requested_features.max_closure;
- }
+ OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_);
~OpenCLDeviceSplitKernel()
{
@@ -347,960 +70,298 @@ public:
/* Release kernels */
program_data_init.release();
- program_scene_intersect.release();
- program_lamp_emission.release();
- program_queue_enqueue.release();
- program_background_buffer_update.release();
- program_shader_eval.release();
- program_holdout_emission_blurring_pathtermination_ao.release();
- program_direct_lighting.release();
- program_shadow_blocked.release();
- program_next_iteration_setup.release();
- program_sum_all_radiance.release();
-
- /* Release global memory */
- release_mem_object_safe(rng_coop);
- release_mem_object_safe(throughput_coop);
- release_mem_object_safe(L_transparent_coop);
- release_mem_object_safe(PathRadiance_coop);
- release_mem_object_safe(Ray_coop);
- release_mem_object_safe(PathState_coop);
- release_mem_object_safe(Intersection_coop);
- release_mem_object_safe(kgbuffer);
- release_mem_object_safe(sd);
- release_mem_object_safe(sd_DL_shadow);
- release_mem_object_safe(ray_state);
- release_mem_object_safe(AOAlpha_coop);
- release_mem_object_safe(AOBSDF_coop);
- release_mem_object_safe(AOLightRay_coop);
- release_mem_object_safe(BSDFEval_coop);
- release_mem_object_safe(ISLamp_coop);
- release_mem_object_safe(LightRay_coop);
- release_mem_object_safe(Intersection_coop_shadow);
-#ifdef WITH_CYCLES_DEBUG
- release_mem_object_safe(debugdata_coop);
-#endif
- release_mem_object_safe(use_queues_flag);
- release_mem_object_safe(Queue_data);
- release_mem_object_safe(Queue_index);
- release_mem_object_safe(work_array);
-#ifdef __WORK_STEALING__
- release_mem_object_safe(work_pool_wgs);
-#endif
- release_mem_object_safe(per_sample_output_buffers);
-
- if(hostRayStateArray != NULL) {
- free(hostRayStateArray);
- }
+
+ delete split_kernel;
}
- void path_trace(DeviceTask *task,
- SplitRenderTile& rtile,
- int2 max_render_feasible_tile_size)
+ virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
+ vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
{
- /* cast arguments to cl types */
- cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
- cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
- cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
- cl_int d_x = rtile.x;
- cl_int d_y = rtile.y;
- cl_int d_w = rtile.w;
- cl_int d_h = rtile.h;
- cl_int d_offset = rtile.offset;
- cl_int d_stride = rtile.stride;
-
- /* Make sure that set render feasible tile size is a multiple of local
- * work size dimensions.
- */
- assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
- assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
-
- size_t global_size[2];
- size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
- SPLIT_KERNEL_LOCAL_SIZE_Y};
+ program_data_init = OpenCLDeviceBase::OpenCLProgram(this,
+ "split_data_init",
+ "kernel_data_init.cl",
+ get_build_options(this, requested_features));
+ program_data_init.add_kernel(ustring("path_trace_data_init"));
+ programs.push_back(&program_data_init);
+
+ program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
+ "split_state_buffer_size",
+ "kernel_state_buffer_size.cl",
+ get_build_options(this, requested_features));
+ program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
+ programs.push_back(&program_state_buffer_size);
+
+ return split_kernel->load_kernels(requested_features);
+ }
- /* Set the range of samples to be processed for every ray in
- * path-regeneration logic.
- */
- cl_int start_sample = rtile.start_sample;
- cl_int end_sample = rtile.start_sample + rtile.num_samples;
- cl_int num_samples = rtile.num_samples;
-
-#ifdef __WORK_STEALING__
- global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
- global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
- unsigned int num_parallel_samples = 1;
-#else
- global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
- unsigned int num_threads = max_render_feasible_tile_size.x *
- max_render_feasible_tile_size.y;
- unsigned int num_tile_columns_possible = num_threads / global_size[1];
- /* Estimate number of parallel samples that can be
- * processed in parallel.
- */
- unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
- rtile.num_samples);
- /* Wavefront size in AMD is 64.
- * TODO(sergey): What about other platforms?
- */
- if(num_parallel_samples >= 64) {
- /* TODO(sergey): Could use generic round-up here. */
- num_parallel_samples = (num_parallel_samples / 64) * 64;
+ void thread_run(DeviceTask *task)
+ {
+ if(task->type == DeviceTask::FILM_CONVERT) {
+ film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
}
- assert(num_parallel_samples != 0);
-
- global_size[0] = d_w * num_parallel_samples;
-#endif /* __WORK_STEALING__ */
-
- assert(global_size[0] * global_size[1] <=
- max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
-
- /* Allocate all required global memory once. */
- if(first_tile) {
- size_t num_global_elements = max_render_feasible_tile_size.x *
- max_render_feasible_tile_size.y;
- /* TODO(sergey): This will actually over-allocate if
- * particular kernel does not support multiclosure.
- */
- size_t shaderdata_size = get_shader_data_size(current_max_closure);
-
-#ifdef __WORK_STEALING__
- /* Calculate max groups */
- size_t max_global_size[2];
- size_t tile_x = max_render_feasible_tile_size.x;
- size_t tile_y = max_render_feasible_tile_size.y;
- max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
- max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
- max_work_groups = (max_global_size[0] * max_global_size[1]) /
- (local_size[0] * local_size[1]);
- /* Allocate work_pool_wgs memory. */
- work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
-#endif /* __WORK_STEALING__ */
-
- /* Allocate queue_index memory only once. */
- Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
- use_queues_flag = mem_alloc(sizeof(char));
- kgbuffer = mem_alloc(get_KernelGlobals_size());
-
- /* Create global buffers for ShaderData. */
- sd = mem_alloc(num_global_elements * shaderdata_size);
- sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size);
-
- /* Creation of global memory buffers which are shared among
- * the kernels.
- */
- rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
- throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
- L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
- PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
- Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
- PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
- Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
- AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
- AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
- AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
- BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
- ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
- LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
- Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection));
-
-#ifdef WITH_CYCLES_DEBUG
- debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
-#endif
-
- ray_state = mem_alloc(num_global_elements * sizeof(char));
-
- hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
- assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
-
- Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
- work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
- per_sample_output_buffers = mem_alloc(num_global_elements *
- per_thread_output_buffer_size);
+ else if(task->type == DeviceTask::SHADER) {
+ shader(*task);
}
+ else if(task->type == DeviceTask::PATH_TRACE) {
+ RenderTile tile;
- cl_int dQueue_size = global_size[0] * global_size[1];
-
- cl_uint start_arg_index =
- kernel_set_args(program_data_init(),
- 0,
- kgbuffer,
- sd_DL_shadow,
- d_data,
- per_sample_output_buffers,
- d_rng_state,
- rng_coop,
- throughput_coop,
- L_transparent_coop,
- PathRadiance_coop,
- Ray_coop,
- PathState_coop,
- Intersection_coop_shadow,
- ray_state);
-
-/* TODO(sergey): Avoid map lookup here. */
+ /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
+ * fetch its size.
+ */
+ typedef struct KernelGlobals {
+ ccl_constant KernelData *data;
#define KERNEL_TEX(type, ttype, name) \
- set_kernel_arg_mem(program_data_init(), &start_arg_index, #name);
+ ccl_global type *name;
#include "kernel_textures.h"
#undef KERNEL_TEX
+ SplitData split_data;
+ SplitParams split_param_data;
+ } KernelGlobals;
- start_arg_index +=
- kernel_set_args(program_data_init(),
- start_arg_index,
- start_sample,
- d_x,
- d_y,
- d_w,
- d_h,
- d_offset,
- d_stride,
- rtile.rng_state_offset_x,
- rtile.rng_state_offset_y,
- rtile.buffer_rng_state_stride,
- Queue_data,
- Queue_index,
- dQueue_size,
- use_queues_flag,
- work_array,
-#ifdef __WORK_STEALING__
- work_pool_wgs,
- num_samples,
-#endif
-#ifdef WITH_CYCLES_DEBUG
- debugdata_coop,
-#endif
- num_parallel_samples);
-
- kernel_set_args(program_scene_intersect(),
- 0,
- kgbuffer,
- d_data,
- rng_coop,
- Ray_coop,
- PathState_coop,
- Intersection_coop,
- ray_state,
- d_w,
- d_h,
- Queue_data,
- Queue_index,
- dQueue_size,
- use_queues_flag,
-#ifdef WITH_CYCLES_DEBUG
- debugdata_coop,
-#endif
- num_parallel_samples);
-
- kernel_set_args(program_lamp_emission(),
- 0,
- kgbuffer,
- d_data,
- throughput_coop,
- PathRadiance_coop,
- Ray_coop,
- PathState_coop,
- Intersection_coop,
- ray_state,
- d_w,
- d_h,
- Queue_data,
- Queue_index,
- dQueue_size,
- use_queues_flag,
- num_parallel_samples);
-
- kernel_set_args(program_queue_enqueue(),
- 0,
- Queue_data,
- Queue_index,
- ray_state,
- dQueue_size);
-
- kernel_set_args(program_background_buffer_update(),
- 0,
- kgbuffer,
- d_data,
- per_sample_output_buffers,
- d_rng_state,
- rng_coop,
- throughput_coop,
- PathRadiance_coop,
- Ray_coop,
- PathState_coop,
- L_transparent_coop,
- ray_state,
- d_w,
- d_h,
- d_x,
- d_y,
- d_stride,
- rtile.rng_state_offset_x,
- rtile.rng_state_offset_y,
- rtile.buffer_rng_state_stride,
- work_array,
- Queue_data,
- Queue_index,
- dQueue_size,
- end_sample,
- start_sample,
-#ifdef __WORK_STEALING__
- work_pool_wgs,
- num_samples,
-#endif
-#ifdef WITH_CYCLES_DEBUG
- debugdata_coop,
-#endif
- num_parallel_samples);
-
- kernel_set_args(program_shader_eval(),
- 0,
- kgbuffer,
- d_data,
- sd,
- rng_coop,
- Ray_coop,
- PathState_coop,
- Intersection_coop,
- ray_state,
- Queue_data,
- Queue_index,
- dQueue_size);
-
- kernel_set_args(program_holdout_emission_blurring_pathtermination_ao(),
- 0,
- kgbuffer,
- d_data,
- sd,
- per_sample_output_buffers,
- rng_coop,
- throughput_coop,
- L_transparent_coop,
- PathRadiance_coop,
- PathState_coop,
- Intersection_coop,
- AOAlpha_coop,
- AOBSDF_coop,
- AOLightRay_coop,
- d_w,
- d_h,
- d_x,
- d_y,
- d_stride,
- ray_state,
- work_array,
- Queue_data,
- Queue_index,
- dQueue_size,
-#ifdef __WORK_STEALING__
- start_sample,
-#endif
- num_parallel_samples);
-
- kernel_set_args(program_direct_lighting(),
- 0,
- kgbuffer,
- d_data,
- sd,
- rng_coop,
- PathState_coop,
- ISLamp_coop,
- LightRay_coop,
- BSDFEval_coop,
- ray_state,
- Queue_data,
- Queue_index,
- dQueue_size);
-
- kernel_set_args(program_shadow_blocked(),
- 0,
- kgbuffer,
- d_data,
- PathState_coop,
- LightRay_coop,
- AOLightRay_coop,
- ray_state,
- Queue_data,
- Queue_index,
- dQueue_size);
-
- kernel_set_args(program_next_iteration_setup(),
- 0,
- kgbuffer,
- d_data,
- sd,
- rng_coop,
- throughput_coop,
- PathRadiance_coop,
- Ray_coop,
- PathState_coop,
- LightRay_coop,
- ISLamp_coop,
- BSDFEval_coop,
- AOLightRay_coop,
- AOBSDF_coop,
- AOAlpha_coop,
- ray_state,
- Queue_data,
- Queue_index,
- dQueue_size,
- use_queues_flag);
-
- kernel_set_args(program_sum_all_radiance(),
- 0,
- d_data,
- d_buffer,
- per_sample_output_buffers,
- num_parallel_samples,
- d_w,
- d_h,
- d_stride,
- rtile.buffer_offset_x,
- rtile.buffer_offset_y,
- rtile.buffer_rng_state_stride,
- start_sample);
-
- /* Macro for Enqueuing split kernels. */
-#define GLUE(a, b) a ## b
-#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
- { \
- ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \
- GLUE(program_, \
- kernelName)(), \
- 2, \
- NULL, \
- globalSize, \
- localSize, \
- 0, \
- NULL, \
- NULL); \
- opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \
- if(ciErr != CL_SUCCESS) { \
- string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \
- clewErrorString(ciErr)); \
- opencl_error(message); \
- return; \
- } \
- } (void) 0
-
- /* Enqueue ckPathTraceKernel_data_init kernel. */
- ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
- bool activeRaysAvailable = true;
-
- /* Record number of time host intervention has been made */
- unsigned int numHostIntervention = 0;
- unsigned int numNextPathIterTimes = PathIteration_times;
- bool canceled = false;
- while(activeRaysAvailable) {
- /* Twice the global work size of other kernels for
- * ckPathTraceKernel_shadow_blocked_direct_lighting. */
- size_t global_size_shadow_blocked[2];
- global_size_shadow_blocked[0] = global_size[0] * 2;
- global_size_shadow_blocked[1] = global_size[1];
-
- /* Do path-iteration in host [Enqueue Path-iteration kernels. */
- for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
- ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
- ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
-
- if(task->get_cancel()) {
- canceled = true;
- break;
- }
- }
+ /* Allocate buffer for kernel globals */
+ device_memory kgbuffer;
+ kgbuffer.resize(sizeof(KernelGlobals));
+ mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
- /* Read ray-state into Host memory to decide if we should exit
- * path-iteration in host.
- */
- ciErr = clEnqueueReadBuffer(cqCommandQueue,
- ray_state,
- CL_TRUE,
- 0,
- global_size[0] * global_size[1] * sizeof(char),
- hostRayStateArray,
- 0,
- NULL,
- NULL);
- assert(ciErr == CL_SUCCESS);
-
- activeRaysAvailable = false;
-
- for(int rayStateIter = 0;
- rayStateIter < global_size[0] * global_size[1];
- ++rayStateIter)
- {
- if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
- /* Not all rays are RAY_INACTIVE. */
- activeRaysAvailable = true;
- break;
- }
- }
+ /* Keep rendering tiles until done. */
+ while(task->acquire_tile(this, tile)) {
+ split_kernel->path_trace(task,
+ tile,
+ kgbuffer,
+ *const_mem_map["__data"]);
- if(activeRaysAvailable) {
- numHostIntervention++;
- PathIteration_times = PATH_ITER_INC_FACTOR;
- /* Host intervention done before all rays become RAY_INACTIVE;
- * Set do more initial iterations for the next tile.
+ /* Complete kernel execution before release tile. */
+ /* This helps in multi-device render;
+ * The device that reaches the critical-section function
+ * release_tile waits (stalling other devices from entering
+ * release_tile) for all kernels to complete. If device1 (a
+ * slow-render device) reaches release_tile first then it would
+ * stall device2 (a fast-render device) from proceeding to render
+ * next tile.
*/
- numNextPathIterTimes += PATH_ITER_INC_FACTOR;
- }
+ clFinish(cqCommandQueue);
- if(task->get_cancel()) {
- canceled = true;
- break;
+ task->release_tile(tile);
}
- }
- /* Execute SumALLRadiance kernel to accumulate radiance calculated in
- * per_sample_output_buffers into RenderTile's output buffer.
- */
- if(!canceled) {
- size_t sum_all_radiance_local_size[2] = {16, 16};
- size_t sum_all_radiance_global_size[2];
- sum_all_radiance_global_size[0] =
- (((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
- sum_all_radiance_local_size[0];
- sum_all_radiance_global_size[1] =
- (((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
- sum_all_radiance_local_size[1];
- ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
- sum_all_radiance_global_size,
- sum_all_radiance_local_size);
- }
-
-#undef ENQUEUE_SPLIT_KERNEL
-#undef GLUE
-
- if(numHostIntervention == 0) {
- /* This means that we are executing kernel more than required
- * Must avoid this for the next sample/tile.
- */
- PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?
- PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR;
+ mem_free(kgbuffer);
}
- else {
- /* Number of path-iterations done for this tile is set as
- * Initial path-iteration times for the next tile
- */
- PathIteration_times = numNextPathIterTimes;
- }
-
- first_tile = false;
}
- /* Calculates the amount of memory that has to be always
- * allocated in order for the split kernel to function.
- * This memory is tile/scene-property invariant (meaning,
- * the value returned by this function does not depend
- * on the user set tile size or scene properties.
- */
- size_t get_invariable_mem_allocated()
- {
- size_t total_invariable_mem_allocated = 0;
- size_t KernelGlobals_size = 0;
-
- KernelGlobals_size = get_KernelGlobals_size();
-
- total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */
- total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */
- total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */
-
- return total_invariable_mem_allocated;
- }
+protected:
+ /* ** Those guys are for workign around some compiler-specific bugs ** */
- /* Calculate the memory that has-to-be/has-been allocated for
- * the split kernel to function.
- */
- size_t get_tile_specific_mem_allocated(const int2 tile_size)
+ string build_options_for_base_program(
+ const DeviceRequestedFeatures& requested_features)
{
- size_t tile_specific_mem_allocated = 0;
-
- /* Get required tile info */
- unsigned int user_set_tile_w = tile_size.x;
- unsigned int user_set_tile_h = tile_size.y;
-
-#ifdef __WORK_STEALING__
- /* Calculate memory to be allocated for work_pools in
- * case of work_stealing.
- */
- size_t max_global_size[2];
- size_t max_num_work_pools = 0;
- max_global_size[0] =
- (((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
- SPLIT_KERNEL_LOCAL_SIZE_X;
- max_global_size[1] =
- (((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
- SPLIT_KERNEL_LOCAL_SIZE_Y;
- max_num_work_pools =
- (max_global_size[0] * max_global_size[1]) /
- (SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y);
- tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int);
-#endif
-
- tile_specific_mem_allocated +=
- user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size;
- tile_specific_mem_allocated +=
- user_set_tile_w * user_set_tile_h * sizeof(RNG);
-
- return tile_specific_mem_allocated;
+ return requested_features.get_build_options();
}
- /* Calculates the texture memories and KernelData (d_data) memory
- * that has been allocated.
- */
- size_t get_scene_specific_mem_allocated(cl_mem d_data)
- {
- size_t scene_specific_mem_allocated = 0;
- /* Calculate texture memories. */
-#define KERNEL_TEX(type, ttype, name) \
- scene_specific_mem_allocated += get_tex_size(#name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
- size_t d_data_size;
- ciErr = clGetMemObjectInfo(d_data,
- CL_MEM_SIZE,
- sizeof(d_data_size),
- &d_data_size,
- NULL);
- assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info");
- scene_specific_mem_allocated += d_data_size;
- return scene_specific_mem_allocated;
- }
+ friend class OpenCLSplitKernel;
+ friend class OpenCLSplitKernelFunction;
+};
- /* Calculate the memory required for one thread in split kernel. */
- size_t get_per_thread_memory()
- {
- size_t shaderdata_size = 0;
- /* TODO(sergey): This will actually over-allocate if
- * particular kernel does not support multiclosure.
- */
- shaderdata_size = get_shader_data_size(current_max_closure);
- size_t retval = sizeof(RNG)
- + sizeof(float3) /* Throughput size */
- + sizeof(float) /* L transparent size */
- + sizeof(char) /* Ray state size */
- + sizeof(unsigned int) /* Work element size */
- + sizeof(int) /* ISLamp_size */
- + sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState)
- + sizeof(Intersection) /* Overall isect */
- + sizeof(Intersection) /* Instersection_coop_AO */
- + sizeof(Intersection) /* Intersection coop DL */
- + shaderdata_size /* Overall ShaderData */
- + (shaderdata_size * 2) /* ShaderData : DL and shadow */
- + sizeof(Ray) + sizeof(BsdfEval)
- + sizeof(float3) /* AOAlpha size */
- + sizeof(float3) /* AOBSDF size */
- + sizeof(Ray)
- + (sizeof(int) * NUM_QUEUES)
- + per_thread_output_buffer_size;
- return retval;
- }
+class OpenCLSplitKernelFunction : public SplitKernelFunction {
+public:
+ OpenCLDeviceSplitKernel* device;
+ OpenCLDeviceBase::OpenCLProgram program;
- /* Considers the total memory available in the device and
- * and returns the maximum global work size possible.
- */
- size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data)
- {
- /* Calculate invariably allocated memory. */
- size_t invariable_mem_allocated = get_invariable_mem_allocated();
- /* Calculate tile specific allocated memory. */
- size_t tile_specific_mem_allocated =
- get_tile_specific_mem_allocated(tile_size);
- /* Calculate scene specific allocated memory. */
- size_t scene_specific_mem_allocated =
- get_scene_specific_mem_allocated(d_data);
- /* Calculate total memory available for the threads in global work size. */
- size_t available_memory = total_allocatable_memory
- - invariable_mem_allocated
- - tile_specific_mem_allocated
- - scene_specific_mem_allocated
- - DATA_ALLOCATION_MEM_FACTOR;
- size_t per_thread_memory_required = get_per_thread_memory();
- return (available_memory / per_thread_memory_required);
- }
+ OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {}
+ ~OpenCLSplitKernelFunction() { program.release(); }
- /* Checks if the device has enough memory to render the whole tile;
- * If not, we should split single tile into multiple tiles of small size
- * and process them all.
- */
- bool need_to_split_tile(unsigned int d_w,
- unsigned int d_h,
- int2 max_render_feasible_tile_size)
+ virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
{
- size_t global_size_estimate[2];
- /* TODO(sergey): Such round-ups are in quite few places, need to replace
- * them with an utility macro.
- */
- global_size_estimate[0] =
- (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
- SPLIT_KERNEL_LOCAL_SIZE_X;
- global_size_estimate[1] =
- (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
- SPLIT_KERNEL_LOCAL_SIZE_Y;
- if((global_size_estimate[0] * global_size_estimate[1]) >
- (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y))
- {
- return true;
- }
- else {
+ device->kernel_set_args(program(), 0, kg, data);
+
+ device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+ program(),
+ 2,
+ NULL,
+ dim.global_size,
+ dim.local_size,
+ 0,
+ NULL,
+ NULL);
+
+ device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+ if(device->ciErr != CL_SUCCESS) {
+ string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+ clewErrorString(device->ciErr));
+ device->opencl_error(message);
return false;
}
+
+ return true;
}
+};
- /* Considers the scene properties, global memory available in the device
- * and returns a rectanglular tile dimension (approx the maximum)
- * that should render on split kernel.
- */
- int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size)
- {
- int2 max_render_feasible_tile_size;
- int square_root_val = (int)sqrt(feasible_global_work_size);
- max_render_feasible_tile_size.x = square_root_val;
- max_render_feasible_tile_size.y = square_root_val;
- /* Ciel round-off max_render_feasible_tile_size. */
- int2 ceil_render_feasible_tile_size;
- ceil_render_feasible_tile_size.x =
- (((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
- SPLIT_KERNEL_LOCAL_SIZE_X;
- ceil_render_feasible_tile_size.y =
- (((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
- SPLIT_KERNEL_LOCAL_SIZE_Y;
- if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <=
- feasible_global_work_size)
- {
- return ceil_render_feasible_tile_size;
- }
- /* Floor round-off max_render_feasible_tile_size. */
- int2 floor_render_feasible_tile_size;
- floor_render_feasible_tile_size.x =
- (max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) *
- SPLIT_KERNEL_LOCAL_SIZE_X;
- floor_render_feasible_tile_size.y =
- (max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) *
- SPLIT_KERNEL_LOCAL_SIZE_Y;
- return floor_render_feasible_tile_size;
+class OpenCLSplitKernel : public DeviceSplitKernel {
+ OpenCLDeviceSplitKernel *device;
+public:
+ explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
}
- /* Try splitting the current tile into multiple smaller
- * almost-square-tiles.
- */
- int2 get_split_tile_size(RenderTile rtile,
- int2 max_render_feasible_tile_size)
+ virtual SplitKernelFunction* get_split_kernel_function(string kernel_name,
+ const DeviceRequestedFeatures& requested_features)
{
- int2 split_tile_size;
- int num_global_threads = max_render_feasible_tile_size.x *
- max_render_feasible_tile_size.y;
- int d_w = rtile.w;
- int d_h = rtile.h;
- /* Ceil round off d_w and d_h */
- d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
- SPLIT_KERNEL_LOCAL_SIZE_X;
- d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
- SPLIT_KERNEL_LOCAL_SIZE_Y;
- while(d_w * d_h > num_global_threads) {
- /* Halve the longer dimension. */
- if(d_w >= d_h) {
- d_w = d_w / 2;
- d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
- SPLIT_KERNEL_LOCAL_SIZE_X;
- }
- else {
- d_h = d_h / 2;
- d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
- SPLIT_KERNEL_LOCAL_SIZE_Y;
- }
+ OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device);
+
+ kernel->program = OpenCLDeviceBase::OpenCLProgram(device,
+ "split_" + kernel_name,
+ "kernel_" + kernel_name + ".cl",
+ get_build_options(device, requested_features));
+ kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
+ kernel->program.load();
+
+ if(!kernel->program.is_loaded()) {
+ delete kernel;
+ return NULL;
}
- split_tile_size.x = d_w;
- split_tile_size.y = d_h;
- return split_tile_size;
+
+ return kernel;
}
- /* Splits existing tile into multiple tiles of tile size split_tile_size. */
- vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size)
+ virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
{
- vector<SplitRenderTile> to_path_trace_rtile;
- int d_w = rtile.w;
- int d_h = rtile.h;
- int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1);
- int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1);
- /* Buffer and rng_state offset calc. */
- size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride);
- size_t offset_x = offset_index % rtile.stride;
- size_t offset_y = offset_index / rtile.stride;
- /* Resize to_path_trace_rtile. */
- to_path_trace_rtile.resize(num_tiles_x * num_tiles_y);
- for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) {
- for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) {
- int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x;
- to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x;
- to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y;
- to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x;
- to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y;
- to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample;
- to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples;
- to_path_trace_rtile[rtile_index].sample = rtile.sample;
- to_path_trace_rtile[rtile_index].resolution = rtile.resolution;
- to_path_trace_rtile[rtile_index].offset = rtile.offset;
- to_path_trace_rtile[rtile_index].buffers = rtile.buffers;
- to_path_trace_rtile[rtile_index].buffer = rtile.buffer;
- to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state;
- to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x);
- to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y);
- to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride;
- /* Fill width and height of the new render tile. */
- to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ?
- (d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */
- : split_tile_size.x;
- to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ?
- (d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */
- : split_tile_size.y;
- to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w;
- }
+ device_vector<uint> size_buffer;
+ size_buffer.resize(1);
+ device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+ uint threads = num_threads;
+ device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
+
+ size_t global_size = 64;
+ device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+ device->program_state_buffer_size(),
+ 1,
+ NULL,
+ &global_size,
+ NULL,
+ 0,
+ NULL,
+ NULL);
+
+ device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+ device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
+ device->mem_free(size_buffer);
+
+ if(device->ciErr != CL_SUCCESS) {
+ string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+ clewErrorString(device->ciErr));
+ device->opencl_error(message);
+ return 0;
}
- return to_path_trace_rtile;
+
+ return *size_buffer.get_data();
}
- void thread_run(DeviceTask *task)
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& kernel_globals,
+ device_memory& kernel_data,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs
+ )
{
- if(task->type == DeviceTask::FILM_CONVERT) {
- film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
- }
- else if(task->type == DeviceTask::SHADER) {
- shader(*task);
- }
- else if(task->type == DeviceTask::PATH_TRACE) {
- RenderTile tile;
- bool initialize_data_and_check_render_feasibility = false;
- bool need_to_split_tiles_further = false;
- int2 max_render_feasible_tile_size;
- size_t feasible_global_work_size;
- const int2 tile_size = task->requested_tile_size;
- /* Keep rendering tiles until done. */
- while(task->acquire_tile(this, tile)) {
- if(!initialize_data_and_check_render_feasibility) {
- /* Initialize data. */
- /* Calculate per_thread_output_buffer_size. */
- size_t output_buffer_size = 0;
- ciErr = clGetMemObjectInfo((cl_mem)tile.buffer,
- CL_MEM_SIZE,
- sizeof(output_buffer_size),
- &output_buffer_size,
- NULL);
- assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info");
- /* This value is different when running on AMD and NV. */
- if(background) {
- /* In offline render the number of buffer elements
- * associated with tile.buffer is the current tile size.
- */
- per_thread_output_buffer_size =
- output_buffer_size / (tile.w * tile.h);
- }
- else {
- /* interactive rendering, unlike offline render, the number of buffer elements
- * associated with tile.buffer is the entire viewport size.
- */
- per_thread_output_buffer_size =
- output_buffer_size / (tile.buffers->params.width *
- tile.buffers->params.height);
- }
- /* Check render feasibility. */
- feasible_global_work_size = get_feasible_global_work_size(
- tile_size,
- CL_MEM_PTR(const_mem_map["__data"]->device_pointer));
- max_render_feasible_tile_size =
- get_max_render_feasible_tile_size(
- feasible_global_work_size);
- need_to_split_tiles_further =
- need_to_split_tile(tile_size.x,
- tile_size.y,
- max_render_feasible_tile_size);
- initialize_data_and_check_render_feasibility = true;
- }
- if(need_to_split_tiles_further) {
- int2 split_tile_size =
- get_split_tile_size(tile,
- max_render_feasible_tile_size);
- vector<SplitRenderTile> to_path_trace_render_tiles =
- split_tiles(tile, split_tile_size);
- /* Print message to console */
- if(background && (to_path_trace_render_tiles.size() > 1)) {
- fprintf(stderr, "Message : Tiles need to be split "
- "further inside path trace (due to insufficient "
- "device-global-memory for split kernel to "
- "function) \n"
- "The current tile of dimensions %dx%d is split "
- "into tiles of dimension %dx%d for render \n",
- tile.w, tile.h,
- split_tile_size.x,
- split_tile_size.y);
- }
- /* Process all split tiles. */
- for(int tile_iter = 0;
- tile_iter < to_path_trace_render_tiles.size();
- ++tile_iter)
- {
- path_trace(task,
- to_path_trace_render_tiles[tile_iter],
- max_render_feasible_tile_size);
- }
- }
- else {
- /* No splitting required; process the entire tile at once. */
- /* Render feasible tile size is user-set-tile-size itself. */
- max_render_feasible_tile_size.x =
- (((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
- SPLIT_KERNEL_LOCAL_SIZE_X;
- max_render_feasible_tile_size.y =
- (((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
- SPLIT_KERNEL_LOCAL_SIZE_Y;
- /* buffer_rng_state_stride is stride itself. */
- SplitRenderTile split_tile(tile);
- split_tile.buffer_rng_state_stride = tile.stride;
- path_trace(task, split_tile, max_render_feasible_tile_size);
- }
- tile.sample = tile.start_sample + tile.num_samples;
+ cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
- /* Complete kernel execution before release tile. */
- /* This helps in multi-device render;
- * The device that reaches the critical-section function
- * release_tile waits (stalling other devices from entering
- * release_tile) for all kernels to complete. If device1 (a
- * slow-render device) reaches release_tile first then it would
- * stall device2 (a fast-render device) from proceeding to render
- * next tile.
- */
- clFinish(cqCommandQueue);
+ /* Set the range of samples to be processed for every ray in
+ * path-regeneration logic.
+ */
+ cl_int start_sample = rtile.start_sample;
+ cl_int end_sample = rtile.start_sample + rtile.num_samples;
- task->release_tile(tile);
- }
+ cl_uint start_arg_index =
+ device->kernel_set_args(device->program_data_init(),
+ 0,
+ kernel_globals,
+ kernel_data,
+ split_data,
+ num_global_elements,
+ ray_state,
+ rtile.rng_state);
+
+/* TODO(sergey): Avoid map lookup here. */
+#define KERNEL_TEX(type, ttype, name) \
+ device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+ start_arg_index +=
+ device->kernel_set_args(device->program_data_init(),
+ start_arg_index,
+ start_sample,
+ end_sample,
+ rtile.x,
+ rtile.y,
+ rtile.w,
+ rtile.h,
+ rtile.offset,
+ rtile.stride,
+ queue_index,
+ dQueue_size,
+ use_queues_flag,
+ work_pool_wgs,
+ rtile.num_samples,
+ rtile.buffer);
+
+ /* Enqueue ckPathTraceKernel_data_init kernel. */
+ device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+ device->program_data_init(),
+ 2,
+ NULL,
+ dim.global_size,
+ dim.local_size,
+ 0,
+ NULL,
+ NULL);
+
+ device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+ if(device->ciErr != CL_SUCCESS) {
+ string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+ clewErrorString(device->ciErr));
+ device->opencl_error(message);
+ return false;
}
+
+ return true;
}
-protected:
- cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE)
+ virtual int2 split_kernel_local_size()
{
- cl_mem ptr;
- assert(bufsize != 0);
- ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr);
- opencl_assert_err(ciErr, "clCreateBuffer");
- return ptr;
+ return make_int2(64, 1);
}
- /* ** Those guys are for workign around some compiler-specific bugs ** */
-
- string build_options_for_base_program(
- const DeviceRequestedFeatures& requested_features)
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/)
{
- return requested_features.get_build_options();
+ size_t max_buffer_size;
+ clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_buffer_size, NULL);
+ VLOG(1) << "Maximum device allocation side: "
+ << string_human_readable_number(max_buffer_size) << " bytes. ("
+ << string_human_readable_size(max_buffer_size) << ").";
+
+ size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2);
+ int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements));
+ VLOG(1) << "Global size: " << global_size << ".";
+ return global_size;
}
};
+OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
+: OpenCLDeviceBase(info, stats, background_)
+{
+ split_kernel = new OpenCLSplitKernel(this);
+
+ background = background_;
+}
+
Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background)
{
return new OpenCLDeviceSplitKernel(info, stats, background);
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index c7760e075cb..d5c19bf5386 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -19,6 +19,7 @@
#include "opencl.h"
#include "util_logging.h"
+#include "util_md5.h"
#include "util_path.h"
#include "util_time.h"
@@ -338,12 +339,13 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src)
bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
{
- string source = "#include \"kernels/opencl/" + kernel_file + "\" // " + OpenCLCache::get_kernel_md5() + "\n";
+ string source = "#include \"kernels/opencl/" + kernel_file + "\"\n";
/* We compile kernels consisting of many files. unfortunately OpenCL
* kernel caches do not seem to recognize changes in included files.
* so we force recompile on changes by adding the md5 hash of all files.
*/
source = path_source_replace_includes(source, path_get("kernel"));
+ source += "\n// " + util_md5_string(source) + "\n";
if(debug_src) {
path_write_text(*debug_src, source);
@@ -440,7 +442,11 @@ void OpenCLDeviceBase::OpenCLProgram::load()
if(!program) {
add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
- string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + OpenCLCache::get_kernel_md5();
+ /* need to create source to get md5 */
+ string source = "#include \"kernels/opencl/" + kernel_file + "\"\n";
+ source = path_source_replace_includes(source, path_get("kernel"));
+
+ string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source);
basename = path_cache_get(path_join("kernels", basename));
string clbin = basename + ".clbin";
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 29e0f44841e..1c740b5c6eb 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -13,8 +13,11 @@ set(INC_SYS
set(SRC
kernels/cpu/kernel.cpp
+ kernels/cpu/kernel_split.cpp
kernels/opencl/kernel.cl
+ kernels/opencl/kernel_state_buffer_size.cl
kernels/opencl/kernel_data_init.cl
+ kernels/opencl/kernel_path_init.cl
kernels/opencl/kernel_queue_enqueue.cl
kernels/opencl/kernel_scene_intersect.cl
kernels/opencl/kernel_lamp_emission.cl
@@ -24,8 +27,8 @@ set(SRC
kernels/opencl/kernel_direct_lighting.cl
kernels/opencl/kernel_shadow_blocked.cl
kernels/opencl/kernel_next_iteration_setup.cl
- kernels/opencl/kernel_sum_all_radiance.cl
kernels/cuda/kernel.cu
+ kernels/cuda/kernel_split.cu
)
set(SRC_BVH_HEADERS
@@ -88,6 +91,10 @@ set(SRC_KERNELS_CPU_HEADERS
kernels/cpu/kernel_cpu_image.h
)
+set(SRC_KERNELS_CUDA_HEADERS
+ kernels/cuda/kernel_config.h
+)
+
set(SRC_CLOSURE_HEADERS
closure/alloc.h
closure/bsdf.h
@@ -195,11 +202,14 @@ set(SRC_SPLIT_HEADERS
split/kernel_holdout_emission_blurring_pathtermination_ao.h
split/kernel_lamp_emission.h
split/kernel_next_iteration_setup.h
+ split/kernel_path_init.h
+ split/kernel_queue_enqueue.h
split/kernel_scene_intersect.h
split/kernel_shader_eval.h
split/kernel_shadow_blocked.h
split/kernel_split_common.h
- split/kernel_sum_all_radiance.h
+ split/kernel_split_data.h
+ split/kernel_split_data_types.h
)
# CUDA module
@@ -227,8 +237,9 @@ if(WITH_CYCLES_CUDA_BINARIES)
endif()
# build for each arch
- set(cuda_sources kernels/cuda/kernel.cu
+ set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
${SRC_HEADERS}
+ ${SRC_KERNELS_CUDA_HEADERS}
${SRC_BVH_HEADERS}
${SRC_SVM_HEADERS}
${SRC_GEOM_HEADERS}
@@ -237,15 +248,22 @@ if(WITH_CYCLES_CUDA_BINARIES)
)
set(cuda_cubins)
- macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
- if(${experimental})
- set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
- set(cuda_cubin kernel_experimental_${arch}.cubin)
+ macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
+ if(${split})
+ set(cuda_extra_flags "-D__SPLIT__")
+ set(cuda_cubin kernel_split)
else()
set(cuda_extra_flags "")
- set(cuda_cubin kernel_${arch}.cubin)
+ set(cuda_cubin kernel)
+ endif()
+
+ if(${experimental})
+ set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
+ set(cuda_cubin ${cuda_cubin}_experimental)
endif()
+ set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
+
if(WITH_CYCLES_DEBUG)
set(cuda_debug_flags "-D__KERNEL_DEBUG__")
else()
@@ -258,13 +276,19 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
set(cuda_math_flags "--use_fast_math")
+ if(split)
+ set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
+ else()
+ set(cuda_kernel_src "/kernels/cuda/kernel.cu")
+ endif()
+
add_custom_command(
OUTPUT ${cuda_cubin}
COMMAND ${cuda_nvcc_command}
-arch=${arch}
${CUDA_NVCC_FLAGS}
-m${CUDA_BITS}
- --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
+ --cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
--ptxas-options="-v"
${cuda_arch_flags}
@@ -291,7 +315,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
# Compile regular kernel
- CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
+ CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
+
+ if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
+ # Compile split kernel
+ CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
+ endif()
endforeach()
add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
@@ -314,31 +343,42 @@ if(CXX_HAS_SSE)
kernels/cpu/kernel_sse2.cpp
kernels/cpu/kernel_sse3.cpp
kernels/cpu/kernel_sse41.cpp
+ kernels/cpu/kernel_split_sse2.cpp
+ kernels/cpu/kernel_split_sse3.cpp
+ kernels/cpu/kernel_split_sse41.cpp
)
set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX)
list(APPEND SRC
kernels/cpu/kernel_avx.cpp
+ kernels/cpu/kernel_split_avx.cpp
)
set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
endif()
if(CXX_HAS_AVX2)
list(APPEND SRC
kernels/cpu/kernel_avx2.cpp
+ kernels/cpu/kernel_split_avx2.cpp
)
set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+ set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
endif()
add_library(cycles_kernel
${SRC}
${SRC_HEADERS}
${SRC_KERNELS_CPU_HEADERS}
+ ${SRC_KERNELS_CUDA_HEADERS}
${SRC_BVH_HEADERS}
${SRC_CLOSURE_HEADERS}
${SRC_SVM_HEADERS}
@@ -361,7 +401,9 @@ endif()
#delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_state_buffer_size.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_path_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
@@ -371,9 +413,10 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emiss
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index b7abc1ec507..4894ea58dba 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -20,17 +20,17 @@ ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType ty
{
kernel_assert(size <= sizeof(ShaderClosure));
- int num_closure = ccl_fetch(sd, num_closure);
- int num_closure_extra = ccl_fetch(sd, num_closure_extra);
+ int num_closure = sd->num_closure;
+ int num_closure_extra = sd->num_closure_extra;
if(num_closure + num_closure_extra >= MAX_CLOSURE)
return NULL;
- ShaderClosure *sc = &ccl_fetch(sd, closure)[num_closure];
+ ShaderClosure *sc = &sd->closure[num_closure];
sc->type = type;
sc->weight = weight;
- ccl_fetch(sd, num_closure)++;
+ sd->num_closure++;
return sc;
}
@@ -44,18 +44,18 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size)
* This lets us keep the same fast array iteration over closures, as we
* found linked list iteration and iteration with skipping to be slower. */
int num_extra = ((size + sizeof(ShaderClosure) - 1) / sizeof(ShaderClosure));
- int num_closure = ccl_fetch(sd, num_closure);
- int num_closure_extra = ccl_fetch(sd, num_closure_extra) + num_extra;
+ int num_closure = sd->num_closure;
+ int num_closure_extra = sd->num_closure_extra + num_extra;
if(num_closure + num_closure_extra > MAX_CLOSURE) {
/* Remove previous closure. */
- ccl_fetch(sd, num_closure)--;
- ccl_fetch(sd, num_closure_extra)++;
+ sd->num_closure--;
+ sd->num_closure_extra++;
return NULL;
}
- ccl_fetch(sd, num_closure_extra) = num_closure_extra;
- return (ccl_addr_space void*)(ccl_fetch(sd, closure) + MAX_CLOSURE - num_closure_extra);
+ sd->num_closure_extra = num_closure_extra;
+ return (ccl_addr_space void*)(sd->closure + MAX_CLOSURE - num_closure_extra);
}
ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 7e4d5fe2e37..a44b9e2d9b9 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -51,89 +51,89 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
switch(sc->type) {
case CLOSURE_BSDF_DIFFUSE_ID:
case CLOSURE_BSDF_BSSRDF_ID:
- label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
#ifdef __SVM__
case CLOSURE_BSDF_OREN_NAYAR_ID:
- label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
#ifdef __OSL__
case CLOSURE_BSDF_PHONG_RAMP_ID:
- label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
- label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
#endif
case CLOSURE_BSDF_TRANSLUCENT_ID:
- label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_REFLECTION_ID:
- label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_REFRACTION_ID:
- label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_TRANSPARENT_ID:
- label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
- label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
- label = bsdf_microfacet_multi_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
- eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state));
+ label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
- label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
- eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state));
+ label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
- label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
- label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
- label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_DIFFUSE_TOON_ID:
- label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_GLOSSY_TOON_ID:
- label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_HAIR_REFLECTION_ID:
- label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
- label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+ label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
#endif
#ifdef __VOLUME__
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
- label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+ label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
#endif
default:
@@ -157,75 +157,75 @@ float3 bsdf_eval(KernelGlobals *kg,
{
float3 eval;
- if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) {
+ if(dot(sd->Ng, omega_in) >= 0.0f) {
switch(sc->type) {
case CLOSURE_BSDF_DIFFUSE_ID:
case CLOSURE_BSDF_BSSRDF_ID:
- eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
break;
#ifdef __SVM__
case CLOSURE_BSDF_OREN_NAYAR_ID:
- eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf);
break;
#ifdef __OSL__
case CLOSURE_BSDF_PHONG_RAMP_ID:
- eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
- eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
break;
#endif
case CLOSURE_BSDF_TRANSLUCENT_ID:
- eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_REFLECTION_ID:
- eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_REFRACTION_ID:
- eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_TRANSPARENT_ID:
- eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
- eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
- eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+ eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
- eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+ eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
- eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
- eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
- eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_DIFFUSE_TOON_ID:
- eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_GLOSSY_TOON_ID:
- eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_HAIR_REFLECTION_ID:
- eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
- eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
break;
#endif
#ifdef __VOLUME__
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
- eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
break;
#endif
default:
@@ -237,63 +237,63 @@ float3 bsdf_eval(KernelGlobals *kg,
switch(sc->type) {
case CLOSURE_BSDF_DIFFUSE_ID:
case CLOSURE_BSDF_BSSRDF_ID:
- eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
break;
#ifdef __SVM__
case CLOSURE_BSDF_OREN_NAYAR_ID:
- eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_TRANSLUCENT_ID:
- eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_REFLECTION_ID:
- eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_REFRACTION_ID:
- eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_TRANSPARENT_ID:
- eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
- eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
- eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+ eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
- eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+ eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
break;
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
- eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
- eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
- eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_DIFFUSE_TOON_ID:
- eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_GLOSSY_TOON_ID:
- eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_HAIR_REFLECTION_ID:
- eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
break;
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
- eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
break;
#endif
#ifdef __VOLUME__
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
- eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
+ eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
break;
#endif
default:
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index 08ccee56335..cc62192ef21 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -30,7 +30,7 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *
ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
{
#ifdef __HAIR__
- if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+ if(sd->type & PRIMITIVE_ALL_CURVE) {
return ATTR_PRIM_CURVE;
}
else
@@ -53,12 +53,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found()
ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id)
{
- if(ccl_fetch(sd, object) == PRIM_NONE) {
+ if(sd->object == PRIM_NONE) {
return attribute_not_found();
}
/* for SVM, find attribute by unique id */
- uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
+ uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
attr_offset += attribute_primitive_type(kg, sd);
uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
@@ -73,7 +73,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const Sh
AttributeDescriptor desc;
desc.element = (AttributeElement)attr_map.y;
- if(ccl_fetch(sd, prim) == PRIM_NONE &&
+ if(sd->prim == PRIM_NONE &&
desc.element != ATTR_ELEMENT_MESH &&
desc.element != ATTR_ELEMENT_VOXEL &&
desc.element != ATTR_ELEMENT_OBJECT)
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 712b67a1b55..7cc840ce78d 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -32,22 +32,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd,
if(dy) *dy = 0.0f;
#endif
- return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
+ return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
}
else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
- float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
- int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+ float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+ int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
int k1 = k0 + 1;
float f0 = kernel_tex_fetch(__attributes_float, desc.offset + k0);
float f1 = kernel_tex_fetch(__attributes_float, desc.offset + k1);
#ifdef __RAY_DIFFERENTIALS__
- if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
+ if(dx) *dx = sd->du.dx*(f1 - f0);
if(dy) *dy = 0.0f;
#endif
- return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
+ return (1.0f - sd->u)*f0 + sd->u*f1;
}
else {
#ifdef __RAY_DIFFERENTIALS__
@@ -71,22 +71,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd
if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
#endif
- return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
+ return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim));
}
else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
- float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
- int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+ float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+ int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
int k1 = k0 + 1;
float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0));
float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1));
#ifdef __RAY_DIFFERENTIALS__
- if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
+ if(dx) *dx = sd->du.dx*(f1 - f0);
if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
#endif
- return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
+ return (1.0f - sd->u)*f0 + sd->u*f1;
}
else {
#ifdef __RAY_DIFFERENTIALS__
@@ -104,22 +104,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
{
float r = 0.0f;
- if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
- float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
- int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+ if(sd->type & PRIMITIVE_ALL_CURVE) {
+ float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+ int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
int k1 = k0 + 1;
float4 P_curve[2];
- if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
+ if(sd->type & PRIMITIVE_CURVE) {
P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
}
else {
- motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
+ motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
}
- r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w;
+ r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w;
}
return r*2.0f;
@@ -130,8 +130,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
{
- float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
- int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+ float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+ int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
int k1 = k0 + 1;
float4 P_curve[2];
@@ -139,7 +139,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
- return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u));
+ return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u);
}
/* Curve tangent normal */
@@ -148,14 +148,14 @@ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
{
float3 tgN = make_float3(0.0f,0.0f,0.0f);
- if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+ if(sd->type & PRIMITIVE_ALL_CURVE) {
- tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu))));
+ tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu)));
tgN = normalize(tgN);
/* need to find suitable scaled gd for corrected normal */
#if 0
- tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu));
+ tgN = normalize(tgN - gd * sd->dPdu);
#endif
}
@@ -966,7 +966,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
if(isect->object != OBJECT_NONE) {
#ifdef __OBJECT_MOTION__
- Transform tfm = ccl_fetch(sd, ob_itfm);
+ Transform tfm = sd->ob_itfm;
#else
Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
#endif
@@ -979,7 +979,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
int prim = kernel_tex_fetch(__prim_index, isect->prim);
float4 v00 = kernel_tex_fetch(__curves, prim);
- int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+ int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
int k1 = k0 + 1;
float3 tg;
@@ -990,14 +990,14 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
float4 P_curve[4];
- if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
+ if(sd->type & PRIMITIVE_CURVE) {
P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
}
else {
- motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve);
+ motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
}
float3 p[4];
@@ -1009,43 +1009,43 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
P = P + D*t;
#ifdef __UV__
- ccl_fetch(sd, u) = isect->u;
- ccl_fetch(sd, v) = 0.0f;
+ sd->u = isect->u;
+ sd->v = 0.0f;
#endif
tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
- ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D))));
+ sd->Ng = normalize(-(D - tg * (dot(tg, D))));
}
else {
/* direction from inside to surface of curve */
float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
- ccl_fetch(sd, Ng) = normalize(P - p_curr);
+ sd->Ng = normalize(P - p_curr);
/* adjustment for changing radius */
float gd = isect->v;
if(gd != 0.0f) {
- ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
- ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
+ sd->Ng = sd->Ng - gd * tg;
+ sd->Ng = normalize(sd->Ng);
}
}
/* todo: sometimes the normal is still so that this is detected as
* backfacing even if cull backfaces is enabled */
- ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
+ sd->N = sd->Ng;
}
else {
float4 P_curve[2];
- if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
+ if(sd->type & PRIMITIVE_CURVE) {
P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
}
else {
- motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
+ motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
}
float l = 1.0f;
@@ -1056,39 +1056,39 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
float3 dif = P - float4_to_float3(P_curve[0]);
#ifdef __UV__
- ccl_fetch(sd, u) = dot(dif,tg)/l;
- ccl_fetch(sd, v) = 0.0f;
+ sd->u = dot(dif,tg)/l;
+ sd->v = 0.0f;
#endif
if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
- ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D));
- ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
+ sd->Ng = -(D - tg * dot(tg, D));
+ sd->Ng = normalize(sd->Ng);
}
else {
float gd = isect->v;
/* direction from inside to surface of curve */
- ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd);
+ sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
/* adjustment for changing radius */
if(gd != 0.0f) {
- ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
- ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
+ sd->Ng = sd->Ng - gd * tg;
+ sd->Ng = normalize(sd->Ng);
}
}
- ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
+ sd->N = sd->Ng;
}
#ifdef __DPDU__
/* dPdu/dPdv */
- ccl_fetch(sd, dPdu) = tg;
- ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng));
+ sd->dPdu = tg;
+ sd->dPdv = cross(tg, sd->Ng);
#endif
if(isect->object != OBJECT_NONE) {
#ifdef __OBJECT_MOTION__
- Transform tfm = ccl_fetch(sd, ob_tfm);
+ Transform tfm = sd->ob_tfm;
#else
Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
#endif
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
index d57d74ea882..2500228281e 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -48,7 +48,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg,
return P;
}
# ifdef __OBJECT_MOTION__
- Transform tfm = ccl_fetch(sd, ob_itfm);
+ Transform tfm = sd->ob_itfm;
# else
Transform tfm = object_fetch_transform(kg,
isect->object,
@@ -77,7 +77,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg,
if(isect->object != OBJECT_NONE) {
# ifdef __OBJECT_MOTION__
- Transform tfm = ccl_fetch(sd, ob_tfm);
+ Transform tfm = sd->ob_tfm;
# else
Transform tfm = object_fetch_transform(kg,
isect->object,
@@ -116,7 +116,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
# ifdef __INTERSECTION_REFINE__
if(isect->object != OBJECT_NONE) {
# ifdef __OBJECT_MOTION__
- Transform tfm = ccl_fetch(sd, ob_itfm);
+ Transform tfm = sd->ob_itfm;
# else
Transform tfm = object_fetch_transform(kg,
isect->object,
@@ -144,7 +144,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
if(isect->object != OBJECT_NONE) {
# ifdef __OBJECT_MOTION__
- Transform tfm = ccl_fetch(sd, ob_tfm);
+ Transform tfm = sd->ob_tfm;
# else
Transform tfm = object_fetch_transform(kg,
isect->object,
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
index 0e024a05db6..cb456056e20 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
@@ -39,26 +39,26 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
bool subsurface)
{
/* Get shader. */
- ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
+ sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
/* Get motion info. */
/* TODO(sergey): This logic is really similar to motion_triangle_vertices(),
* can we de-duplicate something here?
*/
int numsteps, numverts;
- object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);
+ object_motion_info(kg, sd->object, &numsteps, &numverts, NULL);
/* Figure out which steps we need to fetch and their interpolation factor. */
int maxstep = numsteps*2;
- int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
- float t = ccl_fetch(sd, time)*maxstep - step;
+ int step = min((int)(sd->time*maxstep), maxstep-1);
+ float t = sd->time*maxstep - step;
/* Find attribute. */
AttributeElement elem;
- int offset = find_attribute_motion(kg, ccl_fetch(sd, object),
+ int offset = find_attribute_motion(kg, sd->object,
ATTR_STD_MOTION_VERTEX_POSITION,
&elem);
kernel_assert(offset != ATTR_STD_NOT_FOUND);
/* Fetch vertex coordinates. */
float3 verts[3], next_verts[3];
- uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+ uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
/* Interpolate between steps. */
@@ -68,7 +68,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
/* Compute refined position. */
#ifdef __SUBSURFACE__
if(subsurface) {
- ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg,
+ sd->P = motion_triangle_refine_subsurface(kg,
sd,
isect,
ray,
@@ -77,29 +77,29 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
else
#endif /* __SUBSURFACE__*/
{
- ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
+ sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
}
/* Compute face normal. */
float3 Ng;
- if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+ if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
}
else {
Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
}
- ccl_fetch(sd, Ng) = Ng;
- ccl_fetch(sd, N) = Ng;
+ sd->Ng = Ng;
+ sd->N = Ng;
/* Compute derivatives of P w.r.t. uv. */
#ifdef __DPDU__
- ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
- ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
+ sd->dPdu = (verts[0] - verts[2]);
+ sd->dPdv = (verts[1] - verts[2]);
#endif
/* Compute smooth normal. */
- if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+ if(sd->shader & SHADER_SMOOTH_NORMAL) {
/* Find attribute. */
AttributeElement elem;
int offset = find_attribute_motion(kg,
- ccl_fetch(sd, object),
+ sd->object,
ATTR_STD_MOTION_VERTEX_NORMAL,
&elem);
kernel_assert(offset != ATTR_STD_NOT_FOUND);
@@ -112,10 +112,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
normals[1] = (1.0f - t)*normals[1] + t*next_normals[1];
normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
/* Interpolate between vertices. */
- float u = ccl_fetch(sd, u);
- float v = ccl_fetch(sd, v);
+ float u = sd->u;
+ float v = sd->v;
float w = 1.0f - u - v;
- ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
+ sd->N = (u*normals[0] + v*normals[1] + w*normals[2]);
}
}
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index f51b2d18657..5a04be8b0bf 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -137,9 +137,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
{
#ifdef __OBJECT_MOTION__
- *P = transform_point_auto(&ccl_fetch(sd, ob_tfm), *P);
+ *P = transform_point_auto(&sd->ob_tfm, *P);
#else
- Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+ Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
*P = transform_point(&tfm, *P);
#endif
}
@@ -149,9 +149,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader
ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
{
#ifdef __OBJECT_MOTION__
- *P = transform_point_auto(&ccl_fetch(sd, ob_itfm), *P);
+ *P = transform_point_auto(&sd->ob_itfm, *P);
#else
- Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+ Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
*P = transform_point(&tfm, *P);
#endif
}
@@ -161,12 +161,12 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons
ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
{
#ifdef __OBJECT_MOTION__
- if((ccl_fetch(sd, object) != OBJECT_NONE) || (ccl_fetch(sd, type) == PRIMITIVE_LAMP)) {
- *N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N));
+ if((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
+ *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N));
}
#else
- if(ccl_fetch(sd, object) != OBJECT_NONE) {
- Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+ if(sd->object != OBJECT_NONE) {
+ Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
*N = normalize(transform_direction_transposed(&tfm, *N));
}
#endif
@@ -177,9 +177,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const
ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
{
#ifdef __OBJECT_MOTION__
- *N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_itfm), *N));
+ *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N));
#else
- Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+ Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
*N = normalize(transform_direction_transposed(&tfm, *N));
#endif
}
@@ -189,9 +189,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa
ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
{
#ifdef __OBJECT_MOTION__
- *D = transform_direction_auto(&ccl_fetch(sd, ob_tfm), *D);
+ *D = transform_direction_auto(&sd->ob_tfm, *D);
#else
- Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+ Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
*D = transform_direction(&tfm, *D);
#endif
}
@@ -201,9 +201,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData
ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
{
#ifdef __OBJECT_MOTION__
- *D = transform_direction_auto(&ccl_fetch(sd, ob_itfm), *D);
+ *D = transform_direction_auto(&sd->ob_itfm, *D);
#else
- Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+ Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
*D = transform_direction(&tfm, *D);
#endif
}
@@ -212,13 +212,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha
ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
{
- if(ccl_fetch(sd, object) == OBJECT_NONE)
+ if(sd->object == OBJECT_NONE)
return make_float3(0.0f, 0.0f, 0.0f);
#ifdef __OBJECT_MOTION__
- return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w);
+ return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
#else
- Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+ Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
#endif
}
@@ -326,7 +326,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
{
- return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE + 1);
+ return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE + 1);
}
/* Particle data from which object was instanced */
diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h
index 6a0ff5a4a04..5663b598508 100644
--- a/intern/cycles/kernel/geom/geom_patch.h
+++ b/intern/cycles/kernel/geom/geom_patch.h
@@ -267,7 +267,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg, const ShaderData *sd, int o
float weights_du[PATCH_MAX_CONTROL_VERTS];
float weights_dv[PATCH_MAX_CONTROL_VERTS];
- int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+ int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
indices, weights, weights_du, weights_dv);
float val = 0.0f;
@@ -294,7 +294,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg, const ShaderData *sd, int
float weights_du[PATCH_MAX_CONTROL_VERTS];
float weights_dv[PATCH_MAX_CONTROL_VERTS];
- int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+ int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
indices, weights, weights_du, weights_dv);
float3 val = make_float3(0.0f, 0.0f, 0.0f);
@@ -321,7 +321,7 @@ ccl_device float3 patch_eval_uchar4(KernelGlobals *kg, const ShaderData *sd, int
float weights_du[PATCH_MAX_CONTROL_VERTS];
float weights_dv[PATCH_MAX_CONTROL_VERTS];
- int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+ int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
indices, weights, weights_du, weights_dv);
float3 val = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index 8a73bb2f78b..90a9c2147cc 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -28,19 +28,19 @@ ccl_device_inline float primitive_attribute_float(KernelGlobals *kg,
const AttributeDescriptor desc,
float *dx, float *dy)
{
- if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
+ if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
if(subd_triangle_patch(kg, sd) == ~0)
return triangle_attribute_float(kg, sd, desc, dx, dy);
else
return subd_triangle_attribute_float(kg, sd, desc, dx, dy);
}
#ifdef __HAIR__
- else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+ else if(sd->type & PRIMITIVE_ALL_CURVE) {
return curve_attribute_float(kg, sd, desc, dx, dy);
}
#endif
#ifdef __VOLUME__
- else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
+ else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
return volume_attribute_float(kg, sd, desc, dx, dy);
}
#endif
@@ -56,19 +56,19 @@ ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg,
const AttributeDescriptor desc,
float3 *dx, float3 *dy)
{
- if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
+ if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
if(subd_triangle_patch(kg, sd) == ~0)
return triangle_attribute_float3(kg, sd, desc, dx, dy);
else
return subd_triangle_attribute_float3(kg, sd, desc, dx, dy);
}
#ifdef __HAIR__
- else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+ else if(sd->type & PRIMITIVE_ALL_CURVE) {
return curve_attribute_float3(kg, sd, desc, dx, dy);
}
#endif
#ifdef __VOLUME__
- else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
+ else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
return volume_attribute_float3(kg, sd, desc, dx, dy);
}
#endif
@@ -118,9 +118,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
{
#ifdef __HAIR__
- if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)
+ if(sd->type & PRIMITIVE_ALL_CURVE)
# ifdef __DPDU__
- return normalize(ccl_fetch(sd, dPdu));
+ return normalize(sd->dPdu);
# else
return make_float3(0.0f, 0.0f, 0.0f);
# endif
@@ -133,12 +133,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
float3 data = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f);
object_normal_transform(kg, sd, &data);
- return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N))));
+ return cross(sd->N, normalize(cross(data, sd->N)));
}
else {
/* otherwise use surface derivatives */
#ifdef __DPDU__
- return normalize(ccl_fetch(sd, dPdu));
+ return normalize(sd->dPdu);
#else
return make_float3(0.0f, 0.0f, 0.0f);
#endif
@@ -153,17 +153,17 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
float3 center;
#ifdef __HAIR__
- bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE;
+ bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE;
if(is_curve_primitive) {
center = curve_motion_center_location(kg, sd);
- if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
+ if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
object_position_transform(kg, sd, &center);
}
}
else
#endif
- center = ccl_fetch(sd, P);
+ center = sd->P;
float3 motion_pre = center, motion_post = center;
@@ -173,16 +173,16 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
if(desc.offset != ATTR_STD_NOT_FOUND) {
/* get motion info */
int numverts, numkeys;
- object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys);
+ object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
/* lookup attributes */
motion_pre = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
- desc.offset += (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys;
+ desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys;
motion_post = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
#ifdef __HAIR__
- if(is_curve_primitive && (ccl_fetch(sd, object_flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+ if(is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
object_position_transform(kg, sd, &motion_pre);
object_position_transform(kg, sd, &motion_post);
}
@@ -193,10 +193,10 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
* transformation was set match the world/object space of motion_pre/post */
Transform tfm;
- tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE);
+ tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE);
motion_pre = transform_point(&tfm, motion_pre);
- tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST);
+ tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST);
motion_post = transform_point(&tfm, motion_post);
float3 motion_center;
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 647840dc696..044e82f03d4 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -22,14 +22,14 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd)
{
- return (ccl_fetch(sd, prim) != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, ccl_fetch(sd, prim)) : ~0;
+ return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
}
/* UV coords of triangle within patch */
ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg, const ShaderData *sd, float2 uv[3])
{
- uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+ uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
uv[0] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.x);
uv[1] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.y);
@@ -110,7 +110,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
float2 dpdv = uv[1] - uv[2];
/* p is [s, t] */
- float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2];
+ float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
float a, dads, dadt;
a = patch_eval_float(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
@@ -123,8 +123,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
float dtdv = dpdv.y;
if(dx) {
- float dudx = ccl_fetch(sd, du).dx;
- float dvdx = ccl_fetch(sd, dv).dx;
+ float dudx = sd->du.dx;
+ float dvdx = sd->dv.dx;
float dsdx = dsdu*dudx + dsdv*dvdx;
float dtdx = dtdu*dudx + dtdv*dvdx;
@@ -132,8 +132,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
*dx = dads*dsdx + dadt*dtdx;
}
if(dy) {
- float dudy = ccl_fetch(sd, du).dy;
- float dvdy = ccl_fetch(sd, dv).dy;
+ float dudy = sd->du.dy;
+ float dvdy = sd->dv.dy;
float dsdy = dsdu*dudy + dsdv*dvdy;
float dtdy = dtdu*dudy + dtdv*dvdy;
@@ -174,11 +174,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
#ifdef __RAY_DIFFERENTIALS__
- if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
- if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+ if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+ if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
#endif
- return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+ return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
}
else if(desc.element == ATTR_ELEMENT_CORNER) {
float2 uv[3];
@@ -202,11 +202,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
#ifdef __RAY_DIFFERENTIALS__
- if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
- if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+ if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+ if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
#endif
- return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+ return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
}
else {
if(dx) *dx = 0.0f;
@@ -229,7 +229,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
float2 dpdv = uv[1] - uv[2];
/* p is [s, t] */
- float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2];
+ float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
float3 a, dads, dadt;
@@ -248,8 +248,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
float dtdv = dpdv.y;
if(dx) {
- float dudx = ccl_fetch(sd, du).dx;
- float dvdx = ccl_fetch(sd, dv).dx;
+ float dudx = sd->du.dx;
+ float dvdx = sd->dv.dx;
float dsdx = dsdu*dudx + dsdv*dvdx;
float dtdx = dtdu*dudx + dtdv*dvdx;
@@ -257,8 +257,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
*dx = dads*dsdx + dadt*dtdx;
}
if(dy) {
- float dudy = ccl_fetch(sd, du).dy;
- float dvdy = ccl_fetch(sd, dv).dy;
+ float dudy = sd->du.dy;
+ float dvdy = sd->dv.dy;
float dsdy = dsdu*dudy + dsdv*dvdy;
float dtdy = dtdu*dudy + dtdv*dvdy;
@@ -299,11 +299,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
#ifdef __RAY_DIFFERENTIALS__
- if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
- if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+ if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+ if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
#endif
- return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+ return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
}
else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
float2 uv[3];
@@ -337,11 +337,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
#ifdef __RAY_DIFFERENTIALS__
- if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
- if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+ if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+ if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
#endif
- return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+ return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
}
else {
if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 3229091bbb0..47778553b94 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -26,13 +26,13 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
{
/* load triangle vertices */
- const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+ const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
const float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0));
const float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1));
const float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2));
/* return normal */
- if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+ if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
return normalize(cross(v2 - v0, v1 - v0));
}
else {
@@ -110,34 +110,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
if(dx) *dx = 0.0f;
if(dy) *dy = 0.0f;
- return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
+ return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
}
else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
- uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+ uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
float f0 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.x);
float f1 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.y);
float f2 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.z);
#ifdef __RAY_DIFFERENTIALS__
- if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
- if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+ if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+ if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
#endif
- return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+ return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
}
else if(desc.element == ATTR_ELEMENT_CORNER) {
- int tri = desc.offset + ccl_fetch(sd, prim)*3;
+ int tri = desc.offset + sd->prim*3;
float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
#ifdef __RAY_DIFFERENTIALS__
- if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
- if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+ if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+ if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
#endif
- return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+ return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
}
else {
if(dx) *dx = 0.0f;
@@ -153,24 +153,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
- return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
+ return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim));
}
else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
- uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+ uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x));
float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y));
float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z));
#ifdef __RAY_DIFFERENTIALS__
- if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
- if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+ if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+ if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
#endif
- return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+ return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
}
else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
- int tri = desc.offset + ccl_fetch(sd, prim)*3;
+ int tri = desc.offset + sd->prim*3;
float3 f0, f1, f2;
if(desc.element == ATTR_ELEMENT_CORNER) {
@@ -185,11 +185,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
}
#ifdef __RAY_DIFFERENTIALS__
- if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
- if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+ if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+ if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
#endif
- return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+ return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
}
else {
if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index 4db121d94f4..4d234dd62bd 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -457,7 +457,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
return P;
}
# ifdef __OBJECT_MOTION__
- Transform tfm = ccl_fetch(sd, ob_itfm);
+ Transform tfm = sd->ob_itfm;
# else
Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
# endif
@@ -491,7 +491,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
if(isect->object != OBJECT_NONE) {
# ifdef __OBJECT_MOTION__
- Transform tfm = ccl_fetch(sd, ob_tfm);
+ Transform tfm = sd->ob_tfm;
# else
Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
# endif
@@ -519,7 +519,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
if(isect->object != OBJECT_NONE) {
#ifdef __OBJECT_MOTION__
- Transform tfm = ccl_fetch(sd, ob_itfm);
+ Transform tfm = sd->ob_itfm;
#else
Transform tfm = object_fetch_transform(kg,
isect->object,
@@ -557,7 +557,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
if(isect->object != OBJECT_NONE) {
#ifdef __OBJECT_MOTION__
- Transform tfm = ccl_fetch(sd, ob_tfm);
+ Transform tfm = sd->ob_tfm;
#else
Transform tfm = object_fetch_transform(kg,
isect->object,
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 03724c955be..1e0ef5201c9 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -64,7 +64,7 @@ ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
{
- float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
+ float3 P = volume_normalized_position(kg, sd, sd->P);
#ifdef __KERNEL_CUDA__
# if __CUDA_ARCH__ >= 300
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
@@ -91,7 +91,7 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
{
- float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
+ float3 P = volume_normalized_position(kg, sd, sd->P);
#ifdef __KERNEL_CUDA__
# if __CUDA_ARCH__ >= 300
CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 9279a94c13a..cd339e6237e 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -20,6 +20,7 @@
/* CPU Kernel Interface */
#include "util_types.h"
+#include "kernel_types.h"
CCL_NAMESPACE_BEGIN
@@ -28,6 +29,7 @@ CCL_NAMESPACE_BEGIN
#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
struct KernelGlobals;
+struct KernelData;
KernelGlobals *kernel_globals_create();
void kernel_globals_free(KernelGlobals *kg);
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index dedac6b1465..0df5217d97a 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -457,7 +457,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
{
if(kernel_data.cam.type != CAMERA_PANORAMA) {
/* perspective / ortho */
- if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
+ if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
P += camera_position(kg);
Transform tfm = kernel_data.cam.worldtondc;
@@ -467,7 +467,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
/* panorama */
Transform tfm = kernel_data.cam.worldtocamera;
- if(ccl_fetch(sd, object) != OBJECT_NONE)
+ if(sd->object != OBJECT_NONE)
P = normalize(transform_point(&tfm, P));
else
P = normalize(transform_direction(&tfm, P));
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 9d1f3bdc918..e347a1eca18 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -44,6 +44,15 @@
#define ccl_addr_space
+#define ccl_local_id(d) 0
+#define ccl_global_id(d) (kg->global_id[d])
+
+#define ccl_local_size(d) 1
+#define ccl_global_size(d) (kg->global_size[d])
+
+#define ccl_group_id(d) ccl_global_id(d)
+#define ccl_num_groups(d) ccl_global_size(d)
+
/* On x86_64, versions of glibc < 2.16 have an issue where expf is
* much slower than the double version. This was fixed in glibc 2.16.
*/
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index e0c7b17c6a0..8fffe2a13c9 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -46,11 +46,58 @@
#define ccl_device_noinline __device__ __noinline__
#define ccl_global
#define ccl_constant
+#define ccl_local __shared__
+#define ccl_local_param
+#define ccl_private
#define ccl_may_alias
#define ccl_addr_space
#define ccl_restrict __restrict__
#define ccl_align(n) __align__(n)
+ccl_device_inline uint ccl_local_id(uint d)
+{
+ switch(d) {
+ case 0: return threadIdx.x;
+ case 1: return threadIdx.y;
+ case 2: return threadIdx.z;
+ default: return 0;
+ }
+}
+
+#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+
+ccl_device_inline uint ccl_local_size(uint d)
+{
+ switch(d) {
+ case 0: return blockDim.x;
+ case 1: return blockDim.y;
+ case 2: return blockDim.z;
+ default: return 0;
+ }
+}
+
+#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+
+ccl_device_inline uint ccl_group_id(uint d)
+{
+ switch(d) {
+ case 0: return blockIdx.x;
+ case 1: return blockIdx.y;
+ case 2: return blockIdx.z;
+ default: return 0;
+ }
+}
+
+ccl_device_inline uint ccl_num_groups(uint d)
+{
+ switch(d) {
+ case 0: return gridDim.x;
+ case 1: return gridDim.y;
+ case 2: return gridDim.z;
+ default: return 0;
+ }
+}
+
/* No assert supported for CUDA */
#define kernel_assert(cond)
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index f076e3a7d37..6c963dea4f5 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -39,6 +39,7 @@
#define ccl_constant __constant
#define ccl_global __global
#define ccl_local __local
+#define ccl_local_param __local
#define ccl_private __private
#define ccl_restrict restrict
#define ccl_align(n) __attribute__((aligned(n)))
@@ -49,6 +50,15 @@
# define ccl_addr_space
#endif
+#define ccl_local_id(d) get_local_id(d)
+#define ccl_global_id(d) get_global_id(d)
+
+#define ccl_local_size(d) get_local_size(d)
+#define ccl_global_size(d) get_global_size(d)
+
+#define ccl_group_id(d) get_group_id(d)
+#define ccl_num_groups(d) get_num_groups(d)
+
/* Selective nodes compilation. */
#ifndef __NODES_MAX_GROUP__
# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 8c7c651a053..bc2d9604122 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -67,7 +67,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
ls->shader, ls->object, ls->prim,
ls->u, ls->v, t, time, false, ls->lamp);
- ls->Ng = ccl_fetch(emission_sd, Ng);
+ ls->Ng = emission_sd->Ng;
/* no path flag, we're evaluating this for all closures. that's weak but
* we'd have to do multiple evaluations otherwise */
@@ -76,7 +76,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
path_state_modify_bounce(state, false);
/* evaluate emissive closure */
- if(ccl_fetch(emission_sd, flag) & SD_EMISSION)
+ if(emission_sd->flag & SD_EMISSION)
eval = shader_emissive_eval(kg, emission_sd);
else
eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -112,7 +112,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
-ls->D,
dD,
ls->t,
- ccl_fetch(sd, time));
+ sd->time);
if(is_zero(light_eval))
return false;
@@ -120,7 +120,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
/* evaluate BSDF at shading point */
#ifdef __VOLUME__
- if(ccl_fetch(sd, prim) != PRIM_NONE)
+ if(sd->prim != PRIM_NONE)
shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
else {
float bsdf_pdf;
@@ -168,8 +168,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
if(ls->shader & SHADER_CAST_SHADOW) {
/* setup ray */
- bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f);
- ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+ bool transmit = (dot(sd->Ng, ls->D) < 0.0f);
+ ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng);
if(ls->t == FLT_MAX) {
/* distant light */
@@ -182,7 +182,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
ray->D = normalize_len(ray->D, &ray->t);
}
- ray->dP = ccl_fetch(sd, dP);
+ ray->dP = sd->dP;
ray->dD = differential3_zero();
}
else {
@@ -204,14 +204,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
float3 L = shader_emissive_eval(kg, sd);
#ifdef __HAIR__
- if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE))
+ if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE))
#else
- if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS))
+ if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
#endif
{
/* multiple importance sampling, get triangle light pdf,
* and compute weight with respect to BSDF pdf */
- float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t);
+ float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t);
float mis_weight = power_heuristic(bsdf_pdf, pdf);
return L*mis_weight;
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 2b52a2d2f48..1c3884890bf 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -16,6 +16,9 @@
/* Constant Globals */
+#ifndef __KERNEL_GLOBALS_H__
+#define __KERNEL_GLOBALS_H__
+
CCL_NAMESPACE_BEGIN
/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
@@ -64,6 +67,13 @@ typedef struct KernelGlobals {
/* Storage for decoupled volume steps. */
VolumeStep *decoupled_volume_steps[2];
int decoupled_volume_steps_index;
+
+ /* split kernel */
+ SplitData split_data;
+ SplitParams split_param_data;
+
+ int2 global_size;
+ int2 global_id;
} KernelGlobals;
#endif /* __KERNEL_CPU__ */
@@ -103,8 +113,8 @@ typedef ccl_addr_space struct KernelGlobals {
# include "kernel_textures.h"
# ifdef __SPLIT_KERNEL__
- ShaderData *sd_input;
- Intersection *isect_shadow;
+ SplitData split_data;
+ SplitParams split_param_data;
# endif
} KernelGlobals;
@@ -146,3 +156,4 @@ ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int o
CCL_NAMESPACE_END
+#endif /* __KERNEL_GLOBALS_H__ */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 7aec47e4957..ed523696571 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -19,16 +19,16 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value)
{
ccl_global float *buf = buffer;
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
atomic_add_and_fetch_float(buf, value);
#else
*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif /* __SPLIT_KERNEL__ */
}
ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value)
{
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
ccl_global float *buf_x = buffer + 0;
ccl_global float *buf_y = buffer + 1;
ccl_global float *buf_z = buffer + 2;
@@ -39,12 +39,12 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sa
#else
ccl_global float3 *buf = (ccl_global float3*)buffer;
*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif /* __SPLIT_KERNEL__ */
}
ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value)
{
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
ccl_global float *buf_x = buffer + 0;
ccl_global float *buf_y = buffer + 1;
ccl_global float *buf_z = buffer + 2;
@@ -57,7 +57,7 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
#else
ccl_global float4 *buf = (ccl_global float4*)buffer;
*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif /* __SPLIT_KERNEL__ */
}
ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
@@ -75,18 +75,18 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
return;
if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
- if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) ||
+ if(!(sd->flag & SD_TRANSPARENT) ||
kernel_data.film.pass_alpha_threshold == 0.0f ||
average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold)
{
if(sample == 0) {
if(flag & PASS_DEPTH) {
- float depth = camera_distance(kg, ccl_fetch(sd, P));
+ float depth = camera_distance(kg, sd->P);
kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth);
}
if(flag & PASS_OBJECT_ID) {
- float id = object_pass_id(kg, ccl_fetch(sd, object));
+ float id = object_pass_id(kg, sd->object);
kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id);
}
if(flag & PASS_MATERIAL_ID) {
@@ -96,7 +96,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
}
if(flag & PASS_NORMAL) {
- float3 normal = ccl_fetch(sd, N);
+ float3 normal = sd->N;
kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal);
}
if(flag & PASS_UV) {
@@ -127,7 +127,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
float mist_start = kernel_data.film.mist_start;
float mist_inv_depth = kernel_data.film.mist_inv_depth;
- float depth = camera_distance(kg, ccl_fetch(sd, P));
+ float depth = camera_distance(kg, sd->P);
float mist = saturate((depth - mist_start)*mist_inv_depth);
/* falloff */
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index f90701a8260..95c27850513 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -75,17 +75,17 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
- if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+ if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
Ray light_ray;
float3 ao_shadow;
- light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+ light_ray.P = ray_offset(sd->P, sd->Ng);
light_ray.D = ao_D;
light_ray.t = kernel_data.background.ao_distance;
#ifdef __OBJECT_MOTION__
- light_ray.time = ccl_fetch(sd, time);
+ light_ray.time = sd->time;
#endif /* __OBJECT_MOTION__ */
- light_ray.dP = ccl_fetch(sd, dP);
+ light_ray.dP = sd->dP;
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
@@ -459,7 +459,7 @@ bool kernel_path_subsurface_scatter(
# ifdef __VOLUME__
ss_indirect->need_update_volume_stack =
kernel_data.integrator.use_volumes &&
- ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME;
+ sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
# endif /* __VOLUME__ */
/* compute lighting with the BSDF closure */
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index ff2b828795d..d58960cae4e 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -42,17 +42,17 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
- if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+ if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
Ray light_ray;
float3 ao_shadow;
- light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+ light_ray.P = ray_offset(sd->P, sd->Ng);
light_ray.D = ao_D;
light_ray.t = kernel_data.background.ao_distance;
#ifdef __OBJECT_MOTION__
- light_ray.time = ccl_fetch(sd, time);
+ light_ray.time = sd->time;
#endif /* __OBJECT_MOTION__ */
- light_ray.dP = ccl_fetch(sd, dP);
+ light_ray.dP = sd->dP;
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow))
@@ -67,8 +67,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
{
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- const ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
if(!CLOSURE_IS_BSDF(sc->type))
continue;
@@ -140,8 +140,8 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
Ray *ray,
float3 throughput)
{
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
if(!CLOSURE_IS_BSSRDF(sc->type))
continue;
@@ -169,7 +169,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
Ray volume_ray = *ray;
bool need_update_volume_stack =
kernel_data.integrator.use_volumes &&
- ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME;
+ sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
#endif /* __VOLUME__ */
/* compute lighting with the BSDF closure */
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index fea503d06e5..34a78552c1d 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -25,7 +25,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
{
#ifdef __EMISSION__
/* sample illumination from lights to find path contribution */
- if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))
+ if(!(sd->flag & SD_BSDF_HAS_EVAL))
return;
Ray light_ray;
@@ -33,7 +33,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
bool is_lamp;
# ifdef __OBJECT_MOTION__
- light_ray.time = ccl_fetch(sd, time);
+ light_ray.time = sd->time;
# endif
if(sample_all_lights) {
@@ -52,7 +52,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples);
LightSample ls;
- if(lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls)) {
+ if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
/* The sampling probability returned by lamp_light_sample assumes that all lights were sampled.
* However, this code only samples lamps, so if the scene also had mesh lights, the real probability is twice as high. */
if(kernel_data.integrator.pdf_triangles != 0.0f)
@@ -87,7 +87,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
light_t = 0.5f*light_t;
LightSample ls;
- if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+ if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
/* Same as above, probability needs to be corrected since the sampling was forced to select a mesh light. */
if(kernel_data.integrator.num_all_lights)
ls.pdf *= 2.0f;
@@ -113,7 +113,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
float terminate = path_state_rng_light_termination(kg, rng, state);
LightSample ls;
- if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+ if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
/* sample random light */
if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
/* trace shadow ray */
@@ -156,15 +156,15 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
path_state_next(kg, state, label);
/* setup ray */
- ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+ ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
ray->D = normalize(bsdf_omega_in);
ray->t = FLT_MAX;
#ifdef __RAY_DIFFERENTIALS__
- ray->dP = ccl_fetch(sd, dP);
+ ray->dP = sd->dP;
ray->dD = bsdf_domega_in;
#endif
#ifdef __OBJECT_MOTION__
- ray->time = ccl_fetch(sd, time);
+ ray->time = sd->time;
#endif
#ifdef __VOLUME__
@@ -195,7 +195,7 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_
PathRadiance *L)
{
#ifdef __EMISSION__
- if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
+ if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
return;
/* sample illumination from lights to find path contribution */
@@ -208,11 +208,11 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_
bool is_lamp;
#ifdef __OBJECT_MOTION__
- light_ray.time = ccl_fetch(sd, time);
+ light_ray.time = sd->time;
#endif
LightSample ls;
- if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+ if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
float terminate = path_state_rng_light_termination(kg, rng, state);
if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
/* trace shadow ray */
@@ -238,7 +238,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
ccl_addr_space Ray *ray)
{
/* no BSDF? we can stop here */
- if(ccl_fetch(sd, flag) & SD_BSDF) {
+ if(sd->flag & SD_BSDF) {
/* sample BSDF */
float bsdf_pdf;
BsdfEval bsdf_eval;
@@ -270,16 +270,16 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
path_state_next(kg, state, label);
/* setup ray */
- ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+ ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
ray->D = normalize(bsdf_omega_in);
if(state->bounce == 0)
- ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
+ ray->t -= sd->ray_length; /* clipping works through transparent */
else
ray->t = FLT_MAX;
#ifdef __RAY_DIFFERENTIALS__
- ray->dP = ccl_fetch(sd, dP);
+ ray->dP = sd->dP;
ray->dD = bsdf_domega_in;
#endif
@@ -291,21 +291,21 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
return true;
}
#ifdef __VOLUME__
- else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) {
+ else if(sd->flag & SD_HAS_ONLY_VOLUME) {
/* no surface shader but have a volume shader? act transparent */
/* update path state, count as transparent */
path_state_next(kg, state, LABEL_TRANSPARENT);
if(state->bounce == 0)
- ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
+ ray->t -= sd->ray_length; /* clipping works through transparent */
else
ray->t = FLT_MAX;
/* setup ray position, direction stays unchanged */
- ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
+ ray->P = ray_offset(sd->P, -sd->Ng);
#ifdef __RAY_DIFFERENTIALS__
- ray->dP = ccl_fetch(sd, dP);
+ ray->dP = sd->dP;
#endif
/* enter/exit volume */
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index cf5614b8a86..2e63909a38c 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -17,6 +17,8 @@
#ifndef __KERNEL_QUEUE_H__
#define __KERNEL_QUEUE_H__
+CCL_NAMESPACE_BEGIN
+
/*
* Queue utility functions for split kernel
*/
@@ -35,7 +37,8 @@ ccl_device void enqueue_ray_index(
ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */
{
/* This thread's queue index. */
- int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size);
+ int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint*)&queue_index[queue_number])
+ + (queue_number * queue_size);
queues[my_queue_index] = ray_index;
}
@@ -47,6 +50,7 @@ ccl_device void enqueue_ray_index(
* is no more ray to allocate to other threads.
*/
ccl_device int get_ray_index(
+ KernelGlobals *kg,
int thread_index, /* Global thread index. */
int queue_number, /* Queue to operate on. */
ccl_global int *queues, /* Buffer of all queues. */
@@ -68,24 +72,25 @@ ccl_device void enqueue_ray_index_local(
int queue_number, /* Queue in which to enqueue ray index. */
char enqueue_flag, /* True for threads whose ray index has to be enqueued. */
int queuesize, /* queue size. */
- ccl_local unsigned int *local_queue_atomics, /* To to local queue atomics. */
+ ccl_local_param unsigned int *local_queue_atomics, /* To to local queue atomics. */
ccl_global int *Queue_data, /* Queues. */
ccl_global int *Queue_index) /* To do global queue atomics. */
{
- int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+ int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
/* Get local queue id .*/
unsigned int lqidx;
if(enqueue_flag) {
- lqidx = atomic_inc(local_queue_atomics);
+ lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics);
}
- barrier(CLK_LOCAL_MEM_FENCE);
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
/* Get global queue offset. */
if(lidx == 0) {
- *local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics);
+ *local_queue_atomics = atomic_fetch_and_add_uint32((ccl_global uint*)&Queue_index[queue_number],
+ *local_queue_atomics);
}
- barrier(CLK_LOCAL_MEM_FENCE);
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
/* Get global queue index and enqueue ray. */
if(enqueue_flag) {
@@ -96,19 +101,19 @@ ccl_device void enqueue_ray_index_local(
ccl_device unsigned int get_local_queue_index(
int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
- ccl_local unsigned int *local_queue_atomics)
+ ccl_local_param unsigned int *local_queue_atomics)
{
- int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]);
+ int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]);
return my_lqidx;
}
ccl_device unsigned int get_global_per_queue_offset(
int queue_number,
- ccl_local unsigned int *local_queue_atomics,
+ ccl_local_param unsigned int *local_queue_atomics,
ccl_global int* global_queue_atomics)
{
- unsigned int queue_offset = atomic_add(&global_queue_atomics[queue_number],
- local_queue_atomics[queue_number]);
+ unsigned int queue_offset = atomic_fetch_and_add_uint32((ccl_global uint*)&global_queue_atomics[queue_number],
+ local_queue_atomics[queue_number]);
return queue_offset;
}
@@ -116,10 +121,12 @@ ccl_device unsigned int get_global_queue_index(
int queue_number,
int queuesize,
unsigned int lqidx,
- ccl_local unsigned int * global_per_queue_offset)
+ ccl_local_param unsigned int * global_per_queue_offset)
{
int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
return my_gqidx;
}
+CCL_NAMESPACE_END
+
#endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 59c1331a63c..a2ab96b35e2 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -38,13 +38,13 @@ CCL_NAMESPACE_BEGIN
#ifdef __OBJECT_MOTION__
ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
{
- if(ccl_fetch(sd, object_flag) & SD_OBJECT_MOTION) {
- ccl_fetch(sd, ob_tfm) = object_fetch_transform_motion(kg, ccl_fetch(sd, object), time);
- ccl_fetch(sd, ob_itfm) = transform_quick_inverse(ccl_fetch(sd, ob_tfm));
+ if(sd->object_flag & SD_OBJECT_MOTION) {
+ sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
+ sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
}
else {
- ccl_fetch(sd, ob_tfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
- ccl_fetch(sd, ob_itfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+ sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+ sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
}
}
#endif
@@ -55,55 +55,55 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
const Ray *ray)
{
#ifdef __INSTANCING__
- ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
+ sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
#endif
- ccl_fetch(sd, type) = isect->type;
- ccl_fetch(sd, flag) = 0;
- ccl_fetch(sd, object_flag) = kernel_tex_fetch(__object_flag,
- ccl_fetch(sd, object));
+ sd->type = isect->type;
+ sd->flag = 0;
+ sd->object_flag = kernel_tex_fetch(__object_flag,
+ sd->object);
/* matrices and time */
#ifdef __OBJECT_MOTION__
shader_setup_object_transforms(kg, sd, ray->time);
- ccl_fetch(sd, time) = ray->time;
+ sd->time = ray->time;
#endif
- ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim);
- ccl_fetch(sd, ray_length) = isect->t;
+ sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
+ sd->ray_length = isect->t;
#ifdef __UV__
- ccl_fetch(sd, u) = isect->u;
- ccl_fetch(sd, v) = isect->v;
+ sd->u = isect->u;
+ sd->v = isect->v;
#endif
#ifdef __HAIR__
- if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+ if(sd->type & PRIMITIVE_ALL_CURVE) {
/* curve */
- float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+ float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
- ccl_fetch(sd, shader) = __float_as_int(curvedata.z);
- ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray);
+ sd->shader = __float_as_int(curvedata.z);
+ sd->P = bvh_curve_refine(kg, sd, isect, ray);
}
else
#endif
- if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+ if(sd->type & PRIMITIVE_TRIANGLE) {
/* static triangle */
float3 Ng = triangle_normal(kg, sd);
- ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
+ sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
/* vectors */
- ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray);
- ccl_fetch(sd, Ng) = Ng;
- ccl_fetch(sd, N) = Ng;
+ sd->P = triangle_refine(kg, sd, isect, ray);
+ sd->Ng = Ng;
+ sd->N = Ng;
/* smooth normal */
- if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL)
- ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
+ if(sd->shader & SHADER_SMOOTH_NORMAL)
+ sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
#ifdef __DPDU__
/* dPdu/dPdv */
- triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
+ triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
#endif
}
else {
@@ -111,40 +111,40 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
motion_triangle_shader_setup(kg, sd, isect, ray, false);
}
- ccl_fetch(sd, I) = -ray->D;
+ sd->I = -ray->D;
- ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
+ sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
#ifdef __INSTANCING__
if(isect->object != OBJECT_NONE) {
/* instance transform */
- object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
- object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
+ object_normal_transform_auto(kg, sd, &sd->N);
+ object_normal_transform_auto(kg, sd, &sd->Ng);
# ifdef __DPDU__
- object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
- object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
+ object_dir_transform_auto(kg, sd, &sd->dPdu);
+ object_dir_transform_auto(kg, sd, &sd->dPdv);
# endif
}
#endif
/* backfacing test */
- bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
+ bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
if(backfacing) {
- ccl_fetch(sd, flag) |= SD_BACKFACING;
- ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
- ccl_fetch(sd, N) = -ccl_fetch(sd, N);
+ sd->flag |= SD_BACKFACING;
+ sd->Ng = -sd->Ng;
+ sd->N = -sd->N;
#ifdef __DPDU__
- ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
- ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
+ sd->dPdu = -sd->dPdu;
+ sd->dPdv = -sd->dPdv;
#endif
}
#ifdef __RAY_DIFFERENTIALS__
/* differentials */
- differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t);
- differential_incoming(&ccl_fetch(sd, dI), ray->dD);
- differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng));
+ differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
+ differential_incoming(&sd->dI, ray->dD);
+ differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
#endif
}
@@ -249,106 +249,106 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
int lamp)
{
/* vectors */
- ccl_fetch(sd, P) = P;
- ccl_fetch(sd, N) = Ng;
- ccl_fetch(sd, Ng) = Ng;
- ccl_fetch(sd, I) = I;
- ccl_fetch(sd, shader) = shader;
+ sd->P = P;
+ sd->N = Ng;
+ sd->Ng = Ng;
+ sd->I = I;
+ sd->shader = shader;
if(prim != PRIM_NONE)
- ccl_fetch(sd, type) = PRIMITIVE_TRIANGLE;
+ sd->type = PRIMITIVE_TRIANGLE;
else if(lamp != LAMP_NONE)
- ccl_fetch(sd, type) = PRIMITIVE_LAMP;
+ sd->type = PRIMITIVE_LAMP;
else
- ccl_fetch(sd, type) = PRIMITIVE_NONE;
+ sd->type = PRIMITIVE_NONE;
/* primitive */
#ifdef __INSTANCING__
- ccl_fetch(sd, object) = object;
+ sd->object = object;
#endif
/* currently no access to bvh prim index for strand sd->prim*/
- ccl_fetch(sd, prim) = prim;
+ sd->prim = prim;
#ifdef __UV__
- ccl_fetch(sd, u) = u;
- ccl_fetch(sd, v) = v;
+ sd->u = u;
+ sd->v = v;
#endif
- ccl_fetch(sd, ray_length) = t;
+ sd->ray_length = t;
- ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
- ccl_fetch(sd, object_flag) = 0;
- if(ccl_fetch(sd, object) != OBJECT_NONE) {
- ccl_fetch(sd, object_flag) |= kernel_tex_fetch(__object_flag,
- ccl_fetch(sd, object));
+ sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
+ sd->object_flag = 0;
+ if(sd->object != OBJECT_NONE) {
+ sd->object_flag |= kernel_tex_fetch(__object_flag,
+ sd->object);
#ifdef __OBJECT_MOTION__
shader_setup_object_transforms(kg, sd, time);
- ccl_fetch(sd, time) = time;
+ sd->time = time;
}
else if(lamp != LAMP_NONE) {
- ccl_fetch(sd, ob_tfm) = lamp_fetch_transform(kg, lamp, false);
- ccl_fetch(sd, ob_itfm) = lamp_fetch_transform(kg, lamp, true);
+ sd->ob_tfm = lamp_fetch_transform(kg, lamp, false);
+ sd->ob_itfm = lamp_fetch_transform(kg, lamp, true);
#endif
}
/* transform into world space */
if(object_space) {
- object_position_transform_auto(kg, sd, &ccl_fetch(sd, P));
- object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
- ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
- object_dir_transform_auto(kg, sd, &ccl_fetch(sd, I));
+ object_position_transform_auto(kg, sd, &sd->P);
+ object_normal_transform_auto(kg, sd, &sd->Ng);
+ sd->N = sd->Ng;
+ object_dir_transform_auto(kg, sd, &sd->I);
}
- if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+ if(sd->type & PRIMITIVE_TRIANGLE) {
/* smooth normal */
- if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
- ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
+ if(sd->shader & SHADER_SMOOTH_NORMAL) {
+ sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
#ifdef __INSTANCING__
- if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
- object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
+ if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+ object_normal_transform_auto(kg, sd, &sd->N);
}
#endif
}
/* dPdu/dPdv */
#ifdef __DPDU__
- triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
+ triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
# ifdef __INSTANCING__
- if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
- object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
- object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
+ if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+ object_dir_transform_auto(kg, sd, &sd->dPdu);
+ object_dir_transform_auto(kg, sd, &sd->dPdv);
}
# endif
#endif
}
else {
#ifdef __DPDU__
- ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
- ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
+ sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
+ sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
#endif
}
/* backfacing test */
- if(ccl_fetch(sd, prim) != PRIM_NONE) {
- bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
+ if(sd->prim != PRIM_NONE) {
+ bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
if(backfacing) {
- ccl_fetch(sd, flag) |= SD_BACKFACING;
- ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
- ccl_fetch(sd, N) = -ccl_fetch(sd, N);
+ sd->flag |= SD_BACKFACING;
+ sd->Ng = -sd->Ng;
+ sd->N = -sd->N;
#ifdef __DPDU__
- ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
- ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
+ sd->dPdu = -sd->dPdu;
+ sd->dPdv = -sd->dPdv;
#endif
}
}
#ifdef __RAY_DIFFERENTIALS__
/* no ray differentials here yet */
- ccl_fetch(sd, dP) = differential3_zero();
- ccl_fetch(sd, dI) = differential3_zero();
- ccl_fetch(sd, du) = differential_zero();
- ccl_fetch(sd, dv) = differential_zero();
+ sd->dP = differential3_zero();
+ sd->dI = differential3_zero();
+ sd->du = differential_zero();
+ sd->dv = differential_zero();
#endif
}
@@ -378,39 +378,39 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
{
/* vectors */
- ccl_fetch(sd, P) = ray->D;
- ccl_fetch(sd, N) = -ray->D;
- ccl_fetch(sd, Ng) = -ray->D;
- ccl_fetch(sd, I) = -ray->D;
- ccl_fetch(sd, shader) = kernel_data.background.surface_shader;
- ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
- ccl_fetch(sd, object_flag) = 0;
+ sd->P = ray->D;
+ sd->N = -ray->D;
+ sd->Ng = -ray->D;
+ sd->I = -ray->D;
+ sd->shader = kernel_data.background.surface_shader;
+ sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
+ sd->object_flag = 0;
#ifdef __OBJECT_MOTION__
- ccl_fetch(sd, time) = ray->time;
+ sd->time = ray->time;
#endif
- ccl_fetch(sd, ray_length) = 0.0f;
+ sd->ray_length = 0.0f;
#ifdef __INSTANCING__
- ccl_fetch(sd, object) = PRIM_NONE;
+ sd->object = PRIM_NONE;
#endif
- ccl_fetch(sd, prim) = PRIM_NONE;
+ sd->prim = PRIM_NONE;
#ifdef __UV__
- ccl_fetch(sd, u) = 0.0f;
- ccl_fetch(sd, v) = 0.0f;
+ sd->u = 0.0f;
+ sd->v = 0.0f;
#endif
#ifdef __DPDU__
/* dPdu/dPdv */
- ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
- ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
+ sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
+ sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
#endif
#ifdef __RAY_DIFFERENTIALS__
/* differentials */
- ccl_fetch(sd, dP) = ray->dD;
- differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP));
- ccl_fetch(sd, du) = differential_zero();
- ccl_fetch(sd, dv) = differential_zero();
+ sd->dP = ray->dD;
+ differential_incoming(&sd->dI, sd->dP);
+ sd->du = differential_zero();
+ sd->dv = differential_zero();
#endif
}
@@ -505,11 +505,11 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd
{
/* this is the veach one-sample model with balance heuristic, some pdf
* factors drop out when using balance heuristic weighting */
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+ for(int i = 0; i < sd->num_closure; i++) {
if(i == skip_bsdf)
continue;
- const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+ const ShaderClosure *sc = &sd->closure[i];
if(CLOSURE_IS_BSDF(sc->type)) {
float bsdf_pdf = 0.0f;
@@ -535,8 +535,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
float light_pdf,
bool use_mis)
{
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
if(CLOSURE_IS_BSDF(sc->type)) {
float bsdf_pdf = 0.0f;
float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
@@ -591,22 +591,22 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
{
int sampled = 0;
- if(ccl_fetch(sd, num_closure) > 1) {
+ if(sd->num_closure > 1) {
/* pick a BSDF closure based on sample weights */
float sum = 0.0f;
- for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
- const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+ for(sampled = 0; sampled < sd->num_closure; sampled++) {
+ const ShaderClosure *sc = &sd->closure[sampled];
if(CLOSURE_IS_BSDF(sc->type))
sum += sc->sample_weight;
}
- float r = ccl_fetch(sd, randb_closure)*sum;
+ float r = sd->randb_closure*sum;
sum = 0.0f;
- for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
- const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+ for(sampled = 0; sampled < sd->num_closure; sampled++) {
+ const ShaderClosure *sc = &sd->closure[sampled];
if(CLOSURE_IS_BSDF(sc->type)) {
sum += sc->sample_weight;
@@ -616,13 +616,13 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
}
}
- if(sampled == ccl_fetch(sd, num_closure)) {
+ if(sampled == sd->num_closure) {
*pdf = 0.0f;
return LABEL_NONE;
}
}
- const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+ const ShaderClosure *sc = &sd->closure[sampled];
int label;
float3 eval;
@@ -633,7 +633,7 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
if(*pdf != 0.0f) {
bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass);
- if(ccl_fetch(sd, num_closure) > 1) {
+ if(sd->num_closure > 1) {
float sweight = sc->sample_weight;
_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight);
}
@@ -660,8 +660,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, ShaderData *sd,
ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
{
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
if(CLOSURE_IS_BSDF(sc->type))
bsdf_blur(kg, sc, roughness);
@@ -670,13 +670,13 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn
ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
{
- if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)
+ if(sd->flag & SD_HAS_ONLY_VOLUME)
return make_float3(1.0f, 1.0f, 1.0f);
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl
eval += sc->weight;
@@ -687,8 +687,8 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd)
{
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) {
sc->sample_weight = 0.0f;
@@ -711,8 +711,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
{
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
eval += sc->weight;
@@ -725,8 +725,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
{
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
eval += sc->weight;
@@ -739,8 +739,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
{
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
eval += sc->weight;
@@ -753,8 +753,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
{
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type))
eval += sc->weight;
@@ -768,8 +768,8 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
float3 N = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc;
@@ -778,12 +778,12 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
}
else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) {
eval += sc->weight;
- N += ccl_fetch(sd, N)*average(sc->weight);
+ N += sd->N*average(sc->weight);
}
}
if(is_zero(N))
- N = ccl_fetch(sd, N);
+ N = sd->N;
else
N = normalize(N);
@@ -798,8 +798,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
float3 N = make_float3(0.0f, 0.0f, 0.0f);
float texture_blur = 0.0f, weight_sum = 0.0f;
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
if(CLOSURE_IS_BSSRDF(sc->type)) {
const Bssrdf *bssrdf = (const Bssrdf*)sc;
@@ -813,7 +813,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
}
if(N_)
- *N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N);
+ *N_ = (is_zero(N))? sd->N: normalize(N);
if(texture_blur_)
*texture_blur_ = texture_blur/weight_sum;
@@ -826,7 +826,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc)
{
- return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I));
+ return emissive_simple_eval(sd->Ng, sd->I);
}
ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
@@ -834,8 +834,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
float3 eval;
eval = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
if(CLOSURE_IS_EMISSION(sc->type))
eval += emissive_eval(kg, sd, sc)*sc->weight;
@@ -850,8 +850,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
{
float3 weight = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
if(CLOSURE_IS_HOLDOUT(sc->type))
weight += sc->weight;
@@ -865,9 +865,9 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_space RNG *rng,
ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx)
{
- ccl_fetch(sd, num_closure) = 0;
- ccl_fetch(sd, num_closure_extra) = 0;
- ccl_fetch(sd, randb_closure) = randb;
+ sd->num_closure = 0;
+ sd->num_closure_extra = 0;
+ sd->randb_closure = randb;
#ifdef __OSL__
if(kg->osl)
@@ -881,13 +881,13 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_
DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd,
sizeof(DiffuseBsdf),
make_float3(0.8f, 0.8f, 0.8f));
- bsdf->N = ccl_fetch(sd, N);
- ccl_fetch(sd, flag) |= bsdf_diffuse_setup(bsdf);
+ bsdf->N = sd->N;
+ sd->flag |= bsdf_diffuse_setup(bsdf);
#endif
}
- if(rng && (ccl_fetch(sd, flag) & SD_BSDF_NEEDS_LCG)) {
- ccl_fetch(sd, lcg_state) = lcg_state_init_addrspace(rng, state, 0xb4bc3953);
+ if(rng && (sd->flag & SD_BSDF_NEEDS_LCG)) {
+ sd->lcg_state = lcg_state_init_addrspace(rng, state, 0xb4bc3953);
}
}
@@ -896,9 +896,9 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_
ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
ccl_addr_space PathState *state, int path_flag, ShaderContext ctx)
{
- ccl_fetch(sd, num_closure) = 0;
- ccl_fetch(sd, num_closure_extra) = 0;
- ccl_fetch(sd, randb_closure) = 0.0f;
+ sd->num_closure = 0;
+ sd->num_closure_extra = 0;
+ sd->randb_closure = 0.0f;
#ifdef __SVM__
#ifdef __OSL__
@@ -913,8 +913,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
- const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
if(CLOSURE_IS_BACKGROUND(sc->type))
eval += sc->weight;
@@ -1093,9 +1093,9 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx)
{
- ccl_fetch(sd, num_closure) = 0;
- ccl_fetch(sd, num_closure_extra) = 0;
- ccl_fetch(sd, randb_closure) = 0.0f;
+ sd->num_closure = 0;
+ sd->num_closure_extra = 0;
+ sd->randb_closure = 0.0f;
/* this will modify sd->P */
#ifdef __SVM__
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 06a77a208cb..2483c5f9ae1 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -45,7 +45,7 @@ ccl_device_forceinline bool shadow_handle_transparent_isect(
/* Setup shader data at surface. */
shader_setup_from_ray(kg, shadow_sd, isect, ray);
/* Attenuation from transparent surface. */
- if(!(ccl_fetch(shadow_sd, flag) & SD_HAS_ONLY_VOLUME)) {
+ if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
path_state_modify_bounce(state, true);
shader_eval_surface(kg,
shadow_sd,
@@ -180,7 +180,7 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
return true;
}
/* Move ray forward. */
- ray->P = ccl_fetch(shadow_sd, P);
+ ray->P = shadow_sd->P;
if(ray->t != FLT_MAX) {
ray->D = normalize_len(Pend - ray->P, &ray->t);
}
@@ -248,7 +248,7 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
}
# endif /* __SHADOW_RECORD_ALL__ */
-# ifdef __KERNEL_GPU__
+# if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__)
/* Shadow function to compute how much light is blocked,
*
* Here we raytrace from one transparent surface to the next step by step.
@@ -308,7 +308,7 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(
return true;
}
/* Move ray forward. */
- ray->P = ray_offset(ccl_fetch(shadow_sd, P), -ccl_fetch(shadow_sd, Ng));
+ ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
if(ray->t != FLT_MAX) {
ray->D = normalize_len(Pend - ray->P, &ray->t);
}
@@ -359,7 +359,7 @@ ccl_device bool shadow_blocked_transparent_stepped(
shadow);
}
-# endif /* __KERNEL_GPU__ */
+# endif /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */
#endif /* __TRANSPARENT_SHADOWS__ */
ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
@@ -374,7 +374,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
#ifdef __SPLIT_KERNEL__
Ray private_ray = *ray_input;
Ray *ray = &private_ray;
- Intersection *isect = &kg->isect_shadow[SD_THREAD];
+ Intersection *isect = &kernel_split_state.isect_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
#else /* __SPLIT_KERNEL__ */
Ray *ray = ray_input;
Intersection isect_object;
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 52c05b85aee..a8fa6432542 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -298,20 +298,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
for(int hit = 0; hit < num_eval_hits; hit++) {
/* Quickly retrieve P and Ng without setting up ShaderData. */
float3 hit_P;
- if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+ if(sd->type & PRIMITIVE_TRIANGLE) {
hit_P = triangle_refine_subsurface(kg,
sd,
&ss_isect->hits[hit],
ray);
}
#ifdef __OBJECT_MOTION__
- else if(ccl_fetch(sd, type) & PRIMITIVE_MOTION_TRIANGLE) {
+ else if(sd->type & PRIMITIVE_MOTION_TRIANGLE) {
float3 verts[3];
motion_triangle_vertices(
kg,
- ccl_fetch(sd, object),
+ sd->object,
kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim),
- ccl_fetch(sd, time),
+ sd->time,
verts);
hit_P = motion_triangle_refine_subsurface(kg,
sd,
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 8250eaf6073..a7faaef89ca 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -32,6 +32,11 @@
# define ccl_addr_space
#endif
+#if defined(__SPLIT_KERNEL__) && !defined(__COMPUTE_DEVICE_GPU__)
+/* TODO(mai): need to investigate how this effects the kernel, as cpu kernel crashes without this right now */
+#define __COMPUTE_DEVICE_GPU__
+#endif
+
CCL_NAMESPACE_BEGIN
/* constants */
@@ -56,6 +61,8 @@ CCL_NAMESPACE_BEGIN
#define VOLUME_STACK_SIZE 16
+#define WORK_POOL_SIZE 64
+
/* device capabilities */
#ifdef __KERNEL_CPU__
# ifdef __KERNEL_SSE2__
@@ -63,28 +70,36 @@ CCL_NAMESPACE_BEGIN
# endif
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
-# define __BRANCHED_PATH__
+# ifndef __SPLIT_KERNEL__
+# define __BRANCHED_PATH__
+# endif
# ifdef WITH_OSL
# define __OSL__
# endif
-# define __SUBSURFACE__
+# ifndef __SPLIT_KERNEL__
+# define __SUBSURFACE__
+# endif
# define __CMJ__
-# define __VOLUME__
-# define __VOLUME_DECOUPLED__
-# define __VOLUME_SCATTER__
-# define __SHADOW_RECORD_ALL__
-# define __VOLUME_RECORD_ALL__
+# ifndef __SPLIT_KERNEL__
+# define __VOLUME__
+# define __VOLUME_DECOUPLED__
+# define __VOLUME_SCATTER__
+# define __SHADOW_RECORD_ALL__
+# define __VOLUME_RECORD_ALL__
+# endif
#endif /* __KERNEL_CPU__ */
#ifdef __KERNEL_CUDA__
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
-# define __BRANCHED_PATH__
-# define __VOLUME__
-# define __VOLUME_SCATTER__
-# define __SUBSURFACE__
-# define __CMJ__
-# define __SHADOW_RECORD_ALL__
+# ifndef __SPLIT_KERNEL__
+# define __BRANCHED_PATH__
+# define __VOLUME__
+# define __VOLUME_SCATTER__
+# define __SUBSURFACE__
+# define __CMJ__
+# define __SHADOW_RECORD_ALL__
+# endif
#endif /* __KERNEL_CUDA__ */
#ifdef __KERNEL_OPENCL__
@@ -798,99 +813,77 @@ enum ShaderDataObjectFlag {
SD_OBJECT_INTERSECTS_VOLUME)
};
-#ifdef __SPLIT_KERNEL__
-# define SD_THREAD (get_global_id(1) * get_global_size(0) + get_global_id(0))
-# if !defined(__SPLIT_KERNEL_SOA__)
- /* ShaderData is stored as an Array-of-Structures */
-# define ccl_soa_member(type, name) type soa_##name
-# define ccl_fetch(s, t) (s[SD_THREAD].soa_##t)
-# define ccl_fetch_array(s, t, index) (&s[SD_THREAD].soa_##t[index])
-# else
- /* ShaderData is stored as an Structure-of-Arrays */
-# define SD_GLOBAL_SIZE (get_global_size(0) * get_global_size(1))
-# define SD_FIELD_SIZE(t) sizeof(((struct ShaderData*)0)->t)
-# define SD_OFFSETOF(t) ((char*)(&((struct ShaderData*)0)->t) - (char*)0)
-# define ccl_soa_member(type, name) type soa_##name
-# define ccl_fetch(s, t) (((ShaderData*)((ccl_addr_space char*)s + SD_GLOBAL_SIZE * SD_OFFSETOF(soa_##t) + SD_FIELD_SIZE(soa_##t) * SD_THREAD - SD_OFFSETOF(soa_##t)))->soa_##t)
-# define ccl_fetch_array(s, t, index) (&ccl_fetch(s, t)[index])
-# endif
-#else
-# define ccl_soa_member(type, name) type name
-# define ccl_fetch(s, t) (s->t)
-# define ccl_fetch_array(s, t, index) (&s->t[index])
-#endif
-
typedef ccl_addr_space struct ShaderData {
/* position */
- ccl_soa_member(float3, P);
+ float3 P;
/* smooth normal for shading */
- ccl_soa_member(float3, N);
+ float3 N;
/* true geometric normal */
- ccl_soa_member(float3, Ng);
+ float3 Ng;
/* view/incoming direction */
- ccl_soa_member(float3, I);
+ float3 I;
/* shader id */
- ccl_soa_member(int, shader);
+ int shader;
/* booleans describing shader, see ShaderDataFlag */
- ccl_soa_member(int, flag);
+ int flag;
/* booleans describing object of the shader, see ShaderDataObjectFlag */
- ccl_soa_member(int, object_flag);
+ int object_flag;
/* primitive id if there is one, ~0 otherwise */
- ccl_soa_member(int, prim);
+ int prim;
/* combined type and curve segment for hair */
- ccl_soa_member(int, type);
+ int type;
/* parametric coordinates
* - barycentric weights for triangles */
- ccl_soa_member(float, u);
- ccl_soa_member(float, v);
+ float u;
+ float v;
/* object id if there is one, ~0 otherwise */
- ccl_soa_member(int, object);
+ int object;
/* motion blur sample time */
- ccl_soa_member(float, time);
+ float time;
/* length of the ray being shaded */
- ccl_soa_member(float, ray_length);
+ float ray_length;
#ifdef __RAY_DIFFERENTIALS__
/* differential of P. these are orthogonal to Ng, not N */
- ccl_soa_member(differential3, dP);
+ differential3 dP;
/* differential of I */
- ccl_soa_member(differential3, dI);
+ differential3 dI;
/* differential of u, v */
- ccl_soa_member(differential, du);
- ccl_soa_member(differential, dv);
+ differential du;
+ differential dv;
#endif
#ifdef __DPDU__
/* differential of P w.r.t. parametric coordinates. note that dPdu is
* not readily suitable as a tangent for shading on triangles. */
- ccl_soa_member(float3, dPdu);
- ccl_soa_member(float3, dPdv);
+ float3 dPdu;
+ float3 dPdv;
#endif
#ifdef __OBJECT_MOTION__
/* object <-> world space transformations, cached to avoid
* re-interpolating them constantly for shading */
- ccl_soa_member(Transform, ob_tfm);
- ccl_soa_member(Transform, ob_itfm);
+ Transform ob_tfm;
+ Transform ob_itfm;
#endif
/* Closure data, we store a fixed array of closures */
- ccl_soa_member(struct ShaderClosure, closure[MAX_CLOSURE]);
- ccl_soa_member(int, num_closure);
- ccl_soa_member(int, num_closure_extra);
- ccl_soa_member(float, randb_closure);
- ccl_soa_member(float3, svm_closure_weight);
+ struct ShaderClosure closure[MAX_CLOSURE];
+ int num_closure;
+ int num_closure_extra;
+ float randb_closure;
+ float3 svm_closure_weight;
/* LCG state for closures that require additional random numbers. */
- ccl_soa_member(uint, lcg_state);
+ uint lcg_state;
/* ray start position, only set for backgrounds */
- ccl_soa_member(float3, ray_P);
- ccl_soa_member(differential3, ray_dP);
+ float3 ray_P;
+ differential3 ray_dP;
#ifdef __OSL__
struct KernelGlobals *osl_globals;
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 7d559b1aa31..28fc5ce1c30 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -17,177 +17,102 @@
#ifndef __KERNEL_WORK_STEALING_H__
#define __KERNEL_WORK_STEALING_H__
+CCL_NAMESPACE_BEGIN
+
/*
* Utility functions for work stealing
*/
-#ifdef __WORK_STEALING__
-
#ifdef __KERNEL_OPENCL__
# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#endif
-uint get_group_id_with_ray_index(uint ray_index,
- uint tile_dim_x,
- uint tile_dim_y,
- uint parallel_samples,
- int dim)
+ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg)
+{
+ return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples;
+}
+
+ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg)
+{
+ return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE;
+}
+
+ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index)
+{
+ return ray_index / WORK_POOL_SIZE;
+}
+
+ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool)
{
- if(dim == 0) {
- uint x_span = ray_index % (tile_dim_x * parallel_samples);
- return x_span / get_local_size(0);
+ uint total_work_size = kernel_total_work_size(kg);
+ uint num_pools = kernel_num_work_pools(kg);
+
+ if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) {
+ return 0;
+ }
+
+ uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE;
+
+ uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE));
+ if(work_pool < remainder / WORK_POOL_SIZE) {
+ work_size += WORK_POOL_SIZE;
}
- else /*if(dim == 1)*/ {
- kernel_assert(dim == 1);
- uint y_span = ray_index / (tile_dim_x * parallel_samples);
- return y_span / get_local_size(1);
+ else if(work_pool == remainder / WORK_POOL_SIZE) {
+ work_size += remainder % WORK_POOL_SIZE;
}
+
+ return work_size;
}
-uint get_total_work(uint tile_dim_x,
- uint tile_dim_y,
- uint grp_idx,
- uint grp_idy,
- uint num_samples)
+ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index)
{
- uint threads_within_tile_border_x =
- (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
- : get_local_size(0);
- uint threads_within_tile_border_y =
- (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
- : get_local_size(1);
-
- threads_within_tile_border_x =
- (threads_within_tile_border_x == 0) ? get_local_size(0)
- : threads_within_tile_border_x;
- threads_within_tile_border_y =
- (threads_within_tile_border_y == 0) ? get_local_size(1)
- : threads_within_tile_border_y;
-
- return threads_within_tile_border_x *
- threads_within_tile_border_y *
- num_samples;
+ uint num_pools = kernel_num_work_pools(kg);
+ uint pool = work_pool_from_ray_index(kg, ray_index);
+
+ return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE)
+ + (pool * WORK_POOL_SIZE)
+ + (work_index % WORK_POOL_SIZE);
}
-/* Returns 0 in case there is no next work available */
-/* Returns 1 in case work assigned is valid */
-int get_next_work(ccl_global uint *work_pool,
- ccl_private uint *my_work,
- uint tile_dim_x,
- uint tile_dim_y,
- uint num_samples,
- uint parallel_samples,
- uint ray_index)
+/* Returns true if there is work */
+ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index)
{
- uint grp_idx = get_group_id_with_ray_index(ray_index,
- tile_dim_x,
- tile_dim_y,
- parallel_samples,
- 0);
- uint grp_idy = get_group_id_with_ray_index(ray_index,
- tile_dim_x,
- tile_dim_y,
- parallel_samples,
- 1);
- uint total_work = get_total_work(tile_dim_x,
- tile_dim_y,
- grp_idx,
- grp_idy,
- num_samples);
- uint group_index = grp_idy * get_num_groups(0) + grp_idx;
- *my_work = atomic_inc(&work_pool[group_index]);
- return (*my_work < total_work) ? 1 : 0;
+ uint work_pool = work_pool_from_ray_index(kg, ray_index);
+ uint pool_size = work_pool_work_size(kg, work_pool);
+
+ if(pool_size == 0) {
+ return false;
+ }
+
+ *work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]);
+ return (*work_index < pool_size);
}
-/* This function assumes that the passed my_work is valid. */
-/* Decode sample number w.r.t. assigned my_work. */
-uint get_my_sample(uint my_work,
- uint tile_dim_x,
- uint tile_dim_y,
- uint parallel_samples,
- uint ray_index)
+/* This function assumes that the passed `work` is valid. */
+/* Decode sample number w.r.t. assigned `work`. */
+ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index)
{
- uint grp_idx = get_group_id_with_ray_index(ray_index,
- tile_dim_x,
- tile_dim_y,
- parallel_samples,
- 0);
- uint grp_idy = get_group_id_with_ray_index(ray_index,
- tile_dim_x,
- tile_dim_y,
- parallel_samples,
- 1);
- uint threads_within_tile_border_x =
- (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
- : get_local_size(0);
- uint threads_within_tile_border_y =
- (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
- : get_local_size(1);
-
- threads_within_tile_border_x =
- (threads_within_tile_border_x == 0) ? get_local_size(0)
- : threads_within_tile_border_x;
- threads_within_tile_border_y =
- (threads_within_tile_border_y == 0) ? get_local_size(1)
- : threads_within_tile_border_y;
-
- return my_work /
- (threads_within_tile_border_x * threads_within_tile_border_y);
+ return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h);
}
-/* Decode pixel and tile position w.r.t. assigned my_work. */
-void get_pixel_tile_position(ccl_private uint *pixel_x,
+/* Decode pixel and tile position w.r.t. assigned `work`. */
+ccl_device void get_work_pixel_tile_position(KernelGlobals *kg,
+ ccl_private uint *pixel_x,
ccl_private uint *pixel_y,
ccl_private uint *tile_x,
ccl_private uint *tile_y,
- uint my_work,
- uint tile_dim_x,
- uint tile_dim_y,
- uint tile_offset_x,
- uint tile_offset_y,
- uint parallel_samples,
+ uint work_index,
uint ray_index)
{
- uint grp_idx = get_group_id_with_ray_index(ray_index,
- tile_dim_x,
- tile_dim_y,
- parallel_samples,
- 0);
- uint grp_idy = get_group_id_with_ray_index(ray_index,
- tile_dim_x,
- tile_dim_y,
- parallel_samples,
- 1);
- uint threads_within_tile_border_x =
- (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
- : get_local_size(0);
- uint threads_within_tile_border_y =
- (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
- : get_local_size(1);
-
- threads_within_tile_border_x =
- (threads_within_tile_border_x == 0) ? get_local_size(0)
- : threads_within_tile_border_x;
- threads_within_tile_border_y =
- (threads_within_tile_border_y == 0) ? get_local_size(1)
- : threads_within_tile_border_y;
-
- uint total_associated_pixels =
- threads_within_tile_border_x * threads_within_tile_border_y;
- uint work_group_pixel_index = my_work % total_associated_pixels;
- uint work_group_pixel_x =
- work_group_pixel_index % threads_within_tile_border_x;
- uint work_group_pixel_y =
- work_group_pixel_index / threads_within_tile_border_x;
-
- *pixel_x =
- tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
- *pixel_y =
- tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
- *tile_x = *pixel_x - tile_offset_x;
- *tile_y = *pixel_y - tile_offset_y;
+ uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h);
+
+ *tile_x = pixel_index % kernel_split_params.w;
+ *tile_y = pixel_index / kernel_split_params.w;
+
+ *pixel_x = *tile_x + kernel_split_params.x;
+ *pixel_y = *tile_y + kernel_split_params.y;
}
-#endif /* __WORK_STEALING__ */
+CCL_NAMESPACE_END
#endif /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 1a07c705f1c..deb872444d0 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -49,4 +49,39 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
int offset,
int sample);
+/* Split kernels */
+
+void KERNEL_FUNCTION_FULL_NAME(data_init)(
+ KernelGlobals *kg,
+ ccl_constant KernelData *data,
+ ccl_global void *split_data_buffer,
+ int num_elements,
+ ccl_global char *ray_state,
+ ccl_global uint *rng_state,
+ int start_sample,
+ int end_sample,
+ int sx, int sy, int sw, int sh, int offset, int stride,
+ ccl_global int *Queue_index,
+ int queuesize,
+ ccl_global char *use_queues_flag,
+ ccl_global unsigned int *work_pool_wgs,
+ unsigned int num_samples,
+ ccl_global float *buffer);
+
+#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData *data);
+
+DECLARE_SPLIT_KERNEL_FUNCTION(path_init)
+DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DECLARE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
+DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
+DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+
+void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
+
#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index ec82d4b4c22..d6d0db4e034 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -21,17 +21,39 @@
*/
#include "kernel_compat_cpu.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_cpu_image.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_path_branched.h"
-#include "kernel_bake.h"
+
+#ifndef __SPLIT_KERNEL__
+# include "kernel_math.h"
+# include "kernel_types.h"
+
+# include "split/kernel_split_data.h"
+# include "kernel_globals.h"
+
+# include "kernel_cpu_image.h"
+# include "kernel_film.h"
+# include "kernel_path.h"
+# include "kernel_path_branched.h"
+# include "kernel_bake.h"
+#else
+# include "split/kernel_split_common.h"
+
+# include "split/kernel_data_init.h"
+# include "split/kernel_path_init.h"
+# include "split/kernel_scene_intersect.h"
+# include "split/kernel_lamp_emission.h"
+# include "split/kernel_queue_enqueue.h"
+# include "split/kernel_background_buffer_update.h"
+# include "split/kernel_shader_eval.h"
+# include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+# include "split/kernel_direct_lighting.h"
+# include "split/kernel_shadow_blocked.h"
+# include "split/kernel_next_iteration_setup.h"
+#endif
CCL_NAMESPACE_BEGIN
+#ifndef __SPLIT_KERNEL__
+
/* Path Tracing */
void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
@@ -131,4 +153,55 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
}
}
+#else /* __SPLIT_KERNEL__ */
+
+/* Split Kernel Path Tracing */
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+ void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+ { \
+ kernel_##name(kg); \
+ }
+
+DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
+DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+
+void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
+{
+#define REGISTER_NAME_STRING(name) #name
+#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
+#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
+
+ REGISTER(path_trace);
+ REGISTER(convert_to_byte);
+ REGISTER(convert_to_half_float);
+ REGISTER(shader);
+
+ REGISTER(data_init);
+ REGISTER(path_init);
+ REGISTER(scene_intersect);
+ REGISTER(lamp_emission);
+ REGISTER(queue_enqueue);
+ REGISTER(background_buffer_update);
+ REGISTER(shader_eval);
+ REGISTER(holdout_emission_blurring_pathtermination_ao);
+ REGISTER(direct_lighting);
+ REGISTER(shadow_blocked);
+ REGISTER(next_iteration_setup);
+
+#undef REGISTER
+#undef REGISTER_EVAL_NAME
+#undef REGISTER_NAME_STRING
+}
+
+#endif /* __SPLIT_KERNEL__ */
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
new file mode 100644
index 00000000000..30519dae53e
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+# define __KERNEL_SSE2__
+#endif
+
+#define __SPLIT_KERNEL__
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+# ifdef __SSE2__
+# ifndef __KERNEL_SSE2__
+# define __KERNEL_SSE2__
+# endif
+# endif
+# ifdef __SSE3__
+# define __KERNEL_SSE3__
+# endif
+# ifdef __SSSE3__
+# define __KERNEL_SSSE3__
+# endif
+# ifdef __SSE4_1__
+# define __KERNEL_SSE41__
+# endif
+# ifdef __AVX__
+# define __KERNEL_AVX__
+# endif
+# ifdef __AVX2__
+# define __KERNEL_SSE__
+# define __KERNEL_AVX2__
+# endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+ /* do nothing */
+#endif
+
+#include "kernel.h"
+#define KERNEL_ARCH cpu
+#include "kernel_cpu_impl.h"
+
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
new file mode 100644
index 00000000000..335ad24bdc5
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+# include "kernel.h"
+# define KERNEL_ARCH cpu_avx
+# include "kernel_cpu_impl.h"
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
new file mode 100644
index 00000000000..765ba96aba3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE__
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+# define __KERNEL_AVX__
+# define __KERNEL_AVX2__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+# include "kernel.h"
+# define KERNEL_ARCH cpu_avx2
+# include "kernel_cpu_impl.h"
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
new file mode 100644
index 00000000000..af244c03929
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+# include "kernel.h"
+# define KERNEL_ARCH cpu_sse2
+# include "kernel_cpu_impl.h"
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
new file mode 100644
index 00000000000..d1b579eeac5
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+# include "kernel.h"
+# define KERNEL_ARCH cpu_sse3
+# include "kernel_cpu_impl.h"
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
new file mode 100644
index 00000000000..83d62de5aa5
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+# define __KERNEL_SSE2__
+# define __KERNEL_SSE3__
+# define __KERNEL_SSSE3__
+# define __KERNEL_SSE41__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+# include "kernel.h"
+# define KERNEL_ARCH cpu_sse41
+# include "kernel_cpu_impl.h"
+#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 090ab2c50c2..52e541321e3 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -16,7 +16,10 @@
/* CUDA kernel entry points */
+#ifdef __CUDA_ARCH__
+
#include "../../kernel_compat_cuda.h"
+#include "kernel_config.h"
#include "../../kernel_math.h"
#include "../../kernel_types.h"
#include "../../kernel_globals.h"
@@ -25,104 +28,7 @@
#include "../../kernel_path_branched.h"
#include "../../kernel_bake.h"
-/* device data taken from CUDA occupancy calculator */
-
-#ifdef __CUDA_ARCH__
-
-/* 2.0 and 2.1 */
-#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 32
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
-
-/* 3.0 and 3.5 */
-#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 63
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 63
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 63
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.0, 5.2, 5.3, 6.0, 6.1 */
-#elif __CUDA_ARCH__ >= 500
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 48
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* unknown architecture */
-#else
-# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
- __launch_bounds__( \
- threads_block_width*threads_block_width, \
- CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
- )
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-# error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
-# error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-# error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-# error "Maximum number of registers per thread exceeded"
-#endif
-
/* kernels */
-
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
new file mode 100644
index 00000000000..9fa39dc9ebb
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* device data taken from CUDA occupancy calculator */
+
+/* 2.0 and 2.1 */
+#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 32
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* 3.0 and 3.5 */
+#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 63
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 63
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 63
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 5.0, 5.2, 5.3, 6.0, 6.1 */
+#elif __CUDA_ARCH__ >= 500
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 48
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* unknown architecture */
+#else
+# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread */
+
+#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
+ __launch_bounds__( \
+ threads_block_width*threads_block_width, \
+ CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
+ )
+
+/* sanity checks */
+
+#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
+# error "Maximum number of threads per block exceeded"
+#endif
+
+#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
+# error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+# error "Maximum number of registers per thread exceeded"
+#endif
+
+#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+# error "Maximum number of registers per thread exceeded"
+#endif
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
new file mode 100644
index 00000000000..759475b175f
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA split kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#define __SPLIT_KERNEL__
+
+#include "../../kernel_compat_cuda.h"
+#include "kernel_config.h"
+
+#include "../../split/kernel_split_common.h"
+#include "../../split/kernel_data_init.h"
+#include "../../split/kernel_path_init.h"
+#include "../../split/kernel_scene_intersect.h"
+#include "../../split/kernel_lamp_emission.h"
+#include "../../split/kernel_queue_enqueue.h"
+#include "../../split/kernel_background_buffer_update.h"
+#include "../../split/kernel_shader_eval.h"
+#include "../../split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#include "../../split/kernel_direct_lighting.h"
+#include "../../split/kernel_shadow_blocked.h"
+#include "../../split/kernel_next_iteration_setup.h"
+
+#include "../../kernel_film.h"
+
+/* kernels */
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_state_buffer_size(uint num_threads, uint *size)
+{
+ *size = split_data_buffer_size(NULL, num_threads);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_path_trace_data_init(
+ ccl_global void *split_data_buffer,
+ int num_elements,
+ ccl_global char *ray_state,
+ ccl_global uint *rng_state,
+ int start_sample,
+ int end_sample,
+ int sx, int sy, int sw, int sh, int offset, int stride,
+ ccl_global int *Queue_index,
+ int queuesize,
+ ccl_global char *use_queues_flag,
+ ccl_global unsigned int *work_pool_wgs,
+ unsigned int num_samples,
+ ccl_global float *buffer)
+{
+ kernel_data_init(NULL,
+ NULL,
+ split_data_buffer,
+ num_elements,
+ ray_state,
+ rng_state,
+ start_sample,
+ end_sample,
+ sx, sy, sw, sh, offset, stride,
+ Queue_index,
+ queuesize,
+ use_queues_flag,
+ work_pool_wgs,
+ num_samples,
+ buffer);
+}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+ extern "C" __global__ void \
+ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+ kernel_cuda_##name() \
+ { \
+ kernel_##name(NULL); \
+ }
+
+DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
+DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+ int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+ if(x < sx + sw && y < sy + sh)
+ kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+ int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+ if(x < sx + sw && y < sy + sh)
+ kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index a68f97857b6..52406d2f548 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -67,8 +67,8 @@ __kernel void kernel_ocl_path_trace(
kg->name = name;
#include "../../kernel_textures.h"
- int x = sx + get_global_id(0);
- int y = sy + get_global_id(1);
+ int x = sx + ccl_global_id(0);
+ int y = sy + ccl_global_id(1);
if(x < sx + sw && y < sy + sh)
kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
@@ -96,7 +96,7 @@ __kernel void kernel_ocl_shader(
kg->name = name;
#include "../../kernel_textures.h"
- int x = sx + get_global_id(0);
+ int x = sx + ccl_global_id(0);
if(x < sx + sw) {
kernel_shader_evaluate(kg,
@@ -128,7 +128,7 @@ __kernel void kernel_ocl_bake(
kg->name = name;
#include "../../kernel_textures.h"
- int x = sx + get_global_id(0);
+ int x = sx + ccl_global_id(0);
if(x < sx + sw) {
#ifdef __NO_BAKING__
@@ -159,8 +159,8 @@ __kernel void kernel_ocl_convert_to_byte(
kg->name = name;
#include "../../kernel_textures.h"
- int x = sx + get_global_id(0);
- int y = sy + get_global_id(1);
+ int x = sx + ccl_global_id(0);
+ int y = sy + ccl_global_id(1);
if(x < sx + sw && y < sy + sh)
kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
@@ -186,11 +186,27 @@ __kernel void kernel_ocl_convert_to_half_float(
kg->name = name;
#include "../../kernel_textures.h"
- int x = sx + get_global_id(0);
- int y = sy + get_global_id(1);
+ int x = sx + ccl_global_id(0);
+ int y = sy + ccl_global_id(1);
if(x < sx + sw && y < sy + sh)
kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
}
+__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, ulong size, ulong offset)
+{
+ size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
+
+ if(i < size / sizeof(float4)) {
+ buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ else if(i == size / sizeof(float4)) {
+ ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)];
+
+ for(i = 0; i < size % sizeof(float4); i++) {
+ *(b++) = 0;
+ }
+ }
+}
+
#endif /* __COMPILE_ONLY_MEGAKERNEL__ */
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
index 1914d241eb1..47e363f6e03 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
@@ -14,112 +14,13 @@
* limitations under the License.
*/
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
#include "split/kernel_background_buffer_update.h"
__kernel void kernel_ocl_path_trace_background_buffer_update(
- ccl_global char *kg,
- ccl_constant KernelData *data,
- ccl_global float *per_sample_output_buffers,
- ccl_global uint *rng_state,
- ccl_global uint *rng_coop, /* Required for buffer Update */
- ccl_global float3 *throughput_coop, /* Required for background hit processing */
- PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */
- ccl_global Ray *Ray_coop, /* Required for background hit processing */
- ccl_global PathState *PathState_coop, /* Required for background hit processing */
- ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */
- ccl_global char *ray_state, /* Stores information on the current state of a ray */
- int sw, int sh, int sx, int sy, int stride,
- int rng_state_offset_x,
- int rng_state_offset_y,
- int rng_state_stride,
- ccl_global unsigned int *work_array, /* Denotes work of each ray */
- ccl_global int *Queue_data, /* Queues memory */
- ccl_global int *Queue_index, /* Tracks the number of elements in each queue */
- int queuesize, /* Size (capacity) of each queue */
- int end_sample,
- int start_sample,
-#ifdef __WORK_STEALING__
- ccl_global unsigned int *work_pool_wgs,
- unsigned int num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
- DebugData *debugdata_coop,
-#endif
- int parallel_samples) /* Number of samples to be processed in parallel */
+ KernelGlobals *kg,
+ ccl_constant KernelData *data)
{
- ccl_local unsigned int local_queue_atomics;
- if(get_local_id(0) == 0 && get_local_id(1) == 0) {
- local_queue_atomics = 0;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
- if(ray_index == 0) {
- /* We will empty this queue in this kernel. */
- Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
- }
- char enqueue_flag = 0;
- ray_index = get_ray_index(ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- Queue_data,
- queuesize,
- 1);
-
-#ifdef __COMPUTE_DEVICE_GPU__
- /* If we are executing on a GPU device, we exit all threads that are not
- * required.
- *
- * If we are executing on a CPU device, then we need to keep all threads
- * active since we have barrier() calls later in the kernel. CPU devices,
- * expect all threads to execute barrier statement.
- */
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
- if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
- enqueue_flag =
- kernel_background_buffer_update((KernelGlobals *)kg,
- per_sample_output_buffers,
- rng_state,
- rng_coop,
- throughput_coop,
- PathRadiance_coop,
- Ray_coop,
- PathState_coop,
- L_transparent_coop,
- ray_state,
- sw, sh, sx, sy, stride,
- rng_state_offset_x,
- rng_state_offset_y,
- rng_state_stride,
- work_array,
- end_sample,
- start_sample,
-#ifdef __WORK_STEALING__
- work_pool_wgs,
- num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
- debugdata_coop,
-#endif
- parallel_samples,
- ray_index);
-#ifndef __COMPUTE_DEVICE_GPU__
- }
-#endif
-
- /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
- * These rays will be made active during next SceneIntersectkernel.
- */
- enqueue_ray_index_local(ray_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- enqueue_flag,
- queuesize,
- &local_queue_atomics,
- Queue_data,
- Queue_index);
+ kernel_background_buffer_update(kg);
}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
index 18139687eab..1e3c4fa28c7 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -14,77 +14,49 @@
* limitations under the License.
*/
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
#include "split/kernel_data_init.h"
__kernel void kernel_ocl_path_trace_data_init(
- ccl_global char *globals,
- ccl_global char *sd_DL_shadow,
+ KernelGlobals *kg,
ccl_constant KernelData *data,
- ccl_global float *per_sample_output_buffers,
+ ccl_global void *split_data_buffer,
+ int num_elements,
+ ccl_global char *ray_state,
ccl_global uint *rng_state,
- ccl_global uint *rng_coop, /* rng array to store rng values for all rays */
- ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */
- ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */
- PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */
- ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */
- ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */
- Intersection *Intersection_coop_shadow,
- ccl_global char *ray_state, /* Stores information on current state of a ray */
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "../../kernel_textures.h"
- int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
- int rng_state_offset_x,
- int rng_state_offset_y,
- int rng_state_stride,
- ccl_global int *Queue_data, /* Memory for queues */
+ int start_sample,
+ int end_sample,
+ int sx, int sy, int sw, int sh, int offset, int stride,
ccl_global int *Queue_index, /* Tracks the number of elements in queues */
int queuesize, /* size (capacity) of the queue */
ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
- ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */
-#ifdef __WORK_STEALING__
ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */
unsigned int num_samples, /* Total number of samples per pixel */
-#endif
-#ifdef __KERNEL_DEBUG__
- DebugData *debugdata_coop,
-#endif
- int parallel_samples) /* Number of samples to be processed in parallel */
+ ccl_global float *buffer)
{
- kernel_data_init((KernelGlobals *)globals,
- (ShaderData *)sd_DL_shadow,
+ kernel_data_init(kg,
data,
- per_sample_output_buffers,
- rng_state,
- rng_coop,
- throughput_coop,
- L_transparent_coop,
- PathRadiance_coop,
- Ray_coop,
- PathState_coop,
- Intersection_coop_shadow,
+ split_data_buffer,
+ num_elements,
ray_state,
+ rng_state,
#define KERNEL_TEX(type, ttype, name) name,
#include "../../kernel_textures.h"
- start_sample, sx, sy, sw, sh, offset, stride,
- rng_state_offset_x,
- rng_state_offset_y,
- rng_state_stride,
- Queue_data,
+ start_sample,
+ end_sample,
+ sx, sy, sw, sh, offset, stride,
Queue_index,
queuesize,
use_queues_flag,
- work_array,
-#ifdef __WORK_STEALING__
work_pool_wgs,
num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
- debugdata_coop,
-#endif
- parallel_samples);
+ buffer);
}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
index c6a2c8d050c..5d2f46b319d 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -14,74 +14,13 @@
* limitations under the License.
*/
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
#include "split/kernel_direct_lighting.h"
__kernel void kernel_ocl_path_trace_direct_lighting(
- ccl_global char *kg,
- ccl_constant KernelData *data,
- ccl_global char *sd, /* Required for direct lighting */
- ccl_global uint *rng_coop, /* Required for direct lighting */
- ccl_global PathState *PathState_coop, /* Required for direct lighting */
- ccl_global int *ISLamp_coop, /* Required for direct lighting */
- ccl_global Ray *LightRay_coop, /* Required for direct lighting */
- ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- ccl_global int *Queue_data, /* Queue memory */
- ccl_global int *Queue_index, /* Tracks the number of elements in each queue */
- int queuesize) /* Size (capacity) of each queue */
+ KernelGlobals *kg,
+ ccl_constant KernelData *data)
{
- ccl_local unsigned int local_queue_atomics;
- if(get_local_id(0) == 0 && get_local_id(1) == 0) {
- local_queue_atomics = 0;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- char enqueue_flag = 0;
- int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
- ray_index = get_ray_index(ray_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- Queue_data,
- queuesize,
- 0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
- /* If we are executing on a GPU device, we exit all threads that are not
- * required.
- *
- * If we are executing on a CPU device, then we need to keep all threads
- * active since we have barrier() calls later in the kernel. CPU devices,
- * expect all threads to execute barrier statement.
- */
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
- if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
- enqueue_flag = kernel_direct_lighting((KernelGlobals *)kg,
- (ShaderData *)sd,
- rng_coop,
- PathState_coop,
- ISLamp_coop,
- LightRay_coop,
- BSDFEval_coop,
- ray_state,
- ray_index);
-
-#ifndef __COMPUTE_DEVICE_GPU__
- }
-#endif
-
-#ifdef __EMISSION__
- /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
- enqueue_ray_index_local(ray_index,
- QUEUE_SHADOW_RAY_CAST_DL_RAYS,
- enqueue_flag,
- queuesize,
- &local_queue_atomics,
- Queue_data,
- Queue_index);
-#endif
+ kernel_direct_lighting(kg);
}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
index e063614da1a..7724b8a0bdf 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -14,110 +14,13 @@
* limitations under the License.
*/
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
#include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
- ccl_global char *kg,
- ccl_constant KernelData *data,
- ccl_global char *sd, /* Required throughout the kernel except probabilistic path termination and AO */
- ccl_global float *per_sample_output_buffers,
- ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */
- ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */
- ccl_global float *L_transparent_coop, /* Required for handling holdout material */
- PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */
- ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */
- Intersection *Intersection_coop, /* Required for indirect primitive emission */
- ccl_global float3 *AOAlpha_coop, /* Required for AO */
- ccl_global float3 *AOBSDF_coop, /* Required for AO */
- ccl_global Ray *AOLightRay_coop, /* Required for AO */
- int sw, int sh, int sx, int sy, int stride,
- ccl_global char *ray_state, /* Denotes the state of each ray */
- ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */
- ccl_global int *Queue_data, /* Queue memory */
- ccl_global int *Queue_index, /* Tracks the number of elements in each queue */
- int queuesize, /* Size (capacity) of each queue */
-#ifdef __WORK_STEALING__
- unsigned int start_sample,
-#endif
- int parallel_samples) /* Number of samples to be processed in parallel */
+ KernelGlobals *kg,
+ ccl_constant KernelData *data)
{
- ccl_local unsigned int local_queue_atomics_bg;
- ccl_local unsigned int local_queue_atomics_ao;
- if(get_local_id(0) == 0 && get_local_id(1) == 0) {
- local_queue_atomics_bg = 0;
- local_queue_atomics_ao = 0;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- char enqueue_flag = 0;
- char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
- int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
- ray_index = get_ray_index(ray_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- Queue_data,
- queuesize,
- 0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
- /* If we are executing on a GPU device, we exit all threads that are not
- * required.
- *
- * If we are executing on a CPU device, then we need to keep all threads
- * active since we have barrier() calls later in the kernel. CPU devices,
- * expect all threads to execute barrier statement.
- */
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-#endif /* __COMPUTE_DEVICE_GPU__ */
-
-#ifndef __COMPUTE_DEVICE_GPU__
- if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
- kernel_holdout_emission_blurring_pathtermination_ao(
- (KernelGlobals *)kg,
- (ShaderData *)sd,
- per_sample_output_buffers,
- rng_coop,
- throughput_coop,
- L_transparent_coop,
- PathRadiance_coop,
- PathState_coop,
- Intersection_coop,
- AOAlpha_coop,
- AOBSDF_coop,
- AOLightRay_coop,
- sw, sh, sx, sy, stride,
- ray_state,
- work_array,
-#ifdef __WORK_STEALING__
- start_sample,
-#endif
- parallel_samples,
- ray_index,
- &enqueue_flag,
- &enqueue_flag_AO_SHADOW_RAY_CAST);
-#ifndef __COMPUTE_DEVICE_GPU__
- }
-#endif
-
- /* Enqueue RAY_UPDATE_BUFFER rays. */
- enqueue_ray_index_local(ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- enqueue_flag,
- queuesize,
- &local_queue_atomics_bg,
- Queue_data,
- Queue_index);
-
-#ifdef __AO__
- /* Enqueue to-shadow-ray-cast rays. */
- enqueue_ray_index_local(ray_index,
- QUEUE_SHADOW_RAY_CAST_AO_RAYS,
- enqueue_flag_AO_SHADOW_RAY_CAST,
- queuesize,
- &local_queue_atomics_ao,
- Queue_data,
- Queue_index);
-#endif
+ kernel_holdout_emission_blurring_pathtermination_ao(kg);
}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
index 267bddc2ffc..2b84d0ea43e 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -14,67 +14,13 @@
* limitations under the License.
*/
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
#include "split/kernel_lamp_emission.h"
__kernel void kernel_ocl_path_trace_lamp_emission(
- ccl_global char *kg,
- ccl_constant KernelData *data,
- ccl_global float3 *throughput_coop, /* Required for lamp emission */
- PathRadiance *PathRadiance_coop, /* Required for lamp emission */
- ccl_global Ray *Ray_coop, /* Required for lamp emission */
- ccl_global PathState *PathState_coop, /* Required for lamp emission */
- Intersection *Intersection_coop, /* Required for lamp emission */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- int sw, int sh,
- ccl_global int *Queue_data, /* Memory for queues */
- ccl_global int *Queue_index, /* Tracks the number of elements in queues */
- int queuesize, /* Size (capacity) of queues */
- ccl_global char *use_queues_flag, /* Used to decide if this kernel should use
- * queues to fetch ray index
- */
- int parallel_samples) /* Number of samples to be processed in parallel */
+ KernelGlobals *kg,
+ ccl_constant KernelData *data)
{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- /* We will empty this queue in this kernel. */
- if(get_global_id(0) == 0 && get_global_id(1) == 0) {
- Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
- }
- /* Fetch use_queues_flag. */
- ccl_local char local_use_queues_flag;
- if(get_local_id(0) == 0 && get_local_id(1) == 0) {
- local_use_queues_flag = use_queues_flag[0];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int ray_index;
- if(local_use_queues_flag) {
- int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
- ray_index = get_ray_index(thread_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- Queue_data,
- queuesize,
- 1);
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
- } else {
- if(x < (sw * parallel_samples) && y < sh) {
- ray_index = x + y * (sw * parallel_samples);
- } else {
- return;
- }
- }
-
- kernel_lamp_emission((KernelGlobals *)kg,
- throughput_coop,
- PathRadiance_coop,
- Ray_coop,
- PathState_coop,
- Intersection_coop,
- ray_state,
- sw, sh,
- use_queues_flag,
- ray_index);
+ kernel_lamp_emission(kg);
}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
index 6d49b6294a8..e87e367fb9c 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -14,101 +14,13 @@
* limitations under the License.
*/
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
#include "split/kernel_next_iteration_setup.h"
__kernel void kernel_ocl_path_trace_next_iteration_setup(
- ccl_global char *kg,
- ccl_constant KernelData *data,
- ccl_global char *sd, /* Required for setting up ray for next iteration */
- ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */
- ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */
- PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */
- ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */
- ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
- ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */
- ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */
- ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */
- ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */
- ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */
- ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- ccl_global int *Queue_data, /* Queue memory */
- ccl_global int *Queue_index, /* Tracks the number of elements in each queue */
- int queuesize, /* Size (capacity) of each queue */
- ccl_global char *use_queues_flag) /* flag to decide if scene_intersect kernel should
- * use queues to fetch ray index */
+ KernelGlobals *kg,
+ ccl_constant KernelData *data)
{
- ccl_local unsigned int local_queue_atomics;
- if(get_local_id(0) == 0 && get_local_id(1) == 0) {
- local_queue_atomics = 0;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if(get_global_id(0) == 0 && get_global_id(1) == 0) {
- /* If we are here, then it means that scene-intersect kernel
- * has already been executed atleast once. From the next time,
- * scene-intersect kernel may operate on queues to fetch ray index
- */
- use_queues_flag[0] = 1;
-
- /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
- * previous kernel.
- */
- Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
- Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
- }
-
- char enqueue_flag = 0;
- int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
- ray_index = get_ray_index(ray_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- Queue_data,
- queuesize,
- 0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
- /* If we are executing on a GPU device, we exit all threads that are not
- * required.
- *
- * If we are executing on a CPU device, then we need to keep all threads
- * active since we have barrier() calls later in the kernel. CPU devices,
- * expect all threads to execute barrier statement.
- */
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
- if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
- enqueue_flag = kernel_next_iteration_setup((KernelGlobals *)kg,
- (ShaderData *)sd,
- rng_coop,
- throughput_coop,
- PathRadiance_coop,
- Ray_coop,
- PathState_coop,
- LightRay_dl_coop,
- ISLamp_coop,
- BSDFEval_coop,
- LightRay_ao_coop,
- AOBSDF_coop,
- AOAlpha_coop,
- ray_state,
- use_queues_flag,
- ray_index);
-#ifndef __COMPUTE_DEVICE_GPU__
- }
-#endif
-
- /* Enqueue RAY_UPDATE_BUFFER rays. */
- enqueue_ray_index_local(ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- enqueue_flag,
- queuesize,
- &local_queue_atomics,
- Queue_data,
- Queue_index);
+ kernel_next_iteration_setup(kg);
}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
new file mode 100644
index 00000000000..7e9e4a02529
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
+#include "split/kernel_path_init.h"
+
+__kernel void kernel_ocl_path_trace_path_init(
+ KernelGlobals *kg,
+ ccl_constant KernelData *data)
+{
+ kernel_path_init(kg);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
index 3156dc255fb..9ceb6a5c3d8 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -14,93 +14,13 @@
* limitations under the License.
*/
-#include "../../kernel_compat_opencl.h"
-#include "../../kernel_math.h"
-#include "../../kernel_types.h"
-#include "../../kernel_globals.h"
-#include "../../kernel_queues.h"
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
+#include "split/kernel_queue_enqueue.h"
-/*
- * The kernel "kernel_queue_enqueue" enqueues rays of
- * different ray state into their appropriate Queues;
- * 1. Rays that have been determined to hit the background from the
- * "kernel_scene_intersect" kernel
- * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * The input and output of the kernel is as follows,
- *
- * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |
- * queuesize -------------------------------------------| |
- *
- * Note on Queues :
- * State of queues during the first time this kernel is called :
- * At entry,
- * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
- *
- * State of queue during other times this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
- */
__kernel void kernel_ocl_path_trace_queue_enqueue(
- ccl_global int *Queue_data, /* Queue memory */
- ccl_global int *Queue_index, /* Tracks the number of elements in each queue */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- int queuesize) /* Size (capacity) of each queue */
+ KernelGlobals *kg,
+ ccl_constant KernelData *data)
{
- /* We have only 2 cases (Hit/Not-Hit) */
- ccl_local unsigned int local_queue_atomics[2];
-
- int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
- int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-
- if(lidx < 2 ) {
- local_queue_atomics[lidx] = 0;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int queue_number = -1;
-
- if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
- queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- }
- else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
- }
-
- unsigned int my_lqidx;
- if(queue_number != -1) {
- my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if(lidx == 0) {
- local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
- get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- local_queue_atomics,
- Queue_index);
- local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
- get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- local_queue_atomics,
- Queue_index);
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- unsigned int my_gqidx;
- if(queue_number != -1) {
- my_gqidx = get_global_queue_index(queue_number,
- queuesize,
- my_lqidx,
- local_queue_atomics);
- Queue_data[my_gqidx] = ray_index;
- }
+ kernel_queue_enqueue(kg);
}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
index 7f3f433c7a6..4e083e87d1c 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -14,67 +14,13 @@
* limitations under the License.
*/
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
#include "split/kernel_scene_intersect.h"
__kernel void kernel_ocl_path_trace_scene_intersect(
- ccl_global char *kg,
- ccl_constant KernelData *data,
- ccl_global uint *rng_coop,
- ccl_global Ray *Ray_coop, /* Required for scene_intersect */
- ccl_global PathState *PathState_coop, /* Required for scene_intersect */
- Intersection *Intersection_coop, /* Required for scene_intersect */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- int sw, int sh,
- ccl_global int *Queue_data, /* Memory for queues */
- ccl_global int *Queue_index, /* Tracks the number of elements in queues */
- int queuesize, /* Size (capacity) of queues */
- ccl_global char *use_queues_flag, /* used to decide if this kernel should use
- * queues to fetch ray index */
-#ifdef __KERNEL_DEBUG__
- DebugData *debugdata_coop,
-#endif
- int parallel_samples) /* Number of samples to be processed in parallel */
+ KernelGlobals *kg,
+ ccl_constant KernelData *data)
{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- /* Fetch use_queues_flag */
- ccl_local char local_use_queues_flag;
- if(get_local_id(0) == 0 && get_local_id(1) == 0) {
- local_use_queues_flag = use_queues_flag[0];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int ray_index;
- if(local_use_queues_flag) {
- int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
- ray_index = get_ray_index(thread_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- Queue_data,
- queuesize,
- 0);
-
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
- } else {
- if(x < (sw * parallel_samples) && y < sh) {
- ray_index = x + y * (sw * parallel_samples);
- } else {
- return;
- }
- }
-
- kernel_scene_intersect((KernelGlobals *)kg,
- rng_coop,
- Ray_coop,
- PathState_coop,
- Intersection_coop,
- ray_state,
- sw, sh,
- use_queues_flag,
-#ifdef __KERNEL_DEBUG__
- debugdata_coop,
-#endif
- ray_index);
+ kernel_scene_intersect(kg);
}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
index c37856c8f30..a2b48b15928 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -14,55 +14,13 @@
* limitations under the License.
*/
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
#include "split/kernel_shader_eval.h"
__kernel void kernel_ocl_path_trace_shader_eval(
- ccl_global char *kg,
- ccl_constant KernelData *data,
- ccl_global char *sd, /* Output ShaderData structure to be filled */
- ccl_global uint *rng_coop, /* Required for rbsdf calculation */
- ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */
- ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */
- Intersection *Intersection_coop, /* Required for setting up shader from ray */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- ccl_global int *Queue_data, /* queue memory */
- ccl_global int *Queue_index, /* Tracks the number of elements in each queue */
- int queuesize) /* Size (capacity) of each queue */
+ KernelGlobals *kg,
+ ccl_constant KernelData *data)
{
- /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
- ccl_local unsigned int local_queue_atomics;
- if(get_local_id(0) == 0 && get_local_id(1) == 0) {
- local_queue_atomics = 0;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
- ray_index = get_ray_index(ray_index,
- QUEUE_ACTIVE_AND_REGENERATED_RAYS,
- Queue_data,
- queuesize,
- 0);
-
- if(ray_index == QUEUE_EMPTY_SLOT) {
- return;
- }
-
- char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
- enqueue_ray_index_local(ray_index,
- QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
- enqueue_flag,
- queuesize,
- &local_queue_atomics,
- Queue_data,
- Queue_index);
-
- /* Continue on with shader evaluation. */
- kernel_shader_eval((KernelGlobals *)kg,
- (ShaderData *)sd,
- rng_coop,
- Ray_coop,
- PathState_coop,
- Intersection_coop,
- ray_state,
- ray_index);
+ kernel_shader_eval(kg);
}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
index edf76fba714..3693f7f9c9d 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
@@ -14,52 +14,13 @@
* limitations under the License.
*/
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
#include "split/kernel_shadow_blocked.h"
__kernel void kernel_ocl_path_trace_shadow_blocked(
- ccl_global char *kg,
- ccl_constant KernelData *data,
- ccl_global PathState *PathState_coop, /* Required for shadow blocked */
- ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */
- ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */
- ccl_global char *ray_state,
- ccl_global int *Queue_data, /* Queue memory */
- ccl_global int *Queue_index, /* Tracks the number of elements in each queue */
- int queuesize) /* Size (capacity) of each queue */
+ KernelGlobals *kg,
+ ccl_constant KernelData *data)
{
- int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
-
- ccl_local unsigned int ao_queue_length;
- ccl_local unsigned int dl_queue_length;
- if(lidx == 0) {
- ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
- dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- /* flag determining if the current ray is to process shadow ray for AO or DL */
- char shadow_blocked_type = -1;
-
- int ray_index = QUEUE_EMPTY_SLOT;
- int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
- if(thread_index < ao_queue_length + dl_queue_length) {
- if(thread_index < ao_queue_length) {
- ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
- shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
- } else {
- ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
- shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
- }
- }
-
- if(ray_index == QUEUE_EMPTY_SLOT)
- return;
-
- kernel_shadow_blocked((KernelGlobals *)kg,
- PathState_coop,
- LightRay_dl_coop,
- LightRay_ao_coop,
- ray_state,
- shadow_blocked_type,
- ray_index);
+ kernel_shadow_blocked(kg);
}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
new file mode 100644
index 00000000000..0a1843ff8bd
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
+
+__kernel void kernel_ocl_path_trace_state_buffer_size(
+ KernelGlobals *kg,
+ ccl_constant KernelData *data,
+ uint num_threads,
+ ccl_global uint *size)
+{
+ kg->data = data;
+ *size = split_data_buffer_size(kg, num_threads);
+}
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
deleted file mode 100644
index 88a1ed830af..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "split/kernel_sum_all_radiance.h"
-
-__kernel void kernel_ocl_path_trace_sum_all_radiance(
- ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */
- ccl_global float *buffer, /* Output buffer of RenderTile */
- ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */
- int parallel_samples, int sw, int sh, int stride,
- int buffer_offset_x,
- int buffer_offset_y,
- int buffer_stride,
- int start_sample)
-{
- kernel_sum_all_radiance(data,
- buffer,
- per_sample_output_buffer,
- parallel_samples,
- sw, sh, stride,
- buffer_offset_x,
- buffer_offset_y,
- buffer_stride,
- start_sample);
-}
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 3614717e28c..d3a69d39597 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -78,7 +78,7 @@ public:
bssrdf->albedo = albedo.x;
bssrdf->sharpness = sharpness;
bssrdf->N = params.N;
- ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+ sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
}
bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f));
@@ -89,7 +89,7 @@ public:
bssrdf->albedo = albedo.y;
bssrdf->sharpness = sharpness;
bssrdf->N = params.N;
- ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+ sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
}
bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z));
@@ -100,7 +100,7 @@ public:
bssrdf->albedo = albedo.z;
bssrdf->sharpness = sharpness;
bssrdf->N = params.N;
- ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+ sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
}
}
}
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 94de782dca0..fe61587d179 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -42,6 +42,7 @@
#include "kernel_types.h"
#include "kernel_compat_cpu.h"
+#include "split/kernel_split_data_types.h"
#include "kernel_globals.h"
#include "kernel_montecarlo.h"
#include "kernel_random.h"
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 58bbdc33920..b08353e82d1 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -39,6 +39,7 @@
#include "util_string.h"
#include "kernel_compat_cpu.h"
+#include "split/kernel_split_data_types.h"
#include "kernel_globals.h"
#include "kernel_random.h"
#include "kernel_projection.h"
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 0d762bbdb38..c7e9f57b18a 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -19,6 +19,7 @@
#include "kernel_compat_cpu.h"
#include "kernel_montecarlo.h"
#include "kernel_types.h"
+#include "split/kernel_split_data_types.h"
#include "kernel_globals.h"
#include "geom/geom_object.h"
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h
index 9bfa71c75ef..04aaf1bbaad 100644
--- a/intern/cycles/kernel/split/kernel_background_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
/* Note on kernel_background_buffer_update kernel.
* This is the fourth kernel in the ray tracing logic, and the third
@@ -69,80 +69,77 @@
* QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
* QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
*/
-ccl_device char kernel_background_buffer_update(
- KernelGlobals *kg,
- ccl_global float *per_sample_output_buffers,
- ccl_global uint *rng_state,
- ccl_global uint *rng_coop, /* Required for buffer Update */
- ccl_global float3 *throughput_coop, /* Required for background hit processing */
- PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */
- ccl_global Ray *Ray_coop, /* Required for background hit processing */
- ccl_global PathState *PathState_coop, /* Required for background hit processing */
- ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */
- ccl_global char *ray_state, /* Stores information on the current state of a ray */
- int sw, int sh, int sx, int sy, int stride,
- int rng_state_offset_x,
- int rng_state_offset_y,
- int rng_state_stride,
- ccl_global unsigned int *work_array, /* Denotes work of each ray */
- int end_sample,
- int start_sample,
-#ifdef __WORK_STEALING__
- ccl_global unsigned int *work_pool_wgs,
- unsigned int num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
- DebugData *debugdata_coop,
-#endif
- int parallel_samples, /* Number of samples to be processed in parallel */
- int ray_index)
+ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
{
+ ccl_local unsigned int local_queue_atomics;
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if(ray_index == 0) {
+ /* We will empty this queue in this kernel. */
+ kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+ }
char enqueue_flag = 0;
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+ /* If we are executing on a GPU device, we exit all threads that are not
+ * required.
+ *
+ * If we are executing on a CPU device, then we need to keep all threads
+ * active since we have barrier() calls later in the kernel. CPU devices,
+ * expect all threads to execute barrier statement.
+ */
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+ ccl_global uint *rng_state = kernel_split_params.rng_state;
+ int stride = kernel_split_params.stride;
+
+ ccl_global char *ray_state = kernel_split_state.ray_state;
#ifdef __KERNEL_DEBUG__
- DebugData *debug_data = &debugdata_coop[ray_index];
+ DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
#endif
- ccl_global PathState *state = &PathState_coop[ray_index];
- PathRadiance *L = L = &PathRadiance_coop[ray_index];
- ccl_global Ray *ray = &Ray_coop[ray_index];
- ccl_global float3 *throughput = &throughput_coop[ray_index];
- ccl_global float *L_transparent = &L_transparent_coop[ray_index];
- ccl_global uint *rng = &rng_coop[ray_index];
-
-#ifdef __WORK_STEALING__
- unsigned int my_work;
- ccl_global float *initial_per_sample_output_buffers;
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+ ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
+ ccl_global uint *rng = &kernel_split_state.rng[ray_index];
+ ccl_global float *buffer = kernel_split_params.buffer;
+
+ unsigned int work_index;
ccl_global uint *initial_rng;
-#endif
+
unsigned int sample;
unsigned int tile_x;
unsigned int tile_y;
unsigned int pixel_x;
unsigned int pixel_y;
- unsigned int my_sample_tile;
-#ifdef __WORK_STEALING__
- my_work = work_array[ray_index];
- sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
- get_pixel_tile_position(&pixel_x, &pixel_y,
+ work_index = kernel_split_state.work_array[ray_index];
+ sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
+ get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
&tile_x, &tile_y,
- my_work,
- sw, sh, sx, sy,
- parallel_samples,
+ work_index,
ray_index);
- my_sample_tile = 0;
- initial_per_sample_output_buffers = per_sample_output_buffers;
initial_rng = rng_state;
-#else /* __WORK_STEALING__ */
- sample = work_array[ray_index];
- int tile_index = ray_index / parallel_samples;
- /* buffer and rng_state's stride is "stride". Find x and y using ray_index */
- tile_x = tile_index % sw;
- tile_y = tile_index / sw;
- my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif /* __WORK_STEALING__ */
-
- rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
- per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+
+ rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride;
+ buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
/* eval background shader if nothing hit */
@@ -157,7 +154,7 @@ ccl_device char kernel_background_buffer_update(
if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
#ifdef __BACKGROUND__
/* sample background shader */
- float3 L_background = indirect_background(kg, kg->sd_input, state, ray);
+ float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
#endif
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
@@ -166,55 +163,38 @@ ccl_device char kernel_background_buffer_update(
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
float3 L_sum = path_radiance_clamp_and_sum(kg, L);
- kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
+ kernel_write_light_passes(kg, buffer, L, sample);
#ifdef __KERNEL_DEBUG__
- kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
+ kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
#endif
float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
/* accumulate result in output buffer */
- kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+ kernel_write_pass_float4(buffer, sample, L_rad);
path_rng_end(kg, rng_state, *rng);
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
}
if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
/* We have completed current work; So get next work */
- int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+ int valid_work = get_next_work(kg, &work_index, ray_index);
if(!valid_work) {
/* If work is invalid, this means no more work is available and the thread may exit */
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
}
-#else /* __WORK_STEALING__ */
- if((sample + parallel_samples) >= end_sample) {
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
- }
-#endif /* __WORK_STEALING__ */
if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
- work_array[ray_index] = my_work;
+ kernel_split_state.work_array[ray_index] = work_index;
/* Get the sample associated with the current work */
- sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+ sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
/* Get pixel and tile position associated with current work */
- get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
- my_sample_tile = 0;
+ get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
/* Remap rng_state according to the current work */
- rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
- /* Remap per_sample_output_buffers according to the current work */
- per_sample_output_buffers = initial_per_sample_output_buffers
- + (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-#else /* __WORK_STEALING__ */
- work_array[ray_index] = sample + parallel_samples;
- sample = work_array[ray_index];
-
- /* Get ray position from ray index */
- pixel_x = sx + ((ray_index / parallel_samples) % sw);
- pixel_y = sy + ((ray_index / parallel_samples) / sw);
-#endif /* __WORK_STEALING__ */
+ rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride;
+ /* Remap buffer according to the current work */
+ buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
/* Initialize random numbers and ray. */
kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
@@ -226,7 +206,7 @@ ccl_device char kernel_background_buffer_update(
*throughput = make_float3(1.0f, 1.0f, 1.0f);
*L_transparent = 0.0f;
path_radiance_init(L, kernel_data.film.use_light_pass);
- path_state_init(kg, kg->sd_input, state, rng, sample, ray);
+ path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, rng, sample, ray);
#ifdef __KERNEL_DEBUG__
debug_data_init(debug_data);
#endif
@@ -237,12 +217,29 @@ ccl_device char kernel_background_buffer_update(
/* These rays do not participate in path-iteration. */
float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
/* Accumulate result in output buffer. */
- kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+ kernel_write_pass_float4(buffer, sample, L_rad);
path_rng_end(kg, rng_state, *rng);
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
}
}
}
- return enqueue_flag;
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ }
+#endif
+
+ /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+ * These rays will be made active during next SceneIntersectkernel.
+ */
+ enqueue_ray_index_local(ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ &local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 6e158d53d23..9b62d65ffd9 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -14,108 +14,105 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
/* Note on kernel_data_initialization kernel
* This kernel Initializes structures needed in path-iteration kernels.
- * This is the first kernel in ray-tracing logic.
- *
- * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
- *
- * Its input and output are as follows,
- *
- * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng
- * Un-initialized throughput -------| |--- Initialized throughput
- * Un-initialized L_transparent ----| |--- Initialized L_transparent
- * Un-initialized PathRadiance -----| |--- Initialized PathRadiance
- * Un-initialized Ray --------------| |--- Initialized Ray
- * Un-initialized PathState --------| |--- Initialized PathState
- * Un-initialized QueueData --------| |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
- * Un-initialized QueueIndex -------| |--- Initialized QueueIndex (to 0)
- * Un-initialized use_queues_flag---| |--- Initialized use_queues_flag (to false)
- * Un-initialized ray_state --------| |--- Initialized ray_state
- * parallel_samples --------------- | |--- Initialized per_sample_output_buffers
- * rng_state -----------------------| |--- Initialized work_array
- * data ----------------------------| |--- Initialized work_pool_wgs
- * start_sample --------------------| |
- * sx ------------------------------| |
- * sy ------------------------------| |
- * sw ------------------------------| |
- * sh ------------------------------| |
- * stride --------------------------| |
- * queuesize -----------------------| |
- * num_samples ---------------------| |
*
* Note on Queues :
* All slots in queues are initialized to queue empty slot;
* The number of elements in the queues is initialized to 0;
*/
+
+/* distributes an amount of work across all threads
+ * note: work done inside the loop may not show up to all threads till after the current kernel has completed
+ */
+#define parallel_for(kg, iter_name, work_size) \
+ for(size_t _size = (work_size), \
+ _global_size = ccl_global_size(0) * ccl_global_size(1), \
+ _n = _size / _global_size, \
+ _thread = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0), \
+ iter_name = (_n > 0) ? (_thread * _n) : (_thread) \
+ ; \
+ (iter_name < (_thread+1) * _n) || (iter_name == _n * _global_size + _thread && _thread < _size % _global_size) \
+ ; \
+ iter_name = (iter_name != (_thread+1) * _n - 1) ? (iter_name + 1) : (_n * _global_size + _thread) \
+ )
+
+#ifndef __KERNEL_CPU__
ccl_device void kernel_data_init(
+#else
+void KERNEL_FUNCTION_FULL_NAME(data_init)(
+#endif
KernelGlobals *kg,
- ShaderData *sd_DL_shadow,
ccl_constant KernelData *data,
- ccl_global float *per_sample_output_buffers,
+ ccl_global void *split_data_buffer,
+ int num_elements,
+ ccl_global char *ray_state,
ccl_global uint *rng_state,
- ccl_global uint *rng_coop, /* rng array to store rng values for all rays */
- ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */
- ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */
- PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */
- ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */
- ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */
- Intersection *Intersection_coop_shadow,
- ccl_global char *ray_state, /* Stores information on current state of a ray */
+#ifdef __KERNEL_OPENCL__
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "../kernel_textures.h"
+#endif
- int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
- int rng_state_offset_x,
- int rng_state_offset_y,
- int rng_state_stride,
- ccl_global int *Queue_data, /* Memory for queues */
+ int start_sample,
+ int end_sample,
+ int sx, int sy, int sw, int sh, int offset, int stride,
ccl_global int *Queue_index, /* Tracks the number of elements in queues */
int queuesize, /* size (capacity) of the queue */
ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
- ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */
-#ifdef __WORK_STEALING__
- ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */
- unsigned int num_samples, /* Total number of samples per pixel */
-#endif
-#ifdef __KERNEL_DEBUG__
- DebugData *debugdata_coop,
-#endif
- int parallel_samples) /* Number of samples to be processed in parallel */
+ ccl_global unsigned int *work_pools, /* Work pool for each work group */
+ unsigned int num_samples,
+ ccl_global float *buffer)
{
+#ifdef __KERNEL_OPENCL__
kg->data = data;
- kg->sd_input = sd_DL_shadow;
- kg->isect_shadow = Intersection_coop_shadow;
+#endif
+
+ kernel_split_params.x = sx;
+ kernel_split_params.y = sy;
+ kernel_split_params.w = sw;
+ kernel_split_params.h = sh;
+
+ kernel_split_params.offset = offset;
+ kernel_split_params.stride = stride;
+
+ kernel_split_params.rng_state = rng_state;
+
+ kernel_split_params.start_sample = start_sample;
+ kernel_split_params.end_sample = end_sample;
+
+ kernel_split_params.work_pools = work_pools;
+ kernel_split_params.num_samples = num_samples;
+
+ kernel_split_params.queue_index = Queue_index;
+ kernel_split_params.queue_size = queuesize;
+ kernel_split_params.use_queues_flag = use_queues_flag;
+
+ kernel_split_params.buffer = buffer;
+
+ split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
+
+#ifdef __KERNEL_OPENCL__
#define KERNEL_TEX(type, ttype, name) \
kg->name = name;
#include "../kernel_textures.h"
+#endif
- int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-
-#ifdef __WORK_STEALING__
- int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
- /* Initialize work_pool_wgs */
- if(lid == 0) {
- int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
- work_pool_wgs[group_index] = 0;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-#endif /* __WORK_STEALING__ */
+ int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
/* Initialize queue data and queue index. */
if(thread_index < queuesize) {
/* Initialize active ray queue. */
- Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+ kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
/* Initialize background and buffer update queue. */
- Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+ kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
/* Initialize shadow ray cast of AO queue. */
- Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+ kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
/* Initialize shadow ray cast of direct lighting queue. */
- Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+ kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
}
if(thread_index == 0) {
@@ -126,109 +123,32 @@ ccl_device void kernel_data_init(
/* The scene-intersect kernel should not use the queues very first time.
* since the queue would be empty.
*/
- use_queues_flag[0] = 0;
+ *use_queues_flag = 0;
}
- int x = get_global_id(0);
- int y = get_global_id(1);
+ /* zero the tiles pixels and initialize rng_state if this is the first sample */
+ if(start_sample == 0) {
+ parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) {
+ int pixel = i / kernel_data.film.pass_stride;
+ int pass = i % kernel_data.film.pass_stride;
- if(x < (sw * parallel_samples) && y < sh) {
- int ray_index = x + y * (sw * parallel_samples);
+ int x = sx + pixel % sw;
+ int y = sy + pixel / sw;
- /* This is the first assignment to ray_state;
- * So we dont use ASSIGN_RAY_STATE macro.
- */
- ray_state[ray_index] = RAY_ACTIVE;
-
- unsigned int my_sample;
- unsigned int pixel_x;
- unsigned int pixel_y;
- unsigned int tile_x;
- unsigned int tile_y;
- unsigned int my_sample_tile;
-
-#ifdef __WORK_STEALING__
- unsigned int my_work = 0;
- /* Get work. */
- get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
- /* Get the sample associated with the work. */
- my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-
- my_sample_tile = 0;
-
- /* Get pixel and tile position associated with the work. */
- get_pixel_tile_position(&pixel_x, &pixel_y,
- &tile_x, &tile_y,
- my_work,
- sw, sh, sx, sy,
- parallel_samples,
- ray_index);
- work_array[ray_index] = my_work;
-#else /* __WORK_STEALING__ */
- unsigned int tile_index = ray_index / parallel_samples;
- tile_x = tile_index % sw;
- tile_y = tile_index / sw;
- my_sample_tile = ray_index - (tile_index * parallel_samples);
- my_sample = my_sample_tile + start_sample;
-
- /* Initialize work array. */
- work_array[ray_index] = my_sample ;
-
- /* Calculate pixel position of this ray. */
- pixel_x = sx + tile_x;
- pixel_y = sy + tile_y;
-#endif /* __WORK_STEALING__ */
-
- rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-
- /* Initialise per_sample_output_buffers to all zeros. */
- per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
- int per_sample_output_buffers_iterator = 0;
- for(per_sample_output_buffers_iterator = 0;
- per_sample_output_buffers_iterator < kernel_data.film.pass_stride;
- per_sample_output_buffers_iterator++)
- {
- per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
- }
+ int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass;
- /* Initialize random numbers and ray. */
- kernel_path_trace_setup(kg,
- rng_state,
- my_sample,
- pixel_x, pixel_y,
- &rng_coop[ray_index],
- &Ray_coop[ray_index]);
-
- if(Ray_coop[ray_index].t != 0.0f) {
- /* Initialize throughput, L_transparent, Ray, PathState;
- * These rays proceed with path-iteration.
- */
- throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
- L_transparent_coop[ray_index] = 0.0f;
- path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
- path_state_init(kg,
- kg->sd_input,
- &PathState_coop[ray_index],
- &rng_coop[ray_index],
- my_sample,
- &Ray_coop[ray_index]);
-#ifdef __KERNEL_DEBUG__
- debug_data_init(&debugdata_coop[ray_index]);
-#endif
- }
- else {
- /* These rays do not participate in path-iteration. */
- float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- /* Accumulate result in output buffer. */
- kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
- path_rng_end(kg, rng_state, rng_coop[ray_index]);
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+ *(buffer + index) = 0.0f;
}
- }
- /* Mark rest of the ray-state indices as RAY_INACTIVE. */
- if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
- /* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
- ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
+ parallel_for(kg, i, sw * sh) {
+ int x = sx + i % sw;
+ int y = sy + i / sw;
+
+ int index = (offset + x + y*stride);
+ *(rng_state + index) = hash_int_2d(x, y);
+ }
}
}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index 82ca18829d3..5163b8edc04 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
/* Note on kernel_direct_lighting kernel.
* This is the eighth kernel in the ray tracing logic. This is the seventh
@@ -47,28 +47,50 @@
* QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
* kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
*/
-ccl_device char kernel_direct_lighting(
- KernelGlobals *kg,
- ShaderData *sd, /* Required for direct lighting */
- ccl_global uint *rng_coop, /* Required for direct lighting */
- ccl_global PathState *PathState_coop, /* Required for direct lighting */
- ccl_global int *ISLamp_coop, /* Required for direct lighting */
- ccl_global Ray *LightRay_coop, /* Required for direct lighting */
- ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- int ray_index)
+ccl_device void kernel_direct_lighting(KernelGlobals *kg)
{
+ ccl_local unsigned int local_queue_atomics;
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
char enqueue_flag = 0;
- if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- ccl_global PathState *state = &PathState_coop[ray_index];
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+ /* If we are executing on a GPU device, we exit all threads that are not
+ * required.
+ *
+ * If we are executing on a CPU device, then we need to keep all threads
+ * active since we have barrier() calls later in the kernel. CPU devices,
+ * expect all threads to execute barrier statement.
+ */
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
/* direct lighting */
#ifdef __EMISSION__
if((kernel_data.integrator.use_direct_light &&
- (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
+ (sd->flag & SD_BSDF_HAS_EVAL)))
{
/* Sample illumination from lights to find path contribution. */
- ccl_global RNG* rng = &rng_coop[ray_index];
+ ccl_global RNG* rng = &kernel_split_state.rng[ray_index];
float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
float light_u, light_v;
path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
@@ -77,32 +99,48 @@ ccl_device char kernel_direct_lighting(
LightSample ls;
if(light_sample(kg,
light_t, light_u, light_v,
- ccl_fetch(sd, time),
- ccl_fetch(sd, P),
+ sd->time,
+ sd->P,
state->bounce,
&ls)) {
Ray light_ray;
#ifdef __OBJECT_MOTION__
- light_ray.time = ccl_fetch(sd, time);
+ light_ray.time = sd->time;
#endif
BsdfEval L_light;
bool is_lamp;
- if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+ if(direct_emission(kg, sd, &kernel_split_state.sd_DL_shadow[ray_index], &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
/* Write intermediate data to global memory to access from
* the next kernel.
*/
- LightRay_coop[ray_index] = light_ray;
- BSDFEval_coop[ray_index] = L_light;
- ISLamp_coop[ray_index] = is_lamp;
+ kernel_split_state.light_ray[ray_index] = light_ray;
+ kernel_split_state.bsdf_eval[ray_index] = L_light;
+ kernel_split_state.is_lamp[ray_index] = is_lamp;
/* Mark ray state for next shadow kernel. */
- ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+ ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
enqueue_flag = 1;
}
}
}
#endif /* __EMISSION__ */
}
- return enqueue_flag;
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ }
+#endif
+
+#ifdef __EMISSION__
+ /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
+ enqueue_ray_index_local(ray_index,
+ QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ &local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+#endif
}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 5d951b972ed..7168efa59ae 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
* This is the sixth kernel in the ray tracing logic. This is the fifth
@@ -70,101 +70,105 @@
* QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
* QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
*/
-ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
- KernelGlobals *kg,
- ShaderData *sd, /* Required throughout the kernel except probabilistic path termination and AO */
- ccl_global float *per_sample_output_buffers,
- ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */
- ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */
- ccl_global float *L_transparent_coop, /* Required for handling holdout material */
- PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */
- ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */
- Intersection *Intersection_coop, /* Required for indirect primitive emission */
- ccl_global float3 *AOAlpha_coop, /* Required for AO */
- ccl_global float3 *AOBSDF_coop, /* Required for AO */
- ccl_global Ray *AOLightRay_coop, /* Required for AO */
- int sw, int sh, int sx, int sy, int stride,
- ccl_global char *ray_state, /* Denotes the state of each ray */
- ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */
-#ifdef __WORK_STEALING__
- unsigned int start_sample,
-#endif
- int parallel_samples, /* Number of samples to be processed in parallel */
- int ray_index,
- char *enqueue_flag,
- char *enqueue_flag_AO_SHADOW_RAY_CAST)
+ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobals *kg)
{
-#ifdef __WORK_STEALING__
- unsigned int my_work;
+ ccl_local unsigned int local_queue_atomics_bg;
+ ccl_local unsigned int local_queue_atomics_ao;
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ local_queue_atomics_bg = 0;
+ local_queue_atomics_ao = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ char enqueue_flag = 0;
+ char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+ /* If we are executing on a GPU device, we exit all threads that are not
+ * required.
+ *
+ * If we are executing on a CPU device, then we need to keep all threads
+ * active since we have barrier() calls later in the kernel. CPU devices,
+ * expect all threads to execute barrier statement.
+ */
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+#endif /* __COMPUTE_DEVICE_GPU__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+ int stride = kernel_split_params.stride;
+
+ unsigned int work_index;
unsigned int pixel_x;
unsigned int pixel_y;
-#endif
+
unsigned int tile_x;
unsigned int tile_y;
- int my_sample_tile;
unsigned int sample;
ccl_global RNG *rng = 0x0;
ccl_global PathState *state = 0x0;
float3 throughput;
+ ccl_global char *ray_state = kernel_split_state.ray_state;
+ ShaderData *sd = &kernel_split_state.sd[ray_index];
+ ccl_global float *buffer = kernel_split_params.buffer;
+
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- throughput = throughput_coop[ray_index];
- state = &PathState_coop[ray_index];
- rng = &rng_coop[ray_index];
-#ifdef __WORK_STEALING__
- my_work = work_array[ray_index];
- sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
- get_pixel_tile_position(&pixel_x, &pixel_y,
+ throughput = kernel_split_state.throughput[ray_index];
+ state = &kernel_split_state.path_state[ray_index];
+ rng = &kernel_split_state.rng[ray_index];
+
+ work_index = kernel_split_state.work_array[ray_index];
+ sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
+ get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
&tile_x, &tile_y,
- my_work,
- sw, sh, sx, sy,
- parallel_samples,
+ work_index,
ray_index);
- my_sample_tile = 0;
-#else /* __WORK_STEALING__ */
- sample = work_array[ray_index];
- /* Buffer's stride is "stride"; Find x and y using ray_index. */
- int tile_index = ray_index / parallel_samples;
- tile_x = tile_index % sw;
- tile_y = tile_index / sw;
- my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif /* __WORK_STEALING__ */
- per_sample_output_buffers +=
- (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) *
- kernel_data.film.pass_stride;
+
+ buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride;
/* holdout */
#ifdef __HOLDOUT__
- if(((ccl_fetch(sd, flag) & SD_HOLDOUT) ||
- (ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK)) &&
+ if(((sd->flag & SD_HOLDOUT) ||
+ (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
(state->flag & PATH_RAY_CAMERA))
{
if(kernel_data.background.transparent) {
float3 holdout_weight;
- if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) {
+ if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
}
else {
holdout_weight = shader_holdout_eval(kg, sd);
}
/* any throughput is ok, should all be identical here */
- L_transparent_coop[ray_index] += average(holdout_weight*throughput);
+ kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput);
}
- if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) {
+ if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- *enqueue_flag = 1;
+ enqueue_flag = 1;
}
}
#endif /* __HOLDOUT__ */
}
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- PathRadiance *L = &PathRadiance_coop[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
/* Holdout mask objects do not write data passes. */
kernel_write_data_passes(kg,
- per_sample_output_buffers,
+ buffer,
L,
sd,
sample,
@@ -183,12 +187,12 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
#ifdef __EMISSION__
/* emission */
- if(ccl_fetch(sd, flag) & SD_EMISSION) {
+ if(sd->flag & SD_EMISSION) {
/* TODO(sergey): is isect.t wrong here for transparent surfaces? */
float3 emission = indirect_primitive_emission(
kg,
sd,
- Intersection_coop[ray_index].t,
+ kernel_split_state.isect[ray_index].t,
state->flag,
state->ray_pdf);
path_radiance_accum_emission(L, throughput, emission, state->bounce);
@@ -203,7 +207,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
if(probability == 0.0f) {
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- *enqueue_flag = 1;
+ enqueue_flag = 1;
}
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
@@ -211,10 +215,10 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
if(terminate >= probability) {
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
- *enqueue_flag = 1;
+ enqueue_flag = 1;
}
else {
- throughput_coop[ray_index] = throughput/probability;
+ kernel_split_state.throughput[ray_index] = throughput/probability;
}
}
}
@@ -224,7 +228,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
/* ambient occlusion */
if(kernel_data.integrator.use_ambient_occlusion ||
- (ccl_fetch(sd, flag) & SD_AO))
+ (sd->flag & SD_AO))
{
/* todo: solve correlation */
float bsdf_u, bsdf_v;
@@ -232,29 +236,56 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
float ao_factor = kernel_data.background.ao_factor;
float3 ao_N;
- AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
- AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
+ kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+ kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd);
float3 ao_D;
float ao_pdf;
sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
- if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+ if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
Ray _ray;
- _ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+ _ray.P = ray_offset(sd->P, sd->Ng);
_ray.D = ao_D;
_ray.t = kernel_data.background.ao_distance;
#ifdef __OBJECT_MOTION__
- _ray.time = ccl_fetch(sd, time);
+ _ray.time = sd->time;
#endif
- _ray.dP = ccl_fetch(sd, dP);
+ _ray.dP = sd->dP;
_ray.dD = differential3_zero();
- AOLightRay_coop[ray_index] = _ray;
+ kernel_split_state.ao_light_ray[ray_index] = _ray;
ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
- *enqueue_flag_AO_SHADOW_RAY_CAST = 1;
+ enqueue_flag_AO_SHADOW_RAY_CAST = 1;
}
}
}
#endif /* __AO__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ }
+#endif
+
+ /* Enqueue RAY_UPDATE_BUFFER rays. */
+ enqueue_ray_index_local(ray_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ &local_queue_atomics_bg,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+
+#ifdef __AO__
+ /* Enqueue to-shadow-ray-cast rays. */
+ enqueue_ray_index_local(ray_index,
+ QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+ enqueue_flag_AO_SHADOW_RAY_CAST,
+ kernel_split_params.queue_size,
+ &local_queue_atomics_ao,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+#endif
}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
index 3bd0e361078..261625da31d 100644
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
/* Note on kernel_lamp_emission
* This is the 3rd kernel in the ray-tracing logic. This is the second of the
@@ -36,28 +36,39 @@
* sw -------------------------------------------------| |
* sh -------------------------------------------------| |
*/
-ccl_device void kernel_lamp_emission(
- KernelGlobals *kg,
- ccl_global float3 *throughput_coop, /* Required for lamp emission */
- PathRadiance *PathRadiance_coop, /* Required for lamp emission */
- ccl_global Ray *Ray_coop, /* Required for lamp emission */
- ccl_global PathState *PathState_coop, /* Required for lamp emission */
- Intersection *Intersection_coop, /* Required for lamp emission */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- int sw, int sh,
- ccl_global char *use_queues_flag, /* Used to decide if this kernel should use
- * queues to fetch ray index
- */
- int ray_index)
+ccl_device void kernel_lamp_emission(KernelGlobals *kg)
{
- if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
- IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
+ /* We will empty this queue in this kernel. */
+ if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+ kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+ }
+ /* Fetch use_queues_flag. */
+ ccl_local char local_use_queues_flag;
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ local_use_queues_flag = *kernel_split_params.use_queues_flag;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if(local_use_queues_flag) {
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 1);
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+ }
+
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
+ IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND))
{
- PathRadiance *L = &PathRadiance_coop[ray_index];
- ccl_global PathState *state = &PathState_coop[ray_index];
+ PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
- float3 throughput = throughput_coop[ray_index];
- Ray ray = Ray_coop[ray_index];
+ float3 throughput = kernel_split_state.throughput[ray_index];
+ Ray ray = kernel_split_state.ray[ray_index];
#ifdef __LAMP_MIS__
if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
@@ -65,7 +76,7 @@ ccl_device void kernel_lamp_emission(
Ray light_ray;
light_ray.P = ray.P - state->ray_t*ray.D;
- state->ray_t += Intersection_coop[ray_index].t;
+ state->ray_t += kernel_split_state.isect[ray_index].t;
light_ray.D = ray.D;
light_ray.t = state->ray_t;
light_ray.time = ray.time;
@@ -74,10 +85,13 @@ ccl_device void kernel_lamp_emission(
/* intersect with lamp */
float3 emission;
- if(indirect_lamp_emission(kg, kg->sd_input, state, &light_ray, &emission)) {
+ if(indirect_lamp_emission(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &light_ray, &emission)) {
path_radiance_accum_emission(L, throughput, emission, state->bounce);
}
}
#endif /* __LAMP_MIS__ */
}
}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 816f3a6fbff..a6f26278116 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
/* Note on kernel_setup_next_iteration kernel.
* This is the tenth kernel in the ray tracing logic. This is the ninth
@@ -59,47 +59,76 @@
* QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
* QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
*/
-ccl_device char kernel_next_iteration_setup(
- KernelGlobals *kg,
- ShaderData *sd, /* Required for setting up ray for next iteration */
- ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */
- ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */
- PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */
- ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */
- ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
- ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */
- ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */
- ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */
- ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */
- ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */
- ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- ccl_global char *use_queues_flag, /* flag to decide if scene_intersect kernel should
- * use queues to fetch ray index */
- int ray_index)
+ccl_device void kernel_next_iteration_setup(KernelGlobals *kg)
{
+ ccl_local unsigned int local_queue_atomics;
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+ /* If we are here, then it means that scene-intersect kernel
+ * has already been executed atleast once. From the next time,
+ * scene-intersect kernel may operate on queues to fetch ray index
+ */
+ *kernel_split_params.use_queues_flag = 1;
+
+ /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
+ * previous kernel.
+ */
+ kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+ kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+ }
+
char enqueue_flag = 0;
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+ /* If we are executing on a GPU device, we exit all threads that are not
+ * required.
+ *
+ * If we are executing on a CPU device, then we need to keep all threads
+ * active since we have barrier() calls later in the kernel. CPU devices,
+ * expect all threads to execute barrier statement.
+ */
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
/* Load ShaderData structure. */
PathRadiance *L = NULL;
ccl_global PathState *state = NULL;
+ ccl_global char *ray_state = kernel_split_state.ray_state;
/* Path radiance update for AO/Direct_lighting's shadow blocked. */
if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
{
- state = &PathState_coop[ray_index];
- L = &PathRadiance_coop[ray_index];
- float3 _throughput = throughput_coop[ray_index];
+ state = &kernel_split_state.path_state[ray_index];
+ L = &kernel_split_state.path_radiance[ray_index];
+ float3 _throughput = kernel_split_state.throughput[ray_index];
if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
- float3 shadow = LightRay_ao_coop[ray_index].P;
- char update_path_radiance = LightRay_ao_coop[ray_index].t;
+ float3 shadow = kernel_split_state.ao_light_ray[ray_index].P;
+ // TODO(mai): investigate correctness here
+ char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t;
if(update_path_radiance) {
path_radiance_accum_ao(L,
_throughput,
- AOAlpha_coop[ray_index],
- AOBSDF_coop[ray_index],
+ kernel_split_state.ao_alpha[ray_index],
+ kernel_split_state.ao_bsdf[ray_index],
shadow,
state->bounce);
}
@@ -107,35 +136,50 @@ ccl_device char kernel_next_iteration_setup(
}
if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
- float3 shadow = LightRay_dl_coop[ray_index].P;
- char update_path_radiance = LightRay_dl_coop[ray_index].t;
+ float3 shadow = kernel_split_state.light_ray[ray_index].P;
+ // TODO(mai): investigate correctness here
+ char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t;
if(update_path_radiance) {
- BsdfEval L_light = BSDFEval_coop[ray_index];
+ BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
path_radiance_accum_light(L,
_throughput,
&L_light,
shadow,
1.0f,
state->bounce,
- ISLamp_coop[ray_index]);
+ kernel_split_state.is_lamp[ray_index]);
}
REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
}
}
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- ccl_global float3 *throughput = &throughput_coop[ray_index];
- ccl_global Ray *ray = &Ray_coop[ray_index];
- ccl_global RNG *rng = &rng_coop[ray_index];
- state = &PathState_coop[ray_index];
- L = &PathRadiance_coop[ray_index];
+ ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+ ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+ ccl_global RNG *rng = &kernel_split_state.rng[ray_index];
+ state = &kernel_split_state.path_state[ray_index];
+ L = &kernel_split_state.path_radiance[ray_index];
/* Compute direct lighting and next bounce. */
- if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
+ if(!kernel_path_surface_bounce(kg, rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) {
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
enqueue_flag = 1;
}
}
- return enqueue_flag;
+#ifndef __COMPUTE_DEVICE_GPU__
+ }
+#endif
+
+ /* Enqueue RAY_UPDATE_BUFFER rays. */
+ enqueue_ray_index_local(ray_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ &local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
new file mode 100644
index 00000000000..fe3c9e1e8a2
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel initializes structures needed in path-iteration kernels.
+ * This is the first kernel in ray-tracing logic.
+ *
+ * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
+ */
+
+ccl_device void kernel_path_init(KernelGlobals *kg) {
+ int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
+
+ /* This is the first assignment to ray_state;
+ * So we dont use ASSIGN_RAY_STATE macro.
+ */
+ kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
+
+ unsigned int my_sample;
+ unsigned int pixel_x;
+ unsigned int pixel_y;
+ unsigned int tile_x;
+ unsigned int tile_y;
+
+ unsigned int work_index = 0;
+ /* Get work. */
+ if(!get_next_work(kg, &work_index, ray_index)) {
+ /* No more work, mark ray as inactive */
+ kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
+
+ return;
+ }
+
+ /* Get the sample associated with the work. */
+ my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
+
+ /* Get pixel and tile position associated with the work. */
+ get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
+ &tile_x, &tile_y,
+ work_index,
+ ray_index);
+ kernel_split_state.work_array[ray_index] = work_index;
+
+ ccl_global uint *rng_state = kernel_split_params.rng_state;
+ rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
+
+ ccl_global float *buffer = kernel_split_params.buffer;
+ buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride;
+
+ /* Initialize random numbers and ray. */
+ kernel_path_trace_setup(kg,
+ rng_state,
+ my_sample,
+ pixel_x, pixel_y,
+ &kernel_split_state.rng[ray_index],
+ &kernel_split_state.ray[ray_index]);
+
+ if(kernel_split_state.ray[ray_index].t != 0.0f) {
+ /* Initialize throughput, L_transparent, Ray, PathState;
+ * These rays proceed with path-iteration.
+ */
+ kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
+ kernel_split_state.L_transparent[ray_index] = 0.0f;
+ path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass);
+ path_state_init(kg,
+ &kernel_split_state.sd_DL_shadow[ray_index],
+ &kernel_split_state.path_state[ray_index],
+ &kernel_split_state.rng[ray_index],
+ my_sample,
+ &kernel_split_state.ray[ray_index]);
+#ifdef __KERNEL_DEBUG__
+ debug_data_init(&kernel_split_state.debug_data[ray_index]);
+#endif
+ }
+ else {
+ /* These rays do not participate in path-iteration. */
+ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ /* Accumulate result in output buffer. */
+ kernel_write_pass_float4(buffer, my_sample, L_rad);
+ path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]);
+ ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
+ }
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
new file mode 100644
index 00000000000..66aad705bd4
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/*
+ * The kernel "kernel_queue_enqueue" enqueues rays of
+ * different ray state into their appropriate Queues;
+ * 1. Rays that have been determined to hit the background from the
+ * "kernel_scene_intersect" kernel
+ * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output of the kernel is as follows,
+ *
+ * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |
+ * queuesize -------------------------------------------| |
+ *
+ * Note on Queues :
+ * State of queues during the first time this kernel is called :
+ * At entry,
+ * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
+ *
+ * State of queue during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
+ccl_device void kernel_queue_enqueue(KernelGlobals *kg)
+{
+ /* We have only 2 cases (Hit/Not-Hit) */
+ ccl_local unsigned int local_queue_atomics[2];
+
+ int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+ if(lidx == 0) {
+ local_queue_atomics[0] = 0;
+ local_queue_atomics[1] = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int queue_number = -1;
+
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+ queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ }
+ else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+ queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+ }
+
+ unsigned int my_lqidx;
+ if(queue_number != -1) {
+ my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ if(lidx == 0) {
+ local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
+ get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ local_queue_atomics,
+ kernel_split_params.queue_index);
+ local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
+ get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ local_queue_atomics,
+ kernel_split_params.queue_index);
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ unsigned int my_gqidx;
+ if(queue_number != -1) {
+ my_gqidx = get_global_queue_index(queue_number,
+ kernel_split_params.queue_size,
+ my_lqidx,
+ local_queue_atomics);
+ kernel_split_state.queue_data[my_gqidx] = ray_index;
+ }
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index 2388580051f..a7e0c7692a2 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
/* Note on kernel_scene_intersect kernel.
* This is the second kernel in the ray tracing logic. This is the first
@@ -61,34 +61,41 @@
* QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
*/
-ccl_device void kernel_scene_intersect(
- KernelGlobals *kg,
- ccl_global uint *rng_coop,
- ccl_global Ray *Ray_coop, /* Required for scene_intersect */
- ccl_global PathState *PathState_coop, /* Required for scene_intersect */
- Intersection *Intersection_coop, /* Required for scene_intersect */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- int sw, int sh,
- ccl_global char *use_queues_flag, /* used to decide if this kernel should use
- * queues to fetch ray index */
-#ifdef __KERNEL_DEBUG__
- DebugData *debugdata_coop,
-#endif
- int ray_index)
+ccl_device void kernel_scene_intersect(KernelGlobals *kg)
{
+ /* Fetch use_queues_flag */
+ ccl_local char local_use_queues_flag;
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ local_use_queues_flag = *kernel_split_params.use_queues_flag;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if(local_use_queues_flag) {
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
+
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+ }
+
/* All regenerated rays become active here */
- if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED))
+ ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
- if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE))
+ if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE))
return;
#ifdef __KERNEL_DEBUG__
- DebugData *debug_data = &debugdata_coop[ray_index];
+ DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
#endif
- Intersection *isect = &Intersection_coop[ray_index];
- PathState state = PathState_coop[ray_index];
- Ray ray = Ray_coop[ray_index];
+ Intersection *isect = &kernel_split_state.isect[ray_index];
+ PathState state = kernel_split_state.path_state[ray_index];
+ Ray ray = kernel_split_state.ray[ray_index];
/* intersect scene */
uint visibility = path_state_ray_visibility(kg, &state);
@@ -96,7 +103,7 @@ ccl_device void kernel_scene_intersect(
#ifdef __HAIR__
float difl = 0.0f, extmax = 0.0f;
uint lcg_state = 0;
- RNG rng = rng_coop[ray_index];
+ RNG rng = kernel_split_state.rng[ray_index];
if(kernel_data.bvh.have_curves) {
if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
@@ -128,6 +135,9 @@ ccl_device void kernel_scene_intersect(
* These rays undergo special processing in the
* background_bufferUpdate kernel.
*/
- ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+ ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
}
}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index cef64bf5f36..35ee19ddf1b 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
/* Note on kernel_shader_eval kernel
* This kernel is the 5th kernel in the ray tracing logic. This is
@@ -44,27 +44,51 @@
* QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
* QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
*/
-ccl_device void kernel_shader_eval(
- KernelGlobals *kg,
- ShaderData *sd, /* Output ShaderData structure to be filled */
- ccl_global uint *rng_coop, /* Required for rbsdf calculation */
- ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */
- ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */
- Intersection *Intersection_coop, /* Required for setting up shader from ray */
- ccl_global char *ray_state, /* Denotes the state of each ray */
- int ray_index)
+
+ccl_device void kernel_shader_eval(KernelGlobals *kg)
{
- if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- Intersection *isect = &Intersection_coop[ray_index];
- ccl_global uint *rng = &rng_coop[ray_index];
- ccl_global PathState *state = &PathState_coop[ray_index];
- Ray ray = Ray_coop[ray_index];
+ /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+ ccl_local unsigned int local_queue_atomics;
+ if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+ local_queue_atomics = 0;
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ ray_index = get_ray_index(kg, ray_index,
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_size,
+ 0);
+
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+
+ char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+ enqueue_ray_index_local(ray_index,
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+ enqueue_flag,
+ kernel_split_params.queue_size,
+ &local_queue_atomics,
+ kernel_split_state.queue_data,
+ kernel_split_params.queue_index);
+
+ /* Continue on with shader evaluation. */
+ if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+ Intersection *isect = &kernel_split_state.isect[ray_index];
+ ccl_global uint *rng = &kernel_split_state.rng[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ Ray ray = kernel_split_state.ray[ray_index];
shader_setup_from_ray(kg,
- sd,
+ &kernel_split_state.sd[ray_index],
isect,
&ray);
float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
- shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+ shader_eval_surface(kg, &kernel_split_state.sd[ray_index], rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
}
}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h
index 6153af47f96..d532c7cf55b 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked.h
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
/* Note on kernel_shadow_blocked kernel.
* This is the ninth kernel in the ray tracing logic. This is the eighth
@@ -45,24 +45,47 @@
* and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
* QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
*/
-ccl_device void kernel_shadow_blocked(
- KernelGlobals *kg,
- ccl_global PathState *PathState_coop, /* Required for shadow blocked */
- ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */
- ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */
- ccl_global char *ray_state,
- char shadow_blocked_type,
- int ray_index)
+ccl_device void kernel_shadow_blocked(KernelGlobals *kg)
{
+ int lidx = ccl_local_id(1) * ccl_local_id(0) + ccl_local_id(0);
+
+ ccl_local unsigned int ao_queue_length;
+ ccl_local unsigned int dl_queue_length;
+ if(lidx == 0) {
+ ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+ dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+ }
+ ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+ /* flag determining if the current ray is to process shadow ray for AO or DL */
+ char shadow_blocked_type = -1;
+
+ int ray_index = QUEUE_EMPTY_SLOT;
+ int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+ if(thread_index < ao_queue_length + dl_queue_length) {
+ if(thread_index < ao_queue_length) {
+ ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+ kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
+ shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
+ } else {
+ ray_index = get_ray_index(kg, thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+ kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
+ shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
+ }
+ }
+
+ if(ray_index == QUEUE_EMPTY_SLOT)
+ return;
+
/* Flag determining if we need to update L. */
char update_path_radiance = 0;
- if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
- IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
+ if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
+ IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
{
- ccl_global PathState *state = &PathState_coop[ray_index];
- ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index];
- ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index];
+ ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+ ccl_global Ray *light_ray_dl_global = &kernel_split_state.light_ray[ray_index];
+ ccl_global Ray *light_ray_ao_global = &kernel_split_state.ao_light_ray[ray_index];
ccl_global Ray *light_ray_global =
shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO
@@ -71,7 +94,7 @@ ccl_device void kernel_shadow_blocked(
float3 shadow;
update_path_radiance = !(shadow_blocked(kg,
- kg->sd_input,
+ &kernel_split_state.sd_DL_shadow[thread_index],
state,
light_ray_global,
&shadow));
@@ -83,3 +106,6 @@ ccl_device void kernel_shadow_blocked(
light_ray_global->t = update_path_radiance;
}
}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 2135ee22b2e..dd0c3f9c941 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -17,11 +17,23 @@
#ifndef __KERNEL_SPLIT_H__
#define __KERNEL_SPLIT_H__
-#include "kernel_compat_opencl.h"
#include "kernel_math.h"
#include "kernel_types.h"
+
+#include "kernel_split_data.h"
+
#include "kernel_globals.h"
-#include "kernel_image_opencl.h"
+
+#ifdef __OSL__
+# include "osl_shader.h"
+#endif
+
+#ifdef __KERNEL_OPENCL__
+# include "kernel_image_opencl.h"
+#endif
+#ifdef __KERNEL_CPU__
+# include "../kernels/cpu/kernel_cpu_image.h"
+#endif
#include "util_atomic.h"
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
new file mode 100644
index 00000000000..5380c0c5de6
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_SPLIT_DATA_H__
+#define __KERNEL_SPLIT_DATA_H__
+
+#include "kernel_split_data_types.h"
+#include "kernel_globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline size_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
+{
+ (void)kg; /* Unused on CPU. */
+
+ size_t size = 0;
+#define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16)
+ size = size SPLIT_DATA_ENTRIES;
+#undef SPLIT_DATA_ENTRY
+
+ return size;
+}
+
+ccl_device_inline void split_data_init(KernelGlobals *kg,
+ ccl_global SplitData *split_data,
+ size_t num_elements,
+ ccl_global void *data,
+ ccl_global char *ray_state)
+{
+ (void)kg; /* Unused on CPU. */
+
+ ccl_global char *p = (ccl_global char*)data;
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+ split_data->name = (type*)p; p += align_up(num_elements * num * sizeof(type), 16);
+ SPLIT_DATA_ENTRIES
+#undef SPLIT_DATA_ENTRY
+
+ split_data->ray_state = ray_state;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __KERNEL_SPLIT_DATA_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
new file mode 100644
index 00000000000..62e3ea45ae2
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_SPLIT_DATA_TYPES_H__
+#define __KERNEL_SPLIT_DATA_TYPES_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */
+
+typedef struct SplitParams {
+ int x;
+ int y;
+ int w;
+ int h;
+
+ int offset;
+ int stride;
+
+ ccl_global uint *rng_state;
+
+ int start_sample;
+ int end_sample;
+
+ ccl_global unsigned int *work_pools;
+ unsigned int num_samples;
+
+ ccl_global int *queue_index;
+ int queue_size;
+ ccl_global char *use_queues_flag;
+
+ ccl_global float *buffer;
+} SplitParams;
+
+/* Global memory variables [porting]; These memory is used for
+ * co-operation between different kernels; Data written by one
+ * kernel will be available to another kernel via this global
+ * memory.
+ */
+
+/* SPLIT_DATA_ENTRY(type, name, num) */
+
+#if defined(WITH_CYCLES_DEBUG) || defined(__KERNEL_DEBUG__)
+/* DebugData memory */
+# define SPLIT_DATA_DEBUG_ENTRIES \
+ SPLIT_DATA_ENTRY(DebugData, debug_data, 1)
+#else
+# define SPLIT_DATA_DEBUG_ENTRIES
+#endif
+
+#define SPLIT_DATA_ENTRIES \
+ SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
+ SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+ SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
+ SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+ SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+ SPLIT_DATA_ENTRY(Intersection, isect, 1) \
+ SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \
+ SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \
+ SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+ SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+ SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+ SPLIT_DATA_ENTRY(Intersection, isect_shadow, 2) \
+ SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
+ SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
+ SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
+ SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 2) \
+ SPLIT_DATA_DEBUG_ENTRIES \
+
+/* struct that holds pointers to data in the shared state buffer */
+typedef struct SplitData {
+#define SPLIT_DATA_ENTRY(type, name, num) type *name;
+ SPLIT_DATA_ENTRIES
+#undef SPLIT_DATA_ENTRY
+
+ /* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
+ * the host easily) but is still used the same as the other data so we have it here in this struct as well
+ */
+ ccl_global char *ray_state;
+} SplitData;
+
+#ifndef __KERNEL_CUDA__
+# define kernel_split_state (kg->split_data)
+# define kernel_split_params (kg->split_param_data)
+#else
+__device__ SplitData __split_data;
+# define kernel_split_state (__split_data)
+__device__ SplitParams __split_param_data;
+# define kernel_split_params (__split_param_data)
+#endif /* __KERNEL_CUDA__ */
+
+CCL_NAMESPACE_END
+
+#endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
deleted file mode 100644
index a21e9b6a0b1..00000000000
--- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../kernel_compat_opencl.h"
-#include "../kernel_math.h"
-#include "../kernel_types.h"
-#include "../kernel_globals.h"
-
-/* Since we process various samples in parallel; The output radiance of different samples
- * are stored in different locations; This kernel combines the output radiance contributed
- * by all different samples and stores them in the RenderTile's output buffer.
- */
-ccl_device void kernel_sum_all_radiance(
- ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */
- ccl_global float *buffer, /* Output buffer of RenderTile */
- ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */
- int parallel_samples, int sw, int sh, int stride,
- int buffer_offset_x,
- int buffer_offset_y,
- int buffer_stride,
- int start_sample)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if(x < sw && y < sh) {
- buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride);
- per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride);
-
- int sample_stride = (data->film.pass_stride);
-
- int sample_iterator = 0;
- int pass_stride_iterator = 0;
- int num_floats = data->film.pass_stride;
-
- for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) {
- for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
- *(buffer + pass_stride_iterator) =
- (start_sample == 0 && sample_iterator == 0)
- ? *(per_sample_output_buffer + pass_stride_iterator)
- : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
- }
- per_sample_output_buffer += sample_stride;
- }
- }
-}
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 88ec7fe6fcc..57ec9f94a3d 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -192,7 +192,7 @@ CCL_NAMESPACE_BEGIN
ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderType type, int path_flag)
{
float stack[SVM_STACK_SIZE];
- int offset = ccl_fetch(sd, shader) & SHADER_MASK;
+ int offset = sd->shader & SHADER_MASK;
while(1) {
uint4 node = read_node(kg, &offset);
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index 0e55c99ae97..229a3f20421 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -27,7 +27,7 @@ ccl_device AttributeDescriptor svm_node_attr_init(KernelGlobals *kg, ShaderData
AttributeDescriptor desc;
- if(ccl_fetch(sd, object) != OBJECT_NONE) {
+ if(sd->object != OBJECT_NONE) {
desc = find_attribute(kg, sd, node.y);
if(desc.offset == ATTR_STD_NOT_FOUND) {
desc = attribute_not_found();
diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h
index 04a8c7b64e5..610d9af9e1f 100644
--- a/intern/cycles/kernel/svm/svm_bump.h
+++ b/intern/cycles/kernel/svm/svm_bump.h
@@ -21,9 +21,9 @@ CCL_NAMESPACE_BEGIN
ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset)
{
/* save state */
- stack_store_float3(stack, offset+0, ccl_fetch(sd, P));
- stack_store_float3(stack, offset+3, ccl_fetch(sd, dP).dx);
- stack_store_float3(stack, offset+6, ccl_fetch(sd, dP).dy);
+ stack_store_float3(stack, offset+0, sd->P);
+ stack_store_float3(stack, offset+3, sd->dP.dx);
+ stack_store_float3(stack, offset+6, sd->dP.dy);
/* set state as if undisplaced */
const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_POSITION_UNDISPLACED);
@@ -36,18 +36,18 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, floa
object_dir_transform(kg, sd, &dPdx);
object_dir_transform(kg, sd, &dPdy);
- ccl_fetch(sd, P) = P;
- ccl_fetch(sd, dP).dx = dPdx;
- ccl_fetch(sd, dP).dy = dPdy;
+ sd->P = P;
+ sd->dP.dx = dPdx;
+ sd->dP.dy = dPdy;
}
}
ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset)
{
/* restore state */
- ccl_fetch(sd, P) = stack_load_float3(stack, offset+0);
- ccl_fetch(sd, dP).dx = stack_load_float3(stack, offset+3);
- ccl_fetch(sd, dP).dy = stack_load_float3(stack, offset+6);
+ sd->P = stack_load_float3(stack, offset+0);
+ sd->dP.dx = stack_load_float3(stack, offset+3);
+ sd->dP.dy = stack_load_float3(stack, offset+6);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index 00678a49d70..90249dfd978 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -23,7 +23,7 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack,
float3 vector;
Transform tfm = kernel_data.cam.worldtocamera;
- vector = transform_point(&tfm, ccl_fetch(sd, P));
+ vector = transform_point(&tfm, sd->P);
zdepth = vector.z;
distance = len(vector);
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 017d697f9f8..1885e1af851 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -25,13 +25,13 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
bsdf->alpha_y = 0.0f;
bsdf->alpha_x = 0.0f;
bsdf->ior = eta;
- ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf);
+ sd->flag |= bsdf_refraction_setup(bsdf);
}
else {
bsdf->alpha_y = 0.0f;
bsdf->alpha_x = 0.0f;
bsdf->ior = 0.0f;
- ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf);
+ sd->flag |= bsdf_reflection_setup(bsdf);
}
}
else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) {
@@ -40,9 +40,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
bsdf->ior = eta;
if(refract)
- ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+ sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
else
- ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf);
+ sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
}
else {
bsdf->alpha_x = roughness;
@@ -50,9 +50,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
bsdf->ior = eta;
if(refract)
- ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+ sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
else
- ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf);
+ sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
}
}
@@ -70,14 +70,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
if(mix_weight == 0.0f)
return;
- float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N);
+ float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N;
float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z);
float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
switch(type) {
case CLOSURE_BSDF_DIFFUSE_ID: {
- float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+ float3 weight = sd->svm_closure_weight * mix_weight;
OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight);
if(bsdf) {
@@ -86,31 +86,31 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
float roughness = param1;
if(roughness == 0.0f) {
- ccl_fetch(sd, flag) |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf);
+ sd->flag |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf);
}
else {
bsdf->roughness = roughness;
- ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(bsdf);
+ sd->flag |= bsdf_oren_nayar_setup(bsdf);
}
}
break;
}
case CLOSURE_BSDF_TRANSLUCENT_ID: {
- float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+ float3 weight = sd->svm_closure_weight * mix_weight;
DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
if(bsdf) {
bsdf->N = N;
- ccl_fetch(sd, flag) |= bsdf_translucent_setup(bsdf);
+ sd->flag |= bsdf_translucent_setup(bsdf);
}
break;
}
case CLOSURE_BSDF_TRANSPARENT_ID: {
- float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+ float3 weight = sd->svm_closure_weight * mix_weight;
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
if(bsdf) {
- ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf);
+ sd->flag |= bsdf_transparent_setup(bsdf);
}
break;
}
@@ -123,7 +123,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
break;
#endif
- float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+ float3 weight = sd->svm_closure_weight * mix_weight;
MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
if(bsdf) {
@@ -135,21 +135,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
/* setup bsdf */
if(type == CLOSURE_BSDF_REFLECTION_ID)
- ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf);
+ sd->flag |= bsdf_reflection_setup(bsdf);
else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
- ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf);
+ sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
- ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf);
+ sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID) {
kernel_assert(stack_valid(data_node.z));
bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
if(bsdf->extra) {
bsdf->extra->color = stack_load_float3(stack, data_node.z);
- ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_setup(bsdf);
+ sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
}
}
else
- ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(bsdf);
+ sd->flag |= bsdf_ashikhmin_shirley_setup(bsdf);
}
break;
@@ -161,7 +161,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
break;
#endif
- float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+ float3 weight = sd->svm_closure_weight * mix_weight;
MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
if(bsdf) {
@@ -169,7 +169,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
bsdf->extra = NULL;
float eta = fmaxf(param2, 1e-5f);
- eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+ eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
/* setup bsdf */
if(type == CLOSURE_BSDF_REFRACTION_ID) {
@@ -177,7 +177,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
bsdf->alpha_y = 0.0f;
bsdf->ior = eta;
- ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf);
+ sd->flag |= bsdf_refraction_setup(bsdf);
}
else {
bsdf->alpha_x = param1;
@@ -185,9 +185,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
bsdf->ior = eta;
if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
- ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+ sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
else
- ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+ sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
}
}
@@ -203,14 +203,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
break;
}
#endif
- float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+ float3 weight = sd->svm_closure_weight * mix_weight;
/* index of refraction */
float eta = fmaxf(param2, 1e-5f);
- eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+ eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
/* fresnel */
- float cosNO = dot(N, ccl_fetch(sd, I));
+ float cosNO = dot(N, sd->I);
float fresnel = fresnel_dielectric_cos(cosNO, eta);
float roughness = param1;
@@ -249,7 +249,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
if(!kernel_data.integrator.caustics_reflective && !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
break;
#endif
- float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+ float3 weight = sd->svm_closure_weight * mix_weight;
MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
@@ -261,13 +261,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
bsdf->alpha_x = param1;
bsdf->alpha_y = param1;
float eta = fmaxf(param2, 1e-5f);
- bsdf->ior = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+ bsdf->ior = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
kernel_assert(stack_valid(data_node.z));
bsdf->extra->color = stack_load_float3(stack, data_node.z);
/* setup bsdf */
- ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
+ sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
}
break;
@@ -280,7 +280,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
break;
#endif
- float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+ float3 weight = sd->svm_closure_weight * mix_weight;
MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
if(bsdf) {
@@ -310,33 +310,33 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
bsdf->ior = 0.0f;
if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) {
- ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(bsdf);
+ sd->flag |= bsdf_microfacet_beckmann_aniso_setup(bsdf);
}
else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) {
- ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(bsdf);
+ sd->flag |= bsdf_microfacet_ggx_aniso_setup(bsdf);
}
else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID) {
kernel_assert(stack_valid(data_node.w));
bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
if(bsdf->extra) {
bsdf->extra->color = stack_load_float3(stack, data_node.w);
- ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
+ sd->flag |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
}
}
else
- ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(bsdf);
+ sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(bsdf);
}
break;
}
case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: {
- float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+ float3 weight = sd->svm_closure_weight * mix_weight;
VelvetBsdf *bsdf = (VelvetBsdf*)bsdf_alloc(sd, sizeof(VelvetBsdf), weight);
if(bsdf) {
bsdf->N = N;
bsdf->sigma = saturate(param1);
- ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(bsdf);
+ sd->flag |= bsdf_ashikhmin_velvet_setup(bsdf);
}
break;
}
@@ -346,7 +346,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
break;
#endif
case CLOSURE_BSDF_DIFFUSE_TOON_ID: {
- float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+ float3 weight = sd->svm_closure_weight * mix_weight;
ToonBsdf *bsdf = (ToonBsdf*)bsdf_alloc(sd, sizeof(ToonBsdf), weight);
if(bsdf) {
@@ -355,18 +355,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
bsdf->smooth = param2;
if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
- ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(bsdf);
+ sd->flag |= bsdf_diffuse_toon_setup(bsdf);
else
- ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(bsdf);
+ sd->flag |= bsdf_glossy_toon_setup(bsdf);
}
break;
}
#ifdef __HAIR__
case CLOSURE_BSDF_HAIR_REFLECTION_ID:
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
- float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+ float3 weight = sd->svm_closure_weight * mix_weight;
- if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+ if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) {
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
if(bsdf) {
@@ -376,7 +376,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
* better figure out a way to skip backfaces from rays
* spawned by transmission from the front */
bsdf->weight = make_float3(1.0f, 1.0f, 1.0f);
- ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf);
+ sd->flag |= bsdf_transparent_setup(bsdf);
}
}
else {
@@ -390,18 +390,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
if(stack_valid(data_node.y)) {
bsdf->T = normalize(stack_load_float3(stack, data_node.y));
}
- else if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) {
- bsdf->T = normalize(ccl_fetch(sd, dPdv));
+ else if(!(sd->type & PRIMITIVE_ALL_CURVE)) {
+ bsdf->T = normalize(sd->dPdv);
bsdf->offset = 0.0f;
}
else
- bsdf->T = normalize(ccl_fetch(sd, dPdu));
+ bsdf->T = normalize(sd->dPdu);
if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
- ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(bsdf);
+ sd->flag |= bsdf_hair_reflection_setup(bsdf);
}
else {
- ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(bsdf);
+ sd->flag |= bsdf_hair_transmission_setup(bsdf);
}
}
}
@@ -414,8 +414,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
case CLOSURE_BSSRDF_CUBIC_ID:
case CLOSURE_BSSRDF_GAUSSIAN_ID:
case CLOSURE_BSSRDF_BURLEY_ID: {
- float3 albedo = ccl_fetch(sd, svm_closure_weight);
- float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+ float3 albedo = sd->svm_closure_weight;
+ float3 weight = sd->svm_closure_weight * mix_weight;
float sample_weight = fabsf(average(weight));
/* disable in case of diffuse ancestor, can't see it well then and
@@ -441,7 +441,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
bssrdf->albedo = albedo.x;
bssrdf->sharpness = sharpness;
bssrdf->N = N;
- ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+ sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
}
bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f));
@@ -452,7 +452,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
bssrdf->albedo = albedo.y;
bssrdf->sharpness = sharpness;
bssrdf->N = N;
- ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+ sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
}
bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z));
@@ -463,7 +463,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
bssrdf->albedo = albedo.z;
bssrdf->sharpness = sharpness;
bssrdf->N = N;
- ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+ sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
}
}
@@ -493,21 +493,21 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
switch(type) {
case CLOSURE_VOLUME_ABSORPTION_ID: {
- float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - ccl_fetch(sd, svm_closure_weight)) * mix_weight * density;
+ float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sd->svm_closure_weight) * mix_weight * density;
ShaderClosure *sc = closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_NONE_ID, weight);
if(sc) {
- ccl_fetch(sd, flag) |= volume_absorption_setup(sc);
+ sd->flag |= volume_absorption_setup(sc);
}
break;
}
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: {
- float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight * density;
+ float3 weight = sd->svm_closure_weight * mix_weight * density;
HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), weight);
if(volume) {
volume->g = param2; /* g */
- ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(volume);
+ sd->flag |= volume_henyey_greenstein_setup(volume);
}
break;
}
@@ -527,12 +527,12 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
if(mix_weight == 0.0f)
return;
- closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+ closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight * mix_weight);
}
else
- closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight));
+ closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight);
- ccl_fetch(sd, flag) |= SD_EMISSION;
+ sd->flag |= SD_EMISSION;
}
ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
@@ -545,10 +545,10 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4
if(mix_weight == 0.0f)
return;
- closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+ closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight * mix_weight);
}
else
- closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight));
+ closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight);
}
ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
@@ -561,12 +561,12 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod
if(mix_weight == 0.0f)
return;
- closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+ closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight * mix_weight);
}
else
- closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight));
+ closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight);
- ccl_fetch(sd, flag) |= SD_HOLDOUT;
+ sd->flag |= SD_HOLDOUT;
}
ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node)
@@ -579,19 +579,19 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
if(mix_weight == 0.0f)
return;
- closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+ closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight * mix_weight);
}
else
- closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight));
+ closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight);
- ccl_fetch(sd, flag) |= SD_AO;
+ sd->flag |= SD_AO;
}
/* Closure Nodes */
ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
{
- ccl_fetch(sd, svm_closure_weight) = weight;
+ sd->svm_closure_weight = weight;
}
ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
@@ -641,7 +641,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
{
float3 normal = stack_load_float3(stack, in_direction);
- ccl_fetch(sd, N) = normal;
+ sd->N = normal;
stack_store_float3(stack, out_normal, normal);
}
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 890ab41aaaa..c94fa130af7 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -25,10 +25,10 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
uint normal_offset, distance_offset, invert, use_object_space;
decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, &use_object_space);
- float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+ float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
- float3 dPdx = ccl_fetch(sd, dP).dx;
- float3 dPdy = ccl_fetch(sd, dP).dy;
+ float3 dPdx = sd->dP.dx;
+ float3 dPdy = sd->dP.dy;
if(use_object_space) {
object_inverse_normal_transform(kg, sd, &normal_in);
@@ -80,14 +80,14 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg, ShaderData *sd, flo
{
float d = stack_load_float(stack, fac_offset);
- float3 dP = ccl_fetch(sd, N);
+ float3 dP = sd->N;
object_inverse_normal_transform(kg, sd, &dP);
dP *= d*0.1f; /* todo: get rid of this factor */
object_dir_transform(kg, sd, &dP);
- ccl_fetch(sd, P) += dP;
+ sd->P += dP;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 23c97d80cb0..3703ec55015 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset,
uint normal_offset, out_offset;
decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL);
float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value);
- float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+ float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
eta = fmaxf(eta, 1e-5f);
- eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+ eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
- float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
+ float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
stack_store_float(stack, out_offset, f);
}
@@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL);
float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value);
- float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+ float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N;
float f;
if(type == NODE_LAYER_WEIGHT_FRESNEL) {
float eta = fmaxf(1.0f - blend, 1e-5f);
- eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta;
+ eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta;
- f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
+ f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
}
else {
- f = fabsf(dot(ccl_fetch(sd, I), normal_in));
+ f = fabsf(dot(sd->I, normal_in));
if(blend != 0.5f) {
blend = clamp(blend, 0.0f, 1.0f-1e-5f);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 7d512f7ff4d..4a09d9f6653 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -27,15 +27,15 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
float3 data;
switch(type) {
- case NODE_GEOM_P: data = ccl_fetch(sd, P); break;
- case NODE_GEOM_N: data = ccl_fetch(sd, N); break;
+ case NODE_GEOM_P: data = sd->P; break;
+ case NODE_GEOM_N: data = sd->N; break;
#ifdef __DPDU__
case NODE_GEOM_T: data = primitive_tangent(kg, sd); break;
#endif
- case NODE_GEOM_I: data = ccl_fetch(sd, I); break;
- case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break;
+ case NODE_GEOM_I: data = sd->I; break;
+ case NODE_GEOM_Ng: data = sd->Ng; break;
#ifdef __UV__
- case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break;
+ case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
#endif
}
@@ -48,8 +48,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo
float3 data;
switch(type) {
- case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break;
- case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break;
+ case NODE_GEOM_P: data = sd->P + sd->dP.dx; break;
+ case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break;
default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
}
@@ -65,8 +65,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo
float3 data;
switch(type) {
- case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break;
- case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break;
+ case NODE_GEOM_P: data = sd->P + sd->dP.dy; break;
+ case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break;
default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
}
@@ -87,9 +87,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s
stack_store_float3(stack, out_offset, object_location(kg, sd));
return;
}
- case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break;
+ case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break;
case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break;
- case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break;
+ case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break;
default: data = 0.0f; break;
}
@@ -106,44 +106,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg,
{
switch(type) {
case NODE_INFO_PAR_INDEX: {
- int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+ int particle_id = object_particle_id(kg, sd->object);
stack_store_float(stack, out_offset, particle_index(kg, particle_id));
break;
}
case NODE_INFO_PAR_AGE: {
- int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+ int particle_id = object_particle_id(kg, sd->object);
stack_store_float(stack, out_offset, particle_age(kg, particle_id));
break;
}
case NODE_INFO_PAR_LIFETIME: {
- int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+ int particle_id = object_particle_id(kg, sd->object);
stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id));
break;
}
case NODE_INFO_PAR_LOCATION: {
- int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+ int particle_id = object_particle_id(kg, sd->object);
stack_store_float3(stack, out_offset, particle_location(kg, particle_id));
break;
}
#if 0 /* XXX float4 currently not supported in SVM stack */
case NODE_INFO_PAR_ROTATION: {
- int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+ int particle_id = object_particle_id(kg, sd->object);
stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id));
break;
}
#endif
case NODE_INFO_PAR_SIZE: {
- int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+ int particle_id = object_particle_id(kg, sd->object);
stack_store_float(stack, out_offset, particle_size(kg, particle_id));
break;
}
case NODE_INFO_PAR_VELOCITY: {
- int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+ int particle_id = object_particle_id(kg, sd->object);
stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id));
break;
}
case NODE_INFO_PAR_ANGULAR_VELOCITY: {
- int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+ int particle_id = object_particle_id(kg, sd->object);
stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id));
break;
}
@@ -165,7 +165,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg,
switch(type) {
case NODE_INFO_CURVE_IS_STRAND: {
- data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0;
+ data = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
stack_store_float(stack, out_offset, data);
break;
}
@@ -177,7 +177,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg,
break;
}
/*case NODE_INFO_CURVE_FADE: {
- data = ccl_fetch(sd, curve_transparency);
+ data = sd->curve_transparency;
stack_store_float(stack, out_offset, data);
break;
}*/
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 0d6efb47223..76acc9253a1 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -237,9 +237,9 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta
ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
{
/* get object space normal */
- float3 N = ccl_fetch(sd, N);
+ float3 N = sd->N;
- N = ccl_fetch(sd, N);
+ N = sd->N;
object_inverse_normal_transform(kg, sd, &N);
/* project from direction vector to barycentric coordinates in triangles */
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index 04f6f623f18..1492e358608 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -31,8 +31,8 @@ ccl_device void svm_node_light_path(ShaderData *sd, ccl_addr_space PathState *st
case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break;
case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break;
case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break;
- case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break;
- case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break;
+ case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break;
+ case NODE_LP_ray_length: info = sd->ray_length; break;
case NODE_LP_ray_depth: info = (float)state->bounce; break;
case NODE_LP_ray_diffuse: info = (float)state->diffuse_bounce; break;
case NODE_LP_ray_glossy: info = (float)state->glossy_bounce; break;
@@ -56,14 +56,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
switch(type) {
case NODE_LIGHT_FALLOFF_QUADRATIC: break;
- case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break;
- case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break;
+ case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break;
+ case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break;
}
float smooth = stack_load_float(stack, smooth_offset);
if(smooth > 0.0f) {
- float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length);
+ float squared = sd->ray_length*sd->ray_length;
/* Distant lamps set the ray length to FLT_MAX, which causes squared to overflow. */
if(isfinite(squared)) {
strength *= squared/(smooth + squared);
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index c0b01262212..c94327401f5 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
switch(type) {
case NODE_TEXCO_OBJECT: {
- data = ccl_fetch(sd, P);
+ data = sd->P;
if(node.w == 0) {
- if(ccl_fetch(sd, object) != OBJECT_NONE) {
+ if(sd->object != OBJECT_NONE) {
object_inverse_position_transform(kg, sd, &data);
}
}
@@ -48,47 +48,47 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
break;
}
case NODE_TEXCO_NORMAL: {
- data = ccl_fetch(sd, N);
+ data = sd->N;
object_inverse_normal_transform(kg, sd, &data);
break;
}
case NODE_TEXCO_CAMERA: {
Transform tfm = kernel_data.cam.worldtocamera;
- if(ccl_fetch(sd, object) != OBJECT_NONE)
- data = transform_point(&tfm, ccl_fetch(sd, P));
+ if(sd->object != OBJECT_NONE)
+ data = transform_point(&tfm, sd->P);
else
- data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg));
+ data = transform_point(&tfm, sd->P + camera_position(kg));
break;
}
case NODE_TEXCO_WINDOW: {
- if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
- data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P));
+ if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+ data = camera_world_to_ndc(kg, sd, sd->ray_P);
else
- data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P));
+ data = camera_world_to_ndc(kg, sd, sd->P);
data.z = 0.0f;
break;
}
case NODE_TEXCO_REFLECTION: {
- if(ccl_fetch(sd, object) != OBJECT_NONE)
- data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+ if(sd->object != OBJECT_NONE)
+ data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
else
- data = ccl_fetch(sd, I);
+ data = sd->I;
break;
}
case NODE_TEXCO_DUPLI_GENERATED: {
- data = object_dupli_generated(kg, ccl_fetch(sd, object));
+ data = object_dupli_generated(kg, sd->object);
break;
}
case NODE_TEXCO_DUPLI_UV: {
- data = object_dupli_uv(kg, ccl_fetch(sd, object));
+ data = object_dupli_uv(kg, sd->object);
break;
}
case NODE_TEXCO_VOLUME_GENERATED: {
- data = ccl_fetch(sd, P);
+ data = sd->P;
#ifdef __VOLUME__
- if(ccl_fetch(sd, object) != OBJECT_NONE)
+ if(sd->object != OBJECT_NONE)
data = volume_normalized_position(kg, sd, data);
#endif
break;
@@ -112,9 +112,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
switch(type) {
case NODE_TEXCO_OBJECT: {
- data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
+ data = sd->P + sd->dP.dx;
if(node.w == 0) {
- if(ccl_fetch(sd, object) != OBJECT_NONE) {
+ if(sd->object != OBJECT_NONE) {
object_inverse_position_transform(kg, sd, &data);
}
}
@@ -129,47 +129,47 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
break;
}
case NODE_TEXCO_NORMAL: {
- data = ccl_fetch(sd, N);
+ data = sd->N;
object_inverse_normal_transform(kg, sd, &data);
break;
}
case NODE_TEXCO_CAMERA: {
Transform tfm = kernel_data.cam.worldtocamera;
- if(ccl_fetch(sd, object) != OBJECT_NONE)
- data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
+ if(sd->object != OBJECT_NONE)
+ data = transform_point(&tfm, sd->P + sd->dP.dx);
else
- data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg));
+ data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg));
break;
}
case NODE_TEXCO_WINDOW: {
- if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
- data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx);
+ if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+ data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
else
- data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
+ data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
data.z = 0.0f;
break;
}
case NODE_TEXCO_REFLECTION: {
- if(ccl_fetch(sd, object) != OBJECT_NONE)
- data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+ if(sd->object != OBJECT_NONE)
+ data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
else
- data = ccl_fetch(sd, I);
+ data = sd->I;
break;
}
case NODE_TEXCO_DUPLI_GENERATED: {
- data = object_dupli_generated(kg, ccl_fetch(sd, object));
+ data = object_dupli_generated(kg, sd->object);
break;
}
case NODE_TEXCO_DUPLI_UV: {
- data = object_dupli_uv(kg, ccl_fetch(sd, object));
+ data = object_dupli_uv(kg, sd->object);
break;
}
case NODE_TEXCO_VOLUME_GENERATED: {
- data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
+ data = sd->P + sd->dP.dx;
#ifdef __VOLUME__
- if(ccl_fetch(sd, object) != OBJECT_NONE)
+ if(sd->object != OBJECT_NONE)
data = volume_normalized_position(kg, sd, data);
#endif
break;
@@ -196,9 +196,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
switch(type) {
case NODE_TEXCO_OBJECT: {
- data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
+ data = sd->P + sd->dP.dy;
if(node.w == 0) {
- if(ccl_fetch(sd, object) != OBJECT_NONE) {
+ if(sd->object != OBJECT_NONE) {
object_inverse_position_transform(kg, sd, &data);
}
}
@@ -213,47 +213,47 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
break;
}
case NODE_TEXCO_NORMAL: {
- data = ccl_fetch(sd, N);
+ data = sd->N;
object_inverse_normal_transform(kg, sd, &data);
break;
}
case NODE_TEXCO_CAMERA: {
Transform tfm = kernel_data.cam.worldtocamera;
- if(ccl_fetch(sd, object) != OBJECT_NONE)
- data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
+ if(sd->object != OBJECT_NONE)
+ data = transform_point(&tfm, sd->P + sd->dP.dy);
else
- data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg));
+ data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg));
break;
}
case NODE_TEXCO_WINDOW: {
- if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
- data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy);
+ if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+ data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
else
- data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
+ data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
data.z = 0.0f;
break;
}
case NODE_TEXCO_REFLECTION: {
- if(ccl_fetch(sd, object) != OBJECT_NONE)
- data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+ if(sd->object != OBJECT_NONE)
+ data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
else
- data = ccl_fetch(sd, I);
+ data = sd->I;
break;
}
case NODE_TEXCO_DUPLI_GENERATED: {
- data = object_dupli_generated(kg, ccl_fetch(sd, object));
+ data = object_dupli_generated(kg, sd->object);
break;
}
case NODE_TEXCO_DUPLI_UV: {
- data = object_dupli_uv(kg, ccl_fetch(sd, object));
+ data = object_dupli_uv(kg, sd->object);
break;
}
case NODE_TEXCO_VOLUME_GENERATED: {
- data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
+ data = sd->P + sd->dP.dy;
#ifdef __VOLUME__
- if(ccl_fetch(sd, object) != OBJECT_NONE)
+ if(sd->object != OBJECT_NONE)
data = volume_normalized_position(kg, sd, data);
#endif
break;
@@ -274,12 +274,12 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
float3 color = stack_load_float3(stack, color_offset);
color = 2.0f*make_float3(color.x - 0.5f, color.y - 0.5f, color.z - 0.5f);
- bool is_backfacing = (ccl_fetch(sd, flag) & SD_BACKFACING) != 0;
+ bool is_backfacing = (sd->flag & SD_BACKFACING) != 0;
float3 N;
if(space == NODE_NORMAL_MAP_TANGENT) {
/* tangent space */
- if(ccl_fetch(sd, object) == OBJECT_NONE) {
+ if(sd->object == OBJECT_NONE) {
stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f));
return;
}
@@ -299,11 +299,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
float sign = primitive_attribute_float(kg, sd, attr_sign, NULL, NULL);
float3 normal;
- if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+ if(sd->shader & SHADER_SMOOTH_NORMAL) {
normal = primitive_attribute_float3(kg, sd, attr_normal, NULL, NULL);
}
else {
- normal = ccl_fetch(sd, Ng);
+ normal = sd->Ng;
/* the normal is already inverted, which is too soon for the math here */
if(is_backfacing) {
@@ -345,11 +345,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
if(strength != 1.0f) {
strength = max(strength, 0.0f);
- N = safe_normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength);
+ N = safe_normalize(sd->N + (N - sd->N)*strength);
}
if(is_zero(N)) {
- N = ccl_fetch(sd, N);
+ N = sd->N;
}
stack_store_float3(stack, normal_offset, N);
@@ -377,7 +377,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
float3 generated;
if(desc.offset == ATTR_STD_NOT_FOUND)
- generated = ccl_fetch(sd, P);
+ generated = sd->P;
else
generated = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
@@ -390,7 +390,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
}
object_normal_transform(kg, sd, &tangent);
- tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N))));
+ tangent = cross(sd->N, normalize(cross(tangent, sd->N)));
stack_store_float3(stack, tangent_offset, tangent);
}
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 4c32130d06d..4e92f27acdb 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito;
Transform tfm;
- bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE);
+ bool is_object = (sd->object != OBJECT_NONE);
bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL);
/* From world */
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 87e40791333..3c6353c8001 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -41,9 +41,9 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
float3 *P)
{
#ifdef __HAIR__
- if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)
+ if(sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
#else
- if(ccl_fetch(sd, prim) != PRIM_NONE)
+ if(sd->prim != PRIM_NONE)
#endif
{
float3 Co[3];
@@ -52,12 +52,12 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
/* Triangles */
int np = 3;
- if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE)
- triangle_vertices(kg, ccl_fetch(sd, prim), Co);
+ if(sd->type & PRIMITIVE_TRIANGLE)
+ triangle_vertices(kg, sd->prim, Co);
else
- motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co);
+ motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co);
- if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
+ if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
object_position_transform(kg, sd, &Co[0]);
object_position_transform(kg, sd, &Co[1]);
object_position_transform(kg, sd, &Co[2]);
@@ -66,8 +66,8 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
if(pixel_size) {
// Project the derivatives of P to the viewing plane defined
// by I so we have a measure of how big is a pixel at this point
- float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
- float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
+ float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I);
+ float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I);
// Take the average of both axis' length
pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f;
}
@@ -113,20 +113,20 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg,
* With OpenCL 2.0 it's possible to avoid this change, but for until
* then we'll be living with such an exception.
*/
- float3 P = ccl_fetch(sd, P);
+ float3 P = sd->P;
float f = wireframe(kg, sd, size, pixel_size, &P);
#else
- float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P));
+ float f = wireframe(kg, sd, size, pixel_size, &sd->P);
#endif
/* TODO(sergey): Think of faster way to calculate derivatives. */
if(bump_offset == NODE_BUMP_OFFSET_DX) {
- float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx;
- f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx);
+ float3 Px = sd->P - sd->dP.dx;
+ f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx);
}
else if(bump_offset == NODE_BUMP_OFFSET_DY) {
- float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy;
- f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy);
+ float3 Py = sd->P - sd->dP.dy;
+ f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy);
}
if(stack_valid(out_fac))
diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h
index 8029c6a9e80..deb22c9c2f2 100644
--- a/intern/cycles/render/background.h
+++ b/intern/cycles/render/background.h
@@ -30,7 +30,7 @@ class Shader;
class Background : public Node {
public:
- NODE_DECLARE;
+ NODE_DECLARE
float ao_factor;
float ao_distance;
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index d9a297002c6..c2f6293a50b 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -171,9 +171,9 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
/* needs to be up to data for attribute access */
device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
- device->mem_alloc(d_input, MEM_READ_ONLY);
+ device->mem_alloc("bake_input", d_input, MEM_READ_ONLY);
device->mem_copy_to(d_input);
- device->mem_alloc(d_output, MEM_READ_WRITE);
+ device->mem_alloc("bake_output", d_output, MEM_READ_WRITE);
DeviceTask task(DeviceTask::SHADER);
task.shader_input = d_input.device_pointer;
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index f1692712d61..e3ef4bf13fb 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -129,13 +129,13 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
/* allocate buffer */
buffer.resize(params.width*params.height*params.get_passes_size());
- device->mem_alloc(buffer, MEM_READ_WRITE);
+ device->mem_alloc("render_buffer", buffer, MEM_READ_WRITE);
device->mem_zero(buffer);
/* allocate rng state */
rng_state.resize(params.width, params.height);
- device->mem_alloc(rng_state, MEM_READ_WRITE);
+ device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE);
}
bool RenderBuffers::copy_from_device()
diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h
index 141ef9cccef..655d74e42d8 100644
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -39,7 +39,7 @@ class Scene;
class Camera : public Node {
public:
- NODE_DECLARE;
+ NODE_DECLARE
/* Specifies an offset for the shutter's time interval. */
enum MotionPosition {
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 9fa51c51f52..d917057ed91 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -53,7 +53,7 @@ public:
class Film : public Node {
public:
- NODE_DECLARE;
+ NODE_DECLARE
float exposure;
array<Pass> passes;
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 780fdf49ca4..06524d3fa13 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -201,14 +201,14 @@ public:
/* Node definition utility macros */
#define SHADER_NODE_CLASS(type) \
- NODE_DECLARE; \
+ NODE_DECLARE \
type(); \
virtual ShaderNode *clone() const { return new type(*this); } \
virtual void compile(SVMCompiler& compiler); \
virtual void compile(OSLCompiler& compiler); \
#define SHADER_NODE_NO_CLONE_CLASS(type) \
- NODE_DECLARE; \
+ NODE_DECLARE \
type(); \
virtual void compile(SVMCompiler& compiler); \
virtual void compile(OSLCompiler& compiler); \
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 27fff4831e5..3ce41d5a185 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -29,7 +29,7 @@ class Scene;
class Integrator : public Node {
public:
- NODE_DECLARE;
+ NODE_DECLARE
int min_bounce;
int max_bounce;
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 6a4557506c3..fc6790dc022 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -57,9 +57,9 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
- device->mem_alloc(d_input, MEM_READ_ONLY);
+ device->mem_alloc("shade_background_pixels_input", d_input, MEM_READ_ONLY);
device->mem_copy_to(d_input);
- device->mem_alloc(d_output, MEM_WRITE_ONLY);
+ device->mem_alloc("shade_background_pixels_output", d_output, MEM_WRITE_ONLY);
DeviceTask main_task(DeviceTask::SHADER);
main_task.shader_input = d_input.device_pointer;
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index 5f33e30eac2..1f8b880c161 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -48,7 +48,7 @@ struct PackedPatchTable;
class Mesh : public Node {
public:
- NODE_DECLARE;
+ NODE_DECLARE
/* Mesh Triangle */
struct Triangle {
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index adc5b820298..4acb7911560 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -121,9 +121,9 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
/* needs to be up to data for attribute access */
device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
- device->mem_alloc(d_input, MEM_READ_ONLY);
+ device->mem_alloc("displace_input", d_input, MEM_READ_ONLY);
device->mem_copy_to(d_input);
- device->mem_alloc(d_output, MEM_WRITE_ONLY);
+ device->mem_alloc("displace_output", d_output, MEM_WRITE_ONLY);
DeviceTask task(DeviceTask::SHADER);
task.shader_input = d_input.device_pointer;
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 7e306fab2a8..3495849d149 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -40,7 +40,7 @@ struct Transform;
class Object : public Node {
public:
- NODE_DECLARE;
+ NODE_DECLARE
Mesh *mesh;
Transform tfm;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 420866c9436..0c7bd271371 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -654,6 +654,8 @@ void Session::load_kernels()
if(!kernels_loaded) {
progress.set_status("Loading render kernels (may take a few minutes the first time)");
+ scoped_timer timer;
+
DeviceRequestedFeatures requested_features = get_requested_device_features();
VLOG(2) << "Requested features:\n" << requested_features;
if(!device->load_kernels(requested_features)) {
@@ -667,6 +669,9 @@ void Session::load_kernels()
return;
}
+ progress.add_skip_time(timer, false);
+ VLOG(1) << "Total time spent loading kernels: " << time_dt() - timer.get_start();
+
kernels_loaded = true;
}
}
@@ -887,6 +892,7 @@ void Session::path_trace()
task.need_finish_queue = params.progressive_refine;
task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;
task.requested_tile_size = params.tile_size;
+ task.passes_size = tile_manager.params.get_passes_size();
device->task_add(task);
}
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index 7d896652196..490c3f1c95d 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -82,7 +82,7 @@ enum DisplacementMethod {
class Shader : public Node {
public:
- NODE_DECLARE;
+ NODE_DECLARE
int pass_id;
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 433e41fbbb6..6c52117ef9a 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -32,6 +32,13 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
}
}
+#define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
+
+#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+
+#define CCL_LOCAL_MEM_FENCE 0
+#define ccl_barrier(flags) (void)0
+
#else /* __KERNEL_GPU__ */
#ifdef __KERNEL_OPENCL__
@@ -39,7 +46,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
/* Float atomics implementation credits:
* http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html
*/
-ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *source,
+ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *source,
const float operand)
{
union {
@@ -56,10 +63,29 @@ ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *sou
} while(atomic_cmpxchg((volatile ccl_global unsigned int *)source,
prev_value.int_value,
new_value.int_value) != prev_value.int_value);
+ return new_value.float_value;
}
+#define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
+#define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
+
+#define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
+#define ccl_barrier(flags) barrier(flags)
+
#endif /* __KERNEL_OPENCL__ */
+#ifdef __KERNEL_CUDA__
+
+#define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x))
+
+#define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x))
+#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+
+#define CCL_LOCAL_MEM_FENCE
+#define ccl_barrier(flags) __syncthreads()
+
+#endif /* __KERNEL_CUDA__ */
+
#endif /* __KERNEL_GPU__ */
#endif /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 80d177d2cae..f12c5e28c80 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -29,7 +29,8 @@ DebugFlags::CPU::CPU()
sse41(true),
sse3(true),
sse2(true),
- qbvh(true)
+ qbvh(true),
+ split_kernel(false)
{
reset();
}
@@ -55,10 +56,12 @@ void DebugFlags::CPU::reset()
#undef CHECK_CPU_FLAGS
qbvh = true;
+ split_kernel = false;
}
DebugFlags::CUDA::CUDA()
- : adaptive_compile(false)
+ : adaptive_compile(false),
+ split_kernel(false)
{
reset();
}
@@ -67,6 +70,8 @@ void DebugFlags::CUDA::reset()
{
if(getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
adaptive_compile = true;
+
+ split_kernel = false;
}
DebugFlags::OpenCL::OpenCL()
@@ -133,7 +138,9 @@ std::ostream& operator <<(std::ostream &os,
<< " AVX : " << string_from_bool(debug_flags.cpu.avx) << "\n"
<< " SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n"
<< " SSE3 : " << string_from_bool(debug_flags.cpu.sse3) << "\n"
- << " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n";
+ << " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n"
+ << " QBVH : " << string_from_bool(debug_flags.cpu.qbvh) << "\n"
+ << " Split : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
os << "CUDA flags:\n"
<< " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 73fd228b5d9..911c95de4ab 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -46,6 +46,9 @@ public:
/* Whether QBVH usage is allowed or not. */
bool qbvh;
+
+ /* Whether split kernel is used */
+ bool split_kernel;
};
/* Descriptor of CUDA feature-set to be used. */
@@ -58,6 +61,9 @@ public:
/* Whether adaptive feature based runtime compile is enabled or not.
* Requires the CUDA Toolkit and only works on Linux atm. */
bool adaptive_compile;
+
+ /* Whether split kernel is used */
+ bool split_kernel;
};
/* Descriptor of OpenCL feature-set to be used. */
diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp
index 03041723e15..6824f1ff83c 100644
--- a/intern/cycles/util/util_logging.cpp
+++ b/intern/cycles/util/util_logging.cpp
@@ -69,6 +69,15 @@ void util_logging_verbosity_set(int verbosity)
}
std::ostream& operator <<(std::ostream &os,
+ const int2 &value)
+{
+ os << "(" << value.x
+ << ", " << value.y
+ << ")";
+ return os;
+}
+
+std::ostream& operator <<(std::ostream &os,
const float3 &value)
{
os << "(" << value.x
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index 2aa9c25b1a0..ecf9c9cfee0 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -45,6 +45,7 @@ public:
#define VLOG_ONCE(level, flag) if(!flag) flag = true, VLOG(level)
+struct int2;
struct float3;
void util_logging_init(const char *argv0);
@@ -52,6 +53,8 @@ void util_logging_start(void);
void util_logging_verbosity_set(int verbosity);
std::ostream& operator <<(std::ostream &os,
+ const int2 &value);
+std::ostream& operator <<(std::ostream &os,
const float3 &value);
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 5df262fcbbb..1b2e8aace5b 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -814,7 +814,7 @@ string path_source_replace_includes(const string& source,
/* Use line directives for better error messages. */
line = line_directive(filepath, 1)
+ token.replace(0, n_end + 1, "\n" + text + "\n")
- + line_directive(path_join(path, source_filename), i);
+ + line_directive(path_join(path, source_filename), i + 1);
}
}
}
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index 2f5295b5463..cf99a08efae 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -514,12 +514,12 @@ ccl_device_inline float len3(const ssef& a)
/* faster version for SSSE3 */
typedef ssei shuffle_swap_t;
-ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_identity(void)
{
return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
}
-ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_swap(void)
{
return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
}
@@ -534,12 +534,12 @@ ccl_device_inline const ssef shuffle_swap(const ssef& a, const shuffle_swap_t& s
/* somewhat slower version for SSE2 */
typedef int shuffle_swap_t;
-ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_identity(void)
{
return 0;
}
-ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_swap(void)
{
return 1;
}
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index a000fae4bd6..36d2f1053c7 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -37,6 +37,9 @@
#define ccl_device_noinline static
#define ccl_global
#define ccl_constant
+#define ccl_local
+#define ccl_local_param
+#define ccl_private
#define ccl_restrict __restrict
#define __KERNEL_WITH_SSE_ALIGN__
@@ -397,11 +400,6 @@ ccl_device_inline float4 make_float4(float x, float y, float z, float w)
return a;
}
-ccl_device_inline int align_up(int offset, int alignment)
-{
- return (offset + alignment - 1) & ~(alignment - 1);
-}
-
ccl_device_inline int3 make_int3(int i)
{
#ifdef __KERNEL_SSE__
@@ -476,6 +474,21 @@ ccl_device_inline int4 make_int4(const float3& f)
#endif
+ccl_device_inline int align_up(int offset, int alignment)
+{
+ return (offset + alignment - 1) & ~(alignment - 1);
+}
+
+ccl_device_inline int round_up(int x, int multiple)
+{
+ return ((x + multiple - 1) / multiple) * multiple;
+}
+
+ccl_device_inline int round_down(int x, int multiple)
+{
+ return (x / multiple) * multiple;
+}
+
/* Interpolation types for textures
* cuda also use texture space to store other objects */
enum InterpolationType {
diff --git a/release/scripts/presets/interface_theme/back_to_black.xml b/release/scripts/presets/interface_theme/back_to_black.xml
index 915e9cb64f1..1636f5b5cf6 100644
--- a/release/scripts/presets/interface_theme/back_to_black.xml
+++ b/release/scripts/presets/interface_theme/back_to_black.xml
@@ -18,7 +18,7 @@
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_regular>
<wcol_tool>
@@ -30,19 +30,19 @@
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_tool>
<wcol_radio>
<ThemeWidgetColors outline="#2a2a2a"
inner="#111111ff"
inner_sel="#33406bff"
- item="#191919ff"
+ item="#444444ff"
text="#929292"
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_radio>
<wcol_text>
@@ -50,23 +50,23 @@
inner="#111111ff"
inner_sel="#33406bff"
item="#191919ff"
- text="#e4e4e4"
+ text="#929292"
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_text>
<wcol_option>
- <ThemeWidgetColors outline="#2a2a2a"
+ <ThemeWidgetColors outline="#535353"
inner="#111111ff"
inner_sel="#33406bff"
- item="#000000ff"
- text="#c7c7c7"
+ item="#a3a3a3ff"
+ text="#929292"
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_option>
<wcol_toggle>
@@ -78,7 +78,7 @@
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_toggle>
<wcol_num>
@@ -90,7 +90,7 @@
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_num>
<wcol_numslider>
@@ -102,7 +102,7 @@
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_numslider>
<wcol_box>
@@ -114,7 +114,7 @@
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_box>
<wcol_menu>
@@ -126,7 +126,7 @@
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_menu>
<wcol_pulldown>
@@ -138,7 +138,7 @@
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_pulldown>
<wcol_menu_back>
@@ -150,7 +150,7 @@
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_menu_back>
<wcol_pie_menu>
@@ -170,7 +170,7 @@
inner="#191919e6"
inner_sel="#2d2d2de6"
item="#646464ff"
- text="#ffffff"
+ text="#929292"
text_sel="#ffffff"
show_shaded="FALSE"
shadetop="25"
@@ -186,7 +186,7 @@
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_menu_item>
<wcol_scroll>
@@ -198,7 +198,7 @@
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_scroll>
<wcol_progress>
@@ -210,7 +210,7 @@
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_progress>
<wcol_list_item>
@@ -222,7 +222,7 @@
text_sel="#ffffff"
show_shaded="TRUE"
shadetop="-100"
- shadedown="0">
+ shadedown="5">
</ThemeWidgetColors>
</wcol_list_item>
<wcol_state>
@@ -239,32 +239,35 @@
</user_interface>
<view_3d>
<ThemeView3D grid="#222222"
+ clipping_border_3d="#313131ff"
wire="#888888"
- wire_edit="#000000"
+ wire_edit="#6c75ff"
gp_vertex="#000000"
gp_vertex_select="#ff8500"
gp_vertex_size="3"
- lamp="#c1d40028"
- speaker="#535353"
- camera="#000000"
- view_overlay="#000000"
- empty="#000000"
+ text_grease_pencil="#b5e61d"
object_selected="#f15800"
object_active="#ff8c19"
object_grouped="#083008"
object_grouped_active="#55bb55"
- transform="#ffffff"
+ text_keyframe="#ddd700"
+ camera="#535353"
+ empty="#535353"
+ lamp="#fff0d328"
+ speaker="#535353"
vertex="#72cfdd"
vertex_select="#ff8500"
vertex_size="3"
+ vertex_bevel="#00a5ff"
vertex_unreferenced="#000000"
edge_select="#ffa000"
edge_seam="#db2512"
edge_sharp="#ff2020"
edge_crease="#cc0099"
+ edge_bevel="#00a5ff"
edge_facesel="#6b6b6b"
freestyle_edge_mark="#7fff7f"
- face="#73828f12"
+ face="#73828f41"
face_select="#ffa4003c"
face_dot="#ffa900"
facedot_size="4"
@@ -291,19 +294,18 @@
normal="#22dddd"
vertex_normal="#2361dd"
split_normal="#dd23dd"
- bone_solid="#c8c8c8"
bone_pose="#50c8ff"
bone_pose_active="#8cffff"
- frame_current="#60c040"
- outline_width="1"
+ bone_solid="#c8c8c8"
bundle_solid="#c8c8c8"
camera_path="#5a5a5a"
skin_root="#000000"
- clipping_border_3d="#313131ff"
- text_keyframe="#ddd700"
- text_grease_pencil="#b5e61d"
+ view_overlay="#000000"
+ transform="#ffffff"
+ frame_current="#60c040"
paint_curve_handle="#7fff7f7f"
- paint_curve_pivot="#ff7f7f7f">
+ paint_curve_pivot="#ff7f7f7f"
+ outline_width="1">
<space>
<ThemeSpaceGradient title="#5d5d5d"
text="#7d7d7d"
@@ -312,23 +314,23 @@
header_text="#979797"
header_text_hi="#ffffff"
button="#00000057"
- button_title="#c5c5c5"
+ button_title="#929292"
button_text="#c3c3c3"
- button_text_hi="#ffffff"
+ button_text_hi="#e5e5e5"
tab_active="#212947"
tab_inactive="#000000"
tab_back="#060606ff"
tab_outline="#000000">
<gradients>
<ThemeGradientColors show_grad="TRUE"
- gradient="#0a0a0a"
+ gradient="#1d1d1d"
high_gradient="#000000">
</ThemeGradientColors>
</gradients>
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -348,6 +350,7 @@
vertex="#ffffff"
vertex_select="#ff8500"
vertex_size="3"
+ vertex_bevel="#000000"
vertex_unreferenced="#000000"
handle_free="#808080"
handle_auto="#909000"
@@ -382,7 +385,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -418,7 +421,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -463,7 +466,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -501,6 +504,7 @@
keyframe_jitter_selected="#61c042"
keyframe_border="#000000ff"
keyframe_border_selected="#000000ff"
+ keyframe_scale_factor="1"
summary="#00000000">
<space>
<ThemeSpaceGeneric back="#080808"
@@ -521,7 +525,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -543,6 +547,7 @@
vertex="#0f13bb"
vertex_select="#ff8500"
vertex_size="3"
+ vertex_bevel="#000000"
vertex_unreferenced="#000000"
face="#ffffff0a"
face_select="#ff85003c"
@@ -596,7 +601,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -644,7 +649,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -673,7 +678,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -712,7 +717,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -744,7 +749,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -799,7 +804,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -835,7 +840,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -865,7 +870,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -903,7 +908,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -932,7 +937,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -966,7 +971,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
@@ -1019,7 +1024,7 @@
<panelcolors>
<ThemePanelColors header="#00000019"
back="#72727280"
- show_header="FALSE"
+ show_header="TRUE"
show_back="FALSE">
</ThemePanelColors>
</panelcolors>
diff --git a/release/scripts/presets/keyconfig/3dsmax.py b/release/scripts/presets/keyconfig/3dsmax.py
index b6b0a0c926f..a07a3a52caf 100644
--- a/release/scripts/presets/keyconfig/3dsmax.py
+++ b/release/scripts/presets/keyconfig/3dsmax.py
@@ -400,6 +400,12 @@ kmi = km.keymap_items.new('particle.hide', 'H', 'PRESS')
kmi.properties.unselected = False
kmi = km.keymap_items.new('particle.hide', 'H', 'PRESS', shift=True)
kmi.properties.unselected = True
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_planar_constraint= True
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_accurate= True
kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', any=True)
kmi.properties.release_confirm = True
kmi = km.keymap_items.new('particle.brush_edit', 'LEFTMOUSE', 'PRESS')
@@ -421,6 +427,12 @@ kmi.properties.value_2 = 'ENABLED'
# Map 3D View
km = kc.keymaps.new('3D View', space_type='VIEW_3D', region_type='WINDOW', modal=False)
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_planar_constraint= True
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_accurate= True
kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', any=True)
kmi.properties.release_confirm = True
kmi = km.keymap_items.new('view3d.cursor3d', 'ACTIONMOUSE', 'PRESS')
diff --git a/release/scripts/presets/keyconfig/maya.py b/release/scripts/presets/keyconfig/maya.py
index 3f4754863c6..cf213c1ddbd 100644
--- a/release/scripts/presets/keyconfig/maya.py
+++ b/release/scripts/presets/keyconfig/maya.py
@@ -935,6 +935,9 @@ kmi = km.keymap_items.new('view3d.rotate', 'LEFTMOUSE', 'PRESS', alt=True)
kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
kmi.properties.release_confirm = True
kmi.properties.use_planar_constraint = True
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_accurate = True
kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', any=True)
kmi.properties.release_confirm = True
kmi = km.keymap_items.new('view3d.move', 'MIDDLEMOUSE', 'PRESS', alt=True)
diff --git a/source/blender/blenlib/BLI_rect.h b/source/blender/blenlib/BLI_rect.h
index 484a679f76a..041679ef876 100644
--- a/source/blender/blenlib/BLI_rect.h
+++ b/source/blender/blenlib/BLI_rect.h
@@ -47,8 +47,8 @@ bool BLI_rcti_is_empty(const struct rcti *rect);
bool BLI_rctf_is_empty(const struct rctf *rect);
void BLI_rctf_init(struct rctf *rect, float xmin, float xmax, float ymin, float ymax);
void BLI_rcti_init(struct rcti *rect, int xmin, int xmax, int ymin, int ymax);
-void BLI_rctf_init_pt_size(struct rctf *rect, const float xy[2], float size);
-void BLI_rcti_init_pt_size(struct rcti *rect, const int xy[2], int size);
+void BLI_rctf_init_pt_radius(struct rctf *rect, const float xy[2], float size);
+void BLI_rcti_init_pt_radius(struct rcti *rect, const int xy[2], int size);
void BLI_rcti_init_minmax(struct rcti *rect);
void BLI_rctf_init_minmax(struct rctf *rect);
void BLI_rcti_do_minmax_v(struct rcti *rect, const int xy[2]);
diff --git a/source/blender/blenlib/BLI_task.h b/source/blender/blenlib/BLI_task.h
index d27bf4dad20..c3c587275e1 100644
--- a/source/blender/blenlib/BLI_task.h
+++ b/source/blender/blenlib/BLI_task.h
@@ -81,6 +81,7 @@ typedef void (*TaskFreeFunction)(TaskPool *__restrict pool, void *taskdata, int
TaskPool *BLI_task_pool_create(TaskScheduler *scheduler, void *userdata);
TaskPool *BLI_task_pool_create_background(TaskScheduler *scheduler, void *userdata);
+TaskPool *BLI_task_pool_create_suspended(TaskScheduler *scheduler, void *userdata);
void BLI_task_pool_free(TaskPool *pool);
void BLI_task_pool_push_ex(
@@ -96,9 +97,6 @@ void BLI_task_pool_work_and_wait(TaskPool *pool);
/* cancel all tasks, keep worker threads running */
void BLI_task_pool_cancel(TaskPool *pool);
-/* set number of threads allowed to be used by this pool */
-void BLI_pool_set_num_threads(TaskPool *pool, int num_threads);
-
/* for worker threads, test if canceled */
bool BLI_task_pool_canceled(TaskPool *pool);
diff --git a/source/blender/blenlib/intern/rct.c b/source/blender/blenlib/intern/rct.c
index e01a4714131..fd24a00156d 100644
--- a/source/blender/blenlib/intern/rct.c
+++ b/source/blender/blenlib/intern/rct.c
@@ -351,7 +351,7 @@ void BLI_rcti_init(rcti *rect, int xmin, int xmax, int ymin, int ymax)
}
}
-void BLI_rctf_init_pt_size(rctf *rect, const float xy[2], float size)
+void BLI_rctf_init_pt_radius(rctf *rect, const float xy[2], float size)
{
rect->xmin = xy[0] - size;
rect->xmax = xy[0] + size;
@@ -359,7 +359,7 @@ void BLI_rctf_init_pt_size(rctf *rect, const float xy[2], float size)
rect->ymax = xy[1] + size;
}
-void BLI_rcti_init_pt_size(rcti *rect, const int xy[2], int size)
+void BLI_rcti_init_pt_radius(rcti *rect, const int xy[2], int size)
{
rect->xmin = xy[0] - size;
rect->xmax = xy[0] + size;
diff --git a/source/blender/blenlib/intern/task.c b/source/blender/blenlib/intern/task.c
index 5d16fd9229c..49d2ee83a66 100644
--- a/source/blender/blenlib/intern/task.c
+++ b/source/blender/blenlib/intern/task.c
@@ -48,6 +48,32 @@
*/
#define MEMPOOL_SIZE 256
+/* Number of tasks which are pushed directly to local thread queue.
+ *
+ * This allows thread to fetch next task without locking the whole queue.
+ */
+#define LOCALQUEUE_SIZE 1
+
+#ifndef NDEBUG
+# define ASSERT_THREAD_ID(scheduler, thread_id) \
+ do { \
+ if (!BLI_thread_is_main()) { \
+ TaskThread *thread = pthread_getspecific(scheduler->tls_id_key); \
+ if (thread == NULL) { \
+ BLI_assert(thread_id == 0); \
+ } \
+ else { \
+ BLI_assert(thread_id == thread->id); \
+ } \
+ } \
+ else { \
+ BLI_assert(thread_id == 0); \
+ } \
+ } while (false)
+#else
+# define ASSERT_THREAD_ID(scheduler, thread_id)
+#endif
+
typedef struct Task {
struct Task *next, *prev;
@@ -102,12 +128,16 @@ typedef struct TaskMemPoolStats {
} TaskMemPoolStats;
#endif
+typedef struct TaskThreadLocalStorage {
+ TaskMemPool task_mempool;
+ int num_local_queue;
+ Task *local_queue[LOCALQUEUE_SIZE];
+} TaskThreadLocalStorage;
+
struct TaskPool {
TaskScheduler *scheduler;
volatile size_t num;
- size_t num_threads;
- size_t currently_running_tasks;
ThreadMutex num_mutex;
ThreadCondition num_cond;
@@ -115,6 +145,11 @@ struct TaskPool {
ThreadMutex user_mutex;
volatile bool do_cancel;
+ volatile bool do_work;
+
+ volatile bool is_suspended;
+ ListBase suspended_queue;
+ size_t num_suspended;
/* If set, this pool may never be work_and_wait'ed, which means TaskScheduler
* has to use its special background fallback thread in case we are in
@@ -122,16 +157,10 @@ struct TaskPool {
*/
bool run_in_background;
- /* This pool is used for caching task pointers for thread id 0.
- * This could either point to a global scheduler's task_mempool[0] if the
- * pool is handled form the main thread or point to task_mempool_local
- * otherwise.
- *
- * This way we solve possible threading conflicts accessing same global
- * memory pool from multiple threads from which wait_work() is called.
+ /* This is a task scheduler's ID of a thread at which pool was constructed.
+ * It will be used to access task TLS.
*/
- TaskMemPool *task_mempool;
- TaskMemPool task_mempool_local;
+ int thread_id;
#ifdef DEBUG_STATS
TaskMemPoolStats *mempool_stats;
@@ -141,7 +170,6 @@ struct TaskPool {
struct TaskScheduler {
pthread_t *threads;
struct TaskThread *task_threads;
- TaskMemPool *task_mempool;
int num_threads;
bool background_thread_only;
@@ -150,15 +178,19 @@ struct TaskScheduler {
ThreadCondition queue_cond;
volatile bool do_exit;
+
+ /* NOTE: In pthread's TLS we store the whole TaskThread structure. */
+ pthread_key_t tls_id_key;
};
typedef struct TaskThread {
TaskScheduler *scheduler;
int id;
+ TaskThreadLocalStorage tls;
} TaskThread;
/* Helper */
-static void task_data_free(Task *task, const int thread_id)
+BLI_INLINE void task_data_free(Task *task, const int thread_id)
{
if (task->free_taskdata) {
if (task->freedata) {
@@ -170,12 +202,24 @@ static void task_data_free(Task *task, const int thread_id)
}
}
-BLI_INLINE TaskMemPool *get_task_mempool(TaskPool *pool, const int thread_id)
+BLI_INLINE TaskThreadLocalStorage *get_task_tls(TaskPool *pool,
+ const int thread_id)
{
+ TaskScheduler *scheduler = pool->scheduler;
+ BLI_assert(thread_id >= 0);
+ BLI_assert(thread_id <= scheduler->num_threads);
if (thread_id == 0) {
- return pool->task_mempool;
+ return &scheduler->task_threads[pool->thread_id].tls;
+ }
+ return &scheduler->task_threads[thread_id].tls;
+}
+
+BLI_INLINE void free_task_tls(TaskThreadLocalStorage *tls)
+{
+ TaskMemPool *task_mempool = &tls->task_mempool;
+ for (int i = 0; i < task_mempool->num_tasks; ++i) {
+ MEM_freeN(task_mempool->tasks[i]);
}
- return &pool->scheduler->task_mempool[thread_id];
}
static Task *task_alloc(TaskPool *pool, const int thread_id)
@@ -183,15 +227,17 @@ static Task *task_alloc(TaskPool *pool, const int thread_id)
BLI_assert(thread_id <= pool->scheduler->num_threads);
if (thread_id != -1) {
BLI_assert(thread_id >= 0);
- TaskMemPool *mem_pool = get_task_mempool(pool, thread_id);
+ BLI_assert(thread_id <= pool->scheduler->num_threads);
+ TaskThreadLocalStorage *tls = get_task_tls(pool, thread_id);
+ TaskMemPool *task_mempool = &tls->task_mempool;
/* Try to re-use task memory from a thread local storage. */
- if (mem_pool->num_tasks > 0) {
- --mem_pool->num_tasks;
+ if (task_mempool->num_tasks > 0) {
+ --task_mempool->num_tasks;
/* Success! We've just avoided task allocation. */
#ifdef DEBUG_STATS
pool->mempool_stats[thread_id].num_reuse++;
#endif
- return mem_pool->tasks[mem_pool->num_tasks];
+ return task_mempool->tasks[task_mempool->num_tasks];
}
/* We are doomed to allocate new task data. */
#ifdef DEBUG_STATS
@@ -206,11 +252,12 @@ static void task_free(TaskPool *pool, Task *task, const int thread_id)
task_data_free(task, thread_id);
BLI_assert(thread_id >= 0);
BLI_assert(thread_id <= pool->scheduler->num_threads);
- TaskMemPool *mem_pool = get_task_mempool(pool, thread_id);
- if (mem_pool->num_tasks < MEMPOOL_SIZE - 1) {
+ TaskThreadLocalStorage *tls = get_task_tls(pool, thread_id);
+ TaskMemPool *task_mempool = &tls->task_mempool;
+ if (task_mempool->num_tasks < MEMPOOL_SIZE - 1) {
/* Successfully allowed the task to be re-used later. */
- mem_pool->tasks[mem_pool->num_tasks] = task;
- ++mem_pool->num_tasks;
+ task_mempool->tasks[task_mempool->num_tasks] = task;
+ ++task_mempool->num_tasks;
}
else {
/* Local storage saturated, no other way than just discard
@@ -236,7 +283,6 @@ static void task_pool_num_decrease(TaskPool *pool, size_t done)
BLI_assert(pool->num >= done);
pool->num -= done;
- atomic_sub_and_fetch_z(&pool->currently_running_tasks, done);
if (pool->num == 0)
BLI_condition_notify_all(&pool->num_cond);
@@ -244,11 +290,11 @@ static void task_pool_num_decrease(TaskPool *pool, size_t done)
BLI_mutex_unlock(&pool->num_mutex);
}
-static void task_pool_num_increase(TaskPool *pool)
+static void task_pool_num_increase(TaskPool *pool, size_t new)
{
BLI_mutex_lock(&pool->num_mutex);
- pool->num++;
+ pool->num += new;
BLI_condition_notify_all(&pool->num_cond);
BLI_mutex_unlock(&pool->num_mutex);
@@ -290,17 +336,10 @@ static bool task_scheduler_thread_wait_pop(TaskScheduler *scheduler, Task **task
continue;
}
- if (atomic_add_and_fetch_z(&pool->currently_running_tasks, 1) <= pool->num_threads ||
- pool->num_threads == 0)
- {
- *task = current_task;
- found_task = true;
- BLI_remlink(&scheduler->queue, *task);
- break;
- }
- else {
- atomic_sub_and_fetch_z(&pool->currently_running_tasks, 1);
- }
+ *task = current_task;
+ found_task = true;
+ BLI_remlink(&scheduler->queue, *task);
+ break;
}
if (!found_task)
BLI_condition_wait(&scheduler->queue_cond, &scheduler->queue_mutex);
@@ -311,13 +350,34 @@ static bool task_scheduler_thread_wait_pop(TaskScheduler *scheduler, Task **task
return true;
}
+BLI_INLINE void handle_local_queue(TaskThreadLocalStorage *tls,
+ const int thread_id)
+{
+ while (tls->num_local_queue > 0) {
+ /* We pop task from queue before handling it so handler of the task can
+ * push next job to the local queue.
+ */
+ tls->num_local_queue--;
+ Task *local_task = tls->local_queue[tls->num_local_queue];
+ /* TODO(sergey): Double-check work_and_wait() doesn't handle other's
+ * pool tasks.
+ */
+ TaskPool *local_pool = local_task->pool;
+ local_task->run(local_pool, local_task->taskdata, thread_id);
+ task_free(local_pool, local_task, thread_id);
+ }
+}
+
static void *task_scheduler_thread_run(void *thread_p)
{
TaskThread *thread = (TaskThread *) thread_p;
+ TaskThreadLocalStorage *tls = &thread->tls;
TaskScheduler *scheduler = thread->scheduler;
int thread_id = thread->id;
Task *task;
+ pthread_setspecific(scheduler->tls_id_key, thread);
+
/* keep popping off tasks */
while (task_scheduler_thread_wait_pop(scheduler, &task)) {
TaskPool *pool = task->pool;
@@ -328,6 +388,9 @@ static void *task_scheduler_thread_run(void *thread_p)
/* delete task */
task_free(pool, task, thread_id);
+ /* Handle all tasks from local queue. */
+ handle_local_queue(tls, thread_id);
+
/* notify pool task was done */
task_pool_num_decrease(pool, 1);
}
@@ -361,16 +424,20 @@ TaskScheduler *BLI_task_scheduler_create(int num_threads)
num_threads = 1;
}
+ scheduler->task_threads = MEM_callocN(sizeof(TaskThread) * (num_threads + 1),
+ "TaskScheduler task threads");
+
+ pthread_key_create(&scheduler->tls_id_key, NULL);
+
/* launch threads that will be waiting for work */
if (num_threads > 0) {
int i;
scheduler->num_threads = num_threads;
scheduler->threads = MEM_callocN(sizeof(pthread_t) * num_threads, "TaskScheduler threads");
- scheduler->task_threads = MEM_callocN(sizeof(TaskThread) * num_threads, "TaskScheduler task threads");
for (i = 0; i < num_threads; i++) {
- TaskThread *thread = &scheduler->task_threads[i];
+ TaskThread *thread = &scheduler->task_threads[i + 1];
thread->scheduler = scheduler;
thread->id = i + 1;
@@ -378,9 +445,6 @@ TaskScheduler *BLI_task_scheduler_create(int num_threads)
fprintf(stderr, "TaskScheduler failed to launch thread %d/%d\n", i, num_threads);
}
}
-
- scheduler->task_mempool = MEM_callocN(sizeof(*scheduler->task_mempool) * (num_threads + 1),
- "TaskScheduler task_mempool");
}
return scheduler;
@@ -396,6 +460,8 @@ void BLI_task_scheduler_free(TaskScheduler *scheduler)
BLI_condition_notify_all(&scheduler->queue_cond);
BLI_mutex_unlock(&scheduler->queue_mutex);
+ pthread_key_delete(scheduler->tls_id_key);
+
/* delete threads */
if (scheduler->threads) {
int i;
@@ -410,17 +476,12 @@ void BLI_task_scheduler_free(TaskScheduler *scheduler)
/* Delete task thread data */
if (scheduler->task_threads) {
- MEM_freeN(scheduler->task_threads);
- }
-
- /* Delete task memory pool */
- if (scheduler->task_mempool) {
- for (int i = 0; i <= scheduler->num_threads; ++i) {
- for (int j = 0; j < scheduler->task_mempool[i].num_tasks; ++j) {
- MEM_freeN(scheduler->task_mempool[i].tasks[j]);
- }
+ for (int i = 0; i < scheduler->num_threads + 1; ++i) {
+ TaskThreadLocalStorage *tls = &scheduler->task_threads[i].tls;
+ free_task_tls(tls);
}
- MEM_freeN(scheduler->task_mempool);
+
+ MEM_freeN(scheduler->task_threads);
}
/* delete leftover tasks */
@@ -443,7 +504,7 @@ int BLI_task_scheduler_num_threads(TaskScheduler *scheduler)
static void task_scheduler_push(TaskScheduler *scheduler, Task *task, TaskPriority priority)
{
- task_pool_num_increase(task->pool);
+ task_pool_num_increase(task->pool, 1);
/* add task to queue */
BLI_mutex_lock(&scheduler->queue_mutex);
@@ -469,7 +530,7 @@ static void task_scheduler_clear(TaskScheduler *scheduler, TaskPool *pool)
nexttask = task->next;
if (task->pool == pool) {
- task_data_free(task, 0);
+ task_data_free(task, pool->thread_id);
BLI_freelinkN(&scheduler->queue, task);
done++;
@@ -484,7 +545,10 @@ static void task_scheduler_clear(TaskScheduler *scheduler, TaskPool *pool)
/* Task Pool */
-static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, const bool is_background)
+static TaskPool *task_pool_create_ex(TaskScheduler *scheduler,
+ void *userdata,
+ const bool is_background,
+ const bool is_suspended)
{
TaskPool *pool = MEM_mallocN(sizeof(TaskPool), "TaskPool");
@@ -502,9 +566,11 @@ static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, c
pool->scheduler = scheduler;
pool->num = 0;
- pool->num_threads = 0;
- pool->currently_running_tasks = 0;
pool->do_cancel = false;
+ pool->do_work = false;
+ pool->is_suspended = is_suspended;
+ pool->num_suspended = 0;
+ pool->suspended_queue.first = pool->suspended_queue.last = NULL;
pool->run_in_background = is_background;
BLI_mutex_init(&pool->num_mutex);
@@ -514,11 +580,21 @@ static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, c
BLI_mutex_init(&pool->user_mutex);
if (BLI_thread_is_main()) {
- pool->task_mempool = scheduler->task_mempool;
+ pool->thread_id = 0;
}
else {
- pool->task_mempool = &pool->task_mempool_local;
- pool->task_mempool_local.num_tasks = 0;
+ TaskThread *thread = pthread_getspecific(scheduler->tls_id_key);
+ /* NOTE: It is possible that pool is created from non-main thread
+ * which isn't a scheduler thread. In this case pthread's TLS will
+ * be NULL and we can safely consider thread id 0 for the main
+ * thread of this pool (the one which does wort_and_wait()).
+ */
+ if (thread == NULL) {
+ pool->thread_id = 0;
+ }
+ else {
+ pool->thread_id = thread->id;
+ }
}
#ifdef DEBUG_STATS
@@ -545,7 +621,7 @@ static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, c
*/
TaskPool *BLI_task_pool_create(TaskScheduler *scheduler, void *userdata)
{
- return task_pool_create_ex(scheduler, userdata, false);
+ return task_pool_create_ex(scheduler, userdata, false, false);
}
/**
@@ -560,7 +636,17 @@ TaskPool *BLI_task_pool_create(TaskScheduler *scheduler, void *userdata)
*/
TaskPool *BLI_task_pool_create_background(TaskScheduler *scheduler, void *userdata)
{
- return task_pool_create_ex(scheduler, userdata, true);
+ return task_pool_create_ex(scheduler, userdata, true, false);
+}
+
+/**
+ * Similar to BLI_task_pool_create() but does not schedule any tasks for execution
+ * for until BLI_task_pool_work_and_wait() is called. This helps reducing therading
+ * overhead when pushing huge amount of small initial tasks from the main thread.
+ */
+TaskPool *BLI_task_pool_create_suspended(TaskScheduler *scheduler, void *userdata)
+{
+ return task_pool_create_ex(scheduler, userdata, false, true);
}
void BLI_task_pool_free(TaskPool *pool)
@@ -572,13 +658,6 @@ void BLI_task_pool_free(TaskPool *pool)
BLI_mutex_end(&pool->user_mutex);
- /* Free local memory pool, those pointers are lost forever. */
- if (pool->task_mempool == &pool->task_mempool_local) {
- for (int i = 0; i < pool->task_mempool_local.num_tasks; i++) {
- MEM_freeN(pool->task_mempool_local.tasks[i]);
- }
- }
-
#ifdef DEBUG_STATS
printf("Thread ID Allocated Reused Discarded\n");
for (int i = 0; i < pool->scheduler->num_threads + 1; ++i) {
@@ -609,6 +688,25 @@ static void task_pool_push(
task->freedata = freedata;
task->pool = pool;
+ if (pool->is_suspended) {
+ BLI_addhead(&pool->suspended_queue, task);
+ atomic_fetch_and_add_z(&pool->num_suspended, 1);
+ return;
+ }
+
+ if (thread_id != -1 &&
+ (thread_id != pool->thread_id || pool->do_work))
+ {
+ ASSERT_THREAD_ID(pool->scheduler, thread_id);
+
+ TaskThreadLocalStorage *tls = get_task_tls(pool, thread_id);
+ if (tls->num_local_queue < LOCALQUEUE_SIZE) {
+ tls->local_queue[tls->num_local_queue] = task;
+ tls->num_local_queue++;
+ return;
+ }
+ }
+
task_scheduler_push(pool->scheduler, task, priority);
}
@@ -633,8 +731,27 @@ void BLI_task_pool_push_from_thread(TaskPool *pool, TaskRunFunction run,
void BLI_task_pool_work_and_wait(TaskPool *pool)
{
+ TaskThreadLocalStorage *tls = get_task_tls(pool, pool->thread_id);
TaskScheduler *scheduler = pool->scheduler;
+ if (atomic_fetch_and_and_uint8((uint8_t*)&pool->is_suspended, 0)) {
+ if (pool->num_suspended) {
+ task_pool_num_increase(pool, pool->num_suspended);
+ BLI_mutex_lock(&scheduler->queue_mutex);
+
+ BLI_movelisttolist(&scheduler->queue, &pool->suspended_queue);
+
+ BLI_condition_notify_all(&scheduler->queue_cond);
+ BLI_mutex_unlock(&scheduler->queue_mutex);
+
+ }
+ pool->is_suspended = false;
+ }
+
+ pool->do_work = true;
+
+ ASSERT_THREAD_ID(pool->scheduler, pool->thread_id);
+
BLI_mutex_lock(&pool->num_mutex);
while (pool->num != 0) {
@@ -648,16 +765,12 @@ void BLI_task_pool_work_and_wait(TaskPool *pool)
/* find task from this pool. if we get a task from another pool,
* we can get into deadlock */
- if (pool->num_threads == 0 ||
- pool->currently_running_tasks < pool->num_threads)
- {
- for (task = scheduler->queue.first; task; task = task->next) {
- if (task->pool == pool) {
- work_task = task;
- found_task = true;
- BLI_remlink(&scheduler->queue, task);
- break;
- }
+ for (task = scheduler->queue.first; task; task = task->next) {
+ if (task->pool == pool) {
+ work_task = task;
+ found_task = true;
+ BLI_remlink(&scheduler->queue, task);
+ break;
}
}
@@ -666,11 +779,13 @@ void BLI_task_pool_work_and_wait(TaskPool *pool)
/* if found task, do it, otherwise wait until other tasks are done */
if (found_task) {
/* run task */
- atomic_add_and_fetch_z(&pool->currently_running_tasks, 1);
- work_task->run(pool, work_task->taskdata, 0);
+ work_task->run(pool, work_task->taskdata, pool->thread_id);
/* delete task */
- task_free(pool, task, 0);
+ task_free(pool, task, pool->thread_id);
+
+ /* Handle all tasks from local queue. */
+ handle_local_queue(tls, pool->thread_id);
/* notify pool task was done */
task_pool_num_decrease(pool, 1);
@@ -685,12 +800,8 @@ void BLI_task_pool_work_and_wait(TaskPool *pool)
}
BLI_mutex_unlock(&pool->num_mutex);
-}
-void BLI_pool_set_num_threads(TaskPool *pool, int num_threads)
-{
- /* NOTE: Don't try to modify threads while tasks are running! */
- pool->num_threads = num_threads;
+ handle_local_queue(tls, pool->thread_id);
}
void BLI_task_pool_cancel(TaskPool *pool)
@@ -893,7 +1004,8 @@ static void task_parallel_range_ex(
BLI_task_pool_push_from_thread(task_pool,
parallel_range_func,
userdata_chunk_local, false,
- TASK_PRIORITY_HIGH, 0);
+ TASK_PRIORITY_HIGH,
+ task_pool->thread_id);
}
BLI_task_pool_work_and_wait(task_pool);
@@ -1099,7 +1211,8 @@ void BLI_task_parallel_listbase(
BLI_task_pool_push_from_thread(task_pool,
parallel_listbase_func,
NULL, false,
- TASK_PRIORITY_HIGH, 0);
+ TASK_PRIORITY_HIGH,
+ task_pool->thread_id);
}
BLI_task_pool_work_and_wait(task_pool);
diff --git a/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp b/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp
index e1ada9a8c39..5f78067220a 100644
--- a/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp
+++ b/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp
@@ -94,4 +94,10 @@ void ConvolutionEdgeFilterOperation::executePixel(float output[4], int x, int y,
output[2] = output[2] * value[0] + in2[2] * mval;
output[3] = in2[3];
+
+ /* Make sure we don't return negative color. */
+ output[0] = max(output[0], 0.0f);
+ output[1] = max(output[1], 0.0f);
+ output[2] = max(output[2], 0.0f);
+ output[3] = max(output[3], 0.0f);
}
diff --git a/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp b/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp
index 68ec2be5ebd..6ac1ff9a1eb 100644
--- a/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp
+++ b/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp
@@ -107,6 +107,12 @@ void ConvolutionFilterOperation::executePixel(float output[4], int x, int y, voi
output[1] = output[1] * value[0] + in2[1] * mval;
output[2] = output[2] * value[0] + in2[2] * mval;
output[3] = output[3] * value[0] + in2[3] * mval;
+
+ /* Make sure we don't return negative color. */
+ output[0] = max(output[0], 0.0f);
+ output[1] = max(output[1], 0.0f);
+ output[2] = max(output[2], 0.0f);
+ output[3] = max(output[3], 0.0f);
}
bool ConvolutionFilterOperation::determineDependingAreaOfInterest(rcti *input, ReadBufferOperation *readOperation, rcti *output)
diff --git a/source/blender/depsgraph/intern/eval/deg_eval.cc b/source/blender/depsgraph/intern/eval/deg_eval.cc
index 3a042535d26..e739bc9dbb5 100644
--- a/source/blender/depsgraph/intern/eval/deg_eval.cc
+++ b/source/blender/depsgraph/intern/eval/deg_eval.cc
@@ -95,105 +95,38 @@ static void deg_task_run_func(TaskPool *pool,
/* Should only be the case for NOOPs, which never get to this point. */
BLI_assert(node->evaluate);
- while (true) {
- /* Get context. */
- /* TODO: Who initialises this? "Init" operations aren't able to
- * initialise it!!!
- */
- /* TODO(sergey): We don't use component contexts at this moment. */
- /* ComponentDepsNode *comp = node->owner; */
- BLI_assert(node->owner != NULL);
-
- /* Since we're not leaving the thread for until the graph branches it is
- * possible to have NO-OP on the way. for which evaluate() will be NULL.
- * but that's all fine, we'll just scheduler it's children.
- */
- if (node->evaluate) {
+ /* Get context. */
+ /* TODO: Who initialises this? "Init" operations aren't able to
+ * initialise it!!!
+ */
+ /* TODO(sergey): We don't use component contexts at this moment. */
+ /* ComponentDepsNode *comp = node->owner; */
+ BLI_assert(node->owner != NULL);
+
+ /* Since we're not leaving the thread for until the graph branches it is
+ * possible to have NO-OP on the way. for which evaluate() will be NULL.
+ * but that's all fine, we'll just scheduler it's children.
+ */
+ if (node->evaluate) {
/* Take note of current time. */
#ifdef USE_DEBUGGER
- double start_time = PIL_check_seconds_timer();
- DepsgraphDebug::task_started(state->graph, node);
+ double start_time = PIL_check_seconds_timer();
+ DepsgraphDebug::task_started(state->graph, node);
#endif
- /* Perform operation. */
- node->evaluate(state->eval_ctx);
+ /* Perform operation. */
+ node->evaluate(state->eval_ctx);
/* Note how long this took. */
#ifdef USE_DEBUGGER
- double end_time = PIL_check_seconds_timer();
- DepsgraphDebug::task_completed(state->graph,
- node,
- end_time - start_time);
+ double end_time = PIL_check_seconds_timer();
+ DepsgraphDebug::task_completed(state->graph,
+ node,
+ end_time - start_time);
#endif
- }
-
- /* If there's only one outgoing link we try to immediately switch to
- * that node evaluation, without leaving the thread.
- *
- * It's only doable if the child don't have extra relations or all they
- * are satisfied.
- *
- * TODO(sergey): Checks here can be de-duplicated with the ones from
- * schedule_node(), however, how to do it nicely?
- */
- if (node->outlinks.size() == 1) {
- DepsRelation *rel = node->outlinks[0];
- OperationDepsNode *child = (OperationDepsNode *)rel->to;
- BLI_assert(child->type == DEPSNODE_TYPE_OPERATION);
- if (!child->scheduled) {
- unsigned int id_layers = child->owner->owner->layers;
- if (!((child->flag & DEPSOP_FLAG_NEEDS_UPDATE) != 0 &&
- (id_layers & state->layers) != 0))
- {
- /* Node does not need an update, so can;t continue with the
- * chain and need to switch to another one by leaving the
- * thread.
- */
- break;
- }
- if ((rel->flag & DEPSREL_FLAG_CYCLIC) == 0) {
- BLI_assert(child->num_links_pending > 0);
- atomic_sub_and_fetch_uint32(&child->num_links_pending, 1);
- }
- if (child->num_links_pending == 0) {
- bool is_scheduled = atomic_fetch_and_or_uint8(
- (uint8_t *)&child->scheduled, (uint8_t)true);
- if (!is_scheduled) {
- /* Node was not scheduled, switch to it! */
- node = child;
- }
- else {
- /* Someone else scheduled the node, leaving us
- * unemployed in this thread, we're leaving.
- */
- break;
- }
- }
- else {
- /* There are other dependencies on the child, can't do
- * anything in the current thread.
- */
- break;
- }
- }
- else {
- /* Happens when having cyclic dependencies.
- *
- * Nothing to do here, single child was already scheduled, we
- * can leave the thread now.
- */
- break;
- }
- }
- else {
- /* TODO(sergey): It's possible to use one of the outgoing relations
- * as a chain which we'll try to keep alive, but it's a bit more
- * involved change.
- */
- schedule_children(pool, state->graph, node, state->layers, thread_id);
- break;
- }
}
+
+ schedule_children(pool, state->graph, node, state->layers, thread_id);
}
typedef struct CalculatePengindData {
@@ -378,12 +311,19 @@ void deg_evaluate_on_refresh(EvaluationContext *eval_ctx,
state.graph = graph;
state.layers = layers;
- TaskScheduler *task_scheduler = BLI_task_scheduler_get();
- TaskPool *task_pool = BLI_task_pool_create(task_scheduler, &state);
+ TaskScheduler *task_scheduler;
+ bool need_free_scheduler;
if (G.debug & G_DEBUG_DEPSGRAPH_NO_THREADS) {
- BLI_pool_set_num_threads(task_pool, 1);
+ task_scheduler = BLI_task_scheduler_create(1);
+ need_free_scheduler = true;
}
+ else {
+ task_scheduler = BLI_task_scheduler_get();
+ need_free_scheduler = false;
+ }
+
+ TaskPool *task_pool = BLI_task_pool_create_suspended(task_scheduler, &state);
calculate_pending_parents(graph, layers);
@@ -410,6 +350,10 @@ void deg_evaluate_on_refresh(EvaluationContext *eval_ctx,
/* Clear any uncleared tags - just in case. */
deg_graph_clear_tags(graph);
+
+ if (need_free_scheduler) {
+ BLI_task_scheduler_free(task_scheduler);
+ }
}
} // namespace DEG
diff --git a/source/blender/depsgraph/intern/eval/deg_eval_flush.cc b/source/blender/depsgraph/intern/eval/deg_eval_flush.cc
index 7c6c25bef0d..e10f86f6e95 100644
--- a/source/blender/depsgraph/intern/eval/deg_eval_flush.cc
+++ b/source/blender/depsgraph/intern/eval/deg_eval_flush.cc
@@ -180,6 +180,11 @@ void deg_graph_flush_updates(Main *bmain, Depsgraph *graph)
comp_node->done = 1;
/* Flush to nodes along links... */
+ /* TODO(sergey): This is mainly giving speedup due ot less queue pushes, which
+ * reduces number of memory allocations.
+ *
+ * We should try solve the allocation issue instead of doing crazy things here.
+ */
if (node->outlinks.size() == 1) {
OperationDepsNode *to_node = (OperationDepsNode *)node->outlinks[0]->to;
if (to_node->scheduled == false) {
diff --git a/source/blender/editors/armature/armature_select.c b/source/blender/editors/armature/armature_select.c
index a9acb41fcf6..a6f2fa40f46 100644
--- a/source/blender/editors/armature/armature_select.c
+++ b/source/blender/editors/armature/armature_select.c
@@ -303,11 +303,11 @@ static EditBone *get_nearest_editbonepoint(
ebone_next_act = NULL;
}
- BLI_rcti_init_pt_size(&rect, mval, 5);
+ BLI_rcti_init_pt_radius(&rect, mval, 5);
hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, true);
if (hits == 0) {
- BLI_rcti_init_pt_size(&rect, mval, 12);
+ BLI_rcti_init_pt_radius(&rect, mval, 12);
hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, true);
}
/* See if there are any selected bones in this group */
diff --git a/source/blender/editors/armature/editarmature_sketch.c b/source/blender/editors/armature/editarmature_sketch.c
index c2813d8c5a7..c3b918ef64f 100644
--- a/source/blender/editors/armature/editarmature_sketch.c
+++ b/source/blender/editors/armature/editarmature_sketch.c
@@ -1929,7 +1929,7 @@ static bool sk_selectStroke(bContext *C, SK_Sketch *sketch, const int mval[2], c
view3d_set_viewcontext(C, &vc);
- BLI_rcti_init_pt_size(&rect, mval, 5);
+ BLI_rcti_init_pt_radius(&rect, mval, 5);
hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, true);
diff --git a/source/blender/editors/interface/interface_layout.c b/source/blender/editors/interface/interface_layout.c
index ad5f6279606..ce1153911da 100644
--- a/source/blender/editors/interface/interface_layout.c
+++ b/source/blender/editors/interface/interface_layout.c
@@ -2681,13 +2681,14 @@ static void ui_litem_layout_absolute(uiLayout *litem)
static void ui_litem_estimate_split(uiLayout *litem)
{
ui_litem_estimate_row(litem);
+ litem->item.flag &= ~UI_ITEM_MIN;
}
static void ui_litem_layout_split(uiLayout *litem)
{
uiLayoutItemSplit *split = (uiLayoutItemSplit *)litem;
uiItem *item;
- float percentage;
+ float percentage, extra_pixel = 0.0f;
const int tot = BLI_listbase_count(&litem->items);
int itemh, x, y, w, colw = 0;
@@ -2710,7 +2711,9 @@ static void ui_litem_layout_split(uiLayout *litem)
x += colw;
if (item->next) {
- colw = (w - (int)(w * percentage)) / (tot - 1);
+ const float width = extra_pixel + (w - (int)(w * percentage)) / ((float)tot - 1);
+ extra_pixel = width - (int)width;
+ colw = (int)width;
colw = MAX2(colw, 0);
x += litem->space;
diff --git a/source/blender/editors/metaball/mball_edit.c b/source/blender/editors/metaball/mball_edit.c
index 9c42d3eb08f..fff53d6885e 100644
--- a/source/blender/editors/metaball/mball_edit.c
+++ b/source/blender/editors/metaball/mball_edit.c
@@ -592,7 +592,7 @@ bool ED_mball_select_pick(bContext *C, const int mval[2], bool extend, bool dese
view3d_set_viewcontext(C, &vc);
- BLI_rcti_init_pt_size(&rect, mval, 12);
+ BLI_rcti_init_pt_radius(&rect, mval, 12);
hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, true);
diff --git a/source/blender/editors/render/render_opengl.c b/source/blender/editors/render/render_opengl.c
index 21a7ec0d06c..1d870b89026 100644
--- a/source/blender/editors/render/render_opengl.c
+++ b/source/blender/editors/render/render_opengl.c
@@ -718,7 +718,6 @@ static bool screen_opengl_render_init(bContext *C, wmOperator *op)
oglrender->task_scheduler = task_scheduler;
oglrender->task_pool = BLI_task_pool_create_background(task_scheduler,
oglrender);
- BLI_pool_set_num_threads(oglrender->task_pool, 1);
}
else {
oglrender->task_scheduler = NULL;
@@ -750,6 +749,23 @@ static void screen_opengl_render_end(bContext *C, OGLRender *oglrender)
int i;
if (oglrender->is_animation) {
+ /* Trickery part for movie output:
+ *
+ * We MUST write frames in an exact order, so we only let background
+ * thread to work on that, and main thread is simply waits for that
+ * thread to do all the dirty work.
+ *
+ * After this loop is done work_and_wait() will have nothing to do,
+ * so we don't run into wrong order of frames written to the stream.
+ */
+ if (BKE_imtype_is_movie(scene->r.im_format.imtype)) {
+ BLI_mutex_lock(&oglrender->task_mutex);
+ while (oglrender->num_scheduled_frames > 0) {
+ BLI_condition_wait(&oglrender->task_condition,
+ &oglrender->task_mutex);
+ }
+ BLI_mutex_unlock(&oglrender->task_mutex);
+ }
BLI_task_pool_work_and_wait(oglrender->task_pool);
BLI_task_pool_free(oglrender->task_pool);
/* Depending on various things we might or might not use global scheduler. */
diff --git a/source/blender/editors/space_node/node_edit.c b/source/blender/editors/space_node/node_edit.c
index 54ddd9aed46..fdfe316f5ed 100644
--- a/source/blender/editors/space_node/node_edit.c
+++ b/source/blender/editors/space_node/node_edit.c
@@ -1070,7 +1070,7 @@ int node_find_indicated_socket(SpaceNode *snode, bNode **nodep, bNodeSocket **so
/* check if we click in a socket */
for (node = snode->edittree->nodes.first; node; node = node->next) {
- BLI_rctf_init_pt_size(&rect, cursor, NODE_SOCKSIZE + 4);
+ BLI_rctf_init_pt_radius(&rect, cursor, NODE_SOCKSIZE + 4);
if (!(node->flag & NODE_HIDDEN)) {
/* extra padding inside and out - allow dragging on the text areas too */
diff --git a/source/blender/editors/space_view3d/space_view3d.c b/source/blender/editors/space_view3d/space_view3d.c
index 7920631b100..37d300ca0a7 100644
--- a/source/blender/editors/space_view3d/space_view3d.c
+++ b/source/blender/editors/space_view3d/space_view3d.c
@@ -882,6 +882,7 @@ static void view3d_main_region_listener(bScreen *sc, ScrArea *sa, ARegion *ar, w
case ND_CONSTRAINT:
case ND_KEYS:
case ND_PARTICLE:
+ case ND_POINTCACHE:
case ND_LOD:
ED_region_tag_redraw(ar);
break;
diff --git a/source/blender/editors/space_view3d/view3d_edit.c b/source/blender/editors/space_view3d/view3d_edit.c
index 131095c8c47..b65e8e01768 100644
--- a/source/blender/editors/space_view3d/view3d_edit.c
+++ b/source/blender/editors/space_view3d/view3d_edit.c
@@ -4872,7 +4872,7 @@ static float view_autodist_depth_margin(ARegion *ar, const int mval[2], int marg
rect.ymax = mval[1] + 1;
}
else {
- BLI_rcti_init_pt_size(&rect, mval, margin);
+ BLI_rcti_init_pt_radius(&rect, mval, margin);
}
view3d_update_depths_rect(ar, &depth_temp, &rect);
diff --git a/source/blender/editors/space_view3d/view3d_intern.h b/source/blender/editors/space_view3d/view3d_intern.h
index 911bea78461..eb961c786af 100644
--- a/source/blender/editors/space_view3d/view3d_intern.h
+++ b/source/blender/editors/space_view3d/view3d_intern.h
@@ -258,7 +258,7 @@ void ED_view3d_smooth_view_force_finish(
struct bContext *C,
struct View3D *v3d, struct ARegion *ar);
-void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rctf *rect);
+void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rcti *rect);
void view3d_viewmatrix_set(Scene *scene, const View3D *v3d, RegionView3D *rv3d);
void fly_modal_keymap(struct wmKeyConfig *keyconf);
diff --git a/source/blender/editors/space_view3d/view3d_select.c b/source/blender/editors/space_view3d/view3d_select.c
index 921ac136dad..3f6afb2634d 100644
--- a/source/blender/editors/space_view3d/view3d_select.c
+++ b/source/blender/editors/space_view3d/view3d_select.c
@@ -1059,9 +1059,11 @@ static void deselectall_except(SceneLayer *sl, Base *b) /* deselect all except
}
}
-static Base *object_mouse_select_menu(bContext *C, ViewContext *vc, unsigned int *buffer, int hits, const int mval[2], short toggle)
+static Base *object_mouse_select_menu(
+ bContext *C, ViewContext *vc, unsigned int *buffer, int hits,
+ const int mval[2], bool toggle)
{
- short baseCount = 0;
+ int baseCount = 0;
bool ok;
LinkNode *linklist = NULL;
@@ -1151,19 +1153,19 @@ static bool selectbuffer_has_bones(const unsigned int *buffer, const unsigned in
}
/* utility function for mixed_bones_object_selectbuffer */
-static short selectbuffer_ret_hits_15(unsigned int *UNUSED(buffer), const short hits15)
+static int selectbuffer_ret_hits_15(unsigned int *UNUSED(buffer), const int hits15)
{
return hits15;
}
-static short selectbuffer_ret_hits_9(unsigned int *buffer, const short hits15, const short hits9)
+static int selectbuffer_ret_hits_9(unsigned int *buffer, const int hits15, const int hits9)
{
const int offs = 4 * hits15;
memcpy(buffer, buffer + offs, 4 * hits9 * sizeof(unsigned int));
return hits9;
}
-static short selectbuffer_ret_hits_5(unsigned int *buffer, const short hits15, const short hits9, const short hits5)
+static int selectbuffer_ret_hits_5(unsigned int *buffer, const int hits15, const int hits9, const int hits5)
{
const int offs = 4 * hits15 + 4 * hits9;
memcpy(buffer, buffer + offs, 4 * hits5 * sizeof(unsigned int));
@@ -1172,14 +1174,14 @@ static short selectbuffer_ret_hits_5(unsigned int *buffer, const short hits15, c
/* we want a select buffer with bones, if there are... */
/* so check three selection levels and compare */
-static short mixed_bones_object_selectbuffer(
+static int mixed_bones_object_selectbuffer(
ViewContext *vc, unsigned int *buffer, const int mval[2],
bool use_cycle, bool enumerate,
bool *r_do_nearest)
{
rcti rect;
int offs;
- short hits15, hits9 = 0, hits5 = 0;
+ int hits15, hits9 = 0, hits5 = 0;
bool has_bones15 = false, has_bones9 = false, has_bones5 = false;
static int last_mval[2] = {-100, -100};
bool do_nearest = false;
@@ -1207,7 +1209,7 @@ static short mixed_bones_object_selectbuffer(
do_nearest = do_nearest && !enumerate;
- BLI_rcti_init(&rect, mval[0] - 14, mval[0] + 14, mval[1] - 14, mval[1] + 14);
+ BLI_rcti_init_pt_radius(&rect, mval, 14);
hits15 = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, do_nearest);
if (hits15 == 1) {
return selectbuffer_ret_hits_15(buffer, hits15);
@@ -1216,7 +1218,7 @@ static short mixed_bones_object_selectbuffer(
has_bones15 = selectbuffer_has_bones(buffer, hits15);
offs = 4 * hits15;
- BLI_rcti_init(&rect, mval[0] - 9, mval[0] + 9, mval[1] - 9, mval[1] + 9);
+ BLI_rcti_init_pt_radius(&rect, mval, 9);
hits9 = view3d_opengl_select(vc, buffer + offs, MAXPICKBUF - offs, &rect, do_nearest);
if (hits9 == 1) {
return selectbuffer_ret_hits_9(buffer, hits15, hits9);
@@ -1225,7 +1227,7 @@ static short mixed_bones_object_selectbuffer(
has_bones9 = selectbuffer_has_bones(buffer + offs, hits9);
offs += 4 * hits9;
- BLI_rcti_init(&rect, mval[0] - 5, mval[0] + 5, mval[1] - 5, mval[1] + 5);
+ BLI_rcti_init_pt_radius(&rect, mval, 5);
hits5 = view3d_opengl_select(vc, buffer + offs, MAXPICKBUF - offs, &rect, do_nearest);
if (hits5 == 1) {
return selectbuffer_ret_hits_5(buffer, hits15, hits9, hits5);
@@ -1384,7 +1386,7 @@ static bool ed_object_select_pick(
bool is_obedit;
float dist = ED_view3d_select_dist_px() * 1.3333f;
bool retval = false;
- short hits;
+ int hits;
const float mval_fl[2] = {(float)mval[0], (float)mval[1]};
@@ -1876,7 +1878,7 @@ static int do_meta_box_select(ViewContext *vc, rcti *rect, bool select, bool ext
int a;
unsigned int buffer[MAXPICKBUF];
- short hits;
+ int hits;
hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, rect, false);
@@ -1910,7 +1912,7 @@ static int do_armature_box_select(ViewContext *vc, rcti *rect, bool select, bool
int a;
unsigned int buffer[MAXPICKBUF];
- short hits;
+ int hits;
hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, rect, false);
@@ -1985,7 +1987,7 @@ static int do_object_pose_box_select(bContext *C, ViewContext *vc, rcti *rect, b
int bone_only;
int bone_selected = 0;
int totobj = MAXPICKBUF; /* XXX solve later */
- short hits;
+ int hits;
if ((ob) && (ob->mode & OB_MODE_POSE))
bone_only = 1;
@@ -2549,7 +2551,7 @@ static void lattice_circle_select(ViewContext *vc, const bool select, const int
/* NOTE: pose-bone case is copied from editbone case... */
-static short pchan_circle_doSelectJoint(void *userData, bPoseChannel *pchan, const float screen_co[2])
+static bool pchan_circle_doSelectJoint(void *userData, bPoseChannel *pchan, const float screen_co[2])
{
CircleSelectUserData *data = userData;
@@ -2627,7 +2629,7 @@ static void pose_circle_select(ViewContext *vc, const bool select, const int mva
}
}
-static short armature_circle_doSelectJoint(void *userData, EditBone *ebone, const float screen_co[2], short head)
+static bool armature_circle_doSelectJoint(void *userData, EditBone *ebone, const float screen_co[2], bool head)
{
CircleSelectUserData *data = userData;
diff --git a/source/blender/editors/space_view3d/view3d_view.c b/source/blender/editors/space_view3d/view3d_view.c
index 687a9a398d9..637479f9ee3 100644
--- a/source/blender/editors/space_view3d/view3d_view.c
+++ b/source/blender/editors/space_view3d/view3d_view.c
@@ -903,7 +903,7 @@ void ED_view3d_polygon_offset(const RegionView3D *rv3d, const float dist)
/**
* \param rect optional for picking (can be NULL).
*/
-void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rctf *rect)
+void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rcti *rect)
{
RegionView3D *rv3d = ar->regiondata;
rctf viewplane;
@@ -1170,7 +1170,7 @@ short view3d_opengl_select(ViewContext *vc, unsigned int *buffer, unsigned int b
SceneLayer *sl = vc->sl;
View3D *v3d = vc->v3d;
ARegion *ar = vc->ar;
- rctf rect;
+ rcti rect;
short hits;
const bool use_obedit_skip = (scene->obedit != NULL) && (vc->obedit == NULL);
const bool do_passes = do_nearest && GPU_select_query_check_active();
@@ -1180,10 +1180,10 @@ short view3d_opengl_select(ViewContext *vc, unsigned int *buffer, unsigned int b
/* case not a border select */
if (input->xmin == input->xmax) {
/* seems to be default value for bones only now */
- BLI_rctf_init_pt_size(&rect, (const float[2]){input->xmin, input->ymin}, 12);
+ BLI_rcti_init_pt_radius(&rect, (const int[2]){input->xmin, input->ymin}, 12);
}
else {
- BLI_rctf_rcti_copy(&rect, input);
+ rect = *input;
}
view3d_winmatrix_set(ar, v3d, &rect);
diff --git a/source/blender/editors/transform/transform_manipulator.c b/source/blender/editors/transform/transform_manipulator.c
index 489badf594b..a78bf1551bc 100644
--- a/source/blender/editors/transform/transform_manipulator.c
+++ b/source/blender/editors/transform/transform_manipulator.c
@@ -1768,14 +1768,14 @@ static int manipulator_selectbuf(ScrArea *sa, ARegion *ar, const int mval[2], fl
{
View3D *v3d = sa->spacedata.first;
RegionView3D *rv3d = ar->regiondata;
- rctf rect, selrect;
+ rcti rect;
GLuint buffer[64]; // max 4 items per select, so large enuf
short hits;
const bool is_picksel = true;
const bool do_passes = GPU_select_query_check_active();
/* XXX check a bit later on this... (ton) */
- extern void view3d_winmatrix_set(ARegion *ar, View3D *v3d, rctf *rect);
+ extern void view3d_winmatrix_set(ARegion *ar, View3D *v3d, const rcti *rect);
/* when looking through a selected camera, the manipulator can be at the
* exact same position as the view, skip so we don't break selection */
@@ -1787,15 +1787,13 @@ static int manipulator_selectbuf(ScrArea *sa, ARegion *ar, const int mval[2], fl
rect.ymin = mval[1] - hotspot;
rect.ymax = mval[1] + hotspot;
- selrect = rect;
-
view3d_winmatrix_set(ar, v3d, &rect);
mul_m4_m4m4(rv3d->persmat, rv3d->winmat, rv3d->viewmat);
if (do_passes)
- GPU_select_begin(buffer, 64, &selrect, GPU_SELECT_NEAREST_FIRST_PASS, 0);
+ GPU_select_begin(buffer, 64, &rect, GPU_SELECT_NEAREST_FIRST_PASS, 0);
else
- GPU_select_begin(buffer, 64, &selrect, GPU_SELECT_ALL, 0);
+ GPU_select_begin(buffer, 64, &rect, GPU_SELECT_ALL, 0);
/* do the drawing */
if (v3d->twtype & V3D_MANIP_ROTATE) {
@@ -1810,7 +1808,7 @@ static int manipulator_selectbuf(ScrArea *sa, ARegion *ar, const int mval[2], fl
hits = GPU_select_end();
if (do_passes) {
- GPU_select_begin(buffer, 64, &selrect, GPU_SELECT_NEAREST_SECOND_PASS, hits);
+ GPU_select_begin(buffer, 64, &rect, GPU_SELECT_NEAREST_SECOND_PASS, hits);
/* do the drawing */
if (v3d->twtype & V3D_MANIP_ROTATE) {
@@ -1916,10 +1914,8 @@ int BIF_do_manipulator(bContext *C, const struct wmEvent *event, wmOperator *op)
drawflags = manipulator_selectbuf(sa, ar, event->mval, 0.2f * (float)U.tw_hotspot);
if (drawflags == 0) drawflags = val;
- /* We are not doing translation but were requested to do planar constraints.
- * This wouldn't work, so we give other keymaps a chance.
- */
- if ((drawflags & MAN_TRANS_C) == 0 && use_planar) {
+ /* Planar constraint doesn't make sense for rotation, give other keymaps a chance */
+ if ((drawflags & MAN_ROT_C) && use_planar) {
return 0;
}
diff --git a/source/blender/gpu/GPU_select.h b/source/blender/gpu/GPU_select.h
index d3cb914976e..93f5ce13bbd 100644
--- a/source/blender/gpu/GPU_select.h
+++ b/source/blender/gpu/GPU_select.h
@@ -32,7 +32,7 @@
#include "BLI_sys_types.h"
-struct rctf;
+struct rcti;
/* flags for mode of operation */
enum {
@@ -41,7 +41,7 @@ enum {
GPU_SELECT_NEAREST_SECOND_PASS = 3,
};
-void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, const struct rctf *input, char mode, int oldhits);
+void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, const struct rcti *input, char mode, int oldhits);
bool GPU_select_load_id(unsigned int id);
unsigned int GPU_select_end(void);
bool GPU_select_query_check_active(void);
diff --git a/source/blender/gpu/intern/gpu_select.c b/source/blender/gpu/intern/gpu_select.c
index f78191a6f6d..35944c455a5 100644
--- a/source/blender/gpu/intern/gpu_select.c
+++ b/source/blender/gpu/intern/gpu_select.c
@@ -37,6 +37,8 @@
#include "DNA_userdef_types.h"
+#include "BLI_rect.h"
+
#include "BLI_utildefines.h"
/* Ad hoc number of queries to allocate to skip doing many glGenQueries */
@@ -72,7 +74,7 @@ static GPUQueryState g_query_state = {0};
/**
* initialize and provide buffer for results
*/
-void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, const rctf *input, char mode, int oldhits)
+void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, const rcti *input, char mode, int oldhits)
{
g_query_state.select_is_active = true;
g_query_state.query_issued = false;
@@ -109,7 +111,7 @@ void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, const rctf *in
* get rejected before the depth test. Should probably cull rect against
* scissor for viewport but this is a rare case I think */
glGetFloatv(GL_SCISSOR_BOX, viewport);
- glViewport(viewport[0], viewport[1], (int)(input->xmax - input->xmin), (int)(input->ymax - input->ymin));
+ glViewport(viewport[0], viewport[1], BLI_rcti_size_x(input), BLI_rcti_size_y(input));
/* occlusion queries operates on fragments that pass tests and since we are interested on all
* objects in the view frustum independently of their order, we need to disable the depth test */
diff --git a/source/blender/windowmanager/manipulators/intern/wm_manipulatormap.c b/source/blender/windowmanager/manipulators/intern/wm_manipulatormap.c
index 500092f5f2d..2dd02450dcf 100644
--- a/source/blender/windowmanager/manipulators/intern/wm_manipulatormap.c
+++ b/source/blender/windowmanager/manipulators/intern/wm_manipulatormap.c
@@ -279,35 +279,32 @@ static int manipulator_find_intersected_3D_intern(
ARegion *ar = CTX_wm_region(C);
View3D *v3d = sa->spacedata.first;
RegionView3D *rv3d = ar->regiondata;
- rctf rect, selrect;
+ rcti rect;
GLuint buffer[64]; // max 4 items per select, so large enuf
short hits;
const bool do_passes = GPU_select_query_check_active();
- extern void view3d_winmatrix_set(ARegion *ar, View3D *v3d, rctf *rect);
-
+ extern void view3d_winmatrix_set(ARegion *ar, View3D *v3d, const rcti *rect);
rect.xmin = co[0] - hotspot;
rect.xmax = co[0] + hotspot;
rect.ymin = co[1] - hotspot;
rect.ymax = co[1] + hotspot;
- selrect = rect;
-
view3d_winmatrix_set(ar, v3d, &rect);
mul_m4_m4m4(rv3d->persmat, rv3d->winmat, rv3d->viewmat);
if (do_passes)
- GPU_select_begin(buffer, ARRAY_SIZE(buffer), &selrect, GPU_SELECT_NEAREST_FIRST_PASS, 0);
+ GPU_select_begin(buffer, ARRAY_SIZE(buffer), &rect, GPU_SELECT_NEAREST_FIRST_PASS, 0);
else
- GPU_select_begin(buffer, ARRAY_SIZE(buffer), &selrect, GPU_SELECT_ALL, 0);
+ GPU_select_begin(buffer, ARRAY_SIZE(buffer), &rect, GPU_SELECT_ALL, 0);
/* do the drawing */
manipulator_find_active_3D_loop(C, visible_manipulators);
hits = GPU_select_end();
if (do_passes) {
- GPU_select_begin(buffer, ARRAY_SIZE(buffer), &selrect, GPU_SELECT_NEAREST_SECOND_PASS, hits);
+ GPU_select_begin(buffer, ARRAY_SIZE(buffer), &rect, GPU_SELECT_NEAREST_SECOND_PASS, hits);
manipulator_find_active_3D_loop(C, visible_manipulators);
GPU_select_end();
}
diff --git a/source/creator/creator_args.c b/source/creator/creator_args.c
index 5ee69aebd5d..f05dc59875f 100644
--- a/source/creator/creator_args.c
+++ b/source/creator/creator_args.c
@@ -943,7 +943,7 @@ static int arg_handle_native_pixels_set(int UNUSED(argc), const char **UNUSED(ar
}
static const char arg_handle_with_borders_doc[] =
-"\n\tForce opening without borders"
+"\n\tForce opening with borders"
;
static int arg_handle_with_borders(int UNUSED(argc), const char **UNUSED(argv), void *UNUSED(data))
{