Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--intern/cycles/device/device.h2
-rw-r--r--intern/cycles/device/device_opencl.cpp2440
-rw-r--r--intern/cycles/kernel/CMakeLists.txt28
-rw-r--r--intern/cycles/kernel/closure/bsdf.h100
-rw-r--r--intern/cycles/kernel/geom/geom_attribute.h8
-rw-r--r--intern/cycles/kernel/geom/geom_bvh.h2
-rw-r--r--intern/cycles/kernel/geom/geom_motion_triangle.h36
-rw-r--r--intern/cycles/kernel/geom/geom_object.h69
-rw-r--r--intern/cycles/kernel/geom/geom_primitive.h36
-rw-r--r--intern/cycles/kernel/geom/geom_triangle.h42
-rw-r--r--intern/cycles/kernel/kernel.cl61
-rw-r--r--intern/cycles/kernel/kernel_accumulate.h2
-rw-r--r--intern/cycles/kernel/kernel_background_buffer_update.cl282
-rw-r--r--intern/cycles/kernel/kernel_camera.h13
-rw-r--r--intern/cycles/kernel/kernel_compat_cpu.h2
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h1
-rw-r--r--intern/cycles/kernel/kernel_compat_opencl.h6
-rw-r--r--intern/cycles/kernel/kernel_data_init.cl384
-rw-r--r--intern/cycles/kernel/kernel_debug.h2
-rw-r--r--intern/cycles/kernel/kernel_differential.h6
-rw-r--r--intern/cycles/kernel/kernel_direct_lighting.cl137
-rw-r--r--intern/cycles/kernel/kernel_emission.h85
-rw-r--r--intern/cycles/kernel/kernel_globals.h2
-rw-r--r--intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl283
-rw-r--r--intern/cycles/kernel/kernel_lamp_emission.cl209
-rw-r--r--intern/cycles/kernel/kernel_next_iteration_setup.cl176
-rw-r--r--intern/cycles/kernel/kernel_passes.h38
-rw-r--r--intern/cycles/kernel/kernel_path.h55
-rw-r--r--intern/cycles/kernel/kernel_path_common.h50
-rw-r--r--intern/cycles/kernel/kernel_path_state.h6
-rw-r--r--intern/cycles/kernel/kernel_path_surface.h48
-rw-r--r--intern/cycles/kernel/kernel_queue_enqueue.cl98
-rw-r--r--intern/cycles/kernel/kernel_queues.h132
-rw-r--r--intern/cycles/kernel/kernel_random.h18
-rw-r--r--intern/cycles/kernel/kernel_scene_intersect.cl164
-rw-r--r--intern/cycles/kernel/kernel_shader.h297
-rw-r--r--intern/cycles/kernel/kernel_shader_eval.cl93
-rw-r--r--intern/cycles/kernel/kernel_shaderdata_vars.h99
-rw-r--r--intern/cycles/kernel/kernel_shadow.h54
-rw-r--r--intern/cycles/kernel/kernel_shadow_blocked.cl126
-rw-r--r--intern/cycles/kernel/kernel_split.h87
-rw-r--r--intern/cycles/kernel/kernel_sum_all_radiance.cl59
-rw-r--r--intern/cycles/kernel/kernel_types.h187
-rw-r--r--intern/cycles/kernel/kernel_work_stealing.h193
-rw-r--r--intern/cycles/kernel/svm/svm.h2
-rw-r--r--intern/cycles/kernel/svm/svm_attribute.h6
-rw-r--r--intern/cycles/kernel/svm/svm_camera.h2
-rw-r--r--intern/cycles/kernel/svm/svm_closure.h140
-rw-r--r--intern/cycles/kernel/svm/svm_displace.h10
-rw-r--r--intern/cycles/kernel/svm/svm_fresnel.h14
-rw-r--r--intern/cycles/kernel/svm/svm_geometry.h42
-rw-r--r--intern/cycles/kernel/svm/svm_image.h6
-rw-r--r--intern/cycles/kernel/svm/svm_light_path.h14
-rw-r--r--intern/cycles/kernel/svm/svm_tex_coord.h114
-rw-r--r--intern/cycles/kernel/svm/svm_vector_transform.h2
-rw-r--r--intern/cycles/kernel/svm/svm_wireframe.h37
-rw-r--r--intern/cycles/render/session.cpp5
57 files changed, 5784 insertions, 828 deletions
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 4d40518644e..162f51252b0 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -55,6 +55,7 @@ public:
bool advanced_shading;
bool pack_images;
bool extended_images; /* flag for GPU and Multi device */
+ bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
vector<DeviceInfo> multi_devices;
DeviceInfo()
@@ -66,6 +67,7 @@ public:
advanced_shading = true;
pack_images = false;
extended_images = false;
+ use_split_kernel = false;
}
};
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 1147cbd69b4..25eb160d71b 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -39,6 +39,30 @@
CCL_NAMESPACE_BEGIN
#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
+#define KERNEL_APPEND_ARG(kernel_name, arg) \
+ opencl_assert(clSetKernelArg(kernel_name, narg++, sizeof(arg), (void*)&arg))
+
+/* Macro declarations used with split kernel */
+
+/* Macro to enable/disable work-stealing */
+#define __WORK_STEALING__
+
+#define SPLIT_KERNEL_LOCAL_SIZE_X 64
+#define SPLIT_KERNEL_LOCAL_SIZE_Y 1
+
+/* This value may be tuned according to the scene we are rendering.
+ *
+ * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
+ * ray-bounces will improve performance.
+ */
+#define PATH_ITER_INC_FACTOR 8
+
+/* When allocate global memory in chunks. We may not be able to
+ * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
+ * Since some bytes may be needed for aligning chunks of memory;
+ * This is the amount of memory that we dedicate for that purpose.
+ */
+#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
static cl_device_type opencl_device_type()
{
@@ -94,11 +118,11 @@ static string opencl_kernel_build_options(const string& platform, const string *
build_options += "-D__KERNEL_OPENCL_AMD__ ";
else if(platform == "Intel(R) OpenCL") {
- build_options += "-D__KERNEL_OPENCL_INTEL_CPU__";
+ build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
/* options for gdb source level kernel debugging. this segfaults on linux currently */
if(opencl_kernel_use_debug() && debug_src)
- build_options += "-g -s \"" + *debug_src + "\"";
+ build_options += "-g -s \"" + *debug_src + "\" ";
}
if(opencl_kernel_use_debug())
@@ -118,14 +142,18 @@ class OpenCLCache
{
thread_mutex *mutex;
cl_context context;
- cl_program program;
+ /* cl_program for shader, bake, film_convert kernels (used in OpenCLDeviceBase) */
+ cl_program ocl_dev_base_program;
+ /* cl_program for megakernel (used in OpenCLDeviceMegaKernel) */
+ cl_program ocl_dev_megakernel_program;
- Slot() : mutex(NULL), context(NULL), program(NULL) {}
+ Slot() : mutex(NULL), context(NULL), ocl_dev_base_program(NULL), ocl_dev_megakernel_program(NULL) {}
Slot(const Slot &rhs)
: mutex(rhs.mutex)
, context(rhs.context)
- , program(rhs.program)
+ , ocl_dev_base_program(rhs.ocl_dev_base_program)
+ , ocl_dev_megakernel_program(rhs.ocl_dev_megakernel_program)
{
/* copy can only happen in map insert, assert that */
assert(mutex == NULL);
@@ -236,6 +264,12 @@ class OpenCLCache
}
public:
+
+ enum ProgramName {
+ OCL_DEV_BASE_PROGRAM,
+ OCL_DEV_MEGAKERNEL_PROGRAM,
+ };
+
/* see get_something comment */
static cl_context get_context(cl_platform_id platform, cl_device_id device,
thread_scoped_lock &slot_locker)
@@ -254,10 +288,21 @@ public:
}
/* see get_something comment */
- static cl_program get_program(cl_platform_id platform, cl_device_id device,
+ static cl_program get_program(cl_platform_id platform, cl_device_id device, ProgramName program_name,
thread_scoped_lock &slot_locker)
{
- cl_program program = get_something<cl_program>(platform, device, &Slot::program, slot_locker);
+ cl_program program = NULL;
+
+ if(program_name == OCL_DEV_BASE_PROGRAM) {
+ /* Get program related to OpenCLDeviceBase */
+ program = get_something<cl_program>(platform, device, &Slot::ocl_dev_base_program, slot_locker);
+ }
+ else if(program_name == OCL_DEV_MEGAKERNEL_PROGRAM) {
+ /* Get program related to megakernel */
+ program = get_something<cl_program>(platform, device, &Slot::ocl_dev_megakernel_program, slot_locker);
+ } else {
+ assert(!"Invalid program name");
+ }
if(!program)
return NULL;
@@ -284,10 +329,18 @@ public:
}
/* see store_something comment */
- static void store_program(cl_platform_id platform, cl_device_id device, cl_program program,
+ static void store_program(cl_platform_id platform, cl_device_id device, cl_program program, ProgramName program_name,
thread_scoped_lock &slot_locker)
{
- store_something<cl_program>(platform, device, program, &Slot::program, slot_locker);
+ if(program_name == OCL_DEV_BASE_PROGRAM) {
+ store_something<cl_program>(platform, device, program, &Slot::ocl_dev_base_program, slot_locker);
+ }
+ else if(program_name == OCL_DEV_MEGAKERNEL_PROGRAM) {
+ store_something<cl_program>(platform, device, program, &Slot::ocl_dev_megakernel_program, slot_locker);
+ } else {
+ assert(!"Invalid program name\n");
+ return;
+ }
/* increment reference count in OpenCL.
* The caller is going to release the object when done with it. */
@@ -304,8 +357,10 @@ public:
thread_scoped_lock cache_lock(self.cache_lock);
foreach(CacheMap::value_type &item, self.cache) {
- if(item.second.program != NULL)
- clReleaseProgram(item.second.program);
+ if(item.second.ocl_dev_base_program != NULL)
+ clReleaseProgram(item.second.ocl_dev_base_program);
+ if(item.second.ocl_dev_megakernel_program != NULL)
+ clReleaseProgram(item.second.ocl_dev_megakernel_program);
if(item.second.context != NULL)
clReleaseContext(item.second.context);
}
@@ -314,7 +369,7 @@ public:
}
};
-class OpenCLDevice : public Device
+class OpenCLDeviceBase : public Device
{
public:
DedicatedTaskPool task_pool;
@@ -323,7 +378,6 @@ public:
cl_platform_id cpPlatform;
cl_device_id cdDevice;
cl_program cpProgram;
- cl_kernel ckPathTraceKernel;
cl_kernel ckFilmConvertByteKernel;
cl_kernel ckFilmConvertHalfFloatKernel;
cl_kernel ckShaderKernel;
@@ -385,7 +439,7 @@ public:
}
}
- OpenCLDevice(DeviceInfo& info, Stats &stats, bool background_)
+ OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
: Device(info, stats, background_)
{
cpPlatform = NULL;
@@ -393,7 +447,6 @@ public:
cxContext = NULL;
cqCommandQueue = NULL;
cpProgram = NULL;
- ckPathTraceKernel = NULL;
ckFilmConvertByteKernel = NULL;
ckFilmConvertHalfFloatKernel = NULL;
ckShaderKernel = NULL;
@@ -501,7 +554,7 @@ public:
if(opencl_error(ciErr))
return;
- fprintf(stderr,"Device init succes\n");
+ fprintf(stderr, "Device init success\n");
device_initialized = true;
}
@@ -547,7 +600,11 @@ public:
return true;
}
- bool load_binary(const string& kernel_path, const string& clbin, const string *debug_src = NULL)
+ bool load_binary(const string& /*kernel_path*/,
+ const string& clbin,
+ string custom_kernel_build_options,
+ cl_program *program,
+ const string *debug_src = NULL)
{
/* read binary into memory */
vector<uint8_t> binary;
@@ -562,7 +619,7 @@ public:
size_t size = binary.size();
const uint8_t *bytes = &binary[0];
- cpProgram = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
+ *program = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
&size, &bytes, &status, &ciErr);
if(opencl_error(status) || opencl_error(ciErr)) {
@@ -570,16 +627,16 @@ public:
return false;
}
- if(!build_kernel(kernel_path, debug_src))
+ if(!build_kernel(program, custom_kernel_build_options, debug_src))
return false;
return true;
}
- bool save_binary(const string& clbin)
+ bool save_binary(cl_program *program, const string& clbin)
{
size_t size = 0;
- clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
+ clGetProgramInfo(*program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
if(!size)
return false;
@@ -587,7 +644,7 @@ public:
vector<uint8_t> binary(size);
uint8_t *bytes = &binary[0];
- clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
+ clGetProgramInfo(*program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
if(!path_write_binary(clbin, binary)) {
opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
@@ -597,20 +654,23 @@ public:
return true;
}
- bool build_kernel(const string& /*kernel_path*/, const string *debug_src = NULL)
+ bool build_kernel(cl_program *kernel_program,
+ string custom_kernel_build_options,
+ const string *debug_src = NULL)
{
- string build_options = opencl_kernel_build_options(platform_name, debug_src);
-
- ciErr = clBuildProgram(cpProgram, 0, NULL, build_options.c_str(), NULL, NULL);
+ string build_options;
+ build_options = opencl_kernel_build_options(platform_name, debug_src) + custom_kernel_build_options;
+
+ ciErr = clBuildProgram(*kernel_program, 0, NULL, build_options.c_str(), NULL, NULL);
/* show warnings even if build is successful */
size_t ret_val_size = 0;
- clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+ clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
if(ret_val_size > 1) {
- vector<char> build_log(ret_val_size+1);
- clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
+ vector<char> build_log(ret_val_size + 1);
+ clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
build_log[ret_val_size] = '\0';
fprintf(stderr, "OpenCL kernel build output:\n");
@@ -625,12 +685,15 @@ public:
return true;
}
- bool compile_kernel(const string& kernel_path, const string& kernel_md5, const string *debug_src = NULL)
+ bool compile_kernel(const string& kernel_path,
+ string source,
+ string custom_kernel_build_options,
+ cl_program *kernel_program,
+ const string *debug_src = NULL)
{
/* we compile kernels consisting of many files. unfortunately opencl
* kernel caches do not seem to recognize changes in included files.
* so we force recompile on changes by adding the md5 hash of all files */
- string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
source = path_source_replace_includes(source, kernel_path);
if(debug_src)
@@ -639,7 +702,7 @@ public:
size_t source_len = source.size();
const char *source_str = source.c_str();
- cpProgram = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
+ *kernel_program = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
if(opencl_error(ciErr))
return false;
@@ -647,7 +710,7 @@ public:
double starttime = time_dt();
printf("Compiling OpenCL kernel ...\n");
- if(!build_kernel(kernel_path, debug_src))
+ if(!build_kernel(kernel_program, custom_kernel_build_options, debug_src))
return false;
printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
@@ -655,7 +718,7 @@ public:
return true;
}
- string device_md5_hash()
+ string device_md5_hash(string kernel_custom_build_options = "")
{
MD5Hash md5;
char version[256], driver[256], name[256], vendor[256];
@@ -671,12 +734,13 @@ public:
md5.append((uint8_t*)driver, strlen(driver));
string options = opencl_kernel_build_options(platform_name);
+ options += kernel_custom_build_options;
md5.append((uint8_t*)options.c_str(), options.size());
return md5.get_hex();
}
- bool load_kernels(bool /*experimental*/)
+ bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/)
{
/* verify if device was initialized */
if(!device_initialized) {
@@ -686,7 +750,7 @@ public:
/* try to use cached kernel */
thread_scoped_lock cache_locker;
- cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, cache_locker);
+ cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, OpenCLCache::OCL_DEV_BASE_PROGRAM, cache_locker);
if(!cpProgram) {
/* verify we have right opencl version */
@@ -712,28 +776,27 @@ public:
}
/* if exists already, try use it */
- if(path_exists(clbin) && load_binary(kernel_path, clbin, debug_src)) {
+ if(path_exists(clbin) && load_binary(kernel_path, clbin, "", &cpProgram)) {
/* kernel loaded from binary */
}
else {
+
+ string init_kernel_source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
+
/* if does not exist or loading binary failed, compile kernel */
- if(!compile_kernel(kernel_path, kernel_md5, debug_src))
+ if(!compile_kernel(kernel_path, init_kernel_source, "", &cpProgram, debug_src))
return false;
/* save binary for reuse */
- if(!save_binary(clbin))
+ if(!save_binary(&cpProgram, clbin))
return false;
}
/* cache the program */
- OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, cache_locker);
+ OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, OpenCLCache::OCL_DEV_BASE_PROGRAM, cache_locker);
}
/* find kernels */
- ckPathTraceKernel = clCreateKernel(cpProgram, "kernel_ocl_path_trace", &ciErr);
- if(opencl_error(ciErr))
- return false;
-
ckFilmConvertByteKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_byte", &ciErr);
if(opencl_error(ciErr))
return false;
@@ -753,7 +816,7 @@ public:
return true;
}
- ~OpenCLDevice()
+ ~OpenCLDeviceBase()
{
task_pool.stop();
@@ -766,12 +829,14 @@ public:
delete mt->second;
}
- if(ckPathTraceKernel)
- clReleaseKernel(ckPathTraceKernel);
if(ckFilmConvertByteKernel)
clReleaseKernel(ckFilmConvertByteKernel);
if(ckFilmConvertHalfFloatKernel)
clReleaseKernel(ckFilmConvertHalfFloatKernel);
+ if(ckShaderKernel)
+ clReleaseKernel(ckShaderKernel);
+ if(ckBakeKernel)
+ clReleaseKernel(ckBakeKernel);
if(cpProgram)
clReleaseProgram(cpProgram);
if(cqCommandQueue)
@@ -913,42 +978,6 @@ public:
opencl_assert(clFlush(cqCommandQueue));
}
- void path_trace(RenderTile& rtile, int sample)
- {
- /* cast arguments to cl types */
- cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
- cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
- cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
- cl_int d_x = rtile.x;
- cl_int d_y = rtile.y;
- cl_int d_w = rtile.w;
- cl_int d_h = rtile.h;
- cl_int d_sample = sample;
- cl_int d_offset = rtile.offset;
- cl_int d_stride = rtile.stride;
-
- /* sample arguments */
- cl_uint narg = 0;
-
- opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data));
- opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer));
- opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state));
-
-#define KERNEL_TEX(type, ttype, name) \
- set_kernel_arg_mem(ckPathTraceKernel, &narg, #name);
-#include "kernel_textures.h"
-
- opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample));
- opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x));
- opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y));
- opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w));
- opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h));
- opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_offset), (void*)&d_offset));
- opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_stride), (void*)&d_stride));
-
- enqueue_kernel(ckPathTraceKernel, d_w, d_h);
- }
-
void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
{
cl_mem ptr;
@@ -985,23 +1014,23 @@ public:
cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
- opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data));
- opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba));
- opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer));
+ /* TODO : Make the kernel launch similar to Cuda */
+ KERNEL_APPEND_ARG(ckFilmConvertKernel, d_data);
+ KERNEL_APPEND_ARG(ckFilmConvertKernel, d_rgba);
+ KERNEL_APPEND_ARG(ckFilmConvertKernel, d_buffer);
#define KERNEL_TEX(type, ttype, name) \
set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name);
#include "kernel_textures.h"
+#undef KERNEL_TEX
- opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample_scale), (void*)&d_sample_scale));
- opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x));
- opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y));
- opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w));
- opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h));
- opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_offset), (void*)&d_offset));
- opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_stride), (void*)&d_stride));
-
-
+ KERNEL_APPEND_ARG(ckFilmConvertKernel, d_sample_scale);
+ KERNEL_APPEND_ARG(ckFilmConvertKernel, d_x);
+ KERNEL_APPEND_ARG(ckFilmConvertKernel, d_y);
+ KERNEL_APPEND_ARG(ckFilmConvertKernel, d_w);
+ KERNEL_APPEND_ARG(ckFilmConvertKernel, d_h);
+ KERNEL_APPEND_ARG(ckFilmConvertKernel, d_offset);
+ KERNEL_APPEND_ARG(ckFilmConvertKernel, d_stride);
enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
}
@@ -1034,19 +1063,21 @@ public:
cl_int d_sample = sample;
- opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
- opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
- opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));
+ /* TODO : Make the kernel launch similar to Cuda */
+ KERNEL_APPEND_ARG(kernel, d_data);
+ KERNEL_APPEND_ARG(kernel, d_input);
+ KERNEL_APPEND_ARG(kernel, d_output);
#define KERNEL_TEX(type, ttype, name) \
set_kernel_arg_mem(kernel, &narg, #name);
#include "kernel_textures.h"
+#undef KERNEL_TEX
- opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
- opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
- opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
- opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_offset), (void*)&d_offset));
- opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample));
+ KERNEL_APPEND_ARG(kernel, d_shader_eval_type);
+ KERNEL_APPEND_ARG(kernel, d_shader_x);
+ KERNEL_APPEND_ARG(kernel, d_shader_w);
+ KERNEL_APPEND_ARG(kernel, d_offset);
+ KERNEL_APPEND_ARG(kernel, d_sample);
enqueue_kernel(kernel, task.shader_w, 1);
@@ -1054,6 +1085,305 @@ public:
}
}
+ class OpenCLDeviceTask : public DeviceTask {
+ public:
+ OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
+ : DeviceTask(task)
+ {
+ run = function_bind(&OpenCLDeviceBase::thread_run,
+ device,
+ this);
+ }
+ };
+
+ int get_split_task_count(DeviceTask& /*task*/)
+ {
+ return 1;
+ }
+
+ void task_add(DeviceTask& task)
+ {
+ task_pool.push(new OpenCLDeviceTask(this, task));
+ }
+
+ void task_wait()
+ {
+ task_pool.wait();
+ }
+
+ void task_cancel()
+ {
+ task_pool.cancel();
+ }
+
+ virtual void thread_run(DeviceTask * /*task*/) = 0;
+
+protected:
+ class ArgumentWrapper {
+ public:
+ ArgumentWrapper() : size(0), pointer(NULL) {}
+ template <typename T>
+ ArgumentWrapper(T& argument) : size(sizeof(argument)),
+ pointer(&argument) { }
+ size_t size;
+ void *pointer;
+ };
+
+ /* TODO(sergey): In the future we can use variadic templates, once
+ * C++0x is allowed. Should allow to clean this up a bit.
+ */
+ int kernel_set_args(cl_kernel kernel,
+ int start_argument_index,
+ const ArgumentWrapper& arg1 = ArgumentWrapper(),
+ const ArgumentWrapper& arg2 = ArgumentWrapper(),
+ const ArgumentWrapper& arg3 = ArgumentWrapper(),
+ const ArgumentWrapper& arg4 = ArgumentWrapper(),
+ const ArgumentWrapper& arg5 = ArgumentWrapper(),
+ const ArgumentWrapper& arg6 = ArgumentWrapper(),
+ const ArgumentWrapper& arg7 = ArgumentWrapper(),
+ const ArgumentWrapper& arg8 = ArgumentWrapper(),
+ const ArgumentWrapper& arg9 = ArgumentWrapper(),
+ const ArgumentWrapper& arg10 = ArgumentWrapper(),
+ const ArgumentWrapper& arg11 = ArgumentWrapper(),
+ const ArgumentWrapper& arg12 = ArgumentWrapper(),
+ const ArgumentWrapper& arg13 = ArgumentWrapper(),
+ const ArgumentWrapper& arg14 = ArgumentWrapper(),
+ const ArgumentWrapper& arg15 = ArgumentWrapper(),
+ const ArgumentWrapper& arg16 = ArgumentWrapper(),
+ const ArgumentWrapper& arg17 = ArgumentWrapper(),
+ const ArgumentWrapper& arg18 = ArgumentWrapper(),
+ const ArgumentWrapper& arg19 = ArgumentWrapper(),
+ const ArgumentWrapper& arg20 = ArgumentWrapper(),
+ const ArgumentWrapper& arg21 = ArgumentWrapper(),
+ const ArgumentWrapper& arg22 = ArgumentWrapper(),
+ const ArgumentWrapper& arg23 = ArgumentWrapper(),
+ const ArgumentWrapper& arg24 = ArgumentWrapper(),
+ const ArgumentWrapper& arg25 = ArgumentWrapper(),
+ const ArgumentWrapper& arg26 = ArgumentWrapper(),
+ const ArgumentWrapper& arg27 = ArgumentWrapper(),
+ const ArgumentWrapper& arg28 = ArgumentWrapper(),
+ const ArgumentWrapper& arg29 = ArgumentWrapper(),
+ const ArgumentWrapper& arg30 = ArgumentWrapper(),
+ const ArgumentWrapper& arg31 = ArgumentWrapper(),
+ const ArgumentWrapper& arg32 = ArgumentWrapper(),
+ const ArgumentWrapper& arg33 = ArgumentWrapper())
+ {
+ int current_arg_index = 0;
+#define FAKE_VARARG_HANDLE_ARG(arg) \
+ do { \
+ if(arg.pointer != NULL) { \
+ opencl_assert(clSetKernelArg( \
+ kernel, \
+ start_argument_index + current_arg_index, \
+ arg.size, arg.pointer)); \
+ ++current_arg_index; \
+ } \
+ else { \
+ return current_arg_index; \
+ } \
+ } while(false)
+ FAKE_VARARG_HANDLE_ARG(arg1);
+ FAKE_VARARG_HANDLE_ARG(arg2);
+ FAKE_VARARG_HANDLE_ARG(arg3);
+ FAKE_VARARG_HANDLE_ARG(arg4);
+ FAKE_VARARG_HANDLE_ARG(arg5);
+ FAKE_VARARG_HANDLE_ARG(arg6);
+ FAKE_VARARG_HANDLE_ARG(arg7);
+ FAKE_VARARG_HANDLE_ARG(arg8);
+ FAKE_VARARG_HANDLE_ARG(arg9);
+ FAKE_VARARG_HANDLE_ARG(arg10);
+ FAKE_VARARG_HANDLE_ARG(arg11);
+ FAKE_VARARG_HANDLE_ARG(arg12);
+ FAKE_VARARG_HANDLE_ARG(arg13);
+ FAKE_VARARG_HANDLE_ARG(arg14);
+ FAKE_VARARG_HANDLE_ARG(arg15);
+ FAKE_VARARG_HANDLE_ARG(arg16);
+ FAKE_VARARG_HANDLE_ARG(arg17);
+ FAKE_VARARG_HANDLE_ARG(arg18);
+ FAKE_VARARG_HANDLE_ARG(arg19);
+ FAKE_VARARG_HANDLE_ARG(arg20);
+ FAKE_VARARG_HANDLE_ARG(arg21);
+ FAKE_VARARG_HANDLE_ARG(arg22);
+ FAKE_VARARG_HANDLE_ARG(arg23);
+ FAKE_VARARG_HANDLE_ARG(arg24);
+ FAKE_VARARG_HANDLE_ARG(arg25);
+ FAKE_VARARG_HANDLE_ARG(arg26);
+ FAKE_VARARG_HANDLE_ARG(arg27);
+ FAKE_VARARG_HANDLE_ARG(arg28);
+ FAKE_VARARG_HANDLE_ARG(arg29);
+ FAKE_VARARG_HANDLE_ARG(arg30);
+ FAKE_VARARG_HANDLE_ARG(arg31);
+ FAKE_VARARG_HANDLE_ARG(arg32);
+ FAKE_VARARG_HANDLE_ARG(arg33);
+#undef FAKE_VARARG_HANDLE_ARG
+ return current_arg_index;
+ }
+
+ inline void release_kernel_safe(cl_kernel kernel)
+ {
+ if(kernel) {
+ clReleaseKernel(kernel);
+ }
+ }
+
+ inline void release_mem_object_safe(cl_mem mem)
+ {
+ if(mem != NULL) {
+ clReleaseMemObject(mem);
+ }
+ }
+
+ inline void release_program_safe(cl_program program)
+ {
+ if(program) {
+ clReleaseProgram(program);
+ }
+ }
+};
+
+class OpenCLDeviceMegaKernel : public OpenCLDeviceBase
+{
+public:
+ cl_kernel ckPathTraceKernel;
+ cl_program path_trace_program;
+
+ OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, bool background_)
+ : OpenCLDeviceBase(info, stats, background_)
+ {
+ ckPathTraceKernel = NULL;
+ path_trace_program = NULL;
+ }
+
+ bool load_kernels(const DeviceRequestedFeatures& requested_features)
+ {
+ /* Get Shader, bake and film convert kernels.
+ * It'll also do verification of OpenCL actually initialized.
+ */
+ if(!OpenCLDeviceBase::load_kernels(requested_features)) {
+ return false;
+ }
+
+ /* Try to use cached kernel. */
+ thread_scoped_lock cache_locker;
+ path_trace_program = OpenCLCache::get_program(cpPlatform,
+ cdDevice,
+ OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
+ cache_locker);
+
+ if(!path_trace_program) {
+ /* Verify we have right opencl version. */
+ if(!opencl_version_check())
+ return false;
+
+ /* Calculate md5 hash to detect changes. */
+ string kernel_path = path_get("kernel");
+ string kernel_md5 = path_files_md5_hash(kernel_path);
+ string custom_kernel_build_options = "-D__COMPILE_ONLY_MEGAKERNEL__ ";
+ string device_md5 = device_md5_hash(custom_kernel_build_options);
+
+ /* Path to cached binary. */
+ string clbin = string_printf("cycles_kernel_%s_%s.clbin",
+ device_md5.c_str(),
+ kernel_md5.c_str());
+ clbin = path_user_get(path_join("cache", clbin));
+
+ /* Path to preprocessed source for debugging. */
+ string clsrc, *debug_src = NULL;
+ if(opencl_kernel_use_debug()) {
+ clsrc = string_printf("cycles_kernel_%s_%s.cl",
+ device_md5.c_str(),
+ kernel_md5.c_str());
+ clsrc = path_user_get(path_join("cache", clsrc));
+ debug_src = &clsrc;
+ }
+
+ /* If exists already, try use it. */
+ if(path_exists(clbin) && load_binary(kernel_path,
+ clbin,
+ custom_kernel_build_options,
+ &path_trace_program,
+ debug_src)) {
+ /* Kernel loaded from binary, nothing to do. */
+ }
+ else {
+ string init_kernel_source = "#include \"kernel.cl\" // " +
+ kernel_md5 + "\n";
+ /* If does not exist or loading binary failed, compile kernel. */
+ if(!compile_kernel(kernel_path,
+ init_kernel_source,
+ custom_kernel_build_options,
+ &path_trace_program,
+ debug_src))
+ {
+ return false;
+ }
+ /* Save binary for reuse. */
+ if(!save_binary(&path_trace_program, clbin)) {
+ return false;
+ }
+ }
+ /* Cache the program. */
+ OpenCLCache::store_program(cpPlatform,
+ cdDevice,
+ path_trace_program,
+ OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
+ cache_locker);
+ }
+
+ /* Find kernels. */
+ ckPathTraceKernel = clCreateKernel(path_trace_program,
+ "kernel_ocl_path_trace",
+ &ciErr);
+ if(opencl_error(ciErr))
+ return false;
+ return true;
+ }
+
+ ~OpenCLDeviceMegaKernel()
+ {
+ task_pool.stop();
+ release_kernel_safe(ckPathTraceKernel);
+ release_program_safe(path_trace_program);
+ }
+
+ void path_trace(RenderTile& rtile, int sample)
+ {
+ /* Cast arguments to cl types. */
+ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+ cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
+ cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
+ cl_int d_x = rtile.x;
+ cl_int d_y = rtile.y;
+ cl_int d_w = rtile.w;
+ cl_int d_h = rtile.h;
+ cl_int d_offset = rtile.offset;
+ cl_int d_stride = rtile.stride;
+
+ /* Sample arguments. */
+ cl_int d_sample = sample;
+ cl_uint narg = 0;
+
+ /* TODO : Make the kernel launch similar to Cuda. */
+ KERNEL_APPEND_ARG(ckPathTraceKernel, d_data);
+ KERNEL_APPEND_ARG(ckPathTraceKernel, d_buffer);
+ KERNEL_APPEND_ARG(ckPathTraceKernel, d_rng_state);
+
+#define KERNEL_TEX(type, ttype, name) \
+ set_kernel_arg_mem(ckPathTraceKernel, &narg, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+ KERNEL_APPEND_ARG(ckPathTraceKernel, d_sample);
+ KERNEL_APPEND_ARG(ckPathTraceKernel, d_x);
+ KERNEL_APPEND_ARG(ckPathTraceKernel, d_y);
+ KERNEL_APPEND_ARG(ckPathTraceKernel, d_w);
+ KERNEL_APPEND_ARG(ckPathTraceKernel, d_h);
+ KERNEL_APPEND_ARG(ckPathTraceKernel, d_offset);
+ KERNEL_APPEND_ARG(ckPathTraceKernel, d_stride);
+
+ enqueue_kernel(ckPathTraceKernel, d_w, d_h);
+ }
+
void thread_run(DeviceTask *task)
{
if(task->type == DeviceTask::FILM_CONVERT) {
@@ -1064,8 +1394,7 @@ public:
}
else if(task->type == DeviceTask::PATH_TRACE) {
RenderTile tile;
-
- /* keep rendering tiles until done */
+ /* Keep rendering tiles until done. */
while(task->acquire_tile(this, tile)) {
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
@@ -1083,47 +1412,1908 @@ public:
task->update_progress(&tile);
}
+ /* Complete kernel execution before release tile */
+ /* This helps in multi-device render;
+ * The device that reaches the critical-section function
+ * release_tile waits (stalling other devices from entering
+ * release_tile) for all kernels to complete. If device1 (a
+ * slow-render device) reaches release_tile first then it would
+ * stall device2 (a fast-render device) from proceeding to render
+ * next tile.
+ */
+ clFinish(cqCommandQueue);
+
task->release_tile(tile);
}
}
}
+};
- class OpenCLDeviceTask : public DeviceTask {
- public:
- OpenCLDeviceTask(OpenCLDevice *device, DeviceTask& task)
- : DeviceTask(task)
+/* TODO(sergey): This is to keep tile split on OpenCL level working
+ * for now, since withotu this viewport render does not work as it
+ * should.
+ *
+ * Ideally it'll be done on the higher level, but we need to get ready
+ * for merge rather soon, so let's keep split logic private here in
+ * the file.
+ */
+class SplitRenderTile : public RenderTile {
+public:
+ SplitRenderTile()
+ : RenderTile(),
+ buffer_offset_x(0),
+ buffer_offset_y(0),
+ rng_state_offset_x(0),
+ rng_state_offset_y(0),
+ buffer_rng_state_stride(0) {}
+
+ explicit SplitRenderTile(RenderTile& tile)
+ : RenderTile(),
+ buffer_offset_x(0),
+ buffer_offset_y(0),
+ rng_state_offset_x(0),
+ rng_state_offset_y(0),
+ buffer_rng_state_stride(0)
+ {
+ x = tile.x;
+ y = tile.y;
+ w = tile.w;
+ h = tile.h;
+ start_sample = tile.start_sample;
+ num_samples = tile.num_samples;
+ sample = tile.sample;
+ resolution = tile.resolution;
+ offset = tile.offset;
+ stride = tile.stride;
+ buffer = tile.buffer;
+ rng_state = tile.rng_state;
+ buffers = tile.buffers;
+ }
+
+ /* Split kernel is device global memory constained;
+ * hence split kernel cant render big tile size's in
+ * one go. If the user sets a big tile size (big tile size
+ * is a term relative to the available device global memory),
+ * we split the tile further and then call path_trace on
+ * each of those split tiles. The following variables declared,
+ * assist in achieving that purpose
+ */
+ int buffer_offset_x;
+ int buffer_offset_y;
+ int rng_state_offset_x;
+ int rng_state_offset_y;
+ int buffer_rng_state_stride;
+};
+
+/* OpenCLDeviceSplitKernel's declaration/definition. */
+class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
+{
+public:
+ /* Kernel declaration. */
+ cl_kernel ckPathTraceKernel_data_init;
+ cl_kernel ckPathTraceKernel_scene_intersect;
+ cl_kernel ckPathTraceKernel_lamp_emission;
+ cl_kernel ckPathTraceKernel_queue_enqueue;
+ cl_kernel ckPathTraceKernel_background_buffer_update;
+ cl_kernel ckPathTraceKernel_shader_lighting;
+ cl_kernel ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao;
+ cl_kernel ckPathTraceKernel_direct_lighting;
+ cl_kernel ckPathTraceKernel_shadow_blocked_direct_lighting;
+ cl_kernel ckPathTraceKernel_setup_next_iteration;
+ cl_kernel ckPathTraceKernel_sum_all_radiance;
+
+ /* cl_program declaration. */
+ cl_program data_init_program;
+ cl_program scene_intersect_program;
+ cl_program lamp_emission_program;
+ cl_program queue_enqueue_program;
+ cl_program background_buffer_update_program;
+ cl_program shader_eval_program;
+ cl_program holdout_emission_blurring_termination_ao_program;
+ cl_program direct_lighting_program;
+ cl_program shadow_blocked_program;
+ cl_program next_iteration_setup_program;
+ cl_program sum_all_radiance_program;
+
+ /* Global memory variables [porting]; These memory is used for
+ * co-operation between different kernels; Data written by one
+ * kernel will be avaible to another kernel via this global
+ * memory.
+ */
+ cl_mem rng_coop;
+ cl_mem throughput_coop;
+ cl_mem L_transparent_coop;
+ cl_mem PathRadiance_coop;
+ cl_mem Ray_coop;
+ cl_mem PathState_coop;
+ cl_mem Intersection_coop;
+ cl_mem kgbuffer; /* KernelGlobals buffer. */
+
+ /* Global buffers for ShaderData. */
+ cl_mem sd; /* ShaderData used in the main path-iteration loop. */
+ cl_mem sd_DL_shadow; /* ShaderData used in Direct Lighting and
+ * shadow_blocked kernel.
+ */
+
+ /* Global buffers of each member of ShaderData. */
+ cl_mem P_sd;
+ cl_mem P_sd_DL_shadow;
+ cl_mem N_sd;
+ cl_mem N_sd_DL_shadow;
+ cl_mem Ng_sd;
+ cl_mem Ng_sd_DL_shadow;
+ cl_mem I_sd;
+ cl_mem I_sd_DL_shadow;
+ cl_mem shader_sd;
+ cl_mem shader_sd_DL_shadow;
+ cl_mem flag_sd;
+ cl_mem flag_sd_DL_shadow;
+ cl_mem prim_sd;
+ cl_mem prim_sd_DL_shadow;
+ cl_mem type_sd;
+ cl_mem type_sd_DL_shadow;
+ cl_mem u_sd;
+ cl_mem u_sd_DL_shadow;
+ cl_mem v_sd;
+ cl_mem v_sd_DL_shadow;
+ cl_mem object_sd;
+ cl_mem object_sd_DL_shadow;
+ cl_mem time_sd;
+ cl_mem time_sd_DL_shadow;
+ cl_mem ray_length_sd;
+ cl_mem ray_length_sd_DL_shadow;
+ cl_mem ray_depth_sd;
+ cl_mem ray_depth_sd_DL_shadow;
+ cl_mem transparent_depth_sd;
+ cl_mem transparent_depth_sd_DL_shadow;
+#ifdef __RAY_DIFFERENTIALS__
+ cl_mem dP_sd, dI_sd;
+ cl_mem dP_sd_DL_shadow, dI_sd_DL_shadow;
+ cl_mem du_sd, dv_sd;
+ cl_mem du_sd_DL_shadow, dv_sd_DL_shadow;
+#endif
+#ifdef __DPDU__
+ cl_mem dPdu_sd, dPdv_sd;
+ cl_mem dPdu_sd_DL_shadow, dPdv_sd_DL_shadow;
+#endif
+ cl_mem closure_sd;
+ cl_mem closure_sd_DL_shadow;
+ cl_mem num_closure_sd;
+ cl_mem num_closure_sd_DL_shadow;
+ cl_mem randb_closure_sd;
+ cl_mem randb_closure_sd_DL_shadow;
+ cl_mem ray_P_sd;
+ cl_mem ray_P_sd_DL_shadow;
+ cl_mem ray_dP_sd;
+ cl_mem ray_dP_sd_DL_shadow;
+
+ /* Global memory required for shadow blocked and accum_radiance. */
+ cl_mem BSDFEval_coop;
+ cl_mem ISLamp_coop;
+ cl_mem LightRay_coop;
+ cl_mem AOAlpha_coop;
+ cl_mem AOBSDF_coop;
+ cl_mem AOLightRay_coop;
+ cl_mem Intersection_coop_AO;
+ cl_mem Intersection_coop_DL;
+
+#ifdef WITH_CYCLES_DEBUG
+ /* DebugData memory */
+ cl_mem debugdata_coop;
+#endif
+
+ /* Global state array that tracks ray state. */
+ cl_mem ray_state;
+
+ /* Per sample buffers. */
+ cl_mem per_sample_output_buffers;
+
+ /* Denotes which sample each ray is being processed for. */
+ cl_mem work_array;
+
+ /* Queue */
+ cl_mem Queue_data; /* Array of size queuesize * num_queues * sizeof(int). */
+ cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
+ * Tracks the size of each queue.
+ */
+
+ /* Flag to make sceneintersect and lampemission kernel use queues. */
+ cl_mem use_queues_flag;
+
+ /* Required-memory size. */
+ size_t throughput_size;
+ size_t L_transparent_size;
+ size_t rayState_size;
+ size_t hostRayState_size;
+ size_t work_element_size;
+ size_t ISLamp_size;
+
+ /* Sizes of memory required for shadow blocked function. */
+ size_t AOAlpha_size;
+ size_t AOBSDF_size;
+
+ /* Amount of memory in output buffer associated with one pixel/thread. */
+ size_t per_thread_output_buffer_size;
+
+ /* Total allocatable available device memory. */
+ size_t total_allocatable_memory;
+
+ /* host version of ray_state; Used in checking host path-iteration
+ * termination.
+ */
+ char *hostRayStateArray;
+
+ /* Number of path-iterations to be done in one shot. */
+ unsigned int PathIteration_times;
+
+#ifdef __WORK_STEALING__
+ /* Work pool with respect to each work group. */
+ cl_mem work_pool_wgs;
+
+ /* Denotes the maximum work groups possible w.r.t. current tile size. */
+ unsigned int max_work_groups;
+#endif
+
+ /* clos_max value for which the kernels have been loaded currently. */
+ int current_clos_max;
+
+ /* Marked True in constructor and marked false at the end of path_trace(). */
+ bool first_tile;
+
+ OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
+ : OpenCLDeviceBase(info, stats, background_)
+ {
+
+ info.use_split_kernel = true;
+ background = background_;
+
+ /* Initialize kernels. */
+ ckPathTraceKernel_data_init = NULL;
+ ckPathTraceKernel_scene_intersect = NULL;
+ ckPathTraceKernel_lamp_emission = NULL;
+ ckPathTraceKernel_background_buffer_update = NULL;
+ ckPathTraceKernel_shader_lighting = NULL;
+ ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao = NULL;
+ ckPathTraceKernel_direct_lighting = NULL;
+ ckPathTraceKernel_shadow_blocked_direct_lighting = NULL;
+ ckPathTraceKernel_setup_next_iteration = NULL;
+ ckPathTraceKernel_sum_all_radiance = NULL;
+ ckPathTraceKernel_queue_enqueue = NULL;
+
+ /* Initialize program. */
+ data_init_program = NULL;
+ scene_intersect_program = NULL;
+ lamp_emission_program = NULL;
+ queue_enqueue_program = NULL;
+ background_buffer_update_program = NULL;
+ shader_eval_program = NULL;
+ holdout_emission_blurring_termination_ao_program = NULL;
+ direct_lighting_program = NULL;
+ shadow_blocked_program = NULL;
+ next_iteration_setup_program = NULL;
+ sum_all_radiance_program = NULL;
+
+ /* Initialize cl_mem variables. */
+ kgbuffer = NULL;
+ sd = NULL;
+ sd_DL_shadow = NULL;
+
+ P_sd = NULL;
+ P_sd_DL_shadow = NULL;
+ N_sd = NULL;
+ N_sd_DL_shadow = NULL;
+ Ng_sd = NULL;
+ Ng_sd_DL_shadow = NULL;
+ I_sd = NULL;
+ I_sd_DL_shadow = NULL;
+ shader_sd = NULL;
+ shader_sd_DL_shadow = NULL;
+ flag_sd = NULL;
+ flag_sd_DL_shadow = NULL;
+ prim_sd = NULL;
+ prim_sd_DL_shadow = NULL;
+ type_sd = NULL;
+ type_sd_DL_shadow = NULL;
+ u_sd = NULL;
+ u_sd_DL_shadow = NULL;
+ v_sd = NULL;
+ v_sd_DL_shadow = NULL;
+ object_sd = NULL;
+ object_sd_DL_shadow = NULL;
+ time_sd = NULL;
+ time_sd_DL_shadow = NULL;
+ ray_length_sd = NULL;
+ ray_length_sd_DL_shadow = NULL;
+ ray_depth_sd = NULL;
+ ray_depth_sd_DL_shadow = NULL;
+ transparent_depth_sd = NULL;
+ transparent_depth_sd_DL_shadow = NULL;
+#ifdef __RAY_DIFFERENTIALS__
+ dP_sd = NULL;
+ dI_sd = NULL;
+ dP_sd_DL_shadow = NULL;
+ dI_sd_DL_shadow = NULL;
+ du_sd = NULL;
+ dv_sd = NULL;
+ du_sd_DL_shadow = NULL;
+ dv_sd_DL_shadow = NULL;
+#endif
+#ifdef __DPDU__
+ dPdu_sd = NULL;
+ dPdv_sd = NULL;
+ dPdu_sd_DL_shadow = NULL;
+ dPdv_sd_DL_shadow = NULL;
+#endif
+ closure_sd = NULL;
+ closure_sd_DL_shadow = NULL;
+ num_closure_sd = NULL;
+ num_closure_sd_DL_shadow = NULL;
+ randb_closure_sd = NULL;
+ randb_closure_sd_DL_shadow = NULL;
+ ray_P_sd = NULL;
+ ray_P_sd_DL_shadow = NULL;
+ ray_dP_sd = NULL;
+ ray_dP_sd_DL_shadow = NULL;
+
+ rng_coop = NULL;
+ throughput_coop = NULL;
+ L_transparent_coop = NULL;
+ PathRadiance_coop = NULL;
+ Ray_coop = NULL;
+ PathState_coop = NULL;
+ Intersection_coop = NULL;
+ ray_state = NULL;
+
+ AOAlpha_coop = NULL;
+ AOBSDF_coop = NULL;
+ AOLightRay_coop = NULL;
+ BSDFEval_coop = NULL;
+ ISLamp_coop = NULL;
+ LightRay_coop = NULL;
+ Intersection_coop_AO = NULL;
+ Intersection_coop_DL = NULL;
+
+#ifdef WITH_CYCLES_DEBUG
+ debugdata_coop = NULL;
+#endif
+
+ work_array = NULL;
+
+ /* Queue. */
+ Queue_data = NULL;
+ Queue_index = NULL;
+ use_queues_flag = NULL;
+
+ per_sample_output_buffers = NULL;
+
+ /* Initialize required memory size. */
+ throughput_size = sizeof(float3);
+ L_transparent_size = sizeof(float);
+ rayState_size = sizeof(char);
+ hostRayState_size = sizeof(char);
+ work_element_size = sizeof(unsigned int);
+ ISLamp_size = sizeof(int);
+
+ /* Initialize sizes of memory required for shadow blocked function. */
+ AOAlpha_size = sizeof(float3);
+ AOBSDF_size = sizeof(float3);
+
+ per_thread_output_buffer_size = 0;
+ hostRayStateArray = NULL;
+ PathIteration_times = PATH_ITER_INC_FACTOR;
+#ifdef __WORK_STEALING__
+ work_pool_wgs = NULL;
+ max_work_groups = 0;
+#endif
+ current_clos_max = -1;
+ first_tile = true;
+
+ /* Get device's maximum memory that can be allocated. */
+ ciErr = clGetDeviceInfo(cdDevice,
+ CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+ sizeof(size_t),
+ &total_allocatable_memory,
+ NULL);
+ assert(ciErr == CL_SUCCESS);
+ if(platform_name == "AMD Accelerated Parallel Processing") {
+ /* This value is tweak-able; AMD platform does not seem to
+ * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
+ * is considered for further computation.
+ */
+ total_allocatable_memory /= 2;
+ }
+ }
+
+ /* TODO(sergey): Seems really close to load_kernel(),
+ * could it be de-duplicated?
+ */
+ bool load_split_kernel(string kernel_path,
+ string kernel_init_source,
+ string clbin,
+ string custom_kernel_build_options,
+ cl_program *program)
+ {
+ if(!opencl_version_check())
+ return false;
+
+ clbin = path_user_get(path_join("cache", clbin));
+
+ /* Path to preprocessed source for debugging. */
+ string *debug_src = NULL;
+
+ /* If exists already, try use it. */
+ if(path_exists(clbin) && load_binary(kernel_path,
+ clbin,
+ custom_kernel_build_options,
+ program,
+ debug_src)) {
+ /* Kernel loaded from binary. */
+ }
+ else {
+ /* If does not exist or loading binary failed, compile kernel. */
+ if(!compile_kernel(kernel_path,
+ kernel_init_source,
+ custom_kernel_build_options,
+ program))
+ {
+ return false;
+ }
+ /* Save binary for reuse. */
+ if(!save_binary(program, clbin)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /* Split kernel utility functions. */
+ size_t get_tex_size(const char *tex_name)
+ {
+ cl_mem ptr;
+ size_t ret_size = 0;
+ MemMap::iterator i = mem_map.find(tex_name);
+ if(i != mem_map.end()) {
+ ptr = CL_MEM_PTR(i->second);
+ ciErr = clGetMemObjectInfo(ptr,
+ CL_MEM_SIZE,
+ sizeof(ret_size),
+ &ret_size,
+ NULL);
+ assert(ciErr == CL_SUCCESS);
+ }
+ return ret_size;
+ }
+
+ size_t get_shader_closure_size(int max_closure)
+ {
+ return (sizeof(ShaderClosure)* max_closure);
+ }
+
+ size_t get_shader_data_size(size_t shader_closure_size)
+ {
+ /* ShaderData size without accounting for ShaderClosure array. */
+ size_t shader_data_size =
+ sizeof(ShaderData) - (sizeof(ShaderClosure) * MAX_CLOSURE);
+ return (shader_data_size + shader_closure_size);
+ }
+
+ /* Returns size of KernelGlobals structure associated with OpenCL. */
+ size_t get_KernelGlobals_size()
+ {
+ /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
+ * fetch its size.
+ */
+ typedef struct KernelGlobals {
+ ccl_constant KernelData *data;
+#define KERNEL_TEX(type, ttype, name) \
+ ccl_global type *name;
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+ } KernelGlobals;
+
+ return sizeof(KernelGlobals);
+ }
+
+ /* Returns size of Structure of arrays implementation of. */
+ size_t get_shaderdata_soa_size()
+ {
+ size_t shader_soa_size = 0;
+
+#define SD_VAR(type, what) \
+ shader_soa_size += sizeof(void *);
+#define SD_CLOSURE_VAR(type, what, max_closure)
+ shader_soa_size += sizeof(void *);
+ #include "kernel_shaderdata_vars.h"
+#undef SD_VAR
+#undef SD_CLOSURE_VAR
+
+ return shader_soa_size;
+ }
+
+ bool load_kernels(const DeviceRequestedFeatures& requested_features)
+ {
+ /* If it is an interactive render; we ceil clos_max value to a multiple
+ * of 5 in order to limit re-compilations.
+ */
+ /* TODO(sergey): Decision about this should be done on higher levels. */
+ int max_closure = requested_features.max_closure;
+ if(!background) {
+ assert((max_closure != 0) && "clos_max value is 0" );
+ max_closure = (((max_closure - 1) / 5) + 1) * 5;
+ /* clos_max value shouldn't be greater than MAX_CLOSURE. */
+ max_closure = (max_closure > MAX_CLOSURE) ? MAX_CLOSURE : max_closure;
+ if(current_clos_max == max_closure) {
+ /* Present kernels have been created with the same closure count
+ * build option.
+ */
+ return true;
+ }
+ }
+ /* Get Shader, bake and film_convert kernels.
+ * It'll also do verification of OpenCL actually initialized.
+ */
+ if(!OpenCLDeviceBase::load_kernels(requested_features)) {
+ return false;
+ }
+
+ string svm_build_options = "";
+ string max_closure_build_option = "";
+ string compute_device_type_build_option = "";
+
+ /* Set svm_build_options. */
+ svm_build_options += " -D__NODES_MAX_GROUP__=" +
+ string_printf("%d", requested_features.max_nodes_group);
+ svm_build_options += " -D__NODES_FEATURES__=" +
+ string_printf("%d", requested_features.nodes_features);
+
+ /* Set max closure build option. */
+ max_closure_build_option += string_printf("-D__MAX_CLOSURE__=%d ",
+ max_closure);
+
+ /* Set compute device build option. */
+ cl_device_type device_type;
+ ciErr = clGetDeviceInfo(cdDevice,
+ CL_DEVICE_TYPE,
+ sizeof(cl_device_type),
+ &device_type,
+ NULL);
+ assert(ciErr == CL_SUCCESS);
+ if(device_type == CL_DEVICE_TYPE_GPU) {
+ compute_device_type_build_option = "-D__COMPUTE_DEVICE_GPU__ ";
+ }
+
+ string kernel_path = path_get("kernel");
+ string kernel_md5 = path_files_md5_hash(kernel_path);
+ string device_md5;
+ string custom_kernel_build_options;
+ string kernel_init_source;
+ string clbin;
+
+ string common_custom_build_options = "";
+ common_custom_build_options += "-D__SPLIT_KERNEL__ ";
+ common_custom_build_options += max_closure_build_option;;
+#ifdef __WORK_STEALING__
+ common_custom_build_options += "-D__WORK_STEALING__ ";
+#endif
+
+#define LOAD_KERNEL(program, name) \
+ do { \
+ kernel_init_source = "#include \"kernel_" name ".cl\" // " + \
+ kernel_md5 + "\n"; \
+ custom_kernel_build_options = common_custom_build_options; \
+ device_md5 = device_md5_hash(custom_kernel_build_options); \
+ clbin = string_printf("cycles_kernel_%s_%s_" name ".clbin", \
+ device_md5.c_str(), kernel_md5.c_str()); \
+ if(!load_split_kernel(kernel_path, kernel_init_source, clbin, \
+ custom_kernel_build_options, &program)) \
+ { \
+ return false; \
+ } \
+ } while(false)
+
+ /* TODO(sergey): If names are unified we can save some more bits of
+ * code here.
+ */
+ LOAD_KERNEL(data_init_program, "data_init");
+ LOAD_KERNEL(scene_intersect_program, "scene_intersect");
+ LOAD_KERNEL(lamp_emission_program, "lamp_emission");
+ LOAD_KERNEL(queue_enqueue_program, "queue_enqueue");
+ LOAD_KERNEL(background_buffer_update_program, "background_buffer_update");
+ LOAD_KERNEL(shader_eval_program, "shader_eval");
+ LOAD_KERNEL(holdout_emission_blurring_termination_ao_program,
+ "holdout_emission_blurring_pathtermination_ao");
+ LOAD_KERNEL(direct_lighting_program, "direct_lighting");
+ LOAD_KERNEL(shadow_blocked_program, "shadow_blocked");
+ LOAD_KERNEL(next_iteration_setup_program, "next_iteration_setup");
+ LOAD_KERNEL(sum_all_radiance_program, "sum_all_radiance");
+
+#undef LOAD_KERNEL
+
+#define GLUE(a, b) a ## b
+#define FIND_KERNEL(kernel, program, function) \
+ do { \
+ GLUE(ckPathTraceKernel_, kernel) = \
+ clCreateKernel(GLUE(program, _program), \
+ "kernel_ocl_path_trace_" function, &ciErr); \
+ if(opencl_error(ciErr)) { \
+ return false; \
+ } \
+ } while(false)
+
+ FIND_KERNEL(data_init, data_init, "data_initialization");
+ FIND_KERNEL(scene_intersect, scene_intersect, "scene_intersect");
+ FIND_KERNEL(lamp_emission, lamp_emission, "lamp_emission");
+ FIND_KERNEL(queue_enqueue, queue_enqueue, "queue_enqueue");
+ FIND_KERNEL(background_buffer_update, background_buffer_update, "background_buffer_update");
+ FIND_KERNEL(shader_lighting, shader_eval, "shader_evaluation");
+ FIND_KERNEL(holdout_emission_blurring_pathtermination_ao,
+ holdout_emission_blurring_termination_ao,
+ "holdout_emission_blurring_pathtermination_ao");
+ FIND_KERNEL(direct_lighting, direct_lighting, "direct_lighting");
+ FIND_KERNEL(shadow_blocked_direct_lighting, shadow_blocked, "shadow_blocked_direct_lighting");
+ FIND_KERNEL(setup_next_iteration, next_iteration_setup, "setup_next_iteration");
+ FIND_KERNEL(sum_all_radiance, sum_all_radiance, "sum_all_radiance");
+#undef FIND_KERNEL
+#undef GLUE
+
+ current_clos_max = max_closure;
+
+ return true;
+ }
+
+ ~OpenCLDeviceSplitKernel()
+ {
+ task_pool.stop();
+
+ /* Release kernels */
+ release_kernel_safe(ckPathTraceKernel_data_init);
+ release_kernel_safe(ckPathTraceKernel_scene_intersect);
+ release_kernel_safe(ckPathTraceKernel_lamp_emission);
+ release_kernel_safe(ckPathTraceKernel_queue_enqueue);
+ release_kernel_safe(ckPathTraceKernel_background_buffer_update);
+ release_kernel_safe(ckPathTraceKernel_shader_lighting);
+ release_kernel_safe(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao);
+ release_kernel_safe(ckPathTraceKernel_direct_lighting);
+ release_kernel_safe(ckPathTraceKernel_shadow_blocked_direct_lighting);
+ release_kernel_safe(ckPathTraceKernel_setup_next_iteration);
+ release_kernel_safe(ckPathTraceKernel_sum_all_radiance);
+
+ /* Release global memory */
+ release_mem_object_safe(P_sd);
+ release_mem_object_safe(P_sd_DL_shadow);
+ release_mem_object_safe(N_sd);
+ release_mem_object_safe(N_sd_DL_shadow);
+ release_mem_object_safe(Ng_sd);
+ release_mem_object_safe(Ng_sd_DL_shadow);
+ release_mem_object_safe(I_sd);
+ release_mem_object_safe(I_sd_DL_shadow);
+ release_mem_object_safe(shader_sd);
+ release_mem_object_safe(shader_sd_DL_shadow);
+ release_mem_object_safe(flag_sd);
+ release_mem_object_safe(flag_sd_DL_shadow);
+ release_mem_object_safe(prim_sd);
+ release_mem_object_safe(prim_sd_DL_shadow);
+ release_mem_object_safe(type_sd);
+ release_mem_object_safe(type_sd_DL_shadow);
+ release_mem_object_safe(u_sd);
+ release_mem_object_safe(u_sd_DL_shadow);
+ release_mem_object_safe(v_sd);
+ release_mem_object_safe(v_sd_DL_shadow);
+ release_mem_object_safe(object_sd);
+ release_mem_object_safe(object_sd_DL_shadow);
+ release_mem_object_safe(time_sd);
+ release_mem_object_safe(time_sd_DL_shadow);
+ release_mem_object_safe(ray_length_sd);
+ release_mem_object_safe(ray_length_sd_DL_shadow);
+ release_mem_object_safe(ray_depth_sd);
+ release_mem_object_safe(ray_depth_sd_DL_shadow);
+ release_mem_object_safe(transparent_depth_sd);
+ release_mem_object_safe(transparent_depth_sd_DL_shadow);
+#ifdef __RAY_DIFFERENTIALS__
+ release_mem_object_safe(dP_sd);
+ release_mem_object_safe(dP_sd_DL_shadow);
+ release_mem_object_safe(dI_sd);
+ release_mem_object_safe(dI_sd_DL_shadow);
+ release_mem_object_safe(du_sd);
+ release_mem_object_safe(du_sd_DL_shadow);
+ release_mem_object_safe(dv_sd);
+ release_mem_object_safe(dv_sd_DL_shadow);
+#endif
+#ifdef __DPDU__
+ release_mem_object_safe(dPdu_sd);
+ release_mem_object_safe(dPdu_sd_DL_shadow);
+ release_mem_object_safe(dPdv_sd);
+ release_mem_object_safe(dPdv_sd_DL_shadow);
+#endif
+ release_mem_object_safe(closure_sd);
+ release_mem_object_safe(closure_sd_DL_shadow);
+ release_mem_object_safe(num_closure_sd);
+ release_mem_object_safe(num_closure_sd_DL_shadow);
+ release_mem_object_safe(randb_closure_sd);
+ release_mem_object_safe(randb_closure_sd_DL_shadow);
+ release_mem_object_safe(ray_P_sd);
+ release_mem_object_safe(ray_P_sd_DL_shadow);
+ release_mem_object_safe(ray_dP_sd);
+ release_mem_object_safe(ray_dP_sd_DL_shadow);
+ release_mem_object_safe(rng_coop);
+ release_mem_object_safe(throughput_coop);
+ release_mem_object_safe(L_transparent_coop);
+ release_mem_object_safe(PathRadiance_coop);
+ release_mem_object_safe(Ray_coop);
+ release_mem_object_safe(PathState_coop);
+ release_mem_object_safe(Intersection_coop);
+ release_mem_object_safe(kgbuffer);
+ release_mem_object_safe(sd);
+ release_mem_object_safe(sd_DL_shadow);
+ release_mem_object_safe(ray_state);
+ release_mem_object_safe(AOAlpha_coop);
+ release_mem_object_safe(AOBSDF_coop);
+ release_mem_object_safe(AOLightRay_coop);
+ release_mem_object_safe(BSDFEval_coop);
+ release_mem_object_safe(ISLamp_coop);
+ release_mem_object_safe(LightRay_coop);
+ release_mem_object_safe(Intersection_coop_AO);
+ release_mem_object_safe(Intersection_coop_DL);
+#ifdef WITH_CYCLES_DEBUG
+ release_mem_object_safe(debugdata_coop);
+#endif
+ release_mem_object_safe(use_queues_flag);
+ release_mem_object_safe(Queue_data);
+ release_mem_object_safe(Queue_index);
+ release_mem_object_safe(work_array);
+#ifdef __WORK_STEALING__
+ release_mem_object_safe(work_pool_wgs);
+#endif
+ release_mem_object_safe(per_sample_output_buffers);
+
+ /* Release programs */
+ release_program_safe(data_init_program);
+ release_program_safe(scene_intersect_program);
+ release_program_safe(lamp_emission_program);
+ release_program_safe(queue_enqueue_program);
+ release_program_safe(background_buffer_update_program);
+ release_program_safe(shader_eval_program);
+ release_program_safe(holdout_emission_blurring_termination_ao_program);
+ release_program_safe(direct_lighting_program);
+ release_program_safe(shadow_blocked_program);
+ release_program_safe(next_iteration_setup_program);
+ release_program_safe(sum_all_radiance_program);
+
+ if(hostRayStateArray != NULL) {
+ free(hostRayStateArray);
+ }
+ }
+
+ void path_trace(SplitRenderTile& rtile, int2 max_render_feasible_tile_size)
+ {
+ /* cast arguments to cl types */
+ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+ cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
+ cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
+ cl_int d_x = rtile.x;
+ cl_int d_y = rtile.y;
+ cl_int d_w = rtile.w;
+ cl_int d_h = rtile.h;
+ cl_int d_offset = rtile.offset;
+ cl_int d_stride = rtile.stride;
+
+ /* Make sure that set render feasible tile size is a multiple of local
+ * work size dimensions.
+ */
+ assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
+ assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
+
+ /* ray_state and hostRayStateArray should be of same size. */
+ assert(hostRayState_size == rayState_size);
+ assert(rayState_size == 1);
+
+ size_t global_size[2];
+ size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
+ SPLIT_KERNEL_LOCAL_SIZE_Y};
+
+ /* Set the range of samples to be processed for every ray in
+ * path-regeneration logic.
+ */
+ cl_int start_sample = rtile.start_sample;
+ cl_int end_sample = rtile.start_sample + rtile.num_samples;
+ cl_int num_samples = rtile.num_samples;
+
+#ifdef __WORK_STEALING__
+ global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
+ global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
+ unsigned int num_parallel_samples = 1;
+#else
+ global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
+ unsigned int num_threads = max_render_feasible_tile_size.x *
+ max_render_feasible_tile_size.y;
+ unsigned int num_tile_columns_possible = num_threads / global_size[1];
+ /* Estimate number of parallel samples that can be
+ * processed in parallel.
+ */
+ unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
+ rtile.num_samples);
+ /* Wavefront size in AMD is 64.
+ * TODO(sergey): What about other platforms?
+ */
+ if(num_parallel_samples >= 64) {
+ /* TODO(sergey): Could use generic round-up here. */
+ num_parallel_samples = (num_parallel_samples / 64) * 64
+ }
+ assert(num_parallel_samples != 0);
+
+ global_size[0] = d_w * num_parallel_samples;
+#endif /* __WORK_STEALING__ */
+
+ assert(global_size[0] * global_size[1] <=
+ max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
+
+ /* Allocate all required global memory once. */
+ if(first_tile) {
+ size_t num_global_elements = max_render_feasible_tile_size.x *
+ max_render_feasible_tile_size.y;
+ /* TODO(sergey): This will actually over-allocate if
+ * particular kernel does not support multiclosure.
+ */
+ size_t ShaderClosure_size = get_shader_closure_size(current_clos_max);
+
+#ifdef __WORK_STEALING__
+ /* Calculate max groups */
+ size_t max_global_size[2];
+ size_t tile_x = max_render_feasible_tile_size.x;
+ size_t tile_y = max_render_feasible_tile_size.y;
+ max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
+ max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
+ max_work_groups = (max_global_size[0] * max_global_size[1]) /
+ (local_size[0] * local_size[1]);
+ /* Allocate work_pool_wgs memory. */
+ work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
+#endif /* __WORK_STEALING__ */
+
+ /* Allocate queue_index memory only once. */
+ Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
+ use_queues_flag = mem_alloc(sizeof(char));
+ kgbuffer = mem_alloc(get_KernelGlobals_size());
+
+ /* Create global buffers for ShaderData. */
+ sd = mem_alloc(get_shaderdata_soa_size());
+ sd_DL_shadow = mem_alloc(get_shaderdata_soa_size());
+ P_sd = mem_alloc(num_global_elements * sizeof(float3));
+ P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+ N_sd = mem_alloc(num_global_elements * sizeof(float3));
+ N_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+ Ng_sd = mem_alloc(num_global_elements * sizeof(float3));
+ Ng_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+ I_sd = mem_alloc(num_global_elements * sizeof(float3));
+ I_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+ shader_sd = mem_alloc(num_global_elements * sizeof(int));
+ shader_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+ flag_sd = mem_alloc(num_global_elements * sizeof(int));
+ flag_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+ prim_sd = mem_alloc(num_global_elements * sizeof(int));
+ prim_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+ type_sd = mem_alloc(num_global_elements * sizeof(int));
+ type_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+ u_sd = mem_alloc(num_global_elements * sizeof(float));
+ u_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+ v_sd = mem_alloc(num_global_elements * sizeof(float));
+ v_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+ object_sd = mem_alloc(num_global_elements * sizeof(int));
+ object_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+ time_sd = mem_alloc(num_global_elements * sizeof(float));
+ time_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+ ray_length_sd = mem_alloc(num_global_elements * sizeof(float));
+ ray_length_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+ ray_depth_sd = mem_alloc(num_global_elements * sizeof(int));
+ ray_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+ transparent_depth_sd = mem_alloc(num_global_elements * sizeof(int));
+ transparent_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+
+#ifdef __RAY_DIFFERENTIALS__
+ dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
+ dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
+ dI_sd = mem_alloc(num_global_elements * sizeof(differential3));
+ dI_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
+ du_sd = mem_alloc(num_global_elements * sizeof(differential));
+ du_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
+ dv_sd = mem_alloc(num_global_elements * sizeof(differential));
+ dv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
+#endif
+
+#ifdef __DPDU__
+ dPdu_sd = mem_alloc(num_global_elements * sizeof(float3));
+ dPdu_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+ dPdv_sd = mem_alloc(num_global_elements * sizeof(float3));
+ dPdv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+#endif
+ closure_sd = mem_alloc(num_global_elements * ShaderClosure_size);
+ closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * ShaderClosure_size);
+ num_closure_sd = mem_alloc(num_global_elements * sizeof(int));
+ num_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+ randb_closure_sd = mem_alloc(num_global_elements * sizeof(float));
+ randb_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+ ray_P_sd = mem_alloc(num_global_elements * sizeof(float3));
+ ray_P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+ ray_dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
+ ray_dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
+
+ /* Creation of global memory buffers which are shared among
+ * the kernels.
+ */
+ rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
+ throughput_coop = mem_alloc(num_global_elements * throughput_size);
+ L_transparent_coop = mem_alloc(num_global_elements * L_transparent_size);
+ PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
+ Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
+ PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
+ Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
+ AOAlpha_coop = mem_alloc(num_global_elements * AOAlpha_size);
+ AOBSDF_coop = mem_alloc(num_global_elements * AOBSDF_size);
+ AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
+ BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
+ ISLamp_coop = mem_alloc(num_global_elements * ISLamp_size);
+ LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
+ Intersection_coop_AO = mem_alloc(num_global_elements * sizeof(Intersection));
+ Intersection_coop_DL = mem_alloc(num_global_elements * sizeof(Intersection));
+
+#ifdef WITH_CYCLES_DEBUG
+ debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
+#endif
+
+ ray_state = mem_alloc(num_global_elements * rayState_size);
+
+ hostRayStateArray = (char *)calloc(num_global_elements, hostRayState_size);
+ assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
+
+ Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
+ work_array = mem_alloc(num_global_elements * work_element_size);
+ per_sample_output_buffers = mem_alloc(num_global_elements *
+ per_thread_output_buffer_size);
+ }
+
+ cl_int dQueue_size = global_size[0] * global_size[1];
+ cl_int total_num_rays = global_size[0] * global_size[1];
+
+ cl_uint start_arg_index =
+ kernel_set_args(ckPathTraceKernel_data_init,
+ 0,
+ kgbuffer,
+ sd,
+ sd_DL_shadow,
+ P_sd,
+ P_sd_DL_shadow,
+ N_sd,
+ N_sd_DL_shadow,
+ Ng_sd,
+ Ng_sd_DL_shadow,
+ I_sd,
+ I_sd_DL_shadow,
+ shader_sd,
+ shader_sd_DL_shadow,
+ flag_sd,
+ flag_sd_DL_shadow,
+ prim_sd,
+ prim_sd_DL_shadow,
+ type_sd,
+ type_sd_DL_shadow,
+ u_sd,
+ u_sd_DL_shadow,
+ v_sd,
+ v_sd_DL_shadow,
+ object_sd,
+ object_sd_DL_shadow,
+ time_sd,
+ time_sd_DL_shadow,
+ ray_length_sd,
+ ray_length_sd_DL_shadow,
+ ray_depth_sd,
+ ray_depth_sd_DL_shadow,
+ transparent_depth_sd,
+ transparent_depth_sd_DL_shadow);
+
+ start_arg_index +=
+ kernel_set_args(ckPathTraceKernel_data_init,
+#ifdef __RAY_DIFFERENTIALS__
+ start_arg_index,
+ dP_sd,
+ dP_sd_DL_shadow,
+ dI_sd,
+ dI_sd_DL_shadow,
+ du_sd,
+ du_sd_DL_shadow,
+ dv_sd,
+ dv_sd_DL_shadow,
+#endif
+#ifdef __DPDU__
+ dPdu_sd,
+ dPdu_sd_DL_shadow,
+ dPdv_sd,
+ dPdv_sd_DL_shadow,
+#endif
+ closure_sd,
+ closure_sd_DL_shadow,
+ num_closure_sd,
+ num_closure_sd_DL_shadow,
+ randb_closure_sd,
+ randb_closure_sd_DL_shadow,
+ ray_P_sd,
+ ray_P_sd_DL_shadow,
+ ray_dP_sd,
+ ray_dP_sd_DL_shadow,
+ d_data,
+ per_sample_output_buffers,
+ d_rng_state,
+ rng_coop,
+ throughput_coop,
+ L_transparent_coop,
+ PathRadiance_coop,
+ Ray_coop,
+ PathState_coop,
+ ray_state);
+
+/* TODO(segrey): Avoid map lookup here. */
+#define KERNEL_TEX(type, ttype, name) \
+ set_kernel_arg_mem(ckPathTraceKernel_data_init, &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+ start_arg_index +=
+ kernel_set_args(ckPathTraceKernel_data_init,
+ start_arg_index,
+ start_sample,
+ d_x,
+ d_y,
+ d_w,
+ d_h,
+ d_offset,
+ d_stride,
+ rtile.rng_state_offset_x,
+ rtile.rng_state_offset_y,
+ rtile.buffer_rng_state_stride,
+ Queue_data,
+ Queue_index,
+ dQueue_size,
+ use_queues_flag,
+ work_array,
+#ifdef __WORK_STEALING__
+ work_pool_wgs,
+ num_samples,
+#endif
+#ifdef WITH_CYCLES_DEBUG
+ debugdata_coop,
+#endif
+ num_parallel_samples);
+
+ kernel_set_args(ckPathTraceKernel_scene_intersect,
+ 0,
+ kgbuffer,
+ d_data,
+ rng_coop,
+ Ray_coop,
+ PathState_coop,
+ Intersection_coop,
+ ray_state,
+ d_w,
+ d_h,
+ Queue_data,
+ Queue_index,
+ dQueue_size,
+ use_queues_flag,
+#ifdef WITH_CYCLES_DEBUG
+ debugdata_coop,
+#endif
+ num_parallel_samples);
+
+ kernel_set_args(ckPathTraceKernel_lamp_emission,
+ 0,
+ kgbuffer,
+ d_data,
+ sd,
+ throughput_coop,
+ PathRadiance_coop,
+ Ray_coop,
+ PathState_coop,
+ Intersection_coop,
+ ray_state,
+ d_w,
+ d_h,
+ Queue_data,
+ Queue_index,
+ dQueue_size,
+ use_queues_flag,
+ num_parallel_samples);
+
+ kernel_set_args(ckPathTraceKernel_queue_enqueue,
+ 0,
+ Queue_data,
+ Queue_index,
+ ray_state,
+ dQueue_size);
+
+ kernel_set_args(ckPathTraceKernel_background_buffer_update,
+ 0,
+ kgbuffer,
+ d_data,
+ sd,
+ per_sample_output_buffers,
+ d_rng_state,
+ rng_coop,
+ throughput_coop,
+ PathRadiance_coop,
+ Ray_coop,
+ PathState_coop,
+ L_transparent_coop,
+ ray_state,
+ d_w,
+ d_h,
+ d_x,
+ d_y,
+ d_stride,
+ rtile.rng_state_offset_x,
+ rtile.rng_state_offset_y,
+ rtile.buffer_rng_state_stride,
+ work_array,
+ Queue_data,
+ Queue_index,
+ dQueue_size,
+ end_sample,
+ start_sample,
+#ifdef __WORK_STEALING__
+ work_pool_wgs,
+ num_samples,
+#endif
+#ifdef WITH_CYCLES_DEBUG
+ debugdata_coop,
+#endif
+ num_parallel_samples);
+
+ kernel_set_args(ckPathTraceKernel_shader_lighting,
+ 0,
+ kgbuffer,
+ d_data,
+ sd,
+ rng_coop,
+ Ray_coop,
+ PathState_coop,
+ Intersection_coop,
+ ray_state,
+ Queue_data,
+ Queue_index,
+ dQueue_size);
+
+ kernel_set_args(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao,
+ 0,
+ kgbuffer,
+ d_data,
+ sd,
+ per_sample_output_buffers,
+ rng_coop,
+ throughput_coop,
+ L_transparent_coop,
+ PathRadiance_coop,
+ PathState_coop,
+ Intersection_coop,
+ AOAlpha_coop,
+ AOBSDF_coop,
+ AOLightRay_coop,
+ d_w,
+ d_h,
+ d_x,
+ d_y,
+ d_stride,
+ ray_state,
+ work_array,
+ Queue_data,
+ Queue_index,
+ dQueue_size,
+#ifdef __WORK_STEALING__
+ start_sample,
+#endif
+ num_parallel_samples);
+
+ kernel_set_args(ckPathTraceKernel_direct_lighting,
+ 0,
+ kgbuffer,
+ d_data,
+ sd,
+ sd_DL_shadow,
+ rng_coop,
+ PathState_coop,
+ ISLamp_coop,
+ LightRay_coop,
+ BSDFEval_coop,
+ ray_state,
+ Queue_data,
+ Queue_index,
+ dQueue_size);
+
+ kernel_set_args(ckPathTraceKernel_shadow_blocked_direct_lighting,
+ 0,
+ kgbuffer,
+ d_data,
+ sd_DL_shadow,
+ PathState_coop,
+ LightRay_coop,
+ AOLightRay_coop,
+ Intersection_coop_AO,
+ Intersection_coop_DL,
+ ray_state,
+ Queue_data,
+ Queue_index,
+ dQueue_size,
+ total_num_rays);
+
+ kernel_set_args(ckPathTraceKernel_setup_next_iteration,
+ 0,
+ kgbuffer,
+ d_data,
+ sd,
+ rng_coop,
+ throughput_coop,
+ PathRadiance_coop,
+ Ray_coop,
+ PathState_coop,
+ LightRay_coop,
+ ISLamp_coop,
+ BSDFEval_coop,
+ AOLightRay_coop,
+ AOBSDF_coop,
+ AOAlpha_coop,
+ ray_state,
+ Queue_data,
+ Queue_index,
+ dQueue_size,
+ use_queues_flag);
+
+ kernel_set_args(ckPathTraceKernel_sum_all_radiance,
+ 0,
+ d_data,
+ d_buffer,
+ per_sample_output_buffers,
+ num_parallel_samples,
+ d_w,
+ d_h,
+ d_stride,
+ rtile.buffer_offset_x,
+ rtile.buffer_offset_y,
+ rtile.buffer_rng_state_stride,
+ start_sample);
+
+ /* Macro for Enqueuing split kernels. */
+#define GLUE(a, b) a ## b
+#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
+ opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, \
+ GLUE(ckPathTraceKernel_, \
+ kernelName), \
+ 2, \
+ NULL, \
+ globalSize, \
+ localSize, \
+ 0, \
+ NULL, \
+ NULL))
+
+ /* Enqueue ckPathTraceKernel_data_init kernel. */
+ ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
+ bool activeRaysAvailable = true;
+
+ /* Record number of time host intervention has been made */
+ unsigned int numHostIntervention = 0;
+ unsigned int numNextPathIterTimes = PathIteration_times;
+ while(activeRaysAvailable) {
+ /* Twice the global work size of other kernels for
+ * ckPathTraceKernel_shadow_blocked_direct_lighting. */
+ size_t global_size_shadow_blocked[2];
+ global_size_shadow_blocked[0] = global_size[0] * 2;
+ global_size_shadow_blocked[1] = global_size[1];
+
+ /* Do path-iteration in host [Enqueue Path-iteration kernels. */
+ for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
+ ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(shader_lighting, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(shadow_blocked_direct_lighting, global_size_shadow_blocked, local_size);
+ ENQUEUE_SPLIT_KERNEL(setup_next_iteration, global_size, local_size);
+ }
+
+ /* Read ray-state into Host memory to decide if we should exit
+ * path-iteration in host.
+ */
+ ciErr = clEnqueueReadBuffer(cqCommandQueue,
+ ray_state,
+ CL_TRUE,
+ 0,
+ global_size[0] * global_size[1] * sizeof(char),
+ hostRayStateArray,
+ 0,
+ NULL,
+ NULL);
+ assert(ciErr == CL_SUCCESS);
+
+ activeRaysAvailable = false;
+
+ for(int rayStateIter = 0;
+ rayStateIter < global_size[0] * global_size[1];
+ ++rayStateIter)
+ {
+ if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
+ /* Not all rays are RAY_INACTIVE. */
+ activeRaysAvailable = true;
+ break;
+ }
+ }
+
+ if(activeRaysAvailable) {
+ numHostIntervention++;
+ PathIteration_times = PATH_ITER_INC_FACTOR;
+ /* Host intervention done before all rays become RAY_INACTIVE;
+ * Set do more initial iterations for the next tile.
+ */
+ numNextPathIterTimes += PATH_ITER_INC_FACTOR;
+ }
+ }
+
+ /* Execute SumALLRadiance kernel to accumulate radiance calculated in
+ * per_sample_output_buffers into RenderTile's output buffer.
+ */
+ size_t sum_all_radiance_local_size[2] = {16, 16};
+ size_t sum_all_radiance_global_size[2];
+ sum_all_radiance_global_size[0] =
+ (((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
+ sum_all_radiance_local_size[0];
+ sum_all_radiance_global_size[1] =
+ (((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
+ sum_all_radiance_local_size[1];
+ ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
+ sum_all_radiance_global_size,
+ sum_all_radiance_local_size);
+
+#undef ENQUEUE_SPLIT_KERNEL
+#undef GLUE
+
+ if(numHostIntervention == 0) {
+ /* This means that we are executing kernel more than required
+ * Must avoid this for the next sample/tile.
+ */
+ PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?
+ PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR;
+ }
+ else {
+ /* Number of path-iterations done for this tile is set as
+ * Initial path-iteration times for the next tile
+ */
+ PathIteration_times = numNextPathIterTimes;
+ }
+
+ first_tile = false;
+ }
+
+ /* Calculates the amount of memory that has to be always
+ * allocated in order for the split kernel to function.
+ * This memory is tile/scene-property invariant (meaning,
+ * the value returned by this function does not depend
+ * on the user set tile size or scene properties.
+ */
+ size_t get_invariable_mem_allocated()
+ {
+ size_t total_invariable_mem_allocated = 0;
+ size_t KernelGlobals_size = 0;
+ size_t ShaderData_SOA_size = 0;
+
+ KernelGlobals_size = get_KernelGlobals_size();
+ ShaderData_SOA_size = get_shaderdata_soa_size();
+
+ total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */
+ total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */
+ total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */
+ total_invariable_mem_allocated += ShaderData_SOA_size; /* sd size */
+ total_invariable_mem_allocated += ShaderData_SOA_size; /* sd_DL_shadow size */
+
+ return total_invariable_mem_allocated;
+ }
+
+ /* Calculate the memory that has-to-be/has-been allocated for
+ * the split kernel to function.
+ */
+ size_t get_tile_specific_mem_allocated(const int2 tile_size)
+ {
+ size_t tile_specific_mem_allocated = 0;
+
+ /* Get required tile info */
+ unsigned int user_set_tile_w = tile_size.x;
+ unsigned int user_set_tile_h = tile_size.y;
+
+#ifdef __WORK_STEALING__
+ /* Calculate memory to be allocated for work_pools in
+ * case of work_stealing.
+ */
+ size_t max_global_size[2];
+ size_t max_num_work_pools = 0;
+ max_global_size[0] =
+ (((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+ SPLIT_KERNEL_LOCAL_SIZE_X;
+ max_global_size[1] =
+ (((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+ SPLIT_KERNEL_LOCAL_SIZE_Y;
+ max_num_work_pools =
+ (max_global_size[0] * max_global_size[1]) /
+ (SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y);
+ tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int);
+#endif
+
+ tile_specific_mem_allocated +=
+ user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size;
+ tile_specific_mem_allocated +=
+ user_set_tile_w * user_set_tile_h * sizeof(RNG);
+
+ return tile_specific_mem_allocated;
+ }
+
+ /* Calculates the texture memories and KernelData (d_data) memory
+ * that has been allocated.
+ */
+ size_t get_scene_specific_mem_allocated(cl_mem d_data)
+ {
+ size_t scene_specific_mem_allocated = 0;
+ /* Calculate texture memories. */
+#define KERNEL_TEX(type, ttype, name) \
+ scene_specific_mem_allocated += get_tex_size(#name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+ size_t d_data_size;
+ ciErr = clGetMemObjectInfo(d_data,
+ CL_MEM_SIZE,
+ sizeof(d_data_size),
+ &d_data_size,
+ NULL);
+ assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info");
+ scene_specific_mem_allocated += d_data_size;
+ return scene_specific_mem_allocated;
+ }
+
+ /* Calculate the memory required for one thread in split kernel. */
+ size_t get_per_thread_memory()
+ {
+ size_t shader_closure_size = 0;
+ size_t shaderdata_volume = 0;
+ shader_closure_size = get_shader_closure_size(current_clos_max);
+ /* TODO(sergey): This will actually over-allocate if
+ * particular kernel does not support multiclosure.
+ */
+ shaderdata_volume = get_shader_data_size(shader_closure_size);
+ size_t retval = sizeof(RNG)
+ + throughput_size + L_transparent_size
+ + rayState_size + work_element_size
+ + ISLamp_size + sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState)
+ + sizeof(Intersection) /* Overall isect */
+ + sizeof(Intersection) /* Instersection_coop_AO */
+ + sizeof(Intersection) /* Intersection coop DL */
+ + shaderdata_volume /* Overall ShaderData */
+ + (shaderdata_volume * 2) /* ShaderData : DL and shadow */
+ + sizeof(Ray) + sizeof(BsdfEval) + AOAlpha_size + AOBSDF_size + sizeof(Ray)
+ + (sizeof(int)* NUM_QUEUES)
+ + per_thread_output_buffer_size;
+ return retval;
+ }
+
+ /* Considers the total memory available in the device and
+ * and returns the maximum global work size possible.
+ */
+ size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data)
+ {
+ /* Calculate invariably allocated memory. */
+ size_t invariable_mem_allocated = get_invariable_mem_allocated();
+ /* Calculate tile specific allocated memory. */
+ size_t tile_specific_mem_allocated =
+ get_tile_specific_mem_allocated(tile_size);
+ /* Calculate scene specific allocated memory. */
+ size_t scene_specific_mem_allocated =
+ get_scene_specific_mem_allocated(d_data);
+ /* Calculate total memory available for the threads in global work size. */
+ size_t available_memory = total_allocatable_memory
+ - invariable_mem_allocated
+ - tile_specific_mem_allocated
+ - scene_specific_mem_allocated
+ - DATA_ALLOCATION_MEM_FACTOR;
+ size_t per_thread_memory_required = get_per_thread_memory();
+ return (available_memory / per_thread_memory_required);
+ }
+
+ /* Checks if the device has enough memory to render the whole tile;
+ * If not, we should split single tile into multiple tiles of small size
+ * and process them all.
+ */
+ bool need_to_split_tile(unsigned int d_w,
+ unsigned int d_h,
+ int2 max_render_feasible_tile_size)
+ {
+ size_t global_size_estimate[2];
+ /* TODO(sergey): Such round-ups are in quite few places, need to replace
+ * them with an utility macro.
+ */
+ global_size_estimate[0] =
+ (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+ SPLIT_KERNEL_LOCAL_SIZE_X;
+ global_size_estimate[1] =
+ (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+ SPLIT_KERNEL_LOCAL_SIZE_Y;
+ if((global_size_estimate[0] * global_size_estimate[1]) >
+ (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y))
{
- run = function_bind(&OpenCLDevice::thread_run, device, this);
+ return true;
}
- };
+ else {
+ return false;
+ }
+ }
- int get_split_task_count(DeviceTask& /*task*/)
+ /* Considers the scene properties, global memory available in the device
+ * and returns a rectanglular tile dimension (approx the maximum)
+ * that should render on split kernel.
+ */
+ int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size)
{
- return 1;
+ int2 max_render_feasible_tile_size;
+ int square_root_val = (int)sqrt(feasible_global_work_size);
+ max_render_feasible_tile_size.x = square_root_val;
+ max_render_feasible_tile_size.y = square_root_val;
+ /* Ciel round-off max_render_feasible_tile_size. */
+ int2 ceil_render_feasible_tile_size;
+ ceil_render_feasible_tile_size.x =
+ (((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+ SPLIT_KERNEL_LOCAL_SIZE_X;
+ ceil_render_feasible_tile_size.y =
+ (((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+ SPLIT_KERNEL_LOCAL_SIZE_Y;
+ if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <=
+ feasible_global_work_size)
+ {
+ return ceil_render_feasible_tile_size;
+ }
+ /* Floor round-off max_render_feasible_tile_size. */
+ int2 floor_render_feasible_tile_size;
+ floor_render_feasible_tile_size.x =
+ (max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) *
+ SPLIT_KERNEL_LOCAL_SIZE_X;
+ floor_render_feasible_tile_size.y =
+ (max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) *
+ SPLIT_KERNEL_LOCAL_SIZE_Y;
+ return floor_render_feasible_tile_size;
}
- void task_add(DeviceTask& task)
+ /* Try splitting the current tile into multiple smaller
+ * almost-square-tiles.
+ */
+ int2 get_split_tile_size(RenderTile rtile,
+ int2 max_render_feasible_tile_size)
{
- task_pool.push(new OpenCLDeviceTask(this, task));
+ int2 split_tile_size;
+ int num_global_threads = max_render_feasible_tile_size.x *
+ max_render_feasible_tile_size.y;
+ int d_w = rtile.w;
+ int d_h = rtile.h;
+ /* Ceil round off d_w and d_h */
+ d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+ SPLIT_KERNEL_LOCAL_SIZE_X;
+ d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+ SPLIT_KERNEL_LOCAL_SIZE_Y;
+ while(d_w * d_h > num_global_threads) {
+ /* Halve the longer dimension. */
+ if(d_w >= d_h) {
+ d_w = d_w / 2;
+ d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+ SPLIT_KERNEL_LOCAL_SIZE_X;
+ }
+ else {
+ d_h = d_h / 2;
+ d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+ SPLIT_KERNEL_LOCAL_SIZE_Y;
+ }
+ }
+ split_tile_size.x = d_w;
+ split_tile_size.y = d_h;
+ return split_tile_size;
}
- void task_wait()
+ /* Splits existing tile into multiple tiles of tile size split_tile_size. */
+ vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size)
{
- task_pool.wait();
+ vector<SplitRenderTile> to_path_trace_rtile;
+ int d_w = rtile.w;
+ int d_h = rtile.h;
+ int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1);
+ int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1);
+ /* Buffer and rng_state offset calc. */
+ size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride);
+ size_t offset_x = offset_index % rtile.stride;
+ size_t offset_y = offset_index / rtile.stride;
+ /* Resize to_path_trace_rtile. */
+ to_path_trace_rtile.resize(num_tiles_x * num_tiles_y);
+ for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) {
+ for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) {
+ int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x;
+ to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x;
+ to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y;
+ to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x;
+ to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y;
+ to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample;
+ to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples;
+ to_path_trace_rtile[rtile_index].sample = rtile.sample;
+ to_path_trace_rtile[rtile_index].resolution = rtile.resolution;
+ to_path_trace_rtile[rtile_index].offset = rtile.offset;
+ to_path_trace_rtile[rtile_index].buffers = rtile.buffers;
+ to_path_trace_rtile[rtile_index].buffer = rtile.buffer;
+ to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state;
+ to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x);
+ to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y);
+ to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride;
+ /* Fill width and height of the new render tile. */
+ to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ?
+ (d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */
+ : split_tile_size.x;
+ to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ?
+ (d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */
+ : split_tile_size.y;
+ to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w;
+ }
+ }
+ return to_path_trace_rtile;
}
- void task_cancel()
+ void thread_run(DeviceTask *task)
{
- task_pool.cancel();
+ if(task->type == DeviceTask::FILM_CONVERT) {
+ film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+ }
+ else if(task->type == DeviceTask::SHADER) {
+ shader(*task);
+ }
+ else if(task->type == DeviceTask::PATH_TRACE) {
+ RenderTile tile;
+ bool initialize_data_and_check_render_feasibility = false;
+ bool need_to_split_tiles_further = false;
+ int2 max_render_feasible_tile_size;
+ size_t feasible_global_work_size;
+ const int2 tile_size = task->requested_tile_size;
+ /* Keep rendering tiles until done. */
+ while(task->acquire_tile(this, tile)) {
+ if(!initialize_data_and_check_render_feasibility) {
+ /* Initialize data. */
+ /* Calculate per_thread_output_buffer_size. */
+ size_t output_buffer_size = 0;
+ ciErr = clGetMemObjectInfo((cl_mem)tile.buffer,
+ CL_MEM_SIZE,
+ sizeof(output_buffer_size),
+ &output_buffer_size,
+ NULL);
+ assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info");
+ /* This value is different when running on AMD and NV. */
+ if(background) {
+ /* In offline render the number of buffer elements
+ * associated with tile.buffer is the current tile size.
+ */
+ per_thread_output_buffer_size =
+ output_buffer_size / (tile.w * tile.h);
+ }
+ else {
+ /* interactive rendering, unlike offline render, the number of buffer elements
+ * associated with tile.buffer is the entire viewport size.
+ */
+ per_thread_output_buffer_size =
+ output_buffer_size / (tile.buffers->params.width *
+ tile.buffers->params.height);
+ }
+ /* Check render feasibility. */
+ feasible_global_work_size = get_feasible_global_work_size(
+ tile_size,
+ CL_MEM_PTR(const_mem_map["__data"]->device_pointer));
+ max_render_feasible_tile_size =
+ get_max_render_feasible_tile_size(
+ feasible_global_work_size);
+ need_to_split_tiles_further =
+ need_to_split_tile(tile_size.x,
+ tile_size.y,
+ max_render_feasible_tile_size);
+ initialize_data_and_check_render_feasibility = true;
+ }
+ if(need_to_split_tiles_further) {
+ int2 split_tile_size =
+ get_split_tile_size(tile,
+ max_render_feasible_tile_size);
+ vector<SplitRenderTile> to_path_trace_render_tiles =
+ split_tiles(tile, split_tile_size);
+ /* Print message to console */
+ if(background && (to_path_trace_render_tiles.size() > 1)) {
+ fprintf(stderr, "Message : Tiles need to be split "
+ "further inside path trace (due to insufficient "
+ "device-global-memory for split kernel to "
+ "function) \n"
+ "The current tile of dimensions %dx%d is split "
+ "into tiles of dimension %dx%d for render \n",
+ tile.w, tile.h,
+ split_tile_size.x,
+ split_tile_size.y);
+ }
+ /* Process all split tiles. */
+ for(int tile_iter = 0;
+ tile_iter < to_path_trace_render_tiles.size();
+ ++tile_iter)
+ {
+ path_trace(to_path_trace_render_tiles[tile_iter],
+ max_render_feasible_tile_size);
+ }
+ }
+ else {
+ /* No splitting required; process the entire tile at once. */
+ /* Render feasible tile size is user-set-tile-size itself. */
+ max_render_feasible_tile_size.x =
+ (((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+ SPLIT_KERNEL_LOCAL_SIZE_X;
+ max_render_feasible_tile_size.y =
+ (((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+ SPLIT_KERNEL_LOCAL_SIZE_Y;
+ /* buffer_rng_state_stride is stride itself. */
+ SplitRenderTile split_tile(tile);
+ split_tile.buffer_rng_state_stride = tile.stride;
+ path_trace(split_tile, max_render_feasible_tile_size);
+ }
+ tile.sample = tile.start_sample + tile.num_samples;
+
+ /* Complete kernel execution before release tile. */
+ /* This helps in multi-device render;
+ * The device that reaches the critical-section function
+ * release_tile waits (stalling other devices from entering
+ * release_tile) for all kernels to complete. If device1 (a
+ * slow-render device) reaches release_tile first then it would
+ * stall device2 (a fast-render device) from proceeding to render
+ * next tile.
+ */
+ clFinish(cqCommandQueue);
+
+ task->release_tile(tile);
+ }
+ }
+ }
+
+protected:
+ cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE)
+ {
+ cl_mem ptr;
+ ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr);
+ if(opencl_error(ciErr)) {
+ assert(0);
+ }
+ return ptr;
}
};
+/* Returns true in case of successful detection of platform and device type,
+ * else returns false.
+ */
+static bool get_platform_and_devicetype(const DeviceInfo info,
+ string &platform_name,
+ cl_device_type &device_type)
+{
+ cl_platform_id platform_id;
+ cl_device_id device_id;
+ cl_uint num_platforms;
+ cl_int ciErr;
+
+ /* TODO(sergey): Use some generic error print helper function/ */
+ ciErr = clGetPlatformIDs(0, NULL, &num_platforms);
+ if(ciErr != CL_SUCCESS) {
+ fprintf(stderr, "Can't getPlatformIds. file - %s, line - %d\n", __FILE__, __LINE__);
+ return false;
+ }
+
+ if(num_platforms == 0) {
+ fprintf(stderr, "No OpenCL platforms found. file - %s, line - %d\n", __FILE__, __LINE__);
+ return false;
+ }
+
+ vector<cl_platform_id> platforms(num_platforms, NULL);
+
+ ciErr = clGetPlatformIDs(num_platforms, &platforms[0], NULL);
+ if(ciErr != CL_SUCCESS) {
+ fprintf(stderr, "Can't getPlatformIds. file - %s, line - %d\n", __FILE__, __LINE__);
+ return false;
+ }
+
+ int num_base = 0;
+ int total_devices = 0;
+
+ for(int platform = 0; platform < num_platforms; platform++) {
+ cl_uint num_devices;
+
+ ciErr = clGetDeviceIDs(platforms[platform], opencl_device_type(), 0, NULL, &num_devices);
+ if(ciErr != CL_SUCCESS) {
+ fprintf(stderr, "Can't getDeviceIDs. file - %s, line - %d\n", __FILE__, __LINE__);
+ return false;
+ }
+
+ total_devices += num_devices;
+
+ if(info.num - num_base >= num_devices) {
+ /* num doesn't refer to a device in this platform */
+ num_base += num_devices;
+ continue;
+ }
+
+ /* device is in this platform */
+ platform_id = platforms[platform];
+
+ /* get devices */
+ vector<cl_device_id> device_ids(num_devices, NULL);
+
+ ciErr = clGetDeviceIDs(platform_id, opencl_device_type(), num_devices, &device_ids[0], NULL);
+ if(ciErr != CL_SUCCESS) {
+ fprintf(stderr, "Can't getDeviceIDs. file - %s, line - %d\n", __FILE__, __LINE__);
+ return false;
+ }
+
+ device_id = device_ids[info.num - num_base];
+
+ char name[256];
+ ciErr = clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
+ if(ciErr != CL_SUCCESS) {
+ fprintf(stderr, "Can't getPlatformIDs. file - %s, line - %d \n", __FILE__, __LINE__);
+ return false;
+ }
+ platform_name = name;
+
+ ciErr = clGetDeviceInfo(device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL);
+ if(ciErr != CL_SUCCESS) {
+ fprintf(stderr, "Can't getDeviceInfo. file - %s, line - %d \n", __FILE__, __LINE__);
+ return false;
+ }
+
+ break;
+ }
+
+ if(total_devices == 0) {
+ fprintf(stderr, "No devices found. file - %s, line - %d \n", __FILE__, __LINE__);
+ return false;
+ }
+
+ return true;
+}
+
Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background)
{
- return new OpenCLDevice(info, stats, background);
+ string platform_name;
+ cl_device_type device_type;
+ if(get_platform_and_devicetype(info, platform_name, device_type)) {
+ const bool force_split_kernel =
+ getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST") != NULL;
+ /* TODO(sergey): Replace string lookups with more enum-like API,
+ * similar to device/venfdor checks blender's gpu.
+ */
+ if(force_split_kernel ||
+ (platform_name == "AMD Accelerated Parallel Processing" &&
+ device_type == CL_DEVICE_TYPE_GPU))
+ {
+ /* If the device is an AMD GPU, take split kernel path. */
+ VLOG(1) << "Using split kernel";
+ return new OpenCLDeviceSplitKernel(info, stats, background);
+ } else {
+ /* For any other device, take megakernel path. */
+ VLOG(1) << "Using megekernel";
+ return new OpenCLDeviceMegaKernel(info, stats, background);
+ }
+ } else {
+ /* If we can't retrieve platform and device type information for some
+ * reason, we default to megakernel path.
+ */
+ VLOG(1) << "Failed to rertieve platform or device, using megakernel";
+ return new OpenCLDeviceMegaKernel(info, stats, background);
+ }
}
-bool device_opencl_init(void) {
+bool device_opencl_init(void)
+{
static bool initialized = false;
static bool result = false;
@@ -1132,13 +3322,7 @@ bool device_opencl_init(void) {
initialized = true;
- // OpenCL disabled for now, only works with this environment variable set
- if(!getenv("CYCLES_OPENCL_TEST")) {
- result = false;
- }
- else {
- result = clewInit() == CLEW_SUCCESS;
- }
+ result = clewInit() == CLEW_SUCCESS;
return result;
}
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 83b3450fc1c..85b2760073b 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -14,6 +14,17 @@ set(INC_SYS
set(SRC
kernel.cpp
kernel.cl
+ kernel_data_init.cl
+ kernel_queue_enqueue.cl
+ kernel_scene_intersect.cl
+ kernel_lamp_emission.cl
+ kernel_background_buffer_update.cl
+ kernel_shader_eval.cl
+ kernel_holdout_emission_blurring_pathtermination_ao.cl
+ kernel_direct_lighting.cl
+ kernel_shadow_blocked.cl
+ kernel_next_iteration_setup.cl
+ kernel_sum_all_radiance.cl
kernel.cu
)
@@ -36,17 +47,22 @@ set(SRC_HEADERS
kernel_montecarlo.h
kernel_passes.h
kernel_path.h
+ kernel_path_common.h
kernel_path_state.h
kernel_path_surface.h
kernel_path_volume.h
kernel_projection.h
+ kernel_queues.h
kernel_random.h
kernel_shader.h
+ kernel_shaderdata_vars.h
kernel_shadow.h
+ kernel_split.h
kernel_subsurface.h
kernel_textures.h
kernel_types.h
kernel_volume.h
+ kernel_work_stealing.h
)
set(SRC_CLOSURE_HEADERS
@@ -68,6 +84,7 @@ set(SRC_CLOSURE_HEADERS
closure/emissive.h
closure/volume.h
)
+
set(SRC_SVM_HEADERS
svm/svm.h
svm/svm_attribute.h
@@ -284,6 +301,17 @@ endif()
#delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 2b9e2a4e44d..558aa0dc6a9 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -47,79 +47,79 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader
switch(sc->type) {
case CLOSURE_BSDF_DIFFUSE_ID:
case CLOSURE_BSDF_BSSRDF_ID:
- label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
#ifdef __SVM__
case CLOSURE_BSDF_OREN_NAYAR_ID:
- label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
/*case CLOSURE_BSDF_PHONG_RAMP_ID:
- label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
- label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;*/
case CLOSURE_BSDF_TRANSLUCENT_ID:
- label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_REFLECTION_ID:
- label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_REFRACTION_ID:
- label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_TRANSPARENT_ID:
- label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
- label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
- label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
- label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
- label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_DIFFUSE_TOON_ID:
- label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_GLOSSY_TOON_ID:
- label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_HAIR_REFLECTION_ID:
- label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
- label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+ label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
#endif
#ifdef __VOLUME__
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
- label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+ label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
break;
#endif
default:
@@ -139,67 +139,67 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
return OSLShader::bsdf_eval(sd, sc, omega_in, *pdf);
#endif
- if(dot(sd->Ng, omega_in) >= 0.0f) {
+ if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) {
switch(sc->type) {
case CLOSURE_BSDF_DIFFUSE_ID:
case CLOSURE_BSDF_BSSRDF_ID:
- eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
#ifdef __SVM__
case CLOSURE_BSDF_OREN_NAYAR_ID:
- eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
/*case CLOSURE_BSDF_PHONG_RAMP_ID:
- eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
- eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;*/
case CLOSURE_BSDF_TRANSLUCENT_ID:
- eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_REFLECTION_ID:
- eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_REFRACTION_ID:
- eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_TRANSPARENT_ID:
- eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
- eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
- eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
- eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
- eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_DIFFUSE_TOON_ID:
- eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_GLOSSY_TOON_ID:
- eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_HAIR_REFLECTION_ID:
- eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
- eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
+ eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
#endif
#ifdef __VOLUME__
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
- eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+ eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
#endif
default:
@@ -211,57 +211,57 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
switch(sc->type) {
case CLOSURE_BSDF_DIFFUSE_ID:
case CLOSURE_BSDF_BSSRDF_ID:
- eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
+ eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
#ifdef __SVM__
case CLOSURE_BSDF_OREN_NAYAR_ID:
- eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf);
+ eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_TRANSLUCENT_ID:
- eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf);
+ eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_REFLECTION_ID:
- eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
+ eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_REFRACTION_ID:
- eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf);
+ eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_TRANSPARENT_ID:
- eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
+ eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_GGX_ID:
case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
- eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
+ eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
- eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
+ eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
- eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
+ eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
- eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
+ eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_DIFFUSE_TOON_ID:
- eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf);
+ eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_GLOSSY_TOON_ID:
- eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
+ eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_HAIR_REFLECTION_ID:
- eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
+ eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
- eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
+ eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
#endif
#ifdef __VOLUME__
case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
- eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+ eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
break;
#endif
default:
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index 9ac16e86085..c7364e9edac 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -29,13 +29,13 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeElement *elem)
{
- if(sd->object == PRIM_NONE)
+ if(ccl_fetch(sd, object) == PRIM_NONE)
return (int)ATTR_STD_NOT_FOUND;
/* for SVM, find attribute by unique id */
- uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
+ uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
#ifdef __HAIR__
- attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
+ attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
#endif
uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
@@ -49,7 +49,7 @@ ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, ui
*elem = (AttributeElement)attr_map.y;
- if(sd->prim == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
+ if(ccl_fetch(sd, prim) == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
return ATTR_STD_NOT_FOUND;
/* return result */
diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h
index 2e8e27c709d..3d0d406dd0b 100644
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@@ -447,6 +447,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
#endif
}
+#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
/* ToDo: Move to another file? */
ccl_device int intersections_compare(const void *a, const void *b)
{
@@ -460,6 +461,7 @@ ccl_device int intersections_compare(const void *a, const void *b)
else
return 0;
}
+#endif
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index a5a25f4a9ae..4ea9e4714c4 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -236,25 +236,25 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh
ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface)
{
/* get shader */
- sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+ ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
/* get motion info */
int numsteps, numverts;
- object_motion_info(kg, sd->object, &numsteps, &numverts, NULL);
+ object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);
/* figure out which steps we need to fetch and their interpolation factor */
int maxstep = numsteps*2;
- int step = min((int)(sd->time*maxstep), maxstep-1);
- float t = sd->time*maxstep - step;
+ int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
+ float t = ccl_fetch(sd, time)*maxstep - step;
/* find attribute */
AttributeElement elem;
- int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_POSITION, &elem);
+ int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_POSITION, &elem);
kernel_assert(offset != ATTR_STD_NOT_FOUND);
/* fetch vertex coordinates */
float3 verts[3], next_verts[3];
- float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+ float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)));
motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
@@ -268,33 +268,33 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
#ifdef __SUBSURFACE__
if(!subsurface)
#endif
- sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
+ ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
#ifdef __SUBSURFACE__
else
- sd->P = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
+ ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
#endif
/* compute face normal */
float3 Ng;
- if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+ if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
else
Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
- sd->Ng = Ng;
- sd->N = Ng;
+ ccl_fetch(sd, Ng) = Ng;
+ ccl_fetch(sd, N) = Ng;
/* compute derivatives of P w.r.t. uv */
#ifdef __DPDU__
- sd->dPdu = (verts[0] - verts[2]);
- sd->dPdv = (verts[1] - verts[2]);
+ ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
+ ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
#endif
/* compute smooth normal */
- if(sd->shader & SHADER_SMOOTH_NORMAL) {
+ if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
/* find attribute */
AttributeElement elem;
- int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
+ int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
kernel_assert(offset != ATTR_STD_NOT_FOUND);
/* fetch vertex coordinates */
@@ -308,10 +308,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
/* interpolate between vertices */
- float u = sd->u;
- float v = sd->v;
+ float u = ccl_fetch(sd, u);
+ float v = ccl_fetch(sd, v);
float w = 1.0f - u - v;
- sd->N = (u*normals[0] + v*normals[1] + w*normals[2]);
+ ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
}
}
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index 7df71010232..40cbca243a7 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -123,9 +123,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
{
#ifdef __OBJECT_MOTION__
- *P = transform_point(&sd->ob_tfm, *P);
+ *P = transform_point(&ccl_fetch(sd, ob_tfm), *P);
#else
- Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+ Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
*P = transform_point(&tfm, *P);
#endif
}
@@ -135,9 +135,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader
ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
{
#ifdef __OBJECT_MOTION__
- *P = transform_point(&sd->ob_itfm, *P);
+ *P = transform_point(&ccl_fetch(sd, ob_itfm), *P);
#else
- Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+ Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
*P = transform_point(&tfm, *P);
#endif
}
@@ -147,9 +147,9 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons
ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
{
#ifdef __OBJECT_MOTION__
- *N = normalize(transform_direction_transposed(&sd->ob_tfm, *N));
+ *N = normalize(transform_direction_transposed(&ccl_fetch(sd, ob_tfm), *N));
#else
- Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+ Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
*N = normalize(transform_direction_transposed(&tfm, *N));
#endif
}
@@ -159,9 +159,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const
ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
{
#ifdef __OBJECT_MOTION__
- *N = normalize(transform_direction_transposed(&sd->ob_itfm, *N));
+ *N = normalize(transform_direction_transposed(&ccl_fetch(sd, ob_itfm), *N));
#else
- Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+ Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
*N = normalize(transform_direction_transposed(&tfm, *N));
#endif
}
@@ -171,9 +171,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa
ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
{
#ifdef __OBJECT_MOTION__
- *D = transform_direction(&sd->ob_tfm, *D);
+ *D = transform_direction(&ccl_fetch(sd, ob_tfm), *D);
#else
- Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+ Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
*D = transform_direction(&tfm, *D);
#endif
}
@@ -183,9 +183,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData
ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
{
#ifdef __OBJECT_MOTION__
- *D = transform_direction(&sd->ob_itfm, *D);
+ *D = transform_direction(&ccl_fetch(sd, ob_itfm), *D);
#else
- Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+ Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
*D = transform_direction(&tfm, *D);
#endif
}
@@ -194,13 +194,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha
ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
{
- if(sd->object == OBJECT_NONE)
+ if(ccl_fetch(sd, object) == OBJECT_NONE)
return make_float3(0.0f, 0.0f, 0.0f);
#ifdef __OBJECT_MOTION__
- return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
+ return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w);
#else
- Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+ Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
#endif
}
@@ -296,7 +296,7 @@ ccl_device_inline void object_motion_info(KernelGlobals *kg, int object, int *nu
ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
{
- return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2 + 1);
+ return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2 + 1);
}
/* Particle data from which object was instanced */
@@ -377,7 +377,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)
/* Transform ray into object space to enter static object in BVH */
-ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
+ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
{
Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
@@ -425,7 +425,7 @@ ccl_device_inline void qbvh_instance_push(KernelGlobals *kg,
/* Transorm ray to exit static object in BVH */
-ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
+ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
{
if(*t != FLT_MAX) {
Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
@@ -520,5 +520,38 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg, int obj
#endif
+/* TODO(sergey): This is only for until we've got OpenCL 2.0
+ * on all devices we consider supported. It'll be replaced with
+ * generic address space.
+ */
+
+#ifdef __KERNEL_OPENCL__
+ccl_device_inline void object_dir_transform_addrspace(KernelGlobals *kg,
+ const ShaderData *sd,
+ ccl_addr_space float3 *D)
+{
+ float3 private_D = *D;
+ object_dir_transform(kg, sd, &private_D);
+ *D = private_D;
+}
+
+ccl_device_inline void object_normal_transform_addrspace(KernelGlobals *kg,
+ const ShaderData *sd,
+ ccl_addr_space float3 *N)
+{
+ float3 private_N = *N;
+ object_dir_transform(kg, sd, &private_N);
+ *N = private_N;
+}
+#endif
+
+#ifndef __KERNEL_OPENCL__
+# define object_dir_transform_auto object_dir_transform
+# define object_normal_transform_auto object_normal_transform
+#else
+# define object_dir_transform_auto object_dir_transform_addrspace
+# define object_normal_transform_auto object_normal_transform_addrspace
+#endif
+
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index d2543c5943e..30f12d32355 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -25,16 +25,16 @@ CCL_NAMESPACE_BEGIN
ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
{
- if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
+ if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
return triangle_attribute_float(kg, sd, elem, offset, dx, dy);
}
#ifdef __HAIR__
- else if(sd->type & PRIMITIVE_ALL_CURVE) {
+ else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
return curve_attribute_float(kg, sd, elem, offset, dx, dy);
}
#endif
#ifdef __VOLUME__
- else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
+ else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
return volume_attribute_float(kg, sd, elem, offset, dx, dy);
}
#endif
@@ -47,16 +47,16 @@ ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *
ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
{
- if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
+ if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
return triangle_attribute_float3(kg, sd, elem, offset, dx, dy);
}
#ifdef __HAIR__
- else if(sd->type & PRIMITIVE_ALL_CURVE) {
+ else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
return curve_attribute_float3(kg, sd, elem, offset, dx, dy);
}
#endif
#ifdef __VOLUME__
- else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
+ else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
return volume_attribute_float3(kg, sd, elem, offset, dx, dy);
}
#endif
@@ -108,9 +108,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
{
#ifdef __HAIR__
- if(sd->type & PRIMITIVE_ALL_CURVE)
+ if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)
#ifdef __DPDU__
- return normalize(sd->dPdu);
+ return normalize(ccl_fetch(sd, dPdu));
#else
return make_float3(0.0f, 0.0f, 0.0f);
#endif
@@ -124,12 +124,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
float3 data = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f);
object_normal_transform(kg, sd, &data);
- return cross(sd->N, normalize(cross(data, sd->N)));
+ return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N))));
}
else {
/* otherwise use surface derivatives */
#ifdef __DPDU__
- return normalize(sd->dPdu);
+ return normalize(ccl_fetch(sd, dPdu));
#else
return make_float3(0.0f, 0.0f, 0.0f);
#endif
@@ -144,16 +144,16 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
float3 center;
#ifdef __HAIR__
- bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE;
+ bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE;
if(is_curve_primitive) {
center = curve_motion_center_location(kg, sd);
- if(!(sd->flag & SD_TRANSFORM_APPLIED))
+ if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED))
object_position_transform(kg, sd, &center);
}
else
#endif
- center = sd->P;
+ center = ccl_fetch(sd, P);
float3 motion_pre = center, motion_post = center;
@@ -164,16 +164,16 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
if(offset != ATTR_STD_NOT_FOUND) {
/* get motion info */
int numverts, numkeys;
- object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
+ object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys);
/* lookup attributes */
- int offset_next = (sd->type & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys;
+ int offset_next = (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys;
motion_pre = primitive_attribute_float3(kg, sd, elem, offset, NULL, NULL);
motion_post = primitive_attribute_float3(kg, sd, elem, offset_next, NULL, NULL);
#ifdef __HAIR__
- if(is_curve_primitive && (sd->flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+ if(is_curve_primitive && (ccl_fetch(sd, flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
object_position_transform(kg, sd, &motion_pre);
object_position_transform(kg, sd, &motion_post);
}
@@ -184,10 +184,10 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
* transformation was set match the world/object space of motion_pre/post */
Transform tfm;
- tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE);
+ tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE);
motion_pre = transform_point(&tfm, motion_pre);
- tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST);
+ tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST);
motion_post = transform_point(&tfm, motion_post);
float3 motion_center;
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index dd3928682e3..995dfac5b09 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -27,14 +27,14 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
{
/* load triangle vertices */
- float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+ float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
/* return normal */
- if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+ if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
return normalize(cross(v2 - v0, v1 - v0));
else
return normalize(cross(v1 - v0, v2 - v0));
@@ -94,7 +94,7 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
/* Ray differentials on triangle */
-ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, float3 *dPdu, float3 *dPdv)
+ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, ccl_addr_space float3 *dPdu, ccl_addr_space float3 *dPdv)
{
/* fetch triangle vertex coordinates */
float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -116,34 +116,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
if(dx) *dx = 0.0f;
if(dy) *dy = 0.0f;
- return kernel_tex_fetch(__attributes_float, offset + sd->prim);
+ return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim));
}
else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
- float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+ float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x));
float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y));
float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z));
#ifdef __RAY_DIFFERENTIALS__
- if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
- if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+ if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+ if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
#endif
- return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+ return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
}
else if(elem == ATTR_ELEMENT_CORNER) {
- int tri = offset + sd->prim*3;
+ int tri = offset + ccl_fetch(sd, prim)*3;
float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
#ifdef __RAY_DIFFERENTIALS__
- if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
- if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+ if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+ if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
#endif
- return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+ return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
}
else {
if(dx) *dx = 0.0f;
@@ -159,24 +159,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
- return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
+ return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim)));
}
else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
- float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+ float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));
#ifdef __RAY_DIFFERENTIALS__
- if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
- if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+ if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+ if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
#endif
- return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+ return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
}
else if(elem == ATTR_ELEMENT_CORNER || elem == ATTR_ELEMENT_CORNER_BYTE) {
- int tri = offset + sd->prim*3;
+ int tri = offset + ccl_fetch(sd, prim)*3;
float3 f0, f1, f2;
if(elem == ATTR_ELEMENT_CORNER) {
@@ -191,11 +191,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
}
#ifdef __RAY_DIFFERENTIALS__
- if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
- if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+ if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+ if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
#endif
- return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+ return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
}
else {
if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernel.cl
index 5a47260a4ee..19e394936ee 100644
--- a/intern/cycles/kernel/kernel.cl
+++ b/intern/cycles/kernel/kernel.cl
@@ -25,6 +25,8 @@
#include "kernel_path.h"
#include "kernel_bake.h"
+#ifdef __COMPILE_ONLY_MEGAKERNEL__
+
__kernel void kernel_ocl_path_trace(
ccl_constant KernelData *data,
ccl_global float *buffer,
@@ -52,17 +54,18 @@ __kernel void kernel_ocl_path_trace(
kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
}
-__kernel void kernel_ocl_convert_to_byte(
+#else // __COMPILE_ONLY_MEGAKERNEL__
+
+__kernel void kernel_ocl_shader(
ccl_constant KernelData *data,
- ccl_global uchar4 *rgba,
- ccl_global float *buffer,
+ ccl_global uint4 *input,
+ ccl_global float4 *output,
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "kernel_textures.h"
- float sample_scale,
- int sx, int sy, int sw, int sh, int offset, int stride)
+ int type, int sx, int sw, int offset, int sample)
{
KernelGlobals kglobals, *kg = &kglobals;
@@ -73,23 +76,21 @@ __kernel void kernel_ocl_convert_to_byte(
#include "kernel_textures.h"
int x = sx + get_global_id(0);
- int y = sy + get_global_id(1);
- if(x < sx + sw && y < sy + sh)
- kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+ if(x < sx + sw)
+ kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
}
-__kernel void kernel_ocl_convert_to_half_float(
+__kernel void kernel_ocl_bake(
ccl_constant KernelData *data,
- ccl_global uchar4 *rgba,
- ccl_global float *buffer,
+ ccl_global uint4 *input,
+ ccl_global float4 *output,
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "kernel_textures.h"
- float sample_scale,
- int sx, int sy, int sw, int sh, int offset, int stride)
+ int type, int sx, int sw, int offset, int sample)
{
KernelGlobals kglobals, *kg = &kglobals;
@@ -100,22 +101,22 @@ __kernel void kernel_ocl_convert_to_half_float(
#include "kernel_textures.h"
int x = sx + get_global_id(0);
- int y = sy + get_global_id(1);
- if(x < sx + sw && y < sy + sh)
- kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+ if(x < sx + sw)
+ kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
}
-__kernel void kernel_ocl_shader(
+__kernel void kernel_ocl_convert_to_byte(
ccl_constant KernelData *data,
- ccl_global uint4 *input,
- ccl_global float4 *output,
+ ccl_global uchar4 *rgba,
+ ccl_global float *buffer,
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "kernel_textures.h"
- int type, int sx, int sw, int offset, int sample)
+ float sample_scale,
+ int sx, int sy, int sw, int sh, int offset, int stride)
{
KernelGlobals kglobals, *kg = &kglobals;
@@ -126,21 +127,23 @@ __kernel void kernel_ocl_shader(
#include "kernel_textures.h"
int x = sx + get_global_id(0);
+ int y = sy + get_global_id(1);
- if(x < sx + sw)
- kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+ if(x < sx + sw && y < sy + sh)
+ kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
}
-__kernel void kernel_ocl_bake(
+__kernel void kernel_ocl_convert_to_half_float(
ccl_constant KernelData *data,
- ccl_global uint4 *input,
- ccl_global float4 *output,
+ ccl_global uchar4 *rgba,
+ ccl_global float *buffer,
#define KERNEL_TEX(type, ttype, name) \
ccl_global type *name,
#include "kernel_textures.h"
- int type, int sx, int sw, int offset, int sample)
+ float sample_scale,
+ int sx, int sy, int sw, int sh, int offset, int stride)
{
KernelGlobals kglobals, *kg = &kglobals;
@@ -151,8 +154,10 @@ __kernel void kernel_ocl_bake(
#include "kernel_textures.h"
int x = sx + get_global_id(0);
+ int y = sy + get_global_id(1);
- if(x < sx + sw)
- kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
+ if(x < sx + sw && y < sy + sh)
+ kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
}
+#endif // __COMPILE_ONLY_MEGAKERNEL__ \ No newline at end of file
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 369c615eade..257728b6244 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -176,7 +176,7 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
#endif
}
-ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throughput,
+ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label)
{
float inverse_pdf = 1.0f/bsdf_pdf;
diff --git a/intern/cycles/kernel/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernel_background_buffer_update.cl
new file mode 100644
index 00000000000..bf08477cfbf
--- /dev/null
+++ b/intern/cycles/kernel/kernel_background_buffer_update.cl
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_background_buffer_update kernel.
+ * This is the fourth kernel in the ray tracing logic, and the third
+ * of the path iteration kernels. This kernel takes care of rays that hit
+ * the background (sceneintersect kernel), and for the rays of
+ * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in
+ * the output buffer. This kernel also takes care of rays that have been determined
+ * to-be-regenerated.
+ *
+ * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel
+ *
+ * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
+ * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state
+ * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_background_buffer_update --|--- PathRadiance_coop
+ * throughput_coop --------------------------------------| |--- L_transparent_coop
+ * per_sample_output_buffers ----------------------------| |--- per_sample_output_buffers
+ * Ray_coop ---------------------------------------------| |--- ray_state
+ * PathState_coop ---------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * L_transparent_coop -----------------------------------| |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * ray_state --------------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- work_array
+ * parallel_samples -------------------------------------| |--- PathState_coop
+ * end_sample -------------------------------------------| |--- throughput_coop
+ * kg (globals + data) ----------------------------------| |--- rng_coop
+ * rng_state --------------------------------------------| |--- Ray
+ * PathRadiance_coop ------------------------------------| |
+ * sw ---------------------------------------------------| |
+ * sh ---------------------------------------------------| |
+ * sx ---------------------------------------------------| |
+ * sy ---------------------------------------------------| |
+ * stride -----------------------------------------------| |
+ * work_array -------------------------------------------| |--- work_array
+ * queuesize --------------------------------------------| |
+ * start_sample -----------------------------------------| |--- work_pool_wgs
+ * work_pool_wgs ----------------------------------------| |
+ * num_samples ------------------------------------------| |
+ *
+ * note on shader_data : shader_data argument is neither an input nor an output for this kernel. It is just filled and consumed here itself.
+ * Note on Queues :
+ * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
+ */
+__kernel void kernel_ocl_path_trace_background_buffer_update(
+ ccl_global char *globals,
+ ccl_constant KernelData *data,
+ ccl_global char *shader_data,
+ ccl_global float *per_sample_output_buffers,
+ ccl_global uint *rng_state,
+ ccl_global uint *rng_coop, /* Required for buffer Update */
+ ccl_global float3 *throughput_coop, /* Required for background hit processing */
+ PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */
+ ccl_global Ray *Ray_coop, /* Required for background hit processing */
+ ccl_global PathState *PathState_coop, /* Required for background hit processing */
+ ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */
+ ccl_global char *ray_state, /* Stores information on the current state of a ray */
+ int sw, int sh, int sx, int sy, int stride,
+ int rng_state_offset_x,
+ int rng_state_offset_y,
+ int rng_state_stride,
+ ccl_global unsigned int *work_array, /* Denotes work of each ray */
+ ccl_global int *Queue_data, /* Queues memory */
+ ccl_global int *Queue_index, /* Tracks the number of elements in each queue */
+ int queuesize, /* Size (capacity) of each queue */
+ int end_sample,
+ int start_sample,
+#ifdef __WORK_STEALING__
+ ccl_global unsigned int *work_pool_wgs,
+ unsigned int num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+ DebugData *debugdata_coop,
+#endif
+ int parallel_samples /* Number of samples to be processed in parallel */
+ )
+{
+ ccl_local unsigned int local_queue_atomics;
+ if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+ local_queue_atomics = 0;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+ if(ray_index == 0) {
+ /* We will empty this queue in this kernel */
+ Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+ }
+ char enqueue_flag = 0;
+ ray_index = get_ray_index(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, Queue_data, queuesize, 1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+ /* If we are executing on a GPU device, we exit all threads that are not required.
+ * If we are executing on a CPU device, then we need to keep all threads active
+ * since we have barrier() calls later in the kernel. CPU devices
+ * expect all threads to execute barrier statement.
+ */
+ if(ray_index == QUEUE_EMPTY_SLOT)
+ return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+ /* Load kernel globals structure and ShaderData strucuture */
+ KernelGlobals *kg = (KernelGlobals *)globals;
+ ShaderData *sd = (ShaderData *)shader_data;
+
+#ifdef __KERNEL_DEBUG__
+ DebugData *debug_data = &debugdata_coop[ray_index];
+#endif
+ ccl_global PathState *state = &PathState_coop[ray_index];
+ PathRadiance *L = L = &PathRadiance_coop[ray_index];
+ ccl_global Ray *ray = &Ray_coop[ray_index];
+ ccl_global float3 *throughput = &throughput_coop[ray_index];
+ ccl_global float *L_transparent = &L_transparent_coop[ray_index];
+ ccl_global uint *rng = &rng_coop[ray_index];
+
+#ifdef __WORK_STEALING__
+ unsigned int my_work;
+ ccl_global float *initial_per_sample_output_buffers;
+ ccl_global uint *initial_rng;
+#endif
+ unsigned int sample;
+ unsigned int tile_x;
+ unsigned int tile_y;
+ unsigned int pixel_x;
+ unsigned int pixel_y;
+ unsigned int my_sample_tile;
+
+#ifdef __WORK_STEALING__
+ my_work = work_array[ray_index];
+ sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+ get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+ my_sample_tile = 0;
+ initial_per_sample_output_buffers = per_sample_output_buffers;
+ initial_rng = rng_state;
+#else // __WORK_STEALING__
+ sample = work_array[ray_index];
+ int tile_index = ray_index / parallel_samples;
+ /* buffer and rng_state's stride is "stride". Find x and y using ray_index */
+ tile_x = tile_index % sw;
+ tile_y = tile_index / sw;
+ my_sample_tile = ray_index - (tile_index * parallel_samples);
+#endif
+ rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+ per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+
+ if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+ /* eval background shader if nothing hit */
+ if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
+ *L_transparent = (*L_transparent) + average((*throughput));
+#ifdef __PASSES__
+ if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ }
+
+ if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
+ {
+#ifdef __BACKGROUND__
+ /* sample background shader */
+ float3 L_background = indirect_background(kg, state, ray, sd);
+ path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
+#endif
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ }
+ }
+
+ if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+ float3 L_sum = path_radiance_clamp_and_sum(kg, L);
+ kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
+#ifdef __KERNEL_DEBUG__
+ kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
+#endif
+ float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
+
+ /* accumulate result in output buffer */
+ kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+ path_rng_end(kg, rng_state, *rng);
+
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+ }
+
+ if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+#ifdef __WORK_STEALING__
+ /* We have completed current work; So get next work */
+ int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+ if(!valid_work) {
+ /* If work is invalid, this means no more work is available and the thread may exit */
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+ }
+#else
+ if((sample + parallel_samples) >= end_sample) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+ }
+#endif
+ if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+#ifdef __WORK_STEALING__
+ work_array[ray_index] = my_work;
+ /* Get the sample associated with the current work */
+ sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+ /* Get pixel and tile position associated with current work */
+ get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+ my_sample_tile = 0;
+
+ /* Remap rng_state according to the current work */
+ rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
+ /* Remap per_sample_output_buffers according to the current work */
+ per_sample_output_buffers = initial_per_sample_output_buffers
+ + (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+#else
+ work_array[ray_index] = sample + parallel_samples;
+ sample = work_array[ray_index];
+
+ /* Get ray position from ray index */
+ pixel_x = sx + ((ray_index / parallel_samples) % sw);
+ pixel_y = sy + ((ray_index / parallel_samples) / sw);
+#endif
+
+ /* initialize random numbers and ray */
+ kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
+
+ if(ray->t != 0.0f) {
+ /* Initialize throughput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/
+ *throughput = make_float3(1.0f, 1.0f, 1.0f);
+ *L_transparent = 0.0f;
+ path_radiance_init(L, kernel_data.film.use_light_pass);
+ path_state_init(kg, state, rng, sample, ray);
+#ifdef __KERNEL_DEBUG__
+ debug_data_init(debug_data);
+#endif
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+ enqueue_flag = 1;
+ } else {
+ /*These rays do not participate in path-iteration */
+ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ /* accumulate result in output buffer */
+ kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+ path_rng_end(kg, rng_state, *rng);
+
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+ }
+ }
+ }
+#ifndef __COMPUTE_DEVICE_GPU__
+ }
+#endif
+
+ /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; These rays
+ * will be made active during next SceneIntersectkernel
+ */
+ enqueue_ray_index_local(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+}
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index 1e81210007c..3ce5134181a 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -39,7 +39,7 @@ ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
return bokeh;
}
-ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
{
/* create ray form raster position */
Transform rastertocamera = kernel_data.cam.rastertocamera;
@@ -108,8 +108,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
}
/* Orthographic Camera */
-
-ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
{
/* create ray form raster position */
Transform rastertocamera = kernel_data.cam.rastertocamera;
@@ -175,7 +174,7 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl
/* Panorama Camera */
-ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
{
Transform rastertocamera = kernel_data.cam.rastertocamera;
float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
@@ -256,7 +255,7 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float
/* Common */
ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v,
- float lens_u, float lens_v, float time, Ray *ray)
+ float lens_u, float lens_v, float time, ccl_addr_space Ray *ray)
{
/* pixel filter */
int filter_table_offset = kernel_data.film.filter_table_offset;
@@ -319,7 +318,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
{
if(kernel_data.cam.type != CAMERA_PANORAMA) {
/* perspective / ortho */
- if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
+ if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
P += camera_position(kg);
Transform tfm = kernel_data.cam.worldtondc;
@@ -329,7 +328,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
/* panorama */
Transform tfm = kernel_data.cam.worldtocamera;
- if(sd->object != OBJECT_NONE)
+ if(ccl_fetch(sd, object) != OBJECT_NONE)
P = normalize(transform_point(&tfm, P));
else
P = normalize(transform_direction(&tfm, P));
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index b209fff88e6..7a5f70ff3da 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -40,6 +40,8 @@
#include "util_half.h"
#include "util_types.h"
+#define ccl_addr_space
+
/* On x86_64, versions of glibc < 2.16 have an issue where expf is
* much slower than the double version. This was fixed in glibc 2.16.
*/
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 61e208fcab3..9fdd3abfec3 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -41,6 +41,7 @@
#define ccl_global
#define ccl_constant
#define ccl_may_alias
+#define ccl_addr_space
/* No assert supported for CUDA */
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 12b0f117600..e8b36d2605d 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -40,6 +40,12 @@
#define ccl_local __local
#define ccl_private __private
+#ifdef __SPLIT_KERNEL__
+#define ccl_addr_space __global
+#else
+#define ccl_addr_space
+#endif
+
/* Selective nodes compilation. */
#ifndef __NODES_MAX_GROUP__
# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
diff --git a/intern/cycles/kernel/kernel_data_init.cl b/intern/cycles/kernel/kernel_data_init.cl
new file mode 100644
index 00000000000..dbf9e62ccbf
--- /dev/null
+++ b/intern/cycles/kernel/kernel_data_init.cl
@@ -0,0 +1,384 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_data_initialization kernel
+ * This kernel Initializes structures needed in path-iteration kernels.
+ * This is the first kernel in ray-tracing logic.
+ *
+ * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
+ *
+ * Its input and output are as follows,
+ *
+ * Un-initialized rng---------------|--- kernel_ocl_path_trace_data_initialization ---|--- Initialized rng
+ * Un-initialized throughput -------| |--- Initialized throughput
+ * Un-initialized L_transparent ----| |--- Initialized L_transparent
+ * Un-initialized PathRadiance -----| |--- Initialized PathRadiance
+ * Un-initialized Ray --------------| |--- Initialized Ray
+ * Un-initialized PathState --------| |--- Initialized PathState
+ * Un-initialized QueueData --------| |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
+ * Un-initilaized QueueIndex -------| |--- Initialized QueueIndex (to 0)
+ * Un-initialized use_queues_flag---| |--- Initialized use_queues_flag (to false)
+ * Un-initialized ray_state --------| |--- Initialized ray_state
+ * parallel_samples --------------- | |--- Initialized per_sample_output_buffers
+ * rng_state -----------------------| |--- Initialized work_array
+ * data ----------------------------| |--- Initialized work_pool_wgs
+ * start_sample --------------------| |
+ * sx ------------------------------| |
+ * sy ------------------------------| |
+ * sw ------------------------------| |
+ * sh ------------------------------| |
+ * stride --------------------------| |
+ * queuesize -----------------------| |
+ * num_samples ---------------------| |
+ *
+ * Note on Queues :
+ * All slots in queues are initialized to queue empty slot;
+ * The number of elements in the queues is initialized to 0;
+ */
+__kernel void kernel_ocl_path_trace_data_initialization(
+ ccl_global char *globals,
+ ccl_global char *shader_data_sd, /* Arguments related to ShaderData */
+ ccl_global char *shader_data_sd_DL_shadow, /* Arguments related to ShaderData */
+
+ ccl_global float3 *P_sd,
+ ccl_global float3 *P_sd_DL_shadow,
+
+ ccl_global float3 *N_sd,
+ ccl_global float3 *N_sd_DL_shadow,
+
+ ccl_global float3 *Ng_sd,
+ ccl_global float3 *Ng_sd_DL_shadow,
+
+ ccl_global float3 *I_sd,
+ ccl_global float3 *I_sd_DL_shadow,
+
+ ccl_global int *shader_sd,
+ ccl_global int *shader_sd_DL_shadow,
+
+ ccl_global int *flag_sd,
+ ccl_global int *flag_sd_DL_shadow,
+
+ ccl_global int *prim_sd,
+ ccl_global int *prim_sd_DL_shadow,
+
+ ccl_global int *type_sd,
+ ccl_global int *type_sd_DL_shadow,
+
+ ccl_global float *u_sd,
+ ccl_global float *u_sd_DL_shadow,
+
+ ccl_global float *v_sd,
+ ccl_global float *v_sd_DL_shadow,
+
+ ccl_global int *object_sd,
+ ccl_global int *object_sd_DL_shadow,
+
+ ccl_global float *time_sd,
+ ccl_global float *time_sd_DL_shadow,
+
+ ccl_global float *ray_length_sd,
+ ccl_global float *ray_length_sd_DL_shadow,
+
+ ccl_global int *ray_depth_sd,
+ ccl_global int *ray_depth_sd_DL_shadow,
+
+ ccl_global int *transparent_depth_sd,
+ ccl_global int *transparent_depth_sd_DL_shadow,
+ #ifdef __RAY_DIFFERENTIALS__
+ ccl_global differential3 *dP_sd,
+ ccl_global differential3 *dP_sd_DL_shadow,
+
+ ccl_global differential3 *dI_sd,
+ ccl_global differential3 *dI_sd_DL_shadow,
+
+ ccl_global differential *du_sd,
+ ccl_global differential *du_sd_DL_shadow,
+
+ ccl_global differential *dv_sd,
+ ccl_global differential *dv_sd_DL_shadow,
+ #endif
+ #ifdef __DPDU__
+ ccl_global float3 *dPdu_sd,
+ ccl_global float3 *dPdu_sd_DL_shadow,
+
+ ccl_global float3 *dPdv_sd,
+ ccl_global float3 *dPdv_sd_DL_shadow,
+ #endif
+ ShaderClosure *closure_sd,
+ ShaderClosure *closure_sd_DL_shadow,
+
+ ccl_global int *num_closure_sd,
+ ccl_global int *num_closure_sd_DL_shadow,
+
+ ccl_global float *randb_closure_sd,
+ ccl_global float *randb_closure_sd_DL_shadow,
+
+ ccl_global float3 *ray_P_sd,
+ ccl_global float3 *ray_P_sd_DL_shadow,
+
+ ccl_global differential3 *ray_dP_sd,
+ ccl_global differential3 *ray_dP_sd_DL_shadow,
+
+ ccl_constant KernelData *data,
+ ccl_global float *per_sample_output_buffers,
+ ccl_global uint *rng_state,
+ ccl_global uint *rng_coop, /* rng array to store rng values for all rays */
+ ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */
+ ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */
+ PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */
+ ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */
+ ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */
+ ccl_global char *ray_state, /* Stores information on current state of a ray */
+
+#define KERNEL_TEX(type, ttype, name) \
+ ccl_global type *name,
+#include "kernel_textures.h"
+
+ int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+ int rng_state_offset_x,
+ int rng_state_offset_y,
+ int rng_state_stride,
+ ccl_global int *Queue_data, /* Memory for queues */
+ ccl_global int *Queue_index, /* Tracks the number of elements in queues */
+ int queuesize, /* size (capacity) of the queue */
+ ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+ ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */
+#ifdef __WORK_STEALING__
+ ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */
+ unsigned int num_samples, /* Total number of samples per pixel */
+#endif
+#ifdef __KERNEL_DEBUG__
+ DebugData *debugdata_coop,
+#endif
+ int parallel_samples /* Number of samples to be processed in parallel */
+ )
+{
+
+ /* Load kernel globals structure */
+ KernelGlobals *kg = (KernelGlobals *)globals;
+
+ kg->data = data;
+#define KERNEL_TEX(type, ttype, name) \
+ kg->name = name;
+#include "kernel_textures.h"
+
+ /* Load ShaderData structure */
+ ShaderData *sd = (ShaderData *)shader_data_sd;
+ ShaderData *sd_DL_shadow = (ShaderData *)shader_data_sd_DL_shadow;
+
+ sd->P = P_sd;
+ sd_DL_shadow->P = P_sd_DL_shadow;
+
+ sd->N = N_sd;
+ sd_DL_shadow->N = N_sd_DL_shadow;
+
+ sd->Ng = Ng_sd;
+ sd_DL_shadow->Ng = Ng_sd_DL_shadow;
+
+ sd->I = I_sd;
+ sd_DL_shadow->I = I_sd_DL_shadow;
+
+ sd->shader = shader_sd;
+ sd_DL_shadow->shader = shader_sd_DL_shadow;
+
+ sd->flag = flag_sd;
+ sd_DL_shadow->flag = flag_sd_DL_shadow;
+
+ sd->prim = prim_sd;
+ sd_DL_shadow->prim = prim_sd_DL_shadow;
+
+ sd->type = type_sd;
+ sd_DL_shadow->type = type_sd_DL_shadow;
+
+ sd->u = u_sd;
+ sd_DL_shadow->u = u_sd_DL_shadow;
+
+ sd->v = v_sd;
+ sd_DL_shadow->v = v_sd_DL_shadow;
+
+ sd->object = object_sd;
+ sd_DL_shadow->object = object_sd_DL_shadow;
+
+ sd->time = time_sd;
+ sd_DL_shadow->time = time_sd_DL_shadow;
+
+ sd->ray_length = ray_length_sd;
+ sd_DL_shadow->ray_length = ray_length_sd_DL_shadow;
+
+ sd->ray_depth = ray_depth_sd;
+ sd_DL_shadow->ray_depth = ray_depth_sd_DL_shadow;
+
+ sd->transparent_depth = transparent_depth_sd;
+ sd_DL_shadow->transparent_depth = transparent_depth_sd_DL_shadow;
+
+#ifdef __RAY_DIFFERENTIALS__
+ sd->dP = dP_sd;
+ sd_DL_shadow->dP = dP_sd_DL_shadow;
+
+ sd->dI = dI_sd;
+ sd_DL_shadow->dI = dI_sd_DL_shadow;
+
+ sd->du = du_sd;
+ sd_DL_shadow->du = du_sd_DL_shadow;
+
+ sd->dv = dv_sd;
+ sd_DL_shadow->dv = dv_sd_DL_shadow;
+#ifdef __DPDU__
+ sd->dPdu = dPdu_sd;
+ sd_DL_shadow->dPdu = dPdu_sd_DL_shadow;
+
+ sd->dPdv = dPdv_sd;
+ sd_DL_shadow->dPdv = dPdv_sd_DL_shadow;
+#endif
+#endif
+
+ sd->closure = closure_sd;
+ sd_DL_shadow->closure = closure_sd_DL_shadow;
+
+ sd->num_closure = num_closure_sd;
+ sd_DL_shadow->num_closure = num_closure_sd_DL_shadow;
+
+ sd->randb_closure = randb_closure_sd;
+ sd_DL_shadow->randb_closure = randb_closure_sd_DL_shadow;
+
+ sd->ray_P = ray_P_sd;
+ sd_DL_shadow->ray_P = ray_P_sd_DL_shadow;
+
+ sd->ray_dP = ray_dP_sd;
+ sd_DL_shadow->ray_dP = ray_dP_sd_DL_shadow;
+
+ int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+#ifdef __WORK_STEALING__
+ int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+ /* Initialize work_pool_wgs */
+ if(lid == 0) {
+ int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
+ work_pool_wgs[group_index] = 0;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+#endif // __WORK_STEALING__
+
+ /* Initialize queue data and queue index */
+ if(thread_index < queuesize) {
+ /* Initialize active ray queue */
+ Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+ /* Initialize background and buffer update queue */
+ Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+ /* Initialize shadow ray cast of AO queue */
+ Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+ /* Initialize shadow ray cast of direct lighting queue */
+ Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+ }
+
+ if(thread_index == 0) {
+ Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+ Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+ Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+ Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+ /* The scene-intersect kernel should not use the queues very first time.
+ * since the queue would be empty.
+ */
+ use_queues_flag[0] = 0;
+ }
+
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+
+ if(x < (sw * parallel_samples) && y < sh) {
+
+ int ray_index = x + y * (sw * parallel_samples);
+
+ /* This is the first assignment to ray_state; So we dont use ASSIGN_RAY_STATE macro */
+ ray_state[ray_index] = RAY_ACTIVE;
+
+ unsigned int my_sample;
+ unsigned int pixel_x;
+ unsigned int pixel_y;
+ unsigned int tile_x;
+ unsigned int tile_y;
+ unsigned int my_sample_tile;
+
+#ifdef __WORK_STEALING__
+ unsigned int my_work = 0;
+ /* get work */
+ get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+ /* Get the sample associated with the work */
+ my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+
+ my_sample_tile = 0;
+
+ /* Get pixel and tile position associated with the work */
+ get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+ work_array[ray_index] = my_work;
+#else // __WORK_STEALING__
+
+ unsigned int tile_index = ray_index / parallel_samples;
+ tile_x = tile_index % sw;
+ tile_y = tile_index / sw;
+ my_sample_tile = ray_index - (tile_index * parallel_samples);
+ my_sample = my_sample_tile + start_sample;
+
+ /* Initialize work array */
+ work_array[ray_index] = my_sample ;
+
+ /* Calculate pixel position of this ray */
+ pixel_x = sx + tile_x;
+ pixel_y = sy + tile_y;
+#endif // __WORK_STEALING__
+
+ rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+
+ /* Initialise per_sample_output_buffers to all zeros */
+ per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
+ int per_sample_output_buffers_iterator = 0;
+ for(per_sample_output_buffers_iterator = 0; per_sample_output_buffers_iterator < kernel_data.film.pass_stride; per_sample_output_buffers_iterator++) {
+ per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
+ }
+
+ /* initialize random numbers and ray */
+ kernel_path_trace_setup(kg, rng_state, my_sample, pixel_x, pixel_y, &rng_coop[ray_index], &Ray_coop[ray_index]);
+
+ if(Ray_coop[ray_index].t != 0.0f) {
+ /* Initialize throuput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/
+ throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
+ L_transparent_coop[ray_index] = 0.0f;
+ path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
+ path_state_init(kg, &PathState_coop[ray_index], &rng_coop[ray_index], my_sample, &Ray_coop[ray_index]);
+#ifdef __KERNEL_DEBUG__
+ debug_data_init(&debugdata_coop[ray_index]);
+#endif
+ } else {
+ /*These rays do not participate in path-iteration */
+
+ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ /* accumulate result in output buffer */
+ kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
+ path_rng_end(kg, rng_state, rng_coop[ray_index]);
+
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+ }
+ }
+
+ /* Mark rest of the ray-state indices as RAY_INACTIVE */
+ if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
+ /* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
+ ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
+ }
+}
diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h
index f532442ba41..94ede397848 100644
--- a/intern/cycles/kernel/kernel_debug.h
+++ b/intern/cycles/kernel/kernel_debug.h
@@ -23,7 +23,7 @@ ccl_device_inline void debug_data_init(DebugData *debug_data)
ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
ccl_global float *buffer,
- PathState *state,
+ ccl_addr_space PathState *state,
DebugData *debug_data,
int sample)
{
diff --git a/intern/cycles/kernel/kernel_differential.h b/intern/cycles/kernel/kernel_differential.h
index e5fbd5b450e..ae1e70f0167 100644
--- a/intern/cycles/kernel/kernel_differential.h
+++ b/intern/cycles/kernel/kernel_differential.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
/* See "Tracing Ray Differentials", Homan Igehy, 1999. */
-ccl_device void differential_transfer(differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t)
+ccl_device void differential_transfer(ccl_addr_space differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t)
{
/* ray differential transfer through homogeneous medium, to
* compute dPdx/dy at a shading point from the incoming ray */
@@ -31,7 +31,7 @@ ccl_device void differential_transfer(differential3 *dP_, const differential3 dP
dP_->dy = tmpy - dot(tmpy, Ng)*tmp;
}
-ccl_device void differential_incoming(differential3 *dI, const differential3 dD)
+ccl_device void differential_incoming(ccl_addr_space differential3 *dI, const differential3 dD)
{
/* compute dIdx/dy at a shading point, we just need to negate the
* differential of the ray direction */
@@ -40,7 +40,7 @@ ccl_device void differential_incoming(differential3 *dI, const differential3 dD)
dI->dy = -dD.dy;
}
-ccl_device void differential_dudv(differential *du, differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng)
+ccl_device void differential_dudv(ccl_addr_space differential *du, ccl_addr_space differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng)
{
/* now we have dPdx/dy from the ray differential transfer, and dPdu/dv
* from the primitive, we can compute dudx/dy and dvdx/dy. these are
diff --git a/intern/cycles/kernel/kernel_direct_lighting.cl b/intern/cycles/kernel/kernel_direct_lighting.cl
new file mode 100644
index 00000000000..8bdc7dc0fd1
--- /dev/null
+++ b/intern/cycles/kernel/kernel_direct_lighting.cl
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_direct_lighting kernel.
+ * This is the eighth kernel in the ray tracing logic. This is the seventh
+ * of the path iteration kernels. This kernel takes care of direct lighting
+ * logic. However, the "shadow ray cast" part of direct lighting is handled
+ * in the next kernel.
+ *
+ * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed.
+ * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and
+ * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop -----------------------------------------|--- kernel_ocl_path_trace_direct_lighting --|--- BSDFEval_coop
+ * PathState_coop -----------------------------------| |--- ISLamp_coop
+ * shader_data --------------------------------------| |--- LightRay_coop
+ * ray_state ----------------------------------------| |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| |
+ * kg (globals + data) ------------------------------| |
+ * queuesize ----------------------------------------| |
+ *
+ * note on shader_DL : shader_DL is neither input nor output to this kernel; shader_DL is filled and consumed in this kernel itself.
+ * Note on Queues :
+ * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
+ * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked
+ * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag.
+ *
+ * State of queues when this kernel is called :
+ * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
+ * before and after this kernel call.
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
+ * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
+ */
+__kernel void kernel_ocl_path_trace_direct_lighting(
+ ccl_global char *globals,
+ ccl_constant KernelData *data,
+ ccl_global char *shader_data, /* Required for direct lighting */
+ ccl_global char *shader_DL, /* Required for direct lighting */
+ ccl_global uint *rng_coop, /* Required for direct lighting */
+ ccl_global PathState *PathState_coop, /* Required for direct lighting */
+ ccl_global int *ISLamp_coop, /* Required for direct lighting */
+ ccl_global Ray *LightRay_coop, /* Required for direct lighting */
+ ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */
+ ccl_global char *ray_state, /* Denotes the state of each ray */
+ ccl_global int *Queue_data, /* Queue memory */
+ ccl_global int *Queue_index, /* Tracks the number of elements in each queue */
+ int queuesize /* Size (capacity) of each queue */
+ )
+{
+ ccl_local unsigned int local_queue_atomics;
+ if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+ local_queue_atomics = 0;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ char enqueue_flag = 0;
+ int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+ ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+ /* If we are executing on a GPU device, we exit all threads that are not required
+ * If we are executing on a CPU device, then we need to keep all threads active
+ * since we have barrier() calls later in the kernel. CPU devices,
+ * expect all threads to execute barrier statement.
+ */
+ if(ray_index == QUEUE_EMPTY_SLOT)
+ return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+ /* Load kernel globals structure and ShaderData structure */
+ KernelGlobals *kg = (KernelGlobals *)globals;
+ ShaderData *sd = (ShaderData *)shader_data;
+ ShaderData *sd_DL = (ShaderData *)shader_DL;
+
+ ccl_global PathState *state = &PathState_coop[ray_index];
+
+ /* direct lighting */
+#ifdef __EMISSION__
+ if((kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) {
+ /* sample illumination from lights to find path contribution */
+ ccl_global RNG* rng = &rng_coop[ray_index];
+ float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+ float light_u, light_v;
+ path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+#ifdef __OBJECT_MOTION__
+ light_ray.time = sd->time;
+#endif
+ LightSample ls;
+ light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
+
+ Ray light_ray;
+ BsdfEval L_light;
+ bool is_lamp;
+ if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce, sd_DL)) {
+ /* write intermediate data to global memory to access from the next kernel */
+ LightRay_coop[ray_index] = light_ray;
+ BSDFEval_coop[ray_index] = L_light;
+ ISLamp_coop[ray_index] = is_lamp;
+ /// mark ray state for next shadow kernel
+ ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+ enqueue_flag = 1;
+ }
+ }
+#endif
+ }
+#ifndef __COMPUTE_DEVICE_GPU__
+ }
+#endif
+
+#ifdef __EMISSION__
+ /* Enqueue RAY_SHADOW_RAY_CAST_DL rays */
+ enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+#endif
+}
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 6c5a5fac8c5..1fee5835360 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -17,12 +17,20 @@
CCL_NAMESPACE_BEGIN
/* Direction Emission */
-
ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
- LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce)
+ LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce
+#ifdef __SPLIT_KERNEL__
+ ,ShaderData *sd_input
+#endif
+)
{
/* setup shading at emitter */
- ShaderData sd;
+#ifdef __SPLIT_KERNEL__
+ ShaderData *sd = sd_input;
+#else
+ ShaderData sd_object;
+ ShaderData *sd = &sd_object;
+#endif
float3 eval;
#ifdef __BACKGROUND_MIS__
@@ -37,23 +45,23 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
ray.dP = differential3_zero();
ray.dD = dI;
- shader_setup_from_background(kg, &sd, &ray, bounce+1, transparent_bounce);
- eval = shader_eval_background(kg, &sd, 0, SHADER_CONTEXT_EMISSION);
+ shader_setup_from_background(kg, sd, &ray, bounce+1, transparent_bounce);
+ eval = shader_eval_background(kg, sd, 0, SHADER_CONTEXT_EMISSION);
}
else
#endif
{
- shader_setup_from_sample(kg, &sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce);
+ shader_setup_from_sample(kg, sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce);
- ls->Ng = sd.Ng;
+ ls->Ng = ccl_fetch(sd, Ng);
/* no path flag, we're evaluating this for all closures. that's weak but
* we'd have to do multiple evaluations otherwise */
- shader_eval_surface(kg, &sd, 0.0f, 0, SHADER_CONTEXT_EMISSION);
+ shader_eval_surface(kg, sd, 0.0f, 0, SHADER_CONTEXT_EMISSION);
/* evaluate emissive closure */
- if(sd.flag & SD_EMISSION)
- eval = shader_emissive_eval(kg, &sd);
+ if(ccl_fetch(sd, flag) & SD_EMISSION)
+ eval = shader_emissive_eval(kg, sd);
else
eval = make_float3(0.0f, 0.0f, 0.0f);
}
@@ -63,9 +71,14 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
return eval;
}
+/* The argument sd_DL is meaningful only for split kernel. Other uses can just pass NULL */
ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
LightSample *ls, Ray *ray, BsdfEval *eval, bool *is_lamp,
- int bounce, int transparent_bounce)
+ int bounce, int transparent_bounce
+#ifdef __SPLIT_KERNEL__
+ , ShaderData *sd_DL
+#endif
+ )
{
if(ls->pdf == 0.0f)
return false;
@@ -74,7 +87,14 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
differential3 dD = differential3_zero();
/* evaluate closure */
- float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, sd->time, bounce, transparent_bounce);
+
+ float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, ccl_fetch(sd, time),
+ bounce,
+ transparent_bounce
+#ifdef __SPLIT_KERNEL__
+ ,sd_DL
+#endif
+ );
if(is_zero(light_eval))
return false;
@@ -83,7 +103,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
float bsdf_pdf;
#ifdef __VOLUME__
- if(sd->prim != PRIM_NONE)
+ if(ccl_fetch(sd, prim) != PRIM_NONE)
shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf);
else
shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf);
@@ -118,8 +138,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
if(ls->shader & SHADER_CAST_SHADOW) {
/* setup ray */
- bool transmit = (dot(sd->Ng, ls->D) < 0.0f);
- ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng);
+ bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f);
+ ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
if(ls->t == FLT_MAX) {
/* distant light */
@@ -132,7 +152,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
ray->D = normalize_len(ray->D, &ray->t);
}
- ray->dP = sd->dP;
+ ray->dP = ccl_fetch(sd, dP);
ray->dD = differential3_zero();
}
else {
@@ -154,14 +174,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
float3 L = shader_emissive_eval(kg, sd);
#ifdef __HAIR__
- if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE))
+ if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE))
#else
- if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
+ if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS))
#endif
{
/* multiple importance sampling, get triangle light pdf,
* and compute weight with respect to BSDF pdf */
- float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t);
+ float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t);
float mis_weight = power_heuristic(bsdf_pdf, pdf);
return L*mis_weight;
@@ -172,7 +192,12 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
/* Indirect Lamp Emission */
-ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission)
+/* The argument sd is meaningful only for split kernel. Other uses can just pass NULL */
+ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission
+#ifdef __SPLIT_KERNEL__
+ ,ShaderData *sd
+#endif
+ )
{
bool hit_lamp = false;
@@ -196,7 +221,13 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
}
#endif
- float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce);
+ float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time,
+ state->bounce,
+ state->transparent_bounce
+#ifdef __SPLIT_KERNEL__
+ ,sd
+#endif
+ );
#ifdef __VOLUME__
if(state->volume_stack[0].shader != SHADER_NONE) {
@@ -225,7 +256,11 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
/* Indirect Background */
-ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *state, Ray *ray)
+ccl_device_noinline float3 indirect_background(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray
+#ifdef __SPLIT_KERNEL__
+ ,ShaderData *sd_global
+#endif
+ )
{
#ifdef __BACKGROUND__
int shader = kernel_data.background.surface_shader;
@@ -241,11 +276,17 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta
return make_float3(0.0f, 0.0f, 0.0f);
}
+#ifdef __SPLIT_KERNEL__
/* evaluate background closure */
+ Ray priv_ray = *ray;
+ shader_setup_from_background(kg, sd_global, &priv_ray, state->bounce+1, state->transparent_bounce);
+ float3 L = shader_eval_background(kg, sd_global, state->flag, SHADER_CONTEXT_EMISSION);
+#else
ShaderData sd;
shader_setup_from_background(kg, &sd, ray, state->bounce+1, state->transparent_bounce);
float3 L = shader_eval_background(kg, &sd, state->flag, SHADER_CONTEXT_EMISSION);
+#endif
#ifdef __BACKGROUND_MIS__
/* check if background light exists or if we should skip pdf */
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index be2c879adb9..17fa18909c4 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -80,7 +80,7 @@ typedef struct KernelGlobals {} KernelGlobals;
#ifdef __KERNEL_OPENCL__
-typedef struct KernelGlobals {
+typedef ccl_addr_space struct KernelGlobals {
ccl_constant KernelData *data;
#define KERNEL_TEX(type, ttype, name) \
diff --git a/intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl
new file mode 100644
index 00000000000..a2e57771522
--- /dev/null
+++ b/intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -0,0 +1,283 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao kernel.
+ * This is the sixth kernel in the ray tracing logic. This is the fifth
+ * of the path iteration kernels. This kernel takes care of the logic to process
+ * "material of type holdout", indirect primitive emission, bsdf blurring,
+ * probabilistic path termination and AO.
+ *
+ * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed.
+ * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and
+ * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS
+ *
+ * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * throughput_coop --------------------------------------| |--- PathState_coop
+ * PathRadiance_coop ------------------------------------| |--- throughput_coop
+ * Intersection_coop ------------------------------------| |--- L_transparent_coop
+ * PathState_coop ---------------------------------------| |--- per_sample_output_buffers
+ * L_transparent_coop -----------------------------------| |--- PathRadiance_coop
+ * shader_data ------------------------------------------| |--- ShaderData
+ * ray_state --------------------------------------------| |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- AOAlpha_coop
+ * kg (globals + data) ----------------------------------| |--- AOBSDF_coop
+ * parallel_samples -------------------------------------| |--- AOLightRay_coop
+ * per_sample_output_buffers ----------------------------| |
+ * sw ---------------------------------------------------| |
+ * sh ---------------------------------------------------| |
+ * sx ---------------------------------------------------| |
+ * sy ---------------------------------------------------| |
+ * stride -----------------------------------------------| |
+ * work_array -------------------------------------------| |
+ * queuesize --------------------------------------------| |
+ * start_sample -----------------------------------------| |
+ *
+ * Note on Queues :
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
+ * the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER
+ * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
+ * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
+ * changed to RAY_UPDATE_BUFFER, there is no problem.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
+ */
+
+__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
+ ccl_global char *globals,
+ ccl_constant KernelData *data,
+ ccl_global char *shader_data, /* Required throughout the kernel except probabilistic path termination and AO */
+ ccl_global float *per_sample_output_buffers,
+ ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */
+ ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */
+ ccl_global float *L_transparent_coop, /* Required for handling holdout material */
+ PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */
+ ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */
+ Intersection *Intersection_coop, /* Required for indirect primitive emission */
+ ccl_global float3 *AOAlpha_coop, /* Required for AO */
+ ccl_global float3 *AOBSDF_coop, /* Required for AO */
+ ccl_global Ray *AOLightRay_coop, /* Required for AO */
+ int sw, int sh, int sx, int sy, int stride,
+ ccl_global char *ray_state, /* Denotes the state of each ray */
+ ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */
+ ccl_global int *Queue_data, /* Queue memory */
+ ccl_global int *Queue_index, /* Tracks the number of elements in each queue */
+ int queuesize, /* Size (capacity) of each queue */
+#ifdef __WORK_STEALING__
+ unsigned int start_sample,
+#endif
+ int parallel_samples /* Number of samples to be processed in parallel */
+ )
+{
+ ccl_local unsigned int local_queue_atomics_bg;
+ ccl_local unsigned int local_queue_atomics_ao;
+ if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+ local_queue_atomics_bg = 0;
+ local_queue_atomics_ao = 0;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ char enqueue_flag = 0;
+ char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+ int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+ ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+ /* If we are executing on a GPU device, we exit all threads that are not required
+ * If we are executing on a CPU device, then we need to keep all threads active
+ * since we have barrier() calls later in the kernel. CPU devices
+ * expect all threads to execute barrier statement.
+ */
+ if(ray_index == QUEUE_EMPTY_SLOT)
+ return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+ /* Load kernel globals structure and ShaderData structure */
+ KernelGlobals *kg = (KernelGlobals *)globals;
+ ShaderData *sd = (ShaderData *)shader_data;
+
+#ifdef __WORK_STEALING__
+ unsigned int my_work;
+ unsigned int pixel_x;
+ unsigned int pixel_y;
+#endif
+ unsigned int tile_x;
+ unsigned int tile_y;
+ int my_sample_tile;
+ unsigned int sample;
+
+ ccl_global RNG *rng = 0x0;
+ ccl_global PathState *state = 0x0;
+ float3 throughput;
+
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+ throughput = throughput_coop[ray_index];
+ state = &PathState_coop[ray_index];
+ rng = &rng_coop[ray_index];
+#ifdef __WORK_STEALING__
+ my_work = work_array[ray_index];
+ sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+ get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+ my_sample_tile = 0;
+#else // __WORK_STEALING__
+ sample = work_array[ray_index];
+ /* buffer's stride is "stride"; Find x and y using ray_index */
+ int tile_index = ray_index / parallel_samples;
+ tile_x = tile_index % sw;
+ tile_y = tile_index / sw;
+ my_sample_tile = ray_index - (tile_index * parallel_samples);
+#endif // __WORK_STEALING__
+ per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+
+ /* holdout */
+#ifdef __HOLDOUT__
+ if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) {
+ if(kernel_data.background.transparent) {
+ float3 holdout_weight;
+
+ if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK)
+ holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+ else
+ holdout_weight = shader_holdout_eval(kg, sd);
+
+ /* any throughput is ok, should all be identical here */
+ L_transparent_coop[ray_index] += average(holdout_weight*throughput);
+ }
+
+ if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ enqueue_flag = 1;
+ }
+ }
+#endif
+ }
+
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+ PathRadiance *L = &PathRadiance_coop[ray_index];
+ /* holdout mask objects do not write data passes */
+ kernel_write_data_passes(kg, per_sample_output_buffers, L, sd, sample, state, throughput);
+
+ /* blurring of bsdf after bounces, for rays that have a small likelihood
+ * of following this particular path (diffuse, rough glossy) */
+ if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+ float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
+
+ if(blur_pdf < 1.0f) {
+ float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
+ shader_bsdf_blur(kg, sd, blur_roughness);
+ }
+ }
+
+#ifdef __EMISSION__
+ /* emission */
+ if(ccl_fetch(sd, flag) & SD_EMISSION) {
+ /* todo: is isect.t wrong here for transparent surfaces? */
+ float3 emission = indirect_primitive_emission(kg, sd, Intersection_coop[ray_index].t, state->flag, state->ray_pdf);
+ path_radiance_accum_emission(L, throughput, emission, state->bounce);
+ }
+#endif
+
+ /* path termination. this is a strange place to put the termination, it's
+ * mainly due to the mixed in MIS that we use. gives too many unneeded
+ * shader evaluations, only need emission if we are going to terminate */
+ float probability = path_state_terminate_probability(kg, state, throughput);
+
+ if(probability == 0.0f) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ enqueue_flag = 1;
+ }
+
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+ if(probability != 1.0f) {
+ float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+
+ if(terminate >= probability) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ enqueue_flag = 1;
+ } else {
+ throughput_coop[ray_index] = throughput/probability;
+ }
+ }
+ }
+ }
+
+#ifdef __AO__
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+ /* ambient occlusion */
+ if(kernel_data.integrator.use_ambient_occlusion || (ccl_fetch(sd, flag) & SD_AO)) {
+ /* todo: solve correlation */
+ float bsdf_u, bsdf_v;
+ path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+ float ao_factor = kernel_data.background.ao_factor;
+ float3 ao_N;
+ AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+ AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
+
+ float3 ao_D;
+ float ao_pdf;
+ sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+ if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+ Ray _ray;
+ _ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+ _ray.D = ao_D;
+ _ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+ _ray.time = ccl_fetch(sd, time);
+#endif
+ _ray.dP = ccl_fetch(sd, dP);
+ _ray.dD = differential3_zero();
+ AOLightRay_coop[ray_index] = _ray;
+
+ ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+ enqueue_flag_AO_SHADOW_RAY_CAST = 1;
+ }
+ }
+ }
+#endif
+#ifndef __COMPUTE_DEVICE_GPU__
+ }
+#endif
+
+ /* Enqueue RAY_UPDATE_BUFFER rays */
+ enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics_bg, Queue_data, Queue_index);
+#ifdef __AO__
+ /* Enqueue to-shadow-ray-cast rays */
+ enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, enqueue_flag_AO_SHADOW_RAY_CAST, queuesize, &local_queue_atomics_ao, Queue_data, Queue_index);
+#endif
+}
diff --git a/intern/cycles/kernel/kernel_lamp_emission.cl b/intern/cycles/kernel/kernel_lamp_emission.cl
new file mode 100644
index 00000000000..e7f8b227dd8
--- /dev/null
+++ b/intern/cycles/kernel/kernel_lamp_emission.cl
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_lamp_emission
+ * This is the 3rd kernel in the ray-tracing logic. This is the second of the
+ * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
+ * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
+ * and RAY_HIT_BACKGROUND.
+ * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
+ * The input/output of the kernel is as follows,
+ * Throughput_coop ------------------------------------|--- kernel_ocl_path_trace_lamp_emission --|--- PathRadiance_coop
+ * Ray_coop -------------------------------------------| |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * PathState_coop -------------------------------------| |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * kg (globals + data) --------------------------------| |
+ * Intersection_coop ----------------------------------| |
+ * ray_state ------------------------------------------| |
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----| |
+ * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----| |
+ * queuesize ------------------------------------------| |
+ * use_queues_flag ------------------------------------| |
+ * sw -------------------------------------------------| |
+ * sh -------------------------------------------------| |
+ * parallel_samples -----------------------------------| |
+ *
+ * note : shader_data is neither input nor output. Its just filled and consumed in the same, kernel_ocl_path_trace_lamp_emission, kernel.
+ */
+__kernel void kernel_ocl_path_trace_lamp_emission(
+ ccl_global char *globals,
+ ccl_constant KernelData *data,
+ ccl_global char *shader_data, /* Required for lamp emission */
+ ccl_global float3 *throughput_coop, /* Required for lamp emission */
+ PathRadiance *PathRadiance_coop, /* Required for lamp emission */
+ ccl_global Ray *Ray_coop, /* Required for lamp emission */
+ ccl_global PathState *PathState_coop, /* Required for lamp emission */
+ Intersection *Intersection_coop, /* Required for lamp emission */
+ ccl_global char *ray_state, /* Denotes the state of each ray */
+ int sw, int sh,
+ ccl_global int *Queue_data, /* Memory for queues */
+ ccl_global int *Queue_index, /* Tracks the number of elements in queues */
+ int queuesize, /* Size (capacity) of queues */
+ ccl_global char *use_queues_flag, /* used to decide if this kernel should use queues to fetch ray index */
+ int parallel_samples /* Number of samples to be processed in parallel */
+ )
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+
+ /* We will empty this queue in this kernel */
+ if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+ Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+ }
+
+ /* Fetch use_queues_flag */
+ ccl_local char local_use_queues_flag;
+ if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+ local_use_queues_flag = use_queues_flag[0];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int ray_index;
+ if(local_use_queues_flag) {
+ int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+ ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 1);
+
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+ } else {
+ if(x < (sw * parallel_samples) && y < sh){
+ ray_index = x + y * (sw * parallel_samples);
+ } else {
+ return;
+ }
+ }
+
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+ KernelGlobals *kg = (KernelGlobals *)globals;
+ ShaderData *sd = (ShaderData *)shader_data;
+ PathRadiance *L = &PathRadiance_coop[ray_index];
+
+ float3 throughput = throughput_coop[ray_index];
+ Ray ray = Ray_coop[ray_index];
+ PathState state = PathState_coop[ray_index];
+
+#ifdef __LAMP_MIS__
+ if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
+ /* ray starting from previous non-transparent bounce */
+ Ray light_ray;
+
+ light_ray.P = ray.P - state.ray_t*ray.D;
+ state.ray_t += Intersection_coop[ray_index].t;
+ light_ray.D = ray.D;
+ light_ray.t = state.ray_t;
+ light_ray.time = ray.time;
+ light_ray.dD = ray.dD;
+ light_ray.dP = ray.dP;
+ /* intersect with lamp */
+ float3 emission;
+
+ if(indirect_lamp_emission(kg, &state, &light_ray, &emission, sd)) {
+ path_radiance_accum_emission(L, throughput, emission, state.bounce);
+ }
+ }
+#endif
+ /* __VOLUME__ feature is disabled */
+#if 0
+#ifdef __VOLUME__
+ /* volume attenuation, emission, scatter */
+ if(state.volume_stack[0].shader != SHADER_NONE) {
+ Ray volume_ray = ray;
+ volume_ray.t = (hit)? isect.t: FLT_MAX;
+
+ bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+
+#ifdef __VOLUME_DECOUPLED__
+ int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+ bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
+
+ if(decoupled) {
+ /* cache steps along volume for repeated sampling */
+ VolumeSegment volume_segment;
+ ShaderData volume_sd;
+
+ shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+ kernel_volume_decoupled_record(kg, &state,
+ &volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+ volume_segment.sampling_method = sampling_method;
+
+ /* emission */
+ if(volume_segment.closure_flag & SD_EMISSION)
+ path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+
+ /* scattering */
+ VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+ if(volume_segment.closure_flag & SD_SCATTER) {
+ bool all = false;
+
+ /* direct light sampling */
+ kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+ throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
+
+ /* indirect sample. if we use distance sampling and take just
+ * one sample for direct and indirect light, we could share
+ * this computation, but makes code a bit complex */
+ float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
+ float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+
+ result = kernel_volume_decoupled_scatter(kg,
+ &state, &volume_ray, &volume_sd, &throughput,
+ rphase, rscatter, &volume_segment, NULL, true);
+ }
+
+ if(result != VOLUME_PATH_SCATTERED)
+ throughput *= volume_segment.accum_transmittance;
+
+ /* free cached steps */
+ kernel_volume_decoupled_free(kg, &volume_segment);
+
+ if(result == VOLUME_PATH_SCATTERED) {
+ if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+ continue;
+ else
+ break;
+ }
+ }
+ else
+#endif
+ {
+ /* integrate along volume segment with distance sampling */
+ ShaderData volume_sd;
+ VolumeIntegrateResult result = kernel_volume_integrate(
+ kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+
+#ifdef __VOLUME_SCATTER__
+ if(result == VOLUME_PATH_SCATTERED) {
+ /* direct lighting */
+ kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L);
+
+ /* indirect light bounce */
+ if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+ continue;
+ else
+ break;
+ }
+#endif
+ }
+ }
+#endif
+#endif
+ }
+}
diff --git a/intern/cycles/kernel/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernel_next_iteration_setup.cl
new file mode 100644
index 00000000000..49562ca6ed5
--- /dev/null
+++ b/intern/cycles/kernel/kernel_next_iteration_setup.cl
@@ -0,0 +1,176 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_setup_next_iteration kernel.
+ * This is the tenth kernel in the ray tracing logic. This is the ninth
+ * of the path iteration kernels. This kernel takes care of setting up
+ * Ray for the next iteration of path-iteration and accumulating radiance
+ * corresponding to AO and direct-lighting
+ *
+ * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_setup_next_iteration -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * throughput_coop --------------------------------------| |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * PathRadiance_coop ------------------------------------| |--- throughput_coop
+ * PathState_coop ---------------------------------------| |--- PathRadiance_coop
+ * shader_data ------------------------------------------| |--- PathState_coop
+ * ray_state --------------------------------------------| |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------| |--- Ray_coop
+ * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- use_queues_flag
+ * Ray_coop ---------------------------------------------| |
+ * kg (globals + data) ----------------------------------| |
+ * LightRay_dl_coop -------------------------------------|
+ * ISLamp_coop ------------------------------------------|
+ * BSDFEval_coop ----------------------------------------|
+ * LightRay_ao_coop -------------------------------------|
+ * AOBSDF_coop ------------------------------------------|
+ * AOAlpha_coop -----------------------------------------|
+ *
+ * Note on queues,
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
+ * the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF
+ * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
+ * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
+ * changed to RAY_UPDATE_BUFF, there is no problem.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
+ */
+
+__kernel void kernel_ocl_path_trace_setup_next_iteration(
+ ccl_global char *globals,
+ ccl_constant KernelData *data,
+ ccl_global char *shader_data, /* Required for setting up ray for next iteration */
+ ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */
+ ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */
+ PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */
+ ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */
+ ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
+ ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */
+ ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */
+ ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */
+ ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */
+ ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */
+ ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */
+ ccl_global char *ray_state, /* Denotes the state of each ray */
+ ccl_global int *Queue_data, /* Queue memory */
+ ccl_global int *Queue_index, /* Tracks the number of elements in each queue */
+ int queuesize, /* Size (capacity) of each queue */
+ ccl_global char *use_queues_flag /* flag to decide if scene_intersect kernel should use queues to fetch ray index */
+ )
+{
+
+ ccl_local unsigned int local_queue_atomics;
+ if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+ local_queue_atomics = 0;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+ /* If we are here, then it means that scene-intersect kernel
+ * has already been executed atleast once. From the next time,
+ * scene-intersect kernel may operate on queues to fetch ray index
+ */
+ use_queues_flag[0] = 1;
+
+ /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS
+ * queues that were made empty during the previous kernel
+ */
+ Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+ Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+ }
+
+ char enqueue_flag = 0;
+ int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+ ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+ /* If we are executing on a GPU device, we exit all threads that are not required
+ * If we are executing on a CPU device, then we need to keep all threads active
+ * since we have barrier() calls later in the kernel. CPU devices,
+ * expect all threads to execute barrier statement.
+ */
+ if(ray_index == QUEUE_EMPTY_SLOT)
+ return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+ if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+ /* Load kernel globals structure and ShaderData structure */
+ KernelGlobals *kg = (KernelGlobals *)globals;
+ ShaderData *sd = (ShaderData *)shader_data;
+ PathRadiance *L = 0x0;
+ ccl_global PathState *state = 0x0;
+
+ /* Path radiance update for AO/Direct_lighting's shadow blocked */
+ if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+ state = &PathState_coop[ray_index];
+ L = &PathRadiance_coop[ray_index];
+ float3 _throughput = throughput_coop[ray_index];
+
+ if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+ float3 shadow = LightRay_ao_coop[ray_index].P;
+ char update_path_radiance = LightRay_ao_coop[ray_index].t;
+ if(update_path_radiance) {
+ path_radiance_accum_ao(L, _throughput, AOAlpha_coop[ray_index], AOBSDF_coop[ray_index], shadow, state->bounce);
+ }
+ REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+ }
+
+ if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
+ float3 shadow = LightRay_dl_coop[ray_index].P;
+ char update_path_radiance = LightRay_dl_coop[ray_index].t;
+ if(update_path_radiance) {
+ BsdfEval L_light = BSDFEval_coop[ray_index];
+ path_radiance_accum_light(L, _throughput, &L_light, shadow, 1.0f, state->bounce, ISLamp_coop[ray_index]);
+ }
+ REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+ }
+ }
+
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+ ccl_global float3 *throughput = &throughput_coop[ray_index];
+ ccl_global Ray *ray = &Ray_coop[ray_index];
+ ccl_global RNG* rng = &rng_coop[ray_index];
+ state = &PathState_coop[ray_index];
+ L = &PathRadiance_coop[ray_index];
+
+ /* compute direct lighting and next bounce */
+ if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+ enqueue_flag = 1;
+ }
+ }
+#ifndef __COMPUTE_DEVICE_GPU__
+ }
+#endif
+
+ /* Enqueue RAY_UPDATE_BUFFER rays */
+ enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+}
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 8910e26fcc6..20cf3fa931b 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -19,23 +19,49 @@ CCL_NAMESPACE_BEGIN
ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value)
{
ccl_global float *buf = buffer;
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+ atomic_add_float(buf, value);
+#else
*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
}
ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value)
{
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+ ccl_global float *buf_x = buffer + 0;
+ ccl_global float *buf_y = buffer + 1;
+ ccl_global float *buf_z = buffer + 2;
+
+ atomic_add_float(buf_x, value.x);
+ atomic_add_float(buf_y, value.y);
+ atomic_add_float(buf_z, value.z);
+#else
ccl_global float3 *buf = (ccl_global float3*)buffer;
*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
}
ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value)
{
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+ ccl_global float *buf_x = buffer + 0;
+ ccl_global float *buf_y = buffer + 1;
+ ccl_global float *buf_z = buffer + 2;
+ ccl_global float *buf_w = buffer + 3;
+
+ atomic_add_float(buf_x, value.x);
+ atomic_add_float(buf_y, value.y);
+ atomic_add_float(buf_z, value.z);
+ atomic_add_float(buf_w, value.w);
+#else
ccl_global float4 *buf = (ccl_global float4*)buffer;
*buf = (sample == 0)? value: *buf + value;
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
}
ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
- ShaderData *sd, int sample, PathState *state, float3 throughput)
+ ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
{
#ifdef __PASSES__
int path_flag = state->flag;
@@ -49,18 +75,18 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
return;
if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
- if(!(sd->flag & SD_TRANSPARENT) ||
+ if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) ||
kernel_data.film.pass_alpha_threshold == 0.0f ||
average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold)
{
if(sample == 0) {
if(flag & PASS_DEPTH) {
- float depth = camera_distance(kg, sd->P);
+ float depth = camera_distance(kg, ccl_fetch(sd, P));
kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth);
}
if(flag & PASS_OBJECT_ID) {
- float id = object_pass_id(kg, sd->object);
+ float id = object_pass_id(kg, ccl_fetch(sd, object));
kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id);
}
if(flag & PASS_MATERIAL_ID) {
@@ -70,7 +96,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
}
if(flag & PASS_NORMAL) {
- float3 normal = sd->N;
+ float3 normal = ccl_fetch(sd, N);
kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal);
}
if(flag & PASS_UV) {
@@ -101,7 +127,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
float mist_start = kernel_data.film.mist_start;
float mist_inv_depth = kernel_data.film.mist_inv_depth;
- float depth = camera_distance(kg, sd->P);
+ float depth = camera_distance(kg, ccl_fetch(sd, P));
float mist = saturate((depth - mist_start)*mist_inv_depth);
/* falloff */
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 0ba8f694592..e2dbf6e22f7 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -42,6 +42,7 @@
#include "kernel_path_state.h"
#include "kernel_shadow.h"
#include "kernel_emission.h"
+#include "kernel_path_common.h"
#include "kernel_path_surface.h"
#include "kernel_path_volume.h"
@@ -305,17 +306,17 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *
sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
- if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+ if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
Ray light_ray;
float3 ao_shadow;
- light_ray.P = ray_offset(sd->P, sd->Ng);
+ light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
light_ray.D = ao_D;
light_ray.t = kernel_data.background.ao_distance;
#ifdef __OBJECT_MOTION__
- light_ray.time = sd->time;
+ light_ray.time = ccl_fetch(sd, time);
#endif
- light_ray.dP = sd->dP;
+ light_ray.dP = ccl_fetch(sd, dP);
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
@@ -341,17 +342,17 @@ ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathR
sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
- if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+ if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
Ray light_ray;
float3 ao_shadow;
- light_ray.P = ray_offset(sd->P, sd->Ng);
+ light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
light_ray.D = ao_D;
light_ray.t = kernel_data.background.ao_distance;
#ifdef __OBJECT_MOTION__
- light_ray.time = sd->time;
+ light_ray.time = ccl_fetch(sd, time);
#endif
- light_ray.dP = sd->dP;
+ light_ray.dP = ccl_fetch(sd, dP);
light_ray.dD = differential3_zero();
if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
@@ -381,7 +382,7 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd
#ifdef __VOLUME__
Ray volume_ray = *ray;
bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
- sd->flag & SD_OBJECT_INTERSECTS_VOLUME;
+ ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
#endif
/* compute lighting with the BSDF closure */
@@ -712,8 +713,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust,
PathState *state, PathRadiance *L)
{
- for(int i = 0; i< sd->num_closure; i++) {
- const ShaderClosure *sc = &sd->closure[i];
+ for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+ const ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
if(!CLOSURE_IS_BSDF(sc->type))
continue;
@@ -764,8 +765,8 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
Ray *ray,
float3 throughput)
{
- for(int i = 0; i< sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+ ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
if(!CLOSURE_IS_BSSRDF(sc->type))
continue;
@@ -786,7 +787,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
#ifdef __VOLUME__
Ray volume_ray = *ray;
bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
- sd->flag & SD_OBJECT_INTERSECTS_VOLUME;
+ ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
#endif
/* compute lighting with the BSDF closure */
@@ -1143,32 +1144,6 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
#endif
-ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int x, int y, RNG *rng, Ray *ray)
-{
- float filter_u;
- float filter_v;
-
- int num_samples = kernel_data.integrator.aa_samples;
-
- path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
-
- /* sample camera ray */
-
- float lens_u = 0.0f, lens_v = 0.0f;
-
- if(kernel_data.cam.aperturesize > 0.0f)
- path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
-
- float time = 0.0f;
-
-#ifdef __CAMERA_MOTION__
- if(kernel_data.cam.shuttertime != -1.0f)
- time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
-#endif
-
- camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
-}
-
ccl_device void kernel_path_trace(KernelGlobals *kg,
ccl_global float *buffer, ccl_global uint *rng_state,
int sample, int x, int y, int offset, int stride)
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
new file mode 100644
index 00000000000..1912dfa16ed
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_common.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
+ ccl_global uint *rng_state,
+ int sample,
+ int x, int y,
+ ccl_addr_space RNG *rng,
+ ccl_addr_space Ray *ray)
+{
+ float filter_u;
+ float filter_v;
+
+ int num_samples = kernel_data.integrator.aa_samples;
+
+ path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
+
+ /* sample camera ray */
+
+ float lens_u = 0.0f, lens_v = 0.0f;
+
+ if(kernel_data.cam.aperturesize > 0.0f)
+ path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
+
+ float time = 0.0f;
+
+#ifdef __CAMERA_MOTION__
+ if(kernel_data.cam.shuttertime != -1.0f)
+ time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
+#endif
+
+ camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index 45ea0e502ab..15efb2371de 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -16,7 +16,7 @@
CCL_NAMESPACE_BEGIN
-ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample, Ray *ray)
+ccl_device_inline void path_state_init(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space RNG *rng, int sample, ccl_addr_space Ray *ray)
{
state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP;
@@ -51,7 +51,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG
#endif
}
-ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int label)
+ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathState *state, int label)
{
/* ray through transparent keeps same flags from previous ray and is
* not counted as a regular bounce, transparent has separate max */
@@ -138,7 +138,7 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s
return flag;
}
-ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, PathState *state, const float3 throughput)
+ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput)
{
if(state->flag & PATH_RAY_TRANSPARENT) {
/* transparent rays treated separately */
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index f0d4e98c5e0..fe85a6b6e4b 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -24,7 +24,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
{
#ifdef __EMISSION__
/* sample illumination from lights to find path contribution */
- if(!(sd->flag & SD_BSDF_HAS_EVAL))
+ if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))
return;
Ray light_ray;
@@ -32,7 +32,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
bool is_lamp;
#ifdef __OBJECT_MOTION__
- light_ray.time = sd->time;
+ light_ray.time = ccl_fetch(sd, time);
#endif
if(sample_all_lights) {
@@ -53,7 +53,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
LightSample ls;
- lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls);
+ lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls);
if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
/* trace shadow ray */
@@ -85,7 +85,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
light_t = 0.5f*light_t;
LightSample ls;
- light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
+ light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
/* trace shadow ray */
@@ -106,7 +106,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN
path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
LightSample ls;
- light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
+ light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
/* sample random light */
if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
@@ -149,15 +149,15 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
path_state_next(kg, state, label);
/* setup ray */
- ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
+ ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
ray->D = bsdf_omega_in;
ray->t = FLT_MAX;
#ifdef __RAY_DIFFERENTIALS__
- ray->dP = sd->dP;
+ ray->dP = ccl_fetch(sd, dP);
ray->dD = bsdf_domega_in;
#endif
#ifdef __OBJECT_MOTION__
- ray->time = sd->time;
+ ray->time = ccl_fetch(sd, time);
#endif
#ifdef __VOLUME__
@@ -181,12 +181,13 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
#endif
+#ifndef __SPLIT_KERNEL__
/* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
- ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L)
+ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng,
+ ShaderData *sd, float3 throughput, ccl_addr_space PathState *state, PathRadiance *L)
{
#ifdef __EMISSION__
- if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
+ if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
return;
/* sample illumination from lights to find path contribution */
@@ -199,11 +200,11 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
bool is_lamp;
#ifdef __OBJECT_MOTION__
- light_ray.time = sd->time;
+ light_ray.time = ccl_fetch(sd, time);
#endif
LightSample ls;
- light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
+ light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) {
/* trace shadow ray */
@@ -216,13 +217,14 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
}
#endif
}
+#endif
/* path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
- ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, ccl_addr_space RNG *rng,
+ ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, ccl_addr_space Ray *ray)
{
/* no BSDF? we can stop here */
- if(sd->flag & SD_BSDF) {
+ if(ccl_fetch(sd, flag) & SD_BSDF) {
/* sample BSDF */
float bsdf_pdf;
BsdfEval bsdf_eval;
@@ -254,16 +256,16 @@ ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
path_state_next(kg, state, label);
/* setup ray */
- ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
+ ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
ray->D = bsdf_omega_in;
if(state->bounce == 0)
- ray->t -= sd->ray_length; /* clipping works through transparent */
+ ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
else
ray->t = FLT_MAX;
#ifdef __RAY_DIFFERENTIALS__
- ray->dP = sd->dP;
+ ray->dP = ccl_fetch(sd, dP);
ray->dD = bsdf_domega_in;
#endif
@@ -275,21 +277,21 @@ ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng,
return true;
}
#ifdef __VOLUME__
- else if(sd->flag & SD_HAS_ONLY_VOLUME) {
+ else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) {
/* no surface shader but have a volume shader? act transparent */
/* update path state, count as transparent */
path_state_next(kg, state, LABEL_TRANSPARENT);
if(state->bounce == 0)
- ray->t -= sd->ray_length; /* clipping works through transparent */
+ ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
else
ray->t = FLT_MAX;
/* setup ray position, direction stays unchanged */
- ray->P = ray_offset(sd->P, -sd->Ng);
+ ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
#ifdef __RAY_DIFFERENTIALS__
- ray->dP = sd->dP;
+ ray->dP = ccl_fetch(sd, dP);
#endif
/* enter/exit volume */
diff --git a/intern/cycles/kernel/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernel_queue_enqueue.cl
new file mode 100644
index 00000000000..eee7860fb84
--- /dev/null
+++ b/intern/cycles/kernel/kernel_queue_enqueue.cl
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_compat_opencl.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+#include "kernel_queues.h"
+
+/*
+ * The kernel "kernel_ocl_path_trace_queue_enqueue" enqueues rays of
+ * different ray state into their appropriate Queues;
+ * 1. Rays that have been determined to hit the background from the
+ * "kernel_ocl_path_trace_scene_intersect" kernel
+ * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output of the kernel is as follows,
+ *
+ * ray_state -------------------------------------------|--- kernel_ocl_path_trace_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |
+ * queuesize -------------------------------------------| |
+ *
+ * Note on Queues :
+ * State of queues during the first time this kernel is called :
+ * At entry,
+ * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
+ *
+ * State of queue during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
+
+__kernel void kernel_ocl_path_trace_queue_enqueue(
+ ccl_global int *Queue_data, /* Queue memory */
+ ccl_global int *Queue_index, /* Tracks the number of elements in each queue */
+ ccl_global char *ray_state, /* Denotes the state of each ray */
+ int queuesize /* Size (capacity) of each queue */
+ )
+{
+ /* We have only 2 cases (Hit/Not-Hit) */
+ ccl_local unsigned int local_queue_atomics[2];
+
+ int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+ int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+ if(lidx < 2 ) {
+ local_queue_atomics[lidx] = 0;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int queue_number = -1;
+
+ if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+ queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ } else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+ queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+ }
+
+ unsigned int my_lqidx;
+ if(queue_number != -1) {
+ my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if(lidx == 0) {
+ local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, local_queue_atomics, Queue_index);
+ local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, local_queue_atomics, Queue_index);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ unsigned int my_gqidx;
+ if(queue_number != -1) {
+ my_gqidx = get_global_queue_index(queue_number, queuesize, my_lqidx, local_queue_atomics);
+ Queue_data[my_gqidx] = ray_index;
+ }
+}
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
new file mode 100644
index 00000000000..9e65e2b0768
--- /dev/null
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_QUEUE_H__
+#define __KERNEL_QUEUE_H__
+
+/*
+ * Queue utility functions for split kernel
+ */
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+/*
+ * Enqueue ray index into the queue
+ */
+ccl_device void enqueue_ray_index (
+ int ray_index, /* Ray index to be enqueued */
+ int queue_number, /* Queue in which the ray index should be enqueued*/
+ ccl_global int *queues, /* Buffer of all queues */
+ int queue_size, /* Size of each queue */
+ ccl_global int *queue_index /* Array of size num_queues; Used for atomic increment */
+ )
+{
+ /* This thread's queue index */
+ int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size);
+ queues[my_queue_index] = ray_index;
+}
+
+/*
+ * Get the ray index for this thread
+ * Returns a positive ray_index for threads that have to do some work;
+ * Returns 'QUEUE_EMPTY_SLOT' for threads that don't have any work
+ * i.e All ray's in the queue has been successfully allocated and there
+ * is no more ray to allocate to other threads.
+ */
+ccl_device int get_ray_index (
+ int thread_index, /* Global thread index */
+ int queue_number, /* Queue to operate on */
+ ccl_global int *queues, /* Buffer of all queues */
+ int queuesize, /* Size of a queue */
+ int empty_queue /* Empty the queue slot as soon as we fetch the ray index */
+ )
+{
+ int ray_index = queues[queue_number * queuesize + thread_index];
+
+ if(empty_queue && ray_index != QUEUE_EMPTY_SLOT) {
+ queues[queue_number * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+ }
+
+ return ray_index;
+}
+
+/* The following functions are to realize Local memory variant of enqueue ray index function */
+
+/* All threads should call this function */
+ccl_device void enqueue_ray_index_local(
+ int ray_index, /* Ray index to enqueue*/
+ int queue_number, /* Queue in which to enqueue ray index */
+ char enqueue_flag, /* True for threads whose ray index has to be enqueued */
+ int queuesize, /* queue size */
+ ccl_local unsigned int *local_queue_atomics, /* To to local queue atomics */
+ ccl_global int *Queue_data, /* Queues */
+ ccl_global int *Queue_index /* To do global queue atomics */
+ )
+{
+ int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+
+ /* Get local queue id */
+ unsigned int lqidx;
+ if(enqueue_flag) {
+ lqidx = atomic_inc(local_queue_atomics);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* Get global queue offset */
+ if(lidx == 0) {
+ *local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* Get global queue index and enqueue ray */
+ if(enqueue_flag) {
+ unsigned int my_gqidx = queue_number * queuesize + (*local_queue_atomics) + lqidx;
+ Queue_data[my_gqidx] = ray_index;
+ }
+}
+
+ccl_device unsigned int get_local_queue_index(
+ int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
+ ccl_local unsigned int *local_queue_atomics
+ )
+{
+ int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]);
+ return my_lqidx;
+}
+
+ccl_device unsigned int get_global_per_queue_offset(
+ int queue_number,
+ ccl_local unsigned int *local_queue_atomics,
+ ccl_global int* global_queue_atomics
+ )
+{
+ unsigned int queue_offset = atomic_add((&global_queue_atomics[queue_number]), local_queue_atomics[queue_number]);
+ return queue_offset;
+}
+
+ccl_device unsigned int get_global_queue_index(
+ int queue_number,
+ int queuesize,
+ unsigned int lqidx,
+ ccl_local unsigned int * global_per_queue_offset
+ )
+{
+ int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
+ return my_gqidx;
+}
+
+#endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 40767bac013..631a2cb75de 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
return index;
}
-ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
{
#ifdef __CMJ__
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -132,7 +132,7 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int
#endif
}
-ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_inline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
{
#ifdef __CMJ__
if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -149,7 +149,7 @@ ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int
}
}
-ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy)
{
#ifdef __SOBOL_FULL_SCREEN__
uint px, py;
@@ -261,12 +261,12 @@ ccl_device uint lcg_init(uint seed)
* For branches in the path we must be careful not to reuse the same number
* in a sequence and offset accordingly. */
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
{
return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
}
-ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
{
/* the rng_offset is not increased for transparent bounces. if we do then
* fully transparent objects can become subtly visible by the different
@@ -279,23 +279,23 @@ ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *r
return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
}
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
{
path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
}
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
{
return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
}
-ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
{
int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
}
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
{
path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
}
diff --git a/intern/cycles/kernel/kernel_scene_intersect.cl b/intern/cycles/kernel/kernel_scene_intersect.cl
new file mode 100644
index 00000000000..6817e28a302
--- /dev/null
+++ b/intern/cycles/kernel/kernel_scene_intersect.cl
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_scene_intersect kernel.
+ * This is the second kernel in the ray tracing logic. This is the first
+ * of the path iteration kernels. This kernel takes care of scene_intersect function.
+ *
+ * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
+ * This kernel processes rays of ray state RAY_ACTIVE
+ * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND.
+ *
+ * The input and output are as follows,
+ *
+ * Ray_coop ---------------------------------------|--------- kernel_ocl_path_trace_scene_intersect----------|--- PathState
+ * PathState_coop ---------------------------------| |--- Intersection
+ * ray_state --------------------------------------| |--- ray_state
+ * use_queues_flag --------------------------------| |
+ * parallel_samples -------------------------------| |
+ * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| |
+ * kg (data + globals) ----------------------------| |
+ * rng_coop ---------------------------------------| |
+ * sw ---------------------------------------------| |
+ * sh ---------------------------------------------| |
+ * queuesize --------------------------------------| |
+ *
+ * Note on Queues :
+ * Ideally we would want kernel_ocl_path_trace_scene_intersect to work on queues.
+ * But during the very first time, the queues wil be empty and hence we perform a direct mapping
+ * between ray-index and thread-index; From the next time onward, the queue will be filled and
+ * we may start operating on queues.
+ *
+ * State of queue during the first time this kernel is called :
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel
+ *
+ * State of queues during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays;
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ;
+ * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These
+ * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing
+ * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays
+ * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues)
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and
+ * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
+ */
+
+__kernel void kernel_ocl_path_trace_scene_intersect(
+ ccl_global char *globals,
+ ccl_constant KernelData *data,
+ ccl_global uint *rng_coop,
+ ccl_global Ray *Ray_coop, /* Required for scene_intersect */
+ ccl_global PathState *PathState_coop, /* Required for scene_intersect */
+ Intersection *Intersection_coop, /* Required for scene_intersect */
+ ccl_global char *ray_state, /* Denotes the state of each ray */
+ int sw, int sh,
+ ccl_global int *Queue_data, /* Memory for queues */
+ ccl_global int *Queue_index, /* Tracks the number of elements in queues */
+ int queuesize, /* Size (capacity) of queues */
+ ccl_global char *use_queues_flag, /* used to decide if this kernel should use queues to fetch ray index */
+#ifdef __KERNEL_DEBUG__
+ DebugData *debugdata_coop,
+#endif
+ int parallel_samples /* Number of samples to be processed in parallel */
+ )
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+
+ /* Fetch use_queues_flag */
+ ccl_local char local_use_queues_flag;
+ if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+ local_use_queues_flag = use_queues_flag[0];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int ray_index;
+ if(local_use_queues_flag) {
+ int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+ ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+ if(ray_index == QUEUE_EMPTY_SLOT) {
+ return;
+ }
+ } else {
+ if(x < (sw * parallel_samples) && y < sh){
+ ray_index = x + y * (sw * parallel_samples);
+ } else {
+ return;
+ }
+ }
+
+ /* All regenerated rays become active here */
+ if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
+
+ if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE))
+ return;
+
+ /* Load kernel globals structure */
+ KernelGlobals *kg = (KernelGlobals *)globals;
+
+#ifdef __KERNEL_DEBUG__
+ DebugData *debug_data = &debugdata_coop[ray_index];
+#endif
+ Intersection *isect = &Intersection_coop[ray_index];
+ PathState state = PathState_coop[ray_index];
+ Ray ray = Ray_coop[ray_index];
+
+ /* intersect scene */
+ uint visibility = path_state_ray_visibility(kg, &state);
+
+#ifdef __HAIR__
+ float difl = 0.0f, extmax = 0.0f;
+ uint lcg_state = 0;
+ RNG rng = rng_coop[ray_index];
+
+ if(kernel_data.bvh.have_curves) {
+ if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
+ float3 pixdiff = ray.dD.dx + ray.dD.dy;
+ /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
+ difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+ }
+
+ extmax = kernel_data.curve.maximum_width;
+ lcg_state = lcg_state_init(&rng, &state, 0x51633e2d);
+ }
+
+ bool hit = scene_intersect(kg, &ray, visibility, isect, &lcg_state, difl, extmax);
+#else
+ bool hit = scene_intersect(kg, &ray, visibility, isect, NULL, 0.0f, 0.0f);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+ if(state.flag & PATH_RAY_CAMERA) {
+ debug_data->num_bvh_traversal_steps += isect->num_traversal_steps;
+ }
+#endif
+
+ if(!hit) {
+ /* Change the state of rays that hit the background;
+ * These rays undergo special processing in the
+ * background_bufferUpdate kernel*/
+ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+ }
+}
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index e9d9f72dfcd..a12419624c3 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -52,55 +52,55 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
const Intersection *isect, const Ray *ray, int bounce, int transparent_bounce)
{
#ifdef __INSTANCING__
- sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
+ ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
#endif
- sd->type = isect->type;
- sd->flag = kernel_tex_fetch(__object_flag, sd->object);
+ ccl_fetch(sd, type) = isect->type;
+ ccl_fetch(sd, flag) = kernel_tex_fetch(__object_flag, ccl_fetch(sd, object));
/* matrices and time */
#ifdef __OBJECT_MOTION__
shader_setup_object_transforms(kg, sd, ray->time);
- sd->time = ray->time;
+ ccl_fetch(sd, time) = ray->time;
#endif
- sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
- sd->ray_length = isect->t;
- sd->ray_depth = bounce;
- sd->transparent_depth = transparent_bounce;
+ ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim);
+ ccl_fetch(sd, ray_length) = isect->t;
+ ccl_fetch(sd, ray_depth) = bounce;
+ ccl_fetch(sd, transparent_depth) = transparent_bounce;
#ifdef __UV__
- sd->u = isect->u;
- sd->v = isect->v;
+ ccl_fetch(sd, u) = isect->u;
+ ccl_fetch(sd, v) = isect->v;
#endif
#ifdef __HAIR__
- if(sd->type & PRIMITIVE_ALL_CURVE) {
+ if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
/* curve */
- float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+ float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
- sd->shader = __float_as_int(curvedata.z);
- sd->P = bvh_curve_refine(kg, sd, isect, ray);
+ ccl_fetch(sd, shader) = __float_as_int(curvedata.z);
+ ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray);
}
else
#endif
- if(sd->type & PRIMITIVE_TRIANGLE) {
+ if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
/* static triangle */
float3 Ng = triangle_normal(kg, sd);
- sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+ ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
/* vectors */
- sd->P = triangle_refine(kg, sd, isect, ray);
- sd->Ng = Ng;
- sd->N = Ng;
+ ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray);
+ ccl_fetch(sd, Ng) = Ng;
+ ccl_fetch(sd, N) = Ng;
/* smooth normal */
- if(sd->shader & SHADER_SMOOTH_NORMAL)
- sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+ if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL)
+ ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
#ifdef __DPDU__
/* dPdu/dPdv */
- triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+ triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
#endif
}
else {
@@ -108,40 +108,40 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
motion_triangle_shader_setup(kg, sd, isect, ray, false);
}
- sd->I = -ray->D;
+ ccl_fetch(sd, I) = -ray->D;
- sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
+ ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
#ifdef __INSTANCING__
if(isect->object != OBJECT_NONE) {
/* instance transform */
- object_normal_transform(kg, sd, &sd->N);
- object_normal_transform(kg, sd, &sd->Ng);
+ object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
+ object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
#ifdef __DPDU__
- object_dir_transform(kg, sd, &sd->dPdu);
- object_dir_transform(kg, sd, &sd->dPdv);
+ object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
+ object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
#endif
}
#endif
/* backfacing test */
- bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+ bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
if(backfacing) {
- sd->flag |= SD_BACKFACING;
- sd->Ng = -sd->Ng;
- sd->N = -sd->N;
+ ccl_fetch(sd, flag) |= SD_BACKFACING;
+ ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
+ ccl_fetch(sd, N) = -ccl_fetch(sd, N);
#ifdef __DPDU__
- sd->dPdu = -sd->dPdu;
- sd->dPdv = -sd->dPdv;
+ ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
+ ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
#endif
}
#ifdef __RAY_DIFFERENTIALS__
/* differentials */
- differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
- differential_incoming(&sd->dI, ray->dD);
- differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
+ differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t);
+ differential_incoming(&ccl_fetch(sd, dI), ray->dD);
+ differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng));
#endif
}
@@ -230,105 +230,105 @@ ccl_device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
int shader, int object, int prim, float u, float v, float t, float time, int bounce, int transparent_bounce)
{
/* vectors */
- sd->P = P;
- sd->N = Ng;
- sd->Ng = Ng;
- sd->I = I;
- sd->shader = shader;
- sd->type = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE;
+ ccl_fetch(sd, P) = P;
+ ccl_fetch(sd, N) = Ng;
+ ccl_fetch(sd, Ng) = Ng;
+ ccl_fetch(sd, I) = I;
+ ccl_fetch(sd, shader) = shader;
+ ccl_fetch(sd, type) = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE;
/* primitive */
#ifdef __INSTANCING__
- sd->object = object;
+ ccl_fetch(sd, object) = object;
#endif
/* currently no access to bvh prim index for strand sd->prim*/
- sd->prim = prim;
+ ccl_fetch(sd, prim) = prim;
#ifdef __UV__
- sd->u = u;
- sd->v = v;
+ ccl_fetch(sd, u) = u;
+ ccl_fetch(sd, v) = v;
#endif
- sd->ray_length = t;
- sd->ray_depth = bounce;
- sd->transparent_depth = transparent_bounce;
+ ccl_fetch(sd, ray_length) = t;
+ ccl_fetch(sd, ray_depth) = bounce;
+ ccl_fetch(sd, transparent_depth) = transparent_bounce;
/* detect instancing, for non-instanced the object index is -object-1 */
#ifdef __INSTANCING__
bool instanced = false;
- if(sd->prim != PRIM_NONE) {
- if(sd->object >= 0)
+ if(ccl_fetch(sd, prim) != PRIM_NONE) {
+ if(ccl_fetch(sd, object) >= 0)
instanced = true;
else
#endif
- sd->object = ~sd->object;
+ ccl_fetch(sd, object) = ~ccl_fetch(sd, object);
#ifdef __INSTANCING__
}
#endif
- sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
- if(sd->object != OBJECT_NONE) {
- sd->flag |= kernel_tex_fetch(__object_flag, sd->object);
+ ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
+ if(ccl_fetch(sd, object) != OBJECT_NONE) {
+ ccl_fetch(sd, flag) |= kernel_tex_fetch(__object_flag, ccl_fetch(sd, object));
#ifdef __OBJECT_MOTION__
shader_setup_object_transforms(kg, sd, time);
}
- sd->time = time;
+ ccl_fetch(sd, time) = time;
#else
}
#endif
- if(sd->type & PRIMITIVE_TRIANGLE) {
+ if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
/* smooth normal */
- if(sd->shader & SHADER_SMOOTH_NORMAL) {
- sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+ if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+ ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
#ifdef __INSTANCING__
if(instanced)
- object_normal_transform(kg, sd, &sd->N);
+ object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
#endif
}
/* dPdu/dPdv */
#ifdef __DPDU__
- triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+ triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
#ifdef __INSTANCING__
if(instanced) {
- object_dir_transform(kg, sd, &sd->dPdu);
- object_dir_transform(kg, sd, &sd->dPdv);
+ object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
+ object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
}
#endif
#endif
}
else {
#ifdef __DPDU__
- sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
- sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
+ ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
+ ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
#endif
}
/* backfacing test */
- if(sd->prim != PRIM_NONE) {
- bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+ if(ccl_fetch(sd, prim) != PRIM_NONE) {
+ bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
if(backfacing) {
- sd->flag |= SD_BACKFACING;
- sd->Ng = -sd->Ng;
- sd->N = -sd->N;
+ ccl_fetch(sd, flag) |= SD_BACKFACING;
+ ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
+ ccl_fetch(sd, N) = -ccl_fetch(sd, N);
#ifdef __DPDU__
- sd->dPdu = -sd->dPdu;
- sd->dPdv = -sd->dPdv;
+ ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
+ ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
#endif
}
}
#ifdef __RAY_DIFFERENTIALS__
/* no ray differentials here yet */
- sd->dP = differential3_zero();
- sd->dI = differential3_zero();
- sd->du = differential_zero();
- sd->dv = differential_zero();
+ ccl_fetch(sd, dP) = differential3_zero();
+ ccl_fetch(sd, dI) = differential3_zero();
+ ccl_fetch(sd, du) = differential_zero();
+ ccl_fetch(sd, dv) = differential_zero();
#endif
}
@@ -355,45 +355,46 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce)
{
/* vectors */
- sd->P = ray->D;
- sd->N = -ray->D;
- sd->Ng = -ray->D;
- sd->I = -ray->D;
- sd->shader = kernel_data.background.surface_shader;
- sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
+ ccl_fetch(sd, P) = ray->D;
+ ccl_fetch(sd, N) = -ray->D;
+ ccl_fetch(sd, Ng) = -ray->D;
+ ccl_fetch(sd, I) = -ray->D;
+ ccl_fetch(sd, shader) = kernel_data.background.surface_shader;
+ ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
#ifdef __OBJECT_MOTION__
- sd->time = ray->time;
+ ccl_fetch(sd, time) = ray->time;
#endif
- sd->ray_length = 0.0f;
- sd->ray_depth = bounce;
- sd->transparent_depth = transparent_bounce;
+ ccl_fetch(sd, ray_length) = 0.0f;
+ ccl_fetch(sd, ray_depth) = bounce;
+ ccl_fetch(sd, transparent_depth) = transparent_bounce;
#ifdef __INSTANCING__
- sd->object = PRIM_NONE;
+ ccl_fetch(sd, object) = PRIM_NONE;
#endif
- sd->prim = PRIM_NONE;
+ ccl_fetch(sd, prim) = PRIM_NONE;
#ifdef __UV__
- sd->u = 0.0f;
- sd->v = 0.0f;
+ ccl_fetch(sd, u) = 0.0f;
+ ccl_fetch(sd, v) = 0.0f;
#endif
#ifdef __DPDU__
/* dPdu/dPdv */
- sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
- sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
+ ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
+ ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
#endif
#ifdef __RAY_DIFFERENTIALS__
/* differentials */
- sd->dP = ray->dD;
- differential_incoming(&sd->dI, sd->dP);
- sd->du = differential_zero();
- sd->dv = differential_zero();
+ ccl_fetch(sd, dP) = ray->dD;
+ differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP));
+ ccl_fetch(sd, du) = differential_zero();
+ ccl_fetch(sd, dv) = differential_zero();
#endif
}
/* ShaderData setup from point inside volume */
+#ifdef __VOLUME__
ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce)
{
/* vectors */
@@ -439,6 +440,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
sd->ray_P = ray->P;
sd->ray_dP = ray->dP;
}
+#endif
/* Merging */
@@ -491,11 +493,11 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa
{
/* this is the veach one-sample model with balance heuristic, some pdf
* factors drop out when using balance heuristic weighting */
- for(int i = 0; i< sd->num_closure; i++) {
+ for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
if(i == skip_bsdf)
continue;
- const ShaderClosure *sc = &sd->closure[i];
+ const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
if(CLOSURE_IS_BSDF(sc->type)) {
float bsdf_pdf = 0.0f;
@@ -513,7 +515,7 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa
*pdf = (sum_sample_weight > 0.0f)? sum_pdf/sum_sample_weight: 0.0f;
}
-ccl_device void shader_bsdf_eval(KernelGlobals *kg, const ShaderData *sd,
+ccl_device void shader_bsdf_eval(KernelGlobals *kg, ShaderData *sd,
const float3 omega_in, BsdfEval *eval, float *pdf)
{
bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass);
@@ -527,22 +529,22 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
{
int sampled = 0;
- if(sd->num_closure > 1) {
+ if(ccl_fetch(sd, num_closure) > 1) {
/* pick a BSDF closure based on sample weights */
float sum = 0.0f;
- for(sampled = 0; sampled < sd->num_closure; sampled++) {
- const ShaderClosure *sc = &sd->closure[sampled];
+ for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
+ const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
if(CLOSURE_IS_BSDF(sc->type))
sum += sc->sample_weight;
}
- float r = sd->randb_closure*sum;
+ float r = ccl_fetch(sd, randb_closure)*sum;
sum = 0.0f;
- for(sampled = 0; sampled < sd->num_closure; sampled++) {
- const ShaderClosure *sc = &sd->closure[sampled];
+ for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
+ const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
if(CLOSURE_IS_BSDF(sc->type)) {
sum += sc->sample_weight;
@@ -552,13 +554,14 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
}
}
- if(sampled == sd->num_closure) {
+ if(sampled == ccl_fetch(sd, num_closure)) {
*pdf = 0.0f;
return LABEL_NONE;
}
}
- const ShaderClosure *sc = &sd->closure[sampled];
+ const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+
int label;
float3 eval;
@@ -568,7 +571,7 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd,
if(*pdf != 0.0f) {
bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass);
- if(sd->num_closure > 1) {
+ if(ccl_fetch(sd, num_closure) > 1) {
float sweight = sc->sample_weight;
_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight);
}
@@ -595,8 +598,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, const ShaderData *s
ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
{
- for(int i = 0; i< sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
if(CLOSURE_IS_BSDF(sc->type))
bsdf_blur(kg, sc, roughness);
@@ -605,13 +608,13 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn
ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
{
- if(sd->flag & SD_HAS_ONLY_VOLUME)
+ if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)
return make_float3(1.0f, 1.0f, 1.0f);
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i< sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl
eval += sc->weight;
@@ -634,8 +637,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
{
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i< sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
eval += sc->weight;
@@ -648,8 +651,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
{
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i< sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
eval += sc->weight;
@@ -662,8 +665,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
{
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i< sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
eval += sc->weight;
@@ -676,8 +679,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
{
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i< sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type))
eval += sc->weight;
@@ -691,8 +694,8 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
float3 N = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i< sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
eval += sc->weight*ao_factor;
@@ -700,12 +703,12 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
}
else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) {
eval += sc->weight;
- N += sd->N*average(sc->weight);
+ N += ccl_fetch(sd, N)*average(sc->weight);
}
}
if(is_zero(N))
- N = sd->N;
+ N = ccl_fetch(sd, N);
else
N = normalize(N);
@@ -719,8 +722,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
float3 N = make_float3(0.0f, 0.0f, 0.0f);
float texture_blur = 0.0f, weight_sum = 0.0f;
- for(int i = 0; i< sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
if(CLOSURE_IS_BSSRDF(sc->type)) {
float avg_weight = fabsf(average(sc->weight));
@@ -733,7 +736,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
}
if(N_)
- *N_ = (is_zero(N))? sd->N: normalize(N);
+ *N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N);
if(texture_blur_)
*texture_blur_ = texture_blur/weight_sum;
@@ -745,7 +748,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc)
{
- return emissive_simple_eval(sd->Ng, sd->I);
+ return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I));
}
ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
@@ -753,8 +756,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
float3 eval;
eval = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
if(CLOSURE_IS_EMISSION(sc->type))
eval += emissive_eval(kg, sd, sc)*sc->weight;
@@ -769,8 +772,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
{
float3 weight = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
+ for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
if(CLOSURE_IS_HOLDOUT(sc->type))
weight += sc->weight;
@@ -784,8 +787,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
float randb, int path_flag, ShaderContext ctx)
{
- sd->num_closure = 0;
- sd->randb_closure = randb;
+ ccl_fetch(sd, num_closure) = 0;
+ ccl_fetch(sd, randb_closure) = randb;
#ifdef __OSL__
if(kg->osl)
@@ -796,11 +799,11 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
#ifdef __SVM__
svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, path_flag);
#else
- sd->closure->weight = make_float3(0.8f, 0.8f, 0.8f);
- sd->closure->N = sd->N;
- sd->closure->data0 = 0.0f;
- sd->closure->data1 = 0.0f;
- sd->flag |= bsdf_diffuse_setup(&sd->closure);
+ ccl_fetch_array(sd, closure, 0)->weight = make_float3(0.8f, 0.8f, 0.8f);
+ ccl_fetch_array(sd, closure, 0)->N = ccl_fetch(sd, N);
+ ccl_fetch_array(sd, closure, 0)->data0 = 0.0f;
+ ccl_fetch_array(sd, closure, 0)->data1 = 0.0f;
+ ccl_fetch(sd, flag) |= bsdf_diffuse_setup(ccl_fetch_array(sd, closure, 0));
#endif
}
}
@@ -809,8 +812,8 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int path_flag, ShaderContext ctx)
{
- sd->num_closure = 0;
- sd->randb_closure = 0.0f;
+ ccl_fetch(sd, num_closure) = 0;
+ ccl_fetch(sd, randb_closure) = 0.0f;
#ifdef __OSL__
if(kg->osl) {
@@ -825,8 +828,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
- for(int i = 0; i< sd->num_closure; i++) {
- const ShaderClosure *sc = &sd->closure[i];
+ for(int i = 0; i< ccl_fetch(sd, num_closure); i++) {
+ const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
if(CLOSURE_IS_BACKGROUND(sc->type))
eval += sc->weight;
@@ -999,8 +1002,8 @@ ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd,
ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx)
{
- sd->num_closure = 0;
- sd->randb_closure = 0.0f;
+ ccl_fetch(sd, num_closure) = 0;
+ ccl_fetch(sd, randb_closure) = 0.0f;
/* this will modify sd->P */
#ifdef __SVM__
diff --git a/intern/cycles/kernel/kernel_shader_eval.cl b/intern/cycles/kernel/kernel_shader_eval.cl
new file mode 100644
index 00000000000..78cf19a3df8
--- /dev/null
+++ b/intern/cycles/kernel/kernel_shader_eval.cl
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_shader_evaluation kernel
+ * This kernel is the 5th kernel in the ray tracing logic. This is
+ * the 4rd kernel in path iteration. This kernel sets up the ShaderData
+ * structure from the values computed by the previous kernels. It also identifies
+ * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ *
+ * The input and output of the kernel is as follows,
+ * rng_coop -------------------------------------------|--- kernel_ocl_path_trace_shader_evaluation --|--- shader_data
+ * Ray_coop -------------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * PathState_coop -------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Intersection_coop ----------------------------------| |
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------| |
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---| |
+ * ray_state ------------------------------------------| |
+ * kg (globals + data) --------------------------------| |
+ * queuesize ------------------------------------------| |
+ *
+ * Note on Queues :
+ * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
+ * only the rays of state RAY_ACTIVE;
+ * State of queues when this kernel is called,
+ * at entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * at exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
+ */
+
+__kernel void kernel_ocl_path_trace_shader_evaluation(
+ ccl_global char *globals,
+ ccl_constant KernelData *data,
+ ccl_global char *shader_data, /* Output ShaderData structure to be filled */
+ ccl_global uint *rng_coop, /* Required for rbsdf calculation */
+ ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */
+ ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */
+ Intersection *Intersection_coop, /* Required for setting up shader from ray */
+ ccl_global char *ray_state, /* Denotes the state of each ray */
+ ccl_global int *Queue_data, /* queue memory */
+ ccl_global int *Queue_index, /* Tracks the number of elements in each queue */
+ int queuesize /* Size (capacity) of each queue */
+ )
+{
+ int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+ /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue */
+ ccl_local unsigned int local_queue_atomics;
+ if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+ local_queue_atomics = 0;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+
+ enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+
+ ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+ if(ray_index == QUEUE_EMPTY_SLOT)
+ return;
+
+ /* Continue on with shader evaluation */
+ if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+ KernelGlobals *kg = (KernelGlobals *)globals;
+ ShaderData *sd = (ShaderData *)shader_data;
+ Intersection *isect = &Intersection_coop[ray_index];
+ ccl_global uint *rng = &rng_coop[ray_index];
+ ccl_global PathState *state = &PathState_coop[ray_index];
+ Ray ray = Ray_coop[ray_index];
+
+ shader_setup_from_ray(kg, sd, isect, &ray, state->bounce, state->transparent_bounce);
+ float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
+ shader_eval_surface(kg, sd, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+ }
+}
diff --git a/intern/cycles/kernel/kernel_shaderdata_vars.h b/intern/cycles/kernel/kernel_shaderdata_vars.h
new file mode 100644
index 00000000000..b157b82e023
--- /dev/null
+++ b/intern/cycles/kernel/kernel_shaderdata_vars.h
@@ -0,0 +1,99 @@
+/*
+* Copyright 2011-2015 Blender Foundation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#ifndef SD_VAR
+#define SD_VAR(type, what)
+#endif
+#ifndef SD_CLOSURE_VAR
+#define SD_CLOSURE_VAR(type, what, max_closure)
+#endif
+
+/* position */
+SD_VAR(float3, P)
+/* smooth normal for shading */
+SD_VAR(float3, N)
+/* true geometric normal */
+SD_VAR(float3, Ng)
+/* view/incoming direction */
+SD_VAR(float3, I)
+/* shader id */
+SD_VAR(int, shader)
+/* booleans describing shader, see ShaderDataFlag */
+SD_VAR(int, flag)
+
+/* primitive id if there is one, ~0 otherwise */
+SD_VAR(int, prim)
+
+/* combined type and curve segment for hair */
+SD_VAR(int, type)
+
+/* parametric coordinates
+* - barycentric weights for triangles */
+SD_VAR(float, u)
+SD_VAR(float, v)
+/* object id if there is one, ~0 otherwise */
+SD_VAR(int, object)
+
+/* motion blur sample time */
+SD_VAR(float, time)
+
+/* length of the ray being shaded */
+SD_VAR(float, ray_length)
+
+/* ray bounce depth */
+SD_VAR(int, ray_depth)
+
+/* ray transparent depth */
+SD_VAR(int, transparent_depth)
+
+#ifdef __RAY_DIFFERENTIALS__
+/* differential of P. these are orthogonal to Ng, not N */
+SD_VAR(differential3, dP)
+/* differential of I */
+SD_VAR(differential3, dI)
+/* differential of u, v */
+SD_VAR(differential, du)
+SD_VAR(differential, dv)
+#endif
+#ifdef __DPDU__
+/* differential of P w.r.t. parametric coordinates. note that dPdu is
+* not readily suitable as a tangent for shading on triangles. */
+SD_VAR(float3, dPdu)
+SD_VAR(float3, dPdv)
+#endif
+
+#ifdef __OBJECT_MOTION__
+/* object <-> world space transformations, cached to avoid
+* re-interpolating them constantly for shading */
+SD_VAR(Transform, ob_tfm)
+SD_VAR(Transform, ob_itfm)
+#endif
+
+/* Closure data, we store a fixed array of closures */
+SD_CLOSURE_VAR(ShaderClosure, closure, MAX_CLOSURE)
+SD_VAR(int, num_closure)
+SD_VAR(float, randb_closure)
+
+/* ray start position, only set for backgrounds */
+SD_VAR(float3, ray_P)
+SD_VAR(differential3, ray_dP)
+
+#ifdef __OSL__
+SD_VAR(struct KernelGlobals *, osl_globals)
+#endif
+
+#undef SD_VAR
+#undef SD_CLOSURE_VAR
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index d7c4fa02bcf..5119b7a30c2 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -180,19 +180,37 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
* potentially transparent, and only in that case start marching. this gives
* one extra ray cast for the cases were we do want transparency. */
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow)
+/* The arguments sd_mem and isect_mem are meaningful only for OpenCL split kernel. Other uses can just pass a NULL */
+ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray_input, float3 *shadow
+#ifdef __SPLIT_KERNEL__
+ , ShaderData *sd_mem, Intersection *isect_mem
+#endif
+ )
{
*shadow = make_float3(1.0f, 1.0f, 1.0f);
- if(ray->t == 0.0f)
+ if(ray_input->t == 0.0f)
return false;
- Intersection isect;
- bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
+#ifdef __SPLIT_KERNEL__
+ Ray private_ray = *ray_input;
+ Ray *ray = &private_ray;
+#else
+ Ray *ray = ray_input;
+#endif
+
+#ifdef __SPLIT_KERNEL__
+ Intersection *isect = isect_mem;
+#else
+ Intersection isect_object;
+ Intersection *isect = &isect_object;
+#endif
+
+ bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
#ifdef __TRANSPARENT_SHADOWS__
if(blocked && kernel_data.integrator.transparent_shadows) {
- if(shader_transparent_shadow(kg, &isect)) {
+ if(shader_transparent_shadow(kg, isect)) {
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
float3 Pend = ray->P + ray->D*ray->t;
int bounce = state->transparent_bounce;
@@ -204,9 +222,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
if(bounce >= kernel_data.integrator.transparent_max_bounce)
return true;
- if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f))
+ if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
{
-
#ifdef __VOLUME__
/* attenuation for last line segment towards light */
if(ps.volume_stack[0].shader != SHADER_NONE)
@@ -218,39 +235,44 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *
return false;
}
- if(!shader_transparent_shadow(kg, &isect))
+ if(!shader_transparent_shadow(kg, isect))
return true;
#ifdef __VOLUME__
/* attenuation between last surface and next surface */
if(ps.volume_stack[0].shader != SHADER_NONE) {
Ray segment_ray = *ray;
- segment_ray.t = isect.t;
+ segment_ray.t = isect->t;
kernel_volume_shadow(kg, &ps, &segment_ray, &throughput);
}
#endif
/* setup shader data at surface */
- ShaderData sd;
- shader_setup_from_ray(kg, &sd, &isect, ray, state->bounce+1, bounce);
+#ifdef __SPLIT_KERNEL__
+ ShaderData *sd = sd_mem;
+#else
+ ShaderData sd_object;
+ ShaderData *sd = &sd_object;
+#endif
+ shader_setup_from_ray(kg, sd, isect, ray, state->bounce+1, bounce);
/* attenuation from transparent surface */
- if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
- shader_eval_surface(kg, &sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
- throughput *= shader_bsdf_transparency(kg, &sd);
+ if(!(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)) {
+ shader_eval_surface(kg, sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
+ throughput *= shader_bsdf_transparency(kg, sd);
}
if(is_zero(throughput))
return true;
/* move ray forward */
- ray->P = ray_offset(sd.P, -sd.Ng);
+ ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
if(ray->t != FLT_MAX)
ray->D = normalize_len(Pend - ray->P, &ray->t);
#ifdef __VOLUME__
/* exit/enter volume */
- kernel_volume_stack_enter_exit(kg, &sd, ps.volume_stack);
+ kernel_volume_stack_enter_exit(kg, sd, ps.volume_stack);
#endif
bounce++;
diff --git a/intern/cycles/kernel/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernel_shadow_blocked.cl
new file mode 100644
index 00000000000..72a2d0affb0
--- /dev/null
+++ b/intern/cycles/kernel/kernel_shadow_blocked.cl
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_shadow_blocked kernel.
+ * This is the ninth kernel in the ray tracing logic. This is the eighth
+ * of the path iteration kernels. This kernel takes care of "shadow ray cast"
+ * logic of the direct lighting and AO part of ray tracing.
+ *
+ * The input and output are as follows,
+ *
+ * PathState_coop ----------------------------------|--- kernel_ocl_path_trace_shadow_blocked --|
+ * LightRay_dl_coop --------------------------------| |--- LightRay_dl_coop
+ * LightRay_ao_coop --------------------------------| |--- LightRay_ao_coop
+ * ray_state ---------------------------------------| |--- ray_state
+ * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS & | |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
+ QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| |
+ * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
+ QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| |
+ * kg (globals + data) -----------------------------| |
+ * queuesize ---------------------------------------| |
+ *
+ * Note on shader_shadow : shader_shadow is neither input nor output to this kernel. shader_shadow is filled and consumed in this kernel itself.
+ * Note on queues :
+ * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty
+ * these queues this kernel.
+ * State of queues when this kernel is called :
+ * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
+ * before and after this kernel call.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO
+ * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
+ */
+
+__kernel void kernel_ocl_path_trace_shadow_blocked_direct_lighting(
+ ccl_global char *globals,
+ ccl_constant KernelData *data,
+ ccl_global char *shader_shadow, /* Required for shadow blocked */
+ ccl_global PathState *PathState_coop, /* Required for shadow blocked */
+ ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */
+ ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */
+ Intersection *Intersection_coop_AO,
+ Intersection *Intersection_coop_DL,
+ ccl_global char *ray_state,
+ ccl_global int *Queue_data, /* Queue memory */
+ ccl_global int *Queue_index, /* Tracks the number of elements in each queue */
+ int queuesize, /* Size (capacity) of each queue */
+ int total_num_rays
+ )
+{
+#if 0
+ /* we will make the Queue_index entries '0' in the next kernel */
+ if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+ /* We empty this queue here */
+ Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+ Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+ }
+#endif
+
+ int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
+
+ ccl_local unsigned int ao_queue_length;
+ ccl_local unsigned int dl_queue_length;
+ if(lidx == 0) {
+ ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+ dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* flag determining if the current ray is to process shadow ray for AO or DL */
+ char shadow_blocked_type = -1;
+ /* flag determining if we need to update L */
+ char update_path_radiance = 0;
+
+ int ray_index = QUEUE_EMPTY_SLOT;
+ int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+ if(thread_index < ao_queue_length + dl_queue_length) {
+ if(thread_index < ao_queue_length) {
+ ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
+ shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
+ } else {
+ ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
+ shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
+ }
+ }
+
+ if(ray_index == QUEUE_EMPTY_SLOT)
+ return;
+
+ if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+ /* Load kernel global structure */
+ KernelGlobals *kg = (KernelGlobals *)globals;
+ ShaderData *sd_shadow = (ShaderData *)shader_shadow;
+
+ ccl_global PathState *state = &PathState_coop[ray_index];
+ ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index];
+ ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index];
+ Intersection *isect_ao_global = &Intersection_coop_AO[ray_index];
+ Intersection *isect_dl_global = &Intersection_coop_DL[ray_index];
+
+ ccl_global Ray *light_ray_global = shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO ? light_ray_ao_global : light_ray_dl_global;
+ Intersection *isect_global = RAY_SHADOW_RAY_CAST_AO ? isect_ao_global : isect_dl_global;
+
+ float3 shadow;
+ update_path_radiance = !(shadow_blocked(kg, state, light_ray_global, &shadow, sd_shadow, isect_global));
+
+ /* We use light_ray_global's P and t to store shadow and update_path_radiance */
+ light_ray_global->P = shadow;
+ light_ray_global->t = update_path_radiance;
+ }
+}
diff --git a/intern/cycles/kernel/kernel_split.h b/intern/cycles/kernel/kernel_split.h
new file mode 100644
index 00000000000..eb386c2e5a7
--- /dev/null
+++ b/intern/cycles/kernel/kernel_split.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _KERNEL_SPLIT_H_
+#define _KERNEL_SPLIT_H_
+
+#include "kernel_compat_opencl.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+
+/* atomic_add_float function should be defined prior to its usage in kernel_passes.h */
+#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+/* Utility functions for float atomics */
+/* float atomics impl credits : http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html */
+ccl_device_inline void atomic_add_float(volatile ccl_global float *source, const float operand) {
+ union {
+ unsigned int intVal;
+ float floatVal;
+
+ } newVal;
+ union {
+ unsigned int intVal;
+ float floatVal;
+
+ } prevVal;
+ do {
+ prevVal.floatVal = *source;
+ newVal.floatVal = prevVal.floatVal + operand;
+
+ } while (atomic_cmpxchg((volatile ccl_global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+
+#ifdef __OSL__
+#include "osl_shader.h"
+#endif
+
+#include "kernel_random.h"
+#include "kernel_projection.h"
+#include "kernel_montecarlo.h"
+#include "kernel_differential.h"
+#include "kernel_camera.h"
+
+#include "geom/geom.h"
+
+#include "kernel_accumulate.h"
+#include "kernel_shader.h"
+#include "kernel_light.h"
+#include "kernel_passes.h"
+
+#ifdef __SUBSURFACE__
+#include "kernel_subsurface.h"
+#endif
+
+#ifdef __VOLUME__
+#include "kernel_volume.h"
+#endif
+
+#include "kernel_path_state.h"
+#include "kernel_shadow.h"
+#include "kernel_emission.h"
+#include "kernel_path_common.h"
+#include "kernel_path_surface.h"
+#include "kernel_path_volume.h"
+
+#ifdef __KERNEL_DEBUG__
+#include "kernel_debug.h"
+#endif
+
+#include "kernel_queues.h"
+#include "kernel_work_stealing.h"
+
+#endif
diff --git a/intern/cycles/kernel/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernel_sum_all_radiance.cl
new file mode 100644
index 00000000000..739a85d4cc8
--- /dev/null
+++ b/intern/cycles/kernel/kernel_sum_all_radiance.cl
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_compat_opencl.h"
+#include "kernel_math.h"
+#include "kernel_types.h"
+#include "kernel_globals.h"
+
+/*
+* Since we process various samples in parallel; The output radiance of different samples
+* are stored in different locations; This kernel combines the output radiance contributed
+* by all different samples and stores them in the RenderTile's output buffer.
+*/
+
+__kernel void kernel_ocl_path_trace_sum_all_radiance(
+ ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */
+ ccl_global float *buffer, /* Output buffer of RenderTile */
+ ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */
+ int parallel_samples, int sw, int sh, int stride,
+ int buffer_offset_x,
+ int buffer_offset_y,
+ int buffer_stride,
+ int start_sample)
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+
+ if(x < sw && y < sh) {
+ buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride);
+ per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride);
+
+ int sample_stride = (data->film.pass_stride);
+
+ int sample_iterator = 0;
+ int pass_stride_iterator = 0;
+ int num_floats = data->film.pass_stride;
+
+ for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) {
+ for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
+ *(buffer + pass_stride_iterator) = (start_sample == 0 && sample_iterator == 0) ? *(per_sample_output_buffer + pass_stride_iterator)
+ : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
+ }
+ per_sample_output_buffer += sample_stride;
+ }
+ }
+}
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index daa5ec1b9f1..303a78d8ac0 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -24,6 +24,13 @@
#define __KERNEL_CPU__
#endif
+/* TODO(sergey): This is only to make it possible to include this header
+ * from outside of the kernel. but this could be done somewhat cleaner?
+ */
+#ifndef ccl_addr_space
+#define ccl_addr_space
+#endif
+
CCL_NAMESPACE_BEGIN
/* constants */
@@ -90,7 +97,19 @@ CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_OPENCL_NVIDIA__
#define __KERNEL_SHADING__
-#define __KERNEL_ADV_SHADING__
+/* TODO(sergey): Advanced shading code still requires work
+ * for split kernel.
+ */
+# ifndef __SPLIT_KERNEL__
+# define __KERNEL_ADV_SHADING__
+# else
+# define __MULTI_CLOSURE__
+# define __TRANSPARENT_SHADOWS__
+# define __PASSES__
+# define __BACKGROUND_MIS__
+# define __LAMP_MIS__
+# define __AO__
+# endif
#endif
#ifdef __KERNEL_OPENCL_APPLE__
@@ -103,7 +122,7 @@ CCL_NAMESPACE_BEGIN
#define __KERNEL_SHADING__
//__KERNEL_ADV_SHADING__
#define __MULTI_CLOSURE__
-#define __TRANSPARENT_SHADOWS__
+//#define __TRANSPARENT_SHADOWS__
#define __PASSES__
#define __BACKGROUND_MIS__
#define __LAMP_MIS__
@@ -117,10 +136,22 @@ CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_OPENCL_INTEL_CPU__
#define __CL_USE_NATIVE__
#define __KERNEL_SHADING__
-#define __KERNEL_ADV_SHADING__
+/* TODO(sergey): Advanced shading code still requires work
+ * for split kernel.
+ */
+# ifndef __SPLIT_KERNEL__
+# define __KERNEL_ADV_SHADING__
+# else
+# define __MULTI_CLOSURE__
+# define __TRANSPARENT_SHADOWS__
+# define __PASSES__
+# define __BACKGROUND_MIS__
+# define __LAMP_MIS__
+# define __AO__
+# endif
#endif
-#endif
+#endif // __KERNEL_OPENCL__
/* kernel features */
#define __SOBOL__
@@ -322,7 +353,7 @@ typedef enum PassType {
#ifdef __PASSES__
-typedef struct PathRadiance {
+typedef ccl_addr_space struct PathRadiance {
int use_light_pass;
float3 emission;
@@ -374,7 +405,7 @@ typedef struct BsdfEval {
#else
-typedef float3 PathRadiance;
+typedef ccl_addr_space float3 PathRadiance;
typedef float3 BsdfEval;
#endif
@@ -441,9 +472,9 @@ typedef struct differential {
typedef struct Ray {
float3 P; /* origin */
float3 D; /* direction */
+
float t; /* length of the ray */
float time; /* time (for motion blur) */
-
#ifdef __RAY_DIFFERENTIALS__
differential3 dP;
differential3 dD;
@@ -452,7 +483,7 @@ typedef struct Ray {
/* Intersection */
-typedef struct Intersection {
+typedef ccl_addr_space struct Intersection {
float t, u, v;
int prim;
int object;
@@ -537,7 +568,11 @@ typedef enum AttributeStandard {
/* Closure data */
#ifdef __MULTI_CLOSURE__
-#define MAX_CLOSURE 64
+# ifndef __MAX_CLOSURE__
+# define MAX_CLOSURE 64
+# else
+# define MAX_CLOSURE __MAX_CLOSURE__
+# endif
#else
#define MAX_CLOSURE 1
#endif
@@ -547,7 +582,7 @@ typedef enum AttributeStandard {
* does not put own padding trying to align this members.
* - We make sure OSL pointer is also 16 bytes aligned.
*/
-typedef struct ShaderClosure {
+typedef ccl_addr_space struct ShaderClosure {
float3 weight;
float3 N;
float3 T;
@@ -632,78 +667,23 @@ enum ShaderDataFlag {
struct KernelGlobals;
-typedef struct ShaderData {
- /* position */
- float3 P;
- /* smooth normal for shading */
- float3 N;
- /* true geometric normal */
- float3 Ng;
- /* view/incoming direction */
- float3 I;
- /* shader id */
- int shader;
- /* booleans describing shader, see ShaderDataFlag */
- int flag;
-
- /* primitive id if there is one, ~0 otherwise */
- int prim;
-
- /* combined type and curve segment for hair */
- int type;
-
- /* parametric coordinates
- * - barycentric weights for triangles */
- float u, v;
- /* object id if there is one, ~0 otherwise */
- int object;
-
- /* motion blur sample time */
- float time;
-
- /* length of the ray being shaded */
- float ray_length;
-
- /* ray bounce depth */
- int ray_depth;
-
- /* ray transparent depth */
- int transparent_depth;
-
-#ifdef __RAY_DIFFERENTIALS__
- /* differential of P. these are orthogonal to Ng, not N */
- differential3 dP;
- /* differential of I */
- differential3 dI;
- /* differential of u, v */
- differential du;
- differential dv;
-#endif
-#ifdef __DPDU__
- /* differential of P w.r.t. parametric coordinates. note that dPdu is
- * not readily suitable as a tangent for shading on triangles. */
- float3 dPdu, dPdv;
-#endif
-
-#ifdef __OBJECT_MOTION__
- /* object <-> world space transformations, cached to avoid
- * re-interpolating them constantly for shading */
- Transform ob_tfm;
- Transform ob_itfm;
+#ifdef __SPLIT_KERNEL__
+#define SD_VAR(type, what) ccl_global type *what;
+#define SD_CLOSURE_VAR(type, what, max_closure) type *what;
+#define TIDX (get_global_id(1) * get_global_size(0) + get_global_id(0))
+#define ccl_fetch(s, t) (s->t[TIDX])
+#define ccl_fetch_array(s, t, index) (&s->t[TIDX * MAX_CLOSURE + index])
+#else
+#define SD_VAR(type, what) type what;
+#define SD_CLOSURE_VAR(type, what, max_closure) type what[max_closure];
+#define ccl_fetch(s, t) (s->t)
+#define ccl_fetch_array(s, t, index) (&s->t[index])
#endif
- /* Closure data, we store a fixed array of closures */
- ShaderClosure closure[MAX_CLOSURE];
- int num_closure;
- float randb_closure;
+typedef ccl_addr_space struct ShaderData {
- /* ray start position, only set for backgrounds */
- float3 ray_P;
- differential3 ray_dP;
+#include "kernel_shaderdata_vars.h"
-#ifdef __OSL__
- struct KernelGlobals *osl_globals;
-#endif
} ShaderData;
/* Path State */
@@ -996,13 +976,62 @@ typedef struct KernelData {
} KernelData;
#ifdef __KERNEL_DEBUG__
-typedef struct DebugData {
+typedef ccl_addr_space struct DebugData {
// Total number of BVH node traversal steps and primitives intersections
// for the camera rays.
int num_bvh_traversal_steps;
} DebugData;
#endif
+/* Declarations required for split kernel */
+
+/* Macro for queues */
+/* Value marking queue's empty slot */
+#define QUEUE_EMPTY_SLOT -1
+
+/*
+* Queue 1 - Active rays
+* Queue 2 - Background queue
+* Queue 3 - Shadow ray cast kernel - AO
+* Queeu 4 - Shadow ray cast kernel - direct lighting
+*/
+#define NUM_QUEUES 4
+
+/* Queue names */
+enum QueueNumber {
+ QUEUE_ACTIVE_AND_REGENERATED_RAYS, /* All active rays and regenerated rays are enqueued here */
+ QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, /* All
+ * 1.Background-hit rays,
+ * 2.Rays that has exited path-iteration but needs to update output buffer
+ * 3.Rays to be regenerated
+ * are enqueued here */
+ QUEUE_SHADOW_RAY_CAST_AO_RAYS, /* All rays for which a shadow ray should be cast to determine radiance
+ contribution for AO are enqueued here */
+ QUEUE_SHADOW_RAY_CAST_DL_RAYS, /* All rays for which a shadow ray should be cast to determine radiance
+ contributuin for direct lighting are enqueued here */
+};
+
+/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */
+#define RAY_STATE_MASK 0x007
+#define RAY_FLAG_MASK 0x0F8
+enum RayState {
+ RAY_ACTIVE = 0, // Denotes ray is actively involved in path-iteration
+ RAY_INACTIVE = 1, // Denotes ray has completed processing all samples and is inactive
+ RAY_UPDATE_BUFFER = 2, // Denoted ray has exited path-iteration and needs to update output buffer
+ RAY_HIT_BACKGROUND = 3, // Donotes ray has hit background
+ RAY_TO_REGENERATE = 4, // Denotes ray has to be regenerated
+ RAY_REGENERATED = 5, // Denotes ray has been regenerated
+ RAY_SKIP_DL = 6, // Denotes ray should skip direct lighting
+ RAY_SHADOW_RAY_CAST_AO = 16, // Flag's ray has to execute shadow blocked function in AO part
+ RAY_SHADOW_RAY_CAST_DL = 32 // Flag's ray has to execute shadow blocked function in direct lighting part
+};
+
+#define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
+#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state)
+#define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag))
+#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
+#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
+
CCL_NAMESPACE_END
#endif /* __KERNEL_TYPES_H__ */
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
new file mode 100644
index 00000000000..9b83d972e97
--- /dev/null
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_WORK_STEALING_H__
+#define __KERNEL_WORK_STEALING_H__
+
+/*
+ * Utility functions for work stealing
+ */
+
+#ifdef __WORK_STEALING__
+
+#ifdef __KERNEL_OPENCL__
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#endif
+
+uint get_group_id_with_ray_index(uint ray_index,
+ uint tile_dim_x,
+ uint tile_dim_y,
+ uint parallel_samples,
+ int dim)
+{
+ if(dim == 0) {
+ uint x_span = ray_index % (tile_dim_x * parallel_samples);
+ return x_span / get_local_size(0);
+ }
+ else /*if(dim == 1)*/ {
+ kernel_assert(dim == 1);
+ uint y_span = ray_index / (tile_dim_x * parallel_samples);
+ return y_span / get_local_size(1);
+ }
+}
+
+uint get_total_work(uint tile_dim_x,
+ uint tile_dim_y,
+ uint grp_idx,
+ uint grp_idy,
+ uint num_samples)
+{
+ uint threads_within_tile_border_x =
+ (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+ : get_local_size(0);
+ uint threads_within_tile_border_y =
+ (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+ : get_local_size(1);
+
+ threads_within_tile_border_x =
+ (threads_within_tile_border_x == 0) ? get_local_size(0)
+ : threads_within_tile_border_x;
+ threads_within_tile_border_y =
+ (threads_within_tile_border_y == 0) ? get_local_size(1)
+ : threads_within_tile_border_y;
+
+ return threads_within_tile_border_x *
+ threads_within_tile_border_y *
+ num_samples;
+}
+
+/* Returns 0 in case there is no next work available */
+/* Returns 1 in case work assigned is valid */
+int get_next_work(ccl_global uint *work_pool,
+ ccl_private uint *my_work,
+ uint tile_dim_x,
+ uint tile_dim_y,
+ uint num_samples,
+ uint parallel_samples,
+ uint ray_index)
+{
+ uint grp_idx = get_group_id_with_ray_index(ray_index,
+ tile_dim_x,
+ tile_dim_y,
+ parallel_samples,
+ 0);
+ uint grp_idy = get_group_id_with_ray_index(ray_index,
+ tile_dim_x,
+ tile_dim_y,
+ parallel_samples,
+ 1);
+ uint total_work = get_total_work(tile_dim_x,
+ tile_dim_y,
+ grp_idx,
+ grp_idy,
+ num_samples);
+ uint group_index = grp_idy * get_num_groups(0) + grp_idx;
+ *my_work = atomic_inc(&work_pool[group_index]);
+ return (*my_work < total_work) ? 1 : 0;
+}
+
+/* This function assumes that the passed my_work is valid. */
+/* Decode sample number w.r.t. assigned my_work. */
+uint get_my_sample(uint my_work,
+ uint tile_dim_x,
+ uint tile_dim_y,
+ uint parallel_samples,
+ uint ray_index)
+{
+ uint grp_idx = get_group_id_with_ray_index(ray_index,
+ tile_dim_x,
+ tile_dim_y,
+ parallel_samples,
+ 0);
+ uint grp_idy = get_group_id_with_ray_index(ray_index,
+ tile_dim_x,
+ tile_dim_y,
+ parallel_samples,
+ 1);
+ uint threads_within_tile_border_x =
+ (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+ : get_local_size(0);
+ uint threads_within_tile_border_y =
+ (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+ : get_local_size(1);
+
+ threads_within_tile_border_x =
+ (threads_within_tile_border_x == 0) ? get_local_size(0)
+ : threads_within_tile_border_x;
+ threads_within_tile_border_y =
+ (threads_within_tile_border_y == 0) ? get_local_size(1)
+ : threads_within_tile_border_y;
+
+ return my_work /
+ (threads_within_tile_border_x * threads_within_tile_border_y);
+}
+
+/* Decode pixel and tile position w.r.t. assigned my_work. */
+void get_pixel_tile_position(ccl_private uint *pixel_x,
+ ccl_private uint *pixel_y,
+ ccl_private uint *tile_x,
+ ccl_private uint *tile_y,
+ uint my_work,
+ uint tile_dim_x,
+ uint tile_dim_y,
+ uint tile_offset_x,
+ uint tile_offset_y,
+ uint parallel_samples,
+ uint ray_index)
+{
+ uint grp_idx = get_group_id_with_ray_index(ray_index,
+ tile_dim_x,
+ tile_dim_y,
+ parallel_samples,
+ 0);
+ uint grp_idy = get_group_id_with_ray_index(ray_index,
+ tile_dim_x,
+ tile_dim_y,
+ parallel_samples,
+ 1);
+ uint threads_within_tile_border_x =
+ (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
+ : get_local_size(0);
+ uint threads_within_tile_border_y =
+ (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
+ : get_local_size(1);
+
+ threads_within_tile_border_x =
+ (threads_within_tile_border_x == 0) ? get_local_size(0)
+ : threads_within_tile_border_x;
+ threads_within_tile_border_y =
+ (threads_within_tile_border_y == 0) ? get_local_size(1)
+ : threads_within_tile_border_y;
+
+ uint total_associated_pixels =
+ threads_within_tile_border_x * threads_within_tile_border_y;
+ uint work_group_pixel_index = my_work % total_associated_pixels;
+ uint work_group_pixel_x =
+ work_group_pixel_index % threads_within_tile_border_x;
+ uint work_group_pixel_y =
+ work_group_pixel_index / threads_within_tile_border_x;
+
+ *pixel_x =
+ tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
+ *pixel_y =
+ tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
+ *tile_x = *pixel_x - tile_offset_x;
+ *tile_y = *pixel_y - tile_offset_y;
+}
+
+#endif /* __WORK_STEALING__ */
+
+#endif /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index b1561963e5d..e5e25eb6ca8 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -189,7 +189,7 @@ CCL_NAMESPACE_BEGIN
ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ShaderType type, int path_flag)
{
float stack[SVM_STACK_SIZE];
- int offset = sd->shader & SHADER_MASK;
+ int offset = ccl_fetch(sd, shader) & SHADER_MASK;
while(1) {
uint4 node = read_node(kg, &offset);
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index b63978b6e1f..025ae96f59d 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -22,12 +22,12 @@ ccl_device void svm_node_attr_init(KernelGlobals *kg, ShaderData *sd,
uint4 node, NodeAttributeType *type,
NodeAttributeType *mesh_type, AttributeElement *elem, int *offset, uint *out_offset)
{
- if(sd->object != OBJECT_NONE) {
+ if(ccl_fetch(sd, object) != OBJECT_NONE) {
/* find attribute by unique id */
uint id = node.y;
- uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
+ uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
#ifdef __HAIR__
- attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
+ attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
#endif
uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index 90249dfd978..00678a49d70 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -23,7 +23,7 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack,
float3 vector;
Transform tfm = kernel_data.cam.worldtocamera;
- vector = transform_point(&tfm, sd->P);
+ vector = transform_point(&tfm, ccl_fetch(sd, P));
zdepth = vector.z;
distance = len(vector);
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 0d2d155f827..7cdcbc2d30c 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -25,12 +25,12 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
sc->data0 = eta;
sc->data1 = 0.0f;
sc->data2 = 0.0f;
- sd->flag |= bsdf_refraction_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc);
}
else {
sc->data0 = 0.0f;
sc->data1 = 0.0f;
- sd->flag |= bsdf_reflection_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc);
}
}
else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) {
@@ -39,9 +39,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
sc->data2 = eta;
if(refract)
- sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc);
else
- sd->flag |= bsdf_microfacet_beckmann_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc);
}
else {
sc->data0 = roughness;
@@ -49,23 +49,23 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type
sc->data2 = eta;
if(refract)
- sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc);
else
- sd->flag |= bsdf_microfacet_ggx_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc);
}
}
ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, ClosureType type, float mix_weight)
{
- ShaderClosure *sc = &sd->closure[sd->num_closure];
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
- if(sd->num_closure < MAX_CLOSURE) {
+ if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
sc->weight *= mix_weight;
sc->type = type;
#ifdef __OSL__
sc->prim = NULL;
#endif
- sd->num_closure++;
+ ccl_fetch(sd, num_closure)++;
return sc;
}
@@ -74,14 +74,15 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, C
ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float mix_weight)
{
- ShaderClosure *sc = &sd->closure[sd->num_closure];
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+
float3 weight = sc->weight * mix_weight;
float sample_weight = fabsf(average(weight));
- if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
+ if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
sc->weight = weight;
sc->sample_weight = sample_weight;
- sd->num_closure++;
+ ccl_fetch(sd, num_closure)++;
#ifdef __OSL__
sc->prim = NULL;
#endif
@@ -93,14 +94,15 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float
ccl_device_inline ShaderClosure *svm_node_closure_get_absorption(ShaderData *sd, float mix_weight)
{
- ShaderClosure *sc = &sd->closure[sd->num_closure];
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+
float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sc->weight) * mix_weight;
float sample_weight = fabsf(average(weight));
- if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) {
+ if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
sc->weight = weight;
sc->sample_weight = sample_weight;
- sd->num_closure++;
+ ccl_fetch(sd, num_closure)++;
#ifdef __OSL__
sc->prim = NULL;
#endif
@@ -124,7 +126,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
if(mix_weight == 0.0f)
return;
- float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N;
+ float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N);
float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z);
float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
@@ -142,13 +144,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->data0 = 0.0f;
sc->data1 = 0.0f;
sc->data2 = 0.0f;
- sd->flag |= bsdf_diffuse_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_diffuse_setup(sc);
}
else {
sc->data0 = roughness;
sc->data1 = 0.0f;
sc->data2 = 0.0f;
- sd->flag |= bsdf_oren_nayar_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(sc);
}
}
break;
@@ -161,7 +163,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->data1 = 0.0f;
sc->data2 = 0.0f;
sc->N = N;
- sd->flag |= bsdf_translucent_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_translucent_setup(sc);
}
break;
}
@@ -173,7 +175,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->data1 = 0.0f;
sc->data2 = 0.0f;
sc->N = N;
- sd->flag |= bsdf_transparent_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc);
}
break;
}
@@ -195,13 +197,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
/* setup bsdf */
if(type == CLOSURE_BSDF_REFLECTION_ID)
- sd->flag |= bsdf_reflection_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc);
else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
- sd->flag |= bsdf_microfacet_beckmann_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc);
else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
- sd->flag |= bsdf_microfacet_ggx_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc);
else
- sd->flag |= bsdf_ashikhmin_shirley_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(sc);
}
break;
@@ -219,7 +221,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->N = N;
float eta = fmaxf(param2, 1e-5f);
- eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+ eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
/* setup bsdf */
if(type == CLOSURE_BSDF_REFRACTION_ID) {
@@ -227,7 +229,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->data1 = 0.0f;
sc->data2 = 0.0f;
- sd->flag |= bsdf_refraction_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc);
}
else {
sc->data0 = param1;
@@ -235,9 +237,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->data2 = eta;
if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
- sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc);
else
- sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc);
}
}
@@ -254,15 +256,15 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
#endif
/* index of refraction */
float eta = fmaxf(param2, 1e-5f);
- eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+ eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
/* fresnel */
- float cosNO = dot(N, sd->I);
+ float cosNO = dot(N, ccl_fetch(sd, I));
float fresnel = fresnel_dielectric_cos(cosNO, eta);
float roughness = param1;
/* reflection */
- ShaderClosure *sc = &sd->closure[sd->num_closure];
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
float3 weight = sc->weight;
float sample_weight = sc->sample_weight;
@@ -283,7 +285,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
#endif
/* refraction */
- sc = &sd->closure[sd->num_closure];
+ sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
sc->weight = weight;
sc->sample_weight = sample_weight;
@@ -332,11 +334,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->data2 = 0.0f;
if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID)
- sd->flag |= bsdf_microfacet_beckmann_aniso_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(sc);
else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID)
- sd->flag |= bsdf_microfacet_ggx_aniso_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(sc);
else
- sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(sc);
}
break;
}
@@ -350,7 +352,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->data0 = saturate(param1);
sc->data1 = 0.0f;
sc->data2 = 0.0f;
- sd->flag |= bsdf_ashikhmin_velvet_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(sc);
}
break;
}
@@ -366,9 +368,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->data2 = 0.0f;
if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
- sd->flag |= bsdf_diffuse_toon_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(sc);
else
- sd->flag |= bsdf_glossy_toon_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(sc);
}
break;
}
@@ -376,7 +378,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
case CLOSURE_BSDF_HAIR_REFLECTION_ID:
case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
- if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) {
+ if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight);
if(sc) {
@@ -389,11 +391,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->N = N;
sc->data0 = 0.0f;
sc->data1 = 0.0f;
- sd->flag |= bsdf_transparent_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc);
}
}
else {
- ShaderClosure *sc = &sd->closure[sd->num_closure];
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
sc = svm_node_closure_get_bsdf(sd, mix_weight);
if(sc) {
@@ -402,18 +404,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->data1 = param2;
sc->data2 = -stack_load_float(stack, data_node.z);
- if(!(sd->type & PRIMITIVE_ALL_CURVE)) {
- sc->T = normalize(sd->dPdv);
+ if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) {
+ sc->T = normalize(ccl_fetch(sd, dPdv));
sc->data2 = 0.0f;
}
else
- sc->T = normalize(sd->dPdu);
+ sc->T = normalize(ccl_fetch(sd, dPdu));
if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
- sd->flag |= bsdf_hair_reflection_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(sc);
}
else {
- sd->flag |= bsdf_hair_transmission_setup(sc);
+ ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(sc);
}
}
}
@@ -423,9 +425,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
#endif
#ifdef __SUBSURFACE__
+#ifndef __SPLIT_KERNEL__
+# define sc_next(sc) sc++
+# else
+# define sc_next(sc) sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure))
+# endif
case CLOSURE_BSSRDF_CUBIC_ID:
case CLOSURE_BSSRDF_GAUSSIAN_ID: {
- ShaderClosure *sc = &sd->closure[sd->num_closure];
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
float3 weight = sc->weight * mix_weight;
float sample_weight = fabsf(average(weight));
@@ -435,7 +442,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
param1 = 0.0f;
- if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure+2 < MAX_CLOSURE) {
+ if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure)+2 < MAX_CLOSURE) {
/* radius * scale */
float3 radius = stack_load_float3(stack, data_node.z)*param1;
/* sharpness */
@@ -455,10 +462,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->prim = NULL;
#endif
sc->N = N;
- sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+ ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
- sd->num_closure++;
- sc++;
+ ccl_fetch(sd, num_closure)++;
+ sc_next(sc);
}
if(fabsf(weight.y) > 0.0f) {
@@ -472,10 +479,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->prim = NULL;
#endif
sc->N = N;
- sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+ ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
- sd->num_closure++;
- sc++;
+ ccl_fetch(sd, num_closure)++;
+ sc_next(sc);
}
if(fabsf(weight.z) > 0.0f) {
@@ -489,15 +496,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
sc->prim = NULL;
#endif
sc->N = N;
- sd->flag |= bssrdf_setup(sc, (ClosureType)type);
+ ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type);
- sd->num_closure++;
- sc++;
+ ccl_fetch(sd, num_closure)++;
+ sc_next(sc);
}
}
break;
}
+# undef sc_next
#endif
default:
break;
@@ -525,7 +533,7 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
ShaderClosure *sc = svm_node_closure_get_absorption(sd, mix_weight * density);
if(sc) {
- sd->flag |= volume_absorption_setup(sc);
+ ccl_fetch(sd, flag) |= volume_absorption_setup(sc);
}
break;
}
@@ -535,7 +543,7 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
if(sc) {
sc->data0 = param2; /* g */
sc->data1 = 0.0f;
- sd->flag |= volume_henyey_greenstein_setup(sc);
+ ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(sc);
}
break;
}
@@ -560,7 +568,7 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
else
svm_node_closure_get_non_bsdf(sd, CLOSURE_EMISSION_ID, 1.0f);
- sd->flag |= SD_EMISSION;
+ ccl_fetch(sd, flag) |= SD_EMISSION;
}
ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
@@ -594,7 +602,7 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod
else
svm_node_closure_get_non_bsdf(sd, CLOSURE_HOLDOUT_ID, 1.0f);
- sd->flag |= SD_HOLDOUT;
+ ccl_fetch(sd, flag) |= SD_HOLDOUT;
}
ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node)
@@ -612,15 +620,17 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
else
svm_node_closure_get_non_bsdf(sd, CLOSURE_AMBIENT_OCCLUSION_ID, 1.0f);
- sd->flag |= SD_AO;
+ ccl_fetch(sd, flag) |= SD_AO;
}
/* Closure Nodes */
ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
{
- if(sd->num_closure < MAX_CLOSURE)
- sd->closure[sd->num_closure].weight = weight;
+ if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) {
+ ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure));
+ sc->weight = weight;
+ }
}
ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
@@ -670,7 +680,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
{
float3 normal = stack_load_float3(stack, in_direction);
- sd->N = normal;
+ ccl_fetch(sd, N) = normal;
stack_store_float3(stack, out_normal, normal);
}
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 4a058905a93..8d4b07c9973 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -25,11 +25,11 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
uint normal_offset, distance_offset, invert;
decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, NULL);
- float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
+ float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
/* get surface tangents from normal */
- float3 Rx = cross(sd->dP.dy, normal_in);
- float3 Ry = cross(normal_in, sd->dP.dx);
+ float3 Rx = cross(ccl_fetch(sd, dP).dy, normal_in);
+ float3 Ry = cross(normal_in, ccl_fetch(sd, dP).dx);
/* get bump values */
uint c_offset, x_offset, y_offset, strength_offset;
@@ -40,7 +40,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
float h_y = stack_load_float(stack, y_offset);
/* compute surface gradient and determinant */
- float det = dot(sd->dP.dx, Rx);
+ float det = dot(ccl_fetch(sd, dP).dx, Rx);
float3 surfgrad = (h_x - h_c)*Rx + (h_y - h_c)*Ry;
float absdet = fabsf(det);
@@ -65,7 +65,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
ccl_device void svm_node_set_displacement(ShaderData *sd, float *stack, uint fac_offset)
{
float d = stack_load_float(stack, fac_offset);
- sd->P += sd->N*d*0.1f; /* todo: get rid of this factor */
+ ccl_fetch(sd, P) += ccl_fetch(sd, N)*d*0.1f; /* todo: get rid of this factor */
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 3703ec55015..23c97d80cb0 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset,
uint normal_offset, out_offset;
decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL);
float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value);
- float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
+ float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
eta = fmaxf(eta, 1e-5f);
- eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
+ eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
- float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
+ float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
stack_store_float(stack, out_offset, f);
}
@@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL);
float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value);
- float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N;
+ float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
float f;
if(type == NODE_LAYER_WEIGHT_FRESNEL) {
float eta = fmaxf(1.0f - blend, 1e-5f);
- eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta;
+ eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta;
- f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
+ f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
}
else {
- f = fabsf(dot(sd->I, normal_in));
+ f = fabsf(dot(ccl_fetch(sd, I), normal_in));
if(blend != 0.5f) {
blend = clamp(blend, 0.0f, 1.0f-1e-5f);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index efbefa77d28..bb06254c3a9 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -23,15 +23,15 @@ ccl_device void svm_node_geometry(KernelGlobals *kg, ShaderData *sd, float *stac
float3 data;
switch(type) {
- case NODE_GEOM_P: data = sd->P; break;
- case NODE_GEOM_N: data = sd->N; break;
+ case NODE_GEOM_P: data = ccl_fetch(sd, P); break;
+ case NODE_GEOM_N: data = ccl_fetch(sd, N); break;
#ifdef __DPDU__
case NODE_GEOM_T: data = primitive_tangent(kg, sd); break;
#endif
- case NODE_GEOM_I: data = sd->I; break;
- case NODE_GEOM_Ng: data = sd->Ng; break;
+ case NODE_GEOM_I: data = ccl_fetch(sd, I); break;
+ case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break;
#ifdef __UV__
- case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
+ case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break;
#endif
}
@@ -44,8 +44,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo
float3 data;
switch(type) {
- case NODE_GEOM_P: data = sd->P + sd->dP.dx; break;
- case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break;
+ case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break;
+ case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break;
default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
}
@@ -61,8 +61,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo
float3 data;
switch(type) {
- case NODE_GEOM_P: data = sd->P + sd->dP.dy; break;
- case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break;
+ case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break;
+ case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break;
default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
}
@@ -83,9 +83,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s
stack_store_float3(stack, out_offset, object_location(kg, sd));
return;
}
- case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break;
+ case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break;
case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break;
- case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break;
+ case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break;
default: data = 0.0f; break;
}
@@ -98,44 +98,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float
{
switch(type) {
case NODE_INFO_PAR_INDEX: {
- int particle_id = object_particle_id(kg, sd->object);
+ int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
stack_store_float(stack, out_offset, particle_index(kg, particle_id));
break;
}
case NODE_INFO_PAR_AGE: {
- int particle_id = object_particle_id(kg, sd->object);
+ int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
stack_store_float(stack, out_offset, particle_age(kg, particle_id));
break;
}
case NODE_INFO_PAR_LIFETIME: {
- int particle_id = object_particle_id(kg, sd->object);
+ int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id));
break;
}
case NODE_INFO_PAR_LOCATION: {
- int particle_id = object_particle_id(kg, sd->object);
+ int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
stack_store_float3(stack, out_offset, particle_location(kg, particle_id));
break;
}
#if 0 /* XXX float4 currently not supported in SVM stack */
case NODE_INFO_PAR_ROTATION: {
- int particle_id = object_particle_id(kg, sd->object);
+ int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id));
break;
}
#endif
case NODE_INFO_PAR_SIZE: {
- int particle_id = object_particle_id(kg, sd->object);
+ int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
stack_store_float(stack, out_offset, particle_size(kg, particle_id));
break;
}
case NODE_INFO_PAR_VELOCITY: {
- int particle_id = object_particle_id(kg, sd->object);
+ int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id));
break;
}
case NODE_INFO_PAR_ANGULAR_VELOCITY: {
- int particle_id = object_particle_id(kg, sd->object);
+ int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id));
break;
}
@@ -153,7 +153,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta
switch(type) {
case NODE_INFO_CURVE_IS_STRAND: {
- data = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
+ data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0;
stack_store_float(stack, out_offset, data);
break;
}
@@ -165,7 +165,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta
break;
}
/*case NODE_INFO_CURVE_FADE: {
- data = sd->curve_transparency;
+ data = ccl_fetch(sd, curve_transparency);
stack_store_float(stack, out_offset, data);
break;
}*/
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 08a6c01162c..caf0b37ba35 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -392,10 +392,10 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta
ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
{
/* get object space normal */
- float3 N = sd->N;
+ float3 N = ccl_fetch(sd, N);
- N = sd->N;
- if(sd->object != OBJECT_NONE)
+ N = ccl_fetch(sd, N);
+ if(ccl_fetch(sd, object) != OBJECT_NONE)
object_inverse_normal_transform(kg, sd, &N);
/* project from direction vector to barycentric coordinates in triangles */
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index ffadafb1d0c..a235dd35224 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -31,10 +31,10 @@ ccl_device void svm_node_light_path(ShaderData *sd, float *stack, uint type, uin
case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break;
case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break;
case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break;
- case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break;
- case NODE_LP_ray_length: info = sd->ray_length; break;
- case NODE_LP_ray_depth: info = (float)sd->ray_depth; break;
- case NODE_LP_ray_transparent: info = (float)sd->transparent_depth; break;
+ case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break;
+ case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break;
+ case NODE_LP_ray_depth: info = (float)ccl_fetch(sd, ray_depth); break;
+ case NODE_LP_ray_transparent: info = (float)ccl_fetch(sd, transparent_depth); break;
}
stack_store_float(stack, out_offset, info);
@@ -53,14 +53,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
switch(type) {
case NODE_LIGHT_FALLOFF_QUADRATIC: break;
- case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break;
- case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break;
+ case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break;
+ case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break;
}
float smooth = stack_load_float(stack, smooth_offset);
if(smooth > 0.0f) {
- float squared = sd->ray_length*sd->ray_length;
+ float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length);
strength *= squared/(smooth + squared);
}
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index a399acf3c0f..eebd9bee420 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
switch(type) {
case NODE_TEXCO_OBJECT: {
- data = sd->P;
+ data = ccl_fetch(sd, P);
if(node.w == 0) {
- if(sd->object != OBJECT_NONE) {
+ if(ccl_fetch(sd, object) != OBJECT_NONE) {
object_inverse_position_transform(kg, sd, &data);
}
}
@@ -48,48 +48,48 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
break;
}
case NODE_TEXCO_NORMAL: {
- data = sd->N;
- if(sd->object != OBJECT_NONE)
+ data = ccl_fetch(sd, N);
+ if(ccl_fetch(sd, object) != OBJECT_NONE)
object_inverse_normal_transform(kg, sd, &data);
break;
}
case NODE_TEXCO_CAMERA: {
Transform tfm = kernel_data.cam.worldtocamera;
- if(sd->object != OBJECT_NONE)
- data = transform_point(&tfm, sd->P);
+ if(ccl_fetch(sd, object) != OBJECT_NONE)
+ data = transform_point(&tfm, ccl_fetch(sd, P));
else
- data = transform_point(&tfm, sd->P + camera_position(kg));
+ data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg));
break;
}
case NODE_TEXCO_WINDOW: {
- if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
- data = camera_world_to_ndc(kg, sd, sd->ray_P);
+ if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+ data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P));
else
- data = camera_world_to_ndc(kg, sd, sd->P);
+ data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P));
data.z = 0.0f;
break;
}
case NODE_TEXCO_REFLECTION: {
- if(sd->object != OBJECT_NONE)
- data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+ if(ccl_fetch(sd, object) != OBJECT_NONE)
+ data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
else
- data = sd->I;
+ data = ccl_fetch(sd, I);
break;
}
case NODE_TEXCO_DUPLI_GENERATED: {
- data = object_dupli_generated(kg, sd->object);
+ data = object_dupli_generated(kg, ccl_fetch(sd, object));
break;
}
case NODE_TEXCO_DUPLI_UV: {
- data = object_dupli_uv(kg, sd->object);
+ data = object_dupli_uv(kg, ccl_fetch(sd, object));
break;
}
case NODE_TEXCO_VOLUME_GENERATED: {
- data = sd->P;
+ data = ccl_fetch(sd, P);
#ifdef __VOLUME__
- if(sd->object != OBJECT_NONE)
+ if(ccl_fetch(sd, object) != OBJECT_NONE)
data = volume_normalized_position(kg, sd, data);
#endif
break;
@@ -113,9 +113,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
switch(type) {
case NODE_TEXCO_OBJECT: {
- data = sd->P + sd->dP.dx;
+ data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
if(node.w == 0) {
- if(sd->object != OBJECT_NONE) {
+ if(ccl_fetch(sd, object) != OBJECT_NONE) {
object_inverse_position_transform(kg, sd, &data);
}
}
@@ -130,48 +130,48 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
break;
}
case NODE_TEXCO_NORMAL: {
- data = sd->N;
- if(sd->object != OBJECT_NONE)
+ data = ccl_fetch(sd, N);
+ if(ccl_fetch(sd, object) != OBJECT_NONE)
object_inverse_normal_transform(kg, sd, &data);
break;
}
case NODE_TEXCO_CAMERA: {
Transform tfm = kernel_data.cam.worldtocamera;
- if(sd->object != OBJECT_NONE)
- data = transform_point(&tfm, sd->P + sd->dP.dx);
+ if(ccl_fetch(sd, object) != OBJECT_NONE)
+ data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
else
- data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg));
+ data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg));
break;
}
case NODE_TEXCO_WINDOW: {
- if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
- data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
+ if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+ data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx);
else
- data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
+ data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
data.z = 0.0f;
break;
}
case NODE_TEXCO_REFLECTION: {
- if(sd->object != OBJECT_NONE)
- data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+ if(ccl_fetch(sd, object) != OBJECT_NONE)
+ data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
else
- data = sd->I;
+ data = ccl_fetch(sd, I);
break;
}
case NODE_TEXCO_DUPLI_GENERATED: {
- data = object_dupli_generated(kg, sd->object);
+ data = object_dupli_generated(kg, ccl_fetch(sd, object));
break;
}
case NODE_TEXCO_DUPLI_UV: {
- data = object_dupli_uv(kg, sd->object);
+ data = object_dupli_uv(kg, ccl_fetch(sd, object));
break;
}
case NODE_TEXCO_VOLUME_GENERATED: {
- data = sd->P + sd->dP.dx;
+ data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
#ifdef __VOLUME__
- if(sd->object != OBJECT_NONE)
+ if(ccl_fetch(sd, object) != OBJECT_NONE)
data = volume_normalized_position(kg, sd, data);
#endif
break;
@@ -198,9 +198,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
switch(type) {
case NODE_TEXCO_OBJECT: {
- data = sd->P + sd->dP.dy;
+ data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
if(node.w == 0) {
- if(sd->object != OBJECT_NONE) {
+ if(ccl_fetch(sd, object) != OBJECT_NONE) {
object_inverse_position_transform(kg, sd, &data);
}
}
@@ -215,48 +215,48 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
break;
}
case NODE_TEXCO_NORMAL: {
- data = sd->N;
- if(sd->object != OBJECT_NONE)
+ data = ccl_fetch(sd, N);
+ if(ccl_fetch(sd, object) != OBJECT_NONE)
object_inverse_normal_transform(kg, sd, &data);
break;
}
case NODE_TEXCO_CAMERA: {
Transform tfm = kernel_data.cam.worldtocamera;
- if(sd->object != OBJECT_NONE)
- data = transform_point(&tfm, sd->P + sd->dP.dy);
+ if(ccl_fetch(sd, object) != OBJECT_NONE)
+ data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
else
- data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg));
+ data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg));
break;
}
case NODE_TEXCO_WINDOW: {
- if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
- data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
+ if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+ data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy);
else
- data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
+ data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
data.z = 0.0f;
break;
}
case NODE_TEXCO_REFLECTION: {
- if(sd->object != OBJECT_NONE)
- data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
+ if(ccl_fetch(sd, object) != OBJECT_NONE)
+ data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
else
- data = sd->I;
+ data = ccl_fetch(sd, I);
break;
}
case NODE_TEXCO_DUPLI_GENERATED: {
- data = object_dupli_generated(kg, sd->object);
+ data = object_dupli_generated(kg, ccl_fetch(sd, object));
break;
}
case NODE_TEXCO_DUPLI_UV: {
- data = object_dupli_uv(kg, sd->object);
+ data = object_dupli_uv(kg, ccl_fetch(sd, object));
break;
}
case NODE_TEXCO_VOLUME_GENERATED: {
- data = sd->P + sd->dP.dy;
+ data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
#ifdef __VOLUME__
- if(sd->object != OBJECT_NONE)
+ if(ccl_fetch(sd, object) != OBJECT_NONE)
data = volume_normalized_position(kg, sd, data);
#endif
break;
@@ -281,7 +281,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
if(space == NODE_NORMAL_MAP_TANGENT) {
/* tangent space */
- if(sd->object == OBJECT_NONE) {
+ if(ccl_fetch(sd, object) == OBJECT_NONE) {
stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f));
return;
}
@@ -302,11 +302,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
float sign = primitive_attribute_float(kg, sd, attr_sign_elem, attr_sign_offset, NULL, NULL);
float3 normal;
- if(sd->shader & SHADER_SMOOTH_NORMAL) {
+ if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
normal = primitive_attribute_float3(kg, sd, attr_normal_elem, attr_normal_offset, NULL, NULL);
}
else {
- normal = sd->Ng;
+ normal = ccl_fetch(sd, Ng);
object_inverse_normal_transform(kg, sd, &normal);
}
@@ -337,7 +337,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
if(strength != 1.0f) {
strength = max(strength, 0.0f);
- N = normalize(sd->N + (N - sd->N)*strength);
+ N = normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength);
}
stack_store_float3(stack, normal_offset, N);
@@ -367,7 +367,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
float3 generated;
if(attr_offset == ATTR_STD_NOT_FOUND)
- generated = sd->P;
+ generated = ccl_fetch(sd, P);
else
generated = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
@@ -380,7 +380,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
}
object_normal_transform(kg, sd, &tangent);
- tangent = cross(sd->N, normalize(cross(tangent, sd->N)));
+ tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N))));
stack_store_float3(stack, tangent_offset, tangent);
}
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 4e92f27acdb..4c32130d06d 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito;
Transform tfm;
- bool is_object = (sd->object != OBJECT_NONE);
+ bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE);
bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL);
/* From world */
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index eaa17f8ce57..30ccd523add 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -41,9 +41,9 @@ ccl_device float wireframe(KernelGlobals *kg,
float3 *P)
{
#ifdef __HAIR__
- if(sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
+ if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)
#else
- if(sd->prim != PRIM_NONE)
+ if(ccl_fetch(sd, prim) != PRIM_NONE)
#endif
{
float3 Co[3];
@@ -52,12 +52,12 @@ ccl_device float wireframe(KernelGlobals *kg,
/* Triangles */
int np = 3;
- if(sd->type & PRIMITIVE_TRIANGLE)
- triangle_vertices(kg, sd->prim, Co);
+ if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE)
+ triangle_vertices(kg, ccl_fetch(sd, prim), Co);
else
- motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co);
+ motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co);
- if(!(sd->flag & SD_TRANSFORM_APPLIED)) {
+ if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) {
object_position_transform(kg, sd, &Co[0]);
object_position_transform(kg, sd, &Co[1]);
object_position_transform(kg, sd, &Co[2]);
@@ -66,8 +66,8 @@ ccl_device float wireframe(KernelGlobals *kg,
if(pixel_size) {
// Project the derivatives of P to the viewing plane defined
// by I so we have a measure of how big is a pixel at this point
- float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I);
- float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I);
+ float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
+ float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
// Take the average of both axis' length
pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f;
}
@@ -106,16 +106,27 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg,
int pixel_size = (int)use_pixel_size;
/* Calculate wireframe */
- float f = wireframe(kg, sd, size, pixel_size, &sd->P);
+#ifdef __SPLIT_KERNEL__
+ /* TODO(sergey): This is because sd is actually a global space,
+ * which makes it difficult to re-use same wireframe() function.
+ *
+ * With OpenCL 2.0 it's possible to avoid this change, but for until
+ * then we'll be living with such an exception.
+ */
+ float3 P = ccl_fetch(sd, P);
+ float f = wireframe(kg, sd, size, pixel_size, &P);
+#else
+ float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P));
+#endif
/* TODO(sergey): Think of faster way to calculate derivatives. */
if(bump_offset == NODE_BUMP_OFFSET_DX) {
- float3 Px = sd->P - sd->dP.dx;
- f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx);
+ float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx;
+ f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx);
}
else if(bump_offset == NODE_BUMP_OFFSET_DY) {
- float3 Py = sd->P - sd->dP.dy;
- f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy);
+ float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy;
+ f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy);
}
if(stack_valid(out_fac))
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index aacb81faf83..7b329af008d 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -807,7 +807,10 @@ void Session::update_status_time(bool show_pause, bool show_done)
substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles);
- if((is_gpu && !is_multidevice) || (is_cpu && num_tiles == 1)) {
+ if(((is_gpu && !is_multidevice) || (is_cpu && num_tiles == 1)) && !device->info.use_split_kernel) {
+ /* When using split-kernel (OpenCL) each thread in a tile will be working on a different
+ * sample. Can't display sample number when device uses split-kernel
+ */
/* when rendering on GPU multithreading happens within single tile, as in
* tiles are handling sequentially and in this case we could display
* currently rendering sample number