diff options
57 files changed, 5784 insertions, 828 deletions
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 4d40518644e..162f51252b0 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -55,6 +55,7 @@ public: bool advanced_shading; bool pack_images; bool extended_images; /* flag for GPU and Multi device */ + bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */ vector<DeviceInfo> multi_devices; DeviceInfo() @@ -66,6 +67,7 @@ public: advanced_shading = true; pack_images = false; extended_images = false; + use_split_kernel = false; } }; diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index 1147cbd69b4..25eb160d71b 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -39,6 +39,30 @@ CCL_NAMESPACE_BEGIN #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p)) +#define KERNEL_APPEND_ARG(kernel_name, arg) \ + opencl_assert(clSetKernelArg(kernel_name, narg++, sizeof(arg), (void*)&arg)) + +/* Macro declarations used with split kernel */ + +/* Macro to enable/disable work-stealing */ +#define __WORK_STEALING__ + +#define SPLIT_KERNEL_LOCAL_SIZE_X 64 +#define SPLIT_KERNEL_LOCAL_SIZE_Y 1 + +/* This value may be tuned according to the scene we are rendering. + * + * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected + * ray-bounces will improve performance. + */ +#define PATH_ITER_INC_FACTOR 8 + +/* When allocate global memory in chunks. We may not be able to + * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks; + * Since some bytes may be needed for aligning chunks of memory; + * This is the amount of memory that we dedicate for that purpose. + */ +#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB static cl_device_type opencl_device_type() { @@ -94,11 +118,11 @@ static string opencl_kernel_build_options(const string& platform, const string * build_options += "-D__KERNEL_OPENCL_AMD__ "; else if(platform == "Intel(R) OpenCL") { - build_options += "-D__KERNEL_OPENCL_INTEL_CPU__"; + build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ "; /* options for gdb source level kernel debugging. this segfaults on linux currently */ if(opencl_kernel_use_debug() && debug_src) - build_options += "-g -s \"" + *debug_src + "\""; + build_options += "-g -s \"" + *debug_src + "\" "; } if(opencl_kernel_use_debug()) @@ -118,14 +142,18 @@ class OpenCLCache { thread_mutex *mutex; cl_context context; - cl_program program; + /* cl_program for shader, bake, film_convert kernels (used in OpenCLDeviceBase) */ + cl_program ocl_dev_base_program; + /* cl_program for megakernel (used in OpenCLDeviceMegaKernel) */ + cl_program ocl_dev_megakernel_program; - Slot() : mutex(NULL), context(NULL), program(NULL) {} + Slot() : mutex(NULL), context(NULL), ocl_dev_base_program(NULL), ocl_dev_megakernel_program(NULL) {} Slot(const Slot &rhs) : mutex(rhs.mutex) , context(rhs.context) - , program(rhs.program) + , ocl_dev_base_program(rhs.ocl_dev_base_program) + , ocl_dev_megakernel_program(rhs.ocl_dev_megakernel_program) { /* copy can only happen in map insert, assert that */ assert(mutex == NULL); @@ -236,6 +264,12 @@ class OpenCLCache } public: + + enum ProgramName { + OCL_DEV_BASE_PROGRAM, + OCL_DEV_MEGAKERNEL_PROGRAM, + }; + /* see get_something comment */ static cl_context get_context(cl_platform_id platform, cl_device_id device, thread_scoped_lock &slot_locker) @@ -254,10 +288,21 @@ public: } /* see get_something comment */ - static cl_program get_program(cl_platform_id platform, cl_device_id device, + static cl_program get_program(cl_platform_id platform, cl_device_id device, ProgramName program_name, thread_scoped_lock &slot_locker) { - cl_program program = get_something<cl_program>(platform, device, &Slot::program, slot_locker); + cl_program program = NULL; + + if(program_name == OCL_DEV_BASE_PROGRAM) { + /* Get program related to OpenCLDeviceBase */ + program = get_something<cl_program>(platform, device, &Slot::ocl_dev_base_program, slot_locker); + } + else if(program_name == OCL_DEV_MEGAKERNEL_PROGRAM) { + /* Get program related to megakernel */ + program = get_something<cl_program>(platform, device, &Slot::ocl_dev_megakernel_program, slot_locker); + } else { + assert(!"Invalid program name"); + } if(!program) return NULL; @@ -284,10 +329,18 @@ public: } /* see store_something comment */ - static void store_program(cl_platform_id platform, cl_device_id device, cl_program program, + static void store_program(cl_platform_id platform, cl_device_id device, cl_program program, ProgramName program_name, thread_scoped_lock &slot_locker) { - store_something<cl_program>(platform, device, program, &Slot::program, slot_locker); + if(program_name == OCL_DEV_BASE_PROGRAM) { + store_something<cl_program>(platform, device, program, &Slot::ocl_dev_base_program, slot_locker); + } + else if(program_name == OCL_DEV_MEGAKERNEL_PROGRAM) { + store_something<cl_program>(platform, device, program, &Slot::ocl_dev_megakernel_program, slot_locker); + } else { + assert(!"Invalid program name\n"); + return; + } /* increment reference count in OpenCL. * The caller is going to release the object when done with it. */ @@ -304,8 +357,10 @@ public: thread_scoped_lock cache_lock(self.cache_lock); foreach(CacheMap::value_type &item, self.cache) { - if(item.second.program != NULL) - clReleaseProgram(item.second.program); + if(item.second.ocl_dev_base_program != NULL) + clReleaseProgram(item.second.ocl_dev_base_program); + if(item.second.ocl_dev_megakernel_program != NULL) + clReleaseProgram(item.second.ocl_dev_megakernel_program); if(item.second.context != NULL) clReleaseContext(item.second.context); } @@ -314,7 +369,7 @@ public: } }; -class OpenCLDevice : public Device +class OpenCLDeviceBase : public Device { public: DedicatedTaskPool task_pool; @@ -323,7 +378,6 @@ public: cl_platform_id cpPlatform; cl_device_id cdDevice; cl_program cpProgram; - cl_kernel ckPathTraceKernel; cl_kernel ckFilmConvertByteKernel; cl_kernel ckFilmConvertHalfFloatKernel; cl_kernel ckShaderKernel; @@ -385,7 +439,7 @@ public: } } - OpenCLDevice(DeviceInfo& info, Stats &stats, bool background_) + OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_) : Device(info, stats, background_) { cpPlatform = NULL; @@ -393,7 +447,6 @@ public: cxContext = NULL; cqCommandQueue = NULL; cpProgram = NULL; - ckPathTraceKernel = NULL; ckFilmConvertByteKernel = NULL; ckFilmConvertHalfFloatKernel = NULL; ckShaderKernel = NULL; @@ -501,7 +554,7 @@ public: if(opencl_error(ciErr)) return; - fprintf(stderr,"Device init succes\n"); + fprintf(stderr, "Device init success\n"); device_initialized = true; } @@ -547,7 +600,11 @@ public: return true; } - bool load_binary(const string& kernel_path, const string& clbin, const string *debug_src = NULL) + bool load_binary(const string& /*kernel_path*/, + const string& clbin, + string custom_kernel_build_options, + cl_program *program, + const string *debug_src = NULL) { /* read binary into memory */ vector<uint8_t> binary; @@ -562,7 +619,7 @@ public: size_t size = binary.size(); const uint8_t *bytes = &binary[0]; - cpProgram = clCreateProgramWithBinary(cxContext, 1, &cdDevice, + *program = clCreateProgramWithBinary(cxContext, 1, &cdDevice, &size, &bytes, &status, &ciErr); if(opencl_error(status) || opencl_error(ciErr)) { @@ -570,16 +627,16 @@ public: return false; } - if(!build_kernel(kernel_path, debug_src)) + if(!build_kernel(program, custom_kernel_build_options, debug_src)) return false; return true; } - bool save_binary(const string& clbin) + bool save_binary(cl_program *program, const string& clbin) { size_t size = 0; - clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); + clGetProgramInfo(*program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); if(!size) return false; @@ -587,7 +644,7 @@ public: vector<uint8_t> binary(size); uint8_t *bytes = &binary[0]; - clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL); + clGetProgramInfo(*program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL); if(!path_write_binary(clbin, binary)) { opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str())); @@ -597,20 +654,23 @@ public: return true; } - bool build_kernel(const string& /*kernel_path*/, const string *debug_src = NULL) + bool build_kernel(cl_program *kernel_program, + string custom_kernel_build_options, + const string *debug_src = NULL) { - string build_options = opencl_kernel_build_options(platform_name, debug_src); - - ciErr = clBuildProgram(cpProgram, 0, NULL, build_options.c_str(), NULL, NULL); + string build_options; + build_options = opencl_kernel_build_options(platform_name, debug_src) + custom_kernel_build_options; + + ciErr = clBuildProgram(*kernel_program, 0, NULL, build_options.c_str(), NULL, NULL); /* show warnings even if build is successful */ size_t ret_val_size = 0; - clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); + clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); if(ret_val_size > 1) { - vector<char> build_log(ret_val_size+1); - clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL); + vector<char> build_log(ret_val_size + 1); + clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL); build_log[ret_val_size] = '\0'; fprintf(stderr, "OpenCL kernel build output:\n"); @@ -625,12 +685,15 @@ public: return true; } - bool compile_kernel(const string& kernel_path, const string& kernel_md5, const string *debug_src = NULL) + bool compile_kernel(const string& kernel_path, + string source, + string custom_kernel_build_options, + cl_program *kernel_program, + const string *debug_src = NULL) { /* we compile kernels consisting of many files. unfortunately opencl * kernel caches do not seem to recognize changes in included files. * so we force recompile on changes by adding the md5 hash of all files */ - string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n"; source = path_source_replace_includes(source, kernel_path); if(debug_src) @@ -639,7 +702,7 @@ public: size_t source_len = source.size(); const char *source_str = source.c_str(); - cpProgram = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr); + *kernel_program = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr); if(opencl_error(ciErr)) return false; @@ -647,7 +710,7 @@ public: double starttime = time_dt(); printf("Compiling OpenCL kernel ...\n"); - if(!build_kernel(kernel_path, debug_src)) + if(!build_kernel(kernel_program, custom_kernel_build_options, debug_src)) return false; printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); @@ -655,7 +718,7 @@ public: return true; } - string device_md5_hash() + string device_md5_hash(string kernel_custom_build_options = "") { MD5Hash md5; char version[256], driver[256], name[256], vendor[256]; @@ -671,12 +734,13 @@ public: md5.append((uint8_t*)driver, strlen(driver)); string options = opencl_kernel_build_options(platform_name); + options += kernel_custom_build_options; md5.append((uint8_t*)options.c_str(), options.size()); return md5.get_hex(); } - bool load_kernels(bool /*experimental*/) + bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/) { /* verify if device was initialized */ if(!device_initialized) { @@ -686,7 +750,7 @@ public: /* try to use cached kernel */ thread_scoped_lock cache_locker; - cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, cache_locker); + cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, OpenCLCache::OCL_DEV_BASE_PROGRAM, cache_locker); if(!cpProgram) { /* verify we have right opencl version */ @@ -712,28 +776,27 @@ public: } /* if exists already, try use it */ - if(path_exists(clbin) && load_binary(kernel_path, clbin, debug_src)) { + if(path_exists(clbin) && load_binary(kernel_path, clbin, "", &cpProgram)) { /* kernel loaded from binary */ } else { + + string init_kernel_source = "#include \"kernel.cl\" // " + kernel_md5 + "\n"; + /* if does not exist or loading binary failed, compile kernel */ - if(!compile_kernel(kernel_path, kernel_md5, debug_src)) + if(!compile_kernel(kernel_path, init_kernel_source, "", &cpProgram, debug_src)) return false; /* save binary for reuse */ - if(!save_binary(clbin)) + if(!save_binary(&cpProgram, clbin)) return false; } /* cache the program */ - OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, cache_locker); + OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, OpenCLCache::OCL_DEV_BASE_PROGRAM, cache_locker); } /* find kernels */ - ckPathTraceKernel = clCreateKernel(cpProgram, "kernel_ocl_path_trace", &ciErr); - if(opencl_error(ciErr)) - return false; - ckFilmConvertByteKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_byte", &ciErr); if(opencl_error(ciErr)) return false; @@ -753,7 +816,7 @@ public: return true; } - ~OpenCLDevice() + ~OpenCLDeviceBase() { task_pool.stop(); @@ -766,12 +829,14 @@ public: delete mt->second; } - if(ckPathTraceKernel) - clReleaseKernel(ckPathTraceKernel); if(ckFilmConvertByteKernel) clReleaseKernel(ckFilmConvertByteKernel); if(ckFilmConvertHalfFloatKernel) clReleaseKernel(ckFilmConvertHalfFloatKernel); + if(ckShaderKernel) + clReleaseKernel(ckShaderKernel); + if(ckBakeKernel) + clReleaseKernel(ckBakeKernel); if(cpProgram) clReleaseProgram(cpProgram); if(cqCommandQueue) @@ -913,42 +978,6 @@ public: opencl_assert(clFlush(cqCommandQueue)); } - void path_trace(RenderTile& rtile, int sample) - { - /* cast arguments to cl types */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); - cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state); - cl_int d_x = rtile.x; - cl_int d_y = rtile.y; - cl_int d_w = rtile.w; - cl_int d_h = rtile.h; - cl_int d_sample = sample; - cl_int d_offset = rtile.offset; - cl_int d_stride = rtile.stride; - - /* sample arguments */ - cl_uint narg = 0; - - opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data)); - opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer)); - opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state)); - -#define KERNEL_TEX(type, ttype, name) \ - set_kernel_arg_mem(ckPathTraceKernel, &narg, #name); -#include "kernel_textures.h" - - opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample)); - opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x)); - opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y)); - opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w)); - opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h)); - opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_offset), (void*)&d_offset)); - opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_stride), (void*)&d_stride)); - - enqueue_kernel(ckPathTraceKernel, d_w, d_h); - } - void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name) { cl_mem ptr; @@ -985,23 +1014,23 @@ public: cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel; - opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data)); - opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba)); - opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer)); + /* TODO : Make the kernel launch similar to Cuda */ + KERNEL_APPEND_ARG(ckFilmConvertKernel, d_data); + KERNEL_APPEND_ARG(ckFilmConvertKernel, d_rgba); + KERNEL_APPEND_ARG(ckFilmConvertKernel, d_buffer); #define KERNEL_TEX(type, ttype, name) \ set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name); #include "kernel_textures.h" +#undef KERNEL_TEX - opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample_scale), (void*)&d_sample_scale)); - opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x)); - opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y)); - opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w)); - opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h)); - opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_offset), (void*)&d_offset)); - opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_stride), (void*)&d_stride)); - - + KERNEL_APPEND_ARG(ckFilmConvertKernel, d_sample_scale); + KERNEL_APPEND_ARG(ckFilmConvertKernel, d_x); + KERNEL_APPEND_ARG(ckFilmConvertKernel, d_y); + KERNEL_APPEND_ARG(ckFilmConvertKernel, d_w); + KERNEL_APPEND_ARG(ckFilmConvertKernel, d_h); + KERNEL_APPEND_ARG(ckFilmConvertKernel, d_offset); + KERNEL_APPEND_ARG(ckFilmConvertKernel, d_stride); enqueue_kernel(ckFilmConvertKernel, d_w, d_h); } @@ -1034,19 +1063,21 @@ public: cl_int d_sample = sample; - opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data)); - opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input)); - opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output)); + /* TODO : Make the kernel launch similar to Cuda */ + KERNEL_APPEND_ARG(kernel, d_data); + KERNEL_APPEND_ARG(kernel, d_input); + KERNEL_APPEND_ARG(kernel, d_output); #define KERNEL_TEX(type, ttype, name) \ set_kernel_arg_mem(kernel, &narg, #name); #include "kernel_textures.h" +#undef KERNEL_TEX - opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type)); - opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x)); - opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w)); - opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_offset), (void*)&d_offset)); - opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample)); + KERNEL_APPEND_ARG(kernel, d_shader_eval_type); + KERNEL_APPEND_ARG(kernel, d_shader_x); + KERNEL_APPEND_ARG(kernel, d_shader_w); + KERNEL_APPEND_ARG(kernel, d_offset); + KERNEL_APPEND_ARG(kernel, d_sample); enqueue_kernel(kernel, task.shader_w, 1); @@ -1054,6 +1085,305 @@ public: } } + class OpenCLDeviceTask : public DeviceTask { + public: + OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task) + : DeviceTask(task) + { + run = function_bind(&OpenCLDeviceBase::thread_run, + device, + this); + } + }; + + int get_split_task_count(DeviceTask& /*task*/) + { + return 1; + } + + void task_add(DeviceTask& task) + { + task_pool.push(new OpenCLDeviceTask(this, task)); + } + + void task_wait() + { + task_pool.wait(); + } + + void task_cancel() + { + task_pool.cancel(); + } + + virtual void thread_run(DeviceTask * /*task*/) = 0; + +protected: + class ArgumentWrapper { + public: + ArgumentWrapper() : size(0), pointer(NULL) {} + template <typename T> + ArgumentWrapper(T& argument) : size(sizeof(argument)), + pointer(&argument) { } + size_t size; + void *pointer; + }; + + /* TODO(sergey): In the future we can use variadic templates, once + * C++0x is allowed. Should allow to clean this up a bit. + */ + int kernel_set_args(cl_kernel kernel, + int start_argument_index, + const ArgumentWrapper& arg1 = ArgumentWrapper(), + const ArgumentWrapper& arg2 = ArgumentWrapper(), + const ArgumentWrapper& arg3 = ArgumentWrapper(), + const ArgumentWrapper& arg4 = ArgumentWrapper(), + const ArgumentWrapper& arg5 = ArgumentWrapper(), + const ArgumentWrapper& arg6 = ArgumentWrapper(), + const ArgumentWrapper& arg7 = ArgumentWrapper(), + const ArgumentWrapper& arg8 = ArgumentWrapper(), + const ArgumentWrapper& arg9 = ArgumentWrapper(), + const ArgumentWrapper& arg10 = ArgumentWrapper(), + const ArgumentWrapper& arg11 = ArgumentWrapper(), + const ArgumentWrapper& arg12 = ArgumentWrapper(), + const ArgumentWrapper& arg13 = ArgumentWrapper(), + const ArgumentWrapper& arg14 = ArgumentWrapper(), + const ArgumentWrapper& arg15 = ArgumentWrapper(), + const ArgumentWrapper& arg16 = ArgumentWrapper(), + const ArgumentWrapper& arg17 = ArgumentWrapper(), + const ArgumentWrapper& arg18 = ArgumentWrapper(), + const ArgumentWrapper& arg19 = ArgumentWrapper(), + const ArgumentWrapper& arg20 = ArgumentWrapper(), + const ArgumentWrapper& arg21 = ArgumentWrapper(), + const ArgumentWrapper& arg22 = ArgumentWrapper(), + const ArgumentWrapper& arg23 = ArgumentWrapper(), + const ArgumentWrapper& arg24 = ArgumentWrapper(), + const ArgumentWrapper& arg25 = ArgumentWrapper(), + const ArgumentWrapper& arg26 = ArgumentWrapper(), + const ArgumentWrapper& arg27 = ArgumentWrapper(), + const ArgumentWrapper& arg28 = ArgumentWrapper(), + const ArgumentWrapper& arg29 = ArgumentWrapper(), + const ArgumentWrapper& arg30 = ArgumentWrapper(), + const ArgumentWrapper& arg31 = ArgumentWrapper(), + const ArgumentWrapper& arg32 = ArgumentWrapper(), + const ArgumentWrapper& arg33 = ArgumentWrapper()) + { + int current_arg_index = 0; +#define FAKE_VARARG_HANDLE_ARG(arg) \ + do { \ + if(arg.pointer != NULL) { \ + opencl_assert(clSetKernelArg( \ + kernel, \ + start_argument_index + current_arg_index, \ + arg.size, arg.pointer)); \ + ++current_arg_index; \ + } \ + else { \ + return current_arg_index; \ + } \ + } while(false) + FAKE_VARARG_HANDLE_ARG(arg1); + FAKE_VARARG_HANDLE_ARG(arg2); + FAKE_VARARG_HANDLE_ARG(arg3); + FAKE_VARARG_HANDLE_ARG(arg4); + FAKE_VARARG_HANDLE_ARG(arg5); + FAKE_VARARG_HANDLE_ARG(arg6); + FAKE_VARARG_HANDLE_ARG(arg7); + FAKE_VARARG_HANDLE_ARG(arg8); + FAKE_VARARG_HANDLE_ARG(arg9); + FAKE_VARARG_HANDLE_ARG(arg10); + FAKE_VARARG_HANDLE_ARG(arg11); + FAKE_VARARG_HANDLE_ARG(arg12); + FAKE_VARARG_HANDLE_ARG(arg13); + FAKE_VARARG_HANDLE_ARG(arg14); + FAKE_VARARG_HANDLE_ARG(arg15); + FAKE_VARARG_HANDLE_ARG(arg16); + FAKE_VARARG_HANDLE_ARG(arg17); + FAKE_VARARG_HANDLE_ARG(arg18); + FAKE_VARARG_HANDLE_ARG(arg19); + FAKE_VARARG_HANDLE_ARG(arg20); + FAKE_VARARG_HANDLE_ARG(arg21); + FAKE_VARARG_HANDLE_ARG(arg22); + FAKE_VARARG_HANDLE_ARG(arg23); + FAKE_VARARG_HANDLE_ARG(arg24); + FAKE_VARARG_HANDLE_ARG(arg25); + FAKE_VARARG_HANDLE_ARG(arg26); + FAKE_VARARG_HANDLE_ARG(arg27); + FAKE_VARARG_HANDLE_ARG(arg28); + FAKE_VARARG_HANDLE_ARG(arg29); + FAKE_VARARG_HANDLE_ARG(arg30); + FAKE_VARARG_HANDLE_ARG(arg31); + FAKE_VARARG_HANDLE_ARG(arg32); + FAKE_VARARG_HANDLE_ARG(arg33); +#undef FAKE_VARARG_HANDLE_ARG + return current_arg_index; + } + + inline void release_kernel_safe(cl_kernel kernel) + { + if(kernel) { + clReleaseKernel(kernel); + } + } + + inline void release_mem_object_safe(cl_mem mem) + { + if(mem != NULL) { + clReleaseMemObject(mem); + } + } + + inline void release_program_safe(cl_program program) + { + if(program) { + clReleaseProgram(program); + } + } +}; + +class OpenCLDeviceMegaKernel : public OpenCLDeviceBase +{ +public: + cl_kernel ckPathTraceKernel; + cl_program path_trace_program; + + OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, bool background_) + : OpenCLDeviceBase(info, stats, background_) + { + ckPathTraceKernel = NULL; + path_trace_program = NULL; + } + + bool load_kernels(const DeviceRequestedFeatures& requested_features) + { + /* Get Shader, bake and film convert kernels. + * It'll also do verification of OpenCL actually initialized. + */ + if(!OpenCLDeviceBase::load_kernels(requested_features)) { + return false; + } + + /* Try to use cached kernel. */ + thread_scoped_lock cache_locker; + path_trace_program = OpenCLCache::get_program(cpPlatform, + cdDevice, + OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM, + cache_locker); + + if(!path_trace_program) { + /* Verify we have right opencl version. */ + if(!opencl_version_check()) + return false; + + /* Calculate md5 hash to detect changes. */ + string kernel_path = path_get("kernel"); + string kernel_md5 = path_files_md5_hash(kernel_path); + string custom_kernel_build_options = "-D__COMPILE_ONLY_MEGAKERNEL__ "; + string device_md5 = device_md5_hash(custom_kernel_build_options); + + /* Path to cached binary. */ + string clbin = string_printf("cycles_kernel_%s_%s.clbin", + device_md5.c_str(), + kernel_md5.c_str()); + clbin = path_user_get(path_join("cache", clbin)); + + /* Path to preprocessed source for debugging. */ + string clsrc, *debug_src = NULL; + if(opencl_kernel_use_debug()) { + clsrc = string_printf("cycles_kernel_%s_%s.cl", + device_md5.c_str(), + kernel_md5.c_str()); + clsrc = path_user_get(path_join("cache", clsrc)); + debug_src = &clsrc; + } + + /* If exists already, try use it. */ + if(path_exists(clbin) && load_binary(kernel_path, + clbin, + custom_kernel_build_options, + &path_trace_program, + debug_src)) { + /* Kernel loaded from binary, nothing to do. */ + } + else { + string init_kernel_source = "#include \"kernel.cl\" // " + + kernel_md5 + "\n"; + /* If does not exist or loading binary failed, compile kernel. */ + if(!compile_kernel(kernel_path, + init_kernel_source, + custom_kernel_build_options, + &path_trace_program, + debug_src)) + { + return false; + } + /* Save binary for reuse. */ + if(!save_binary(&path_trace_program, clbin)) { + return false; + } + } + /* Cache the program. */ + OpenCLCache::store_program(cpPlatform, + cdDevice, + path_trace_program, + OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM, + cache_locker); + } + + /* Find kernels. */ + ckPathTraceKernel = clCreateKernel(path_trace_program, + "kernel_ocl_path_trace", + &ciErr); + if(opencl_error(ciErr)) + return false; + return true; + } + + ~OpenCLDeviceMegaKernel() + { + task_pool.stop(); + release_kernel_safe(ckPathTraceKernel); + release_program_safe(path_trace_program); + } + + void path_trace(RenderTile& rtile, int sample) + { + /* Cast arguments to cl types. */ + cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); + cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); + cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state); + cl_int d_x = rtile.x; + cl_int d_y = rtile.y; + cl_int d_w = rtile.w; + cl_int d_h = rtile.h; + cl_int d_offset = rtile.offset; + cl_int d_stride = rtile.stride; + + /* Sample arguments. */ + cl_int d_sample = sample; + cl_uint narg = 0; + + /* TODO : Make the kernel launch similar to Cuda. */ + KERNEL_APPEND_ARG(ckPathTraceKernel, d_data); + KERNEL_APPEND_ARG(ckPathTraceKernel, d_buffer); + KERNEL_APPEND_ARG(ckPathTraceKernel, d_rng_state); + +#define KERNEL_TEX(type, ttype, name) \ + set_kernel_arg_mem(ckPathTraceKernel, &narg, #name); +#include "kernel_textures.h" +#undef KERNEL_TEX + + KERNEL_APPEND_ARG(ckPathTraceKernel, d_sample); + KERNEL_APPEND_ARG(ckPathTraceKernel, d_x); + KERNEL_APPEND_ARG(ckPathTraceKernel, d_y); + KERNEL_APPEND_ARG(ckPathTraceKernel, d_w); + KERNEL_APPEND_ARG(ckPathTraceKernel, d_h); + KERNEL_APPEND_ARG(ckPathTraceKernel, d_offset); + KERNEL_APPEND_ARG(ckPathTraceKernel, d_stride); + + enqueue_kernel(ckPathTraceKernel, d_w, d_h); + } + void thread_run(DeviceTask *task) { if(task->type == DeviceTask::FILM_CONVERT) { @@ -1064,8 +1394,7 @@ public: } else if(task->type == DeviceTask::PATH_TRACE) { RenderTile tile; - - /* keep rendering tiles until done */ + /* Keep rendering tiles until done. */ while(task->acquire_tile(this, tile)) { int start_sample = tile.start_sample; int end_sample = tile.start_sample + tile.num_samples; @@ -1083,47 +1412,1908 @@ public: task->update_progress(&tile); } + /* Complete kernel execution before release tile */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + task->release_tile(tile); } } } +}; - class OpenCLDeviceTask : public DeviceTask { - public: - OpenCLDeviceTask(OpenCLDevice *device, DeviceTask& task) - : DeviceTask(task) +/* TODO(sergey): This is to keep tile split on OpenCL level working + * for now, since withotu this viewport render does not work as it + * should. + * + * Ideally it'll be done on the higher level, but we need to get ready + * for merge rather soon, so let's keep split logic private here in + * the file. + */ +class SplitRenderTile : public RenderTile { +public: + SplitRenderTile() + : RenderTile(), + buffer_offset_x(0), + buffer_offset_y(0), + rng_state_offset_x(0), + rng_state_offset_y(0), + buffer_rng_state_stride(0) {} + + explicit SplitRenderTile(RenderTile& tile) + : RenderTile(), + buffer_offset_x(0), + buffer_offset_y(0), + rng_state_offset_x(0), + rng_state_offset_y(0), + buffer_rng_state_stride(0) + { + x = tile.x; + y = tile.y; + w = tile.w; + h = tile.h; + start_sample = tile.start_sample; + num_samples = tile.num_samples; + sample = tile.sample; + resolution = tile.resolution; + offset = tile.offset; + stride = tile.stride; + buffer = tile.buffer; + rng_state = tile.rng_state; + buffers = tile.buffers; + } + + /* Split kernel is device global memory constained; + * hence split kernel cant render big tile size's in + * one go. If the user sets a big tile size (big tile size + * is a term relative to the available device global memory), + * we split the tile further and then call path_trace on + * each of those split tiles. The following variables declared, + * assist in achieving that purpose + */ + int buffer_offset_x; + int buffer_offset_y; + int rng_state_offset_x; + int rng_state_offset_y; + int buffer_rng_state_stride; +}; + +/* OpenCLDeviceSplitKernel's declaration/definition. */ +class OpenCLDeviceSplitKernel : public OpenCLDeviceBase +{ +public: + /* Kernel declaration. */ + cl_kernel ckPathTraceKernel_data_init; + cl_kernel ckPathTraceKernel_scene_intersect; + cl_kernel ckPathTraceKernel_lamp_emission; + cl_kernel ckPathTraceKernel_queue_enqueue; + cl_kernel ckPathTraceKernel_background_buffer_update; + cl_kernel ckPathTraceKernel_shader_lighting; + cl_kernel ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao; + cl_kernel ckPathTraceKernel_direct_lighting; + cl_kernel ckPathTraceKernel_shadow_blocked_direct_lighting; + cl_kernel ckPathTraceKernel_setup_next_iteration; + cl_kernel ckPathTraceKernel_sum_all_radiance; + + /* cl_program declaration. */ + cl_program data_init_program; + cl_program scene_intersect_program; + cl_program lamp_emission_program; + cl_program queue_enqueue_program; + cl_program background_buffer_update_program; + cl_program shader_eval_program; + cl_program holdout_emission_blurring_termination_ao_program; + cl_program direct_lighting_program; + cl_program shadow_blocked_program; + cl_program next_iteration_setup_program; + cl_program sum_all_radiance_program; + + /* Global memory variables [porting]; These memory is used for + * co-operation between different kernels; Data written by one + * kernel will be avaible to another kernel via this global + * memory. + */ + cl_mem rng_coop; + cl_mem throughput_coop; + cl_mem L_transparent_coop; + cl_mem PathRadiance_coop; + cl_mem Ray_coop; + cl_mem PathState_coop; + cl_mem Intersection_coop; + cl_mem kgbuffer; /* KernelGlobals buffer. */ + + /* Global buffers for ShaderData. */ + cl_mem sd; /* ShaderData used in the main path-iteration loop. */ + cl_mem sd_DL_shadow; /* ShaderData used in Direct Lighting and + * shadow_blocked kernel. + */ + + /* Global buffers of each member of ShaderData. */ + cl_mem P_sd; + cl_mem P_sd_DL_shadow; + cl_mem N_sd; + cl_mem N_sd_DL_shadow; + cl_mem Ng_sd; + cl_mem Ng_sd_DL_shadow; + cl_mem I_sd; + cl_mem I_sd_DL_shadow; + cl_mem shader_sd; + cl_mem shader_sd_DL_shadow; + cl_mem flag_sd; + cl_mem flag_sd_DL_shadow; + cl_mem prim_sd; + cl_mem prim_sd_DL_shadow; + cl_mem type_sd; + cl_mem type_sd_DL_shadow; + cl_mem u_sd; + cl_mem u_sd_DL_shadow; + cl_mem v_sd; + cl_mem v_sd_DL_shadow; + cl_mem object_sd; + cl_mem object_sd_DL_shadow; + cl_mem time_sd; + cl_mem time_sd_DL_shadow; + cl_mem ray_length_sd; + cl_mem ray_length_sd_DL_shadow; + cl_mem ray_depth_sd; + cl_mem ray_depth_sd_DL_shadow; + cl_mem transparent_depth_sd; + cl_mem transparent_depth_sd_DL_shadow; +#ifdef __RAY_DIFFERENTIALS__ + cl_mem dP_sd, dI_sd; + cl_mem dP_sd_DL_shadow, dI_sd_DL_shadow; + cl_mem du_sd, dv_sd; + cl_mem du_sd_DL_shadow, dv_sd_DL_shadow; +#endif +#ifdef __DPDU__ + cl_mem dPdu_sd, dPdv_sd; + cl_mem dPdu_sd_DL_shadow, dPdv_sd_DL_shadow; +#endif + cl_mem closure_sd; + cl_mem closure_sd_DL_shadow; + cl_mem num_closure_sd; + cl_mem num_closure_sd_DL_shadow; + cl_mem randb_closure_sd; + cl_mem randb_closure_sd_DL_shadow; + cl_mem ray_P_sd; + cl_mem ray_P_sd_DL_shadow; + cl_mem ray_dP_sd; + cl_mem ray_dP_sd_DL_shadow; + + /* Global memory required for shadow blocked and accum_radiance. */ + cl_mem BSDFEval_coop; + cl_mem ISLamp_coop; + cl_mem LightRay_coop; + cl_mem AOAlpha_coop; + cl_mem AOBSDF_coop; + cl_mem AOLightRay_coop; + cl_mem Intersection_coop_AO; + cl_mem Intersection_coop_DL; + +#ifdef WITH_CYCLES_DEBUG + /* DebugData memory */ + cl_mem debugdata_coop; +#endif + + /* Global state array that tracks ray state. */ + cl_mem ray_state; + + /* Per sample buffers. */ + cl_mem per_sample_output_buffers; + + /* Denotes which sample each ray is being processed for. */ + cl_mem work_array; + + /* Queue */ + cl_mem Queue_data; /* Array of size queuesize * num_queues * sizeof(int). */ + cl_mem Queue_index; /* Array of size num_queues * sizeof(int); + * Tracks the size of each queue. + */ + + /* Flag to make sceneintersect and lampemission kernel use queues. */ + cl_mem use_queues_flag; + + /* Required-memory size. */ + size_t throughput_size; + size_t L_transparent_size; + size_t rayState_size; + size_t hostRayState_size; + size_t work_element_size; + size_t ISLamp_size; + + /* Sizes of memory required for shadow blocked function. */ + size_t AOAlpha_size; + size_t AOBSDF_size; + + /* Amount of memory in output buffer associated with one pixel/thread. */ + size_t per_thread_output_buffer_size; + + /* Total allocatable available device memory. */ + size_t total_allocatable_memory; + + /* host version of ray_state; Used in checking host path-iteration + * termination. + */ + char *hostRayStateArray; + + /* Number of path-iterations to be done in one shot. */ + unsigned int PathIteration_times; + +#ifdef __WORK_STEALING__ + /* Work pool with respect to each work group. */ + cl_mem work_pool_wgs; + + /* Denotes the maximum work groups possible w.r.t. current tile size. */ + unsigned int max_work_groups; +#endif + + /* clos_max value for which the kernels have been loaded currently. */ + int current_clos_max; + + /* Marked True in constructor and marked false at the end of path_trace(). */ + bool first_tile; + + OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_) + : OpenCLDeviceBase(info, stats, background_) + { + + info.use_split_kernel = true; + background = background_; + + /* Initialize kernels. */ + ckPathTraceKernel_data_init = NULL; + ckPathTraceKernel_scene_intersect = NULL; + ckPathTraceKernel_lamp_emission = NULL; + ckPathTraceKernel_background_buffer_update = NULL; + ckPathTraceKernel_shader_lighting = NULL; + ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao = NULL; + ckPathTraceKernel_direct_lighting = NULL; + ckPathTraceKernel_shadow_blocked_direct_lighting = NULL; + ckPathTraceKernel_setup_next_iteration = NULL; + ckPathTraceKernel_sum_all_radiance = NULL; + ckPathTraceKernel_queue_enqueue = NULL; + + /* Initialize program. */ + data_init_program = NULL; + scene_intersect_program = NULL; + lamp_emission_program = NULL; + queue_enqueue_program = NULL; + background_buffer_update_program = NULL; + shader_eval_program = NULL; + holdout_emission_blurring_termination_ao_program = NULL; + direct_lighting_program = NULL; + shadow_blocked_program = NULL; + next_iteration_setup_program = NULL; + sum_all_radiance_program = NULL; + + /* Initialize cl_mem variables. */ + kgbuffer = NULL; + sd = NULL; + sd_DL_shadow = NULL; + + P_sd = NULL; + P_sd_DL_shadow = NULL; + N_sd = NULL; + N_sd_DL_shadow = NULL; + Ng_sd = NULL; + Ng_sd_DL_shadow = NULL; + I_sd = NULL; + I_sd_DL_shadow = NULL; + shader_sd = NULL; + shader_sd_DL_shadow = NULL; + flag_sd = NULL; + flag_sd_DL_shadow = NULL; + prim_sd = NULL; + prim_sd_DL_shadow = NULL; + type_sd = NULL; + type_sd_DL_shadow = NULL; + u_sd = NULL; + u_sd_DL_shadow = NULL; + v_sd = NULL; + v_sd_DL_shadow = NULL; + object_sd = NULL; + object_sd_DL_shadow = NULL; + time_sd = NULL; + time_sd_DL_shadow = NULL; + ray_length_sd = NULL; + ray_length_sd_DL_shadow = NULL; + ray_depth_sd = NULL; + ray_depth_sd_DL_shadow = NULL; + transparent_depth_sd = NULL; + transparent_depth_sd_DL_shadow = NULL; +#ifdef __RAY_DIFFERENTIALS__ + dP_sd = NULL; + dI_sd = NULL; + dP_sd_DL_shadow = NULL; + dI_sd_DL_shadow = NULL; + du_sd = NULL; + dv_sd = NULL; + du_sd_DL_shadow = NULL; + dv_sd_DL_shadow = NULL; +#endif +#ifdef __DPDU__ + dPdu_sd = NULL; + dPdv_sd = NULL; + dPdu_sd_DL_shadow = NULL; + dPdv_sd_DL_shadow = NULL; +#endif + closure_sd = NULL; + closure_sd_DL_shadow = NULL; + num_closure_sd = NULL; + num_closure_sd_DL_shadow = NULL; + randb_closure_sd = NULL; + randb_closure_sd_DL_shadow = NULL; + ray_P_sd = NULL; + ray_P_sd_DL_shadow = NULL; + ray_dP_sd = NULL; + ray_dP_sd_DL_shadow = NULL; + + rng_coop = NULL; + throughput_coop = NULL; + L_transparent_coop = NULL; + PathRadiance_coop = NULL; + Ray_coop = NULL; + PathState_coop = NULL; + Intersection_coop = NULL; + ray_state = NULL; + + AOAlpha_coop = NULL; + AOBSDF_coop = NULL; + AOLightRay_coop = NULL; + BSDFEval_coop = NULL; + ISLamp_coop = NULL; + LightRay_coop = NULL; + Intersection_coop_AO = NULL; + Intersection_coop_DL = NULL; + +#ifdef WITH_CYCLES_DEBUG + debugdata_coop = NULL; +#endif + + work_array = NULL; + + /* Queue. */ + Queue_data = NULL; + Queue_index = NULL; + use_queues_flag = NULL; + + per_sample_output_buffers = NULL; + + /* Initialize required memory size. */ + throughput_size = sizeof(float3); + L_transparent_size = sizeof(float); + rayState_size = sizeof(char); + hostRayState_size = sizeof(char); + work_element_size = sizeof(unsigned int); + ISLamp_size = sizeof(int); + + /* Initialize sizes of memory required for shadow blocked function. */ + AOAlpha_size = sizeof(float3); + AOBSDF_size = sizeof(float3); + + per_thread_output_buffer_size = 0; + hostRayStateArray = NULL; + PathIteration_times = PATH_ITER_INC_FACTOR; +#ifdef __WORK_STEALING__ + work_pool_wgs = NULL; + max_work_groups = 0; +#endif + current_clos_max = -1; + first_tile = true; + + /* Get device's maximum memory that can be allocated. */ + ciErr = clGetDeviceInfo(cdDevice, + CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(size_t), + &total_allocatable_memory, + NULL); + assert(ciErr == CL_SUCCESS); + if(platform_name == "AMD Accelerated Parallel Processing") { + /* This value is tweak-able; AMD platform does not seem to + * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE + * is considered for further computation. + */ + total_allocatable_memory /= 2; + } + } + + /* TODO(sergey): Seems really close to load_kernel(), + * could it be de-duplicated? + */ + bool load_split_kernel(string kernel_path, + string kernel_init_source, + string clbin, + string custom_kernel_build_options, + cl_program *program) + { + if(!opencl_version_check()) + return false; + + clbin = path_user_get(path_join("cache", clbin)); + + /* Path to preprocessed source for debugging. */ + string *debug_src = NULL; + + /* If exists already, try use it. */ + if(path_exists(clbin) && load_binary(kernel_path, + clbin, + custom_kernel_build_options, + program, + debug_src)) { + /* Kernel loaded from binary. */ + } + else { + /* If does not exist or loading binary failed, compile kernel. */ + if(!compile_kernel(kernel_path, + kernel_init_source, + custom_kernel_build_options, + program)) + { + return false; + } + /* Save binary for reuse. */ + if(!save_binary(program, clbin)) { + return false; + } + } + return true; + } + + /* Split kernel utility functions. */ + size_t get_tex_size(const char *tex_name) + { + cl_mem ptr; + size_t ret_size = 0; + MemMap::iterator i = mem_map.find(tex_name); + if(i != mem_map.end()) { + ptr = CL_MEM_PTR(i->second); + ciErr = clGetMemObjectInfo(ptr, + CL_MEM_SIZE, + sizeof(ret_size), + &ret_size, + NULL); + assert(ciErr == CL_SUCCESS); + } + return ret_size; + } + + size_t get_shader_closure_size(int max_closure) + { + return (sizeof(ShaderClosure)* max_closure); + } + + size_t get_shader_data_size(size_t shader_closure_size) + { + /* ShaderData size without accounting for ShaderClosure array. */ + size_t shader_data_size = + sizeof(ShaderData) - (sizeof(ShaderClosure) * MAX_CLOSURE); + return (shader_data_size + shader_closure_size); + } + + /* Returns size of KernelGlobals structure associated with OpenCL. */ + size_t get_KernelGlobals_size() + { + /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to + * fetch its size. + */ + typedef struct KernelGlobals { + ccl_constant KernelData *data; +#define KERNEL_TEX(type, ttype, name) \ + ccl_global type *name; +#include "kernel_textures.h" +#undef KERNEL_TEX + } KernelGlobals; + + return sizeof(KernelGlobals); + } + + /* Returns size of Structure of arrays implementation of. */ + size_t get_shaderdata_soa_size() + { + size_t shader_soa_size = 0; + +#define SD_VAR(type, what) \ + shader_soa_size += sizeof(void *); +#define SD_CLOSURE_VAR(type, what, max_closure) + shader_soa_size += sizeof(void *); + #include "kernel_shaderdata_vars.h" +#undef SD_VAR +#undef SD_CLOSURE_VAR + + return shader_soa_size; + } + + bool load_kernels(const DeviceRequestedFeatures& requested_features) + { + /* If it is an interactive render; we ceil clos_max value to a multiple + * of 5 in order to limit re-compilations. + */ + /* TODO(sergey): Decision about this should be done on higher levels. */ + int max_closure = requested_features.max_closure; + if(!background) { + assert((max_closure != 0) && "clos_max value is 0" ); + max_closure = (((max_closure - 1) / 5) + 1) * 5; + /* clos_max value shouldn't be greater than MAX_CLOSURE. */ + max_closure = (max_closure > MAX_CLOSURE) ? MAX_CLOSURE : max_closure; + if(current_clos_max == max_closure) { + /* Present kernels have been created with the same closure count + * build option. + */ + return true; + } + } + /* Get Shader, bake and film_convert kernels. + * It'll also do verification of OpenCL actually initialized. + */ + if(!OpenCLDeviceBase::load_kernels(requested_features)) { + return false; + } + + string svm_build_options = ""; + string max_closure_build_option = ""; + string compute_device_type_build_option = ""; + + /* Set svm_build_options. */ + svm_build_options += " -D__NODES_MAX_GROUP__=" + + string_printf("%d", requested_features.max_nodes_group); + svm_build_options += " -D__NODES_FEATURES__=" + + string_printf("%d", requested_features.nodes_features); + + /* Set max closure build option. */ + max_closure_build_option += string_printf("-D__MAX_CLOSURE__=%d ", + max_closure); + + /* Set compute device build option. */ + cl_device_type device_type; + ciErr = clGetDeviceInfo(cdDevice, + CL_DEVICE_TYPE, + sizeof(cl_device_type), + &device_type, + NULL); + assert(ciErr == CL_SUCCESS); + if(device_type == CL_DEVICE_TYPE_GPU) { + compute_device_type_build_option = "-D__COMPUTE_DEVICE_GPU__ "; + } + + string kernel_path = path_get("kernel"); + string kernel_md5 = path_files_md5_hash(kernel_path); + string device_md5; + string custom_kernel_build_options; + string kernel_init_source; + string clbin; + + string common_custom_build_options = ""; + common_custom_build_options += "-D__SPLIT_KERNEL__ "; + common_custom_build_options += max_closure_build_option;; +#ifdef __WORK_STEALING__ + common_custom_build_options += "-D__WORK_STEALING__ "; +#endif + +#define LOAD_KERNEL(program, name) \ + do { \ + kernel_init_source = "#include \"kernel_" name ".cl\" // " + \ + kernel_md5 + "\n"; \ + custom_kernel_build_options = common_custom_build_options; \ + device_md5 = device_md5_hash(custom_kernel_build_options); \ + clbin = string_printf("cycles_kernel_%s_%s_" name ".clbin", \ + device_md5.c_str(), kernel_md5.c_str()); \ + if(!load_split_kernel(kernel_path, kernel_init_source, clbin, \ + custom_kernel_build_options, &program)) \ + { \ + return false; \ + } \ + } while(false) + + /* TODO(sergey): If names are unified we can save some more bits of + * code here. + */ + LOAD_KERNEL(data_init_program, "data_init"); + LOAD_KERNEL(scene_intersect_program, "scene_intersect"); + LOAD_KERNEL(lamp_emission_program, "lamp_emission"); + LOAD_KERNEL(queue_enqueue_program, "queue_enqueue"); + LOAD_KERNEL(background_buffer_update_program, "background_buffer_update"); + LOAD_KERNEL(shader_eval_program, "shader_eval"); + LOAD_KERNEL(holdout_emission_blurring_termination_ao_program, + "holdout_emission_blurring_pathtermination_ao"); + LOAD_KERNEL(direct_lighting_program, "direct_lighting"); + LOAD_KERNEL(shadow_blocked_program, "shadow_blocked"); + LOAD_KERNEL(next_iteration_setup_program, "next_iteration_setup"); + LOAD_KERNEL(sum_all_radiance_program, "sum_all_radiance"); + +#undef LOAD_KERNEL + +#define GLUE(a, b) a ## b +#define FIND_KERNEL(kernel, program, function) \ + do { \ + GLUE(ckPathTraceKernel_, kernel) = \ + clCreateKernel(GLUE(program, _program), \ + "kernel_ocl_path_trace_" function, &ciErr); \ + if(opencl_error(ciErr)) { \ + return false; \ + } \ + } while(false) + + FIND_KERNEL(data_init, data_init, "data_initialization"); + FIND_KERNEL(scene_intersect, scene_intersect, "scene_intersect"); + FIND_KERNEL(lamp_emission, lamp_emission, "lamp_emission"); + FIND_KERNEL(queue_enqueue, queue_enqueue, "queue_enqueue"); + FIND_KERNEL(background_buffer_update, background_buffer_update, "background_buffer_update"); + FIND_KERNEL(shader_lighting, shader_eval, "shader_evaluation"); + FIND_KERNEL(holdout_emission_blurring_pathtermination_ao, + holdout_emission_blurring_termination_ao, + "holdout_emission_blurring_pathtermination_ao"); + FIND_KERNEL(direct_lighting, direct_lighting, "direct_lighting"); + FIND_KERNEL(shadow_blocked_direct_lighting, shadow_blocked, "shadow_blocked_direct_lighting"); + FIND_KERNEL(setup_next_iteration, next_iteration_setup, "setup_next_iteration"); + FIND_KERNEL(sum_all_radiance, sum_all_radiance, "sum_all_radiance"); +#undef FIND_KERNEL +#undef GLUE + + current_clos_max = max_closure; + + return true; + } + + ~OpenCLDeviceSplitKernel() + { + task_pool.stop(); + + /* Release kernels */ + release_kernel_safe(ckPathTraceKernel_data_init); + release_kernel_safe(ckPathTraceKernel_scene_intersect); + release_kernel_safe(ckPathTraceKernel_lamp_emission); + release_kernel_safe(ckPathTraceKernel_queue_enqueue); + release_kernel_safe(ckPathTraceKernel_background_buffer_update); + release_kernel_safe(ckPathTraceKernel_shader_lighting); + release_kernel_safe(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao); + release_kernel_safe(ckPathTraceKernel_direct_lighting); + release_kernel_safe(ckPathTraceKernel_shadow_blocked_direct_lighting); + release_kernel_safe(ckPathTraceKernel_setup_next_iteration); + release_kernel_safe(ckPathTraceKernel_sum_all_radiance); + + /* Release global memory */ + release_mem_object_safe(P_sd); + release_mem_object_safe(P_sd_DL_shadow); + release_mem_object_safe(N_sd); + release_mem_object_safe(N_sd_DL_shadow); + release_mem_object_safe(Ng_sd); + release_mem_object_safe(Ng_sd_DL_shadow); + release_mem_object_safe(I_sd); + release_mem_object_safe(I_sd_DL_shadow); + release_mem_object_safe(shader_sd); + release_mem_object_safe(shader_sd_DL_shadow); + release_mem_object_safe(flag_sd); + release_mem_object_safe(flag_sd_DL_shadow); + release_mem_object_safe(prim_sd); + release_mem_object_safe(prim_sd_DL_shadow); + release_mem_object_safe(type_sd); + release_mem_object_safe(type_sd_DL_shadow); + release_mem_object_safe(u_sd); + release_mem_object_safe(u_sd_DL_shadow); + release_mem_object_safe(v_sd); + release_mem_object_safe(v_sd_DL_shadow); + release_mem_object_safe(object_sd); + release_mem_object_safe(object_sd_DL_shadow); + release_mem_object_safe(time_sd); + release_mem_object_safe(time_sd_DL_shadow); + release_mem_object_safe(ray_length_sd); + release_mem_object_safe(ray_length_sd_DL_shadow); + release_mem_object_safe(ray_depth_sd); + release_mem_object_safe(ray_depth_sd_DL_shadow); + release_mem_object_safe(transparent_depth_sd); + release_mem_object_safe(transparent_depth_sd_DL_shadow); +#ifdef __RAY_DIFFERENTIALS__ + release_mem_object_safe(dP_sd); + release_mem_object_safe(dP_sd_DL_shadow); + release_mem_object_safe(dI_sd); + release_mem_object_safe(dI_sd_DL_shadow); + release_mem_object_safe(du_sd); + release_mem_object_safe(du_sd_DL_shadow); + release_mem_object_safe(dv_sd); + release_mem_object_safe(dv_sd_DL_shadow); +#endif +#ifdef __DPDU__ + release_mem_object_safe(dPdu_sd); + release_mem_object_safe(dPdu_sd_DL_shadow); + release_mem_object_safe(dPdv_sd); + release_mem_object_safe(dPdv_sd_DL_shadow); +#endif + release_mem_object_safe(closure_sd); + release_mem_object_safe(closure_sd_DL_shadow); + release_mem_object_safe(num_closure_sd); + release_mem_object_safe(num_closure_sd_DL_shadow); + release_mem_object_safe(randb_closure_sd); + release_mem_object_safe(randb_closure_sd_DL_shadow); + release_mem_object_safe(ray_P_sd); + release_mem_object_safe(ray_P_sd_DL_shadow); + release_mem_object_safe(ray_dP_sd); + release_mem_object_safe(ray_dP_sd_DL_shadow); + release_mem_object_safe(rng_coop); + release_mem_object_safe(throughput_coop); + release_mem_object_safe(L_transparent_coop); + release_mem_object_safe(PathRadiance_coop); + release_mem_object_safe(Ray_coop); + release_mem_object_safe(PathState_coop); + release_mem_object_safe(Intersection_coop); + release_mem_object_safe(kgbuffer); + release_mem_object_safe(sd); + release_mem_object_safe(sd_DL_shadow); + release_mem_object_safe(ray_state); + release_mem_object_safe(AOAlpha_coop); + release_mem_object_safe(AOBSDF_coop); + release_mem_object_safe(AOLightRay_coop); + release_mem_object_safe(BSDFEval_coop); + release_mem_object_safe(ISLamp_coop); + release_mem_object_safe(LightRay_coop); + release_mem_object_safe(Intersection_coop_AO); + release_mem_object_safe(Intersection_coop_DL); +#ifdef WITH_CYCLES_DEBUG + release_mem_object_safe(debugdata_coop); +#endif + release_mem_object_safe(use_queues_flag); + release_mem_object_safe(Queue_data); + release_mem_object_safe(Queue_index); + release_mem_object_safe(work_array); +#ifdef __WORK_STEALING__ + release_mem_object_safe(work_pool_wgs); +#endif + release_mem_object_safe(per_sample_output_buffers); + + /* Release programs */ + release_program_safe(data_init_program); + release_program_safe(scene_intersect_program); + release_program_safe(lamp_emission_program); + release_program_safe(queue_enqueue_program); + release_program_safe(background_buffer_update_program); + release_program_safe(shader_eval_program); + release_program_safe(holdout_emission_blurring_termination_ao_program); + release_program_safe(direct_lighting_program); + release_program_safe(shadow_blocked_program); + release_program_safe(next_iteration_setup_program); + release_program_safe(sum_all_radiance_program); + + if(hostRayStateArray != NULL) { + free(hostRayStateArray); + } + } + + void path_trace(SplitRenderTile& rtile, int2 max_render_feasible_tile_size) + { + /* cast arguments to cl types */ + cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); + cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); + cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state); + cl_int d_x = rtile.x; + cl_int d_y = rtile.y; + cl_int d_w = rtile.w; + cl_int d_h = rtile.h; + cl_int d_offset = rtile.offset; + cl_int d_stride = rtile.stride; + + /* Make sure that set render feasible tile size is a multiple of local + * work size dimensions. + */ + assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0); + assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0); + + /* ray_state and hostRayStateArray should be of same size. */ + assert(hostRayState_size == rayState_size); + assert(rayState_size == 1); + + size_t global_size[2]; + size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X, + SPLIT_KERNEL_LOCAL_SIZE_Y}; + + /* Set the range of samples to be processed for every ray in + * path-regeneration logic. + */ + cl_int start_sample = rtile.start_sample; + cl_int end_sample = rtile.start_sample + rtile.num_samples; + cl_int num_samples = rtile.num_samples; + +#ifdef __WORK_STEALING__ + global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0]; + global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; + unsigned int num_parallel_samples = 1; +#else + global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; + unsigned int num_threads = max_render_feasible_tile_size.x * + max_render_feasible_tile_size.y; + unsigned int num_tile_columns_possible = num_threads / global_size[1]; + /* Estimate number of parallel samples that can be + * processed in parallel. + */ + unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w, + rtile.num_samples); + /* Wavefront size in AMD is 64. + * TODO(sergey): What about other platforms? + */ + if(num_parallel_samples >= 64) { + /* TODO(sergey): Could use generic round-up here. */ + num_parallel_samples = (num_parallel_samples / 64) * 64 + } + assert(num_parallel_samples != 0); + + global_size[0] = d_w * num_parallel_samples; +#endif /* __WORK_STEALING__ */ + + assert(global_size[0] * global_size[1] <= + max_render_feasible_tile_size.x * max_render_feasible_tile_size.y); + + /* Allocate all required global memory once. */ + if(first_tile) { + size_t num_global_elements = max_render_feasible_tile_size.x * + max_render_feasible_tile_size.y; + /* TODO(sergey): This will actually over-allocate if + * particular kernel does not support multiclosure. + */ + size_t ShaderClosure_size = get_shader_closure_size(current_clos_max); + +#ifdef __WORK_STEALING__ + /* Calculate max groups */ + size_t max_global_size[2]; + size_t tile_x = max_render_feasible_tile_size.x; + size_t tile_y = max_render_feasible_tile_size.y; + max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0]; + max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1]; + max_work_groups = (max_global_size[0] * max_global_size[1]) / + (local_size[0] * local_size[1]); + /* Allocate work_pool_wgs memory. */ + work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int)); +#endif /* __WORK_STEALING__ */ + + /* Allocate queue_index memory only once. */ + Queue_index = mem_alloc(NUM_QUEUES * sizeof(int)); + use_queues_flag = mem_alloc(sizeof(char)); + kgbuffer = mem_alloc(get_KernelGlobals_size()); + + /* Create global buffers for ShaderData. */ + sd = mem_alloc(get_shaderdata_soa_size()); + sd_DL_shadow = mem_alloc(get_shaderdata_soa_size()); + P_sd = mem_alloc(num_global_elements * sizeof(float3)); + P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3)); + N_sd = mem_alloc(num_global_elements * sizeof(float3)); + N_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3)); + Ng_sd = mem_alloc(num_global_elements * sizeof(float3)); + Ng_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3)); + I_sd = mem_alloc(num_global_elements * sizeof(float3)); + I_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3)); + shader_sd = mem_alloc(num_global_elements * sizeof(int)); + shader_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int)); + flag_sd = mem_alloc(num_global_elements * sizeof(int)); + flag_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int)); + prim_sd = mem_alloc(num_global_elements * sizeof(int)); + prim_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int)); + type_sd = mem_alloc(num_global_elements * sizeof(int)); + type_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int)); + u_sd = mem_alloc(num_global_elements * sizeof(float)); + u_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float)); + v_sd = mem_alloc(num_global_elements * sizeof(float)); + v_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float)); + object_sd = mem_alloc(num_global_elements * sizeof(int)); + object_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int)); + time_sd = mem_alloc(num_global_elements * sizeof(float)); + time_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float)); + ray_length_sd = mem_alloc(num_global_elements * sizeof(float)); + ray_length_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float)); + ray_depth_sd = mem_alloc(num_global_elements * sizeof(int)); + ray_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int)); + transparent_depth_sd = mem_alloc(num_global_elements * sizeof(int)); + transparent_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int)); + +#ifdef __RAY_DIFFERENTIALS__ + dP_sd = mem_alloc(num_global_elements * sizeof(differential3)); + dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3)); + dI_sd = mem_alloc(num_global_elements * sizeof(differential3)); + dI_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3)); + du_sd = mem_alloc(num_global_elements * sizeof(differential)); + du_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential)); + dv_sd = mem_alloc(num_global_elements * sizeof(differential)); + dv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential)); +#endif + +#ifdef __DPDU__ + dPdu_sd = mem_alloc(num_global_elements * sizeof(float3)); + dPdu_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3)); + dPdv_sd = mem_alloc(num_global_elements * sizeof(float3)); + dPdv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3)); +#endif + closure_sd = mem_alloc(num_global_elements * ShaderClosure_size); + closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * ShaderClosure_size); + num_closure_sd = mem_alloc(num_global_elements * sizeof(int)); + num_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int)); + randb_closure_sd = mem_alloc(num_global_elements * sizeof(float)); + randb_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float)); + ray_P_sd = mem_alloc(num_global_elements * sizeof(float3)); + ray_P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3)); + ray_dP_sd = mem_alloc(num_global_elements * sizeof(differential3)); + ray_dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3)); + + /* Creation of global memory buffers which are shared among + * the kernels. + */ + rng_coop = mem_alloc(num_global_elements * sizeof(RNG)); + throughput_coop = mem_alloc(num_global_elements * throughput_size); + L_transparent_coop = mem_alloc(num_global_elements * L_transparent_size); + PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance)); + Ray_coop = mem_alloc(num_global_elements * sizeof(Ray)); + PathState_coop = mem_alloc(num_global_elements * sizeof(PathState)); + Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection)); + AOAlpha_coop = mem_alloc(num_global_elements * AOAlpha_size); + AOBSDF_coop = mem_alloc(num_global_elements * AOBSDF_size); + AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray)); + BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval)); + ISLamp_coop = mem_alloc(num_global_elements * ISLamp_size); + LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray)); + Intersection_coop_AO = mem_alloc(num_global_elements * sizeof(Intersection)); + Intersection_coop_DL = mem_alloc(num_global_elements * sizeof(Intersection)); + +#ifdef WITH_CYCLES_DEBUG + debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData)); +#endif + + ray_state = mem_alloc(num_global_elements * rayState_size); + + hostRayStateArray = (char *)calloc(num_global_elements, hostRayState_size); + assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory"); + + Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int))); + work_array = mem_alloc(num_global_elements * work_element_size); + per_sample_output_buffers = mem_alloc(num_global_elements * + per_thread_output_buffer_size); + } + + cl_int dQueue_size = global_size[0] * global_size[1]; + cl_int total_num_rays = global_size[0] * global_size[1]; + + cl_uint start_arg_index = + kernel_set_args(ckPathTraceKernel_data_init, + 0, + kgbuffer, + sd, + sd_DL_shadow, + P_sd, + P_sd_DL_shadow, + N_sd, + N_sd_DL_shadow, + Ng_sd, + Ng_sd_DL_shadow, + I_sd, + I_sd_DL_shadow, + shader_sd, + shader_sd_DL_shadow, + flag_sd, + flag_sd_DL_shadow, + prim_sd, + prim_sd_DL_shadow, + type_sd, + type_sd_DL_shadow, + u_sd, + u_sd_DL_shadow, + v_sd, + v_sd_DL_shadow, + object_sd, + object_sd_DL_shadow, + time_sd, + time_sd_DL_shadow, + ray_length_sd, + ray_length_sd_DL_shadow, + ray_depth_sd, + ray_depth_sd_DL_shadow, + transparent_depth_sd, + transparent_depth_sd_DL_shadow); + + start_arg_index += + kernel_set_args(ckPathTraceKernel_data_init, +#ifdef __RAY_DIFFERENTIALS__ + start_arg_index, + dP_sd, + dP_sd_DL_shadow, + dI_sd, + dI_sd_DL_shadow, + du_sd, + du_sd_DL_shadow, + dv_sd, + dv_sd_DL_shadow, +#endif +#ifdef __DPDU__ + dPdu_sd, + dPdu_sd_DL_shadow, + dPdv_sd, + dPdv_sd_DL_shadow, +#endif + closure_sd, + closure_sd_DL_shadow, + num_closure_sd, + num_closure_sd_DL_shadow, + randb_closure_sd, + randb_closure_sd_DL_shadow, + ray_P_sd, + ray_P_sd_DL_shadow, + ray_dP_sd, + ray_dP_sd_DL_shadow, + d_data, + per_sample_output_buffers, + d_rng_state, + rng_coop, + throughput_coop, + L_transparent_coop, + PathRadiance_coop, + Ray_coop, + PathState_coop, + ray_state); + +/* TODO(segrey): Avoid map lookup here. */ +#define KERNEL_TEX(type, ttype, name) \ + set_kernel_arg_mem(ckPathTraceKernel_data_init, &start_arg_index, #name); +#include "kernel_textures.h" +#undef KERNEL_TEX + + start_arg_index += + kernel_set_args(ckPathTraceKernel_data_init, + start_arg_index, + start_sample, + d_x, + d_y, + d_w, + d_h, + d_offset, + d_stride, + rtile.rng_state_offset_x, + rtile.rng_state_offset_y, + rtile.buffer_rng_state_stride, + Queue_data, + Queue_index, + dQueue_size, + use_queues_flag, + work_array, +#ifdef __WORK_STEALING__ + work_pool_wgs, + num_samples, +#endif +#ifdef WITH_CYCLES_DEBUG + debugdata_coop, +#endif + num_parallel_samples); + + kernel_set_args(ckPathTraceKernel_scene_intersect, + 0, + kgbuffer, + d_data, + rng_coop, + Ray_coop, + PathState_coop, + Intersection_coop, + ray_state, + d_w, + d_h, + Queue_data, + Queue_index, + dQueue_size, + use_queues_flag, +#ifdef WITH_CYCLES_DEBUG + debugdata_coop, +#endif + num_parallel_samples); + + kernel_set_args(ckPathTraceKernel_lamp_emission, + 0, + kgbuffer, + d_data, + sd, + throughput_coop, + PathRadiance_coop, + Ray_coop, + PathState_coop, + Intersection_coop, + ray_state, + d_w, + d_h, + Queue_data, + Queue_index, + dQueue_size, + use_queues_flag, + num_parallel_samples); + + kernel_set_args(ckPathTraceKernel_queue_enqueue, + 0, + Queue_data, + Queue_index, + ray_state, + dQueue_size); + + kernel_set_args(ckPathTraceKernel_background_buffer_update, + 0, + kgbuffer, + d_data, + sd, + per_sample_output_buffers, + d_rng_state, + rng_coop, + throughput_coop, + PathRadiance_coop, + Ray_coop, + PathState_coop, + L_transparent_coop, + ray_state, + d_w, + d_h, + d_x, + d_y, + d_stride, + rtile.rng_state_offset_x, + rtile.rng_state_offset_y, + rtile.buffer_rng_state_stride, + work_array, + Queue_data, + Queue_index, + dQueue_size, + end_sample, + start_sample, +#ifdef __WORK_STEALING__ + work_pool_wgs, + num_samples, +#endif +#ifdef WITH_CYCLES_DEBUG + debugdata_coop, +#endif + num_parallel_samples); + + kernel_set_args(ckPathTraceKernel_shader_lighting, + 0, + kgbuffer, + d_data, + sd, + rng_coop, + Ray_coop, + PathState_coop, + Intersection_coop, + ray_state, + Queue_data, + Queue_index, + dQueue_size); + + kernel_set_args(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao, + 0, + kgbuffer, + d_data, + sd, + per_sample_output_buffers, + rng_coop, + throughput_coop, + L_transparent_coop, + PathRadiance_coop, + PathState_coop, + Intersection_coop, + AOAlpha_coop, + AOBSDF_coop, + AOLightRay_coop, + d_w, + d_h, + d_x, + d_y, + d_stride, + ray_state, + work_array, + Queue_data, + Queue_index, + dQueue_size, +#ifdef __WORK_STEALING__ + start_sample, +#endif + num_parallel_samples); + + kernel_set_args(ckPathTraceKernel_direct_lighting, + 0, + kgbuffer, + d_data, + sd, + sd_DL_shadow, + rng_coop, + PathState_coop, + ISLamp_coop, + LightRay_coop, + BSDFEval_coop, + ray_state, + Queue_data, + Queue_index, + dQueue_size); + + kernel_set_args(ckPathTraceKernel_shadow_blocked_direct_lighting, + 0, + kgbuffer, + d_data, + sd_DL_shadow, + PathState_coop, + LightRay_coop, + AOLightRay_coop, + Intersection_coop_AO, + Intersection_coop_DL, + ray_state, + Queue_data, + Queue_index, + dQueue_size, + total_num_rays); + + kernel_set_args(ckPathTraceKernel_setup_next_iteration, + 0, + kgbuffer, + d_data, + sd, + rng_coop, + throughput_coop, + PathRadiance_coop, + Ray_coop, + PathState_coop, + LightRay_coop, + ISLamp_coop, + BSDFEval_coop, + AOLightRay_coop, + AOBSDF_coop, + AOAlpha_coop, + ray_state, + Queue_data, + Queue_index, + dQueue_size, + use_queues_flag); + + kernel_set_args(ckPathTraceKernel_sum_all_radiance, + 0, + d_data, + d_buffer, + per_sample_output_buffers, + num_parallel_samples, + d_w, + d_h, + d_stride, + rtile.buffer_offset_x, + rtile.buffer_offset_y, + rtile.buffer_rng_state_stride, + start_sample); + + /* Macro for Enqueuing split kernels. */ +#define GLUE(a, b) a ## b +#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \ + opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, \ + GLUE(ckPathTraceKernel_, \ + kernelName), \ + 2, \ + NULL, \ + globalSize, \ + localSize, \ + 0, \ + NULL, \ + NULL)) + + /* Enqueue ckPathTraceKernel_data_init kernel. */ + ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size); + bool activeRaysAvailable = true; + + /* Record number of time host intervention has been made */ + unsigned int numHostIntervention = 0; + unsigned int numNextPathIterTimes = PathIteration_times; + while(activeRaysAvailable) { + /* Twice the global work size of other kernels for + * ckPathTraceKernel_shadow_blocked_direct_lighting. */ + size_t global_size_shadow_blocked[2]; + global_size_shadow_blocked[0] = global_size[0] * 2; + global_size_shadow_blocked[1] = global_size[1]; + + /* Do path-iteration in host [Enqueue Path-iteration kernels. */ + for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) { + ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shader_lighting, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shadow_blocked_direct_lighting, global_size_shadow_blocked, local_size); + ENQUEUE_SPLIT_KERNEL(setup_next_iteration, global_size, local_size); + } + + /* Read ray-state into Host memory to decide if we should exit + * path-iteration in host. + */ + ciErr = clEnqueueReadBuffer(cqCommandQueue, + ray_state, + CL_TRUE, + 0, + global_size[0] * global_size[1] * sizeof(char), + hostRayStateArray, + 0, + NULL, + NULL); + assert(ciErr == CL_SUCCESS); + + activeRaysAvailable = false; + + for(int rayStateIter = 0; + rayStateIter < global_size[0] * global_size[1]; + ++rayStateIter) + { + if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) { + /* Not all rays are RAY_INACTIVE. */ + activeRaysAvailable = true; + break; + } + } + + if(activeRaysAvailable) { + numHostIntervention++; + PathIteration_times = PATH_ITER_INC_FACTOR; + /* Host intervention done before all rays become RAY_INACTIVE; + * Set do more initial iterations for the next tile. + */ + numNextPathIterTimes += PATH_ITER_INC_FACTOR; + } + } + + /* Execute SumALLRadiance kernel to accumulate radiance calculated in + * per_sample_output_buffers into RenderTile's output buffer. + */ + size_t sum_all_radiance_local_size[2] = {16, 16}; + size_t sum_all_radiance_global_size[2]; + sum_all_radiance_global_size[0] = + (((d_w - 1) / sum_all_radiance_local_size[0]) + 1) * + sum_all_radiance_local_size[0]; + sum_all_radiance_global_size[1] = + (((d_h - 1) / sum_all_radiance_local_size[1]) + 1) * + sum_all_radiance_local_size[1]; + ENQUEUE_SPLIT_KERNEL(sum_all_radiance, + sum_all_radiance_global_size, + sum_all_radiance_local_size); + +#undef ENQUEUE_SPLIT_KERNEL +#undef GLUE + + if(numHostIntervention == 0) { + /* This means that we are executing kernel more than required + * Must avoid this for the next sample/tile. + */ + PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ? + PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR; + } + else { + /* Number of path-iterations done for this tile is set as + * Initial path-iteration times for the next tile + */ + PathIteration_times = numNextPathIterTimes; + } + + first_tile = false; + } + + /* Calculates the amount of memory that has to be always + * allocated in order for the split kernel to function. + * This memory is tile/scene-property invariant (meaning, + * the value returned by this function does not depend + * on the user set tile size or scene properties. + */ + size_t get_invariable_mem_allocated() + { + size_t total_invariable_mem_allocated = 0; + size_t KernelGlobals_size = 0; + size_t ShaderData_SOA_size = 0; + + KernelGlobals_size = get_KernelGlobals_size(); + ShaderData_SOA_size = get_shaderdata_soa_size(); + + total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */ + total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */ + total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */ + total_invariable_mem_allocated += ShaderData_SOA_size; /* sd size */ + total_invariable_mem_allocated += ShaderData_SOA_size; /* sd_DL_shadow size */ + + return total_invariable_mem_allocated; + } + + /* Calculate the memory that has-to-be/has-been allocated for + * the split kernel to function. + */ + size_t get_tile_specific_mem_allocated(const int2 tile_size) + { + size_t tile_specific_mem_allocated = 0; + + /* Get required tile info */ + unsigned int user_set_tile_w = tile_size.x; + unsigned int user_set_tile_h = tile_size.y; + +#ifdef __WORK_STEALING__ + /* Calculate memory to be allocated for work_pools in + * case of work_stealing. + */ + size_t max_global_size[2]; + size_t max_num_work_pools = 0; + max_global_size[0] = + (((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * + SPLIT_KERNEL_LOCAL_SIZE_X; + max_global_size[1] = + (((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * + SPLIT_KERNEL_LOCAL_SIZE_Y; + max_num_work_pools = + (max_global_size[0] * max_global_size[1]) / + (SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y); + tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int); +#endif + + tile_specific_mem_allocated += + user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size; + tile_specific_mem_allocated += + user_set_tile_w * user_set_tile_h * sizeof(RNG); + + return tile_specific_mem_allocated; + } + + /* Calculates the texture memories and KernelData (d_data) memory + * that has been allocated. + */ + size_t get_scene_specific_mem_allocated(cl_mem d_data) + { + size_t scene_specific_mem_allocated = 0; + /* Calculate texture memories. */ +#define KERNEL_TEX(type, ttype, name) \ + scene_specific_mem_allocated += get_tex_size(#name); +#include "kernel_textures.h" +#undef KERNEL_TEX + size_t d_data_size; + ciErr = clGetMemObjectInfo(d_data, + CL_MEM_SIZE, + sizeof(d_data_size), + &d_data_size, + NULL); + assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info"); + scene_specific_mem_allocated += d_data_size; + return scene_specific_mem_allocated; + } + + /* Calculate the memory required for one thread in split kernel. */ + size_t get_per_thread_memory() + { + size_t shader_closure_size = 0; + size_t shaderdata_volume = 0; + shader_closure_size = get_shader_closure_size(current_clos_max); + /* TODO(sergey): This will actually over-allocate if + * particular kernel does not support multiclosure. + */ + shaderdata_volume = get_shader_data_size(shader_closure_size); + size_t retval = sizeof(RNG) + + throughput_size + L_transparent_size + + rayState_size + work_element_size + + ISLamp_size + sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState) + + sizeof(Intersection) /* Overall isect */ + + sizeof(Intersection) /* Instersection_coop_AO */ + + sizeof(Intersection) /* Intersection coop DL */ + + shaderdata_volume /* Overall ShaderData */ + + (shaderdata_volume * 2) /* ShaderData : DL and shadow */ + + sizeof(Ray) + sizeof(BsdfEval) + AOAlpha_size + AOBSDF_size + sizeof(Ray) + + (sizeof(int)* NUM_QUEUES) + + per_thread_output_buffer_size; + return retval; + } + + /* Considers the total memory available in the device and + * and returns the maximum global work size possible. + */ + size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data) + { + /* Calculate invariably allocated memory. */ + size_t invariable_mem_allocated = get_invariable_mem_allocated(); + /* Calculate tile specific allocated memory. */ + size_t tile_specific_mem_allocated = + get_tile_specific_mem_allocated(tile_size); + /* Calculate scene specific allocated memory. */ + size_t scene_specific_mem_allocated = + get_scene_specific_mem_allocated(d_data); + /* Calculate total memory available for the threads in global work size. */ + size_t available_memory = total_allocatable_memory + - invariable_mem_allocated + - tile_specific_mem_allocated + - scene_specific_mem_allocated + - DATA_ALLOCATION_MEM_FACTOR; + size_t per_thread_memory_required = get_per_thread_memory(); + return (available_memory / per_thread_memory_required); + } + + /* Checks if the device has enough memory to render the whole tile; + * If not, we should split single tile into multiple tiles of small size + * and process them all. + */ + bool need_to_split_tile(unsigned int d_w, + unsigned int d_h, + int2 max_render_feasible_tile_size) + { + size_t global_size_estimate[2]; + /* TODO(sergey): Such round-ups are in quite few places, need to replace + * them with an utility macro. + */ + global_size_estimate[0] = + (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * + SPLIT_KERNEL_LOCAL_SIZE_X; + global_size_estimate[1] = + (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * + SPLIT_KERNEL_LOCAL_SIZE_Y; + if((global_size_estimate[0] * global_size_estimate[1]) > + (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y)) { - run = function_bind(&OpenCLDevice::thread_run, device, this); + return true; } - }; + else { + return false; + } + } - int get_split_task_count(DeviceTask& /*task*/) + /* Considers the scene properties, global memory available in the device + * and returns a rectanglular tile dimension (approx the maximum) + * that should render on split kernel. + */ + int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size) { - return 1; + int2 max_render_feasible_tile_size; + int square_root_val = (int)sqrt(feasible_global_work_size); + max_render_feasible_tile_size.x = square_root_val; + max_render_feasible_tile_size.y = square_root_val; + /* Ciel round-off max_render_feasible_tile_size. */ + int2 ceil_render_feasible_tile_size; + ceil_render_feasible_tile_size.x = + (((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * + SPLIT_KERNEL_LOCAL_SIZE_X; + ceil_render_feasible_tile_size.y = + (((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * + SPLIT_KERNEL_LOCAL_SIZE_Y; + if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <= + feasible_global_work_size) + { + return ceil_render_feasible_tile_size; + } + /* Floor round-off max_render_feasible_tile_size. */ + int2 floor_render_feasible_tile_size; + floor_render_feasible_tile_size.x = + (max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) * + SPLIT_KERNEL_LOCAL_SIZE_X; + floor_render_feasible_tile_size.y = + (max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) * + SPLIT_KERNEL_LOCAL_SIZE_Y; + return floor_render_feasible_tile_size; } - void task_add(DeviceTask& task) + /* Try splitting the current tile into multiple smaller + * almost-square-tiles. + */ + int2 get_split_tile_size(RenderTile rtile, + int2 max_render_feasible_tile_size) { - task_pool.push(new OpenCLDeviceTask(this, task)); + int2 split_tile_size; + int num_global_threads = max_render_feasible_tile_size.x * + max_render_feasible_tile_size.y; + int d_w = rtile.w; + int d_h = rtile.h; + /* Ceil round off d_w and d_h */ + d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * + SPLIT_KERNEL_LOCAL_SIZE_X; + d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * + SPLIT_KERNEL_LOCAL_SIZE_Y; + while(d_w * d_h > num_global_threads) { + /* Halve the longer dimension. */ + if(d_w >= d_h) { + d_w = d_w / 2; + d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * + SPLIT_KERNEL_LOCAL_SIZE_X; + } + else { + d_h = d_h / 2; + d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * + SPLIT_KERNEL_LOCAL_SIZE_Y; + } + } + split_tile_size.x = d_w; + split_tile_size.y = d_h; + return split_tile_size; } - void task_wait() + /* Splits existing tile into multiple tiles of tile size split_tile_size. */ + vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size) { - task_pool.wait(); + vector<SplitRenderTile> to_path_trace_rtile; + int d_w = rtile.w; + int d_h = rtile.h; + int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1); + int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1); + /* Buffer and rng_state offset calc. */ + size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride); + size_t offset_x = offset_index % rtile.stride; + size_t offset_y = offset_index / rtile.stride; + /* Resize to_path_trace_rtile. */ + to_path_trace_rtile.resize(num_tiles_x * num_tiles_y); + for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) { + for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) { + int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x; + to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x; + to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y; + to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x; + to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y; + to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample; + to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples; + to_path_trace_rtile[rtile_index].sample = rtile.sample; + to_path_trace_rtile[rtile_index].resolution = rtile.resolution; + to_path_trace_rtile[rtile_index].offset = rtile.offset; + to_path_trace_rtile[rtile_index].buffers = rtile.buffers; + to_path_trace_rtile[rtile_index].buffer = rtile.buffer; + to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state; + to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x); + to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y); + to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride; + /* Fill width and height of the new render tile. */ + to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ? + (d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */ + : split_tile_size.x; + to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ? + (d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */ + : split_tile_size.y; + to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w; + } + } + return to_path_trace_rtile; } - void task_cancel() + void thread_run(DeviceTask *task) { - task_pool.cancel(); + if(task->type == DeviceTask::FILM_CONVERT) { + film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); + } + else if(task->type == DeviceTask::SHADER) { + shader(*task); + } + else if(task->type == DeviceTask::PATH_TRACE) { + RenderTile tile; + bool initialize_data_and_check_render_feasibility = false; + bool need_to_split_tiles_further = false; + int2 max_render_feasible_tile_size; + size_t feasible_global_work_size; + const int2 tile_size = task->requested_tile_size; + /* Keep rendering tiles until done. */ + while(task->acquire_tile(this, tile)) { + if(!initialize_data_and_check_render_feasibility) { + /* Initialize data. */ + /* Calculate per_thread_output_buffer_size. */ + size_t output_buffer_size = 0; + ciErr = clGetMemObjectInfo((cl_mem)tile.buffer, + CL_MEM_SIZE, + sizeof(output_buffer_size), + &output_buffer_size, + NULL); + assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info"); + /* This value is different when running on AMD and NV. */ + if(background) { + /* In offline render the number of buffer elements + * associated with tile.buffer is the current tile size. + */ + per_thread_output_buffer_size = + output_buffer_size / (tile.w * tile.h); + } + else { + /* interactive rendering, unlike offline render, the number of buffer elements + * associated with tile.buffer is the entire viewport size. + */ + per_thread_output_buffer_size = + output_buffer_size / (tile.buffers->params.width * + tile.buffers->params.height); + } + /* Check render feasibility. */ + feasible_global_work_size = get_feasible_global_work_size( + tile_size, + CL_MEM_PTR(const_mem_map["__data"]->device_pointer)); + max_render_feasible_tile_size = + get_max_render_feasible_tile_size( + feasible_global_work_size); + need_to_split_tiles_further = + need_to_split_tile(tile_size.x, + tile_size.y, + max_render_feasible_tile_size); + initialize_data_and_check_render_feasibility = true; + } + if(need_to_split_tiles_further) { + int2 split_tile_size = + get_split_tile_size(tile, + max_render_feasible_tile_size); + vector<SplitRenderTile> to_path_trace_render_tiles = + split_tiles(tile, split_tile_size); + /* Print message to console */ + if(background && (to_path_trace_render_tiles.size() > 1)) { + fprintf(stderr, "Message : Tiles need to be split " + "further inside path trace (due to insufficient " + "device-global-memory for split kernel to " + "function) \n" + "The current tile of dimensions %dx%d is split " + "into tiles of dimension %dx%d for render \n", + tile.w, tile.h, + split_tile_size.x, + split_tile_size.y); + } + /* Process all split tiles. */ + for(int tile_iter = 0; + tile_iter < to_path_trace_render_tiles.size(); + ++tile_iter) + { + path_trace(to_path_trace_render_tiles[tile_iter], + max_render_feasible_tile_size); + } + } + else { + /* No splitting required; process the entire tile at once. */ + /* Render feasible tile size is user-set-tile-size itself. */ + max_render_feasible_tile_size.x = + (((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * + SPLIT_KERNEL_LOCAL_SIZE_X; + max_render_feasible_tile_size.y = + (((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * + SPLIT_KERNEL_LOCAL_SIZE_Y; + /* buffer_rng_state_stride is stride itself. */ + SplitRenderTile split_tile(tile); + split_tile.buffer_rng_state_stride = tile.stride; + path_trace(split_tile, max_render_feasible_tile_size); + } + tile.sample = tile.start_sample + tile.num_samples; + + /* Complete kernel execution before release tile. */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + + task->release_tile(tile); + } + } + } + +protected: + cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE) + { + cl_mem ptr; + ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr); + if(opencl_error(ciErr)) { + assert(0); + } + return ptr; } }; +/* Returns true in case of successful detection of platform and device type, + * else returns false. + */ +static bool get_platform_and_devicetype(const DeviceInfo info, + string &platform_name, + cl_device_type &device_type) +{ + cl_platform_id platform_id; + cl_device_id device_id; + cl_uint num_platforms; + cl_int ciErr; + + /* TODO(sergey): Use some generic error print helper function/ */ + ciErr = clGetPlatformIDs(0, NULL, &num_platforms); + if(ciErr != CL_SUCCESS) { + fprintf(stderr, "Can't getPlatformIds. file - %s, line - %d\n", __FILE__, __LINE__); + return false; + } + + if(num_platforms == 0) { + fprintf(stderr, "No OpenCL platforms found. file - %s, line - %d\n", __FILE__, __LINE__); + return false; + } + + vector<cl_platform_id> platforms(num_platforms, NULL); + + ciErr = clGetPlatformIDs(num_platforms, &platforms[0], NULL); + if(ciErr != CL_SUCCESS) { + fprintf(stderr, "Can't getPlatformIds. file - %s, line - %d\n", __FILE__, __LINE__); + return false; + } + + int num_base = 0; + int total_devices = 0; + + for(int platform = 0; platform < num_platforms; platform++) { + cl_uint num_devices; + + ciErr = clGetDeviceIDs(platforms[platform], opencl_device_type(), 0, NULL, &num_devices); + if(ciErr != CL_SUCCESS) { + fprintf(stderr, "Can't getDeviceIDs. file - %s, line - %d\n", __FILE__, __LINE__); + return false; + } + + total_devices += num_devices; + + if(info.num - num_base >= num_devices) { + /* num doesn't refer to a device in this platform */ + num_base += num_devices; + continue; + } + + /* device is in this platform */ + platform_id = platforms[platform]; + + /* get devices */ + vector<cl_device_id> device_ids(num_devices, NULL); + + ciErr = clGetDeviceIDs(platform_id, opencl_device_type(), num_devices, &device_ids[0], NULL); + if(ciErr != CL_SUCCESS) { + fprintf(stderr, "Can't getDeviceIDs. file - %s, line - %d\n", __FILE__, __LINE__); + return false; + } + + device_id = device_ids[info.num - num_base]; + + char name[256]; + ciErr = clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(name), &name, NULL); + if(ciErr != CL_SUCCESS) { + fprintf(stderr, "Can't getPlatformIDs. file - %s, line - %d \n", __FILE__, __LINE__); + return false; + } + platform_name = name; + + ciErr = clGetDeviceInfo(device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL); + if(ciErr != CL_SUCCESS) { + fprintf(stderr, "Can't getDeviceInfo. file - %s, line - %d \n", __FILE__, __LINE__); + return false; + } + + break; + } + + if(total_devices == 0) { + fprintf(stderr, "No devices found. file - %s, line - %d \n", __FILE__, __LINE__); + return false; + } + + return true; +} + Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background) { - return new OpenCLDevice(info, stats, background); + string platform_name; + cl_device_type device_type; + if(get_platform_and_devicetype(info, platform_name, device_type)) { + const bool force_split_kernel = + getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST") != NULL; + /* TODO(sergey): Replace string lookups with more enum-like API, + * similar to device/venfdor checks blender's gpu. + */ + if(force_split_kernel || + (platform_name == "AMD Accelerated Parallel Processing" && + device_type == CL_DEVICE_TYPE_GPU)) + { + /* If the device is an AMD GPU, take split kernel path. */ + VLOG(1) << "Using split kernel"; + return new OpenCLDeviceSplitKernel(info, stats, background); + } else { + /* For any other device, take megakernel path. */ + VLOG(1) << "Using megekernel"; + return new OpenCLDeviceMegaKernel(info, stats, background); + } + } else { + /* If we can't retrieve platform and device type information for some + * reason, we default to megakernel path. + */ + VLOG(1) << "Failed to rertieve platform or device, using megakernel"; + return new OpenCLDeviceMegaKernel(info, stats, background); + } } -bool device_opencl_init(void) { +bool device_opencl_init(void) +{ static bool initialized = false; static bool result = false; @@ -1132,13 +3322,7 @@ bool device_opencl_init(void) { initialized = true; - // OpenCL disabled for now, only works with this environment variable set - if(!getenv("CYCLES_OPENCL_TEST")) { - result = false; - } - else { - result = clewInit() == CLEW_SUCCESS; - } + result = clewInit() == CLEW_SUCCESS; return result; } diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 83b3450fc1c..85b2760073b 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -14,6 +14,17 @@ set(INC_SYS set(SRC kernel.cpp kernel.cl + kernel_data_init.cl + kernel_queue_enqueue.cl + kernel_scene_intersect.cl + kernel_lamp_emission.cl + kernel_background_buffer_update.cl + kernel_shader_eval.cl + kernel_holdout_emission_blurring_pathtermination_ao.cl + kernel_direct_lighting.cl + kernel_shadow_blocked.cl + kernel_next_iteration_setup.cl + kernel_sum_all_radiance.cl kernel.cu ) @@ -36,17 +47,22 @@ set(SRC_HEADERS kernel_montecarlo.h kernel_passes.h kernel_path.h + kernel_path_common.h kernel_path_state.h kernel_path_surface.h kernel_path_volume.h kernel_projection.h + kernel_queues.h kernel_random.h kernel_shader.h + kernel_shaderdata_vars.h kernel_shadow.h + kernel_split.h kernel_subsurface.h kernel_textures.h kernel_types.h kernel_volume.h + kernel_work_stealing.h ) set(SRC_CLOSURE_HEADERS @@ -68,6 +84,7 @@ set(SRC_CLOSURE_HEADERS closure/emissive.h closure/volume.h ) + set(SRC_SVM_HEADERS svm/svm.h svm/svm_attribute.h @@ -284,6 +301,17 @@ endif() #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cl" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure) diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index 2b9e2a4e44d..558aa0dc6a9 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -47,79 +47,79 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; /*case CLOSURE_BSDF_PHONG_RAMP_ID: - label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_DIFFUSE_RAMP_ID: - label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break;*/ case CLOSURE_BSDF_TRANSLUCENT_ID: - label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); + label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #endif default: @@ -139,67 +139,67 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade return OSLShader::bsdf_eval(sd, sc, omega_in, *pdf); #endif - if(dot(sd->Ng, omega_in) >= 0.0f) { + if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) { switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; /*case CLOSURE_BSDF_PHONG_RAMP_ID: - eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_RAMP_ID: - eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break;*/ case CLOSURE_BSDF_TRANSLUCENT_ID: - eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf); + eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); break; #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); + eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf); break; #endif default: @@ -211,57 +211,57 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_TRANSLUCENT_ID: - eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf); + eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); break; #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); + eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf); break; #endif default: diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h index 9ac16e86085..c7364e9edac 100644 --- a/intern/cycles/kernel/geom/geom_attribute.h +++ b/intern/cycles/kernel/geom/geom_attribute.h @@ -29,13 +29,13 @@ CCL_NAMESPACE_BEGIN ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeElement *elem) { - if(sd->object == PRIM_NONE) + if(ccl_fetch(sd, object) == PRIM_NONE) return (int)ATTR_STD_NOT_FOUND; /* for SVM, find attribute by unique id */ - uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride; + uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride; #ifdef __HAIR__ - attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset; + attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset; #endif uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset); @@ -49,7 +49,7 @@ ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, ui *elem = (AttributeElement)attr_map.y; - if(sd->prim == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH) + if(ccl_fetch(sd, prim) == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH) return ATTR_STD_NOT_FOUND; /* return result */ diff --git a/intern/cycles/kernel/geom/geom_bvh.h b/intern/cycles/kernel/geom/geom_bvh.h index 2e8e27c709d..3d0d406dd0b 100644 --- a/intern/cycles/kernel/geom/geom_bvh.h +++ b/intern/cycles/kernel/geom/geom_bvh.h @@ -447,6 +447,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng) #endif } +#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__) /* ToDo: Move to another file? */ ccl_device int intersections_compare(const void *a, const void *b) { @@ -460,6 +461,7 @@ ccl_device int intersections_compare(const void *a, const void *b) else return 0; } +#endif CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h index a5a25f4a9ae..4ea9e4714c4 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle.h @@ -236,25 +236,25 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface) { /* get shader */ - sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); + ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim)); /* get motion info */ int numsteps, numverts; - object_motion_info(kg, sd->object, &numsteps, &numverts, NULL); + object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL); /* figure out which steps we need to fetch and their interpolation factor */ int maxstep = numsteps*2; - int step = min((int)(sd->time*maxstep), maxstep-1); - float t = sd->time*maxstep - step; + int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1); + float t = ccl_fetch(sd, time)*maxstep - step; /* find attribute */ AttributeElement elem; - int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_POSITION, &elem); + int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_POSITION, &elem); kernel_assert(offset != ATTR_STD_NOT_FOUND); /* fetch vertex coordinates */ float3 verts[3], next_verts[3]; - float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim)); + float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim))); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts); @@ -268,33 +268,33 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD #ifdef __SUBSURFACE__ if(!subsurface) #endif - sd->P = motion_triangle_refine(kg, sd, isect, ray, verts); + ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts); #ifdef __SUBSURFACE__ else - sd->P = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts); + ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts); #endif /* compute face normal */ float3 Ng; - if(sd->flag & SD_NEGATIVE_SCALE_APPLIED) + if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED) Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0])); else Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0])); - sd->Ng = Ng; - sd->N = Ng; + ccl_fetch(sd, Ng) = Ng; + ccl_fetch(sd, N) = Ng; /* compute derivatives of P w.r.t. uv */ #ifdef __DPDU__ - sd->dPdu = (verts[0] - verts[2]); - sd->dPdv = (verts[1] - verts[2]); + ccl_fetch(sd, dPdu) = (verts[0] - verts[2]); + ccl_fetch(sd, dPdv) = (verts[1] - verts[2]); #endif /* compute smooth normal */ - if(sd->shader & SHADER_SMOOTH_NORMAL) { + if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { /* find attribute */ AttributeElement elem; - int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_NORMAL, &elem); + int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_NORMAL, &elem); kernel_assert(offset != ATTR_STD_NOT_FOUND); /* fetch vertex coordinates */ @@ -308,10 +308,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD normals[2] = (1.0f - t)*normals[2] + t*next_normals[2]; /* interpolate between vertices */ - float u = sd->u; - float v = sd->v; + float u = ccl_fetch(sd, u); + float v = ccl_fetch(sd, v); float w = 1.0f - u - v; - sd->N = (u*normals[0] + v*normals[1] + w*normals[2]); + ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]); } } diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h index 7df71010232..40cbca243a7 100644 --- a/intern/cycles/kernel/geom/geom_object.h +++ b/intern/cycles/kernel/geom/geom_object.h @@ -123,9 +123,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P) { #ifdef __OBJECT_MOTION__ - *P = transform_point(&sd->ob_tfm, *P); + *P = transform_point(&ccl_fetch(sd, ob_tfm), *P); #else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); *P = transform_point(&tfm, *P); #endif } @@ -135,9 +135,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P) { #ifdef __OBJECT_MOTION__ - *P = transform_point(&sd->ob_itfm, *P); + *P = transform_point(&ccl_fetch(sd, ob_itfm), *P); #else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); *P = transform_point(&tfm, *P); #endif } @@ -147,9 +147,9 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N) { #ifdef __OBJECT_MOTION__ - *N = normalize(transform_direction_transposed(&sd->ob_tfm, *N)); + *N = normalize(transform_direction_transposed(&ccl_fetch(sd, ob_tfm), *N)); #else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); *N = normalize(transform_direction_transposed(&tfm, *N)); #endif } @@ -159,9 +159,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N) { #ifdef __OBJECT_MOTION__ - *N = normalize(transform_direction_transposed(&sd->ob_itfm, *N)); + *N = normalize(transform_direction_transposed(&ccl_fetch(sd, ob_itfm), *N)); #else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); *N = normalize(transform_direction_transposed(&tfm, *N)); #endif } @@ -171,9 +171,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D) { #ifdef __OBJECT_MOTION__ - *D = transform_direction(&sd->ob_tfm, *D); + *D = transform_direction(&ccl_fetch(sd, ob_tfm), *D); #else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); *D = transform_direction(&tfm, *D); #endif } @@ -183,9 +183,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D) { #ifdef __OBJECT_MOTION__ - *D = transform_direction(&sd->ob_itfm, *D); + *D = transform_direction(&ccl_fetch(sd, ob_itfm), *D); #else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); *D = transform_direction(&tfm, *D); #endif } @@ -194,13 +194,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd) { - if(sd->object == OBJECT_NONE) + if(ccl_fetch(sd, object) == OBJECT_NONE) return make_float3(0.0f, 0.0f, 0.0f); #ifdef __OBJECT_MOTION__ - return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w); + return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w); #else - Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); return make_float3(tfm.x.w, tfm.y.w, tfm.z.w); #endif } @@ -296,7 +296,7 @@ ccl_device_inline void object_motion_info(KernelGlobals *kg, int object, int *nu ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd) { - return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2 + 1); + return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2 + 1); } /* Particle data from which object was instanced */ @@ -377,7 +377,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir) /* Transform ray into object space to enter static object in BVH */ -ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t) +ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t) { Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM); @@ -425,7 +425,7 @@ ccl_device_inline void qbvh_instance_push(KernelGlobals *kg, /* Transorm ray to exit static object in BVH */ -ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t) +ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t) { if(*t != FLT_MAX) { Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM); @@ -520,5 +520,38 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg, int obj #endif +/* TODO(sergey): This is only for until we've got OpenCL 2.0 + * on all devices we consider supported. It'll be replaced with + * generic address space. + */ + +#ifdef __KERNEL_OPENCL__ +ccl_device_inline void object_dir_transform_addrspace(KernelGlobals *kg, + const ShaderData *sd, + ccl_addr_space float3 *D) +{ + float3 private_D = *D; + object_dir_transform(kg, sd, &private_D); + *D = private_D; +} + +ccl_device_inline void object_normal_transform_addrspace(KernelGlobals *kg, + const ShaderData *sd, + ccl_addr_space float3 *N) +{ + float3 private_N = *N; + object_dir_transform(kg, sd, &private_N); + *N = private_N; +} +#endif + +#ifndef __KERNEL_OPENCL__ +# define object_dir_transform_auto object_dir_transform +# define object_normal_transform_auto object_normal_transform +#else +# define object_dir_transform_auto object_dir_transform_addrspace +# define object_normal_transform_auto object_normal_transform_addrspace +#endif + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h index d2543c5943e..30f12d32355 100644 --- a/intern/cycles/kernel/geom/geom_primitive.h +++ b/intern/cycles/kernel/geom/geom_primitive.h @@ -25,16 +25,16 @@ CCL_NAMESPACE_BEGIN ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy) { - if(sd->type & PRIMITIVE_ALL_TRIANGLE) { + if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { return triangle_attribute_float(kg, sd, elem, offset, dx, dy); } #ifdef __HAIR__ - else if(sd->type & PRIMITIVE_ALL_CURVE) { + else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { return curve_attribute_float(kg, sd, elem, offset, dx, dy); } #endif #ifdef __VOLUME__ - else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) { + else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) { return volume_attribute_float(kg, sd, elem, offset, dx, dy); } #endif @@ -47,16 +47,16 @@ ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData * ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy) { - if(sd->type & PRIMITIVE_ALL_TRIANGLE) { + if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { return triangle_attribute_float3(kg, sd, elem, offset, dx, dy); } #ifdef __HAIR__ - else if(sd->type & PRIMITIVE_ALL_CURVE) { + else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { return curve_attribute_float3(kg, sd, elem, offset, dx, dy); } #endif #ifdef __VOLUME__ - else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) { + else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) { return volume_attribute_float3(kg, sd, elem, offset, dx, dy); } #endif @@ -108,9 +108,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) { #ifdef __HAIR__ - if(sd->type & PRIMITIVE_ALL_CURVE) + if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) #ifdef __DPDU__ - return normalize(sd->dPdu); + return normalize(ccl_fetch(sd, dPdu)); #else return make_float3(0.0f, 0.0f, 0.0f); #endif @@ -124,12 +124,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) float3 data = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL); data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f); object_normal_transform(kg, sd, &data); - return cross(sd->N, normalize(cross(data, sd->N))); + return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N)))); } else { /* otherwise use surface derivatives */ #ifdef __DPDU__ - return normalize(sd->dPdu); + return normalize(ccl_fetch(sd, dPdu)); #else return make_float3(0.0f, 0.0f, 0.0f); #endif @@ -144,16 +144,16 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) float3 center; #ifdef __HAIR__ - bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE; + bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE; if(is_curve_primitive) { center = curve_motion_center_location(kg, sd); - if(!(sd->flag & SD_TRANSFORM_APPLIED)) + if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) object_position_transform(kg, sd, ¢er); } else #endif - center = sd->P; + center = ccl_fetch(sd, P); float3 motion_pre = center, motion_post = center; @@ -164,16 +164,16 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) if(offset != ATTR_STD_NOT_FOUND) { /* get motion info */ int numverts, numkeys; - object_motion_info(kg, sd->object, NULL, &numverts, &numkeys); + object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys); /* lookup attributes */ - int offset_next = (sd->type & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys; + int offset_next = (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys; motion_pre = primitive_attribute_float3(kg, sd, elem, offset, NULL, NULL); motion_post = primitive_attribute_float3(kg, sd, elem, offset_next, NULL, NULL); #ifdef __HAIR__ - if(is_curve_primitive && (sd->flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) { + if(is_curve_primitive && (ccl_fetch(sd, flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) { object_position_transform(kg, sd, &motion_pre); object_position_transform(kg, sd, &motion_post); } @@ -184,10 +184,10 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd) * transformation was set match the world/object space of motion_pre/post */ Transform tfm; - tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE); + tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE); motion_pre = transform_point(&tfm, motion_pre); - tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST); + tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST); motion_post = transform_point(&tfm, motion_post); float3 motion_center; diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index dd3928682e3..995dfac5b09 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -27,14 +27,14 @@ CCL_NAMESPACE_BEGIN ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd) { /* load triangle vertices */ - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x))); float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y))); float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z))); /* return normal */ - if(sd->flag & SD_NEGATIVE_SCALE_APPLIED) + if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED) return normalize(cross(v2 - v0, v1 - v0)); else return normalize(cross(v1 - v0, v2 - v0)); @@ -94,7 +94,7 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo /* Ray differentials on triangle */ -ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, float3 *dPdu, float3 *dPdv) +ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, ccl_addr_space float3 *dPdu, ccl_addr_space float3 *dPdv) { /* fetch triangle vertex coordinates */ float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim); @@ -116,34 +116,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s if(dx) *dx = 0.0f; if(dy) *dy = 0.0f; - return kernel_tex_fetch(__attributes_float, offset + sd->prim); + return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim)); } else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) { - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x)); float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y)); float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z)); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; - if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; + if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; + if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; #endif - return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; + return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; } else if(elem == ATTR_ELEMENT_CORNER) { - int tri = offset + sd->prim*3; + int tri = offset + ccl_fetch(sd, prim)*3; float f0 = kernel_tex_fetch(__attributes_float, tri + 0); float f1 = kernel_tex_fetch(__attributes_float, tri + 1); float f2 = kernel_tex_fetch(__attributes_float, tri + 2); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; - if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; + if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; + if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; #endif - return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; + return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; } else { if(dx) *dx = 0.0f; @@ -159,24 +159,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); - return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim)); + return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim))); } else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) { - float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); + float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x))); float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y))); float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z))); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; - if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; + if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; + if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; #endif - return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; + return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; } else if(elem == ATTR_ELEMENT_CORNER || elem == ATTR_ELEMENT_CORNER_BYTE) { - int tri = offset + sd->prim*3; + int tri = offset + ccl_fetch(sd, prim)*3; float3 f0, f1, f2; if(elem == ATTR_ELEMENT_CORNER) { @@ -191,11 +191,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData } #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; - if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; + if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; + if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; #endif - return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; + return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; } else { if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernel.cl index 5a47260a4ee..19e394936ee 100644 --- a/intern/cycles/kernel/kernel.cl +++ b/intern/cycles/kernel/kernel.cl @@ -25,6 +25,8 @@ #include "kernel_path.h" #include "kernel_bake.h" +#ifdef __COMPILE_ONLY_MEGAKERNEL__ + __kernel void kernel_ocl_path_trace( ccl_constant KernelData *data, ccl_global float *buffer, @@ -52,17 +54,18 @@ __kernel void kernel_ocl_path_trace( kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); } -__kernel void kernel_ocl_convert_to_byte( +#else // __COMPILE_ONLY_MEGAKERNEL__ + +__kernel void kernel_ocl_shader( ccl_constant KernelData *data, - ccl_global uchar4 *rgba, - ccl_global float *buffer, + ccl_global uint4 *input, + ccl_global float4 *output, #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, #include "kernel_textures.h" - float sample_scale, - int sx, int sy, int sw, int sh, int offset, int stride) + int type, int sx, int sw, int offset, int sample) { KernelGlobals kglobals, *kg = &kglobals; @@ -73,23 +76,21 @@ __kernel void kernel_ocl_convert_to_byte( #include "kernel_textures.h" int x = sx + get_global_id(0); - int y = sy + get_global_id(1); - if(x < sx + sw && y < sy + sh) - kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); + if(x < sx + sw) + kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample); } -__kernel void kernel_ocl_convert_to_half_float( +__kernel void kernel_ocl_bake( ccl_constant KernelData *data, - ccl_global uchar4 *rgba, - ccl_global float *buffer, + ccl_global uint4 *input, + ccl_global float4 *output, #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, #include "kernel_textures.h" - float sample_scale, - int sx, int sy, int sw, int sh, int offset, int stride) + int type, int sx, int sw, int offset, int sample) { KernelGlobals kglobals, *kg = &kglobals; @@ -100,22 +101,22 @@ __kernel void kernel_ocl_convert_to_half_float( #include "kernel_textures.h" int x = sx + get_global_id(0); - int y = sy + get_global_id(1); - if(x < sx + sw && y < sy + sh) - kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); + if(x < sx + sw) + kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample); } -__kernel void kernel_ocl_shader( +__kernel void kernel_ocl_convert_to_byte( ccl_constant KernelData *data, - ccl_global uint4 *input, - ccl_global float4 *output, + ccl_global uchar4 *rgba, + ccl_global float *buffer, #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, #include "kernel_textures.h" - int type, int sx, int sw, int offset, int sample) + float sample_scale, + int sx, int sy, int sw, int sh, int offset, int stride) { KernelGlobals kglobals, *kg = &kglobals; @@ -126,21 +127,23 @@ __kernel void kernel_ocl_shader( #include "kernel_textures.h" int x = sx + get_global_id(0); + int y = sy + get_global_id(1); - if(x < sx + sw) - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample); + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); } -__kernel void kernel_ocl_bake( +__kernel void kernel_ocl_convert_to_half_float( ccl_constant KernelData *data, - ccl_global uint4 *input, - ccl_global float4 *output, + ccl_global uchar4 *rgba, + ccl_global float *buffer, #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, #include "kernel_textures.h" - int type, int sx, int sw, int offset, int sample) + float sample_scale, + int sx, int sy, int sw, int sh, int offset, int stride) { KernelGlobals kglobals, *kg = &kglobals; @@ -151,8 +154,10 @@ __kernel void kernel_ocl_bake( #include "kernel_textures.h" int x = sx + get_global_id(0); + int y = sy + get_global_id(1); - if(x < sx + sw) - kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample); + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); } +#endif // __COMPILE_ONLY_MEGAKERNEL__
\ No newline at end of file diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 369c615eade..257728b6244 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -176,7 +176,7 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass) #endif } -ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throughput, +ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput, BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label) { float inverse_pdf = 1.0f/bsdf_pdf; diff --git a/intern/cycles/kernel/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernel_background_buffer_update.cl new file mode 100644 index 00000000000..bf08477cfbf --- /dev/null +++ b/intern/cycles/kernel/kernel_background_buffer_update.cl @@ -0,0 +1,282 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split.h" + +/* + * Note on kernel_ocl_path_trace_background_buffer_update kernel. + * This is the fourth kernel in the ray tracing logic, and the third + * of the path iteration kernels. This kernel takes care of rays that hit + * the background (sceneintersect kernel), and for the rays of + * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in + * the output buffer. This kernel also takes care of rays that have been determined + * to-be-regenerated. + * + * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel + * + * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER + * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state + * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * + * The input and output are as follows, + * + * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_background_buffer_update --|--- PathRadiance_coop + * throughput_coop --------------------------------------| |--- L_transparent_coop + * per_sample_output_buffers ----------------------------| |--- per_sample_output_buffers + * Ray_coop ---------------------------------------------| |--- ray_state + * PathState_coop ---------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * L_transparent_coop -----------------------------------| |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) + * ray_state --------------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) + * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- work_array + * parallel_samples -------------------------------------| |--- PathState_coop + * end_sample -------------------------------------------| |--- throughput_coop + * kg (globals + data) ----------------------------------| |--- rng_coop + * rng_state --------------------------------------------| |--- Ray + * PathRadiance_coop ------------------------------------| | + * sw ---------------------------------------------------| | + * sh ---------------------------------------------------| | + * sx ---------------------------------------------------| | + * sy ---------------------------------------------------| | + * stride -----------------------------------------------| | + * work_array -------------------------------------------| |--- work_array + * queuesize --------------------------------------------| | + * start_sample -----------------------------------------| |--- work_pool_wgs + * work_pool_wgs ----------------------------------------| | + * num_samples ------------------------------------------| | + * + * note on shader_data : shader_data argument is neither an input nor an output for this kernel. It is just filled and consumed here itself. + * Note on Queues : + * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. + * + * State of queues when this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty + */ +__kernel void kernel_ocl_path_trace_background_buffer_update( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_state, + ccl_global uint *rng_coop, /* Required for buffer Update */ + ccl_global float3 *throughput_coop, /* Required for background hit processing */ + PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */ + ccl_global Ray *Ray_coop, /* Required for background hit processing */ + ccl_global PathState *PathState_coop, /* Required for background hit processing */ + ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */ + ccl_global char *ray_state, /* Stores information on the current state of a ray */ + int sw, int sh, int sx, int sy, int stride, + int rng_state_offset_x, + int rng_state_offset_y, + int rng_state_stride, + ccl_global unsigned int *work_array, /* Denotes work of each ray */ + ccl_global int *Queue_data, /* Queues memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize, /* Size (capacity) of each queue */ + int end_sample, + int start_sample, +#ifdef __WORK_STEALING__ + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, +#endif +#ifdef __KERNEL_DEBUG__ + DebugData *debugdata_coop, +#endif + int parallel_samples /* Number of samples to be processed in parallel */ + ) +{ + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + if(ray_index == 0) { + /* We will empty this queue in this kernel */ + Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + } + char enqueue_flag = 0; + ray_index = get_ray_index(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, Queue_data, queuesize, 1); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not required. + * If we are executing on a CPU device, then we need to keep all threads active + * since we have barrier() calls later in the kernel. CPU devices + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) + return; +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + /* Load kernel globals structure and ShaderData strucuture */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + +#ifdef __KERNEL_DEBUG__ + DebugData *debug_data = &debugdata_coop[ray_index]; +#endif + ccl_global PathState *state = &PathState_coop[ray_index]; + PathRadiance *L = L = &PathRadiance_coop[ray_index]; + ccl_global Ray *ray = &Ray_coop[ray_index]; + ccl_global float3 *throughput = &throughput_coop[ray_index]; + ccl_global float *L_transparent = &L_transparent_coop[ray_index]; + ccl_global uint *rng = &rng_coop[ray_index]; + +#ifdef __WORK_STEALING__ + unsigned int my_work; + ccl_global float *initial_per_sample_output_buffers; + ccl_global uint *initial_rng; +#endif + unsigned int sample; + unsigned int tile_x; + unsigned int tile_y; + unsigned int pixel_x; + unsigned int pixel_y; + unsigned int my_sample_tile; + +#ifdef __WORK_STEALING__ + my_work = work_array[ray_index]; + sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); + my_sample_tile = 0; + initial_per_sample_output_buffers = per_sample_output_buffers; + initial_rng = rng_state; +#else // __WORK_STEALING__ + sample = work_array[ray_index]; + int tile_index = ray_index / parallel_samples; + /* buffer and rng_state's stride is "stride". Find x and y using ray_index */ + tile_x = tile_index % sw; + tile_y = tile_index / sw; + my_sample_tile = ray_index - (tile_index * parallel_samples); +#endif + rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; + per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; + + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + /* eval background shader if nothing hit */ + if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) { + *L_transparent = (*L_transparent) + average((*throughput)); +#ifdef __PASSES__ + if(!(kernel_data.film.pass_flag & PASS_BACKGROUND)) +#endif + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } + + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) + { +#ifdef __BACKGROUND__ + /* sample background shader */ + float3 L_background = indirect_background(kg, state, ray, sd); + path_radiance_accum_background(L, (*throughput), L_background, state->bounce); +#endif + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + } + } + + if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { + float3 L_sum = path_radiance_clamp_and_sum(kg, L); + kernel_write_light_passes(kg, per_sample_output_buffers, L, sample); +#ifdef __KERNEL_DEBUG__ + kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample); +#endif + float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); + + /* accumulate result in output buffer */ + kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); + path_rng_end(kg, rng_state, *rng); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { +#ifdef __WORK_STEALING__ + /* We have completed current work; So get next work */ + int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); + if(!valid_work) { + /* If work is invalid, this means no more work is available and the thread may exit */ + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } +#else + if((sample + parallel_samples) >= end_sample) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); + } +#endif + if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { +#ifdef __WORK_STEALING__ + work_array[ray_index] = my_work; + /* Get the sample associated with the current work */ + sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + /* Get pixel and tile position associated with current work */ + get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); + my_sample_tile = 0; + + /* Remap rng_state according to the current work */ + rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride); + /* Remap per_sample_output_buffers according to the current work */ + per_sample_output_buffers = initial_per_sample_output_buffers + + (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; +#else + work_array[ray_index] = sample + parallel_samples; + sample = work_array[ray_index]; + + /* Get ray position from ray index */ + pixel_x = sx + ((ray_index / parallel_samples) % sw); + pixel_y = sy + ((ray_index / parallel_samples) / sw); +#endif + + /* initialize random numbers and ray */ + kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray); + + if(ray->t != 0.0f) { + /* Initialize throughput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/ + *throughput = make_float3(1.0f, 1.0f, 1.0f); + *L_transparent = 0.0f; + path_radiance_init(L, kernel_data.film.use_light_pass); + path_state_init(kg, state, rng, sample, ray); +#ifdef __KERNEL_DEBUG__ + debug_data_init(debug_data); +#endif + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED); + enqueue_flag = 1; + } else { + /*These rays do not participate in path-iteration */ + float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + /* accumulate result in output buffer */ + kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); + path_rng_end(kg, rng_state, *rng); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + } + } +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; These rays + * will be made active during next SceneIntersectkernel + */ + enqueue_ray_index_local(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index); +} diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h index 1e81210007c..3ce5134181a 100644 --- a/intern/cycles/kernel/kernel_camera.h +++ b/intern/cycles/kernel/kernel_camera.h @@ -39,7 +39,7 @@ ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v) return bokeh; } -ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray) +ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray) { /* create ray form raster position */ Transform rastertocamera = kernel_data.cam.rastertocamera; @@ -108,8 +108,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo } /* Orthographic Camera */ - -ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray) +ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray) { /* create ray form raster position */ Transform rastertocamera = kernel_data.cam.rastertocamera; @@ -175,7 +174,7 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl /* Panorama Camera */ -ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray) +ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray) { Transform rastertocamera = kernel_data.cam.rastertocamera; float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f)); @@ -256,7 +255,7 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float /* Common */ ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v, - float lens_u, float lens_v, float time, Ray *ray) + float lens_u, float lens_v, float time, ccl_addr_space Ray *ray) { /* pixel filter */ int filter_table_offset = kernel_data.film.filter_table_offset; @@ -319,7 +318,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, { if(kernel_data.cam.type != CAMERA_PANORAMA) { /* perspective / ortho */ - if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE) + if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE) P += camera_position(kg); Transform tfm = kernel_data.cam.worldtondc; @@ -329,7 +328,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, /* panorama */ Transform tfm = kernel_data.cam.worldtocamera; - if(sd->object != OBJECT_NONE) + if(ccl_fetch(sd, object) != OBJECT_NONE) P = normalize(transform_point(&tfm, P)); else P = normalize(transform_direction(&tfm, P)); diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index b209fff88e6..7a5f70ff3da 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -40,6 +40,8 @@ #include "util_half.h" #include "util_types.h" +#define ccl_addr_space + /* On x86_64, versions of glibc < 2.16 have an issue where expf is * much slower than the double version. This was fixed in glibc 2.16. */ diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 61e208fcab3..9fdd3abfec3 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -41,6 +41,7 @@ #define ccl_global #define ccl_constant #define ccl_may_alias +#define ccl_addr_space /* No assert supported for CUDA */ diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index 12b0f117600..e8b36d2605d 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -40,6 +40,12 @@ #define ccl_local __local #define ccl_private __private +#ifdef __SPLIT_KERNEL__ +#define ccl_addr_space __global +#else +#define ccl_addr_space +#endif + /* Selective nodes compilation. */ #ifndef __NODES_MAX_GROUP__ # define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX diff --git a/intern/cycles/kernel/kernel_data_init.cl b/intern/cycles/kernel/kernel_data_init.cl new file mode 100644 index 00000000000..dbf9e62ccbf --- /dev/null +++ b/intern/cycles/kernel/kernel_data_init.cl @@ -0,0 +1,384 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split.h" + +/* + * Note on kernel_ocl_path_trace_data_initialization kernel + * This kernel Initializes structures needed in path-iteration kernels. + * This is the first kernel in ray-tracing logic. + * + * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE + * + * Its input and output are as follows, + * + * Un-initialized rng---------------|--- kernel_ocl_path_trace_data_initialization ---|--- Initialized rng + * Un-initialized throughput -------| |--- Initialized throughput + * Un-initialized L_transparent ----| |--- Initialized L_transparent + * Un-initialized PathRadiance -----| |--- Initialized PathRadiance + * Un-initialized Ray --------------| |--- Initialized Ray + * Un-initialized PathState --------| |--- Initialized PathState + * Un-initialized QueueData --------| |--- Initialized QueueData (to QUEUE_EMPTY_SLOT) + * Un-initilaized QueueIndex -------| |--- Initialized QueueIndex (to 0) + * Un-initialized use_queues_flag---| |--- Initialized use_queues_flag (to false) + * Un-initialized ray_state --------| |--- Initialized ray_state + * parallel_samples --------------- | |--- Initialized per_sample_output_buffers + * rng_state -----------------------| |--- Initialized work_array + * data ----------------------------| |--- Initialized work_pool_wgs + * start_sample --------------------| | + * sx ------------------------------| | + * sy ------------------------------| | + * sw ------------------------------| | + * sh ------------------------------| | + * stride --------------------------| | + * queuesize -----------------------| | + * num_samples ---------------------| | + * + * Note on Queues : + * All slots in queues are initialized to queue empty slot; + * The number of elements in the queues is initialized to 0; + */ +__kernel void kernel_ocl_path_trace_data_initialization( + ccl_global char *globals, + ccl_global char *shader_data_sd, /* Arguments related to ShaderData */ + ccl_global char *shader_data_sd_DL_shadow, /* Arguments related to ShaderData */ + + ccl_global float3 *P_sd, + ccl_global float3 *P_sd_DL_shadow, + + ccl_global float3 *N_sd, + ccl_global float3 *N_sd_DL_shadow, + + ccl_global float3 *Ng_sd, + ccl_global float3 *Ng_sd_DL_shadow, + + ccl_global float3 *I_sd, + ccl_global float3 *I_sd_DL_shadow, + + ccl_global int *shader_sd, + ccl_global int *shader_sd_DL_shadow, + + ccl_global int *flag_sd, + ccl_global int *flag_sd_DL_shadow, + + ccl_global int *prim_sd, + ccl_global int *prim_sd_DL_shadow, + + ccl_global int *type_sd, + ccl_global int *type_sd_DL_shadow, + + ccl_global float *u_sd, + ccl_global float *u_sd_DL_shadow, + + ccl_global float *v_sd, + ccl_global float *v_sd_DL_shadow, + + ccl_global int *object_sd, + ccl_global int *object_sd_DL_shadow, + + ccl_global float *time_sd, + ccl_global float *time_sd_DL_shadow, + + ccl_global float *ray_length_sd, + ccl_global float *ray_length_sd_DL_shadow, + + ccl_global int *ray_depth_sd, + ccl_global int *ray_depth_sd_DL_shadow, + + ccl_global int *transparent_depth_sd, + ccl_global int *transparent_depth_sd_DL_shadow, + #ifdef __RAY_DIFFERENTIALS__ + ccl_global differential3 *dP_sd, + ccl_global differential3 *dP_sd_DL_shadow, + + ccl_global differential3 *dI_sd, + ccl_global differential3 *dI_sd_DL_shadow, + + ccl_global differential *du_sd, + ccl_global differential *du_sd_DL_shadow, + + ccl_global differential *dv_sd, + ccl_global differential *dv_sd_DL_shadow, + #endif + #ifdef __DPDU__ + ccl_global float3 *dPdu_sd, + ccl_global float3 *dPdu_sd_DL_shadow, + + ccl_global float3 *dPdv_sd, + ccl_global float3 *dPdv_sd_DL_shadow, + #endif + ShaderClosure *closure_sd, + ShaderClosure *closure_sd_DL_shadow, + + ccl_global int *num_closure_sd, + ccl_global int *num_closure_sd_DL_shadow, + + ccl_global float *randb_closure_sd, + ccl_global float *randb_closure_sd_DL_shadow, + + ccl_global float3 *ray_P_sd, + ccl_global float3 *ray_P_sd_DL_shadow, + + ccl_global differential3 *ray_dP_sd, + ccl_global differential3 *ray_dP_sd_DL_shadow, + + ccl_constant KernelData *data, + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_state, + ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ + ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ + ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ + PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ + ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ + ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ + ccl_global char *ray_state, /* Stores information on current state of a ray */ + +#define KERNEL_TEX(type, ttype, name) \ + ccl_global type *name, +#include "kernel_textures.h" + + int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, + int rng_state_offset_x, + int rng_state_offset_y, + int rng_state_stride, + ccl_global int *Queue_data, /* Memory for queues */ + ccl_global int *Queue_index, /* Tracks the number of elements in queues */ + int queuesize, /* size (capacity) of the queue */ + ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ + ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ +#ifdef __WORK_STEALING__ + ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ + unsigned int num_samples, /* Total number of samples per pixel */ +#endif +#ifdef __KERNEL_DEBUG__ + DebugData *debugdata_coop, +#endif + int parallel_samples /* Number of samples to be processed in parallel */ + ) +{ + + /* Load kernel globals structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + + kg->data = data; +#define KERNEL_TEX(type, ttype, name) \ + kg->name = name; +#include "kernel_textures.h" + + /* Load ShaderData structure */ + ShaderData *sd = (ShaderData *)shader_data_sd; + ShaderData *sd_DL_shadow = (ShaderData *)shader_data_sd_DL_shadow; + + sd->P = P_sd; + sd_DL_shadow->P = P_sd_DL_shadow; + + sd->N = N_sd; + sd_DL_shadow->N = N_sd_DL_shadow; + + sd->Ng = Ng_sd; + sd_DL_shadow->Ng = Ng_sd_DL_shadow; + + sd->I = I_sd; + sd_DL_shadow->I = I_sd_DL_shadow; + + sd->shader = shader_sd; + sd_DL_shadow->shader = shader_sd_DL_shadow; + + sd->flag = flag_sd; + sd_DL_shadow->flag = flag_sd_DL_shadow; + + sd->prim = prim_sd; + sd_DL_shadow->prim = prim_sd_DL_shadow; + + sd->type = type_sd; + sd_DL_shadow->type = type_sd_DL_shadow; + + sd->u = u_sd; + sd_DL_shadow->u = u_sd_DL_shadow; + + sd->v = v_sd; + sd_DL_shadow->v = v_sd_DL_shadow; + + sd->object = object_sd; + sd_DL_shadow->object = object_sd_DL_shadow; + + sd->time = time_sd; + sd_DL_shadow->time = time_sd_DL_shadow; + + sd->ray_length = ray_length_sd; + sd_DL_shadow->ray_length = ray_length_sd_DL_shadow; + + sd->ray_depth = ray_depth_sd; + sd_DL_shadow->ray_depth = ray_depth_sd_DL_shadow; + + sd->transparent_depth = transparent_depth_sd; + sd_DL_shadow->transparent_depth = transparent_depth_sd_DL_shadow; + +#ifdef __RAY_DIFFERENTIALS__ + sd->dP = dP_sd; + sd_DL_shadow->dP = dP_sd_DL_shadow; + + sd->dI = dI_sd; + sd_DL_shadow->dI = dI_sd_DL_shadow; + + sd->du = du_sd; + sd_DL_shadow->du = du_sd_DL_shadow; + + sd->dv = dv_sd; + sd_DL_shadow->dv = dv_sd_DL_shadow; +#ifdef __DPDU__ + sd->dPdu = dPdu_sd; + sd_DL_shadow->dPdu = dPdu_sd_DL_shadow; + + sd->dPdv = dPdv_sd; + sd_DL_shadow->dPdv = dPdv_sd_DL_shadow; +#endif +#endif + + sd->closure = closure_sd; + sd_DL_shadow->closure = closure_sd_DL_shadow; + + sd->num_closure = num_closure_sd; + sd_DL_shadow->num_closure = num_closure_sd_DL_shadow; + + sd->randb_closure = randb_closure_sd; + sd_DL_shadow->randb_closure = randb_closure_sd_DL_shadow; + + sd->ray_P = ray_P_sd; + sd_DL_shadow->ray_P = ray_P_sd_DL_shadow; + + sd->ray_dP = ray_dP_sd; + sd_DL_shadow->ray_dP = ray_dP_sd_DL_shadow; + + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + +#ifdef __WORK_STEALING__ + int lid = get_local_id(1) * get_local_size(0) + get_local_id(0); + /* Initialize work_pool_wgs */ + if(lid == 0) { + int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0); + work_pool_wgs[group_index] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); +#endif // __WORK_STEALING__ + + /* Initialize queue data and queue index */ + if(thread_index < queuesize) { + /* Initialize active ray queue */ + Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + /* Initialize background and buffer update queue */ + Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + /* Initialize shadow ray cast of AO queue */ + Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + /* Initialize shadow ray cast of direct lighting queue */ + Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + } + + if(thread_index == 0) { + Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + /* The scene-intersect kernel should not use the queues very first time. + * since the queue would be empty. + */ + use_queues_flag[0] = 0; + } + + int x = get_global_id(0); + int y = get_global_id(1); + + if(x < (sw * parallel_samples) && y < sh) { + + int ray_index = x + y * (sw * parallel_samples); + + /* This is the first assignment to ray_state; So we dont use ASSIGN_RAY_STATE macro */ + ray_state[ray_index] = RAY_ACTIVE; + + unsigned int my_sample; + unsigned int pixel_x; + unsigned int pixel_y; + unsigned int tile_x; + unsigned int tile_y; + unsigned int my_sample_tile; + +#ifdef __WORK_STEALING__ + unsigned int my_work = 0; + /* get work */ + get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); + /* Get the sample associated with the work */ + my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + + my_sample_tile = 0; + + /* Get pixel and tile position associated with the work */ + get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); + work_array[ray_index] = my_work; +#else // __WORK_STEALING__ + + unsigned int tile_index = ray_index / parallel_samples; + tile_x = tile_index % sw; + tile_y = tile_index / sw; + my_sample_tile = ray_index - (tile_index * parallel_samples); + my_sample = my_sample_tile + start_sample; + + /* Initialize work array */ + work_array[ray_index] = my_sample ; + + /* Calculate pixel position of this ray */ + pixel_x = sx + tile_x; + pixel_y = sy + tile_y; +#endif // __WORK_STEALING__ + + rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; + + /* Initialise per_sample_output_buffers to all zeros */ + per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride; + int per_sample_output_buffers_iterator = 0; + for(per_sample_output_buffers_iterator = 0; per_sample_output_buffers_iterator < kernel_data.film.pass_stride; per_sample_output_buffers_iterator++) { + per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f; + } + + /* initialize random numbers and ray */ + kernel_path_trace_setup(kg, rng_state, my_sample, pixel_x, pixel_y, &rng_coop[ray_index], &Ray_coop[ray_index]); + + if(Ray_coop[ray_index].t != 0.0f) { + /* Initialize throuput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/ + throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f); + L_transparent_coop[ray_index] = 0.0f; + path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass); + path_state_init(kg, &PathState_coop[ray_index], &rng_coop[ray_index], my_sample, &Ray_coop[ray_index]); +#ifdef __KERNEL_DEBUG__ + debug_data_init(&debugdata_coop[ray_index]); +#endif + } else { + /*These rays do not participate in path-iteration */ + + float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + /* accumulate result in output buffer */ + kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad); + path_rng_end(kg, rng_state, rng_coop[ray_index]); + + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + } + } + + /* Mark rest of the ray-state indices as RAY_INACTIVE */ + if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) { + /* First assignment, hence we dont use ASSIGN_RAY_STATE macro */ + ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE; + } +} diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h index f532442ba41..94ede397848 100644 --- a/intern/cycles/kernel/kernel_debug.h +++ b/intern/cycles/kernel/kernel_debug.h @@ -23,7 +23,7 @@ ccl_device_inline void debug_data_init(DebugData *debug_data) ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg, ccl_global float *buffer, - PathState *state, + ccl_addr_space PathState *state, DebugData *debug_data, int sample) { diff --git a/intern/cycles/kernel/kernel_differential.h b/intern/cycles/kernel/kernel_differential.h index e5fbd5b450e..ae1e70f0167 100644 --- a/intern/cycles/kernel/kernel_differential.h +++ b/intern/cycles/kernel/kernel_differential.h @@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN /* See "Tracing Ray Differentials", Homan Igehy, 1999. */ -ccl_device void differential_transfer(differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t) +ccl_device void differential_transfer(ccl_addr_space differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t) { /* ray differential transfer through homogeneous medium, to * compute dPdx/dy at a shading point from the incoming ray */ @@ -31,7 +31,7 @@ ccl_device void differential_transfer(differential3 *dP_, const differential3 dP dP_->dy = tmpy - dot(tmpy, Ng)*tmp; } -ccl_device void differential_incoming(differential3 *dI, const differential3 dD) +ccl_device void differential_incoming(ccl_addr_space differential3 *dI, const differential3 dD) { /* compute dIdx/dy at a shading point, we just need to negate the * differential of the ray direction */ @@ -40,7 +40,7 @@ ccl_device void differential_incoming(differential3 *dI, const differential3 dD) dI->dy = -dD.dy; } -ccl_device void differential_dudv(differential *du, differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng) +ccl_device void differential_dudv(ccl_addr_space differential *du, ccl_addr_space differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng) { /* now we have dPdx/dy from the ray differential transfer, and dPdu/dv * from the primitive, we can compute dudx/dy and dvdx/dy. these are diff --git a/intern/cycles/kernel/kernel_direct_lighting.cl b/intern/cycles/kernel/kernel_direct_lighting.cl new file mode 100644 index 00000000000..8bdc7dc0fd1 --- /dev/null +++ b/intern/cycles/kernel/kernel_direct_lighting.cl @@ -0,0 +1,137 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split.h" + +/* + * Note on kernel_ocl_path_trace_direct_lighting kernel. + * This is the eighth kernel in the ray tracing logic. This is the seventh + * of the path iteration kernels. This kernel takes care of direct lighting + * logic. However, the "shadow ray cast" part of direct lighting is handled + * in the next kernel. + * + * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed. + * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and + * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS + * + * The input and output are as follows, + * + * rng_coop -----------------------------------------|--- kernel_ocl_path_trace_direct_lighting --|--- BSDFEval_coop + * PathState_coop -----------------------------------| |--- ISLamp_coop + * shader_data --------------------------------------| |--- LightRay_coop + * ray_state ----------------------------------------| |--- ray_state + * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| | + * kg (globals + data) ------------------------------| | + * queuesize ----------------------------------------| | + * + * note on shader_DL : shader_DL is neither input nor output to this kernel; shader_DL is filled and consumed in this kernel itself. + * Note on Queues : + * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes + * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked + * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag. + * + * State of queues when this kernel is called : + * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same + * before and after this kernel call. + * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this + * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty. + */ +__kernel void kernel_ocl_path_trace_direct_lighting( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for direct lighting */ + ccl_global char *shader_DL, /* Required for direct lighting */ + ccl_global uint *rng_coop, /* Required for direct lighting */ + ccl_global PathState *PathState_coop, /* Required for direct lighting */ + ccl_global int *ISLamp_coop, /* Required for direct lighting */ + ccl_global Ray *LightRay_coop, /* Required for direct lighting */ + ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize /* Size (capacity) of each queue */ + ) +{ + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + char enqueue_flag = 0; + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not required + * If we are executing on a CPU device, then we need to keep all threads active + * since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) + return; +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + /* Load kernel globals structure and ShaderData structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + ShaderData *sd_DL = (ShaderData *)shader_DL; + + ccl_global PathState *state = &PathState_coop[ray_index]; + + /* direct lighting */ +#ifdef __EMISSION__ + if((kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) { + /* sample illumination from lights to find path contribution */ + ccl_global RNG* rng = &rng_coop[ray_index]; + float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); + float light_u, light_v; + path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); + +#ifdef __OBJECT_MOTION__ + light_ray.time = sd->time; +#endif + LightSample ls; + light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls); + + Ray light_ray; + BsdfEval L_light; + bool is_lamp; + if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce, sd_DL)) { + /* write intermediate data to global memory to access from the next kernel */ + LightRay_coop[ray_index] = light_ray; + BSDFEval_coop[ray_index] = L_light; + ISLamp_coop[ray_index] = is_lamp; + /// mark ray state for next shadow kernel + ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); + enqueue_flag = 1; + } + } +#endif + } +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + +#ifdef __EMISSION__ + /* Enqueue RAY_SHADOW_RAY_CAST_DL rays */ + enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index); +#endif +} diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index 6c5a5fac8c5..1fee5835360 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -17,12 +17,20 @@ CCL_NAMESPACE_BEGIN /* Direction Emission */ - ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, - LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce) + LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce +#ifdef __SPLIT_KERNEL__ + ,ShaderData *sd_input +#endif +) { /* setup shading at emitter */ - ShaderData sd; +#ifdef __SPLIT_KERNEL__ + ShaderData *sd = sd_input; +#else + ShaderData sd_object; + ShaderData *sd = &sd_object; +#endif float3 eval; #ifdef __BACKGROUND_MIS__ @@ -37,23 +45,23 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, ray.dP = differential3_zero(); ray.dD = dI; - shader_setup_from_background(kg, &sd, &ray, bounce+1, transparent_bounce); - eval = shader_eval_background(kg, &sd, 0, SHADER_CONTEXT_EMISSION); + shader_setup_from_background(kg, sd, &ray, bounce+1, transparent_bounce); + eval = shader_eval_background(kg, sd, 0, SHADER_CONTEXT_EMISSION); } else #endif { - shader_setup_from_sample(kg, &sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce); + shader_setup_from_sample(kg, sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce); - ls->Ng = sd.Ng; + ls->Ng = ccl_fetch(sd, Ng); /* no path flag, we're evaluating this for all closures. that's weak but * we'd have to do multiple evaluations otherwise */ - shader_eval_surface(kg, &sd, 0.0f, 0, SHADER_CONTEXT_EMISSION); + shader_eval_surface(kg, sd, 0.0f, 0, SHADER_CONTEXT_EMISSION); /* evaluate emissive closure */ - if(sd.flag & SD_EMISSION) - eval = shader_emissive_eval(kg, &sd); + if(ccl_fetch(sd, flag) & SD_EMISSION) + eval = shader_emissive_eval(kg, sd); else eval = make_float3(0.0f, 0.0f, 0.0f); } @@ -63,9 +71,14 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, return eval; } +/* The argument sd_DL is meaningful only for split kernel. Other uses can just pass NULL */ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, LightSample *ls, Ray *ray, BsdfEval *eval, bool *is_lamp, - int bounce, int transparent_bounce) + int bounce, int transparent_bounce +#ifdef __SPLIT_KERNEL__ + , ShaderData *sd_DL +#endif + ) { if(ls->pdf == 0.0f) return false; @@ -74,7 +87,14 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, differential3 dD = differential3_zero(); /* evaluate closure */ - float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, sd->time, bounce, transparent_bounce); + + float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, ccl_fetch(sd, time), + bounce, + transparent_bounce +#ifdef __SPLIT_KERNEL__ + ,sd_DL +#endif + ); if(is_zero(light_eval)) return false; @@ -83,7 +103,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, float bsdf_pdf; #ifdef __VOLUME__ - if(sd->prim != PRIM_NONE) + if(ccl_fetch(sd, prim) != PRIM_NONE) shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf); else shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf); @@ -118,8 +138,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, if(ls->shader & SHADER_CAST_SHADOW) { /* setup ray */ - bool transmit = (dot(sd->Ng, ls->D) < 0.0f); - ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng); + bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f); + ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); if(ls->t == FLT_MAX) { /* distant light */ @@ -132,7 +152,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd, ray->D = normalize_len(ray->D, &ray->t); } - ray->dP = sd->dP; + ray->dP = ccl_fetch(sd, dP); ray->dD = differential3_zero(); } else { @@ -154,14 +174,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader float3 L = shader_emissive_eval(kg, sd); #ifdef __HAIR__ - if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE)) + if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)) #else - if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS)) + if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS)) #endif { /* multiple importance sampling, get triangle light pdf, * and compute weight with respect to BSDF pdf */ - float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t); + float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t); float mis_weight = power_heuristic(bsdf_pdf, pdf); return L*mis_weight; @@ -172,7 +192,12 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader /* Indirect Lamp Emission */ -ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission) +/* The argument sd is meaningful only for split kernel. Other uses can just pass NULL */ +ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission +#ifdef __SPLIT_KERNEL__ + ,ShaderData *sd +#endif + ) { bool hit_lamp = false; @@ -196,7 +221,13 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st } #endif - float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce); + float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, + state->bounce, + state->transparent_bounce +#ifdef __SPLIT_KERNEL__ + ,sd +#endif + ); #ifdef __VOLUME__ if(state->volume_stack[0].shader != SHADER_NONE) { @@ -225,7 +256,11 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st /* Indirect Background */ -ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *state, Ray *ray) +ccl_device_noinline float3 indirect_background(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray +#ifdef __SPLIT_KERNEL__ + ,ShaderData *sd_global +#endif + ) { #ifdef __BACKGROUND__ int shader = kernel_data.background.surface_shader; @@ -241,11 +276,17 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta return make_float3(0.0f, 0.0f, 0.0f); } +#ifdef __SPLIT_KERNEL__ /* evaluate background closure */ + Ray priv_ray = *ray; + shader_setup_from_background(kg, sd_global, &priv_ray, state->bounce+1, state->transparent_bounce); + float3 L = shader_eval_background(kg, sd_global, state->flag, SHADER_CONTEXT_EMISSION); +#else ShaderData sd; shader_setup_from_background(kg, &sd, ray, state->bounce+1, state->transparent_bounce); float3 L = shader_eval_background(kg, &sd, state->flag, SHADER_CONTEXT_EMISSION); +#endif #ifdef __BACKGROUND_MIS__ /* check if background light exists or if we should skip pdf */ diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index be2c879adb9..17fa18909c4 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -80,7 +80,7 @@ typedef struct KernelGlobals {} KernelGlobals; #ifdef __KERNEL_OPENCL__ -typedef struct KernelGlobals { +typedef ccl_addr_space struct KernelGlobals { ccl_constant KernelData *data; #define KERNEL_TEX(type, ttype, name) \ diff --git a/intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl new file mode 100644 index 00000000000..a2e57771522 --- /dev/null +++ b/intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl @@ -0,0 +1,283 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split.h" + +/* + * Note on kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao kernel. + * This is the sixth kernel in the ray tracing logic. This is the fifth + * of the path iteration kernels. This kernel takes care of the logic to process + * "material of type holdout", indirect primitive emission, bsdf blurring, + * probabilistic path termination and AO. + * + * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed. + * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and + * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS + * + * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER + * + * The input and output are as follows, + * + * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * throughput_coop --------------------------------------| |--- PathState_coop + * PathRadiance_coop ------------------------------------| |--- throughput_coop + * Intersection_coop ------------------------------------| |--- L_transparent_coop + * PathState_coop ---------------------------------------| |--- per_sample_output_buffers + * L_transparent_coop -----------------------------------| |--- PathRadiance_coop + * shader_data ------------------------------------------| |--- ShaderData + * ray_state --------------------------------------------| |--- ray_state + * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- AOAlpha_coop + * kg (globals + data) ----------------------------------| |--- AOBSDF_coop + * parallel_samples -------------------------------------| |--- AOLightRay_coop + * per_sample_output_buffers ----------------------------| | + * sw ---------------------------------------------------| | + * sh ---------------------------------------------------| | + * sx ---------------------------------------------------| | + * sy ---------------------------------------------------| | + * stride -----------------------------------------------| | + * work_array -------------------------------------------| | + * queuesize --------------------------------------------| | + * start_sample -----------------------------------------| | + * + * Note on Queues : + * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only + * the rays of state RAY_ACTIVE. + * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER + * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will + * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been + * changed to RAY_UPDATE_BUFFER, there is no problem. + * + * State of queues when this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays. + * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty. + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays + * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO + */ + +__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required throughout the kernel except probabilistic path termination and AO */ + ccl_global float *per_sample_output_buffers, + ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */ + ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */ + ccl_global float *L_transparent_coop, /* Required for handling holdout material */ + PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */ + ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */ + Intersection *Intersection_coop, /* Required for indirect primitive emission */ + ccl_global float3 *AOAlpha_coop, /* Required for AO */ + ccl_global float3 *AOBSDF_coop, /* Required for AO */ + ccl_global Ray *AOLightRay_coop, /* Required for AO */ + int sw, int sh, int sx, int sy, int stride, + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */ + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize, /* Size (capacity) of each queue */ +#ifdef __WORK_STEALING__ + unsigned int start_sample, +#endif + int parallel_samples /* Number of samples to be processed in parallel */ + ) +{ + ccl_local unsigned int local_queue_atomics_bg; + ccl_local unsigned int local_queue_atomics_ao; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics_bg = 0; + local_queue_atomics_ao = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + char enqueue_flag = 0; + char enqueue_flag_AO_SHADOW_RAY_CAST = 0; + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not required + * If we are executing on a CPU device, then we need to keep all threads active + * since we have barrier() calls later in the kernel. CPU devices + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) + return; +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + /* Load kernel globals structure and ShaderData structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + +#ifdef __WORK_STEALING__ + unsigned int my_work; + unsigned int pixel_x; + unsigned int pixel_y; +#endif + unsigned int tile_x; + unsigned int tile_y; + int my_sample_tile; + unsigned int sample; + + ccl_global RNG *rng = 0x0; + ccl_global PathState *state = 0x0; + float3 throughput; + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + + throughput = throughput_coop[ray_index]; + state = &PathState_coop[ray_index]; + rng = &rng_coop[ray_index]; +#ifdef __WORK_STEALING__ + my_work = work_array[ray_index]; + sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); + my_sample_tile = 0; +#else // __WORK_STEALING__ + sample = work_array[ray_index]; + /* buffer's stride is "stride"; Find x and y using ray_index */ + int tile_index = ray_index / parallel_samples; + tile_x = tile_index % sw; + tile_y = tile_index / sw; + my_sample_tile = ray_index - (tile_index * parallel_samples); +#endif // __WORK_STEALING__ + per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; + + /* holdout */ +#ifdef __HOLDOUT__ + if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) { + if(kernel_data.background.transparent) { + float3 holdout_weight; + + if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) + holdout_weight = make_float3(1.0f, 1.0f, 1.0f); + else + holdout_weight = shader_holdout_eval(kg, sd); + + /* any throughput is ok, should all be identical here */ + L_transparent_coop[ray_index] += average(holdout_weight*throughput); + } + + if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + enqueue_flag = 1; + } + } +#endif + } + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + + PathRadiance *L = &PathRadiance_coop[ray_index]; + /* holdout mask objects do not write data passes */ + kernel_write_data_passes(kg, per_sample_output_buffers, L, sd, sample, state, throughput); + + /* blurring of bsdf after bounces, for rays that have a small likelihood + * of following this particular path (diffuse, rough glossy) */ + if(kernel_data.integrator.filter_glossy != FLT_MAX) { + float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf; + + if(blur_pdf < 1.0f) { + float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f; + shader_bsdf_blur(kg, sd, blur_roughness); + } + } + +#ifdef __EMISSION__ + /* emission */ + if(ccl_fetch(sd, flag) & SD_EMISSION) { + /* todo: is isect.t wrong here for transparent surfaces? */ + float3 emission = indirect_primitive_emission(kg, sd, Intersection_coop[ray_index].t, state->flag, state->ray_pdf); + path_radiance_accum_emission(L, throughput, emission, state->bounce); + } +#endif + + /* path termination. this is a strange place to put the termination, it's + * mainly due to the mixed in MIS that we use. gives too many unneeded + * shader evaluations, only need emission if we are going to terminate */ + float probability = path_state_terminate_probability(kg, state, throughput); + + if(probability == 0.0f) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + enqueue_flag = 1; + } + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + if(probability != 1.0f) { + float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE); + + if(terminate >= probability) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + enqueue_flag = 1; + } else { + throughput_coop[ray_index] = throughput/probability; + } + } + } + } + +#ifdef __AO__ + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + /* ambient occlusion */ + if(kernel_data.integrator.use_ambient_occlusion || (ccl_fetch(sd, flag) & SD_AO)) { + /* todo: solve correlation */ + float bsdf_u, bsdf_v; + path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v); + + float ao_factor = kernel_data.background.ao_factor; + float3 ao_N; + AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); + AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd); + + float3 ao_D; + float ao_pdf; + sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); + + if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + Ray _ray; + _ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + _ray.D = ao_D; + _ray.t = kernel_data.background.ao_distance; +#ifdef __OBJECT_MOTION__ + _ray.time = ccl_fetch(sd, time); +#endif + _ray.dP = ccl_fetch(sd, dP); + _ray.dD = differential3_zero(); + AOLightRay_coop[ray_index] = _ray; + + ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); + enqueue_flag_AO_SHADOW_RAY_CAST = 1; + } + } + } +#endif +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_UPDATE_BUFFER rays */ + enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics_bg, Queue_data, Queue_index); +#ifdef __AO__ + /* Enqueue to-shadow-ray-cast rays */ + enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, enqueue_flag_AO_SHADOW_RAY_CAST, queuesize, &local_queue_atomics_ao, Queue_data, Queue_index); +#endif +} diff --git a/intern/cycles/kernel/kernel_lamp_emission.cl b/intern/cycles/kernel/kernel_lamp_emission.cl new file mode 100644 index 00000000000..e7f8b227dd8 --- /dev/null +++ b/intern/cycles/kernel/kernel_lamp_emission.cl @@ -0,0 +1,209 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split.h" + +/* + * Note on kernel_ocl_path_trace_lamp_emission + * This is the 3rd kernel in the ray-tracing logic. This is the second of the + * path-iteration kernels. This kernel takes care of the indirect lamp emission logic. + * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE + * and RAY_HIT_BACKGROUND. + * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel. + * The input/output of the kernel is as follows, + * Throughput_coop ------------------------------------|--- kernel_ocl_path_trace_lamp_emission --|--- PathRadiance_coop + * Ray_coop -------------------------------------------| |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS) + * PathState_coop -------------------------------------| |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) + * kg (globals + data) --------------------------------| | + * Intersection_coop ----------------------------------| | + * ray_state ------------------------------------------| | + * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----| | + * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----| | + * queuesize ------------------------------------------| | + * use_queues_flag ------------------------------------| | + * sw -------------------------------------------------| | + * sh -------------------------------------------------| | + * parallel_samples -----------------------------------| | + * + * note : shader_data is neither input nor output. Its just filled and consumed in the same, kernel_ocl_path_trace_lamp_emission, kernel. + */ +__kernel void kernel_ocl_path_trace_lamp_emission( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for lamp emission */ + ccl_global float3 *throughput_coop, /* Required for lamp emission */ + PathRadiance *PathRadiance_coop, /* Required for lamp emission */ + ccl_global Ray *Ray_coop, /* Required for lamp emission */ + ccl_global PathState *PathState_coop, /* Required for lamp emission */ + Intersection *Intersection_coop, /* Required for lamp emission */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int sw, int sh, + ccl_global int *Queue_data, /* Memory for queues */ + ccl_global int *Queue_index, /* Tracks the number of elements in queues */ + int queuesize, /* Size (capacity) of queues */ + ccl_global char *use_queues_flag, /* used to decide if this kernel should use queues to fetch ray index */ + int parallel_samples /* Number of samples to be processed in parallel */ + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + /* We will empty this queue in this kernel */ + if(get_global_id(0) == 0 && get_global_id(1) == 0) { + Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + } + + /* Fetch use_queues_flag */ + ccl_local char local_use_queues_flag; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_use_queues_flag = use_queues_flag[0]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ray_index; + if(local_use_queues_flag) { + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 1); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } else { + if(x < (sw * parallel_samples) && y < sh){ + ray_index = x + y * (sw * parallel_samples); + } else { + return; + } + } + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + PathRadiance *L = &PathRadiance_coop[ray_index]; + + float3 throughput = throughput_coop[ray_index]; + Ray ray = Ray_coop[ray_index]; + PathState state = PathState_coop[ray_index]; + +#ifdef __LAMP_MIS__ + if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) { + /* ray starting from previous non-transparent bounce */ + Ray light_ray; + + light_ray.P = ray.P - state.ray_t*ray.D; + state.ray_t += Intersection_coop[ray_index].t; + light_ray.D = ray.D; + light_ray.t = state.ray_t; + light_ray.time = ray.time; + light_ray.dD = ray.dD; + light_ray.dP = ray.dP; + /* intersect with lamp */ + float3 emission; + + if(indirect_lamp_emission(kg, &state, &light_ray, &emission, sd)) { + path_radiance_accum_emission(L, throughput, emission, state.bounce); + } + } +#endif + /* __VOLUME__ feature is disabled */ +#if 0 +#ifdef __VOLUME__ + /* volume attenuation, emission, scatter */ + if(state.volume_stack[0].shader != SHADER_NONE) { + Ray volume_ray = ray; + volume_ray.t = (hit)? isect.t: FLT_MAX; + + bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack); + +#ifdef __VOLUME_DECOUPLED__ + int sampling_method = volume_stack_sampling_method(kg, state.volume_stack); + bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method); + + if(decoupled) { + /* cache steps along volume for repeated sampling */ + VolumeSegment volume_segment; + ShaderData volume_sd; + + shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce); + kernel_volume_decoupled_record(kg, &state, + &volume_ray, &volume_sd, &volume_segment, heterogeneous); + + volume_segment.sampling_method = sampling_method; + + /* emission */ + if(volume_segment.closure_flag & SD_EMISSION) + path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce); + + /* scattering */ + VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED; + + if(volume_segment.closure_flag & SD_SCATTER) { + bool all = false; + + /* direct light sampling */ + kernel_branched_path_volume_connect_light(kg, rng, &volume_sd, + throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment); + + /* indirect sample. if we use distance sampling and take just + * one sample for direct and indirect light, we could share + * this computation, but makes code a bit complex */ + float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE); + float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE); + + result = kernel_volume_decoupled_scatter(kg, + &state, &volume_ray, &volume_sd, &throughput, + rphase, rscatter, &volume_segment, NULL, true); + } + + if(result != VOLUME_PATH_SCATTERED) + throughput *= volume_segment.accum_transmittance; + + /* free cached steps */ + kernel_volume_decoupled_free(kg, &volume_segment); + + if(result == VOLUME_PATH_SCATTERED) { + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray)) + continue; + else + break; + } + } + else +#endif + { + /* integrate along volume segment with distance sampling */ + ShaderData volume_sd; + VolumeIntegrateResult result = kernel_volume_integrate( + kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous); + +#ifdef __VOLUME_SCATTER__ + if(result == VOLUME_PATH_SCATTERED) { + /* direct lighting */ + kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L); + + /* indirect light bounce */ + if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray)) + continue; + else + break; + } +#endif + } + } +#endif +#endif + } +} diff --git a/intern/cycles/kernel/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernel_next_iteration_setup.cl new file mode 100644 index 00000000000..49562ca6ed5 --- /dev/null +++ b/intern/cycles/kernel/kernel_next_iteration_setup.cl @@ -0,0 +1,176 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split.h" + +/* + * Note on kernel_ocl_path_trace_setup_next_iteration kernel. + * This is the tenth kernel in the ray tracing logic. This is the ninth + * of the path iteration kernels. This kernel takes care of setting up + * Ray for the next iteration of path-iteration and accumulating radiance + * corresponding to AO and direct-lighting + * + * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER + * + * The input and output are as follows, + * + * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_setup_next_iteration -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * throughput_coop --------------------------------------| |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) + * PathRadiance_coop ------------------------------------| |--- throughput_coop + * PathState_coop ---------------------------------------| |--- PathRadiance_coop + * shader_data ------------------------------------------| |--- PathState_coop + * ray_state --------------------------------------------| |--- ray_state + * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------| |--- Ray_coop + * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| |--- use_queues_flag + * Ray_coop ---------------------------------------------| | + * kg (globals + data) ----------------------------------| | + * LightRay_dl_coop -------------------------------------| + * ISLamp_coop ------------------------------------------| + * BSDFEval_coop ----------------------------------------| + * LightRay_ao_coop -------------------------------------| + * AOBSDF_coop ------------------------------------------| + * AOAlpha_coop -----------------------------------------| + * + * Note on queues, + * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only + * the rays of state RAY_ACTIVE. + * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF + * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will + * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been + * changed to RAY_UPDATE_BUFF, there is no problem. + * + * State of queues when this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays + */ + +__kernel void kernel_ocl_path_trace_setup_next_iteration( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Required for setting up ray for next iteration */ + ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */ + ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */ + PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */ + ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */ + ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */ + ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */ + ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */ + ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */ + ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */ + ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */ + ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize, /* Size (capacity) of each queue */ + ccl_global char *use_queues_flag /* flag to decide if scene_intersect kernel should use queues to fetch ray index */ + ) +{ + + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(get_global_id(0) == 0 && get_global_id(1) == 0) { + /* If we are here, then it means that scene-intersect kernel + * has already been executed atleast once. From the next time, + * scene-intersect kernel may operate on queues to fetch ray index + */ + use_queues_flag[0] = 1; + + /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS + * queues that were made empty during the previous kernel + */ + Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + } + + char enqueue_flag = 0; + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not required + * If we are executing on a CPU device, then we need to keep all threads active + * since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) + return; +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + /* Load kernel globals structure and ShaderData structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + PathRadiance *L = 0x0; + ccl_global PathState *state = 0x0; + + /* Path radiance update for AO/Direct_lighting's shadow blocked */ + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { + state = &PathState_coop[ray_index]; + L = &PathRadiance_coop[ray_index]; + float3 _throughput = throughput_coop[ray_index]; + + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { + float3 shadow = LightRay_ao_coop[ray_index].P; + char update_path_radiance = LightRay_ao_coop[ray_index].t; + if(update_path_radiance) { + path_radiance_accum_ao(L, _throughput, AOAlpha_coop[ray_index], AOBSDF_coop[ray_index], shadow, state->bounce); + } + REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); + } + + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { + float3 shadow = LightRay_dl_coop[ray_index].P; + char update_path_radiance = LightRay_dl_coop[ray_index].t; + if(update_path_radiance) { + BsdfEval L_light = BSDFEval_coop[ray_index]; + path_radiance_accum_light(L, _throughput, &L_light, shadow, 1.0f, state->bounce, ISLamp_coop[ray_index]); + } + REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); + } + } + + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + + ccl_global float3 *throughput = &throughput_coop[ray_index]; + ccl_global Ray *ray = &Ray_coop[ray_index]; + ccl_global RNG* rng = &rng_coop[ray_index]; + state = &PathState_coop[ray_index]; + L = &PathRadiance_coop[ray_index]; + + /* compute direct lighting and next bounce */ + if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) { + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); + enqueue_flag = 1; + } + } +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_UPDATE_BUFFER rays */ + enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index); +} diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 8910e26fcc6..20cf3fa931b 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -19,23 +19,49 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value) { ccl_global float *buf = buffer; +#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) + atomic_add_float(buf, value); +#else *buf = (sample == 0)? value: *buf + value; +#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ } ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value) { +#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) + ccl_global float *buf_x = buffer + 0; + ccl_global float *buf_y = buffer + 1; + ccl_global float *buf_z = buffer + 2; + + atomic_add_float(buf_x, value.x); + atomic_add_float(buf_y, value.y); + atomic_add_float(buf_z, value.z); +#else ccl_global float3 *buf = (ccl_global float3*)buffer; *buf = (sample == 0)? value: *buf + value; +#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ } ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value) { +#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) + ccl_global float *buf_x = buffer + 0; + ccl_global float *buf_y = buffer + 1; + ccl_global float *buf_z = buffer + 2; + ccl_global float *buf_w = buffer + 3; + + atomic_add_float(buf_x, value.x); + atomic_add_float(buf_y, value.y); + atomic_add_float(buf_z, value.z); + atomic_add_float(buf_w, value.w); +#else ccl_global float4 *buf = (ccl_global float4*)buffer; *buf = (sample == 0)? value: *buf + value; +#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ } ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, - ShaderData *sd, int sample, PathState *state, float3 throughput) + ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput) { #ifdef __PASSES__ int path_flag = state->flag; @@ -49,18 +75,18 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl return; if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) { - if(!(sd->flag & SD_TRANSPARENT) || + if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f || average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) { if(sample == 0) { if(flag & PASS_DEPTH) { - float depth = camera_distance(kg, sd->P); + float depth = camera_distance(kg, ccl_fetch(sd, P)); kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth); } if(flag & PASS_OBJECT_ID) { - float id = object_pass_id(kg, sd->object); + float id = object_pass_id(kg, ccl_fetch(sd, object)); kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id); } if(flag & PASS_MATERIAL_ID) { @@ -70,7 +96,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl } if(flag & PASS_NORMAL) { - float3 normal = sd->N; + float3 normal = ccl_fetch(sd, N); kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal); } if(flag & PASS_UV) { @@ -101,7 +127,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl float mist_start = kernel_data.film.mist_start; float mist_inv_depth = kernel_data.film.mist_inv_depth; - float depth = camera_distance(kg, sd->P); + float depth = camera_distance(kg, ccl_fetch(sd, P)); float mist = saturate((depth - mist_start)*mist_inv_depth); /* falloff */ diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 0ba8f694592..e2dbf6e22f7 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -42,6 +42,7 @@ #include "kernel_path_state.h" #include "kernel_shadow.h" #include "kernel_emission.h" +#include "kernel_path_common.h" #include "kernel_path_surface.h" #include "kernel_path_volume.h" @@ -305,17 +306,17 @@ ccl_device void kernel_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance * sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { + if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { Ray light_ray; float3 ao_shadow; - light_ray.P = ray_offset(sd->P, sd->Ng); + light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); light_ray.D = ao_D; light_ray.t = kernel_data.background.ao_distance; #ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; + light_ray.time = ccl_fetch(sd, time); #endif - light_ray.dP = sd->dP; + light_ray.dP = ccl_fetch(sd, dP); light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) @@ -341,17 +342,17 @@ ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathR sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { + if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { Ray light_ray; float3 ao_shadow; - light_ray.P = ray_offset(sd->P, sd->Ng); + light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); light_ray.D = ao_D; light_ray.t = kernel_data.background.ao_distance; #ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; + light_ray.time = ccl_fetch(sd, time); #endif - light_ray.dP = sd->dP; + light_ray.dP = ccl_fetch(sd, dP); light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) @@ -381,7 +382,7 @@ ccl_device bool kernel_path_subsurface_scatter(KernelGlobals *kg, ShaderData *sd #ifdef __VOLUME__ Ray volume_ray = *ray; bool need_update_volume_stack = kernel_data.integrator.use_volumes && - sd->flag & SD_OBJECT_INTERSECTS_VOLUME; + ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME; #endif /* compute lighting with the BSDF closure */ @@ -712,8 +713,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L) { - for(int i = 0; i< sd->num_closure; i++) { - const ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i< ccl_fetch(sd, num_closure); i++) { + const ShaderClosure *sc = &ccl_fetch(sd, closure)[i]; if(!CLOSURE_IS_BSDF(sc->type)) continue; @@ -764,8 +765,8 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, Ray *ray, float3 throughput) { - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i< ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = &ccl_fetch(sd, closure)[i]; if(!CLOSURE_IS_BSSRDF(sc->type)) continue; @@ -786,7 +787,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, #ifdef __VOLUME__ Ray volume_ray = *ray; bool need_update_volume_stack = kernel_data.integrator.use_volumes && - sd->flag & SD_OBJECT_INTERSECTS_VOLUME; + ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME; #endif /* compute lighting with the BSDF closure */ @@ -1143,32 +1144,6 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in #endif -ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int x, int y, RNG *rng, Ray *ray) -{ - float filter_u; - float filter_v; - - int num_samples = kernel_data.integrator.aa_samples; - - path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v); - - /* sample camera ray */ - - float lens_u = 0.0f, lens_v = 0.0f; - - if(kernel_data.cam.aperturesize > 0.0f) - path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v); - - float time = 0.0f; - -#ifdef __CAMERA_MOTION__ - if(kernel_data.cam.shuttertime != -1.0f) - time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME); -#endif - - camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray); -} - ccl_device void kernel_path_trace(KernelGlobals *kg, ccl_global float *buffer, ccl_global uint *rng_state, int sample, int x, int y, int offset, int stride) diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h new file mode 100644 index 00000000000..1912dfa16ed --- /dev/null +++ b/intern/cycles/kernel/kernel_path_common.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg, + ccl_global uint *rng_state, + int sample, + int x, int y, + ccl_addr_space RNG *rng, + ccl_addr_space Ray *ray) +{ + float filter_u; + float filter_v; + + int num_samples = kernel_data.integrator.aa_samples; + + path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v); + + /* sample camera ray */ + + float lens_u = 0.0f, lens_v = 0.0f; + + if(kernel_data.cam.aperturesize > 0.0f) + path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v); + + float time = 0.0f; + +#ifdef __CAMERA_MOTION__ + if(kernel_data.cam.shuttertime != -1.0f) + time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME); +#endif + + camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index 45ea0e502ab..15efb2371de 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG *rng, int sample, Ray *ray) +ccl_device_inline void path_state_init(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space RNG *rng, int sample, ccl_addr_space Ray *ray) { state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP; @@ -51,7 +51,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, PathState *state, RNG #endif } -ccl_device_inline void path_state_next(KernelGlobals *kg, PathState *state, int label) +ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathState *state, int label) { /* ray through transparent keeps same flags from previous ray and is * not counted as a regular bounce, transparent has separate max */ @@ -138,7 +138,7 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s return flag; } -ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, PathState *state, const float3 throughput) +ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput) { if(state->flag & PATH_RAY_TRANSPARENT) { /* transparent rays treated separately */ diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index f0d4e98c5e0..fe85a6b6e4b 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -24,7 +24,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN { #ifdef __EMISSION__ /* sample illumination from lights to find path contribution */ - if(!(sd->flag & SD_BSDF_HAS_EVAL)) + if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)) return; Ray light_ray; @@ -32,7 +32,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN bool is_lamp; #ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; + light_ray.time = ccl_fetch(sd, time); #endif if(sample_all_lights) { @@ -53,7 +53,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v); LightSample ls; - lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls); + lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls); if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { /* trace shadow ray */ @@ -85,7 +85,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN light_t = 0.5f*light_t; LightSample ls; - light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls); + light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls); if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { /* trace shadow ray */ @@ -106,7 +106,7 @@ ccl_device void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RN path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); LightSample ls; - light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls); + light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls); /* sample random light */ if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { @@ -149,15 +149,15 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, path_state_next(kg, state, label); /* setup ray */ - ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); + ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); ray->D = bsdf_omega_in; ray->t = FLT_MAX; #ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; + ray->dP = ccl_fetch(sd, dP); ray->dD = bsdf_domega_in; #endif #ifdef __OBJECT_MOTION__ - ray->time = sd->time; + ray->time = ccl_fetch(sd, time); #endif #ifdef __VOLUME__ @@ -181,12 +181,13 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, #endif +#ifndef __SPLIT_KERNEL__ /* path tracing: connect path directly to position on a light and add it to L */ -ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng, - ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L) +ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng, + ShaderData *sd, float3 throughput, ccl_addr_space PathState *state, PathRadiance *L) { #ifdef __EMISSION__ - if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) + if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) return; /* sample illumination from lights to find path contribution */ @@ -199,11 +200,11 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG bool is_lamp; #ifdef __OBJECT_MOTION__ - light_ray.time = sd->time; + light_ray.time = ccl_fetch(sd, time); #endif LightSample ls; - light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls); + light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls); if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce)) { /* trace shadow ray */ @@ -216,13 +217,14 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG } #endif } +#endif /* path tracing: bounce off or through surface to with new direction stored in ray */ -ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng, - ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray) +ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, ccl_addr_space RNG *rng, + ShaderData *sd, ccl_addr_space float3 *throughput, ccl_addr_space PathState *state, PathRadiance *L, ccl_addr_space Ray *ray) { /* no BSDF? we can stop here */ - if(sd->flag & SD_BSDF) { + if(ccl_fetch(sd, flag) & SD_BSDF) { /* sample BSDF */ float bsdf_pdf; BsdfEval bsdf_eval; @@ -254,16 +256,16 @@ ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng, path_state_next(kg, state, label); /* setup ray */ - ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); + ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); ray->D = bsdf_omega_in; if(state->bounce == 0) - ray->t -= sd->ray_length; /* clipping works through transparent */ + ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */ else ray->t = FLT_MAX; #ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; + ray->dP = ccl_fetch(sd, dP); ray->dD = bsdf_domega_in; #endif @@ -275,21 +277,21 @@ ccl_device_inline bool kernel_path_surface_bounce(KernelGlobals *kg, RNG *rng, return true; } #ifdef __VOLUME__ - else if(sd->flag & SD_HAS_ONLY_VOLUME) { + else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) { /* no surface shader but have a volume shader? act transparent */ /* update path state, count as transparent */ path_state_next(kg, state, LABEL_TRANSPARENT); if(state->bounce == 0) - ray->t -= sd->ray_length; /* clipping works through transparent */ + ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */ else ray->t = FLT_MAX; /* setup ray position, direction stays unchanged */ - ray->P = ray_offset(sd->P, -sd->Ng); + ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng)); #ifdef __RAY_DIFFERENTIALS__ - ray->dP = sd->dP; + ray->dP = ccl_fetch(sd, dP); #endif /* enter/exit volume */ diff --git a/intern/cycles/kernel/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernel_queue_enqueue.cl new file mode 100644 index 00000000000..eee7860fb84 --- /dev/null +++ b/intern/cycles/kernel/kernel_queue_enqueue.cl @@ -0,0 +1,98 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_compat_opencl.h" +#include "kernel_math.h" +#include "kernel_types.h" +#include "kernel_globals.h" +#include "kernel_queues.h" + +/* + * The kernel "kernel_ocl_path_trace_queue_enqueue" enqueues rays of + * different ray state into their appropriate Queues; + * 1. Rays that have been determined to hit the background from the + * "kernel_ocl_path_trace_scene_intersect" kernel + * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * + * The input and output of the kernel is as follows, + * + * ray_state -------------------------------------------|--- kernel_ocl_path_trace_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| | + * queuesize -------------------------------------------| | + * + * Note on Queues : + * State of queues during the first time this kernel is called : + * At entry, + * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays + * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays. + * + * State of queue during other times this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays. + */ + +__kernel void kernel_ocl_path_trace_queue_enqueue( + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int queuesize /* Size (capacity) of each queue */ + ) +{ + /* We have only 2 cases (Hit/Not-Hit) */ + ccl_local unsigned int local_queue_atomics[2]; + + int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + + if(lidx < 2 ) { + local_queue_atomics[lidx] = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int queue_number = -1; + + if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { + queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + } else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS; + } + + unsigned int my_lqidx; + if(queue_number != -1) { + my_lqidx = get_local_queue_index(queue_number, local_queue_atomics); + } + barrier(CLK_LOCAL_MEM_FENCE); + + if(lidx == 0) { + local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, local_queue_atomics, Queue_index); + local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, local_queue_atomics, Queue_index); + } + barrier(CLK_LOCAL_MEM_FENCE); + + unsigned int my_gqidx; + if(queue_number != -1) { + my_gqidx = get_global_queue_index(queue_number, queuesize, my_lqidx, local_queue_atomics); + Queue_data[my_gqidx] = ray_index; + } +} diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h new file mode 100644 index 00000000000..9e65e2b0768 --- /dev/null +++ b/intern/cycles/kernel/kernel_queues.h @@ -0,0 +1,132 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_QUEUE_H__ +#define __KERNEL_QUEUE_H__ + +/* + * Queue utility functions for split kernel + */ + +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable + +/* + * Enqueue ray index into the queue + */ +ccl_device void enqueue_ray_index ( + int ray_index, /* Ray index to be enqueued */ + int queue_number, /* Queue in which the ray index should be enqueued*/ + ccl_global int *queues, /* Buffer of all queues */ + int queue_size, /* Size of each queue */ + ccl_global int *queue_index /* Array of size num_queues; Used for atomic increment */ + ) +{ + /* This thread's queue index */ + int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size); + queues[my_queue_index] = ray_index; +} + +/* + * Get the ray index for this thread + * Returns a positive ray_index for threads that have to do some work; + * Returns 'QUEUE_EMPTY_SLOT' for threads that don't have any work + * i.e All ray's in the queue has been successfully allocated and there + * is no more ray to allocate to other threads. + */ +ccl_device int get_ray_index ( + int thread_index, /* Global thread index */ + int queue_number, /* Queue to operate on */ + ccl_global int *queues, /* Buffer of all queues */ + int queuesize, /* Size of a queue */ + int empty_queue /* Empty the queue slot as soon as we fetch the ray index */ + ) +{ + int ray_index = queues[queue_number * queuesize + thread_index]; + + if(empty_queue && ray_index != QUEUE_EMPTY_SLOT) { + queues[queue_number * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + } + + return ray_index; +} + +/* The following functions are to realize Local memory variant of enqueue ray index function */ + +/* All threads should call this function */ +ccl_device void enqueue_ray_index_local( + int ray_index, /* Ray index to enqueue*/ + int queue_number, /* Queue in which to enqueue ray index */ + char enqueue_flag, /* True for threads whose ray index has to be enqueued */ + int queuesize, /* queue size */ + ccl_local unsigned int *local_queue_atomics, /* To to local queue atomics */ + ccl_global int *Queue_data, /* Queues */ + ccl_global int *Queue_index /* To do global queue atomics */ + ) +{ + int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); + + /* Get local queue id */ + unsigned int lqidx; + if(enqueue_flag) { + lqidx = atomic_inc(local_queue_atomics); + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* Get global queue offset */ + if(lidx == 0) { + *local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics); + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* Get global queue index and enqueue ray */ + if(enqueue_flag) { + unsigned int my_gqidx = queue_number * queuesize + (*local_queue_atomics) + lqidx; + Queue_data[my_gqidx] = ray_index; + } +} + +ccl_device unsigned int get_local_queue_index( + int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */ + ccl_local unsigned int *local_queue_atomics + ) +{ + int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]); + return my_lqidx; +} + +ccl_device unsigned int get_global_per_queue_offset( + int queue_number, + ccl_local unsigned int *local_queue_atomics, + ccl_global int* global_queue_atomics + ) +{ + unsigned int queue_offset = atomic_add((&global_queue_atomics[queue_number]), local_queue_atomics[queue_number]); + return queue_offset; +} + +ccl_device unsigned int get_global_queue_index( + int queue_number, + int queuesize, + unsigned int lqidx, + ccl_local unsigned int * global_per_queue_offset + ) +{ + int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number]; + return my_gqidx; +} + +#endif // __KERNEL_QUEUE_H__ diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index 40767bac013..631a2cb75de 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons return index; } -ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension) +ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension) { #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { @@ -132,7 +132,7 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int #endif } -ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) +ccl_device_inline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) { #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { @@ -149,7 +149,7 @@ ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int } } -ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy) +ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy) { #ifdef __SOBOL_FULL_SCREEN__ uint px, py; @@ -261,12 +261,12 @@ ccl_device uint lcg_init(uint seed) * For branches in the path we must be careful not to reuse the same number * in a sequence and offset accordingly. */ -ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension) +ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension) { return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension); } -ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension) +ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension) { /* the rng_offset is not increased for transparent bounces. if we do then * fully transparent objects can become subtly visible by the different @@ -279,23 +279,23 @@ ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *r return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension); } -ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int dimension, float *fx, float *fy) +ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy) { path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy); } -ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension) { return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension); } -ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension) +ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension) { int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM; return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension); } -ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy) +ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy) { path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy); } diff --git a/intern/cycles/kernel/kernel_scene_intersect.cl b/intern/cycles/kernel/kernel_scene_intersect.cl new file mode 100644 index 00000000000..6817e28a302 --- /dev/null +++ b/intern/cycles/kernel/kernel_scene_intersect.cl @@ -0,0 +1,164 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split.h" + +/* + * Note on kernel_ocl_path_trace_scene_intersect kernel. + * This is the second kernel in the ray tracing logic. This is the first + * of the path iteration kernels. This kernel takes care of scene_intersect function. + * + * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE. + * This kernel processes rays of ray state RAY_ACTIVE + * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND. + * + * The input and output are as follows, + * + * Ray_coop ---------------------------------------|--------- kernel_ocl_path_trace_scene_intersect----------|--- PathState + * PathState_coop ---------------------------------| |--- Intersection + * ray_state --------------------------------------| |--- ray_state + * use_queues_flag --------------------------------| | + * parallel_samples -------------------------------| | + * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---| | + * kg (data + globals) ----------------------------| | + * rng_coop ---------------------------------------| | + * sw ---------------------------------------------| | + * sh ---------------------------------------------| | + * queuesize --------------------------------------| | + * + * Note on Queues : + * Ideally we would want kernel_ocl_path_trace_scene_intersect to work on queues. + * But during the very first time, the queues wil be empty and hence we perform a direct mapping + * between ray-index and thread-index; From the next time onward, the queue will be filled and + * we may start operating on queues. + * + * State of queue during the first time this kernel is called : + * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel + * + * State of queues during other times this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays; + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ; + * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These + * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing + * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from + * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays + * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues) + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and + * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND + * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change + */ + +__kernel void kernel_ocl_path_trace_scene_intersect( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global uint *rng_coop, + ccl_global Ray *Ray_coop, /* Required for scene_intersect */ + ccl_global PathState *PathState_coop, /* Required for scene_intersect */ + Intersection *Intersection_coop, /* Required for scene_intersect */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + int sw, int sh, + ccl_global int *Queue_data, /* Memory for queues */ + ccl_global int *Queue_index, /* Tracks the number of elements in queues */ + int queuesize, /* Size (capacity) of queues */ + ccl_global char *use_queues_flag, /* used to decide if this kernel should use queues to fetch ray index */ +#ifdef __KERNEL_DEBUG__ + DebugData *debugdata_coop, +#endif + int parallel_samples /* Number of samples to be processed in parallel */ + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + /* Fetch use_queues_flag */ + ccl_local char local_use_queues_flag; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_use_queues_flag = use_queues_flag[0]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int ray_index; + if(local_use_queues_flag) { + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } else { + if(x < (sw * parallel_samples) && y < sh){ + ray_index = x + y * (sw * parallel_samples); + } else { + return; + } + } + + /* All regenerated rays become active here */ + if(IS_STATE(ray_state, ray_index, RAY_REGENERATED)) + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE); + + if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE)) + return; + + /* Load kernel globals structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + +#ifdef __KERNEL_DEBUG__ + DebugData *debug_data = &debugdata_coop[ray_index]; +#endif + Intersection *isect = &Intersection_coop[ray_index]; + PathState state = PathState_coop[ray_index]; + Ray ray = Ray_coop[ray_index]; + + /* intersect scene */ + uint visibility = path_state_ray_visibility(kg, &state); + +#ifdef __HAIR__ + float difl = 0.0f, extmax = 0.0f; + uint lcg_state = 0; + RNG rng = rng_coop[ray_index]; + + if(kernel_data.bvh.have_curves) { + if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) { + float3 pixdiff = ray.dD.dx + ray.dD.dy; + /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/ + difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f; + } + + extmax = kernel_data.curve.maximum_width; + lcg_state = lcg_state_init(&rng, &state, 0x51633e2d); + } + + bool hit = scene_intersect(kg, &ray, visibility, isect, &lcg_state, difl, extmax); +#else + bool hit = scene_intersect(kg, &ray, visibility, isect, NULL, 0.0f, 0.0f); +#endif + +#ifdef __KERNEL_DEBUG__ + if(state.flag & PATH_RAY_CAMERA) { + debug_data->num_bvh_traversal_steps += isect->num_traversal_steps; + } +#endif + + if(!hit) { + /* Change the state of rays that hit the background; + * These rays undergo special processing in the + * background_bufferUpdate kernel*/ + ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND); + } +} diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index e9d9f72dfcd..a12419624c3 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -52,55 +52,55 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, int bounce, int transparent_bounce) { #ifdef __INSTANCING__ - sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object; + ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object; #endif - sd->type = isect->type; - sd->flag = kernel_tex_fetch(__object_flag, sd->object); + ccl_fetch(sd, type) = isect->type; + ccl_fetch(sd, flag) = kernel_tex_fetch(__object_flag, ccl_fetch(sd, object)); /* matrices and time */ #ifdef __OBJECT_MOTION__ shader_setup_object_transforms(kg, sd, ray->time); - sd->time = ray->time; + ccl_fetch(sd, time) = ray->time; #endif - sd->prim = kernel_tex_fetch(__prim_index, isect->prim); - sd->ray_length = isect->t; - sd->ray_depth = bounce; - sd->transparent_depth = transparent_bounce; + ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim); + ccl_fetch(sd, ray_length) = isect->t; + ccl_fetch(sd, ray_depth) = bounce; + ccl_fetch(sd, transparent_depth) = transparent_bounce; #ifdef __UV__ - sd->u = isect->u; - sd->v = isect->v; + ccl_fetch(sd, u) = isect->u; + ccl_fetch(sd, v) = isect->v; #endif #ifdef __HAIR__ - if(sd->type & PRIMITIVE_ALL_CURVE) { + if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { /* curve */ - float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - sd->shader = __float_as_int(curvedata.z); - sd->P = bvh_curve_refine(kg, sd, isect, ray); + ccl_fetch(sd, shader) = __float_as_int(curvedata.z); + ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray); } else #endif - if(sd->type & PRIMITIVE_TRIANGLE) { + if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { /* static triangle */ float3 Ng = triangle_normal(kg, sd); - sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); + ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim)); /* vectors */ - sd->P = triangle_refine(kg, sd, isect, ray); - sd->Ng = Ng; - sd->N = Ng; + ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray); + ccl_fetch(sd, Ng) = Ng; + ccl_fetch(sd, N) = Ng; /* smooth normal */ - if(sd->shader & SHADER_SMOOTH_NORMAL) - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) + ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v)); #ifdef __DPDU__ /* dPdu/dPdv */ - triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); + triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv)); #endif } else { @@ -108,40 +108,40 @@ ccl_device void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd, motion_triangle_shader_setup(kg, sd, isect, ray, false); } - sd->I = -ray->D; + ccl_fetch(sd, I) = -ray->D; - sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2); + ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2); #ifdef __INSTANCING__ if(isect->object != OBJECT_NONE) { /* instance transform */ - object_normal_transform(kg, sd, &sd->N); - object_normal_transform(kg, sd, &sd->Ng); + object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N)); + object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng)); #ifdef __DPDU__ - object_dir_transform(kg, sd, &sd->dPdu); - object_dir_transform(kg, sd, &sd->dPdv); + object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu)); + object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv)); #endif } #endif /* backfacing test */ - bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); + bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f); if(backfacing) { - sd->flag |= SD_BACKFACING; - sd->Ng = -sd->Ng; - sd->N = -sd->N; + ccl_fetch(sd, flag) |= SD_BACKFACING; + ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng); + ccl_fetch(sd, N) = -ccl_fetch(sd, N); #ifdef __DPDU__ - sd->dPdu = -sd->dPdu; - sd->dPdv = -sd->dPdv; + ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu); + ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv); #endif } #ifdef __RAY_DIFFERENTIALS__ /* differentials */ - differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t); - differential_incoming(&sd->dI, ray->dD); - differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng); + differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t); + differential_incoming(&ccl_fetch(sd, dI), ray->dD); + differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng)); #endif } @@ -230,105 +230,105 @@ ccl_device void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd, int shader, int object, int prim, float u, float v, float t, float time, int bounce, int transparent_bounce) { /* vectors */ - sd->P = P; - sd->N = Ng; - sd->Ng = Ng; - sd->I = I; - sd->shader = shader; - sd->type = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE; + ccl_fetch(sd, P) = P; + ccl_fetch(sd, N) = Ng; + ccl_fetch(sd, Ng) = Ng; + ccl_fetch(sd, I) = I; + ccl_fetch(sd, shader) = shader; + ccl_fetch(sd, type) = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE; /* primitive */ #ifdef __INSTANCING__ - sd->object = object; + ccl_fetch(sd, object) = object; #endif /* currently no access to bvh prim index for strand sd->prim*/ - sd->prim = prim; + ccl_fetch(sd, prim) = prim; #ifdef __UV__ - sd->u = u; - sd->v = v; + ccl_fetch(sd, u) = u; + ccl_fetch(sd, v) = v; #endif - sd->ray_length = t; - sd->ray_depth = bounce; - sd->transparent_depth = transparent_bounce; + ccl_fetch(sd, ray_length) = t; + ccl_fetch(sd, ray_depth) = bounce; + ccl_fetch(sd, transparent_depth) = transparent_bounce; /* detect instancing, for non-instanced the object index is -object-1 */ #ifdef __INSTANCING__ bool instanced = false; - if(sd->prim != PRIM_NONE) { - if(sd->object >= 0) + if(ccl_fetch(sd, prim) != PRIM_NONE) { + if(ccl_fetch(sd, object) >= 0) instanced = true; else #endif - sd->object = ~sd->object; + ccl_fetch(sd, object) = ~ccl_fetch(sd, object); #ifdef __INSTANCING__ } #endif - sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2); - if(sd->object != OBJECT_NONE) { - sd->flag |= kernel_tex_fetch(__object_flag, sd->object); + ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2); + if(ccl_fetch(sd, object) != OBJECT_NONE) { + ccl_fetch(sd, flag) |= kernel_tex_fetch(__object_flag, ccl_fetch(sd, object)); #ifdef __OBJECT_MOTION__ shader_setup_object_transforms(kg, sd, time); } - sd->time = time; + ccl_fetch(sd, time) = time; #else } #endif - if(sd->type & PRIMITIVE_TRIANGLE) { + if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { /* smooth normal */ - if(sd->shader & SHADER_SMOOTH_NORMAL) { - sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); + if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { + ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v)); #ifdef __INSTANCING__ if(instanced) - object_normal_transform(kg, sd, &sd->N); + object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N)); #endif } /* dPdu/dPdv */ #ifdef __DPDU__ - triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); + triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv)); #ifdef __INSTANCING__ if(instanced) { - object_dir_transform(kg, sd, &sd->dPdu); - object_dir_transform(kg, sd, &sd->dPdv); + object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu)); + object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv)); } #endif #endif } else { #ifdef __DPDU__ - sd->dPdu = make_float3(0.0f, 0.0f, 0.0f); - sd->dPdv = make_float3(0.0f, 0.0f, 0.0f); + ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f); + ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f); #endif } /* backfacing test */ - if(sd->prim != PRIM_NONE) { - bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); + if(ccl_fetch(sd, prim) != PRIM_NONE) { + bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f); if(backfacing) { - sd->flag |= SD_BACKFACING; - sd->Ng = -sd->Ng; - sd->N = -sd->N; + ccl_fetch(sd, flag) |= SD_BACKFACING; + ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng); + ccl_fetch(sd, N) = -ccl_fetch(sd, N); #ifdef __DPDU__ - sd->dPdu = -sd->dPdu; - sd->dPdv = -sd->dPdv; + ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu); + ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv); #endif } } #ifdef __RAY_DIFFERENTIALS__ /* no ray differentials here yet */ - sd->dP = differential3_zero(); - sd->dI = differential3_zero(); - sd->du = differential_zero(); - sd->dv = differential_zero(); + ccl_fetch(sd, dP) = differential3_zero(); + ccl_fetch(sd, dI) = differential3_zero(); + ccl_fetch(sd, du) = differential_zero(); + ccl_fetch(sd, dv) = differential_zero(); #endif } @@ -355,45 +355,46 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd, ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce) { /* vectors */ - sd->P = ray->D; - sd->N = -ray->D; - sd->Ng = -ray->D; - sd->I = -ray->D; - sd->shader = kernel_data.background.surface_shader; - sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2); + ccl_fetch(sd, P) = ray->D; + ccl_fetch(sd, N) = -ray->D; + ccl_fetch(sd, Ng) = -ray->D; + ccl_fetch(sd, I) = -ray->D; + ccl_fetch(sd, shader) = kernel_data.background.surface_shader; + ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2); #ifdef __OBJECT_MOTION__ - sd->time = ray->time; + ccl_fetch(sd, time) = ray->time; #endif - sd->ray_length = 0.0f; - sd->ray_depth = bounce; - sd->transparent_depth = transparent_bounce; + ccl_fetch(sd, ray_length) = 0.0f; + ccl_fetch(sd, ray_depth) = bounce; + ccl_fetch(sd, transparent_depth) = transparent_bounce; #ifdef __INSTANCING__ - sd->object = PRIM_NONE; + ccl_fetch(sd, object) = PRIM_NONE; #endif - sd->prim = PRIM_NONE; + ccl_fetch(sd, prim) = PRIM_NONE; #ifdef __UV__ - sd->u = 0.0f; - sd->v = 0.0f; + ccl_fetch(sd, u) = 0.0f; + ccl_fetch(sd, v) = 0.0f; #endif #ifdef __DPDU__ /* dPdu/dPdv */ - sd->dPdu = make_float3(0.0f, 0.0f, 0.0f); - sd->dPdv = make_float3(0.0f, 0.0f, 0.0f); + ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f); + ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f); #endif #ifdef __RAY_DIFFERENTIALS__ /* differentials */ - sd->dP = ray->dD; - differential_incoming(&sd->dI, sd->dP); - sd->du = differential_zero(); - sd->dv = differential_zero(); + ccl_fetch(sd, dP) = ray->dD; + differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP)); + ccl_fetch(sd, du) = differential_zero(); + ccl_fetch(sd, dv) = differential_zero(); #endif } /* ShaderData setup from point inside volume */ +#ifdef __VOLUME__ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray, int bounce, int transparent_bounce) { /* vectors */ @@ -439,6 +440,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s sd->ray_P = ray->P; sd->ray_dP = ray->dP; } +#endif /* Merging */ @@ -491,11 +493,11 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa { /* this is the veach one-sample model with balance heuristic, some pdf * factors drop out when using balance heuristic weighting */ - for(int i = 0; i< sd->num_closure; i++) { + for(int i = 0; i< ccl_fetch(sd, num_closure); i++) { if(i == skip_bsdf) continue; - const ShaderClosure *sc = &sd->closure[i]; + const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSDF(sc->type)) { float bsdf_pdf = 0.0f; @@ -513,7 +515,7 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, const ShaderDa *pdf = (sum_sample_weight > 0.0f)? sum_pdf/sum_sample_weight: 0.0f; } -ccl_device void shader_bsdf_eval(KernelGlobals *kg, const ShaderData *sd, +ccl_device void shader_bsdf_eval(KernelGlobals *kg, ShaderData *sd, const float3 omega_in, BsdfEval *eval, float *pdf) { bsdf_eval_init(eval, NBUILTIN_CLOSURES, make_float3(0.0f, 0.0f, 0.0f), kernel_data.film.use_light_pass); @@ -527,22 +529,22 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd, { int sampled = 0; - if(sd->num_closure > 1) { + if(ccl_fetch(sd, num_closure) > 1) { /* pick a BSDF closure based on sample weights */ float sum = 0.0f; - for(sampled = 0; sampled < sd->num_closure; sampled++) { - const ShaderClosure *sc = &sd->closure[sampled]; + for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) { + const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); if(CLOSURE_IS_BSDF(sc->type)) sum += sc->sample_weight; } - float r = sd->randb_closure*sum; + float r = ccl_fetch(sd, randb_closure)*sum; sum = 0.0f; - for(sampled = 0; sampled < sd->num_closure; sampled++) { - const ShaderClosure *sc = &sd->closure[sampled]; + for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) { + const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); if(CLOSURE_IS_BSDF(sc->type)) { sum += sc->sample_weight; @@ -552,13 +554,14 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd, } } - if(sampled == sd->num_closure) { + if(sampled == ccl_fetch(sd, num_closure)) { *pdf = 0.0f; return LABEL_NONE; } } - const ShaderClosure *sc = &sd->closure[sampled]; + const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); + int label; float3 eval; @@ -568,7 +571,7 @@ ccl_device int shader_bsdf_sample(KernelGlobals *kg, const ShaderData *sd, if(*pdf != 0.0f) { bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass); - if(sd->num_closure > 1) { + if(ccl_fetch(sd, num_closure) > 1) { float sweight = sc->sample_weight; _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight); } @@ -595,8 +598,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, const ShaderData *s ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness) { - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i< ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSDF(sc->type)) bsdf_blur(kg, sc, roughness); @@ -605,13 +608,13 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) { - if(sd->flag & SD_HAS_ONLY_VOLUME) + if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) return make_float3(1.0f, 1.0f, 1.0f); float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i< ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl eval += sc->weight; @@ -634,8 +637,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i< ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) eval += sc->weight; @@ -648,8 +651,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i< ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSDF_GLOSSY(sc->type)) eval += sc->weight; @@ -662,8 +665,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i< ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type)) eval += sc->weight; @@ -676,8 +679,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i< ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type)) eval += sc->weight; @@ -691,8 +694,8 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac float3 eval = make_float3(0.0f, 0.0f, 0.0f); float3 N = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i< ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) { eval += sc->weight*ao_factor; @@ -700,12 +703,12 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac } else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) { eval += sc->weight; - N += sd->N*average(sc->weight); + N += ccl_fetch(sd, N)*average(sc->weight); } } if(is_zero(N)) - N = sd->N; + N = ccl_fetch(sd, N); else N = normalize(N); @@ -719,8 +722,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b float3 N = make_float3(0.0f, 0.0f, 0.0f); float texture_blur = 0.0f, weight_sum = 0.0f; - for(int i = 0; i< sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i< ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BSSRDF(sc->type)) { float avg_weight = fabsf(average(sc->weight)); @@ -733,7 +736,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b } if(N_) - *N_ = (is_zero(N))? sd->N: normalize(N); + *N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N); if(texture_blur_) *texture_blur_ = texture_blur/weight_sum; @@ -745,7 +748,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc) { - return emissive_simple_eval(sd->Ng, sd->I); + return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I)); } ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd) @@ -753,8 +756,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd) float3 eval; eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_EMISSION(sc->type)) eval += emissive_eval(kg, sd, sc)*sc->weight; @@ -769,8 +772,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) { float3 weight = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_HOLDOUT(sc->type)) weight += sc->weight; @@ -784,8 +787,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, float randb, int path_flag, ShaderContext ctx) { - sd->num_closure = 0; - sd->randb_closure = randb; + ccl_fetch(sd, num_closure) = 0; + ccl_fetch(sd, randb_closure) = randb; #ifdef __OSL__ if(kg->osl) @@ -796,11 +799,11 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, #ifdef __SVM__ svm_eval_nodes(kg, sd, SHADER_TYPE_SURFACE, path_flag); #else - sd->closure->weight = make_float3(0.8f, 0.8f, 0.8f); - sd->closure->N = sd->N; - sd->closure->data0 = 0.0f; - sd->closure->data1 = 0.0f; - sd->flag |= bsdf_diffuse_setup(&sd->closure); + ccl_fetch_array(sd, closure, 0)->weight = make_float3(0.8f, 0.8f, 0.8f); + ccl_fetch_array(sd, closure, 0)->N = ccl_fetch(sd, N); + ccl_fetch_array(sd, closure, 0)->data0 = 0.0f; + ccl_fetch_array(sd, closure, 0)->data1 = 0.0f; + ccl_fetch(sd, flag) |= bsdf_diffuse_setup(ccl_fetch_array(sd, closure, 0)); #endif } } @@ -809,8 +812,8 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int path_flag, ShaderContext ctx) { - sd->num_closure = 0; - sd->randb_closure = 0.0f; + ccl_fetch(sd, num_closure) = 0; + ccl_fetch(sd, randb_closure) = 0.0f; #ifdef __OSL__ if(kg->osl) { @@ -825,8 +828,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, int float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i< sd->num_closure; i++) { - const ShaderClosure *sc = &sd->closure[i]; + for(int i = 0; i< ccl_fetch(sd, num_closure); i++) { + const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); if(CLOSURE_IS_BACKGROUND(sc->type)) eval += sc->weight; @@ -999,8 +1002,8 @@ ccl_device void shader_eval_volume(KernelGlobals *kg, ShaderData *sd, ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx) { - sd->num_closure = 0; - sd->randb_closure = 0.0f; + ccl_fetch(sd, num_closure) = 0; + ccl_fetch(sd, randb_closure) = 0.0f; /* this will modify sd->P */ #ifdef __SVM__ diff --git a/intern/cycles/kernel/kernel_shader_eval.cl b/intern/cycles/kernel/kernel_shader_eval.cl new file mode 100644 index 00000000000..78cf19a3df8 --- /dev/null +++ b/intern/cycles/kernel/kernel_shader_eval.cl @@ -0,0 +1,93 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split.h" + +/* + * Note on kernel_ocl_path_trace_shader_evaluation kernel + * This kernel is the 5th kernel in the ray tracing logic. This is + * the 4rd kernel in path iteration. This kernel sets up the ShaderData + * structure from the values computed by the previous kernels. It also identifies + * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. + * + * The input and output of the kernel is as follows, + * rng_coop -------------------------------------------|--- kernel_ocl_path_trace_shader_evaluation --|--- shader_data + * Ray_coop -------------------------------------------| |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * PathState_coop -------------------------------------| |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) + * Intersection_coop ----------------------------------| | + * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------| | + * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---| | + * ray_state ------------------------------------------| | + * kg (globals + data) --------------------------------| | + * queuesize ------------------------------------------| | + * + * Note on Queues : + * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes + * only the rays of state RAY_ACTIVE; + * State of queues when this kernel is called, + * at entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. + * at exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays + */ + +__kernel void kernel_ocl_path_trace_shader_evaluation( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_data, /* Output ShaderData structure to be filled */ + ccl_global uint *rng_coop, /* Required for rbsdf calculation */ + ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */ + ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */ + Intersection *Intersection_coop, /* Required for setting up shader from ray */ + ccl_global char *ray_state, /* Denotes the state of each ray */ + ccl_global int *Queue_data, /* queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize /* Size (capacity) of each queue */ + ) +{ + int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue */ + ccl_local unsigned int local_queue_atomics; + if(get_local_id(0) == 0 && get_local_id(1) == 0) { + local_queue_atomics = 0; + } + barrier(CLK_LOCAL_MEM_FENCE); + + char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; + + enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index); + + ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0); + + if(ray_index == QUEUE_EMPTY_SLOT) + return; + + /* Continue on with shader evaluation */ + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd = (ShaderData *)shader_data; + Intersection *isect = &Intersection_coop[ray_index]; + ccl_global uint *rng = &rng_coop[ray_index]; + ccl_global PathState *state = &PathState_coop[ray_index]; + Ray ray = Ray_coop[ray_index]; + + shader_setup_from_ray(kg, sd, isect, &ray, state->bounce, state->transparent_bounce); + float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF); + shader_eval_surface(kg, sd, rbsdf, state->flag, SHADER_CONTEXT_MAIN); + } +} diff --git a/intern/cycles/kernel/kernel_shaderdata_vars.h b/intern/cycles/kernel/kernel_shaderdata_vars.h new file mode 100644 index 00000000000..b157b82e023 --- /dev/null +++ b/intern/cycles/kernel/kernel_shaderdata_vars.h @@ -0,0 +1,99 @@ +/* +* Copyright 2011-2015 Blender Foundation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#ifndef SD_VAR +#define SD_VAR(type, what) +#endif +#ifndef SD_CLOSURE_VAR +#define SD_CLOSURE_VAR(type, what, max_closure) +#endif + +/* position */ +SD_VAR(float3, P) +/* smooth normal for shading */ +SD_VAR(float3, N) +/* true geometric normal */ +SD_VAR(float3, Ng) +/* view/incoming direction */ +SD_VAR(float3, I) +/* shader id */ +SD_VAR(int, shader) +/* booleans describing shader, see ShaderDataFlag */ +SD_VAR(int, flag) + +/* primitive id if there is one, ~0 otherwise */ +SD_VAR(int, prim) + +/* combined type and curve segment for hair */ +SD_VAR(int, type) + +/* parametric coordinates +* - barycentric weights for triangles */ +SD_VAR(float, u) +SD_VAR(float, v) +/* object id if there is one, ~0 otherwise */ +SD_VAR(int, object) + +/* motion blur sample time */ +SD_VAR(float, time) + +/* length of the ray being shaded */ +SD_VAR(float, ray_length) + +/* ray bounce depth */ +SD_VAR(int, ray_depth) + +/* ray transparent depth */ +SD_VAR(int, transparent_depth) + +#ifdef __RAY_DIFFERENTIALS__ +/* differential of P. these are orthogonal to Ng, not N */ +SD_VAR(differential3, dP) +/* differential of I */ +SD_VAR(differential3, dI) +/* differential of u, v */ +SD_VAR(differential, du) +SD_VAR(differential, dv) +#endif +#ifdef __DPDU__ +/* differential of P w.r.t. parametric coordinates. note that dPdu is +* not readily suitable as a tangent for shading on triangles. */ +SD_VAR(float3, dPdu) +SD_VAR(float3, dPdv) +#endif + +#ifdef __OBJECT_MOTION__ +/* object <-> world space transformations, cached to avoid +* re-interpolating them constantly for shading */ +SD_VAR(Transform, ob_tfm) +SD_VAR(Transform, ob_itfm) +#endif + +/* Closure data, we store a fixed array of closures */ +SD_CLOSURE_VAR(ShaderClosure, closure, MAX_CLOSURE) +SD_VAR(int, num_closure) +SD_VAR(float, randb_closure) + +/* ray start position, only set for backgrounds */ +SD_VAR(float3, ray_P) +SD_VAR(differential3, ray_dP) + +#ifdef __OSL__ +SD_VAR(struct KernelGlobals *, osl_globals) +#endif + +#undef SD_VAR +#undef SD_CLOSURE_VAR diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index d7c4fa02bcf..5119b7a30c2 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -180,19 +180,37 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * * potentially transparent, and only in that case start marching. this gives * one extra ray cast for the cases were we do want transparency. */ -ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray *ray, float3 *shadow) +/* The arguments sd_mem and isect_mem are meaningful only for OpenCL split kernel. Other uses can just pass a NULL */ +ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray_input, float3 *shadow +#ifdef __SPLIT_KERNEL__ + , ShaderData *sd_mem, Intersection *isect_mem +#endif + ) { *shadow = make_float3(1.0f, 1.0f, 1.0f); - if(ray->t == 0.0f) + if(ray_input->t == 0.0f) return false; - Intersection isect; - bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f); +#ifdef __SPLIT_KERNEL__ + Ray private_ray = *ray_input; + Ray *ray = &private_ray; +#else + Ray *ray = ray_input; +#endif + +#ifdef __SPLIT_KERNEL__ + Intersection *isect = isect_mem; +#else + Intersection isect_object; + Intersection *isect = &isect_object; +#endif + + bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f); #ifdef __TRANSPARENT_SHADOWS__ if(blocked && kernel_data.integrator.transparent_shadows) { - if(shader_transparent_shadow(kg, &isect)) { + if(shader_transparent_shadow(kg, isect)) { float3 throughput = make_float3(1.0f, 1.0f, 1.0f); float3 Pend = ray->P + ray->D*ray->t; int bounce = state->transparent_bounce; @@ -204,9 +222,8 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * if(bounce >= kernel_data.integrator.transparent_max_bounce) return true; - if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, &isect, NULL, 0.0f, 0.0f)) + if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f)) { - #ifdef __VOLUME__ /* attenuation for last line segment towards light */ if(ps.volume_stack[0].shader != SHADER_NONE) @@ -218,39 +235,44 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, PathState *state, Ray * return false; } - if(!shader_transparent_shadow(kg, &isect)) + if(!shader_transparent_shadow(kg, isect)) return true; #ifdef __VOLUME__ /* attenuation between last surface and next surface */ if(ps.volume_stack[0].shader != SHADER_NONE) { Ray segment_ray = *ray; - segment_ray.t = isect.t; + segment_ray.t = isect->t; kernel_volume_shadow(kg, &ps, &segment_ray, &throughput); } #endif /* setup shader data at surface */ - ShaderData sd; - shader_setup_from_ray(kg, &sd, &isect, ray, state->bounce+1, bounce); +#ifdef __SPLIT_KERNEL__ + ShaderData *sd = sd_mem; +#else + ShaderData sd_object; + ShaderData *sd = &sd_object; +#endif + shader_setup_from_ray(kg, sd, isect, ray, state->bounce+1, bounce); /* attenuation from transparent surface */ - if(!(sd.flag & SD_HAS_ONLY_VOLUME)) { - shader_eval_surface(kg, &sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW); - throughput *= shader_bsdf_transparency(kg, &sd); + if(!(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)) { + shader_eval_surface(kg, sd, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW); + throughput *= shader_bsdf_transparency(kg, sd); } if(is_zero(throughput)) return true; /* move ray forward */ - ray->P = ray_offset(sd.P, -sd.Ng); + ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng)); if(ray->t != FLT_MAX) ray->D = normalize_len(Pend - ray->P, &ray->t); #ifdef __VOLUME__ /* exit/enter volume */ - kernel_volume_stack_enter_exit(kg, &sd, ps.volume_stack); + kernel_volume_stack_enter_exit(kg, sd, ps.volume_stack); #endif bounce++; diff --git a/intern/cycles/kernel/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernel_shadow_blocked.cl new file mode 100644 index 00000000000..72a2d0affb0 --- /dev/null +++ b/intern/cycles/kernel/kernel_shadow_blocked.cl @@ -0,0 +1,126 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_split.h" + +/* + * Note on kernel_ocl_path_trace_shadow_blocked kernel. + * This is the ninth kernel in the ray tracing logic. This is the eighth + * of the path iteration kernels. This kernel takes care of "shadow ray cast" + * logic of the direct lighting and AO part of ray tracing. + * + * The input and output are as follows, + * + * PathState_coop ----------------------------------|--- kernel_ocl_path_trace_shadow_blocked --| + * LightRay_dl_coop --------------------------------| |--- LightRay_dl_coop + * LightRay_ao_coop --------------------------------| |--- LightRay_ao_coop + * ray_state ---------------------------------------| |--- ray_state + * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS & | |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS) + QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | + * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS& + QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------| | + * kg (globals + data) -----------------------------| | + * queuesize ---------------------------------------| | + * + * Note on shader_shadow : shader_shadow is neither input nor output to this kernel. shader_shadow is filled and consumed in this kernel itself. + * Note on queues : + * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty + * these queues this kernel. + * State of queues when this kernel is called : + * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same + * before and after this kernel call. + * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO + * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry. + * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit. + */ + +__kernel void kernel_ocl_path_trace_shadow_blocked_direct_lighting( + ccl_global char *globals, + ccl_constant KernelData *data, + ccl_global char *shader_shadow, /* Required for shadow blocked */ + ccl_global PathState *PathState_coop, /* Required for shadow blocked */ + ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */ + ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */ + Intersection *Intersection_coop_AO, + Intersection *Intersection_coop_DL, + ccl_global char *ray_state, + ccl_global int *Queue_data, /* Queue memory */ + ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ + int queuesize, /* Size (capacity) of each queue */ + int total_num_rays + ) +{ +#if 0 + /* we will make the Queue_index entries '0' in the next kernel */ + if(get_global_id(0) == 0 && get_global_id(1) == 0) { + /* We empty this queue here */ + Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + } +#endif + + int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0); + + ccl_local unsigned int ao_queue_length; + ccl_local unsigned int dl_queue_length; + if(lidx == 0) { + ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS]; + dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + /* flag determining if the current ray is to process shadow ray for AO or DL */ + char shadow_blocked_type = -1; + /* flag determining if we need to update L */ + char update_path_radiance = 0; + + int ray_index = QUEUE_EMPTY_SLOT; + int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); + if(thread_index < ao_queue_length + dl_queue_length) { + if(thread_index < ao_queue_length) { + ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1); + shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO; + } else { + ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1); + shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL; + } + } + + if(ray_index == QUEUE_EMPTY_SLOT) + return; + + if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { + /* Load kernel global structure */ + KernelGlobals *kg = (KernelGlobals *)globals; + ShaderData *sd_shadow = (ShaderData *)shader_shadow; + + ccl_global PathState *state = &PathState_coop[ray_index]; + ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index]; + ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index]; + Intersection *isect_ao_global = &Intersection_coop_AO[ray_index]; + Intersection *isect_dl_global = &Intersection_coop_DL[ray_index]; + + ccl_global Ray *light_ray_global = shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO ? light_ray_ao_global : light_ray_dl_global; + Intersection *isect_global = RAY_SHADOW_RAY_CAST_AO ? isect_ao_global : isect_dl_global; + + float3 shadow; + update_path_radiance = !(shadow_blocked(kg, state, light_ray_global, &shadow, sd_shadow, isect_global)); + + /* We use light_ray_global's P and t to store shadow and update_path_radiance */ + light_ray_global->P = shadow; + light_ray_global->t = update_path_radiance; + } +} diff --git a/intern/cycles/kernel/kernel_split.h b/intern/cycles/kernel/kernel_split.h new file mode 100644 index 00000000000..eb386c2e5a7 --- /dev/null +++ b/intern/cycles/kernel/kernel_split.h @@ -0,0 +1,87 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _KERNEL_SPLIT_H_ +#define _KERNEL_SPLIT_H_ + +#include "kernel_compat_opencl.h" +#include "kernel_math.h" +#include "kernel_types.h" +#include "kernel_globals.h" + +/* atomic_add_float function should be defined prior to its usage in kernel_passes.h */ +#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) +/* Utility functions for float atomics */ +/* float atomics impl credits : http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html */ +ccl_device_inline void atomic_add_float(volatile ccl_global float *source, const float operand) { + union { + unsigned int intVal; + float floatVal; + + } newVal; + union { + unsigned int intVal; + float floatVal; + + } prevVal; + do { + prevVal.floatVal = *source; + newVal.floatVal = prevVal.floatVal + operand; + + } while (atomic_cmpxchg((volatile ccl_global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); +} +#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ + +#ifdef __OSL__ +#include "osl_shader.h" +#endif + +#include "kernel_random.h" +#include "kernel_projection.h" +#include "kernel_montecarlo.h" +#include "kernel_differential.h" +#include "kernel_camera.h" + +#include "geom/geom.h" + +#include "kernel_accumulate.h" +#include "kernel_shader.h" +#include "kernel_light.h" +#include "kernel_passes.h" + +#ifdef __SUBSURFACE__ +#include "kernel_subsurface.h" +#endif + +#ifdef __VOLUME__ +#include "kernel_volume.h" +#endif + +#include "kernel_path_state.h" +#include "kernel_shadow.h" +#include "kernel_emission.h" +#include "kernel_path_common.h" +#include "kernel_path_surface.h" +#include "kernel_path_volume.h" + +#ifdef __KERNEL_DEBUG__ +#include "kernel_debug.h" +#endif + +#include "kernel_queues.h" +#include "kernel_work_stealing.h" + +#endif diff --git a/intern/cycles/kernel/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernel_sum_all_radiance.cl new file mode 100644 index 00000000000..739a85d4cc8 --- /dev/null +++ b/intern/cycles/kernel/kernel_sum_all_radiance.cl @@ -0,0 +1,59 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_compat_opencl.h" +#include "kernel_math.h" +#include "kernel_types.h" +#include "kernel_globals.h" + +/* +* Since we process various samples in parallel; The output radiance of different samples +* are stored in different locations; This kernel combines the output radiance contributed +* by all different samples and stores them in the RenderTile's output buffer. +*/ + +__kernel void kernel_ocl_path_trace_sum_all_radiance( + ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */ + ccl_global float *buffer, /* Output buffer of RenderTile */ + ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */ + int parallel_samples, int sw, int sh, int stride, + int buffer_offset_x, + int buffer_offset_y, + int buffer_stride, + int start_sample) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + if(x < sw && y < sh) { + buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride); + per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride); + + int sample_stride = (data->film.pass_stride); + + int sample_iterator = 0; + int pass_stride_iterator = 0; + int num_floats = data->film.pass_stride; + + for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) { + for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) { + *(buffer + pass_stride_iterator) = (start_sample == 0 && sample_iterator == 0) ? *(per_sample_output_buffer + pass_stride_iterator) + : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator); + } + per_sample_output_buffer += sample_stride; + } + } +} diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index daa5ec1b9f1..303a78d8ac0 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -24,6 +24,13 @@ #define __KERNEL_CPU__ #endif +/* TODO(sergey): This is only to make it possible to include this header + * from outside of the kernel. but this could be done somewhat cleaner? + */ +#ifndef ccl_addr_space +#define ccl_addr_space +#endif + CCL_NAMESPACE_BEGIN /* constants */ @@ -90,7 +97,19 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_OPENCL_NVIDIA__ #define __KERNEL_SHADING__ -#define __KERNEL_ADV_SHADING__ +/* TODO(sergey): Advanced shading code still requires work + * for split kernel. + */ +# ifndef __SPLIT_KERNEL__ +# define __KERNEL_ADV_SHADING__ +# else +# define __MULTI_CLOSURE__ +# define __TRANSPARENT_SHADOWS__ +# define __PASSES__ +# define __BACKGROUND_MIS__ +# define __LAMP_MIS__ +# define __AO__ +# endif #endif #ifdef __KERNEL_OPENCL_APPLE__ @@ -103,7 +122,7 @@ CCL_NAMESPACE_BEGIN #define __KERNEL_SHADING__ //__KERNEL_ADV_SHADING__ #define __MULTI_CLOSURE__ -#define __TRANSPARENT_SHADOWS__ +//#define __TRANSPARENT_SHADOWS__ #define __PASSES__ #define __BACKGROUND_MIS__ #define __LAMP_MIS__ @@ -117,10 +136,22 @@ CCL_NAMESPACE_BEGIN #ifdef __KERNEL_OPENCL_INTEL_CPU__ #define __CL_USE_NATIVE__ #define __KERNEL_SHADING__ -#define __KERNEL_ADV_SHADING__ +/* TODO(sergey): Advanced shading code still requires work + * for split kernel. + */ +# ifndef __SPLIT_KERNEL__ +# define __KERNEL_ADV_SHADING__ +# else +# define __MULTI_CLOSURE__ +# define __TRANSPARENT_SHADOWS__ +# define __PASSES__ +# define __BACKGROUND_MIS__ +# define __LAMP_MIS__ +# define __AO__ +# endif #endif -#endif +#endif // __KERNEL_OPENCL__ /* kernel features */ #define __SOBOL__ @@ -322,7 +353,7 @@ typedef enum PassType { #ifdef __PASSES__ -typedef struct PathRadiance { +typedef ccl_addr_space struct PathRadiance { int use_light_pass; float3 emission; @@ -374,7 +405,7 @@ typedef struct BsdfEval { #else -typedef float3 PathRadiance; +typedef ccl_addr_space float3 PathRadiance; typedef float3 BsdfEval; #endif @@ -441,9 +472,9 @@ typedef struct differential { typedef struct Ray { float3 P; /* origin */ float3 D; /* direction */ + float t; /* length of the ray */ float time; /* time (for motion blur) */ - #ifdef __RAY_DIFFERENTIALS__ differential3 dP; differential3 dD; @@ -452,7 +483,7 @@ typedef struct Ray { /* Intersection */ -typedef struct Intersection { +typedef ccl_addr_space struct Intersection { float t, u, v; int prim; int object; @@ -537,7 +568,11 @@ typedef enum AttributeStandard { /* Closure data */ #ifdef __MULTI_CLOSURE__ -#define MAX_CLOSURE 64 +# ifndef __MAX_CLOSURE__ +# define MAX_CLOSURE 64 +# else +# define MAX_CLOSURE __MAX_CLOSURE__ +# endif #else #define MAX_CLOSURE 1 #endif @@ -547,7 +582,7 @@ typedef enum AttributeStandard { * does not put own padding trying to align this members. * - We make sure OSL pointer is also 16 bytes aligned. */ -typedef struct ShaderClosure { +typedef ccl_addr_space struct ShaderClosure { float3 weight; float3 N; float3 T; @@ -632,78 +667,23 @@ enum ShaderDataFlag { struct KernelGlobals; -typedef struct ShaderData { - /* position */ - float3 P; - /* smooth normal for shading */ - float3 N; - /* true geometric normal */ - float3 Ng; - /* view/incoming direction */ - float3 I; - /* shader id */ - int shader; - /* booleans describing shader, see ShaderDataFlag */ - int flag; - - /* primitive id if there is one, ~0 otherwise */ - int prim; - - /* combined type and curve segment for hair */ - int type; - - /* parametric coordinates - * - barycentric weights for triangles */ - float u, v; - /* object id if there is one, ~0 otherwise */ - int object; - - /* motion blur sample time */ - float time; - - /* length of the ray being shaded */ - float ray_length; - - /* ray bounce depth */ - int ray_depth; - - /* ray transparent depth */ - int transparent_depth; - -#ifdef __RAY_DIFFERENTIALS__ - /* differential of P. these are orthogonal to Ng, not N */ - differential3 dP; - /* differential of I */ - differential3 dI; - /* differential of u, v */ - differential du; - differential dv; -#endif -#ifdef __DPDU__ - /* differential of P w.r.t. parametric coordinates. note that dPdu is - * not readily suitable as a tangent for shading on triangles. */ - float3 dPdu, dPdv; -#endif - -#ifdef __OBJECT_MOTION__ - /* object <-> world space transformations, cached to avoid - * re-interpolating them constantly for shading */ - Transform ob_tfm; - Transform ob_itfm; +#ifdef __SPLIT_KERNEL__ +#define SD_VAR(type, what) ccl_global type *what; +#define SD_CLOSURE_VAR(type, what, max_closure) type *what; +#define TIDX (get_global_id(1) * get_global_size(0) + get_global_id(0)) +#define ccl_fetch(s, t) (s->t[TIDX]) +#define ccl_fetch_array(s, t, index) (&s->t[TIDX * MAX_CLOSURE + index]) +#else +#define SD_VAR(type, what) type what; +#define SD_CLOSURE_VAR(type, what, max_closure) type what[max_closure]; +#define ccl_fetch(s, t) (s->t) +#define ccl_fetch_array(s, t, index) (&s->t[index]) #endif - /* Closure data, we store a fixed array of closures */ - ShaderClosure closure[MAX_CLOSURE]; - int num_closure; - float randb_closure; +typedef ccl_addr_space struct ShaderData { - /* ray start position, only set for backgrounds */ - float3 ray_P; - differential3 ray_dP; +#include "kernel_shaderdata_vars.h" -#ifdef __OSL__ - struct KernelGlobals *osl_globals; -#endif } ShaderData; /* Path State */ @@ -996,13 +976,62 @@ typedef struct KernelData { } KernelData; #ifdef __KERNEL_DEBUG__ -typedef struct DebugData { +typedef ccl_addr_space struct DebugData { // Total number of BVH node traversal steps and primitives intersections // for the camera rays. int num_bvh_traversal_steps; } DebugData; #endif +/* Declarations required for split kernel */ + +/* Macro for queues */ +/* Value marking queue's empty slot */ +#define QUEUE_EMPTY_SLOT -1 + +/* +* Queue 1 - Active rays +* Queue 2 - Background queue +* Queue 3 - Shadow ray cast kernel - AO +* Queeu 4 - Shadow ray cast kernel - direct lighting +*/ +#define NUM_QUEUES 4 + +/* Queue names */ +enum QueueNumber { + QUEUE_ACTIVE_AND_REGENERATED_RAYS, /* All active rays and regenerated rays are enqueued here */ + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, /* All + * 1.Background-hit rays, + * 2.Rays that has exited path-iteration but needs to update output buffer + * 3.Rays to be regenerated + * are enqueued here */ + QUEUE_SHADOW_RAY_CAST_AO_RAYS, /* All rays for which a shadow ray should be cast to determine radiance + contribution for AO are enqueued here */ + QUEUE_SHADOW_RAY_CAST_DL_RAYS, /* All rays for which a shadow ray should be cast to determine radiance + contributuin for direct lighting are enqueued here */ +}; + +/* We use RAY_STATE_MASK to get ray_state (enums 0 to 5) */ +#define RAY_STATE_MASK 0x007 +#define RAY_FLAG_MASK 0x0F8 +enum RayState { + RAY_ACTIVE = 0, // Denotes ray is actively involved in path-iteration + RAY_INACTIVE = 1, // Denotes ray has completed processing all samples and is inactive + RAY_UPDATE_BUFFER = 2, // Denoted ray has exited path-iteration and needs to update output buffer + RAY_HIT_BACKGROUND = 3, // Donotes ray has hit background + RAY_TO_REGENERATE = 4, // Denotes ray has to be regenerated + RAY_REGENERATED = 5, // Denotes ray has been regenerated + RAY_SKIP_DL = 6, // Denotes ray should skip direct lighting + RAY_SHADOW_RAY_CAST_AO = 16, // Flag's ray has to execute shadow blocked function in AO part + RAY_SHADOW_RAY_CAST_DL = 32 // Flag's ray has to execute shadow blocked function in direct lighting part +}; + +#define ASSIGN_RAY_STATE(ray_state, ray_index, state) (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state)) +#define IS_STATE(ray_state, ray_index, state) ((ray_state[ray_index] & RAY_STATE_MASK) == state) +#define ADD_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] | flag)) +#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] = (ray_state[ray_index] & (~flag))) +#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag) + CCL_NAMESPACE_END #endif /* __KERNEL_TYPES_H__ */ diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h new file mode 100644 index 00000000000..9b83d972e97 --- /dev/null +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -0,0 +1,193 @@ +/* + * Copyright 2011-2015 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_WORK_STEALING_H__ +#define __KERNEL_WORK_STEALING_H__ + +/* + * Utility functions for work stealing + */ + +#ifdef __WORK_STEALING__ + +#ifdef __KERNEL_OPENCL__ +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#endif + +uint get_group_id_with_ray_index(uint ray_index, + uint tile_dim_x, + uint tile_dim_y, + uint parallel_samples, + int dim) +{ + if(dim == 0) { + uint x_span = ray_index % (tile_dim_x * parallel_samples); + return x_span / get_local_size(0); + } + else /*if(dim == 1)*/ { + kernel_assert(dim == 1); + uint y_span = ray_index / (tile_dim_x * parallel_samples); + return y_span / get_local_size(1); + } +} + +uint get_total_work(uint tile_dim_x, + uint tile_dim_y, + uint grp_idx, + uint grp_idy, + uint num_samples) +{ + uint threads_within_tile_border_x = + (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) + : get_local_size(0); + uint threads_within_tile_border_y = + (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) + : get_local_size(1); + + threads_within_tile_border_x = + (threads_within_tile_border_x == 0) ? get_local_size(0) + : threads_within_tile_border_x; + threads_within_tile_border_y = + (threads_within_tile_border_y == 0) ? get_local_size(1) + : threads_within_tile_border_y; + + return threads_within_tile_border_x * + threads_within_tile_border_y * + num_samples; +} + +/* Returns 0 in case there is no next work available */ +/* Returns 1 in case work assigned is valid */ +int get_next_work(ccl_global uint *work_pool, + ccl_private uint *my_work, + uint tile_dim_x, + uint tile_dim_y, + uint num_samples, + uint parallel_samples, + uint ray_index) +{ + uint grp_idx = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 0); + uint grp_idy = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 1); + uint total_work = get_total_work(tile_dim_x, + tile_dim_y, + grp_idx, + grp_idy, + num_samples); + uint group_index = grp_idy * get_num_groups(0) + grp_idx; + *my_work = atomic_inc(&work_pool[group_index]); + return (*my_work < total_work) ? 1 : 0; +} + +/* This function assumes that the passed my_work is valid. */ +/* Decode sample number w.r.t. assigned my_work. */ +uint get_my_sample(uint my_work, + uint tile_dim_x, + uint tile_dim_y, + uint parallel_samples, + uint ray_index) +{ + uint grp_idx = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 0); + uint grp_idy = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 1); + uint threads_within_tile_border_x = + (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) + : get_local_size(0); + uint threads_within_tile_border_y = + (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) + : get_local_size(1); + + threads_within_tile_border_x = + (threads_within_tile_border_x == 0) ? get_local_size(0) + : threads_within_tile_border_x; + threads_within_tile_border_y = + (threads_within_tile_border_y == 0) ? get_local_size(1) + : threads_within_tile_border_y; + + return my_work / + (threads_within_tile_border_x * threads_within_tile_border_y); +} + +/* Decode pixel and tile position w.r.t. assigned my_work. */ +void get_pixel_tile_position(ccl_private uint *pixel_x, + ccl_private uint *pixel_y, + ccl_private uint *tile_x, + ccl_private uint *tile_y, + uint my_work, + uint tile_dim_x, + uint tile_dim_y, + uint tile_offset_x, + uint tile_offset_y, + uint parallel_samples, + uint ray_index) +{ + uint grp_idx = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 0); + uint grp_idy = get_group_id_with_ray_index(ray_index, + tile_dim_x, + tile_dim_y, + parallel_samples, + 1); + uint threads_within_tile_border_x = + (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) + : get_local_size(0); + uint threads_within_tile_border_y = + (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) + : get_local_size(1); + + threads_within_tile_border_x = + (threads_within_tile_border_x == 0) ? get_local_size(0) + : threads_within_tile_border_x; + threads_within_tile_border_y = + (threads_within_tile_border_y == 0) ? get_local_size(1) + : threads_within_tile_border_y; + + uint total_associated_pixels = + threads_within_tile_border_x * threads_within_tile_border_y; + uint work_group_pixel_index = my_work % total_associated_pixels; + uint work_group_pixel_x = + work_group_pixel_index % threads_within_tile_border_x; + uint work_group_pixel_y = + work_group_pixel_index / threads_within_tile_border_x; + + *pixel_x = + tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x; + *pixel_y = + tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y; + *tile_x = *pixel_x - tile_offset_x; + *tile_y = *pixel_y - tile_offset_y; +} + +#endif /* __WORK_STEALING__ */ + +#endif /* __KERNEL_WORK_STEALING_H__ */ diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index b1561963e5d..e5e25eb6ca8 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -189,7 +189,7 @@ CCL_NAMESPACE_BEGIN ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ShaderType type, int path_flag) { float stack[SVM_STACK_SIZE]; - int offset = sd->shader & SHADER_MASK; + int offset = ccl_fetch(sd, shader) & SHADER_MASK; while(1) { uint4 node = read_node(kg, &offset); diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h index b63978b6e1f..025ae96f59d 100644 --- a/intern/cycles/kernel/svm/svm_attribute.h +++ b/intern/cycles/kernel/svm/svm_attribute.h @@ -22,12 +22,12 @@ ccl_device void svm_node_attr_init(KernelGlobals *kg, ShaderData *sd, uint4 node, NodeAttributeType *type, NodeAttributeType *mesh_type, AttributeElement *elem, int *offset, uint *out_offset) { - if(sd->object != OBJECT_NONE) { + if(ccl_fetch(sd, object) != OBJECT_NONE) { /* find attribute by unique id */ uint id = node.y; - uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride; + uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride; #ifdef __HAIR__ - attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset; + attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset; #endif uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset); diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h index 90249dfd978..00678a49d70 100644 --- a/intern/cycles/kernel/svm/svm_camera.h +++ b/intern/cycles/kernel/svm/svm_camera.h @@ -23,7 +23,7 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack, float3 vector; Transform tfm = kernel_data.cam.worldtocamera; - vector = transform_point(&tfm, sd->P); + vector = transform_point(&tfm, ccl_fetch(sd, P)); zdepth = vector.z; distance = len(vector); diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index 0d2d155f827..7cdcbc2d30c 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -25,12 +25,12 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type sc->data0 = eta; sc->data1 = 0.0f; sc->data2 = 0.0f; - sd->flag |= bsdf_refraction_setup(sc); + ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc); } else { sc->data0 = 0.0f; sc->data1 = 0.0f; - sd->flag |= bsdf_reflection_setup(sc); + ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc); } } else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) { @@ -39,9 +39,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type sc->data2 = eta; if(refract) - sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc); else - sd->flag |= bsdf_microfacet_beckmann_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc); } else { sc->data0 = roughness; @@ -49,23 +49,23 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, ShaderClosure *sc, int type sc->data2 = eta; if(refract) - sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc); else - sd->flag |= bsdf_microfacet_ggx_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc); } } ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, ClosureType type, float mix_weight) { - ShaderClosure *sc = &sd->closure[sd->num_closure]; + ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); - if(sd->num_closure < MAX_CLOSURE) { + if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) { sc->weight *= mix_weight; sc->type = type; #ifdef __OSL__ sc->prim = NULL; #endif - sd->num_closure++; + ccl_fetch(sd, num_closure)++; return sc; } @@ -74,14 +74,15 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_non_bsdf(ShaderData *sd, C ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float mix_weight) { - ShaderClosure *sc = &sd->closure[sd->num_closure]; + ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); + float3 weight = sc->weight * mix_weight; float sample_weight = fabsf(average(weight)); - if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) { + if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) { sc->weight = weight; sc->sample_weight = sample_weight; - sd->num_closure++; + ccl_fetch(sd, num_closure)++; #ifdef __OSL__ sc->prim = NULL; #endif @@ -93,14 +94,15 @@ ccl_device_inline ShaderClosure *svm_node_closure_get_bsdf(ShaderData *sd, float ccl_device_inline ShaderClosure *svm_node_closure_get_absorption(ShaderData *sd, float mix_weight) { - ShaderClosure *sc = &sd->closure[sd->num_closure]; + ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); + float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sc->weight) * mix_weight; float sample_weight = fabsf(average(weight)); - if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure < MAX_CLOSURE) { + if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure) < MAX_CLOSURE) { sc->weight = weight; sc->sample_weight = sample_weight; - sd->num_closure++; + ccl_fetch(sd, num_closure)++; #ifdef __OSL__ sc->prim = NULL; #endif @@ -124,7 +126,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(mix_weight == 0.0f) return; - float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N; + float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N); float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z); float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w); @@ -142,13 +144,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data0 = 0.0f; sc->data1 = 0.0f; sc->data2 = 0.0f; - sd->flag |= bsdf_diffuse_setup(sc); + ccl_fetch(sd, flag) |= bsdf_diffuse_setup(sc); } else { sc->data0 = roughness; sc->data1 = 0.0f; sc->data2 = 0.0f; - sd->flag |= bsdf_oren_nayar_setup(sc); + ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(sc); } } break; @@ -161,7 +163,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data1 = 0.0f; sc->data2 = 0.0f; sc->N = N; - sd->flag |= bsdf_translucent_setup(sc); + ccl_fetch(sd, flag) |= bsdf_translucent_setup(sc); } break; } @@ -173,7 +175,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data1 = 0.0f; sc->data2 = 0.0f; sc->N = N; - sd->flag |= bsdf_transparent_setup(sc); + ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc); } break; } @@ -195,13 +197,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * /* setup bsdf */ if(type == CLOSURE_BSDF_REFLECTION_ID) - sd->flag |= bsdf_reflection_setup(sc); + ccl_fetch(sd, flag) |= bsdf_reflection_setup(sc); else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID) - sd->flag |= bsdf_microfacet_beckmann_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(sc); else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID) - sd->flag |= bsdf_microfacet_ggx_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(sc); else - sd->flag |= bsdf_ashikhmin_shirley_setup(sc); + ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(sc); } break; @@ -219,7 +221,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->N = N; float eta = fmaxf(param2, 1e-5f); - eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; + eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; /* setup bsdf */ if(type == CLOSURE_BSDF_REFRACTION_ID) { @@ -227,7 +229,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data1 = 0.0f; sc->data2 = 0.0f; - sd->flag |= bsdf_refraction_setup(sc); + ccl_fetch(sd, flag) |= bsdf_refraction_setup(sc); } else { sc->data0 = param1; @@ -235,9 +237,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data2 = eta; if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID) - sd->flag |= bsdf_microfacet_beckmann_refraction_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(sc); else - sd->flag |= bsdf_microfacet_ggx_refraction_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(sc); } } @@ -254,15 +256,15 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * #endif /* index of refraction */ float eta = fmaxf(param2, 1e-5f); - eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; + eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; /* fresnel */ - float cosNO = dot(N, sd->I); + float cosNO = dot(N, ccl_fetch(sd, I)); float fresnel = fresnel_dielectric_cos(cosNO, eta); float roughness = param1; /* reflection */ - ShaderClosure *sc = &sd->closure[sd->num_closure]; + ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); float3 weight = sc->weight; float sample_weight = sc->sample_weight; @@ -283,7 +285,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * #endif /* refraction */ - sc = &sd->closure[sd->num_closure]; + sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); sc->weight = weight; sc->sample_weight = sample_weight; @@ -332,11 +334,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data2 = 0.0f; if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) - sd->flag |= bsdf_microfacet_beckmann_aniso_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(sc); else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) - sd->flag |= bsdf_microfacet_ggx_aniso_setup(sc); + ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(sc); else - sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(sc); + ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(sc); } break; } @@ -350,7 +352,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data0 = saturate(param1); sc->data1 = 0.0f; sc->data2 = 0.0f; - sd->flag |= bsdf_ashikhmin_velvet_setup(sc); + ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(sc); } break; } @@ -366,9 +368,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data2 = 0.0f; if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID) - sd->flag |= bsdf_diffuse_toon_setup(sc); + ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(sc); else - sd->flag |= bsdf_glossy_toon_setup(sc); + ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(sc); } break; } @@ -376,7 +378,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * case CLOSURE_BSDF_HAIR_REFLECTION_ID: case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: { - if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) { + if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { ShaderClosure *sc = svm_node_closure_get_bsdf(sd, mix_weight); if(sc) { @@ -389,11 +391,11 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->N = N; sc->data0 = 0.0f; sc->data1 = 0.0f; - sd->flag |= bsdf_transparent_setup(sc); + ccl_fetch(sd, flag) |= bsdf_transparent_setup(sc); } } else { - ShaderClosure *sc = &sd->closure[sd->num_closure]; + ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); sc = svm_node_closure_get_bsdf(sd, mix_weight); if(sc) { @@ -402,18 +404,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->data1 = param2; sc->data2 = -stack_load_float(stack, data_node.z); - if(!(sd->type & PRIMITIVE_ALL_CURVE)) { - sc->T = normalize(sd->dPdv); + if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) { + sc->T = normalize(ccl_fetch(sd, dPdv)); sc->data2 = 0.0f; } else - sc->T = normalize(sd->dPdu); + sc->T = normalize(ccl_fetch(sd, dPdu)); if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) { - sd->flag |= bsdf_hair_reflection_setup(sc); + ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(sc); } else { - sd->flag |= bsdf_hair_transmission_setup(sc); + ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(sc); } } } @@ -423,9 +425,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * #endif #ifdef __SUBSURFACE__ +#ifndef __SPLIT_KERNEL__ +# define sc_next(sc) sc++ +# else +# define sc_next(sc) sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)) +# endif case CLOSURE_BSSRDF_CUBIC_ID: case CLOSURE_BSSRDF_GAUSSIAN_ID: { - ShaderClosure *sc = &sd->closure[sd->num_closure]; + ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); float3 weight = sc->weight * mix_weight; float sample_weight = fabsf(average(weight)); @@ -435,7 +442,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(path_flag & PATH_RAY_DIFFUSE_ANCESTOR) param1 = 0.0f; - if(sample_weight > CLOSURE_WEIGHT_CUTOFF && sd->num_closure+2 < MAX_CLOSURE) { + if(sample_weight > CLOSURE_WEIGHT_CUTOFF && ccl_fetch(sd, num_closure)+2 < MAX_CLOSURE) { /* radius * scale */ float3 radius = stack_load_float3(stack, data_node.z)*param1; /* sharpness */ @@ -455,10 +462,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->prim = NULL; #endif sc->N = N; - sd->flag |= bssrdf_setup(sc, (ClosureType)type); + ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type); - sd->num_closure++; - sc++; + ccl_fetch(sd, num_closure)++; + sc_next(sc); } if(fabsf(weight.y) > 0.0f) { @@ -472,10 +479,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->prim = NULL; #endif sc->N = N; - sd->flag |= bssrdf_setup(sc, (ClosureType)type); + ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type); - sd->num_closure++; - sc++; + ccl_fetch(sd, num_closure)++; + sc_next(sc); } if(fabsf(weight.z) > 0.0f) { @@ -489,15 +496,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * sc->prim = NULL; #endif sc->N = N; - sd->flag |= bssrdf_setup(sc, (ClosureType)type); + ccl_fetch(sd, flag) |= bssrdf_setup(sc, (ClosureType)type); - sd->num_closure++; - sc++; + ccl_fetch(sd, num_closure)++; + sc_next(sc); } } break; } +# undef sc_next #endif default: break; @@ -525,7 +533,7 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float ShaderClosure *sc = svm_node_closure_get_absorption(sd, mix_weight * density); if(sc) { - sd->flag |= volume_absorption_setup(sc); + ccl_fetch(sd, flag) |= volume_absorption_setup(sc); } break; } @@ -535,7 +543,7 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float if(sc) { sc->data0 = param2; /* g */ sc->data1 = 0.0f; - sd->flag |= volume_henyey_greenstein_setup(sc); + ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(sc); } break; } @@ -560,7 +568,7 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no else svm_node_closure_get_non_bsdf(sd, CLOSURE_EMISSION_ID, 1.0f); - sd->flag |= SD_EMISSION; + ccl_fetch(sd, flag) |= SD_EMISSION; } ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node) @@ -594,7 +602,7 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod else svm_node_closure_get_non_bsdf(sd, CLOSURE_HOLDOUT_ID, 1.0f); - sd->flag |= SD_HOLDOUT; + ccl_fetch(sd, flag) |= SD_HOLDOUT; } ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node) @@ -612,15 +620,17 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, else svm_node_closure_get_non_bsdf(sd, CLOSURE_AMBIENT_OCCLUSION_ID, 1.0f); - sd->flag |= SD_AO; + ccl_fetch(sd, flag) |= SD_AO; } /* Closure Nodes */ ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight) { - if(sd->num_closure < MAX_CLOSURE) - sd->closure[sd->num_closure].weight = weight; + if(ccl_fetch(sd, num_closure) < MAX_CLOSURE) { + ShaderClosure *sc = ccl_fetch_array(sd, closure, ccl_fetch(sd, num_closure)); + sc->weight = weight; + } } ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b) @@ -670,7 +680,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node) ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal) { float3 normal = stack_load_float3(stack, in_direction); - sd->N = normal; + ccl_fetch(sd, N) = normal; stack_store_float3(stack, out_normal, normal); } diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h index 4a058905a93..8d4b07c9973 100644 --- a/intern/cycles/kernel/svm/svm_displace.h +++ b/intern/cycles/kernel/svm/svm_displace.h @@ -25,11 +25,11 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac uint normal_offset, distance_offset, invert; decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, NULL); - float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N; + float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); /* get surface tangents from normal */ - float3 Rx = cross(sd->dP.dy, normal_in); - float3 Ry = cross(normal_in, sd->dP.dx); + float3 Rx = cross(ccl_fetch(sd, dP).dy, normal_in); + float3 Ry = cross(normal_in, ccl_fetch(sd, dP).dx); /* get bump values */ uint c_offset, x_offset, y_offset, strength_offset; @@ -40,7 +40,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac float h_y = stack_load_float(stack, y_offset); /* compute surface gradient and determinant */ - float det = dot(sd->dP.dx, Rx); + float det = dot(ccl_fetch(sd, dP).dx, Rx); float3 surfgrad = (h_x - h_c)*Rx + (h_y - h_c)*Ry; float absdet = fabsf(det); @@ -65,7 +65,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac ccl_device void svm_node_set_displacement(ShaderData *sd, float *stack, uint fac_offset) { float d = stack_load_float(stack, fac_offset); - sd->P += sd->N*d*0.1f; /* todo: get rid of this factor */ + ccl_fetch(sd, P) += ccl_fetch(sd, N)*d*0.1f; /* todo: get rid of this factor */ } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h index 3703ec55015..23c97d80cb0 100644 --- a/intern/cycles/kernel/svm/svm_fresnel.h +++ b/intern/cycles/kernel/svm/svm_fresnel.h @@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset, uint normal_offset, out_offset; decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL); float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value); - float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N; + float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); eta = fmaxf(eta, 1e-5f); - eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; + eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; - float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta); + float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta); stack_store_float(stack, out_offset, f); } @@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node) decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL); float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value); - float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N; + float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); float f; if(type == NODE_LAYER_WEIGHT_FRESNEL) { float eta = fmaxf(1.0f - blend, 1e-5f); - eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta; + eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta; - f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta); + f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta); } else { - f = fabsf(dot(sd->I, normal_in)); + f = fabsf(dot(ccl_fetch(sd, I), normal_in)); if(blend != 0.5f) { blend = clamp(blend, 0.0f, 1.0f-1e-5f); diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index efbefa77d28..bb06254c3a9 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -23,15 +23,15 @@ ccl_device void svm_node_geometry(KernelGlobals *kg, ShaderData *sd, float *stac float3 data; switch(type) { - case NODE_GEOM_P: data = sd->P; break; - case NODE_GEOM_N: data = sd->N; break; + case NODE_GEOM_P: data = ccl_fetch(sd, P); break; + case NODE_GEOM_N: data = ccl_fetch(sd, N); break; #ifdef __DPDU__ case NODE_GEOM_T: data = primitive_tangent(kg, sd); break; #endif - case NODE_GEOM_I: data = sd->I; break; - case NODE_GEOM_Ng: data = sd->Ng; break; + case NODE_GEOM_I: data = ccl_fetch(sd, I); break; + case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break; #ifdef __UV__ - case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break; + case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break; #endif } @@ -44,8 +44,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo float3 data; switch(type) { - case NODE_GEOM_P: data = sd->P + sd->dP.dx; break; - case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break; + case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break; + case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break; default: svm_node_geometry(kg, sd, stack, type, out_offset); return; } @@ -61,8 +61,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo float3 data; switch(type) { - case NODE_GEOM_P: data = sd->P + sd->dP.dy; break; - case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break; + case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break; + case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break; default: svm_node_geometry(kg, sd, stack, type, out_offset); return; } @@ -83,9 +83,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s stack_store_float3(stack, out_offset, object_location(kg, sd)); return; } - case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break; + case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break; case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break; - case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break; + case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break; default: data = 0.0f; break; } @@ -98,44 +98,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, ShaderData *sd, float { switch(type) { case NODE_INFO_PAR_INDEX: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float(stack, out_offset, particle_index(kg, particle_id)); break; } case NODE_INFO_PAR_AGE: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float(stack, out_offset, particle_age(kg, particle_id)); break; } case NODE_INFO_PAR_LIFETIME: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id)); break; } case NODE_INFO_PAR_LOCATION: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float3(stack, out_offset, particle_location(kg, particle_id)); break; } #if 0 /* XXX float4 currently not supported in SVM stack */ case NODE_INFO_PAR_ROTATION: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id)); break; } #endif case NODE_INFO_PAR_SIZE: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float(stack, out_offset, particle_size(kg, particle_id)); break; } case NODE_INFO_PAR_VELOCITY: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id)); break; } case NODE_INFO_PAR_ANGULAR_VELOCITY: { - int particle_id = object_particle_id(kg, sd->object); + int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id)); break; } @@ -153,7 +153,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta switch(type) { case NODE_INFO_CURVE_IS_STRAND: { - data = (sd->type & PRIMITIVE_ALL_CURVE) != 0; + data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0; stack_store_float(stack, out_offset, data); break; } @@ -165,7 +165,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, ShaderData *sd, float *sta break; } /*case NODE_INFO_CURVE_FADE: { - data = sd->curve_transparency; + data = ccl_fetch(sd, curve_transparency); stack_store_float(stack, out_offset, data); break; }*/ diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 08a6c01162c..caf0b37ba35 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -392,10 +392,10 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) { /* get object space normal */ - float3 N = sd->N; + float3 N = ccl_fetch(sd, N); - N = sd->N; - if(sd->object != OBJECT_NONE) + N = ccl_fetch(sd, N); + if(ccl_fetch(sd, object) != OBJECT_NONE) object_inverse_normal_transform(kg, sd, &N); /* project from direction vector to barycentric coordinates in triangles */ diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h index ffadafb1d0c..a235dd35224 100644 --- a/intern/cycles/kernel/svm/svm_light_path.h +++ b/intern/cycles/kernel/svm/svm_light_path.h @@ -31,10 +31,10 @@ ccl_device void svm_node_light_path(ShaderData *sd, float *stack, uint type, uin case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break; case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break; case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break; - case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break; - case NODE_LP_ray_length: info = sd->ray_length; break; - case NODE_LP_ray_depth: info = (float)sd->ray_depth; break; - case NODE_LP_ray_transparent: info = (float)sd->transparent_depth; break; + case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break; + case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break; + case NODE_LP_ray_depth: info = (float)ccl_fetch(sd, ray_depth); break; + case NODE_LP_ray_transparent: info = (float)ccl_fetch(sd, transparent_depth); break; } stack_store_float(stack, out_offset, info); @@ -53,14 +53,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node) switch(type) { case NODE_LIGHT_FALLOFF_QUADRATIC: break; - case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break; - case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break; + case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break; + case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break; } float smooth = stack_load_float(stack, smooth_offset); if(smooth > 0.0f) { - float squared = sd->ray_length*sd->ray_length; + float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); strength *= squared/(smooth + squared); } diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h index a399acf3c0f..eebd9bee420 100644 --- a/intern/cycles/kernel/svm/svm_tex_coord.h +++ b/intern/cycles/kernel/svm/svm_tex_coord.h @@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = sd->P; + data = ccl_fetch(sd, P); if(node.w == 0) { - if(sd->object != OBJECT_NONE) { + if(ccl_fetch(sd, object) != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -48,48 +48,48 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = sd->N; - if(sd->object != OBJECT_NONE) + data = ccl_fetch(sd, N); + if(ccl_fetch(sd, object) != OBJECT_NONE) object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(sd->object != OBJECT_NONE) - data = transform_point(&tfm, sd->P); + if(ccl_fetch(sd, object) != OBJECT_NONE) + data = transform_point(&tfm, ccl_fetch(sd, P)); else - data = transform_point(&tfm, sd->P + camera_position(kg)); + data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, sd->ray_P); + if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P)); else - data = camera_world_to_ndc(kg, sd, sd->P); + data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P)); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(sd->object != OBJECT_NONE) - data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; + if(ccl_fetch(sd, object) != OBJECT_NONE) + data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); else - data = sd->I; + data = ccl_fetch(sd, I); break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, sd->object); + data = object_dupli_generated(kg, ccl_fetch(sd, object)); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, sd->object); + data = object_dupli_uv(kg, ccl_fetch(sd, object)); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = sd->P; + data = ccl_fetch(sd, P); #ifdef __VOLUME__ - if(sd->object != OBJECT_NONE) + if(ccl_fetch(sd, object) != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -113,9 +113,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = sd->P + sd->dP.dx; + data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; if(node.w == 0) { - if(sd->object != OBJECT_NONE) { + if(ccl_fetch(sd, object) != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -130,48 +130,48 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = sd->N; - if(sd->object != OBJECT_NONE) + data = ccl_fetch(sd, N); + if(ccl_fetch(sd, object) != OBJECT_NONE) object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(sd->object != OBJECT_NONE) - data = transform_point(&tfm, sd->P + sd->dP.dx); + if(ccl_fetch(sd, object) != OBJECT_NONE) + data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx); else - data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg)); + data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx); + if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx); else - data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx); + data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(sd->object != OBJECT_NONE) - data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; + if(ccl_fetch(sd, object) != OBJECT_NONE) + data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); else - data = sd->I; + data = ccl_fetch(sd, I); break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, sd->object); + data = object_dupli_generated(kg, ccl_fetch(sd, object)); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, sd->object); + data = object_dupli_uv(kg, ccl_fetch(sd, object)); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = sd->P + sd->dP.dx; + data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; #ifdef __VOLUME__ - if(sd->object != OBJECT_NONE) + if(ccl_fetch(sd, object) != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -198,9 +198,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = sd->P + sd->dP.dy; + data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; if(node.w == 0) { - if(sd->object != OBJECT_NONE) { + if(ccl_fetch(sd, object) != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -215,48 +215,48 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = sd->N; - if(sd->object != OBJECT_NONE) + data = ccl_fetch(sd, N); + if(ccl_fetch(sd, object) != OBJECT_NONE) object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(sd->object != OBJECT_NONE) - data = transform_point(&tfm, sd->P + sd->dP.dy); + if(ccl_fetch(sd, object) != OBJECT_NONE) + data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy); else - data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg)); + data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy); + if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy); else - data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy); + data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(sd->object != OBJECT_NONE) - data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; + if(ccl_fetch(sd, object) != OBJECT_NONE) + data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); else - data = sd->I; + data = ccl_fetch(sd, I); break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, sd->object); + data = object_dupli_generated(kg, ccl_fetch(sd, object)); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, sd->object); + data = object_dupli_uv(kg, ccl_fetch(sd, object)); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = sd->P + sd->dP.dy; + data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; #ifdef __VOLUME__ - if(sd->object != OBJECT_NONE) + if(ccl_fetch(sd, object) != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -281,7 +281,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st if(space == NODE_NORMAL_MAP_TANGENT) { /* tangent space */ - if(sd->object == OBJECT_NONE) { + if(ccl_fetch(sd, object) == OBJECT_NONE) { stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f)); return; } @@ -302,11 +302,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st float sign = primitive_attribute_float(kg, sd, attr_sign_elem, attr_sign_offset, NULL, NULL); float3 normal; - if(sd->shader & SHADER_SMOOTH_NORMAL) { + if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { normal = primitive_attribute_float3(kg, sd, attr_normal_elem, attr_normal_offset, NULL, NULL); } else { - normal = sd->Ng; + normal = ccl_fetch(sd, Ng); object_inverse_normal_transform(kg, sd, &normal); } @@ -337,7 +337,7 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st if(strength != 1.0f) { strength = max(strength, 0.0f); - N = normalize(sd->N + (N - sd->N)*strength); + N = normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength); } stack_store_float3(stack, normal_offset, N); @@ -367,7 +367,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack float3 generated; if(attr_offset == ATTR_STD_NOT_FOUND) - generated = sd->P; + generated = ccl_fetch(sd, P); else generated = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL); @@ -380,7 +380,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack } object_normal_transform(kg, sd, &tangent); - tangent = cross(sd->N, normalize(cross(tangent, sd->N))); + tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N)))); stack_store_float3(stack, tangent_offset, tangent); } diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h index 4e92f27acdb..4c32130d06d 100644 --- a/intern/cycles/kernel/svm/svm_vector_transform.h +++ b/intern/cycles/kernel/svm/svm_vector_transform.h @@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito; Transform tfm; - bool is_object = (sd->object != OBJECT_NONE); + bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE); bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL); /* From world */ diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h index eaa17f8ce57..30ccd523add 100644 --- a/intern/cycles/kernel/svm/svm_wireframe.h +++ b/intern/cycles/kernel/svm/svm_wireframe.h @@ -41,9 +41,9 @@ ccl_device float wireframe(KernelGlobals *kg, float3 *P) { #ifdef __HAIR__ - if(sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE) + if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) #else - if(sd->prim != PRIM_NONE) + if(ccl_fetch(sd, prim) != PRIM_NONE) #endif { float3 Co[3]; @@ -52,12 +52,12 @@ ccl_device float wireframe(KernelGlobals *kg, /* Triangles */ int np = 3; - if(sd->type & PRIMITIVE_TRIANGLE) - triangle_vertices(kg, sd->prim, Co); + if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) + triangle_vertices(kg, ccl_fetch(sd, prim), Co); else - motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co); + motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co); - if(!(sd->flag & SD_TRANSFORM_APPLIED)) { + if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED)) { object_position_transform(kg, sd, &Co[0]); object_position_transform(kg, sd, &Co[1]); object_position_transform(kg, sd, &Co[2]); @@ -66,8 +66,8 @@ ccl_device float wireframe(KernelGlobals *kg, if(pixel_size) { // Project the derivatives of P to the viewing plane defined // by I so we have a measure of how big is a pixel at this point - float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I); - float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I); + float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I)); + float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I)); // Take the average of both axis' length pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f; } @@ -106,16 +106,27 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, int pixel_size = (int)use_pixel_size; /* Calculate wireframe */ - float f = wireframe(kg, sd, size, pixel_size, &sd->P); +#ifdef __SPLIT_KERNEL__ + /* TODO(sergey): This is because sd is actually a global space, + * which makes it difficult to re-use same wireframe() function. + * + * With OpenCL 2.0 it's possible to avoid this change, but for until + * then we'll be living with such an exception. + */ + float3 P = ccl_fetch(sd, P); + float f = wireframe(kg, sd, size, pixel_size, &P); +#else + float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P)); +#endif /* TODO(sergey): Think of faster way to calculate derivatives. */ if(bump_offset == NODE_BUMP_OFFSET_DX) { - float3 Px = sd->P - sd->dP.dx; - f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx); + float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx; + f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx); } else if(bump_offset == NODE_BUMP_OFFSET_DY) { - float3 Py = sd->P - sd->dP.dy; - f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy); + float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy; + f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy); } if(stack_valid(out_fac)) diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index aacb81faf83..7b329af008d 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -807,7 +807,10 @@ void Session::update_status_time(bool show_pause, bool show_done) substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles); - if((is_gpu && !is_multidevice) || (is_cpu && num_tiles == 1)) { + if(((is_gpu && !is_multidevice) || (is_cpu && num_tiles == 1)) && !device->info.use_split_kernel) { + /* When using split-kernel (OpenCL) each thread in a tile will be working on a different + * sample. Can't display sample number when device uses split-kernel + */ /* when rendering on GPU multithreading happens within single tile, as in * tiles are handling sequentially and in this case we could display * currently rendering sample number |