diff options
156 files changed, 4671 insertions, 3896 deletions
diff --git a/extern/cuew/include/cuew.h b/extern/cuew/include/cuew.h index 19087117667..4cce29d38ab 100644 --- a/extern/cuew/include/cuew.h +++ b/extern/cuew/include/cuew.h @@ -114,7 +114,7 @@ extern "C" { #define cuGLGetDevices cuGLGetDevices_v2 /* Types. */ -#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) +#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined (__aarch64__) typedef unsigned long long CUdeviceptr; #else typedef unsigned int CUdeviceptr; diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 5c51f9afc28..ca109734314 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -665,8 +665,10 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): cls.debug_use_cpu_sse3 = BoolProperty(name="SSE3", default=True) cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True) cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True) + cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False) cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False) + cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False) cls.debug_opencl_kernel_type = EnumProperty( name="OpenCL Kernel Type", diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index eb89f0b1efa..688d025e7d7 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -1518,10 +1518,12 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel): row.prop(cscene, "debug_use_cpu_avx", toggle=True) row.prop(cscene, "debug_use_cpu_avx2", toggle=True) col.prop(cscene, "debug_use_qbvh") + col.prop(cscene, "debug_use_cpu_split_kernel") col = layout.column() col.label('CUDA Flags:') col.prop(cscene, "debug_use_cuda_adaptive_compile") + col.prop(cscene, "debug_use_cuda_split_kernel") col = layout.column() col.label('OpenCL Flags:') diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 438abc49f88..75118c43747 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -67,8 +67,10 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene) flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3"); flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2"); flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh"); + flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel"); /* Synchronize CUDA flags. */ flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile"); + flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel"); /* Synchronize OpenCL kernel type. */ switch(get_enum(cscene, "debug_opencl_kernel_type")) { case 0: diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 966ff5e52ba..a2373451696 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -3,6 +3,7 @@ set(INC . ../graph ../kernel + ../kernel/split ../kernel/svm ../kernel/osl ../util @@ -33,6 +34,7 @@ set(SRC device_cuda.cpp device_multi.cpp device_opencl.cpp + device_split_kernel.cpp device_task.cpp ) @@ -56,6 +58,7 @@ set(SRC_HEADERS device_memory.h device_intern.h device_network.h + device_split_kernel.h device_task.h ) diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 31c99f49d6d..6b07b9d04bd 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -80,7 +80,7 @@ Device::~Device() void Device::pixels_alloc(device_memory& mem) { - mem_alloc(mem, MEM_READ_WRITE); + mem_alloc("pixels", mem, MEM_READ_WRITE); } void Device::pixels_copy_from(device_memory& mem, int y, int w, int h) diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index ccee25ae34e..c740cada98b 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -234,7 +234,7 @@ public: Stats &stats; /* regular memory */ - virtual void mem_alloc(device_memory& mem, MemoryType type) = 0; + virtual void mem_alloc(const char *name, device_memory& mem, MemoryType type) = 0; virtual void mem_copy_to(device_memory& mem) = 0; virtual void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) = 0; diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index c8e001ec2fd..06a1568b4d6 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -26,10 +26,12 @@ #include "device.h" #include "device_intern.h" +#include "device_split_kernel.h" #include "kernel.h" #include "kernel_compat_cpu.h" #include "kernel_types.h" +#include "split/kernel_split_data.h" #include "kernel_globals.h" #include "osl_shader.h" @@ -41,6 +43,7 @@ #include "util_foreach.h" #include "util_function.h" #include "util_logging.h" +#include "util_map.h" #include "util_opengl.h" #include "util_progress.h" #include "util_system.h" @@ -48,8 +51,93 @@ CCL_NAMESPACE_BEGIN +class CPUDevice; + +class CPUSplitKernel : public DeviceSplitKernel { + CPUDevice *device; +public: + explicit CPUSplitKernel(CPUDevice *device); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs); + + virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); + virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); +}; + class CPUDevice : public Device { + static unordered_map<string, void*> kernel_functions; + + static void register_kernel_function(const char* name, void* func) + { + kernel_functions[name] = func; + } + + static const char* get_arch_name() + { +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + return "cpu_avx2"; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) { + return "cpu_avx"; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if(system_cpu_support_sse41()) { + return "cpu_sse41"; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 + if(system_cpu_support_sse3()) { + return "cpu_sse3"; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + if(system_cpu_support_sse2()) { + return "cpu_sse2"; + } + else +#endif + { + return "cpu"; + } + } + + template<typename F> + static F get_kernel_function(string name) + { + name = string("kernel_") + get_arch_name() + "_" + name; + + unordered_map<string, void*>::iterator it = kernel_functions.find(name); + + if(it == kernel_functions.end()) { + assert(!"kernel function not found"); + return NULL; + } + + return (F)it->second; + } + + friend class CPUSplitKernel; + public: TaskPool task_pool; KernelGlobals kernel_globals; @@ -57,10 +145,15 @@ public: #ifdef WITH_OSL OSLGlobals osl_globals; #endif + + bool use_split_kernel; + + DeviceRequestedFeatures requested_features; CPUDevice(DeviceInfo& info, Stats &stats, bool background) : Device(info, stats, background) { + #ifdef WITH_OSL kernel_globals.osl = &osl_globals; #endif @@ -105,6 +198,28 @@ public: { VLOG(1) << "Will be using regular kernels."; } + + use_split_kernel = DebugFlags().cpu.split_kernel; + if(use_split_kernel) { + VLOG(1) << "Will be using split kernel."; + } + + kernel_cpu_register_functions(register_kernel_function); +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + kernel_cpu_sse2_register_functions(register_kernel_function); +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 + kernel_cpu_sse3_register_functions(register_kernel_function); +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + kernel_cpu_sse41_register_functions(register_kernel_function); +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + kernel_cpu_avx_register_functions(register_kernel_function); +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + kernel_cpu_avx2_register_functions(register_kernel_function); +#endif } ~CPUDevice() @@ -117,9 +232,20 @@ public: return (TaskScheduler::num_threads() == 1); } - void mem_alloc(device_memory& mem, MemoryType /*type*/) + void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/) { + if(name) { + VLOG(1) << "Buffer allocate: " << name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + mem.device_pointer = mem.data_pointer; + + if(!mem.device_pointer) { + mem.device_pointer = (device_ptr)malloc(mem.memory_size()); + } + mem.device_size = mem.memory_size(); stats.mem_alloc(mem.device_size); } @@ -144,6 +270,10 @@ public: void mem_free(device_memory& mem) { if(mem.device_pointer) { + if(!mem.data_pointer) { + free((void*)mem.device_pointer); + } + mem.device_pointer = 0; stats.mem_free(mem.device_size); mem.device_size = 0; @@ -196,8 +326,14 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::PATH_TRACE) - thread_path_trace(*task); + if(task->type == DeviceTask::PATH_TRACE) { + if(!use_split_kernel) { + thread_path_trace(*task); + } + else { + thread_path_trace_split(*task); + } + } else if(task->type == DeviceTask::FILM_CONVERT) thread_film_convert(*task); else if(task->type == DeviceTask::SHADER) @@ -258,7 +394,7 @@ public: { path_trace_kernel = kernel_cpu_path_trace; } - + while(task.acquire_tile(this, tile)) { float *render_buffer = (float*)tile.buffer; uint *rng_state = (uint*)tile.rng_state; @@ -294,6 +430,49 @@ public: thread_kernel_globals_free(&kg); } + void thread_path_trace_split(DeviceTask& task) + { + if(task_pool.canceled()) { + if(task.need_finish_queue == false) + return; + } + + RenderTile tile; + + CPUSplitKernel split_kernel(this); + + /* allocate buffer for kernel globals */ + device_memory kgbuffer; + kgbuffer.resize(sizeof(KernelGlobals)); + mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE); + + KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer; + *kg = thread_kernel_globals_init(); + + requested_features.max_closure = MAX_CLOSURE; + if(!split_kernel.load_kernels(requested_features)) { + thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); + mem_free(kgbuffer); + + return; + } + + while(task.acquire_tile(this, tile)) { + device_memory data; + split_kernel.path_trace(&task, tile, kgbuffer, data); + + task.release_tile(tile); + + if(task_pool.canceled()) { + if(task.need_finish_queue == false) + break; + } + } + + thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); + mem_free(kgbuffer); + } + void thread_film_convert(DeviceTask& task) { float sample_scale = 1.0f/(task.sample + 1); @@ -501,6 +680,10 @@ protected: inline void thread_kernel_globals_free(KernelGlobals *kg) { + if(kg == NULL) { + return; + } + if(kg->transparent_shadow_intersections != NULL) { free(kg->transparent_shadow_intersections); } @@ -515,8 +698,176 @@ protected: OSLShader::thread_free(kg); #endif } + + virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) { + requested_features = requested_features_; + + return true; + } +}; + +/* split kernel */ + +class CPUSplitKernelFunction : public SplitKernelFunction { +public: + CPUDevice* device; + void (*func)(KernelGlobals *kg, KernelData *data); + + CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {} + ~CPUSplitKernelFunction() {} + + virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data) + { + if(!func) { + return false; + } + + KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; + kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); + + for(int y = 0; y < dim.global_size[1]; y++) { + for(int x = 0; x < dim.global_size[0]; x++) { + kg->global_id = make_int2(x, y); + + func(kg, (KernelData*)data.device_pointer); + } + } + + return true; + } }; +CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device) +{ +} + +bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& data, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flags, + device_memory& work_pool_wgs) +{ + typedef void(*data_init_t)(KernelGlobals *kg, + ccl_constant KernelData *data, + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + ccl_global uint *rng_state, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer); + + data_init_t data_init; + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 + if(system_cpu_support_avx2()) { + data_init = kernel_cpu_avx2_data_init; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX + if(system_cpu_support_avx()) { + data_init = kernel_cpu_avx_data_init; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 + if(system_cpu_support_sse41()) { + data_init = kernel_cpu_sse41_data_init; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 + if(system_cpu_support_sse3()) { + data_init = kernel_cpu_sse3_data_init; + } + else +#endif +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 + if(system_cpu_support_sse2()) { + data_init = kernel_cpu_sse2_data_init; + } + else +#endif + { + data_init = kernel_cpu_data_init; + } + + KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; + kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); + + for(int y = 0; y < dim.global_size[1]; y++) { + for(int x = 0; x < dim.global_size[0]; x++) { + kg->global_id = make_int2(x, y); + + data_init((KernelGlobals*)kernel_globals.device_pointer, + (KernelData*)data.device_pointer, + (void*)split_data.device_pointer, + num_global_elements, + (char*)ray_state.device_pointer, + (uint*)rtile.rng_state, + rtile.start_sample, + rtile.start_sample + rtile.num_samples, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + rtile.offset, + rtile.stride, + (int*)queue_index.device_pointer, + dim.global_size[0] * dim.global_size[1], + (char*)use_queues_flags.device_pointer, + (uint*)work_pool_wgs.device_pointer, + rtile.num_samples, + (float*)rtile.buffer); + } + } + + return true; +} + +SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) +{ + CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device); + + kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name); + if(!kernel->func) { + delete kernel; + return NULL; + } + + return kernel; +} + +int2 CPUSplitKernel::split_kernel_local_size() +{ + return make_int2(1, 1); +} + +int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask *task) { + /* TODO(mai): this needs investigation but cpu gives incorrect render if global size doesnt match tile size */ + return task->requested_tile_size; +} + +size_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) { + KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; + + return split_data_buffer_size(kg, num_threads); +} + +unordered_map<string, void*> CPUDevice::kernel_functions; + Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background) { return new CPUDevice(info, stats, background); diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 5e87f9ec895..a630a3d1183 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -22,6 +22,7 @@ #include "device.h" #include "device_intern.h" +#include "device_split_kernel.h" #include "buffers.h" @@ -43,6 +44,8 @@ #include "util_types.h" #include "util_time.h" +#include "split/kernel_split_data_types.h" + CCL_NAMESPACE_BEGIN #ifndef WITH_CUDA_DYNLOAD @@ -79,6 +82,31 @@ int cuewCompilerVersion(void) } /* namespace */ #endif /* WITH_CUDA_DYNLOAD */ +class CUDADevice; + +class CUDASplitKernel : public DeviceSplitKernel { + CUDADevice *device; +public: + explicit CUDASplitKernel(CUDADevice *device); + + virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs); + + virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); +}; + class CUDADevice : public Device { public: @@ -259,11 +287,16 @@ public: return DebugFlags().cuda.adaptive_compile; } + bool use_split_kernel() + { + return DebugFlags().cuda.split_kernel; + } + /* Common NVCC flags which stays the same regardless of shading model, * kernel sources md5 and only depends on compiler or compilation settings. */ string compile_kernel_get_common_cflags( - const DeviceRequestedFeatures& requested_features) + const DeviceRequestedFeatures& requested_features, bool split=false) { const int cuda_version = cuewCompilerVersion(); const int machine = system_cpu_bits(); @@ -288,6 +321,11 @@ public: #ifdef WITH_CYCLES_DEBUG cflags += " -D__KERNEL_DEBUG__"; #endif + + if(split) { + cflags += " -D__SPLIT__"; + } + return cflags; } @@ -321,7 +359,7 @@ public: return true; } - string compile_kernel(const DeviceRequestedFeatures& requested_features) + string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false) { /* Compute cubin name. */ int major, minor; @@ -330,7 +368,8 @@ public: /* Attempt to use kernel provided with Blender. */ if(!use_adaptive_compilation()) { - const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", + const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin" + : "lib/kernel_sm_%d%d.cubin", major, minor)); VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; if(path_exists(cubin)) { @@ -340,7 +379,7 @@ public: } const string common_cflags = - compile_kernel_get_common_cflags(requested_features); + compile_kernel_get_common_cflags(requested_features, split); /* Try to use locally compiled kernel. */ const string kernel_path = path_get("kernel"); @@ -351,7 +390,8 @@ public: */ const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); - const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin", + const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin" + : "cycles_kernel_sm%d%d_%s.cubin", major, minor, cubin_md5.c_str()); const string cubin = path_cache_get(path_join("kernels", cubin_file)); @@ -386,7 +426,7 @@ public: const char *nvcc = cuewCompilerPath(); const string kernel = path_join(kernel_path, path_join("kernels", - path_join("cuda", "kernel.cu"))); + path_join("cuda", split ? "kernel_split.cu" : "kernel.cu"))); double starttime = time_dt(); printf("Compiling CUDA kernel ...\n"); @@ -434,7 +474,7 @@ public: return false; /* get kernel */ - string cubin = compile_kernel(requested_features); + string cubin = compile_kernel(requested_features, use_split_kernel()); if(cubin == "") return false; @@ -467,8 +507,14 @@ public: } } - void mem_alloc(device_memory& mem, MemoryType /*type*/) + void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/) { + if(name) { + VLOG(1) << "Buffer allocate: " << name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + cuda_push_context(); CUdeviceptr device_pointer; size_t size = mem.memory_size(); @@ -505,7 +551,9 @@ public: void mem_zero(device_memory& mem) { - memset((void*)mem.data_pointer, 0, mem.memory_size()); + if(mem.data_pointer) { + memset((void*)mem.data_pointer, 0, mem.memory_size()); + } cuda_push_context(); if(mem.device_pointer) @@ -618,7 +666,7 @@ public: /* Data Storage */ if(interpolation == INTERPOLATION_NONE) { if(has_bindless_textures) { - mem_alloc(mem, MEM_READ_ONLY); + mem_alloc(NULL, mem, MEM_READ_ONLY); mem_copy_to(mem); cuda_push_context(); @@ -642,7 +690,7 @@ public: cuda_pop_context(); } else { - mem_alloc(mem, MEM_READ_ONLY); + mem_alloc(NULL, mem, MEM_READ_ONLY); mem_copy_to(mem); cuda_push_context(); @@ -1259,25 +1307,48 @@ public: /* Upload Bindless Mapping */ load_bindless_mapping(); - /* keep rendering tiles until done */ - while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; + if(!use_split_kernel()) { + /* keep rendering tiles until done */ + while(task->acquire_tile(this, tile)) { + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } + for(int sample = start_sample; sample < end_sample; sample++) { + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } - path_trace(tile, sample, branched); + path_trace(tile, sample, branched); - tile.sample = sample + 1; + tile.sample = sample + 1; - task->update_progress(&tile, tile.w*tile.h); + task->update_progress(&tile, tile.w*tile.h); + } + + task->release_tile(tile); + } + } + else { + DeviceRequestedFeatures requested_features; + if(!use_adaptive_compilation()) { + requested_features.max_closure = 64; } - task->release_tile(tile); + CUDASplitKernel split_kernel(this); + split_kernel.load_kernels(requested_features); + + while(task->acquire_tile(this, tile)) { + device_memory void_buffer; + split_kernel.path_trace(task, tile, void_buffer, void_buffer); + + task->release_tile(tile); + + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } + } } } else if(task->type == DeviceTask::SHADER) { @@ -1330,8 +1401,223 @@ public: { task_pool.cancel(); } + + friend class CUDASplitKernelFunction; + friend class CUDASplitKernel; }; +/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class + * now that the definition of that class is complete + */ +#undef cuda_assert +#define cuda_assert(stmt) \ + { \ + CUresult result = stmt; \ + \ + if(result != CUDA_SUCCESS) { \ + string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + if(device->error_msg == "") \ + device->error_msg = message; \ + fprintf(stderr, "%s\n", message.c_str()); \ + /*cuda_abort();*/ \ + device->cuda_error_documentation(); \ + } \ + } (void)0 + +/* split kernel */ + +class CUDASplitKernelFunction : public SplitKernelFunction{ + CUDADevice* device; + CUfunction func; +public: + CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {} + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/) + { + return enqueue(dim, NULL); + } + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, void *args[]) + { + device->cuda_push_context(); + + if(device->have_error()) + return false; + + /* we ignore dim.local_size for now, as this is faster */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); + + int xthreads = (int)sqrt(threads_per_block); + int ythreads = (int)sqrt(threads_per_block); + + int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads; + int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads; + + cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); + + cuda_assert(cuLaunchKernel(func, + xblocks , yblocks, 1, /* blocks */ + xthreads, ythreads, 1, /* threads */ + 0, 0, args, 0)); + + device->cuda_pop_context(); + + return !device->have_error(); + } +}; + +CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device) +{ +} + +size_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads) +{ + device_vector<uint> size_buffer; + size_buffer.resize(1); + device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE); + + device->cuda_push_context(); + + uint threads = num_threads; + CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer); + + struct args_t { + uint* num_threads; + CUdeviceptr* size; + }; + + args_t args = { + &threads, + &d_size + }; + + CUfunction state_buffer_size; + cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size")); + + cuda_assert(cuLaunchKernel(state_buffer_size, + 1, 1, 1, + 1, 1, 1, + 0, 0, &args, 0)); + + device->cuda_pop_context(); + + device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint)); + device->mem_free(size_buffer); + + return *size_buffer.get_data(); +} + +bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& /*kernel_globals*/, + device_memory& /*kernel_data*/, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs) +{ + device->cuda_push_context(); + + CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer); + CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer); + CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer); + CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer); + CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer); + + CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state); + CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer); + + int end_sample = rtile.start_sample + rtile.num_samples; + int queue_size = dim.global_size[0] * dim.global_size[1]; + + struct args_t { + CUdeviceptr* split_data_buffer; + int* num_elements; + CUdeviceptr* ray_state; + CUdeviceptr* rng_state; + int* start_sample; + int* end_sample; + int* sx; + int* sy; + int* sw; + int* sh; + int* offset; + int* stride; + CUdeviceptr* queue_index; + int* queuesize; + CUdeviceptr* use_queues_flag; + CUdeviceptr* work_pool_wgs; + int* num_samples; + CUdeviceptr* buffer; + }; + + args_t args = { + &d_split_data, + &num_global_elements, + &d_ray_state, + &d_rng_state, + &rtile.start_sample, + &end_sample, + &rtile.x, + &rtile.y, + &rtile.w, + &rtile.h, + &rtile.offset, + &rtile.stride, + &d_queue_index, + &queue_size, + &d_use_queues_flag, + &d_work_pool_wgs, + &rtile.num_samples, + &d_buffer + }; + + CUfunction data_init; + cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); + if(device->have_error()) { + return false; + } + + CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args); + + device->cuda_pop_context(); + + return !device->have_error(); +} + +SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) +{ + CUfunction func; + + device->cuda_push_context(); + + cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data())); + if(device->have_error()) { + device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data())); + return NULL; + } + + device->cuda_pop_context(); + + return new CUDASplitKernelFunction(device, func); +} + +int2 CUDASplitKernel::split_kernel_local_size() +{ + return make_int2(32, 1); +} + +int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask */*task*/) +{ + /* TODO(mai): implement something here to detect ideal work size */ + return make_int2(256, 256); +} + bool device_cuda_init(void) { #ifdef WITH_CUDA_DYNLOAD diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 5b5b4dc6802..b69c3dad604 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -180,10 +180,27 @@ public: /* device pointer */ device_ptr device_pointer; -protected: - device_memory() {} + device_memory() + { + data_type = device_type_traits<uchar>::data_type; + data_elements = device_type_traits<uchar>::num_elements; + data_pointer = 0; + data_size = 0; + device_size = 0; + data_width = 0; + data_height = 0; + data_depth = 0; + device_pointer = 0; + } virtual ~device_memory() { assert(!device_pointer); } + void resize(size_t size) + { + data_size = size; + data_width = size; + } + +protected: /* no copying */ device_memory(const device_memory&); device_memory& operator = (const device_memory&); @@ -198,16 +215,8 @@ public: { data_type = device_type_traits<T>::data_type; data_elements = device_type_traits<T>::num_elements; - data_pointer = 0; - data_size = 0; - device_size = 0; - data_width = 0; - data_height = 0; - data_depth = 0; assert(data_elements > 0); - - device_pointer = 0; } virtual ~device_vector() {} @@ -266,6 +275,7 @@ public: data_height = 0; data_depth = 0; data_size = 0; + device_pointer = 0; } size_t size() diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 61d78ee65de..3368fd3d756 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -106,11 +106,11 @@ public: return true; } - void mem_alloc(device_memory& mem, MemoryType type) + void mem_alloc(const char *name, device_memory& mem, MemoryType type) { foreach(SubDevice& sub, devices) { mem.device_pointer = 0; - sub.device->mem_alloc(mem, type); + sub.device->mem_alloc(name, mem, type); sub.ptr_map[unique_ptr] = mem.device_pointer; } diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp index 53eef6cf199..6dc4aecbc50 100644 --- a/intern/cycles/device/device_network.cpp +++ b/intern/cycles/device/device_network.cpp @@ -87,8 +87,14 @@ public: snd.write(); } - void mem_alloc(device_memory& mem, MemoryType type) + void mem_alloc(const char *name, device_memory& mem, MemoryType type) { + if(name) { + VLOG(1) << "Buffer allocate: " << name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + thread_scoped_lock lock(rpc_lock); mem.device_pointer = ++mem_counter; @@ -481,7 +487,7 @@ protected: mem.data_pointer = 0; /* perform the allocation on the actual device */ - device->mem_alloc(mem, type); + device->mem_alloc(NULL, mem, type); /* store a mapping to/from client_pointer and real device pointer */ pointer_mapping_insert(client_pointer, mem.device_pointer); diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp new file mode 100644 index 00000000000..b9705077fbf --- /dev/null +++ b/intern/cycles/device/device_split_kernel.cpp @@ -0,0 +1,281 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device_split_kernel.h" + +#include "kernel_types.h" +#include "kernel_split_data_types.h" + +#include "util_time.h" + +CCL_NAMESPACE_BEGIN + +static const double alpha = 0.1; /* alpha for rolling average */ + +DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device) +{ + current_max_closure = -1; + first_tile = true; + + avg_time_per_sample = 0.0; + + kernel_path_init = NULL; + kernel_scene_intersect = NULL; + kernel_lamp_emission = NULL; + kernel_queue_enqueue = NULL; + kernel_background_buffer_update = NULL; + kernel_shader_eval = NULL; + kernel_holdout_emission_blurring_pathtermination_ao = NULL; + kernel_direct_lighting = NULL; + kernel_shadow_blocked = NULL; + kernel_next_iteration_setup = NULL; +} + +DeviceSplitKernel::~DeviceSplitKernel() +{ + device->mem_free(split_data); + device->mem_free(ray_state); + device->mem_free(use_queues_flag); + device->mem_free(queue_index); + device->mem_free(work_pool_wgs); + + delete kernel_path_init; + delete kernel_scene_intersect; + delete kernel_lamp_emission; + delete kernel_queue_enqueue; + delete kernel_background_buffer_update; + delete kernel_shader_eval; + delete kernel_holdout_emission_blurring_pathtermination_ao; + delete kernel_direct_lighting; + delete kernel_shadow_blocked; + delete kernel_next_iteration_setup; +} + +bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features) +{ +#define LOAD_KERNEL(name) \ + kernel_##name = get_split_kernel_function(#name, requested_features); \ + if(!kernel_##name) { \ + return false; \ + } + + LOAD_KERNEL(path_init); + LOAD_KERNEL(scene_intersect); + LOAD_KERNEL(lamp_emission); + LOAD_KERNEL(queue_enqueue); + LOAD_KERNEL(background_buffer_update); + LOAD_KERNEL(shader_eval); + LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao); + LOAD_KERNEL(direct_lighting); + LOAD_KERNEL(shadow_blocked); + LOAD_KERNEL(next_iteration_setup); + +#undef LOAD_KERNEL + + current_max_closure = requested_features.max_closure; + + return true; +} + +size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size) +{ + size_t size_per_element = state_buffer_size(kg, data, 1024) / 1024; + return max_buffer_size / size_per_element; +} + +bool DeviceSplitKernel::path_trace(DeviceTask *task, + RenderTile& tile, + device_memory& kgbuffer, + device_memory& kernel_data) +{ + if(device->have_error()) { + return false; + } + + /* Get local size */ + size_t local_size[2]; + { + int2 lsize = split_kernel_local_size(); + local_size[0] = lsize[0]; + local_size[1] = lsize[1]; + } + + /* Set gloabl size */ + size_t global_size[2]; + { + int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task); + + /* Make sure that set work size is a multiple of local + * work size dimensions. + */ + global_size[0] = round_up(gsize[0], local_size[0]); + global_size[1] = round_up(gsize[1], local_size[1]); + } + + /* Number of elements in the global state buffer */ + int num_global_elements = global_size[0] * global_size[1]; + + /* Allocate all required global memory once. */ + if(first_tile) { + first_tile = false; + + /* Calculate max groups */ + + /* Denotes the maximum work groups possible w.r.t. current requested tile size. */ + unsigned int max_work_groups = num_global_elements / WORK_POOL_SIZE + 1; + + /* Allocate work_pool_wgs memory. */ + work_pool_wgs.resize(max_work_groups * sizeof(unsigned int)); + device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE); + + queue_index.resize(NUM_QUEUES * sizeof(int)); + device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE); + + use_queues_flag.resize(sizeof(char)); + device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE); + + ray_state.resize(num_global_elements); + device->mem_alloc("ray_state", ray_state, MEM_READ_WRITE); + + split_data.resize(state_buffer_size(kgbuffer, kernel_data, num_global_elements)); + device->mem_alloc("split_data", split_data, MEM_READ_WRITE); + } + +#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \ + if(device->have_error()) { \ + return false; \ + } \ + if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \ + return false; \ + } + + tile.sample = tile.start_sample; + + /* for exponential increase between tile updates */ + int time_multiplier = 1; + + while(tile.sample < tile.start_sample + tile.num_samples) { + /* to keep track of how long it takes to run a number of samples */ + double start_time = time_dt(); + + /* initial guess to start rolling average */ + const int initial_num_samples = 1; + /* approx number of samples per second */ + int samples_per_second = (avg_time_per_sample > 0.0) ? + int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples; + + RenderTile subtile = tile; + subtile.start_sample = tile.sample; + subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample); + + if(device->have_error()) { + return false; + } + + /* reset state memory here as global size for data_init + * kernel might not be large enough to do in kernel + */ + device->mem_zero(work_pool_wgs); + device->mem_zero(split_data); + + if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size), + subtile, + num_global_elements, + kgbuffer, + kernel_data, + split_data, + ray_state, + queue_index, + use_queues_flag, + work_pool_wgs)) + { + return false; + } + + ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size); + + bool activeRaysAvailable = true; + + while(activeRaysAvailable) { + /* Twice the global work size of other kernels for + * ckPathTraceKernel_shadow_blocked_direct_lighting. */ + size_t global_size_shadow_blocked[2]; + global_size_shadow_blocked[0] = global_size[0] * 2; + global_size_shadow_blocked[1] = global_size[1]; + + /* Do path-iteration in host [Enqueue Path-iteration kernels. */ + for(int PathIter = 0; PathIter < 16; PathIter++) { + ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size); + ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); + + if(task->get_cancel()) { + return true; + } + } + + /* Decide if we should exit path-iteration in host. */ + device->mem_copy_from(ray_state, 0, global_size[0] * global_size[1] * sizeof(char), 1, 1); + + activeRaysAvailable = false; + + for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) { + if(int8_t(ray_state.get_data()[rayStateIter]) != RAY_INACTIVE) { + /* Not all rays are RAY_INACTIVE. */ + activeRaysAvailable = true; + break; + } + } + + if(task->get_cancel()) { + return true; + } + } + + double time_per_sample = ((time_dt()-start_time) / subtile.num_samples); + + if(avg_time_per_sample == 0.0) { + /* start rolling average */ + avg_time_per_sample = time_per_sample; + } + else { + avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample; + } + +#undef ENQUEUE_SPLIT_KERNEL + + tile.sample += subtile.num_samples; + task->update_progress(&tile, tile.w*tile.h*subtile.num_samples); + + time_multiplier = min(time_multiplier << 1, 10); + + if(task->get_cancel()) { + return true; + } + } + + return true; +} + +CCL_NAMESPACE_END + + diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h new file mode 100644 index 00000000000..cc3e1aa26ae --- /dev/null +++ b/intern/cycles/device/device_split_kernel.h @@ -0,0 +1,127 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __DEVICE_SPLIT_KERNEL_H__ +#define __DEVICE_SPLIT_KERNEL_H__ + +#include "device.h" +#include "buffers.h" + +CCL_NAMESPACE_BEGIN + +/* When allocate global memory in chunks. We may not be able to + * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks; + * Since some bytes may be needed for aligning chunks of memory; + * This is the amount of memory that we dedicate for that purpose. + */ +#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB + +/* Types used for split kernel */ + +class KernelDimensions { +public: + size_t global_size[2]; + size_t local_size[2]; + + KernelDimensions(size_t global_size_[2], size_t local_size_[2]) + { + memcpy(global_size, global_size_, sizeof(global_size)); + memcpy(local_size, local_size_, sizeof(local_size)); + } +}; + +class SplitKernelFunction { +public: + virtual ~SplitKernelFunction() {} + + /* enqueue the kernel, returns false if there is an error */ + virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) = 0; +}; + +class DeviceSplitKernel { +private: + Device *device; + + SplitKernelFunction *kernel_path_init; + SplitKernelFunction *kernel_scene_intersect; + SplitKernelFunction *kernel_lamp_emission; + SplitKernelFunction *kernel_queue_enqueue; + SplitKernelFunction *kernel_background_buffer_update; + SplitKernelFunction *kernel_shader_eval; + SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao; + SplitKernelFunction *kernel_direct_lighting; + SplitKernelFunction *kernel_shadow_blocked; + SplitKernelFunction *kernel_next_iteration_setup; + + /* Global memory variables [porting]; These memory is used for + * co-operation between different kernels; Data written by one + * kernel will be available to another kernel via this global + * memory. + */ + device_memory split_data; + device_vector<uchar> ray_state; + device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */ + + /* Flag to make sceneintersect and lampemission kernel use queues. */ + device_memory use_queues_flag; + + /* Approximate time it takes to complete one sample */ + double avg_time_per_sample; + + /* Work pool with respect to each work group. */ + device_memory work_pool_wgs; + + /* clos_max value for which the kernels have been loaded currently. */ + int current_max_closure; + + /* Marked True in constructor and marked false at the end of path_trace(). */ + bool first_tile; + +public: + explicit DeviceSplitKernel(Device* device); + virtual ~DeviceSplitKernel(); + + bool load_kernels(const DeviceRequestedFeatures& requested_features); + bool path_trace(DeviceTask *task, + RenderTile& rtile, + device_memory& kgbuffer, + device_memory& kernel_data); + + virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0; + size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs) = 0; + + virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0; + virtual int2 split_kernel_local_size() = 0; + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0; +}; + +CCL_NAMESPACE_END + +#endif /* __DEVICE_SPLIT_KERNEL_H__ */ + + + diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 8bd54c3d2b0..f31092fd9d2 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -51,6 +51,8 @@ public: int shader_filter; int shader_x, shader_w; + int passes_size; + explicit DeviceTask(Type type = PATH_TRACE); int get_subtask_count(int num, int max_size = 0); diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 4023ba89a10..6470cb8ff7e 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -26,29 +26,29 @@ CCL_NAMESPACE_BEGIN -#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p)) - -/* Macro declarations used with split kernel */ - -/* Macro to enable/disable work-stealing */ -#define __WORK_STEALING__ - -#define SPLIT_KERNEL_LOCAL_SIZE_X 64 -#define SPLIT_KERNEL_LOCAL_SIZE_Y 1 - -/* This value may be tuned according to the scene we are rendering. - * - * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected - * ray-bounces will improve performance. - */ -#define PATH_ITER_INC_FACTOR 8 +/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */ +#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS +/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */ +# undef clEnqueueNDRangeKernel +# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \ + clFinish(a); \ + CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); + +# undef clEnqueueWriteBuffer +# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \ + clFinish(a); \ + CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); + +# undef clEnqueueReadBuffer +# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \ + clFinish(a); \ + CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); +#endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */ -/* When allocate global memory in chunks. We may not be able to - * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks; - * Since some bytes may be needed for aligning chunks of memory; - * This is the amount of memory that we dedicate for that purpose. - */ -#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB +#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p)) struct OpenCLPlatformDevice { OpenCLPlatformDevice(cl_platform_id platform_id, @@ -248,6 +248,7 @@ public: bool device_initialized; string platform_name; + string device_name; bool opencl_error(cl_int err); void opencl_error(const string& message); @@ -266,10 +267,10 @@ public: /* Has to be implemented by the real device classes. * The base device will then load all these programs. */ - virtual void load_kernels(const DeviceRequestedFeatures& requested_features, + virtual bool load_kernels(const DeviceRequestedFeatures& requested_features, vector<OpenCLProgram*> &programs) = 0; - void mem_alloc(device_memory& mem, MemoryType type); + void mem_alloc(const char *name, device_memory& mem, MemoryType type); void mem_copy_to(device_memory& mem); void mem_copy_from(device_memory& mem, int y, int w, int h, int elem); void mem_zero(device_memory& mem); @@ -326,16 +327,39 @@ protected: class ArgumentWrapper { public: - ArgumentWrapper() : size(0), pointer(NULL) {} - template <typename T> + ArgumentWrapper() : size(0), pointer(NULL) + { + } + + ArgumentWrapper(device_memory& argument) : size(sizeof(void*)), + pointer((void*)(&argument.device_pointer)) + { + } + + template<typename T> + ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)), + pointer((void*)(&argument.device_pointer)) + { + } + + template<typename T> ArgumentWrapper(T& argument) : size(sizeof(argument)), - pointer(&argument) { } + pointer(&argument) + { + } + ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), - pointer(&int_value) { } + pointer(&int_value) + { + } + ArgumentWrapper(float argument) : size(sizeof(float)), float_value(argument), - pointer(&float_value) { } + pointer(&float_value) + { + } + size_t size; int int_value; float float_value; diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index a2b900312e7..c5f44f84e8c 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -82,9 +82,10 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou cpPlatform = platform_device.platform_id; cdDevice = platform_device.device_id; platform_name = platform_device.platform_name; + device_name = platform_device.device_name; VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device " - << platform_device.device_name << "."; + << device_name << "."; { /* try to use cached context */ @@ -113,12 +114,16 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou } cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr); - if(opencl_error(ciErr)) + if(opencl_error(ciErr)) { + opencl_error("OpenCL: Error creating command queue"); return; + } null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr); - if(opencl_error(ciErr)) + if(opencl_error(ciErr)) { + opencl_error("OpenCL: Error creating memory buffer for NULL"); return; + } fprintf(stderr, "Device init success\n"); device_initialized = true; @@ -191,6 +196,8 @@ string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options) bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features) { + VLOG(2) << "Loading kernels for platform " << platform_name + << ", device " << device_name << "."; /* Verify if device was initialized. */ if(!device_initialized) { fprintf(stderr, "OpenCL: failed to initialize device.\n"); @@ -206,11 +213,14 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea base_program.add_kernel(ustring("convert_to_half_float")); base_program.add_kernel(ustring("shader")); base_program.add_kernel(ustring("bake")); + base_program.add_kernel(ustring("zero_buffer")); vector<OpenCLProgram*> programs; programs.push_back(&base_program); /* Call actual class to fill the vector with its programs. */ - load_kernels(requested_features, programs); + if(!load_kernels(requested_features, programs)) { + return false; + } /* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks * serialize the calls internally, so it's not much use right now. @@ -242,8 +252,14 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea return true; } -void OpenCLDeviceBase::mem_alloc(device_memory& mem, MemoryType type) +void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryType type) { + if(name) { + VLOG(1) << "Buffer allocate: " << name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + size_t size = mem.memory_size(); cl_mem_flags mem_flag; @@ -311,8 +327,61 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in void OpenCLDeviceBase::mem_zero(device_memory& mem) { if(mem.device_pointer) { - memset((void*)mem.data_pointer, 0, mem.memory_size()); - mem_copy_to(mem); + if(base_program.is_loaded()) { + cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); + + size_t global_size[] = {1024, 1024}; + size_t num_threads = global_size[0] * global_size[1]; + + cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer); + unsigned long long d_offset = 0; + unsigned long long d_size = 0; + + while(d_offset < mem.memory_size()) { + d_size = std::min<unsigned long long>(num_threads*sizeof(float4), mem.memory_size() - d_offset); + + kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); + + ciErr = clEnqueueNDRangeKernel(cqCommandQueue, + ckZeroBuffer, + 2, + NULL, + global_size, + NULL, + 0, + NULL, + NULL); + opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); + + d_offset += d_size; + } + } + + if(mem.data_pointer) { + memset((void*)mem.data_pointer, 0, mem.memory_size()); + } + + if(!base_program.is_loaded()) { + void* zero = (void*)mem.data_pointer; + + if(!mem.data_pointer) { + zero = util_aligned_malloc(mem.memory_size(), 16); + memset(zero, 0, mem.memory_size()); + } + + opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, + CL_MEM_PTR(mem.device_pointer), + CL_TRUE, + 0, + mem.memory_size(), + zero, + 0, + NULL, NULL)); + + if(!mem.data_pointer) { + util_aligned_free(zero); + } + } } } @@ -337,7 +406,7 @@ void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size) device_vector<uchar> *data = new device_vector<uchar>(); data->copy((uchar*)host, size); - mem_alloc(*data, MEM_READ_ONLY); + mem_alloc(name, *data, MEM_READ_ONLY); i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first; } else { @@ -356,7 +425,7 @@ void OpenCLDeviceBase::tex_alloc(const char *name, VLOG(1) << "Texture allocate: " << name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; - mem_alloc(mem, MEM_READ_ONLY); + mem_alloc(NULL, mem, MEM_READ_ONLY); mem_copy_to(mem); assert(mem_map.find(name) == mem_map.end()); mem_map.insert(MemMap::value_type(name, mem.device_pointer)); diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp index 6ea7619e022..049e332272b 100644 --- a/intern/cycles/device/opencl/opencl_mega.cpp +++ b/intern/cycles/device/opencl/opencl_mega.cpp @@ -43,11 +43,12 @@ public: return true; } - virtual void load_kernels(const DeviceRequestedFeatures& /*requested_features*/, + virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/, vector<OpenCLProgram*> &programs) { path_trace_program.add_kernel(ustring("path_trace")); programs.push_back(&path_trace_program); + return true; } ~OpenCLDeviceMegaKernel() diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index 3c3c2150128..b651b4a848e 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -21,325 +21,48 @@ #include "buffers.h" #include "kernel_types.h" +#include "kernel_split_data_types.h" +#include "device_split_kernel.h" + +#include "util_logging.h" #include "util_md5.h" #include "util_path.h" #include "util_time.h" CCL_NAMESPACE_BEGIN -/* TODO(sergey): This is to keep tile split on OpenCL level working - * for now, since without this view-port render does not work as it - * should. - * - * Ideally it'll be done on the higher level, but we need to get ready - * for merge rather soon, so let's keep split logic private here in - * the file. - */ -class SplitRenderTile : public RenderTile { -public: - SplitRenderTile() - : RenderTile(), - buffer_offset_x(0), - buffer_offset_y(0), - rng_state_offset_x(0), - rng_state_offset_y(0), - buffer_rng_state_stride(0) {} - - explicit SplitRenderTile(RenderTile& tile) - : RenderTile(), - buffer_offset_x(0), - buffer_offset_y(0), - rng_state_offset_x(0), - rng_state_offset_y(0), - buffer_rng_state_stride(0) - { - x = tile.x; - y = tile.y; - w = tile.w; - h = tile.h; - start_sample = tile.start_sample; - num_samples = tile.num_samples; - sample = tile.sample; - resolution = tile.resolution; - offset = tile.offset; - stride = tile.stride; - buffer = tile.buffer; - rng_state = tile.rng_state; - buffers = tile.buffers; +class OpenCLSplitKernel; + +static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features) +{ + string build_options = "-D__SPLIT_KERNEL__ "; + build_options += requested_features.get_build_options(); + + /* Set compute device build option. */ + cl_device_type device_type; + device->ciErr = clGetDeviceInfo(device->cdDevice, + CL_DEVICE_TYPE, + sizeof(cl_device_type), + &device_type, + NULL); + assert(device->ciErr == CL_SUCCESS); + if(device_type == CL_DEVICE_TYPE_GPU) { + build_options += " -D__COMPUTE_DEVICE_GPU__"; } - /* Split kernel is device global memory constrained; - * hence split kernel cant render big tile size's in - * one go. If the user sets a big tile size (big tile size - * is a term relative to the available device global memory), - * we split the tile further and then call path_trace on - * each of those split tiles. The following variables declared, - * assist in achieving that purpose - */ - int buffer_offset_x; - int buffer_offset_y; - int rng_state_offset_x; - int rng_state_offset_y; - int buffer_rng_state_stride; -}; + return build_options; +} /* OpenCLDeviceSplitKernel's declaration/definition. */ class OpenCLDeviceSplitKernel : public OpenCLDeviceBase { public: - /* Kernel declaration. */ + DeviceSplitKernel *split_kernel; OpenCLProgram program_data_init; - OpenCLProgram program_scene_intersect; - OpenCLProgram program_lamp_emission; - OpenCLProgram program_queue_enqueue; - OpenCLProgram program_background_buffer_update; - OpenCLProgram program_shader_eval; - OpenCLProgram program_holdout_emission_blurring_pathtermination_ao; - OpenCLProgram program_direct_lighting; - OpenCLProgram program_shadow_blocked; - OpenCLProgram program_next_iteration_setup; - OpenCLProgram program_sum_all_radiance; - - /* Global memory variables [porting]; These memory is used for - * co-operation between different kernels; Data written by one - * kernel will be available to another kernel via this global - * memory. - */ - cl_mem rng_coop; - cl_mem throughput_coop; - cl_mem L_transparent_coop; - cl_mem PathRadiance_coop; - cl_mem Ray_coop; - cl_mem PathState_coop; - cl_mem Intersection_coop; - cl_mem kgbuffer; /* KernelGlobals buffer. */ - - /* Global buffers for ShaderData. */ - cl_mem sd; /* ShaderData used in the main path-iteration loop. */ - cl_mem sd_DL_shadow; /* ShaderData used in Direct Lighting and - * shadow_blocked kernel. - */ - - /* Global memory required for shadow blocked and accum_radiance. */ - cl_mem BSDFEval_coop; - cl_mem ISLamp_coop; - cl_mem LightRay_coop; - cl_mem AOAlpha_coop; - cl_mem AOBSDF_coop; - cl_mem AOLightRay_coop; - cl_mem Intersection_coop_shadow; - -#ifdef WITH_CYCLES_DEBUG - /* DebugData memory */ - cl_mem debugdata_coop; -#endif - - /* Global state array that tracks ray state. */ - cl_mem ray_state; - - /* Per sample buffers. */ - cl_mem per_sample_output_buffers; - - /* Denotes which sample each ray is being processed for. */ - cl_mem work_array; - - /* Queue */ - cl_mem Queue_data; /* Array of size queuesize * num_queues * sizeof(int). */ - cl_mem Queue_index; /* Array of size num_queues * sizeof(int); - * Tracks the size of each queue. - */ - - /* Flag to make sceneintersect and lampemission kernel use queues. */ - cl_mem use_queues_flag; - - /* Amount of memory in output buffer associated with one pixel/thread. */ - size_t per_thread_output_buffer_size; - - /* Total allocatable available device memory. */ - size_t total_allocatable_memory; - - /* host version of ray_state; Used in checking host path-iteration - * termination. - */ - char *hostRayStateArray; - - /* Number of path-iterations to be done in one shot. */ - unsigned int PathIteration_times; - -#ifdef __WORK_STEALING__ - /* Work pool with respect to each work group. */ - cl_mem work_pool_wgs; - - /* Denotes the maximum work groups possible w.r.t. current tile size. */ - unsigned int max_work_groups; -#endif - - /* clos_max value for which the kernels have been loaded currently. */ - int current_max_closure; - - /* Marked True in constructor and marked false at the end of path_trace(). */ - bool first_tile; - - OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_) - : OpenCLDeviceBase(info, stats, background_) - { - background = background_; - - /* Initialize cl_mem variables. */ - kgbuffer = NULL; - sd = NULL; - sd_DL_shadow = NULL; - - rng_coop = NULL; - throughput_coop = NULL; - L_transparent_coop = NULL; - PathRadiance_coop = NULL; - Ray_coop = NULL; - PathState_coop = NULL; - Intersection_coop = NULL; - ray_state = NULL; - - AOAlpha_coop = NULL; - AOBSDF_coop = NULL; - AOLightRay_coop = NULL; - BSDFEval_coop = NULL; - ISLamp_coop = NULL; - LightRay_coop = NULL; - Intersection_coop_shadow = NULL; - -#ifdef WITH_CYCLES_DEBUG - debugdata_coop = NULL; -#endif - - work_array = NULL; - - /* Queue. */ - Queue_data = NULL; - Queue_index = NULL; - use_queues_flag = NULL; - - per_sample_output_buffers = NULL; - - per_thread_output_buffer_size = 0; - hostRayStateArray = NULL; - PathIteration_times = PATH_ITER_INC_FACTOR; -#ifdef __WORK_STEALING__ - work_pool_wgs = NULL; - max_work_groups = 0; -#endif - current_max_closure = -1; - first_tile = true; - - /* Get device's maximum memory that can be allocated. */ - ciErr = clGetDeviceInfo(cdDevice, - CL_DEVICE_MAX_MEM_ALLOC_SIZE, - sizeof(size_t), - &total_allocatable_memory, - NULL); - assert(ciErr == CL_SUCCESS); - if(platform_name == "AMD Accelerated Parallel Processing") { - /* This value is tweak-able; AMD platform does not seem to - * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE - * is considered for further computation. - */ - total_allocatable_memory /= 2; - } - } - - virtual bool show_samples() const { - return false; - } - - /* Split kernel utility functions. */ - size_t get_tex_size(const char *tex_name) - { - cl_mem ptr; - size_t ret_size = 0; - MemMap::iterator i = mem_map.find(tex_name); - if(i != mem_map.end()) { - ptr = CL_MEM_PTR(i->second); - ciErr = clGetMemObjectInfo(ptr, - CL_MEM_SIZE, - sizeof(ret_size), - &ret_size, - NULL); - assert(ciErr == CL_SUCCESS); - } - return ret_size; - } - - size_t get_shader_data_size(size_t max_closure) - { - /* ShaderData size with variable size ShaderClosure array */ - return sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - max_closure)); - } - - /* Returns size of KernelGlobals structure associated with OpenCL. */ - size_t get_KernelGlobals_size() - { - /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to - * fetch its size. - */ - typedef struct KernelGlobals { - ccl_constant KernelData *data; -#define KERNEL_TEX(type, ttype, name) \ - ccl_global type *name; -#include "kernel_textures.h" -#undef KERNEL_TEX - void *sd_input; - void *isect_shadow; - } KernelGlobals; - - return sizeof(KernelGlobals); - } + OpenCLProgram program_state_buffer_size; - virtual void load_kernels(const DeviceRequestedFeatures& requested_features, - vector<OpenCLProgram*> &programs) - { - string build_options = "-D__SPLIT_KERNEL__ "; -#ifdef __WORK_STEALING__ - build_options += "-D__WORK_STEALING__ "; -#endif - build_options += requested_features.get_build_options(); - - /* Set compute device build option. */ - cl_device_type device_type; - ciErr = clGetDeviceInfo(cdDevice, - CL_DEVICE_TYPE, - sizeof(cl_device_type), - &device_type, - NULL); - assert(ciErr == CL_SUCCESS); - if(device_type == CL_DEVICE_TYPE_GPU) { - build_options += " -D__COMPUTE_DEVICE_GPU__"; - } - -#define GLUE(a, b) a ## b -#define LOAD_KERNEL(name) \ - do { \ - GLUE(program_, name) = OpenCLProgram(this, "split_" #name, "kernel_" #name ".cl", build_options); \ - GLUE(program_, name).add_kernel(ustring("path_trace_" #name)); \ - programs.push_back(&GLUE(program_, name)); \ - } while(false) - - LOAD_KERNEL(data_init); - LOAD_KERNEL(scene_intersect); - LOAD_KERNEL(lamp_emission); - LOAD_KERNEL(queue_enqueue); - LOAD_KERNEL(background_buffer_update); - LOAD_KERNEL(shader_eval); - LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao); - LOAD_KERNEL(direct_lighting); - LOAD_KERNEL(shadow_blocked); - LOAD_KERNEL(next_iteration_setup); - LOAD_KERNEL(sum_all_radiance); - -#undef FIND_KERNEL -#undef GLUE - - current_max_closure = requested_features.max_closure; - } + OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_); ~OpenCLDeviceSplitKernel() { @@ -347,960 +70,298 @@ public: /* Release kernels */ program_data_init.release(); - program_scene_intersect.release(); - program_lamp_emission.release(); - program_queue_enqueue.release(); - program_background_buffer_update.release(); - program_shader_eval.release(); - program_holdout_emission_blurring_pathtermination_ao.release(); - program_direct_lighting.release(); - program_shadow_blocked.release(); - program_next_iteration_setup.release(); - program_sum_all_radiance.release(); - - /* Release global memory */ - release_mem_object_safe(rng_coop); - release_mem_object_safe(throughput_coop); - release_mem_object_safe(L_transparent_coop); - release_mem_object_safe(PathRadiance_coop); - release_mem_object_safe(Ray_coop); - release_mem_object_safe(PathState_coop); - release_mem_object_safe(Intersection_coop); - release_mem_object_safe(kgbuffer); - release_mem_object_safe(sd); - release_mem_object_safe(sd_DL_shadow); - release_mem_object_safe(ray_state); - release_mem_object_safe(AOAlpha_coop); - release_mem_object_safe(AOBSDF_coop); - release_mem_object_safe(AOLightRay_coop); - release_mem_object_safe(BSDFEval_coop); - release_mem_object_safe(ISLamp_coop); - release_mem_object_safe(LightRay_coop); - release_mem_object_safe(Intersection_coop_shadow); -#ifdef WITH_CYCLES_DEBUG - release_mem_object_safe(debugdata_coop); -#endif - release_mem_object_safe(use_queues_flag); - release_mem_object_safe(Queue_data); - release_mem_object_safe(Queue_index); - release_mem_object_safe(work_array); -#ifdef __WORK_STEALING__ - release_mem_object_safe(work_pool_wgs); -#endif - release_mem_object_safe(per_sample_output_buffers); - - if(hostRayStateArray != NULL) { - free(hostRayStateArray); - } + + delete split_kernel; } - void path_trace(DeviceTask *task, - SplitRenderTile& rtile, - int2 max_render_feasible_tile_size) + virtual bool load_kernels(const DeviceRequestedFeatures& requested_features, + vector<OpenCLDeviceBase::OpenCLProgram*> &programs) { - /* cast arguments to cl types */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); - cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state); - cl_int d_x = rtile.x; - cl_int d_y = rtile.y; - cl_int d_w = rtile.w; - cl_int d_h = rtile.h; - cl_int d_offset = rtile.offset; - cl_int d_stride = rtile.stride; - - /* Make sure that set render feasible tile size is a multiple of local - * work size dimensions. - */ - assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0); - assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0); - - size_t global_size[2]; - size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X, - SPLIT_KERNEL_LOCAL_SIZE_Y}; + program_data_init = OpenCLDeviceBase::OpenCLProgram(this, + "split_data_init", + "kernel_data_init.cl", + get_build_options(this, requested_features)); + program_data_init.add_kernel(ustring("path_trace_data_init")); + programs.push_back(&program_data_init); + + program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this, + "split_state_buffer_size", + "kernel_state_buffer_size.cl", + get_build_options(this, requested_features)); + program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size")); + programs.push_back(&program_state_buffer_size); + + return split_kernel->load_kernels(requested_features); + } - /* Set the range of samples to be processed for every ray in - * path-regeneration logic. - */ - cl_int start_sample = rtile.start_sample; - cl_int end_sample = rtile.start_sample + rtile.num_samples; - cl_int num_samples = rtile.num_samples; - -#ifdef __WORK_STEALING__ - global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0]; - global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; - unsigned int num_parallel_samples = 1; -#else - global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1]; - unsigned int num_threads = max_render_feasible_tile_size.x * - max_render_feasible_tile_size.y; - unsigned int num_tile_columns_possible = num_threads / global_size[1]; - /* Estimate number of parallel samples that can be - * processed in parallel. - */ - unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w, - rtile.num_samples); - /* Wavefront size in AMD is 64. - * TODO(sergey): What about other platforms? - */ - if(num_parallel_samples >= 64) { - /* TODO(sergey): Could use generic round-up here. */ - num_parallel_samples = (num_parallel_samples / 64) * 64; + void thread_run(DeviceTask *task) + { + if(task->type == DeviceTask::FILM_CONVERT) { + film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); } - assert(num_parallel_samples != 0); - - global_size[0] = d_w * num_parallel_samples; -#endif /* __WORK_STEALING__ */ - - assert(global_size[0] * global_size[1] <= - max_render_feasible_tile_size.x * max_render_feasible_tile_size.y); - - /* Allocate all required global memory once. */ - if(first_tile) { - size_t num_global_elements = max_render_feasible_tile_size.x * - max_render_feasible_tile_size.y; - /* TODO(sergey): This will actually over-allocate if - * particular kernel does not support multiclosure. - */ - size_t shaderdata_size = get_shader_data_size(current_max_closure); - -#ifdef __WORK_STEALING__ - /* Calculate max groups */ - size_t max_global_size[2]; - size_t tile_x = max_render_feasible_tile_size.x; - size_t tile_y = max_render_feasible_tile_size.y; - max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0]; - max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1]; - max_work_groups = (max_global_size[0] * max_global_size[1]) / - (local_size[0] * local_size[1]); - /* Allocate work_pool_wgs memory. */ - work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int)); -#endif /* __WORK_STEALING__ */ - - /* Allocate queue_index memory only once. */ - Queue_index = mem_alloc(NUM_QUEUES * sizeof(int)); - use_queues_flag = mem_alloc(sizeof(char)); - kgbuffer = mem_alloc(get_KernelGlobals_size()); - - /* Create global buffers for ShaderData. */ - sd = mem_alloc(num_global_elements * shaderdata_size); - sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size); - - /* Creation of global memory buffers which are shared among - * the kernels. - */ - rng_coop = mem_alloc(num_global_elements * sizeof(RNG)); - throughput_coop = mem_alloc(num_global_elements * sizeof(float3)); - L_transparent_coop = mem_alloc(num_global_elements * sizeof(float)); - PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance)); - Ray_coop = mem_alloc(num_global_elements * sizeof(Ray)); - PathState_coop = mem_alloc(num_global_elements * sizeof(PathState)); - Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection)); - AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3)); - AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3)); - AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray)); - BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval)); - ISLamp_coop = mem_alloc(num_global_elements * sizeof(int)); - LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray)); - Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection)); - -#ifdef WITH_CYCLES_DEBUG - debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData)); -#endif - - ray_state = mem_alloc(num_global_elements * sizeof(char)); - - hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char)); - assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory"); - - Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int))); - work_array = mem_alloc(num_global_elements * sizeof(unsigned int)); - per_sample_output_buffers = mem_alloc(num_global_elements * - per_thread_output_buffer_size); + else if(task->type == DeviceTask::SHADER) { + shader(*task); } + else if(task->type == DeviceTask::PATH_TRACE) { + RenderTile tile; - cl_int dQueue_size = global_size[0] * global_size[1]; - - cl_uint start_arg_index = - kernel_set_args(program_data_init(), - 0, - kgbuffer, - sd_DL_shadow, - d_data, - per_sample_output_buffers, - d_rng_state, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - Intersection_coop_shadow, - ray_state); - -/* TODO(sergey): Avoid map lookup here. */ + /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to + * fetch its size. + */ + typedef struct KernelGlobals { + ccl_constant KernelData *data; #define KERNEL_TEX(type, ttype, name) \ - set_kernel_arg_mem(program_data_init(), &start_arg_index, #name); + ccl_global type *name; #include "kernel_textures.h" #undef KERNEL_TEX + SplitData split_data; + SplitParams split_param_data; + } KernelGlobals; - start_arg_index += - kernel_set_args(program_data_init(), - start_arg_index, - start_sample, - d_x, - d_y, - d_w, - d_h, - d_offset, - d_stride, - rtile.rng_state_offset_x, - rtile.rng_state_offset_y, - rtile.buffer_rng_state_stride, - Queue_data, - Queue_index, - dQueue_size, - use_queues_flag, - work_array, -#ifdef __WORK_STEALING__ - work_pool_wgs, - num_samples, -#endif -#ifdef WITH_CYCLES_DEBUG - debugdata_coop, -#endif - num_parallel_samples); - - kernel_set_args(program_scene_intersect(), - 0, - kgbuffer, - d_data, - rng_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - d_w, - d_h, - Queue_data, - Queue_index, - dQueue_size, - use_queues_flag, -#ifdef WITH_CYCLES_DEBUG - debugdata_coop, -#endif - num_parallel_samples); - - kernel_set_args(program_lamp_emission(), - 0, - kgbuffer, - d_data, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - d_w, - d_h, - Queue_data, - Queue_index, - dQueue_size, - use_queues_flag, - num_parallel_samples); - - kernel_set_args(program_queue_enqueue(), - 0, - Queue_data, - Queue_index, - ray_state, - dQueue_size); - - kernel_set_args(program_background_buffer_update(), - 0, - kgbuffer, - d_data, - per_sample_output_buffers, - d_rng_state, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - L_transparent_coop, - ray_state, - d_w, - d_h, - d_x, - d_y, - d_stride, - rtile.rng_state_offset_x, - rtile.rng_state_offset_y, - rtile.buffer_rng_state_stride, - work_array, - Queue_data, - Queue_index, - dQueue_size, - end_sample, - start_sample, -#ifdef __WORK_STEALING__ - work_pool_wgs, - num_samples, -#endif -#ifdef WITH_CYCLES_DEBUG - debugdata_coop, -#endif - num_parallel_samples); - - kernel_set_args(program_shader_eval(), - 0, - kgbuffer, - d_data, - sd, - rng_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - Queue_data, - Queue_index, - dQueue_size); - - kernel_set_args(program_holdout_emission_blurring_pathtermination_ao(), - 0, - kgbuffer, - d_data, - sd, - per_sample_output_buffers, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - PathState_coop, - Intersection_coop, - AOAlpha_coop, - AOBSDF_coop, - AOLightRay_coop, - d_w, - d_h, - d_x, - d_y, - d_stride, - ray_state, - work_array, - Queue_data, - Queue_index, - dQueue_size, -#ifdef __WORK_STEALING__ - start_sample, -#endif - num_parallel_samples); - - kernel_set_args(program_direct_lighting(), - 0, - kgbuffer, - d_data, - sd, - rng_coop, - PathState_coop, - ISLamp_coop, - LightRay_coop, - BSDFEval_coop, - ray_state, - Queue_data, - Queue_index, - dQueue_size); - - kernel_set_args(program_shadow_blocked(), - 0, - kgbuffer, - d_data, - PathState_coop, - LightRay_coop, - AOLightRay_coop, - ray_state, - Queue_data, - Queue_index, - dQueue_size); - - kernel_set_args(program_next_iteration_setup(), - 0, - kgbuffer, - d_data, - sd, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - LightRay_coop, - ISLamp_coop, - BSDFEval_coop, - AOLightRay_coop, - AOBSDF_coop, - AOAlpha_coop, - ray_state, - Queue_data, - Queue_index, - dQueue_size, - use_queues_flag); - - kernel_set_args(program_sum_all_radiance(), - 0, - d_data, - d_buffer, - per_sample_output_buffers, - num_parallel_samples, - d_w, - d_h, - d_stride, - rtile.buffer_offset_x, - rtile.buffer_offset_y, - rtile.buffer_rng_state_stride, - start_sample); - - /* Macro for Enqueuing split kernels. */ -#define GLUE(a, b) a ## b -#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \ - { \ - ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \ - GLUE(program_, \ - kernelName)(), \ - 2, \ - NULL, \ - globalSize, \ - localSize, \ - 0, \ - NULL, \ - NULL); \ - opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \ - if(ciErr != CL_SUCCESS) { \ - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \ - clewErrorString(ciErr)); \ - opencl_error(message); \ - return; \ - } \ - } (void) 0 - - /* Enqueue ckPathTraceKernel_data_init kernel. */ - ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size); - bool activeRaysAvailable = true; - - /* Record number of time host intervention has been made */ - unsigned int numHostIntervention = 0; - unsigned int numNextPathIterTimes = PathIteration_times; - bool canceled = false; - while(activeRaysAvailable) { - /* Twice the global work size of other kernels for - * ckPathTraceKernel_shadow_blocked_direct_lighting. */ - size_t global_size_shadow_blocked[2]; - global_size_shadow_blocked[0] = global_size[0] * 2; - global_size_shadow_blocked[1] = global_size[1]; - - /* Do path-iteration in host [Enqueue Path-iteration kernels. */ - for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) { - ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size); - ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); - - if(task->get_cancel()) { - canceled = true; - break; - } - } + /* Allocate buffer for kernel globals */ + device_memory kgbuffer; + kgbuffer.resize(sizeof(KernelGlobals)); + mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE); - /* Read ray-state into Host memory to decide if we should exit - * path-iteration in host. - */ - ciErr = clEnqueueReadBuffer(cqCommandQueue, - ray_state, - CL_TRUE, - 0, - global_size[0] * global_size[1] * sizeof(char), - hostRayStateArray, - 0, - NULL, - NULL); - assert(ciErr == CL_SUCCESS); - - activeRaysAvailable = false; - - for(int rayStateIter = 0; - rayStateIter < global_size[0] * global_size[1]; - ++rayStateIter) - { - if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) { - /* Not all rays are RAY_INACTIVE. */ - activeRaysAvailable = true; - break; - } - } + /* Keep rendering tiles until done. */ + while(task->acquire_tile(this, tile)) { + split_kernel->path_trace(task, + tile, + kgbuffer, + *const_mem_map["__data"]); - if(activeRaysAvailable) { - numHostIntervention++; - PathIteration_times = PATH_ITER_INC_FACTOR; - /* Host intervention done before all rays become RAY_INACTIVE; - * Set do more initial iterations for the next tile. + /* Complete kernel execution before release tile. */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. */ - numNextPathIterTimes += PATH_ITER_INC_FACTOR; - } + clFinish(cqCommandQueue); - if(task->get_cancel()) { - canceled = true; - break; + task->release_tile(tile); } - } - /* Execute SumALLRadiance kernel to accumulate radiance calculated in - * per_sample_output_buffers into RenderTile's output buffer. - */ - if(!canceled) { - size_t sum_all_radiance_local_size[2] = {16, 16}; - size_t sum_all_radiance_global_size[2]; - sum_all_radiance_global_size[0] = - (((d_w - 1) / sum_all_radiance_local_size[0]) + 1) * - sum_all_radiance_local_size[0]; - sum_all_radiance_global_size[1] = - (((d_h - 1) / sum_all_radiance_local_size[1]) + 1) * - sum_all_radiance_local_size[1]; - ENQUEUE_SPLIT_KERNEL(sum_all_radiance, - sum_all_radiance_global_size, - sum_all_radiance_local_size); - } - -#undef ENQUEUE_SPLIT_KERNEL -#undef GLUE - - if(numHostIntervention == 0) { - /* This means that we are executing kernel more than required - * Must avoid this for the next sample/tile. - */ - PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ? - PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR; + mem_free(kgbuffer); } - else { - /* Number of path-iterations done for this tile is set as - * Initial path-iteration times for the next tile - */ - PathIteration_times = numNextPathIterTimes; - } - - first_tile = false; } - /* Calculates the amount of memory that has to be always - * allocated in order for the split kernel to function. - * This memory is tile/scene-property invariant (meaning, - * the value returned by this function does not depend - * on the user set tile size or scene properties. - */ - size_t get_invariable_mem_allocated() - { - size_t total_invariable_mem_allocated = 0; - size_t KernelGlobals_size = 0; - - KernelGlobals_size = get_KernelGlobals_size(); - - total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */ - total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */ - total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */ - - return total_invariable_mem_allocated; - } +protected: + /* ** Those guys are for workign around some compiler-specific bugs ** */ - /* Calculate the memory that has-to-be/has-been allocated for - * the split kernel to function. - */ - size_t get_tile_specific_mem_allocated(const int2 tile_size) + string build_options_for_base_program( + const DeviceRequestedFeatures& requested_features) { - size_t tile_specific_mem_allocated = 0; - - /* Get required tile info */ - unsigned int user_set_tile_w = tile_size.x; - unsigned int user_set_tile_h = tile_size.y; - -#ifdef __WORK_STEALING__ - /* Calculate memory to be allocated for work_pools in - * case of work_stealing. - */ - size_t max_global_size[2]; - size_t max_num_work_pools = 0; - max_global_size[0] = - (((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - max_global_size[1] = - (((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - max_num_work_pools = - (max_global_size[0] * max_global_size[1]) / - (SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y); - tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int); -#endif - - tile_specific_mem_allocated += - user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size; - tile_specific_mem_allocated += - user_set_tile_w * user_set_tile_h * sizeof(RNG); - - return tile_specific_mem_allocated; + return requested_features.get_build_options(); } - /* Calculates the texture memories and KernelData (d_data) memory - * that has been allocated. - */ - size_t get_scene_specific_mem_allocated(cl_mem d_data) - { - size_t scene_specific_mem_allocated = 0; - /* Calculate texture memories. */ -#define KERNEL_TEX(type, ttype, name) \ - scene_specific_mem_allocated += get_tex_size(#name); -#include "kernel_textures.h" -#undef KERNEL_TEX - size_t d_data_size; - ciErr = clGetMemObjectInfo(d_data, - CL_MEM_SIZE, - sizeof(d_data_size), - &d_data_size, - NULL); - assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info"); - scene_specific_mem_allocated += d_data_size; - return scene_specific_mem_allocated; - } + friend class OpenCLSplitKernel; + friend class OpenCLSplitKernelFunction; +}; - /* Calculate the memory required for one thread in split kernel. */ - size_t get_per_thread_memory() - { - size_t shaderdata_size = 0; - /* TODO(sergey): This will actually over-allocate if - * particular kernel does not support multiclosure. - */ - shaderdata_size = get_shader_data_size(current_max_closure); - size_t retval = sizeof(RNG) - + sizeof(float3) /* Throughput size */ - + sizeof(float) /* L transparent size */ - + sizeof(char) /* Ray state size */ - + sizeof(unsigned int) /* Work element size */ - + sizeof(int) /* ISLamp_size */ - + sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState) - + sizeof(Intersection) /* Overall isect */ - + sizeof(Intersection) /* Instersection_coop_AO */ - + sizeof(Intersection) /* Intersection coop DL */ - + shaderdata_size /* Overall ShaderData */ - + (shaderdata_size * 2) /* ShaderData : DL and shadow */ - + sizeof(Ray) + sizeof(BsdfEval) - + sizeof(float3) /* AOAlpha size */ - + sizeof(float3) /* AOBSDF size */ - + sizeof(Ray) - + (sizeof(int) * NUM_QUEUES) - + per_thread_output_buffer_size; - return retval; - } +class OpenCLSplitKernelFunction : public SplitKernelFunction { +public: + OpenCLDeviceSplitKernel* device; + OpenCLDeviceBase::OpenCLProgram program; - /* Considers the total memory available in the device and - * and returns the maximum global work size possible. - */ - size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data) - { - /* Calculate invariably allocated memory. */ - size_t invariable_mem_allocated = get_invariable_mem_allocated(); - /* Calculate tile specific allocated memory. */ - size_t tile_specific_mem_allocated = - get_tile_specific_mem_allocated(tile_size); - /* Calculate scene specific allocated memory. */ - size_t scene_specific_mem_allocated = - get_scene_specific_mem_allocated(d_data); - /* Calculate total memory available for the threads in global work size. */ - size_t available_memory = total_allocatable_memory - - invariable_mem_allocated - - tile_specific_mem_allocated - - scene_specific_mem_allocated - - DATA_ALLOCATION_MEM_FACTOR; - size_t per_thread_memory_required = get_per_thread_memory(); - return (available_memory / per_thread_memory_required); - } + OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {} + ~OpenCLSplitKernelFunction() { program.release(); } - /* Checks if the device has enough memory to render the whole tile; - * If not, we should split single tile into multiple tiles of small size - * and process them all. - */ - bool need_to_split_tile(unsigned int d_w, - unsigned int d_h, - int2 max_render_feasible_tile_size) + virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) { - size_t global_size_estimate[2]; - /* TODO(sergey): Such round-ups are in quite few places, need to replace - * them with an utility macro. - */ - global_size_estimate[0] = - (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - global_size_estimate[1] = - (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - if((global_size_estimate[0] * global_size_estimate[1]) > - (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y)) - { - return true; - } - else { + device->kernel_set_args(program(), 0, kg, data); + + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + program(), + 2, + NULL, + dim.global_size, + dim.local_size, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + if(device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); return false; } + + return true; } +}; - /* Considers the scene properties, global memory available in the device - * and returns a rectanglular tile dimension (approx the maximum) - * that should render on split kernel. - */ - int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size) - { - int2 max_render_feasible_tile_size; - int square_root_val = (int)sqrt(feasible_global_work_size); - max_render_feasible_tile_size.x = square_root_val; - max_render_feasible_tile_size.y = square_root_val; - /* Ciel round-off max_render_feasible_tile_size. */ - int2 ceil_render_feasible_tile_size; - ceil_render_feasible_tile_size.x = - (((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - ceil_render_feasible_tile_size.y = - (((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <= - feasible_global_work_size) - { - return ceil_render_feasible_tile_size; - } - /* Floor round-off max_render_feasible_tile_size. */ - int2 floor_render_feasible_tile_size; - floor_render_feasible_tile_size.x = - (max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) * - SPLIT_KERNEL_LOCAL_SIZE_X; - floor_render_feasible_tile_size.y = - (max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - return floor_render_feasible_tile_size; +class OpenCLSplitKernel : public DeviceSplitKernel { + OpenCLDeviceSplitKernel *device; +public: + explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) { } - /* Try splitting the current tile into multiple smaller - * almost-square-tiles. - */ - int2 get_split_tile_size(RenderTile rtile, - int2 max_render_feasible_tile_size) + virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, + const DeviceRequestedFeatures& requested_features) { - int2 split_tile_size; - int num_global_threads = max_render_feasible_tile_size.x * - max_render_feasible_tile_size.y; - int d_w = rtile.w; - int d_h = rtile.h; - /* Ceil round off d_w and d_h */ - d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - while(d_w * d_h > num_global_threads) { - /* Halve the longer dimension. */ - if(d_w >= d_h) { - d_w = d_w / 2; - d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - } - else { - d_h = d_h / 2; - d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - } + OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device); + + kernel->program = OpenCLDeviceBase::OpenCLProgram(device, + "split_" + kernel_name, + "kernel_" + kernel_name + ".cl", + get_build_options(device, requested_features)); + kernel->program.add_kernel(ustring("path_trace_" + kernel_name)); + kernel->program.load(); + + if(!kernel->program.is_loaded()) { + delete kernel; + return NULL; } - split_tile_size.x = d_w; - split_tile_size.y = d_h; - return split_tile_size; + + return kernel; } - /* Splits existing tile into multiple tiles of tile size split_tile_size. */ - vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size) + virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) { - vector<SplitRenderTile> to_path_trace_rtile; - int d_w = rtile.w; - int d_h = rtile.h; - int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1); - int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1); - /* Buffer and rng_state offset calc. */ - size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride); - size_t offset_x = offset_index % rtile.stride; - size_t offset_y = offset_index / rtile.stride; - /* Resize to_path_trace_rtile. */ - to_path_trace_rtile.resize(num_tiles_x * num_tiles_y); - for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) { - for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) { - int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x; - to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x; - to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y; - to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x; - to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y; - to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample; - to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples; - to_path_trace_rtile[rtile_index].sample = rtile.sample; - to_path_trace_rtile[rtile_index].resolution = rtile.resolution; - to_path_trace_rtile[rtile_index].offset = rtile.offset; - to_path_trace_rtile[rtile_index].buffers = rtile.buffers; - to_path_trace_rtile[rtile_index].buffer = rtile.buffer; - to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state; - to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x); - to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y); - to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride; - /* Fill width and height of the new render tile. */ - to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ? - (d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */ - : split_tile_size.x; - to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ? - (d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */ - : split_tile_size.y; - to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w; - } + device_vector<uint> size_buffer; + size_buffer.resize(1); + device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE); + + uint threads = num_threads; + device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer); + + size_t global_size = 64; + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + device->program_state_buffer_size(), + 1, + NULL, + &global_size, + NULL, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint)); + device->mem_free(size_buffer); + + if(device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); + return 0; } - return to_path_trace_rtile; + + return *size_buffer.get_data(); } - void thread_run(DeviceTask *task) + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs + ) { - if(task->type == DeviceTask::FILM_CONVERT) { - film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); - } - else if(task->type == DeviceTask::SHADER) { - shader(*task); - } - else if(task->type == DeviceTask::PATH_TRACE) { - RenderTile tile; - bool initialize_data_and_check_render_feasibility = false; - bool need_to_split_tiles_further = false; - int2 max_render_feasible_tile_size; - size_t feasible_global_work_size; - const int2 tile_size = task->requested_tile_size; - /* Keep rendering tiles until done. */ - while(task->acquire_tile(this, tile)) { - if(!initialize_data_and_check_render_feasibility) { - /* Initialize data. */ - /* Calculate per_thread_output_buffer_size. */ - size_t output_buffer_size = 0; - ciErr = clGetMemObjectInfo((cl_mem)tile.buffer, - CL_MEM_SIZE, - sizeof(output_buffer_size), - &output_buffer_size, - NULL); - assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info"); - /* This value is different when running on AMD and NV. */ - if(background) { - /* In offline render the number of buffer elements - * associated with tile.buffer is the current tile size. - */ - per_thread_output_buffer_size = - output_buffer_size / (tile.w * tile.h); - } - else { - /* interactive rendering, unlike offline render, the number of buffer elements - * associated with tile.buffer is the entire viewport size. - */ - per_thread_output_buffer_size = - output_buffer_size / (tile.buffers->params.width * - tile.buffers->params.height); - } - /* Check render feasibility. */ - feasible_global_work_size = get_feasible_global_work_size( - tile_size, - CL_MEM_PTR(const_mem_map["__data"]->device_pointer)); - max_render_feasible_tile_size = - get_max_render_feasible_tile_size( - feasible_global_work_size); - need_to_split_tiles_further = - need_to_split_tile(tile_size.x, - tile_size.y, - max_render_feasible_tile_size); - initialize_data_and_check_render_feasibility = true; - } - if(need_to_split_tiles_further) { - int2 split_tile_size = - get_split_tile_size(tile, - max_render_feasible_tile_size); - vector<SplitRenderTile> to_path_trace_render_tiles = - split_tiles(tile, split_tile_size); - /* Print message to console */ - if(background && (to_path_trace_render_tiles.size() > 1)) { - fprintf(stderr, "Message : Tiles need to be split " - "further inside path trace (due to insufficient " - "device-global-memory for split kernel to " - "function) \n" - "The current tile of dimensions %dx%d is split " - "into tiles of dimension %dx%d for render \n", - tile.w, tile.h, - split_tile_size.x, - split_tile_size.y); - } - /* Process all split tiles. */ - for(int tile_iter = 0; - tile_iter < to_path_trace_render_tiles.size(); - ++tile_iter) - { - path_trace(task, - to_path_trace_render_tiles[tile_iter], - max_render_feasible_tile_size); - } - } - else { - /* No splitting required; process the entire tile at once. */ - /* Render feasible tile size is user-set-tile-size itself. */ - max_render_feasible_tile_size.x = - (((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_X; - max_render_feasible_tile_size.y = - (((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) * - SPLIT_KERNEL_LOCAL_SIZE_Y; - /* buffer_rng_state_stride is stride itself. */ - SplitRenderTile split_tile(tile); - split_tile.buffer_rng_state_stride = tile.stride; - path_trace(task, split_tile, max_render_feasible_tile_size); - } - tile.sample = tile.start_sample + tile.num_samples; + cl_int dQueue_size = dim.global_size[0] * dim.global_size[1]; - /* Complete kernel execution before release tile. */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); + /* Set the range of samples to be processed for every ray in + * path-regeneration logic. + */ + cl_int start_sample = rtile.start_sample; + cl_int end_sample = rtile.start_sample + rtile.num_samples; - task->release_tile(tile); - } + cl_uint start_arg_index = + device->kernel_set_args(device->program_data_init(), + 0, + kernel_globals, + kernel_data, + split_data, + num_global_elements, + ray_state, + rtile.rng_state); + +/* TODO(sergey): Avoid map lookup here. */ +#define KERNEL_TEX(type, ttype, name) \ + device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name); +#include "kernel_textures.h" +#undef KERNEL_TEX + + start_arg_index += + device->kernel_set_args(device->program_data_init(), + start_arg_index, + start_sample, + end_sample, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + rtile.offset, + rtile.stride, + queue_index, + dQueue_size, + use_queues_flag, + work_pool_wgs, + rtile.num_samples, + rtile.buffer); + + /* Enqueue ckPathTraceKernel_data_init kernel. */ + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + device->program_data_init(), + 2, + NULL, + dim.global_size, + dim.local_size, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + if(device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); + return false; } + + return true; } -protected: - cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE) + virtual int2 split_kernel_local_size() { - cl_mem ptr; - assert(bufsize != 0); - ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer"); - return ptr; + return make_int2(64, 1); } - /* ** Those guys are for workign around some compiler-specific bugs ** */ - - string build_options_for_base_program( - const DeviceRequestedFeatures& requested_features) + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/) { - return requested_features.get_build_options(); + size_t max_buffer_size; + clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_buffer_size, NULL); + VLOG(1) << "Maximum device allocation side: " + << string_human_readable_number(max_buffer_size) << " bytes. (" + << string_human_readable_size(max_buffer_size) << ")."; + + size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2); + int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements)); + VLOG(1) << "Global size: " << global_size << "."; + return global_size; } }; +OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_) +: OpenCLDeviceBase(info, stats, background_) +{ + split_kernel = new OpenCLSplitKernel(this); + + background = background_; +} + Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background) { return new OpenCLDeviceSplitKernel(info, stats, background); diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index c7760e075cb..d5c19bf5386 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -19,6 +19,7 @@ #include "opencl.h" #include "util_logging.h" +#include "util_md5.h" #include "util_path.h" #include "util_time.h" @@ -338,12 +339,13 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src) bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src) { - string source = "#include \"kernels/opencl/" + kernel_file + "\" // " + OpenCLCache::get_kernel_md5() + "\n"; + string source = "#include \"kernels/opencl/" + kernel_file + "\"\n"; /* We compile kernels consisting of many files. unfortunately OpenCL * kernel caches do not seem to recognize changes in included files. * so we force recompile on changes by adding the md5 hash of all files. */ source = path_source_replace_includes(source, path_get("kernel")); + source += "\n// " + util_md5_string(source) + "\n"; if(debug_src) { path_write_text(*debug_src, source); @@ -440,7 +442,11 @@ void OpenCLDeviceBase::OpenCLProgram::load() if(!program) { add_log(string("OpenCL program ") + program_name + " not found in cache.", true); - string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + OpenCLCache::get_kernel_md5(); + /* need to create source to get md5 */ + string source = "#include \"kernels/opencl/" + kernel_file + "\"\n"; + source = path_source_replace_includes(source, path_get("kernel")); + + string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source); basename = path_cache_get(path_join("kernels", basename)); string clbin = basename + ".clbin"; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 29e0f44841e..1c740b5c6eb 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -13,8 +13,11 @@ set(INC_SYS set(SRC kernels/cpu/kernel.cpp + kernels/cpu/kernel_split.cpp kernels/opencl/kernel.cl + kernels/opencl/kernel_state_buffer_size.cl kernels/opencl/kernel_data_init.cl + kernels/opencl/kernel_path_init.cl kernels/opencl/kernel_queue_enqueue.cl kernels/opencl/kernel_scene_intersect.cl kernels/opencl/kernel_lamp_emission.cl @@ -24,8 +27,8 @@ set(SRC kernels/opencl/kernel_direct_lighting.cl kernels/opencl/kernel_shadow_blocked.cl kernels/opencl/kernel_next_iteration_setup.cl - kernels/opencl/kernel_sum_all_radiance.cl kernels/cuda/kernel.cu + kernels/cuda/kernel_split.cu ) set(SRC_BVH_HEADERS @@ -88,6 +91,10 @@ set(SRC_KERNELS_CPU_HEADERS kernels/cpu/kernel_cpu_image.h ) +set(SRC_KERNELS_CUDA_HEADERS + kernels/cuda/kernel_config.h +) + set(SRC_CLOSURE_HEADERS closure/alloc.h closure/bsdf.h @@ -195,11 +202,14 @@ set(SRC_SPLIT_HEADERS split/kernel_holdout_emission_blurring_pathtermination_ao.h split/kernel_lamp_emission.h split/kernel_next_iteration_setup.h + split/kernel_path_init.h + split/kernel_queue_enqueue.h split/kernel_scene_intersect.h split/kernel_shader_eval.h split/kernel_shadow_blocked.h split/kernel_split_common.h - split/kernel_sum_all_radiance.h + split/kernel_split_data.h + split/kernel_split_data_types.h ) # CUDA module @@ -227,8 +237,9 @@ if(WITH_CYCLES_CUDA_BINARIES) endif() # build for each arch - set(cuda_sources kernels/cuda/kernel.cu + set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu ${SRC_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} ${SRC_BVH_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} @@ -237,15 +248,22 @@ if(WITH_CYCLES_CUDA_BINARIES) ) set(cuda_cubins) - macro(CYCLES_CUDA_KERNEL_ADD arch experimental) - if(${experimental}) - set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__") - set(cuda_cubin kernel_experimental_${arch}.cubin) + macro(CYCLES_CUDA_KERNEL_ADD arch split experimental) + if(${split}) + set(cuda_extra_flags "-D__SPLIT__") + set(cuda_cubin kernel_split) else() set(cuda_extra_flags "") - set(cuda_cubin kernel_${arch}.cubin) + set(cuda_cubin kernel) + endif() + + if(${experimental}) + set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__) + set(cuda_cubin ${cuda_cubin}_experimental) endif() + set(cuda_cubin ${cuda_cubin}_${arch}.cubin) + if(WITH_CYCLES_DEBUG) set(cuda_debug_flags "-D__KERNEL_DEBUG__") else() @@ -258,13 +276,19 @@ if(WITH_CYCLES_CUDA_BINARIES) set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}") set(cuda_math_flags "--use_fast_math") + if(split) + set(cuda_kernel_src "/kernels/cuda/kernel_split.cu") + else() + set(cuda_kernel_src "/kernels/cuda/kernel.cu") + endif() + add_custom_command( OUTPUT ${cuda_cubin} COMMAND ${cuda_nvcc_command} -arch=${arch} ${CUDA_NVCC_FLAGS} -m${CUDA_BITS} - --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu + --cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src} -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} @@ -291,7 +315,12 @@ if(WITH_CYCLES_CUDA_BINARIES) foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) # Compile regular kernel - CYCLES_CUDA_KERNEL_ADD(${arch} FALSE) + CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE) + + if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES) + # Compile split kernel + CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE) + endif() endforeach() add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins}) @@ -314,31 +343,42 @@ if(CXX_HAS_SSE) kernels/cpu/kernel_sse2.cpp kernels/cpu/kernel_sse3.cpp kernels/cpu/kernel_sse41.cpp + kernels/cpu/kernel_split_sse2.cpp + kernels/cpu/kernel_split_sse3.cpp + kernels/cpu/kernel_split_sse41.cpp ) set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX) list(APPEND SRC kernels/cpu/kernel_avx.cpp + kernels/cpu/kernel_split_avx.cpp ) set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}") endif() if(CXX_HAS_AVX2) list(APPEND SRC kernels/cpu/kernel_avx2.cpp + kernels/cpu/kernel_split_avx2.cpp ) set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") + set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}") endif() add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_KERNELS_CPU_HEADERS} + ${SRC_KERNELS_CUDA_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS} @@ -361,7 +401,9 @@ endif() #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_state_buffer_size.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_path_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) @@ -371,9 +413,10 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emiss delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm) diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h index b7abc1ec507..4894ea58dba 100644 --- a/intern/cycles/kernel/closure/alloc.h +++ b/intern/cycles/kernel/closure/alloc.h @@ -20,17 +20,17 @@ ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType ty { kernel_assert(size <= sizeof(ShaderClosure)); - int num_closure = ccl_fetch(sd, num_closure); - int num_closure_extra = ccl_fetch(sd, num_closure_extra); + int num_closure = sd->num_closure; + int num_closure_extra = sd->num_closure_extra; if(num_closure + num_closure_extra >= MAX_CLOSURE) return NULL; - ShaderClosure *sc = &ccl_fetch(sd, closure)[num_closure]; + ShaderClosure *sc = &sd->closure[num_closure]; sc->type = type; sc->weight = weight; - ccl_fetch(sd, num_closure)++; + sd->num_closure++; return sc; } @@ -44,18 +44,18 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size) * This lets us keep the same fast array iteration over closures, as we * found linked list iteration and iteration with skipping to be slower. */ int num_extra = ((size + sizeof(ShaderClosure) - 1) / sizeof(ShaderClosure)); - int num_closure = ccl_fetch(sd, num_closure); - int num_closure_extra = ccl_fetch(sd, num_closure_extra) + num_extra; + int num_closure = sd->num_closure; + int num_closure_extra = sd->num_closure_extra + num_extra; if(num_closure + num_closure_extra > MAX_CLOSURE) { /* Remove previous closure. */ - ccl_fetch(sd, num_closure)--; - ccl_fetch(sd, num_closure_extra)++; + sd->num_closure--; + sd->num_closure_extra++; return NULL; } - ccl_fetch(sd, num_closure_extra) = num_closure_extra; - return (ccl_addr_space void*)(ccl_fetch(sd, closure) + MAX_CLOSURE - num_closure_extra); + sd->num_closure_extra = num_closure_extra; + return (ccl_addr_space void*)(sd->closure + MAX_CLOSURE - num_closure_extra); } ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight) diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h index 7e4d5fe2e37..a44b9e2d9b9 100644 --- a/intern/cycles/kernel/closure/bsdf.h +++ b/intern/cycles/kernel/closure/bsdf.h @@ -51,89 +51,89 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg, switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #ifdef __OSL__ case CLOSURE_BSDF_PHONG_RAMP_ID: - label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_DIFFUSE_RAMP_ID: - label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #endif case CLOSURE_BSDF_TRANSLUCENT_ID: - label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: - label = bsdf_microfacet_multi_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, - eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state)); + label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: - label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, - eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state)); + label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, + eval, omega_in, &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, + label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); + label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf); break; #endif default: @@ -157,75 +157,75 @@ float3 bsdf_eval(KernelGlobals *kg, { float3 eval; - if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) { + if(dot(sd->Ng, omega_in) >= 0.0f) { switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf); break; #ifdef __OSL__ case CLOSURE_BSDF_PHONG_RAMP_ID: - eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_RAMP_ID: - eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf); break; #endif case CLOSURE_BSDF_TRANSLUCENT_ID: - eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: - eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: - eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf); break; #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); break; #endif default: @@ -237,63 +237,63 @@ float3 bsdf_eval(KernelGlobals *kg, switch(sc->type) { case CLOSURE_BSDF_DIFFUSE_ID: case CLOSURE_BSDF_BSSRDF_ID: - eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf); break; #ifdef __SVM__ case CLOSURE_BSDF_OREN_NAYAR_ID: - eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_TRANSLUCENT_ID: - eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFLECTION_ID: - eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_REFRACTION_ID: - eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_TRANSPARENT_ID: - eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_GGX_ID: case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID: case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID: - eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID: - eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID: - eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state)); + eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state); break; case CLOSURE_BSDF_MICROFACET_BECKMANN_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID: case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID: - eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID: case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID: - eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: - eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_DIFFUSE_TOON_ID: - eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_GLOSSY_TOON_ID: - eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_REFLECTION_ID: - eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf); break; case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: - eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf); break; #endif #ifdef __VOLUME__ case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: - eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf); + eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf); break; #endif default: diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h index 08ccee56335..cc62192ef21 100644 --- a/intern/cycles/kernel/geom/geom_attribute.h +++ b/intern/cycles/kernel/geom/geom_attribute.h @@ -30,7 +30,7 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData * ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd) { #ifdef __HAIR__ - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->type & PRIMITIVE_ALL_CURVE) { return ATTR_PRIM_CURVE; } else @@ -53,12 +53,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found() ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id) { - if(ccl_fetch(sd, object) == PRIM_NONE) { + if(sd->object == PRIM_NONE) { return attribute_not_found(); } /* for SVM, find attribute by unique id */ - uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride; + uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride; attr_offset += attribute_primitive_type(kg, sd); uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset); @@ -73,7 +73,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const Sh AttributeDescriptor desc; desc.element = (AttributeElement)attr_map.y; - if(ccl_fetch(sd, prim) == PRIM_NONE && + if(sd->prim == PRIM_NONE && desc.element != ATTR_ELEMENT_MESH && desc.element != ATTR_ELEMENT_VOXEL && desc.element != ATTR_ELEMENT_OBJECT) diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h index 712b67a1b55..7cc840ce78d 100644 --- a/intern/cycles/kernel/geom/geom_curve.h +++ b/intern/cycles/kernel/geom/geom_curve.h @@ -32,22 +32,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, if(dy) *dy = 0.0f; #endif - return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim)); + return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim); } else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float f0 = kernel_tex_fetch(__attributes_float, desc.offset + k0); float f1 = kernel_tex_fetch(__attributes_float, desc.offset + k1); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0); + if(dx) *dx = sd->du.dx*(f1 - f0); if(dy) *dy = 0.0f; #endif - return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1; + return (1.0f - sd->u)*f0 + sd->u*f1; } else { #ifdef __RAY_DIFFERENTIALS__ @@ -71,22 +71,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); #endif - return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim))); + return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim)); } else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0)); float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1)); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0); + if(dx) *dx = sd->du.dx*(f1 - f0); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); #endif - return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1; + return (1.0f - sd->u)*f0 + sd->u*f1; } else { #ifdef __RAY_DIFFERENTIALS__ @@ -104,22 +104,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) { float r = 0.0f; - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + if(sd->type & PRIMITIVE_ALL_CURVE) { + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float4 P_curve[2]; - if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { + if(sd->type & PRIMITIVE_CURVE) { P_curve[0]= kernel_tex_fetch(__curve_keys, k0); P_curve[1]= kernel_tex_fetch(__curve_keys, k1); } else { - motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve); + motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); } - r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w; + r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w; } return r*2.0f; @@ -130,8 +130,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd) ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd) { - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); - int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); + int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float4 P_curve[2]; @@ -139,7 +139,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd P_curve[0]= kernel_tex_fetch(__curve_keys, k0); P_curve[1]= kernel_tex_fetch(__curve_keys, k1); - return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u)); + return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u); } /* Curve tangent normal */ @@ -148,14 +148,14 @@ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd) { float3 tgN = make_float3(0.0f,0.0f,0.0f); - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->type & PRIMITIVE_ALL_CURVE) { - tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu)))); + tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu))); tgN = normalize(tgN); /* need to find suitable scaled gd for corrected normal */ #if 0 - tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu)); + tgN = normalize(tgN - gd * sd->dPdu); #endif } @@ -966,7 +966,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; #else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); #endif @@ -979,7 +979,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con int prim = kernel_tex_fetch(__prim_index, isect->prim); float4 v00 = kernel_tex_fetch(__curves, prim); - int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type)); + int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type); int k1 = k0 + 1; float3 tg; @@ -990,14 +990,14 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con float4 P_curve[4]; - if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { + if(sd->type & PRIMITIVE_CURVE) { P_curve[0] = kernel_tex_fetch(__curve_keys, ka); P_curve[1] = kernel_tex_fetch(__curve_keys, k0); P_curve[2] = kernel_tex_fetch(__curve_keys, k1); P_curve[3] = kernel_tex_fetch(__curve_keys, kb); } else { - motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve); + motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve); } float3 p[4]; @@ -1009,43 +1009,43 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con P = P + D*t; #ifdef __UV__ - ccl_fetch(sd, u) = isect->u; - ccl_fetch(sd, v) = 0.0f; + sd->u = isect->u; + sd->v = 0.0f; #endif tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3])); if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) { - ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D)))); + sd->Ng = normalize(-(D - tg * (dot(tg, D)))); } else { /* direction from inside to surface of curve */ float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]); - ccl_fetch(sd, Ng) = normalize(P - p_curr); + sd->Ng = normalize(P - p_curr); /* adjustment for changing radius */ float gd = isect->v; if(gd != 0.0f) { - ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg; - ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); + sd->Ng = sd->Ng - gd * tg; + sd->Ng = normalize(sd->Ng); } } /* todo: sometimes the normal is still so that this is detected as * backfacing even if cull backfaces is enabled */ - ccl_fetch(sd, N) = ccl_fetch(sd, Ng); + sd->N = sd->Ng; } else { float4 P_curve[2]; - if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) { + if(sd->type & PRIMITIVE_CURVE) { P_curve[0]= kernel_tex_fetch(__curve_keys, k0); P_curve[1]= kernel_tex_fetch(__curve_keys, k1); } else { - motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve); + motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve); } float l = 1.0f; @@ -1056,39 +1056,39 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con float3 dif = P - float4_to_float3(P_curve[0]); #ifdef __UV__ - ccl_fetch(sd, u) = dot(dif,tg)/l; - ccl_fetch(sd, v) = 0.0f; + sd->u = dot(dif,tg)/l; + sd->v = 0.0f; #endif if(flag & CURVE_KN_TRUETANGENTGNORMAL) { - ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D)); - ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); + sd->Ng = -(D - tg * dot(tg, D)); + sd->Ng = normalize(sd->Ng); } else { float gd = isect->v; /* direction from inside to surface of curve */ - ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd); + sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd); /* adjustment for changing radius */ if(gd != 0.0f) { - ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg; - ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng)); + sd->Ng = sd->Ng - gd * tg; + sd->Ng = normalize(sd->Ng); } } - ccl_fetch(sd, N) = ccl_fetch(sd, Ng); + sd->N = sd->Ng; } #ifdef __DPDU__ /* dPdu/dPdv */ - ccl_fetch(sd, dPdu) = tg; - ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng)); + sd->dPdu = tg; + sd->dPdv = cross(tg, sd->Ng); #endif if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; #else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); #endif diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h index d57d74ea882..2500228281e 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h @@ -48,7 +48,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, return P; } # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; # else Transform tfm = object_fetch_transform(kg, isect->object, @@ -77,7 +77,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; # else Transform tfm = object_fetch_transform(kg, isect->object, @@ -116,7 +116,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg, # ifdef __INTERSECTION_REFINE__ if(isect->object != OBJECT_NONE) { # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; # else Transform tfm = object_fetch_transform(kg, isect->object, @@ -144,7 +144,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; # else Transform tfm = object_fetch_transform(kg, isect->object, diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h index 0e024a05db6..cb456056e20 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h @@ -39,26 +39,26 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, bool subsurface) { /* Get shader. */ - ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim)); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* Get motion info. */ /* TODO(sergey): This logic is really similar to motion_triangle_vertices(), * can we de-duplicate something here? */ int numsteps, numverts; - object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL); + object_motion_info(kg, sd->object, &numsteps, &numverts, NULL); /* Figure out which steps we need to fetch and their interpolation factor. */ int maxstep = numsteps*2; - int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1); - float t = ccl_fetch(sd, time)*maxstep - step; + int step = min((int)(sd->time*maxstep), maxstep-1); + float t = sd->time*maxstep - step; /* Find attribute. */ AttributeElement elem; - int offset = find_attribute_motion(kg, ccl_fetch(sd, object), + int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_POSITION, &elem); kernel_assert(offset != ATTR_STD_NOT_FOUND); /* Fetch vertex coordinates. */ float3 verts[3], next_verts[3]; - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts); motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts); /* Interpolate between steps. */ @@ -68,7 +68,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, /* Compute refined position. */ #ifdef __SUBSURFACE__ if(subsurface) { - ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, + sd->P = motion_triangle_refine_subsurface(kg, sd, isect, ray, @@ -77,29 +77,29 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, else #endif /* __SUBSURFACE__*/ { - ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts); + sd->P = motion_triangle_refine(kg, sd, isect, ray, verts); } /* Compute face normal. */ float3 Ng; - if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { + if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0])); } else { Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0])); } - ccl_fetch(sd, Ng) = Ng; - ccl_fetch(sd, N) = Ng; + sd->Ng = Ng; + sd->N = Ng; /* Compute derivatives of P w.r.t. uv. */ #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = (verts[0] - verts[2]); - ccl_fetch(sd, dPdv) = (verts[1] - verts[2]); + sd->dPdu = (verts[0] - verts[2]); + sd->dPdv = (verts[1] - verts[2]); #endif /* Compute smooth normal. */ - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { + if(sd->shader & SHADER_SMOOTH_NORMAL) { /* Find attribute. */ AttributeElement elem; int offset = find_attribute_motion(kg, - ccl_fetch(sd, object), + sd->object, ATTR_STD_MOTION_VERTEX_NORMAL, &elem); kernel_assert(offset != ATTR_STD_NOT_FOUND); @@ -112,10 +112,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, normals[1] = (1.0f - t)*normals[1] + t*next_normals[1]; normals[2] = (1.0f - t)*normals[2] + t*next_normals[2]; /* Interpolate between vertices. */ - float u = ccl_fetch(sd, u); - float v = ccl_fetch(sd, v); + float u = sd->u; + float v = sd->v; float w = 1.0f - u - v; - ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]); + sd->N = (u*normals[0] + v*normals[1] + w*normals[2]); } } diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h index f51b2d18657..5a04be8b0bf 100644 --- a/intern/cycles/kernel/geom/geom_object.h +++ b/intern/cycles/kernel/geom/geom_object.h @@ -137,9 +137,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P) { #ifdef __OBJECT_MOTION__ - *P = transform_point_auto(&ccl_fetch(sd, ob_tfm), *P); + *P = transform_point_auto(&sd->ob_tfm, *P); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); *P = transform_point(&tfm, *P); #endif } @@ -149,9 +149,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P) { #ifdef __OBJECT_MOTION__ - *P = transform_point_auto(&ccl_fetch(sd, ob_itfm), *P); + *P = transform_point_auto(&sd->ob_itfm, *P); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); *P = transform_point(&tfm, *P); #endif } @@ -161,12 +161,12 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N) { #ifdef __OBJECT_MOTION__ - if((ccl_fetch(sd, object) != OBJECT_NONE) || (ccl_fetch(sd, type) == PRIMITIVE_LAMP)) { - *N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N)); + if((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) { + *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N)); } #else - if(ccl_fetch(sd, object) != OBJECT_NONE) { - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + if(sd->object != OBJECT_NONE) { + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); *N = normalize(transform_direction_transposed(&tfm, *N)); } #endif @@ -177,9 +177,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N) { #ifdef __OBJECT_MOTION__ - *N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_itfm), *N)); + *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N)); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); *N = normalize(transform_direction_transposed(&tfm, *N)); #endif } @@ -189,9 +189,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D) { #ifdef __OBJECT_MOTION__ - *D = transform_direction_auto(&ccl_fetch(sd, ob_tfm), *D); + *D = transform_direction_auto(&sd->ob_tfm, *D); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); *D = transform_direction(&tfm, *D); #endif } @@ -201,9 +201,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D) { #ifdef __OBJECT_MOTION__ - *D = transform_direction_auto(&ccl_fetch(sd, ob_itfm), *D); + *D = transform_direction_auto(&sd->ob_itfm, *D); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); *D = transform_direction(&tfm, *D); #endif } @@ -212,13 +212,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd) { - if(ccl_fetch(sd, object) == OBJECT_NONE) + if(sd->object == OBJECT_NONE) return make_float3(0.0f, 0.0f, 0.0f); #ifdef __OBJECT_MOTION__ - return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w); + return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w); #else - Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); + Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); return make_float3(tfm.x.w, tfm.y.w, tfm.z.w); #endif } @@ -326,7 +326,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object) ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd) { - return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE + 1); + return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE + 1); } /* Particle data from which object was instanced */ diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h index 6a0ff5a4a04..5663b598508 100644 --- a/intern/cycles/kernel/geom/geom_patch.h +++ b/intern/cycles/kernel/geom/geom_patch.h @@ -267,7 +267,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg, const ShaderData *sd, int o float weights_du[PATCH_MAX_CONTROL_VERTS]; float weights_dv[PATCH_MAX_CONTROL_VERTS]; - int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel, + int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel, indices, weights, weights_du, weights_dv); float val = 0.0f; @@ -294,7 +294,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg, const ShaderData *sd, int float weights_du[PATCH_MAX_CONTROL_VERTS]; float weights_dv[PATCH_MAX_CONTROL_VERTS]; - int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel, + int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel, indices, weights, weights_du, weights_dv); float3 val = make_float3(0.0f, 0.0f, 0.0f); @@ -321,7 +321,7 @@ ccl_device float3 patch_eval_uchar4(KernelGlobals *kg, const ShaderData *sd, int float weights_du[PATCH_MAX_CONTROL_VERTS]; float weights_dv[PATCH_MAX_CONTROL_VERTS]; - int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel, + int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel, indices, weights, weights_du, weights_dv); float3 val = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h index 8a73bb2f78b..90a9c2147cc 100644 --- a/intern/cycles/kernel/geom/geom_primitive.h +++ b/intern/cycles/kernel/geom/geom_primitive.h @@ -28,19 +28,19 @@ ccl_device_inline float primitive_attribute_float(KernelGlobals *kg, const AttributeDescriptor desc, float *dx, float *dy) { - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { + if(sd->type & PRIMITIVE_ALL_TRIANGLE) { if(subd_triangle_patch(kg, sd) == ~0) return triangle_attribute_float(kg, sd, desc, dx, dy); else return subd_triangle_attribute_float(kg, sd, desc, dx, dy); } #ifdef __HAIR__ - else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + else if(sd->type & PRIMITIVE_ALL_CURVE) { return curve_attribute_float(kg, sd, desc, dx, dy); } #endif #ifdef __VOLUME__ - else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) { + else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) { return volume_attribute_float(kg, sd, desc, dx, dy); } #endif @@ -56,19 +56,19 @@ ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg, const AttributeDescriptor desc, float3 *dx, float3 *dy) { - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) { + if(sd->type & PRIMITIVE_ALL_TRIANGLE) { if(subd_triangle_patch(kg, sd) == ~0) return triangle_attribute_float3(kg, sd, desc, dx, dy); else return subd_triangle_attribute_float3(kg, sd, desc, dx, dy); } #ifdef __HAIR__ - else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + else if(sd->type & PRIMITIVE_ALL_CURVE) { return curve_attribute_float3(kg, sd, desc, dx, dy); } #endif #ifdef __VOLUME__ - else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) { + else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) { return volume_attribute_float3(kg, sd, desc, dx, dy); } #endif @@ -118,9 +118,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) { #ifdef __HAIR__ - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) + if(sd->type & PRIMITIVE_ALL_CURVE) # ifdef __DPDU__ - return normalize(ccl_fetch(sd, dPdu)); + return normalize(sd->dPdu); # else return make_float3(0.0f, 0.0f, 0.0f); # endif @@ -133,12 +133,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) float3 data = primitive_attribute_float3(kg, sd, desc, NULL, NULL); data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f); object_normal_transform(kg, sd, &data); - return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N)))); + return cross(sd->N, normalize(cross(data, sd->N))); } else { /* otherwise use surface derivatives */ #ifdef __DPDU__ - return normalize(ccl_fetch(sd, dPdu)); + return normalize(sd->dPdu); #else return make_float3(0.0f, 0.0f, 0.0f); #endif @@ -153,17 +153,17 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData * float3 center; #ifdef __HAIR__ - bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE; + bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE; if(is_curve_primitive) { center = curve_motion_center_location(kg, sd); - if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) { + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { object_position_transform(kg, sd, ¢er); } } else #endif - center = ccl_fetch(sd, P); + center = sd->P; float3 motion_pre = center, motion_post = center; @@ -173,16 +173,16 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData * if(desc.offset != ATTR_STD_NOT_FOUND) { /* get motion info */ int numverts, numkeys; - object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys); + object_motion_info(kg, sd->object, NULL, &numverts, &numkeys); /* lookup attributes */ motion_pre = primitive_attribute_float3(kg, sd, desc, NULL, NULL); - desc.offset += (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys; + desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys; motion_post = primitive_attribute_float3(kg, sd, desc, NULL, NULL); #ifdef __HAIR__ - if(is_curve_primitive && (ccl_fetch(sd, object_flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) { + if(is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) { object_position_transform(kg, sd, &motion_pre); object_position_transform(kg, sd, &motion_post); } @@ -193,10 +193,10 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData * * transformation was set match the world/object space of motion_pre/post */ Transform tfm; - tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE); + tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE); motion_pre = transform_point(&tfm, motion_pre); - tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST); + tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST); motion_post = transform_point(&tfm, motion_post); float3 motion_center; diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h index 647840dc696..044e82f03d4 100644 --- a/intern/cycles/kernel/geom/geom_subd_triangle.h +++ b/intern/cycles/kernel/geom/geom_subd_triangle.h @@ -22,14 +22,14 @@ CCL_NAMESPACE_BEGIN ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd) { - return (ccl_fetch(sd, prim) != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, ccl_fetch(sd, prim)) : ~0; + return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0; } /* UV coords of triangle within patch */ ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg, const ShaderData *sd, float2 uv[3]) { - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); uv[0] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.x); uv[1] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.y); @@ -110,7 +110,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float2 dpdv = uv[1] - uv[2]; /* p is [s, t] */ - float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2]; + float2 p = dpdu * sd->u + dpdv * sd->v + uv[2]; float a, dads, dadt; a = patch_eval_float(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt); @@ -123,8 +123,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float dtdv = dpdv.y; if(dx) { - float dudx = ccl_fetch(sd, du).dx; - float dvdx = ccl_fetch(sd, dv).dx; + float dudx = sd->du.dx; + float dvdx = sd->dv.dx; float dsdx = dsdu*dudx + dsdv*dvdx; float dtdx = dtdu*dudx + dtdv*dvdx; @@ -132,8 +132,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const *dx = dads*dsdx + dadt*dtdx; } if(dy) { - float dudy = ccl_fetch(sd, du).dy; - float dvdy = ccl_fetch(sd, dv).dy; + float dudy = sd->du.dy; + float dvdy = sd->dv.dy; float dsdy = dsdu*dudy + dsdv*dvdy; float dtdy = dtdu*dudy + dtdv*dvdy; @@ -174,11 +174,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else if(desc.element == ATTR_ELEMENT_CORNER) { float2 uv[3]; @@ -202,11 +202,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else { if(dx) *dx = 0.0f; @@ -229,7 +229,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float2 dpdv = uv[1] - uv[2]; /* p is [s, t] */ - float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2]; + float2 p = dpdu * sd->u + dpdv * sd->v + uv[2]; float3 a, dads, dadt; @@ -248,8 +248,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float dtdv = dpdv.y; if(dx) { - float dudx = ccl_fetch(sd, du).dx; - float dvdx = ccl_fetch(sd, dv).dx; + float dudx = sd->du.dx; + float dvdx = sd->dv.dx; float dsdx = dsdu*dudx + dsdv*dvdx; float dtdx = dtdu*dudx + dtdv*dvdx; @@ -257,8 +257,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con *dx = dads*dsdx + dadt*dtdx; } if(dy) { - float dudy = ccl_fetch(sd, du).dy; - float dvdy = ccl_fetch(sd, dv).dy; + float dudy = sd->du.dy; + float dvdy = sd->dv.dy; float dsdy = dsdu*dudy + dsdv*dvdy; float dtdy = dtdu*dudy + dtdv*dvdy; @@ -299,11 +299,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) { float2 uv[3]; @@ -337,11 +337,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c; - if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c; + if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c; + if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c; #endif - return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c; + return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c; } else { if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h index 3229091bbb0..47778553b94 100644 --- a/intern/cycles/kernel/geom/geom_triangle.h +++ b/intern/cycles/kernel/geom/geom_triangle.h @@ -26,13 +26,13 @@ CCL_NAMESPACE_BEGIN ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd) { /* load triangle vertices */ - const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); const float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0)); const float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1)); const float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2)); /* return normal */ - if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { + if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) { return normalize(cross(v2 - v0, v1 - v0)); } else { @@ -110,34 +110,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s if(dx) *dx = 0.0f; if(dy) *dy = 0.0f; - return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim)); + return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim); } else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) { - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); float f0 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.x); float f1 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.y); float f2 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.z); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else if(desc.element == ATTR_ELEMENT_CORNER) { - int tri = desc.offset + ccl_fetch(sd, prim)*3; + int tri = desc.offset + sd->prim*3; float f0 = kernel_tex_fetch(__attributes_float, tri + 0); float f1 = kernel_tex_fetch(__attributes_float, tri + 1); float f2 = kernel_tex_fetch(__attributes_float, tri + 2); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else { if(dx) *dx = 0.0f; @@ -153,24 +153,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f); - return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim))); + return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim)); } else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) { - uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)); + uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim); float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x)); float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y)); float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z)); #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) { - int tri = desc.offset + ccl_fetch(sd, prim)*3; + int tri = desc.offset + sd->prim*3; float3 f0, f1, f2; if(desc.element == ATTR_ELEMENT_CORNER) { @@ -185,11 +185,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData } #ifdef __RAY_DIFFERENTIALS__ - if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2; - if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2; + if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2; + if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2; #endif - return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2; + return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2; } else { if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h index 4db121d94f4..4d234dd62bd 100644 --- a/intern/cycles/kernel/geom/geom_triangle_intersect.h +++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h @@ -457,7 +457,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, return P; } # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; # else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM); # endif @@ -491,7 +491,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { # ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; # else Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM); # endif @@ -519,7 +519,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_itfm); + Transform tfm = sd->ob_itfm; #else Transform tfm = object_fetch_transform(kg, isect->object, @@ -557,7 +557,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg, if(isect->object != OBJECT_NONE) { #ifdef __OBJECT_MOTION__ - Transform tfm = ccl_fetch(sd, ob_tfm); + Transform tfm = sd->ob_tfm; #else Transform tfm = object_fetch_transform(kg, isect->object, diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index 03724c955be..1e0ef5201c9 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -64,7 +64,7 @@ ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg, ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy) { - float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P)); + float3 P = volume_normalized_position(kg, sd, sd->P); #ifdef __KERNEL_CUDA__ # if __CUDA_ARCH__ >= 300 CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset); @@ -91,7 +91,7 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy) { - float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P)); + float3 P = volume_normalized_position(kg, sd, sd->P); #ifdef __KERNEL_CUDA__ # if __CUDA_ARCH__ >= 300 CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset); diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h index 9279a94c13a..cd339e6237e 100644 --- a/intern/cycles/kernel/kernel.h +++ b/intern/cycles/kernel/kernel.h @@ -20,6 +20,7 @@ /* CPU Kernel Interface */ #include "util_types.h" +#include "kernel_types.h" CCL_NAMESPACE_BEGIN @@ -28,6 +29,7 @@ CCL_NAMESPACE_BEGIN #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name) struct KernelGlobals; +struct KernelData; KernelGlobals *kernel_globals_create(); void kernel_globals_free(KernelGlobals *kg); diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h index dedac6b1465..0df5217d97a 100644 --- a/intern/cycles/kernel/kernel_camera.h +++ b/intern/cycles/kernel/kernel_camera.h @@ -457,7 +457,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, { if(kernel_data.cam.type != CAMERA_PANORAMA) { /* perspective / ortho */ - if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE) + if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE) P += camera_position(kg); Transform tfm = kernel_data.cam.worldtondc; @@ -467,7 +467,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, /* panorama */ Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) P = normalize(transform_point(&tfm, P)); else P = normalize(transform_direction(&tfm, P)); diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h index 9d1f3bdc918..e347a1eca18 100644 --- a/intern/cycles/kernel/kernel_compat_cpu.h +++ b/intern/cycles/kernel/kernel_compat_cpu.h @@ -44,6 +44,15 @@ #define ccl_addr_space +#define ccl_local_id(d) 0 +#define ccl_global_id(d) (kg->global_id[d]) + +#define ccl_local_size(d) 1 +#define ccl_global_size(d) (kg->global_size[d]) + +#define ccl_group_id(d) ccl_global_id(d) +#define ccl_num_groups(d) ccl_global_size(d) + /* On x86_64, versions of glibc < 2.16 have an issue where expf is * much slower than the double version. This was fixed in glibc 2.16. */ diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index e0c7b17c6a0..8fffe2a13c9 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -46,11 +46,58 @@ #define ccl_device_noinline __device__ __noinline__ #define ccl_global #define ccl_constant +#define ccl_local __shared__ +#define ccl_local_param +#define ccl_private #define ccl_may_alias #define ccl_addr_space #define ccl_restrict __restrict__ #define ccl_align(n) __align__(n) +ccl_device_inline uint ccl_local_id(uint d) +{ + switch(d) { + case 0: return threadIdx.x; + case 1: return threadIdx.y; + case 2: return threadIdx.z; + default: return 0; + } +} + +#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d)) + +ccl_device_inline uint ccl_local_size(uint d) +{ + switch(d) { + case 0: return blockDim.x; + case 1: return blockDim.y; + case 2: return blockDim.z; + default: return 0; + } +} + +#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d)) + +ccl_device_inline uint ccl_group_id(uint d) +{ + switch(d) { + case 0: return blockIdx.x; + case 1: return blockIdx.y; + case 2: return blockIdx.z; + default: return 0; + } +} + +ccl_device_inline uint ccl_num_groups(uint d) +{ + switch(d) { + case 0: return gridDim.x; + case 1: return gridDim.y; + case 2: return gridDim.z; + default: return 0; + } +} + /* No assert supported for CUDA */ #define kernel_assert(cond) diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h index f076e3a7d37..6c963dea4f5 100644 --- a/intern/cycles/kernel/kernel_compat_opencl.h +++ b/intern/cycles/kernel/kernel_compat_opencl.h @@ -39,6 +39,7 @@ #define ccl_constant __constant #define ccl_global __global #define ccl_local __local +#define ccl_local_param __local #define ccl_private __private #define ccl_restrict restrict #define ccl_align(n) __attribute__((aligned(n))) @@ -49,6 +50,15 @@ # define ccl_addr_space #endif +#define ccl_local_id(d) get_local_id(d) +#define ccl_global_id(d) get_global_id(d) + +#define ccl_local_size(d) get_local_size(d) +#define ccl_global_size(d) get_global_size(d) + +#define ccl_group_id(d) get_group_id(d) +#define ccl_num_groups(d) get_num_groups(d) + /* Selective nodes compilation. */ #ifndef __NODES_MAX_GROUP__ # define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index 8c7c651a053..bc2d9604122 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -67,7 +67,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, false, ls->lamp); - ls->Ng = ccl_fetch(emission_sd, Ng); + ls->Ng = emission_sd->Ng; /* no path flag, we're evaluating this for all closures. that's weak but * we'd have to do multiple evaluations otherwise */ @@ -76,7 +76,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, path_state_modify_bounce(state, false); /* evaluate emissive closure */ - if(ccl_fetch(emission_sd, flag) & SD_EMISSION) + if(emission_sd->flag & SD_EMISSION) eval = shader_emissive_eval(kg, emission_sd); else eval = make_float3(0.0f, 0.0f, 0.0f); @@ -112,7 +112,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, -ls->D, dD, ls->t, - ccl_fetch(sd, time)); + sd->time); if(is_zero(light_eval)) return false; @@ -120,7 +120,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, /* evaluate BSDF at shading point */ #ifdef __VOLUME__ - if(ccl_fetch(sd, prim) != PRIM_NONE) + if(sd->prim != PRIM_NONE) shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS); else { float bsdf_pdf; @@ -168,8 +168,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, if(ls->shader & SHADER_CAST_SHADOW) { /* setup ray */ - bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f); - ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); + bool transmit = (dot(sd->Ng, ls->D) < 0.0f); + ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng); if(ls->t == FLT_MAX) { /* distant light */ @@ -182,7 +182,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ray->D = normalize_len(ray->D, &ray->t); } - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; ray->dD = differential3_zero(); } else { @@ -204,14 +204,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader float3 L = shader_emissive_eval(kg, sd); #ifdef __HAIR__ - if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)) + if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE)) #else - if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS)) + if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS)) #endif { /* multiple importance sampling, get triangle light pdf, * and compute weight with respect to BSDF pdf */ - float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t); + float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t); float mis_weight = power_heuristic(bsdf_pdf, pdf); return L*mis_weight; diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index 2b52a2d2f48..1c3884890bf 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -16,6 +16,9 @@ /* Constant Globals */ +#ifndef __KERNEL_GLOBALS_H__ +#define __KERNEL_GLOBALS_H__ + CCL_NAMESPACE_BEGIN /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in @@ -64,6 +67,13 @@ typedef struct KernelGlobals { /* Storage for decoupled volume steps. */ VolumeStep *decoupled_volume_steps[2]; int decoupled_volume_steps_index; + + /* split kernel */ + SplitData split_data; + SplitParams split_param_data; + + int2 global_size; + int2 global_id; } KernelGlobals; #endif /* __KERNEL_CPU__ */ @@ -103,8 +113,8 @@ typedef ccl_addr_space struct KernelGlobals { # include "kernel_textures.h" # ifdef __SPLIT_KERNEL__ - ShaderData *sd_input; - Intersection *isect_shadow; + SplitData split_data; + SplitParams split_param_data; # endif } KernelGlobals; @@ -146,3 +156,4 @@ ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int o CCL_NAMESPACE_END +#endif /* __KERNEL_GLOBALS_H__ */ diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h index 7aec47e4957..ed523696571 100644 --- a/intern/cycles/kernel/kernel_passes.h +++ b/intern/cycles/kernel/kernel_passes.h @@ -19,16 +19,16 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value) { ccl_global float *buf = buffer; -#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) +#if defined(__SPLIT_KERNEL__) atomic_add_and_fetch_float(buf, value); #else *buf = (sample == 0)? value: *buf + value; -#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ +#endif /* __SPLIT_KERNEL__ */ } ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value) { -#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) +#if defined(__SPLIT_KERNEL__) ccl_global float *buf_x = buffer + 0; ccl_global float *buf_y = buffer + 1; ccl_global float *buf_z = buffer + 2; @@ -39,12 +39,12 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sa #else ccl_global float3 *buf = (ccl_global float3*)buffer; *buf = (sample == 0)? value: *buf + value; -#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ +#endif /* __SPLIT_KERNEL__ */ } ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value) { -#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__) +#if defined(__SPLIT_KERNEL__) ccl_global float *buf_x = buffer + 0; ccl_global float *buf_y = buffer + 1; ccl_global float *buf_z = buffer + 2; @@ -57,7 +57,7 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa #else ccl_global float4 *buf = (ccl_global float4*)buffer; *buf = (sample == 0)? value: *buf + value; -#endif // __SPLIT_KERNEL__ && __WORK_STEALING__ +#endif /* __SPLIT_KERNEL__ */ } ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L, @@ -75,18 +75,18 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl return; if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) { - if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) || + if(!(sd->flag & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f || average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) { if(sample == 0) { if(flag & PASS_DEPTH) { - float depth = camera_distance(kg, ccl_fetch(sd, P)); + float depth = camera_distance(kg, sd->P); kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth); } if(flag & PASS_OBJECT_ID) { - float id = object_pass_id(kg, ccl_fetch(sd, object)); + float id = object_pass_id(kg, sd->object); kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id); } if(flag & PASS_MATERIAL_ID) { @@ -96,7 +96,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl } if(flag & PASS_NORMAL) { - float3 normal = ccl_fetch(sd, N); + float3 normal = sd->N; kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal); } if(flag & PASS_UV) { @@ -127,7 +127,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl float mist_start = kernel_data.film.mist_start; float mist_inv_depth = kernel_data.film.mist_inv_depth; - float depth = camera_distance(kg, ccl_fetch(sd, P)); + float depth = camera_distance(kg, sd->P); float mist = saturate((depth - mist_start)*mist_inv_depth); /* falloff */ diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index f90701a8260..95c27850513 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -75,17 +75,17 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg, sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { Ray light_ray; float3 ao_shadow; - light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + light_ray.P = ray_offset(sd->P, sd->Ng); light_ray.D = ao_D; light_ray.t = kernel_data.background.ao_distance; #ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); + light_ray.time = sd->time; #endif /* __OBJECT_MOTION__ */ - light_ray.dP = ccl_fetch(sd, dP); + light_ray.dP = sd->dP; light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) { @@ -459,7 +459,7 @@ bool kernel_path_subsurface_scatter( # ifdef __VOLUME__ ss_indirect->need_update_volume_stack = kernel_data.integrator.use_volumes && - ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME; + sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; # endif /* __VOLUME__ */ /* compute lighting with the BSDF closure */ diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index ff2b828795d..d58960cae4e 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -42,17 +42,17 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg, sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { Ray light_ray; float3 ao_shadow; - light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + light_ray.P = ray_offset(sd->P, sd->Ng); light_ray.D = ao_D; light_ray.t = kernel_data.background.ao_distance; #ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); + light_ray.time = sd->time; #endif /* __OBJECT_MOTION__ */ - light_ray.dP = ccl_fetch(sd, dP); + light_ray.dP = sd->dP; light_ray.dD = differential3_zero(); if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) @@ -67,8 +67,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd, float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - const ShaderClosure *sc = &ccl_fetch(sd, closure)[i]; + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; if(!CLOSURE_IS_BSDF(sc->type)) continue; @@ -140,8 +140,8 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, Ray *ray, float3 throughput) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = &ccl_fetch(sd, closure)[i]; + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(!CLOSURE_IS_BSSRDF(sc->type)) continue; @@ -169,7 +169,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg, Ray volume_ray = *ray; bool need_update_volume_stack = kernel_data.integrator.use_volumes && - ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME; + sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME; #endif /* __VOLUME__ */ /* compute lighting with the BSDF closure */ diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h index fea503d06e5..34a78552c1d 100644 --- a/intern/cycles/kernel/kernel_path_surface.h +++ b/intern/cycles/kernel/kernel_path_surface.h @@ -25,7 +25,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal { #ifdef __EMISSION__ /* sample illumination from lights to find path contribution */ - if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)) + if(!(sd->flag & SD_BSDF_HAS_EVAL)) return; Ray light_ray; @@ -33,7 +33,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal bool is_lamp; # ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); + light_ray.time = sd->time; # endif if(sample_all_lights) { @@ -52,7 +52,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples); LightSample ls; - if(lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls)) { + if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) { /* The sampling probability returned by lamp_light_sample assumes that all lights were sampled. * However, this code only samples lamps, so if the scene also had mesh lights, the real probability is twice as high. */ if(kernel_data.integrator.pdf_triangles != 0.0f) @@ -87,7 +87,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal light_t = 0.5f*light_t; LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) { + if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { /* Same as above, probability needs to be corrected since the sampling was forced to select a mesh light. */ if(kernel_data.integrator.num_all_lights) ls.pdf *= 2.0f; @@ -113,7 +113,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal float terminate = path_state_rng_light_termination(kg, rng, state); LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) { + if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { /* sample random light */ if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ @@ -156,15 +156,15 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng, path_state_next(kg, state, label); /* setup ray */ - ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); + ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); ray->D = normalize(bsdf_omega_in); ray->t = FLT_MAX; #ifdef __RAY_DIFFERENTIALS__ - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; ray->dD = bsdf_domega_in; #endif #ifdef __OBJECT_MOTION__ - ray->time = ccl_fetch(sd, time); + ray->time = sd->time; #endif #ifdef __VOLUME__ @@ -195,7 +195,7 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_ PathRadiance *L) { #ifdef __EMISSION__ - if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) + if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) return; /* sample illumination from lights to find path contribution */ @@ -208,11 +208,11 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_ bool is_lamp; #ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); + light_ray.time = sd->time; #endif LightSample ls; - if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) { + if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) { float terminate = path_state_rng_light_termination(kg, rng, state); if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* trace shadow ray */ @@ -238,7 +238,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, ccl_addr_space Ray *ray) { /* no BSDF? we can stop here */ - if(ccl_fetch(sd, flag) & SD_BSDF) { + if(sd->flag & SD_BSDF) { /* sample BSDF */ float bsdf_pdf; BsdfEval bsdf_eval; @@ -270,16 +270,16 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, path_state_next(kg, state, label); /* setup ray */ - ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng)); + ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng); ray->D = normalize(bsdf_omega_in); if(state->bounce == 0) - ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */ + ray->t -= sd->ray_length; /* clipping works through transparent */ else ray->t = FLT_MAX; #ifdef __RAY_DIFFERENTIALS__ - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; ray->dD = bsdf_domega_in; #endif @@ -291,21 +291,21 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg, return true; } #ifdef __VOLUME__ - else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) { + else if(sd->flag & SD_HAS_ONLY_VOLUME) { /* no surface shader but have a volume shader? act transparent */ /* update path state, count as transparent */ path_state_next(kg, state, LABEL_TRANSPARENT); if(state->bounce == 0) - ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */ + ray->t -= sd->ray_length; /* clipping works through transparent */ else ray->t = FLT_MAX; /* setup ray position, direction stays unchanged */ - ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng)); + ray->P = ray_offset(sd->P, -sd->Ng); #ifdef __RAY_DIFFERENTIALS__ - ray->dP = ccl_fetch(sd, dP); + ray->dP = sd->dP; #endif /* enter/exit volume */ diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h index cf5614b8a86..2e63909a38c 100644 --- a/intern/cycles/kernel/kernel_queues.h +++ b/intern/cycles/kernel/kernel_queues.h @@ -17,6 +17,8 @@ #ifndef __KERNEL_QUEUE_H__ #define __KERNEL_QUEUE_H__ +CCL_NAMESPACE_BEGIN + /* * Queue utility functions for split kernel */ @@ -35,7 +37,8 @@ ccl_device void enqueue_ray_index( ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */ { /* This thread's queue index. */ - int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size); + int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint*)&queue_index[queue_number]) + + (queue_number * queue_size); queues[my_queue_index] = ray_index; } @@ -47,6 +50,7 @@ ccl_device void enqueue_ray_index( * is no more ray to allocate to other threads. */ ccl_device int get_ray_index( + KernelGlobals *kg, int thread_index, /* Global thread index. */ int queue_number, /* Queue to operate on. */ ccl_global int *queues, /* Buffer of all queues. */ @@ -68,24 +72,25 @@ ccl_device void enqueue_ray_index_local( int queue_number, /* Queue in which to enqueue ray index. */ char enqueue_flag, /* True for threads whose ray index has to be enqueued. */ int queuesize, /* queue size. */ - ccl_local unsigned int *local_queue_atomics, /* To to local queue atomics. */ + ccl_local_param unsigned int *local_queue_atomics, /* To to local queue atomics. */ ccl_global int *Queue_data, /* Queues. */ ccl_global int *Queue_index) /* To do global queue atomics. */ { - int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); + int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); /* Get local queue id .*/ unsigned int lqidx; if(enqueue_flag) { - lqidx = atomic_inc(local_queue_atomics); + lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics); } - barrier(CLK_LOCAL_MEM_FENCE); + ccl_barrier(CCL_LOCAL_MEM_FENCE); /* Get global queue offset. */ if(lidx == 0) { - *local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics); + *local_queue_atomics = atomic_fetch_and_add_uint32((ccl_global uint*)&Queue_index[queue_number], + *local_queue_atomics); } - barrier(CLK_LOCAL_MEM_FENCE); + ccl_barrier(CCL_LOCAL_MEM_FENCE); /* Get global queue index and enqueue ray. */ if(enqueue_flag) { @@ -96,19 +101,19 @@ ccl_device void enqueue_ray_index_local( ccl_device unsigned int get_local_queue_index( int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */ - ccl_local unsigned int *local_queue_atomics) + ccl_local_param unsigned int *local_queue_atomics) { - int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]); + int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]); return my_lqidx; } ccl_device unsigned int get_global_per_queue_offset( int queue_number, - ccl_local unsigned int *local_queue_atomics, + ccl_local_param unsigned int *local_queue_atomics, ccl_global int* global_queue_atomics) { - unsigned int queue_offset = atomic_add(&global_queue_atomics[queue_number], - local_queue_atomics[queue_number]); + unsigned int queue_offset = atomic_fetch_and_add_uint32((ccl_global uint*)&global_queue_atomics[queue_number], + local_queue_atomics[queue_number]); return queue_offset; } @@ -116,10 +121,12 @@ ccl_device unsigned int get_global_queue_index( int queue_number, int queuesize, unsigned int lqidx, - ccl_local unsigned int * global_per_queue_offset) + ccl_local_param unsigned int * global_per_queue_offset) { int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number]; return my_gqidx; } +CCL_NAMESPACE_END + #endif // __KERNEL_QUEUE_H__ diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index 59c1331a63c..a2ab96b35e2 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -38,13 +38,13 @@ CCL_NAMESPACE_BEGIN #ifdef __OBJECT_MOTION__ ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time) { - if(ccl_fetch(sd, object_flag) & SD_OBJECT_MOTION) { - ccl_fetch(sd, ob_tfm) = object_fetch_transform_motion(kg, ccl_fetch(sd, object), time); - ccl_fetch(sd, ob_itfm) = transform_quick_inverse(ccl_fetch(sd, ob_tfm)); + if(sd->object_flag & SD_OBJECT_MOTION) { + sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time); + sd->ob_itfm = transform_quick_inverse(sd->ob_tfm); } else { - ccl_fetch(sd, ob_tfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM); - ccl_fetch(sd, ob_itfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM); + sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM); + sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM); } } #endif @@ -55,55 +55,55 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, const Ray *ray) { #ifdef __INSTANCING__ - ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object; + sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object; #endif - ccl_fetch(sd, type) = isect->type; - ccl_fetch(sd, flag) = 0; - ccl_fetch(sd, object_flag) = kernel_tex_fetch(__object_flag, - ccl_fetch(sd, object)); + sd->type = isect->type; + sd->flag = 0; + sd->object_flag = kernel_tex_fetch(__object_flag, + sd->object); /* matrices and time */ #ifdef __OBJECT_MOTION__ shader_setup_object_transforms(kg, sd, ray->time); - ccl_fetch(sd, time) = ray->time; + sd->time = ray->time; #endif - ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim); - ccl_fetch(sd, ray_length) = isect->t; + sd->prim = kernel_tex_fetch(__prim_index, isect->prim); + sd->ray_length = isect->t; #ifdef __UV__ - ccl_fetch(sd, u) = isect->u; - ccl_fetch(sd, v) = isect->v; + sd->u = isect->u; + sd->v = isect->v; #endif #ifdef __HAIR__ - if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->type & PRIMITIVE_ALL_CURVE) { /* curve */ - float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim)); + float4 curvedata = kernel_tex_fetch(__curves, sd->prim); - ccl_fetch(sd, shader) = __float_as_int(curvedata.z); - ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray); + sd->shader = __float_as_int(curvedata.z); + sd->P = bvh_curve_refine(kg, sd, isect, ray); } else #endif - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { + if(sd->type & PRIMITIVE_TRIANGLE) { /* static triangle */ float3 Ng = triangle_normal(kg, sd); - ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim)); + sd->shader = kernel_tex_fetch(__tri_shader, sd->prim); /* vectors */ - ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray); - ccl_fetch(sd, Ng) = Ng; - ccl_fetch(sd, N) = Ng; + sd->P = triangle_refine(kg, sd, isect, ray); + sd->Ng = Ng; + sd->N = Ng; /* smooth normal */ - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) - ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v)); + if(sd->shader & SHADER_SMOOTH_NORMAL) + sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); #ifdef __DPDU__ /* dPdu/dPdv */ - triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv)); + triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); #endif } else { @@ -111,40 +111,40 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg, motion_triangle_shader_setup(kg, sd, isect, ray, false); } - ccl_fetch(sd, I) = -ray->D; + sd->I = -ray->D; - ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE); + sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE); #ifdef __INSTANCING__ if(isect->object != OBJECT_NONE) { /* instance transform */ - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N)); - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng)); + object_normal_transform_auto(kg, sd, &sd->N); + object_normal_transform_auto(kg, sd, &sd->Ng); # ifdef __DPDU__ - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu)); - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv)); + object_dir_transform_auto(kg, sd, &sd->dPdu); + object_dir_transform_auto(kg, sd, &sd->dPdv); # endif } #endif /* backfacing test */ - bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f); + bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); if(backfacing) { - ccl_fetch(sd, flag) |= SD_BACKFACING; - ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng); - ccl_fetch(sd, N) = -ccl_fetch(sd, N); + sd->flag |= SD_BACKFACING; + sd->Ng = -sd->Ng; + sd->N = -sd->N; #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu); - ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv); + sd->dPdu = -sd->dPdu; + sd->dPdv = -sd->dPdv; #endif } #ifdef __RAY_DIFFERENTIALS__ /* differentials */ - differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t); - differential_incoming(&ccl_fetch(sd, dI), ray->dD); - differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng)); + differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t); + differential_incoming(&sd->dI, ray->dD); + differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng); #endif } @@ -249,106 +249,106 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg, int lamp) { /* vectors */ - ccl_fetch(sd, P) = P; - ccl_fetch(sd, N) = Ng; - ccl_fetch(sd, Ng) = Ng; - ccl_fetch(sd, I) = I; - ccl_fetch(sd, shader) = shader; + sd->P = P; + sd->N = Ng; + sd->Ng = Ng; + sd->I = I; + sd->shader = shader; if(prim != PRIM_NONE) - ccl_fetch(sd, type) = PRIMITIVE_TRIANGLE; + sd->type = PRIMITIVE_TRIANGLE; else if(lamp != LAMP_NONE) - ccl_fetch(sd, type) = PRIMITIVE_LAMP; + sd->type = PRIMITIVE_LAMP; else - ccl_fetch(sd, type) = PRIMITIVE_NONE; + sd->type = PRIMITIVE_NONE; /* primitive */ #ifdef __INSTANCING__ - ccl_fetch(sd, object) = object; + sd->object = object; #endif /* currently no access to bvh prim index for strand sd->prim*/ - ccl_fetch(sd, prim) = prim; + sd->prim = prim; #ifdef __UV__ - ccl_fetch(sd, u) = u; - ccl_fetch(sd, v) = v; + sd->u = u; + sd->v = v; #endif - ccl_fetch(sd, ray_length) = t; + sd->ray_length = t; - ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE); - ccl_fetch(sd, object_flag) = 0; - if(ccl_fetch(sd, object) != OBJECT_NONE) { - ccl_fetch(sd, object_flag) |= kernel_tex_fetch(__object_flag, - ccl_fetch(sd, object)); + sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE); + sd->object_flag = 0; + if(sd->object != OBJECT_NONE) { + sd->object_flag |= kernel_tex_fetch(__object_flag, + sd->object); #ifdef __OBJECT_MOTION__ shader_setup_object_transforms(kg, sd, time); - ccl_fetch(sd, time) = time; + sd->time = time; } else if(lamp != LAMP_NONE) { - ccl_fetch(sd, ob_tfm) = lamp_fetch_transform(kg, lamp, false); - ccl_fetch(sd, ob_itfm) = lamp_fetch_transform(kg, lamp, true); + sd->ob_tfm = lamp_fetch_transform(kg, lamp, false); + sd->ob_itfm = lamp_fetch_transform(kg, lamp, true); #endif } /* transform into world space */ if(object_space) { - object_position_transform_auto(kg, sd, &ccl_fetch(sd, P)); - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng)); - ccl_fetch(sd, N) = ccl_fetch(sd, Ng); - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, I)); + object_position_transform_auto(kg, sd, &sd->P); + object_normal_transform_auto(kg, sd, &sd->Ng); + sd->N = sd->Ng; + object_dir_transform_auto(kg, sd, &sd->I); } - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { + if(sd->type & PRIMITIVE_TRIANGLE) { /* smooth normal */ - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { - ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v)); + if(sd->shader & SHADER_SMOOTH_NORMAL) { + sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v); #ifdef __INSTANCING__ - if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) { - object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N)); + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + object_normal_transform_auto(kg, sd, &sd->N); } #endif } /* dPdu/dPdv */ #ifdef __DPDU__ - triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv)); + triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv); # ifdef __INSTANCING__ - if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) { - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu)); - object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv)); + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { + object_dir_transform_auto(kg, sd, &sd->dPdu); + object_dir_transform_auto(kg, sd, &sd->dPdv); } # endif #endif } else { #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f); - ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdu = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdv = make_float3(0.0f, 0.0f, 0.0f); #endif } /* backfacing test */ - if(ccl_fetch(sd, prim) != PRIM_NONE) { - bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f); + if(sd->prim != PRIM_NONE) { + bool backfacing = (dot(sd->Ng, sd->I) < 0.0f); if(backfacing) { - ccl_fetch(sd, flag) |= SD_BACKFACING; - ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng); - ccl_fetch(sd, N) = -ccl_fetch(sd, N); + sd->flag |= SD_BACKFACING; + sd->Ng = -sd->Ng; + sd->N = -sd->N; #ifdef __DPDU__ - ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu); - ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv); + sd->dPdu = -sd->dPdu; + sd->dPdv = -sd->dPdv; #endif } } #ifdef __RAY_DIFFERENTIALS__ /* no ray differentials here yet */ - ccl_fetch(sd, dP) = differential3_zero(); - ccl_fetch(sd, dI) = differential3_zero(); - ccl_fetch(sd, du) = differential_zero(); - ccl_fetch(sd, dv) = differential_zero(); + sd->dP = differential3_zero(); + sd->dI = differential3_zero(); + sd->du = differential_zero(); + sd->dv = differential_zero(); #endif } @@ -378,39 +378,39 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd, ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray) { /* vectors */ - ccl_fetch(sd, P) = ray->D; - ccl_fetch(sd, N) = -ray->D; - ccl_fetch(sd, Ng) = -ray->D; - ccl_fetch(sd, I) = -ray->D; - ccl_fetch(sd, shader) = kernel_data.background.surface_shader; - ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE); - ccl_fetch(sd, object_flag) = 0; + sd->P = ray->D; + sd->N = -ray->D; + sd->Ng = -ray->D; + sd->I = -ray->D; + sd->shader = kernel_data.background.surface_shader; + sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE); + sd->object_flag = 0; #ifdef __OBJECT_MOTION__ - ccl_fetch(sd, time) = ray->time; + sd->time = ray->time; #endif - ccl_fetch(sd, ray_length) = 0.0f; + sd->ray_length = 0.0f; #ifdef __INSTANCING__ - ccl_fetch(sd, object) = PRIM_NONE; + sd->object = PRIM_NONE; #endif - ccl_fetch(sd, prim) = PRIM_NONE; + sd->prim = PRIM_NONE; #ifdef __UV__ - ccl_fetch(sd, u) = 0.0f; - ccl_fetch(sd, v) = 0.0f; + sd->u = 0.0f; + sd->v = 0.0f; #endif #ifdef __DPDU__ /* dPdu/dPdv */ - ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f); - ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdu = make_float3(0.0f, 0.0f, 0.0f); + sd->dPdv = make_float3(0.0f, 0.0f, 0.0f); #endif #ifdef __RAY_DIFFERENTIALS__ /* differentials */ - ccl_fetch(sd, dP) = ray->dD; - differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP)); - ccl_fetch(sd, du) = differential_zero(); - ccl_fetch(sd, dv) = differential_zero(); + sd->dP = ray->dD; + differential_incoming(&sd->dI, sd->dP); + sd->du = differential_zero(); + sd->dv = differential_zero(); #endif } @@ -505,11 +505,11 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd { /* this is the veach one-sample model with balance heuristic, some pdf * factors drop out when using balance heuristic weighting */ - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { + for(int i = 0; i < sd->num_closure; i++) { if(i == skip_bsdf) continue; - const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + const ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF(sc->type)) { float bsdf_pdf = 0.0f; @@ -535,8 +535,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg, float light_pdf, bool use_mis) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF(sc->type)) { float bsdf_pdf = 0.0f; float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf); @@ -591,22 +591,22 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, { int sampled = 0; - if(ccl_fetch(sd, num_closure) > 1) { + if(sd->num_closure > 1) { /* pick a BSDF closure based on sample weights */ float sum = 0.0f; - for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); + for(sampled = 0; sampled < sd->num_closure; sampled++) { + const ShaderClosure *sc = &sd->closure[sampled]; if(CLOSURE_IS_BSDF(sc->type)) sum += sc->sample_weight; } - float r = ccl_fetch(sd, randb_closure)*sum; + float r = sd->randb_closure*sum; sum = 0.0f; - for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); + for(sampled = 0; sampled < sd->num_closure; sampled++) { + const ShaderClosure *sc = &sd->closure[sampled]; if(CLOSURE_IS_BSDF(sc->type)) { sum += sc->sample_weight; @@ -616,13 +616,13 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, } } - if(sampled == ccl_fetch(sd, num_closure)) { + if(sampled == sd->num_closure) { *pdf = 0.0f; return LABEL_NONE; } } - const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled); + const ShaderClosure *sc = &sd->closure[sampled]; int label; float3 eval; @@ -633,7 +633,7 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, if(*pdf != 0.0f) { bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass); - if(ccl_fetch(sd, num_closure) > 1) { + if(sd->num_closure > 1) { float sweight = sc->sample_weight; _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight); } @@ -660,8 +660,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, ShaderData *sd, ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF(sc->type)) bsdf_blur(kg, sc, roughness); @@ -670,13 +670,13 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) { - if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) + if(sd->flag & SD_HAS_ONLY_VOLUME) return make_float3(1.0f, 1.0f, 1.0f); float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl eval += sc->weight; @@ -687,8 +687,8 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd) ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd) { - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) { sc->sample_weight = 0.0f; @@ -711,8 +711,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) eval += sc->weight; @@ -725,8 +725,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_GLOSSY(sc->type)) eval += sc->weight; @@ -739,8 +739,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type)) eval += sc->weight; @@ -753,8 +753,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type)) eval += sc->weight; @@ -768,8 +768,8 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac float3 eval = make_float3(0.0f, 0.0f, 0.0f); float3 N = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) { const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc; @@ -778,12 +778,12 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac } else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) { eval += sc->weight; - N += ccl_fetch(sd, N)*average(sc->weight); + N += sd->N*average(sc->weight); } } if(is_zero(N)) - N = ccl_fetch(sd, N); + N = sd->N; else N = normalize(N); @@ -798,8 +798,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b float3 N = make_float3(0.0f, 0.0f, 0.0f); float texture_blur = 0.0f, weight_sum = 0.0f; - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BSSRDF(sc->type)) { const Bssrdf *bssrdf = (const Bssrdf*)sc; @@ -813,7 +813,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b } if(N_) - *N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N); + *N_ = (is_zero(N))? sd->N: normalize(N); if(texture_blur_) *texture_blur_ = texture_blur/weight_sum; @@ -826,7 +826,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc) { - return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I)); + return emissive_simple_eval(sd->Ng, sd->I); } ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd) @@ -834,8 +834,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd) float3 eval; eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_EMISSION(sc->type)) eval += emissive_eval(kg, sd, sc)*sc->weight; @@ -850,8 +850,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) { float3 weight = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_HOLDOUT(sc->type)) weight += sc->weight; @@ -865,9 +865,9 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_space RNG *rng, ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx) { - ccl_fetch(sd, num_closure) = 0; - ccl_fetch(sd, num_closure_extra) = 0; - ccl_fetch(sd, randb_closure) = randb; + sd->num_closure = 0; + sd->num_closure_extra = 0; + sd->randb_closure = randb; #ifdef __OSL__ if(kg->osl) @@ -881,13 +881,13 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_ DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), make_float3(0.8f, 0.8f, 0.8f)); - bsdf->N = ccl_fetch(sd, N); - ccl_fetch(sd, flag) |= bsdf_diffuse_setup(bsdf); + bsdf->N = sd->N; + sd->flag |= bsdf_diffuse_setup(bsdf); #endif } - if(rng && (ccl_fetch(sd, flag) & SD_BSDF_NEEDS_LCG)) { - ccl_fetch(sd, lcg_state) = lcg_state_init_addrspace(rng, state, 0xb4bc3953); + if(rng && (sd->flag & SD_BSDF_NEEDS_LCG)) { + sd->lcg_state = lcg_state_init_addrspace(rng, state, 0xb4bc3953); } } @@ -896,9 +896,9 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, int path_flag, ShaderContext ctx) { - ccl_fetch(sd, num_closure) = 0; - ccl_fetch(sd, num_closure_extra) = 0; - ccl_fetch(sd, randb_closure) = 0.0f; + sd->num_closure = 0; + sd->num_closure_extra = 0; + sd->randb_closure = 0.0f; #ifdef __SVM__ #ifdef __OSL__ @@ -913,8 +913,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, float3 eval = make_float3(0.0f, 0.0f, 0.0f); - for(int i = 0; i < ccl_fetch(sd, num_closure); i++) { - const ShaderClosure *sc = ccl_fetch_array(sd, closure, i); + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; if(CLOSURE_IS_BACKGROUND(sc->type)) eval += sc->weight; @@ -1093,9 +1093,9 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg, ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx) { - ccl_fetch(sd, num_closure) = 0; - ccl_fetch(sd, num_closure_extra) = 0; - ccl_fetch(sd, randb_closure) = 0.0f; + sd->num_closure = 0; + sd->num_closure_extra = 0; + sd->randb_closure = 0.0f; /* this will modify sd->P */ #ifdef __SVM__ diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index 06a77a208cb..2483c5f9ae1 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -45,7 +45,7 @@ ccl_device_forceinline bool shadow_handle_transparent_isect( /* Setup shader data at surface. */ shader_setup_from_ray(kg, shadow_sd, isect, ray); /* Attenuation from transparent surface. */ - if(!(ccl_fetch(shadow_sd, flag) & SD_HAS_ONLY_VOLUME)) { + if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) { path_state_modify_bounce(state, true); shader_eval_surface(kg, shadow_sd, @@ -180,7 +180,7 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg, return true; } /* Move ray forward. */ - ray->P = ccl_fetch(shadow_sd, P); + ray->P = shadow_sd->P; if(ray->t != FLT_MAX) { ray->D = normalize_len(Pend - ray->P, &ray->t); } @@ -248,7 +248,7 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg, } # endif /* __SHADOW_RECORD_ALL__ */ -# ifdef __KERNEL_GPU__ +# if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__) /* Shadow function to compute how much light is blocked, * * Here we raytrace from one transparent surface to the next step by step. @@ -308,7 +308,7 @@ ccl_device bool shadow_blocked_transparent_stepped_loop( return true; } /* Move ray forward. */ - ray->P = ray_offset(ccl_fetch(shadow_sd, P), -ccl_fetch(shadow_sd, Ng)); + ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng); if(ray->t != FLT_MAX) { ray->D = normalize_len(Pend - ray->P, &ray->t); } @@ -359,7 +359,7 @@ ccl_device bool shadow_blocked_transparent_stepped( shadow); } -# endif /* __KERNEL_GPU__ */ +# endif /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */ #endif /* __TRANSPARENT_SHADOWS__ */ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, @@ -374,7 +374,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, #ifdef __SPLIT_KERNEL__ Ray private_ray = *ray_input; Ray *ray = &private_ray; - Intersection *isect = &kg->isect_shadow[SD_THREAD]; + Intersection *isect = &kernel_split_state.isect_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)]; #else /* __SPLIT_KERNEL__ */ Ray *ray = ray_input; Intersection isect_object; diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h index 52c05b85aee..a8fa6432542 100644 --- a/intern/cycles/kernel/kernel_subsurface.h +++ b/intern/cycles/kernel/kernel_subsurface.h @@ -298,20 +298,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect( for(int hit = 0; hit < num_eval_hits; hit++) { /* Quickly retrieve P and Ng without setting up ShaderData. */ float3 hit_P; - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) { + if(sd->type & PRIMITIVE_TRIANGLE) { hit_P = triangle_refine_subsurface(kg, sd, &ss_isect->hits[hit], ray); } #ifdef __OBJECT_MOTION__ - else if(ccl_fetch(sd, type) & PRIMITIVE_MOTION_TRIANGLE) { + else if(sd->type & PRIMITIVE_MOTION_TRIANGLE) { float3 verts[3]; motion_triangle_vertices( kg, - ccl_fetch(sd, object), + sd->object, kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim), - ccl_fetch(sd, time), + sd->time, verts); hit_P = motion_triangle_refine_subsurface(kg, sd, diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 8250eaf6073..a7faaef89ca 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -32,6 +32,11 @@ # define ccl_addr_space #endif +#if defined(__SPLIT_KERNEL__) && !defined(__COMPUTE_DEVICE_GPU__) +/* TODO(mai): need to investigate how this effects the kernel, as cpu kernel crashes without this right now */ +#define __COMPUTE_DEVICE_GPU__ +#endif + CCL_NAMESPACE_BEGIN /* constants */ @@ -56,6 +61,8 @@ CCL_NAMESPACE_BEGIN #define VOLUME_STACK_SIZE 16 +#define WORK_POOL_SIZE 64 + /* device capabilities */ #ifdef __KERNEL_CPU__ # ifdef __KERNEL_SSE2__ @@ -63,28 +70,36 @@ CCL_NAMESPACE_BEGIN # endif # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# define __BRANCHED_PATH__ +# ifndef __SPLIT_KERNEL__ +# define __BRANCHED_PATH__ +# endif # ifdef WITH_OSL # define __OSL__ # endif -# define __SUBSURFACE__ +# ifndef __SPLIT_KERNEL__ +# define __SUBSURFACE__ +# endif # define __CMJ__ -# define __VOLUME__ -# define __VOLUME_DECOUPLED__ -# define __VOLUME_SCATTER__ -# define __SHADOW_RECORD_ALL__ -# define __VOLUME_RECORD_ALL__ +# ifndef __SPLIT_KERNEL__ +# define __VOLUME__ +# define __VOLUME_DECOUPLED__ +# define __VOLUME_SCATTER__ +# define __SHADOW_RECORD_ALL__ +# define __VOLUME_RECORD_ALL__ +# endif #endif /* __KERNEL_CPU__ */ #ifdef __KERNEL_CUDA__ # define __KERNEL_SHADING__ # define __KERNEL_ADV_SHADING__ -# define __BRANCHED_PATH__ -# define __VOLUME__ -# define __VOLUME_SCATTER__ -# define __SUBSURFACE__ -# define __CMJ__ -# define __SHADOW_RECORD_ALL__ +# ifndef __SPLIT_KERNEL__ +# define __BRANCHED_PATH__ +# define __VOLUME__ +# define __VOLUME_SCATTER__ +# define __SUBSURFACE__ +# define __CMJ__ +# define __SHADOW_RECORD_ALL__ +# endif #endif /* __KERNEL_CUDA__ */ #ifdef __KERNEL_OPENCL__ @@ -798,99 +813,77 @@ enum ShaderDataObjectFlag { SD_OBJECT_INTERSECTS_VOLUME) }; -#ifdef __SPLIT_KERNEL__ -# define SD_THREAD (get_global_id(1) * get_global_size(0) + get_global_id(0)) -# if !defined(__SPLIT_KERNEL_SOA__) - /* ShaderData is stored as an Array-of-Structures */ -# define ccl_soa_member(type, name) type soa_##name -# define ccl_fetch(s, t) (s[SD_THREAD].soa_##t) -# define ccl_fetch_array(s, t, index) (&s[SD_THREAD].soa_##t[index]) -# else - /* ShaderData is stored as an Structure-of-Arrays */ -# define SD_GLOBAL_SIZE (get_global_size(0) * get_global_size(1)) -# define SD_FIELD_SIZE(t) sizeof(((struct ShaderData*)0)->t) -# define SD_OFFSETOF(t) ((char*)(&((struct ShaderData*)0)->t) - (char*)0) -# define ccl_soa_member(type, name) type soa_##name -# define ccl_fetch(s, t) (((ShaderData*)((ccl_addr_space char*)s + SD_GLOBAL_SIZE * SD_OFFSETOF(soa_##t) + SD_FIELD_SIZE(soa_##t) * SD_THREAD - SD_OFFSETOF(soa_##t)))->soa_##t) -# define ccl_fetch_array(s, t, index) (&ccl_fetch(s, t)[index]) -# endif -#else -# define ccl_soa_member(type, name) type name -# define ccl_fetch(s, t) (s->t) -# define ccl_fetch_array(s, t, index) (&s->t[index]) -#endif - typedef ccl_addr_space struct ShaderData { /* position */ - ccl_soa_member(float3, P); + float3 P; /* smooth normal for shading */ - ccl_soa_member(float3, N); + float3 N; /* true geometric normal */ - ccl_soa_member(float3, Ng); + float3 Ng; /* view/incoming direction */ - ccl_soa_member(float3, I); + float3 I; /* shader id */ - ccl_soa_member(int, shader); + int shader; /* booleans describing shader, see ShaderDataFlag */ - ccl_soa_member(int, flag); + int flag; /* booleans describing object of the shader, see ShaderDataObjectFlag */ - ccl_soa_member(int, object_flag); + int object_flag; /* primitive id if there is one, ~0 otherwise */ - ccl_soa_member(int, prim); + int prim; /* combined type and curve segment for hair */ - ccl_soa_member(int, type); + int type; /* parametric coordinates * - barycentric weights for triangles */ - ccl_soa_member(float, u); - ccl_soa_member(float, v); + float u; + float v; /* object id if there is one, ~0 otherwise */ - ccl_soa_member(int, object); + int object; /* motion blur sample time */ - ccl_soa_member(float, time); + float time; /* length of the ray being shaded */ - ccl_soa_member(float, ray_length); + float ray_length; #ifdef __RAY_DIFFERENTIALS__ /* differential of P. these are orthogonal to Ng, not N */ - ccl_soa_member(differential3, dP); + differential3 dP; /* differential of I */ - ccl_soa_member(differential3, dI); + differential3 dI; /* differential of u, v */ - ccl_soa_member(differential, du); - ccl_soa_member(differential, dv); + differential du; + differential dv; #endif #ifdef __DPDU__ /* differential of P w.r.t. parametric coordinates. note that dPdu is * not readily suitable as a tangent for shading on triangles. */ - ccl_soa_member(float3, dPdu); - ccl_soa_member(float3, dPdv); + float3 dPdu; + float3 dPdv; #endif #ifdef __OBJECT_MOTION__ /* object <-> world space transformations, cached to avoid * re-interpolating them constantly for shading */ - ccl_soa_member(Transform, ob_tfm); - ccl_soa_member(Transform, ob_itfm); + Transform ob_tfm; + Transform ob_itfm; #endif /* Closure data, we store a fixed array of closures */ - ccl_soa_member(struct ShaderClosure, closure[MAX_CLOSURE]); - ccl_soa_member(int, num_closure); - ccl_soa_member(int, num_closure_extra); - ccl_soa_member(float, randb_closure); - ccl_soa_member(float3, svm_closure_weight); + struct ShaderClosure closure[MAX_CLOSURE]; + int num_closure; + int num_closure_extra; + float randb_closure; + float3 svm_closure_weight; /* LCG state for closures that require additional random numbers. */ - ccl_soa_member(uint, lcg_state); + uint lcg_state; /* ray start position, only set for backgrounds */ - ccl_soa_member(float3, ray_P); - ccl_soa_member(differential3, ray_dP); + float3 ray_P; + differential3 ray_dP; #ifdef __OSL__ struct KernelGlobals *osl_globals; diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h index 7d559b1aa31..28fc5ce1c30 100644 --- a/intern/cycles/kernel/kernel_work_stealing.h +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -17,177 +17,102 @@ #ifndef __KERNEL_WORK_STEALING_H__ #define __KERNEL_WORK_STEALING_H__ +CCL_NAMESPACE_BEGIN + /* * Utility functions for work stealing */ -#ifdef __WORK_STEALING__ - #ifdef __KERNEL_OPENCL__ # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #endif -uint get_group_id_with_ray_index(uint ray_index, - uint tile_dim_x, - uint tile_dim_y, - uint parallel_samples, - int dim) +ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg) +{ + return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples; +} + +ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg) +{ + return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE; +} + +ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index) +{ + return ray_index / WORK_POOL_SIZE; +} + +ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool) { - if(dim == 0) { - uint x_span = ray_index % (tile_dim_x * parallel_samples); - return x_span / get_local_size(0); + uint total_work_size = kernel_total_work_size(kg); + uint num_pools = kernel_num_work_pools(kg); + + if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) { + return 0; + } + + uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE; + + uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE)); + if(work_pool < remainder / WORK_POOL_SIZE) { + work_size += WORK_POOL_SIZE; } - else /*if(dim == 1)*/ { - kernel_assert(dim == 1); - uint y_span = ray_index / (tile_dim_x * parallel_samples); - return y_span / get_local_size(1); + else if(work_pool == remainder / WORK_POOL_SIZE) { + work_size += remainder % WORK_POOL_SIZE; } + + return work_size; } -uint get_total_work(uint tile_dim_x, - uint tile_dim_y, - uint grp_idx, - uint grp_idy, - uint num_samples) +ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index) { - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); - - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; - - return threads_within_tile_border_x * - threads_within_tile_border_y * - num_samples; + uint num_pools = kernel_num_work_pools(kg); + uint pool = work_pool_from_ray_index(kg, ray_index); + + return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE) + + (pool * WORK_POOL_SIZE) + + (work_index % WORK_POOL_SIZE); } -/* Returns 0 in case there is no next work available */ -/* Returns 1 in case work assigned is valid */ -int get_next_work(ccl_global uint *work_pool, - ccl_private uint *my_work, - uint tile_dim_x, - uint tile_dim_y, - uint num_samples, - uint parallel_samples, - uint ray_index) +/* Returns true if there is work */ +ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index) { - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint total_work = get_total_work(tile_dim_x, - tile_dim_y, - grp_idx, - grp_idy, - num_samples); - uint group_index = grp_idy * get_num_groups(0) + grp_idx; - *my_work = atomic_inc(&work_pool[group_index]); - return (*my_work < total_work) ? 1 : 0; + uint work_pool = work_pool_from_ray_index(kg, ray_index); + uint pool_size = work_pool_work_size(kg, work_pool); + + if(pool_size == 0) { + return false; + } + + *work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]); + return (*work_index < pool_size); } -/* This function assumes that the passed my_work is valid. */ -/* Decode sample number w.r.t. assigned my_work. */ -uint get_my_sample(uint my_work, - uint tile_dim_x, - uint tile_dim_y, - uint parallel_samples, - uint ray_index) +/* This function assumes that the passed `work` is valid. */ +/* Decode sample number w.r.t. assigned `work`. */ +ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index) { - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); - - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; - - return my_work / - (threads_within_tile_border_x * threads_within_tile_border_y); + return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h); } -/* Decode pixel and tile position w.r.t. assigned my_work. */ -void get_pixel_tile_position(ccl_private uint *pixel_x, +/* Decode pixel and tile position w.r.t. assigned `work`. */ +ccl_device void get_work_pixel_tile_position(KernelGlobals *kg, + ccl_private uint *pixel_x, ccl_private uint *pixel_y, ccl_private uint *tile_x, ccl_private uint *tile_y, - uint my_work, - uint tile_dim_x, - uint tile_dim_y, - uint tile_offset_x, - uint tile_offset_y, - uint parallel_samples, + uint work_index, uint ray_index) { - uint grp_idx = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 0); - uint grp_idy = get_group_id_with_ray_index(ray_index, - tile_dim_x, - tile_dim_y, - parallel_samples, - 1); - uint threads_within_tile_border_x = - (grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0) - : get_local_size(0); - uint threads_within_tile_border_y = - (grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1) - : get_local_size(1); - - threads_within_tile_border_x = - (threads_within_tile_border_x == 0) ? get_local_size(0) - : threads_within_tile_border_x; - threads_within_tile_border_y = - (threads_within_tile_border_y == 0) ? get_local_size(1) - : threads_within_tile_border_y; - - uint total_associated_pixels = - threads_within_tile_border_x * threads_within_tile_border_y; - uint work_group_pixel_index = my_work % total_associated_pixels; - uint work_group_pixel_x = - work_group_pixel_index % threads_within_tile_border_x; - uint work_group_pixel_y = - work_group_pixel_index / threads_within_tile_border_x; - - *pixel_x = - tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x; - *pixel_y = - tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y; - *tile_x = *pixel_x - tile_offset_x; - *tile_y = *pixel_y - tile_offset_y; + uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h); + + *tile_x = pixel_index % kernel_split_params.w; + *tile_y = pixel_index / kernel_split_params.w; + + *pixel_x = *tile_x + kernel_split_params.x; + *pixel_y = *tile_y + kernel_split_params.y; } -#endif /* __WORK_STEALING__ */ +CCL_NAMESPACE_END #endif /* __KERNEL_WORK_STEALING_H__ */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 1a07c705f1c..deb872444d0 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -49,4 +49,39 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, int offset, int sample); +/* Split kernels */ + +void KERNEL_FUNCTION_FULL_NAME(data_init)( + KernelGlobals *kg, + ccl_constant KernelData *data, + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + ccl_global uint *rng_state, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer); + +#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData *data); + +DECLARE_SPLIT_KERNEL_FUNCTION(path_init) +DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue) +DECLARE_SPLIT_KERNEL_FUNCTION(background_buffer_update) +DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval) +DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) +DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting) +DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked) +DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) + +void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)); + #undef KERNEL_ARCH diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index ec82d4b4c22..d6d0db4e034 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -21,17 +21,39 @@ */ #include "kernel_compat_cpu.h" -#include "kernel_math.h" -#include "kernel_types.h" -#include "kernel_globals.h" -#include "kernel_cpu_image.h" -#include "kernel_film.h" -#include "kernel_path.h" -#include "kernel_path_branched.h" -#include "kernel_bake.h" + +#ifndef __SPLIT_KERNEL__ +# include "kernel_math.h" +# include "kernel_types.h" + +# include "split/kernel_split_data.h" +# include "kernel_globals.h" + +# include "kernel_cpu_image.h" +# include "kernel_film.h" +# include "kernel_path.h" +# include "kernel_path_branched.h" +# include "kernel_bake.h" +#else +# include "split/kernel_split_common.h" + +# include "split/kernel_data_init.h" +# include "split/kernel_path_init.h" +# include "split/kernel_scene_intersect.h" +# include "split/kernel_lamp_emission.h" +# include "split/kernel_queue_enqueue.h" +# include "split/kernel_background_buffer_update.h" +# include "split/kernel_shader_eval.h" +# include "split/kernel_holdout_emission_blurring_pathtermination_ao.h" +# include "split/kernel_direct_lighting.h" +# include "split/kernel_shadow_blocked.h" +# include "split/kernel_next_iteration_setup.h" +#endif CCL_NAMESPACE_BEGIN +#ifndef __SPLIT_KERNEL__ + /* Path Tracing */ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg, @@ -131,4 +153,55 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, } } +#else /* __SPLIT_KERNEL__ */ + +/* Split Kernel Path Tracing */ + +#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \ + { \ + kernel_##name(kg); \ + } + +DEFINE_SPLIT_KERNEL_FUNCTION(path_init) +DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue) +DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) +DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) +DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked) +DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) + +void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)) +{ +#define REGISTER_NAME_STRING(name) #name +#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name) +#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name)); + + REGISTER(path_trace); + REGISTER(convert_to_byte); + REGISTER(convert_to_half_float); + REGISTER(shader); + + REGISTER(data_init); + REGISTER(path_init); + REGISTER(scene_intersect); + REGISTER(lamp_emission); + REGISTER(queue_enqueue); + REGISTER(background_buffer_update); + REGISTER(shader_eval); + REGISTER(holdout_emission_blurring_pathtermination_ao); + REGISTER(direct_lighting); + REGISTER(shadow_blocked); + REGISTER(next_iteration_setup); + +#undef REGISTER +#undef REGISTER_EVAL_NAME +#undef REGISTER_NAME_STRING +} + +#endif /* __SPLIT_KERNEL__ */ + CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp new file mode 100644 index 00000000000..30519dae53e --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp @@ -0,0 +1,63 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CPU kernel entry points */ + +/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this + * one with SSE2 intrinsics. + */ +#if defined(__x86_64__) || defined(_M_X64) +# define __KERNEL_SSE2__ +#endif + +#define __SPLIT_KERNEL__ + +/* When building kernel for native machine detect kernel features from the flags + * set by compiler. + */ +#ifdef WITH_KERNEL_NATIVE +# ifdef __SSE2__ +# ifndef __KERNEL_SSE2__ +# define __KERNEL_SSE2__ +# endif +# endif +# ifdef __SSE3__ +# define __KERNEL_SSE3__ +# endif +# ifdef __SSSE3__ +# define __KERNEL_SSSE3__ +# endif +# ifdef __SSE4_1__ +# define __KERNEL_SSE41__ +# endif +# ifdef __AVX__ +# define __KERNEL_AVX__ +# endif +# ifdef __AVX2__ +# define __KERNEL_SSE__ +# define __KERNEL_AVX2__ +# endif +#endif + +/* quiet unused define warnings */ +#if defined(__KERNEL_SSE2__) + /* do nothing */ +#endif + +#include "kernel.h" +#define KERNEL_ARCH cpu +#include "kernel_cpu_impl.h" + diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp new file mode 100644 index 00000000000..335ad24bdc5 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp @@ -0,0 +1,38 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX +# include "kernel.h" +# define KERNEL_ARCH cpu_avx +# include "kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp new file mode 100644 index 00000000000..765ba96aba3 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp @@ -0,0 +1,40 @@ +/* + * Copyright 2011-2014 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with AVX2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE__ +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +# define __KERNEL_AVX__ +# define __KERNEL_AVX2__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 +# include "kernel.h" +# define KERNEL_ARCH cpu_avx2 +# include "kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp new file mode 100644 index 00000000000..af244c03929 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp @@ -0,0 +1,34 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE2 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 +# include "kernel.h" +# define KERNEL_ARCH cpu_sse2 +# include "kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp new file mode 100644 index 00000000000..d1b579eeac5 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp @@ -0,0 +1,36 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 +# include "kernel.h" +# define KERNEL_ARCH cpu_sse3 +# include "kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */ diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp new file mode 100644 index 00000000000..83d62de5aa5 --- /dev/null +++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp @@ -0,0 +1,37 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3 + * optimization flags and nearly all functions inlined, while kernel.cpp + * is compiled without for other CPU's. */ + +/* SSE optimization disabled for now on 32 bit, see bug #36316 */ +#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86))) +# define __KERNEL_SSE2__ +# define __KERNEL_SSE3__ +# define __KERNEL_SSSE3__ +# define __KERNEL_SSE41__ +#endif + +#define __SPLIT_KERNEL__ + +#include "util_optimization.h" + +#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 +# include "kernel.h" +# define KERNEL_ARCH cpu_sse41 +# include "kernel_cpu_impl.h" +#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */ diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index 090ab2c50c2..52e541321e3 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -16,7 +16,10 @@ /* CUDA kernel entry points */ +#ifdef __CUDA_ARCH__ + #include "../../kernel_compat_cuda.h" +#include "kernel_config.h" #include "../../kernel_math.h" #include "../../kernel_types.h" #include "../../kernel_globals.h" @@ -25,104 +28,7 @@ #include "../../kernel_path_branched.h" #include "../../kernel_bake.h" -/* device data taken from CUDA occupancy calculator */ - -#ifdef __CUDA_ARCH__ - -/* 2.0 and 2.1 */ -#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 32 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40 - -/* 3.0 and 3.5 */ -#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.2 */ -#elif __CUDA_ARCH__ == 320 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.7 */ -#elif __CUDA_ARCH__ == 370 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 5.0, 5.2, 5.3, 6.0, 6.1 */ -#elif __CUDA_ARCH__ >= 500 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 48 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* unknown architecture */ -#else -# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" -#endif - -/* compute number of threads per block and minimum blocks per multiprocessor - * given the maximum number of registers per thread */ - -#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ - __launch_bounds__( \ - threads_block_width*threads_block_width, \ - CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \ - ) - -/* sanity checks */ - -#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS -# error "Maximum number of threads per block exceeded" -#endif - -#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS -# error "Maximum number of blocks per multiprocessor exceeded" -#endif - -#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif - -#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif - /* kernels */ - extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride) diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h new file mode 100644 index 00000000000..9fa39dc9ebb --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h @@ -0,0 +1,110 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* device data taken from CUDA occupancy calculator */ + +/* 2.0 and 2.1 */ +#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 32 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40 + +/* 3.0 and 3.5 */ +#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 3.2 */ +#elif __CUDA_ARCH__ == 320 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 63 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 3.7 */ +#elif __CUDA_ARCH__ == 370 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 63 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* 5.0, 5.2, 5.3, 6.0, 6.1 */ +#elif __CUDA_ARCH__ >= 500 +# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 +# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 +# define CUDA_BLOCK_MAX_THREADS 1024 +# define CUDA_THREAD_MAX_REGISTERS 255 + +/* tunable parameters */ +# define CUDA_THREADS_BLOCK_WIDTH 16 +# define CUDA_KERNEL_MAX_REGISTERS 48 +# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 + +/* unknown architecture */ +#else +# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" +#endif + +/* compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread */ + +#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ + __launch_bounds__( \ + threads_block_width*threads_block_width, \ + CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \ + ) + +/* sanity checks */ + +#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS +# error "Maximum number of threads per block exceeded" +#endif + +#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS +# error "Maximum number of blocks per multiprocessor exceeded" +#endif + +#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif + +#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif + diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu new file mode 100644 index 00000000000..759475b175f --- /dev/null +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -0,0 +1,125 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* CUDA split kernel entry points */ + +#ifdef __CUDA_ARCH__ + +#define __SPLIT_KERNEL__ + +#include "../../kernel_compat_cuda.h" +#include "kernel_config.h" + +#include "../../split/kernel_split_common.h" +#include "../../split/kernel_data_init.h" +#include "../../split/kernel_path_init.h" +#include "../../split/kernel_scene_intersect.h" +#include "../../split/kernel_lamp_emission.h" +#include "../../split/kernel_queue_enqueue.h" +#include "../../split/kernel_background_buffer_update.h" +#include "../../split/kernel_shader_eval.h" +#include "../../split/kernel_holdout_emission_blurring_pathtermination_ao.h" +#include "../../split/kernel_direct_lighting.h" +#include "../../split/kernel_shadow_blocked.h" +#include "../../split/kernel_next_iteration_setup.h" + +#include "../../kernel_film.h" + +/* kernels */ +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_state_buffer_size(uint num_threads, uint *size) +{ + *size = split_data_buffer_size(NULL, num_threads); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_path_trace_data_init( + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, + ccl_global uint *rng_state, + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, + ccl_global int *Queue_index, + int queuesize, + ccl_global char *use_queues_flag, + ccl_global unsigned int *work_pool_wgs, + unsigned int num_samples, + ccl_global float *buffer) +{ + kernel_data_init(NULL, + NULL, + split_data_buffer, + num_elements, + ray_state, + rng_state, + start_sample, + end_sample, + sx, sy, sw, sh, offset, stride, + Queue_index, + queuesize, + use_queues_flag, + work_pool_wgs, + num_samples, + buffer); +} + +#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ + extern "C" __global__ void \ + CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \ + kernel_cuda_##name() \ + { \ + kernel_##name(NULL); \ + } + +DEFINE_SPLIT_KERNEL_FUNCTION(path_init) +DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) +DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) +DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue) +DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update) +DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) +DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) +DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting) +DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked) +DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) +{ + int x = sx + blockDim.x*blockIdx.x + threadIdx.x; + int y = sy + blockDim.y*blockIdx.y + threadIdx.y; + + if(x < sx + sw && y < sy + sh) + kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); +} + +#endif + diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index a68f97857b6..52406d2f548 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -67,8 +67,8 @@ __kernel void kernel_ocl_path_trace( kg->name = name; #include "../../kernel_textures.h" - int x = sx + get_global_id(0); - int y = sy + get_global_id(1); + int x = sx + ccl_global_id(0); + int y = sy + ccl_global_id(1); if(x < sx + sw && y < sy + sh) kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride); @@ -96,7 +96,7 @@ __kernel void kernel_ocl_shader( kg->name = name; #include "../../kernel_textures.h" - int x = sx + get_global_id(0); + int x = sx + ccl_global_id(0); if(x < sx + sw) { kernel_shader_evaluate(kg, @@ -128,7 +128,7 @@ __kernel void kernel_ocl_bake( kg->name = name; #include "../../kernel_textures.h" - int x = sx + get_global_id(0); + int x = sx + ccl_global_id(0); if(x < sx + sw) { #ifdef __NO_BAKING__ @@ -159,8 +159,8 @@ __kernel void kernel_ocl_convert_to_byte( kg->name = name; #include "../../kernel_textures.h" - int x = sx + get_global_id(0); - int y = sy + get_global_id(1); + int x = sx + ccl_global_id(0); + int y = sy + ccl_global_id(1); if(x < sx + sw && y < sy + sh) kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride); @@ -186,11 +186,27 @@ __kernel void kernel_ocl_convert_to_half_float( kg->name = name; #include "../../kernel_textures.h" - int x = sx + get_global_id(0); - int y = sy + get_global_id(1); + int x = sx + ccl_global_id(0); + int y = sy + ccl_global_id(1); if(x < sx + sw && y < sy + sh) kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride); } +__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, ulong size, ulong offset) +{ + size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); + + if(i < size / sizeof(float4)) { + buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + else if(i == size / sizeof(float4)) { + ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)]; + + for(i = 0; i < size % sizeof(float4); i++) { + *(b++) = 0; + } + } +} + #endif /* __COMPILE_ONLY_MEGAKERNEL__ */ diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl index 1914d241eb1..47e363f6e03 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl @@ -14,112 +14,13 @@ * limitations under the License. */ +#include "kernel_compat_opencl.h" +#include "split/kernel_split_common.h" #include "split/kernel_background_buffer_update.h" __kernel void kernel_ocl_path_trace_background_buffer_update( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* Required for buffer Update */ - ccl_global float3 *throughput_coop, /* Required for background hit processing */ - PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */ - ccl_global Ray *Ray_coop, /* Required for background hit processing */ - ccl_global PathState *PathState_coop, /* Required for background hit processing */ - ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */ - ccl_global char *ray_state, /* Stores information on the current state of a ray */ - int sw, int sh, int sx, int sy, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global unsigned int *work_array, /* Denotes work of each ray */ - ccl_global int *Queue_data, /* Queues memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ - int end_sample, - int start_sample, -#ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ + KernelGlobals *kg, + ccl_constant KernelData *data) { - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - if(ray_index == 0) { - /* We will empty this queue in this kernel. */ - Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; - } - char enqueue_flag = 0; - ray_index = get_ray_index(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - Queue_data, - queuesize, - 1); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - enqueue_flag = - kernel_background_buffer_update((KernelGlobals *)kg, - per_sample_output_buffers, - rng_state, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - L_transparent_coop, - ray_state, - sw, sh, sx, sy, stride, - rng_state_offset_x, - rng_state_offset_y, - rng_state_stride, - work_array, - end_sample, - start_sample, -#ifdef __WORK_STEALING__ - work_pool_wgs, - num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - debugdata_coop, -#endif - parallel_samples, - ray_index); -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - - /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; - * These rays will be made active during next SceneIntersectkernel. - */ - enqueue_ray_index_local(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); + kernel_background_buffer_update(kg); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl index 18139687eab..1e3c4fa28c7 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl @@ -14,77 +14,49 @@ * limitations under the License. */ +#include "kernel_compat_opencl.h" +#include "split/kernel_split_common.h" #include "split/kernel_data_init.h" __kernel void kernel_ocl_path_trace_data_init( - ccl_global char *globals, - ccl_global char *sd_DL_shadow, + KernelGlobals *kg, ccl_constant KernelData *data, - ccl_global float *per_sample_output_buffers, + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ - ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ - ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ - PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ - ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ - ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ - Intersection *Intersection_coop_shadow, - ccl_global char *ray_state, /* Stores information on current state of a ray */ #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, #include "../../kernel_textures.h" - int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global int *Queue_data, /* Memory for queues */ + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, ccl_global int *Queue_index, /* Tracks the number of elements in queues */ int queuesize, /* size (capacity) of the queue */ ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ - ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ -#ifdef __WORK_STEALING__ ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ unsigned int num_samples, /* Total number of samples per pixel */ -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ + ccl_global float *buffer) { - kernel_data_init((KernelGlobals *)globals, - (ShaderData *)sd_DL_shadow, + kernel_data_init(kg, data, - per_sample_output_buffers, - rng_state, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - Intersection_coop_shadow, + split_data_buffer, + num_elements, ray_state, + rng_state, #define KERNEL_TEX(type, ttype, name) name, #include "../../kernel_textures.h" - start_sample, sx, sy, sw, sh, offset, stride, - rng_state_offset_x, - rng_state_offset_y, - rng_state_stride, - Queue_data, + start_sample, + end_sample, + sx, sy, sw, sh, offset, stride, Queue_index, queuesize, use_queues_flag, - work_array, -#ifdef __WORK_STEALING__ work_pool_wgs, num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - debugdata_coop, -#endif - parallel_samples); + buffer); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl index c6a2c8d050c..5d2f46b319d 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl @@ -14,74 +14,13 @@ * limitations under the License. */ +#include "kernel_compat_opencl.h" +#include "split/kernel_split_common.h" #include "split/kernel_direct_lighting.h" __kernel void kernel_ocl_path_trace_direct_lighting( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Required for direct lighting */ - ccl_global uint *rng_coop, /* Required for direct lighting */ - ccl_global PathState *PathState_coop, /* Required for direct lighting */ - ccl_global int *ISLamp_coop, /* Required for direct lighting */ - ccl_global Ray *LightRay_coop, /* Required for direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize) /* Size (capacity) of each queue */ + KernelGlobals *kg, + ccl_constant KernelData *data) { - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - char enqueue_flag = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - enqueue_flag = kernel_direct_lighting((KernelGlobals *)kg, - (ShaderData *)sd, - rng_coop, - PathState_coop, - ISLamp_coop, - LightRay_coop, - BSDFEval_coop, - ray_state, - ray_index); - -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - -#ifdef __EMISSION__ - /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_SHADOW_RAY_CAST_DL_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); -#endif + kernel_direct_lighting(kg); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl index e063614da1a..7724b8a0bdf 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl @@ -14,110 +14,13 @@ * limitations under the License. */ +#include "kernel_compat_opencl.h" +#include "split/kernel_split_common.h" #include "split/kernel_holdout_emission_blurring_pathtermination_ao.h" __kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Required throughout the kernel except probabilistic path termination and AO */ - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */ - ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */ - ccl_global float *L_transparent_coop, /* Required for handling holdout material */ - PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */ - ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */ - Intersection *Intersection_coop, /* Required for indirect primitive emission */ - ccl_global float3 *AOAlpha_coop, /* Required for AO */ - ccl_global float3 *AOBSDF_coop, /* Required for AO */ - ccl_global Ray *AOLightRay_coop, /* Required for AO */ - int sw, int sh, int sx, int sy, int stride, - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ -#ifdef __WORK_STEALING__ - unsigned int start_sample, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ + KernelGlobals *kg, + ccl_constant KernelData *data) { - ccl_local unsigned int local_queue_atomics_bg; - ccl_local unsigned int local_queue_atomics_ao; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics_bg = 0; - local_queue_atomics_ao = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - char enqueue_flag = 0; - char enqueue_flag_AO_SHADOW_RAY_CAST = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif /* __COMPUTE_DEVICE_GPU__ */ - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - kernel_holdout_emission_blurring_pathtermination_ao( - (KernelGlobals *)kg, - (ShaderData *)sd, - per_sample_output_buffers, - rng_coop, - throughput_coop, - L_transparent_coop, - PathRadiance_coop, - PathState_coop, - Intersection_coop, - AOAlpha_coop, - AOBSDF_coop, - AOLightRay_coop, - sw, sh, sx, sy, stride, - ray_state, - work_array, -#ifdef __WORK_STEALING__ - start_sample, -#endif - parallel_samples, - ray_index, - &enqueue_flag, - &enqueue_flag_AO_SHADOW_RAY_CAST); -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics_bg, - Queue_data, - Queue_index); - -#ifdef __AO__ - /* Enqueue to-shadow-ray-cast rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_SHADOW_RAY_CAST_AO_RAYS, - enqueue_flag_AO_SHADOW_RAY_CAST, - queuesize, - &local_queue_atomics_ao, - Queue_data, - Queue_index); -#endif + kernel_holdout_emission_blurring_pathtermination_ao(kg); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl index 267bddc2ffc..2b84d0ea43e 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl @@ -14,67 +14,13 @@ * limitations under the License. */ +#include "kernel_compat_opencl.h" +#include "split/kernel_split_common.h" #include "split/kernel_lamp_emission.h" __kernel void kernel_ocl_path_trace_lamp_emission( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global float3 *throughput_coop, /* Required for lamp emission */ - PathRadiance *PathRadiance_coop, /* Required for lamp emission */ - ccl_global Ray *Ray_coop, /* Required for lamp emission */ - ccl_global PathState *PathState_coop, /* Required for lamp emission */ - Intersection *Intersection_coop, /* Required for lamp emission */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global int *Queue_data, /* Memory for queues */ - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* Size (capacity) of queues */ - ccl_global char *use_queues_flag, /* Used to decide if this kernel should use - * queues to fetch ray index - */ - int parallel_samples) /* Number of samples to be processed in parallel */ + KernelGlobals *kg, + ccl_constant KernelData *data) { - int x = get_global_id(0); - int y = get_global_id(1); - - /* We will empty this queue in this kernel. */ - if(get_global_id(0) == 0 && get_global_id(1) == 0) { - Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; - } - /* Fetch use_queues_flag. */ - ccl_local char local_use_queues_flag; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_use_queues_flag = use_queues_flag[0]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index; - if(local_use_queues_flag) { - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(thread_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 1); - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } - } else { - if(x < (sw * parallel_samples) && y < sh) { - ray_index = x + y * (sw * parallel_samples); - } else { - return; - } - } - - kernel_lamp_emission((KernelGlobals *)kg, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - sw, sh, - use_queues_flag, - ray_index); + kernel_lamp_emission(kg); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl index 6d49b6294a8..e87e367fb9c 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl @@ -14,101 +14,13 @@ * limitations under the License. */ +#include "kernel_compat_opencl.h" +#include "split/kernel_split_common.h" #include "split/kernel_next_iteration_setup.h" __kernel void kernel_ocl_path_trace_next_iteration_setup( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Required for setting up ray for next iteration */ - ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */ - ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */ - PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */ - ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */ - ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */ - ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize, /* Size (capacity) of each queue */ - ccl_global char *use_queues_flag) /* flag to decide if scene_intersect kernel should - * use queues to fetch ray index */ + KernelGlobals *kg, + ccl_constant KernelData *data) { - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(get_global_id(0) == 0 && get_global_id(1) == 0) { - /* If we are here, then it means that scene-intersect kernel - * has already been executed atleast once. From the next time, - * scene-intersect kernel may operate on queues to fetch ray index - */ - use_queues_flag[0] = 1; - - /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and - * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the - * previous kernel. - */ - Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; - Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; - } - - char enqueue_flag = 0; - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - -#ifdef __COMPUTE_DEVICE_GPU__ - /* If we are executing on a GPU device, we exit all threads that are not - * required. - * - * If we are executing on a CPU device, then we need to keep all threads - * active since we have barrier() calls later in the kernel. CPU devices, - * expect all threads to execute barrier statement. - */ - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } -#endif - -#ifndef __COMPUTE_DEVICE_GPU__ - if(ray_index != QUEUE_EMPTY_SLOT) { -#endif - enqueue_flag = kernel_next_iteration_setup((KernelGlobals *)kg, - (ShaderData *)sd, - rng_coop, - throughput_coop, - PathRadiance_coop, - Ray_coop, - PathState_coop, - LightRay_dl_coop, - ISLamp_coop, - BSDFEval_coop, - LightRay_ao_coop, - AOBSDF_coop, - AOAlpha_coop, - ray_state, - use_queues_flag, - ray_index); -#ifndef __COMPUTE_DEVICE_GPU__ - } -#endif - - /* Enqueue RAY_UPDATE_BUFFER rays. */ - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); + kernel_next_iteration_setup(kg); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl new file mode 100644 index 00000000000..7e9e4a02529 --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl @@ -0,0 +1,26 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_compat_opencl.h" +#include "split/kernel_split_common.h" +#include "split/kernel_path_init.h" + +__kernel void kernel_ocl_path_trace_path_init( + KernelGlobals *kg, + ccl_constant KernelData *data) +{ + kernel_path_init(kg); +} diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl index 3156dc255fb..9ceb6a5c3d8 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl @@ -14,93 +14,13 @@ * limitations under the License. */ -#include "../../kernel_compat_opencl.h" -#include "../../kernel_math.h" -#include "../../kernel_types.h" -#include "../../kernel_globals.h" -#include "../../kernel_queues.h" +#include "kernel_compat_opencl.h" +#include "split/kernel_split_common.h" +#include "split/kernel_queue_enqueue.h" -/* - * The kernel "kernel_queue_enqueue" enqueues rays of - * different ray state into their appropriate Queues; - * 1. Rays that have been determined to hit the background from the - * "kernel_scene_intersect" kernel - * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; - * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS. - * - * The input and output of the kernel is as follows, - * - * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) - * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| | - * queuesize -------------------------------------------| | - * - * Note on Queues : - * State of queues during the first time this kernel is called : - * At entry, - * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays - * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays. - * - * State of queue during other times this kernel is called : - * At entry, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. - * At exit, - * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. - * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays. - */ __kernel void kernel_ocl_path_trace_queue_enqueue( - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int queuesize) /* Size (capacity) of each queue */ + KernelGlobals *kg, + ccl_constant KernelData *data) { - /* We have only 2 cases (Hit/Not-Hit) */ - ccl_local unsigned int local_queue_atomics[2]; - - int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0); - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - - if(lidx < 2 ) { - local_queue_atomics[lidx] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int queue_number = -1; - - if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { - queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; - } - else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS; - } - - unsigned int my_lqidx; - if(queue_number != -1) { - my_lqidx = get_local_queue_index(queue_number, local_queue_atomics); - } - barrier(CLK_LOCAL_MEM_FENCE); - - if(lidx == 0) { - local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = - get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, - local_queue_atomics, - Queue_index); - local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = - get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - local_queue_atomics, - Queue_index); - } - barrier(CLK_LOCAL_MEM_FENCE); - - unsigned int my_gqidx; - if(queue_number != -1) { - my_gqidx = get_global_queue_index(queue_number, - queuesize, - my_lqidx, - local_queue_atomics); - Queue_data[my_gqidx] = ray_index; - } + kernel_queue_enqueue(kg); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl index 7f3f433c7a6..4e083e87d1c 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl @@ -14,67 +14,13 @@ * limitations under the License. */ +#include "kernel_compat_opencl.h" +#include "split/kernel_split_common.h" #include "split/kernel_scene_intersect.h" __kernel void kernel_ocl_path_trace_scene_intersect( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global uint *rng_coop, - ccl_global Ray *Ray_coop, /* Required for scene_intersect */ - ccl_global PathState *PathState_coop, /* Required for scene_intersect */ - Intersection *Intersection_coop, /* Required for scene_intersect */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global int *Queue_data, /* Memory for queues */ - ccl_global int *Queue_index, /* Tracks the number of elements in queues */ - int queuesize, /* Size (capacity) of queues */ - ccl_global char *use_queues_flag, /* used to decide if this kernel should use - * queues to fetch ray index */ -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ + KernelGlobals *kg, + ccl_constant KernelData *data) { - int x = get_global_id(0); - int y = get_global_id(1); - - /* Fetch use_queues_flag */ - ccl_local char local_use_queues_flag; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_use_queues_flag = use_queues_flag[0]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index; - if(local_use_queues_flag) { - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(thread_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } - } else { - if(x < (sw * parallel_samples) && y < sh) { - ray_index = x + y * (sw * parallel_samples); - } else { - return; - } - } - - kernel_scene_intersect((KernelGlobals *)kg, - rng_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - sw, sh, - use_queues_flag, -#ifdef __KERNEL_DEBUG__ - debugdata_coop, -#endif - ray_index); + kernel_scene_intersect(kg); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl index c37856c8f30..a2b48b15928 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl @@ -14,55 +14,13 @@ * limitations under the License. */ +#include "kernel_compat_opencl.h" +#include "split/kernel_split_common.h" #include "split/kernel_shader_eval.h" __kernel void kernel_ocl_path_trace_shader_eval( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global char *sd, /* Output ShaderData structure to be filled */ - ccl_global uint *rng_coop, /* Required for rbsdf calculation */ - ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */ - ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */ - Intersection *Intersection_coop, /* Required for setting up shader from ray */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global int *Queue_data, /* queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize) /* Size (capacity) of each queue */ + KernelGlobals *kg, + ccl_constant KernelData *data) { - /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ - ccl_local unsigned int local_queue_atomics; - if(get_local_id(0) == 0 && get_local_id(1) == 0) { - local_queue_atomics = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - ray_index = get_ray_index(ray_index, - QUEUE_ACTIVE_AND_REGENERATED_RAYS, - Queue_data, - queuesize, - 0); - - if(ray_index == QUEUE_EMPTY_SLOT) { - return; - } - - char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; - enqueue_ray_index_local(ray_index, - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, - enqueue_flag, - queuesize, - &local_queue_atomics, - Queue_data, - Queue_index); - - /* Continue on with shader evaluation. */ - kernel_shader_eval((KernelGlobals *)kg, - (ShaderData *)sd, - rng_coop, - Ray_coop, - PathState_coop, - Intersection_coop, - ray_state, - ray_index); + kernel_shader_eval(kg); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl index edf76fba714..3693f7f9c9d 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl @@ -14,52 +14,13 @@ * limitations under the License. */ +#include "kernel_compat_opencl.h" +#include "split/kernel_split_common.h" #include "split/kernel_shadow_blocked.h" __kernel void kernel_ocl_path_trace_shadow_blocked( - ccl_global char *kg, - ccl_constant KernelData *data, - ccl_global PathState *PathState_coop, /* Required for shadow blocked */ - ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */ - ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */ - ccl_global char *ray_state, - ccl_global int *Queue_data, /* Queue memory */ - ccl_global int *Queue_index, /* Tracks the number of elements in each queue */ - int queuesize) /* Size (capacity) of each queue */ + KernelGlobals *kg, + ccl_constant KernelData *data) { - int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0); - - ccl_local unsigned int ao_queue_length; - ccl_local unsigned int dl_queue_length; - if(lidx == 0) { - ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS]; - dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - /* flag determining if the current ray is to process shadow ray for AO or DL */ - char shadow_blocked_type = -1; - - int ray_index = QUEUE_EMPTY_SLOT; - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - if(thread_index < ao_queue_length + dl_queue_length) { - if(thread_index < ao_queue_length) { - ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1); - shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO; - } else { - ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1); - shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL; - } - } - - if(ray_index == QUEUE_EMPTY_SLOT) - return; - - kernel_shadow_blocked((KernelGlobals *)kg, - PathState_coop, - LightRay_dl_coop, - LightRay_ao_coop, - ray_state, - shadow_blocked_type, - ray_index); + kernel_shadow_blocked(kg); } diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl new file mode 100644 index 00000000000..0a1843ff8bd --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl @@ -0,0 +1,29 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_compat_opencl.h" +#include "split/kernel_split_common.h" + +__kernel void kernel_ocl_path_trace_state_buffer_size( + KernelGlobals *kg, + ccl_constant KernelData *data, + uint num_threads, + ccl_global uint *size) +{ + kg->data = data; + *size = split_data_buffer_size(kg, num_threads); +} + diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl deleted file mode 100644 index 88a1ed830af..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "split/kernel_sum_all_radiance.h" - -__kernel void kernel_ocl_path_trace_sum_all_radiance( - ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */ - ccl_global float *buffer, /* Output buffer of RenderTile */ - ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */ - int parallel_samples, int sw, int sh, int stride, - int buffer_offset_x, - int buffer_offset_y, - int buffer_stride, - int start_sample) -{ - kernel_sum_all_radiance(data, - buffer, - per_sample_output_buffer, - parallel_samples, - sw, sh, stride, - buffer_offset_x, - buffer_offset_y, - buffer_stride, - start_sample); -} diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp index 3614717e28c..d3a69d39597 100644 --- a/intern/cycles/kernel/osl/osl_bssrdf.cpp +++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp @@ -78,7 +78,7 @@ public: bssrdf->albedo = albedo.x; bssrdf->sharpness = sharpness; bssrdf->N = params.N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f)); @@ -89,7 +89,7 @@ public: bssrdf->albedo = albedo.y; bssrdf->sharpness = sharpness; bssrdf->N = params.N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z)); @@ -100,7 +100,7 @@ public: bssrdf->albedo = albedo.z; bssrdf->sharpness = sharpness; bssrdf->N = params.N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } } } diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp index 94de782dca0..fe61587d179 100644 --- a/intern/cycles/kernel/osl/osl_closures.cpp +++ b/intern/cycles/kernel/osl/osl_closures.cpp @@ -42,6 +42,7 @@ #include "kernel_types.h" #include "kernel_compat_cpu.h" +#include "split/kernel_split_data_types.h" #include "kernel_globals.h" #include "kernel_montecarlo.h" #include "kernel_random.h" diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index 58bbdc33920..b08353e82d1 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -39,6 +39,7 @@ #include "util_string.h" #include "kernel_compat_cpu.h" +#include "split/kernel_split_data_types.h" #include "kernel_globals.h" #include "kernel_random.h" #include "kernel_projection.h" diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp index 0d762bbdb38..c7e9f57b18a 100644 --- a/intern/cycles/kernel/osl/osl_shader.cpp +++ b/intern/cycles/kernel/osl/osl_shader.cpp @@ -19,6 +19,7 @@ #include "kernel_compat_cpu.h" #include "kernel_montecarlo.h" #include "kernel_types.h" +#include "split/kernel_split_data_types.h" #include "kernel_globals.h" #include "geom/geom_object.h" diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h index 9bfa71c75ef..04aaf1bbaad 100644 --- a/intern/cycles/kernel/split/kernel_background_buffer_update.h +++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN /* Note on kernel_background_buffer_update kernel. * This is the fourth kernel in the ray tracing logic, and the third @@ -69,80 +69,77 @@ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty */ -ccl_device char kernel_background_buffer_update( - KernelGlobals *kg, - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* Required for buffer Update */ - ccl_global float3 *throughput_coop, /* Required for background hit processing */ - PathRadiance *PathRadiance_coop, /* Required for background hit processing and buffer Update */ - ccl_global Ray *Ray_coop, /* Required for background hit processing */ - ccl_global PathState *PathState_coop, /* Required for background hit processing */ - ccl_global float *L_transparent_coop, /* Required for background hit processing and buffer Update */ - ccl_global char *ray_state, /* Stores information on the current state of a ray */ - int sw, int sh, int sx, int sy, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global unsigned int *work_array, /* Denotes work of each ray */ - int end_sample, - int start_sample, -#ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples, /* Number of samples to be processed in parallel */ - int ray_index) +ccl_device void kernel_background_buffer_update(KernelGlobals *kg) { + ccl_local unsigned int local_queue_atomics; + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(ray_index == 0) { + /* We will empty this queue in this kernel. */ + kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0; + } char enqueue_flag = 0; + ray_index = get_ray_index(kg, ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + + ccl_global uint *rng_state = kernel_split_params.rng_state; + int stride = kernel_split_params.stride; + + ccl_global char *ray_state = kernel_split_state.ray_state; #ifdef __KERNEL_DEBUG__ - DebugData *debug_data = &debugdata_coop[ray_index]; + DebugData *debug_data = &kernel_split_state.debug_data[ray_index]; #endif - ccl_global PathState *state = &PathState_coop[ray_index]; - PathRadiance *L = L = &PathRadiance_coop[ray_index]; - ccl_global Ray *ray = &Ray_coop[ray_index]; - ccl_global float3 *throughput = &throughput_coop[ray_index]; - ccl_global float *L_transparent = &L_transparent_coop[ray_index]; - ccl_global uint *rng = &rng_coop[ray_index]; - -#ifdef __WORK_STEALING__ - unsigned int my_work; - ccl_global float *initial_per_sample_output_buffers; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index]; + ccl_global uint *rng = &kernel_split_state.rng[ray_index]; + ccl_global float *buffer = kernel_split_params.buffer; + + unsigned int work_index; ccl_global uint *initial_rng; -#endif + unsigned int sample; unsigned int tile_x; unsigned int tile_y; unsigned int pixel_x; unsigned int pixel_y; - unsigned int my_sample_tile; -#ifdef __WORK_STEALING__ - my_work = work_array[ray_index]; - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - get_pixel_tile_position(&pixel_x, &pixel_y, + work_index = kernel_split_state.work_array[ray_index]; + sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; + get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, - my_work, - sw, sh, sx, sy, - parallel_samples, + work_index, ray_index); - my_sample_tile = 0; - initial_per_sample_output_buffers = per_sample_output_buffers; initial_rng = rng_state; -#else /* __WORK_STEALING__ */ - sample = work_array[ray_index]; - int tile_index = ray_index / parallel_samples; - /* buffer and rng_state's stride is "stride". Find x and y using ray_index */ - tile_x = tile_index % sw; - tile_y = tile_index / sw; - my_sample_tile = ray_index - (tile_index * parallel_samples); -#endif /* __WORK_STEALING__ */ - - rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; - per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; + + rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride; + buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { /* eval background shader if nothing hit */ @@ -157,7 +154,7 @@ ccl_device char kernel_background_buffer_update( if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { #ifdef __BACKGROUND__ /* sample background shader */ - float3 L_background = indirect_background(kg, kg->sd_input, state, ray); + float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray); path_radiance_accum_background(L, (*throughput), L_background, state->bounce); #endif ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); @@ -166,55 +163,38 @@ ccl_device char kernel_background_buffer_update( if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { float3 L_sum = path_radiance_clamp_and_sum(kg, L); - kernel_write_light_passes(kg, per_sample_output_buffers, L, sample); + kernel_write_light_passes(kg, buffer, L, sample); #ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample); + kernel_write_debug_passes(kg, buffer, state, debug_data, sample); #endif float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); /* accumulate result in output buffer */ - kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); + kernel_write_pass_float4(buffer, sample, L_rad); path_rng_end(kg, rng_state, *rng); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); } if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { -#ifdef __WORK_STEALING__ /* We have completed current work; So get next work */ - int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); + int valid_work = get_next_work(kg, &work_index, ray_index); if(!valid_work) { /* If work is invalid, this means no more work is available and the thread may exit */ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); } -#else /* __WORK_STEALING__ */ - if((sample + parallel_samples) >= end_sample) { - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); - } -#endif /* __WORK_STEALING__ */ if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { -#ifdef __WORK_STEALING__ - work_array[ray_index] = my_work; + kernel_split_state.work_array[ray_index] = work_index; /* Get the sample associated with the current work */ - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; + sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; /* Get pixel and tile position associated with current work */ - get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index); - my_sample_tile = 0; + get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index); /* Remap rng_state according to the current work */ - rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride); - /* Remap per_sample_output_buffers according to the current work */ - per_sample_output_buffers = initial_per_sample_output_buffers - + (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride; -#else /* __WORK_STEALING__ */ - work_array[ray_index] = sample + parallel_samples; - sample = work_array[ray_index]; - - /* Get ray position from ray index */ - pixel_x = sx + ((ray_index / parallel_samples) % sw); - pixel_y = sy + ((ray_index / parallel_samples) / sw); -#endif /* __WORK_STEALING__ */ + rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride; + /* Remap buffer according to the current work */ + buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; /* Initialize random numbers and ray. */ kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray); @@ -226,7 +206,7 @@ ccl_device char kernel_background_buffer_update( *throughput = make_float3(1.0f, 1.0f, 1.0f); *L_transparent = 0.0f; path_radiance_init(L, kernel_data.film.use_light_pass); - path_state_init(kg, kg->sd_input, state, rng, sample, ray); + path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, rng, sample, ray); #ifdef __KERNEL_DEBUG__ debug_data_init(debug_data); #endif @@ -237,12 +217,29 @@ ccl_device char kernel_background_buffer_update( /* These rays do not participate in path-iteration. */ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* Accumulate result in output buffer. */ - kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); + kernel_write_pass_float4(buffer, sample, L_rad); path_rng_end(kg, rng_state, *rng); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); } } } - return enqueue_flag; + +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; + * These rays will be made active during next SceneIntersectkernel. + */ + enqueue_ray_index_local(ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + &local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); } + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h index 6e158d53d23..9b62d65ffd9 100644 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -14,108 +14,105 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN /* Note on kernel_data_initialization kernel * This kernel Initializes structures needed in path-iteration kernels. - * This is the first kernel in ray-tracing logic. - * - * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE - * - * Its input and output are as follows, - * - * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng - * Un-initialized throughput -------| |--- Initialized throughput - * Un-initialized L_transparent ----| |--- Initialized L_transparent - * Un-initialized PathRadiance -----| |--- Initialized PathRadiance - * Un-initialized Ray --------------| |--- Initialized Ray - * Un-initialized PathState --------| |--- Initialized PathState - * Un-initialized QueueData --------| |--- Initialized QueueData (to QUEUE_EMPTY_SLOT) - * Un-initialized QueueIndex -------| |--- Initialized QueueIndex (to 0) - * Un-initialized use_queues_flag---| |--- Initialized use_queues_flag (to false) - * Un-initialized ray_state --------| |--- Initialized ray_state - * parallel_samples --------------- | |--- Initialized per_sample_output_buffers - * rng_state -----------------------| |--- Initialized work_array - * data ----------------------------| |--- Initialized work_pool_wgs - * start_sample --------------------| | - * sx ------------------------------| | - * sy ------------------------------| | - * sw ------------------------------| | - * sh ------------------------------| | - * stride --------------------------| | - * queuesize -----------------------| | - * num_samples ---------------------| | * * Note on Queues : * All slots in queues are initialized to queue empty slot; * The number of elements in the queues is initialized to 0; */ + +/* distributes an amount of work across all threads + * note: work done inside the loop may not show up to all threads till after the current kernel has completed + */ +#define parallel_for(kg, iter_name, work_size) \ + for(size_t _size = (work_size), \ + _global_size = ccl_global_size(0) * ccl_global_size(1), \ + _n = _size / _global_size, \ + _thread = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0), \ + iter_name = (_n > 0) ? (_thread * _n) : (_thread) \ + ; \ + (iter_name < (_thread+1) * _n) || (iter_name == _n * _global_size + _thread && _thread < _size % _global_size) \ + ; \ + iter_name = (iter_name != (_thread+1) * _n - 1) ? (iter_name + 1) : (_n * _global_size + _thread) \ + ) + +#ifndef __KERNEL_CPU__ ccl_device void kernel_data_init( +#else +void KERNEL_FUNCTION_FULL_NAME(data_init)( +#endif KernelGlobals *kg, - ShaderData *sd_DL_shadow, ccl_constant KernelData *data, - ccl_global float *per_sample_output_buffers, + ccl_global void *split_data_buffer, + int num_elements, + ccl_global char *ray_state, ccl_global uint *rng_state, - ccl_global uint *rng_coop, /* rng array to store rng values for all rays */ - ccl_global float3 *throughput_coop, /* throughput array to store throughput values for all rays */ - ccl_global float *L_transparent_coop, /* L_transparent array to store L_transparent values for all rays */ - PathRadiance *PathRadiance_coop, /* PathRadiance array to store PathRadiance values for all rays */ - ccl_global Ray *Ray_coop, /* Ray array to store Ray information for all rays */ - ccl_global PathState *PathState_coop, /* PathState array to store PathState information for all rays */ - Intersection *Intersection_coop_shadow, - ccl_global char *ray_state, /* Stores information on current state of a ray */ +#ifdef __KERNEL_OPENCL__ #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, #include "../kernel_textures.h" +#endif - int start_sample, int sx, int sy, int sw, int sh, int offset, int stride, - int rng_state_offset_x, - int rng_state_offset_y, - int rng_state_stride, - ccl_global int *Queue_data, /* Memory for queues */ + int start_sample, + int end_sample, + int sx, int sy, int sw, int sh, int offset, int stride, ccl_global int *Queue_index, /* Tracks the number of elements in queues */ int queuesize, /* size (capacity) of the queue */ ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues to fetch ray index */ - ccl_global unsigned int *work_array, /* work array to store which work each ray belongs to */ -#ifdef __WORK_STEALING__ - ccl_global unsigned int *work_pool_wgs, /* Work pool for each work group */ - unsigned int num_samples, /* Total number of samples per pixel */ -#endif -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int parallel_samples) /* Number of samples to be processed in parallel */ + ccl_global unsigned int *work_pools, /* Work pool for each work group */ + unsigned int num_samples, + ccl_global float *buffer) { +#ifdef __KERNEL_OPENCL__ kg->data = data; - kg->sd_input = sd_DL_shadow; - kg->isect_shadow = Intersection_coop_shadow; +#endif + + kernel_split_params.x = sx; + kernel_split_params.y = sy; + kernel_split_params.w = sw; + kernel_split_params.h = sh; + + kernel_split_params.offset = offset; + kernel_split_params.stride = stride; + + kernel_split_params.rng_state = rng_state; + + kernel_split_params.start_sample = start_sample; + kernel_split_params.end_sample = end_sample; + + kernel_split_params.work_pools = work_pools; + kernel_split_params.num_samples = num_samples; + + kernel_split_params.queue_index = Queue_index; + kernel_split_params.queue_size = queuesize; + kernel_split_params.use_queues_flag = use_queues_flag; + + kernel_split_params.buffer = buffer; + + split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state); + +#ifdef __KERNEL_OPENCL__ #define KERNEL_TEX(type, ttype, name) \ kg->name = name; #include "../kernel_textures.h" +#endif - int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0); - -#ifdef __WORK_STEALING__ - int lid = get_local_id(1) * get_local_size(0) + get_local_id(0); - /* Initialize work_pool_wgs */ - if(lid == 0) { - int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0); - work_pool_wgs[group_index] = 0; - } - barrier(CLK_LOCAL_MEM_FENCE); -#endif /* __WORK_STEALING__ */ + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); /* Initialize queue data and queue index. */ if(thread_index < queuesize) { /* Initialize active ray queue. */ - Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; /* Initialize background and buffer update queue. */ - Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; /* Initialize shadow ray cast of AO queue. */ - Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; /* Initialize shadow ray cast of direct lighting queue. */ - Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; + kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT; } if(thread_index == 0) { @@ -126,109 +123,32 @@ ccl_device void kernel_data_init( /* The scene-intersect kernel should not use the queues very first time. * since the queue would be empty. */ - use_queues_flag[0] = 0; + *use_queues_flag = 0; } - int x = get_global_id(0); - int y = get_global_id(1); + /* zero the tiles pixels and initialize rng_state if this is the first sample */ + if(start_sample == 0) { + parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) { + int pixel = i / kernel_data.film.pass_stride; + int pass = i % kernel_data.film.pass_stride; - if(x < (sw * parallel_samples) && y < sh) { - int ray_index = x + y * (sw * parallel_samples); + int x = sx + pixel % sw; + int y = sy + pixel / sw; - /* This is the first assignment to ray_state; - * So we dont use ASSIGN_RAY_STATE macro. - */ - ray_state[ray_index] = RAY_ACTIVE; - - unsigned int my_sample; - unsigned int pixel_x; - unsigned int pixel_y; - unsigned int tile_x; - unsigned int tile_y; - unsigned int my_sample_tile; - -#ifdef __WORK_STEALING__ - unsigned int my_work = 0; - /* Get work. */ - get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index); - /* Get the sample associated with the work. */ - my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - - my_sample_tile = 0; - - /* Get pixel and tile position associated with the work. */ - get_pixel_tile_position(&pixel_x, &pixel_y, - &tile_x, &tile_y, - my_work, - sw, sh, sx, sy, - parallel_samples, - ray_index); - work_array[ray_index] = my_work; -#else /* __WORK_STEALING__ */ - unsigned int tile_index = ray_index / parallel_samples; - tile_x = tile_index % sw; - tile_y = tile_index / sw; - my_sample_tile = ray_index - (tile_index * parallel_samples); - my_sample = my_sample_tile + start_sample; - - /* Initialize work array. */ - work_array[ray_index] = my_sample ; - - /* Calculate pixel position of this ray. */ - pixel_x = sx + tile_x; - pixel_y = sy + tile_y; -#endif /* __WORK_STEALING__ */ - - rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride; - - /* Initialise per_sample_output_buffers to all zeros. */ - per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride; - int per_sample_output_buffers_iterator = 0; - for(per_sample_output_buffers_iterator = 0; - per_sample_output_buffers_iterator < kernel_data.film.pass_stride; - per_sample_output_buffers_iterator++) - { - per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f; - } + int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass; - /* Initialize random numbers and ray. */ - kernel_path_trace_setup(kg, - rng_state, - my_sample, - pixel_x, pixel_y, - &rng_coop[ray_index], - &Ray_coop[ray_index]); - - if(Ray_coop[ray_index].t != 0.0f) { - /* Initialize throughput, L_transparent, Ray, PathState; - * These rays proceed with path-iteration. - */ - throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f); - L_transparent_coop[ray_index] = 0.0f; - path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass); - path_state_init(kg, - kg->sd_input, - &PathState_coop[ray_index], - &rng_coop[ray_index], - my_sample, - &Ray_coop[ray_index]); -#ifdef __KERNEL_DEBUG__ - debug_data_init(&debugdata_coop[ray_index]); -#endif - } - else { - /* These rays do not participate in path-iteration. */ - float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - /* Accumulate result in output buffer. */ - kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad); - path_rng_end(kg, rng_state, rng_coop[ray_index]); - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); + *(buffer + index) = 0.0f; } - } - /* Mark rest of the ray-state indices as RAY_INACTIVE. */ - if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) { - /* First assignment, hence we dont use ASSIGN_RAY_STATE macro */ - ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE; + parallel_for(kg, i, sw * sh) { + int x = sx + i % sw; + int y = sy + i / sw; + + int index = (offset + x + y*stride); + *(rng_state + index) = hash_int_2d(x, y); + } } } + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h index 82ca18829d3..5163b8edc04 100644 --- a/intern/cycles/kernel/split/kernel_direct_lighting.h +++ b/intern/cycles/kernel/split/kernel_direct_lighting.h @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN /* Note on kernel_direct_lighting kernel. * This is the eighth kernel in the ray tracing logic. This is the seventh @@ -47,28 +47,50 @@ * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty. */ -ccl_device char kernel_direct_lighting( - KernelGlobals *kg, - ShaderData *sd, /* Required for direct lighting */ - ccl_global uint *rng_coop, /* Required for direct lighting */ - ccl_global PathState *PathState_coop, /* Required for direct lighting */ - ccl_global int *ISLamp_coop, /* Required for direct lighting */ - ccl_global Ray *LightRay_coop, /* Required for direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for direct lighting */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int ray_index) +ccl_device void kernel_direct_lighting(KernelGlobals *kg) { + ccl_local unsigned int local_queue_atomics; + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + char enqueue_flag = 0; - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global PathState *state = &PathState_coop[ray_index]; + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + ShaderData *sd = &kernel_split_state.sd[ray_index]; /* direct lighting */ #ifdef __EMISSION__ if((kernel_data.integrator.use_direct_light && - (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) + (sd->flag & SD_BSDF_HAS_EVAL))) { /* Sample illumination from lights to find path contribution. */ - ccl_global RNG* rng = &rng_coop[ray_index]; + ccl_global RNG* rng = &kernel_split_state.rng[ray_index]; float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT); float light_u, light_v; path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v); @@ -77,32 +99,48 @@ ccl_device char kernel_direct_lighting( LightSample ls; if(light_sample(kg, light_t, light_u, light_v, - ccl_fetch(sd, time), - ccl_fetch(sd, P), + sd->time, + sd->P, state->bounce, &ls)) { Ray light_ray; #ifdef __OBJECT_MOTION__ - light_ray.time = ccl_fetch(sd, time); + light_ray.time = sd->time; #endif BsdfEval L_light; bool is_lamp; - if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { + if(direct_emission(kg, sd, &kernel_split_state.sd_DL_shadow[ray_index], &ls, state, &light_ray, &L_light, &is_lamp, terminate)) { /* Write intermediate data to global memory to access from * the next kernel. */ - LightRay_coop[ray_index] = light_ray; - BSDFEval_coop[ray_index] = L_light; - ISLamp_coop[ray_index] = is_lamp; + kernel_split_state.light_ray[ray_index] = light_ray; + kernel_split_state.bsdf_eval[ray_index] = L_light; + kernel_split_state.is_lamp[ray_index] = is_lamp; /* Mark ray state for next shadow kernel. */ - ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); + ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); enqueue_flag = 1; } } } #endif /* __EMISSION__ */ } - return enqueue_flag; + +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + +#ifdef __EMISSION__ + /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_SHADOW_RAY_CAST_DL_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + &local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +#endif } + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index 5d951b972ed..7168efa59ae 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN /* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel. * This is the sixth kernel in the ray tracing logic. This is the fifth @@ -70,101 +70,105 @@ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO */ -ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( - KernelGlobals *kg, - ShaderData *sd, /* Required throughout the kernel except probabilistic path termination and AO */ - ccl_global float *per_sample_output_buffers, - ccl_global uint *rng_coop, /* Required for "kernel_write_data_passes" and AO */ - ccl_global float3 *throughput_coop, /* Required for handling holdout material and AO */ - ccl_global float *L_transparent_coop, /* Required for handling holdout material */ - PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */ - ccl_global PathState *PathState_coop, /* Required throughout the kernel and AO */ - Intersection *Intersection_coop, /* Required for indirect primitive emission */ - ccl_global float3 *AOAlpha_coop, /* Required for AO */ - ccl_global float3 *AOBSDF_coop, /* Required for AO */ - ccl_global Ray *AOLightRay_coop, /* Required for AO */ - int sw, int sh, int sx, int sy, int stride, - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global unsigned int *work_array, /* Denotes the work that each ray belongs to */ -#ifdef __WORK_STEALING__ - unsigned int start_sample, -#endif - int parallel_samples, /* Number of samples to be processed in parallel */ - int ray_index, - char *enqueue_flag, - char *enqueue_flag_AO_SHADOW_RAY_CAST) +ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobals *kg) { -#ifdef __WORK_STEALING__ - unsigned int my_work; + ccl_local unsigned int local_queue_atomics_bg; + ccl_local unsigned int local_queue_atomics_ao; + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + local_queue_atomics_bg = 0; + local_queue_atomics_ao = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + char enqueue_flag = 0; + char enqueue_flag_AO_SHADOW_RAY_CAST = 0; + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif /* __COMPUTE_DEVICE_GPU__ */ + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif + + int stride = kernel_split_params.stride; + + unsigned int work_index; unsigned int pixel_x; unsigned int pixel_y; -#endif + unsigned int tile_x; unsigned int tile_y; - int my_sample_tile; unsigned int sample; ccl_global RNG *rng = 0x0; ccl_global PathState *state = 0x0; float3 throughput; + ccl_global char *ray_state = kernel_split_state.ray_state; + ShaderData *sd = &kernel_split_state.sd[ray_index]; + ccl_global float *buffer = kernel_split_params.buffer; + if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - throughput = throughput_coop[ray_index]; - state = &PathState_coop[ray_index]; - rng = &rng_coop[ray_index]; -#ifdef __WORK_STEALING__ - my_work = work_array[ray_index]; - sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample; - get_pixel_tile_position(&pixel_x, &pixel_y, + throughput = kernel_split_state.throughput[ray_index]; + state = &kernel_split_state.path_state[ray_index]; + rng = &kernel_split_state.rng[ray_index]; + + work_index = kernel_split_state.work_array[ray_index]; + sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; + get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, - my_work, - sw, sh, sx, sy, - parallel_samples, + work_index, ray_index); - my_sample_tile = 0; -#else /* __WORK_STEALING__ */ - sample = work_array[ray_index]; - /* Buffer's stride is "stride"; Find x and y using ray_index. */ - int tile_index = ray_index / parallel_samples; - tile_x = tile_index % sw; - tile_y = tile_index / sw; - my_sample_tile = ray_index - (tile_index * parallel_samples); -#endif /* __WORK_STEALING__ */ - per_sample_output_buffers += - (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * - kernel_data.film.pass_stride; + + buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride; /* holdout */ #ifdef __HOLDOUT__ - if(((ccl_fetch(sd, flag) & SD_HOLDOUT) || - (ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK)) && + if(((sd->flag & SD_HOLDOUT) || + (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) { if(kernel_data.background.transparent) { float3 holdout_weight; - if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) { + if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { holdout_weight = make_float3(1.0f, 1.0f, 1.0f); } else { holdout_weight = shader_holdout_eval(kg, sd); } /* any throughput is ok, should all be identical here */ - L_transparent_coop[ray_index] += average(holdout_weight*throughput); + kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput); } - if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) { + if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) { ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - *enqueue_flag = 1; + enqueue_flag = 1; } } #endif /* __HOLDOUT__ */ } if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - PathRadiance *L = &PathRadiance_coop[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; /* Holdout mask objects do not write data passes. */ kernel_write_data_passes(kg, - per_sample_output_buffers, + buffer, L, sd, sample, @@ -183,12 +187,12 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( #ifdef __EMISSION__ /* emission */ - if(ccl_fetch(sd, flag) & SD_EMISSION) { + if(sd->flag & SD_EMISSION) { /* TODO(sergey): is isect.t wrong here for transparent surfaces? */ float3 emission = indirect_primitive_emission( kg, sd, - Intersection_coop[ray_index].t, + kernel_split_state.isect[ray_index].t, state->flag, state->ray_pdf); path_radiance_accum_emission(L, throughput, emission, state->bounce); @@ -203,7 +207,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( if(probability == 0.0f) { ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - *enqueue_flag = 1; + enqueue_flag = 1; } if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { @@ -211,10 +215,10 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE); if(terminate >= probability) { ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); - *enqueue_flag = 1; + enqueue_flag = 1; } else { - throughput_coop[ray_index] = throughput/probability; + kernel_split_state.throughput[ray_index] = throughput/probability; } } } @@ -224,7 +228,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || - (ccl_fetch(sd, flag) & SD_AO)) + (sd->flag & SD_AO)) { /* todo: solve correlation */ float bsdf_u, bsdf_v; @@ -232,29 +236,56 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( float ao_factor = kernel_data.background.ao_factor; float3 ao_N; - AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); - AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd); + kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); + kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd); float3 ao_D; float ao_pdf; sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf); - if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) { + if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) { Ray _ray; - _ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng)); + _ray.P = ray_offset(sd->P, sd->Ng); _ray.D = ao_D; _ray.t = kernel_data.background.ao_distance; #ifdef __OBJECT_MOTION__ - _ray.time = ccl_fetch(sd, time); + _ray.time = sd->time; #endif - _ray.dP = ccl_fetch(sd, dP); + _ray.dP = sd->dP; _ray.dD = differential3_zero(); - AOLightRay_coop[ray_index] = _ray; + kernel_split_state.ao_light_ray[ray_index] = _ray; ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO); - *enqueue_flag_AO_SHADOW_RAY_CAST = 1; + enqueue_flag_AO_SHADOW_RAY_CAST = 1; } } } #endif /* __AO__ */ + +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + &local_queue_atomics_bg, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + +#ifdef __AO__ + /* Enqueue to-shadow-ray-cast rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_SHADOW_RAY_CAST_AO_RAYS, + enqueue_flag_AO_SHADOW_RAY_CAST, + kernel_split_params.queue_size, + &local_queue_atomics_ao, + kernel_split_state.queue_data, + kernel_split_params.queue_index); +#endif } + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h index 3bd0e361078..261625da31d 100644 --- a/intern/cycles/kernel/split/kernel_lamp_emission.h +++ b/intern/cycles/kernel/split/kernel_lamp_emission.h @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN /* Note on kernel_lamp_emission * This is the 3rd kernel in the ray-tracing logic. This is the second of the @@ -36,28 +36,39 @@ * sw -------------------------------------------------| | * sh -------------------------------------------------| | */ -ccl_device void kernel_lamp_emission( - KernelGlobals *kg, - ccl_global float3 *throughput_coop, /* Required for lamp emission */ - PathRadiance *PathRadiance_coop, /* Required for lamp emission */ - ccl_global Ray *Ray_coop, /* Required for lamp emission */ - ccl_global PathState *PathState_coop, /* Required for lamp emission */ - Intersection *Intersection_coop, /* Required for lamp emission */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global char *use_queues_flag, /* Used to decide if this kernel should use - * queues to fetch ray index - */ - int ray_index) +ccl_device void kernel_lamp_emission(KernelGlobals *kg) { - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || - IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) + /* We will empty this queue in this kernel. */ + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0; + } + /* Fetch use_queues_flag. */ + ccl_local char local_use_queues_flag; + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + local_use_queues_flag = *kernel_split_params.use_queues_flag; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(local_use_queues_flag) { + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 1); + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } + + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) || + IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) { - PathRadiance *L = &PathRadiance_coop[ray_index]; - ccl_global PathState *state = &PathState_coop[ray_index]; + PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; - float3 throughput = throughput_coop[ray_index]; - Ray ray = Ray_coop[ray_index]; + float3 throughput = kernel_split_state.throughput[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; #ifdef __LAMP_MIS__ if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) { @@ -65,7 +76,7 @@ ccl_device void kernel_lamp_emission( Ray light_ray; light_ray.P = ray.P - state->ray_t*ray.D; - state->ray_t += Intersection_coop[ray_index].t; + state->ray_t += kernel_split_state.isect[ray_index].t; light_ray.D = ray.D; light_ray.t = state->ray_t; light_ray.time = ray.time; @@ -74,10 +85,13 @@ ccl_device void kernel_lamp_emission( /* intersect with lamp */ float3 emission; - if(indirect_lamp_emission(kg, kg->sd_input, state, &light_ray, &emission)) { + if(indirect_lamp_emission(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &light_ray, &emission)) { path_radiance_accum_emission(L, throughput, emission, state->bounce); } } #endif /* __LAMP_MIS__ */ } } + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h index 816f3a6fbff..a6f26278116 100644 --- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h +++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN /* Note on kernel_setup_next_iteration kernel. * This is the tenth kernel in the ray tracing logic. This is the ninth @@ -59,47 +59,76 @@ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays. * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays */ -ccl_device char kernel_next_iteration_setup( - KernelGlobals *kg, - ShaderData *sd, /* Required for setting up ray for next iteration */ - ccl_global uint *rng_coop, /* Required for setting up ray for next iteration */ - ccl_global float3 *throughput_coop, /* Required for setting up ray for next iteration */ - PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *Ray_coop, /* Required for setting up ray for next iteration */ - ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */ - ccl_global Ray *LightRay_dl_coop, /* Required for radiance update - direct lighting */ - ccl_global int *ISLamp_coop, /* Required for radiance update - direct lighting */ - ccl_global BsdfEval *BSDFEval_coop, /* Required for radiance update - direct lighting */ - ccl_global Ray *LightRay_ao_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOBSDF_coop, /* Required for radiance update - AO */ - ccl_global float3 *AOAlpha_coop, /* Required for radiance update - AO */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - ccl_global char *use_queues_flag, /* flag to decide if scene_intersect kernel should - * use queues to fetch ray index */ - int ray_index) +ccl_device void kernel_next_iteration_setup(KernelGlobals *kg) { + ccl_local unsigned int local_queue_atomics; + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) { + /* If we are here, then it means that scene-intersect kernel + * has already been executed atleast once. From the next time, + * scene-intersect kernel may operate on queues to fetch ray index + */ + *kernel_split_params.use_queues_flag = 1; + + /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and + * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the + * previous kernel. + */ + kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0; + kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0; + } + char enqueue_flag = 0; + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + +#ifdef __COMPUTE_DEVICE_GPU__ + /* If we are executing on a GPU device, we exit all threads that are not + * required. + * + * If we are executing on a CPU device, then we need to keep all threads + * active since we have barrier() calls later in the kernel. CPU devices, + * expect all threads to execute barrier statement. + */ + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } +#endif + +#ifndef __COMPUTE_DEVICE_GPU__ + if(ray_index != QUEUE_EMPTY_SLOT) { +#endif /* Load ShaderData structure. */ PathRadiance *L = NULL; ccl_global PathState *state = NULL; + ccl_global char *ray_state = kernel_split_state.ray_state; /* Path radiance update for AO/Direct_lighting's shadow blocked. */ if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - state = &PathState_coop[ray_index]; - L = &PathRadiance_coop[ray_index]; - float3 _throughput = throughput_coop[ray_index]; + state = &kernel_split_state.path_state[ray_index]; + L = &kernel_split_state.path_radiance[ray_index]; + float3 _throughput = kernel_split_state.throughput[ray_index]; if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - float3 shadow = LightRay_ao_coop[ray_index].P; - char update_path_radiance = LightRay_ao_coop[ray_index].t; + float3 shadow = kernel_split_state.ao_light_ray[ray_index].P; + // TODO(mai): investigate correctness here + char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t; if(update_path_radiance) { path_radiance_accum_ao(L, _throughput, - AOAlpha_coop[ray_index], - AOBSDF_coop[ray_index], + kernel_split_state.ao_alpha[ray_index], + kernel_split_state.ao_bsdf[ray_index], shadow, state->bounce); } @@ -107,35 +136,50 @@ ccl_device char kernel_next_iteration_setup( } if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) { - float3 shadow = LightRay_dl_coop[ray_index].P; - char update_path_radiance = LightRay_dl_coop[ray_index].t; + float3 shadow = kernel_split_state.light_ray[ray_index].P; + // TODO(mai): investigate correctness here + char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t; if(update_path_radiance) { - BsdfEval L_light = BSDFEval_coop[ray_index]; + BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index]; path_radiance_accum_light(L, _throughput, &L_light, shadow, 1.0f, state->bounce, - ISLamp_coop[ray_index]); + kernel_split_state.is_lamp[ray_index]); } REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL); } } if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - ccl_global float3 *throughput = &throughput_coop[ray_index]; - ccl_global Ray *ray = &Ray_coop[ray_index]; - ccl_global RNG *rng = &rng_coop[ray_index]; - state = &PathState_coop[ray_index]; - L = &PathRadiance_coop[ray_index]; + ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; + ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; + ccl_global RNG *rng = &kernel_split_state.rng[ray_index]; + state = &kernel_split_state.path_state[ray_index]; + L = &kernel_split_state.path_radiance[ray_index]; /* Compute direct lighting and next bounce. */ - if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) { + if(!kernel_path_surface_bounce(kg, rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) { ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER); enqueue_flag = 1; } } - return enqueue_flag; +#ifndef __COMPUTE_DEVICE_GPU__ + } +#endif + + /* Enqueue RAY_UPDATE_BUFFER rays. */ + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + &local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); } + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h new file mode 100644 index 00000000000..fe3c9e1e8a2 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_path_init.h @@ -0,0 +1,100 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* This kernel initializes structures needed in path-iteration kernels. + * This is the first kernel in ray-tracing logic. + * + * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE + */ + +ccl_device void kernel_path_init(KernelGlobals *kg) { + int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0); + + /* This is the first assignment to ray_state; + * So we dont use ASSIGN_RAY_STATE macro. + */ + kernel_split_state.ray_state[ray_index] = RAY_ACTIVE; + + unsigned int my_sample; + unsigned int pixel_x; + unsigned int pixel_y; + unsigned int tile_x; + unsigned int tile_y; + + unsigned int work_index = 0; + /* Get work. */ + if(!get_next_work(kg, &work_index, ray_index)) { + /* No more work, mark ray as inactive */ + kernel_split_state.ray_state[ray_index] = RAY_INACTIVE; + + return; + } + + /* Get the sample associated with the work. */ + my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; + + /* Get pixel and tile position associated with the work. */ + get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, + &tile_x, &tile_y, + work_index, + ray_index); + kernel_split_state.work_array[ray_index] = work_index; + + ccl_global uint *rng_state = kernel_split_params.rng_state; + rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride; + + ccl_global float *buffer = kernel_split_params.buffer; + buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride; + + /* Initialize random numbers and ray. */ + kernel_path_trace_setup(kg, + rng_state, + my_sample, + pixel_x, pixel_y, + &kernel_split_state.rng[ray_index], + &kernel_split_state.ray[ray_index]); + + if(kernel_split_state.ray[ray_index].t != 0.0f) { + /* Initialize throughput, L_transparent, Ray, PathState; + * These rays proceed with path-iteration. + */ + kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f); + kernel_split_state.L_transparent[ray_index] = 0.0f; + path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass); + path_state_init(kg, + &kernel_split_state.sd_DL_shadow[ray_index], + &kernel_split_state.path_state[ray_index], + &kernel_split_state.rng[ray_index], + my_sample, + &kernel_split_state.ray[ray_index]); +#ifdef __KERNEL_DEBUG__ + debug_data_init(&kernel_split_state.debug_data[ray_index]); +#endif + } + else { + /* These rays do not participate in path-iteration. */ + float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + /* Accumulate result in output buffer. */ + kernel_write_pass_float4(buffer, my_sample, L_rad); + path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]); + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE); + } +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h new file mode 100644 index 00000000000..66aad705bd4 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h @@ -0,0 +1,102 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +CCL_NAMESPACE_BEGIN + +/* + * The kernel "kernel_queue_enqueue" enqueues rays of + * different ray state into their appropriate Queues; + * 1. Rays that have been determined to hit the background from the + * "kernel_scene_intersect" kernel + * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS. + * + * The input and output of the kernel is as follows, + * + * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------| |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS) + * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---| | + * queuesize -------------------------------------------| | + * + * Note on Queues : + * State of queues during the first time this kernel is called : + * At entry, + * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty. + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays + * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays. + * + * State of queue during other times this kernel is called : + * At entry, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays. + * At exit, + * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays. + * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays. + */ +ccl_device void kernel_queue_enqueue(KernelGlobals *kg) +{ + /* We have only 2 cases (Hit/Not-Hit) */ + ccl_local unsigned int local_queue_atomics[2]; + + int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0); + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + + if(lidx == 0) { + local_queue_atomics[0] = 0; + local_queue_atomics[1] = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int queue_number = -1; + + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) { + queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS; + } + else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS; + } + + unsigned int my_lqidx; + if(queue_number != -1) { + my_lqidx = get_local_queue_index(queue_number, local_queue_atomics); + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + if(lidx == 0) { + local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = + get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS, + local_queue_atomics, + kernel_split_params.queue_index); + local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = + get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + local_queue_atomics, + kernel_split_params.queue_index); + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + unsigned int my_gqidx; + if(queue_number != -1) { + my_gqidx = get_global_queue_index(queue_number, + kernel_split_params.queue_size, + my_lqidx, + local_queue_atomics); + kernel_split_state.queue_data[my_gqidx] = ray_index; + } +} + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h index 2388580051f..a7e0c7692a2 100644 --- a/intern/cycles/kernel/split/kernel_scene_intersect.h +++ b/intern/cycles/kernel/split/kernel_scene_intersect.h @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN /* Note on kernel_scene_intersect kernel. * This is the second kernel in the ray tracing logic. This is the first @@ -61,34 +61,41 @@ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change */ -ccl_device void kernel_scene_intersect( - KernelGlobals *kg, - ccl_global uint *rng_coop, - ccl_global Ray *Ray_coop, /* Required for scene_intersect */ - ccl_global PathState *PathState_coop, /* Required for scene_intersect */ - Intersection *Intersection_coop, /* Required for scene_intersect */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int sw, int sh, - ccl_global char *use_queues_flag, /* used to decide if this kernel should use - * queues to fetch ray index */ -#ifdef __KERNEL_DEBUG__ - DebugData *debugdata_coop, -#endif - int ray_index) +ccl_device void kernel_scene_intersect(KernelGlobals *kg) { + /* Fetch use_queues_flag */ + ccl_local char local_use_queues_flag; + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + local_use_queues_flag = *kernel_split_params.use_queues_flag; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(local_use_queues_flag) { + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + } + /* All regenerated rays become active here */ - if(IS_STATE(ray_state, ray_index, RAY_REGENERATED)) - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE); + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE); - if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE)) + if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) return; #ifdef __KERNEL_DEBUG__ - DebugData *debug_data = &debugdata_coop[ray_index]; + DebugData *debug_data = &kernel_split_state.debug_data[ray_index]; #endif - Intersection *isect = &Intersection_coop[ray_index]; - PathState state = PathState_coop[ray_index]; - Ray ray = Ray_coop[ray_index]; + Intersection *isect = &kernel_split_state.isect[ray_index]; + PathState state = kernel_split_state.path_state[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; /* intersect scene */ uint visibility = path_state_ray_visibility(kg, &state); @@ -96,7 +103,7 @@ ccl_device void kernel_scene_intersect( #ifdef __HAIR__ float difl = 0.0f, extmax = 0.0f; uint lcg_state = 0; - RNG rng = rng_coop[ray_index]; + RNG rng = kernel_split_state.rng[ray_index]; if(kernel_data.bvh.have_curves) { if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) { @@ -128,6 +135,9 @@ ccl_device void kernel_scene_intersect( * These rays undergo special processing in the * background_bufferUpdate kernel. */ - ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND); + ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND); } } + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h index cef64bf5f36..35ee19ddf1b 100644 --- a/intern/cycles/kernel/split/kernel_shader_eval.h +++ b/intern/cycles/kernel/split/kernel_shader_eval.h @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN /* Note on kernel_shader_eval kernel * This kernel is the 5th kernel in the ray tracing logic. This is @@ -44,27 +44,51 @@ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays */ -ccl_device void kernel_shader_eval( - KernelGlobals *kg, - ShaderData *sd, /* Output ShaderData structure to be filled */ - ccl_global uint *rng_coop, /* Required for rbsdf calculation */ - ccl_global Ray *Ray_coop, /* Required for setting up shader from ray */ - ccl_global PathState *PathState_coop, /* Required for all functions in this kernel */ - Intersection *Intersection_coop, /* Required for setting up shader from ray */ - ccl_global char *ray_state, /* Denotes the state of each ray */ - int ray_index) + +ccl_device void kernel_shader_eval(KernelGlobals *kg) { - if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - Intersection *isect = &Intersection_coop[ray_index]; - ccl_global uint *rng = &rng_coop[ray_index]; - ccl_global PathState *state = &PathState_coop[ray_index]; - Ray ray = Ray_coop[ray_index]; + /* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */ + ccl_local unsigned int local_queue_atomics; + if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) { + local_queue_atomics = 0; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + ray_index = get_ray_index(kg, ray_index, + QUEUE_ACTIVE_AND_REGENERATED_RAYS, + kernel_split_state.queue_data, + kernel_split_params.queue_size, + 0); + + if(ray_index == QUEUE_EMPTY_SLOT) { + return; + } + + char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0; + enqueue_ray_index_local(ray_index, + QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, + enqueue_flag, + kernel_split_params.queue_size, + &local_queue_atomics, + kernel_split_state.queue_data, + kernel_split_params.queue_index); + + /* Continue on with shader evaluation. */ + if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) { + Intersection *isect = &kernel_split_state.isect[ray_index]; + ccl_global uint *rng = &kernel_split_state.rng[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + Ray ray = kernel_split_state.ray[ray_index]; shader_setup_from_ray(kg, - sd, + &kernel_split_state.sd[ray_index], isect, &ray); float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF); - shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN); + shader_eval_surface(kg, &kernel_split_state.sd[ray_index], rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN); } } + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h index 6153af47f96..d532c7cf55b 100644 --- a/intern/cycles/kernel/split/kernel_shadow_blocked.h +++ b/intern/cycles/kernel/split/kernel_shadow_blocked.h @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "kernel_split_common.h" +CCL_NAMESPACE_BEGIN /* Note on kernel_shadow_blocked kernel. * This is the ninth kernel in the ray tracing logic. This is the eighth @@ -45,24 +45,47 @@ * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry. * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit. */ -ccl_device void kernel_shadow_blocked( - KernelGlobals *kg, - ccl_global PathState *PathState_coop, /* Required for shadow blocked */ - ccl_global Ray *LightRay_dl_coop, /* Required for direct lighting's shadow blocked */ - ccl_global Ray *LightRay_ao_coop, /* Required for AO's shadow blocked */ - ccl_global char *ray_state, - char shadow_blocked_type, - int ray_index) +ccl_device void kernel_shadow_blocked(KernelGlobals *kg) { + int lidx = ccl_local_id(1) * ccl_local_id(0) + ccl_local_id(0); + + ccl_local unsigned int ao_queue_length; + ccl_local unsigned int dl_queue_length; + if(lidx == 0) { + ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS]; + dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS]; + } + ccl_barrier(CCL_LOCAL_MEM_FENCE); + + /* flag determining if the current ray is to process shadow ray for AO or DL */ + char shadow_blocked_type = -1; + + int ray_index = QUEUE_EMPTY_SLOT; + int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0); + if(thread_index < ao_queue_length + dl_queue_length) { + if(thread_index < ao_queue_length) { + ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, + kernel_split_state.queue_data, kernel_split_params.queue_size, 1); + shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO; + } else { + ray_index = get_ray_index(kg, thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, + kernel_split_state.queue_data, kernel_split_params.queue_size, 1); + shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL; + } + } + + if(ray_index == QUEUE_EMPTY_SLOT) + return; + /* Flag determining if we need to update L. */ char update_path_radiance = 0; - if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || - IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) + if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) || + IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) { - ccl_global PathState *state = &PathState_coop[ray_index]; - ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index]; - ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index]; + ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; + ccl_global Ray *light_ray_dl_global = &kernel_split_state.light_ray[ray_index]; + ccl_global Ray *light_ray_ao_global = &kernel_split_state.ao_light_ray[ray_index]; ccl_global Ray *light_ray_global = shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO @@ -71,7 +94,7 @@ ccl_device void kernel_shadow_blocked( float3 shadow; update_path_radiance = !(shadow_blocked(kg, - kg->sd_input, + &kernel_split_state.sd_DL_shadow[thread_index], state, light_ray_global, &shadow)); @@ -83,3 +106,6 @@ ccl_device void kernel_shadow_blocked( light_ray_global->t = update_path_radiance; } } + +CCL_NAMESPACE_END + diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h index 2135ee22b2e..dd0c3f9c941 100644 --- a/intern/cycles/kernel/split/kernel_split_common.h +++ b/intern/cycles/kernel/split/kernel_split_common.h @@ -17,11 +17,23 @@ #ifndef __KERNEL_SPLIT_H__ #define __KERNEL_SPLIT_H__ -#include "kernel_compat_opencl.h" #include "kernel_math.h" #include "kernel_types.h" + +#include "kernel_split_data.h" + #include "kernel_globals.h" -#include "kernel_image_opencl.h" + +#ifdef __OSL__ +# include "osl_shader.h" +#endif + +#ifdef __KERNEL_OPENCL__ +# include "kernel_image_opencl.h" +#endif +#ifdef __KERNEL_CPU__ +# include "../kernels/cpu/kernel_cpu_image.h" +#endif #include "util_atomic.h" diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h new file mode 100644 index 00000000000..5380c0c5de6 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_split_data.h @@ -0,0 +1,57 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_SPLIT_DATA_H__ +#define __KERNEL_SPLIT_DATA_H__ + +#include "kernel_split_data_types.h" +#include "kernel_globals.h" + +CCL_NAMESPACE_BEGIN + +ccl_device_inline size_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements) +{ + (void)kg; /* Unused on CPU. */ + + size_t size = 0; +#define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16) + size = size SPLIT_DATA_ENTRIES; +#undef SPLIT_DATA_ENTRY + + return size; +} + +ccl_device_inline void split_data_init(KernelGlobals *kg, + ccl_global SplitData *split_data, + size_t num_elements, + ccl_global void *data, + ccl_global char *ray_state) +{ + (void)kg; /* Unused on CPU. */ + + ccl_global char *p = (ccl_global char*)data; + +#define SPLIT_DATA_ENTRY(type, name, num) \ + split_data->name = (type*)p; p += align_up(num_elements * num * sizeof(type), 16); + SPLIT_DATA_ENTRIES +#undef SPLIT_DATA_ENTRY + + split_data->ray_state = ray_state; +} + +CCL_NAMESPACE_END + +#endif /* __KERNEL_SPLIT_DATA_H__ */ diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h new file mode 100644 index 00000000000..62e3ea45ae2 --- /dev/null +++ b/intern/cycles/kernel/split/kernel_split_data_types.h @@ -0,0 +1,109 @@ +/* + * Copyright 2011-2016 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __KERNEL_SPLIT_DATA_TYPES_H__ +#define __KERNEL_SPLIT_DATA_TYPES_H__ + +CCL_NAMESPACE_BEGIN + +/* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */ + +typedef struct SplitParams { + int x; + int y; + int w; + int h; + + int offset; + int stride; + + ccl_global uint *rng_state; + + int start_sample; + int end_sample; + + ccl_global unsigned int *work_pools; + unsigned int num_samples; + + ccl_global int *queue_index; + int queue_size; + ccl_global char *use_queues_flag; + + ccl_global float *buffer; +} SplitParams; + +/* Global memory variables [porting]; These memory is used for + * co-operation between different kernels; Data written by one + * kernel will be available to another kernel via this global + * memory. + */ + +/* SPLIT_DATA_ENTRY(type, name, num) */ + +#if defined(WITH_CYCLES_DEBUG) || defined(__KERNEL_DEBUG__) +/* DebugData memory */ +# define SPLIT_DATA_DEBUG_ENTRIES \ + SPLIT_DATA_ENTRY(DebugData, debug_data, 1) +#else +# define SPLIT_DATA_DEBUG_ENTRIES +#endif + +#define SPLIT_DATA_ENTRIES \ + SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \ + SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \ + SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \ + SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \ + SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \ + SPLIT_DATA_ENTRY(Intersection, isect, 1) \ + SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \ + SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \ + SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \ + SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ + SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ + SPLIT_DATA_ENTRY(Intersection, isect_shadow, 2) \ + SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \ + SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 2) \ + SPLIT_DATA_DEBUG_ENTRIES \ + +/* struct that holds pointers to data in the shared state buffer */ +typedef struct SplitData { +#define SPLIT_DATA_ENTRY(type, name, num) type *name; + SPLIT_DATA_ENTRIES +#undef SPLIT_DATA_ENTRY + + /* this is actually in a separate buffer from the rest of the split state data (so it can be read back from + * the host easily) but is still used the same as the other data so we have it here in this struct as well + */ + ccl_global char *ray_state; +} SplitData; + +#ifndef __KERNEL_CUDA__ +# define kernel_split_state (kg->split_data) +# define kernel_split_params (kg->split_param_data) +#else +__device__ SplitData __split_data; +# define kernel_split_state (__split_data) +__device__ SplitParams __split_param_data; +# define kernel_split_params (__split_param_data) +#endif /* __KERNEL_CUDA__ */ + +CCL_NAMESPACE_END + +#endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */ diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h deleted file mode 100644 index a21e9b6a0b1..00000000000 --- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "../kernel_compat_opencl.h" -#include "../kernel_math.h" -#include "../kernel_types.h" -#include "../kernel_globals.h" - -/* Since we process various samples in parallel; The output radiance of different samples - * are stored in different locations; This kernel combines the output radiance contributed - * by all different samples and stores them in the RenderTile's output buffer. - */ -ccl_device void kernel_sum_all_radiance( - ccl_constant KernelData *data, /* To get pass_stride to offet into buffer */ - ccl_global float *buffer, /* Output buffer of RenderTile */ - ccl_global float *per_sample_output_buffer, /* Radiance contributed by all samples */ - int parallel_samples, int sw, int sh, int stride, - int buffer_offset_x, - int buffer_offset_y, - int buffer_stride, - int start_sample) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if(x < sw && y < sh) { - buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride); - per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride); - - int sample_stride = (data->film.pass_stride); - - int sample_iterator = 0; - int pass_stride_iterator = 0; - int num_floats = data->film.pass_stride; - - for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) { - for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) { - *(buffer + pass_stride_iterator) = - (start_sample == 0 && sample_iterator == 0) - ? *(per_sample_output_buffer + pass_stride_iterator) - : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator); - } - per_sample_output_buffer += sample_stride; - } - } -} diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index 88ec7fe6fcc..57ec9f94a3d 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -192,7 +192,7 @@ CCL_NAMESPACE_BEGIN ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderType type, int path_flag) { float stack[SVM_STACK_SIZE]; - int offset = ccl_fetch(sd, shader) & SHADER_MASK; + int offset = sd->shader & SHADER_MASK; while(1) { uint4 node = read_node(kg, &offset); diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h index 0e55c99ae97..229a3f20421 100644 --- a/intern/cycles/kernel/svm/svm_attribute.h +++ b/intern/cycles/kernel/svm/svm_attribute.h @@ -27,7 +27,7 @@ ccl_device AttributeDescriptor svm_node_attr_init(KernelGlobals *kg, ShaderData AttributeDescriptor desc; - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { desc = find_attribute(kg, sd, node.y); if(desc.offset == ATTR_STD_NOT_FOUND) { desc = attribute_not_found(); diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h index 04a8c7b64e5..610d9af9e1f 100644 --- a/intern/cycles/kernel/svm/svm_bump.h +++ b/intern/cycles/kernel/svm/svm_bump.h @@ -21,9 +21,9 @@ CCL_NAMESPACE_BEGIN ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset) { /* save state */ - stack_store_float3(stack, offset+0, ccl_fetch(sd, P)); - stack_store_float3(stack, offset+3, ccl_fetch(sd, dP).dx); - stack_store_float3(stack, offset+6, ccl_fetch(sd, dP).dy); + stack_store_float3(stack, offset+0, sd->P); + stack_store_float3(stack, offset+3, sd->dP.dx); + stack_store_float3(stack, offset+6, sd->dP.dy); /* set state as if undisplaced */ const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_POSITION_UNDISPLACED); @@ -36,18 +36,18 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, floa object_dir_transform(kg, sd, &dPdx); object_dir_transform(kg, sd, &dPdy); - ccl_fetch(sd, P) = P; - ccl_fetch(sd, dP).dx = dPdx; - ccl_fetch(sd, dP).dy = dPdy; + sd->P = P; + sd->dP.dx = dPdx; + sd->dP.dy = dPdy; } } ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset) { /* restore state */ - ccl_fetch(sd, P) = stack_load_float3(stack, offset+0); - ccl_fetch(sd, dP).dx = stack_load_float3(stack, offset+3); - ccl_fetch(sd, dP).dy = stack_load_float3(stack, offset+6); + sd->P = stack_load_float3(stack, offset+0); + sd->dP.dx = stack_load_float3(stack, offset+3); + sd->dP.dy = stack_load_float3(stack, offset+6); } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h index 00678a49d70..90249dfd978 100644 --- a/intern/cycles/kernel/svm/svm_camera.h +++ b/intern/cycles/kernel/svm/svm_camera.h @@ -23,7 +23,7 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack, float3 vector; Transform tfm = kernel_data.cam.worldtocamera; - vector = transform_point(&tfm, ccl_fetch(sd, P)); + vector = transform_point(&tfm, sd->P); zdepth = vector.z; distance = len(vector); diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h index 017d697f9f8..1885e1af851 100644 --- a/intern/cycles/kernel/svm/svm_closure.h +++ b/intern/cycles/kernel/svm/svm_closure.h @@ -25,13 +25,13 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t bsdf->alpha_y = 0.0f; bsdf->alpha_x = 0.0f; bsdf->ior = eta; - ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf); + sd->flag |= bsdf_refraction_setup(bsdf); } else { bsdf->alpha_y = 0.0f; bsdf->alpha_x = 0.0f; bsdf->ior = 0.0f; - ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf); + sd->flag |= bsdf_reflection_setup(bsdf); } } else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) { @@ -40,9 +40,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t bsdf->ior = eta; if(refract) - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_setup(bsdf); } else { bsdf->alpha_x = roughness; @@ -50,9 +50,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t bsdf->ior = eta; if(refract) - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_setup(bsdf); } } @@ -70,14 +70,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(mix_weight == 0.0f) return; - float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N); + float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N; float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z); float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w); switch(type) { case CLOSURE_BSDF_DIFFUSE_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight); if(bsdf) { @@ -86,31 +86,31 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * float roughness = param1; if(roughness == 0.0f) { - ccl_fetch(sd, flag) |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf); + sd->flag |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf); } else { bsdf->roughness = roughness; - ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(bsdf); + sd->flag |= bsdf_oren_nayar_setup(bsdf); } } break; } case CLOSURE_BSDF_TRANSLUCENT_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight); if(bsdf) { bsdf->N = N; - ccl_fetch(sd, flag) |= bsdf_translucent_setup(bsdf); + sd->flag |= bsdf_translucent_setup(bsdf); } break; } case CLOSURE_BSDF_TRANSPARENT_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); if(bsdf) { - ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf); + sd->flag |= bsdf_transparent_setup(bsdf); } break; } @@ -123,7 +123,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); if(bsdf) { @@ -135,21 +135,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * /* setup bsdf */ if(type == CLOSURE_BSDF_REFLECTION_ID) - ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf); + sd->flag |= bsdf_reflection_setup(bsdf); else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID) - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_setup(bsdf); else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID) - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_setup(bsdf); else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID) { kernel_assert(stack_valid(data_node.z)); bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); if(bsdf->extra) { bsdf->extra->color = stack_load_float3(stack, data_node.z); - ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_setup(bsdf); + sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf); } } else - ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(bsdf); + sd->flag |= bsdf_ashikhmin_shirley_setup(bsdf); } break; @@ -161,7 +161,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); if(bsdf) { @@ -169,7 +169,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->extra = NULL; float eta = fmaxf(param2, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; /* setup bsdf */ if(type == CLOSURE_BSDF_REFRACTION_ID) { @@ -177,7 +177,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->alpha_y = 0.0f; bsdf->ior = eta; - ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf); + sd->flag |= bsdf_refraction_setup(bsdf); } else { bsdf->alpha_x = param1; @@ -185,9 +185,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->ior = eta; if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID) - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf); } } @@ -203,14 +203,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * break; } #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; /* index of refraction */ float eta = fmaxf(param2, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; /* fresnel */ - float cosNO = dot(N, ccl_fetch(sd, I)); + float cosNO = dot(N, sd->I); float fresnel = fresnel_dielectric_cos(cosNO, eta); float roughness = param1; @@ -249,7 +249,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_reflective && !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); @@ -261,13 +261,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->alpha_x = param1; bsdf->alpha_y = param1; float eta = fmaxf(param2, 1e-5f); - bsdf->ior = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + bsdf->ior = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; kernel_assert(stack_valid(data_node.z)); bsdf->extra->color = stack_load_float3(stack, data_node.z); /* setup bsdf */ - ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_setup(bsdf); + sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf); } break; @@ -280,7 +280,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE)) break; #endif - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight); if(bsdf) { @@ -310,33 +310,33 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->ior = 0.0f; if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) { - ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(bsdf); + sd->flag |= bsdf_microfacet_beckmann_aniso_setup(bsdf); } else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) { - ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(bsdf); + sd->flag |= bsdf_microfacet_ggx_aniso_setup(bsdf); } else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID) { kernel_assert(stack_valid(data_node.w)); bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra)); if(bsdf->extra) { bsdf->extra->color = stack_load_float3(stack, data_node.w); - ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf); + sd->flag |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf); } } else - ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(bsdf); + sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(bsdf); } break; } case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; VelvetBsdf *bsdf = (VelvetBsdf*)bsdf_alloc(sd, sizeof(VelvetBsdf), weight); if(bsdf) { bsdf->N = N; bsdf->sigma = saturate(param1); - ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(bsdf); + sd->flag |= bsdf_ashikhmin_velvet_setup(bsdf); } break; } @@ -346,7 +346,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * break; #endif case CLOSURE_BSDF_DIFFUSE_TOON_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; ToonBsdf *bsdf = (ToonBsdf*)bsdf_alloc(sd, sizeof(ToonBsdf), weight); if(bsdf) { @@ -355,18 +355,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bsdf->smooth = param2; if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID) - ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(bsdf); + sd->flag |= bsdf_diffuse_toon_setup(bsdf); else - ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(bsdf); + sd->flag |= bsdf_glossy_toon_setup(bsdf); } break; } #ifdef __HAIR__ case CLOSURE_BSDF_HAIR_REFLECTION_ID: case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 weight = sd->svm_closure_weight * mix_weight; - if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) { + if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) { ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight); if(bsdf) { @@ -376,7 +376,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * * better figure out a way to skip backfaces from rays * spawned by transmission from the front */ bsdf->weight = make_float3(1.0f, 1.0f, 1.0f); - ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf); + sd->flag |= bsdf_transparent_setup(bsdf); } } else { @@ -390,18 +390,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * if(stack_valid(data_node.y)) { bsdf->T = normalize(stack_load_float3(stack, data_node.y)); } - else if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) { - bsdf->T = normalize(ccl_fetch(sd, dPdv)); + else if(!(sd->type & PRIMITIVE_ALL_CURVE)) { + bsdf->T = normalize(sd->dPdv); bsdf->offset = 0.0f; } else - bsdf->T = normalize(ccl_fetch(sd, dPdu)); + bsdf->T = normalize(sd->dPdu); if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) { - ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(bsdf); + sd->flag |= bsdf_hair_reflection_setup(bsdf); } else { - ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(bsdf); + sd->flag |= bsdf_hair_transmission_setup(bsdf); } } } @@ -414,8 +414,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * case CLOSURE_BSSRDF_CUBIC_ID: case CLOSURE_BSSRDF_GAUSSIAN_ID: case CLOSURE_BSSRDF_BURLEY_ID: { - float3 albedo = ccl_fetch(sd, svm_closure_weight); - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight; + float3 albedo = sd->svm_closure_weight; + float3 weight = sd->svm_closure_weight * mix_weight; float sample_weight = fabsf(average(weight)); /* disable in case of diffuse ancestor, can't see it well then and @@ -441,7 +441,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->albedo = albedo.x; bssrdf->sharpness = sharpness; bssrdf->N = N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f)); @@ -452,7 +452,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->albedo = albedo.y; bssrdf->sharpness = sharpness; bssrdf->N = N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z)); @@ -463,7 +463,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float * bssrdf->albedo = albedo.z; bssrdf->sharpness = sharpness; bssrdf->N = N; - ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type); + sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type); } } @@ -493,21 +493,21 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float switch(type) { case CLOSURE_VOLUME_ABSORPTION_ID: { - float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - ccl_fetch(sd, svm_closure_weight)) * mix_weight * density; + float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sd->svm_closure_weight) * mix_weight * density; ShaderClosure *sc = closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_NONE_ID, weight); if(sc) { - ccl_fetch(sd, flag) |= volume_absorption_setup(sc); + sd->flag |= volume_absorption_setup(sc); } break; } case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: { - float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight * density; + float3 weight = sd->svm_closure_weight * mix_weight * density; HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), weight); if(volume) { volume->g = param2; /* g */ - ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(volume); + sd->flag |= volume_henyey_greenstein_setup(volume); } break; } @@ -527,12 +527,12 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight * mix_weight); } else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight)); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight); - ccl_fetch(sd, flag) |= SD_EMISSION; + sd->flag |= SD_EMISSION; } ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node) @@ -545,10 +545,10 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight * mix_weight); } else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight)); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight); } ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node) @@ -561,12 +561,12 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight * mix_weight); } else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight)); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight); - ccl_fetch(sd, flag) |= SD_HOLDOUT; + sd->flag |= SD_HOLDOUT; } ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node) @@ -579,19 +579,19 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, if(mix_weight == 0.0f) return; - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight * mix_weight); } else - closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight)); + closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight); - ccl_fetch(sd, flag) |= SD_AO; + sd->flag |= SD_AO; } /* Closure Nodes */ ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight) { - ccl_fetch(sd, svm_closure_weight) = weight; + sd->svm_closure_weight = weight; } ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b) @@ -641,7 +641,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node) ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal) { float3 normal = stack_load_float3(stack, in_direction); - ccl_fetch(sd, N) = normal; + sd->N = normal; stack_store_float3(stack, out_normal, normal); } diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h index 890ab41aaaa..c94fa130af7 100644 --- a/intern/cycles/kernel/svm/svm_displace.h +++ b/intern/cycles/kernel/svm/svm_displace.h @@ -25,10 +25,10 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac uint normal_offset, distance_offset, invert, use_object_space; decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, &use_object_space); - float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); + float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N; - float3 dPdx = ccl_fetch(sd, dP).dx; - float3 dPdy = ccl_fetch(sd, dP).dy; + float3 dPdx = sd->dP.dx; + float3 dPdy = sd->dP.dy; if(use_object_space) { object_inverse_normal_transform(kg, sd, &normal_in); @@ -80,14 +80,14 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg, ShaderData *sd, flo { float d = stack_load_float(stack, fac_offset); - float3 dP = ccl_fetch(sd, N); + float3 dP = sd->N; object_inverse_normal_transform(kg, sd, &dP); dP *= d*0.1f; /* todo: get rid of this factor */ object_dir_transform(kg, sd, &dP); - ccl_fetch(sd, P) += dP; + sd->P += dP; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h index 23c97d80cb0..3703ec55015 100644 --- a/intern/cycles/kernel/svm/svm_fresnel.h +++ b/intern/cycles/kernel/svm/svm_fresnel.h @@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset, uint normal_offset, out_offset; decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL); float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value); - float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); + float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N; eta = fmaxf(eta, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta; + eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta; - float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta); + float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta); stack_store_float(stack, out_offset, f); } @@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node) decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL); float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value); - float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N); + float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N; float f; if(type == NODE_LAYER_WEIGHT_FRESNEL) { float eta = fmaxf(1.0f - blend, 1e-5f); - eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta; + eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta; - f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta); + f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta); } else { - f = fabsf(dot(ccl_fetch(sd, I), normal_in)); + f = fabsf(dot(sd->I, normal_in)); if(blend != 0.5f) { blend = clamp(blend, 0.0f, 1.0f-1e-5f); diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index 7d512f7ff4d..4a09d9f6653 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -27,15 +27,15 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg, float3 data; switch(type) { - case NODE_GEOM_P: data = ccl_fetch(sd, P); break; - case NODE_GEOM_N: data = ccl_fetch(sd, N); break; + case NODE_GEOM_P: data = sd->P; break; + case NODE_GEOM_N: data = sd->N; break; #ifdef __DPDU__ case NODE_GEOM_T: data = primitive_tangent(kg, sd); break; #endif - case NODE_GEOM_I: data = ccl_fetch(sd, I); break; - case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break; + case NODE_GEOM_I: data = sd->I; break; + case NODE_GEOM_Ng: data = sd->Ng; break; #ifdef __UV__ - case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break; + case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break; #endif } @@ -48,8 +48,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo float3 data; switch(type) { - case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break; - case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break; + case NODE_GEOM_P: data = sd->P + sd->dP.dx; break; + case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break; default: svm_node_geometry(kg, sd, stack, type, out_offset); return; } @@ -65,8 +65,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo float3 data; switch(type) { - case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break; - case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break; + case NODE_GEOM_P: data = sd->P + sd->dP.dy; break; + case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break; default: svm_node_geometry(kg, sd, stack, type, out_offset); return; } @@ -87,9 +87,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s stack_store_float3(stack, out_offset, object_location(kg, sd)); return; } - case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break; + case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break; case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break; - case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break; + case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break; default: data = 0.0f; break; } @@ -106,44 +106,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg, { switch(type) { case NODE_INFO_PAR_INDEX: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_index(kg, particle_id)); break; } case NODE_INFO_PAR_AGE: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_age(kg, particle_id)); break; } case NODE_INFO_PAR_LIFETIME: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id)); break; } case NODE_INFO_PAR_LOCATION: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float3(stack, out_offset, particle_location(kg, particle_id)); break; } #if 0 /* XXX float4 currently not supported in SVM stack */ case NODE_INFO_PAR_ROTATION: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id)); break; } #endif case NODE_INFO_PAR_SIZE: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float(stack, out_offset, particle_size(kg, particle_id)); break; } case NODE_INFO_PAR_VELOCITY: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id)); break; } case NODE_INFO_PAR_ANGULAR_VELOCITY: { - int particle_id = object_particle_id(kg, ccl_fetch(sd, object)); + int particle_id = object_particle_id(kg, sd->object); stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id)); break; } @@ -165,7 +165,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, switch(type) { case NODE_INFO_CURVE_IS_STRAND: { - data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0; + data = (sd->type & PRIMITIVE_ALL_CURVE) != 0; stack_store_float(stack, out_offset, data); break; } @@ -177,7 +177,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg, break; } /*case NODE_INFO_CURVE_FADE: { - data = ccl_fetch(sd, curve_transparency); + data = sd->curve_transparency; stack_store_float(stack, out_offset, data); break; }*/ diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 0d6efb47223..76acc9253a1 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -237,9 +237,9 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node) { /* get object space normal */ - float3 N = ccl_fetch(sd, N); + float3 N = sd->N; - N = ccl_fetch(sd, N); + N = sd->N; object_inverse_normal_transform(kg, sd, &N); /* project from direction vector to barycentric coordinates in triangles */ diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h index 04f6f623f18..1492e358608 100644 --- a/intern/cycles/kernel/svm/svm_light_path.h +++ b/intern/cycles/kernel/svm/svm_light_path.h @@ -31,8 +31,8 @@ ccl_device void svm_node_light_path(ShaderData *sd, ccl_addr_space PathState *st case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break; case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break; case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break; - case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break; - case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break; + case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break; + case NODE_LP_ray_length: info = sd->ray_length; break; case NODE_LP_ray_depth: info = (float)state->bounce; break; case NODE_LP_ray_diffuse: info = (float)state->diffuse_bounce; break; case NODE_LP_ray_glossy: info = (float)state->glossy_bounce; break; @@ -56,14 +56,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node) switch(type) { case NODE_LIGHT_FALLOFF_QUADRATIC: break; - case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break; - case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break; + case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break; + case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break; } float smooth = stack_load_float(stack, smooth_offset); if(smooth > 0.0f) { - float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); + float squared = sd->ray_length*sd->ray_length; /* Distant lamps set the ray length to FLT_MAX, which causes squared to overflow. */ if(isfinite(squared)) { strength *= squared/(smooth + squared); diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h index c0b01262212..c94327401f5 100644 --- a/intern/cycles/kernel/svm/svm_tex_coord.h +++ b/intern/cycles/kernel/svm/svm_tex_coord.h @@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = ccl_fetch(sd, P); + data = sd->P; if(node.w == 0) { - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -48,47 +48,47 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = ccl_fetch(sd, N); + data = sd->N; object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = transform_point(&tfm, ccl_fetch(sd, P)); + if(sd->object != OBJECT_NONE) + data = transform_point(&tfm, sd->P); else - data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg)); + data = transform_point(&tfm, sd->P + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P)); + if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, sd->ray_P); else - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P)); + data = camera_world_to_ndc(kg, sd, sd->P); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); + if(sd->object != OBJECT_NONE) + data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; else - data = ccl_fetch(sd, I); + data = sd->I; break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, ccl_fetch(sd, object)); + data = object_dupli_generated(kg, sd->object); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, ccl_fetch(sd, object)); + data = object_dupli_uv(kg, sd->object); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = ccl_fetch(sd, P); + data = sd->P; #ifdef __VOLUME__ - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -112,9 +112,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; + data = sd->P + sd->dP.dx; if(node.w == 0) { - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -129,47 +129,47 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = ccl_fetch(sd, N); + data = sd->N; object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx); + if(sd->object != OBJECT_NONE) + data = transform_point(&tfm, sd->P + sd->dP.dx); else - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg)); + data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx); + if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx); else - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx); + data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); + if(sd->object != OBJECT_NONE) + data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; else - data = ccl_fetch(sd, I); + data = sd->I; break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, ccl_fetch(sd, object)); + data = object_dupli_generated(kg, sd->object); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, ccl_fetch(sd, object)); + data = object_dupli_uv(kg, sd->object); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; + data = sd->P + sd->dP.dx; #ifdef __VOLUME__ - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -196,9 +196,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, switch(type) { case NODE_TEXCO_OBJECT: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; + data = sd->P + sd->dP.dy; if(node.w == 0) { - if(ccl_fetch(sd, object) != OBJECT_NONE) { + if(sd->object != OBJECT_NONE) { object_inverse_position_transform(kg, sd, &data); } } @@ -213,47 +213,47 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg, break; } case NODE_TEXCO_NORMAL: { - data = ccl_fetch(sd, N); + data = sd->N; object_inverse_normal_transform(kg, sd, &data); break; } case NODE_TEXCO_CAMERA: { Transform tfm = kernel_data.cam.worldtocamera; - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy); + if(sd->object != OBJECT_NONE) + data = transform_point(&tfm, sd->P + sd->dP.dy); else - data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg)); + data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg)); break; } case NODE_TEXCO_WINDOW: { - if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy); + if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC) + data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy); else - data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy); + data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy); data.z = 0.0f; break; } case NODE_TEXCO_REFLECTION: { - if(ccl_fetch(sd, object) != OBJECT_NONE) - data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I); + if(sd->object != OBJECT_NONE) + data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I; else - data = ccl_fetch(sd, I); + data = sd->I; break; } case NODE_TEXCO_DUPLI_GENERATED: { - data = object_dupli_generated(kg, ccl_fetch(sd, object)); + data = object_dupli_generated(kg, sd->object); break; } case NODE_TEXCO_DUPLI_UV: { - data = object_dupli_uv(kg, ccl_fetch(sd, object)); + data = object_dupli_uv(kg, sd->object); break; } case NODE_TEXCO_VOLUME_GENERATED: { - data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; + data = sd->P + sd->dP.dy; #ifdef __VOLUME__ - if(ccl_fetch(sd, object) != OBJECT_NONE) + if(sd->object != OBJECT_NONE) data = volume_normalized_position(kg, sd, data); #endif break; @@ -274,12 +274,12 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st float3 color = stack_load_float3(stack, color_offset); color = 2.0f*make_float3(color.x - 0.5f, color.y - 0.5f, color.z - 0.5f); - bool is_backfacing = (ccl_fetch(sd, flag) & SD_BACKFACING) != 0; + bool is_backfacing = (sd->flag & SD_BACKFACING) != 0; float3 N; if(space == NODE_NORMAL_MAP_TANGENT) { /* tangent space */ - if(ccl_fetch(sd, object) == OBJECT_NONE) { + if(sd->object == OBJECT_NONE) { stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f)); return; } @@ -299,11 +299,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st float sign = primitive_attribute_float(kg, sd, attr_sign, NULL, NULL); float3 normal; - if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) { + if(sd->shader & SHADER_SMOOTH_NORMAL) { normal = primitive_attribute_float3(kg, sd, attr_normal, NULL, NULL); } else { - normal = ccl_fetch(sd, Ng); + normal = sd->Ng; /* the normal is already inverted, which is too soon for the math here */ if(is_backfacing) { @@ -345,11 +345,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st if(strength != 1.0f) { strength = max(strength, 0.0f); - N = safe_normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength); + N = safe_normalize(sd->N + (N - sd->N)*strength); } if(is_zero(N)) { - N = ccl_fetch(sd, N); + N = sd->N; } stack_store_float3(stack, normal_offset, N); @@ -377,7 +377,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack float3 generated; if(desc.offset == ATTR_STD_NOT_FOUND) - generated = ccl_fetch(sd, P); + generated = sd->P; else generated = primitive_attribute_float3(kg, sd, desc, NULL, NULL); @@ -390,7 +390,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack } object_normal_transform(kg, sd, &tangent); - tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N)))); + tangent = cross(sd->N, normalize(cross(tangent, sd->N))); stack_store_float3(stack, tangent_offset, tangent); } diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h index 4c32130d06d..4e92f27acdb 100644 --- a/intern/cycles/kernel/svm/svm_vector_transform.h +++ b/intern/cycles/kernel/svm/svm_vector_transform.h @@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito; Transform tfm; - bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE); + bool is_object = (sd->object != OBJECT_NONE); bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL); /* From world */ diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h index 87e40791333..3c6353c8001 100644 --- a/intern/cycles/kernel/svm/svm_wireframe.h +++ b/intern/cycles/kernel/svm/svm_wireframe.h @@ -41,9 +41,9 @@ ccl_device_inline float wireframe(KernelGlobals *kg, float3 *P) { #ifdef __HAIR__ - if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) + if(sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE) #else - if(ccl_fetch(sd, prim) != PRIM_NONE) + if(sd->prim != PRIM_NONE) #endif { float3 Co[3]; @@ -52,12 +52,12 @@ ccl_device_inline float wireframe(KernelGlobals *kg, /* Triangles */ int np = 3; - if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) - triangle_vertices(kg, ccl_fetch(sd, prim), Co); + if(sd->type & PRIMITIVE_TRIANGLE) + triangle_vertices(kg, sd->prim, Co); else - motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co); + motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co); - if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) { + if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) { object_position_transform(kg, sd, &Co[0]); object_position_transform(kg, sd, &Co[1]); object_position_transform(kg, sd, &Co[2]); @@ -66,8 +66,8 @@ ccl_device_inline float wireframe(KernelGlobals *kg, if(pixel_size) { // Project the derivatives of P to the viewing plane defined // by I so we have a measure of how big is a pixel at this point - float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I)); - float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I)); + float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I); + float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I); // Take the average of both axis' length pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f; } @@ -113,20 +113,20 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, * With OpenCL 2.0 it's possible to avoid this change, but for until * then we'll be living with such an exception. */ - float3 P = ccl_fetch(sd, P); + float3 P = sd->P; float f = wireframe(kg, sd, size, pixel_size, &P); #else - float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P)); + float f = wireframe(kg, sd, size, pixel_size, &sd->P); #endif /* TODO(sergey): Think of faster way to calculate derivatives. */ if(bump_offset == NODE_BUMP_OFFSET_DX) { - float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx; - f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx); + float3 Px = sd->P - sd->dP.dx; + f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx); } else if(bump_offset == NODE_BUMP_OFFSET_DY) { - float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy; - f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy); + float3 Py = sd->P - sd->dP.dy; + f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy); } if(stack_valid(out_fac)) diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h index 8029c6a9e80..deb22c9c2f2 100644 --- a/intern/cycles/render/background.h +++ b/intern/cycles/render/background.h @@ -30,7 +30,7 @@ class Shader; class Background : public Node { public: - NODE_DECLARE; + NODE_DECLARE float ao_factor; float ao_distance; diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp index d9a297002c6..c2f6293a50b 100644 --- a/intern/cycles/render/bake.cpp +++ b/intern/cycles/render/bake.cpp @@ -171,9 +171,9 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre /* needs to be up to data for attribute access */ device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); - device->mem_alloc(d_input, MEM_READ_ONLY); + device->mem_alloc("bake_input", d_input, MEM_READ_ONLY); device->mem_copy_to(d_input); - device->mem_alloc(d_output, MEM_READ_WRITE); + device->mem_alloc("bake_output", d_output, MEM_READ_WRITE); DeviceTask task(DeviceTask::SHADER); task.shader_input = d_input.device_pointer; diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index f1692712d61..e3ef4bf13fb 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -129,13 +129,13 @@ void RenderBuffers::reset(Device *device, BufferParams& params_) /* allocate buffer */ buffer.resize(params.width*params.height*params.get_passes_size()); - device->mem_alloc(buffer, MEM_READ_WRITE); + device->mem_alloc("render_buffer", buffer, MEM_READ_WRITE); device->mem_zero(buffer); /* allocate rng state */ rng_state.resize(params.width, params.height); - device->mem_alloc(rng_state, MEM_READ_WRITE); + device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE); } bool RenderBuffers::copy_from_device() diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h index 141ef9cccef..655d74e42d8 100644 --- a/intern/cycles/render/camera.h +++ b/intern/cycles/render/camera.h @@ -39,7 +39,7 @@ class Scene; class Camera : public Node { public: - NODE_DECLARE; + NODE_DECLARE /* Specifies an offset for the shutter's time interval. */ enum MotionPosition { diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h index 9fa51c51f52..d917057ed91 100644 --- a/intern/cycles/render/film.h +++ b/intern/cycles/render/film.h @@ -53,7 +53,7 @@ public: class Film : public Node { public: - NODE_DECLARE; + NODE_DECLARE float exposure; array<Pass> passes; diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h index 780fdf49ca4..06524d3fa13 100644 --- a/intern/cycles/render/graph.h +++ b/intern/cycles/render/graph.h @@ -201,14 +201,14 @@ public: /* Node definition utility macros */ #define SHADER_NODE_CLASS(type) \ - NODE_DECLARE; \ + NODE_DECLARE \ type(); \ virtual ShaderNode *clone() const { return new type(*this); } \ virtual void compile(SVMCompiler& compiler); \ virtual void compile(OSLCompiler& compiler); \ #define SHADER_NODE_NO_CLONE_CLASS(type) \ - NODE_DECLARE; \ + NODE_DECLARE \ type(); \ virtual void compile(SVMCompiler& compiler); \ virtual void compile(OSLCompiler& compiler); \ diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h index 27fff4831e5..3ce41d5a185 100644 --- a/intern/cycles/render/integrator.h +++ b/intern/cycles/render/integrator.h @@ -29,7 +29,7 @@ class Scene; class Integrator : public Node { public: - NODE_DECLARE; + NODE_DECLARE int min_bounce; int max_bounce; diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp index 6a4557506c3..fc6790dc022 100644 --- a/intern/cycles/render/light.cpp +++ b/intern/cycles/render/light.cpp @@ -57,9 +57,9 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); - device->mem_alloc(d_input, MEM_READ_ONLY); + device->mem_alloc("shade_background_pixels_input", d_input, MEM_READ_ONLY); device->mem_copy_to(d_input); - device->mem_alloc(d_output, MEM_WRITE_ONLY); + device->mem_alloc("shade_background_pixels_output", d_output, MEM_WRITE_ONLY); DeviceTask main_task(DeviceTask::SHADER); main_task.shader_input = d_input.device_pointer; diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h index 5f33e30eac2..1f8b880c161 100644 --- a/intern/cycles/render/mesh.h +++ b/intern/cycles/render/mesh.h @@ -48,7 +48,7 @@ struct PackedPatchTable; class Mesh : public Node { public: - NODE_DECLARE; + NODE_DECLARE /* Mesh Triangle */ struct Triangle { diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp index adc5b820298..4acb7911560 100644 --- a/intern/cycles/render/mesh_displace.cpp +++ b/intern/cycles/render/mesh_displace.cpp @@ -121,9 +121,9 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me /* needs to be up to data for attribute access */ device->const_copy_to("__data", &dscene->data, sizeof(dscene->data)); - device->mem_alloc(d_input, MEM_READ_ONLY); + device->mem_alloc("displace_input", d_input, MEM_READ_ONLY); device->mem_copy_to(d_input); - device->mem_alloc(d_output, MEM_WRITE_ONLY); + device->mem_alloc("displace_output", d_output, MEM_WRITE_ONLY); DeviceTask task(DeviceTask::SHADER); task.shader_input = d_input.device_pointer; diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h index 7e306fab2a8..3495849d149 100644 --- a/intern/cycles/render/object.h +++ b/intern/cycles/render/object.h @@ -40,7 +40,7 @@ struct Transform; class Object : public Node { public: - NODE_DECLARE; + NODE_DECLARE Mesh *mesh; Transform tfm; diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 420866c9436..0c7bd271371 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -654,6 +654,8 @@ void Session::load_kernels() if(!kernels_loaded) { progress.set_status("Loading render kernels (may take a few minutes the first time)"); + scoped_timer timer; + DeviceRequestedFeatures requested_features = get_requested_device_features(); VLOG(2) << "Requested features:\n" << requested_features; if(!device->load_kernels(requested_features)) { @@ -667,6 +669,9 @@ void Session::load_kernels() return; } + progress.add_skip_time(timer, false); + VLOG(1) << "Total time spent loading kernels: " << time_dt() - timer.get_start(); + kernels_loaded = true; } } @@ -887,6 +892,7 @@ void Session::path_trace() task.need_finish_queue = params.progressive_refine; task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH; task.requested_tile_size = params.tile_size; + task.passes_size = tile_manager.params.get_passes_size(); device->task_add(task); } diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h index 7d896652196..490c3f1c95d 100644 --- a/intern/cycles/render/shader.h +++ b/intern/cycles/render/shader.h @@ -82,7 +82,7 @@ enum DisplacementMethod { class Shader : public Node { public: - NODE_DECLARE; + NODE_DECLARE int pass_id; diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h index 433e41fbbb6..6c52117ef9a 100644 --- a/intern/cycles/util/util_atomic.h +++ b/intern/cycles/util/util_atomic.h @@ -32,6 +32,13 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value) } } +#define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x)) + +#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) + +#define CCL_LOCAL_MEM_FENCE 0 +#define ccl_barrier(flags) (void)0 + #else /* __KERNEL_GPU__ */ #ifdef __KERNEL_OPENCL__ @@ -39,7 +46,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value) /* Float atomics implementation credits: * http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html */ -ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *source, +ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *source, const float operand) { union { @@ -56,10 +63,29 @@ ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *sou } while(atomic_cmpxchg((volatile ccl_global unsigned int *)source, prev_value.int_value, new_value.int_value) != prev_value.int_value); + return new_value.float_value; } +#define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x)) +#define atomic_fetch_and_inc_uint32(p) atomic_inc((p)) + +#define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE +#define ccl_barrier(flags) barrier(flags) + #endif /* __KERNEL_OPENCL__ */ +#ifdef __KERNEL_CUDA__ + +#define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x)) + +#define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x)) +#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1) + +#define CCL_LOCAL_MEM_FENCE +#define ccl_barrier(flags) __syncthreads() + +#endif /* __KERNEL_CUDA__ */ + #endif /* __KERNEL_GPU__ */ #endif /* __UTIL_ATOMIC_H__ */ diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index 80d177d2cae..f12c5e28c80 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -29,7 +29,8 @@ DebugFlags::CPU::CPU() sse41(true), sse3(true), sse2(true), - qbvh(true) + qbvh(true), + split_kernel(false) { reset(); } @@ -55,10 +56,12 @@ void DebugFlags::CPU::reset() #undef CHECK_CPU_FLAGS qbvh = true; + split_kernel = false; } DebugFlags::CUDA::CUDA() - : adaptive_compile(false) + : adaptive_compile(false), + split_kernel(false) { reset(); } @@ -67,6 +70,8 @@ void DebugFlags::CUDA::reset() { if(getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL) adaptive_compile = true; + + split_kernel = false; } DebugFlags::OpenCL::OpenCL() @@ -133,7 +138,9 @@ std::ostream& operator <<(std::ostream &os, << " AVX : " << string_from_bool(debug_flags.cpu.avx) << "\n" << " SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n" << " SSE3 : " << string_from_bool(debug_flags.cpu.sse3) << "\n" - << " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n"; + << " SSE2 : " << string_from_bool(debug_flags.cpu.sse2) << "\n" + << " QBVH : " << string_from_bool(debug_flags.cpu.qbvh) << "\n" + << " Split : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n"; os << "CUDA flags:\n" << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n"; diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index 73fd228b5d9..911c95de4ab 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -46,6 +46,9 @@ public: /* Whether QBVH usage is allowed or not. */ bool qbvh; + + /* Whether split kernel is used */ + bool split_kernel; }; /* Descriptor of CUDA feature-set to be used. */ @@ -58,6 +61,9 @@ public: /* Whether adaptive feature based runtime compile is enabled or not. * Requires the CUDA Toolkit and only works on Linux atm. */ bool adaptive_compile; + + /* Whether split kernel is used */ + bool split_kernel; }; /* Descriptor of OpenCL feature-set to be used. */ diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp index 03041723e15..6824f1ff83c 100644 --- a/intern/cycles/util/util_logging.cpp +++ b/intern/cycles/util/util_logging.cpp @@ -69,6 +69,15 @@ void util_logging_verbosity_set(int verbosity) } std::ostream& operator <<(std::ostream &os, + const int2 &value) +{ + os << "(" << value.x + << ", " << value.y + << ")"; + return os; +} + +std::ostream& operator <<(std::ostream &os, const float3 &value) { os << "(" << value.x diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h index 2aa9c25b1a0..ecf9c9cfee0 100644 --- a/intern/cycles/util/util_logging.h +++ b/intern/cycles/util/util_logging.h @@ -45,6 +45,7 @@ public: #define VLOG_ONCE(level, flag) if(!flag) flag = true, VLOG(level) +struct int2; struct float3; void util_logging_init(const char *argv0); @@ -52,6 +53,8 @@ void util_logging_start(void); void util_logging_verbosity_set(int verbosity); std::ostream& operator <<(std::ostream &os, + const int2 &value); +std::ostream& operator <<(std::ostream &os, const float3 &value); CCL_NAMESPACE_END diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp index 5df262fcbbb..1b2e8aace5b 100644 --- a/intern/cycles/util/util_path.cpp +++ b/intern/cycles/util/util_path.cpp @@ -814,7 +814,7 @@ string path_source_replace_includes(const string& source, /* Use line directives for better error messages. */ line = line_directive(filepath, 1) + token.replace(0, n_end + 1, "\n" + text + "\n") - + line_directive(path_join(path, source_filename), i); + + line_directive(path_join(path, source_filename), i + 1); } } } diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h index 2f5295b5463..cf99a08efae 100644 --- a/intern/cycles/util/util_ssef.h +++ b/intern/cycles/util/util_ssef.h @@ -514,12 +514,12 @@ ccl_device_inline float len3(const ssef& a) /* faster version for SSSE3 */ typedef ssei shuffle_swap_t; -ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void) +ccl_device_inline shuffle_swap_t shuffle_swap_identity(void) { return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); } -ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void) +ccl_device_inline shuffle_swap_t shuffle_swap_swap(void) { return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); } @@ -534,12 +534,12 @@ ccl_device_inline const ssef shuffle_swap(const ssef& a, const shuffle_swap_t& s /* somewhat slower version for SSE2 */ typedef int shuffle_swap_t; -ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void) +ccl_device_inline shuffle_swap_t shuffle_swap_identity(void) { return 0; } -ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void) +ccl_device_inline shuffle_swap_t shuffle_swap_swap(void) { return 1; } diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h index a000fae4bd6..36d2f1053c7 100644 --- a/intern/cycles/util/util_types.h +++ b/intern/cycles/util/util_types.h @@ -37,6 +37,9 @@ #define ccl_device_noinline static #define ccl_global #define ccl_constant +#define ccl_local +#define ccl_local_param +#define ccl_private #define ccl_restrict __restrict #define __KERNEL_WITH_SSE_ALIGN__ @@ -397,11 +400,6 @@ ccl_device_inline float4 make_float4(float x, float y, float z, float w) return a; } -ccl_device_inline int align_up(int offset, int alignment) -{ - return (offset + alignment - 1) & ~(alignment - 1); -} - ccl_device_inline int3 make_int3(int i) { #ifdef __KERNEL_SSE__ @@ -476,6 +474,21 @@ ccl_device_inline int4 make_int4(const float3& f) #endif +ccl_device_inline int align_up(int offset, int alignment) +{ + return (offset + alignment - 1) & ~(alignment - 1); +} + +ccl_device_inline int round_up(int x, int multiple) +{ + return ((x + multiple - 1) / multiple) * multiple; +} + +ccl_device_inline int round_down(int x, int multiple) +{ + return (x / multiple) * multiple; +} + /* Interpolation types for textures * cuda also use texture space to store other objects */ enum InterpolationType { diff --git a/release/scripts/presets/interface_theme/back_to_black.xml b/release/scripts/presets/interface_theme/back_to_black.xml index 915e9cb64f1..1636f5b5cf6 100644 --- a/release/scripts/presets/interface_theme/back_to_black.xml +++ b/release/scripts/presets/interface_theme/back_to_black.xml @@ -18,7 +18,7 @@ text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_regular> <wcol_tool> @@ -30,19 +30,19 @@ text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_tool> <wcol_radio> <ThemeWidgetColors outline="#2a2a2a" inner="#111111ff" inner_sel="#33406bff" - item="#191919ff" + item="#444444ff" text="#929292" text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_radio> <wcol_text> @@ -50,23 +50,23 @@ inner="#111111ff" inner_sel="#33406bff" item="#191919ff" - text="#e4e4e4" + text="#929292" text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_text> <wcol_option> - <ThemeWidgetColors outline="#2a2a2a" + <ThemeWidgetColors outline="#535353" inner="#111111ff" inner_sel="#33406bff" - item="#000000ff" - text="#c7c7c7" + item="#a3a3a3ff" + text="#929292" text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_option> <wcol_toggle> @@ -78,7 +78,7 @@ text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_toggle> <wcol_num> @@ -90,7 +90,7 @@ text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_num> <wcol_numslider> @@ -102,7 +102,7 @@ text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_numslider> <wcol_box> @@ -114,7 +114,7 @@ text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_box> <wcol_menu> @@ -126,7 +126,7 @@ text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_menu> <wcol_pulldown> @@ -138,7 +138,7 @@ text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_pulldown> <wcol_menu_back> @@ -150,7 +150,7 @@ text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_menu_back> <wcol_pie_menu> @@ -170,7 +170,7 @@ inner="#191919e6" inner_sel="#2d2d2de6" item="#646464ff" - text="#ffffff" + text="#929292" text_sel="#ffffff" show_shaded="FALSE" shadetop="25" @@ -186,7 +186,7 @@ text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_menu_item> <wcol_scroll> @@ -198,7 +198,7 @@ text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_scroll> <wcol_progress> @@ -210,7 +210,7 @@ text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_progress> <wcol_list_item> @@ -222,7 +222,7 @@ text_sel="#ffffff" show_shaded="TRUE" shadetop="-100" - shadedown="0"> + shadedown="5"> </ThemeWidgetColors> </wcol_list_item> <wcol_state> @@ -239,32 +239,35 @@ </user_interface> <view_3d> <ThemeView3D grid="#222222" + clipping_border_3d="#313131ff" wire="#888888" - wire_edit="#000000" + wire_edit="#6c75ff" gp_vertex="#000000" gp_vertex_select="#ff8500" gp_vertex_size="3" - lamp="#c1d40028" - speaker="#535353" - camera="#000000" - view_overlay="#000000" - empty="#000000" + text_grease_pencil="#b5e61d" object_selected="#f15800" object_active="#ff8c19" object_grouped="#083008" object_grouped_active="#55bb55" - transform="#ffffff" + text_keyframe="#ddd700" + camera="#535353" + empty="#535353" + lamp="#fff0d328" + speaker="#535353" vertex="#72cfdd" vertex_select="#ff8500" vertex_size="3" + vertex_bevel="#00a5ff" vertex_unreferenced="#000000" edge_select="#ffa000" edge_seam="#db2512" edge_sharp="#ff2020" edge_crease="#cc0099" + edge_bevel="#00a5ff" edge_facesel="#6b6b6b" freestyle_edge_mark="#7fff7f" - face="#73828f12" + face="#73828f41" face_select="#ffa4003c" face_dot="#ffa900" facedot_size="4" @@ -291,19 +294,18 @@ normal="#22dddd" vertex_normal="#2361dd" split_normal="#dd23dd" - bone_solid="#c8c8c8" bone_pose="#50c8ff" bone_pose_active="#8cffff" - frame_current="#60c040" - outline_width="1" + bone_solid="#c8c8c8" bundle_solid="#c8c8c8" camera_path="#5a5a5a" skin_root="#000000" - clipping_border_3d="#313131ff" - text_keyframe="#ddd700" - text_grease_pencil="#b5e61d" + view_overlay="#000000" + transform="#ffffff" + frame_current="#60c040" paint_curve_handle="#7fff7f7f" - paint_curve_pivot="#ff7f7f7f"> + paint_curve_pivot="#ff7f7f7f" + outline_width="1"> <space> <ThemeSpaceGradient title="#5d5d5d" text="#7d7d7d" @@ -312,23 +314,23 @@ header_text="#979797" header_text_hi="#ffffff" button="#00000057" - button_title="#c5c5c5" + button_title="#929292" button_text="#c3c3c3" - button_text_hi="#ffffff" + button_text_hi="#e5e5e5" tab_active="#212947" tab_inactive="#000000" tab_back="#060606ff" tab_outline="#000000"> <gradients> <ThemeGradientColors show_grad="TRUE" - gradient="#0a0a0a" + gradient="#1d1d1d" high_gradient="#000000"> </ThemeGradientColors> </gradients> <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -348,6 +350,7 @@ vertex="#ffffff" vertex_select="#ff8500" vertex_size="3" + vertex_bevel="#000000" vertex_unreferenced="#000000" handle_free="#808080" handle_auto="#909000" @@ -382,7 +385,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -418,7 +421,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -463,7 +466,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -501,6 +504,7 @@ keyframe_jitter_selected="#61c042" keyframe_border="#000000ff" keyframe_border_selected="#000000ff" + keyframe_scale_factor="1" summary="#00000000"> <space> <ThemeSpaceGeneric back="#080808" @@ -521,7 +525,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -543,6 +547,7 @@ vertex="#0f13bb" vertex_select="#ff8500" vertex_size="3" + vertex_bevel="#000000" vertex_unreferenced="#000000" face="#ffffff0a" face_select="#ff85003c" @@ -596,7 +601,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -644,7 +649,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -673,7 +678,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -712,7 +717,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -744,7 +749,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -799,7 +804,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -835,7 +840,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -865,7 +870,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -903,7 +908,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -932,7 +937,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -966,7 +971,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> @@ -1019,7 +1024,7 @@ <panelcolors> <ThemePanelColors header="#00000019" back="#72727280" - show_header="FALSE" + show_header="TRUE" show_back="FALSE"> </ThemePanelColors> </panelcolors> diff --git a/release/scripts/presets/keyconfig/3dsmax.py b/release/scripts/presets/keyconfig/3dsmax.py index b6b0a0c926f..a07a3a52caf 100644 --- a/release/scripts/presets/keyconfig/3dsmax.py +++ b/release/scripts/presets/keyconfig/3dsmax.py @@ -400,6 +400,12 @@ kmi = km.keymap_items.new('particle.hide', 'H', 'PRESS') kmi.properties.unselected = False kmi = km.keymap_items.new('particle.hide', 'H', 'PRESS', shift=True) kmi.properties.unselected = True +kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True) +kmi.properties.release_confirm = True +kmi.properties.use_planar_constraint= True +kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True) +kmi.properties.release_confirm = True +kmi.properties.use_accurate= True kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', any=True) kmi.properties.release_confirm = True kmi = km.keymap_items.new('particle.brush_edit', 'LEFTMOUSE', 'PRESS') @@ -421,6 +427,12 @@ kmi.properties.value_2 = 'ENABLED' # Map 3D View km = kc.keymaps.new('3D View', space_type='VIEW_3D', region_type='WINDOW', modal=False) +kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True) +kmi.properties.release_confirm = True +kmi.properties.use_planar_constraint= True +kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True) +kmi.properties.release_confirm = True +kmi.properties.use_accurate= True kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', any=True) kmi.properties.release_confirm = True kmi = km.keymap_items.new('view3d.cursor3d', 'ACTIONMOUSE', 'PRESS') diff --git a/release/scripts/presets/keyconfig/maya.py b/release/scripts/presets/keyconfig/maya.py index 3f4754863c6..cf213c1ddbd 100644 --- a/release/scripts/presets/keyconfig/maya.py +++ b/release/scripts/presets/keyconfig/maya.py @@ -935,6 +935,9 @@ kmi = km.keymap_items.new('view3d.rotate', 'LEFTMOUSE', 'PRESS', alt=True) kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True) kmi.properties.release_confirm = True kmi.properties.use_planar_constraint = True +kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True) +kmi.properties.release_confirm = True +kmi.properties.use_accurate = True kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', any=True) kmi.properties.release_confirm = True kmi = km.keymap_items.new('view3d.move', 'MIDDLEMOUSE', 'PRESS', alt=True) diff --git a/source/blender/blenlib/BLI_rect.h b/source/blender/blenlib/BLI_rect.h index 484a679f76a..041679ef876 100644 --- a/source/blender/blenlib/BLI_rect.h +++ b/source/blender/blenlib/BLI_rect.h @@ -47,8 +47,8 @@ bool BLI_rcti_is_empty(const struct rcti *rect); bool BLI_rctf_is_empty(const struct rctf *rect); void BLI_rctf_init(struct rctf *rect, float xmin, float xmax, float ymin, float ymax); void BLI_rcti_init(struct rcti *rect, int xmin, int xmax, int ymin, int ymax); -void BLI_rctf_init_pt_size(struct rctf *rect, const float xy[2], float size); -void BLI_rcti_init_pt_size(struct rcti *rect, const int xy[2], int size); +void BLI_rctf_init_pt_radius(struct rctf *rect, const float xy[2], float size); +void BLI_rcti_init_pt_radius(struct rcti *rect, const int xy[2], int size); void BLI_rcti_init_minmax(struct rcti *rect); void BLI_rctf_init_minmax(struct rctf *rect); void BLI_rcti_do_minmax_v(struct rcti *rect, const int xy[2]); diff --git a/source/blender/blenlib/BLI_task.h b/source/blender/blenlib/BLI_task.h index d27bf4dad20..c3c587275e1 100644 --- a/source/blender/blenlib/BLI_task.h +++ b/source/blender/blenlib/BLI_task.h @@ -81,6 +81,7 @@ typedef void (*TaskFreeFunction)(TaskPool *__restrict pool, void *taskdata, int TaskPool *BLI_task_pool_create(TaskScheduler *scheduler, void *userdata); TaskPool *BLI_task_pool_create_background(TaskScheduler *scheduler, void *userdata); +TaskPool *BLI_task_pool_create_suspended(TaskScheduler *scheduler, void *userdata); void BLI_task_pool_free(TaskPool *pool); void BLI_task_pool_push_ex( @@ -96,9 +97,6 @@ void BLI_task_pool_work_and_wait(TaskPool *pool); /* cancel all tasks, keep worker threads running */ void BLI_task_pool_cancel(TaskPool *pool); -/* set number of threads allowed to be used by this pool */ -void BLI_pool_set_num_threads(TaskPool *pool, int num_threads); - /* for worker threads, test if canceled */ bool BLI_task_pool_canceled(TaskPool *pool); diff --git a/source/blender/blenlib/intern/rct.c b/source/blender/blenlib/intern/rct.c index e01a4714131..fd24a00156d 100644 --- a/source/blender/blenlib/intern/rct.c +++ b/source/blender/blenlib/intern/rct.c @@ -351,7 +351,7 @@ void BLI_rcti_init(rcti *rect, int xmin, int xmax, int ymin, int ymax) } } -void BLI_rctf_init_pt_size(rctf *rect, const float xy[2], float size) +void BLI_rctf_init_pt_radius(rctf *rect, const float xy[2], float size) { rect->xmin = xy[0] - size; rect->xmax = xy[0] + size; @@ -359,7 +359,7 @@ void BLI_rctf_init_pt_size(rctf *rect, const float xy[2], float size) rect->ymax = xy[1] + size; } -void BLI_rcti_init_pt_size(rcti *rect, const int xy[2], int size) +void BLI_rcti_init_pt_radius(rcti *rect, const int xy[2], int size) { rect->xmin = xy[0] - size; rect->xmax = xy[0] + size; diff --git a/source/blender/blenlib/intern/task.c b/source/blender/blenlib/intern/task.c index 5d16fd9229c..49d2ee83a66 100644 --- a/source/blender/blenlib/intern/task.c +++ b/source/blender/blenlib/intern/task.c @@ -48,6 +48,32 @@ */ #define MEMPOOL_SIZE 256 +/* Number of tasks which are pushed directly to local thread queue. + * + * This allows thread to fetch next task without locking the whole queue. + */ +#define LOCALQUEUE_SIZE 1 + +#ifndef NDEBUG +# define ASSERT_THREAD_ID(scheduler, thread_id) \ + do { \ + if (!BLI_thread_is_main()) { \ + TaskThread *thread = pthread_getspecific(scheduler->tls_id_key); \ + if (thread == NULL) { \ + BLI_assert(thread_id == 0); \ + } \ + else { \ + BLI_assert(thread_id == thread->id); \ + } \ + } \ + else { \ + BLI_assert(thread_id == 0); \ + } \ + } while (false) +#else +# define ASSERT_THREAD_ID(scheduler, thread_id) +#endif + typedef struct Task { struct Task *next, *prev; @@ -102,12 +128,16 @@ typedef struct TaskMemPoolStats { } TaskMemPoolStats; #endif +typedef struct TaskThreadLocalStorage { + TaskMemPool task_mempool; + int num_local_queue; + Task *local_queue[LOCALQUEUE_SIZE]; +} TaskThreadLocalStorage; + struct TaskPool { TaskScheduler *scheduler; volatile size_t num; - size_t num_threads; - size_t currently_running_tasks; ThreadMutex num_mutex; ThreadCondition num_cond; @@ -115,6 +145,11 @@ struct TaskPool { ThreadMutex user_mutex; volatile bool do_cancel; + volatile bool do_work; + + volatile bool is_suspended; + ListBase suspended_queue; + size_t num_suspended; /* If set, this pool may never be work_and_wait'ed, which means TaskScheduler * has to use its special background fallback thread in case we are in @@ -122,16 +157,10 @@ struct TaskPool { */ bool run_in_background; - /* This pool is used for caching task pointers for thread id 0. - * This could either point to a global scheduler's task_mempool[0] if the - * pool is handled form the main thread or point to task_mempool_local - * otherwise. - * - * This way we solve possible threading conflicts accessing same global - * memory pool from multiple threads from which wait_work() is called. + /* This is a task scheduler's ID of a thread at which pool was constructed. + * It will be used to access task TLS. */ - TaskMemPool *task_mempool; - TaskMemPool task_mempool_local; + int thread_id; #ifdef DEBUG_STATS TaskMemPoolStats *mempool_stats; @@ -141,7 +170,6 @@ struct TaskPool { struct TaskScheduler { pthread_t *threads; struct TaskThread *task_threads; - TaskMemPool *task_mempool; int num_threads; bool background_thread_only; @@ -150,15 +178,19 @@ struct TaskScheduler { ThreadCondition queue_cond; volatile bool do_exit; + + /* NOTE: In pthread's TLS we store the whole TaskThread structure. */ + pthread_key_t tls_id_key; }; typedef struct TaskThread { TaskScheduler *scheduler; int id; + TaskThreadLocalStorage tls; } TaskThread; /* Helper */ -static void task_data_free(Task *task, const int thread_id) +BLI_INLINE void task_data_free(Task *task, const int thread_id) { if (task->free_taskdata) { if (task->freedata) { @@ -170,12 +202,24 @@ static void task_data_free(Task *task, const int thread_id) } } -BLI_INLINE TaskMemPool *get_task_mempool(TaskPool *pool, const int thread_id) +BLI_INLINE TaskThreadLocalStorage *get_task_tls(TaskPool *pool, + const int thread_id) { + TaskScheduler *scheduler = pool->scheduler; + BLI_assert(thread_id >= 0); + BLI_assert(thread_id <= scheduler->num_threads); if (thread_id == 0) { - return pool->task_mempool; + return &scheduler->task_threads[pool->thread_id].tls; + } + return &scheduler->task_threads[thread_id].tls; +} + +BLI_INLINE void free_task_tls(TaskThreadLocalStorage *tls) +{ + TaskMemPool *task_mempool = &tls->task_mempool; + for (int i = 0; i < task_mempool->num_tasks; ++i) { + MEM_freeN(task_mempool->tasks[i]); } - return &pool->scheduler->task_mempool[thread_id]; } static Task *task_alloc(TaskPool *pool, const int thread_id) @@ -183,15 +227,17 @@ static Task *task_alloc(TaskPool *pool, const int thread_id) BLI_assert(thread_id <= pool->scheduler->num_threads); if (thread_id != -1) { BLI_assert(thread_id >= 0); - TaskMemPool *mem_pool = get_task_mempool(pool, thread_id); + BLI_assert(thread_id <= pool->scheduler->num_threads); + TaskThreadLocalStorage *tls = get_task_tls(pool, thread_id); + TaskMemPool *task_mempool = &tls->task_mempool; /* Try to re-use task memory from a thread local storage. */ - if (mem_pool->num_tasks > 0) { - --mem_pool->num_tasks; + if (task_mempool->num_tasks > 0) { + --task_mempool->num_tasks; /* Success! We've just avoided task allocation. */ #ifdef DEBUG_STATS pool->mempool_stats[thread_id].num_reuse++; #endif - return mem_pool->tasks[mem_pool->num_tasks]; + return task_mempool->tasks[task_mempool->num_tasks]; } /* We are doomed to allocate new task data. */ #ifdef DEBUG_STATS @@ -206,11 +252,12 @@ static void task_free(TaskPool *pool, Task *task, const int thread_id) task_data_free(task, thread_id); BLI_assert(thread_id >= 0); BLI_assert(thread_id <= pool->scheduler->num_threads); - TaskMemPool *mem_pool = get_task_mempool(pool, thread_id); - if (mem_pool->num_tasks < MEMPOOL_SIZE - 1) { + TaskThreadLocalStorage *tls = get_task_tls(pool, thread_id); + TaskMemPool *task_mempool = &tls->task_mempool; + if (task_mempool->num_tasks < MEMPOOL_SIZE - 1) { /* Successfully allowed the task to be re-used later. */ - mem_pool->tasks[mem_pool->num_tasks] = task; - ++mem_pool->num_tasks; + task_mempool->tasks[task_mempool->num_tasks] = task; + ++task_mempool->num_tasks; } else { /* Local storage saturated, no other way than just discard @@ -236,7 +283,6 @@ static void task_pool_num_decrease(TaskPool *pool, size_t done) BLI_assert(pool->num >= done); pool->num -= done; - atomic_sub_and_fetch_z(&pool->currently_running_tasks, done); if (pool->num == 0) BLI_condition_notify_all(&pool->num_cond); @@ -244,11 +290,11 @@ static void task_pool_num_decrease(TaskPool *pool, size_t done) BLI_mutex_unlock(&pool->num_mutex); } -static void task_pool_num_increase(TaskPool *pool) +static void task_pool_num_increase(TaskPool *pool, size_t new) { BLI_mutex_lock(&pool->num_mutex); - pool->num++; + pool->num += new; BLI_condition_notify_all(&pool->num_cond); BLI_mutex_unlock(&pool->num_mutex); @@ -290,17 +336,10 @@ static bool task_scheduler_thread_wait_pop(TaskScheduler *scheduler, Task **task continue; } - if (atomic_add_and_fetch_z(&pool->currently_running_tasks, 1) <= pool->num_threads || - pool->num_threads == 0) - { - *task = current_task; - found_task = true; - BLI_remlink(&scheduler->queue, *task); - break; - } - else { - atomic_sub_and_fetch_z(&pool->currently_running_tasks, 1); - } + *task = current_task; + found_task = true; + BLI_remlink(&scheduler->queue, *task); + break; } if (!found_task) BLI_condition_wait(&scheduler->queue_cond, &scheduler->queue_mutex); @@ -311,13 +350,34 @@ static bool task_scheduler_thread_wait_pop(TaskScheduler *scheduler, Task **task return true; } +BLI_INLINE void handle_local_queue(TaskThreadLocalStorage *tls, + const int thread_id) +{ + while (tls->num_local_queue > 0) { + /* We pop task from queue before handling it so handler of the task can + * push next job to the local queue. + */ + tls->num_local_queue--; + Task *local_task = tls->local_queue[tls->num_local_queue]; + /* TODO(sergey): Double-check work_and_wait() doesn't handle other's + * pool tasks. + */ + TaskPool *local_pool = local_task->pool; + local_task->run(local_pool, local_task->taskdata, thread_id); + task_free(local_pool, local_task, thread_id); + } +} + static void *task_scheduler_thread_run(void *thread_p) { TaskThread *thread = (TaskThread *) thread_p; + TaskThreadLocalStorage *tls = &thread->tls; TaskScheduler *scheduler = thread->scheduler; int thread_id = thread->id; Task *task; + pthread_setspecific(scheduler->tls_id_key, thread); + /* keep popping off tasks */ while (task_scheduler_thread_wait_pop(scheduler, &task)) { TaskPool *pool = task->pool; @@ -328,6 +388,9 @@ static void *task_scheduler_thread_run(void *thread_p) /* delete task */ task_free(pool, task, thread_id); + /* Handle all tasks from local queue. */ + handle_local_queue(tls, thread_id); + /* notify pool task was done */ task_pool_num_decrease(pool, 1); } @@ -361,16 +424,20 @@ TaskScheduler *BLI_task_scheduler_create(int num_threads) num_threads = 1; } + scheduler->task_threads = MEM_callocN(sizeof(TaskThread) * (num_threads + 1), + "TaskScheduler task threads"); + + pthread_key_create(&scheduler->tls_id_key, NULL); + /* launch threads that will be waiting for work */ if (num_threads > 0) { int i; scheduler->num_threads = num_threads; scheduler->threads = MEM_callocN(sizeof(pthread_t) * num_threads, "TaskScheduler threads"); - scheduler->task_threads = MEM_callocN(sizeof(TaskThread) * num_threads, "TaskScheduler task threads"); for (i = 0; i < num_threads; i++) { - TaskThread *thread = &scheduler->task_threads[i]; + TaskThread *thread = &scheduler->task_threads[i + 1]; thread->scheduler = scheduler; thread->id = i + 1; @@ -378,9 +445,6 @@ TaskScheduler *BLI_task_scheduler_create(int num_threads) fprintf(stderr, "TaskScheduler failed to launch thread %d/%d\n", i, num_threads); } } - - scheduler->task_mempool = MEM_callocN(sizeof(*scheduler->task_mempool) * (num_threads + 1), - "TaskScheduler task_mempool"); } return scheduler; @@ -396,6 +460,8 @@ void BLI_task_scheduler_free(TaskScheduler *scheduler) BLI_condition_notify_all(&scheduler->queue_cond); BLI_mutex_unlock(&scheduler->queue_mutex); + pthread_key_delete(scheduler->tls_id_key); + /* delete threads */ if (scheduler->threads) { int i; @@ -410,17 +476,12 @@ void BLI_task_scheduler_free(TaskScheduler *scheduler) /* Delete task thread data */ if (scheduler->task_threads) { - MEM_freeN(scheduler->task_threads); - } - - /* Delete task memory pool */ - if (scheduler->task_mempool) { - for (int i = 0; i <= scheduler->num_threads; ++i) { - for (int j = 0; j < scheduler->task_mempool[i].num_tasks; ++j) { - MEM_freeN(scheduler->task_mempool[i].tasks[j]); - } + for (int i = 0; i < scheduler->num_threads + 1; ++i) { + TaskThreadLocalStorage *tls = &scheduler->task_threads[i].tls; + free_task_tls(tls); } - MEM_freeN(scheduler->task_mempool); + + MEM_freeN(scheduler->task_threads); } /* delete leftover tasks */ @@ -443,7 +504,7 @@ int BLI_task_scheduler_num_threads(TaskScheduler *scheduler) static void task_scheduler_push(TaskScheduler *scheduler, Task *task, TaskPriority priority) { - task_pool_num_increase(task->pool); + task_pool_num_increase(task->pool, 1); /* add task to queue */ BLI_mutex_lock(&scheduler->queue_mutex); @@ -469,7 +530,7 @@ static void task_scheduler_clear(TaskScheduler *scheduler, TaskPool *pool) nexttask = task->next; if (task->pool == pool) { - task_data_free(task, 0); + task_data_free(task, pool->thread_id); BLI_freelinkN(&scheduler->queue, task); done++; @@ -484,7 +545,10 @@ static void task_scheduler_clear(TaskScheduler *scheduler, TaskPool *pool) /* Task Pool */ -static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, const bool is_background) +static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, + void *userdata, + const bool is_background, + const bool is_suspended) { TaskPool *pool = MEM_mallocN(sizeof(TaskPool), "TaskPool"); @@ -502,9 +566,11 @@ static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, c pool->scheduler = scheduler; pool->num = 0; - pool->num_threads = 0; - pool->currently_running_tasks = 0; pool->do_cancel = false; + pool->do_work = false; + pool->is_suspended = is_suspended; + pool->num_suspended = 0; + pool->suspended_queue.first = pool->suspended_queue.last = NULL; pool->run_in_background = is_background; BLI_mutex_init(&pool->num_mutex); @@ -514,11 +580,21 @@ static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, c BLI_mutex_init(&pool->user_mutex); if (BLI_thread_is_main()) { - pool->task_mempool = scheduler->task_mempool; + pool->thread_id = 0; } else { - pool->task_mempool = &pool->task_mempool_local; - pool->task_mempool_local.num_tasks = 0; + TaskThread *thread = pthread_getspecific(scheduler->tls_id_key); + /* NOTE: It is possible that pool is created from non-main thread + * which isn't a scheduler thread. In this case pthread's TLS will + * be NULL and we can safely consider thread id 0 for the main + * thread of this pool (the one which does wort_and_wait()). + */ + if (thread == NULL) { + pool->thread_id = 0; + } + else { + pool->thread_id = thread->id; + } } #ifdef DEBUG_STATS @@ -545,7 +621,7 @@ static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, c */ TaskPool *BLI_task_pool_create(TaskScheduler *scheduler, void *userdata) { - return task_pool_create_ex(scheduler, userdata, false); + return task_pool_create_ex(scheduler, userdata, false, false); } /** @@ -560,7 +636,17 @@ TaskPool *BLI_task_pool_create(TaskScheduler *scheduler, void *userdata) */ TaskPool *BLI_task_pool_create_background(TaskScheduler *scheduler, void *userdata) { - return task_pool_create_ex(scheduler, userdata, true); + return task_pool_create_ex(scheduler, userdata, true, false); +} + +/** + * Similar to BLI_task_pool_create() but does not schedule any tasks for execution + * for until BLI_task_pool_work_and_wait() is called. This helps reducing therading + * overhead when pushing huge amount of small initial tasks from the main thread. + */ +TaskPool *BLI_task_pool_create_suspended(TaskScheduler *scheduler, void *userdata) +{ + return task_pool_create_ex(scheduler, userdata, false, true); } void BLI_task_pool_free(TaskPool *pool) @@ -572,13 +658,6 @@ void BLI_task_pool_free(TaskPool *pool) BLI_mutex_end(&pool->user_mutex); - /* Free local memory pool, those pointers are lost forever. */ - if (pool->task_mempool == &pool->task_mempool_local) { - for (int i = 0; i < pool->task_mempool_local.num_tasks; i++) { - MEM_freeN(pool->task_mempool_local.tasks[i]); - } - } - #ifdef DEBUG_STATS printf("Thread ID Allocated Reused Discarded\n"); for (int i = 0; i < pool->scheduler->num_threads + 1; ++i) { @@ -609,6 +688,25 @@ static void task_pool_push( task->freedata = freedata; task->pool = pool; + if (pool->is_suspended) { + BLI_addhead(&pool->suspended_queue, task); + atomic_fetch_and_add_z(&pool->num_suspended, 1); + return; + } + + if (thread_id != -1 && + (thread_id != pool->thread_id || pool->do_work)) + { + ASSERT_THREAD_ID(pool->scheduler, thread_id); + + TaskThreadLocalStorage *tls = get_task_tls(pool, thread_id); + if (tls->num_local_queue < LOCALQUEUE_SIZE) { + tls->local_queue[tls->num_local_queue] = task; + tls->num_local_queue++; + return; + } + } + task_scheduler_push(pool->scheduler, task, priority); } @@ -633,8 +731,27 @@ void BLI_task_pool_push_from_thread(TaskPool *pool, TaskRunFunction run, void BLI_task_pool_work_and_wait(TaskPool *pool) { + TaskThreadLocalStorage *tls = get_task_tls(pool, pool->thread_id); TaskScheduler *scheduler = pool->scheduler; + if (atomic_fetch_and_and_uint8((uint8_t*)&pool->is_suspended, 0)) { + if (pool->num_suspended) { + task_pool_num_increase(pool, pool->num_suspended); + BLI_mutex_lock(&scheduler->queue_mutex); + + BLI_movelisttolist(&scheduler->queue, &pool->suspended_queue); + + BLI_condition_notify_all(&scheduler->queue_cond); + BLI_mutex_unlock(&scheduler->queue_mutex); + + } + pool->is_suspended = false; + } + + pool->do_work = true; + + ASSERT_THREAD_ID(pool->scheduler, pool->thread_id); + BLI_mutex_lock(&pool->num_mutex); while (pool->num != 0) { @@ -648,16 +765,12 @@ void BLI_task_pool_work_and_wait(TaskPool *pool) /* find task from this pool. if we get a task from another pool, * we can get into deadlock */ - if (pool->num_threads == 0 || - pool->currently_running_tasks < pool->num_threads) - { - for (task = scheduler->queue.first; task; task = task->next) { - if (task->pool == pool) { - work_task = task; - found_task = true; - BLI_remlink(&scheduler->queue, task); - break; - } + for (task = scheduler->queue.first; task; task = task->next) { + if (task->pool == pool) { + work_task = task; + found_task = true; + BLI_remlink(&scheduler->queue, task); + break; } } @@ -666,11 +779,13 @@ void BLI_task_pool_work_and_wait(TaskPool *pool) /* if found task, do it, otherwise wait until other tasks are done */ if (found_task) { /* run task */ - atomic_add_and_fetch_z(&pool->currently_running_tasks, 1); - work_task->run(pool, work_task->taskdata, 0); + work_task->run(pool, work_task->taskdata, pool->thread_id); /* delete task */ - task_free(pool, task, 0); + task_free(pool, task, pool->thread_id); + + /* Handle all tasks from local queue. */ + handle_local_queue(tls, pool->thread_id); /* notify pool task was done */ task_pool_num_decrease(pool, 1); @@ -685,12 +800,8 @@ void BLI_task_pool_work_and_wait(TaskPool *pool) } BLI_mutex_unlock(&pool->num_mutex); -} -void BLI_pool_set_num_threads(TaskPool *pool, int num_threads) -{ - /* NOTE: Don't try to modify threads while tasks are running! */ - pool->num_threads = num_threads; + handle_local_queue(tls, pool->thread_id); } void BLI_task_pool_cancel(TaskPool *pool) @@ -893,7 +1004,8 @@ static void task_parallel_range_ex( BLI_task_pool_push_from_thread(task_pool, parallel_range_func, userdata_chunk_local, false, - TASK_PRIORITY_HIGH, 0); + TASK_PRIORITY_HIGH, + task_pool->thread_id); } BLI_task_pool_work_and_wait(task_pool); @@ -1099,7 +1211,8 @@ void BLI_task_parallel_listbase( BLI_task_pool_push_from_thread(task_pool, parallel_listbase_func, NULL, false, - TASK_PRIORITY_HIGH, 0); + TASK_PRIORITY_HIGH, + task_pool->thread_id); } BLI_task_pool_work_and_wait(task_pool); diff --git a/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp b/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp index e1ada9a8c39..5f78067220a 100644 --- a/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp +++ b/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp @@ -94,4 +94,10 @@ void ConvolutionEdgeFilterOperation::executePixel(float output[4], int x, int y, output[2] = output[2] * value[0] + in2[2] * mval; output[3] = in2[3]; + + /* Make sure we don't return negative color. */ + output[0] = max(output[0], 0.0f); + output[1] = max(output[1], 0.0f); + output[2] = max(output[2], 0.0f); + output[3] = max(output[3], 0.0f); } diff --git a/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp b/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp index 68ec2be5ebd..6ac1ff9a1eb 100644 --- a/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp +++ b/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp @@ -107,6 +107,12 @@ void ConvolutionFilterOperation::executePixel(float output[4], int x, int y, voi output[1] = output[1] * value[0] + in2[1] * mval; output[2] = output[2] * value[0] + in2[2] * mval; output[3] = output[3] * value[0] + in2[3] * mval; + + /* Make sure we don't return negative color. */ + output[0] = max(output[0], 0.0f); + output[1] = max(output[1], 0.0f); + output[2] = max(output[2], 0.0f); + output[3] = max(output[3], 0.0f); } bool ConvolutionFilterOperation::determineDependingAreaOfInterest(rcti *input, ReadBufferOperation *readOperation, rcti *output) diff --git a/source/blender/depsgraph/intern/eval/deg_eval.cc b/source/blender/depsgraph/intern/eval/deg_eval.cc index 3a042535d26..e739bc9dbb5 100644 --- a/source/blender/depsgraph/intern/eval/deg_eval.cc +++ b/source/blender/depsgraph/intern/eval/deg_eval.cc @@ -95,105 +95,38 @@ static void deg_task_run_func(TaskPool *pool, /* Should only be the case for NOOPs, which never get to this point. */ BLI_assert(node->evaluate); - while (true) { - /* Get context. */ - /* TODO: Who initialises this? "Init" operations aren't able to - * initialise it!!! - */ - /* TODO(sergey): We don't use component contexts at this moment. */ - /* ComponentDepsNode *comp = node->owner; */ - BLI_assert(node->owner != NULL); - - /* Since we're not leaving the thread for until the graph branches it is - * possible to have NO-OP on the way. for which evaluate() will be NULL. - * but that's all fine, we'll just scheduler it's children. - */ - if (node->evaluate) { + /* Get context. */ + /* TODO: Who initialises this? "Init" operations aren't able to + * initialise it!!! + */ + /* TODO(sergey): We don't use component contexts at this moment. */ + /* ComponentDepsNode *comp = node->owner; */ + BLI_assert(node->owner != NULL); + + /* Since we're not leaving the thread for until the graph branches it is + * possible to have NO-OP on the way. for which evaluate() will be NULL. + * but that's all fine, we'll just scheduler it's children. + */ + if (node->evaluate) { /* Take note of current time. */ #ifdef USE_DEBUGGER - double start_time = PIL_check_seconds_timer(); - DepsgraphDebug::task_started(state->graph, node); + double start_time = PIL_check_seconds_timer(); + DepsgraphDebug::task_started(state->graph, node); #endif - /* Perform operation. */ - node->evaluate(state->eval_ctx); + /* Perform operation. */ + node->evaluate(state->eval_ctx); /* Note how long this took. */ #ifdef USE_DEBUGGER - double end_time = PIL_check_seconds_timer(); - DepsgraphDebug::task_completed(state->graph, - node, - end_time - start_time); + double end_time = PIL_check_seconds_timer(); + DepsgraphDebug::task_completed(state->graph, + node, + end_time - start_time); #endif - } - - /* If there's only one outgoing link we try to immediately switch to - * that node evaluation, without leaving the thread. - * - * It's only doable if the child don't have extra relations or all they - * are satisfied. - * - * TODO(sergey): Checks here can be de-duplicated with the ones from - * schedule_node(), however, how to do it nicely? - */ - if (node->outlinks.size() == 1) { - DepsRelation *rel = node->outlinks[0]; - OperationDepsNode *child = (OperationDepsNode *)rel->to; - BLI_assert(child->type == DEPSNODE_TYPE_OPERATION); - if (!child->scheduled) { - unsigned int id_layers = child->owner->owner->layers; - if (!((child->flag & DEPSOP_FLAG_NEEDS_UPDATE) != 0 && - (id_layers & state->layers) != 0)) - { - /* Node does not need an update, so can;t continue with the - * chain and need to switch to another one by leaving the - * thread. - */ - break; - } - if ((rel->flag & DEPSREL_FLAG_CYCLIC) == 0) { - BLI_assert(child->num_links_pending > 0); - atomic_sub_and_fetch_uint32(&child->num_links_pending, 1); - } - if (child->num_links_pending == 0) { - bool is_scheduled = atomic_fetch_and_or_uint8( - (uint8_t *)&child->scheduled, (uint8_t)true); - if (!is_scheduled) { - /* Node was not scheduled, switch to it! */ - node = child; - } - else { - /* Someone else scheduled the node, leaving us - * unemployed in this thread, we're leaving. - */ - break; - } - } - else { - /* There are other dependencies on the child, can't do - * anything in the current thread. - */ - break; - } - } - else { - /* Happens when having cyclic dependencies. - * - * Nothing to do here, single child was already scheduled, we - * can leave the thread now. - */ - break; - } - } - else { - /* TODO(sergey): It's possible to use one of the outgoing relations - * as a chain which we'll try to keep alive, but it's a bit more - * involved change. - */ - schedule_children(pool, state->graph, node, state->layers, thread_id); - break; - } } + + schedule_children(pool, state->graph, node, state->layers, thread_id); } typedef struct CalculatePengindData { @@ -378,12 +311,19 @@ void deg_evaluate_on_refresh(EvaluationContext *eval_ctx, state.graph = graph; state.layers = layers; - TaskScheduler *task_scheduler = BLI_task_scheduler_get(); - TaskPool *task_pool = BLI_task_pool_create(task_scheduler, &state); + TaskScheduler *task_scheduler; + bool need_free_scheduler; if (G.debug & G_DEBUG_DEPSGRAPH_NO_THREADS) { - BLI_pool_set_num_threads(task_pool, 1); + task_scheduler = BLI_task_scheduler_create(1); + need_free_scheduler = true; } + else { + task_scheduler = BLI_task_scheduler_get(); + need_free_scheduler = false; + } + + TaskPool *task_pool = BLI_task_pool_create_suspended(task_scheduler, &state); calculate_pending_parents(graph, layers); @@ -410,6 +350,10 @@ void deg_evaluate_on_refresh(EvaluationContext *eval_ctx, /* Clear any uncleared tags - just in case. */ deg_graph_clear_tags(graph); + + if (need_free_scheduler) { + BLI_task_scheduler_free(task_scheduler); + } } } // namespace DEG diff --git a/source/blender/depsgraph/intern/eval/deg_eval_flush.cc b/source/blender/depsgraph/intern/eval/deg_eval_flush.cc index 7c6c25bef0d..e10f86f6e95 100644 --- a/source/blender/depsgraph/intern/eval/deg_eval_flush.cc +++ b/source/blender/depsgraph/intern/eval/deg_eval_flush.cc @@ -180,6 +180,11 @@ void deg_graph_flush_updates(Main *bmain, Depsgraph *graph) comp_node->done = 1; /* Flush to nodes along links... */ + /* TODO(sergey): This is mainly giving speedup due ot less queue pushes, which + * reduces number of memory allocations. + * + * We should try solve the allocation issue instead of doing crazy things here. + */ if (node->outlinks.size() == 1) { OperationDepsNode *to_node = (OperationDepsNode *)node->outlinks[0]->to; if (to_node->scheduled == false) { diff --git a/source/blender/editors/armature/armature_select.c b/source/blender/editors/armature/armature_select.c index a9acb41fcf6..a6f2fa40f46 100644 --- a/source/blender/editors/armature/armature_select.c +++ b/source/blender/editors/armature/armature_select.c @@ -303,11 +303,11 @@ static EditBone *get_nearest_editbonepoint( ebone_next_act = NULL; } - BLI_rcti_init_pt_size(&rect, mval, 5); + BLI_rcti_init_pt_radius(&rect, mval, 5); hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, true); if (hits == 0) { - BLI_rcti_init_pt_size(&rect, mval, 12); + BLI_rcti_init_pt_radius(&rect, mval, 12); hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, true); } /* See if there are any selected bones in this group */ diff --git a/source/blender/editors/armature/editarmature_sketch.c b/source/blender/editors/armature/editarmature_sketch.c index c2813d8c5a7..c3b918ef64f 100644 --- a/source/blender/editors/armature/editarmature_sketch.c +++ b/source/blender/editors/armature/editarmature_sketch.c @@ -1929,7 +1929,7 @@ static bool sk_selectStroke(bContext *C, SK_Sketch *sketch, const int mval[2], c view3d_set_viewcontext(C, &vc); - BLI_rcti_init_pt_size(&rect, mval, 5); + BLI_rcti_init_pt_radius(&rect, mval, 5); hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, true); diff --git a/source/blender/editors/interface/interface_layout.c b/source/blender/editors/interface/interface_layout.c index ad5f6279606..ce1153911da 100644 --- a/source/blender/editors/interface/interface_layout.c +++ b/source/blender/editors/interface/interface_layout.c @@ -2681,13 +2681,14 @@ static void ui_litem_layout_absolute(uiLayout *litem) static void ui_litem_estimate_split(uiLayout *litem) { ui_litem_estimate_row(litem); + litem->item.flag &= ~UI_ITEM_MIN; } static void ui_litem_layout_split(uiLayout *litem) { uiLayoutItemSplit *split = (uiLayoutItemSplit *)litem; uiItem *item; - float percentage; + float percentage, extra_pixel = 0.0f; const int tot = BLI_listbase_count(&litem->items); int itemh, x, y, w, colw = 0; @@ -2710,7 +2711,9 @@ static void ui_litem_layout_split(uiLayout *litem) x += colw; if (item->next) { - colw = (w - (int)(w * percentage)) / (tot - 1); + const float width = extra_pixel + (w - (int)(w * percentage)) / ((float)tot - 1); + extra_pixel = width - (int)width; + colw = (int)width; colw = MAX2(colw, 0); x += litem->space; diff --git a/source/blender/editors/metaball/mball_edit.c b/source/blender/editors/metaball/mball_edit.c index 9c42d3eb08f..fff53d6885e 100644 --- a/source/blender/editors/metaball/mball_edit.c +++ b/source/blender/editors/metaball/mball_edit.c @@ -592,7 +592,7 @@ bool ED_mball_select_pick(bContext *C, const int mval[2], bool extend, bool dese view3d_set_viewcontext(C, &vc); - BLI_rcti_init_pt_size(&rect, mval, 12); + BLI_rcti_init_pt_radius(&rect, mval, 12); hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, true); diff --git a/source/blender/editors/render/render_opengl.c b/source/blender/editors/render/render_opengl.c index 21a7ec0d06c..1d870b89026 100644 --- a/source/blender/editors/render/render_opengl.c +++ b/source/blender/editors/render/render_opengl.c @@ -718,7 +718,6 @@ static bool screen_opengl_render_init(bContext *C, wmOperator *op) oglrender->task_scheduler = task_scheduler; oglrender->task_pool = BLI_task_pool_create_background(task_scheduler, oglrender); - BLI_pool_set_num_threads(oglrender->task_pool, 1); } else { oglrender->task_scheduler = NULL; @@ -750,6 +749,23 @@ static void screen_opengl_render_end(bContext *C, OGLRender *oglrender) int i; if (oglrender->is_animation) { + /* Trickery part for movie output: + * + * We MUST write frames in an exact order, so we only let background + * thread to work on that, and main thread is simply waits for that + * thread to do all the dirty work. + * + * After this loop is done work_and_wait() will have nothing to do, + * so we don't run into wrong order of frames written to the stream. + */ + if (BKE_imtype_is_movie(scene->r.im_format.imtype)) { + BLI_mutex_lock(&oglrender->task_mutex); + while (oglrender->num_scheduled_frames > 0) { + BLI_condition_wait(&oglrender->task_condition, + &oglrender->task_mutex); + } + BLI_mutex_unlock(&oglrender->task_mutex); + } BLI_task_pool_work_and_wait(oglrender->task_pool); BLI_task_pool_free(oglrender->task_pool); /* Depending on various things we might or might not use global scheduler. */ diff --git a/source/blender/editors/space_node/node_edit.c b/source/blender/editors/space_node/node_edit.c index 54ddd9aed46..fdfe316f5ed 100644 --- a/source/blender/editors/space_node/node_edit.c +++ b/source/blender/editors/space_node/node_edit.c @@ -1070,7 +1070,7 @@ int node_find_indicated_socket(SpaceNode *snode, bNode **nodep, bNodeSocket **so /* check if we click in a socket */ for (node = snode->edittree->nodes.first; node; node = node->next) { - BLI_rctf_init_pt_size(&rect, cursor, NODE_SOCKSIZE + 4); + BLI_rctf_init_pt_radius(&rect, cursor, NODE_SOCKSIZE + 4); if (!(node->flag & NODE_HIDDEN)) { /* extra padding inside and out - allow dragging on the text areas too */ diff --git a/source/blender/editors/space_view3d/space_view3d.c b/source/blender/editors/space_view3d/space_view3d.c index 7920631b100..37d300ca0a7 100644 --- a/source/blender/editors/space_view3d/space_view3d.c +++ b/source/blender/editors/space_view3d/space_view3d.c @@ -882,6 +882,7 @@ static void view3d_main_region_listener(bScreen *sc, ScrArea *sa, ARegion *ar, w case ND_CONSTRAINT: case ND_KEYS: case ND_PARTICLE: + case ND_POINTCACHE: case ND_LOD: ED_region_tag_redraw(ar); break; diff --git a/source/blender/editors/space_view3d/view3d_edit.c b/source/blender/editors/space_view3d/view3d_edit.c index 131095c8c47..b65e8e01768 100644 --- a/source/blender/editors/space_view3d/view3d_edit.c +++ b/source/blender/editors/space_view3d/view3d_edit.c @@ -4872,7 +4872,7 @@ static float view_autodist_depth_margin(ARegion *ar, const int mval[2], int marg rect.ymax = mval[1] + 1; } else { - BLI_rcti_init_pt_size(&rect, mval, margin); + BLI_rcti_init_pt_radius(&rect, mval, margin); } view3d_update_depths_rect(ar, &depth_temp, &rect); diff --git a/source/blender/editors/space_view3d/view3d_intern.h b/source/blender/editors/space_view3d/view3d_intern.h index 911bea78461..eb961c786af 100644 --- a/source/blender/editors/space_view3d/view3d_intern.h +++ b/source/blender/editors/space_view3d/view3d_intern.h @@ -258,7 +258,7 @@ void ED_view3d_smooth_view_force_finish( struct bContext *C, struct View3D *v3d, struct ARegion *ar); -void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rctf *rect); +void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rcti *rect); void view3d_viewmatrix_set(Scene *scene, const View3D *v3d, RegionView3D *rv3d); void fly_modal_keymap(struct wmKeyConfig *keyconf); diff --git a/source/blender/editors/space_view3d/view3d_select.c b/source/blender/editors/space_view3d/view3d_select.c index 921ac136dad..3f6afb2634d 100644 --- a/source/blender/editors/space_view3d/view3d_select.c +++ b/source/blender/editors/space_view3d/view3d_select.c @@ -1059,9 +1059,11 @@ static void deselectall_except(SceneLayer *sl, Base *b) /* deselect all except } } -static Base *object_mouse_select_menu(bContext *C, ViewContext *vc, unsigned int *buffer, int hits, const int mval[2], short toggle) +static Base *object_mouse_select_menu( + bContext *C, ViewContext *vc, unsigned int *buffer, int hits, + const int mval[2], bool toggle) { - short baseCount = 0; + int baseCount = 0; bool ok; LinkNode *linklist = NULL; @@ -1151,19 +1153,19 @@ static bool selectbuffer_has_bones(const unsigned int *buffer, const unsigned in } /* utility function for mixed_bones_object_selectbuffer */ -static short selectbuffer_ret_hits_15(unsigned int *UNUSED(buffer), const short hits15) +static int selectbuffer_ret_hits_15(unsigned int *UNUSED(buffer), const int hits15) { return hits15; } -static short selectbuffer_ret_hits_9(unsigned int *buffer, const short hits15, const short hits9) +static int selectbuffer_ret_hits_9(unsigned int *buffer, const int hits15, const int hits9) { const int offs = 4 * hits15; memcpy(buffer, buffer + offs, 4 * hits9 * sizeof(unsigned int)); return hits9; } -static short selectbuffer_ret_hits_5(unsigned int *buffer, const short hits15, const short hits9, const short hits5) +static int selectbuffer_ret_hits_5(unsigned int *buffer, const int hits15, const int hits9, const int hits5) { const int offs = 4 * hits15 + 4 * hits9; memcpy(buffer, buffer + offs, 4 * hits5 * sizeof(unsigned int)); @@ -1172,14 +1174,14 @@ static short selectbuffer_ret_hits_5(unsigned int *buffer, const short hits15, c /* we want a select buffer with bones, if there are... */ /* so check three selection levels and compare */ -static short mixed_bones_object_selectbuffer( +static int mixed_bones_object_selectbuffer( ViewContext *vc, unsigned int *buffer, const int mval[2], bool use_cycle, bool enumerate, bool *r_do_nearest) { rcti rect; int offs; - short hits15, hits9 = 0, hits5 = 0; + int hits15, hits9 = 0, hits5 = 0; bool has_bones15 = false, has_bones9 = false, has_bones5 = false; static int last_mval[2] = {-100, -100}; bool do_nearest = false; @@ -1207,7 +1209,7 @@ static short mixed_bones_object_selectbuffer( do_nearest = do_nearest && !enumerate; - BLI_rcti_init(&rect, mval[0] - 14, mval[0] + 14, mval[1] - 14, mval[1] + 14); + BLI_rcti_init_pt_radius(&rect, mval, 14); hits15 = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, do_nearest); if (hits15 == 1) { return selectbuffer_ret_hits_15(buffer, hits15); @@ -1216,7 +1218,7 @@ static short mixed_bones_object_selectbuffer( has_bones15 = selectbuffer_has_bones(buffer, hits15); offs = 4 * hits15; - BLI_rcti_init(&rect, mval[0] - 9, mval[0] + 9, mval[1] - 9, mval[1] + 9); + BLI_rcti_init_pt_radius(&rect, mval, 9); hits9 = view3d_opengl_select(vc, buffer + offs, MAXPICKBUF - offs, &rect, do_nearest); if (hits9 == 1) { return selectbuffer_ret_hits_9(buffer, hits15, hits9); @@ -1225,7 +1227,7 @@ static short mixed_bones_object_selectbuffer( has_bones9 = selectbuffer_has_bones(buffer + offs, hits9); offs += 4 * hits9; - BLI_rcti_init(&rect, mval[0] - 5, mval[0] + 5, mval[1] - 5, mval[1] + 5); + BLI_rcti_init_pt_radius(&rect, mval, 5); hits5 = view3d_opengl_select(vc, buffer + offs, MAXPICKBUF - offs, &rect, do_nearest); if (hits5 == 1) { return selectbuffer_ret_hits_5(buffer, hits15, hits9, hits5); @@ -1384,7 +1386,7 @@ static bool ed_object_select_pick( bool is_obedit; float dist = ED_view3d_select_dist_px() * 1.3333f; bool retval = false; - short hits; + int hits; const float mval_fl[2] = {(float)mval[0], (float)mval[1]}; @@ -1876,7 +1878,7 @@ static int do_meta_box_select(ViewContext *vc, rcti *rect, bool select, bool ext int a; unsigned int buffer[MAXPICKBUF]; - short hits; + int hits; hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, rect, false); @@ -1910,7 +1912,7 @@ static int do_armature_box_select(ViewContext *vc, rcti *rect, bool select, bool int a; unsigned int buffer[MAXPICKBUF]; - short hits; + int hits; hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, rect, false); @@ -1985,7 +1987,7 @@ static int do_object_pose_box_select(bContext *C, ViewContext *vc, rcti *rect, b int bone_only; int bone_selected = 0; int totobj = MAXPICKBUF; /* XXX solve later */ - short hits; + int hits; if ((ob) && (ob->mode & OB_MODE_POSE)) bone_only = 1; @@ -2549,7 +2551,7 @@ static void lattice_circle_select(ViewContext *vc, const bool select, const int /* NOTE: pose-bone case is copied from editbone case... */ -static short pchan_circle_doSelectJoint(void *userData, bPoseChannel *pchan, const float screen_co[2]) +static bool pchan_circle_doSelectJoint(void *userData, bPoseChannel *pchan, const float screen_co[2]) { CircleSelectUserData *data = userData; @@ -2627,7 +2629,7 @@ static void pose_circle_select(ViewContext *vc, const bool select, const int mva } } -static short armature_circle_doSelectJoint(void *userData, EditBone *ebone, const float screen_co[2], short head) +static bool armature_circle_doSelectJoint(void *userData, EditBone *ebone, const float screen_co[2], bool head) { CircleSelectUserData *data = userData; diff --git a/source/blender/editors/space_view3d/view3d_view.c b/source/blender/editors/space_view3d/view3d_view.c index 687a9a398d9..637479f9ee3 100644 --- a/source/blender/editors/space_view3d/view3d_view.c +++ b/source/blender/editors/space_view3d/view3d_view.c @@ -903,7 +903,7 @@ void ED_view3d_polygon_offset(const RegionView3D *rv3d, const float dist) /** * \param rect optional for picking (can be NULL). */ -void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rctf *rect) +void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rcti *rect) { RegionView3D *rv3d = ar->regiondata; rctf viewplane; @@ -1170,7 +1170,7 @@ short view3d_opengl_select(ViewContext *vc, unsigned int *buffer, unsigned int b SceneLayer *sl = vc->sl; View3D *v3d = vc->v3d; ARegion *ar = vc->ar; - rctf rect; + rcti rect; short hits; const bool use_obedit_skip = (scene->obedit != NULL) && (vc->obedit == NULL); const bool do_passes = do_nearest && GPU_select_query_check_active(); @@ -1180,10 +1180,10 @@ short view3d_opengl_select(ViewContext *vc, unsigned int *buffer, unsigned int b /* case not a border select */ if (input->xmin == input->xmax) { /* seems to be default value for bones only now */ - BLI_rctf_init_pt_size(&rect, (const float[2]){input->xmin, input->ymin}, 12); + BLI_rcti_init_pt_radius(&rect, (const int[2]){input->xmin, input->ymin}, 12); } else { - BLI_rctf_rcti_copy(&rect, input); + rect = *input; } view3d_winmatrix_set(ar, v3d, &rect); diff --git a/source/blender/editors/transform/transform_manipulator.c b/source/blender/editors/transform/transform_manipulator.c index 489badf594b..a78bf1551bc 100644 --- a/source/blender/editors/transform/transform_manipulator.c +++ b/source/blender/editors/transform/transform_manipulator.c @@ -1768,14 +1768,14 @@ static int manipulator_selectbuf(ScrArea *sa, ARegion *ar, const int mval[2], fl { View3D *v3d = sa->spacedata.first; RegionView3D *rv3d = ar->regiondata; - rctf rect, selrect; + rcti rect; GLuint buffer[64]; // max 4 items per select, so large enuf short hits; const bool is_picksel = true; const bool do_passes = GPU_select_query_check_active(); /* XXX check a bit later on this... (ton) */ - extern void view3d_winmatrix_set(ARegion *ar, View3D *v3d, rctf *rect); + extern void view3d_winmatrix_set(ARegion *ar, View3D *v3d, const rcti *rect); /* when looking through a selected camera, the manipulator can be at the * exact same position as the view, skip so we don't break selection */ @@ -1787,15 +1787,13 @@ static int manipulator_selectbuf(ScrArea *sa, ARegion *ar, const int mval[2], fl rect.ymin = mval[1] - hotspot; rect.ymax = mval[1] + hotspot; - selrect = rect; - view3d_winmatrix_set(ar, v3d, &rect); mul_m4_m4m4(rv3d->persmat, rv3d->winmat, rv3d->viewmat); if (do_passes) - GPU_select_begin(buffer, 64, &selrect, GPU_SELECT_NEAREST_FIRST_PASS, 0); + GPU_select_begin(buffer, 64, &rect, GPU_SELECT_NEAREST_FIRST_PASS, 0); else - GPU_select_begin(buffer, 64, &selrect, GPU_SELECT_ALL, 0); + GPU_select_begin(buffer, 64, &rect, GPU_SELECT_ALL, 0); /* do the drawing */ if (v3d->twtype & V3D_MANIP_ROTATE) { @@ -1810,7 +1808,7 @@ static int manipulator_selectbuf(ScrArea *sa, ARegion *ar, const int mval[2], fl hits = GPU_select_end(); if (do_passes) { - GPU_select_begin(buffer, 64, &selrect, GPU_SELECT_NEAREST_SECOND_PASS, hits); + GPU_select_begin(buffer, 64, &rect, GPU_SELECT_NEAREST_SECOND_PASS, hits); /* do the drawing */ if (v3d->twtype & V3D_MANIP_ROTATE) { @@ -1916,10 +1914,8 @@ int BIF_do_manipulator(bContext *C, const struct wmEvent *event, wmOperator *op) drawflags = manipulator_selectbuf(sa, ar, event->mval, 0.2f * (float)U.tw_hotspot); if (drawflags == 0) drawflags = val; - /* We are not doing translation but were requested to do planar constraints. - * This wouldn't work, so we give other keymaps a chance. - */ - if ((drawflags & MAN_TRANS_C) == 0 && use_planar) { + /* Planar constraint doesn't make sense for rotation, give other keymaps a chance */ + if ((drawflags & MAN_ROT_C) && use_planar) { return 0; } diff --git a/source/blender/gpu/GPU_select.h b/source/blender/gpu/GPU_select.h index d3cb914976e..93f5ce13bbd 100644 --- a/source/blender/gpu/GPU_select.h +++ b/source/blender/gpu/GPU_select.h @@ -32,7 +32,7 @@ #include "BLI_sys_types.h" -struct rctf; +struct rcti; /* flags for mode of operation */ enum { @@ -41,7 +41,7 @@ enum { GPU_SELECT_NEAREST_SECOND_PASS = 3, }; -void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, const struct rctf *input, char mode, int oldhits); +void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, const struct rcti *input, char mode, int oldhits); bool GPU_select_load_id(unsigned int id); unsigned int GPU_select_end(void); bool GPU_select_query_check_active(void); diff --git a/source/blender/gpu/intern/gpu_select.c b/source/blender/gpu/intern/gpu_select.c index f78191a6f6d..35944c455a5 100644 --- a/source/blender/gpu/intern/gpu_select.c +++ b/source/blender/gpu/intern/gpu_select.c @@ -37,6 +37,8 @@ #include "DNA_userdef_types.h" +#include "BLI_rect.h" + #include "BLI_utildefines.h" /* Ad hoc number of queries to allocate to skip doing many glGenQueries */ @@ -72,7 +74,7 @@ static GPUQueryState g_query_state = {0}; /** * initialize and provide buffer for results */ -void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, const rctf *input, char mode, int oldhits) +void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, const rcti *input, char mode, int oldhits) { g_query_state.select_is_active = true; g_query_state.query_issued = false; @@ -109,7 +111,7 @@ void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, const rctf *in * get rejected before the depth test. Should probably cull rect against * scissor for viewport but this is a rare case I think */ glGetFloatv(GL_SCISSOR_BOX, viewport); - glViewport(viewport[0], viewport[1], (int)(input->xmax - input->xmin), (int)(input->ymax - input->ymin)); + glViewport(viewport[0], viewport[1], BLI_rcti_size_x(input), BLI_rcti_size_y(input)); /* occlusion queries operates on fragments that pass tests and since we are interested on all * objects in the view frustum independently of their order, we need to disable the depth test */ diff --git a/source/blender/windowmanager/manipulators/intern/wm_manipulatormap.c b/source/blender/windowmanager/manipulators/intern/wm_manipulatormap.c index 500092f5f2d..2dd02450dcf 100644 --- a/source/blender/windowmanager/manipulators/intern/wm_manipulatormap.c +++ b/source/blender/windowmanager/manipulators/intern/wm_manipulatormap.c @@ -279,35 +279,32 @@ static int manipulator_find_intersected_3D_intern( ARegion *ar = CTX_wm_region(C); View3D *v3d = sa->spacedata.first; RegionView3D *rv3d = ar->regiondata; - rctf rect, selrect; + rcti rect; GLuint buffer[64]; // max 4 items per select, so large enuf short hits; const bool do_passes = GPU_select_query_check_active(); - extern void view3d_winmatrix_set(ARegion *ar, View3D *v3d, rctf *rect); - + extern void view3d_winmatrix_set(ARegion *ar, View3D *v3d, const rcti *rect); rect.xmin = co[0] - hotspot; rect.xmax = co[0] + hotspot; rect.ymin = co[1] - hotspot; rect.ymax = co[1] + hotspot; - selrect = rect; - view3d_winmatrix_set(ar, v3d, &rect); mul_m4_m4m4(rv3d->persmat, rv3d->winmat, rv3d->viewmat); if (do_passes) - GPU_select_begin(buffer, ARRAY_SIZE(buffer), &selrect, GPU_SELECT_NEAREST_FIRST_PASS, 0); + GPU_select_begin(buffer, ARRAY_SIZE(buffer), &rect, GPU_SELECT_NEAREST_FIRST_PASS, 0); else - GPU_select_begin(buffer, ARRAY_SIZE(buffer), &selrect, GPU_SELECT_ALL, 0); + GPU_select_begin(buffer, ARRAY_SIZE(buffer), &rect, GPU_SELECT_ALL, 0); /* do the drawing */ manipulator_find_active_3D_loop(C, visible_manipulators); hits = GPU_select_end(); if (do_passes) { - GPU_select_begin(buffer, ARRAY_SIZE(buffer), &selrect, GPU_SELECT_NEAREST_SECOND_PASS, hits); + GPU_select_begin(buffer, ARRAY_SIZE(buffer), &rect, GPU_SELECT_NEAREST_SECOND_PASS, hits); manipulator_find_active_3D_loop(C, visible_manipulators); GPU_select_end(); } diff --git a/source/creator/creator_args.c b/source/creator/creator_args.c index 5ee69aebd5d..f05dc59875f 100644 --- a/source/creator/creator_args.c +++ b/source/creator/creator_args.c @@ -943,7 +943,7 @@ static int arg_handle_native_pixels_set(int UNUSED(argc), const char **UNUSED(ar } static const char arg_handle_with_borders_doc[] = -"\n\tForce opening without borders" +"\n\tForce opening with borders" ; static int arg_handle_with_borders(int UNUSED(argc), const char **UNUSED(argv), void *UNUSED(data)) { |