Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMai Lavelle <mai.lavelle@gmail.com>2017-02-14 13:50:29 +0300
committerMai Lavelle <mai.lavelle@gmail.com>2017-03-08 09:24:53 +0300
commit817873cc83034c460f1be6bf410c95ff009f3ae2 (patch)
treed50373c256ff02d5f12b067be50c7401c326332b
parent0892352bfe6d5a9aa6ec4c088e67f8bbbbfae610 (diff)
Cycles: CUDA implementation of split kernel
-rw-r--r--intern/cycles/blender/addon/properties.py1
-rw-r--r--intern/cycles/blender/addon/ui.py1
-rw-r--r--intern/cycles/blender/blender_python.cpp1
-rw-r--r--intern/cycles/device/device_cuda.cpp279
-rw-r--r--intern/cycles/kernel/CMakeLists.txt43
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h47
-rw-r--r--intern/cycles/kernel/kernel_types.h14
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel.cu100
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_config.h110
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_split.cu118
-rw-r--r--intern/cycles/kernel/split/kernel_split_data.h11
-rw-r--r--intern/cycles/util/util_debug.cpp5
-rw-r--r--intern/cycles/util/util_debug.h3
13 files changed, 599 insertions, 134 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 1f0b712c93e..ca109734314 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -668,6 +668,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)
cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)
+ cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False)
cls.debug_opencl_kernel_type = EnumProperty(
name="OpenCL Kernel Type",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 8d3fe877597..7c1e3e270fb 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -1523,6 +1523,7 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
col = layout.column()
col.label('CUDA Flags:')
col.prop(cscene, "debug_use_cuda_adaptive_compile")
+ col.prop(cscene, "debug_use_cuda_split_kernel")
col = layout.column()
col.label('OpenCL Flags:')
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index ed410e15e7b..75118c43747 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -70,6 +70,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
/* Synchronize CUDA flags. */
flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
+ flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
/* Synchronize OpenCL kernel type. */
switch(get_enum(cscene, "debug_opencl_kernel_type")) {
case 0:
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 1e5ce7875b1..74f36022b30 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -22,6 +22,7 @@
#include "device.h"
#include "device_intern.h"
+#include "device_split_kernel.h"
#include "buffers.h"
@@ -43,6 +44,8 @@
#include "util_types.h"
#include "util_time.h"
+#include "split/kernel_split_data.h"
+
CCL_NAMESPACE_BEGIN
#ifndef WITH_CUDA_DYNLOAD
@@ -79,6 +82,29 @@ int cuewCompilerVersion(void)
} /* namespace */
#endif /* WITH_CUDA_DYNLOAD */
+class CUDADevice;
+
+class CUDASplitKernel : public DeviceSplitKernel {
+ CUDADevice *device;
+public:
+ explicit CUDASplitKernel(CUDADevice *device);
+
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& kernel_globals,
+ device_memory& kernel_data_,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs);
+
+ virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+ virtual int2 split_kernel_local_size();
+ virtual int2 split_kernel_global_size(DeviceTask *task);
+};
+
class CUDADevice : public Device
{
public:
@@ -259,11 +285,16 @@ public:
return DebugFlags().cuda.adaptive_compile;
}
+ bool use_split_kernel()
+ {
+ return DebugFlags().cuda.split_kernel;
+ }
+
/* Common NVCC flags which stays the same regardless of shading model,
* kernel sources md5 and only depends on compiler or compilation settings.
*/
string compile_kernel_get_common_cflags(
- const DeviceRequestedFeatures& requested_features)
+ const DeviceRequestedFeatures& requested_features, bool split=false)
{
const int cuda_version = cuewCompilerVersion();
const int machine = system_cpu_bits();
@@ -288,6 +319,11 @@ public:
#ifdef WITH_CYCLES_DEBUG
cflags += " -D__KERNEL_DEBUG__";
#endif
+
+ if(split) {
+ cflags += " -D__SPLIT__";
+ }
+
return cflags;
}
@@ -321,7 +357,7 @@ public:
return true;
}
- string compile_kernel(const DeviceRequestedFeatures& requested_features)
+ string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
{
/* Compute cubin name. */
int major, minor;
@@ -330,7 +366,8 @@ public:
/* Attempt to use kernel provided with Blender. */
if(!use_adaptive_compilation()) {
- const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin",
+ const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
+ : "lib/kernel_sm_%d%d.cubin",
major, minor));
VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
if(path_exists(cubin)) {
@@ -340,7 +377,7 @@ public:
}
const string common_cflags =
- compile_kernel_get_common_cflags(requested_features);
+ compile_kernel_get_common_cflags(requested_features, split);
/* Try to use locally compiled kernel. */
const string kernel_path = path_get("kernel");
@@ -351,7 +388,8 @@ public:
*/
const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
- const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin",
+ const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
+ : "cycles_kernel_sm%d%d_%s.cubin",
major, minor,
cubin_md5.c_str());
const string cubin = path_cache_get(path_join("kernels", cubin_file));
@@ -386,7 +424,7 @@ public:
const char *nvcc = cuewCompilerPath();
const string kernel = path_join(kernel_path,
path_join("kernels",
- path_join("cuda", "kernel.cu")));
+ path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
double starttime = time_dt();
printf("Compiling CUDA kernel ...\n");
@@ -434,7 +472,7 @@ public:
return false;
/* get kernel */
- string cubin = compile_kernel(requested_features);
+ string cubin = compile_kernel(requested_features, use_split_kernel());
if(cubin == "")
return false;
@@ -1261,25 +1299,48 @@ public:
/* Upload Bindless Mapping */
load_bindless_mapping();
- /* keep rendering tiles until done */
- while(task->acquire_tile(this, tile)) {
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
+ if(!use_split_kernel()) {
+ /* keep rendering tiles until done */
+ while(task->acquire_tile(this, tile)) {
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
- }
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
+ }
- path_trace(tile, sample, branched);
+ path_trace(tile, sample, branched);
- tile.sample = sample + 1;
+ tile.sample = sample + 1;
- task->update_progress(&tile, tile.w*tile.h);
+ task->update_progress(&tile, tile.w*tile.h);
+ }
+
+ task->release_tile(tile);
}
+ }
+ else {
+ DeviceRequestedFeatures requested_features;
+ if(!use_adaptive_compilation()) {
+ requested_features.max_closure = 64;
+ }
+
+ CUDASplitKernel split_kernel(this);
+ split_kernel.load_kernels(requested_features);
+
+ while(task->acquire_tile(this, tile)) {
+ device_memory void_buffer;
+ split_kernel.path_trace(task, tile, void_buffer, void_buffer);
+
+ task->release_tile(tile);
- task->release_tile(tile);
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
+ }
+ }
}
}
else if(task->type == DeviceTask::SHADER) {
@@ -1332,8 +1393,186 @@ public:
{
task_pool.cancel();
}
+
+ friend class CUDASplitKernelFunction;
+ friend class CUDASplitKernel;
};
+/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
+ * now that the definition of that class is complete
+ */
+#undef cuda_assert
+#define cuda_assert(stmt) \
+ { \
+ CUresult result = stmt; \
+ \
+ if(result != CUDA_SUCCESS) { \
+ string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+ if(device->error_msg == "") \
+ device->error_msg = message; \
+ fprintf(stderr, "%s\n", message.c_str()); \
+ /*cuda_abort();*/ \
+ device->cuda_error_documentation(); \
+ } \
+ } (void)0
+
+/* split kernel */
+
+class CUDASplitKernelFunction : public SplitKernelFunction{
+ CUDADevice* device;
+ CUfunction func;
+public:
+ CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
+
+ /* enqueue the kernel, returns false if there is an error */
+ bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
+ {
+ return enqueue(dim, NULL);
+ }
+
+ /* enqueue the kernel, returns false if there is an error */
+ bool enqueue(const KernelDimensions &dim, void *args[])
+ {
+ device->cuda_push_context();
+
+ if(device->have_error())
+ return false;
+
+ /* we ignore dim.local_size for now, as this is faster */
+ int threads_per_block;
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+ int xthreads = (int)sqrt(threads_per_block);
+ int ythreads = (int)sqrt(threads_per_block);
+
+ int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
+ int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
+
+ cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+ cuda_assert(cuLaunchKernel(func,
+ xblocks , yblocks, 1, /* blocks */
+ xthreads, ythreads, 1, /* threads */
+ 0, 0, args, 0));
+
+ device->cuda_pop_context();
+
+ return !device->have_error();
+ }
+};
+
+CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& /*kernel_globals*/,
+ device_memory& /*kernel_data*/,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs)
+{
+ device->cuda_push_context();
+
+ CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
+ CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
+ CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
+ CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
+ CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
+
+ CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state);
+ CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
+
+ int end_sample = rtile.start_sample + rtile.num_samples;
+ int queue_size = dim.global_size[0] * dim.global_size[1];
+
+ struct args_t {
+ CUdeviceptr* split_data_buffer;
+ int* num_elements;
+ CUdeviceptr* ray_state;
+ CUdeviceptr* rng_state;
+ int* start_sample;
+ int* end_sample;
+ int* sx;
+ int* sy;
+ int* sw;
+ int* sh;
+ int* offset;
+ int* stride;
+ CUdeviceptr* queue_index;
+ int* queuesize;
+ CUdeviceptr* use_queues_flag;
+ CUdeviceptr* work_pool_wgs;
+ int* num_samples;
+ CUdeviceptr* buffer;
+ };
+
+ args_t args = {
+ &d_split_data,
+ &num_global_elements,
+ &d_ray_state,
+ &d_rng_state,
+ &rtile.start_sample,
+ &end_sample,
+ &rtile.x,
+ &rtile.y,
+ &rtile.w,
+ &rtile.h,
+ &rtile.offset,
+ &rtile.stride,
+ &d_queue_index,
+ &queue_size,
+ &d_use_queues_flag,
+ &d_work_pool_wgs,
+ &rtile.num_samples,
+ &d_buffer
+ };
+
+ CUfunction data_init;
+ cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+ if(device->have_error()) {
+ return false;
+ }
+
+ CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
+
+ device->cuda_pop_context();
+
+ return !device->have_error();
+}
+
+SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+{
+ CUfunction func;
+
+ device->cuda_push_context();
+
+ cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
+ if(device->have_error()) {
+ device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+ return NULL;
+ }
+
+ device->cuda_pop_context();
+
+ return new CUDASplitKernelFunction(device, func);
+}
+
+int2 CUDASplitKernel::split_kernel_local_size()
+{
+ return make_int2(32, 1);
+}
+
+int2 CUDASplitKernel::split_kernel_global_size(DeviceTask */*task*/)
+{
+ /* TODO(mai): implement something here to detect ideal work size */
+ return make_int2(256, 256);
+}
+
bool device_cuda_init(void)
{
#ifdef WITH_CUDA_DYNLOAD
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index d844ba007aa..685955170b5 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -27,6 +27,7 @@ set(SRC
kernels/opencl/kernel_next_iteration_setup.cl
kernels/opencl/kernel_sum_all_radiance.cl
kernels/cuda/kernel.cu
+ kernels/cuda/kernel_split.cu
)
set(SRC_BVH_HEADERS
@@ -89,6 +90,10 @@ set(SRC_KERNELS_CPU_HEADERS
kernels/cpu/kernel_cpu_image.h
)
+set(SRC_KERNELS_CUDA_HEADERS
+ kernels/cuda/kernel_config.h
+)
+
set(SRC_CLOSURE_HEADERS
closure/alloc.h
closure/bsdf.h
@@ -230,8 +235,9 @@ if(WITH_CYCLES_CUDA_BINARIES)
endif()
# build for each arch
- set(cuda_sources kernels/cuda/kernel.cu
+ set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
${SRC_HEADERS}
+ ${SRC_KERNELS_CUDA_HEADERS}
${SRC_BVH_HEADERS}
${SRC_SVM_HEADERS}
${SRC_GEOM_HEADERS}
@@ -240,15 +246,22 @@ if(WITH_CYCLES_CUDA_BINARIES)
)
set(cuda_cubins)
- macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
- if(${experimental})
- set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
- set(cuda_cubin kernel_experimental_${arch}.cubin)
+ macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
+ if(${split})
+ set(cuda_extra_flags "-D__SPLIT__")
+ set(cuda_cubin kernel_split)
else()
set(cuda_extra_flags "")
- set(cuda_cubin kernel_${arch}.cubin)
+ set(cuda_cubin kernel)
+ endif()
+
+ if(${experimental})
+ set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
+ set(cuda_cubin ${cuda_cubin}_experimental)
endif()
+ set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
+
if(WITH_CYCLES_DEBUG)
set(cuda_debug_flags "-D__KERNEL_DEBUG__")
else()
@@ -261,13 +274,19 @@ if(WITH_CYCLES_CUDA_BINARIES)
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
set(cuda_math_flags "--use_fast_math")
+ if(split)
+ set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
+ else()
+ set(cuda_kernel_src "/kernels/cuda/kernel.cu")
+ endif()
+
add_custom_command(
OUTPUT ${cuda_cubin}
COMMAND ${cuda_nvcc_command}
-arch=${arch}
${CUDA_NVCC_FLAGS}
-m${CUDA_BITS}
- --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
+ --cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
--ptxas-options="-v"
${cuda_arch_flags}
@@ -294,7 +313,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
# Compile regular kernel
- CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
+ CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
+
+ if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
+ # Compile split kernel
+ CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
+ endif()
endforeach()
add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
@@ -352,6 +376,7 @@ add_library(cycles_kernel
${SRC}
${SRC_HEADERS}
${SRC_KERNELS_CPU_HEADERS}
+ ${SRC_KERNELS_CUDA_HEADERS}
${SRC_BVH_HEADERS}
${SRC_CLOSURE_HEADERS}
${SRC_SVM_HEADERS}
@@ -386,7 +411,9 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index e0c7b17c6a0..8fffe2a13c9 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -46,11 +46,58 @@
#define ccl_device_noinline __device__ __noinline__
#define ccl_global
#define ccl_constant
+#define ccl_local __shared__
+#define ccl_local_param
+#define ccl_private
#define ccl_may_alias
#define ccl_addr_space
#define ccl_restrict __restrict__
#define ccl_align(n) __align__(n)
+ccl_device_inline uint ccl_local_id(uint d)
+{
+ switch(d) {
+ case 0: return threadIdx.x;
+ case 1: return threadIdx.y;
+ case 2: return threadIdx.z;
+ default: return 0;
+ }
+}
+
+#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+
+ccl_device_inline uint ccl_local_size(uint d)
+{
+ switch(d) {
+ case 0: return blockDim.x;
+ case 1: return blockDim.y;
+ case 2: return blockDim.z;
+ default: return 0;
+ }
+}
+
+#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+
+ccl_device_inline uint ccl_group_id(uint d)
+{
+ switch(d) {
+ case 0: return blockIdx.x;
+ case 1: return blockIdx.y;
+ case 2: return blockIdx.z;
+ default: return 0;
+ }
+}
+
+ccl_device_inline uint ccl_num_groups(uint d)
+{
+ switch(d) {
+ case 0: return gridDim.x;
+ case 1: return gridDim.y;
+ case 2: return gridDim.z;
+ default: return 0;
+ }
+}
+
/* No assert supported for CUDA */
#define kernel_assert(cond)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index a016e5293ca..a7faaef89ca 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -92,12 +92,14 @@ CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_CUDA__
# define __KERNEL_SHADING__
# define __KERNEL_ADV_SHADING__
-# define __BRANCHED_PATH__
-# define __VOLUME__
-# define __VOLUME_SCATTER__
-# define __SUBSURFACE__
-# define __CMJ__
-# define __SHADOW_RECORD_ALL__
+# ifndef __SPLIT_KERNEL__
+# define __BRANCHED_PATH__
+# define __VOLUME__
+# define __VOLUME_SCATTER__
+# define __SUBSURFACE__
+# define __CMJ__
+# define __SHADOW_RECORD_ALL__
+# endif
#endif /* __KERNEL_CUDA__ */
#ifdef __KERNEL_OPENCL__
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 090ab2c50c2..52e541321e3 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -16,7 +16,10 @@
/* CUDA kernel entry points */
+#ifdef __CUDA_ARCH__
+
#include "../../kernel_compat_cuda.h"
+#include "kernel_config.h"
#include "../../kernel_math.h"
#include "../../kernel_types.h"
#include "../../kernel_globals.h"
@@ -25,104 +28,7 @@
#include "../../kernel_path_branched.h"
#include "../../kernel_bake.h"
-/* device data taken from CUDA occupancy calculator */
-
-#ifdef __CUDA_ARCH__
-
-/* 2.0 and 2.1 */
-#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 32
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
-
-/* 3.0 and 3.5 */
-#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 63
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 63
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 63
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.0, 5.2, 5.3, 6.0, 6.1 */
-#elif __CUDA_ARCH__ >= 500
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 48
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* unknown architecture */
-#else
-# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
- __launch_bounds__( \
- threads_block_width*threads_block_width, \
- CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
- )
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-# error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
-# error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-# error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-# error "Maximum number of registers per thread exceeded"
-#endif
-
/* kernels */
-
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
new file mode 100644
index 00000000000..9fa39dc9ebb
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* device data taken from CUDA occupancy calculator */
+
+/* 2.0 and 2.1 */
+#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 32
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* 3.0 and 3.5 */
+#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 63
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 63
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 63
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 5.0, 5.2, 5.3, 6.0, 6.1 */
+#elif __CUDA_ARCH__ >= 500
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 48
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* unknown architecture */
+#else
+# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread */
+
+#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
+ __launch_bounds__( \
+ threads_block_width*threads_block_width, \
+ CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
+ )
+
+/* sanity checks */
+
+#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
+# error "Maximum number of threads per block exceeded"
+#endif
+
+#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
+# error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+# error "Maximum number of registers per thread exceeded"
+#endif
+
+#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+# error "Maximum number of registers per thread exceeded"
+#endif
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
new file mode 100644
index 00000000000..441cd96fafa
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA split kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#define __SPLIT_KERNEL__
+
+#include "../../kernel_compat_cuda.h"
+#include "kernel_config.h"
+
+#include "../../split/kernel_split_common.h"
+#include "../../split/kernel_data_init.h"
+#include "../../split/kernel_scene_intersect.h"
+#include "../../split/kernel_lamp_emission.h"
+#include "../../split/kernel_queue_enqueue.h"
+#include "../../split/kernel_background_buffer_update.h"
+#include "../../split/kernel_shader_eval.h"
+#include "../../split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#include "../../split/kernel_direct_lighting.h"
+#include "../../split/kernel_shadow_blocked.h"
+#include "../../split/kernel_next_iteration_setup.h"
+#include "../../split/kernel_sum_all_radiance.h"
+
+#include "../../kernel_film.h"
+
+/* kernels */
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_path_trace_data_init(
+ ccl_global void *split_data_buffer,
+ int num_elements,
+ ccl_global char *ray_state,
+ ccl_global uint *rng_state,
+ int start_sample,
+ int end_sample,
+ int sx, int sy, int sw, int sh, int offset, int stride,
+ ccl_global int *Queue_index,
+ int queuesize,
+ ccl_global char *use_queues_flag,
+ ccl_global unsigned int *work_pool_wgs,
+ unsigned int num_samples,
+ ccl_global float *buffer)
+{
+ kernel_data_init(NULL,
+ NULL,
+ split_data_buffer,
+ num_elements,
+ ray_state,
+ rng_state,
+ start_sample,
+ end_sample,
+ sx, sy, sw, sh, offset, stride,
+ Queue_index,
+ queuesize,
+ use_queues_flag,
+ work_pool_wgs,
+ num_samples,
+ buffer);
+}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+ extern "C" __global__ void \
+ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+ kernel_cuda_##name() \
+ { \
+ kernel_##name(NULL); \
+ }
+
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
+DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+ int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+ if(x < sx + sw && y < sy + sh)
+ kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+ int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+ if(x < sx + sw && y < sy + sh)
+ kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
index 18f062ef682..7e88b6f5168 100644
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -142,8 +142,15 @@ ccl_device_inline void split_data_init(ccl_global SplitData *split_data,
split_data->ray_state = ray_state;
}
-#define kernel_split_state (kg->split_data)
-#define kernel_split_params (kg->split_param_data)
+#ifndef __KERNEL_CUDA__
+# define kernel_split_state (kg->split_data)
+# define kernel_split_params (kg->split_param_data)
+#else
+__device__ SplitData __split_data;
+# define kernel_split_state (__split_data)
+__device__ SplitParams __split_param_data;
+# define kernel_split_params (__split_param_data)
+#endif /* __KERNEL_CUDA__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 318248998aa..f12c5e28c80 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -60,7 +60,8 @@ void DebugFlags::CPU::reset()
}
DebugFlags::CUDA::CUDA()
- : adaptive_compile(false)
+ : adaptive_compile(false),
+ split_kernel(false)
{
reset();
}
@@ -69,6 +70,8 @@ void DebugFlags::CUDA::reset()
{
if(getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
adaptive_compile = true;
+
+ split_kernel = false;
}
DebugFlags::OpenCL::OpenCL()
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 171e43ec32a..911c95de4ab 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -61,6 +61,9 @@ public:
/* Whether adaptive feature based runtime compile is enabled or not.
* Requires the CUDA Toolkit and only works on Linux atm. */
bool adaptive_compile;
+
+ /* Whether split kernel is used */
+ bool split_kernel;
};
/* Descriptor of OpenCL feature-set to be used. */