From fb3f32760d68134aadb7978922360857f0ecccb7 Mon Sep 17 00:00:00 2001 From: Thomas Dinges Date: Tue, 26 Aug 2014 17:02:03 +0200 Subject: Cycles: Add an experimental CUDA kernel. Now we build 2 .cubins per architecture (e.g. kernel_sm_21.cubin, kernel_experimental_sm_21.cubin). The experimental kernel can be used by switching to the Experimental Feature Set: http://wiki.blender.org/index.php/Doc:2.6/Manual/Render/Cycles/Experimental_Features This enables Subsurface Scattering and Correlated Multi Jitter Sampling on GPU, while keeping the stability and performance of the regular kernel. Differential Revision: https://developer.blender.org/D762 Patch by Sergey and myself. Developer / Builder Note: CUDA Toolkit 6.5 is highly recommended for this, also note that building the experimental kernel requires a lot of system memory (~7-8GB). --- intern/cycles/blender/addon/ui.py | 2 +- intern/cycles/device/device_cuda.cpp | 18 +++++++--- intern/cycles/kernel/CMakeLists.txt | 67 +++++++++++++++++++++--------------- intern/cycles/kernel/SConscript | 49 ++++++++++++++------------ intern/cycles/kernel/kernel_types.h | 7 ++-- 5 files changed, 87 insertions(+), 56 deletions(-) (limited to 'intern') diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index d2d2a2038d7..fa827c3b1dc 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -154,7 +154,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel): sub.prop(cscene, "subsurface_samples", text="Subsurface") sub.prop(cscene, "volume_samples", text="Volume") - if use_cpu(context): + if use_cpu(context) or cscene.feature_set == 'EXPERIMENTAL': layout.row().prop(cscene, "sampling_pattern", text="Pattern") for rl in scene.render.layers: diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index f2e470c21d5..1ed26717f4b 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -197,14 +197,18 @@ public: return true; } - string compile_kernel() + string compile_kernel(bool experimental) { /* compute cubin name */ int major, minor; cuDeviceComputeCapability(&major, &minor, cuDevId); /* attempt to use kernel provided with blender */ - string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor)); + string cubin; + if(experimental) + cubin = path_get(string_printf("lib/kernel_experimental_sm_%d%d.cubin", major, minor)); + else + cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor)); if(path_exists(cubin)) return cubin; @@ -212,7 +216,10 @@ public: string kernel_path = path_get("kernel"); string md5 = path_files_md5_hash(kernel_path); - cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str()); + if(experimental) + cubin = string_printf("cycles_kernel_experimental_sm%d%d_%s.cubin", major, minor, md5.c_str()); + else + cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str()); cubin = path_user_get(path_join("cache", cubin)); /* if exists already, use it */ @@ -263,6 +270,9 @@ public: string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" " "-o \"%s\" --ptxas-options=\"-v\" -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d", nvcc, major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version); + + if(experimental) + command += " -D__KERNEL_CUDA_EXPERIMENTAL__"; printf("%s\n", command.c_str()); @@ -293,7 +303,7 @@ public: return false; /* get kernel */ - string cubin = compile_kernel(); + string cubin = compile_kernel(experimental); if(cubin == "") return false; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 34ef9e5a387..8857f86890c 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -160,37 +160,50 @@ if(WITH_CYCLES_CUDA_BINARIES) set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS}) set(cuda_cubins) - foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) - set(cuda_cubin kernel_${arch}.cubin) + macro(CYCLES_CUDA_KERNEL_ADD arch experimental) + if(${experimental}) + set(cuda_extra_flags "-D__KERNEL_CUDA_EXPERIMENTAL__") + set(cuda_cubin kernel_experimental_${arch}.cubin) + else() + set(cuda_extra_flags "") + set(cuda_cubin kernel_${arch}.cubin) + endif() set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}") set(cuda_math_flags "--use_fast_math") - if(CUDA_VERSION LESS 60 AND ${arch} MATCHES "sm_50") - message(WARNING "Can't build kernel for CUDA sm_50 architecture, skipping") - else() - add_custom_command( - OUTPUT ${cuda_cubin} - COMMAND ${CUDA_NVCC_EXECUTABLE} - -arch=${arch} - -m${CUDA_BITS} - --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu - -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} - --ptxas-options="-v" - ${cuda_arch_flags} - ${cuda_version_flags} - ${cuda_math_flags} - -I${CMAKE_CURRENT_SOURCE_DIR}/../util - -I${CMAKE_CURRENT_SOURCE_DIR}/svm - -DCCL_NAMESPACE_BEGIN= - -DCCL_NAMESPACE_END= - -DNVCC - - DEPENDS ${cuda_sources}) - - delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) - list(APPEND cuda_cubins ${cuda_cubin}) - endif() + add_custom_command( + OUTPUT ${cuda_cubin} + COMMAND ${CUDA_NVCC_EXECUTABLE} + -arch=${arch} + -m${CUDA_BITS} + --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu + -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} + --ptxas-options="-v" + ${cuda_arch_flags} + ${cuda_version_flags} + ${cuda_math_flags} + ${cuda_extra_flags} + -I${CMAKE_CURRENT_SOURCE_DIR}/../util + -I${CMAKE_CURRENT_SOURCE_DIR}/svm + -DCCL_NAMESPACE_BEGIN= + -DCCL_NAMESPACE_END= + -DNVCC + + DEPENDS ${cuda_sources}) + + delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) + list(APPEND cuda_cubins ${cuda_cubin}) + + unset(cuda_extra_flags) + endmacro() + + foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) + # Compile regular kernel + CYCLES_CUDA_KERNEL_ADD(${arch} FALSE) + + # Compile experimental kernel + CYCLES_CUDA_KERNEL_ADD(${arch} TRUE) endforeach() add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins}) diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript index 39f0dd44774..5a9e57c5342 100644 --- a/intern/cycles/kernel/SConscript +++ b/intern/cycles/kernel/SConscript @@ -83,30 +83,35 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']: dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h') last_cubin_file = None + configs = (("kernel_%s.cubin", ''), + ("kernel_experimental_%s.cubin", ' -D__KERNEL_CUDA_EXPERIMENTAL__')) + # add command for each cuda architecture for arch in cuda_archs: - if cuda_version < 60 and arch == "sm_50": - print("Can't build kernel for CUDA sm_50 architecture, skipping") - continue - - cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch) - - if env['BF_CYCLES_CUDA_ENV']: - MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd" - command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, kernel_file, cubin_file) - else: - command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, kernel_file, cubin_file) - - kernel.Command(cubin_file, 'kernel.cu', command) - kernel.Depends(cubin_file, dependencies) - - kernel_binaries.append(cubin_file) - - if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']: - # trick to compile one kernel at a time to reduce memory usage - if last_cubin_file: - kernel.Depends(cubin_file, last_cubin_file) - last_cubin_file = cubin_file + for config in configs: + # TODO(sergey): Use dict instead ocouple in order to increase readability? + name = config[0] + extra_flags = config[1] + + cubin_file = os.path.join(build_dir, name % arch) + current_flags = nvcc_flags + extra_flags + + if env['BF_CYCLES_CUDA_ENV']: + MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd" + command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, current_flags, kernel_file, cubin_file) + else: + command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, current_flags, kernel_file, cubin_file) + + kernel.Command(cubin_file, 'kernel.cu', command) + kernel.Depends(cubin_file, dependencies) + + kernel_binaries.append(cubin_file) + + if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']: + # trick to compile one kernel at a time to reduce memory usage + if last_cubin_file: + kernel.Depends(cubin_file, last_cubin_file) + last_cubin_file = cubin_file Return('kernel_binaries') diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 1ed4281ea75..292283cbbfd 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -79,8 +79,11 @@ CCL_NAMESPACE_BEGIN #define __VOLUME_SCATTER__ /* Experimental on GPU */ -//#define __VOLUME_DECOUPLED__ -//#define __SUBSURFACE__ +#ifdef __KERNEL_CUDA_EXPERIMENTAL__ +#define __SUBSURFACE__ +#define __CMJ__ +#endif + #endif #ifdef __KERNEL_OPENCL__ -- cgit v1.2.3