From fb3f32760d68134aadb7978922360857f0ecccb7 Mon Sep 17 00:00:00 2001
From: Thomas Dinges <blender@dingto.org>
Date: Tue, 26 Aug 2014 17:02:03 +0200
Subject: Cycles: Add an experimental CUDA kernel.

Now we build 2 .cubins per architecture (e.g. kernel_sm_21.cubin, kernel_experimental_sm_21.cubin).
The experimental kernel can be used by switching to the Experimental Feature Set: http://wiki.blender.org/index.php/Doc:2.6/Manual/Render/Cycles/Experimental_Features

This enables Subsurface Scattering and Correlated Multi Jitter Sampling on GPU, while keeping the stability and performance of the regular kernel.

Differential Revision: https://developer.blender.org/D762
Patch by Sergey and myself.

Developer / Builder Note:
CUDA Toolkit 6.5 is highly recommended for this, also note that building the experimental kernel requires a lot of system memory (~7-8GB).
---
 intern/cycles/blender/addon/ui.py    |  2 +-
 intern/cycles/device/device_cuda.cpp | 18 +++++++---
 intern/cycles/kernel/CMakeLists.txt  | 67 +++++++++++++++++++++---------------
 intern/cycles/kernel/SConscript      | 49 ++++++++++++++------------
 intern/cycles/kernel/kernel_types.h  |  7 ++--
 5 files changed, 87 insertions(+), 56 deletions(-)

(limited to 'intern')

diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index d2d2a2038d7..fa827c3b1dc 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -154,7 +154,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
             sub.prop(cscene, "subsurface_samples", text="Subsurface")
             sub.prop(cscene, "volume_samples", text="Volume")
 
-        if use_cpu(context):
+        if use_cpu(context) or cscene.feature_set == 'EXPERIMENTAL':
             layout.row().prop(cscene, "sampling_pattern", text="Pattern")
 
         for rl in scene.render.layers:
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index f2e470c21d5..1ed26717f4b 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -197,14 +197,18 @@ public:
 		return true;
 	}
 
-	string compile_kernel()
+	string compile_kernel(bool experimental)
 	{
 		/* compute cubin name */
 		int major, minor;
 		cuDeviceComputeCapability(&major, &minor, cuDevId);
 
 		/* attempt to use kernel provided with blender */
-		string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
+		string cubin;
+		if(experimental)
+			cubin = path_get(string_printf("lib/kernel_experimental_sm_%d%d.cubin", major, minor));
+		else
+			cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
 		if(path_exists(cubin))
 			return cubin;
 
@@ -212,7 +216,10 @@ public:
 		string kernel_path = path_get("kernel");
 		string md5 = path_files_md5_hash(kernel_path);
 
-		cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
+		if(experimental)
+			cubin = string_printf("cycles_kernel_experimental_sm%d%d_%s.cubin", major, minor, md5.c_str());
+		else
+			cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
 		cubin = path_user_get(path_join("cache", cubin));
 
 		/* if exists already, use it */
@@ -263,6 +270,9 @@ public:
 		string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
 			"-o \"%s\" --ptxas-options=\"-v\" -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
 			nvcc, major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
+		
+		if(experimental)
+			command += " -D__KERNEL_CUDA_EXPERIMENTAL__";
 
 		printf("%s\n", command.c_str());
 
@@ -293,7 +303,7 @@ public:
 			return false;
 
 		/* get kernel */
-		string cubin = compile_kernel();
+		string cubin = compile_kernel(experimental);
 
 		if(cubin == "")
 			return false;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 34ef9e5a387..8857f86890c 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -160,37 +160,50 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_GEOM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
 	set(cuda_cubins)
 
-	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
-		set(cuda_cubin kernel_${arch}.cubin)
+	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
+		if(${experimental})
+			set(cuda_extra_flags "-D__KERNEL_CUDA_EXPERIMENTAL__")
+			set(cuda_cubin kernel_experimental_${arch}.cubin)
+		else()
+			set(cuda_extra_flags "")
+			set(cuda_cubin kernel_${arch}.cubin)
+		endif()
 
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
 		set(cuda_math_flags "--use_fast_math")
 
-		if(CUDA_VERSION LESS 60 AND ${arch} MATCHES "sm_50")
-			message(WARNING "Can't build kernel for CUDA sm_50 architecture, skipping")
-		else()
-			add_custom_command(
-				OUTPUT ${cuda_cubin}
-				COMMAND ${CUDA_NVCC_EXECUTABLE}
-				        -arch=${arch}
-				        -m${CUDA_BITS}
-				        --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
-				        -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
-				        --ptxas-options="-v"
-				        ${cuda_arch_flags}
-				        ${cuda_version_flags}
-				        ${cuda_math_flags}
-				        -I${CMAKE_CURRENT_SOURCE_DIR}/../util
-				        -I${CMAKE_CURRENT_SOURCE_DIR}/svm
-				        -DCCL_NAMESPACE_BEGIN=
-				        -DCCL_NAMESPACE_END=
-				        -DNVCC
-
-				DEPENDS ${cuda_sources})
-
-			delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
-			list(APPEND cuda_cubins ${cuda_cubin})
-		endif()
+		add_custom_command(
+			OUTPUT ${cuda_cubin}
+			COMMAND ${CUDA_NVCC_EXECUTABLE}
+					-arch=${arch}
+					-m${CUDA_BITS}
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu
+					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
+					--ptxas-options="-v"
+					${cuda_arch_flags}
+					${cuda_version_flags}
+					${cuda_math_flags}
+					${cuda_extra_flags}
+					-I${CMAKE_CURRENT_SOURCE_DIR}/../util
+					-I${CMAKE_CURRENT_SOURCE_DIR}/svm
+					-DCCL_NAMESPACE_BEGIN=
+					-DCCL_NAMESPACE_END=
+					-DNVCC
+
+			DEPENDS ${cuda_sources})
+
+		delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
+		list(APPEND cuda_cubins ${cuda_cubin})
+
+		unset(cuda_extra_flags)
+	endmacro()
+
+	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
+		# Compile regular kernel
+		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
+
+		# Compile experimental kernel
+		CYCLES_CUDA_KERNEL_ADD(${arch} TRUE)
 	endforeach()
 
 	add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript
index 39f0dd44774..5a9e57c5342 100644
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@@ -83,30 +83,35 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     dependencies = ['kernel.cu'] + kernel.Glob('*.h') + kernel.Glob('../util/*.h') + kernel.Glob('svm/*.h') + kernel.Glob('geom/*.h') + kernel.Glob('closure/*.h')
     last_cubin_file = None
 
+    configs = (("kernel_%s.cubin", ''),
+               ("kernel_experimental_%s.cubin", ' -D__KERNEL_CUDA_EXPERIMENTAL__'))
+
     # add command for each cuda architecture
     for arch in cuda_archs:
-        if cuda_version < 60 and arch == "sm_50":
-            print("Can't build kernel for CUDA sm_50 architecture, skipping")
-            continue
-
-        cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
-
-        if env['BF_CYCLES_CUDA_ENV']:
-            MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
-            command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, nvcc_flags, kernel_file, cubin_file)
-        else:
-            command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, kernel_file, cubin_file)
-
-        kernel.Command(cubin_file, 'kernel.cu', command)
-        kernel.Depends(cubin_file, dependencies)
-
-        kernel_binaries.append(cubin_file)
-        
-        if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']:
-            # trick to compile one kernel at a time to reduce memory usage
-            if last_cubin_file:
-                kernel.Depends(cubin_file, last_cubin_file)
-            last_cubin_file = cubin_file
+        for config in configs:
+            # TODO(sergey): Use dict instead ocouple in order to increase readability?
+            name = config[0]
+            extra_flags = config[1]
+
+            cubin_file = os.path.join(build_dir, name % arch)
+            current_flags = nvcc_flags + extra_flags
+
+            if env['BF_CYCLES_CUDA_ENV']:
+                MS_SDK = "C:\\Program Files\\Microsoft SDKs\\Windows\\v7.1\\Bin\\SetEnv.cmd"
+                command = "\"%s\" & \"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (MS_SDK, nvcc, arch, current_flags, kernel_file, cubin_file)
+            else:
+                command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, current_flags, kernel_file, cubin_file)
+
+            kernel.Command(cubin_file, 'kernel.cu', command)
+            kernel.Depends(cubin_file, dependencies)
+
+            kernel_binaries.append(cubin_file)
+
+            if not env['WITH_BF_CYCLES_CUDA_THREADED_COMPILE']:
+                # trick to compile one kernel at a time to reduce memory usage
+                if last_cubin_file:
+                    kernel.Depends(cubin_file, last_cubin_file)
+                last_cubin_file = cubin_file
 
 Return('kernel_binaries')
 
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 1ed4281ea75..292283cbbfd 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -79,8 +79,11 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME_SCATTER__
 
 /* Experimental on GPU */
-//#define __VOLUME_DECOUPLED__
-//#define __SUBSURFACE__
+#ifdef __KERNEL_CUDA_EXPERIMENTAL__
+#define __SUBSURFACE__
+#define __CMJ__
+#endif
+
 #endif
 
 #ifdef __KERNEL_OPENCL__
-- 
cgit v1.2.3