Cycles: prepare to make CUDA 5.0 the official version we use

* Add CUDA compiler version detection to cmake/scons/runtime * Remove noinline in kernel_shader.h and reenable --use_fast_math if CUDA 5.x is used, these were workarounds for CUDA 4.2 bugs * Change max number of registers to 32 for sm 2.x (based on performance tests from Martijn Berger and confirmed here), and also for NVidia OpenCL. Overall it seems that with these changes and the latest CUDA 5.0 download, that performance is as good as or better than the 2.67b release with the scenes and graphics cards I tested.
author: Brecht Van Lommel <brechtvanlommel@pandora.be> 2013-06-19 21:54:23 +0400
committer: Brecht Van Lommel <brechtvanlommel@pandora.be> 2013-06-19 21:54:23 +0400
commit: 16204bd64759fddc940800f39fc91461ee340424 (patch)
tree: e2f9f61775df316001e0bb328b15bf97fdf6555d /intern
parent: a7416641e6cb634e4ac5fd3ad14be57e308b52fa (diff)
8 files changed, 209 insertions, 22 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index f32c6dde639..1f96ed0ae83 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -271,11 +271,53 @@ public:
 			return "";
 		}
 
+		int cuda_version = cuCompilerVersion();
+
+		if(cuda_version == 0) {
+			cuda_error_message("CUDA nvcc compiler version could not be parsed.");
+			return "";
+		}
+
+		if(cuda_version != 50)
+			printf("CUDA version %d.%d detected, build may succeed but only CUDA 5.0 is officially supported.\n", cuda_version/10, cuda_version%10);
+
 		/* compile */
 		string kernel = path_join(kernel_path, "kernel.cu");
 		string include = kernel_path;
 		const int machine = system_cpu_bits();
-		const int maxreg = 24;
+		string arch_flags;
+
+		/* build flags depending on CUDA version and arch */
+		if(cuda_version < 50) {
+			/* CUDA 4.x */
+			if(major == 1) {
+				/* sm_1x */
+				arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0";
+			}
+			else if(major == 2) {
+				/* sm_2x */
+				arch_flags = "--maxrregcount=24";
+			}
+			else {
+				/* sm_3x */
+				arch_flags = "--maxrregcount=32";
+			}
+		}
+		else {
+			/* CUDA 4.x */
+			if(major == 1) {
+				/* sm_1x */
+				arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0 --use_fast_math";
+			}
+			else if(major == 2) {
+				/* sm_2x */
+				arch_flags = "--maxrregcount=32 --use_fast_math";
+			}
+			else {
+				/* sm_3x */
+				arch_flags = "--maxrregcount=32 --use_fast_math";
+			}
+		}
 
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");
@@ -283,8 +325,10 @@ public:
 		path_create_directories(cubin);
 
 		string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
-			"-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
-			nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), maxreg, include.c_str());
+			"-o \"%s\" --ptxas-options=\"-v\" %s -I\"%s\" -DNVCC -D__KERNEL_CUDA_VERSION__=%d",
+			nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), arch_flags.c_str(), include.c_str(), cuda_version);
+
+		printf("%s\n", command.c_str());
 
 		if(system(command.c_str()) == -1) {
 			cuda_error_message("Failed to execute compilation command, see console for details.");
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 0fb5c7abafb..2ee4ffaca17 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -85,7 +85,7 @@ static string opencl_kernel_build_options(const string& platform, const string *
 	string build_options = " -cl-fast-relaxed-math ";
 
 	if(platform == "NVIDIA CUDA")
-		build_options += "-D__KERNEL_OPENCL_NVIDIA__ -cl-nv-maxrregcount=24 -cl-nv-verbose ";
+		build_options += "-D__KERNEL_OPENCL_NVIDIA__ -cl-nv-maxrregcount=32 -cl-nv-verbose ";
 
 	else if(platform == "Apple")
 		build_options += "-D__KERNEL_OPENCL_APPLE__ -Wno-missing-prototypes ";
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 8b4466863e0..5e9dd15b812 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -117,32 +117,68 @@ set(SRC_UTIL_HEADERS
 # CUDA module
 
 if(WITH_CYCLES_CUDA_BINARIES)
+	# 32 bit or 64 bit
 	if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
 		set(CUDA_BITS 64)
 	else()
 		set(CUDA_BITS 32)
 	endif()
 
+	# CUDA version
+	execute_process (COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
+	string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${NVCC_OUT})
+	string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${NVCC_OUT})
+	set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")
+
+	# build for each arch
 	set(cuda_sources kernel.cu ${SRC_HEADERS} ${SRC_SVM_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_UTIL_HEADERS})
 	set(cuda_cubins)
 
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
 		set(cuda_cubin kernel_${arch}.cubin)
 
-		if(${arch} MATCHES "sm_1[0-9]")
-			# sm_1x
-			set(cuda_arch_flags "--maxrregcount=24 --opencc-options -OPT:Olimit=0")
-		elseif(${arch} MATCHES "sm_2[0-9]")
-			# sm_2x
-			set(cuda_arch_flags "--maxrregcount=24")
+		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${CUDA_VERSION}")
+
+		# warn for other versions
+		if(CUDA_VERSION MATCHES "50")
+		else()
+			message(STATUS "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, build may succeed but only CUDA 5.0 is officially supported")
+		endif()
+
+		# build flags depending on CUDA version and arch
+		if(CUDA_VERSION LESS 50)
+			# CUDA 4.x
+			if(${arch} MATCHES "sm_1[0-9]")
+				# sm_1x
+				set(cuda_arch_flags "--maxrregcount=24 --opencc-options -OPT:Olimit=0")
+			elseif(${arch} MATCHES "sm_2[0-9]")
+				# sm_2x
+				set(cuda_arch_flags "--maxrregcount=24")
+			else()
+				# sm_3x
+				set(cuda_arch_flags "--maxrregcount=32")
+			endif()
+
+			set(cuda_math_flags "")
 		else()
-			# sm_3x
-			set(cuda_arch_flags "--maxrregcount=32")
+			# CUDA 5.x
+			if(${arch} MATCHES "sm_1[0-9]")
+				# sm_1x
+				set(cuda_arch_flags "--maxrregcount=24 --opencc-options -OPT:Olimit=0")
+			elseif(${arch} MATCHES "sm_2[0-9]")
+				# sm_2x
+				set(cuda_arch_flags "--maxrregcount=32")
+			else()
+				# sm_3x
+				set(cuda_arch_flags "--maxrregcount=32")
+			endif()
+
+			set(cuda_math_flags "--use_fast_math")
 		endif()
 		
 		add_custom_command(
 			OUTPUT ${cuda_cubin}
-			COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
+			COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} ${cuda_version_flags} ${cuda_math_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
 			DEPENDS ${cuda_sources})
 
 		delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
diff --git a/intern/cycles/kernel/SConscript b/intern/cycles/kernel/SConscript
index 3a46d10dee1..353ec1ce9d8 100644
--- a/intern/cycles/kernel/SConscript
+++ b/intern/cycles/kernel/SConscript
@@ -25,6 +25,8 @@
 #
 # ***** END GPL LICENSE BLOCK *****
 
+import re
+import subprocess
 import sys
 import os
 import Blender as B
@@ -60,10 +62,19 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     svm_dir = os.path.join(source_dir, "../svm")
     closure_dir = os.path.join(source_dir, "../closure")
 
+    # get CUDA version
+    nvcc_pipe = subprocess.Popen([nvcc, "--version"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
+    output, erroroutput = nvcc_pipe.communicate()
+    cuda_major_minor = re.findall(r'release (\d+).(\d+)', output)[0]
+    cuda_version = int(cuda_major_minor[0])*10 + int(cuda_major_minor[1])
+
+    if cuda_version != 50:
+        print("CUDA version %d.%d detected, build may succeed but only CUDA 5.0 is officially supported." % (cuda_version/10, cuda_version%10))
+
     # nvcc flags
     nvcc_flags = "-m%s" % (bits)
-    nvcc_flags += " --cubin --ptxas-options=\"-v\" --maxrregcount=24"
-    nvcc_flags += " --opencc-options -OPT:Olimit=0"
+    nvcc_flags += " --cubin --ptxas-options=\"-v\""
+    nvcc_flags += " -D__KERNEL_CUDA_VERSION__=%d" % (cuda_version)
     nvcc_flags += " -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC"
     nvcc_flags += " -I \"%s\" -I \"%s\" -I \"%s\"" % (util_dir, svm_dir, closure_dir)
 
@@ -75,7 +86,31 @@ if env['WITH_BF_CYCLES_CUDA_BINARIES']:
     for arch in cuda_archs:
         cubin_file = os.path.join(build_dir, "kernel_%s.cubin" % arch)
 
-        command = "\"%s\" -arch=%s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, kernel_file, cubin_file)
+		# build flags depending on CUDA version and arch
+        if cuda_version < 50:
+            # CUDA 4.x
+            if arch.startswith("sm_1"):
+                # sm_1x
+                cuda_arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0"
+            elif arch.startswith("sm_2"):
+                # sm_2x
+                cuda_arch_flags = "--maxrregcount=24"
+            else:
+                # sm_3x
+                cuda_arch_flags = "--maxrregcount=32"
+        else:
+            # CUDA 5.x
+            if arch.startswith("sm_1"):
+                # sm_1x
+                cuda_arch_flags = "--maxrregcount=24 --opencc-options -OPT:Olimit=0 --use_fast_math"
+            elif arch.startswith("sm_2"):
+                # sm_2x
+                cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
+            else:
+                # sm_3x
+                cuda_arch_flags = "--maxrregcount=32 --use_fast_math"
+
+        command = "\"%s\" -arch=%s %s %s \"%s\" -o \"%s\"" % (nvcc, arch, nvcc_flags, cuda_arch_flags, kernel_file, cubin_file)
 
         kernel.Command(cubin_file, 'kernel.cu', command)
         kernel.Depends(cubin_file, dependencies)
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index 17704b48cee..3e1a18ab469 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -137,7 +137,7 @@ __device_inline float cmj_randfloat(uint i, uint p)
 }
 
 #ifdef __CMJ__
-__device_noinline float cmj_sample_1D(int s, int N, int p)
+__device float cmj_sample_1D(int s, int N, int p)
 {
 	uint x = cmj_permute(s, N, p * 0x68bc21eb);
 	float jx = cmj_randfloat(s, p * 0x967a889b);
@@ -146,7 +146,7 @@ __device_noinline float cmj_sample_1D(int s, int N, int p)
 	return (x + jx)*invN;
 }
 
-__device_noinline void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
+__device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
 {
 	int m = float_to_int(sqrtf(N));
 	int n = (N + m - 1)/m;
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 7e41ee35ae0..039981a031a 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -38,7 +38,12 @@ CCL_NAMESPACE_BEGIN
 /* ShaderData setup from incoming ray */
 
 #ifdef __OBJECT_MOTION__
-__device_noinline void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
+#if defined(__KERNEL_CUDA_VERSION__) && __KERNEL_CUDA_VERSION__ <= 42
+__device_noinline
+#else
+__device
+#endif
+void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
 {
 	/* note that this is a separate non-inlined function to work around crash
 	 * on CUDA sm 2.0, otherwise kernel execution crashes (compiler bug?) */
@@ -53,7 +58,12 @@ __device_noinline void shader_setup_object_transforms(KernelGlobals *kg, ShaderD
 }
 #endif
 
-__device_noinline void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
+#if defined(__KERNEL_CUDA_VERSION__) && __KERNEL_CUDA_VERSION__ <= 42
+__device_noinline
+#else
+__device
+#endif
+void shader_setup_from_ray(KernelGlobals *kg, ShaderData *sd,
 	const Intersection *isect, const Ray *ray)
 {
 #ifdef __INSTANCING__
@@ -260,7 +270,12 @@ __device_inline void shader_setup_from_subsurface(KernelGlobals *kg, ShaderData
 
 /* ShaderData setup from position sampled on mesh */
 
-__device_noinline void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
+#if defined(__KERNEL_CUDA_VERSION__) && __KERNEL_CUDA_VERSION__ <= 42
+__device_noinline
+#else
+__device
+#endif
+void shader_setup_from_sample(KernelGlobals *kg, ShaderData *sd,
 	const float3 P, const float3 Ng, const float3 I,
 	int shader, int object, int prim, float u, float v, float t, float time, int segment)
 {
diff --git a/intern/cycles/util/util_cuda.cpp b/intern/cycles/util/util_cuda.cpp
index 6c9ee7c548f..42ffb04a793 100644
--- a/intern/cycles/util/util_cuda.cpp
+++ b/intern/cycles/util/util_cuda.cpp
@@ -16,6 +16,8 @@
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
+#include <iostream>
+
 #include <stdlib.h>
 #include <stdio.h>
 
@@ -25,6 +27,11 @@
 #include "util_path.h"
 #include "util_string.h"
 
+#ifdef _WIN32
+#define popen _popen
+#define pclose _pclose
+#endif
+
 /* function defininitions */
 
 tcuInit *cuInit;
@@ -399,7 +406,15 @@ string cuCompilerPath()
 	const char *defaultpaths[] = {"C:/CUDA/bin", NULL};
 	const char *executable = "nvcc.exe";
 #else
-	const char *defaultpaths[] = {"/Developer/NVIDIA/CUDA-4.2/bin", "/usr/local/cuda-4.2/bin", "/usr/local/cuda/bin", NULL};
+	const char *defaultpaths[] = {
+		"/Developer/NVIDIA/CUDA-5.0/bin",
+		"/usr/local/cuda-5.0/bin",
+		"/usr/local/cuda/bin",
+		"/Developer/NVIDIA/CUDA-4.2/bin",
+		"/usr/local/cuda-4.2/bin", 
+		"/Developer/NVIDIA/CUDA-5.5/bin",
+		"/usr/local/cuda-5.5/bin",
+		NULL};
 	const char *executable = "nvcc";
 #endif
 
@@ -437,5 +452,46 @@ string cuCompilerPath()
 	return "";
 }
 
+int cuCompilerVersion()
+{
+	string path = cuCompilerPath();
+	if(path == "")
+		return 0;
+	
+	/* get --version output */
+	FILE *pipe = popen((path + " --version").c_str(), "r");
+	if(!pipe) {
+		fprintf(stderr, "CUDA: failed to run compiler to retrieve version");
+		return 0;
+	}
+
+	char buf[128];
+	string output = "";
+
+	while(!feof(pipe))
+		if(fgets(buf, 128, pipe) != NULL)
+			output += buf;
+
+	pclose(pipe);
+
+	/* parse version number */
+	string marker = "Cuda compilation tools, release ";
+	size_t offset = output.find(marker);
+	if(offset == string::npos) {
+		fprintf(stderr, "CUDA: failed to find version number in:\n\n%s\n", output.c_str());
+		return 0;
+	}
+
+	string versionstr = output.substr(offset + marker.size(), string::npos);
+	int major, minor;
+
+	if(sscanf(versionstr.c_str(), "%d.%d", &major, &minor) < 2) {
+		fprintf(stderr, "CUDA: failed to parse version number from:\n\n%s\n", output.c_str());
+		return 0;
+	}
+
+	return 10*major + minor;
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_cuda.h b/intern/cycles/util/util_cuda.h
index 55feb3f84cb..ba4df0de90a 100644
--- a/intern/cycles/util/util_cuda.h
+++ b/intern/cycles/util/util_cuda.h
@@ -32,6 +32,7 @@ CCL_NAMESPACE_BEGIN
 bool cuLibraryInit();
 bool cuHavePrecompiledKernels();
 string cuCompilerPath();
+int cuCompilerVersion();
 
 CCL_NAMESPACE_END
author	Brecht Van Lommel <brechtvanlommel@pandora.be>	2013-06-19 21:54:23 +0400
committer	Brecht Van Lommel <brechtvanlommel@pandora.be>	2013-06-19 21:54:23 +0400
commit	16204bd64759fddc940800f39fc91461ee340424 (patch)
tree	e2f9f61775df316001e0bb328b15bf97fdf6555d /intern
parent	a7416641e6cb634e4ac5fd3ad14be57e308b52fa (diff)