Cycles:

* OpenCL now only uses GPU/Accelerator devices, it's only confusing if CPU device is used, easy to enable in the code for debugging. * OpenCL kernel binaries are now cached for faster startup after the first time compiling. * CUDA kernels can now be compiled and cached at runtime if the CUDA toolkit is installed. This means that even if the build does not have CUDA enabled, it's still possible to use it as long as you install the toolkit.
author: Brecht Van Lommel <brechtvanlommel@pandora.be> 2011-09-09 16:04:39 +0400
committer: Brecht Van Lommel <brechtvanlommel@pandora.be> 2011-09-09 16:04:39 +0400
commit: cfbd6cf154bbc653422f30b2bf8077545f5fb99c (patch)
tree: b32a82345cf6d87f56ede78a8bdf1cd18ca2f0f3 /intern/cycles
parent: 9b31cba74e2bd84e9988ebdab723e6e43f9b8357 (diff)
11 files changed, 317 insertions, 46 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 5d76f57fa73..05b9d4ad3bb 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -46,7 +46,7 @@ if(WITH_CYCLES_MULTI)
 endif()
 
 if(WITH_CYCLES_CUDA)
-  add_definitions(-DWITH_CUDA)
+  add_definitions(-DWITH_CUDA_BINARIES)
 endif()
 
 if(WITH_CYCLES_OSL)
@@ -58,6 +58,7 @@ if(WITH_CYCLES_PARTIO)
 endif()
 
 add_definitions(-DWITH_OPENCL)
+add_definitions(-DWITH_CUDA)
 
 include_directories(
 	${BOOST_INCLUDE_DIR}
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index d6ea15a435f..8cd7be567e6 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -21,7 +21,11 @@ import bpy
 def init():
     import libcycles_blender as lib
     import os.path
-    lib.init(os.path.dirname(__file__))
+
+    path = os.path.dirname(__file__)
+    user_path = os.path.dirname(os.path.abspath(bpy.utils.user_resource('CONFIG', '')))
+
+    lib.init(path, user_path)
 
 def create(engine, data, scene, region = 0, v3d = 0, rv3d = 0):
     import libcycles_blender as lib
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index b40f5206fea..1026d420c02 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -28,12 +28,12 @@ CCL_NAMESPACE_BEGIN
 
 static PyObject *init_func(PyObject *self, PyObject *args)
 {
-	const char *path;
+	const char *path, *user_path;
 
-	if(!PyArg_ParseTuple(args, "s", &path))
+	if(!PyArg_ParseTuple(args, "ss", &path, &user_path))
 		return NULL;
 	
-	path_init(path);
+	path_init(path, user_path);
 
 	Py_INCREF(Py_None);
 	return Py_None;
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index a6a66873b92..49ffd3d0834 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -28,7 +28,9 @@
 #include "util_map.h"
 #include "util_opengl.h"
 #include "util_path.h"
+#include "util_system.h"
 #include "util_types.h"
+#include "util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -125,6 +127,15 @@ public:
 		} \
 	}
 
+	bool cuda_error(CUresult result)
+	{
+		if(result == CUDA_SUCCESS)
+			return false;
+
+		fprintf(stderr, "CUDA error: %s\n", cuda_error_string(result));
+		return true;
+	}
+
 	void cuda_push_context()
 	{
 		cuda_assert(cuCtxSetCurrent(cuContext))
@@ -140,17 +151,26 @@ public:
 		background = background_;
 
 		cuDevId = 0;
+		cuDevice = 0;
+		cuContext = 0;
 
 		/* intialize */
-		cuda_assert(cuInit(0))
+		if(cuda_error(cuInit(0)))
+			return;
 
 		/* setup device and context */
-		cuda_assert(cuDeviceGet(&cuDevice, cuDevId))
+		if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
+			return;
+
+		CUresult result;
 
 		if(background)
-			cuda_assert(cuCtxCreate(&cuContext, 0, cuDevice))
+			result = cuCtxCreate(&cuContext, 0, cuDevice);
 		else
-			cuda_assert(cuGLCtxCreate(&cuContext, 0, cuDevice))
+			result = cuGLCtxCreate(&cuContext, 0, cuDevice);
+
+		if(cuda_error(result))
+			return;
 
 		cuda_pop_context();
 	}
@@ -173,21 +193,80 @@ public:
 		return string("CUDA ") + deviceName;
 	}
 
+	string compile_kernel()
+	{
+		/* compute cubin name */
+		int major, minor;
+		cuDeviceComputeCapability(&major, &minor, cuDevId);
+
+		/* attempt to use kernel provided with blender */
+		string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
+		if(path_exists(cubin))
+			return cubin;
+
+		/* not found, try to use locally compiled kernel */
+		string kernel_path = path_get("kernel");
+		string md5 = path_files_md5_hash(kernel_path);
+
+		cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());;
+		cubin = path_user_get(path_join("cache", cubin));
+
+		/* if exists already, use it */
+		if(path_exists(cubin))
+			return cubin;
+
+		/* if not, find CUDA compiler */
+		string nvcc = cuCompilerPath();
+
+		if(nvcc == "") {
+			fprintf(stderr, "CUDA nvcc compiler not found. Install CUDA toolkit in default location.\n");
+			return "";
+		}
+
+		/* compile */
+		string kernel = path_join(kernel_path, "kernel.cu");
+		string include = kernel_path;
+		const int machine = system_cpu_bits();
+		const int maxreg = 24;
+
+		double starttime = time_dt();
+		printf("Compiling CUDA kernel ...\n");
+
+		string command = string_printf("%s -arch=sm_%d%d -m%d --cubin \"%s\" --use_fast_math "
+			"-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
+			nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), maxreg, include.c_str());
+
+		system(command.c_str());
+
+		/* verify if compilation succeeded */
+		if(!path_exists(cubin)) {
+			fprintf(stderr, "CUDA kernel compilation failed.\n");
+			return "";
+		}
+
+		printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+		return cubin;
+	}
 
 	bool load_kernels()
 	{
-		CUresult result;
-		int major, minor;
+		/* check if cuda init succeeded */
+		if(cuContext == 0)
+			return false;
 
-		cuda_push_context();
+		/* get kernel */
+		string cubin = compile_kernel();
+
+		if(cubin == "")
+			return false;
 
 		/* open module */
-		cuDeviceComputeCapability(&major, &minor, cuDevId);
-		string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
+		cuda_push_context();
 
-		result = cuModuleLoad(&cuModule, cubin.c_str());
-		if(result != CUDA_SUCCESS)
-			fprintf(stderr, "Failed loading CUDA kernel %s (%s).\n", cubin.c_str(), cuda_error_string(result));
+		CUresult result = cuModuleLoad(&cuModule, cubin.c_str());
+		if(cuda_error(result))
+			fprintf(stderr, "Failed loading CUDA kernel %s.\n", cubin.c_str());
 
 		cuda_pop_context();
 
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index c142701c873..a41238ffe09 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -27,6 +27,7 @@
 
 #include "util_map.h"
 #include "util_math.h"
+#include "util_md5.h"
 #include "util_opencl.h"
 #include "util_opengl.h"
 #include "util_path.h"
@@ -118,7 +119,7 @@ public:
 	void opencl_assert(cl_int err)
 	{
 		if(err != CL_SUCCESS) {
-			printf("error (%d): %s\n", err, opencl_error_string(err));
+			fprintf(stderr, "OpenCL error (%d): %s\n", err, opencl_error_string(err));
 #ifndef NDEBUG
 			abort();
 #endif
@@ -157,7 +158,7 @@ public:
 
 		cpPlatform = platform_ids[0]; /* todo: pick specified platform && device */
 
-		ciErr = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_ALL, 1, &cdDevice, NULL);
+		ciErr = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR, 1, &cdDevice, NULL);
 		if(opencl_error(ciErr))
 			return;
 
@@ -208,38 +209,67 @@ public:
 		return true;
 	}
 
-	bool load_kernels()
+	bool load_binary(const string& kernel_path, const string& clbin)
 	{
-		/* verify if device was initialized */
-		if(!device_initialized)
+		/* read binary into memory */
+		vector<uint8_t> binary;
+
+		if(!path_read_binary(clbin, binary)) {
+			fprintf(stderr, "OpenCL failed to read cached binary %s.\n", clbin.c_str());
 			return false;
+		}
 
-		/* verify we have right opencl version */
-		if(!opencl_version_check())
+		/* create program */
+		cl_int status;
+		size_t size = binary.size();
+		const uint8_t *bytes = &binary[0];
+
+		cpProgram = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
+			&size, &bytes, &status, &ciErr);
+
+		if(opencl_error(status) || opencl_error(ciErr)) {
+			fprintf(stderr, "OpenCL failed create program from cached binary %s.\n", clbin.c_str());
 			return false;
+		}
 
-		/* we compile kernels consisting of many files. unfortunately opencl
-		   kernel caches do not seem to recognize changes in included files.
-		   so we force recompile on changes by adding the md5 hash of all files */
-		string kernel_path = path_get("kernel");
-		string kernel_md5 = path_files_md5_hash(kernel_path);
+		if(!build_kernel(kernel_path))
+			return false;
 
-		string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
-		size_t source_len = source.size();
-		const char *source_str = source.c_str();
+		return true;
+	}
+
+	bool save_binary(const string& clbin)
+	{
+		size_t size = 0;
+		clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
+
+		if(!size)
+			return false;
+
+		vector<uint8_t> binary(size);
+		uint8_t *bytes = &binary[0];
+
+		clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
+
+		if(!path_write_binary(clbin, binary)) {
+			fprintf(stderr, "OpenCL failed to write cached binary %s.\n", clbin.c_str());
+			return false;
+		}
+
+		return true;
+	}
 
+	bool build_kernel(const string& kernel_path)
+	{
 		string build_options = "";
 
 		build_options += "-I " + kernel_path + ""; /* todo: escape path */
 		build_options += " -cl-fast-relaxed-math -cl-strict-aliasing";
 
-		cpProgram = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
-		if(opencl_error(ciErr))
-			return false;
-
 		ciErr = clBuildProgram(cpProgram, 0, NULL, build_options.c_str(), NULL, NULL);
 
 		if(ciErr != CL_SUCCESS) {
+			/* show build errors */
 			char *build_log;
 			size_t ret_val_size;
 
@@ -256,6 +286,87 @@ public:
 			return false;
 		}
 
+		return true;
+	}
+
+	bool compile_kernel(const string& kernel_path, const string& kernel_md5)
+	{
+		/* we compile kernels consisting of many files. unfortunately opencl
+		   kernel caches do not seem to recognize changes in included files.
+		   so we force recompile on changes by adding the md5 hash of all files */
+		string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
+		size_t source_len = source.size();
+		const char *source_str = source.c_str();
+
+		cpProgram = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
+
+		if(opencl_error(ciErr))
+			return false;
+
+		double starttime = time_dt();
+		printf("Compiling OpenCL kernel ...\n");
+
+		if(!build_kernel(kernel_path))
+			return false;
+
+		printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+		return true;
+	}
+
+	string device_md5_hash()
+	{
+		MD5Hash md5;
+		char version[256], driver[256], name[256], vendor[256];
+
+		clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
+		clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
+		clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
+		clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
+
+		md5.append((uint8_t*)vendor, strlen(vendor));
+		md5.append((uint8_t*)version, strlen(version));
+		md5.append((uint8_t*)name, strlen(name));
+		md5.append((uint8_t*)driver, strlen(driver));
+
+		return md5.get_hex();
+	}
+
+	bool load_kernels()
+	{
+		/* verify if device was initialized */
+		if(!device_initialized) {
+			fprintf(stderr, "OpenCL: failed to initialize device.\n");
+			return false;
+		}
+
+		/* verify we have right opencl version */
+		if(!opencl_version_check())
+			return false;
+
+		/* md5 hash to detect changes */
+		string kernel_path = path_get("kernel");
+		string kernel_md5 = path_files_md5_hash(kernel_path);
+		string device_md5 = device_md5_hash();
+
+		/* try to use cache binary */
+		string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());;
+		clbin = path_user_get(path_join("cache", clbin));
+
+		if(path_exists(clbin)) {
+			/* if exists already, try use it */
+			if(!load_binary(kernel_path, clbin))
+				return false;
+		}
+		else {
+			/* compile kernel */
+			if(!compile_kernel(kernel_path, kernel_md5))
+				return false;
+
+			/* save binary for reuse */
+			save_binary(clbin);
+		}
+
 		/* find kernels */
 		ckPathTraceKernel = clCreateKernel(cpProgram, "kernel_ocl_path_trace", &ciErr);
 		if(opencl_error(ciErr))
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 31fe6994a1a..9714f941082 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -132,6 +132,7 @@ endif()
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${kernel_preprocessed}" ${CYCLES_INSTALL_PATH}/kernel)
 
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${headers}" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${svm_headers}" ${CYCLES_INSTALL_PATH}/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${util_headers}" ${CYCLES_INSTALL_PATH}/kernel)
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 3a0eff5210c..72aef463cab 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -21,14 +21,14 @@
 
 #define __KERNEL_GPU__
 #define __KERNEL_CUDA__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
 
 #include <cuda.h>
 #include <float.h>
 
 #include "util_types.h"
 
-CCL_NAMESPACE_BEGIN
-
 /* Qualifier wrappers for different names on different devices */
 
 #define __device  __device__ __inline__
@@ -60,7 +60,5 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
 
 #define kernel_data __data
 
-CCL_NAMESPACE_END
-
 #endif /* __KERNEL_COMPAT_CUDA_H__ */
 
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index f66c4a64b0b..5fb687971ef 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -384,7 +384,7 @@ void Session::run_cpu()
 void Session::run()
 {
 	/* load kernels */
-	progress.set_status("Loading render kernels (may take a few minutes)");
+	progress.set_status("Loading render kernels (may take a few minutes the first time)");
 
 	if(!device->load_kernels()) {
 		progress.set_status("Failed loading render kernel, see console for errors");
diff --git a/intern/cycles/util/util_cuda.cpp b/intern/cycles/util/util_cuda.cpp
index b371f4e9499..14e3f9a159b 100644
--- a/intern/cycles/util/util_cuda.cpp
+++ b/intern/cycles/util/util_cuda.cpp
@@ -16,6 +16,8 @@
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
+#include <stdlib.h>
+
 #include "util_cuda.h"
 #include "util_debug.h"
 #include "util_dynlib.h"
@@ -371,6 +373,11 @@ bool cuLibraryInit()
 	/* cuda 4.0 */
 	CUDA_LIBRARY_FIND(cuCtxSetCurrent);
 
+#ifndef WITH_CUDA_BINARIES
+	if(cuCompilerPath() == "")
+		return false;
+#endif
+
 	/* success */
 	result = true;
 
@@ -379,13 +386,23 @@ bool cuLibraryInit()
 
 string cuCompilerPath()
 {
-		/* todo: better nvcc detection */
 #ifdef _WIN32
-	string nvcc = "C:/CUDA/bin/nvcc.exe";
+	const char *defaultpath = "C:/CUDA/bin";
+	const char *executable = "nvcc.exe";
 #else
-	string nvcc = "/usr/local/cuda/bin/nvcc";
+	const char *defaultpath = "/usr/local/cuda/bin";
+	const char *executable = "nvcc";
 #endif
 
+	const char *binpath = getenv("CUDA_BIN_PATH");
+
+	string nvcc;
+
+	if(binpath)
+		nvcc = path_join(binpath, executable);
+	else
+		nvcc = path_join(defaultpath, executable);
+
 	return (path_exists(nvcc))? nvcc: "";
 }
 
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 086063bcb81..90093e32d58 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -24,6 +24,8 @@
 #include <OpenImageIO/sysutil.h>
 OIIO_NAMESPACE_USING
 
+#include <stdio.h>
+
 #define BOOST_FILESYSTEM_VERSION 2
 
 #include <boost/filesystem.hpp> 
@@ -32,10 +34,12 @@ OIIO_NAMESPACE_USING
 CCL_NAMESPACE_BEGIN
 
 static string cached_path = "";
+static string cached_user_path = "";
 
-void path_init(const string& path)
+void path_init(const string& path, const string& user_path)
 {
 	cached_path = path;
+	cached_user_path = user_path;
 }
 
 string path_get(const string& sub)
@@ -46,6 +50,14 @@ string path_get(const string& sub)
 	return path_join(cached_path, sub);
 }
 
+string path_user_get(const string& sub)
+{
+	if(cached_user_path == "")
+		cached_user_path = path_dirname(Sysutil::this_program_path());
+
+	return path_join(cached_user_path, sub);
+}
+
 string path_filename(const string& path)
 {
 	return boost::filesystem::path(path).filename();
@@ -97,5 +109,48 @@ string path_files_md5_hash(const string& dir)
 	return hash.get_hex();
 }
 
+bool path_write_binary(const string& path, const vector<uint8_t>& binary)
+{
+	/* write binary file from memory */
+	boost::filesystem::create_directories(path_dirname(path));
+
+	FILE *f = fopen(path.c_str(), "wb");
+
+	if(!f)
+		return false;
+
+	if(binary.size() > 0)
+		fwrite(&binary[0], sizeof(uint8_t), binary.size(), f);
+
+	fclose(f);
+
+	return true;
+}
+
+bool path_read_binary(const string& path, vector<uint8_t>& binary)
+{
+	binary.resize(boost::filesystem::file_size(path));
+
+	/* read binary file into memory */
+	FILE *f = fopen(path.c_str(), "rb");
+
+	if(!f)
+		return false;
+
+	if(binary.size() == 0) {
+		fclose(f);
+		return false;
+	}
+
+	if(fread(&binary[0], sizeof(uint8_t), binary.size(), f) != binary.size()) {
+		fclose(f);
+		return false;
+	}
+
+	fclose(f);
+
+	return true;
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_path.h b/intern/cycles/util/util_path.h
index 3e07f9524bf..ec3524d362a 100644
--- a/intern/cycles/util/util_path.h
+++ b/intern/cycles/util/util_path.h
@@ -25,11 +25,13 @@
  * then makes all paths relative to that. */
 
 #include "util_string.h"
+#include "util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
-void path_init(const string& path = "");
+void path_init(const string& path = "", const string& user_path = "");
 string path_get(const string& sub = "");
+string path_user_get(const string& sub = "");
 
 string path_filename(const string& path);
 string path_dirname(const string& path);
@@ -39,6 +41,9 @@ string path_escape(const string& path);
 bool path_exists(const string& path);
 string path_files_md5_hash(const string& dir);
 
+bool path_write_binary(const string& path, const vector<uint8_t>& binary);
+bool path_read_binary(const string& path, vector<uint8_t>& binary);
+
 CCL_NAMESPACE_END
 
 #endif
author	Brecht Van Lommel <brechtvanlommel@pandora.be>	2011-09-09 16:04:39 +0400
committer	Brecht Van Lommel <brechtvanlommel@pandora.be>	2011-09-09 16:04:39 +0400
commit	cfbd6cf154bbc653422f30b2bf8077545f5fb99c (patch)
tree	b32a82345cf6d87f56ede78a8bdf1cd18ca2f0f3 /intern/cycles
parent	9b31cba74e2bd84e9988ebdab723e6e43f9b8357 (diff)