Cycles: Quick experiment with using feature-adaptive kernels for CUDA

Gives few percent of memory improvement for regular feature set kernel and could give significant memory improvement for Experimental kernel. It could also give some degree of performance improvement, but this I didn't really measure reliably yet. Code is ifdef-ed for now, since it's only working on Linux and requires CUDA toolkit to be installed (other platform only use precompiled kernels). This is just an experiment for now and a base for the proper feature support in the future (with runtime compilation using CUDA 7?).
author: Sergey Sharybin <sergey.vfx@gmail.com> 2015-11-21 20:16:01 +0300
committer: Sergey Sharybin <sergey.vfx@gmail.com> 2015-11-21 20:16:01 +0300
commit: c08727ebabae1d84c9d4e3096a61ef131177d4d5 (patch)
tree: 57d093f0a17d9ef6d20ed7236922ec0aabf30a88
parent: f8ab3fd30f163fad9f96c889d1989faab5a471e0 (diff)
1 files changed, 22 insertions, 1 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 7f21cc5e036..ea023b4af66 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -27,6 +27,7 @@
 #include "util_debug.h"
 #include "util_logging.h"
 #include "util_map.h"
+#include "util_md5.h"
 #include "util_opengl.h"
 #include "util_path.h"
 #include "util_string.h"
@@ -34,6 +35,11 @@
 #include "util_types.h"
 #include "util_time.h"
 
+/* use feature-adaptive kernel compilation.
+ * Requires CUDA toolkit to be installed and currently only works on Linux.
+ */
+/* #define KERNEL_USE_ADAPTIVE */
+
 CCL_NAMESPACE_BEGIN
 
 class CUDADevice : public Device
@@ -221,10 +227,20 @@ public:
 		string kernel_path = path_get("kernel");
 		string md5 = path_files_md5_hash(kernel_path);
 
+#ifdef KERNEL_USE_ADAPTIVE
+		string feature_build_options = requested_features.get_build_options();
+		string device_md5 = util_md5_string(feature_build_options);
+		cubin = string_printf("cycles_kernel_%s_sm%d%d_%s.cubin",
+		                      device_md5.c_str(),
+		                      major, minor,
+		                      md5.c_str());
+#else
 		if(requested_features.experimental)
 			cubin = string_printf("cycles_kernel_experimental_sm%d%d_%s.cubin", major, minor, md5.c_str());
 		else
 			cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
+#endif
+
 		cubin = path_user_get(path_join("cache", cubin));
 		VLOG(1) << "Testing for locally compiled kernel " << cubin;
 		/* if exists already, use it */
@@ -280,8 +296,13 @@ public:
 			"-DNVCC -D__KERNEL_CUDA_VERSION__=%d",
 			nvcc, major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);
 
-		if(requested_features.experimental)
+#ifdef KERNEL_USE_ADAPTIVE
+		command += " " + feature_build_options;
+#else
+		if(requested_features.experimental) {
 			command += " -D__KERNEL_EXPERIMENTAL__";
+		}
+#endif
 
 		if(getenv("CYCLES_CUDA_EXTRA_CFLAGS")) {
 			command += string(" ") + getenv("CYCLES_CUDA_EXTRA_CFLAGS");
author	Sergey Sharybin <sergey.vfx@gmail.com>	2015-11-21 20:16:01 +0300
committer	Sergey Sharybin <sergey.vfx@gmail.com>	2015-11-21 20:16:01 +0300
commit	c08727ebabae1d84c9d4e3096a61ef131177d4d5 (patch)
tree	57d093f0a17d9ef6d20ed7236922ec0aabf30a88
parent	f8ab3fd30f163fad9f96c889d1989faab5a471e0 (diff)