Cycles OpenCL: keep the opencl context and program around for quicker rendering

the second time, as for example Intel CPU startup time is 9 seconds. * Adds an cache for contexts and programs for each platform and device pair, which also ensure now no two threads try to compile and write the binary cache file at the same time. * Change clFinish to clFlush so we don't block until the result is done, instead it will block at the moment we copy back memory. * Fix error in Cycles time_sleep implementation, does not affect any active code though. * Adds some (disabled) debugging code in the task scheduler. Patch #35559 by Doug Gale.
author: Brecht Van Lommel <brechtvanlommel@pandora.be> 2013-05-31 20:19:03 +0400
committer: Brecht Van Lommel <brechtvanlommel@pandora.be> 2013-05-31 20:19:03 +0400
commit: 2d0a586c29e482d646dec0cf62880f577ff77657 (patch)
tree: e8ae7b03728ce3215592b7e76567c46facddf91e /intern/cycles/device/device_opencl.cpp
parent: db42a596aafeb7b33bee63c6bc8da205582b5257 (diff)
1 files changed, 267 insertions, 48 deletions
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 1cd538d655f..0b9881c0eb5 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -104,12 +104,194 @@ static string opencl_kernel_build_options(const string& platform, const string *
 	if(opencl_kernel_use_debug())
 		build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
 
-	if (opencl_kernel_use_advanced_shading(platform))
+	if(opencl_kernel_use_advanced_shading(platform))
 		build_options += "-D__KERNEL_OPENCL_NEED_ADVANCED_SHADING__ ";
 	
 	return build_options;
 }
 
+/* thread safe cache for contexts and programs */
+class OpenCLCache
+{
+	struct Slot
+	{
+		thread_mutex *mutex;
+		cl_context context;
+		cl_program program;
+
+		Slot() : mutex(NULL), context(NULL), program(NULL) {}
+
+		Slot(const Slot &rhs)
+			: mutex(rhs.mutex)
+			, context(rhs.context)
+			, program(rhs.program)
+		{
+			/* copy can only happen in map insert, assert that */
+			assert(mutex == NULL);
+		}
+
+		~Slot()
+		{
+			delete mutex;
+			mutex = NULL;
+		}
+	};
+
+	/* key is combination of platform ID and device ID */
+	typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
+
+	/* map of Slot objects */
+	typedef map<PlatformDevicePair, Slot> CacheMap;
+	CacheMap cache;
+
+	thread_mutex cache_lock;
+
+	/* lazy instantiate */
+	static OpenCLCache &global_instance()
+	{
+		static OpenCLCache instance;
+		return instance;
+	}
+
+	OpenCLCache()
+	{
+	}
+
+	~OpenCLCache()
+	{
+		/* Intel OpenCL bug raises SIGABRT due to pure virtual call
+		 * so this is disabled. It's not necessary to free objects
+		 * at process exit anyway.
+		 * http://software.intel.com/en-us/forums/topic/370083#comments */
+
+		//flush();
+	}
+
+	/* lookup something in the cache. If this returns NULL, slot_locker
+	 * will be holding a lock for the cache. slot_locker should refer to a
+	 * default constructed thread_scoped_lock */
+	template<typename T>
+	static T get_something(cl_platform_id platform, cl_device_id device,
+		T Slot::*member, cl_int (*retain_func)(T), thread_scoped_lock &slot_locker)
+	{
+		assert(platform != NULL);
+
+		OpenCLCache &self = global_instance();
+
+		thread_scoped_lock cache_lock(self.cache_lock);
+
+		pair<CacheMap::iterator,bool> ins = self.cache.insert(
+			CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
+
+		Slot &slot = ins.first->second;
+
+		/* create slot lock only while holding cache lock */
+		if(!slot.mutex)
+			slot.mutex = new thread_mutex;
+
+		/* need to unlock cache before locking slot, to allow store to complete */
+		cache_lock.unlock();
+
+		/* lock the slot */
+		slot_locker = thread_scoped_lock(*slot.mutex);
+
+		/* If the thing isn't cached */
+		if(slot.*member == NULL) {
+			/* return with the caller's lock holder holding the slot lock */
+			return NULL;
+		}
+
+		/* the item was already cached, release the slot lock */
+		slot_locker.unlock();
+
+		/* caller is going to release it when done with it, so retain it */
+		cl_int ciErr = retain_func(slot.*member);
+		assert(ciErr == CL_SUCCESS);
+		(void)ciErr;
+
+		return slot.*member;
+	}
+
+	/* store something in the cache. you MUST have tried to get the item before storing to it */
+	template<typename T>
+	static void store_something(cl_platform_id platform, cl_device_id device, T thing,
+		T Slot::*member, cl_int (*retain_func)(T), thread_scoped_lock &slot_locker)
+	{
+		assert(platform != NULL);
+		assert(device != NULL);
+		assert(thing != NULL);
+
+		OpenCLCache &self = global_instance();
+
+		thread_scoped_lock cache_lock(self.cache_lock);
+		CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
+		cache_lock.unlock();
+
+		Slot &slot = i->second;
+
+		/* sanity check */
+		assert(i != self.cache.end());
+		assert(slot.*member == NULL);
+
+		slot.*member = thing;
+
+		/* unlock the slot */
+		slot_locker.unlock();
+
+		/* increment reference count in OpenCL.
+		 * The caller is going to release the object when done with it. */
+		cl_int ciErr = retain_func(thing);
+		assert(ciErr == CL_SUCCESS);
+		(void)ciErr;
+	}
+
+public:
+	/* see get_something comment */
+	static cl_context get_context(cl_platform_id platform, cl_device_id device,
+		thread_scoped_lock &slot_locker)
+	{
+		return get_something(platform, device, &Slot::context, clRetainContext, slot_locker);
+	}
+
+	/* see get_something comment */
+	static cl_program get_program(cl_platform_id platform, cl_device_id device,
+		thread_scoped_lock &slot_locker)
+	{
+		return get_something(platform, device, &Slot::program, clRetainProgram, slot_locker);
+	}
+
+	/* see store_something comment */
+	static void store_context(cl_platform_id platform, cl_device_id device, cl_context context,
+		thread_scoped_lock &slot_locker)
+	{
+		store_something(platform, device, context, &Slot::context, clRetainContext, slot_locker);
+	}
+
+	/* see store_something comment */
+	static void store_program(cl_platform_id platform, cl_device_id device, cl_program program,
+		thread_scoped_lock &slot_locker)
+	{
+		store_something(platform, device, program, &Slot::program, clRetainProgram, slot_locker);
+	}
+
+	/* discard all cached contexts and programs
+	 * the parameter is a temporary workaround. See OpenCLCache::~OpenCLCache */
+	static void flush()
+	{
+		OpenCLCache &self = global_instance();
+		thread_scoped_lock cache_lock(self.cache_lock);
+
+		foreach(CacheMap::value_type &item, self.cache) {
+			if(item.second.program != NULL)
+				clReleaseProgram(item.second.program);
+			if(item.second.context != NULL)
+				clReleaseContext(item.second.context);
+		}
+
+		self.cache.clear();
+	}
+};
+
 class OpenCLDevice : public Device
 {
 public:
@@ -290,21 +472,34 @@ public:
 			opencl_error("OpenCL: no devices found.");
 			return;
 		}
-		else if (!cdDevice) {
+		else if(!cdDevice) {
 			opencl_error("OpenCL: specified device not found.");
 			return;
 		}
 
-		/* Create context properties array to specify platform */
-		const cl_context_properties context_props[] = {
-			CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
-			0, 0
-		};
-
-		/* create context */
-		cxContext = clCreateContext(context_props, 1, &cdDevice, NULL, NULL, &ciErr);
-		if(opencl_error(ciErr))
-			return;
+		{
+			/* try to use cached context */
+			thread_scoped_lock cache_locker;
+			cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
+
+			if(cxContext == NULL) {
+				/* create context properties array to specify platform */
+				const cl_context_properties context_props[] = {
+					CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
+					0, 0
+				};
+
+				/* create context */
+				cxContext = clCreateContext(context_props, 1, &cdDevice,
+					context_notify_callback, cdDevice, &ciErr);
+
+				if(opencl_error(ciErr))
+					return;
+
+				/* cache it */
+				OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
+			}
+		}
 
 		cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
 		if(opencl_error(ciErr))
@@ -317,6 +512,15 @@ public:
 		device_initialized = true;
 	}
 
+	static void context_notify_callback(const char *err_info,
+		const void *private_info, size_t cb, void *user_data)
+	{
+		char name[256];
+		clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
+
+		fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
+	}
+
 	bool opencl_version_check()
 	{
 		char version[256];
@@ -436,7 +640,7 @@ public:
 		string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
 		source = path_source_replace_includes(source, kernel_path);
 
-		if (debug_src)
+		if(debug_src)
 			path_write_text(*debug_src, source);
 
 		size_t source_len = source.size();
@@ -487,39 +691,49 @@ public:
 			return false;
 		}
 
-		/* verify we have right opencl version */
-		if(!opencl_version_check())
-			return false;
+		/* try to use cached kernel */
+		thread_scoped_lock cache_locker;
+		cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, cache_locker);
 
-		/* md5 hash to detect changes */
-		string kernel_path = path_get("kernel");
-		string kernel_md5 = path_files_md5_hash(kernel_path);
-		string device_md5 = device_md5_hash();
+		if(!cpProgram) {
+			/* verify we have right opencl version */
+			if(!opencl_version_check())
+				return false;
 
-		/* path to cached binary */
-		string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());
-		clbin = path_user_get(path_join("cache", clbin));
+			/* md5 hash to detect changes */
+			string kernel_path = path_get("kernel");
+			string kernel_md5 = path_files_md5_hash(kernel_path);
+			string device_md5 = device_md5_hash();
 
-		/* path to preprocessed source for debugging */
-		string clsrc, *debug_src = NULL;
-		
-		if (opencl_kernel_use_debug()) {
-			clsrc = string_printf("cycles_kernel_%s_%s.cl", device_md5.c_str(), kernel_md5.c_str());
-			clsrc = path_user_get(path_join("cache", clsrc));
-			debug_src = &clsrc;
-		}
+			/* path to cached binary */
+			string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());
+			clbin = path_user_get(path_join("cache", clbin));
 
-		/* if exists already, try use it */
-		if(path_exists(clbin) && load_binary(kernel_path, clbin, debug_src)) {
-			/* kernel loaded from binary */
-		}
-		else {
-			/* if does not exist or loading binary failed, compile kernel */
-			if(!compile_kernel(kernel_path, kernel_md5, debug_src))
-				return false;
+			/* path to preprocessed source for debugging */
+			string clsrc, *debug_src = NULL;
+
+			if(opencl_kernel_use_debug()) {
+				clsrc = string_printf("cycles_kernel_%s_%s.cl", device_md5.c_str(), kernel_md5.c_str());
+				clsrc = path_user_get(path_join("cache", clsrc));
+				debug_src = &clsrc;
+			}
 
-			/* save binary for reuse */
-			save_binary(clbin);
+			/* if exists already, try use it */
+			if(path_exists(clbin) && load_binary(kernel_path, clbin, debug_src)) {
+				/* kernel loaded from binary */
+			}
+			else {
+				/* if does not exist or loading binary failed, compile kernel */
+				if(!compile_kernel(kernel_path, kernel_md5, debug_src))
+					return false;
+
+				/* save binary for reuse */
+				if(!save_binary(clbin))
+					return false;
+			}
+
+			/* cache the program */
+			OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, cache_locker);
 		}
 
 		/* find kernels */
@@ -563,12 +777,17 @@ public:
 	{
 		size_t size = mem.memory_size();
 
+		cl_mem_flags mem_flag;
+		void *mem_ptr = NULL;
+
 		if(type == MEM_READ_ONLY)
-			mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, size, NULL, &ciErr);
+			mem_flag = CL_MEM_READ_ONLY;
 		else if(type == MEM_WRITE_ONLY)
-			mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_WRITE_ONLY, size, NULL, &ciErr);
+			mem_flag = CL_MEM_WRITE_ONLY;
 		else
-			mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_WRITE, size, NULL, &ciErr);
+			mem_flag = CL_MEM_READ_WRITE;
+
+		mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
 
 		opencl_assert(ciErr);
 
@@ -664,7 +883,7 @@ public:
 		size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
 
 		/* some implementations have max size 1 on 2nd dimension */
-		if (local_size[1] > max_work_items[1]) {
+		if(local_size[1] > max_work_items[1]) {
 			local_size[0] = workgroup_size/max_work_items[1];
 			local_size[1] = max_work_items[1];
 		}
@@ -674,7 +893,7 @@ public:
 		/* run kernel */
 		ciErr = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, local_size, 0, NULL, NULL);
 		opencl_assert(ciErr);
-		opencl_assert(clFinish(cqCommandQueue));
+		opencl_assert(clFlush(cqCommandQueue));
 	}
 
 	void path_trace(RenderTile& rtile, int sample)
@@ -789,7 +1008,7 @@ public:
 				int end_sample = tile.start_sample + tile.num_samples;
 
 				for(int sample = start_sample; sample < end_sample; sample++) {
-					if (task->get_cancel()) {
+					if(task->get_cancel()) {
 						if(task->need_finish_queue == false)
 							break;
 					}
@@ -798,7 +1017,7 @@ public:
 
 					tile.sample = sample + 1;
 
-					task->update_progress(tile);
+					//task->update_progress(tile);
 				}
 
 				task->release_tile(tile);
author	Brecht Van Lommel <brechtvanlommel@pandora.be>	2013-05-31 20:19:03 +0400
committer	Brecht Van Lommel <brechtvanlommel@pandora.be>	2013-05-31 20:19:03 +0400
commit	2d0a586c29e482d646dec0cf62880f577ff77657 (patch)
tree	e8ae7b03728ce3215592b7e76567c46facddf91e /intern/cycles/device/device_opencl.cpp
parent	db42a596aafeb7b33bee63c6bc8da205582b5257 (diff)