Cuda use streams and async to avoid busywaiting

This switches api usage for cuda towards using more of the Async calls. Updating only once every second is sufficiently cheap that I don't think it is worth doing it less often. Reviewed By: brecht Differential Revision: https://developer.blender.org/D262
author: Martijn Berger <martijn.berger@gmail.com> 2014-03-06 23:51:13 +0400
committer: Martijn Berger <martijn.berger@gmail.com> 2014-03-06 23:51:46 +0400
commit: 1d016758330b7e328758b3df28ea93a19d47fcdc (patch)
tree: e69b3194d8b51ec284a47ad79b2e7f3a17fe424b /intern/cycles/device
parent: 03afa6f9e7546d26766f0ac7bdb23da56a708306 (diff)
1 files changed, 37 insertions, 4 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 0fbb48cf431..932fdc303a5 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -41,11 +41,14 @@ public:
 	CUdevice cuDevice;
 	CUcontext cuContext;
 	CUmodule cuModule;
+	CUstream cuStream;
+	CUevent tileDone;
 	map<device_ptr, bool> tex_interp_map;
 	int cuDevId;
 	int cuDevArchitecture;
 	bool first_error;
 	bool use_texture_storage;
+	unsigned int target_update_frequency;
 
 	struct PixelMem {
 		GLuint cuPBO;
@@ -177,6 +180,8 @@ public:
 		first_error = true;
 		background = background_;
 		use_texture_storage = true;
+		/* we try an update / sync every 1000 ms */
+		target_update_frequency = 1000;
 
 		cuDevId = info.num;
 		cuDevice = 0;
@@ -207,6 +212,9 @@ public:
 		if(cuda_error_(result, "cuCtxCreate"))
 			return;
 
+		cuda_assert(cuStreamCreate(&cuStream, 0))
+		cuda_assert(cuEventCreate(&tileDone, 0x1))
+
 		int major, minor;
 		cuDeviceComputeCapability(&major, &minor, cuDevId);
 		cuDevArchitecture = major*100 + minor*10;
@@ -223,6 +231,8 @@ public:
 	{
 		task_pool.stop();
 
+		cuda_assert(cuEventDestroy(tileDone))
+		cuda_assert(cuStreamDestroy(cuStream))
 		cuda_assert(cuCtxDestroy(cuContext))
 	}
 
@@ -645,9 +655,7 @@ public:
 
 		cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
 		cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
-		cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
-
-		cuda_assert(cuCtxSynchronize())
+		cuda_assert(cuLaunchGridAsync(cuPathTrace, xblocks, yblocks, cuStream))
 
 		cuda_pop_context();
 	}
@@ -964,11 +972,16 @@ public:
 			
 			bool branched = task->integrator_branched;
 			
+
 			/* keep rendering tiles until done */
 			while(task->acquire_tile(this, tile)) {
 				int start_sample = tile.start_sample;
 				int end_sample = tile.start_sample + tile.num_samples;
 
+				boost::posix_time::ptime start_time(boost::posix_time::microsec_clock::local_time());
+				boost::posix_time::ptime last_time = start_time;
+				int sync_sample = 10;
+
 				for(int sample = start_sample; sample < end_sample; sample++) {
 					if (task->get_cancel()) {
 						if(task->need_finish_queue == false)
@@ -978,8 +991,28 @@ public:
 					path_trace(tile, sample, branched);
 
 					tile.sample = sample + 1;
-
 					task->update_progress(tile);
+
+					if(sample == sync_sample){
+						cuda_push_context();
+						cuda_assert(cuEventRecord(tileDone, cuStream ))
+						cuda_assert(cuEventSynchronize(tileDone))
+
+						/* Do some time keeping to find out if we need to sync less */
+						boost::posix_time::ptime current_time(boost::posix_time::microsec_clock::local_time());
+						boost::posix_time::time_duration sample_duration = current_time - last_time;
+
+						long msec = sample_duration.total_milliseconds();
+						float scaling_factor = (float)target_update_frequency / (float)msec;
+
+						/* sync at earliest next sample and probably later */
+						sync_sample = (sample + 1) + sync_sample * ceil(scaling_factor);
+
+						sync_sample = min(end_sample - 1, sync_sample); // make sure we sync the last sample always
+
+						last_time = current_time;
+						cuda_pop_context();
+					}
 				}
 
 				task->release_tile(tile);
author	Martijn Berger <martijn.berger@gmail.com>	2014-03-06 23:51:13 +0400
committer	Martijn Berger <martijn.berger@gmail.com>	2014-03-06 23:51:46 +0400
commit	1d016758330b7e328758b3df28ea93a19d47fcdc (patch)
tree	e69b3194d8b51ec284a47ad79b2e7f3a17fe424b /intern/cycles/device
parent	03afa6f9e7546d26766f0ac7bdb23da56a708306 (diff)