diff options
author | Martijn Berger <martijn.berger@gmail.com> | 2014-03-06 23:51:13 +0400 |
---|---|---|
committer | Martijn Berger <martijn.berger@gmail.com> | 2014-03-06 23:51:46 +0400 |
commit | 1d016758330b7e328758b3df28ea93a19d47fcdc (patch) | |
tree | e69b3194d8b51ec284a47ad79b2e7f3a17fe424b /intern/cycles/device | |
parent | 03afa6f9e7546d26766f0ac7bdb23da56a708306 (diff) |
Cuda use streams and async to avoid busywaiting
This switches api usage for cuda towards using more of the Async calls.
Updating only once every second is sufficiently cheap that I don't think it is worth doing it less often.
Reviewed By: brecht
Differential Revision: https://developer.blender.org/D262
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 41 |
1 files changed, 37 insertions, 4 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 0fbb48cf431..932fdc303a5 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -41,11 +41,14 @@ public: CUdevice cuDevice; CUcontext cuContext; CUmodule cuModule; + CUstream cuStream; + CUevent tileDone; map<device_ptr, bool> tex_interp_map; int cuDevId; int cuDevArchitecture; bool first_error; bool use_texture_storage; + unsigned int target_update_frequency; struct PixelMem { GLuint cuPBO; @@ -177,6 +180,8 @@ public: first_error = true; background = background_; use_texture_storage = true; + /* we try an update / sync every 1000 ms */ + target_update_frequency = 1000; cuDevId = info.num; cuDevice = 0; @@ -207,6 +212,9 @@ public: if(cuda_error_(result, "cuCtxCreate")) return; + cuda_assert(cuStreamCreate(&cuStream, 0)) + cuda_assert(cuEventCreate(&tileDone, 0x1)) + int major, minor; cuDeviceComputeCapability(&major, &minor, cuDevId); cuDevArchitecture = major*100 + minor*10; @@ -223,6 +231,8 @@ public: { task_pool.stop(); + cuda_assert(cuEventDestroy(tileDone)) + cuda_assert(cuStreamDestroy(cuStream)) cuda_assert(cuCtxDestroy(cuContext)) } @@ -645,9 +655,7 @@ public: cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)) cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1)) - cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks)) - - cuda_assert(cuCtxSynchronize()) + cuda_assert(cuLaunchGridAsync(cuPathTrace, xblocks, yblocks, cuStream)) cuda_pop_context(); } @@ -964,11 +972,16 @@ public: bool branched = task->integrator_branched; + /* keep rendering tiles until done */ while(task->acquire_tile(this, tile)) { int start_sample = tile.start_sample; int end_sample = tile.start_sample + tile.num_samples; + boost::posix_time::ptime start_time(boost::posix_time::microsec_clock::local_time()); + boost::posix_time::ptime last_time = start_time; + int sync_sample = 10; + for(int sample = start_sample; sample < end_sample; sample++) { if (task->get_cancel()) { if(task->need_finish_queue == false) @@ -978,8 +991,28 @@ public: path_trace(tile, sample, branched); tile.sample = sample + 1; - task->update_progress(tile); + + if(sample == sync_sample){ + cuda_push_context(); + cuda_assert(cuEventRecord(tileDone, cuStream )) + cuda_assert(cuEventSynchronize(tileDone)) + + /* Do some time keeping to find out if we need to sync less */ + boost::posix_time::ptime current_time(boost::posix_time::microsec_clock::local_time()); + boost::posix_time::time_duration sample_duration = current_time - last_time; + + long msec = sample_duration.total_milliseconds(); + float scaling_factor = (float)target_update_frequency / (float)msec; + + /* sync at earliest next sample and probably later */ + sync_sample = (sample + 1) + sync_sample * ceil(scaling_factor); + + sync_sample = min(end_sample - 1, sync_sample); // make sure we sync the last sample always + + last_time = current_time; + cuda_pop_context(); + } } task->release_tile(tile); |