Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorBrecht Van Lommel <brechtvanlommel@gmail.com>2017-11-04 20:06:48 +0300
committerBrecht Van Lommel <brechtvanlommel@gmail.com>2017-11-05 17:22:04 +0300
commit5475314f4955dbc3af305577a26fe0b537380313 (patch)
tree69797e49a7fb18f5c4b1239c28f74d99da2f70db /intern
parent18d7fbe4f58c86f6f91b1e9e5c1f7f0c593a1d0f (diff)
Cycles: reserve CUDA local memory ahead of time.
This way we can log the amount of memory used, and it will be important for host mapped memory support.
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/device/device_cuda.cpp72
1 files changed, 67 insertions, 5 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 278fff02ae1..59d4fb055d0 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -234,24 +234,29 @@ public:
need_texture_info = false;
- /* intialize */
+ /* Intialize CUDA. */
if(cuda_error(cuInit(0)))
return;
- /* setup device and context */
+ /* Setup device and context. */
if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
return;
+ /* CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+ * so we can predict which memory to map to host. */
+ unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+
+ /* Create context. */
CUresult result;
if(background) {
- result = cuCtxCreate(&cuContext, 0, cuDevice);
+ result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
}
else {
- result = cuGLCtxCreate(&cuContext, 0, cuDevice);
+ result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
if(result != CUDA_SUCCESS) {
- result = cuCtxCreate(&cuContext, 0, cuDevice);
+ result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
background = true;
}
}
@@ -542,9 +547,66 @@ public:
if(cuda_error_(result, "cuModuleLoad"))
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+ if(result == CUDA_SUCCESS) {
+ reserve_local_memory(requested_features);
+ }
+
return (result == CUDA_SUCCESS);
}
+ void reserve_local_memory(const DeviceRequestedFeatures& requested_features)
+ {
+ if(use_split_kernel()) {
+ /* Split kernel mostly uses global memory and adaptive compilation,
+ * difficult to predict how much is needed currently. */
+ return;
+ }
+
+ /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+ * needed for kernel launches, so that we can reliably figure out when
+ * to allocate scene data in mapped host memory. */
+ CUDAContextScope scope(this);
+
+ size_t total = 0, free_before = 0, free_after = 0;
+ cuMemGetInfo(&free_before, &total);
+
+ /* Get kernel function. */
+ CUfunction cuPathTrace;
+
+ if(requested_features.use_integrator_branched) {
+ cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+ }
+ else {
+ cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+ }
+
+ cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+
+ int min_blocks, num_threads_per_block;
+ cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+
+ /* Launch kernel, using just 1 block appears sufficient to reserve
+ * memory for all multiprocessors. It would be good to do this in
+ * parallel for the multi GPU case still to make it faster. */
+ CUdeviceptr d_work_tiles = 0;
+ uint total_work_size = 0;
+
+ void *args[] = {&d_work_tiles,
+ &total_work_size};
+
+ cuda_assert(cuLaunchKernel(cuPathTrace,
+ 1, 1, 1,
+ num_threads_per_block, 1, 1,
+ 0, 0, args, 0));
+
+ cuda_assert(cuCtxSynchronize());
+
+ cuMemGetInfo(&free_after, &total);
+ VLOG(1) << "Local memory reserved "
+ << string_human_readable_number(free_before - free_after) << " bytes. ("
+ << string_human_readable_size(free_before - free_after) << ")";
+ }
+
void load_texture_info()
{
if(!info.has_fermi_limits && need_texture_info) {