From 88520dd5b6ff5bf310421ab6b9a30d7d49425685 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Sun, 24 Sep 2017 00:18:28 +0200 Subject: Code refactor: simplify CUDA context push/pop. Makes it possible to call a function like mem_alloc() when the context is already active. Also fixes some missing pops in case of errors. --- intern/cycles/device/device_cuda.cpp | 178 ++++++++++++++--------------------- 1 file changed, 69 insertions(+), 109 deletions(-) (limited to 'intern/cycles/device/device_cuda.cpp') diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 3b75142ee67..29b5bd70789 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -111,6 +111,16 @@ public: virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); }; +/* Utility to push/pop CUDA context. */ +class CUDAContextScope { +public: + CUDAContextScope(CUDADevice *device); + ~CUDAContextScope(); + +private: + CUDADevice *device; +}; + class CUDADevice : public Device { public: @@ -206,16 +216,6 @@ public: cuda_error_documentation(); } - void cuda_push_context() - { - cuda_assert(cuCtxSetCurrent(cuContext)); - } - - void cuda_pop_context() - { - cuda_assert(cuCtxSetCurrent(NULL)); - } - CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(info, stats, background_) { @@ -263,7 +263,8 @@ public: cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); cuDevArchitecture = major*100 + minor*10; - cuda_pop_context(); + /* Pop context set by cuCtxCreate. */ + cuCtxPopCurrent(NULL); } ~CUDADevice() @@ -519,7 +520,7 @@ public: return false; /* open module */ - cuda_push_context(); + CUDAContextScope scope(this); string cubin_data; CUresult result; @@ -540,8 +541,6 @@ public: if(cuda_error_(result, "cuModuleLoad")) cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); - cuda_pop_context(); - return (result == CUDA_SUCCESS); } @@ -556,36 +555,36 @@ public: void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/) { + CUDAContextScope scope(this); + if(name) { VLOG(1) << "Buffer allocate: " << name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; } - cuda_push_context(); CUdeviceptr device_pointer; size_t size = mem.memory_size(); cuda_assert(cuMemAlloc(&device_pointer, size)); mem.device_pointer = (device_ptr)device_pointer; mem.device_size = size; stats.mem_alloc(size); - cuda_pop_context(); } void mem_copy_to(device_memory& mem) { - cuda_push_context(); + CUDAContextScope scope(this); + if(mem.device_pointer) cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size())); - cuda_pop_context(); } void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) { + CUDAContextScope scope(this); size_t offset = elem*y*w; size_t size = elem*w*h; - cuda_push_context(); if(mem.device_pointer) { cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset, (CUdeviceptr)(mem.device_pointer + offset), size)); @@ -593,7 +592,6 @@ public: else { memset((char*)mem.data_pointer + offset, 0, size); } - cuda_pop_context(); } void mem_zero(device_memory& mem) @@ -602,18 +600,17 @@ public: memset((void*)mem.data_pointer, 0, mem.memory_size()); } - cuda_push_context(); - if(mem.device_pointer) + if(mem.device_pointer) { + CUDAContextScope scope(this); cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())); - cuda_pop_context(); + } } void mem_free(device_memory& mem) { if(mem.device_pointer) { - cuda_push_context(); + CUDAContextScope scope(this); cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer))); - cuda_pop_context(); mem.device_pointer = 0; @@ -629,14 +626,13 @@ public: void const_copy_to(const char *name, void *host, size_t size) { + CUDAContextScope scope(this); CUdeviceptr mem; size_t bytes; - cuda_push_context(); cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); //assert(bytes == size); cuda_assert(cuMemcpyHtoD(mem, host, size)); - cuda_pop_context(); } void tex_alloc(const char *name, @@ -644,6 +640,8 @@ public: InterpolationType interpolation, ExtensionType extension) { + CUDAContextScope scope(this); + VLOG(1) << "Texture allocate: " << name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; @@ -706,9 +704,7 @@ public: tokens[3].c_str()); } - cuda_push_context(); cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str())); - cuda_pop_context(); if(!texref) { return; @@ -721,8 +717,6 @@ public: mem_alloc(NULL, mem, MEM_READ_ONLY); mem_copy_to(mem); - cuda_push_context(); - CUdeviceptr cumem; size_t cubytes; @@ -738,28 +732,20 @@ public: uint32_t ptr = (uint32_t)mem.device_pointer; cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); } - - cuda_pop_context(); } else { mem_alloc(NULL, mem, MEM_READ_ONLY); mem_copy_to(mem); - cuda_push_context(); - cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size)); cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)); cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER)); - - cuda_pop_context(); } } /* Texture Storage */ else { CUarray handle = NULL; - cuda_push_context(); - if(mem.data_depth > 1) { CUDA_ARRAY3D_DESCRIPTOR desc; @@ -784,7 +770,6 @@ public: } if(!handle) { - cuda_pop_context(); return; } @@ -877,14 +862,10 @@ public: cuda_assert(cuTexRefSetFilterMode(texref, filter_mode)); cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); } - - cuda_pop_context(); } /* Fermi, Data and Image Textures */ if(!has_bindless_textures) { - cuda_push_context(); - cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); if(mem.data_depth > 1) { @@ -892,8 +873,6 @@ public: } cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); - - cuda_pop_context(); } /* Fermi and Kepler */ @@ -904,9 +883,8 @@ public: { if(mem.device_pointer) { if(tex_interp_map[mem.device_pointer]) { - cuda_push_context(); + CUDAContextScope scope(this); cuArrayDestroy((CUarray)mem.device_pointer); - cuda_pop_context(); /* Free CUtexObject (Bindless Textures) */ if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) { @@ -960,7 +938,7 @@ public: if(have_error()) return false; - cuda_push_context(); + CUDAContextScope scope(this); int4 rect = task->rect; int w = align_up(rect.z-rect.x, 4); @@ -1017,7 +995,6 @@ public: CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); return !have_error(); } @@ -1026,7 +1003,7 @@ public: if(have_error()) return false; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuFilterConstructTransform; cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); @@ -1046,7 +1023,6 @@ public: CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); return !have_error(); } @@ -1058,11 +1034,11 @@ public: if(have_error()) return false; + CUDAContextScope scope(this); + mem_zero(task->storage.XtWX); mem_zero(task->storage.XtWY); - cuda_push_context(); - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize; cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); @@ -1150,7 +1126,6 @@ public: CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); return !have_error(); } @@ -1161,7 +1136,7 @@ public: if(have_error()) return false; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuFilterCombineHalves; cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); @@ -1179,7 +1154,6 @@ public: CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); return !have_error(); } @@ -1190,7 +1164,7 @@ public: if(have_error()) return false; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuFilterDivideShadow; cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); @@ -1214,7 +1188,6 @@ public: CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); return !have_error(); } @@ -1227,7 +1200,7 @@ public: if(have_error()) return false; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuFilterGetFeature; cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); @@ -1250,7 +1223,6 @@ public: CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); return !have_error(); } @@ -1263,7 +1235,7 @@ public: if(have_error()) return false; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuFilterDetectOutliers; cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers")); @@ -1282,7 +1254,6 @@ public: CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); return !have_error(); } @@ -1319,7 +1290,7 @@ public: if(have_error()) return; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuPathTrace; CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer); @@ -1333,8 +1304,9 @@ public: cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); } - if(have_error()) + if(have_error()) { return; + } /* pass in parameters */ void *args[] = {&d_buffer, @@ -1370,8 +1342,6 @@ public: 0, 0, args, 0)); cuda_assert(cuCtxSynchronize()); - - cuda_pop_context(); } void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) @@ -1379,7 +1349,7 @@ public: if(have_error()) return; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuFilmConvert; CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half); @@ -1424,8 +1394,6 @@ public: 0, 0, args, 0)); unmap_pixels((rgba_byte)? rgba_byte: rgba_half); - - cuda_pop_context(); } void shader(DeviceTask& task) @@ -1433,7 +1401,7 @@ public: if(have_error()) return; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuShader; CUdeviceptr d_input = cuda_device_ptr(task.shader_input); @@ -1498,8 +1466,6 @@ public: task.update_progress(NULL); } - - cuda_pop_context(); } CUdeviceptr map_pixels(device_ptr mem) @@ -1535,7 +1501,7 @@ public: pmem.w = mem.data_width; pmem.h = mem.data_height; - cuda_push_context(); + CUDAContextScope scope(this); glGenBuffers(1, &pmem.cuPBO); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); @@ -1559,8 +1525,6 @@ public: CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); if(result == CUDA_SUCCESS) { - cuda_pop_context(); - mem.device_pointer = pmem.cuTexId; pixel_mem_map[mem.device_pointer] = pmem; @@ -1574,8 +1538,6 @@ public: glDeleteBuffers(1, &pmem.cuPBO); glDeleteTextures(1, &pmem.cuTexId); - cuda_pop_context(); - background = true; } } @@ -1588,7 +1550,7 @@ public: if(!background) { PixelMem pmem = pixel_mem_map[mem.device_pointer]; - cuda_push_context(); + CUDAContextScope scope(this); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); @@ -1597,8 +1559,6 @@ public: glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - cuda_pop_context(); - return; } @@ -1611,14 +1571,12 @@ public: if(!background) { PixelMem pmem = pixel_mem_map[mem.device_pointer]; - cuda_push_context(); + CUDAContextScope scope(this); cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); glDeleteBuffers(1, &pmem.cuPBO); glDeleteTextures(1, &pmem.cuTexId); - cuda_pop_context(); - pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); mem.device_pointer = 0; @@ -1639,7 +1597,7 @@ public: PixelMem pmem = pixel_mem_map[mem.device_pointer]; float *vpointer; - cuda_push_context(); + CUDAContextScope scope(this); /* for multi devices, this assumes the inefficient method that we allocate * all pixels on the device even though we only render to a subset */ @@ -1728,8 +1686,6 @@ public: glBindTexture(GL_TEXTURE_2D, 0); glDisable(GL_TEXTURE_2D); - cuda_pop_context(); - return; } @@ -1738,6 +1694,8 @@ public: void thread_run(DeviceTask *task) { + CUDAContextScope scope(this); + if(task->type == DeviceTask::RENDER) { RenderTile tile; @@ -1805,9 +1763,7 @@ public: shader(*task); - cuda_push_context(); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); } } @@ -1828,12 +1784,11 @@ public: void task_add(DeviceTask& task) { if(task.type == DeviceTask::FILM_CONVERT) { + CUDAContextScope scope(this); + /* must be done in main thread due to opengl access */ film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); - - cuda_push_context(); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); } else { task_pool.push(new CUDADeviceTask(this, task)); @@ -1852,6 +1807,7 @@ public: friend class CUDASplitKernelFunction; friend class CUDASplitKernel; + friend class CUDAContextScope; }; /* redefine the cuda_assert macro so it can be used outside of the CUDADevice class @@ -1872,6 +1828,20 @@ public: } \ } (void)0 + +/* CUDA context scope. */ + +CUDAContextScope::CUDAContextScope(CUDADevice *device) +: device(device) +{ + cuda_assert(cuCtxPushCurrent(device->cuContext)); +} + +CUDAContextScope::~CUDAContextScope() +{ + cuda_assert(cuCtxPopCurrent(NULL)); +} + /* split kernel */ class CUDASplitKernelFunction : public SplitKernelFunction{ @@ -1889,11 +1859,11 @@ public: /* enqueue the kernel, returns false if there is an error */ bool enqueue(const KernelDimensions &dim, void *args[]) { - device->cuda_push_context(); - if(device->have_error()) return false; + CUDAContextScope scope(device); + /* we ignore dim.local_size for now, as this is faster */ int threads_per_block; cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); @@ -1907,8 +1877,6 @@ public: threads_per_block, 1, 1, /* threads */ 0, 0, args, 0)); - device->cuda_pop_context(); - return !device->have_error(); } }; @@ -1919,12 +1887,12 @@ CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device) uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads) { + CUDAContextScope scope(device); + device_vector size_buffer; size_buffer.resize(1); device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE); - device->cuda_push_context(); - uint threads = num_threads; CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer); @@ -1946,8 +1914,6 @@ uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory 1, 1, 1, 0, 0, (void**)&args, 0)); - device->cuda_pop_context(); - device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t)); device->mem_free(size_buffer); @@ -1965,7 +1931,7 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim device_memory& use_queues_flag, device_memory& work_pool_wgs) { - device->cuda_push_context(); + CUDAContextScope scope(device); CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer); CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer); @@ -2029,26 +1995,21 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args); - device->cuda_pop_context(); - return !device->have_error(); } SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name, const DeviceRequestedFeatures&) { + CUDAContextScope scope(device); CUfunction func; - device->cuda_push_context(); - cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data())); if(device->have_error()) { device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data())); return NULL; } - device->cuda_pop_context(); - return new CUDASplitKernelFunction(device, func); } @@ -2059,12 +2020,11 @@ int2 CUDASplitKernel::split_kernel_local_size() int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) { + CUDAContextScope scope(device); size_t free; size_t total; - device->cuda_push_context(); cuda_assert(cuMemGetInfo(&free, &total)); - device->cuda_pop_context(); VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free) << " bytes. (" -- cgit v1.2.3