diff options
Diffstat (limited to 'intern/cycles/device/cuda/device_cuda_impl.cpp')
-rw-r--r-- | intern/cycles/device/cuda/device_cuda_impl.cpp | 220 |
1 files changed, 106 insertions, 114 deletions
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp index 64c7f5e7d34..3a2eb8df95b 100644 --- a/intern/cycles/device/cuda/device_cuda_impl.cpp +++ b/intern/cycles/device/cuda/device_cuda_impl.cpp @@ -105,7 +105,7 @@ class CUDASplitKernel : public DeviceSplitKernel { virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, const DeviceRequestedFeatures &); virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task); + virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task); }; /* Utility to push/pop CUDA context. */ @@ -135,8 +135,10 @@ BVHLayoutMask CUDADevice::get_bvh_layout_mask() const return BVH_LAYOUT_BVH2; } -void CUDADevice::cuda_error_documentation() +void CUDADevice::set_error(const string &error) { + Device::set_error(error); + if (first_error) { fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); fprintf(stderr, @@ -148,42 +150,13 @@ void CUDADevice::cuda_error_documentation() # define cuda_assert(stmt) \ { \ CUresult result = stmt; \ -\ if (result != CUDA_SUCCESS) { \ - string message = string_printf( \ - "CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \ - if (error_msg == "") \ - error_msg = message; \ - fprintf(stderr, "%s\n", message.c_str()); \ - /*cuda_abort();*/ \ - cuda_error_documentation(); \ + const char *name = cuewErrorString(result); \ + set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \ } \ } \ (void)0 -bool CUDADevice::cuda_error_(CUresult result, const string &stmt) -{ - if (result == CUDA_SUCCESS) - return false; - - string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result)); - if (error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); - cuda_error_documentation(); - return true; -} - -# define cuda_error(stmt) cuda_error_(stmt, # stmt) - -void CUDADevice::cuda_error_message(const string &message) -{ - if (error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); - cuda_error_documentation(); -} - CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_) : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL) { @@ -212,12 +185,19 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool functions.loaded = false; /* Intialize CUDA. */ - if (cuda_error(cuInit(0))) + CUresult result = cuInit(0); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result))); return; + } /* Setup device and context. */ - if (cuda_error(cuDeviceGet(&cuDevice, cuDevId))) + result = cuDeviceGet(&cuDevice, cuDevId); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)", + cuewErrorString(result))); return; + } /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, @@ -235,8 +215,6 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool } /* Create context. */ - CUresult result; - if (background) { result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); } @@ -249,8 +227,10 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool } } - if (cuda_error_(result, "cuCtxCreate")) + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result))); return; + } int major, minor; cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); @@ -263,7 +243,7 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool CUDADevice::~CUDADevice() { - task_pool.stop(); + task_pool.cancel(); delete split_kernel; @@ -280,10 +260,8 @@ bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_feat /* We only support sm_30 and above */ if (major < 3) { - cuda_error_message( - string_printf("CUDA device supported only with compute capability 3.0 or up, found %d.%d.", - major, - minor)); + set_error(string_printf( + "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor)); return false; } @@ -319,13 +297,19 @@ bool CUDADevice::check_peer_access(Device *peer_device) // Enable peer access in both directions { const CUDAContextScope scope(this); - if (cuda_error(cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0))) { + CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to enable peer access on CUDA context (%s)", + cuewErrorString(result))); return false; } } { const CUDAContextScope scope(peer_device_cuda); - if (cuda_error(cuCtxEnablePeerAccess(cuContext, 0))) { + CUresult result = cuCtxEnablePeerAccess(cuContext, 0); + if (result != CUDA_SUCCESS) { + set_error(string_printf("Failed to enable peer access on CUDA context (%s)", + cuewErrorString(result))); return false; } } @@ -399,11 +383,24 @@ string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_featu } } - const string ptx = path_get(string_printf("lib/%s_compute_%d%d.ptx", name, major, minor)); - VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; - if (path_exists(ptx)) { - VLOG(1) << "Using precompiled kernel."; - return ptx; + /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */ + int ptx_major = major, ptx_minor = minor; + while (ptx_major >= 3) { + const string ptx = path_get( + string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor)); + VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; + if (path_exists(ptx)) { + VLOG(1) << "Using precompiled kernel."; + return ptx; + } + + if (ptx_minor > 0) { + ptx_minor--; + } + else { + ptx_major--; + ptx_minor = 9; + } } } @@ -432,14 +429,14 @@ string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_featu # ifdef _WIN32 if (!use_adaptive_compilation() && have_precompiled_kernels()) { if (major < 3) { - cuda_error_message( - string_printf("CUDA device requires compute capability 3.0 or up, " - "found %d.%d. Your GPU is not supported.", + set_error( + string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. " + "Your GPU is not supported.", major, minor)); } else { - cuda_error_message( + set_error( string_printf("CUDA binary kernel for this graphics card compute " "capability (%d.%d) not found.", major, @@ -452,7 +449,7 @@ string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_featu /* Compile. */ const char *const nvcc = cuewCompilerPath(); if (nvcc == NULL) { - cuda_error_message( + set_error( "CUDA nvcc compiler not found. " "Install CUDA toolkit in default location."); return string(); @@ -504,7 +501,7 @@ string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_featu command = "call " + command; # endif if (system(command.c_str()) != 0) { - cuda_error_message( + set_error( "Failed to execute compilation command, " "see console for details."); return string(); @@ -512,7 +509,7 @@ string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_featu /* Verify if compilation succeeded */ if (!path_exists(cubin)) { - cuda_error_message( + set_error( "CUDA kernel compilation failed, " "see console for details."); return string(); @@ -565,16 +562,19 @@ bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features) else result = CUDA_ERROR_FILE_NOT_FOUND; - if (cuda_error_(result, "cuModuleLoad")) - cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); + if (result != CUDA_SUCCESS) + set_error(string_printf( + "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result))); if (path_read_text(filter_cubin, cubin_data)) result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); else result = CUDA_ERROR_FILE_NOT_FOUND; - if (cuda_error_(result, "cuModuleLoad")) - cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); + if (result != CUDA_SUCCESS) + set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)", + filter_cubin.c_str(), + cuewErrorString(result))); if (result == CUDA_SUCCESS) { reserve_local_memory(requested_features); @@ -870,7 +870,7 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_ if (mem_alloc_result != CUDA_SUCCESS) { status = " failed, out of device and host memory"; - cuda_assert(mem_alloc_result); + set_error("System is out of GPU and shared host memory"); } if (mem.name) { @@ -1773,7 +1773,7 @@ void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising) denoising.render_buffer.samples = rtile.sample; denoising.buffer.gpu_temporary_mem = true; - denoising.run_denoising(&rtile); + denoising.run_denoising(rtile); } void CUDADevice::adaptive_sampling_filter(uint filter_sample, @@ -2339,11 +2339,11 @@ void CUDADevice::draw_pixels(device_memory &mem, Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params); } -void CUDADevice::thread_run(DeviceTask *task) +void CUDADevice::thread_run(DeviceTask &task) { CUDAContextScope scope(this); - if (task->type == DeviceTask::RENDER) { + if (task.type == DeviceTask::RENDER) { DeviceRequestedFeatures requested_features; if (use_split_kernel()) { if (split_kernel == NULL) { @@ -2356,72 +2356,64 @@ void CUDADevice::thread_run(DeviceTask *task) /* keep rendering tiles until done */ RenderTile tile; - DenoisingTask denoising(this, *task); + DenoisingTask denoising(this, task); - while (task->acquire_tile(this, tile, task->tile_types)) { + while (task.acquire_tile(this, tile, task.tile_types)) { if (tile.task == RenderTile::PATH_TRACE) { if (use_split_kernel()) { device_only_memory<uchar> void_buffer(this, "void_buffer"); split_kernel->path_trace(task, tile, void_buffer, void_buffer); } else { - render(*task, tile, work_tiles); + render(task, tile, work_tiles); } } else if (tile.task == RenderTile::BAKE) { - render(*task, tile, work_tiles); + render(task, tile, work_tiles); } else if (tile.task == RenderTile::DENOISE) { tile.sample = tile.start_sample + tile.num_samples; denoise(tile, denoising); - task->update_progress(&tile, tile.w * tile.h); + task.update_progress(&tile, tile.w * tile.h); } - task->release_tile(tile); + task.release_tile(tile); - if (task->get_cancel()) { - if (task->need_finish_queue == false) + if (task.get_cancel()) { + if (task.need_finish_queue == false) break; } } work_tiles.free(); } - else if (task->type == DeviceTask::SHADER) { - shader(*task); + else if (task.type == DeviceTask::SHADER) { + shader(task); cuda_assert(cuCtxSynchronize()); } - else if (task->type == DeviceTask::DENOISE_BUFFER) { + else if (task.type == DeviceTask::DENOISE_BUFFER) { RenderTile tile; - tile.x = task->x; - tile.y = task->y; - tile.w = task->w; - tile.h = task->h; - tile.buffer = task->buffer; - tile.sample = task->sample + task->num_samples; - tile.num_samples = task->num_samples; - tile.start_sample = task->sample; - tile.offset = task->offset; - tile.stride = task->stride; - tile.buffers = task->buffers; - - DenoisingTask denoising(this, *task); + tile.x = task.x; + tile.y = task.y; + tile.w = task.w; + tile.h = task.h; + tile.buffer = task.buffer; + tile.sample = task.sample + task.num_samples; + tile.num_samples = task.num_samples; + tile.start_sample = task.sample; + tile.offset = task.offset; + tile.stride = task.stride; + tile.buffers = task.buffers; + + DenoisingTask denoising(this, task); denoise(tile, denoising); - task->update_progress(&tile, tile.w * tile.h); + task.update_progress(&tile, tile.w * tile.h); } } -class CUDADeviceTask : public DeviceTask { - public: - CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task) - { - run = function_bind(&CUDADevice::thread_run, device, this); - } -}; - void CUDADevice::task_add(DeviceTask &task) { CUDAContextScope scope(this); @@ -2437,7 +2429,10 @@ void CUDADevice::task_add(DeviceTask &task) film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); } else { - task_pool.push(new CUDADeviceTask(this, task)); + task_pool.push([=] { + DeviceTask task_copy = task; + thread_run(task_copy); + }); } } @@ -2458,14 +2453,10 @@ void CUDADevice::task_cancel() # define cuda_assert(stmt) \ { \ CUresult result = stmt; \ -\ if (result != CUDA_SUCCESS) { \ - string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ - if (device->error_msg == "") \ - device->error_msg = message; \ - fprintf(stderr, "%s\n", message.c_str()); \ - /*cuda_abort();*/ \ - device->cuda_error_documentation(); \ + const char *name = cuewErrorString(result); \ + device->set_error( \ + string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \ } \ } \ (void)0 @@ -2647,14 +2638,15 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name, const DeviceRequestedFeatures &) { - CUDAContextScope scope(device); - CUfunction func; + const CUDAContextScope scope(device); - cuda_assert( - cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data())); - if (device->have_error()) { - device->cuda_error_message( - string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data())); + CUfunction func; + const CUresult result = cuModuleGetFunction( + &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()); + if (result != CUDA_SUCCESS) { + device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)", + kernel_name.data(), + cuewErrorString(result))); return NULL; } @@ -2668,7 +2660,7 @@ int2 CUDASplitKernel::split_kernel_local_size() int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg, device_memory &data, - DeviceTask * /*task*/) + DeviceTask & /*task*/) { CUDAContextScope scope(device); size_t free; |