diff options
Diffstat (limited to 'intern/cycles/device/cuda')
-rw-r--r-- | intern/cycles/device/cuda/device_impl.cpp | 140 | ||||
-rw-r--r-- | intern/cycles/device/cuda/device_impl.h | 5 | ||||
-rw-r--r-- | intern/cycles/device/cuda/graphics_interop.cpp | 15 | ||||
-rw-r--r-- | intern/cycles/device/cuda/graphics_interop.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/cuda/queue.cpp | 54 | ||||
-rw-r--r-- | intern/cycles/device/cuda/queue.h | 2 |
6 files changed, 47 insertions, 171 deletions
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index 37fab8f8293..5e1a63c04df 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -31,7 +31,6 @@ # include "util/util_logging.h" # include "util/util_map.h" # include "util/util_md5.h" -# include "util/util_opengl.h" # include "util/util_path.h" # include "util/util_string.h" # include "util/util_system.h" @@ -837,7 +836,7 @@ void CUDADevice::mem_copy_to(device_memory &mem) } } -void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem) +void CUDADevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) { if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) { assert(!"mem_copy_from not supported for textures."); @@ -891,7 +890,7 @@ void CUDADevice::mem_free(device_memory &mem) } } -device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) +device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) { return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); } @@ -1169,141 +1168,6 @@ void CUDADevice::tex_free(device_texture &mem) } } -# if 0 -void CUDADevice::render(DeviceTask &task, - RenderTile &rtile, - device_vector<KernelWorkTile> &work_tiles) -{ - scoped_timer timer(&rtile.buffers->render_time); - - if (have_error()) - return; - - CUDAContextScope scope(this); - CUfunction cuRender; - - /* Get kernel function. */ - if (rtile.task == RenderTile::BAKE) { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); - } - else { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); - } - - if (have_error()) { - return; - } - - cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); - - /* Allocate work tile. */ - work_tiles.alloc(1); - - KernelWorkTile *wtile = work_tiles.data(); - wtile->x = rtile.x; - wtile->y = rtile.y; - wtile->w = rtile.w; - wtile->h = rtile.h; - wtile->offset = rtile.offset; - wtile->stride = rtile.stride; - wtile->buffer = (float *)(CUdeviceptr)rtile.buffer; - - /* Prepare work size. More step samples render faster, but for now we - * remain conservative for GPUs connected to a display to avoid driver - * timeouts and display freezing. */ - int min_blocks, num_threads_per_block; - cuda_assert( - cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); - if (!info.display_device) { - min_blocks *= 8; - } - - uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); - - /* Render all samples. */ - uint start_sample = rtile.start_sample; - uint end_sample = rtile.start_sample + rtile.num_samples; - - for (int sample = start_sample; sample < end_sample;) { - /* Setup and copy work tile to device. */ - wtile->start_sample = sample; - wtile->num_samples = step_samples; - if (task.adaptive_sampling.use) { - wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples); - } - wtile->num_samples = min(wtile->num_samples, end_sample - sample); - work_tiles.copy_to_device(); - - CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; - uint total_work_size = wtile->w * wtile->h * wtile->num_samples; - uint num_blocks = divide_up(total_work_size, num_threads_per_block); - - /* Launch kernel. */ - void *args[] = {&d_work_tiles, &total_work_size}; - - cuda_assert( - cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); - - /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ - uint filter_sample = sample + wtile->num_samples - 1; - if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { - adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); - } - - cuda_assert(cuCtxSynchronize()); - - /* Update progress. */ - sample += wtile->num_samples; - rtile.sample = sample; - task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); - - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - } - - /* Finalize adaptive sampling. */ - if (task.adaptive_sampling.use) { - CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; - adaptive_sampling_post(rtile, wtile, d_work_tiles); - cuda_assert(cuCtxSynchronize()); - task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); - } -} - -void CUDADevice::thread_run(DeviceTask &task) -{ - CUDAContextScope scope(this); - - if (task.type == DeviceTask::RENDER) { - device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); - - /* keep rendering tiles until done */ - RenderTile tile; - DenoisingTask denoising(this, task); - - while (task.acquire_tile(this, tile, task.tile_types)) { - if (tile.task == RenderTile::PATH_TRACE) { - render(task, tile, work_tiles); - } - else if (tile.task == RenderTile::BAKE) { - render(task, tile, work_tiles); - } - - task.release_tile(tile); - - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - } - - work_tiles.free(); - } -} -# endif - unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create() { return make_unique<CUDADeviceQueue>(this); diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h index 6b27db54ab4..c0316d18ba0 100644 --- a/intern/cycles/device/cuda/device_impl.h +++ b/intern/cycles/device/cuda/device_impl.h @@ -26,7 +26,6 @@ # ifdef WITH_CUDA_DYNLOAD # include "cuew.h" # else -# include "util/util_opengl.h" # include <cuda.h> # include <cudaGL.h> # endif @@ -120,13 +119,13 @@ class CUDADevice : public Device { void mem_copy_to(device_memory &mem) override; - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override; + void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override; void mem_zero(device_memory &mem) override; void mem_free(device_memory &mem) override; - device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override; + device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override; virtual void const_copy_to(const char *name, void *host, size_t size) override; diff --git a/intern/cycles/device/cuda/graphics_interop.cpp b/intern/cycles/device/cuda/graphics_interop.cpp index e8ca8b90eae..30efefd9b6b 100644 --- a/intern/cycles/device/cuda/graphics_interop.cpp +++ b/intern/cycles/device/cuda/graphics_interop.cpp @@ -37,14 +37,15 @@ CUDADeviceGraphicsInterop::~CUDADeviceGraphicsInterop() } } -void CUDADeviceGraphicsInterop::set_destination( - const DeviceGraphicsInteropDestination &destination) +void CUDADeviceGraphicsInterop::set_display_interop( + const DisplayDriver::GraphicsInterop &display_interop) { - const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height; + const int64_t new_buffer_area = int64_t(display_interop.buffer_width) * + display_interop.buffer_height; - need_clear_ = destination.need_clear; + need_clear_ = display_interop.need_clear; - if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) { + if (opengl_pbo_id_ == display_interop.opengl_pbo_id && buffer_area_ == new_buffer_area) { return; } @@ -55,12 +56,12 @@ void CUDADeviceGraphicsInterop::set_destination( } const CUresult result = cuGraphicsGLRegisterBuffer( - &cu_graphics_resource_, destination.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); + &cu_graphics_resource_, display_interop.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); if (result != CUDA_SUCCESS) { LOG(ERROR) << "Error registering OpenGL buffer: " << cuewErrorString(result); } - opengl_pbo_id_ = destination.opengl_pbo_id; + opengl_pbo_id_ = display_interop.opengl_pbo_id; buffer_area_ = new_buffer_area; } diff --git a/intern/cycles/device/cuda/graphics_interop.h b/intern/cycles/device/cuda/graphics_interop.h index 8a70c8aa71d..ec480f20c86 100644 --- a/intern/cycles/device/cuda/graphics_interop.h +++ b/intern/cycles/device/cuda/graphics_interop.h @@ -41,7 +41,7 @@ class CUDADeviceGraphicsInterop : public DeviceGraphicsInterop { CUDADeviceGraphicsInterop &operator=(const CUDADeviceGraphicsInterop &other) = delete; CUDADeviceGraphicsInterop &operator=(CUDADeviceGraphicsInterop &&other) = delete; - virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override; + virtual void set_display_interop(const DisplayDriver::GraphicsInterop &display_interop) override; virtual device_ptr map() override; virtual void unmap() override; diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp index b7f86c10553..1149a835b14 100644 --- a/intern/cycles/device/cuda/queue.cpp +++ b/intern/cycles/device/cuda/queue.cpp @@ -116,18 +116,18 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar } /* Launch kernel. */ - cuda_device_assert(cuda_device_, - cuLaunchKernel(cuda_kernel.function, - num_blocks, - 1, - 1, - num_threads_per_block, - 1, - 1, - shared_mem_bytes, - cuda_stream_, - args, - 0)); + assert_success(cuLaunchKernel(cuda_kernel.function, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + shared_mem_bytes, + cuda_stream_, + args, + 0), + "enqueue"); return !(cuda_device_->have_error()); } @@ -139,7 +139,8 @@ bool CUDADeviceQueue::synchronize() } const CUDAContextScope scope(cuda_device_); - cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_)); + assert_success(cuStreamSynchronize(cuda_stream_), "synchronize"); + debug_synchronize(); return !(cuda_device_->have_error()); @@ -162,9 +163,9 @@ void CUDADeviceQueue::zero_to_device(device_memory &mem) assert(mem.device_pointer != 0); const CUDAContextScope scope(cuda_device_); - cuda_device_assert( - cuda_device_, - cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_)); + assert_success( + cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_), + "zero_to_device"); } void CUDADeviceQueue::copy_to_device(device_memory &mem) @@ -185,10 +186,10 @@ void CUDADeviceQueue::copy_to_device(device_memory &mem) /* Copy memory to device. */ const CUDAContextScope scope(cuda_device_); - cuda_device_assert( - cuda_device_, + assert_success( cuMemcpyHtoDAsync( - (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_)); + (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_), + "copy_to_device"); } void CUDADeviceQueue::copy_from_device(device_memory &mem) @@ -204,10 +205,19 @@ void CUDADeviceQueue::copy_from_device(device_memory &mem) /* Copy memory from device. */ const CUDAContextScope scope(cuda_device_); - cuda_device_assert( - cuda_device_, + assert_success( cuMemcpyDtoHAsync( - mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_)); + mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_), + "copy_from_device"); +} + +void CUDADeviceQueue::assert_success(CUresult result, const char *operation) +{ + if (result != CUDA_SUCCESS) { + const char *name = cuewErrorString(result); + cuda_device_->set_error(string_printf( + "%s in CUDA queue %s (%s)", name, operation, debug_active_kernels().c_str())); + } } unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create() diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h index 62e3aa3d6c2..4d1995ed69e 100644 --- a/intern/cycles/device/cuda/queue.h +++ b/intern/cycles/device/cuda/queue.h @@ -60,6 +60,8 @@ class CUDADeviceQueue : public DeviceQueue { protected: CUDADevice *cuda_device_; CUstream cuda_stream_; + + void assert_success(CUresult result, const char *operation); }; CCL_NAMESPACE_END |