diff options
Diffstat (limited to 'intern/cycles/device/cuda/queue.cpp')
-rw-r--r-- | intern/cycles/device/cuda/queue.cpp | 220 |
1 files changed, 220 insertions, 0 deletions
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp new file mode 100644 index 00000000000..b7f86c10553 --- /dev/null +++ b/intern/cycles/device/cuda/queue.cpp @@ -0,0 +1,220 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include "device/cuda/queue.h" + +# include "device/cuda/device_impl.h" +# include "device/cuda/graphics_interop.h" +# include "device/cuda/kernel.h" + +CCL_NAMESPACE_BEGIN + +/* CUDADeviceQueue */ + +CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device) + : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr) +{ + const CUDAContextScope scope(cuda_device_); + cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING)); +} + +CUDADeviceQueue::~CUDADeviceQueue() +{ + const CUDAContextScope scope(cuda_device_); + cuStreamDestroy(cuda_stream_); +} + +int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const +{ + int num_states = max(cuda_device_->get_num_multiprocessors() * + cuda_device_->get_max_num_threads_per_multiprocessor() * 16, + 1048576); + + const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR"); + if (factor_str) { + num_states = max((int)(num_states * atof(factor_str)), 1024); + } + + VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to " + << string_human_readable_size(num_states * state_size); + + return num_states; +} + +int CUDADeviceQueue::num_concurrent_busy_states() const +{ + const int max_num_threads = cuda_device_->get_num_multiprocessors() * + cuda_device_->get_max_num_threads_per_multiprocessor(); + + if (max_num_threads == 0) { + return 65536; + } + + return 4 * max_num_threads; +} + +void CUDADeviceQueue::init_execution() +{ + /* Synchronize all textures and memory copies before executing task. */ + CUDAContextScope scope(cuda_device_); + cuda_device_->load_texture_info(); + cuda_device_assert(cuda_device_, cuCtxSynchronize()); + + debug_init_execution(); +} + +bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const +{ + return cuda_device_->kernels.available(kernel); +} + +bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[]) +{ + if (cuda_device_->have_error()) { + return false; + } + + debug_enqueue(kernel, work_size); + + const CUDAContextScope scope(cuda_device_); + const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel); + + /* Compute kernel launch parameters. */ + const int num_threads_per_block = cuda_kernel.num_threads_per_block; + const int num_blocks = divide_up(work_size, num_threads_per_block); + + int shared_mem_bytes = 0; + + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: + /* See parall_active_index.h for why this amount of shared memory is needed. */ + shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int); + break; + + default: + break; + } + + /* Launch kernel. */ + cuda_device_assert(cuda_device_, + cuLaunchKernel(cuda_kernel.function, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + shared_mem_bytes, + cuda_stream_, + args, + 0)); + + return !(cuda_device_->have_error()); +} + +bool CUDADeviceQueue::synchronize() +{ + if (cuda_device_->have_error()) { + return false; + } + + const CUDAContextScope scope(cuda_device_); + cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_)); + debug_synchronize(); + + return !(cuda_device_->have_error()); +} + +void CUDADeviceQueue::zero_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + cuda_device_->mem_alloc(mem); + } + + /* Zero memory on device. */ + assert(mem.device_pointer != 0); + + const CUDAContextScope scope(cuda_device_); + cuda_device_assert( + cuda_device_, + cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_)); +} + +void CUDADeviceQueue::copy_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + cuda_device_->mem_alloc(mem); + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + /* Copy memory to device. */ + const CUDAContextScope scope(cuda_device_); + cuda_device_assert( + cuda_device_, + cuMemcpyHtoDAsync( + (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_)); +} + +void CUDADeviceQueue::copy_from_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + /* Copy memory from device. */ + const CUDAContextScope scope(cuda_device_); + cuda_device_assert( + cuda_device_, + cuMemcpyDtoHAsync( + mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_)); +} + +unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create() +{ + return make_unique<CUDADeviceGraphicsInterop>(this); +} + +CCL_NAMESPACE_END + +#endif /* WITH_CUDA */ |