/* * Copyright 2011-2013 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifdef WITH_CUDA # include "device/cuda/queue.h" # include "device/cuda/device_impl.h" # include "device/cuda/graphics_interop.h" # include "device/cuda/kernel.h" CCL_NAMESPACE_BEGIN /* CUDADeviceQueue */ CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device) : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr) { const CUDAContextScope scope(cuda_device_); cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING)); } CUDADeviceQueue::~CUDADeviceQueue() { const CUDAContextScope scope(cuda_device_); cuStreamDestroy(cuda_stream_); } int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const { int num_states = max(cuda_device_->get_num_multiprocessors() * cuda_device_->get_max_num_threads_per_multiprocessor() * 16, 1048576); const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR"); if (factor_str) { num_states = max((int)(num_states * atof(factor_str)), 1024); } VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to " << string_human_readable_size(num_states * state_size); return num_states; } int CUDADeviceQueue::num_concurrent_busy_states() const { const int max_num_threads = cuda_device_->get_num_multiprocessors() * cuda_device_->get_max_num_threads_per_multiprocessor(); if (max_num_threads == 0) { return 65536; } return 4 * max_num_threads; } void CUDADeviceQueue::init_execution() { /* Synchronize all textures and memory copies before executing task. */ CUDAContextScope scope(cuda_device_); cuda_device_->load_texture_info(); cuda_device_assert(cuda_device_, cuCtxSynchronize()); debug_init_execution(); } bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const { return cuda_device_->kernels.available(kernel); } bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[]) { if (cuda_device_->have_error()) { return false; } debug_enqueue(kernel, work_size); const CUDAContextScope scope(cuda_device_); const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel); /* Compute kernel launch parameters. */ const int num_threads_per_block = cuda_kernel.num_threads_per_block; const int num_blocks = divide_up(work_size, num_threads_per_block); int shared_mem_bytes = 0; switch (kernel) { case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: /* See parall_active_index.h for why this amount of shared memory is needed. */ shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int); break; default: break; } /* Launch kernel. */ cuda_device_assert(cuda_device_, cuLaunchKernel(cuda_kernel.function, num_blocks, 1, 1, num_threads_per_block, 1, 1, shared_mem_bytes, cuda_stream_, args, 0)); return !(cuda_device_->have_error()); } bool CUDADeviceQueue::synchronize() { if (cuda_device_->have_error()) { return false; } const CUDAContextScope scope(cuda_device_); cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_)); debug_synchronize(); return !(cuda_device_->have_error()); } void CUDADeviceQueue::zero_to_device(device_memory &mem) { assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); if (mem.memory_size() == 0) { return; } /* Allocate on demand. */ if (mem.device_pointer == 0) { cuda_device_->mem_alloc(mem); } /* Zero memory on device. */ assert(mem.device_pointer != 0); const CUDAContextScope scope(cuda_device_); cuda_device_assert( cuda_device_, cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_)); } void CUDADeviceQueue::copy_to_device(device_memory &mem) { assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); if (mem.memory_size() == 0) { return; } /* Allocate on demand. */ if (mem.device_pointer == 0) { cuda_device_->mem_alloc(mem); } assert(mem.device_pointer != 0); assert(mem.host_pointer != nullptr); /* Copy memory to device. */ const CUDAContextScope scope(cuda_device_); cuda_device_assert( cuda_device_, cuMemcpyHtoDAsync( (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_)); } void CUDADeviceQueue::copy_from_device(device_memory &mem) { assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); if (mem.memory_size() == 0) { return; } assert(mem.device_pointer != 0); assert(mem.host_pointer != nullptr); /* Copy memory from device. */ const CUDAContextScope scope(cuda_device_); cuda_device_assert( cuda_device_, cuMemcpyDtoHAsync( mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_)); } unique_ptr CUDADeviceQueue::graphics_interop_create() { return make_unique(this); } CCL_NAMESPACE_END #endif /* WITH_CUDA */