1 files changed, 220 insertions, 0 deletions
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
new file mode 100644
index 00000000000..b7f86c10553
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/queue.h"
+
+#  include "device/cuda/device_impl.h"
+#  include "device/cuda/graphics_interop.h"
+#  include "device/cuda/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device)
+    : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr)
+{
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING));
+}
+
+CUDADeviceQueue::~CUDADeviceQueue()
+{
+  const CUDAContextScope scope(cuda_device_);
+  cuStreamDestroy(cuda_stream_);
+}
+
+int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
+{
+  int num_states = max(cuda_device_->get_num_multiprocessors() *
+                           cuda_device_->get_max_num_threads_per_multiprocessor() * 16,
+                       1048576);
+
+  const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
+  if (factor_str) {
+    num_states = max((int)(num_states * atof(factor_str)), 1024);
+  }
+
+  VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
+          << string_human_readable_size(num_states * state_size);
+
+  return num_states;
+}
+
+int CUDADeviceQueue::num_concurrent_busy_states() const
+{
+  const int max_num_threads = cuda_device_->get_num_multiprocessors() *
+                              cuda_device_->get_max_num_threads_per_multiprocessor();
+
+  if (max_num_threads == 0) {
+    return 65536;
+  }
+
+  return 4 * max_num_threads;
+}
+
+void CUDADeviceQueue::init_execution()
+{
+  /* Synchronize all textures and memory copies before executing task. */
+  CUDAContextScope scope(cuda_device_);
+  cuda_device_->load_texture_info();
+  cuda_device_assert(cuda_device_, cuCtxSynchronize());
+
+  debug_init_execution();
+}
+
+bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const
+{
+  return cuda_device_->kernels.available(kernel);
+}
+
+bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const CUDAContextScope scope(cuda_device_);
+  const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
+
+  /* Compute kernel launch parameters. */
+  const int num_threads_per_block = cuda_kernel.num_threads_per_block;
+  const int num_blocks = divide_up(work_size, num_threads_per_block);
+
+  int shared_mem_bytes = 0;
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      /* See parall_active_index.h for why this amount of shared memory is needed. */
+      shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
+      break;
+
+    default:
+      break;
+  }
+
+  /* Launch kernel. */
+  cuda_device_assert(cuda_device_,
+                     cuLaunchKernel(cuda_kernel.function,
+                                    num_blocks,
+                                    1,
+                                    1,
+                                    num_threads_per_block,
+                                    1,
+                                    1,
+                                    shared_mem_bytes,
+                                    cuda_stream_,
+                                    args,
+                                    0));
+
+  return !(cuda_device_->have_error());
+}
+
+bool CUDADeviceQueue::synchronize()
+{
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+  debug_synchronize();
+
+  return !(cuda_device_->have_error());
+}
+
+void CUDADeviceQueue::zero_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    cuda_device_->mem_alloc(mem);
+  }
+
+  /* Zero memory on device. */
+  assert(mem.device_pointer != 0);
+
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    cuda_device_->mem_alloc(mem);
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory to device. */
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyHtoDAsync(
+          (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_from_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory from device. */
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyDtoHAsync(
+          mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_));
+}
+
+unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
+{
+  return make_unique<CUDADeviceGraphicsInterop>(this);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */