Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMai Lavelle <mai.lavelle@gmail.com>2017-03-04 14:29:01 +0300
committerMai Lavelle <mai.lavelle@gmail.com>2017-03-04 14:37:56 +0300
commit60ad21badbeff1584418f632b90254943a8b5bf8 (patch)
tree30df8312be14f1ca472a747b78d2aaa11e42c5ff /intern/cycles/device/device_cuda.cpp
parent574c12adfadb7ffbd9fecfd9888e485c48dd6c7b (diff)
Cycles: Calculate size of split state buffer kernel side
By calculating the size of the state buffer in the kernel rather than the host less code is needed and the size actually reflects the requested features. Will also be a little faster in some cases because of larger global work size.
Diffstat (limited to 'intern/cycles/device/device_cuda.cpp')
-rw-r--r--intern/cycles/device/device_cuda.cpp43
1 files changed, 41 insertions, 2 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 52f1b2a2a15..e6596a624d0 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -88,6 +88,8 @@ class CUDASplitKernel : public DeviceSplitKernel {
public:
explicit CUDASplitKernel(CUDADevice *device);
+ virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
@@ -101,7 +103,7 @@ public:
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(DeviceTask *task);
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
};
class CUDADevice : public Device
@@ -1470,6 +1472,43 @@ CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device)
{
}
+size_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+{
+ device_vector<uint> size_buffer;
+ size_buffer.resize(1);
+ device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+ device->cuda_push_context();
+
+ uint threads = num_threads;
+ CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+
+ struct args_t {
+ uint* num_threads;
+ CUdeviceptr* size;
+ };
+
+ args_t args = {
+ &threads,
+ &d_size
+ };
+
+ CUfunction state_buffer_size;
+ cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+
+ cuda_assert(cuLaunchKernel(state_buffer_size,
+ 1, 1, 1,
+ 1, 1, 1,
+ 0, 0, &args, 0));
+
+ device->cuda_pop_context();
+
+ device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
+ device->mem_free(size_buffer);
+
+ return *size_buffer.get_data();
+}
+
bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
@@ -1572,7 +1611,7 @@ int2 CUDASplitKernel::split_kernel_local_size()
return make_int2(32, 1);
}
-int2 CUDASplitKernel::split_kernel_global_size(DeviceTask */*task*/)
+int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask */*task*/)
{
/* TODO(mai): implement something here to detect ideal work size */
return make_int2(256, 256);