Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMai Lavelle <mai.lavelle@gmail.com>2017-03-04 14:29:01 +0300
committerMai Lavelle <mai.lavelle@gmail.com>2017-03-08 09:31:30 +0300
commit306034790fb75ca2c182d55132287cc0ed4b9c2f (patch)
tree272a582174d84ebcb25c0bf3a4922d29b6c0b96f /intern/cycles/device
parent997e345bd25bf28a8a5d67d5ffcb5b70ff52ecdd (diff)
Cycles: Calculate size of split state buffer kernel side
By calculating the size of the state buffer in the kernel rather than the host less code is needed and the size actually reflects the requested features. Will also be a little faster in some cases because of larger global work size.
Diffstat (limited to 'intern/cycles/device')
-rw-r--r--intern/cycles/device/device_cpu.cpp11
-rw-r--r--intern/cycles/device/device_cuda.cpp43
-rw-r--r--intern/cycles/device/device_split_kernel.cpp13
-rw-r--r--intern/cycles/device/device_split_kernel.h5
-rw-r--r--intern/cycles/device/opencl/opencl_split.cpp47
5 files changed, 102 insertions, 17 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index b4d470747c2..1589bbe1a33 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -71,7 +71,8 @@ public:
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(DeviceTask *task);
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+ virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
};
class CPUDevice : public Device
@@ -854,11 +855,17 @@ int2 CPUSplitKernel::split_kernel_local_size()
return make_int2(1, 1);
}
-int2 CPUSplitKernel::split_kernel_global_size(DeviceTask *task) {
+int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask *task) {
/* TODO(mai): this needs investigation but cpu gives incorrect render if global size doesnt match tile size */
return task->requested_tile_size;
}
+size_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
+ KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+
+ return split_data_buffer_size(kg, num_threads);
+}
+
unordered_map<string, void*> CPUDevice::kernel_functions;
Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 9776e82ca2a..0204f0ed960 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -89,6 +89,8 @@ class CUDASplitKernel : public DeviceSplitKernel {
public:
explicit CUDASplitKernel(CUDADevice *device);
+ virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
@@ -102,7 +104,7 @@ public:
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(DeviceTask *task);
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
};
class CUDADevice : public Device
@@ -1471,6 +1473,43 @@ CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device)
{
}
+size_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+{
+ device_vector<uint> size_buffer;
+ size_buffer.resize(1);
+ device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+ device->cuda_push_context();
+
+ uint threads = num_threads;
+ CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+
+ struct args_t {
+ uint* num_threads;
+ CUdeviceptr* size;
+ };
+
+ args_t args = {
+ &threads,
+ &d_size
+ };
+
+ CUfunction state_buffer_size;
+ cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+
+ cuda_assert(cuLaunchKernel(state_buffer_size,
+ 1, 1, 1,
+ 1, 1, 1,
+ 0, 0, &args, 0));
+
+ device->cuda_pop_context();
+
+ device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
+ device->mem_free(size_buffer);
+
+ return *size_buffer.get_data();
+}
+
bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
@@ -1573,7 +1612,7 @@ int2 CUDASplitKernel::split_kernel_local_size()
return make_int2(32, 1);
}
-int2 CUDASplitKernel::split_kernel_global_size(DeviceTask */*task*/)
+int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask */*task*/)
{
/* TODO(mai): implement something here to detect ideal work size */
return make_int2(256, 256);
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index f16fb6a1ea1..799479ddb6a 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -90,9 +90,9 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
return true;
}
-size_t DeviceSplitKernel::max_elements_for_max_buffer_size(size_t max_buffer_size, size_t passes_size)
+size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size)
{
- size_t size_per_element = split_data_buffer_size(1024, current_max_closure, passes_size) / 1024;
+ size_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
return max_buffer_size / size_per_element;
}
@@ -113,13 +113,10 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
local_size[1] = lsize[1];
}
- /* Calculate per_thread_output_buffer_size. */
- size_t per_thread_output_buffer_size = task->passes_size;
-
/* Set gloabl size */
size_t global_size[2];
{
- int2 gsize = split_kernel_global_size(task);
+ int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
/* Make sure that set work size is a multiple of local
* work size dimensions.
@@ -153,9 +150,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
ray_state.resize(num_global_elements);
device->mem_alloc("ray_state", ray_state, MEM_READ_WRITE);
- split_data.resize(split_data_buffer_size(num_global_elements,
- current_max_closure,
- per_thread_output_buffer_size));
+ split_data.resize(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
device->mem_alloc("split_data", split_data, MEM_READ_WRITE);
}
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 1c6a2709cf2..cc3e1aa26ae 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -100,7 +100,8 @@ public:
device_memory& kgbuffer,
device_memory& kernel_data);
- size_t max_elements_for_max_buffer_size(size_t max_buffer_size, size_t passes_size);
+ virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
+ size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size);
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
@@ -115,7 +116,7 @@ public:
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0;
virtual int2 split_kernel_local_size() = 0;
- virtual int2 split_kernel_global_size(DeviceTask *task) = 0;
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 7e04c6fac2c..a44f5da3a32 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -60,6 +60,7 @@ class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
public:
DeviceSplitKernel *split_kernel;
OpenCLProgram program_data_init;
+ OpenCLProgram program_state_buffer_size;
OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_);
@@ -83,6 +84,13 @@ public:
program_data_init.add_kernel(ustring("path_trace_data_init"));
programs.push_back(&program_data_init);
+ program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
+ "split_state_buffer_size",
+ "kernel_state_buffer_size.cl",
+ get_build_options(this, requested_features));
+ program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
+ programs.push_back(&program_state_buffer_size);
+
return split_kernel->load_kernels(requested_features);
}
@@ -216,6 +224,41 @@ public:
return kernel;
}
+ virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
+ {
+ device_vector<uint> size_buffer;
+ size_buffer.resize(1);
+ device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+ uint threads = num_threads;
+ device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
+
+ size_t global_size = 64;
+ device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+ device->program_state_buffer_size(),
+ 1,
+ NULL,
+ &global_size,
+ NULL,
+ 0,
+ NULL,
+ NULL);
+
+ device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+ device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
+ device->mem_free(size_buffer);
+
+ if(device->ciErr != CL_SUCCESS) {
+ string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+ clewErrorString(device->ciErr));
+ device->opencl_error(message);
+ return 0;
+ }
+
+ return *size_buffer.get_data();
+ }
+
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
@@ -298,7 +341,7 @@ public:
return make_int2(64, 1);
}
- virtual int2 split_kernel_global_size(DeviceTask *task)
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/)
{
size_t max_buffer_size;
clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_buffer_size, NULL);
@@ -306,7 +349,7 @@ public:
<< string_human_readable_number(max_buffer_size) << " bytes. ("
<< string_human_readable_size(max_buffer_size) << ").";
- size_t num_elements = max_elements_for_max_buffer_size(max_buffer_size / 2, task->passes_size);
+ size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2);
int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements));
VLOG(1) << "Global size: " << global_size << ".";
return global_size;