Cycles: Pass all buffers to each kernel call for OpenCL

Technically not passing all buffers used by a kernel is undefined behavior. We haven't had any issues with this so far on AMD or Nvidia, but it's known to be a problem with Intel and we received a report from AMD that this is a problem on newer hardware, so we need to make this change at some point. Unfortunately there a cost to being correct, about 5% for the benchmark scenes. For low sample counts it's even worse, I've seen up to 50% slowdown. For the latter case I think adjusting tile updating logic can help, but not sure what that would look like yet (it would be just a few lines change however).
author: Mai Lavelle <mai.lavelle@gmail.com> 2017-05-12 02:23:49 +0300
committer: Mai Lavelle <mai.lavelle@gmail.com> 2017-06-10 11:08:49 +0300
commit: eb293f59f2eb9847b8fd593ac2dde2781ac8ace1 (patch)
tree: 7a939dbac4abcb34e10b9c5e1ac2c9c09dcc9b36 /intern/cycles/device
parent: 6238214159a4229ded91cd36d14c8e55ff427c28 (diff)
1 files changed, 59 insertions, 4 deletions
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 76dcbd6fc9a..08b632ee9d3 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -176,17 +176,62 @@ protected:
 	friend class OpenCLSplitKernelFunction;
 };
 
+struct CachedSplitMemory {
+	int id;
+	device_memory *split_data;
+	device_memory *ray_state;
+	device_ptr *rng_state;
+	device_memory *queue_index;
+	device_memory *use_queues_flag;
+	device_memory *work_pools;
+	device_ptr *buffer;
+};
+
 class OpenCLSplitKernelFunction : public SplitKernelFunction {
 public:
 	OpenCLDeviceSplitKernel* device;
 	OpenCLDeviceBase::OpenCLProgram program;
+	CachedSplitMemory& cached_memory;
+	int cached_id;
+
+	OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device, CachedSplitMemory& cached_memory) :
+			device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
+	{
+	}
 
-	OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {}
-	~OpenCLSplitKernelFunction() { program.release(); }
+	~OpenCLSplitKernelFunction()
+	{
+		program.release();
+	}
 
 	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
 	{
-		device->kernel_set_args(program(), 0, kg, data);
+		if(cached_id != cached_memory.id) {
+			cl_uint start_arg_index =
+				device->kernel_set_args(program(),
+					            0,
+					            kg,
+					            data,
+					            *cached_memory.split_data,
+					            *cached_memory.ray_state,
+					            *cached_memory.rng_state);
+
+/* TODO(sergey): Avoid map lookup here. */
+#define KERNEL_TEX(type, ttype, name) \
+				device->set_kernel_arg_mem(program(), &start_arg_index, #name);
+#include "kernel/kernel_textures.h"
+#undef KERNEL_TEX
+
+			start_arg_index +=
+				device->kernel_set_args(program(),
+					            start_arg_index,
+					            *cached_memory.queue_index,
+					            *cached_memory.use_queues_flag,
+					            *cached_memory.work_pools,
+					            *cached_memory.buffer);
+
+			cached_id = cached_memory.id;
+		}
 
 		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
 		                                       program(),
@@ -213,6 +258,7 @@ public:
 
 class OpenCLSplitKernel : public DeviceSplitKernel {
 	OpenCLDeviceSplitKernel *device;
+	CachedSplitMemory cached_memory;
 public:
 	explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
 	}
@@ -220,7 +266,7 @@ public:
 	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name,
 	                                                       const DeviceRequestedFeatures& requested_features)
 	{
-		OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device);
+		OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
 
 		bool single_program = OpenCLInfo::use_single_program();
 		kernel->program =
@@ -349,6 +395,15 @@ public:
 			return false;
 		}
 
+		cached_memory.split_data = &split_data;
+		cached_memory.ray_state = &ray_state;
+		cached_memory.rng_state = &rtile.rng_state;
+		cached_memory.queue_index = &queue_index;
+		cached_memory.use_queues_flag = &use_queues_flag;
+		cached_memory.work_pools = &work_pool_wgs;
+		cached_memory.buffer = &rtile.buffer;
+		cached_memory.id++;
+
 		return true;
 	}
author	Mai Lavelle <mai.lavelle@gmail.com>	2017-05-12 02:23:49 +0300
committer	Mai Lavelle <mai.lavelle@gmail.com>	2017-06-10 11:08:49 +0300
commit	eb293f59f2eb9847b8fd593ac2dde2781ac8ace1 (patch)
tree	7a939dbac4abcb34e10b9c5e1ac2c9c09dcc9b36 /intern/cycles/device
parent	6238214159a4229ded91cd36d14c8e55ff427c28 (diff)