Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--intern/cycles/device/device_cpu.cpp11
-rw-r--r--intern/cycles/device/device_cuda.cpp43
-rw-r--r--intern/cycles/device/device_split_kernel.cpp13
-rw-r--r--intern/cycles/device/device_split_kernel.h5
-rw-r--r--intern/cycles/device/opencl/opencl_split.cpp47
-rw-r--r--intern/cycles/kernel/CMakeLists.txt2
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_split.cu7
-rw-r--r--intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl29
-rw-r--r--intern/cycles/kernel/split/kernel_data_init.h2
-rw-r--r--intern/cycles/kernel/split/kernel_split_data.h28
10 files changed, 147 insertions, 40 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index b4d470747c2..1589bbe1a33 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -71,7 +71,8 @@ public:
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(DeviceTask *task);
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+ virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
};
class CPUDevice : public Device
@@ -854,11 +855,17 @@ int2 CPUSplitKernel::split_kernel_local_size()
return make_int2(1, 1);
}
-int2 CPUSplitKernel::split_kernel_global_size(DeviceTask *task) {
+int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask *task) {
/* TODO(mai): this needs investigation but cpu gives incorrect render if global size doesnt match tile size */
return task->requested_tile_size;
}
+size_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
+ KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+
+ return split_data_buffer_size(kg, num_threads);
+}
+
unordered_map<string, void*> CPUDevice::kernel_functions;
Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 9776e82ca2a..0204f0ed960 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -89,6 +89,8 @@ class CUDASplitKernel : public DeviceSplitKernel {
public:
explicit CUDASplitKernel(CUDADevice *device);
+ virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
@@ -102,7 +104,7 @@ public:
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(DeviceTask *task);
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
};
class CUDADevice : public Device
@@ -1471,6 +1473,43 @@ CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device)
{
}
+size_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+{
+ device_vector<uint> size_buffer;
+ size_buffer.resize(1);
+ device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+ device->cuda_push_context();
+
+ uint threads = num_threads;
+ CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+
+ struct args_t {
+ uint* num_threads;
+ CUdeviceptr* size;
+ };
+
+ args_t args = {
+ &threads,
+ &d_size
+ };
+
+ CUfunction state_buffer_size;
+ cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+
+ cuda_assert(cuLaunchKernel(state_buffer_size,
+ 1, 1, 1,
+ 1, 1, 1,
+ 0, 0, &args, 0));
+
+ device->cuda_pop_context();
+
+ device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
+ device->mem_free(size_buffer);
+
+ return *size_buffer.get_data();
+}
+
bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
@@ -1573,7 +1612,7 @@ int2 CUDASplitKernel::split_kernel_local_size()
return make_int2(32, 1);
}
-int2 CUDASplitKernel::split_kernel_global_size(DeviceTask */*task*/)
+int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask */*task*/)
{
/* TODO(mai): implement something here to detect ideal work size */
return make_int2(256, 256);
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index f16fb6a1ea1..799479ddb6a 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -90,9 +90,9 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
return true;
}
-size_t DeviceSplitKernel::max_elements_for_max_buffer_size(size_t max_buffer_size, size_t passes_size)
+size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size)
{
- size_t size_per_element = split_data_buffer_size(1024, current_max_closure, passes_size) / 1024;
+ size_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
return max_buffer_size / size_per_element;
}
@@ -113,13 +113,10 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
local_size[1] = lsize[1];
}
- /* Calculate per_thread_output_buffer_size. */
- size_t per_thread_output_buffer_size = task->passes_size;
-
/* Set gloabl size */
size_t global_size[2];
{
- int2 gsize = split_kernel_global_size(task);
+ int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
/* Make sure that set work size is a multiple of local
* work size dimensions.
@@ -153,9 +150,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
ray_state.resize(num_global_elements);
device->mem_alloc("ray_state", ray_state, MEM_READ_WRITE);
- split_data.resize(split_data_buffer_size(num_global_elements,
- current_max_closure,
- per_thread_output_buffer_size));
+ split_data.resize(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
device->mem_alloc("split_data", split_data, MEM_READ_WRITE);
}
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 1c6a2709cf2..cc3e1aa26ae 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -100,7 +100,8 @@ public:
device_memory& kgbuffer,
device_memory& kernel_data);
- size_t max_elements_for_max_buffer_size(size_t max_buffer_size, size_t passes_size);
+ virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
+ size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size);
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
@@ -115,7 +116,7 @@ public:
virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0;
virtual int2 split_kernel_local_size() = 0;
- virtual int2 split_kernel_global_size(DeviceTask *task) = 0;
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 7e04c6fac2c..a44f5da3a32 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -60,6 +60,7 @@ class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
public:
DeviceSplitKernel *split_kernel;
OpenCLProgram program_data_init;
+ OpenCLProgram program_state_buffer_size;
OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_);
@@ -83,6 +84,13 @@ public:
program_data_init.add_kernel(ustring("path_trace_data_init"));
programs.push_back(&program_data_init);
+ program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
+ "split_state_buffer_size",
+ "kernel_state_buffer_size.cl",
+ get_build_options(this, requested_features));
+ program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
+ programs.push_back(&program_state_buffer_size);
+
return split_kernel->load_kernels(requested_features);
}
@@ -216,6 +224,41 @@ public:
return kernel;
}
+ virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
+ {
+ device_vector<uint> size_buffer;
+ size_buffer.resize(1);
+ device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+ uint threads = num_threads;
+ device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
+
+ size_t global_size = 64;
+ device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+ device->program_state_buffer_size(),
+ 1,
+ NULL,
+ &global_size,
+ NULL,
+ 0,
+ NULL,
+ NULL);
+
+ device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+ device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
+ device->mem_free(size_buffer);
+
+ if(device->ciErr != CL_SUCCESS) {
+ string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+ clewErrorString(device->ciErr));
+ device->opencl_error(message);
+ return 0;
+ }
+
+ return *size_buffer.get_data();
+ }
+
virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
RenderTile& rtile,
int num_global_elements,
@@ -298,7 +341,7 @@ public:
return make_int2(64, 1);
}
- virtual int2 split_kernel_global_size(DeviceTask *task)
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/)
{
size_t max_buffer_size;
clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_buffer_size, NULL);
@@ -306,7 +349,7 @@ public:
<< string_human_readable_number(max_buffer_size) << " bytes. ("
<< string_human_readable_size(max_buffer_size) << ").";
- size_t num_elements = max_elements_for_max_buffer_size(max_buffer_size / 2, task->passes_size);
+ size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2);
int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements));
VLOG(1) << "Global size: " << global_size << ".";
return global_size;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index df40c3a0e8e..6867ab02318 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -15,6 +15,7 @@ set(SRC
kernels/cpu/kernel.cpp
kernels/cpu/kernel_split.cpp
kernels/opencl/kernel.cl
+ kernels/opencl/kernel_state_buffer_size.cl
kernels/opencl/kernel_data_init.cl
kernels/opencl/kernel_path_init.cl
kernels/opencl/kernel_queue_enqueue.cl
@@ -399,6 +400,7 @@ endif()
#delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_state_buffer_size.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_path_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index 53a36b15e40..759475b175f 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -41,6 +41,13 @@
/* kernels */
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_state_buffer_size(uint num_threads, uint *size)
+{
+ *size = split_data_buffer_size(NULL, num_threads);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_path_trace_data_init(
ccl_global void *split_data_buffer,
int num_elements,
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
new file mode 100644
index 00000000000..0a1843ff8bd
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
+
+__kernel void kernel_ocl_path_trace_state_buffer_size(
+ KernelGlobals *kg,
+ ccl_constant KernelData *data,
+ uint num_threads,
+ ccl_global uint *size)
+{
+ kg->data = data;
+ *size = split_data_buffer_size(kg, num_threads);
+}
+
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 785103a79ac..9b62d65ffd9 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -93,7 +93,7 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
kernel_split_params.buffer = buffer;
- split_data_init(&kernel_split_state, num_elements, split_data_buffer, ray_state);
+ split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
#ifdef __KERNEL_OPENCL__
#define KERNEL_TEX(type, ttype, name) \
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
index 5dd53f42478..0a2ba8d1e1a 100644
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -78,6 +78,8 @@ typedef struct SplitParams {
SPLIT_DATA_ENTRY(Intersection, isect_shadow, 2) \
SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
+ SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
+ SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 2) \
SPLIT_DATA_DEBUG_ENTRIES \
/* struct that holds pointers to data in the shared state buffer */
@@ -86,37 +88,25 @@ typedef struct SplitData {
SPLIT_DATA_ENTRIES
#undef SPLIT_DATA_ENTRY
- /* size calculation for these is non trivial, so they are left out of SPLIT_DATA_ENTRIES and handled separately */
- ShaderData *sd;
- ShaderData *sd_DL_shadow;
-
/* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
* the host easily) but is still used the same as the other data so we have it here in this struct as well
*/
ccl_global char *ray_state;
} SplitData;
-#define SIZEOF_SD(max_closure) (sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - (max_closure))))
-
-ccl_device_inline size_t split_data_buffer_size(size_t num_elements,
- size_t max_closure,
- size_t per_thread_output_buffer_size)
+/* TODO: find a way to get access to kg here */
+ccl_device_inline size_t split_data_buffer_size(ccl_global void *kg, size_t num_elements)
{
size_t size = 0;
#define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16)
size = size SPLIT_DATA_ENTRIES;
#undef SPLIT_DATA_ENTRY
- /* TODO(sergey): This will actually over-allocate if
- * particular kernel does not support multiclosure.
- */
- size += align_up(num_elements * SIZEOF_SD(max_closure), 16); /* sd */
- size += align_up(2 * num_elements * SIZEOF_SD(max_closure), 16); /* sd_DL_shadow */
-
return size;
}
-ccl_device_inline void split_data_init(ccl_global SplitData *split_data,
+ccl_device_inline void split_data_init(ccl_global void *kg,
+ ccl_global SplitData *split_data,
size_t num_elements,
ccl_global void *data,
ccl_global char *ray_state)
@@ -128,12 +118,6 @@ ccl_device_inline void split_data_init(ccl_global SplitData *split_data,
SPLIT_DATA_ENTRIES
#undef SPLIT_DATA_ENTRY
- split_data->sd = (ShaderData*)p;
- p += align_up(num_elements * SIZEOF_SD(MAX_CLOSURE), 16);
-
- split_data->sd_DL_shadow = (ShaderData*)p;
- p += align_up(2 * num_elements * SIZEOF_SD(MAX_CLOSURE), 16);
-
split_data->ray_state = ray_state;
}