diff options
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 11 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 43 | ||||
-rw-r--r-- | intern/cycles/device/device_split_kernel.cpp | 13 | ||||
-rw-r--r-- | intern/cycles/device/device_split_kernel.h | 5 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_split.cpp | 47 | ||||
-rw-r--r-- | intern/cycles/kernel/CMakeLists.txt | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/kernel_split.cu | 7 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl | 29 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_data_init.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/split/kernel_split_data.h | 28 |
10 files changed, 147 insertions, 40 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index b4d470747c2..1589bbe1a33 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -71,7 +71,8 @@ public: virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(DeviceTask *task); + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); + virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); }; class CPUDevice : public Device @@ -854,11 +855,17 @@ int2 CPUSplitKernel::split_kernel_local_size() return make_int2(1, 1); } -int2 CPUSplitKernel::split_kernel_global_size(DeviceTask *task) { +int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask *task) { /* TODO(mai): this needs investigation but cpu gives incorrect render if global size doesnt match tile size */ return task->requested_tile_size; } +size_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) { + KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; + + return split_data_buffer_size(kg, num_threads); +} + unordered_map<string, void*> CPUDevice::kernel_functions; Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background) diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 9776e82ca2a..0204f0ed960 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -89,6 +89,8 @@ class CUDASplitKernel : public DeviceSplitKernel { public: explicit CUDASplitKernel(CUDADevice *device); + virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, RenderTile& rtile, int num_global_elements, @@ -102,7 +104,7 @@ public: virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&); virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(DeviceTask *task); + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); }; class CUDADevice : public Device @@ -1471,6 +1473,43 @@ CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device) { } +size_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads) +{ + device_vector<uint> size_buffer; + size_buffer.resize(1); + device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE); + + device->cuda_push_context(); + + uint threads = num_threads; + CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer); + + struct args_t { + uint* num_threads; + CUdeviceptr* size; + }; + + args_t args = { + &threads, + &d_size + }; + + CUfunction state_buffer_size; + cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size")); + + cuda_assert(cuLaunchKernel(state_buffer_size, + 1, 1, 1, + 1, 1, 1, + 0, 0, &args, 0)); + + device->cuda_pop_context(); + + device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint)); + device->mem_free(size_buffer); + + return *size_buffer.get_data(); +} + bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, RenderTile& rtile, int num_global_elements, @@ -1573,7 +1612,7 @@ int2 CUDASplitKernel::split_kernel_local_size() return make_int2(32, 1); } -int2 CUDASplitKernel::split_kernel_global_size(DeviceTask */*task*/) +int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask */*task*/) { /* TODO(mai): implement something here to detect ideal work size */ return make_int2(256, 256); diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp index f16fb6a1ea1..799479ddb6a 100644 --- a/intern/cycles/device/device_split_kernel.cpp +++ b/intern/cycles/device/device_split_kernel.cpp @@ -90,9 +90,9 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe return true; } -size_t DeviceSplitKernel::max_elements_for_max_buffer_size(size_t max_buffer_size, size_t passes_size) +size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size) { - size_t size_per_element = split_data_buffer_size(1024, current_max_closure, passes_size) / 1024; + size_t size_per_element = state_buffer_size(kg, data, 1024) / 1024; return max_buffer_size / size_per_element; } @@ -113,13 +113,10 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, local_size[1] = lsize[1]; } - /* Calculate per_thread_output_buffer_size. */ - size_t per_thread_output_buffer_size = task->passes_size; - /* Set gloabl size */ size_t global_size[2]; { - int2 gsize = split_kernel_global_size(task); + int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task); /* Make sure that set work size is a multiple of local * work size dimensions. @@ -153,9 +150,7 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, ray_state.resize(num_global_elements); device->mem_alloc("ray_state", ray_state, MEM_READ_WRITE); - split_data.resize(split_data_buffer_size(num_global_elements, - current_max_closure, - per_thread_output_buffer_size)); + split_data.resize(state_buffer_size(kgbuffer, kernel_data, num_global_elements)); device->mem_alloc("split_data", split_data, MEM_READ_WRITE); } diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h index 1c6a2709cf2..cc3e1aa26ae 100644 --- a/intern/cycles/device/device_split_kernel.h +++ b/intern/cycles/device/device_split_kernel.h @@ -100,7 +100,8 @@ public: device_memory& kgbuffer, device_memory& kernel_data); - size_t max_elements_for_max_buffer_size(size_t max_buffer_size, size_t passes_size); + virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0; + size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size); virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, RenderTile& rtile, @@ -115,7 +116,7 @@ public: virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0; virtual int2 split_kernel_local_size() = 0; - virtual int2 split_kernel_global_size(DeviceTask *task) = 0; + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0; }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index 7e04c6fac2c..a44f5da3a32 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -60,6 +60,7 @@ class OpenCLDeviceSplitKernel : public OpenCLDeviceBase public: DeviceSplitKernel *split_kernel; OpenCLProgram program_data_init; + OpenCLProgram program_state_buffer_size; OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_); @@ -83,6 +84,13 @@ public: program_data_init.add_kernel(ustring("path_trace_data_init")); programs.push_back(&program_data_init); + program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this, + "split_state_buffer_size", + "kernel_state_buffer_size.cl", + get_build_options(this, requested_features)); + program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size")); + programs.push_back(&program_state_buffer_size); + return split_kernel->load_kernels(requested_features); } @@ -216,6 +224,41 @@ public: return kernel; } + virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) + { + device_vector<uint> size_buffer; + size_buffer.resize(1); + device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE); + + uint threads = num_threads; + device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer); + + size_t global_size = 64; + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + device->program_state_buffer_size(), + 1, + NULL, + &global_size, + NULL, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint)); + device->mem_free(size_buffer); + + if(device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); + return 0; + } + + return *size_buffer.get_data(); + } + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, RenderTile& rtile, int num_global_elements, @@ -298,7 +341,7 @@ public: return make_int2(64, 1); } - virtual int2 split_kernel_global_size(DeviceTask *task) + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/) { size_t max_buffer_size; clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_buffer_size, NULL); @@ -306,7 +349,7 @@ public: << string_human_readable_number(max_buffer_size) << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; - size_t num_elements = max_elements_for_max_buffer_size(max_buffer_size / 2, task->passes_size); + size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2); int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements)); VLOG(1) << "Global size: " << global_size << "."; return global_size; diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index df40c3a0e8e..6867ab02318 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -15,6 +15,7 @@ set(SRC kernels/cpu/kernel.cpp kernels/cpu/kernel_split.cpp kernels/opencl/kernel.cl + kernels/opencl/kernel_state_buffer_size.cl kernels/opencl/kernel_data_init.cl kernels/opencl/kernel_path_init.cl kernels/opencl/kernel_queue_enqueue.cl @@ -399,6 +400,7 @@ endif() #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_state_buffer_size.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_path_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu index 53a36b15e40..759475b175f 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -41,6 +41,13 @@ /* kernels */ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) +kernel_cuda_state_buffer_size(uint num_threads, uint *size) +{ + *size = split_data_buffer_size(NULL, num_threads); +} + +extern "C" __global__ void +CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) kernel_cuda_path_trace_data_init( ccl_global void *split_data_buffer, int num_elements, diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl new file mode 100644 index 00000000000..0a1843ff8bd --- /dev/null +++ b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl @@ -0,0 +1,29 @@ +/* + * Copyright 2011-2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_compat_opencl.h" +#include "split/kernel_split_common.h" + +__kernel void kernel_ocl_path_trace_state_buffer_size( + KernelGlobals *kg, + ccl_constant KernelData *data, + uint num_threads, + ccl_global uint *size) +{ + kg->data = data; + *size = split_data_buffer_size(kg, num_threads); +} + diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h index 785103a79ac..9b62d65ffd9 100644 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -93,7 +93,7 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( kernel_split_params.buffer = buffer; - split_data_init(&kernel_split_state, num_elements, split_data_buffer, ray_state); + split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state); #ifdef __KERNEL_OPENCL__ #define KERNEL_TEX(type, ttype, name) \ diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h index 5dd53f42478..0a2ba8d1e1a 100644 --- a/intern/cycles/kernel/split/kernel_split_data.h +++ b/intern/cycles/kernel/split/kernel_split_data.h @@ -78,6 +78,8 @@ typedef struct SplitParams { SPLIT_DATA_ENTRY(Intersection, isect_shadow, 2) \ SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \ SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd, 1) \ + SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 2) \ SPLIT_DATA_DEBUG_ENTRIES \ /* struct that holds pointers to data in the shared state buffer */ @@ -86,37 +88,25 @@ typedef struct SplitData { SPLIT_DATA_ENTRIES #undef SPLIT_DATA_ENTRY - /* size calculation for these is non trivial, so they are left out of SPLIT_DATA_ENTRIES and handled separately */ - ShaderData *sd; - ShaderData *sd_DL_shadow; - /* this is actually in a separate buffer from the rest of the split state data (so it can be read back from * the host easily) but is still used the same as the other data so we have it here in this struct as well */ ccl_global char *ray_state; } SplitData; -#define SIZEOF_SD(max_closure) (sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - (max_closure)))) - -ccl_device_inline size_t split_data_buffer_size(size_t num_elements, - size_t max_closure, - size_t per_thread_output_buffer_size) +/* TODO: find a way to get access to kg here */ +ccl_device_inline size_t split_data_buffer_size(ccl_global void *kg, size_t num_elements) { size_t size = 0; #define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16) size = size SPLIT_DATA_ENTRIES; #undef SPLIT_DATA_ENTRY - /* TODO(sergey): This will actually over-allocate if - * particular kernel does not support multiclosure. - */ - size += align_up(num_elements * SIZEOF_SD(max_closure), 16); /* sd */ - size += align_up(2 * num_elements * SIZEOF_SD(max_closure), 16); /* sd_DL_shadow */ - return size; } -ccl_device_inline void split_data_init(ccl_global SplitData *split_data, +ccl_device_inline void split_data_init(ccl_global void *kg, + ccl_global SplitData *split_data, size_t num_elements, ccl_global void *data, ccl_global char *ray_state) @@ -128,12 +118,6 @@ ccl_device_inline void split_data_init(ccl_global SplitData *split_data, SPLIT_DATA_ENTRIES #undef SPLIT_DATA_ENTRY - split_data->sd = (ShaderData*)p; - p += align_up(num_elements * SIZEOF_SD(MAX_CLOSURE), 16); - - split_data->sd_DL_shadow = (ShaderData*)p; - p += align_up(2 * num_elements * SIZEOF_SD(MAX_CLOSURE), 16); - split_data->ray_state = ray_state; } |