/* * Copyright 2011-2013 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifdef WITH_OPENCL #include "opencl.h" #include "buffers.h" #include "kernel_types.h" #include "kernel_split_data_types.h" #include "device_split_kernel.h" #include "util_logging.h" #include "util_md5.h" #include "util_path.h" #include "util_time.h" CCL_NAMESPACE_BEGIN class OpenCLSplitKernel; static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features) { string build_options = "-D__SPLIT_KERNEL__ "; build_options += requested_features.get_build_options(); /* Set compute device build option. */ cl_device_type device_type; OpenCLInfo::get_device_type(device->cdDevice, &device_type, &device->ciErr); assert(device->ciErr == CL_SUCCESS); if(device_type == CL_DEVICE_TYPE_GPU) { build_options += " -D__COMPUTE_DEVICE_GPU__"; } return build_options; } /* OpenCLDeviceSplitKernel's declaration/definition. */ class OpenCLDeviceSplitKernel : public OpenCLDeviceBase { public: DeviceSplitKernel *split_kernel; OpenCLProgram program_data_init; OpenCLProgram program_state_buffer_size; OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_); ~OpenCLDeviceSplitKernel() { task_pool.stop(); /* Release kernels */ program_data_init.release(); delete split_kernel; } virtual bool load_kernels(const DeviceRequestedFeatures& requested_features, vector &programs) { bool single_program = OpenCLInfo::use_single_program(); program_data_init = OpenCLDeviceBase::OpenCLProgram(this, single_program ? "split" : "split_data_init", single_program ? "kernel_split.cl" : "kernel_data_init.cl", get_build_options(this, requested_features)); program_data_init.add_kernel(ustring("path_trace_data_init")); programs.push_back(&program_data_init); program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this, single_program ? "split" : "split_state_buffer_size", single_program ? "kernel_split.cl" : "kernel_state_buffer_size.cl", get_build_options(this, requested_features)); program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size")); programs.push_back(&program_state_buffer_size); return split_kernel->load_kernels(requested_features); } void thread_run(DeviceTask *task) { if(task->type == DeviceTask::FILM_CONVERT) { film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); } else if(task->type == DeviceTask::SHADER) { shader(*task); } else if(task->type == DeviceTask::PATH_TRACE) { RenderTile tile; /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to * fetch its size. */ typedef struct KernelGlobals { ccl_constant KernelData *data; #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name; #include "kernel_textures.h" #undef KERNEL_TEX SplitData split_data; SplitParams split_param_data; } KernelGlobals; /* Allocate buffer for kernel globals */ device_memory kgbuffer; kgbuffer.resize(sizeof(KernelGlobals)); mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE); /* Keep rendering tiles until done. */ while(task->acquire_tile(this, tile)) { split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]); /* Complete kernel execution before release tile. */ /* This helps in multi-device render; * The device that reaches the critical-section function * release_tile waits (stalling other devices from entering * release_tile) for all kernels to complete. If device1 (a * slow-render device) reaches release_tile first then it would * stall device2 (a fast-render device) from proceeding to render * next tile. */ clFinish(cqCommandQueue); task->release_tile(tile); } mem_free(kgbuffer); } } protected: /* ** Those guys are for workign around some compiler-specific bugs ** */ string build_options_for_base_program( const DeviceRequestedFeatures& requested_features) { return requested_features.get_build_options(); } friend class OpenCLSplitKernel; friend class OpenCLSplitKernelFunction; }; class OpenCLSplitKernelFunction : public SplitKernelFunction { public: OpenCLDeviceSplitKernel* device; OpenCLDeviceBase::OpenCLProgram program; OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {} ~OpenCLSplitKernelFunction() { program.release(); } virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) { device->kernel_set_args(program(), 0, kg, data); device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, program(), 2, NULL, dim.global_size, dim.local_size, 0, NULL, NULL); device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); if(device->ciErr != CL_SUCCESS) { string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", clewErrorString(device->ciErr)); device->opencl_error(message); return false; } return true; } }; class OpenCLSplitKernel : public DeviceSplitKernel { OpenCLDeviceSplitKernel *device; public: explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) { } virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures& requested_features) { OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device); bool single_program = OpenCLInfo::use_single_program(); kernel->program = OpenCLDeviceBase::OpenCLProgram(device, single_program ? "split" : "split_" + kernel_name, single_program ? "kernel_split.cl" : "kernel_" + kernel_name + ".cl", get_build_options(device, requested_features)); kernel->program.add_kernel(ustring("path_trace_" + kernel_name)); kernel->program.load(); if(!kernel->program.is_loaded()) { delete kernel; return NULL; } return kernel; } virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) { device_vector size_buffer; size_buffer.resize(1); device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE); uint threads = num_threads; device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer); size_t global_size = 64; device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, device->program_state_buffer_size(), 1, NULL, &global_size, NULL, 0, NULL, NULL); device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t)); device->mem_free(size_buffer); if(device->ciErr != CL_SUCCESS) { string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", clewErrorString(device->ciErr)); device->opencl_error(message); return 0; } return *size_buffer.get_data(); } virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, RenderTile& rtile, int num_global_elements, device_memory& kernel_globals, device_memory& kernel_data, device_memory& split_data, device_memory& ray_state, device_memory& queue_index, device_memory& use_queues_flag, device_memory& work_pool_wgs ) { cl_int dQueue_size = dim.global_size[0] * dim.global_size[1]; /* Set the range of samples to be processed for every ray in * path-regeneration logic. */ cl_int start_sample = rtile.start_sample; cl_int end_sample = rtile.start_sample + rtile.num_samples; cl_uint start_arg_index = device->kernel_set_args(device->program_data_init(), 0, kernel_globals, kernel_data, split_data, num_global_elements, ray_state, rtile.rng_state); /* TODO(sergey): Avoid map lookup here. */ #define KERNEL_TEX(type, ttype, name) \ device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name); #include "kernel_textures.h" #undef KERNEL_TEX start_arg_index += device->kernel_set_args(device->program_data_init(), start_arg_index, start_sample, end_sample, rtile.x, rtile.y, rtile.w, rtile.h, rtile.offset, rtile.stride, queue_index, dQueue_size, use_queues_flag, work_pool_wgs, rtile.num_samples, rtile.buffer); /* Enqueue ckPathTraceKernel_data_init kernel. */ device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, device->program_data_init(), 2, NULL, dim.global_size, dim.local_size, 0, NULL, NULL); device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); if(device->ciErr != CL_SUCCESS) { string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", clewErrorString(device->ciErr)); device->opencl_error(message); return false; } return true; } virtual int2 split_kernel_local_size() { return make_int2(64, 1); } virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/) { cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); /* Use small global size on CPU devices as it seems to be much faster. */ if(type == CL_DEVICE_TYPE_CPU) { VLOG(1) << "Global size: (64, 64)."; return make_int2(64, 64); } cl_ulong max_buffer_size; clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2); int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements)); VLOG(1) << "Global size: " << global_size << "."; return global_size; } }; OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_) : OpenCLDeviceBase(info, stats, background_) { split_kernel = new OpenCLSplitKernel(this); background = background_; } Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background) { return new OpenCLDeviceSplitKernel(info, stats, background); } CCL_NAMESPACE_END #endif /* WITH_OPENCL */