diff options
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/cuda/device_cuda.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/cuda/device_cuda_impl.cpp | 52 | ||||
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 28 | ||||
-rw-r--r-- | intern/cycles/device/opencl/device_opencl.h | 1 | ||||
-rw-r--r-- | intern/cycles/device/opencl/device_opencl_impl.cpp | 53 |
5 files changed, 100 insertions, 36 deletions
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h index 3e397da895b..3f23f0fe4c5 100644 --- a/intern/cycles/device/cuda/device_cuda.h +++ b/intern/cycles/device/cuda/device_cuda.h @@ -223,7 +223,7 @@ class CUDADevice : public Device { CUdeviceptr d_wtile, CUstream stream = 0); - void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles); + void render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles); void film_convert(DeviceTask &task, device_ptr buffer, diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp index ba5d479e0e7..acf53c3eb1b 100644 --- a/intern/cycles/device/cuda/device_cuda_impl.cpp +++ b/intern/cycles/device/cuda/device_cuda_impl.cpp @@ -586,20 +586,23 @@ void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_f cuMemGetInfo(&free_before, &total); /* Get kernel function. */ - CUfunction cuPathTrace; + CUfunction cuRender; - if (requested_features.use_integrator_branched) { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); + if (requested_features.use_baking) { + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); + } + else if (requested_features.use_integrator_branched) { + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace")); } else { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); } - cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); int min_blocks, num_threads_per_block; - cuda_assert(cuOccupancyMaxPotentialBlockSize( - &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); + cuda_assert( + cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); /* Launch kernel, using just 1 block appears sufficient to reserve * memory for all multiprocessors. It would be good to do this in @@ -609,7 +612,7 @@ void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_f void *args[] = {&d_work_tiles, &total_work_size}; - cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); cuda_assert(cuCtxSynchronize()); @@ -1780,9 +1783,7 @@ void CUDADevice::adaptive_sampling_post(RenderTile &rtile, 0)); } -void CUDADevice::path_trace(DeviceTask &task, - RenderTile &rtile, - device_vector<WorkTile> &work_tiles) +void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles) { scoped_timer timer(&rtile.buffers->render_time); @@ -1790,21 +1791,24 @@ void CUDADevice::path_trace(DeviceTask &task, return; CUDAContextScope scope(this); - CUfunction cuPathTrace; + CUfunction cuRender; /* Get kernel function. */ - if (task.integrator_branched) { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); + if (rtile.task == RenderTile::BAKE) { + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); + } + else if (task.integrator_branched) { + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace")); } else { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); + cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); } if (have_error()) { return; } - cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); /* Allocate work tile. */ work_tiles.alloc(1); @@ -1822,8 +1826,8 @@ void CUDADevice::path_trace(DeviceTask &task, * remain conservative for GPUs connected to a display to avoid driver * timeouts and display freezing. */ int min_blocks, num_threads_per_block; - cuda_assert(cuOccupancyMaxPotentialBlockSize( - &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); + cuda_assert( + cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); if (!info.display_device) { min_blocks *= 8; } @@ -1851,7 +1855,7 @@ void CUDADevice::path_trace(DeviceTask &task, void *args[] = {&d_work_tiles, &total_work_size}; cuda_assert( - cuLaunchKernel(cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ uint filter_sample = sample + wtile->num_samples - 1; @@ -1957,10 +1961,7 @@ void CUDADevice::shader(DeviceTask &task) CUdeviceptr d_output = (CUdeviceptr)task.shader_output; /* get kernel function */ - if (task.shader_eval_type >= SHADER_EVAL_BAKE) { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); - } - else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { + if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); } else { @@ -2297,9 +2298,12 @@ void CUDADevice::thread_run(DeviceTask *task) split_kernel->path_trace(task, tile, void_buffer, void_buffer); } else { - path_trace(*task, tile, work_tiles); + render(*task, tile, work_tiles); } } + else if (tile.task == RenderTile::BAKE) { + render(*task, tile, work_tiles); + } else if (tile.task == RenderTile::DENOISE) { tile.sample = tile.start_sample + tile.num_samples; diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index c701c14318f..fc6febd8cee 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -188,6 +188,7 @@ class CPUDevice : public Device { convert_to_byte_kernel; KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> shader_kernel; + KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> bake_kernel; KernelFunctions<void (*)( int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)> @@ -270,6 +271,7 @@ class CPUDevice : public Device { REGISTER_KERNEL(convert_to_half_float), REGISTER_KERNEL(convert_to_byte), REGISTER_KERNEL(shader), + REGISTER_KERNEL(bake), REGISTER_KERNEL(filter_divide_shadow), REGISTER_KERNEL(filter_get_feature), REGISTER_KERNEL(filter_write_feature), @@ -895,7 +897,7 @@ class CPUDevice : public Device { } } - void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) + void render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) { const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE; @@ -919,12 +921,21 @@ class CPUDevice : public Device { break; } - for (int y = tile.y; y < tile.y + tile.h; y++) { - for (int x = tile.x; x < tile.x + tile.w; x++) { - if (use_coverage) { - coverage.init_pixel(x, y); + if (tile.task == RenderTile::PATH_TRACE) { + for (int y = tile.y; y < tile.y + tile.h; y++) { + for (int x = tile.x; x < tile.x + tile.w; x++) { + if (use_coverage) { + coverage.init_pixel(x, y); + } + path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride); + } + } + } + else { + for (int y = tile.y; y < tile.y + tile.h; y++) { + for (int x = tile.x; x < tile.x + tile.w; x++) { + bake_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride); } - path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride); } } tile.sample = sample + 1; @@ -1019,9 +1030,12 @@ class CPUDevice : public Device { split_kernel->path_trace(&task, tile, kgbuffer, void_buffer); } else { - path_trace(task, tile, kg); + render(task, tile, kg); } } + else if (tile.task == RenderTile::BAKE) { + render(task, tile, kg); + } else if (tile.task == RenderTile::DENOISE) { denoise(denoising, tile); task.update_progress(&tile, tile.w * tile.h); diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h index d6f4fb43061..389268e1c2a 100644 --- a/intern/cycles/device/opencl/device_opencl.h +++ b/intern/cycles/device/opencl/device_opencl.h @@ -451,6 +451,7 @@ class OpenCLDevice : public Device { device_ptr rgba_half); void shader(DeviceTask &task); void update_adaptive(DeviceTask &task, RenderTile &tile, int sample); + void bake(DeviceTask &task, RenderTile &tile); void denoise(RenderTile &tile, DenoisingTask &denoising); diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp index 2766f85d17c..beb3174b111 100644 --- a/intern/cycles/device/opencl/device_opencl_impl.cpp +++ b/intern/cycles/device/opencl/device_opencl_impl.cpp @@ -1367,6 +1367,9 @@ void OpenCLDevice::thread_run(DeviceTask *task) */ clFinish(cqCommandQueue); } + else if (tile.task == RenderTile::BAKE) { + bake(*task, tile); + } else if (tile.task == RenderTile::DENOISE) { tile.sample = tile.start_sample + tile.num_samples; denoise(tile, denoising); @@ -1858,10 +1861,7 @@ void OpenCLDevice::shader(DeviceTask &task) cl_int d_offset = task.offset; OpenCLDevice::OpenCLProgram *program = &background_program; - if (task.shader_eval_type >= SHADER_EVAL_BAKE) { - program = &bake_program; - } - else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { + if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { program = &displace_program; } program->wait_for_availability(); @@ -1892,6 +1892,51 @@ void OpenCLDevice::shader(DeviceTask &task) } } +void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile) +{ + scoped_timer timer(&rtile.buffers->render_time); + + /* Cast arguments to cl types. */ + cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); + cl_mem d_buffer = CL_MEM_PTR(rtile.buffer); + cl_int d_x = rtile.x; + cl_int d_y = rtile.y; + cl_int d_w = rtile.w; + cl_int d_h = rtile.h; + cl_int d_offset = rtile.offset; + cl_int d_stride = rtile.stride; + + bake_program.wait_for_availability(); + cl_kernel kernel = bake_program(); + + cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer); + + set_kernel_arg_buffers(kernel, &start_arg_index); + + start_arg_index += kernel_set_args( + kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride); + + int start_sample = rtile.start_sample; + int end_sample = rtile.start_sample + rtile.num_samples; + + for (int sample = start_sample; sample < end_sample; sample++) { + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + + kernel_set_args(kernel, start_arg_index, sample); + + enqueue_kernel(kernel, d_w, d_h); + + rtile.sample = sample + 1; + + task.update_progress(&rtile, rtile.w * rtile.h); + } + + clFinish(cqCommandQueue); +} + string OpenCLDevice::kernel_build_options(const string *debug_src) { string build_options = "-cl-no-signed-zeros -cl-mad-enable "; |