diff options
Diffstat (limited to 'intern/cycles/device/device_cuda.cpp')
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 495 |
1 files changed, 448 insertions, 47 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index a971170318e..968ee5bc487 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -21,11 +21,14 @@ #include <string.h> #include "device/device.h" +#include "device/device_denoising.h" #include "device/device_intern.h" #include "device/device_split_kernel.h" #include "render/buffers.h" +#include "kernel/filter/filter_defines.h" + #ifdef WITH_CUDA_DYNLOAD # include "cuew.h" #else @@ -113,7 +116,7 @@ public: DedicatedTaskPool task_pool; CUdevice cuDevice; CUcontext cuContext; - CUmodule cuModule; + CUmodule cuModule, cuFilterModule; map<device_ptr, bool> tex_interp_map; map<device_ptr, uint> tex_bindless_map; int cuDevId; @@ -170,7 +173,7 @@ public: CUresult result = stmt; \ \ if(result != CUDA_SUCCESS) { \ - string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \ if(error_msg == "") \ error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ @@ -301,7 +304,8 @@ public: * kernel sources md5 and only depends on compiler or compilation settings. */ string compile_kernel_get_common_cflags( - const DeviceRequestedFeatures& requested_features, bool split=false) + const DeviceRequestedFeatures& requested_features, + bool filter=false, bool split=false) { const int cuda_version = cuewCompilerVersion(); const int machine = system_cpu_bits(); @@ -316,7 +320,7 @@ public: machine, cuda_version, include_path.c_str()); - if(use_adaptive_compilation()) { + if(!filter && use_adaptive_compilation()) { cflags += " " + requested_features.get_build_options(); } const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); @@ -364,8 +368,22 @@ public: return true; } - string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false) + string compile_kernel(const DeviceRequestedFeatures& requested_features, + bool filter=false, bool split=false) { + const char *name, *source; + if(filter) { + name = "filter"; + source = "filter.cu"; + } + else if(split) { + name = "kernel_split"; + source = "kernel_split.cu"; + } + else { + name = "kernel"; + source = "kernel.cu"; + } /* Compute cubin name. */ int major, minor; cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); @@ -373,9 +391,8 @@ public: /* Attempt to use kernel provided with Blender. */ if(!use_adaptive_compilation()) { - const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin" - : "lib/kernel_sm_%d%d.cubin", - major, minor)); + const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", + name, major, minor)); VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; if(path_exists(cubin)) { VLOG(1) << "Using precompiled kernel."; @@ -384,7 +401,7 @@ public: } const string common_cflags = - compile_kernel_get_common_cflags(requested_features, split); + compile_kernel_get_common_cflags(requested_features, filter, split); /* Try to use locally compiled kernel. */ const string source_path = path_get("source"); @@ -395,9 +412,8 @@ public: */ const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); - const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin" - : "cycles_kernel_sm%d%d_%s.cubin", - major, minor, + const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin", + name, major, minor, cubin_md5.c_str()); const string cubin = path_cache_get(path_join("kernels", cubin_file)); VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; @@ -432,7 +448,7 @@ public: const string kernel = path_join( path_join(source_path, "kernel"), path_join("kernels", - path_join("cuda", split ? "kernel_split.cu" : "kernel.cu"))); + path_join("cuda", source))); double starttime = time_dt(); printf("Compiling CUDA kernel ...\n"); @@ -480,11 +496,14 @@ public: return false; /* get kernel */ - string cubin = compile_kernel(requested_features, use_split_kernel()); - + string cubin = compile_kernel(requested_features, false, use_split_kernel()); if(cubin == "") return false; + string filter_cubin = compile_kernel(requested_features, true, false); + if(filter_cubin == "") + return false; + /* open module */ cuda_push_context(); @@ -499,6 +518,14 @@ public: if(cuda_error_(result, "cuModuleLoad")) cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); + if(path_read_text(filter_cubin, cubin_data)) + result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); + else + result = CUDA_ERROR_FILE_NOT_FOUND; + + if(cuda_error_(result, "cuModuleLoad")) + cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); + cuda_pop_context(); return (result == CUDA_SUCCESS); @@ -581,6 +608,11 @@ public: } } + virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/) + { + return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); + } + void const_copy_to(const char *name, void *host, size_t size) { CUdeviceptr mem; @@ -881,6 +913,368 @@ public: } } + bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) + { + mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY); + + TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer; + for(int i = 0; i < 9; i++) { + tiles->buffers[i] = buffers[i]; + } + + mem_copy_to(task->tiles_mem); + + return !have_error(); + } + +#define CUDA_GET_BLOCKSIZE(func, w, h) \ + int threads_per_block; \ + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int threads = (int)sqrt((float)threads_per_block); \ + int xblocks = ((w) + threads - 1)/threads; \ + int yblocks = ((h) + threads - 1)/threads; + +#define CUDA_LAUNCH_KERNEL(func, args) \ + cuda_assert(cuLaunchKernel(func, \ + xblocks, yblocks, 1, \ + threads, threads, 1, \ + 0, 0, args, 0)); + + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + int4 rect = task->rect; + int w = rect.z-rect.x; + int h = rect.w-rect.y; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + CUdeviceptr difference = task->nlm_state.temporary_1_ptr; + CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr; + CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr; + + cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h)); + cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h)); + + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); + cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y); + + int dx, dy; + int4 local_rect; + int channel_offset = 0; + void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f}; + void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f}; + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + dy = i / (2*r+1) - r; + dx = i % (2*r+1) - r; + local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); + + CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args); + } + + local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y); + void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w}; + CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_construct_transform(DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterConstructTransform; + cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); + cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); + CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, + task->storage.w, + task->storage.h); + + void *args[] = {&task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->filter_area, + &task->rect, + &task->radius, + &task->pca_threshold, + &task->buffer.pass_stride}; + CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr guide_ptr, + device_ptr guide_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + cuda_push_context(); + + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr; + CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr; + + int r = task->radius; + int f = 4; + float a = 1.0f; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + + void *calc_difference_args[] = {&dx, &dy, + &guide_ptr, + &guide_variance_ptr, + &difference, + &local_rect, + &task->buffer.w, + &task->buffer.pass_stride, + &a, + &task->nlm_k_2}; + CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); + + void *blur_args[] = {&difference, + &blurDifference, + &local_rect, + &task->buffer.w, + &f}; + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + + void *calc_weight_args[] = {&blurDifference, + &difference, + &local_rect, + &task->buffer.w, + &f}; + CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); + + /* Reuse previous arguments. */ + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + + void *construct_gramian_args[] = {&dx, &dy, + &blurDifference, + &task->buffer.mem.device_pointer, + &color_ptr, + &color_variance_ptr, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &local_rect, + &task->reconstruction_state.filter_rect, + &task->buffer.w, + &task->buffer.h, + &f, + &task->buffer.pass_stride}; + CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args); + } + + void *finalize_args[] = {&task->buffer.w, + &task->buffer.h, + &output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, + device_ptr mean_ptr, device_ptr variance_ptr, + int r, int4 rect, DenoisingTask *task) + { + (void) task; + + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterCombineHalves; + cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); + cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterCombineHalves, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&mean_ptr, + &variance_ptr, + &a_ptr, + &b_ptr, + &rect, + &r}; + CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, + device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, DenoisingTask *task) + { + (void) task; + + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterDivideShadow; + cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterDivideShadow, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + bool use_split_variance = use_split_kernel(); + void *args[] = {&task->render_buffer.samples, + &task->tiles_mem.device_pointer, + &a_ptr, + &b_ptr, + &sample_variance_ptr, + &sv_variance_ptr, + &buffer_variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.denoising_data_offset, + &use_split_variance}; + CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + cuda_push_context(); + + CUfunction cuFilterGetFeature; + cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterGetFeature, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + bool use_split_variance = use_split_kernel(); + void *args[] = {&task->render_buffer.samples, + &task->tiles_mem.device_pointer, + &mean_offset, + &variance_offset, + &mean_ptr, + &variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.denoising_data_offset, + &use_split_variance}; + CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); + cuda_assert(cuCtxSynchronize()); + + cuda_pop_context(); + return !have_error(); + } + + void denoise(RenderTile &rtile, const DeviceTask &task) + { + DenoisingTask denoising(this); + + denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + + RenderTile rtiles[9]; + rtiles[4] = rtile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); + + denoising.run_denoising(); + + task.unmap_neighbor_tiles(rtiles, this); + } + void path_trace(RenderTile& rtile, int sample, bool branched) { if(have_error()) @@ -1305,7 +1699,7 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::PATH_TRACE) { + if(task->type == DeviceTask::RENDER) { RenderTile tile; bool branched = task->integrator_branched; @@ -1313,30 +1707,8 @@ public: /* Upload Bindless Mapping */ load_bindless_mapping(); - if(!use_split_kernel()) { - /* keep rendering tiles until done */ - while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } - - path_trace(tile, sample, branched); - - tile.sample = sample + 1; - - task->update_progress(&tile, tile.w*tile.h); - } - - task->release_tile(tile); - } - } - else { - DeviceRequestedFeatures requested_features; + DeviceRequestedFeatures requested_features; + if(use_split_kernel()) { if(!use_adaptive_compilation()) { requested_features.max_closure = 64; } @@ -1345,18 +1717,47 @@ public: split_kernel = new CUDASplitKernel(this); split_kernel->load_kernels(requested_features); } + } + + /* keep rendering tiles until done */ + while(task->acquire_tile(this, tile)) { + if(tile.task == RenderTile::PATH_TRACE) { + if(use_split_kernel()) { + device_memory void_buffer; + split_kernel->path_trace(task, tile, void_buffer, void_buffer); + } + else { + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; + + for(int sample = start_sample; sample < end_sample; sample++) { + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } - while(task->acquire_tile(this, tile)) { - device_memory void_buffer; - split_kernel->path_trace(task, tile, void_buffer, void_buffer); + path_trace(tile, sample, branched); - task->release_tile(tile); + tile.sample = sample + 1; - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; + task->update_progress(&tile, tile.w*tile.h); + } } } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + + denoise(tile, *task); + + task->update_progress(&tile, tile.w*tile.h); + } + + task->release_tile(tile); + + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } } } else if(task->type == DeviceTask::SHADER) { |