Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device/device_cuda.cpp')
-rw-r--r--intern/cycles/device/device_cuda.cpp549
1 files changed, 499 insertions, 50 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index ef283c9d455..3a29538aa13 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -21,11 +21,14 @@
#include <string.h>
#include "device/device.h"
+#include "device/device_denoising.h"
#include "device/device_intern.h"
#include "device/device_split_kernel.h"
#include "render/buffers.h"
+#include "kernel/filter/filter_defines.h"
+
#ifdef WITH_CUDA_DYNLOAD
# include "cuew.h"
#else
@@ -102,7 +105,8 @@ public:
device_memory& use_queues_flag,
device_memory& work_pool_wgs);
- virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+ virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+ const DeviceRequestedFeatures&);
virtual int2 split_kernel_local_size();
virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
};
@@ -113,12 +117,13 @@ public:
DedicatedTaskPool task_pool;
CUdevice cuDevice;
CUcontext cuContext;
- CUmodule cuModule;
+ CUmodule cuModule, cuFilterModule;
map<device_ptr, bool> tex_interp_map;
map<device_ptr, uint> tex_bindless_map;
int cuDevId;
int cuDevArchitecture;
bool first_error;
+ CUDASplitKernel *split_kernel;
struct PixelMem {
GLuint cuPBO;
@@ -169,7 +174,7 @@ public:
CUresult result = stmt; \
\
if(result != CUDA_SUCCESS) { \
- string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+ string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
if(error_msg == "") \
error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
@@ -221,6 +226,11 @@ public:
cuDevice = 0;
cuContext = 0;
+ cuModule = 0;
+ cuFilterModule = 0;
+
+ split_kernel = NULL;
+
need_bindless_mapping = false;
/* intialize */
@@ -260,6 +270,8 @@ public:
{
task_pool.stop();
+ delete split_kernel;
+
if(info.has_bindless_textures) {
tex_free(bindless_mapping);
}
@@ -296,7 +308,8 @@ public:
* kernel sources md5 and only depends on compiler or compilation settings.
*/
string compile_kernel_get_common_cflags(
- const DeviceRequestedFeatures& requested_features, bool split=false)
+ const DeviceRequestedFeatures& requested_features,
+ bool filter=false, bool split=false)
{
const int cuda_version = cuewCompilerVersion();
const int machine = system_cpu_bits();
@@ -311,7 +324,7 @@ public:
machine,
cuda_version,
include_path.c_str());
- if(use_adaptive_compilation()) {
+ if(!filter && use_adaptive_compilation()) {
cflags += " " + requested_features.get_build_options();
}
const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
@@ -359,8 +372,22 @@ public:
return true;
}
- string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
+ string compile_kernel(const DeviceRequestedFeatures& requested_features,
+ bool filter=false, bool split=false)
{
+ const char *name, *source;
+ if(filter) {
+ name = "filter";
+ source = "filter.cu";
+ }
+ else if(split) {
+ name = "kernel_split";
+ source = "kernel_split.cu";
+ }
+ else {
+ name = "kernel";
+ source = "kernel.cu";
+ }
/* Compute cubin name. */
int major, minor;
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
@@ -368,9 +395,8 @@ public:
/* Attempt to use kernel provided with Blender. */
if(!use_adaptive_compilation()) {
- const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
- : "lib/kernel_sm_%d%d.cubin",
- major, minor));
+ const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
+ name, major, minor));
VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
if(path_exists(cubin)) {
VLOG(1) << "Using precompiled kernel.";
@@ -379,7 +405,7 @@ public:
}
const string common_cflags =
- compile_kernel_get_common_cflags(requested_features, split);
+ compile_kernel_get_common_cflags(requested_features, filter, split);
/* Try to use locally compiled kernel. */
const string source_path = path_get("source");
@@ -390,9 +416,8 @@ public:
*/
const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
- const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
- : "cycles_kernel_sm%d%d_%s.cubin",
- major, minor,
+ const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
+ name, major, minor,
cubin_md5.c_str());
const string cubin = path_cache_get(path_join("kernels", cubin_file));
VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
@@ -427,7 +452,7 @@ public:
const string kernel = path_join(
path_join(source_path, "kernel"),
path_join("kernels",
- path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
+ path_join("cuda", source)));
double starttime = time_dt();
printf("Compiling CUDA kernel ...\n");
@@ -466,6 +491,16 @@ public:
bool load_kernels(const DeviceRequestedFeatures& requested_features)
{
+ /* TODO(sergey): Support kernels re-load for CUDA devices.
+ *
+ * Currently re-loading kernel will invalidate memory pointers,
+ * causing problems in cuCtxSynchronize.
+ */
+ if(cuFilterModule && cuModule) {
+ VLOG(1) << "Skipping kernel reload, not currently supported.";
+ return true;
+ }
+
/* check if cuda init succeeded */
if(cuContext == 0)
return false;
@@ -475,11 +510,14 @@ public:
return false;
/* get kernel */
- string cubin = compile_kernel(requested_features, use_split_kernel());
-
+ string cubin = compile_kernel(requested_features, false, use_split_kernel());
if(cubin == "")
return false;
+ string filter_cubin = compile_kernel(requested_features, true, false);
+ if(filter_cubin == "")
+ return false;
+
/* open module */
cuda_push_context();
@@ -494,6 +532,14 @@ public:
if(cuda_error_(result, "cuModuleLoad"))
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
+ if(path_read_text(filter_cubin, cubin_data))
+ result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+ else
+ result = CUDA_ERROR_FILE_NOT_FOUND;
+
+ if(cuda_error_(result, "cuModuleLoad"))
+ cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+
cuda_pop_context();
return (result == CUDA_SUCCESS);
@@ -576,6 +622,11 @@ public:
}
}
+ virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+ {
+ return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+ }
+
void const_copy_to(const char *name, void *host, size_t size)
{
CUdeviceptr mem;
@@ -876,6 +927,393 @@ public:
}
}
+ bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
+ {
+ mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+ TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+ for(int i = 0; i < 9; i++) {
+ tiles->buffers[i] = buffers[i];
+ }
+
+ mem_copy_to(task->tiles_mem);
+
+ return !have_error();
+ }
+
+#define CUDA_GET_BLOCKSIZE(func, w, h) \
+ int threads_per_block; \
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+ int threads = (int)sqrt((float)threads_per_block); \
+ int xblocks = ((w) + threads - 1)/threads; \
+ int yblocks = ((h) + threads - 1)/threads;
+
+#define CUDA_LAUNCH_KERNEL(func, args) \
+ cuda_assert(cuLaunchKernel(func, \
+ xblocks, yblocks, 1, \
+ threads, threads, 1, \
+ 0, 0, args, 0));
+
+ bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ int4 rect = task->rect;
+ int w = align_up(rect.z-rect.x, 4);
+ int h = rect.w-rect.y;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ CUdeviceptr difference = task->nlm_state.temporary_1_ptr;
+ CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
+ CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
+
+ cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
+ cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
+
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+ cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+
+ CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
+
+ int dx, dy;
+ int4 local_rect;
+ int channel_offset = 0;
+ void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f};
+ void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f};
+ void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ dy = i / (2*r+1) - r;
+ dx = i % (2*r+1) - r;
+ local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+
+ CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
+ }
+
+ local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
+ void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
+ CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_construct_transform(DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterConstructTransform;
+ cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+ CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
+ task->storage.w,
+ task->storage.h);
+
+ void *args[] = {&task->buffer.mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->filter_area,
+ &task->rect,
+ &task->radius,
+ &task->pca_threshold,
+ &task->buffer.pass_stride};
+ CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ mem_zero(task->storage.XtWX);
+ mem_zero(task->storage.XtWY);
+
+ cuda_push_context();
+
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+ cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+ cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+
+ CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr;
+ CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
+
+ int r = task->radius;
+ int f = 4;
+ float a = 1.0f;
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy),
+ task->reconstruction_state.source_w - max(0, dx),
+ task->reconstruction_state.source_h - max(0, dy)};
+
+ void *calc_difference_args[] = {&dx, &dy,
+ &color_ptr,
+ &color_variance_ptr,
+ &difference,
+ &local_rect,
+ &task->buffer.w,
+ &task->buffer.pass_stride,
+ &a,
+ &task->nlm_k_2};
+ CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+
+ void *blur_args[] = {&difference,
+ &blurDifference,
+ &local_rect,
+ &task->buffer.w,
+ &f};
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+ void *calc_weight_args[] = {&blurDifference,
+ &difference,
+ &local_rect,
+ &task->buffer.w,
+ &f};
+ CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+
+ /* Reuse previous arguments. */
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+ void *construct_gramian_args[] = {&dx, &dy,
+ &blurDifference,
+ &task->buffer.mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &local_rect,
+ &task->reconstruction_state.filter_rect,
+ &task->buffer.w,
+ &task->buffer.h,
+ &f,
+ &task->buffer.pass_stride};
+ CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
+ }
+
+ void *finalize_args[] = {&task->buffer.w,
+ &task->buffer.h,
+ &output_ptr,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->filter_area,
+ &task->reconstruction_state.buffer_params.x,
+ &task->render_buffer.samples};
+ CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr mean_ptr, device_ptr variance_ptr,
+ int r, int4 rect, DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterCombineHalves;
+ cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ void *args[] = {&mean_ptr,
+ &variance_ptr,
+ &a_ptr,
+ &b_ptr,
+ &rect,
+ &r};
+ CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr, DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterDivideShadow;
+ cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ bool use_split_variance = use_split_kernel();
+ void *args[] = {&task->render_buffer.samples,
+ &task->tiles_mem.device_pointer,
+ &a_ptr,
+ &b_ptr,
+ &sample_variance_ptr,
+ &sv_variance_ptr,
+ &buffer_variance_ptr,
+ &task->rect,
+ &task->render_buffer.pass_stride,
+ &task->render_buffer.denoising_data_offset,
+ &use_split_variance};
+ CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterGetFeature;
+ cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ bool use_split_variance = use_split_kernel();
+ void *args[] = {&task->render_buffer.samples,
+ &task->tiles_mem.device_pointer,
+ &mean_offset,
+ &variance_offset,
+ &mean_ptr,
+ &variance_ptr,
+ &task->rect,
+ &task->render_buffer.pass_stride,
+ &task->render_buffer.denoising_data_offset,
+ &use_split_variance};
+ CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ bool denoising_detect_outliers(device_ptr image_ptr,
+ device_ptr variance_ptr,
+ device_ptr depth_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ cuda_push_context();
+
+ CUfunction cuFilterDetectOutliers;
+ cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ void *args[] = {&image_ptr,
+ &variance_ptr,
+ &depth_ptr,
+ &output_ptr,
+ &task->rect,
+ &task->buffer.pass_stride};
+
+ CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
+ cuda_assert(cuCtxSynchronize());
+
+ cuda_pop_context();
+ return !have_error();
+ }
+
+ void denoise(RenderTile &rtile, const DeviceTask &task)
+ {
+ DenoisingTask denoising(this);
+
+ denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
+ denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+ denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
+
+ denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+ denoising.render_buffer.samples = rtile.sample;
+
+ RenderTile rtiles[9];
+ rtiles[4] = rtile;
+ task.map_neighbor_tiles(rtiles, this);
+ denoising.tiles_from_rendertiles(rtiles);
+
+ denoising.init_from_devicetask(task);
+
+ denoising.run_denoising();
+
+ task.unmap_neighbor_tiles(rtiles, this);
+ }
+
void path_trace(RenderTile& rtile, int sample, bool branched)
{
if(have_error())
@@ -1300,7 +1738,7 @@ public:
void thread_run(DeviceTask *task)
{
- if(task->type == DeviceTask::PATH_TRACE) {
+ if(task->type == DeviceTask::RENDER) {
RenderTile tile;
bool branched = task->integrator_branched;
@@ -1308,47 +1746,56 @@ public:
/* Upload Bindless Mapping */
load_bindless_mapping();
- if(!use_split_kernel()) {
- /* keep rendering tiles until done */
- while(task->acquire_tile(this, tile)) {
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
+ DeviceRequestedFeatures requested_features;
+ if(use_split_kernel()) {
+ if(!use_adaptive_compilation()) {
+ requested_features.max_closure = 64;
+ }
+
+ if(split_kernel == NULL) {
+ split_kernel = new CUDASplitKernel(this);
+ split_kernel->load_kernels(requested_features);
+ }
+ }
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
- }
+ /* keep rendering tiles until done */
+ while(task->acquire_tile(this, tile)) {
+ if(tile.task == RenderTile::PATH_TRACE) {
+ if(use_split_kernel()) {
+ device_memory void_buffer;
+ split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+ }
+ else {
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
- path_trace(tile, sample, branched);
+ for(int sample = start_sample; sample < end_sample; sample++) {
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
+ }
- tile.sample = sample + 1;
+ path_trace(tile, sample, branched);
- task->update_progress(&tile, tile.w*tile.h);
- }
+ tile.sample = sample + 1;
- task->release_tile(tile);
- }
- }
- else {
- DeviceRequestedFeatures requested_features;
- if(!use_adaptive_compilation()) {
- requested_features.max_closure = 64;
+ task->update_progress(&tile, tile.w*tile.h);
+ }
+ }
}
+ else if(tile.task == RenderTile::DENOISE) {
+ tile.sample = tile.start_sample + tile.num_samples;
- CUDASplitKernel split_kernel(this);
- split_kernel.load_kernels(requested_features);
+ denoise(tile, *task);
- while(task->acquire_tile(this, tile)) {
- device_memory void_buffer;
- split_kernel.path_trace(task, tile, void_buffer, void_buffer);
+ task->update_progress(&tile, tile.w*tile.h);
+ }
- task->release_tile(tile);
+ task->release_tile(tile);
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
- }
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
}
}
}
@@ -1591,7 +2038,8 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim
return !device->have_error();
}
-SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name,
+ const DeviceRequestedFeatures&)
{
CUfunction func;
@@ -1627,7 +2075,8 @@ int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory&
<< string_human_readable_size(free) << ").";
size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
- int2 global_size = make_int2(round_down((int)sqrt(num_elements), 32), (int)sqrt(num_elements));
+ size_t side = round_down((int)sqrt(num_elements), 32);
+ int2 global_size = make_int2(side, round_down(num_elements / side, 16));
VLOG(1) << "Global size: " << global_size << ".";
return global_size;
}