diff options
Diffstat (limited to 'intern/cycles/device/device_cuda.cpp')
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 1224 |
1 files changed, 976 insertions, 248 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index fbb97f78e70..216c85f24e7 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -15,32 +15,39 @@ */ #include <climits> +#include <limits.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include "device.h" -#include "device_intern.h" +#include "device/device.h" +#include "device/device_denoising.h" +#include "device/device_intern.h" +#include "device/device_split_kernel.h" -#include "buffers.h" +#include "render/buffers.h" + +#include "kernel/filter/filter_defines.h" #ifdef WITH_CUDA_DYNLOAD # include "cuew.h" #else -# include "util_opengl.h" +# include "util/util_opengl.h" # include <cuda.h> # include <cudaGL.h> #endif -#include "util_debug.h" -#include "util_logging.h" -#include "util_map.h" -#include "util_md5.h" -#include "util_opengl.h" -#include "util_path.h" -#include "util_string.h" -#include "util_system.h" -#include "util_types.h" -#include "util_time.h" +#include "util/util_debug.h" +#include "util/util_logging.h" +#include "util/util_map.h" +#include "util/util_md5.h" +#include "util/util_opengl.h" +#include "util/util_path.h" +#include "util/util_string.h" +#include "util/util_system.h" +#include "util/util_types.h" +#include "util/util_time.h" + +#include "kernel/split/kernel_split_data_types.h" CCL_NAMESPACE_BEGIN @@ -78,18 +85,55 @@ int cuewCompilerVersion(void) } /* namespace */ #endif /* WITH_CUDA_DYNLOAD */ +class CUDADevice; + +class CUDASplitKernel : public DeviceSplitKernel { + CUDADevice *device; +public: + explicit CUDASplitKernel(CUDADevice *device); + + virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& kernel_globals, + device_memory& kernel_data_, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs); + + virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, + const DeviceRequestedFeatures&); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); +}; + +/* Utility to push/pop CUDA context. */ +class CUDAContextScope { +public: + CUDAContextScope(CUDADevice *device); + ~CUDAContextScope(); + +private: + CUDADevice *device; +}; + class CUDADevice : public Device { public: DedicatedTaskPool task_pool; CUdevice cuDevice; CUcontext cuContext; - CUmodule cuModule; + CUmodule cuModule, cuFilterModule; map<device_ptr, bool> tex_interp_map; - map<device_ptr, uint> tex_bindless_map; + map<device_ptr, CUtexObject> tex_bindless_map; int cuDevId; int cuDevArchitecture; bool first_error; + CUDASplitKernel *split_kernel; struct PixelMem { GLuint cuPBO; @@ -101,8 +145,8 @@ public: map<device_ptr, PixelMem> pixel_mem_map; /* Bindless Textures */ - device_vector<uint> bindless_mapping; - bool need_bindless_mapping; + device_vector<TextureInfo> texture_info; + bool need_texture_info; CUdeviceptr cuda_device_ptr(device_ptr mem) { @@ -115,6 +159,12 @@ public: return path_exists(cubins_path); } + virtual bool show_samples() const + { + /* The CUDADevice only processes one tile at a time, so showing samples is fine. */ + return true; + } + /*#ifdef NDEBUG #define cuda_abort() #else @@ -124,7 +174,7 @@ public: { if(first_error) { fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); - fprintf(stderr, "http://www.blender.org/manual/render/cycles/gpu_rendering.html\n\n"); + fprintf(stderr, "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n"); first_error = false; } } @@ -134,7 +184,7 @@ public: CUresult result = stmt; \ \ if(result != CUDA_SUCCESS) { \ - string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \ if(error_msg == "") \ error_msg = message; \ fprintf(stderr, "%s\n", message.c_str()); \ @@ -166,16 +216,6 @@ public: cuda_error_documentation(); } - void cuda_push_context() - { - cuda_assert(cuCtxSetCurrent(cuContext)); - } - - void cuda_pop_context() - { - cuda_assert(cuCtxSetCurrent(NULL)); - } - CUDADevice(DeviceInfo& info, Stats &stats, bool background_) : Device(info, stats, background_) { @@ -186,7 +226,12 @@ public: cuDevice = 0; cuContext = 0; - need_bindless_mapping = false; + cuModule = 0; + cuFilterModule = 0; + + split_kernel = NULL; + + need_texture_info = false; /* intialize */ if(cuda_error(cuInit(0))) @@ -218,15 +263,18 @@ public: cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); cuDevArchitecture = major*100 + minor*10; - cuda_pop_context(); + /* Pop context set by cuCtxCreate. */ + cuCtxPopCurrent(NULL); } ~CUDADevice() { task_pool.stop(); + delete split_kernel; + if(info.has_bindless_textures) { - tex_free(bindless_mapping); + tex_free(texture_info); } cuda_assert(cuCtxDestroy(cuContext)); @@ -252,16 +300,22 @@ public: return DebugFlags().cuda.adaptive_compile; } + bool use_split_kernel() + { + return DebugFlags().cuda.split_kernel; + } + /* Common NVCC flags which stays the same regardless of shading model, * kernel sources md5 and only depends on compiler or compilation settings. */ string compile_kernel_get_common_cflags( - const DeviceRequestedFeatures& requested_features) + const DeviceRequestedFeatures& requested_features, + bool filter=false, bool split=false) { const int cuda_version = cuewCompilerVersion(); const int machine = system_cpu_bits(); - const string kernel_path = path_get("kernel"); - const string include = kernel_path; + const string source_path = path_get("source"); + const string include_path = source_path; string cflags = string_printf("-m%d " "--ptxas-options=\"-v\" " "--use_fast_math " @@ -270,8 +324,8 @@ public: "-I\"%s\"", machine, cuda_version, - include.c_str()); - if(use_adaptive_compilation()) { + include_path.c_str()); + if(!filter && use_adaptive_compilation()) { cflags += " " + requested_features.get_build_options(); } const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); @@ -281,6 +335,11 @@ public: #ifdef WITH_CYCLES_DEBUG cflags += " -D__KERNEL_DEBUG__"; #endif + + if(split) { + cflags += " -D__SPLIT__"; + } + return cflags; } @@ -300,22 +359,36 @@ public: cuda_error_message("CUDA nvcc compiler version could not be parsed."); return false; } - if(cuda_version < 75) { + if(cuda_version < 80) { printf("Unsupported CUDA version %d.%d detected, " - "you need CUDA 7.5 or newer.\n", + "you need CUDA 8.0 or newer.\n", major, minor); return false; } - else if(cuda_version != 75 && cuda_version != 80) { + else if(cuda_version != 80) { printf("CUDA version %d.%d detected, build may succeed but only " - "CUDA 7.5 and 8.0 are officially supported.\n", + "CUDA 8.0 is officially supported.\n", major, minor); } return true; } - string compile_kernel(const DeviceRequestedFeatures& requested_features) + string compile_kernel(const DeviceRequestedFeatures& requested_features, + bool filter=false, bool split=false) { + const char *name, *source; + if(filter) { + name = "filter"; + source = "filter.cu"; + } + else if(split) { + name = "kernel_split"; + source = "kernel_split.cu"; + } + else { + name = "kernel"; + source = "kernel.cu"; + } /* Compute cubin name. */ int major, minor; cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); @@ -323,8 +396,8 @@ public: /* Attempt to use kernel provided with Blender. */ if(!use_adaptive_compilation()) { - const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", - major, minor)); + const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", + name, major, minor)); VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; if(path_exists(cubin)) { VLOG(1) << "Using precompiled kernel."; @@ -333,19 +406,19 @@ public: } const string common_cflags = - compile_kernel_get_common_cflags(requested_features); + compile_kernel_get_common_cflags(requested_features, filter, split); /* Try to use locally compiled kernel. */ - const string kernel_path = path_get("kernel"); - const string kernel_md5 = path_files_md5_hash(kernel_path); + const string source_path = path_get("source"); + const string kernel_md5 = path_files_md5_hash(source_path); /* We include cflags into md5 so changing cuda toolkit or changing other * compiler command line arguments makes sure cubin gets re-built. */ const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); - const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin", - major, minor, + const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin", + name, major, minor, cubin_md5.c_str()); const string cubin = path_cache_get(path_join("kernels", cubin_file)); VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; @@ -377,9 +450,10 @@ public: return ""; } const char *nvcc = cuewCompilerPath(); - const string kernel = path_join(kernel_path, - path_join("kernels", - path_join("cuda", "kernel.cu"))); + const string kernel = path_join( + path_join(source_path, "kernel"), + path_join("kernels", + path_join("cuda", source))); double starttime = time_dt(); printf("Compiling CUDA kernel ...\n"); @@ -418,6 +492,16 @@ public: bool load_kernels(const DeviceRequestedFeatures& requested_features) { + /* TODO(sergey): Support kernels re-load for CUDA devices. + * + * Currently re-loading kernel will invalidate memory pointers, + * causing problems in cuCtxSynchronize. + */ + if(cuFilterModule && cuModule) { + VLOG(1) << "Skipping kernel reload, not currently supported."; + return true; + } + /* check if cuda init succeeded */ if(cuContext == 0) return false; @@ -427,13 +511,16 @@ public: return false; /* get kernel */ - string cubin = compile_kernel(requested_features); - + string cubin = compile_kernel(requested_features, false, use_split_kernel()); if(cubin == "") return false; + string filter_cubin = compile_kernel(requested_features, true, false); + if(filter_cubin == "") + return false; + /* open module */ - cuda_push_context(); + CUDAContextScope scope(this); string cubin_data; CUresult result; @@ -446,46 +533,58 @@ public: if(cuda_error_(result, "cuModuleLoad")) cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); - cuda_pop_context(); + if(path_read_text(filter_cubin, cubin_data)) + result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); + else + result = CUDA_ERROR_FILE_NOT_FOUND; + + if(cuda_error_(result, "cuModuleLoad")) + cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); return (result == CUDA_SUCCESS); } - void load_bindless_mapping() + void load_texture_info() { - if(info.has_bindless_textures && need_bindless_mapping) { - tex_free(bindless_mapping); - tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT); - need_bindless_mapping = false; + if(info.has_bindless_textures && need_texture_info) { + tex_free(texture_info); + tex_alloc("__texture_info", texture_info, INTERPOLATION_NONE, EXTENSION_REPEAT); + need_texture_info = false; } } - void mem_alloc(device_memory& mem, MemoryType /*type*/) + void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/) { - cuda_push_context(); + CUDAContextScope scope(this); + + if(name) { + VLOG(1) << "Buffer allocate: " << name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + CUdeviceptr device_pointer; size_t size = mem.memory_size(); cuda_assert(cuMemAlloc(&device_pointer, size)); mem.device_pointer = (device_ptr)device_pointer; mem.device_size = size; stats.mem_alloc(size); - cuda_pop_context(); } void mem_copy_to(device_memory& mem) { - cuda_push_context(); + CUDAContextScope scope(this); + if(mem.device_pointer) cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size())); - cuda_pop_context(); } void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) { + CUDAContextScope scope(this); size_t offset = elem*y*w; size_t size = elem*w*h; - cuda_push_context(); if(mem.device_pointer) { cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset, (CUdeviceptr)(mem.device_pointer + offset), size)); @@ -493,25 +592,25 @@ public: else { memset((char*)mem.data_pointer + offset, 0, size); } - cuda_pop_context(); } void mem_zero(device_memory& mem) { - memset((void*)mem.data_pointer, 0, mem.memory_size()); + if(mem.data_pointer) { + memset((void*)mem.data_pointer, 0, mem.memory_size()); + } - cuda_push_context(); - if(mem.device_pointer) + if(mem.device_pointer) { + CUDAContextScope scope(this); cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())); - cuda_pop_context(); + } } void mem_free(device_memory& mem) { if(mem.device_pointer) { - cuda_push_context(); + CUDAContextScope scope(this); cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer))); - cuda_pop_context(); mem.device_pointer = 0; @@ -520,16 +619,20 @@ public: } } + virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/) + { + return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); + } + void const_copy_to(const char *name, void *host, size_t size) { + CUDAContextScope scope(this); CUdeviceptr mem; size_t bytes; - cuda_push_context(); cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); //assert(bytes == size); cuda_assert(cuMemcpyHtoD(mem, host, size)); - cuda_pop_context(); } void tex_alloc(const char *name, @@ -537,12 +640,13 @@ public: InterpolationType interpolation, ExtensionType extension) { + CUDAContextScope scope(this); + VLOG(1) << "Texture allocate: " << name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; - /* Check if we are on sm_30 or above. - * We use arrays and bindles textures for storage there */ + /* Check if we are on sm_30 or above, for bindless textures. */ bool has_bindless_textures = info.has_bindless_textures; /* General variables for both architectures */ @@ -574,20 +678,10 @@ public: filter_mode = CU_TR_FILTER_MODE_LINEAR; } - CUarray_format_enum format; - switch(mem.data_type) { - case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; - case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; - case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; - case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; - case TYPE_HALF: format = CU_AD_FORMAT_HALF; break; - default: assert(0); return; - } - /* General variables for Fermi */ CUtexref texref = NULL; - if(!has_bindless_textures) { + if(!has_bindless_textures && interpolation != INTERPOLATION_NONE) { if(mem.data_depth > 1) { /* Kernel uses different bind names for 2d and 3d float textures, * so we have to adjust couple of things here. @@ -599,59 +693,47 @@ public: tokens[3].c_str()); } - cuda_push_context(); cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str())); - cuda_pop_context(); if(!texref) { return; } } - /* Data Storage */ if(interpolation == INTERPOLATION_NONE) { - if(has_bindless_textures) { - mem_alloc(mem, MEM_READ_ONLY); - mem_copy_to(mem); - - cuda_push_context(); + /* Data Storage */ + mem_alloc(NULL, mem, MEM_READ_ONLY); + mem_copy_to(mem); - CUdeviceptr cumem; - size_t cubytes; + CUdeviceptr cumem; + size_t cubytes; - cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); + cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); - if(cubytes == 8) { - /* 64 bit device pointer */ - uint64_t ptr = mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); - } - else { - /* 32 bit device pointer */ - uint32_t ptr = (uint32_t)mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); - } - - cuda_pop_context(); + if(cubytes == 8) { + /* 64 bit device pointer */ + uint64_t ptr = mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); } else { - mem_alloc(mem, MEM_READ_ONLY); - mem_copy_to(mem); - - cuda_push_context(); - - cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size)); - cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)); - cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER)); - - cuda_pop_context(); + /* 32 bit device pointer */ + uint32_t ptr = (uint32_t)mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); } } - /* Texture Storage */ else { + /* Texture Storage */ CUarray handle = NULL; - cuda_push_context(); + CUarray_format_enum format; + switch(mem.data_type) { + case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; + case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; + case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; + case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; + case TYPE_HALF: format = CU_AD_FORMAT_HALF; break; + default: assert(0); return; + } if(mem.data_depth > 1) { CUDA_ARRAY3D_DESCRIPTOR desc; @@ -677,7 +759,6 @@ public: } if(!handle) { - cuda_pop_context(); return; } @@ -718,8 +799,8 @@ public: stats.mem_alloc(size); - /* Bindless Textures - Kepler */ if(has_bindless_textures) { + /* Bindless Textures - Kepler */ int flat_slot = 0; if(string_startswith(name, "__tex_image")) { int pos = string(name).rfind("_"); @@ -752,41 +833,39 @@ public: } /* Resize once */ - if(flat_slot >= bindless_mapping.size()) { + if(flat_slot >= texture_info.size()) { /* Allocate some slots in advance, to reduce amount - * of re-allocations. - */ - bindless_mapping.resize(flat_slot + 128); + * of re-allocations. */ + texture_info.resize(flat_slot + 128); } /* Set Mapping and tag that we need to (re-)upload to device */ - bindless_mapping.get_data()[flat_slot] = (uint)tex; - tex_bindless_map[mem.device_pointer] = (uint)tex; - need_bindless_mapping = true; + TextureInfo& info = texture_info.get_data()[flat_slot]; + info.data = (uint64_t)tex; + info.cl_buffer = 0; + info.interpolation = interpolation; + info.extension = extension; + info.width = mem.data_width; + info.height = mem.data_height; + info.depth = mem.data_depth; + + tex_bindless_map[mem.device_pointer] = tex; + need_texture_info = true; } - /* Regular Textures - Fermi */ else { + /* Regular Textures - Fermi */ cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT)); cuda_assert(cuTexRefSetFilterMode(texref, filter_mode)); cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); - } - - cuda_pop_context(); - } - /* Fermi, Data and Image Textures */ - if(!has_bindless_textures) { - cuda_push_context(); + cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); + cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); + if(mem.data_depth > 1) { + cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode)); + } - cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); - cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); - if(mem.data_depth > 1) { - cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode)); + cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); } - - cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); - - cuda_pop_context(); } /* Fermi and Kepler */ @@ -797,14 +876,13 @@ public: { if(mem.device_pointer) { if(tex_interp_map[mem.device_pointer]) { - cuda_push_context(); + CUDAContextScope scope(this); cuArrayDestroy((CUarray)mem.device_pointer); - cuda_pop_context(); /* Free CUtexObject (Bindless Textures) */ if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) { - uint flat_slot = tex_bindless_map[mem.device_pointer]; - cuTexObjectDestroy(flat_slot); + CUtexObject tex = tex_bindless_map[mem.device_pointer]; + cuTexObjectDestroy(tex); } tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); @@ -820,64 +898,466 @@ public: } } - void path_trace(RenderTile& rtile, int sample, bool branched) + bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task) + { + mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY); + + TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer; + for(int i = 0; i < 9; i++) { + tiles->buffers[i] = buffers[i]; + } + + mem_copy_to(task->tiles_mem); + + return !have_error(); + } + +#define CUDA_GET_BLOCKSIZE(func, w, h) \ + int threads_per_block; \ + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int threads = (int)sqrt((float)threads_per_block); \ + int xblocks = ((w) + threads - 1)/threads; \ + int yblocks = ((h) + threads - 1)/threads; + +#define CUDA_LAUNCH_KERNEL(func, args) \ + cuda_assert(cuLaunchKernel(func, \ + xblocks, yblocks, 1, \ + threads, threads, 1, \ + 0, 0, args, 0)); + + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, + DenoisingTask *task) { if(have_error()) - return; + return false; + + CUDAContextScope scope(this); + + int4 rect = task->rect; + int w = align_up(rect.z-rect.x, 4); + int h = rect.w-rect.y; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + CUdeviceptr difference = task->nlm_state.temporary_1_ptr; + CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr; + CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr; + + cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h)); + cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h)); + + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); + cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y); + + int dx, dy; + int4 local_rect; + int channel_offset = 0; + void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f}; + void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f}; + + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + dy = i / (2*r+1) - r; + dx = i % (2*r+1) - r; + local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); + + CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args); + } + + local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y); + void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w}; + CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_construct_transform(DenoisingTask *task) + { + if(have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterConstructTransform; + cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); + cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); + CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, + task->storage.w, + task->storage.h); + + void *args[] = {&task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->filter_area, + &task->rect, + &task->radius, + &task->pca_threshold, + &task->buffer.pass_stride}; + CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_reconstruct(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + CUDAContextScope scope(this); + + mem_zero(task->storage.XtWX); + mem_zero(task->storage.XtWY); + + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + + CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr; + CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr; + + int r = task->radius; + int f = 4; + float a = 1.0f; + for(int i = 0; i < (2*r+1)*(2*r+1); i++) { + int dy = i / (2*r+1) - r; + int dx = i % (2*r+1) - r; + + int local_rect[4] = {max(0, -dx), max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + + void *calc_difference_args[] = {&dx, &dy, + &color_ptr, + &color_variance_ptr, + &difference, + &local_rect, + &task->buffer.w, + &task->buffer.pass_stride, + &a, + &task->nlm_k_2}; + CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); + + void *blur_args[] = {&difference, + &blurDifference, + &local_rect, + &task->buffer.w, + &f}; + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + + void *calc_weight_args[] = {&blurDifference, + &difference, + &local_rect, + &task->buffer.w, + &f}; + CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); + + /* Reuse previous arguments. */ + CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); + + void *construct_gramian_args[] = {&dx, &dy, + &blurDifference, + &task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &local_rect, + &task->reconstruction_state.filter_rect, + &task->buffer.w, + &task->buffer.h, + &f, + &task->buffer.pass_stride}; + CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args); + } + + void *finalize_args[] = {&task->buffer.w, + &task->buffer.h, + &output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, + device_ptr mean_ptr, device_ptr variance_ptr, + int r, int4 rect, DenoisingTask *task) + { + if(have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterCombineHalves; + cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); + cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterCombineHalves, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&mean_ptr, + &variance_ptr, + &a_ptr, + &b_ptr, + &rect, + &r}; + CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, + device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, DenoisingTask *task) + { + if(have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterDivideShadow; + cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterDivideShadow, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&task->render_buffer.samples, + &task->tiles_mem.device_pointer, + &a_ptr, + &b_ptr, + &sample_variance_ptr, + &sv_variance_ptr, + &buffer_variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.denoising_data_offset}; + CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterGetFeature; + cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterGetFeature, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&task->render_buffer.samples, + &task->tiles_mem.device_pointer, + &mean_offset, + &variance_offset, + &mean_ptr, + &variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.denoising_data_offset}; + CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + if(have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterDetectOutliers; + cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers, + task->rect.z-task->rect.x, + task->rect.w-task->rect.y); + + void *args[] = {&image_ptr, + &variance_ptr, + &depth_ptr, + &output_ptr, + &task->rect, + &task->buffer.pass_stride}; + + CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + void denoise(RenderTile &rtile, const DeviceTask &task) + { + DenoisingTask denoising(this); + + denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); + denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising); + denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising); + denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + + RenderTile rtiles[9]; + rtiles[4] = rtile; + task.map_neighbor_tiles(rtiles, this); + denoising.tiles_from_rendertiles(rtiles); + + denoising.init_from_devicetask(task); - cuda_push_context(); + denoising.run_denoising(); + task.unmap_neighbor_tiles(rtiles, this); + } + + void path_trace(DeviceTask& task, RenderTile& rtile) + { + if(have_error()) + return; + + CUDAContextScope scope(this); CUfunction cuPathTrace; - CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer); - CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state); - /* get kernel function */ - if(branched) { + /* Get kernel function. */ + if(task.integrator_branched) { cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); } else { cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); } - if(have_error()) + if(have_error()) { return; + } - /* pass in parameters */ - void *args[] = {&d_buffer, - &d_rng_state, - &sample, - &rtile.x, - &rtile.y, - &rtile.w, - &rtile.h, - &rtile.offset, - &rtile.stride}; + cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace)); + /* Allocate work tile. */ + device_vector<WorkTile> work_tiles; + work_tiles.resize(1); + + WorkTile *wtile = work_tiles.get_data(); + wtile->x = rtile.x; + wtile->y = rtile.y; + wtile->w = rtile.w; + wtile->h = rtile.h; + wtile->offset = rtile.offset; + wtile->stride = rtile.stride; + wtile->buffer = (float*)cuda_device_ptr(rtile.buffer); + mem_alloc("work_tiles", work_tiles, MEM_READ_ONLY); + + CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); + + /* Prepare work size. More step samples render faster, but for now we + * remain conservative for GPUs connected to a display to avoid driver + * timeouts and display freezing. */ + int min_blocks, num_threads_per_block; + cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); + if(!info.display_device) { + min_blocks *= 8; + } - /*int num_registers; - cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace)); + uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);; - printf("threads_per_block %d\n", threads_per_block); - printf("num_registers %d\n", num_registers);*/ + /* Render all samples. */ + int start_sample = rtile.start_sample; + int end_sample = rtile.start_sample + rtile.num_samples; - int xthreads = (int)sqrt(threads_per_block); - int ythreads = (int)sqrt(threads_per_block); - int xblocks = (rtile.w + xthreads - 1)/xthreads; - int yblocks = (rtile.h + ythreads - 1)/ythreads; + for(int sample = start_sample; sample < end_sample; sample += step_samples) { + /* Setup and copy work tile to device. */ + wtile->start_sample = sample; + wtile->num_samples = min(step_samples, end_sample - sample);; + mem_copy_to(work_tiles); - cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); - cuda_assert(cuLaunchKernel(cuPathTrace, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, args, 0)); + /* Launch kernel. */ + void *args[] = {&d_work_tiles, + &total_work_size}; - cuda_assert(cuCtxSynchronize()); + cuda_assert(cuLaunchKernel(cuPathTrace, + num_blocks, 1, 1, + num_threads_per_block, 1, 1, + 0, 0, args, 0)); + + cuda_assert(cuCtxSynchronize()); + + /* Update progress. */ + rtile.sample = sample + wtile->num_samples; + task.update_progress(&rtile, rtile.w*rtile.h*wtile->num_samples); + + if(task.get_cancel()) { + if(task.need_finish_queue == false) + break; + } + } - cuda_pop_context(); + mem_free(work_tiles); } void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) @@ -885,7 +1365,7 @@ public: if(have_error()) return; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuFilmConvert; CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half); @@ -930,8 +1410,6 @@ public: 0, 0, args, 0)); unmap_pixels((rgba_byte)? rgba_byte: rgba_half); - - cuda_pop_context(); } void shader(DeviceTask& task) @@ -939,19 +1417,21 @@ public: if(have_error()) return; - cuda_push_context(); + CUDAContextScope scope(this); CUfunction cuShader; CUdeviceptr d_input = cuda_device_ptr(task.shader_input); CUdeviceptr d_output = cuda_device_ptr(task.shader_output); - CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma); /* get kernel function */ if(task.shader_eval_type >= SHADER_EVAL_BAKE) { cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); } + else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); + } else { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader")); + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); } /* do tasks in smaller chunks, so we can cancel it */ @@ -970,9 +1450,6 @@ public: int arg = 0; args[arg++] = &d_input; args[arg++] = &d_output; - if(task.shader_eval_type < SHADER_EVAL_BAKE) { - args[arg++] = &d_output_luma; - } args[arg++] = &task.shader_eval_type; if(task.shader_eval_type >= SHADER_EVAL_BAKE) { args[arg++] = &task.shader_filter; @@ -1004,8 +1481,6 @@ public: task.update_progress(NULL); } - - cuda_pop_context(); } CUdeviceptr map_pixels(device_ptr mem) @@ -1041,7 +1516,7 @@ public: pmem.w = mem.data_width; pmem.h = mem.data_height; - cuda_push_context(); + CUDAContextScope scope(this); glGenBuffers(1, &pmem.cuPBO); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); @@ -1065,8 +1540,6 @@ public: CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); if(result == CUDA_SUCCESS) { - cuda_pop_context(); - mem.device_pointer = pmem.cuTexId; pixel_mem_map[mem.device_pointer] = pmem; @@ -1080,8 +1553,6 @@ public: glDeleteBuffers(1, &pmem.cuPBO); glDeleteTextures(1, &pmem.cuTexId); - cuda_pop_context(); - background = true; } } @@ -1094,7 +1565,7 @@ public: if(!background) { PixelMem pmem = pixel_mem_map[mem.device_pointer]; - cuda_push_context(); + CUDAContextScope scope(this); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); @@ -1103,8 +1574,6 @@ public: glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - cuda_pop_context(); - return; } @@ -1117,14 +1586,12 @@ public: if(!background) { PixelMem pmem = pixel_mem_map[mem.device_pointer]; - cuda_push_context(); + CUDAContextScope scope(this); cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); glDeleteBuffers(1, &pmem.cuPBO); glDeleteTextures(1, &pmem.cuTexId); - cuda_pop_context(); - pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); mem.device_pointer = 0; @@ -1145,7 +1612,7 @@ public: PixelMem pmem = pixel_mem_map[mem.device_pointer]; float *vpointer; - cuda_push_context(); + CUDAContextScope scope(this); /* for multi devices, this assumes the inefficient method that we allocate * all pixels on the device even though we only render to a subset */ @@ -1234,8 +1701,6 @@ public: glBindTexture(GL_TEXTURE_2D, 0); glDisable(GL_TEXTURE_2D); - cuda_pop_context(); - return; } @@ -1244,44 +1709,54 @@ public: void thread_run(DeviceTask *task) { - if(task->type == DeviceTask::PATH_TRACE) { + CUDAContextScope scope(this); + + if(task->type == DeviceTask::RENDER) { RenderTile tile; - bool branched = task->integrator_branched; + DeviceRequestedFeatures requested_features; + if(use_split_kernel()) { + if(!use_adaptive_compilation()) { + requested_features.max_closure = 64; + } - /* Upload Bindless Mapping */ - load_bindless_mapping(); + if(split_kernel == NULL) { + split_kernel = new CUDASplitKernel(this); + split_kernel->load_kernels(requested_features); + } + } /* keep rendering tiles until done */ while(task->acquire_tile(this, tile)) { - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - for(int sample = start_sample; sample < end_sample; sample++) { - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; + if(tile.task == RenderTile::PATH_TRACE) { + if(use_split_kernel()) { + device_memory void_buffer; + split_kernel->path_trace(task, tile, void_buffer, void_buffer); } + else { + path_trace(*task, tile); + } + } + else if(tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; - path_trace(tile, sample, branched); - - tile.sample = sample + 1; + denoise(tile, *task); - task->update_progress(&tile); + task->update_progress(&tile, tile.w*tile.h); } task->release_tile(tile); + + if(task->get_cancel()) { + if(task->need_finish_queue == false) + break; + } } } else if(task->type == DeviceTask::SHADER) { - /* Upload Bindless Mapping */ - load_bindless_mapping(); - shader(*task); - cuda_push_context(); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); } } @@ -1301,13 +1776,15 @@ public: void task_add(DeviceTask& task) { + CUDAContextScope scope(this); + + /* Load texture info. */ + load_texture_info(); + if(task.type == DeviceTask::FILM_CONVERT) { /* must be done in main thread due to opengl access */ film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); - - cuda_push_context(); cuda_assert(cuCtxSynchronize()); - cuda_pop_context(); } else { task_pool.push(new CUDADeviceTask(this, task)); @@ -1323,8 +1800,236 @@ public: { task_pool.cancel(); } + + friend class CUDASplitKernelFunction; + friend class CUDASplitKernel; + friend class CUDAContextScope; +}; + +/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class + * now that the definition of that class is complete + */ +#undef cuda_assert +#define cuda_assert(stmt) \ + { \ + CUresult result = stmt; \ + \ + if(result != CUDA_SUCCESS) { \ + string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + if(device->error_msg == "") \ + device->error_msg = message; \ + fprintf(stderr, "%s\n", message.c_str()); \ + /*cuda_abort();*/ \ + device->cuda_error_documentation(); \ + } \ + } (void)0 + + +/* CUDA context scope. */ + +CUDAContextScope::CUDAContextScope(CUDADevice *device) +: device(device) +{ + cuda_assert(cuCtxPushCurrent(device->cuContext)); +} + +CUDAContextScope::~CUDAContextScope() +{ + cuda_assert(cuCtxPopCurrent(NULL)); +} + +/* split kernel */ + +class CUDASplitKernelFunction : public SplitKernelFunction{ + CUDADevice* device; + CUfunction func; +public: + CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {} + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/) + { + return enqueue(dim, NULL); + } + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, void *args[]) + { + if(device->have_error()) + return false; + + CUDAContextScope scope(device); + + /* we ignore dim.local_size for now, as this is faster */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); + + int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block; + + cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); + + cuda_assert(cuLaunchKernel(func, + xblocks, 1, 1, /* blocks */ + threads_per_block, 1, 1, /* threads */ + 0, 0, args, 0)); + + return !device->have_error(); + } }; +CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device) +{ +} + +uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads) +{ + CUDAContextScope scope(device); + + device_vector<uint64_t> size_buffer; + size_buffer.resize(1); + device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE); + + uint threads = num_threads; + CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer); + + struct args_t { + uint* num_threads; + CUdeviceptr* size; + }; + + args_t args = { + &threads, + &d_size + }; + + CUfunction state_buffer_size; + cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size")); + + cuda_assert(cuLaunchKernel(state_buffer_size, + 1, 1, 1, + 1, 1, 1, + 0, 0, (void**)&args, 0)); + + device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t)); + device->mem_free(size_buffer); + + return *size_buffer.get_data(); +} + +bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, + RenderTile& rtile, + int num_global_elements, + device_memory& /*kernel_globals*/, + device_memory& /*kernel_data*/, + device_memory& split_data, + device_memory& ray_state, + device_memory& queue_index, + device_memory& use_queues_flag, + device_memory& work_pool_wgs) +{ + CUDAContextScope scope(device); + + CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer); + CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer); + CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer); + CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer); + CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer); + + CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer); + + int end_sample = rtile.start_sample + rtile.num_samples; + int queue_size = dim.global_size[0] * dim.global_size[1]; + + struct args_t { + CUdeviceptr* split_data_buffer; + int* num_elements; + CUdeviceptr* ray_state; + int* start_sample; + int* end_sample; + int* sx; + int* sy; + int* sw; + int* sh; + int* offset; + int* stride; + CUdeviceptr* queue_index; + int* queuesize; + CUdeviceptr* use_queues_flag; + CUdeviceptr* work_pool_wgs; + int* num_samples; + CUdeviceptr* buffer; + }; + + args_t args = { + &d_split_data, + &num_global_elements, + &d_ray_state, + &rtile.start_sample, + &end_sample, + &rtile.x, + &rtile.y, + &rtile.w, + &rtile.h, + &rtile.offset, + &rtile.stride, + &d_queue_index, + &queue_size, + &d_use_queues_flag, + &d_work_pool_wgs, + &rtile.num_samples, + &d_buffer + }; + + CUfunction data_init; + cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); + if(device->have_error()) { + return false; + } + + CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args); + + return !device->have_error(); +} + +SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name, + const DeviceRequestedFeatures&) +{ + CUDAContextScope scope(device); + CUfunction func; + + cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data())); + if(device->have_error()) { + device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data())); + return NULL; + } + + return new CUDASplitKernelFunction(device, func); +} + +int2 CUDASplitKernel::split_kernel_local_size() +{ + return make_int2(32, 1); +} + +int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) +{ + CUDAContextScope scope(device); + size_t free; + size_t total; + + cuda_assert(cuMemGetInfo(&free, &total)); + + VLOG(1) << "Maximum device allocation size: " + << string_human_readable_number(free) << " bytes. (" + << string_human_readable_size(free) << ")."; + + size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2); + size_t side = round_down((int)sqrt(num_elements), 32); + int2 global_size = make_int2(side, round_down(num_elements / side, 16)); + VLOG(1) << "Global size: " << global_size << "."; + return global_size; +} + bool device_cuda_init(void) { #ifdef WITH_CUDA_DYNLOAD @@ -1371,18 +2076,34 @@ Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background) return new CUDADevice(info, stats, background); } -void device_cuda_info(vector<DeviceInfo>& devices) +static CUresult device_cuda_safe_init() { - CUresult result; - int count = 0; +#ifdef _WIN32 + __try { + return cuInit(0); + } + __except(EXCEPTION_EXECUTE_HANDLER) { + /* Ignore crashes inside the CUDA driver and hope we can + * survive even with corrupted CUDA installs. */ + fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n"); + } + + return CUDA_ERROR_NO_DEVICE; +#else + return cuInit(0); +#endif +} - result = cuInit(0); +void device_cuda_info(vector<DeviceInfo>& devices) +{ + CUresult result = device_cuda_safe_init(); if(result != CUDA_SUCCESS) { if(result != CUDA_ERROR_NO_DEVICE) fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result)); return; } + int count = 0; result = cuDeviceGetCount(&count); if(result != CUDA_SUCCESS) { fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result)); @@ -1393,7 +2114,6 @@ void device_cuda_info(vector<DeviceInfo>& devices) for(int num = 0; num < count; num++) { char name[256]; - int attr; if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) continue; @@ -1412,7 +2132,8 @@ void device_cuda_info(vector<DeviceInfo>& devices) info.advanced_shading = (major >= 2); info.has_bindless_textures = (major >= 3); - info.pack_images = false; + info.has_volume_decoupled = false; + info.has_qbvh = false; int pci_location[3] = {0, 0, 0}; cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num); @@ -1424,14 +2145,21 @@ void device_cuda_info(vector<DeviceInfo>& devices) (unsigned int)pci_location[1], (unsigned int)pci_location[2]); - /* if device has a kernel timeout, assume it is used for display */ - if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) { + /* If device has a kernel timeout and no compute preemption, we assume + * it is connected to a display and will freeze the display while doing + * computations. */ + int timeout_attr = 0, preempt_attr = 0; + cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num); + cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num); + + if(timeout_attr && !preempt_attr) { info.description += " (Display)"; info.display_device = true; display_devices.push_back(info); } - else + else { devices.push_back(info); + } } if(!display_devices.empty()) @@ -1440,7 +2168,7 @@ void device_cuda_info(vector<DeviceInfo>& devices) string device_cuda_capabilities(void) { - CUresult result = cuInit(0); + CUresult result = device_cuda_safe_init(); if(result != CUDA_SUCCESS) { if(result != CUDA_ERROR_NO_DEVICE) { return string("Error initializing CUDA: ") + cuewErrorString(result); |