diff options
author | Campbell Barton <ideasman42@gmail.com> | 2019-04-17 07:17:24 +0300 |
---|---|---|
committer | Campbell Barton <ideasman42@gmail.com> | 2019-04-17 07:21:24 +0300 |
commit | e12c08e8d170b7ca40f204a5b0423c23a9fbc2c1 (patch) | |
tree | 8cf3453d12edb177a218ef8009357518ec6cab6a /intern/cycles/device/device_cuda.cpp | |
parent | b3dabc200a4b0399ec6b81f2ff2730d07b44fcaa (diff) |
ClangFormat: apply to source, most of intern
Apply clang format as proposed in T53211.
For details on usage and instructions for migrating branches
without conflicts, see:
https://wiki.blender.org/wiki/Tools/ClangFormat
Diffstat (limited to 'intern/cycles/device/device_cuda.cpp')
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 5137 |
1 files changed, 2620 insertions, 2517 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 3aa6bce155e..68bc3bd4045 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -62,2144 +62,2242 @@ namespace { const char *cuewErrorString(CUresult result) { - /* We can only give error code here without major code duplication, that - * should be enough since dynamic loading is only being disabled by folks - * who knows what they're doing anyway. - * - * NOTE: Avoid call from several threads. - */ - static string error; - error = string_printf("%d", result); - return error.c_str(); + /* We can only give error code here without major code duplication, that + * should be enough since dynamic loading is only being disabled by folks + * who knows what they're doing anyway. + * + * NOTE: Avoid call from several threads. + */ + static string error; + error = string_printf("%d", result); + return error.c_str(); } const char *cuewCompilerPath() { - return CYCLES_CUDA_NVCC_EXECUTABLE; + return CYCLES_CUDA_NVCC_EXECUTABLE; } int cuewCompilerVersion() { - return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10); + return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10); } -} /* namespace */ -#endif /* WITH_CUDA_DYNLOAD */ +} /* namespace */ +#endif /* WITH_CUDA_DYNLOAD */ class CUDADevice; class CUDASplitKernel : public DeviceSplitKernel { - CUDADevice *device; -public: - explicit CUDASplitKernel(CUDADevice *device); - - virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, - RenderTile& rtile, - int num_global_elements, - device_memory& kernel_globals, - device_memory& kernel_data_, - device_memory& split_data, - device_memory& ray_state, - device_memory& queue_index, - device_memory& use_queues_flag, - device_memory& work_pool_wgs); - - virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, - const DeviceRequestedFeatures&); - virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); + CUDADevice *device; + + public: + explicit CUDASplitKernel(CUDADevice *device); + + virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, + RenderTile &rtile, + int num_global_elements, + device_memory &kernel_globals, + device_memory &kernel_data_, + device_memory &split_data, + device_memory &ray_state, + device_memory &queue_index, + device_memory &use_queues_flag, + device_memory &work_pool_wgs); + + virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, + const DeviceRequestedFeatures &); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task); }; /* Utility to push/pop CUDA context. */ class CUDAContextScope { -public: - CUDAContextScope(CUDADevice *device); - ~CUDAContextScope(); + public: + CUDAContextScope(CUDADevice *device); + ~CUDAContextScope(); -private: - CUDADevice *device; + private: + CUDADevice *device; }; -class CUDADevice : public Device -{ -public: - DedicatedTaskPool task_pool; - CUdevice cuDevice; - CUcontext cuContext; - CUmodule cuModule, cuFilterModule; - size_t device_texture_headroom; - size_t device_working_headroom; - bool move_texture_to_host; - size_t map_host_used; - size_t map_host_limit; - int can_map_host; - int cuDevId; - int cuDevArchitecture; - bool first_error; - CUDASplitKernel *split_kernel; - - struct CUDAMem { - CUDAMem() - : texobject(0), array(0), map_host_pointer(0), free_map_host(false) {} - - CUtexObject texobject; - CUarray array; - void *map_host_pointer; - bool free_map_host; - }; - typedef map<device_memory*, CUDAMem> CUDAMemMap; - CUDAMemMap cuda_mem_map; - - struct PixelMem { - GLuint cuPBO; - CUgraphicsResource cuPBOresource; - GLuint cuTexId; - int w, h; - }; - map<device_ptr, PixelMem> pixel_mem_map; - - /* Bindless Textures */ - device_vector<TextureInfo> texture_info; - bool need_texture_info; - - CUdeviceptr cuda_device_ptr(device_ptr mem) - { - return (CUdeviceptr)mem; - } - - static bool have_precompiled_kernels() - { - string cubins_path = path_get("lib"); - return path_exists(cubins_path); - } - - virtual bool show_samples() const - { - /* The CUDADevice only processes one tile at a time, so showing samples is fine. */ - return true; - } - - virtual BVHLayoutMask get_bvh_layout_mask() const { - return BVH_LAYOUT_BVH2; - } - -/*#ifdef NDEBUG +class CUDADevice : public Device { + public: + DedicatedTaskPool task_pool; + CUdevice cuDevice; + CUcontext cuContext; + CUmodule cuModule, cuFilterModule; + size_t device_texture_headroom; + size_t device_working_headroom; + bool move_texture_to_host; + size_t map_host_used; + size_t map_host_limit; + int can_map_host; + int cuDevId; + int cuDevArchitecture; + bool first_error; + CUDASplitKernel *split_kernel; + + struct CUDAMem { + CUDAMem() : texobject(0), array(0), map_host_pointer(0), free_map_host(false) + { + } + + CUtexObject texobject; + CUarray array; + void *map_host_pointer; + bool free_map_host; + }; + typedef map<device_memory *, CUDAMem> CUDAMemMap; + CUDAMemMap cuda_mem_map; + + struct PixelMem { + GLuint cuPBO; + CUgraphicsResource cuPBOresource; + GLuint cuTexId; + int w, h; + }; + map<device_ptr, PixelMem> pixel_mem_map; + + /* Bindless Textures */ + device_vector<TextureInfo> texture_info; + bool need_texture_info; + + CUdeviceptr cuda_device_ptr(device_ptr mem) + { + return (CUdeviceptr)mem; + } + + static bool have_precompiled_kernels() + { + string cubins_path = path_get("lib"); + return path_exists(cubins_path); + } + + virtual bool show_samples() const + { + /* The CUDADevice only processes one tile at a time, so showing samples is fine. */ + return true; + } + + virtual BVHLayoutMask get_bvh_layout_mask() const + { + return BVH_LAYOUT_BVH2; + } + + /*#ifdef NDEBUG #define cuda_abort() #else #define cuda_abort() abort() #endif*/ - void cuda_error_documentation() - { - if(first_error) { - fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); - fprintf(stderr, "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n"); - first_error = false; - } - } + void cuda_error_documentation() + { + if (first_error) { + fprintf(stderr, + "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); + fprintf(stderr, + "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n"); + first_error = false; + } + } #define cuda_assert(stmt) \ - { \ - CUresult result = stmt; \ - \ - if(result != CUDA_SUCCESS) { \ - string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \ - if(error_msg == "") \ - error_msg = message; \ - fprintf(stderr, "%s\n", message.c_str()); \ - /*cuda_abort();*/ \ - cuda_error_documentation(); \ - } \ - } (void) 0 - - bool cuda_error_(CUresult result, const string& stmt) - { - if(result == CUDA_SUCCESS) - return false; - - string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result)); - if(error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); - cuda_error_documentation(); - return true; - } + { \ + CUresult result = stmt; \ +\ + if (result != CUDA_SUCCESS) { \ + string message = string_printf( \ + "CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \ + if (error_msg == "") \ + error_msg = message; \ + fprintf(stderr, "%s\n", message.c_str()); \ + /*cuda_abort();*/ \ + cuda_error_documentation(); \ + } \ + } \ + (void)0 + + bool cuda_error_(CUresult result, const string &stmt) + { + if (result == CUDA_SUCCESS) + return false; + + string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result)); + if (error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); + cuda_error_documentation(); + return true; + } #define cuda_error(stmt) cuda_error_(stmt, #stmt) - void cuda_error_message(const string& message) - { - if(error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); - cuda_error_documentation(); - } - - CUDADevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_) - : Device(info, stats, profiler, background_), - texture_info(this, "__texture_info", MEM_TEXTURE) - { - first_error = true; - background = background_; - - cuDevId = info.num; - cuDevice = 0; - cuContext = 0; - - cuModule = 0; - cuFilterModule = 0; - - split_kernel = NULL; - - need_texture_info = false; - - device_texture_headroom = 0; - device_working_headroom = 0; - move_texture_to_host = false; - map_host_limit = 0; - map_host_used = 0; - can_map_host = 0; - - /* Intialize CUDA. */ - if(cuda_error(cuInit(0))) - return; - - /* Setup device and context. */ - if(cuda_error(cuDeviceGet(&cuDevice, cuDevId))) - return; - - /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. - * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, - * so we can predict which memory to map to host. */ - cuda_assert(cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); - - unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX; - if(can_map_host) { - ctx_flags |= CU_CTX_MAP_HOST; - init_host_memory(); - } - - /* Create context. */ - CUresult result; - - if(background) { - result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); - } - else { - result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice); - - if(result != CUDA_SUCCESS) { - result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); - background = true; - } - } - - if(cuda_error_(result, "cuCtxCreate")) - return; - - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - cuDevArchitecture = major*100 + minor*10; - - /* Pop context set by cuCtxCreate. */ - cuCtxPopCurrent(NULL); - } - - ~CUDADevice() - { - task_pool.stop(); - - delete split_kernel; - - texture_info.free(); - - cuda_assert(cuCtxDestroy(cuContext)); - } - - bool support_device(const DeviceRequestedFeatures& /*requested_features*/) - { - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - - /* We only support sm_30 and above */ - if(major < 3) { - cuda_error_message(string_printf("CUDA device supported only with compute capability 3.0 or up, found %d.%d.", major, minor)); - return false; - } - - return true; - } - - bool use_adaptive_compilation() - { - return DebugFlags().cuda.adaptive_compile; - } - - bool use_split_kernel() - { - return DebugFlags().cuda.split_kernel; - } - - /* Common NVCC flags which stays the same regardless of shading model, - * kernel sources md5 and only depends on compiler or compilation settings. - */ - string compile_kernel_get_common_cflags( - const DeviceRequestedFeatures& requested_features, - bool filter=false, bool split=false) - { - const int machine = system_cpu_bits(); - const string source_path = path_get("source"); - const string include_path = source_path; - string cflags = string_printf("-m%d " - "--ptxas-options=\"-v\" " - "--use_fast_math " - "-DNVCC " - "-I\"%s\"", - machine, - include_path.c_str()); - if(!filter && use_adaptive_compilation()) { - cflags += " " + requested_features.get_build_options(); - } - const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); - if(extra_cflags) { - cflags += string(" ") + string(extra_cflags); - } + void cuda_error_message(const string &message) + { + if (error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); + cuda_error_documentation(); + } + + CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_) + : Device(info, stats, profiler, background_), + texture_info(this, "__texture_info", MEM_TEXTURE) + { + first_error = true; + background = background_; + + cuDevId = info.num; + cuDevice = 0; + cuContext = 0; + + cuModule = 0; + cuFilterModule = 0; + + split_kernel = NULL; + + need_texture_info = false; + + device_texture_headroom = 0; + device_working_headroom = 0; + move_texture_to_host = false; + map_host_limit = 0; + map_host_used = 0; + can_map_host = 0; + + /* Intialize CUDA. */ + if (cuda_error(cuInit(0))) + return; + + /* Setup device and context. */ + if (cuda_error(cuDeviceGet(&cuDevice, cuDevId))) + return; + + /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. + * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, + * so we can predict which memory to map to host. */ + cuda_assert( + cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); + + unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX; + if (can_map_host) { + ctx_flags |= CU_CTX_MAP_HOST; + init_host_memory(); + } + + /* Create context. */ + CUresult result; + + if (background) { + result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); + } + else { + result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice); + + if (result != CUDA_SUCCESS) { + result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); + background = true; + } + } + + if (cuda_error_(result, "cuCtxCreate")) + return; + + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + cuDevArchitecture = major * 100 + minor * 10; + + /* Pop context set by cuCtxCreate. */ + cuCtxPopCurrent(NULL); + } + + ~CUDADevice() + { + task_pool.stop(); + + delete split_kernel; + + texture_info.free(); + + cuda_assert(cuCtxDestroy(cuContext)); + } + + bool support_device(const DeviceRequestedFeatures & /*requested_features*/) + { + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + + /* We only support sm_30 and above */ + if (major < 3) { + cuda_error_message(string_printf( + "CUDA device supported only with compute capability 3.0 or up, found %d.%d.", + major, + minor)); + return false; + } + + return true; + } + + bool use_adaptive_compilation() + { + return DebugFlags().cuda.adaptive_compile; + } + + bool use_split_kernel() + { + return DebugFlags().cuda.split_kernel; + } + + /* Common NVCC flags which stays the same regardless of shading model, + * kernel sources md5 and only depends on compiler or compilation settings. + */ + string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features, + bool filter = false, + bool split = false) + { + const int machine = system_cpu_bits(); + const string source_path = path_get("source"); + const string include_path = source_path; + string cflags = string_printf( + "-m%d " + "--ptxas-options=\"-v\" " + "--use_fast_math " + "-DNVCC " + "-I\"%s\"", + machine, + include_path.c_str()); + if (!filter && use_adaptive_compilation()) { + cflags += " " + requested_features.get_build_options(); + } + const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); + if (extra_cflags) { + cflags += string(" ") + string(extra_cflags); + } #ifdef WITH_CYCLES_DEBUG - cflags += " -D__KERNEL_DEBUG__"; + cflags += " -D__KERNEL_DEBUG__"; #endif - if(split) { - cflags += " -D__SPLIT__"; - } - - return cflags; - } - - bool compile_check_compiler() { - const char *nvcc = cuewCompilerPath(); - if(nvcc == NULL) { - cuda_error_message("CUDA nvcc compiler not found. " - "Install CUDA toolkit in default location."); - return false; - } - const int cuda_version = cuewCompilerVersion(); - VLOG(1) << "Found nvcc " << nvcc - << ", CUDA version " << cuda_version - << "."; - const int major = cuda_version / 10, minor = cuda_version % 10; - if(cuda_version == 0) { - cuda_error_message("CUDA nvcc compiler version could not be parsed."); - return false; - } - if(cuda_version < 80) { - printf("Unsupported CUDA version %d.%d detected, " - "you need CUDA 8.0 or newer.\n", - major, minor); - return false; - } - else if(cuda_version != 101) { - printf("CUDA version %d.%d detected, build may succeed but only " - "CUDA 10.1 is officially supported.\n", - major, minor); - } - return true; - } - - string compile_kernel(const DeviceRequestedFeatures& requested_features, - bool filter=false, bool split=false) - { - const char *name, *source; - if(filter) { - name = "filter"; - source = "filter.cu"; - } - else if(split) { - name = "kernel_split"; - source = "kernel_split.cu"; - } - else { - name = "kernel"; - source = "kernel.cu"; - } - /* Compute cubin name. */ - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - - /* Attempt to use kernel provided with Blender. */ - if(!use_adaptive_compilation()) { - const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", - name, major, minor)); - VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; - if(path_exists(cubin)) { - VLOG(1) << "Using precompiled kernel."; - return cubin; - } - } - - const string common_cflags = - compile_kernel_get_common_cflags(requested_features, filter, split); - - /* Try to use locally compiled kernel. */ - const string source_path = path_get("source"); - const string kernel_md5 = path_files_md5_hash(source_path); - - /* We include cflags into md5 so changing cuda toolkit or changing other - * compiler command line arguments makes sure cubin gets re-built. - */ - const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); - - const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin", - name, major, minor, - cubin_md5.c_str()); - const string cubin = path_cache_get(path_join("kernels", cubin_file)); - VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; - if(path_exists(cubin)) { - VLOG(1) << "Using locally compiled kernel."; - return cubin; - } + if (split) { + cflags += " -D__SPLIT__"; + } + + return cflags; + } + + bool compile_check_compiler() + { + const char *nvcc = cuewCompilerPath(); + if (nvcc == NULL) { + cuda_error_message( + "CUDA nvcc compiler not found. " + "Install CUDA toolkit in default location."); + return false; + } + const int cuda_version = cuewCompilerVersion(); + VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << cuda_version << "."; + const int major = cuda_version / 10, minor = cuda_version % 10; + if (cuda_version == 0) { + cuda_error_message("CUDA nvcc compiler version could not be parsed."); + return false; + } + if (cuda_version < 80) { + printf( + "Unsupported CUDA version %d.%d detected, " + "you need CUDA 8.0 or newer.\n", + major, + minor); + return false; + } + else if (cuda_version != 101) { + printf( + "CUDA version %d.%d detected, build may succeed but only " + "CUDA 10.1 is officially supported.\n", + major, + minor); + } + return true; + } + + string compile_kernel(const DeviceRequestedFeatures &requested_features, + bool filter = false, + bool split = false) + { + const char *name, *source; + if (filter) { + name = "filter"; + source = "filter.cu"; + } + else if (split) { + name = "kernel_split"; + source = "kernel_split.cu"; + } + else { + name = "kernel"; + source = "kernel.cu"; + } + /* Compute cubin name. */ + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + + /* Attempt to use kernel provided with Blender. */ + if (!use_adaptive_compilation()) { + const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); + VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; + if (path_exists(cubin)) { + VLOG(1) << "Using precompiled kernel."; + return cubin; + } + } + + const string common_cflags = compile_kernel_get_common_cflags( + requested_features, filter, split); + + /* Try to use locally compiled kernel. */ + const string source_path = path_get("source"); + const string kernel_md5 = path_files_md5_hash(source_path); + + /* We include cflags into md5 so changing cuda toolkit or changing other + * compiler command line arguments makes sure cubin gets re-built. + */ + const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); + + const string cubin_file = string_printf( + "cycles_%s_sm%d%d_%s.cubin", name, major, minor, cubin_md5.c_str()); + const string cubin = path_cache_get(path_join("kernels", cubin_file)); + VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; + if (path_exists(cubin)) { + VLOG(1) << "Using locally compiled kernel."; + return cubin; + } #ifdef _WIN32 - if(have_precompiled_kernels()) { - if(major < 3) { - cuda_error_message(string_printf( - "CUDA device requires compute capability 3.0 or up, " - "found %d.%d. Your GPU is not supported.", - major, minor)); - } - else { - cuda_error_message(string_printf( - "CUDA binary kernel for this graphics card compute " - "capability (%d.%d) not found.", - major, minor)); - } - return ""; - } + if (have_precompiled_kernels()) { + if (major < 3) { + cuda_error_message( + string_printf("CUDA device requires compute capability 3.0 or up, " + "found %d.%d. Your GPU is not supported.", + major, + minor)); + } + else { + cuda_error_message( + string_printf("CUDA binary kernel for this graphics card compute " + "capability (%d.%d) not found.", + major, + minor)); + } + return ""; + } #endif - /* Compile. */ - if(!compile_check_compiler()) { - return ""; - } - const char *nvcc = cuewCompilerPath(); - const string kernel = path_join( - path_join(source_path, "kernel"), - path_join("kernels", - path_join("cuda", source))); - double starttime = time_dt(); - printf("Compiling CUDA kernel ...\n"); - - path_create_directories(cubin); - - string command = string_printf("\"%s\" " - "-arch=sm_%d%d " - "--cubin \"%s\" " - "-o \"%s\" " - "%s ", - nvcc, - major, minor, - kernel.c_str(), - cubin.c_str(), - common_cflags.c_str()); - - printf("%s\n", command.c_str()); - - if(system(command.c_str()) == -1) { - cuda_error_message("Failed to execute compilation command, " - "see console for details."); - return ""; - } - - /* Verify if compilation succeeded */ - if(!path_exists(cubin)) { - cuda_error_message("CUDA kernel compilation failed, " - "see console for details."); - return ""; - } - - printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); - - return cubin; - } - - bool load_kernels(const DeviceRequestedFeatures& requested_features) - { - /* TODO(sergey): Support kernels re-load for CUDA devices. - * - * Currently re-loading kernel will invalidate memory pointers, - * causing problems in cuCtxSynchronize. - */ - if(cuFilterModule && cuModule) { - VLOG(1) << "Skipping kernel reload, not currently supported."; - return true; - } - - /* check if cuda init succeeded */ - if(cuContext == 0) - return false; - - /* check if GPU is supported */ - if(!support_device(requested_features)) - return false; - - /* get kernel */ - string cubin = compile_kernel(requested_features, false, use_split_kernel()); - if(cubin == "") - return false; - - string filter_cubin = compile_kernel(requested_features, true, false); - if(filter_cubin == "") - return false; - - /* open module */ - CUDAContextScope scope(this); - - string cubin_data; - CUresult result; - - if(path_read_text(cubin, cubin_data)) - result = cuModuleLoadData(&cuModule, cubin_data.c_str()); - else - result = CUDA_ERROR_FILE_NOT_FOUND; - - if(cuda_error_(result, "cuModuleLoad")) - cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); - - if(path_read_text(filter_cubin, cubin_data)) - result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); - else - result = CUDA_ERROR_FILE_NOT_FOUND; - - if(cuda_error_(result, "cuModuleLoad")) - cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); - - if(result == CUDA_SUCCESS) { - reserve_local_memory(requested_features); - } - - return (result == CUDA_SUCCESS); - } - - void reserve_local_memory(const DeviceRequestedFeatures& requested_features) - { - if(use_split_kernel()) { - /* Split kernel mostly uses global memory and adaptive compilation, - * difficult to predict how much is needed currently. */ - return; - } - - /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory - * needed for kernel launches, so that we can reliably figure out when - * to allocate scene data in mapped host memory. */ - CUDAContextScope scope(this); - - size_t total = 0, free_before = 0, free_after = 0; - cuMemGetInfo(&free_before, &total); - - /* Get kernel function. */ - CUfunction cuPathTrace; - - if(requested_features.use_integrator_branched) { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); - } - - cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); - - int min_blocks, num_threads_per_block; - cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); - - /* Launch kernel, using just 1 block appears sufficient to reserve - * memory for all multiprocessors. It would be good to do this in - * parallel for the multi GPU case still to make it faster. */ - CUdeviceptr d_work_tiles = 0; - uint total_work_size = 0; - - void *args[] = {&d_work_tiles, - &total_work_size}; - - cuda_assert(cuLaunchKernel(cuPathTrace, - 1, 1, 1, - num_threads_per_block, 1, 1, - 0, 0, args, 0)); - - cuda_assert(cuCtxSynchronize()); - - cuMemGetInfo(&free_after, &total); - VLOG(1) << "Local memory reserved " - << string_human_readable_number(free_before - free_after) << " bytes. (" - << string_human_readable_size(free_before - free_after) << ")"; + /* Compile. */ + if (!compile_check_compiler()) { + return ""; + } + const char *nvcc = cuewCompilerPath(); + const string kernel = path_join(path_join(source_path, "kernel"), + path_join("kernels", path_join("cuda", source))); + double starttime = time_dt(); + printf("Compiling CUDA kernel ...\n"); + + path_create_directories(cubin); + + string command = string_printf( + "\"%s\" " + "-arch=sm_%d%d " + "--cubin \"%s\" " + "-o \"%s\" " + "%s ", + nvcc, + major, + minor, + kernel.c_str(), + cubin.c_str(), + common_cflags.c_str()); + + printf("%s\n", command.c_str()); + + if (system(command.c_str()) == -1) { + cuda_error_message( + "Failed to execute compilation command, " + "see console for details."); + return ""; + } + + /* Verify if compilation succeeded */ + if (!path_exists(cubin)) { + cuda_error_message( + "CUDA kernel compilation failed, " + "see console for details."); + return ""; + } + + printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); + + return cubin; + } + + bool load_kernels(const DeviceRequestedFeatures &requested_features) + { + /* TODO(sergey): Support kernels re-load for CUDA devices. + * + * Currently re-loading kernel will invalidate memory pointers, + * causing problems in cuCtxSynchronize. + */ + if (cuFilterModule && cuModule) { + VLOG(1) << "Skipping kernel reload, not currently supported."; + return true; + } + + /* check if cuda init succeeded */ + if (cuContext == 0) + return false; + + /* check if GPU is supported */ + if (!support_device(requested_features)) + return false; + + /* get kernel */ + string cubin = compile_kernel(requested_features, false, use_split_kernel()); + if (cubin == "") + return false; + + string filter_cubin = compile_kernel(requested_features, true, false); + if (filter_cubin == "") + return false; + + /* open module */ + CUDAContextScope scope(this); + + string cubin_data; + CUresult result; + + if (path_read_text(cubin, cubin_data)) + result = cuModuleLoadData(&cuModule, cubin_data.c_str()); + else + result = CUDA_ERROR_FILE_NOT_FOUND; + + if (cuda_error_(result, "cuModuleLoad")) + cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); + + if (path_read_text(filter_cubin, cubin_data)) + result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); + else + result = CUDA_ERROR_FILE_NOT_FOUND; + + if (cuda_error_(result, "cuModuleLoad")) + cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); + + if (result == CUDA_SUCCESS) { + reserve_local_memory(requested_features); + } + + return (result == CUDA_SUCCESS); + } + + void reserve_local_memory(const DeviceRequestedFeatures &requested_features) + { + if (use_split_kernel()) { + /* Split kernel mostly uses global memory and adaptive compilation, + * difficult to predict how much is needed currently. */ + return; + } + + /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory + * needed for kernel launches, so that we can reliably figure out when + * to allocate scene data in mapped host memory. */ + CUDAContextScope scope(this); + + size_t total = 0, free_before = 0, free_after = 0; + cuMemGetInfo(&free_before, &total); + + /* Get kernel function. */ + CUfunction cuPathTrace; + + if (requested_features.use_integrator_branched) { + cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); + } + else { + cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); + } + + cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + + int min_blocks, num_threads_per_block; + cuda_assert(cuOccupancyMaxPotentialBlockSize( + &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); + + /* Launch kernel, using just 1 block appears sufficient to reserve + * memory for all multiprocessors. It would be good to do this in + * parallel for the multi GPU case still to make it faster. */ + CUdeviceptr d_work_tiles = 0; + uint total_work_size = 0; + + void *args[] = {&d_work_tiles, &total_work_size}; + + cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + + cuda_assert(cuCtxSynchronize()); + + cuMemGetInfo(&free_after, &total); + VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) + << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; #if 0 - /* For testing mapped host memory, fill up device memory. */ - const size_t keep_mb = 1024; - - while(free_after > keep_mb * 1024 * 1024LL) { - CUdeviceptr tmp; - cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL)); - cuMemGetInfo(&free_after, &total); - } + /* For testing mapped host memory, fill up device memory. */ + const size_t keep_mb = 1024; + + while(free_after > keep_mb * 1024 * 1024LL) { + CUdeviceptr tmp; + cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL)); + cuMemGetInfo(&free_after, &total); + } #endif - } - - void init_host_memory() - { - /* Limit amount of host mapped memory, because allocating too much can - * cause system instability. Leave at least half or 4 GB of system - * memory free, whichever is smaller. */ - size_t default_limit = 4 * 1024 * 1024 * 1024LL; - size_t system_ram = system_physical_ram(); - - if(system_ram > 0) { - if(system_ram / 2 > default_limit) { - map_host_limit = system_ram - default_limit; - } - else { - map_host_limit = system_ram / 2; - } - } - else { - VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; - map_host_limit = 0; - } - - /* Amount of device memory to keep is free after texture memory - * and working memory allocations respectively. We set the working - * memory limit headroom lower so that some space is left after all - * texture memory allocations. */ - device_working_headroom = 32 * 1024 * 1024LL; // 32MB - device_texture_headroom = 128 * 1024 * 1024LL; // 128MB - - VLOG(1) << "Mapped host memory limit set to " - << string_human_readable_number(map_host_limit) << " bytes. (" - << string_human_readable_size(map_host_limit) << ")"; - } - - void load_texture_info() - { - if(need_texture_info) { - texture_info.copy_to_device(); - need_texture_info = false; - } - } - - void move_textures_to_host(size_t size, bool for_texture) - { - /* Signal to reallocate textures in host memory only. */ - move_texture_to_host = true; - - while(size > 0) { - /* Find suitable memory allocation to move. */ - device_memory *max_mem = NULL; - size_t max_size = 0; - bool max_is_image = false; - - foreach(CUDAMemMap::value_type& pair, cuda_mem_map) { - device_memory& mem = *pair.first; - CUDAMem *cmem = &pair.second; - - bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - /* Can't move this type of memory. */ - if(!is_texture || cmem->array) { - continue; - } - - /* Already in host memory. */ - if(cmem->map_host_pointer) { - continue; - } - - /* For other textures, only move image textures. */ - if(for_texture && !is_image) { - continue; - } - - /* Try to move largest allocation, prefer moving images. */ - if(is_image > max_is_image || - (is_image == max_is_image && mem.device_size > max_size)) { - max_is_image = is_image; - max_size = mem.device_size; - max_mem = &mem; - } - } - - /* Move to host memory. This part is mutex protected since - * multiple CUDA devices could be moving the memory. The - * first one will do it, and the rest will adopt the pointer. */ - if(max_mem) { - VLOG(1) << "Move memory from device to host: " << max_mem->name; - - static thread_mutex move_mutex; - thread_scoped_lock lock(move_mutex); - - /* Preserve the original device pointer, in case of multi device - * we can't change it because the pointer mapping would break. */ - device_ptr prev_pointer = max_mem->device_pointer; - size_t prev_size = max_mem->device_size; - - tex_free(*max_mem); - tex_alloc(*max_mem); - size = (max_size >= size)? 0: size - max_size; - - max_mem->device_pointer = prev_pointer; - max_mem->device_size = prev_size; - } - else { - break; - } - } - - /* Update texture info array with new pointers. */ - load_texture_info(); - - move_texture_to_host = false; - } - - CUDAMem *generic_alloc(device_memory& mem, size_t pitch_padding = 0) - { - CUDAContextScope scope(this); - - CUdeviceptr device_pointer = 0; - size_t size = mem.memory_size() + pitch_padding; - - CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; - const char *status = ""; - - /* First try allocating in device memory, respecting headroom. We make - * an exception for texture info. It is small and frequently accessed, - * so treat it as working memory. - * - * If there is not enough room for working memory, we will try to move - * textures to host memory, assuming the performance impact would have - * been worse for working memory. */ - bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - size_t headroom = (is_texture)? device_texture_headroom: - device_working_headroom; - - size_t total = 0, free = 0; - cuMemGetInfo(&free, &total); - - /* Move textures to host memory if needed. */ - if(!move_texture_to_host && !is_image && (size + headroom) >= free) { - move_textures_to_host(size + headroom - free, is_texture); - cuMemGetInfo(&free, &total); - } - - /* Allocate in device memory. */ - if(!move_texture_to_host && (size + headroom) < free) { - mem_alloc_result = cuMemAlloc(&device_pointer, size); - if(mem_alloc_result == CUDA_SUCCESS) { - status = " in device memory"; - } - } - - /* Fall back to mapped host memory if needed and possible. */ - void *map_host_pointer = 0; - bool free_map_host = false; - - if(mem_alloc_result != CUDA_SUCCESS && can_map_host && - map_host_used + size < map_host_limit) { - if(mem.shared_pointer) { - /* Another device already allocated host memory. */ - mem_alloc_result = CUDA_SUCCESS; - map_host_pointer = mem.shared_pointer; - } - else { - /* Allocate host memory ourselves. */ - mem_alloc_result = cuMemHostAlloc(&map_host_pointer, size, - CU_MEMHOSTALLOC_DEVICEMAP | - CU_MEMHOSTALLOC_WRITECOMBINED); - mem.shared_pointer = map_host_pointer; - free_map_host = true; - } - - if(mem_alloc_result == CUDA_SUCCESS) { - cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0)); - map_host_used += size; - status = " in host memory"; - - /* Replace host pointer with our host allocation. Only works if - * CUDA memory layout is the same and has no pitch padding. Also - * does not work if we move textures to host during a render, - * since other devices might be using the memory. */ - if(!move_texture_to_host && pitch_padding == 0 && - mem.host_pointer && mem.host_pointer != mem.shared_pointer) { - memcpy(mem.shared_pointer, mem.host_pointer, size); - mem.host_free(); - mem.host_pointer = mem.shared_pointer; - } - } - else { - status = " failed, out of host memory"; - } - } - else if(mem_alloc_result != CUDA_SUCCESS) { - status = " failed, out of device and host memory"; - } - - if(mem_alloc_result != CUDA_SUCCESS) { - cuda_assert(mem_alloc_result); - } - - if(mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")" - << status; - } - - mem.device_pointer = (device_ptr)device_pointer; - mem.device_size = size; - stats.mem_alloc(size); - - if(!mem.device_pointer) { - return NULL; - } - - /* Insert into map of allocations. */ - CUDAMem *cmem = &cuda_mem_map[&mem]; - cmem->map_host_pointer = map_host_pointer; - cmem->free_map_host = free_map_host; - return cmem; - } - - void generic_copy_to(device_memory& mem) - { - if(mem.host_pointer && mem.device_pointer) { - CUDAContextScope scope(this); - - if(mem.host_pointer != mem.shared_pointer) { - cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), - mem.host_pointer, - mem.memory_size())); - } - } - } - - void generic_free(device_memory& mem) - { - if(mem.device_pointer) { - CUDAContextScope scope(this); - const CUDAMem& cmem = cuda_mem_map[&mem]; - - if(cmem.map_host_pointer) { - /* Free host memory. */ - if(cmem.free_map_host) { - cuMemFreeHost(cmem.map_host_pointer); - if(mem.host_pointer == mem.shared_pointer) { - mem.host_pointer = 0; - } - mem.shared_pointer = 0; - } - - map_host_used -= mem.device_size; - } - else { - /* Free device memory. */ - cuMemFree(mem.device_pointer); - } - - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } - } - - void mem_alloc(device_memory& mem) - { - if(mem.type == MEM_PIXELS && !background) { - pixels_alloc(mem); - } - else if(mem.type == MEM_TEXTURE) { - assert(!"mem_alloc not supported for textures."); - } - else { - generic_alloc(mem); - } - } - - void mem_copy_to(device_memory& mem) - { - if(mem.type == MEM_PIXELS) { - assert(!"mem_copy_to not supported for pixels."); - } - else if(mem.type == MEM_TEXTURE) { - tex_free(mem); - tex_alloc(mem); - } - else { - if(!mem.device_pointer) { - generic_alloc(mem); - } - - generic_copy_to(mem); - } - } - - void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) - { - if(mem.type == MEM_PIXELS && !background) { - pixels_copy_from(mem, y, w, h); - } - else if(mem.type == MEM_TEXTURE) { - assert(!"mem_copy_from not supported for textures."); - } - else { - CUDAContextScope scope(this); - size_t offset = elem*y*w; - size_t size = elem*w*h; - - if(mem.host_pointer && mem.device_pointer) { - cuda_assert(cuMemcpyDtoH((uchar*)mem.host_pointer + offset, - (CUdeviceptr)(mem.device_pointer + offset), size)); - } - else if(mem.host_pointer) { - memset((char*)mem.host_pointer + offset, 0, size); - } - } - } - - void mem_zero(device_memory& mem) - { - if(!mem.device_pointer) { - mem_alloc(mem); - } - - if(mem.host_pointer) { - memset(mem.host_pointer, 0, mem.memory_size()); - } - - if(mem.device_pointer && - (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) { - CUDAContextScope scope(this); - cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())); - } - } - - void mem_free(device_memory& mem) - { - if(mem.type == MEM_PIXELS && !background) { - pixels_free(mem); - } - else if(mem.type == MEM_TEXTURE) { - tex_free(mem); - } - else { - generic_free(mem); - } - } - - virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/) - { - return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); - } - - void const_copy_to(const char *name, void *host, size_t size) - { - CUDAContextScope scope(this); - CUdeviceptr mem; - size_t bytes; - - cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); - //assert(bytes == size); - cuda_assert(cuMemcpyHtoD(mem, host, size)); - } - - void tex_alloc(device_memory& mem) - { - CUDAContextScope scope(this); - - /* General variables for both architectures */ - string bind_name = mem.name; - size_t dsize = datatype_size(mem.data_type); - size_t size = mem.memory_size(); - - CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; - switch(mem.extension) { - case EXTENSION_REPEAT: - address_mode = CU_TR_ADDRESS_MODE_WRAP; - break; - case EXTENSION_EXTEND: - address_mode = CU_TR_ADDRESS_MODE_CLAMP; - break; - case EXTENSION_CLIP: - address_mode = CU_TR_ADDRESS_MODE_BORDER; - break; - default: - assert(0); - break; - } - - CUfilter_mode filter_mode; - if(mem.interpolation == INTERPOLATION_CLOSEST) { - filter_mode = CU_TR_FILTER_MODE_POINT; - } - else { - filter_mode = CU_TR_FILTER_MODE_LINEAR; - } - - /* Data Storage */ - if(mem.interpolation == INTERPOLATION_NONE) { - generic_alloc(mem); - generic_copy_to(mem); - - CUdeviceptr cumem; - size_t cubytes; - - cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); - - if(cubytes == 8) { - /* 64 bit device pointer */ - uint64_t ptr = mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); - } - else { - /* 32 bit device pointer */ - uint32_t ptr = (uint32_t)mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); - } - return; - } - - /* Image Texture Storage */ - CUarray_format_enum format; - switch(mem.data_type) { - case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; - case TYPE_UINT16: format = CU_AD_FORMAT_UNSIGNED_INT16; break; - case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; - case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; - case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; - case TYPE_HALF: format = CU_AD_FORMAT_HALF; break; - default: assert(0); return; - } - - CUDAMem *cmem = NULL; - CUarray array_3d = NULL; - size_t src_pitch = mem.data_width * dsize * mem.data_elements; - size_t dst_pitch = src_pitch; - - if(mem.data_depth > 1) { - /* 3D texture using array, there is no API for linear memory. */ - CUDA_ARRAY3D_DESCRIPTOR desc; - - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Depth = mem.data_depth; - desc.Format = format; - desc.NumChannels = mem.data_elements; - desc.Flags = 0; - - VLOG(1) << "Array 3D allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - cuda_assert(cuArray3DCreate(&array_3d, &desc)); - - if(!array_3d) { - return; - } - - CUDA_MEMCPY3D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = array_3d; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - param.Depth = mem.data_depth; - - cuda_assert(cuMemcpy3D(¶m)); - - mem.device_pointer = (device_ptr)array_3d; - mem.device_size = size; - stats.mem_alloc(size); - - cmem = &cuda_mem_map[&mem]; - cmem->texobject = 0; - cmem->array = array_3d; - } - else if(mem.data_height > 0) { - /* 2D texture, using pitch aligned linear memory. */ - int alignment = 0; - cuda_assert(cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); - dst_pitch = align_up(src_pitch, alignment); - size_t dst_size = dst_pitch * mem.data_height; - - cmem = generic_alloc(mem, dst_size - mem.memory_size()); - if(!cmem) { - return; - } - - CUDA_MEMCPY2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_DEVICE; - param.dstDevice = mem.device_pointer; - param.dstPitch = dst_pitch; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - - cuda_assert(cuMemcpy2DUnaligned(¶m)); - } - else { - /* 1D texture, using linear memory. */ - cmem = generic_alloc(mem); - if(!cmem) { - return; - } - - cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); - } - - /* Kepler+, bindless textures. */ - int flat_slot = 0; - if(string_startswith(mem.name, "__tex_image")) { - int pos = string(mem.name).rfind("_"); - flat_slot = atoi(mem.name + pos + 1); - } - else { - assert(0); - } - - CUDA_RESOURCE_DESC resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - - if(array_3d) { - resDesc.resType = CU_RESOURCE_TYPE_ARRAY; - resDesc.res.array.hArray = array_3d; - resDesc.flags = 0; - } - else if(mem.data_height > 0) { - resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; - resDesc.res.pitch2D.devPtr = mem.device_pointer; - resDesc.res.pitch2D.format = format; - resDesc.res.pitch2D.numChannels = mem.data_elements; - resDesc.res.pitch2D.height = mem.data_height; - resDesc.res.pitch2D.width = mem.data_width; - resDesc.res.pitch2D.pitchInBytes = dst_pitch; - } - else { - resDesc.resType = CU_RESOURCE_TYPE_LINEAR; - resDesc.res.linear.devPtr = mem.device_pointer; - resDesc.res.linear.format = format; - resDesc.res.linear.numChannels = mem.data_elements; - resDesc.res.linear.sizeInBytes = mem.device_size; - } - - CUDA_TEXTURE_DESC texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.addressMode[0] = address_mode; - texDesc.addressMode[1] = address_mode; - texDesc.addressMode[2] = address_mode; - texDesc.filterMode = filter_mode; - texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; - - cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); - - /* Resize once */ - if(flat_slot >= texture_info.size()) { - /* Allocate some slots in advance, to reduce amount - * of re-allocations. */ - texture_info.resize(flat_slot + 128); - } - - /* Set Mapping and tag that we need to (re-)upload to device */ - TextureInfo& info = texture_info[flat_slot]; - info.data = (uint64_t)cmem->texobject; - info.cl_buffer = 0; - info.interpolation = mem.interpolation; - info.extension = mem.extension; - info.width = mem.data_width; - info.height = mem.data_height; - info.depth = mem.data_depth; - need_texture_info = true; - } - - void tex_free(device_memory& mem) - { - if(mem.device_pointer) { - CUDAContextScope scope(this); - const CUDAMem& cmem = cuda_mem_map[&mem]; - - if(cmem.texobject) { - /* Free bindless texture. */ - cuTexObjectDestroy(cmem.texobject); - } - - if(cmem.array) { - /* Free array. */ - cuArrayDestroy(cmem.array); - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } - else { - generic_free(mem); - } - } - } - -#define CUDA_GET_BLOCKSIZE(func, w, h) \ - int threads_per_block; \ - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - int threads = (int)sqrt((float)threads_per_block); \ - int xblocks = ((w) + threads - 1)/threads; \ - int yblocks = ((h) + threads - 1)/threads; - -#define CUDA_LAUNCH_KERNEL(func, args) \ - cuda_assert(cuLaunchKernel(func, \ - xblocks, yblocks, 1, \ - threads, threads, 1, \ - 0, 0, args, 0)); + } + + void init_host_memory() + { + /* Limit amount of host mapped memory, because allocating too much can + * cause system instability. Leave at least half or 4 GB of system + * memory free, whichever is smaller. */ + size_t default_limit = 4 * 1024 * 1024 * 1024LL; + size_t system_ram = system_physical_ram(); + + if (system_ram > 0) { + if (system_ram / 2 > default_limit) { + map_host_limit = system_ram - default_limit; + } + else { + map_host_limit = system_ram / 2; + } + } + else { + VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + map_host_limit = 0; + } + + /* Amount of device memory to keep is free after texture memory + * and working memory allocations respectively. We set the working + * memory limit headroom lower so that some space is left after all + * texture memory allocations. */ + device_working_headroom = 32 * 1024 * 1024LL; // 32MB + device_texture_headroom = 128 * 1024 * 1024LL; // 128MB + + VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) + << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; + } + + void load_texture_info() + { + if (need_texture_info) { + texture_info.copy_to_device(); + need_texture_info = false; + } + } + + void move_textures_to_host(size_t size, bool for_texture) + { + /* Signal to reallocate textures in host memory only. */ + move_texture_to_host = true; + + while (size > 0) { + /* Find suitable memory allocation to move. */ + device_memory *max_mem = NULL; + size_t max_size = 0; + bool max_is_image = false; + + foreach (CUDAMemMap::value_type &pair, cuda_mem_map) { + device_memory &mem = *pair.first; + CUDAMem *cmem = &pair.second; + + bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + /* Can't move this type of memory. */ + if (!is_texture || cmem->array) { + continue; + } + + /* Already in host memory. */ + if (cmem->map_host_pointer) { + continue; + } + + /* For other textures, only move image textures. */ + if (for_texture && !is_image) { + continue; + } + + /* Try to move largest allocation, prefer moving images. */ + if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { + max_is_image = is_image; + max_size = mem.device_size; + max_mem = &mem; + } + } + + /* Move to host memory. This part is mutex protected since + * multiple CUDA devices could be moving the memory. The + * first one will do it, and the rest will adopt the pointer. */ + if (max_mem) { + VLOG(1) << "Move memory from device to host: " << max_mem->name; + + static thread_mutex move_mutex; + thread_scoped_lock lock(move_mutex); + + /* Preserve the original device pointer, in case of multi device + * we can't change it because the pointer mapping would break. */ + device_ptr prev_pointer = max_mem->device_pointer; + size_t prev_size = max_mem->device_size; + + tex_free(*max_mem); + tex_alloc(*max_mem); + size = (max_size >= size) ? 0 : size - max_size; + + max_mem->device_pointer = prev_pointer; + max_mem->device_size = prev_size; + } + else { + break; + } + } + + /* Update texture info array with new pointers. */ + load_texture_info(); + + move_texture_to_host = false; + } + + CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0) + { + CUDAContextScope scope(this); + + CUdeviceptr device_pointer = 0; + size_t size = mem.memory_size() + pitch_padding; + + CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; + const char *status = ""; + + /* First try allocating in device memory, respecting headroom. We make + * an exception for texture info. It is small and frequently accessed, + * so treat it as working memory. + * + * If there is not enough room for working memory, we will try to move + * textures to host memory, assuming the performance impact would have + * been worse for working memory. */ + bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; + + size_t total = 0, free = 0; + cuMemGetInfo(&free, &total); + + /* Move textures to host memory if needed. */ + if (!move_texture_to_host && !is_image && (size + headroom) >= free) { + move_textures_to_host(size + headroom - free, is_texture); + cuMemGetInfo(&free, &total); + } + + /* Allocate in device memory. */ + if (!move_texture_to_host && (size + headroom) < free) { + mem_alloc_result = cuMemAlloc(&device_pointer, size); + if (mem_alloc_result == CUDA_SUCCESS) { + status = " in device memory"; + } + } + + /* Fall back to mapped host memory if needed and possible. */ + void *map_host_pointer = 0; + bool free_map_host = false; + + if (mem_alloc_result != CUDA_SUCCESS && can_map_host && + map_host_used + size < map_host_limit) { + if (mem.shared_pointer) { + /* Another device already allocated host memory. */ + mem_alloc_result = CUDA_SUCCESS; + map_host_pointer = mem.shared_pointer; + } + else { + /* Allocate host memory ourselves. */ + mem_alloc_result = cuMemHostAlloc( + &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); + mem.shared_pointer = map_host_pointer; + free_map_host = true; + } + + if (mem_alloc_result == CUDA_SUCCESS) { + cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0)); + map_host_used += size; + status = " in host memory"; + + /* Replace host pointer with our host allocation. Only works if + * CUDA memory layout is the same and has no pitch padding. Also + * does not work if we move textures to host during a render, + * since other devices might be using the memory. */ + if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && + mem.host_pointer != mem.shared_pointer) { + memcpy(mem.shared_pointer, mem.host_pointer, size); + mem.host_free(); + mem.host_pointer = mem.shared_pointer; + } + } + else { + status = " failed, out of host memory"; + } + } + else if (mem_alloc_result != CUDA_SUCCESS) { + status = " failed, out of device and host memory"; + } + + if (mem_alloc_result != CUDA_SUCCESS) { + cuda_assert(mem_alloc_result); + } + + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")" << status; + } + + mem.device_pointer = (device_ptr)device_pointer; + mem.device_size = size; + stats.mem_alloc(size); + + if (!mem.device_pointer) { + return NULL; + } + + /* Insert into map of allocations. */ + CUDAMem *cmem = &cuda_mem_map[&mem]; + cmem->map_host_pointer = map_host_pointer; + cmem->free_map_host = free_map_host; + return cmem; + } + + void generic_copy_to(device_memory &mem) + { + if (mem.host_pointer && mem.device_pointer) { + CUDAContextScope scope(this); + + if (mem.host_pointer != mem.shared_pointer) { + cuda_assert(cuMemcpyHtoD( + cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size())); + } + } + } + + void generic_free(device_memory &mem) + { + if (mem.device_pointer) { + CUDAContextScope scope(this); + const CUDAMem &cmem = cuda_mem_map[&mem]; + + if (cmem.map_host_pointer) { + /* Free host memory. */ + if (cmem.free_map_host) { + cuMemFreeHost(cmem.map_host_pointer); + if (mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + mem.shared_pointer = 0; + } + + map_host_used -= mem.device_size; + } + else { + /* Free device memory. */ + cuMemFree(mem.device_pointer); + } + + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } + } + + void mem_alloc(device_memory &mem) + { + if (mem.type == MEM_PIXELS && !background) { + pixels_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else { + generic_alloc(mem); + } + } + + void mem_copy_to(device_memory &mem) + { + if (mem.type == MEM_PIXELS) { + assert(!"mem_copy_to not supported for pixels."); + } + else if (mem.type == MEM_TEXTURE) { + tex_free(mem); + tex_alloc(mem); + } + else { + if (!mem.device_pointer) { + generic_alloc(mem); + } + + generic_copy_to(mem); + } + } + + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) + { + if (mem.type == MEM_PIXELS && !background) { + pixels_copy_from(mem, y, w, h); + } + else if (mem.type == MEM_TEXTURE) { + assert(!"mem_copy_from not supported for textures."); + } + else { + CUDAContextScope scope(this); + size_t offset = elem * y * w; + size_t size = elem * w * h; + + if (mem.host_pointer && mem.device_pointer) { + cuda_assert(cuMemcpyDtoH( + (uchar *)mem.host_pointer + offset, (CUdeviceptr)(mem.device_pointer + offset), size)); + } + else if (mem.host_pointer) { + memset((char *)mem.host_pointer + offset, 0, size); + } + } + } + + void mem_zero(device_memory &mem) + { + if (!mem.device_pointer) { + mem_alloc(mem); + } + + if (mem.host_pointer) { + memset(mem.host_pointer, 0, mem.memory_size()); + } + + if (mem.device_pointer && (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) { + CUDAContextScope scope(this); + cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())); + } + } + + void mem_free(device_memory &mem) + { + if (mem.type == MEM_PIXELS && !background) { + pixels_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free(mem); + } + else { + generic_free(mem); + } + } + + virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) + { + return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); + } + + void const_copy_to(const char *name, void *host, size_t size) + { + CUDAContextScope scope(this); + CUdeviceptr mem; + size_t bytes; + + cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); + //assert(bytes == size); + cuda_assert(cuMemcpyHtoD(mem, host, size)); + } + + void tex_alloc(device_memory &mem) + { + CUDAContextScope scope(this); + + /* General variables for both architectures */ + string bind_name = mem.name; + size_t dsize = datatype_size(mem.data_type); + size_t size = mem.memory_size(); + + CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; + switch (mem.extension) { + case EXTENSION_REPEAT: + address_mode = CU_TR_ADDRESS_MODE_WRAP; + break; + case EXTENSION_EXTEND: + address_mode = CU_TR_ADDRESS_MODE_CLAMP; + break; + case EXTENSION_CLIP: + address_mode = CU_TR_ADDRESS_MODE_BORDER; + break; + default: + assert(0); + break; + } + + CUfilter_mode filter_mode; + if (mem.interpolation == INTERPOLATION_CLOSEST) { + filter_mode = CU_TR_FILTER_MODE_POINT; + } + else { + filter_mode = CU_TR_FILTER_MODE_LINEAR; + } + + /* Data Storage */ + if (mem.interpolation == INTERPOLATION_NONE) { + generic_alloc(mem); + generic_copy_to(mem); + + CUdeviceptr cumem; + size_t cubytes; + + cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); + + if (cubytes == 8) { + /* 64 bit device pointer */ + uint64_t ptr = mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes)); + } + else { + /* 32 bit device pointer */ + uint32_t ptr = (uint32_t)mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes)); + } + return; + } + + /* Image Texture Storage */ + CUarray_format_enum format; + switch (mem.data_type) { + case TYPE_UCHAR: + format = CU_AD_FORMAT_UNSIGNED_INT8; + break; + case TYPE_UINT16: + format = CU_AD_FORMAT_UNSIGNED_INT16; + break; + case TYPE_UINT: + format = CU_AD_FORMAT_UNSIGNED_INT32; + break; + case TYPE_INT: + format = CU_AD_FORMAT_SIGNED_INT32; + break; + case TYPE_FLOAT: + format = CU_AD_FORMAT_FLOAT; + break; + case TYPE_HALF: + format = CU_AD_FORMAT_HALF; + break; + default: + assert(0); + return; + } + + CUDAMem *cmem = NULL; + CUarray array_3d = NULL; + size_t src_pitch = mem.data_width * dsize * mem.data_elements; + size_t dst_pitch = src_pitch; + + if (mem.data_depth > 1) { + /* 3D texture using array, there is no API for linear memory. */ + CUDA_ARRAY3D_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; + + VLOG(1) << "Array 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + cuda_assert(cuArray3DCreate(&array_3d, &desc)); + + if (!array_3d) { + return; + } + + CUDA_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = array_3d; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + + cuda_assert(cuMemcpy3D(¶m)); + + mem.device_pointer = (device_ptr)array_3d; + mem.device_size = size; + stats.mem_alloc(size); + + cmem = &cuda_mem_map[&mem]; + cmem->texobject = 0; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + /* 2D texture, using pitch aligned linear memory. */ + int alignment = 0; + cuda_assert( + cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); + dst_pitch = align_up(src_pitch, alignment); + size_t dst_size = dst_pitch * mem.data_height; + + cmem = generic_alloc(mem, dst_size - mem.memory_size()); + if (!cmem) { + return; + } + + CUDA_MEMCPY2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_DEVICE; + param.dstDevice = mem.device_pointer; + param.dstPitch = dst_pitch; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + cuda_assert(cuMemcpy2DUnaligned(¶m)); + } + else { + /* 1D texture, using linear memory. */ + cmem = generic_alloc(mem); + if (!cmem) { + return; + } + + cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); + } + + /* Kepler+, bindless textures. */ + int flat_slot = 0; + if (string_startswith(mem.name, "__tex_image")) { + int pos = string(mem.name).rfind("_"); + flat_slot = atoi(mem.name + pos + 1); + } + else { + assert(0); + } + + CUDA_RESOURCE_DESC resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + + if (array_3d) { + resDesc.resType = CU_RESOURCE_TYPE_ARRAY; + resDesc.res.array.hArray = array_3d; + resDesc.flags = 0; + } + else if (mem.data_height > 0) { + resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; + resDesc.res.pitch2D.devPtr = mem.device_pointer; + resDesc.res.pitch2D.format = format; + resDesc.res.pitch2D.numChannels = mem.data_elements; + resDesc.res.pitch2D.height = mem.data_height; + resDesc.res.pitch2D.width = mem.data_width; + resDesc.res.pitch2D.pitchInBytes = dst_pitch; + } + else { + resDesc.resType = CU_RESOURCE_TYPE_LINEAR; + resDesc.res.linear.devPtr = mem.device_pointer; + resDesc.res.linear.format = format; + resDesc.res.linear.numChannels = mem.data_elements; + resDesc.res.linear.sizeInBytes = mem.device_size; + } + + CUDA_TEXTURE_DESC texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = address_mode; + texDesc.addressMode[1] = address_mode; + texDesc.addressMode[2] = address_mode; + texDesc.filterMode = filter_mode; + texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; + + cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); + + /* Resize once */ + if (flat_slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(flat_slot + 128); + } + + /* Set Mapping and tag that we need to (re-)upload to device */ + TextureInfo &info = texture_info[flat_slot]; + info.data = (uint64_t)cmem->texobject; + info.cl_buffer = 0; + info.interpolation = mem.interpolation; + info.extension = mem.extension; + info.width = mem.data_width; + info.height = mem.data_height; + info.depth = mem.data_depth; + need_texture_info = true; + } + + void tex_free(device_memory &mem) + { + if (mem.device_pointer) { + CUDAContextScope scope(this); + const CUDAMem &cmem = cuda_mem_map[&mem]; + + if (cmem.texobject) { + /* Free bindless texture. */ + cuTexObjectDestroy(cmem.texobject); + } + + if (cmem.array) { + /* Free array. */ + cuArrayDestroy(cmem.array); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } + else { + generic_free(mem); + } + } + } + +#define CUDA_GET_BLOCKSIZE(func, w, h) \ + int threads_per_block; \ + cuda_assert( \ + cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int threads = (int)sqrt((float)threads_per_block); \ + int xblocks = ((w) + threads - 1) / threads; \ + int yblocks = ((h) + threads - 1) / threads; + +#define CUDA_LAUNCH_KERNEL(func, args) \ + cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); /* Similar as above, but for 1-dimensional blocks. */ -#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ - int threads_per_block; \ - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - int xblocks = ((w) + threads_per_block - 1)/threads_per_block; \ - int yblocks = h; - -#define CUDA_LAUNCH_KERNEL_1D(func, args) \ - cuda_assert(cuLaunchKernel(func, \ - xblocks, yblocks, 1, \ - threads_per_block, 1, 1, \ - 0, 0, args, 0)); - - bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, - DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - int stride = task->buffer.stride; - int w = task->buffer.width; - int h = task->buffer.h; - int r = task->nlm_state.r; - int f = task->nlm_state.f; - float a = task->nlm_state.a; - float k_2 = task->nlm_state.k_2; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2*r+1)*(2*r+1); - int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; - int frame_offset = 0; - - if(have_error()) - return false; - - CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); - CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; - CUdeviceptr weightAccum = difference + 2*sizeof(float)*pass_stride*num_shifts; - CUdeviceptr scale_ptr = 0; - - cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*pass_stride)); - cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*pass_stride)); - - { - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput; - cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); - - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts); - - void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args); - } - - { - CUfunction cuNLMNormalize; - cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); - cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); - void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride}; - CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h); - CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); - cuda_assert(cuCtxSynchronize()); - } - - return !have_error(); - } - - bool denoising_construct_transform(DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterConstructTransform; - cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); - cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); - CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, - task->storage.w, - task->storage.h); - - void *args[] = {&task->buffer.mem.device_pointer, - &task->tile_info_mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->filter_area, - &task->rect, - &task->radius, - &task->pca_threshold, - &task->buffer.pass_stride, - &task->buffer.frame_stride, - &task->buffer.use_time}; - CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - int r = task->radius; - int f = 4; - float a = 1.0f; - float k_2 = task->nlm_k_2; - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - int stride = task->buffer.stride; - int frame_offset = frame * task->buffer.frame_stride; - int t = task->tile_info->frames[frame]; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2*r+1)*(2*r+1); - - if(have_error()) - return false; - - CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); - CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; - - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; - cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); - - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, - task->reconstruction_state.source_w * task->reconstruction_state.source_h, - num_shifts); - - void *calc_difference_args[] = {&color_ptr, - &color_variance_ptr, - &scale_ptr, - &difference, - &w, &h, - &stride, &pass_stride, - &r, &pass_stride, - &frame_offset, - &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *construct_gramian_args[] = {&t, - &blurDifference, - &task->buffer.mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->reconstruction_state.filter_window, - &w, &h, &stride, - &pass_stride, &r, - &f, - &frame_offset, - &task->buffer.use_time}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_solve(device_ptr output_ptr, - DenoisingTask *task) - { - CUfunction cuFinalize; - cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); - cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); - void *finalize_args[] = {&output_ptr, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->filter_area, - &task->reconstruction_state.buffer_params.x, - &task->render_buffer.samples}; - CUDA_GET_BLOCKSIZE(cuFinalize, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, - device_ptr mean_ptr, device_ptr variance_ptr, - int r, int4 rect, DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterCombineHalves; - cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); - cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterCombineHalves, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - void *args[] = {&mean_ptr, - &variance_ptr, - &a_ptr, - &b_ptr, - &rect, - &r}; - CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, - device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterDivideShadow; - cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); - cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterDivideShadow, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - void *args[] = {&task->render_buffer.samples, - &task->tile_info_mem.device_pointer, - &a_ptr, - &b_ptr, - &sample_variance_ptr, - &sv_variance_ptr, - &buffer_variance_ptr, - &task->rect, - &task->render_buffer.pass_stride, - &task->render_buffer.offset}; - CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterGetFeature; - cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); - cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterGetFeature, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - void *args[] = {&task->render_buffer.samples, - &task->tile_info_mem.device_pointer, - &mean_offset, - &variance_offset, - &mean_ptr, - &variance_ptr, - &scale, - &task->rect, - &task->render_buffer.pass_stride, - &task->render_buffer.offset}; - CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_write_feature(int out_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterWriteFeature; - cuda_assert(cuModuleGetFunction(&cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature")); - cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, - task->filter_area.z, - task->filter_area.w); - - void *args[] = {&task->render_buffer.samples, - &task->reconstruction_state.buffer_params, - &task->filter_area, - &from_ptr, - &buffer_ptr, - &out_offset, - &task->rect}; - CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterDetectOutliers; - cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers")); - cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - void *args[] = {&image_ptr, - &variance_ptr, - &depth_ptr, - &output_ptr, - &task->rect, - &task->buffer.pass_stride}; - - CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - void denoise(RenderTile &rtile, DenoisingTask& denoising) - { - denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); - denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising); - denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); - denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.write_feature = function_bind(&CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising); - denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - - denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); - denoising.render_buffer.samples = rtile.sample; - denoising.buffer.gpu_temporary_mem = true; - - denoising.run_denoising(&rtile); - } - - void path_trace(DeviceTask& task, RenderTile& rtile, device_vector<WorkTile>& work_tiles) - { - scoped_timer timer(&rtile.buffers->render_time); - - if(have_error()) - return; - - CUDAContextScope scope(this); - CUfunction cuPathTrace; - - /* Get kernel function. */ - if(task.integrator_branched) { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); - } - - if(have_error()) { - return; - } - - cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); - - /* Allocate work tile. */ - work_tiles.alloc(1); - - WorkTile *wtile = work_tiles.data(); - wtile->x = rtile.x; - wtile->y = rtile.y; - wtile->w = rtile.w; - wtile->h = rtile.h; - wtile->offset = rtile.offset; - wtile->stride = rtile.stride; - wtile->buffer = (float*)cuda_device_ptr(rtile.buffer); - - /* Prepare work size. More step samples render faster, but for now we - * remain conservative for GPUs connected to a display to avoid driver - * timeouts and display freezing. */ - int min_blocks, num_threads_per_block; - cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); - if(!info.display_device) { - min_blocks *= 8; - } - - uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); - - /* Render all samples. */ - int start_sample = rtile.start_sample; - int end_sample = rtile.start_sample + rtile.num_samples; - - for(int sample = start_sample; sample < end_sample; sample += step_samples) { - /* Setup and copy work tile to device. */ - wtile->start_sample = sample; - wtile->num_samples = min(step_samples, end_sample - sample); - work_tiles.copy_to_device(); - - CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); - uint total_work_size = wtile->w * wtile->h * wtile->num_samples; - uint num_blocks = divide_up(total_work_size, num_threads_per_block); - - /* Launch kernel. */ - void *args[] = {&d_work_tiles, - &total_work_size}; - - cuda_assert(cuLaunchKernel(cuPathTrace, - num_blocks, 1, 1, - num_threads_per_block, 1, 1, - 0, 0, args, 0)); - - cuda_assert(cuCtxSynchronize()); - - /* Update progress. */ - rtile.sample = sample + wtile->num_samples; - task.update_progress(&rtile, rtile.w*rtile.h*wtile->num_samples); - - if(task.get_cancel()) { - if(task.need_finish_queue == false) - break; - } - } - } - - void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) - { - if(have_error()) - return; - - CUDAContextScope scope(this); - - CUfunction cuFilmConvert; - CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half); - CUdeviceptr d_buffer = cuda_device_ptr(buffer); - - /* get kernel function */ - if(rgba_half) { - cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float")); - } - else { - cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte")); - } - - - float sample_scale = 1.0f/(task.sample + 1); - - /* pass in parameters */ - void *args[] = {&d_rgba, - &d_buffer, - &sample_scale, - &task.x, - &task.y, - &task.w, - &task.h, - &task.offset, - &task.stride}; - - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert)); - - int xthreads = (int)sqrt(threads_per_block); - int ythreads = (int)sqrt(threads_per_block); - int xblocks = (task.w + xthreads - 1)/xthreads; - int yblocks = (task.h + ythreads - 1)/ythreads; - - cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1)); - - cuda_assert(cuLaunchKernel(cuFilmConvert, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, args, 0)); - - unmap_pixels((rgba_byte)? rgba_byte: rgba_half); - - cuda_assert(cuCtxSynchronize()); - } - - void shader(DeviceTask& task) - { - if(have_error()) - return; - - CUDAContextScope scope(this); - - CUfunction cuShader; - CUdeviceptr d_input = cuda_device_ptr(task.shader_input); - CUdeviceptr d_output = cuda_device_ptr(task.shader_output); - - /* get kernel function */ - if(task.shader_eval_type >= SHADER_EVAL_BAKE) { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); - } - else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); - } - - /* do tasks in smaller chunks, so we can cancel it */ - const int shader_chunk_size = 65536; - const int start = task.shader_x; - const int end = task.shader_x + task.shader_w; - int offset = task.offset; - - bool canceled = false; - for(int sample = 0; sample < task.num_samples && !canceled; sample++) { - for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) { - int shader_w = min(shader_chunk_size, end - shader_x); - - /* pass in parameters */ - void *args[8]; - int arg = 0; - args[arg++] = &d_input; - args[arg++] = &d_output; - args[arg++] = &task.shader_eval_type; - if(task.shader_eval_type >= SHADER_EVAL_BAKE) { - args[arg++] = &task.shader_filter; - } - args[arg++] = &shader_x; - args[arg++] = &shader_w; - args[arg++] = &offset; - args[arg++] = &sample; - - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader)); - - int xblocks = (shader_w + threads_per_block - 1)/threads_per_block; - - cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuLaunchKernel(cuShader, - xblocks , 1, 1, /* blocks */ - threads_per_block, 1, 1, /* threads */ - 0, 0, args, 0)); - - cuda_assert(cuCtxSynchronize()); - - if(task.get_cancel()) { - canceled = true; - break; - } - } - - task.update_progress(NULL); - } - } - - CUdeviceptr map_pixels(device_ptr mem) - { - if(!background) { - PixelMem pmem = pixel_mem_map[mem]; - CUdeviceptr buffer; - - size_t bytes; - cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0)); - cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource)); - - return buffer; - } - - return cuda_device_ptr(mem); - } - - void unmap_pixels(device_ptr mem) - { - if(!background) { - PixelMem pmem = pixel_mem_map[mem]; - - cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0)); - } - } - - void pixels_alloc(device_memory& mem) - { - PixelMem pmem; - - pmem.w = mem.data_width; - pmem.h = mem.data_height; - - CUDAContextScope scope(this); - - glGenBuffers(1, &pmem.cuPBO); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - if(mem.data_type == TYPE_HALF) - glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW); - else - glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - glActiveTexture(GL_TEXTURE0); - glGenTextures(1, &pmem.cuTexId); - glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); - if(mem.data_type == TYPE_HALF) - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL); - else - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glBindTexture(GL_TEXTURE_2D, 0); - - CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); - - if(result == CUDA_SUCCESS) { - mem.device_pointer = pmem.cuTexId; - pixel_mem_map[mem.device_pointer] = pmem; - - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); - - return; - } - else { - /* failed to register buffer, fallback to no interop */ - glDeleteBuffers(1, &pmem.cuPBO); - glDeleteTextures(1, &pmem.cuTexId); - - background = true; - } - } - - void pixels_copy_from(device_memory& mem, int y, int w, int h) - { - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - - CUDAContextScope scope(this); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); - size_t offset = sizeof(uchar)*4*y*w; - memcpy((uchar*)mem.host_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h); - glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - } - - void pixels_free(device_memory& mem) - { - if(mem.device_pointer) { - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - - CUDAContextScope scope(this); - - cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); - glDeleteBuffers(1, &pmem.cuPBO); - glDeleteTextures(1, &pmem.cuTexId); - - pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); - mem.device_pointer = 0; - - stats.mem_free(mem.device_size); - mem.device_size = 0; - } - } - - void draw_pixels( - device_memory& mem, int y, - int w, int h, int width, int height, - int dx, int dy, int dw, int dh, bool transparent, - const DeviceDrawParams &draw_params) - { - assert(mem.type == MEM_PIXELS); - - if(!background) { - const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL); - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - float *vpointer; - - CUDAContextScope scope(this); - - /* for multi devices, this assumes the inefficient method that we allocate - * all pixels on the device even though we only render to a subset */ - size_t offset = 4*y*w; - - if(mem.data_type == TYPE_HALF) - offset *= sizeof(GLhalf); - else - offset *= sizeof(uint8_t); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); - if(mem.data_type == TYPE_HALF) { - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void*)offset); - } - else { - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset); - } - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - if(transparent) { - glEnable(GL_BLEND); - glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); - } - - GLint shader_program; - if(use_fallback_shader) { - if(!bind_fallback_display_space_shader(dw, dh)) { - return; - } - shader_program = fallback_shader_program; - } - else { - draw_params.bind_display_space_shader_cb(); - glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program); - } - - if(!vertex_buffer) { - glGenBuffers(1, &vertex_buffer); - } - - glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer); - /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */ - glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW); - - vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); - - if(vpointer) { - /* texture coordinate - vertex pair */ - vpointer[0] = 0.0f; - vpointer[1] = 0.0f; - vpointer[2] = dx; - vpointer[3] = dy; - - vpointer[4] = (float)w/(float)pmem.w; - vpointer[5] = 0.0f; - vpointer[6] = (float)width + dx; - vpointer[7] = dy; - - vpointer[8] = (float)w/(float)pmem.w; - vpointer[9] = (float)h/(float)pmem.h; - vpointer[10] = (float)width + dx; - vpointer[11] = (float)height + dy; - - vpointer[12] = 0.0f; - vpointer[13] = (float)h/(float)pmem.h; - vpointer[14] = dx; - vpointer[15] = (float)height + dy; - - glUnmapBuffer(GL_ARRAY_BUFFER); - } - - GLuint vertex_array_object; - GLuint position_attribute, texcoord_attribute; - - glGenVertexArrays(1, &vertex_array_object); - glBindVertexArray(vertex_array_object); - - texcoord_attribute = glGetAttribLocation(shader_program, "texCoord"); - position_attribute = glGetAttribLocation(shader_program, "pos"); - - glEnableVertexAttribArray(texcoord_attribute); - glEnableVertexAttribArray(position_attribute); - - glVertexAttribPointer(texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0); - glVertexAttribPointer(position_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)(sizeof(float) * 2)); - - glDrawArrays(GL_TRIANGLE_FAN, 0, 4); - - if(use_fallback_shader) { - glUseProgram(0); - } - else { - draw_params.unbind_display_space_shader_cb(); - } - - if(transparent) { - glDisable(GL_BLEND); - } - - glBindTexture(GL_TEXTURE_2D, 0); - - return; - } - - Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params); - } - - void thread_run(DeviceTask *task) - { - CUDAContextScope scope(this); - - if(task->type == DeviceTask::RENDER) { - DeviceRequestedFeatures requested_features; - if(use_split_kernel()) { - if(split_kernel == NULL) { - split_kernel = new CUDASplitKernel(this); - split_kernel->load_kernels(requested_features); - } - } - - device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); - - /* keep rendering tiles until done */ - RenderTile tile; - DenoisingTask denoising(this, *task); - - while(task->acquire_tile(this, tile)) { - if(tile.task == RenderTile::PATH_TRACE) { - if(use_split_kernel()) { - device_only_memory<uchar> void_buffer(this, "void_buffer"); - split_kernel->path_trace(task, tile, void_buffer, void_buffer); - } - else { - path_trace(*task, tile, work_tiles); - } - } - else if(tile.task == RenderTile::DENOISE) { - tile.sample = tile.start_sample + tile.num_samples; - - denoise(tile, denoising); - - task->update_progress(&tile, tile.w*tile.h); - } - - task->release_tile(tile); - - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } - } - - work_tiles.free(); - } - else if(task->type == DeviceTask::SHADER) { - shader(*task); - - cuda_assert(cuCtxSynchronize()); - } - } - - class CUDADeviceTask : public DeviceTask { - public: - CUDADeviceTask(CUDADevice *device, DeviceTask& task) - : DeviceTask(task) - { - run = function_bind(&CUDADevice::thread_run, device, this); - } - }; - - int get_split_task_count(DeviceTask& /*task*/) - { - return 1; - } - - void task_add(DeviceTask& task) - { - CUDAContextScope scope(this); - - /* Load texture info. */ - load_texture_info(); - - /* Synchronize all memory copies before executing task. */ - cuda_assert(cuCtxSynchronize()); - - if(task.type == DeviceTask::FILM_CONVERT) { - /* must be done in main thread due to opengl access */ - film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); - } - else { - task_pool.push(new CUDADeviceTask(this, task)); - } - } - - void task_wait() - { - task_pool.wait(); - } - - void task_cancel() - { - task_pool.cancel(); - } - - friend class CUDASplitKernelFunction; - friend class CUDASplitKernel; - friend class CUDAContextScope; +#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ + int threads_per_block; \ + cuda_assert( \ + cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \ + int yblocks = h; + +#define CUDA_LAUNCH_KERNEL_1D(func, args) \ + cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0)); + + bool denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + int stride = task->buffer.stride; + int w = task->buffer.width; + int h = task->buffer.h; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2 * r + 1) * (2 * r + 1); + int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; + int frame_offset = 0; + + if (have_error()) + return false; + + CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); + CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; + CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts; + CUdeviceptr scale_ptr = 0; + + cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride)); + cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride)); + + { + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput; + cuda_assert(cuModuleGetFunction( + &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction( + &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction( + &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts); + + void *calc_difference_args[] = {&guide_ptr, + &variance_ptr, + &scale_ptr, + &difference, + &w, + &h, + &stride, + &pass_stride, + &r, + &channel_offset, + &frame_offset, + &a, + &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = { + &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; + void *update_output_args[] = {&blurDifference, + &image_ptr, + &out_ptr, + &weightAccum, + &w, + &h, + &stride, + &pass_stride, + &channel_offset, + &r, + &f}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args); + } + + { + CUfunction cuNLMNormalize; + cuda_assert(cuModuleGetFunction( + &cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); + cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride}; + CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h); + CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); + cuda_assert(cuCtxSynchronize()); + } + + return !have_error(); + } + + bool denoising_construct_transform(DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterConstructTransform; + cuda_assert(cuModuleGetFunction( + &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); + cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); + CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h); + + void *args[] = {&task->buffer.mem.device_pointer, + &task->tile_info_mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->filter_area, + &task->rect, + &task->radius, + &task->pca_threshold, + &task->buffer.pass_stride, + &task->buffer.frame_stride, + &task->buffer.use_time}; + CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + int r = task->radius; + int f = 4; + float a = 1.0f; + float k_2 = task->nlm_k_2; + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; + + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2 * r + 1) * (2 * r + 1); + + if (have_error()) + return false; + + CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); + CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; + + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; + cuda_assert(cuModuleGetFunction( + &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction( + &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction( + &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, + task->reconstruction_state.source_w * + task->reconstruction_state.source_h, + num_shifts); + + void *calc_difference_args[] = {&color_ptr, + &color_variance_ptr, + &scale_ptr, + &difference, + &w, + &h, + &stride, + &pass_stride, + &r, + &pass_stride, + &frame_offset, + &a, + &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = { + &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; + void *construct_gramian_args[] = {&t, + &blurDifference, + &task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->reconstruction_state.filter_window, + &w, + &h, + &stride, + &pass_stride, + &r, + &f, + &frame_offset, + &task->buffer.use_time}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_solve(device_ptr output_ptr, DenoisingTask *task) + { + CUfunction cuFinalize; + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + void *finalize_args[] = {&output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_GET_BLOCKSIZE( + cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h); + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, + int4 rect, + DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterCombineHalves; + cuda_assert(cuModuleGetFunction( + &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); + cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r}; + CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterDivideShadow; + cuda_assert(cuModuleGetFunction( + &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&task->render_buffer.samples, + &task->tile_info_mem.device_pointer, + &a_ptr, + &b_ptr, + &sample_variance_ptr, + &sv_variance_ptr, + &buffer_variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.offset}; + CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + float scale, + DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterGetFeature; + cuda_assert(cuModuleGetFunction( + &cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&task->render_buffer.samples, + &task->tile_info_mem.device_pointer, + &mean_offset, + &variance_offset, + &mean_ptr, + &variance_ptr, + &scale, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.offset}; + CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterWriteFeature; + cuda_assert(cuModuleGetFunction( + &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w); + + void *args[] = {&task->render_buffer.samples, + &task->reconstruction_state.buffer_params, + &task->filter_area, + &from_ptr, + &buffer_ptr, + &out_offset, + &task->rect}; + CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterDetectOutliers; + cuda_assert(cuModuleGetFunction( + &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&image_ptr, + &variance_ptr, + &depth_ptr, + &output_ptr, + &task->rect, + &task->buffer.pass_stride}; + + CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + void denoise(RenderTile &rtile, DenoisingTask &denoising) + { + denoising.functions.construct_transform = function_bind( + &CUDADevice::denoising_construct_transform, this, &denoising); + denoising.functions.accumulate = function_bind( + &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); + denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising); + denoising.functions.divide_shadow = function_bind( + &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind( + &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind( + &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind( + &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind( + &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising); + denoising.functions.detect_outliers = function_bind( + &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + denoising.buffer.gpu_temporary_mem = true; + + denoising.run_denoising(&rtile); + } + + void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles) + { + scoped_timer timer(&rtile.buffers->render_time); + + if (have_error()) + return; + + CUDAContextScope scope(this); + CUfunction cuPathTrace; + + /* Get kernel function. */ + if (task.integrator_branched) { + cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); + } + else { + cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); + } + + if (have_error()) { + return; + } + + cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + + /* Allocate work tile. */ + work_tiles.alloc(1); + + WorkTile *wtile = work_tiles.data(); + wtile->x = rtile.x; + wtile->y = rtile.y; + wtile->w = rtile.w; + wtile->h = rtile.h; + wtile->offset = rtile.offset; + wtile->stride = rtile.stride; + wtile->buffer = (float *)cuda_device_ptr(rtile.buffer); + + /* Prepare work size. More step samples render faster, but for now we + * remain conservative for GPUs connected to a display to avoid driver + * timeouts and display freezing. */ + int min_blocks, num_threads_per_block; + cuda_assert(cuOccupancyMaxPotentialBlockSize( + &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); + if (!info.display_device) { + min_blocks *= 8; + } + + uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); + + /* Render all samples. */ + int start_sample = rtile.start_sample; + int end_sample = rtile.start_sample + rtile.num_samples; + + for (int sample = start_sample; sample < end_sample; sample += step_samples) { + /* Setup and copy work tile to device. */ + wtile->start_sample = sample; + wtile->num_samples = min(step_samples, end_sample - sample); + work_tiles.copy_to_device(); + + CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); + uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); + + /* Launch kernel. */ + void *args[] = {&d_work_tiles, &total_work_size}; + + cuda_assert(cuLaunchKernel( + cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + + cuda_assert(cuCtxSynchronize()); + + /* Update progress. */ + rtile.sample = sample + wtile->num_samples; + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } + } + + void film_convert(DeviceTask &task, + device_ptr buffer, + device_ptr rgba_byte, + device_ptr rgba_half) + { + if (have_error()) + return; + + CUDAContextScope scope(this); + + CUfunction cuFilmConvert; + CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half); + CUdeviceptr d_buffer = cuda_device_ptr(buffer); + + /* get kernel function */ + if (rgba_half) { + cuda_assert( + cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float")); + } + else { + cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte")); + } + + float sample_scale = 1.0f / (task.sample + 1); + + /* pass in parameters */ + void *args[] = {&d_rgba, + &d_buffer, + &sample_scale, + &task.x, + &task.y, + &task.w, + &task.h, + &task.offset, + &task.stride}; + + /* launch kernel */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute( + &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert)); + + int xthreads = (int)sqrt(threads_per_block); + int ythreads = (int)sqrt(threads_per_block); + int xblocks = (task.w + xthreads - 1) / xthreads; + int yblocks = (task.h + ythreads - 1) / ythreads; + + cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1)); + + cuda_assert(cuLaunchKernel(cuFilmConvert, + xblocks, + yblocks, + 1, /* blocks */ + xthreads, + ythreads, + 1, /* threads */ + 0, + 0, + args, + 0)); + + unmap_pixels((rgba_byte) ? rgba_byte : rgba_half); + + cuda_assert(cuCtxSynchronize()); + } + + void shader(DeviceTask &task) + { + if (have_error()) + return; + + CUDAContextScope scope(this); + + CUfunction cuShader; + CUdeviceptr d_input = cuda_device_ptr(task.shader_input); + CUdeviceptr d_output = cuda_device_ptr(task.shader_output); + + /* get kernel function */ + if (task.shader_eval_type >= SHADER_EVAL_BAKE) { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); + } + else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); + } + else { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); + } + + /* do tasks in smaller chunks, so we can cancel it */ + const int shader_chunk_size = 65536; + const int start = task.shader_x; + const int end = task.shader_x + task.shader_w; + int offset = task.offset; + + bool canceled = false; + for (int sample = 0; sample < task.num_samples && !canceled; sample++) { + for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) { + int shader_w = min(shader_chunk_size, end - shader_x); + + /* pass in parameters */ + void *args[8]; + int arg = 0; + args[arg++] = &d_input; + args[arg++] = &d_output; + args[arg++] = &task.shader_eval_type; + if (task.shader_eval_type >= SHADER_EVAL_BAKE) { + args[arg++] = &task.shader_filter; + } + args[arg++] = &shader_x; + args[arg++] = &shader_w; + args[arg++] = &offset; + args[arg++] = &sample; + + /* launch kernel */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute( + &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader)); + + int xblocks = (shader_w + threads_per_block - 1) / threads_per_block; + + cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuLaunchKernel(cuShader, + xblocks, + 1, + 1, /* blocks */ + threads_per_block, + 1, + 1, /* threads */ + 0, + 0, + args, + 0)); + + cuda_assert(cuCtxSynchronize()); + + if (task.get_cancel()) { + canceled = true; + break; + } + } + + task.update_progress(NULL); + } + } + + CUdeviceptr map_pixels(device_ptr mem) + { + if (!background) { + PixelMem pmem = pixel_mem_map[mem]; + CUdeviceptr buffer; + + size_t bytes; + cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0)); + cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource)); + + return buffer; + } + + return cuda_device_ptr(mem); + } + + void unmap_pixels(device_ptr mem) + { + if (!background) { + PixelMem pmem = pixel_mem_map[mem]; + + cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0)); + } + } + + void pixels_alloc(device_memory &mem) + { + PixelMem pmem; + + pmem.w = mem.data_width; + pmem.h = mem.data_height; + + CUDAContextScope scope(this); + + glGenBuffers(1, &pmem.cuPBO); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); + if (mem.data_type == TYPE_HALF) + glBufferData( + GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW); + else + glBufferData( + GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + glActiveTexture(GL_TEXTURE0); + glGenTextures(1, &pmem.cuTexId); + glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); + if (mem.data_type == TYPE_HALF) + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL); + else + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glBindTexture(GL_TEXTURE_2D, 0); + + CUresult result = cuGraphicsGLRegisterBuffer( + &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); + + if (result == CUDA_SUCCESS) { + mem.device_pointer = pmem.cuTexId; + pixel_mem_map[mem.device_pointer] = pmem; + + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + + return; + } + else { + /* failed to register buffer, fallback to no interop */ + glDeleteBuffers(1, &pmem.cuPBO); + glDeleteTextures(1, &pmem.cuTexId); + + background = true; + } + } + + void pixels_copy_from(device_memory &mem, int y, int w, int h) + { + PixelMem pmem = pixel_mem_map[mem.device_pointer]; + + CUDAContextScope scope(this); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); + uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); + size_t offset = sizeof(uchar) * 4 * y * w; + memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h); + glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + } + + void pixels_free(device_memory &mem) + { + if (mem.device_pointer) { + PixelMem pmem = pixel_mem_map[mem.device_pointer]; + + CUDAContextScope scope(this); + + cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); + glDeleteBuffers(1, &pmem.cuPBO); + glDeleteTextures(1, &pmem.cuTexId); + + pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); + mem.device_pointer = 0; + + stats.mem_free(mem.device_size); + mem.device_size = 0; + } + } + + void draw_pixels(device_memory &mem, + int y, + int w, + int h, + int width, + int height, + int dx, + int dy, + int dw, + int dh, + bool transparent, + const DeviceDrawParams &draw_params) + { + assert(mem.type == MEM_PIXELS); + + if (!background) { + const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL); + PixelMem pmem = pixel_mem_map[mem.device_pointer]; + float *vpointer; + + CUDAContextScope scope(this); + + /* for multi devices, this assumes the inefficient method that we allocate + * all pixels on the device even though we only render to a subset */ + size_t offset = 4 * y * w; + + if (mem.data_type == TYPE_HALF) + offset *= sizeof(GLhalf); + else + offset *= sizeof(uint8_t); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); + if (mem.data_type == TYPE_HALF) { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset); + } + else { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset); + } + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + if (transparent) { + glEnable(GL_BLEND); + glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); + } + + GLint shader_program; + if (use_fallback_shader) { + if (!bind_fallback_display_space_shader(dw, dh)) { + return; + } + shader_program = fallback_shader_program; + } + else { + draw_params.bind_display_space_shader_cb(); + glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program); + } + + if (!vertex_buffer) { + glGenBuffers(1, &vertex_buffer); + } + + glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer); + /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */ + glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW); + + vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); + + if (vpointer) { + /* texture coordinate - vertex pair */ + vpointer[0] = 0.0f; + vpointer[1] = 0.0f; + vpointer[2] = dx; + vpointer[3] = dy; + + vpointer[4] = (float)w / (float)pmem.w; + vpointer[5] = 0.0f; + vpointer[6] = (float)width + dx; + vpointer[7] = dy; + + vpointer[8] = (float)w / (float)pmem.w; + vpointer[9] = (float)h / (float)pmem.h; + vpointer[10] = (float)width + dx; + vpointer[11] = (float)height + dy; + + vpointer[12] = 0.0f; + vpointer[13] = (float)h / (float)pmem.h; + vpointer[14] = dx; + vpointer[15] = (float)height + dy; + + glUnmapBuffer(GL_ARRAY_BUFFER); + } + + GLuint vertex_array_object; + GLuint position_attribute, texcoord_attribute; + + glGenVertexArrays(1, &vertex_array_object); + glBindVertexArray(vertex_array_object); + + texcoord_attribute = glGetAttribLocation(shader_program, "texCoord"); + position_attribute = glGetAttribLocation(shader_program, "pos"); + + glEnableVertexAttribArray(texcoord_attribute); + glEnableVertexAttribArray(position_attribute); + + glVertexAttribPointer( + texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0); + glVertexAttribPointer(position_attribute, + 2, + GL_FLOAT, + GL_FALSE, + 4 * sizeof(float), + (const GLvoid *)(sizeof(float) * 2)); + + glDrawArrays(GL_TRIANGLE_FAN, 0, 4); + + if (use_fallback_shader) { + glUseProgram(0); + } + else { + draw_params.unbind_display_space_shader_cb(); + } + + if (transparent) { + glDisable(GL_BLEND); + } + + glBindTexture(GL_TEXTURE_2D, 0); + + return; + } + + Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params); + } + + void thread_run(DeviceTask *task) + { + CUDAContextScope scope(this); + + if (task->type == DeviceTask::RENDER) { + DeviceRequestedFeatures requested_features; + if (use_split_kernel()) { + if (split_kernel == NULL) { + split_kernel = new CUDASplitKernel(this); + split_kernel->load_kernels(requested_features); + } + } + + device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); + + /* keep rendering tiles until done */ + RenderTile tile; + DenoisingTask denoising(this, *task); + + while (task->acquire_tile(this, tile)) { + if (tile.task == RenderTile::PATH_TRACE) { + if (use_split_kernel()) { + device_only_memory<uchar> void_buffer(this, "void_buffer"); + split_kernel->path_trace(task, tile, void_buffer, void_buffer); + } + else { + path_trace(*task, tile, work_tiles); + } + } + else if (tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + + denoise(tile, denoising); + + task->update_progress(&tile, tile.w * tile.h); + } + + task->release_tile(tile); + + if (task->get_cancel()) { + if (task->need_finish_queue == false) + break; + } + } + + work_tiles.free(); + } + else if (task->type == DeviceTask::SHADER) { + shader(*task); + + cuda_assert(cuCtxSynchronize()); + } + } + + class CUDADeviceTask : public DeviceTask { + public: + CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task) + { + run = function_bind(&CUDADevice::thread_run, device, this); + } + }; + + int get_split_task_count(DeviceTask & /*task*/) + { + return 1; + } + + void task_add(DeviceTask &task) + { + CUDAContextScope scope(this); + + /* Load texture info. */ + load_texture_info(); + + /* Synchronize all memory copies before executing task. */ + cuda_assert(cuCtxSynchronize()); + + if (task.type == DeviceTask::FILM_CONVERT) { + /* must be done in main thread due to opengl access */ + film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); + } + else { + task_pool.push(new CUDADeviceTask(this, task)); + } + } + + void task_wait() + { + task_pool.wait(); + } + + void task_cancel() + { + task_pool.cancel(); + } + + friend class CUDASplitKernelFunction; + friend class CUDASplitKernel; + friend class CUDAContextScope; }; /* redefine the cuda_assert macro so it can be used outside of the CUDADevice class @@ -2207,496 +2305,501 @@ public: */ #undef cuda_assert #define cuda_assert(stmt) \ - { \ - CUresult result = stmt; \ - \ - if(result != CUDA_SUCCESS) { \ - string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ - if(device->error_msg == "") \ - device->error_msg = message; \ - fprintf(stderr, "%s\n", message.c_str()); \ - /*cuda_abort();*/ \ - device->cuda_error_documentation(); \ - } \ - } (void) 0 - + { \ + CUresult result = stmt; \ +\ + if (result != CUDA_SUCCESS) { \ + string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + if (device->error_msg == "") \ + device->error_msg = message; \ + fprintf(stderr, "%s\n", message.c_str()); \ + /*cuda_abort();*/ \ + device->cuda_error_documentation(); \ + } \ + } \ + (void)0 /* CUDA context scope. */ -CUDAContextScope::CUDAContextScope(CUDADevice *device) -: device(device) +CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device) { - cuda_assert(cuCtxPushCurrent(device->cuContext)); + cuda_assert(cuCtxPushCurrent(device->cuContext)); } CUDAContextScope::~CUDAContextScope() { - cuda_assert(cuCtxPopCurrent(NULL)); + cuda_assert(cuCtxPopCurrent(NULL)); } /* split kernel */ -class CUDASplitKernelFunction : public SplitKernelFunction{ - CUDADevice* device; - CUfunction func; -public: - CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {} - - /* enqueue the kernel, returns false if there is an error */ - bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/) - { - return enqueue(dim, NULL); - } - - /* enqueue the kernel, returns false if there is an error */ - bool enqueue(const KernelDimensions &dim, void *args[]) - { - if(device->have_error()) - return false; - - CUDAContextScope scope(device); - - /* we ignore dim.local_size for now, as this is faster */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); - - int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block; - - cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); - - cuda_assert(cuLaunchKernel(func, - xblocks, 1, 1, /* blocks */ - threads_per_block, 1, 1, /* threads */ - 0, 0, args, 0)); - - return !device->have_error(); - } +class CUDASplitKernelFunction : public SplitKernelFunction { + CUDADevice *device; + CUfunction func; + + public: + CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) + { + } + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/) + { + return enqueue(dim, NULL); + } + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, void *args[]) + { + if (device->have_error()) + return false; + + CUDAContextScope scope(device); + + /* we ignore dim.local_size for now, as this is faster */ + int threads_per_block; + cuda_assert( + cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); + + int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) / + threads_per_block; + + cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); + + cuda_assert(cuLaunchKernel(func, + xblocks, + 1, + 1, /* blocks */ + threads_per_block, + 1, + 1, /* threads */ + 0, + 0, + args, + 0)); + + return !device->have_error(); + } }; CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device) { } -uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads) +uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/, + device_memory & /*data*/, + size_t num_threads) { - CUDAContextScope scope(device); + CUDAContextScope scope(device); - device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); - size_buffer.alloc(1); - size_buffer.zero_to_device(); + device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); + size_buffer.alloc(1); + size_buffer.zero_to_device(); - uint threads = num_threads; - CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer); + uint threads = num_threads; + CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer); - struct args_t { - uint* num_threads; - CUdeviceptr* size; - }; + struct args_t { + uint *num_threads; + CUdeviceptr *size; + }; - args_t args = { - &threads, - &d_size - }; + args_t args = {&threads, &d_size}; - CUfunction state_buffer_size; - cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size")); + CUfunction state_buffer_size; + cuda_assert( + cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size")); - cuda_assert(cuLaunchKernel(state_buffer_size, - 1, 1, 1, - 1, 1, 1, - 0, 0, (void**)&args, 0)); + cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0)); - size_buffer.copy_from_device(0, 1, 1); - size_t size = size_buffer[0]; - size_buffer.free(); + size_buffer.copy_from_device(0, 1, 1); + size_t size = size_buffer[0]; + size_buffer.free(); - return size; + return size; } -bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, - RenderTile& rtile, - int num_global_elements, - device_memory& /*kernel_globals*/, - device_memory& /*kernel_data*/, - device_memory& split_data, - device_memory& ray_state, - device_memory& queue_index, - device_memory& use_queues_flag, - device_memory& work_pool_wgs) +bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim, + RenderTile &rtile, + int num_global_elements, + device_memory & /*kernel_globals*/, + device_memory & /*kernel_data*/, + device_memory &split_data, + device_memory &ray_state, + device_memory &queue_index, + device_memory &use_queues_flag, + device_memory &work_pool_wgs) { - CUDAContextScope scope(device); - - CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer); - CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer); - CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer); - CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer); - CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer); - - CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer); - - int end_sample = rtile.start_sample + rtile.num_samples; - int queue_size = dim.global_size[0] * dim.global_size[1]; - - struct args_t { - CUdeviceptr* split_data_buffer; - int* num_elements; - CUdeviceptr* ray_state; - int* start_sample; - int* end_sample; - int* sx; - int* sy; - int* sw; - int* sh; - int* offset; - int* stride; - CUdeviceptr* queue_index; - int* queuesize; - CUdeviceptr* use_queues_flag; - CUdeviceptr* work_pool_wgs; - int* num_samples; - CUdeviceptr* buffer; - }; - - args_t args = { - &d_split_data, - &num_global_elements, - &d_ray_state, - &rtile.start_sample, - &end_sample, - &rtile.x, - &rtile.y, - &rtile.w, - &rtile.h, - &rtile.offset, - &rtile.stride, - &d_queue_index, - &queue_size, - &d_use_queues_flag, - &d_work_pool_wgs, - &rtile.num_samples, - &d_buffer - }; - - CUfunction data_init; - cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); - if(device->have_error()) { - return false; - } - - CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args); - - return !device->have_error(); + CUDAContextScope scope(device); + + CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer); + CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer); + CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer); + CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer); + CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer); + + CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer); + + int end_sample = rtile.start_sample + rtile.num_samples; + int queue_size = dim.global_size[0] * dim.global_size[1]; + + struct args_t { + CUdeviceptr *split_data_buffer; + int *num_elements; + CUdeviceptr *ray_state; + int *start_sample; + int *end_sample; + int *sx; + int *sy; + int *sw; + int *sh; + int *offset; + int *stride; + CUdeviceptr *queue_index; + int *queuesize; + CUdeviceptr *use_queues_flag; + CUdeviceptr *work_pool_wgs; + int *num_samples; + CUdeviceptr *buffer; + }; + + args_t args = {&d_split_data, + &num_global_elements, + &d_ray_state, + &rtile.start_sample, + &end_sample, + &rtile.x, + &rtile.y, + &rtile.w, + &rtile.h, + &rtile.offset, + &rtile.stride, + &d_queue_index, + &queue_size, + &d_use_queues_flag, + &d_work_pool_wgs, + &rtile.num_samples, + &d_buffer}; + + CUfunction data_init; + cuda_assert( + cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); + if (device->have_error()) { + return false; + } + + CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args); + + return !device->have_error(); } -SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name, - const DeviceRequestedFeatures&) +SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name, + const DeviceRequestedFeatures &) { - CUDAContextScope scope(device); - CUfunction func; - - cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data())); - if(device->have_error()) { - device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data())); - return NULL; - } - - return new CUDASplitKernelFunction(device, func); + CUDAContextScope scope(device); + CUfunction func; + + cuda_assert( + cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data())); + if (device->have_error()) { + device->cuda_error_message( + string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data())); + return NULL; + } + + return new CUDASplitKernelFunction(device, func); } int2 CUDASplitKernel::split_kernel_local_size() { - return make_int2(32, 1); + return make_int2(32, 1); } -int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) +int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg, + device_memory &data, + DeviceTask * /*task*/) { - CUDAContextScope scope(device); - size_t free; - size_t total; + CUDAContextScope scope(device); + size_t free; + size_t total; - cuda_assert(cuMemGetInfo(&free, &total)); + cuda_assert(cuMemGetInfo(&free, &total)); - VLOG(1) << "Maximum device allocation size: " - << string_human_readable_number(free) << " bytes. (" - << string_human_readable_size(free) << ")."; + VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free) + << " bytes. (" << string_human_readable_size(free) << ")."; - size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2); - size_t side = round_down((int)sqrt(num_elements), 32); - int2 global_size = make_int2(side, round_down(num_elements / side, 16)); - VLOG(1) << "Global size: " << global_size << "."; - return global_size; + size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2); + size_t side = round_down((int)sqrt(num_elements), 32); + int2 global_size = make_int2(side, round_down(num_elements / side, 16)); + VLOG(1) << "Global size: " << global_size << "."; + return global_size; } bool device_cuda_init() { #ifdef WITH_CUDA_DYNLOAD - static bool initialized = false; - static bool result = false; - - if(initialized) - return result; - - initialized = true; - int cuew_result = cuewInit(CUEW_INIT_CUDA); - if(cuew_result == CUEW_SUCCESS) { - VLOG(1) << "CUEW initialization succeeded"; - if(CUDADevice::have_precompiled_kernels()) { - VLOG(1) << "Found precompiled kernels"; - result = true; - } -#ifndef _WIN32 - else if(cuewCompilerPath() != NULL) { - VLOG(1) << "Found CUDA compiler " << cuewCompilerPath(); - result = true; - } - else { - VLOG(1) << "Neither precompiled kernels nor CUDA compiler was found," - << " unable to use CUDA"; - } -#endif - } - else { - VLOG(1) << "CUEW initialization failed: " - << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED) - ? "Error setting up atexit() handler" - : "Error opening the library"); - } - - return result; + static bool initialized = false; + static bool result = false; + + if (initialized) + return result; + + initialized = true; + int cuew_result = cuewInit(CUEW_INIT_CUDA); + if (cuew_result == CUEW_SUCCESS) { + VLOG(1) << "CUEW initialization succeeded"; + if (CUDADevice::have_precompiled_kernels()) { + VLOG(1) << "Found precompiled kernels"; + result = true; + } +# ifndef _WIN32 + else if (cuewCompilerPath() != NULL) { + VLOG(1) << "Found CUDA compiler " << cuewCompilerPath(); + result = true; + } + else { + VLOG(1) << "Neither precompiled kernels nor CUDA compiler was found," + << " unable to use CUDA"; + } +# endif + } + else { + VLOG(1) << "CUEW initialization failed: " + << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" : + "Error opening the library"); + } + + return result; #else /* WITH_CUDA_DYNLOAD */ - return true; -#endif /* WITH_CUDA_DYNLOAD */ + return true; +#endif /* WITH_CUDA_DYNLOAD */ } -Device *device_cuda_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background) +Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) { - return new CUDADevice(info, stats, profiler, background); + return new CUDADevice(info, stats, profiler, background); } static CUresult device_cuda_safe_init() { #ifdef _WIN32 - __try { - return cuInit(0); - } - __except(EXCEPTION_EXECUTE_HANDLER) { - /* Ignore crashes inside the CUDA driver and hope we can - * survive even with corrupted CUDA installs. */ - fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n"); - } - - return CUDA_ERROR_NO_DEVICE; + __try { + return cuInit(0); + } + __except (EXCEPTION_EXECUTE_HANDLER) { + /* Ignore crashes inside the CUDA driver and hope we can + * survive even with corrupted CUDA installs. */ + fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n"); + } + + return CUDA_ERROR_NO_DEVICE; #else - return cuInit(0); + return cuInit(0); #endif } -void device_cuda_info(vector<DeviceInfo>& devices) +void device_cuda_info(vector<DeviceInfo> &devices) { - CUresult result = device_cuda_safe_init(); - if(result != CUDA_SUCCESS) { - if(result != CUDA_ERROR_NO_DEVICE) - fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result)); - return; - } - - int count = 0; - result = cuDeviceGetCount(&count); - if(result != CUDA_SUCCESS) { - fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result)); - return; - } - - vector<DeviceInfo> display_devices; - - for(int num = 0; num < count; num++) { - char name[256]; - - result = cuDeviceGetName(name, 256, num); - if(result != CUDA_SUCCESS) { - fprintf(stderr, "CUDA cuDeviceGetName: %s\n", cuewErrorString(result)); - continue; - } - - int major; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num); - if(major < 3) { - VLOG(1) << "Ignoring device \"" << name - << "\", this graphics card is no longer supported."; - continue; - } - - DeviceInfo info; - - info.type = DEVICE_CUDA; - info.description = string(name); - info.num = num; - - info.has_half_images = (major >= 3); - info.has_volume_decoupled = false; - - int pci_location[3] = {0, 0, 0}; - cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num); - cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num); - cuDeviceGetAttribute(&pci_location[2], CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, num); - info.id = string_printf("CUDA_%s_%04x:%02x:%02x", - name, - (unsigned int)pci_location[0], - (unsigned int)pci_location[1], - (unsigned int)pci_location[2]); - - /* If device has a kernel timeout and no compute preemption, we assume - * it is connected to a display and will freeze the display while doing - * computations. */ - int timeout_attr = 0, preempt_attr = 0; - cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num); - cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num); - - if(timeout_attr && !preempt_attr) { - VLOG(1) << "Device is recognized as display."; - info.description += " (Display)"; - info.display_device = true; - display_devices.push_back(info); - } - else { - devices.push_back(info); - } - VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\"."; - } - - if(!display_devices.empty()) - devices.insert(devices.end(), display_devices.begin(), display_devices.end()); + CUresult result = device_cuda_safe_init(); + if (result != CUDA_SUCCESS) { + if (result != CUDA_ERROR_NO_DEVICE) + fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result)); + return; + } + + int count = 0; + result = cuDeviceGetCount(&count); + if (result != CUDA_SUCCESS) { + fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result)); + return; + } + + vector<DeviceInfo> display_devices; + + for (int num = 0; num < count; num++) { + char name[256]; + + result = cuDeviceGetName(name, 256, num); + if (result != CUDA_SUCCESS) { + fprintf(stderr, "CUDA cuDeviceGetName: %s\n", cuewErrorString(result)); + continue; + } + + int major; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num); + if (major < 3) { + VLOG(1) << "Ignoring device \"" << name << "\", this graphics card is no longer supported."; + continue; + } + + DeviceInfo info; + + info.type = DEVICE_CUDA; + info.description = string(name); + info.num = num; + + info.has_half_images = (major >= 3); + info.has_volume_decoupled = false; + + int pci_location[3] = {0, 0, 0}; + cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num); + cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num); + cuDeviceGetAttribute(&pci_location[2], CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, num); + info.id = string_printf("CUDA_%s_%04x:%02x:%02x", + name, + (unsigned int)pci_location[0], + (unsigned int)pci_location[1], + (unsigned int)pci_location[2]); + + /* If device has a kernel timeout and no compute preemption, we assume + * it is connected to a display and will freeze the display while doing + * computations. */ + int timeout_attr = 0, preempt_attr = 0; + cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num); + cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num); + + if (timeout_attr && !preempt_attr) { + VLOG(1) << "Device is recognized as display."; + info.description += " (Display)"; + info.display_device = true; + display_devices.push_back(info); + } + else { + devices.push_back(info); + } + VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\"."; + } + + if (!display_devices.empty()) + devices.insert(devices.end(), display_devices.begin(), display_devices.end()); } string device_cuda_capabilities() { - CUresult result = device_cuda_safe_init(); - if(result != CUDA_SUCCESS) { - if(result != CUDA_ERROR_NO_DEVICE) { - return string("Error initializing CUDA: ") + cuewErrorString(result); - } - return "No CUDA device found\n"; - } - - int count; - result = cuDeviceGetCount(&count); - if(result != CUDA_SUCCESS) { - return string("Error getting devices: ") + cuewErrorString(result); - } - - string capabilities = ""; - for(int num = 0; num < count; num++) { - char name[256]; - if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) { - continue; - } - capabilities += string("\t") + name + "\n"; - int value; + CUresult result = device_cuda_safe_init(); + if (result != CUDA_SUCCESS) { + if (result != CUDA_ERROR_NO_DEVICE) { + return string("Error initializing CUDA: ") + cuewErrorString(result); + } + return "No CUDA device found\n"; + } + + int count; + result = cuDeviceGetCount(&count); + if (result != CUDA_SUCCESS) { + return string("Error getting devices: ") + cuewErrorString(result); + } + + string capabilities = ""; + for (int num = 0; num < count; num++) { + char name[256]; + if (cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) { + continue; + } + capabilities += string("\t") + name + "\n"; + int value; #define GET_ATTR(attr) \ - { \ - if(cuDeviceGetAttribute(&value, \ - CU_DEVICE_ATTRIBUTE_##attr, \ - num) == CUDA_SUCCESS) \ - { \ - capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", \ - value); \ - } \ - } (void) 0 - /* TODO(sergey): Strip all attributes which are not useful for us - * or does not depend on the driver. - */ - GET_ATTR(MAX_THREADS_PER_BLOCK); - GET_ATTR(MAX_BLOCK_DIM_X); - GET_ATTR(MAX_BLOCK_DIM_Y); - GET_ATTR(MAX_BLOCK_DIM_Z); - GET_ATTR(MAX_GRID_DIM_X); - GET_ATTR(MAX_GRID_DIM_Y); - GET_ATTR(MAX_GRID_DIM_Z); - GET_ATTR(MAX_SHARED_MEMORY_PER_BLOCK); - GET_ATTR(SHARED_MEMORY_PER_BLOCK); - GET_ATTR(TOTAL_CONSTANT_MEMORY); - GET_ATTR(WARP_SIZE); - GET_ATTR(MAX_PITCH); - GET_ATTR(MAX_REGISTERS_PER_BLOCK); - GET_ATTR(REGISTERS_PER_BLOCK); - GET_ATTR(CLOCK_RATE); - GET_ATTR(TEXTURE_ALIGNMENT); - GET_ATTR(GPU_OVERLAP); - GET_ATTR(MULTIPROCESSOR_COUNT); - GET_ATTR(KERNEL_EXEC_TIMEOUT); - GET_ATTR(INTEGRATED); - GET_ATTR(CAN_MAP_HOST_MEMORY); - GET_ATTR(COMPUTE_MODE); - GET_ATTR(MAXIMUM_TEXTURE1D_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_HEIGHT); - GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT); - GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH); - GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT); - GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_LAYERS); - GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT); - GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES); - GET_ATTR(SURFACE_ALIGNMENT); - GET_ATTR(CONCURRENT_KERNELS); - GET_ATTR(ECC_ENABLED); - GET_ATTR(TCC_DRIVER); - GET_ATTR(MEMORY_CLOCK_RATE); - GET_ATTR(GLOBAL_MEMORY_BUS_WIDTH); - GET_ATTR(L2_CACHE_SIZE); - GET_ATTR(MAX_THREADS_PER_MULTIPROCESSOR); - GET_ATTR(ASYNC_ENGINE_COUNT); - GET_ATTR(UNIFIED_ADDRESSING); - GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_LAYERS); - GET_ATTR(CAN_TEX2D_GATHER); - GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_HEIGHT); - GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE); - GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE); - GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE); - GET_ATTR(TEXTURE_PITCH_ALIGNMENT); - GET_ATTR(MAXIMUM_TEXTURECUBEMAP_WIDTH); - GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH); - GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS); - GET_ATTR(MAXIMUM_SURFACE1D_WIDTH); - GET_ATTR(MAXIMUM_SURFACE2D_WIDTH); - GET_ATTR(MAXIMUM_SURFACE2D_HEIGHT); - GET_ATTR(MAXIMUM_SURFACE3D_WIDTH); - GET_ATTR(MAXIMUM_SURFACE3D_HEIGHT); - GET_ATTR(MAXIMUM_SURFACE3D_DEPTH); - GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_WIDTH); - GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_LAYERS); - GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_WIDTH); - GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_HEIGHT); - GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_LAYERS); - GET_ATTR(MAXIMUM_SURFACECUBEMAP_WIDTH); - GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH); - GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS); - GET_ATTR(MAXIMUM_TEXTURE1D_LINEAR_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_HEIGHT); - GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_PITCH); - GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT); - GET_ATTR(COMPUTE_CAPABILITY_MAJOR); - GET_ATTR(COMPUTE_CAPABILITY_MINOR); - GET_ATTR(MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH); - GET_ATTR(STREAM_PRIORITIES_SUPPORTED); - GET_ATTR(GLOBAL_L1_CACHE_SUPPORTED); - GET_ATTR(LOCAL_L1_CACHE_SUPPORTED); - GET_ATTR(MAX_SHARED_MEMORY_PER_MULTIPROCESSOR); - GET_ATTR(MAX_REGISTERS_PER_MULTIPROCESSOR); - GET_ATTR(MANAGED_MEMORY); - GET_ATTR(MULTI_GPU_BOARD); - GET_ATTR(MULTI_GPU_BOARD_GROUP_ID); + { \ + if (cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_##attr, num) == CUDA_SUCCESS) { \ + capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", value); \ + } \ + } \ + (void)0 + /* TODO(sergey): Strip all attributes which are not useful for us + * or does not depend on the driver. + */ + GET_ATTR(MAX_THREADS_PER_BLOCK); + GET_ATTR(MAX_BLOCK_DIM_X); + GET_ATTR(MAX_BLOCK_DIM_Y); + GET_ATTR(MAX_BLOCK_DIM_Z); + GET_ATTR(MAX_GRID_DIM_X); + GET_ATTR(MAX_GRID_DIM_Y); + GET_ATTR(MAX_GRID_DIM_Z); + GET_ATTR(MAX_SHARED_MEMORY_PER_BLOCK); + GET_ATTR(SHARED_MEMORY_PER_BLOCK); + GET_ATTR(TOTAL_CONSTANT_MEMORY); + GET_ATTR(WARP_SIZE); + GET_ATTR(MAX_PITCH); + GET_ATTR(MAX_REGISTERS_PER_BLOCK); + GET_ATTR(REGISTERS_PER_BLOCK); + GET_ATTR(CLOCK_RATE); + GET_ATTR(TEXTURE_ALIGNMENT); + GET_ATTR(GPU_OVERLAP); + GET_ATTR(MULTIPROCESSOR_COUNT); + GET_ATTR(KERNEL_EXEC_TIMEOUT); + GET_ATTR(INTEGRATED); + GET_ATTR(CAN_MAP_HOST_MEMORY); + GET_ATTR(COMPUTE_MODE); + GET_ATTR(MAXIMUM_TEXTURE1D_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_HEIGHT); + GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT); + GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH); + GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT); + GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_LAYERS); + GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT); + GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES); + GET_ATTR(SURFACE_ALIGNMENT); + GET_ATTR(CONCURRENT_KERNELS); + GET_ATTR(ECC_ENABLED); + GET_ATTR(TCC_DRIVER); + GET_ATTR(MEMORY_CLOCK_RATE); + GET_ATTR(GLOBAL_MEMORY_BUS_WIDTH); + GET_ATTR(L2_CACHE_SIZE); + GET_ATTR(MAX_THREADS_PER_MULTIPROCESSOR); + GET_ATTR(ASYNC_ENGINE_COUNT); + GET_ATTR(UNIFIED_ADDRESSING); + GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_LAYERS); + GET_ATTR(CAN_TEX2D_GATHER); + GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_HEIGHT); + GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE); + GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE); + GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE); + GET_ATTR(TEXTURE_PITCH_ALIGNMENT); + GET_ATTR(MAXIMUM_TEXTURECUBEMAP_WIDTH); + GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH); + GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS); + GET_ATTR(MAXIMUM_SURFACE1D_WIDTH); + GET_ATTR(MAXIMUM_SURFACE2D_WIDTH); + GET_ATTR(MAXIMUM_SURFACE2D_HEIGHT); + GET_ATTR(MAXIMUM_SURFACE3D_WIDTH); + GET_ATTR(MAXIMUM_SURFACE3D_HEIGHT); + GET_ATTR(MAXIMUM_SURFACE3D_DEPTH); + GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_WIDTH); + GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_LAYERS); + GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_WIDTH); + GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_HEIGHT); + GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_LAYERS); + GET_ATTR(MAXIMUM_SURFACECUBEMAP_WIDTH); + GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH); + GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS); + GET_ATTR(MAXIMUM_TEXTURE1D_LINEAR_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_HEIGHT); + GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_PITCH); + GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT); + GET_ATTR(COMPUTE_CAPABILITY_MAJOR); + GET_ATTR(COMPUTE_CAPABILITY_MINOR); + GET_ATTR(MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH); + GET_ATTR(STREAM_PRIORITIES_SUPPORTED); + GET_ATTR(GLOBAL_L1_CACHE_SUPPORTED); + GET_ATTR(LOCAL_L1_CACHE_SUPPORTED); + GET_ATTR(MAX_SHARED_MEMORY_PER_MULTIPROCESSOR); + GET_ATTR(MAX_REGISTERS_PER_MULTIPROCESSOR); + GET_ATTR(MANAGED_MEMORY); + GET_ATTR(MULTI_GPU_BOARD); + GET_ATTR(MULTI_GPU_BOARD_GROUP_ID); #undef GET_ATTR - capabilities += "\n"; - } + capabilities += "\n"; + } - return capabilities; + return capabilities; } CCL_NAMESPACE_END |