diff options
-rw-r--r-- | intern/cycles/device/CMakeLists.txt | 14 | ||||
-rw-r--r-- | intern/cycles/device/cuda/device_cuda.h | 248 | ||||
-rw-r--r-- | intern/cycles/device/cuda/device_cuda_impl.cpp | 2502 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 2571 | ||||
-rw-r--r-- | intern/cycles/device/device_opencl.cpp | 4 | ||||
-rw-r--r-- | intern/cycles/device/device_optix.cpp | 1273 | ||||
-rw-r--r-- | intern/cycles/device/opencl/device_opencl.h (renamed from intern/cycles/device/opencl/opencl.h) | 0 | ||||
-rw-r--r-- | intern/cycles/device/opencl/device_opencl_impl.cpp (renamed from intern/cycles/device/opencl/opencl_split.cpp) | 2 | ||||
-rw-r--r-- | intern/cycles/device/opencl/memory_manager.cpp | 2 | ||||
-rw-r--r-- | intern/cycles/device/opencl/opencl_util.cpp | 2 |
10 files changed, 2829 insertions, 3789 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index 35a79356957..aa5b65a2b73 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -34,13 +34,17 @@ set(SRC device_task.cpp ) +set(SRC_CUDA + cuda/device_cuda.h + cuda/device_cuda_impl.cpp +) + set(SRC_OPENCL - opencl/opencl.h + opencl/device_opencl.h + opencl/device_opencl_impl.cpp opencl/memory_manager.h - - opencl/opencl_split.cpp - opencl/opencl_util.cpp opencl/memory_manager.cpp + opencl/opencl_util.cpp ) if(WITH_CYCLES_NETWORK) @@ -98,4 +102,4 @@ endif() include_directories(${INC}) include_directories(SYSTEM ${INC_SYS}) -cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_OPENCL} ${SRC_HEADERS}) +cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CUDA} ${SRC_OPENCL} ${SRC_HEADERS}) diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h new file mode 100644 index 00000000000..3d29d13a781 --- /dev/null +++ b/intern/cycles/device/cuda/device_cuda.h @@ -0,0 +1,248 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include "device/device.h" +# include "device/device_denoising.h" +# include "device/device_split_kernel.h" + +# include "util/util_map.h" + +# ifdef WITH_CUDA_DYNLOAD +# include "cuew.h" +# else +# include "util/util_opengl.h" +# include <cuda.h> +# include <cudaGL.h> +# endif + +CCL_NAMESPACE_BEGIN + +class CUDADevice : public Device { + + friend class CUDASplitKernelFunction; + friend class CUDASplitKernel; + friend class CUDAContextScope; + + public: + DedicatedTaskPool task_pool; + CUdevice cuDevice; + CUcontext cuContext; + CUmodule cuModule, cuFilterModule; + size_t device_texture_headroom; + size_t device_working_headroom; + bool move_texture_to_host; + size_t map_host_used; + size_t map_host_limit; + int can_map_host; + int cuDevId; + int cuDevArchitecture; + bool first_error; + CUDASplitKernel *split_kernel; + + struct CUDAMem { + CUDAMem() : texobject(0), array(0), use_mapped_host(false) + { + } + + CUtexObject texobject; + CUarray array; + + /* If true, a mapped host memory in shared_pointer is being used. */ + bool use_mapped_host; + }; + typedef map<device_memory *, CUDAMem> CUDAMemMap; + CUDAMemMap cuda_mem_map; + + struct PixelMem { + GLuint cuPBO; + CUgraphicsResource cuPBOresource; + GLuint cuTexId; + int w, h; + }; + map<device_ptr, PixelMem> pixel_mem_map; + + /* Bindless Textures */ + device_vector<TextureInfo> texture_info; + bool need_texture_info; + + CUdeviceptr cuda_device_ptr(device_ptr mem) + { + return (CUdeviceptr)mem; + } + + static bool have_precompiled_kernels(); + + virtual bool show_samples() const; + + virtual BVHLayoutMask get_bvh_layout_mask() const; + + void cuda_error_documentation(); + + bool cuda_error_(CUresult result, const string &stmt); + + void cuda_error_message(const string &message); + + CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_); + + virtual ~CUDADevice(); + + bool support_device(const DeviceRequestedFeatures & /*requested_features*/); + + bool use_adaptive_compilation(); + + bool use_split_kernel(); + + string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features, + bool filter = false, + bool split = false); + + bool compile_check_compiler(); + + string compile_kernel(const DeviceRequestedFeatures &requested_features, + bool filter = false, + bool split = false); + + virtual bool load_kernels(const DeviceRequestedFeatures &requested_features); + + void reserve_local_memory(const DeviceRequestedFeatures &requested_features); + + void init_host_memory(); + + void load_texture_info(); + + void move_textures_to_host(size_t size, bool for_texture); + + CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0); + + void generic_copy_to(device_memory &mem); + + void generic_free(device_memory &mem); + + void mem_alloc(device_memory &mem); + + void mem_copy_to(device_memory &mem); + + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem); + + void mem_zero(device_memory &mem); + + void mem_free(device_memory &mem); + + device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/); + + virtual void const_copy_to(const char *name, void *host, size_t size); + + void tex_alloc(device_memory &mem); + + void tex_free(device_memory &mem); + + bool denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task); + + bool denoising_construct_transform(DenoisingTask *task); + + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task); + + bool denoising_solve(device_ptr output_ptr, DenoisingTask *task); + + bool denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, + int4 rect, + DenoisingTask *task); + + bool denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task); + + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + float scale, + DenoisingTask *task); + + bool denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task); + + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task); + + void denoise(RenderTile &rtile, DenoisingTask &denoising); + + void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles); + + void film_convert(DeviceTask &task, + device_ptr buffer, + device_ptr rgba_byte, + device_ptr rgba_half); + + void shader(DeviceTask &task); + + CUdeviceptr map_pixels(device_ptr mem); + + void unmap_pixels(device_ptr mem); + + void pixels_alloc(device_memory &mem); + + void pixels_copy_from(device_memory &mem, int y, int w, int h); + + void pixels_free(device_memory &mem); + + void draw_pixels(device_memory &mem, + int y, + int w, + int h, + int width, + int height, + int dx, + int dy, + int dw, + int dh, + bool transparent, + const DeviceDrawParams &draw_params); + + void thread_run(DeviceTask *task); + + virtual void task_add(DeviceTask &task); + + virtual void task_wait(); + + virtual void task_cancel(); +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp new file mode 100644 index 00000000000..a4e1c026263 --- /dev/null +++ b/intern/cycles/device/cuda/device_cuda_impl.cpp @@ -0,0 +1,2502 @@ +/* + * Copyright 2011-2013 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_CUDA + +# include <climits> +# include <limits.h> +# include <stdio.h> +# include <stdlib.h> +# include <string.h> + +# include "device/cuda/device_cuda.h" +# include "device/device_intern.h" +# include "device/device_split_kernel.h" + +# include "render/buffers.h" + +# include "kernel/filter/filter_defines.h" + +# include "util/util_debug.h" +# include "util/util_foreach.h" +# include "util/util_logging.h" +# include "util/util_map.h" +# include "util/util_md5.h" +# include "util/util_opengl.h" +# include "util/util_path.h" +# include "util/util_string.h" +# include "util/util_system.h" +# include "util/util_types.h" +# include "util/util_time.h" +# include "util/util_windows.h" + +# include "kernel/split/kernel_split_data_types.h" + +CCL_NAMESPACE_BEGIN + +# ifndef WITH_CUDA_DYNLOAD + +/* Transparently implement some functions, so majority of the file does not need + * to worry about difference between dynamically loaded and linked CUDA at all. + */ + +namespace { + +const char *cuewErrorString(CUresult result) +{ + /* We can only give error code here without major code duplication, that + * should be enough since dynamic loading is only being disabled by folks + * who knows what they're doing anyway. + * + * NOTE: Avoid call from several threads. + */ + static string error; + error = string_printf("%d", result); + return error.c_str(); +} + +const char *cuewCompilerPath() +{ + return CYCLES_CUDA_NVCC_EXECUTABLE; +} + +int cuewCompilerVersion() +{ + return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10); +} + +} /* namespace */ +# endif /* WITH_CUDA_DYNLOAD */ + +class CUDADevice; + +class CUDASplitKernel : public DeviceSplitKernel { + CUDADevice *device; + + public: + explicit CUDASplitKernel(CUDADevice *device); + + virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, + RenderTile &rtile, + int num_global_elements, + device_memory &kernel_globals, + device_memory &kernel_data_, + device_memory &split_data, + device_memory &ray_state, + device_memory &queue_index, + device_memory &use_queues_flag, + device_memory &work_pool_wgs); + + virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, + const DeviceRequestedFeatures &); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task); +}; + +/* Utility to push/pop CUDA context. */ +class CUDAContextScope { + public: + CUDAContextScope(CUDADevice *device); + ~CUDAContextScope(); + + private: + CUDADevice *device; +}; + +bool CUDADevice::have_precompiled_kernels() +{ + string cubins_path = path_get("lib"); + return path_exists(cubins_path); +} + +bool CUDADevice::show_samples() const +{ + /* The CUDADevice only processes one tile at a time, so showing samples is fine. */ + return true; +} + +BVHLayoutMask CUDADevice::get_bvh_layout_mask() const +{ + return BVH_LAYOUT_BVH2; +} + +void CUDADevice::cuda_error_documentation() +{ + if (first_error) { + fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); + fprintf(stderr, + "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n"); + first_error = false; + } +} + +# define cuda_assert(stmt) \ + { \ + CUresult result = stmt; \ +\ + if (result != CUDA_SUCCESS) { \ + string message = string_printf( \ + "CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \ + if (error_msg == "") \ + error_msg = message; \ + fprintf(stderr, "%s\n", message.c_str()); \ + /*cuda_abort();*/ \ + cuda_error_documentation(); \ + } \ + } \ + (void)0 + +bool CUDADevice::cuda_error_(CUresult result, const string &stmt) +{ + if (result == CUDA_SUCCESS) + return false; + + string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result)); + if (error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); + cuda_error_documentation(); + return true; +} + +# define cuda_error(stmt) cuda_error_(stmt, # stmt) + +void CUDADevice::cuda_error_message(const string &message) +{ + if (error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); + cuda_error_documentation(); +} + +CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_) + : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_TEXTURE) +{ + first_error = true; + background = background_; + + cuDevId = info.num; + cuDevice = 0; + cuContext = 0; + + cuModule = 0; + cuFilterModule = 0; + + split_kernel = NULL; + + need_texture_info = false; + + device_texture_headroom = 0; + device_working_headroom = 0; + move_texture_to_host = false; + map_host_limit = 0; + map_host_used = 0; + can_map_host = 0; + + /* Intialize CUDA. */ + if (cuda_error(cuInit(0))) + return; + + /* Setup device and context. */ + if (cuda_error(cuDeviceGet(&cuDevice, cuDevId))) + return; + + /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. + * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, + * so we can predict which memory to map to host. */ + cuda_assert( + cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); + + unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX; + if (can_map_host) { + ctx_flags |= CU_CTX_MAP_HOST; + init_host_memory(); + } + + /* Create context. */ + CUresult result; + + if (background) { + result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); + } + else { + result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice); + + if (result != CUDA_SUCCESS) { + result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); + background = true; + } + } + + if (cuda_error_(result, "cuCtxCreate")) + return; + + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + cuDevArchitecture = major * 100 + minor * 10; + + /* Pop context set by cuCtxCreate. */ + cuCtxPopCurrent(NULL); +} + +CUDADevice::~CUDADevice() +{ + task_pool.stop(); + + delete split_kernel; + + texture_info.free(); + + cuda_assert(cuCtxDestroy(cuContext)); +} + +bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/) +{ + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + + /* We only support sm_30 and above */ + if (major < 3) { + cuda_error_message( + string_printf("CUDA device supported only with compute capability 3.0 or up, found %d.%d.", + major, + minor)); + return false; + } + + return true; +} + +bool CUDADevice::use_adaptive_compilation() +{ + return DebugFlags().cuda.adaptive_compile; +} + +bool CUDADevice::use_split_kernel() +{ + return DebugFlags().cuda.split_kernel; +} + +/* Common NVCC flags which stays the same regardless of shading model, + * kernel sources md5 and only depends on compiler or compilation settings. + */ +string CUDADevice::compile_kernel_get_common_cflags( + const DeviceRequestedFeatures &requested_features, bool filter, bool split) +{ + const int machine = system_cpu_bits(); + const string source_path = path_get("source"); + const string include_path = source_path; + string cflags = string_printf( + "-m%d " + "--ptxas-options=\"-v\" " + "--use_fast_math " + "-DNVCC " + "-I\"%s\"", + machine, + include_path.c_str()); + if (!filter && use_adaptive_compilation()) { + cflags += " " + requested_features.get_build_options(); + } + const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); + if (extra_cflags) { + cflags += string(" ") + string(extra_cflags); + } +# ifdef WITH_CYCLES_DEBUG + cflags += " -D__KERNEL_DEBUG__"; +# endif + + if (split) { + cflags += " -D__SPLIT__"; + } + + return cflags; +} + +bool CUDADevice::compile_check_compiler() +{ + const char *nvcc = cuewCompilerPath(); + if (nvcc == NULL) { + cuda_error_message( + "CUDA nvcc compiler not found. " + "Install CUDA toolkit in default location."); + return false; + } + const int cuda_version = cuewCompilerVersion(); + VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << cuda_version << "."; + const int major = cuda_version / 10, minor = cuda_version % 10; + if (cuda_version == 0) { + cuda_error_message("CUDA nvcc compiler version could not be parsed."); + return false; + } + if (cuda_version < 80) { + printf( + "Unsupported CUDA version %d.%d detected, " + "you need CUDA 8.0 or newer.\n", + major, + minor); + return false; + } + else if (cuda_version != 101) { + printf( + "CUDA version %d.%d detected, build may succeed but only " + "CUDA 10.1 is officially supported.\n", + major, + minor); + } + return true; +} + +string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features, + bool filter, + bool split) +{ + const char *name, *source; + if (filter) { + name = "filter"; + source = "filter.cu"; + } + else if (split) { + name = "kernel_split"; + source = "kernel_split.cu"; + } + else { + name = "kernel"; + source = "kernel.cu"; + } + /* Compute cubin name. */ + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + + /* Attempt to use kernel provided with Blender. */ + if (!use_adaptive_compilation()) { + const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); + VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; + if (path_exists(cubin)) { + VLOG(1) << "Using precompiled kernel."; + return cubin; + } + const string ptx = path_get(string_printf("lib/%s_compute_%d%d.ptx", name, major, minor)); + VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; + if (path_exists(ptx)) { + VLOG(1) << "Using precompiled kernel."; + return ptx; + } + } + + const string common_cflags = compile_kernel_get_common_cflags(requested_features, filter, split); + + /* Try to use locally compiled kernel. */ + const string source_path = path_get("source"); + const string kernel_md5 = path_files_md5_hash(source_path); + + /* We include cflags into md5 so changing cuda toolkit or changing other + * compiler command line arguments makes sure cubin gets re-built. + */ + const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); + + const string cubin_file = string_printf( + "cycles_%s_sm%d%d_%s.cubin", name, major, minor, cubin_md5.c_str()); + const string cubin = path_cache_get(path_join("kernels", cubin_file)); + VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; + if (path_exists(cubin)) { + VLOG(1) << "Using locally compiled kernel."; + return cubin; + } + +# ifdef _WIN32 + if (have_precompiled_kernels()) { + if (major < 3) { + cuda_error_message( + string_printf("CUDA device requires compute capability 3.0 or up, " + "found %d.%d. Your GPU is not supported.", + major, + minor)); + } + else { + cuda_error_message( + string_printf("CUDA binary kernel for this graphics card compute " + "capability (%d.%d) not found.", + major, + minor)); + } + return ""; + } +# endif + + /* Compile. */ + if (!compile_check_compiler()) { + return ""; + } + const char *nvcc = cuewCompilerPath(); + const string kernel = path_join(path_join(source_path, "kernel"), + path_join("kernels", path_join("cuda", source))); + double starttime = time_dt(); + printf("Compiling CUDA kernel ...\n"); + + path_create_directories(cubin); + + string command = string_printf( + "\"%s\" " + "-arch=sm_%d%d " + "--cubin \"%s\" " + "-o \"%s\" " + "%s ", + nvcc, + major, + minor, + kernel.c_str(), + cubin.c_str(), + common_cflags.c_str()); + + printf("%s\n", command.c_str()); + + if (system(command.c_str()) == -1) { + cuda_error_message( + "Failed to execute compilation command, " + "see console for details."); + return ""; + } + + /* Verify if compilation succeeded */ + if (!path_exists(cubin)) { + cuda_error_message( + "CUDA kernel compilation failed, " + "see console for details."); + return ""; + } + + printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); + + return cubin; +} + +bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features) +{ + /* TODO(sergey): Support kernels re-load for CUDA devices. + * + * Currently re-loading kernel will invalidate memory pointers, + * causing problems in cuCtxSynchronize. + */ + if (cuFilterModule && cuModule) { + VLOG(1) << "Skipping kernel reload, not currently supported."; + return true; + } + + /* check if cuda init succeeded */ + if (cuContext == 0) + return false; + + /* check if GPU is supported */ + if (!support_device(requested_features)) + return false; + + /* get kernel */ + string cubin = compile_kernel(requested_features, false, use_split_kernel()); + if (cubin == "") + return false; + + string filter_cubin = compile_kernel(requested_features, true, false); + if (filter_cubin == "") + return false; + + /* open module */ + CUDAContextScope scope(this); + + string cubin_data; + CUresult result; + + if (path_read_text(cubin, cubin_data)) + result = cuModuleLoadData(&cuModule, cubin_data.c_str()); + else + result = CUDA_ERROR_FILE_NOT_FOUND; + + if (cuda_error_(result, "cuModuleLoad")) + cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); + + if (path_read_text(filter_cubin, cubin_data)) + result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); + else + result = CUDA_ERROR_FILE_NOT_FOUND; + + if (cuda_error_(result, "cuModuleLoad")) + cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); + + if (result == CUDA_SUCCESS) { + reserve_local_memory(requested_features); + } + + return (result == CUDA_SUCCESS); +} + +void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features) +{ + if (use_split_kernel()) { + /* Split kernel mostly uses global memory and adaptive compilation, + * difficult to predict how much is needed currently. */ + return; + } + + /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory + * needed for kernel launches, so that we can reliably figure out when + * to allocate scene data in mapped host memory. */ + CUDAContextScope scope(this); + + size_t total = 0, free_before = 0, free_after = 0; + cuMemGetInfo(&free_before, &total); + + /* Get kernel function. */ + CUfunction cuPathTrace; + + if (requested_features.use_integrator_branched) { + cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); + } + else { + cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); + } + + cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + + int min_blocks, num_threads_per_block; + cuda_assert(cuOccupancyMaxPotentialBlockSize( + &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); + + /* Launch kernel, using just 1 block appears sufficient to reserve + * memory for all multiprocessors. It would be good to do this in + * parallel for the multi GPU case still to make it faster. */ + CUdeviceptr d_work_tiles = 0; + uint total_work_size = 0; + + void *args[] = {&d_work_tiles, &total_work_size}; + + cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + + cuda_assert(cuCtxSynchronize()); + + cuMemGetInfo(&free_after, &total); + VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) + << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; + +# if 0 + /* For testing mapped host memory, fill up device memory. */ + const size_t keep_mb = 1024; + + while (free_after > keep_mb * 1024 * 1024LL) { + CUdeviceptr tmp; + cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL)); + cuMemGetInfo(&free_after, &total); + } +# endif +} + +void CUDADevice::init_host_memory() +{ + /* Limit amount of host mapped memory, because allocating too much can + * cause system instability. Leave at least half or 4 GB of system + * memory free, whichever is smaller. */ + size_t default_limit = 4 * 1024 * 1024 * 1024LL; + size_t system_ram = system_physical_ram(); + + if (system_ram > 0) { + if (system_ram / 2 > default_limit) { + map_host_limit = system_ram - default_limit; + } + else { + map_host_limit = system_ram / 2; + } + } + else { + VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + map_host_limit = 0; + } + + /* Amount of device memory to keep is free after texture memory + * and working memory allocations respectively. We set the working + * memory limit headroom lower so that some space is left after all + * texture memory allocations. */ + device_working_headroom = 32 * 1024 * 1024LL; // 32MB + device_texture_headroom = 128 * 1024 * 1024LL; // 128MB + + VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) + << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; +} + +void CUDADevice::load_texture_info() +{ + if (need_texture_info) { + texture_info.copy_to_device(); + need_texture_info = false; + } +} + +void CUDADevice::move_textures_to_host(size_t size, bool for_texture) +{ + /* Signal to reallocate textures in host memory only. */ + move_texture_to_host = true; + + while (size > 0) { + /* Find suitable memory allocation to move. */ + device_memory *max_mem = NULL; + size_t max_size = 0; + bool max_is_image = false; + + foreach (CUDAMemMap::value_type &pair, cuda_mem_map) { + device_memory &mem = *pair.first; + CUDAMem *cmem = &pair.second; + + bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + /* Can't move this type of memory. */ + if (!is_texture || cmem->array) { + continue; + } + + /* Already in host memory. */ + if (cmem->use_mapped_host) { + continue; + } + + /* For other textures, only move image textures. */ + if (for_texture && !is_image) { + continue; + } + + /* Try to move largest allocation, prefer moving images. */ + if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { + max_is_image = is_image; + max_size = mem.device_size; + max_mem = &mem; + } + } + + /* Move to host memory. This part is mutex protected since + * multiple CUDA devices could be moving the memory. The + * first one will do it, and the rest will adopt the pointer. */ + if (max_mem) { + VLOG(1) << "Move memory from device to host: " << max_mem->name; + + static thread_mutex move_mutex; + thread_scoped_lock lock(move_mutex); + + /* Preserve the original device pointer, in case of multi device + * we can't change it because the pointer mapping would break. */ + device_ptr prev_pointer = max_mem->device_pointer; + size_t prev_size = max_mem->device_size; + + tex_free(*max_mem); + tex_alloc(*max_mem); + size = (max_size >= size) ? 0 : size - max_size; + + max_mem->device_pointer = prev_pointer; + max_mem->device_size = prev_size; + } + else { + break; + } + } + + /* Update texture info array with new pointers. */ + load_texture_info(); + + move_texture_to_host = false; +} + +CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding) +{ + CUDAContextScope scope(this); + + CUdeviceptr device_pointer = 0; + size_t size = mem.memory_size() + pitch_padding; + + CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; + const char *status = ""; + + /* First try allocating in device memory, respecting headroom. We make + * an exception for texture info. It is small and frequently accessed, + * so treat it as working memory. + * + * If there is not enough room for working memory, we will try to move + * textures to host memory, assuming the performance impact would have + * been worse for working memory. */ + bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; + + size_t total = 0, free = 0; + cuMemGetInfo(&free, &total); + + /* Move textures to host memory if needed. */ + if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { + move_textures_to_host(size + headroom - free, is_texture); + cuMemGetInfo(&free, &total); + } + + /* Allocate in device memory. */ + if (!move_texture_to_host && (size + headroom) < free) { + mem_alloc_result = cuMemAlloc(&device_pointer, size); + if (mem_alloc_result == CUDA_SUCCESS) { + status = " in device memory"; + } + } + + /* Fall back to mapped host memory if needed and possible. */ + + void *shared_pointer = 0; + + if (mem_alloc_result != CUDA_SUCCESS && can_map_host) { + if (mem.shared_pointer) { + /* Another device already allocated host memory. */ + mem_alloc_result = CUDA_SUCCESS; + shared_pointer = mem.shared_pointer; + } + else if (map_host_used + size < map_host_limit) { + /* Allocate host memory ourselves. */ + mem_alloc_result = cuMemHostAlloc( + &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); + + assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) || + (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0)); + } + + if (mem_alloc_result == CUDA_SUCCESS) { + cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0)); + map_host_used += size; + status = " in host memory"; + } + else { + status = " failed, out of host memory"; + } + } + + if (mem_alloc_result != CUDA_SUCCESS) { + status = " failed, out of device and host memory"; + cuda_assert(mem_alloc_result); + } + + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")" << status; + } + + mem.device_pointer = (device_ptr)device_pointer; + mem.device_size = size; + stats.mem_alloc(size); + + if (!mem.device_pointer) { + return NULL; + } + + /* Insert into map of allocations. */ + CUDAMem *cmem = &cuda_mem_map[&mem]; + if (shared_pointer != 0) { + /* Replace host pointer with our host allocation. Only works if + * CUDA memory layout is the same and has no pitch padding. Also + * does not work if we move textures to host during a render, + * since other devices might be using the memory. */ + + if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && + mem.host_pointer != shared_pointer) { + memcpy(shared_pointer, mem.host_pointer, size); + + /* A Call to device_memory::host_free() should be preceded by + * a call to device_memory::device_free() for host memory + * allocated by a device to be handled properly. Two exceptions + * are here and a call in OptiXDevice::generic_alloc(), where + * the current host memory can be assumed to be allocated by + * device_memory::host_alloc(), not by a device */ + + mem.host_free(); + mem.host_pointer = shared_pointer; + } + mem.shared_pointer = shared_pointer; + mem.shared_counter++; + cmem->use_mapped_host = true; + } + else { + cmem->use_mapped_host = false; + } + + return cmem; +} + +void CUDADevice::generic_copy_to(device_memory &mem) +{ + if (mem.host_pointer && mem.device_pointer) { + CUDAContextScope scope(this); + + /* If use_mapped_host of mem is false, the current device only + * uses device memory allocated by cuMemAlloc regardless of + * mem.host_pointer and mem.shared_pointer, and should copy + * data from mem.host_pointer. */ + + if (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer) { + cuda_assert( + cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size())); + } + } +} + +void CUDADevice::generic_free(device_memory &mem) +{ + if (mem.device_pointer) { + CUDAContextScope scope(this); + const CUDAMem &cmem = cuda_mem_map[&mem]; + + /* If cmem.use_mapped_host is true, reference counting is used + * to safely free a mapped host memory. */ + + if (cmem.use_mapped_host) { + assert(mem.shared_pointer); + if (mem.shared_pointer) { + assert(mem.shared_counter > 0); + if (--mem.shared_counter == 0) { + if (mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + cuMemFreeHost(mem.shared_pointer); + mem.shared_pointer = 0; + } + } + map_host_used -= mem.device_size; + } + else { + /* Free device memory. */ + cuMemFree(mem.device_pointer); + } + + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } +} + +void CUDADevice::mem_alloc(device_memory &mem) +{ + if (mem.type == MEM_PIXELS && !background) { + pixels_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else { + generic_alloc(mem); + } +} + +void CUDADevice::mem_copy_to(device_memory &mem) +{ + if (mem.type == MEM_PIXELS) { + assert(!"mem_copy_to not supported for pixels."); + } + else if (mem.type == MEM_TEXTURE) { + tex_free(mem); + tex_alloc(mem); + } + else { + if (!mem.device_pointer) { + generic_alloc(mem); + } + + generic_copy_to(mem); + } +} + +void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem) +{ + if (mem.type == MEM_PIXELS && !background) { + pixels_copy_from(mem, y, w, h); + } + else if (mem.type == MEM_TEXTURE) { + assert(!"mem_copy_from not supported for textures."); + } + else if (mem.host_pointer) { + const size_t size = elem * w * h; + const size_t offset = elem * y * w; + + if (mem.device_pointer) { + const CUDAContextScope scope(this); + cuda_assert(cuMemcpyDtoH( + (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size)); + } + else { + memset((char *)mem.host_pointer + offset, 0, size); + } + } +} + +void CUDADevice::mem_zero(device_memory &mem) +{ + if (!mem.device_pointer) { + mem_alloc(mem); + } + if (!mem.device_pointer) { + return; + } + + /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory + * regardless of mem.host_pointer and mem.shared_pointer. */ + if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + const CUDAContextScope scope(this); + cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())); + } + else if (mem.host_pointer) { + memset(mem.host_pointer, 0, mem.memory_size()); + } +} + +void CUDADevice::mem_free(device_memory &mem) +{ + if (mem.type == MEM_PIXELS && !background) { + pixels_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free(mem); + } + else { + generic_free(mem); + } +} + +device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) +{ + return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); +} + +void CUDADevice::const_copy_to(const char *name, void *host, size_t size) +{ + CUDAContextScope scope(this); + CUdeviceptr mem; + size_t bytes; + + cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); + // assert(bytes == size); + cuda_assert(cuMemcpyHtoD(mem, host, size)); +} + +void CUDADevice::tex_alloc(device_memory &mem) +{ + CUDAContextScope scope(this); + + /* General variables for both architectures */ + string bind_name = mem.name; + size_t dsize = datatype_size(mem.data_type); + size_t size = mem.memory_size(); + + CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; + switch (mem.extension) { + case EXTENSION_REPEAT: + address_mode = CU_TR_ADDRESS_MODE_WRAP; + break; + case EXTENSION_EXTEND: + address_mode = CU_TR_ADDRESS_MODE_CLAMP; + break; + case EXTENSION_CLIP: + address_mode = CU_TR_ADDRESS_MODE_BORDER; + break; + default: + assert(0); + break; + } + + CUfilter_mode filter_mode; + if (mem.interpolation == INTERPOLATION_CLOSEST) { + filter_mode = CU_TR_FILTER_MODE_POINT; + } + else { + filter_mode = CU_TR_FILTER_MODE_LINEAR; + } + + /* Data Storage */ + if (mem.interpolation == INTERPOLATION_NONE) { + generic_alloc(mem); + generic_copy_to(mem); + + const_copy_to(bind_name.c_str(), &mem.device_pointer, sizeof(mem.device_pointer)); + return; + } + + /* Image Texture Storage */ + CUarray_format_enum format; + switch (mem.data_type) { + case TYPE_UCHAR: + format = CU_AD_FORMAT_UNSIGNED_INT8; + break; + case TYPE_UINT16: + format = CU_AD_FORMAT_UNSIGNED_INT16; + break; + case TYPE_UINT: + format = CU_AD_FORMAT_UNSIGNED_INT32; + break; + case TYPE_INT: + format = CU_AD_FORMAT_SIGNED_INT32; + break; + case TYPE_FLOAT: + format = CU_AD_FORMAT_FLOAT; + break; + case TYPE_HALF: + format = CU_AD_FORMAT_HALF; + break; + default: + assert(0); + return; + } + + CUDAMem *cmem = NULL; + CUarray array_3d = NULL; + size_t src_pitch = mem.data_width * dsize * mem.data_elements; + size_t dst_pitch = src_pitch; + + if (mem.data_depth > 1) { + /* 3D texture using array, there is no API for linear memory. */ + CUDA_ARRAY3D_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; + + VLOG(1) << "Array 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + cuda_assert(cuArray3DCreate(&array_3d, &desc)); + + if (!array_3d) { + return; + } + + CUDA_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = array_3d; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + + cuda_assert(cuMemcpy3D(¶m)); + + mem.device_pointer = (device_ptr)array_3d; + mem.device_size = size; + stats.mem_alloc(size); + + cmem = &cuda_mem_map[&mem]; + cmem->texobject = 0; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + /* 2D texture, using pitch aligned linear memory. */ + int alignment = 0; + cuda_assert( + cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); + dst_pitch = align_up(src_pitch, alignment); + size_t dst_size = dst_pitch * mem.data_height; + + cmem = generic_alloc(mem, dst_size - mem.memory_size()); + if (!cmem) { + return; + } + + CUDA_MEMCPY2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_DEVICE; + param.dstDevice = mem.device_pointer; + param.dstPitch = dst_pitch; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + cuda_assert(cuMemcpy2DUnaligned(¶m)); + } + else { + /* 1D texture, using linear memory. */ + cmem = generic_alloc(mem); + if (!cmem) { + return; + } + + cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); + } + + /* Kepler+, bindless textures. */ + int flat_slot = 0; + if (string_startswith(mem.name, "__tex_image")) { + int pos = string(mem.name).rfind("_"); + flat_slot = atoi(mem.name + pos + 1); + } + else { + assert(0); + } + + CUDA_RESOURCE_DESC resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + + if (array_3d) { + resDesc.resType = CU_RESOURCE_TYPE_ARRAY; + resDesc.res.array.hArray = array_3d; + resDesc.flags = 0; + } + else if (mem.data_height > 0) { + resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; + resDesc.res.pitch2D.devPtr = mem.device_pointer; + resDesc.res.pitch2D.format = format; + resDesc.res.pitch2D.numChannels = mem.data_elements; + resDesc.res.pitch2D.height = mem.data_height; + resDesc.res.pitch2D.width = mem.data_width; + resDesc.res.pitch2D.pitchInBytes = dst_pitch; + } + else { + resDesc.resType = CU_RESOURCE_TYPE_LINEAR; + resDesc.res.linear.devPtr = mem.device_pointer; + resDesc.res.linear.format = format; + resDesc.res.linear.numChannels = mem.data_elements; + resDesc.res.linear.sizeInBytes = mem.device_size; + } + + CUDA_TEXTURE_DESC texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = address_mode; + texDesc.addressMode[1] = address_mode; + texDesc.addressMode[2] = address_mode; + texDesc.filterMode = filter_mode; + texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; + + cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); + + /* Resize once */ + if (flat_slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(flat_slot + 128); + } + + /* Set Mapping and tag that we need to (re-)upload to device */ + TextureInfo &info = texture_info[flat_slot]; + info.data = (uint64_t)cmem->texobject; + info.cl_buffer = 0; + info.interpolation = mem.interpolation; + info.extension = mem.extension; + info.width = mem.data_width; + info.height = mem.data_height; + info.depth = mem.data_depth; + need_texture_info = true; +} + +void CUDADevice::tex_free(device_memory &mem) +{ + if (mem.device_pointer) { + CUDAContextScope scope(this); + const CUDAMem &cmem = cuda_mem_map[&mem]; + + if (cmem.texobject) { + /* Free bindless texture. */ + cuTexObjectDestroy(cmem.texobject); + } + + if (cmem.array) { + /* Free array. */ + cuArrayDestroy(cmem.array); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } + else { + generic_free(mem); + } + } +} + +# define CUDA_GET_BLOCKSIZE(func, w, h) \ + int threads_per_block; \ + cuda_assert( \ + cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int threads = (int)sqrt((float)threads_per_block); \ + int xblocks = ((w) + threads - 1) / threads; \ + int yblocks = ((h) + threads - 1) / threads; + +# define CUDA_LAUNCH_KERNEL(func, args) \ + cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); + +/* Similar as above, but for 1-dimensional blocks. */ +# define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ + int threads_per_block; \ + cuda_assert( \ + cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \ + int yblocks = h; + +# define CUDA_LAUNCH_KERNEL_1D(func, args) \ + cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0)); + +bool CUDADevice::denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task) +{ + if (have_error()) + return false; + + CUDAContextScope scope(this); + + int stride = task->buffer.stride; + int w = task->buffer.width; + int h = task->buffer.h; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2 * r + 1) * (2 * r + 1); + int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; + int frame_offset = 0; + + if (have_error()) + return false; + + CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); + CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; + CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts; + CUdeviceptr scale_ptr = 0; + + cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride)); + cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride)); + + { + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput; + cuda_assert(cuModuleGetFunction( + &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction( + &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction( + &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts); + + void *calc_difference_args[] = {&guide_ptr, + &variance_ptr, + &scale_ptr, + &difference, + &w, + &h, + &stride, + &pass_stride, + &r, + &channel_offset, + &frame_offset, + &a, + &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = { + &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; + void *update_output_args[] = {&blurDifference, + &image_ptr, + &out_ptr, + &weightAccum, + &w, + &h, + &stride, + &pass_stride, + &channel_offset, + &r, + &f}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args); + } + + { + CUfunction cuNLMNormalize; + cuda_assert( + cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); + cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride}; + CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h); + CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); + cuda_assert(cuCtxSynchronize()); + } + + return !have_error(); +} + +bool CUDADevice::denoising_construct_transform(DenoisingTask *task) +{ + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterConstructTransform; + cuda_assert(cuModuleGetFunction( + &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); + cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); + CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h); + + void *args[] = {&task->buffer.mem.device_pointer, + &task->tile_info_mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->filter_area, + &task->rect, + &task->radius, + &task->pca_threshold, + &task->buffer.pass_stride, + &task->buffer.frame_stride, + &task->buffer.use_time}; + CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); +} + +bool CUDADevice::denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task) +{ + if (have_error()) + return false; + + CUDAContextScope scope(this); + + int r = task->radius; + int f = 4; + float a = 1.0f; + float k_2 = task->nlm_k_2; + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; + + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2 * r + 1) * (2 * r + 1); + + if (have_error()) + return false; + + CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); + CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; + + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; + cuda_assert(cuModuleGetFunction( + &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert( + cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction( + &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, + task->reconstruction_state.source_w * task->reconstruction_state.source_h, + num_shifts); + + void *calc_difference_args[] = {&color_ptr, + &color_variance_ptr, + &scale_ptr, + &difference, + &w, + &h, + &stride, + &pass_stride, + &r, + &pass_stride, + &frame_offset, + &a, + &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; + void *construct_gramian_args[] = {&t, + &blurDifference, + &task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->reconstruction_state.filter_window, + &w, + &h, + &stride, + &pass_stride, + &r, + &f, + &frame_offset, + &task->buffer.use_time}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); +} + +bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task) +{ + CUfunction cuFinalize; + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + void *finalize_args[] = {&output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_GET_BLOCKSIZE( + cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h); + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); +} + +bool CUDADevice::denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, + int4 rect, + DenoisingTask *task) +{ + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterCombineHalves; + cuda_assert(cuModuleGetFunction( + &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); + cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r}; + CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); +} + +bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task) +{ + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterDivideShadow; + cuda_assert(cuModuleGetFunction( + &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&task->render_buffer.samples, + &task->tile_info_mem.device_pointer, + &a_ptr, + &b_ptr, + &sample_variance_ptr, + &sv_variance_ptr, + &buffer_variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.offset}; + CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); +} + +bool CUDADevice::denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + float scale, + DenoisingTask *task) +{ + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterGetFeature; + cuda_assert( + cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&task->render_buffer.samples, + &task->tile_info_mem.device_pointer, + &mean_offset, + &variance_offset, + &mean_ptr, + &variance_ptr, + &scale, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.offset}; + CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); +} + +bool CUDADevice::denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task) +{ + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterWriteFeature; + cuda_assert(cuModuleGetFunction( + &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w); + + void *args[] = {&task->render_buffer.samples, + &task->reconstruction_state.buffer_params, + &task->filter_area, + &from_ptr, + &buffer_ptr, + &out_offset, + &task->rect}; + CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); +} + +bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task) +{ + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterDetectOutliers; + cuda_assert(cuModuleGetFunction( + &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = { + &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride}; + + CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); +} + +void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising) +{ + denoising.functions.construct_transform = function_bind( + &CUDADevice::denoising_construct_transform, this, &denoising); + denoising.functions.accumulate = function_bind( + &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); + denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising); + denoising.functions.divide_shadow = function_bind( + &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind( + &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind( + &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind( + &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind( + &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising); + denoising.functions.detect_outliers = function_bind( + &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + denoising.buffer.gpu_temporary_mem = true; + + denoising.run_denoising(&rtile); +} + +void CUDADevice::path_trace(DeviceTask &task, + RenderTile &rtile, + device_vector<WorkTile> &work_tiles) +{ + scoped_timer timer(&rtile.buffers->render_time); + + if (have_error()) + return; + + CUDAContextScope scope(this); + CUfunction cuPathTrace; + + /* Get kernel function. */ + if (task.integrator_branched) { + cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); + } + else { + cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); + } + + if (have_error()) { + return; + } + + cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + + /* Allocate work tile. */ + work_tiles.alloc(1); + + WorkTile *wtile = work_tiles.data(); + wtile->x = rtile.x; + wtile->y = rtile.y; + wtile->w = rtile.w; + wtile->h = rtile.h; + wtile->offset = rtile.offset; + wtile->stride = rtile.stride; + wtile->buffer = (float *)cuda_device_ptr(rtile.buffer); + + /* Prepare work size. More step samples render faster, but for now we + * remain conservative for GPUs connected to a display to avoid driver + * timeouts and display freezing. */ + int min_blocks, num_threads_per_block; + cuda_assert(cuOccupancyMaxPotentialBlockSize( + &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); + if (!info.display_device) { + min_blocks *= 8; + } + + uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); + + /* Render all samples. */ + int start_sample = rtile.start_sample; + int end_sample = rtile.start_sample + rtile.num_samples; + + for (int sample = start_sample; sample < end_sample; sample += step_samples) { + /* Setup and copy work tile to device. */ + wtile->start_sample = sample; + wtile->num_samples = min(step_samples, end_sample - sample); + work_tiles.copy_to_device(); + + CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); + uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); + + /* Launch kernel. */ + void *args[] = {&d_work_tiles, &total_work_size}; + + cuda_assert( + cuLaunchKernel(cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + + cuda_assert(cuCtxSynchronize()); + + /* Update progress. */ + rtile.sample = sample + wtile->num_samples; + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } +} + +void CUDADevice::film_convert(DeviceTask &task, + device_ptr buffer, + device_ptr rgba_byte, + device_ptr rgba_half) +{ + if (have_error()) + return; + + CUDAContextScope scope(this); + + CUfunction cuFilmConvert; + CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half); + CUdeviceptr d_buffer = cuda_device_ptr(buffer); + + /* get kernel function */ + if (rgba_half) { + cuda_assert( + cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float")); + } + else { + cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte")); + } + + float sample_scale = 1.0f / (task.sample + 1); + + /* pass in parameters */ + void *args[] = {&d_rgba, + &d_buffer, + &sample_scale, + &task.x, + &task.y, + &task.w, + &task.h, + &task.offset, + &task.stride}; + + /* launch kernel */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute( + &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert)); + + int xthreads = (int)sqrt(threads_per_block); + int ythreads = (int)sqrt(threads_per_block); + int xblocks = (task.w + xthreads - 1) / xthreads; + int yblocks = (task.h + ythreads - 1) / ythreads; + + cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1)); + + cuda_assert(cuLaunchKernel(cuFilmConvert, + xblocks, + yblocks, + 1, /* blocks */ + xthreads, + ythreads, + 1, /* threads */ + 0, + 0, + args, + 0)); + + unmap_pixels((rgba_byte) ? rgba_byte : rgba_half); + + cuda_assert(cuCtxSynchronize()); +} + +void CUDADevice::shader(DeviceTask &task) +{ + if (have_error()) + return; + + CUDAContextScope scope(this); + + CUfunction cuShader; + CUdeviceptr d_input = cuda_device_ptr(task.shader_input); + CUdeviceptr d_output = cuda_device_ptr(task.shader_output); + + /* get kernel function */ + if (task.shader_eval_type >= SHADER_EVAL_BAKE) { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); + } + else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); + } + else { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); + } + + /* do tasks in smaller chunks, so we can cancel it */ + const int shader_chunk_size = 65536; + const int start = task.shader_x; + const int end = task.shader_x + task.shader_w; + int offset = task.offset; + + bool canceled = false; + for (int sample = 0; sample < task.num_samples && !canceled; sample++) { + for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) { + int shader_w = min(shader_chunk_size, end - shader_x); + + /* pass in parameters */ + void *args[8]; + int arg = 0; + args[arg++] = &d_input; + args[arg++] = &d_output; + args[arg++] = &task.shader_eval_type; + if (task.shader_eval_type >= SHADER_EVAL_BAKE) { + args[arg++] = &task.shader_filter; + } + args[arg++] = &shader_x; + args[arg++] = &shader_w; + args[arg++] = &offset; + args[arg++] = &sample; + + /* launch kernel */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute( + &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader)); + + int xblocks = (shader_w + threads_per_block - 1) / threads_per_block; + + cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuLaunchKernel(cuShader, + xblocks, + 1, + 1, /* blocks */ + threads_per_block, + 1, + 1, /* threads */ + 0, + 0, + args, + 0)); + + cuda_assert(cuCtxSynchronize()); + + if (task.get_cancel()) { + canceled = true; + break; + } + } + + task.update_progress(NULL); + } +} + +CUdeviceptr CUDADevice::map_pixels(device_ptr mem) +{ + if (!background) { + PixelMem pmem = pixel_mem_map[mem]; + CUdeviceptr buffer; + + size_t bytes; + cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0)); + cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource)); + + return buffer; + } + + return cuda_device_ptr(mem); +} + +void CUDADevice::unmap_pixels(device_ptr mem) +{ + if (!background) { + PixelMem pmem = pixel_mem_map[mem]; + + cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0)); + } +} + +void CUDADevice::pixels_alloc(device_memory &mem) +{ + PixelMem pmem; + + pmem.w = mem.data_width; + pmem.h = mem.data_height; + + CUDAContextScope scope(this); + + glGenBuffers(1, &pmem.cuPBO); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); + if (mem.data_type == TYPE_HALF) + glBufferData( + GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW); + else + glBufferData( + GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + glActiveTexture(GL_TEXTURE0); + glGenTextures(1, &pmem.cuTexId); + glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); + if (mem.data_type == TYPE_HALF) + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL); + else + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glBindTexture(GL_TEXTURE_2D, 0); + + CUresult result = cuGraphicsGLRegisterBuffer( + &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); + + if (result == CUDA_SUCCESS) { + mem.device_pointer = pmem.cuTexId; + pixel_mem_map[mem.device_pointer] = pmem; + + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + + return; + } + else { + /* failed to register buffer, fallback to no interop */ + glDeleteBuffers(1, &pmem.cuPBO); + glDeleteTextures(1, &pmem.cuTexId); + + background = true; + } +} + +void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h) +{ + PixelMem pmem = pixel_mem_map[mem.device_pointer]; + + CUDAContextScope scope(this); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); + uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); + size_t offset = sizeof(uchar) * 4 * y * w; + memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h); + glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); +} + +void CUDADevice::pixels_free(device_memory &mem) +{ + if (mem.device_pointer) { + PixelMem pmem = pixel_mem_map[mem.device_pointer]; + + CUDAContextScope scope(this); + + cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); + glDeleteBuffers(1, &pmem.cuPBO); + glDeleteTextures(1, &pmem.cuTexId); + + pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); + mem.device_pointer = 0; + + stats.mem_free(mem.device_size); + mem.device_size = 0; + } +} + +void CUDADevice::draw_pixels(device_memory &mem, + int y, + int w, + int h, + int width, + int height, + int dx, + int dy, + int dw, + int dh, + bool transparent, + const DeviceDrawParams &draw_params) +{ + assert(mem.type == MEM_PIXELS); + + if (!background) { + const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL); + PixelMem pmem = pixel_mem_map[mem.device_pointer]; + float *vpointer; + + CUDAContextScope scope(this); + + /* for multi devices, this assumes the inefficient method that we allocate + * all pixels on the device even though we only render to a subset */ + size_t offset = 4 * y * w; + + if (mem.data_type == TYPE_HALF) + offset *= sizeof(GLhalf); + else + offset *= sizeof(uint8_t); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); + if (mem.data_type == TYPE_HALF) { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset); + } + else { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset); + } + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + if (transparent) { + glEnable(GL_BLEND); + glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); + } + + GLint shader_program; + if (use_fallback_shader) { + if (!bind_fallback_display_space_shader(dw, dh)) { + return; + } + shader_program = fallback_shader_program; + } + else { + draw_params.bind_display_space_shader_cb(); + glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program); + } + + if (!vertex_buffer) { + glGenBuffers(1, &vertex_buffer); + } + + glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer); + /* invalidate old contents - + * avoids stalling if buffer is still waiting in queue to be rendered */ + glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW); + + vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); + + if (vpointer) { + /* texture coordinate - vertex pair */ + vpointer[0] = 0.0f; + vpointer[1] = 0.0f; + vpointer[2] = dx; + vpointer[3] = dy; + + vpointer[4] = (float)w / (float)pmem.w; + vpointer[5] = 0.0f; + vpointer[6] = (float)width + dx; + vpointer[7] = dy; + + vpointer[8] = (float)w / (float)pmem.w; + vpointer[9] = (float)h / (float)pmem.h; + vpointer[10] = (float)width + dx; + vpointer[11] = (float)height + dy; + + vpointer[12] = 0.0f; + vpointer[13] = (float)h / (float)pmem.h; + vpointer[14] = dx; + vpointer[15] = (float)height + dy; + + glUnmapBuffer(GL_ARRAY_BUFFER); + } + + GLuint vertex_array_object; + GLuint position_attribute, texcoord_attribute; + + glGenVertexArrays(1, &vertex_array_object); + glBindVertexArray(vertex_array_object); + + texcoord_attribute = glGetAttribLocation(shader_program, "texCoord"); + position_attribute = glGetAttribLocation(shader_program, "pos"); + + glEnableVertexAttribArray(texcoord_attribute); + glEnableVertexAttribArray(position_attribute); + + glVertexAttribPointer( + texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0); + glVertexAttribPointer(position_attribute, + 2, + GL_FLOAT, + GL_FALSE, + 4 * sizeof(float), + (const GLvoid *)(sizeof(float) * 2)); + + glDrawArrays(GL_TRIANGLE_FAN, 0, 4); + + if (use_fallback_shader) { + glUseProgram(0); + } + else { + draw_params.unbind_display_space_shader_cb(); + } + + if (transparent) { + glDisable(GL_BLEND); + } + + glBindTexture(GL_TEXTURE_2D, 0); + + return; + } + + Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params); +} + +void CUDADevice::thread_run(DeviceTask *task) +{ + CUDAContextScope scope(this); + + if (task->type == DeviceTask::RENDER || task->type == DeviceTask::DENOISE) { + DeviceRequestedFeatures requested_features; + if (use_split_kernel()) { + if (split_kernel == NULL) { + split_kernel = new CUDASplitKernel(this); + split_kernel->load_kernels(requested_features); + } + } + + device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); + + /* keep rendering tiles until done */ + RenderTile tile; + DenoisingTask denoising(this, *task); + + while (task->acquire_tile(this, tile)) { + if (tile.task == RenderTile::PATH_TRACE) { + if (use_split_kernel()) { + device_only_memory<uchar> void_buffer(this, "void_buffer"); + split_kernel->path_trace(task, tile, void_buffer, void_buffer); + } + else { + path_trace(*task, tile, work_tiles); + } + } + else if (tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + + denoise(tile, denoising); + + task->update_progress(&tile, tile.w * tile.h); + } + + task->release_tile(tile); + + if (task->get_cancel()) { + if (task->need_finish_queue == false) + break; + } + } + + work_tiles.free(); + } + else if (task->type == DeviceTask::SHADER) { + shader(*task); + + cuda_assert(cuCtxSynchronize()); + } + else if (task->type == DeviceTask::DENOISE_BUFFER) { + RenderTile tile; + tile.x = task->x; + tile.y = task->y; + tile.w = task->w; + tile.h = task->h; + tile.buffer = task->buffer; + tile.sample = task->sample + task->num_samples; + tile.num_samples = task->num_samples; + tile.start_sample = task->sample; + tile.offset = task->offset; + tile.stride = task->stride; + tile.buffers = task->buffers; + + DenoisingTask denoising(this, *task); + denoise(tile, denoising); + task->update_progress(&tile, tile.w * tile.h); + } +} + +class CUDADeviceTask : public DeviceTask { + public: + CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task) + { + run = function_bind(&CUDADevice::thread_run, device, this); + } +}; + +void CUDADevice::task_add(DeviceTask &task) +{ + CUDAContextScope scope(this); + + /* Load texture info. */ + load_texture_info(); + + /* Synchronize all memory copies before executing task. */ + cuda_assert(cuCtxSynchronize()); + + if (task.type == DeviceTask::FILM_CONVERT) { + /* must be done in main thread due to opengl access */ + film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); + } + else { + task_pool.push(new CUDADeviceTask(this, task)); + } +} + +void CUDADevice::task_wait() +{ + task_pool.wait(); +} + +void CUDADevice::task_cancel() +{ + task_pool.cancel(); +} + +/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class + * now that the definition of that class is complete + */ +# undef cuda_assert +# define cuda_assert(stmt) \ + { \ + CUresult result = stmt; \ +\ + if (result != CUDA_SUCCESS) { \ + string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + if (device->error_msg == "") \ + device->error_msg = message; \ + fprintf(stderr, "%s\n", message.c_str()); \ + /*cuda_abort();*/ \ + device->cuda_error_documentation(); \ + } \ + } \ + (void)0 + +/* CUDA context scope. */ + +CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device) +{ + cuda_assert(cuCtxPushCurrent(device->cuContext)); +} + +CUDAContextScope::~CUDAContextScope() +{ + cuda_assert(cuCtxPopCurrent(NULL)); +} + +/* split kernel */ + +class CUDASplitKernelFunction : public SplitKernelFunction { + CUDADevice *device; + CUfunction func; + + public: + CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) + { + } + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/) + { + return enqueue(dim, NULL); + } + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, void *args[]) + { + if (device->have_error()) + return false; + + CUDAContextScope scope(device); + + /* we ignore dim.local_size for now, as this is faster */ + int threads_per_block; + cuda_assert( + cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); + + int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) / + threads_per_block; + + cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); + + cuda_assert(cuLaunchKernel(func, + xblocks, + 1, + 1, /* blocks */ + threads_per_block, + 1, + 1, /* threads */ + 0, + 0, + args, + 0)); + + return !device->have_error(); + } +}; + +CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device) +{ +} + +uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/, + device_memory & /*data*/, + size_t num_threads) +{ + CUDAContextScope scope(device); + + device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); + size_buffer.alloc(1); + size_buffer.zero_to_device(); + + uint threads = num_threads; + CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer); + + struct args_t { + uint *num_threads; + CUdeviceptr *size; + }; + + args_t args = {&threads, &d_size}; + + CUfunction state_buffer_size; + cuda_assert( + cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size")); + + cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0)); + + size_buffer.copy_from_device(0, 1, 1); + size_t size = size_buffer[0]; + size_buffer.free(); + + return size; +} + +bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim, + RenderTile &rtile, + int num_global_elements, + device_memory & /*kernel_globals*/, + device_memory & /*kernel_data*/, + device_memory &split_data, + device_memory &ray_state, + device_memory &queue_index, + device_memory &use_queues_flag, + device_memory &work_pool_wgs) +{ + CUDAContextScope scope(device); + + CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer); + CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer); + CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer); + CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer); + CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer); + + CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer); + + int end_sample = rtile.start_sample + rtile.num_samples; + int queue_size = dim.global_size[0] * dim.global_size[1]; + + struct args_t { + CUdeviceptr *split_data_buffer; + int *num_elements; + CUdeviceptr *ray_state; + int *start_sample; + int *end_sample; + int *sx; + int *sy; + int *sw; + int *sh; + int *offset; + int *stride; + CUdeviceptr *queue_index; + int *queuesize; + CUdeviceptr *use_queues_flag; + CUdeviceptr *work_pool_wgs; + int *num_samples; + CUdeviceptr *buffer; + }; + + args_t args = {&d_split_data, + &num_global_elements, + &d_ray_state, + &rtile.start_sample, + &end_sample, + &rtile.x, + &rtile.y, + &rtile.w, + &rtile.h, + &rtile.offset, + &rtile.stride, + &d_queue_index, + &queue_size, + &d_use_queues_flag, + &d_work_pool_wgs, + &rtile.num_samples, + &d_buffer}; + + CUfunction data_init; + cuda_assert( + cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); + if (device->have_error()) { + return false; + } + + CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args); + + return !device->have_error(); +} + +SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name, + const DeviceRequestedFeatures &) +{ + CUDAContextScope scope(device); + CUfunction func; + + cuda_assert( + cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data())); + if (device->have_error()) { + device->cuda_error_message( + string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data())); + return NULL; + } + + return new CUDASplitKernelFunction(device, func); +} + +int2 CUDASplitKernel::split_kernel_local_size() +{ + return make_int2(32, 1); +} + +int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg, + device_memory &data, + DeviceTask * /*task*/) +{ + CUDAContextScope scope(device); + size_t free; + size_t total; + + cuda_assert(cuMemGetInfo(&free, &total)); + + VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free) + << " bytes. (" << string_human_readable_size(free) << ")."; + + size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2); + size_t side = round_down((int)sqrt(num_elements), 32); + int2 global_size = make_int2(side, round_down(num_elements / side, 16)); + VLOG(1) << "Global size: " << global_size << "."; + return global_size; +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index e3c737cc2e7..b24b3ab6567 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -14,2576 +14,15 @@ * limitations under the License. */ -#include <climits> -#include <limits.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> +#ifdef WITH_CUDA -#include "device/device.h" -#include "device/device_denoising.h" -#include "device/device_intern.h" -#include "device/device_split_kernel.h" +# include "device/cuda/device_cuda.h" -#include "render/buffers.h" - -#include "kernel/filter/filter_defines.h" - -#ifdef WITH_CUDA_DYNLOAD -# include "cuew.h" -#else -# include "util/util_opengl.h" -# include <cuda.h> -# include <cudaGL.h> -#endif -#include "util/util_debug.h" -#include "util/util_foreach.h" -#include "util/util_logging.h" -#include "util/util_map.h" -#include "util/util_md5.h" -#include "util/util_opengl.h" -#include "util/util_path.h" -#include "util/util_string.h" -#include "util/util_system.h" -#include "util/util_types.h" -#include "util/util_time.h" -#include "util/util_windows.h" - -#include "kernel/split/kernel_split_data_types.h" +# include "util/util_logging.h" +# include "util/util_string.h" CCL_NAMESPACE_BEGIN -#ifndef WITH_CUDA_DYNLOAD - -/* Transparently implement some functions, so majority of the file does not need - * to worry about difference between dynamically loaded and linked CUDA at all. - */ - -namespace { - -const char *cuewErrorString(CUresult result) -{ - /* We can only give error code here without major code duplication, that - * should be enough since dynamic loading is only being disabled by folks - * who knows what they're doing anyway. - * - * NOTE: Avoid call from several threads. - */ - static string error; - error = string_printf("%d", result); - return error.c_str(); -} - -const char *cuewCompilerPath() -{ - return CYCLES_CUDA_NVCC_EXECUTABLE; -} - -int cuewCompilerVersion() -{ - return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10); -} - -} /* namespace */ -#endif /* WITH_CUDA_DYNLOAD */ - -class CUDADevice; - -class CUDASplitKernel : public DeviceSplitKernel { - CUDADevice *device; - - public: - explicit CUDASplitKernel(CUDADevice *device); - - virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, - RenderTile &rtile, - int num_global_elements, - device_memory &kernel_globals, - device_memory &kernel_data_, - device_memory &split_data, - device_memory &ray_state, - device_memory &queue_index, - device_memory &use_queues_flag, - device_memory &work_pool_wgs); - - virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, - const DeviceRequestedFeatures &); - virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task); -}; - -/* Utility to push/pop CUDA context. */ -class CUDAContextScope { - public: - CUDAContextScope(CUDADevice *device); - ~CUDAContextScope(); - - private: - CUDADevice *device; -}; - -class CUDADevice : public Device { - public: - DedicatedTaskPool task_pool; - CUdevice cuDevice; - CUcontext cuContext; - CUmodule cuModule, cuFilterModule; - size_t device_texture_headroom; - size_t device_working_headroom; - bool move_texture_to_host; - size_t map_host_used; - size_t map_host_limit; - int can_map_host; - int cuDevId; - int cuDevArchitecture; - bool first_error; - CUDASplitKernel *split_kernel; - - struct CUDAMem { - CUDAMem() : texobject(0), array(0), use_mapped_host(false) - { - } - - CUtexObject texobject; - CUarray array; - - /* If true, a mapped host memory in shared_pointer is being used. */ - bool use_mapped_host; - }; - typedef map<device_memory *, CUDAMem> CUDAMemMap; - CUDAMemMap cuda_mem_map; - - struct PixelMem { - GLuint cuPBO; - CUgraphicsResource cuPBOresource; - GLuint cuTexId; - int w, h; - }; - map<device_ptr, PixelMem> pixel_mem_map; - - /* Bindless Textures */ - device_vector<TextureInfo> texture_info; - bool need_texture_info; - - CUdeviceptr cuda_device_ptr(device_ptr mem) - { - return (CUdeviceptr)mem; - } - - static bool have_precompiled_kernels() - { - string cubins_path = path_get("lib"); - return path_exists(cubins_path); - } - - virtual bool show_samples() const - { - /* The CUDADevice only processes one tile at a time, so showing samples is fine. */ - return true; - } - - virtual BVHLayoutMask get_bvh_layout_mask() const - { - return BVH_LAYOUT_BVH2; - } - - /*#ifdef NDEBUG -#define cuda_abort() -#else -#define cuda_abort() abort() -#endif*/ - void cuda_error_documentation() - { - if (first_error) { - fprintf(stderr, - "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); - fprintf(stderr, - "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n"); - first_error = false; - } - } - -#define cuda_assert(stmt) \ - { \ - CUresult result = stmt; \ -\ - if (result != CUDA_SUCCESS) { \ - string message = string_printf( \ - "CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \ - if (error_msg == "") \ - error_msg = message; \ - fprintf(stderr, "%s\n", message.c_str()); \ - /*cuda_abort();*/ \ - cuda_error_documentation(); \ - } \ - } \ - (void)0 - - bool cuda_error_(CUresult result, const string &stmt) - { - if (result == CUDA_SUCCESS) - return false; - - string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result)); - if (error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); - cuda_error_documentation(); - return true; - } - -#define cuda_error(stmt) cuda_error_(stmt, #stmt) - - void cuda_error_message(const string &message) - { - if (error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); - cuda_error_documentation(); - } - - CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_) - : Device(info, stats, profiler, background_), - texture_info(this, "__texture_info", MEM_TEXTURE) - { - first_error = true; - background = background_; - - cuDevId = info.num; - cuDevice = 0; - cuContext = 0; - - cuModule = 0; - cuFilterModule = 0; - - split_kernel = NULL; - - need_texture_info = false; - - device_texture_headroom = 0; - device_working_headroom = 0; - move_texture_to_host = false; - map_host_limit = 0; - map_host_used = 0; - can_map_host = 0; - - /* Intialize CUDA. */ - if (cuda_error(cuInit(0))) - return; - - /* Setup device and context. */ - if (cuda_error(cuDeviceGet(&cuDevice, cuDevId))) - return; - - /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. - * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, - * so we can predict which memory to map to host. */ - cuda_assert( - cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); - - unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX; - if (can_map_host) { - ctx_flags |= CU_CTX_MAP_HOST; - init_host_memory(); - } - - /* Create context. */ - CUresult result; - - if (background) { - result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); - } - else { - result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice); - - if (result != CUDA_SUCCESS) { - result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); - background = true; - } - } - - if (cuda_error_(result, "cuCtxCreate")) - return; - - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - cuDevArchitecture = major * 100 + minor * 10; - - /* Pop context set by cuCtxCreate. */ - cuCtxPopCurrent(NULL); - } - - ~CUDADevice() - { - task_pool.stop(); - - delete split_kernel; - - texture_info.free(); - - cuda_assert(cuCtxDestroy(cuContext)); - } - - bool support_device(const DeviceRequestedFeatures & /*requested_features*/) - { - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - - /* We only support sm_30 and above */ - if (major < 3) { - cuda_error_message(string_printf( - "CUDA device supported only with compute capability 3.0 or up, found %d.%d.", - major, - minor)); - return false; - } - - return true; - } - - bool use_adaptive_compilation() - { - return DebugFlags().cuda.adaptive_compile; - } - - bool use_split_kernel() - { - return DebugFlags().cuda.split_kernel; - } - - /* Common NVCC flags which stays the same regardless of shading model, - * kernel sources md5 and only depends on compiler or compilation settings. - */ - string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features, - bool filter = false, - bool split = false) - { - const int machine = system_cpu_bits(); - const string source_path = path_get("source"); - const string include_path = source_path; - string cflags = string_printf( - "-m%d " - "--ptxas-options=\"-v\" " - "--use_fast_math " - "-DNVCC " - "-I\"%s\"", - machine, - include_path.c_str()); - if (!filter && use_adaptive_compilation()) { - cflags += " " + requested_features.get_build_options(); - } - const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); - if (extra_cflags) { - cflags += string(" ") + string(extra_cflags); - } -#ifdef WITH_CYCLES_DEBUG - cflags += " -D__KERNEL_DEBUG__"; -#endif - - if (split) { - cflags += " -D__SPLIT__"; - } - - return cflags; - } - - bool compile_check_compiler() - { - const char *nvcc = cuewCompilerPath(); - if (nvcc == NULL) { - cuda_error_message( - "CUDA nvcc compiler not found. " - "Install CUDA toolkit in default location."); - return false; - } - const int cuda_version = cuewCompilerVersion(); - VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << cuda_version << "."; - const int major = cuda_version / 10, minor = cuda_version % 10; - if (cuda_version == 0) { - cuda_error_message("CUDA nvcc compiler version could not be parsed."); - return false; - } - if (cuda_version < 80) { - printf( - "Unsupported CUDA version %d.%d detected, " - "you need CUDA 8.0 or newer.\n", - major, - minor); - return false; - } - else if (cuda_version != 101) { - printf( - "CUDA version %d.%d detected, build may succeed but only " - "CUDA 10.1 is officially supported.\n", - major, - minor); - } - return true; - } - - string compile_kernel(const DeviceRequestedFeatures &requested_features, - bool filter = false, - bool split = false) - { - const char *name, *source; - if (filter) { - name = "filter"; - source = "filter.cu"; - } - else if (split) { - name = "kernel_split"; - source = "kernel_split.cu"; - } - else { - name = "kernel"; - source = "kernel.cu"; - } - /* Compute cubin name. */ - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - - /* Attempt to use kernel provided with Blender. */ - if (!use_adaptive_compilation()) { - const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); - VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; - if (path_exists(cubin)) { - VLOG(1) << "Using precompiled kernel."; - return cubin; - } - const string ptx = path_get(string_printf("lib/%s_compute_%d%d.ptx", name, major, minor)); - VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; - if (path_exists(ptx)) { - VLOG(1) << "Using precompiled kernel."; - return ptx; - } - } - - const string common_cflags = compile_kernel_get_common_cflags( - requested_features, filter, split); - - /* Try to use locally compiled kernel. */ - const string source_path = path_get("source"); - const string kernel_md5 = path_files_md5_hash(source_path); - - /* We include cflags into md5 so changing cuda toolkit or changing other - * compiler command line arguments makes sure cubin gets re-built. - */ - const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); - - const string cubin_file = string_printf( - "cycles_%s_sm%d%d_%s.cubin", name, major, minor, cubin_md5.c_str()); - const string cubin = path_cache_get(path_join("kernels", cubin_file)); - VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; - if (path_exists(cubin)) { - VLOG(1) << "Using locally compiled kernel."; - return cubin; - } - -#ifdef _WIN32 - if (have_precompiled_kernels()) { - if (major < 3) { - cuda_error_message( - string_printf("CUDA device requires compute capability 3.0 or up, " - "found %d.%d. Your GPU is not supported.", - major, - minor)); - } - else { - cuda_error_message( - string_printf("CUDA binary kernel for this graphics card compute " - "capability (%d.%d) not found.", - major, - minor)); - } - return ""; - } -#endif - - /* Compile. */ - if (!compile_check_compiler()) { - return ""; - } - const char *nvcc = cuewCompilerPath(); - const string kernel = path_join(path_join(source_path, "kernel"), - path_join("kernels", path_join("cuda", source))); - double starttime = time_dt(); - printf("Compiling CUDA kernel ...\n"); - - path_create_directories(cubin); - - string command = string_printf( - "\"%s\" " - "-arch=sm_%d%d " - "--cubin \"%s\" " - "-o \"%s\" " - "%s ", - nvcc, - major, - minor, - kernel.c_str(), - cubin.c_str(), - common_cflags.c_str()); - - printf("%s\n", command.c_str()); - - if (system(command.c_str()) == -1) { - cuda_error_message( - "Failed to execute compilation command, " - "see console for details."); - return ""; - } - - /* Verify if compilation succeeded */ - if (!path_exists(cubin)) { - cuda_error_message( - "CUDA kernel compilation failed, " - "see console for details."); - return ""; - } - - printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); - - return cubin; - } - - bool load_kernels(const DeviceRequestedFeatures &requested_features) - { - /* TODO(sergey): Support kernels re-load for CUDA devices. - * - * Currently re-loading kernel will invalidate memory pointers, - * causing problems in cuCtxSynchronize. - */ - if (cuFilterModule && cuModule) { - VLOG(1) << "Skipping kernel reload, not currently supported."; - return true; - } - - /* check if cuda init succeeded */ - if (cuContext == 0) - return false; - - /* check if GPU is supported */ - if (!support_device(requested_features)) - return false; - - /* get kernel */ - string cubin = compile_kernel(requested_features, false, use_split_kernel()); - if (cubin == "") - return false; - - string filter_cubin = compile_kernel(requested_features, true, false); - if (filter_cubin == "") - return false; - - /* open module */ - CUDAContextScope scope(this); - - string cubin_data; - CUresult result; - - if (path_read_text(cubin, cubin_data)) - result = cuModuleLoadData(&cuModule, cubin_data.c_str()); - else - result = CUDA_ERROR_FILE_NOT_FOUND; - - if (cuda_error_(result, "cuModuleLoad")) - cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); - - if (path_read_text(filter_cubin, cubin_data)) - result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); - else - result = CUDA_ERROR_FILE_NOT_FOUND; - - if (cuda_error_(result, "cuModuleLoad")) - cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); - - if (result == CUDA_SUCCESS) { - reserve_local_memory(requested_features); - } - - return (result == CUDA_SUCCESS); - } - - void reserve_local_memory(const DeviceRequestedFeatures &requested_features) - { - if (use_split_kernel()) { - /* Split kernel mostly uses global memory and adaptive compilation, - * difficult to predict how much is needed currently. */ - return; - } - - /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory - * needed for kernel launches, so that we can reliably figure out when - * to allocate scene data in mapped host memory. */ - CUDAContextScope scope(this); - - size_t total = 0, free_before = 0, free_after = 0; - cuMemGetInfo(&free_before, &total); - - /* Get kernel function. */ - CUfunction cuPathTrace; - - if (requested_features.use_integrator_branched) { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); - } - - cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); - - int min_blocks, num_threads_per_block; - cuda_assert(cuOccupancyMaxPotentialBlockSize( - &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); - - /* Launch kernel, using just 1 block appears sufficient to reserve - * memory for all multiprocessors. It would be good to do this in - * parallel for the multi GPU case still to make it faster. */ - CUdeviceptr d_work_tiles = 0; - uint total_work_size = 0; - - void *args[] = {&d_work_tiles, &total_work_size}; - - cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); - - cuda_assert(cuCtxSynchronize()); - - cuMemGetInfo(&free_after, &total); - VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) - << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; - -#if 0 - /* For testing mapped host memory, fill up device memory. */ - const size_t keep_mb = 1024; - - while (free_after > keep_mb * 1024 * 1024LL) { - CUdeviceptr tmp; - cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL)); - cuMemGetInfo(&free_after, &total); - } -#endif - } - - void init_host_memory() - { - /* Limit amount of host mapped memory, because allocating too much can - * cause system instability. Leave at least half or 4 GB of system - * memory free, whichever is smaller. */ - size_t default_limit = 4 * 1024 * 1024 * 1024LL; - size_t system_ram = system_physical_ram(); - - if (system_ram > 0) { - if (system_ram / 2 > default_limit) { - map_host_limit = system_ram - default_limit; - } - else { - map_host_limit = system_ram / 2; - } - } - else { - VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; - map_host_limit = 0; - } - - /* Amount of device memory to keep is free after texture memory - * and working memory allocations respectively. We set the working - * memory limit headroom lower so that some space is left after all - * texture memory allocations. */ - device_working_headroom = 32 * 1024 * 1024LL; // 32MB - device_texture_headroom = 128 * 1024 * 1024LL; // 128MB - - VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) - << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; - } - - void load_texture_info() - { - if (need_texture_info) { - texture_info.copy_to_device(); - need_texture_info = false; - } - } - - void move_textures_to_host(size_t size, bool for_texture) - { - /* Signal to reallocate textures in host memory only. */ - move_texture_to_host = true; - - while (size > 0) { - /* Find suitable memory allocation to move. */ - device_memory *max_mem = NULL; - size_t max_size = 0; - bool max_is_image = false; - - foreach (CUDAMemMap::value_type &pair, cuda_mem_map) { - device_memory &mem = *pair.first; - CUDAMem *cmem = &pair.second; - - bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - /* Can't move this type of memory. */ - if (!is_texture || cmem->array) { - continue; - } - - /* Already in host memory. */ - if (cmem->use_mapped_host) { - continue; - } - - /* For other textures, only move image textures. */ - if (for_texture && !is_image) { - continue; - } - - /* Try to move largest allocation, prefer moving images. */ - if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { - max_is_image = is_image; - max_size = mem.device_size; - max_mem = &mem; - } - } - - /* Move to host memory. This part is mutex protected since - * multiple CUDA devices could be moving the memory. The - * first one will do it, and the rest will adopt the pointer. */ - if (max_mem) { - VLOG(1) << "Move memory from device to host: " << max_mem->name; - - static thread_mutex move_mutex; - thread_scoped_lock lock(move_mutex); - - /* Preserve the original device pointer, in case of multi device - * we can't change it because the pointer mapping would break. */ - device_ptr prev_pointer = max_mem->device_pointer; - size_t prev_size = max_mem->device_size; - - tex_free(*max_mem); - tex_alloc(*max_mem); - size = (max_size >= size) ? 0 : size - max_size; - - max_mem->device_pointer = prev_pointer; - max_mem->device_size = prev_size; - } - else { - break; - } - } - - /* Update texture info array with new pointers. */ - load_texture_info(); - - move_texture_to_host = false; - } - - CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0) - { - CUDAContextScope scope(this); - - CUdeviceptr device_pointer = 0; - size_t size = mem.memory_size() + pitch_padding; - - CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; - const char *status = ""; - - /* First try allocating in device memory, respecting headroom. We make - * an exception for texture info. It is small and frequently accessed, - * so treat it as working memory. - * - * If there is not enough room for working memory, we will try to move - * textures to host memory, assuming the performance impact would have - * been worse for working memory. */ - bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; - - size_t total = 0, free = 0; - cuMemGetInfo(&free, &total); - - /* Move textures to host memory if needed. */ - if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { - move_textures_to_host(size + headroom - free, is_texture); - cuMemGetInfo(&free, &total); - } - - /* Allocate in device memory. */ - if (!move_texture_to_host && (size + headroom) < free) { - mem_alloc_result = cuMemAlloc(&device_pointer, size); - if (mem_alloc_result == CUDA_SUCCESS) { - status = " in device memory"; - } - } - - /* Fall back to mapped host memory if needed and possible. */ - - void *shared_pointer = 0; - - if (mem_alloc_result != CUDA_SUCCESS && can_map_host) { - if (mem.shared_pointer) { - /* Another device already allocated host memory. */ - mem_alloc_result = CUDA_SUCCESS; - shared_pointer = mem.shared_pointer; - } - else if (map_host_used + size < map_host_limit) { - /* Allocate host memory ourselves. */ - mem_alloc_result = cuMemHostAlloc( - &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); - - assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) || - (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0)); - } - - if (mem_alloc_result == CUDA_SUCCESS) { - cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0)); - map_host_used += size; - status = " in host memory"; - } - else { - status = " failed, out of host memory"; - } - } - - if (mem_alloc_result != CUDA_SUCCESS) { - status = " failed, out of device and host memory"; - cuda_assert(mem_alloc_result); - } - - if (mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")" << status; - } - - mem.device_pointer = (device_ptr)device_pointer; - mem.device_size = size; - stats.mem_alloc(size); - - if (!mem.device_pointer) { - return NULL; - } - - /* Insert into map of allocations. */ - CUDAMem *cmem = &cuda_mem_map[&mem]; - if (shared_pointer != 0) { - /* Replace host pointer with our host allocation. Only works if - * CUDA memory layout is the same and has no pitch padding. Also - * does not work if we move textures to host during a render, - * since other devices might be using the memory. */ - - if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && - mem.host_pointer != shared_pointer) { - memcpy(shared_pointer, mem.host_pointer, size); - - /* A Call to device_memory::host_free() should be preceded by - * a call to device_memory::device_free() for host memory - * allocated by a device to be handled properly. Two exceptions - * are here and a call in OptiXDevice::generic_alloc(), where - * the current host memory can be assumed to be allocated by - * device_memory::host_alloc(), not by a device */ - - mem.host_free(); - mem.host_pointer = shared_pointer; - } - mem.shared_pointer = shared_pointer; - mem.shared_counter++; - cmem->use_mapped_host = true; - } - else { - cmem->use_mapped_host = false; - } - - return cmem; - } - - void generic_copy_to(device_memory &mem) - { - if (mem.host_pointer && mem.device_pointer) { - CUDAContextScope scope(this); - - /* If use_mapped_host of mem is false, the current device only - * uses device memory allocated by cuMemAlloc regardless of - * mem.host_pointer and mem.shared_pointer, and should copy - * data from mem.host_pointer. */ - - if (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer) { - cuda_assert(cuMemcpyHtoD( - cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size())); - } - } - } - - void generic_free(device_memory &mem) - { - if (mem.device_pointer) { - CUDAContextScope scope(this); - const CUDAMem &cmem = cuda_mem_map[&mem]; - - /* If cmem.use_mapped_host is true, reference counting is used - * to safely free a mapped host memory. */ - - if (cmem.use_mapped_host) { - assert(mem.shared_pointer); - if (mem.shared_pointer) { - assert(mem.shared_counter > 0); - if (--mem.shared_counter == 0) { - if (mem.host_pointer == mem.shared_pointer) { - mem.host_pointer = 0; - } - cuMemFreeHost(mem.shared_pointer); - mem.shared_pointer = 0; - } - } - map_host_used -= mem.device_size; - } - else { - /* Free device memory. */ - cuMemFree(mem.device_pointer); - } - - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } - } - - void mem_alloc(device_memory &mem) - { - if (mem.type == MEM_PIXELS && !background) { - pixels_alloc(mem); - } - else if (mem.type == MEM_TEXTURE) { - assert(!"mem_alloc not supported for textures."); - } - else { - generic_alloc(mem); - } - } - - void mem_copy_to(device_memory &mem) - { - if (mem.type == MEM_PIXELS) { - assert(!"mem_copy_to not supported for pixels."); - } - else if (mem.type == MEM_TEXTURE) { - tex_free(mem); - tex_alloc(mem); - } - else { - if (!mem.device_pointer) { - generic_alloc(mem); - } - - generic_copy_to(mem); - } - } - - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) - { - if (mem.type == MEM_PIXELS && !background) { - pixels_copy_from(mem, y, w, h); - } - else if (mem.type == MEM_TEXTURE) { - assert(!"mem_copy_from not supported for textures."); - } - else if (mem.host_pointer) { - const size_t size = elem * w * h; - const size_t offset = elem * y * w; - - if (mem.device_pointer) { - const CUDAContextScope scope(this); - cuda_assert(cuMemcpyDtoH( - (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size)); - } - else { - memset((char *)mem.host_pointer + offset, 0, size); - } - } - } - - void mem_zero(device_memory &mem) - { - if (!mem.device_pointer) { - mem_alloc(mem); - } - if (!mem.device_pointer) { - return; - } - - /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory - * regardless of mem.host_pointer and mem.shared_pointer. */ - if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { - const CUDAContextScope scope(this); - cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())); - } - else if (mem.host_pointer) { - memset(mem.host_pointer, 0, mem.memory_size()); - } - } - - void mem_free(device_memory &mem) - { - if (mem.type == MEM_PIXELS && !background) { - pixels_free(mem); - } - else if (mem.type == MEM_TEXTURE) { - tex_free(mem); - } - else { - generic_free(mem); - } - } - - virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) - { - return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); - } - - void const_copy_to(const char *name, void *host, size_t size) - { - CUDAContextScope scope(this); - CUdeviceptr mem; - size_t bytes; - - cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); - // assert(bytes == size); - cuda_assert(cuMemcpyHtoD(mem, host, size)); - } - - void tex_alloc(device_memory &mem) - { - CUDAContextScope scope(this); - - /* General variables for both architectures */ - string bind_name = mem.name; - size_t dsize = datatype_size(mem.data_type); - size_t size = mem.memory_size(); - - CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; - switch (mem.extension) { - case EXTENSION_REPEAT: - address_mode = CU_TR_ADDRESS_MODE_WRAP; - break; - case EXTENSION_EXTEND: - address_mode = CU_TR_ADDRESS_MODE_CLAMP; - break; - case EXTENSION_CLIP: - address_mode = CU_TR_ADDRESS_MODE_BORDER; - break; - default: - assert(0); - break; - } - - CUfilter_mode filter_mode; - if (mem.interpolation == INTERPOLATION_CLOSEST) { - filter_mode = CU_TR_FILTER_MODE_POINT; - } - else { - filter_mode = CU_TR_FILTER_MODE_LINEAR; - } - - /* Data Storage */ - if (mem.interpolation == INTERPOLATION_NONE) { - generic_alloc(mem); - generic_copy_to(mem); - - CUdeviceptr cumem; - size_t cubytes; - - cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); - - if (cubytes == 8) { - /* 64 bit device pointer */ - uint64_t ptr = mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes)); - } - else { - /* 32 bit device pointer */ - uint32_t ptr = (uint32_t)mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes)); - } - return; - } - - /* Image Texture Storage */ - CUarray_format_enum format; - switch (mem.data_type) { - case TYPE_UCHAR: - format = CU_AD_FORMAT_UNSIGNED_INT8; - break; - case TYPE_UINT16: - format = CU_AD_FORMAT_UNSIGNED_INT16; - break; - case TYPE_UINT: - format = CU_AD_FORMAT_UNSIGNED_INT32; - break; - case TYPE_INT: - format = CU_AD_FORMAT_SIGNED_INT32; - break; - case TYPE_FLOAT: - format = CU_AD_FORMAT_FLOAT; - break; - case TYPE_HALF: - format = CU_AD_FORMAT_HALF; - break; - default: - assert(0); - return; - } - - CUDAMem *cmem = NULL; - CUarray array_3d = NULL; - size_t src_pitch = mem.data_width * dsize * mem.data_elements; - size_t dst_pitch = src_pitch; - - if (mem.data_depth > 1) { - /* 3D texture using array, there is no API for linear memory. */ - CUDA_ARRAY3D_DESCRIPTOR desc; - - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Depth = mem.data_depth; - desc.Format = format; - desc.NumChannels = mem.data_elements; - desc.Flags = 0; - - VLOG(1) << "Array 3D allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - cuda_assert(cuArray3DCreate(&array_3d, &desc)); - - if (!array_3d) { - return; - } - - CUDA_MEMCPY3D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = array_3d; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - param.Depth = mem.data_depth; - - cuda_assert(cuMemcpy3D(¶m)); - - mem.device_pointer = (device_ptr)array_3d; - mem.device_size = size; - stats.mem_alloc(size); - - cmem = &cuda_mem_map[&mem]; - cmem->texobject = 0; - cmem->array = array_3d; - } - else if (mem.data_height > 0) { - /* 2D texture, using pitch aligned linear memory. */ - int alignment = 0; - cuda_assert( - cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); - dst_pitch = align_up(src_pitch, alignment); - size_t dst_size = dst_pitch * mem.data_height; - - cmem = generic_alloc(mem, dst_size - mem.memory_size()); - if (!cmem) { - return; - } - - CUDA_MEMCPY2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_DEVICE; - param.dstDevice = mem.device_pointer; - param.dstPitch = dst_pitch; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - - cuda_assert(cuMemcpy2DUnaligned(¶m)); - } - else { - /* 1D texture, using linear memory. */ - cmem = generic_alloc(mem); - if (!cmem) { - return; - } - - cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); - } - - /* Kepler+, bindless textures. */ - int flat_slot = 0; - if (string_startswith(mem.name, "__tex_image")) { - int pos = string(mem.name).rfind("_"); - flat_slot = atoi(mem.name + pos + 1); - } - else { - assert(0); - } - - CUDA_RESOURCE_DESC resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - - if (array_3d) { - resDesc.resType = CU_RESOURCE_TYPE_ARRAY; - resDesc.res.array.hArray = array_3d; - resDesc.flags = 0; - } - else if (mem.data_height > 0) { - resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; - resDesc.res.pitch2D.devPtr = mem.device_pointer; - resDesc.res.pitch2D.format = format; - resDesc.res.pitch2D.numChannels = mem.data_elements; - resDesc.res.pitch2D.height = mem.data_height; - resDesc.res.pitch2D.width = mem.data_width; - resDesc.res.pitch2D.pitchInBytes = dst_pitch; - } - else { - resDesc.resType = CU_RESOURCE_TYPE_LINEAR; - resDesc.res.linear.devPtr = mem.device_pointer; - resDesc.res.linear.format = format; - resDesc.res.linear.numChannels = mem.data_elements; - resDesc.res.linear.sizeInBytes = mem.device_size; - } - - CUDA_TEXTURE_DESC texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.addressMode[0] = address_mode; - texDesc.addressMode[1] = address_mode; - texDesc.addressMode[2] = address_mode; - texDesc.filterMode = filter_mode; - texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; - - cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); - - /* Resize once */ - if (flat_slot >= texture_info.size()) { - /* Allocate some slots in advance, to reduce amount - * of re-allocations. */ - texture_info.resize(flat_slot + 128); - } - - /* Set Mapping and tag that we need to (re-)upload to device */ - TextureInfo &info = texture_info[flat_slot]; - info.data = (uint64_t)cmem->texobject; - info.cl_buffer = 0; - info.interpolation = mem.interpolation; - info.extension = mem.extension; - info.width = mem.data_width; - info.height = mem.data_height; - info.depth = mem.data_depth; - need_texture_info = true; - } - - void tex_free(device_memory &mem) - { - if (mem.device_pointer) { - CUDAContextScope scope(this); - const CUDAMem &cmem = cuda_mem_map[&mem]; - - if (cmem.texobject) { - /* Free bindless texture. */ - cuTexObjectDestroy(cmem.texobject); - } - - if (cmem.array) { - /* Free array. */ - cuArrayDestroy(cmem.array); - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } - else { - generic_free(mem); - } - } - } - -#define CUDA_GET_BLOCKSIZE(func, w, h) \ - int threads_per_block; \ - cuda_assert( \ - cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - int threads = (int)sqrt((float)threads_per_block); \ - int xblocks = ((w) + threads - 1) / threads; \ - int yblocks = ((h) + threads - 1) / threads; - -#define CUDA_LAUNCH_KERNEL(func, args) \ - cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); - -/* Similar as above, but for 1-dimensional blocks. */ -#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ - int threads_per_block; \ - cuda_assert( \ - cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \ - int yblocks = h; - -#define CUDA_LAUNCH_KERNEL_1D(func, args) \ - cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0)); - - bool denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task) - { - if (have_error()) - return false; - - CUDAContextScope scope(this); - - int stride = task->buffer.stride; - int w = task->buffer.width; - int h = task->buffer.h; - int r = task->nlm_state.r; - int f = task->nlm_state.f; - float a = task->nlm_state.a; - float k_2 = task->nlm_state.k_2; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2 * r + 1) * (2 * r + 1); - int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; - int frame_offset = 0; - - if (have_error()) - return false; - - CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); - CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; - CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts; - CUdeviceptr scale_ptr = 0; - - cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride)); - cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride)); - - { - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput; - cuda_assert(cuModuleGetFunction( - &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction( - &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction( - &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); - - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts); - - void *calc_difference_args[] = {&guide_ptr, - &variance_ptr, - &scale_ptr, - &difference, - &w, - &h, - &stride, - &pass_stride, - &r, - &channel_offset, - &frame_offset, - &a, - &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = { - &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *update_output_args[] = {&blurDifference, - &image_ptr, - &out_ptr, - &weightAccum, - &w, - &h, - &stride, - &pass_stride, - &channel_offset, - &r, - &f}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args); - } - - { - CUfunction cuNLMNormalize; - cuda_assert(cuModuleGetFunction( - &cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); - cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); - void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride}; - CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h); - CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); - cuda_assert(cuCtxSynchronize()); - } - - return !have_error(); - } - - bool denoising_construct_transform(DenoisingTask *task) - { - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterConstructTransform; - cuda_assert(cuModuleGetFunction( - &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); - cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); - CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h); - - void *args[] = {&task->buffer.mem.device_pointer, - &task->tile_info_mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->filter_area, - &task->rect, - &task->radius, - &task->pca_threshold, - &task->buffer.pass_stride, - &task->buffer.frame_stride, - &task->buffer.use_time}; - CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task) - { - if (have_error()) - return false; - - CUDAContextScope scope(this); - - int r = task->radius; - int f = 4; - float a = 1.0f; - float k_2 = task->nlm_k_2; - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - int stride = task->buffer.stride; - int frame_offset = frame * task->buffer.frame_stride; - int t = task->tile_info->frames[frame]; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2 * r + 1) * (2 * r + 1); - - if (have_error()) - return false; - - CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); - CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; - - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; - cuda_assert(cuModuleGetFunction( - &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction( - &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction( - &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); - - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, - task->reconstruction_state.source_w * - task->reconstruction_state.source_h, - num_shifts); - - void *calc_difference_args[] = {&color_ptr, - &color_variance_ptr, - &scale_ptr, - &difference, - &w, - &h, - &stride, - &pass_stride, - &r, - &pass_stride, - &frame_offset, - &a, - &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = { - &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *construct_gramian_args[] = {&t, - &blurDifference, - &task->buffer.mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->reconstruction_state.filter_window, - &w, - &h, - &stride, - &pass_stride, - &r, - &f, - &frame_offset, - &task->buffer.use_time}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_solve(device_ptr output_ptr, DenoisingTask *task) - { - CUfunction cuFinalize; - cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); - cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); - void *finalize_args[] = {&output_ptr, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->filter_area, - &task->reconstruction_state.buffer_params.x, - &task->render_buffer.samples}; - CUDA_GET_BLOCKSIZE( - cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h); - CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect, - DenoisingTask *task) - { - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterCombineHalves; - cuda_assert(cuModuleGetFunction( - &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); - cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE( - cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r}; - CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task) - { - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterDivideShadow; - cuda_assert(cuModuleGetFunction( - &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); - cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE( - cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = {&task->render_buffer.samples, - &task->tile_info_mem.device_pointer, - &a_ptr, - &b_ptr, - &sample_variance_ptr, - &sv_variance_ptr, - &buffer_variance_ptr, - &task->rect, - &task->render_buffer.pass_stride, - &task->render_buffer.offset}; - CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task) - { - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterGetFeature; - cuda_assert(cuModuleGetFunction( - &cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); - cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE( - cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = {&task->render_buffer.samples, - &task->tile_info_mem.device_pointer, - &mean_offset, - &variance_offset, - &mean_ptr, - &variance_ptr, - &scale, - &task->rect, - &task->render_buffer.pass_stride, - &task->render_buffer.offset}; - CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_write_feature(int out_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task) - { - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterWriteFeature; - cuda_assert(cuModuleGetFunction( - &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature")); - cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w); - - void *args[] = {&task->render_buffer.samples, - &task->reconstruction_state.buffer_params, - &task->filter_area, - &from_ptr, - &buffer_ptr, - &out_offset, - &task->rect}; - CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task) - { - if (have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterDetectOutliers; - cuda_assert(cuModuleGetFunction( - &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers")); - cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE( - cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = {&image_ptr, - &variance_ptr, - &depth_ptr, - &output_ptr, - &task->rect, - &task->buffer.pass_stride}; - - CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - void denoise(RenderTile &rtile, DenoisingTask &denoising) - { - denoising.functions.construct_transform = function_bind( - &CUDADevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind( - &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); - denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising); - denoising.functions.divide_shadow = function_bind( - &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.non_local_means = function_bind( - &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); - denoising.functions.combine_halves = function_bind( - &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind( - &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.write_feature = function_bind( - &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising); - denoising.functions.detect_outliers = function_bind( - &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - - denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); - denoising.render_buffer.samples = rtile.sample; - denoising.buffer.gpu_temporary_mem = true; - - denoising.run_denoising(&rtile); - } - - void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles) - { - scoped_timer timer(&rtile.buffers->render_time); - - if (have_error()) - return; - - CUDAContextScope scope(this); - CUfunction cuPathTrace; - - /* Get kernel function. */ - if (task.integrator_branched) { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); - } - - if (have_error()) { - return; - } - - cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); - - /* Allocate work tile. */ - work_tiles.alloc(1); - - WorkTile *wtile = work_tiles.data(); - wtile->x = rtile.x; - wtile->y = rtile.y; - wtile->w = rtile.w; - wtile->h = rtile.h; - wtile->offset = rtile.offset; - wtile->stride = rtile.stride; - wtile->buffer = (float *)cuda_device_ptr(rtile.buffer); - - /* Prepare work size. More step samples render faster, but for now we - * remain conservative for GPUs connected to a display to avoid driver - * timeouts and display freezing. */ - int min_blocks, num_threads_per_block; - cuda_assert(cuOccupancyMaxPotentialBlockSize( - &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); - if (!info.display_device) { - min_blocks *= 8; - } - - uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); - - /* Render all samples. */ - int start_sample = rtile.start_sample; - int end_sample = rtile.start_sample + rtile.num_samples; - - for (int sample = start_sample; sample < end_sample; sample += step_samples) { - /* Setup and copy work tile to device. */ - wtile->start_sample = sample; - wtile->num_samples = min(step_samples, end_sample - sample); - work_tiles.copy_to_device(); - - CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); - uint total_work_size = wtile->w * wtile->h * wtile->num_samples; - uint num_blocks = divide_up(total_work_size, num_threads_per_block); - - /* Launch kernel. */ - void *args[] = {&d_work_tiles, &total_work_size}; - - cuda_assert(cuLaunchKernel( - cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); - - cuda_assert(cuCtxSynchronize()); - - /* Update progress. */ - rtile.sample = sample + wtile->num_samples; - task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); - - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - } - } - - void film_convert(DeviceTask &task, - device_ptr buffer, - device_ptr rgba_byte, - device_ptr rgba_half) - { - if (have_error()) - return; - - CUDAContextScope scope(this); - - CUfunction cuFilmConvert; - CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half); - CUdeviceptr d_buffer = cuda_device_ptr(buffer); - - /* get kernel function */ - if (rgba_half) { - cuda_assert( - cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float")); - } - else { - cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte")); - } - - float sample_scale = 1.0f / (task.sample + 1); - - /* pass in parameters */ - void *args[] = {&d_rgba, - &d_buffer, - &sample_scale, - &task.x, - &task.y, - &task.w, - &task.h, - &task.offset, - &task.stride}; - - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute( - &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert)); - - int xthreads = (int)sqrt(threads_per_block); - int ythreads = (int)sqrt(threads_per_block); - int xblocks = (task.w + xthreads - 1) / xthreads; - int yblocks = (task.h + ythreads - 1) / ythreads; - - cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1)); - - cuda_assert(cuLaunchKernel(cuFilmConvert, - xblocks, - yblocks, - 1, /* blocks */ - xthreads, - ythreads, - 1, /* threads */ - 0, - 0, - args, - 0)); - - unmap_pixels((rgba_byte) ? rgba_byte : rgba_half); - - cuda_assert(cuCtxSynchronize()); - } - - void shader(DeviceTask &task) - { - if (have_error()) - return; - - CUDAContextScope scope(this); - - CUfunction cuShader; - CUdeviceptr d_input = cuda_device_ptr(task.shader_input); - CUdeviceptr d_output = cuda_device_ptr(task.shader_output); - - /* get kernel function */ - if (task.shader_eval_type >= SHADER_EVAL_BAKE) { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); - } - else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); - } - - /* do tasks in smaller chunks, so we can cancel it */ - const int shader_chunk_size = 65536; - const int start = task.shader_x; - const int end = task.shader_x + task.shader_w; - int offset = task.offset; - - bool canceled = false; - for (int sample = 0; sample < task.num_samples && !canceled; sample++) { - for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) { - int shader_w = min(shader_chunk_size, end - shader_x); - - /* pass in parameters */ - void *args[8]; - int arg = 0; - args[arg++] = &d_input; - args[arg++] = &d_output; - args[arg++] = &task.shader_eval_type; - if (task.shader_eval_type >= SHADER_EVAL_BAKE) { - args[arg++] = &task.shader_filter; - } - args[arg++] = &shader_x; - args[arg++] = &shader_w; - args[arg++] = &offset; - args[arg++] = &sample; - - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute( - &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader)); - - int xblocks = (shader_w + threads_per_block - 1) / threads_per_block; - - cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuLaunchKernel(cuShader, - xblocks, - 1, - 1, /* blocks */ - threads_per_block, - 1, - 1, /* threads */ - 0, - 0, - args, - 0)); - - cuda_assert(cuCtxSynchronize()); - - if (task.get_cancel()) { - canceled = true; - break; - } - } - - task.update_progress(NULL); - } - } - - CUdeviceptr map_pixels(device_ptr mem) - { - if (!background) { - PixelMem pmem = pixel_mem_map[mem]; - CUdeviceptr buffer; - - size_t bytes; - cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0)); - cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource)); - - return buffer; - } - - return cuda_device_ptr(mem); - } - - void unmap_pixels(device_ptr mem) - { - if (!background) { - PixelMem pmem = pixel_mem_map[mem]; - - cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0)); - } - } - - void pixels_alloc(device_memory &mem) - { - PixelMem pmem; - - pmem.w = mem.data_width; - pmem.h = mem.data_height; - - CUDAContextScope scope(this); - - glGenBuffers(1, &pmem.cuPBO); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - if (mem.data_type == TYPE_HALF) - glBufferData( - GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW); - else - glBufferData( - GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - glActiveTexture(GL_TEXTURE0); - glGenTextures(1, &pmem.cuTexId); - glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); - if (mem.data_type == TYPE_HALF) - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL); - else - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glBindTexture(GL_TEXTURE_2D, 0); - - CUresult result = cuGraphicsGLRegisterBuffer( - &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); - - if (result == CUDA_SUCCESS) { - mem.device_pointer = pmem.cuTexId; - pixel_mem_map[mem.device_pointer] = pmem; - - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); - - return; - } - else { - /* failed to register buffer, fallback to no interop */ - glDeleteBuffers(1, &pmem.cuPBO); - glDeleteTextures(1, &pmem.cuTexId); - - background = true; - } - } - - void pixels_copy_from(device_memory &mem, int y, int w, int h) - { - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - - CUDAContextScope scope(this); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); - size_t offset = sizeof(uchar) * 4 * y * w; - memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h); - glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - } - - void pixels_free(device_memory &mem) - { - if (mem.device_pointer) { - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - - CUDAContextScope scope(this); - - cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); - glDeleteBuffers(1, &pmem.cuPBO); - glDeleteTextures(1, &pmem.cuTexId); - - pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); - mem.device_pointer = 0; - - stats.mem_free(mem.device_size); - mem.device_size = 0; - } - } - - void draw_pixels(device_memory &mem, - int y, - int w, - int h, - int width, - int height, - int dx, - int dy, - int dw, - int dh, - bool transparent, - const DeviceDrawParams &draw_params) - { - assert(mem.type == MEM_PIXELS); - - if (!background) { - const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL); - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - float *vpointer; - - CUDAContextScope scope(this); - - /* for multi devices, this assumes the inefficient method that we allocate - * all pixels on the device even though we only render to a subset */ - size_t offset = 4 * y * w; - - if (mem.data_type == TYPE_HALF) - offset *= sizeof(GLhalf); - else - offset *= sizeof(uint8_t); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); - if (mem.data_type == TYPE_HALF) { - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset); - } - else { - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset); - } - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - if (transparent) { - glEnable(GL_BLEND); - glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); - } - - GLint shader_program; - if (use_fallback_shader) { - if (!bind_fallback_display_space_shader(dw, dh)) { - return; - } - shader_program = fallback_shader_program; - } - else { - draw_params.bind_display_space_shader_cb(); - glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program); - } - - if (!vertex_buffer) { - glGenBuffers(1, &vertex_buffer); - } - - glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer); - /* invalidate old contents - - * avoids stalling if buffer is still waiting in queue to be rendered */ - glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW); - - vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); - - if (vpointer) { - /* texture coordinate - vertex pair */ - vpointer[0] = 0.0f; - vpointer[1] = 0.0f; - vpointer[2] = dx; - vpointer[3] = dy; - - vpointer[4] = (float)w / (float)pmem.w; - vpointer[5] = 0.0f; - vpointer[6] = (float)width + dx; - vpointer[7] = dy; - - vpointer[8] = (float)w / (float)pmem.w; - vpointer[9] = (float)h / (float)pmem.h; - vpointer[10] = (float)width + dx; - vpointer[11] = (float)height + dy; - - vpointer[12] = 0.0f; - vpointer[13] = (float)h / (float)pmem.h; - vpointer[14] = dx; - vpointer[15] = (float)height + dy; - - glUnmapBuffer(GL_ARRAY_BUFFER); - } - - GLuint vertex_array_object; - GLuint position_attribute, texcoord_attribute; - - glGenVertexArrays(1, &vertex_array_object); - glBindVertexArray(vertex_array_object); - - texcoord_attribute = glGetAttribLocation(shader_program, "texCoord"); - position_attribute = glGetAttribLocation(shader_program, "pos"); - - glEnableVertexAttribArray(texcoord_attribute); - glEnableVertexAttribArray(position_attribute); - - glVertexAttribPointer( - texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0); - glVertexAttribPointer(position_attribute, - 2, - GL_FLOAT, - GL_FALSE, - 4 * sizeof(float), - (const GLvoid *)(sizeof(float) * 2)); - - glDrawArrays(GL_TRIANGLE_FAN, 0, 4); - - if (use_fallback_shader) { - glUseProgram(0); - } - else { - draw_params.unbind_display_space_shader_cb(); - } - - if (transparent) { - glDisable(GL_BLEND); - } - - glBindTexture(GL_TEXTURE_2D, 0); - - return; - } - - Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params); - } - - void thread_run(DeviceTask *task) - { - CUDAContextScope scope(this); - - if (task->type == DeviceTask::RENDER || task->type == DeviceTask::DENOISE) { - DeviceRequestedFeatures requested_features; - if (use_split_kernel()) { - if (split_kernel == NULL) { - split_kernel = new CUDASplitKernel(this); - split_kernel->load_kernels(requested_features); - } - } - - device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); - - /* keep rendering tiles until done */ - RenderTile tile; - DenoisingTask denoising(this, *task); - - while (task->acquire_tile(this, tile)) { - if (tile.task == RenderTile::PATH_TRACE) { - if (use_split_kernel()) { - device_only_memory<uchar> void_buffer(this, "void_buffer"); - split_kernel->path_trace(task, tile, void_buffer, void_buffer); - } - else { - path_trace(*task, tile, work_tiles); - } - } - else if (tile.task == RenderTile::DENOISE) { - tile.sample = tile.start_sample + tile.num_samples; - - denoise(tile, denoising); - - task->update_progress(&tile, tile.w * tile.h); - } - - task->release_tile(tile); - - if (task->get_cancel()) { - if (task->need_finish_queue == false) - break; - } - } - - work_tiles.free(); - } - else if (task->type == DeviceTask::SHADER) { - shader(*task); - - cuda_assert(cuCtxSynchronize()); - } - else if (task->type == DeviceTask::DENOISE_BUFFER) { - RenderTile tile; - tile.x = task->x; - tile.y = task->y; - tile.w = task->w; - tile.h = task->h; - tile.buffer = task->buffer; - tile.sample = task->sample + task->num_samples; - tile.num_samples = task->num_samples; - tile.start_sample = task->sample; - tile.offset = task->offset; - tile.stride = task->stride; - tile.buffers = task->buffers; - - DenoisingTask denoising(this, *task); - denoise(tile, denoising); - task->update_progress(&tile, tile.w * tile.h); - } - } - - class CUDADeviceTask : public DeviceTask { - public: - CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task) - { - run = function_bind(&CUDADevice::thread_run, device, this); - } - }; - - void task_add(DeviceTask &task) - { - CUDAContextScope scope(this); - - /* Load texture info. */ - load_texture_info(); - - /* Synchronize all memory copies before executing task. */ - cuda_assert(cuCtxSynchronize()); - - if (task.type == DeviceTask::FILM_CONVERT) { - /* must be done in main thread due to opengl access */ - film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); - } - else { - task_pool.push(new CUDADeviceTask(this, task)); - } - } - - void task_wait() - { - task_pool.wait(); - } - - void task_cancel() - { - task_pool.cancel(); - } - - friend class CUDASplitKernelFunction; - friend class CUDASplitKernel; - friend class CUDAContextScope; -}; - -/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class - * now that the definition of that class is complete - */ -#undef cuda_assert -#define cuda_assert(stmt) \ - { \ - CUresult result = stmt; \ -\ - if (result != CUDA_SUCCESS) { \ - string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ - if (device->error_msg == "") \ - device->error_msg = message; \ - fprintf(stderr, "%s\n", message.c_str()); \ - /*cuda_abort();*/ \ - device->cuda_error_documentation(); \ - } \ - } \ - (void)0 - -/* CUDA context scope. */ - -CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device) -{ - cuda_assert(cuCtxPushCurrent(device->cuContext)); -} - -CUDAContextScope::~CUDAContextScope() -{ - cuda_assert(cuCtxPopCurrent(NULL)); -} - -/* split kernel */ - -class CUDASplitKernelFunction : public SplitKernelFunction { - CUDADevice *device; - CUfunction func; - - public: - CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) - { - } - - /* enqueue the kernel, returns false if there is an error */ - bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/) - { - return enqueue(dim, NULL); - } - - /* enqueue the kernel, returns false if there is an error */ - bool enqueue(const KernelDimensions &dim, void *args[]) - { - if (device->have_error()) - return false; - - CUDAContextScope scope(device); - - /* we ignore dim.local_size for now, as this is faster */ - int threads_per_block; - cuda_assert( - cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); - - int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) / - threads_per_block; - - cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); - - cuda_assert(cuLaunchKernel(func, - xblocks, - 1, - 1, /* blocks */ - threads_per_block, - 1, - 1, /* threads */ - 0, - 0, - args, - 0)); - - return !device->have_error(); - } -}; - -CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device) -{ -} - -uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/, - device_memory & /*data*/, - size_t num_threads) -{ - CUDAContextScope scope(device); - - device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); - size_buffer.alloc(1); - size_buffer.zero_to_device(); - - uint threads = num_threads; - CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer); - - struct args_t { - uint *num_threads; - CUdeviceptr *size; - }; - - args_t args = {&threads, &d_size}; - - CUfunction state_buffer_size; - cuda_assert( - cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size")); - - cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0)); - - size_buffer.copy_from_device(0, 1, 1); - size_t size = size_buffer[0]; - size_buffer.free(); - - return size; -} - -bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim, - RenderTile &rtile, - int num_global_elements, - device_memory & /*kernel_globals*/, - device_memory & /*kernel_data*/, - device_memory &split_data, - device_memory &ray_state, - device_memory &queue_index, - device_memory &use_queues_flag, - device_memory &work_pool_wgs) -{ - CUDAContextScope scope(device); - - CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer); - CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer); - CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer); - CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer); - CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer); - - CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer); - - int end_sample = rtile.start_sample + rtile.num_samples; - int queue_size = dim.global_size[0] * dim.global_size[1]; - - struct args_t { - CUdeviceptr *split_data_buffer; - int *num_elements; - CUdeviceptr *ray_state; - int *start_sample; - int *end_sample; - int *sx; - int *sy; - int *sw; - int *sh; - int *offset; - int *stride; - CUdeviceptr *queue_index; - int *queuesize; - CUdeviceptr *use_queues_flag; - CUdeviceptr *work_pool_wgs; - int *num_samples; - CUdeviceptr *buffer; - }; - - args_t args = {&d_split_data, - &num_global_elements, - &d_ray_state, - &rtile.start_sample, - &end_sample, - &rtile.x, - &rtile.y, - &rtile.w, - &rtile.h, - &rtile.offset, - &rtile.stride, - &d_queue_index, - &queue_size, - &d_use_queues_flag, - &d_work_pool_wgs, - &rtile.num_samples, - &d_buffer}; - - CUfunction data_init; - cuda_assert( - cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); - if (device->have_error()) { - return false; - } - - CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args); - - return !device->have_error(); -} - -SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name, - const DeviceRequestedFeatures &) -{ - CUDAContextScope scope(device); - CUfunction func; - - cuda_assert( - cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data())); - if (device->have_error()) { - device->cuda_error_message( - string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data())); - return NULL; - } - - return new CUDASplitKernelFunction(device, func); -} - -int2 CUDASplitKernel::split_kernel_local_size() -{ - return make_int2(32, 1); -} - -int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg, - device_memory &data, - DeviceTask * /*task*/) -{ - CUDAContextScope scope(device); - size_t free; - size_t total; - - cuda_assert(cuMemGetInfo(&free, &total)); - - VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free) - << " bytes. (" << string_human_readable_size(free) << ")."; - - size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2); - size_t side = round_down((int)sqrt(num_elements), 32); - int2 global_size = make_int2(side, round_down(num_elements / side, 16)); - VLOG(1) << "Global size: " << global_size << "."; - return global_size; -} - bool device_cuda_init() { #ifdef WITH_CUDA_DYNLOAD @@ -2861,3 +300,5 @@ string device_cuda_capabilities() } CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index b07596c60ff..83f3598df3c 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -16,9 +16,7 @@ #ifdef WITH_OPENCL -# include "device/opencl/opencl.h" - -# include "device/device_intern.h" +# include "device/opencl/device_opencl.h" # include "util/util_foreach.h" # include "util/util_logging.h" diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp index ac119a723e3..4a711c50731 100644 --- a/intern/cycles/device/device_optix.cpp +++ b/intern/cycles/device/device_optix.cpp @@ -17,7 +17,7 @@ #ifdef WITH_OPTIX -# include "device/device.h" +# include "device/cuda/device_cuda.h" # include "device/device_intern.h" # include "device/device_denoising.h" # include "bvh/bvh.h" @@ -120,19 +120,7 @@ struct KernelParams { check_result_cuda_ret(cuLaunchKernel( \ func, xblocks, yblocks, 1, threads, threads, 1, 0, cuda_stream[thread_index], args, 0)); -/* Similar as above, but for 1-dimensional blocks. */ -# define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ - int threads; \ - check_result_cuda_ret( \ - cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - int xblocks = ((w) + threads - 1) / threads; \ - int yblocks = h; - -# define CUDA_LAUNCH_KERNEL_1D(func, args) \ - check_result_cuda_ret(cuLaunchKernel( \ - func, xblocks, yblocks, 1, threads, 1, 1, 0, cuda_stream[thread_index], args, 0)); - -class OptiXDevice : public Device { +class OptiXDevice : public CUDADevice { // List of OptiX program groups enum { @@ -181,78 +169,36 @@ class OptiXDevice : public Device { // Use a pool with multiple threads to support launches with multiple CUDA streams TaskPool task_pool; - // CUDA/OptiX context handles - CUdevice cuda_device = 0; - CUcontext cuda_context = NULL; vector<CUstream> cuda_stream; OptixDeviceContext context = NULL; - // Need CUDA kernel module for some utility functions - CUmodule cuda_module = NULL; - CUmodule cuda_filter_module = NULL; - // All necessary OptiX kernels are in one module - OptixModule optix_module = NULL; + OptixModule optix_module = NULL; // All necessary OptiX kernels are in one module OptixPipeline pipelines[NUM_PIPELINES] = {}; bool motion_blur = false; - bool need_texture_info = false; device_vector<SbtRecord> sbt_data; - device_vector<TextureInfo> texture_info; device_only_memory<KernelParams> launch_params; vector<CUdeviceptr> as_mem; OptixTraversableHandle tlas_handle = 0; - // TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually - int can_map_host = 0; - size_t map_host_used = 0; - size_t map_host_limit = 0; - size_t device_working_headroom = 32 * 1024 * 1024LL; // 32MB - size_t device_texture_headroom = 128 * 1024 * 1024LL; // 128MB - map<device_memory *, CUDAMem> cuda_mem_map; - bool move_texture_to_host = false; - OptixDenoiser denoiser = NULL; vector<pair<int2, CUdeviceptr>> denoiser_state; int denoiser_input_passes = 0; public: OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) - : Device(info_, stats_, profiler_, background_), + : CUDADevice(info_, stats_, profiler_, background_), sbt_data(this, "__sbt", MEM_READ_ONLY), - texture_info(this, "__texture_info", MEM_TEXTURE), launch_params(this, "__params") { // Store number of CUDA streams in device info info.cpu_threads = DebugFlags().optix.cuda_streams; - // Initialize CUDA driver API - check_result_cuda(cuInit(0)); - - // Retrieve the primary CUDA context for this device - check_result_cuda(cuDeviceGet(&cuda_device, info.num)); - check_result_cuda(cuDevicePrimaryCtxRetain(&cuda_context, cuda_device)); - - // Make that CUDA context current - const CUDAContextScope scope(cuda_context); - - // Limit amount of host mapped memory (see init_host_memory in device_cuda.cpp) - size_t default_limit = 4 * 1024 * 1024 * 1024LL; - size_t system_ram = system_physical_ram(); - if (system_ram > 0) { - if (system_ram / 2 > default_limit) { - map_host_limit = system_ram - default_limit; - } - else { - map_host_limit = system_ram / 2; - } - } - else { - VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + // Make the CUDA context current + if (!cuContext) { + return; } - - // Check device support for pinned host memory - check_result_cuda( - cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuda_device)); + const CUDAContextScope scope(cuContext); // Create OptiX context for this device OptixDeviceContextOptions options = {}; @@ -276,7 +222,7 @@ class OptiXDevice : public Device { } }; # endif - check_result_optix(optixDeviceContextCreate(cuda_context, &options, &context)); + check_result_optix(optixDeviceContextCreate(cuContext, &options, &context)); # ifdef WITH_CYCLES_LOGGING check_result_optix(optixDeviceContextSetLogCallback( context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel)); @@ -300,6 +246,9 @@ class OptiXDevice : public Device { // Stop processing any more tasks task_pool.stop(); + // Make CUDA context current + const CUDAContextScope scope(cuContext); + // Free all acceleration structures for (CUdeviceptr mem : as_mem) { cuMemFree(mem); @@ -314,14 +263,7 @@ class OptiXDevice : public Device { texture_info.free(); launch_params.free(); - // Make CUDA context current - const CUDAContextScope scope(cuda_context); - // Unload modules - if (cuda_module != NULL) - cuModuleUnload(cuda_module); - if (cuda_filter_module != NULL) - cuModuleUnload(cuda_filter_module); if (optix_module != NULL) optixModuleDestroy(optix_module); for (unsigned int i = 0; i < NUM_PIPELINES; ++i) @@ -335,9 +277,7 @@ class OptiXDevice : public Device { if (denoiser != NULL) optixDenoiserDestroy(denoiser); - // Destroy OptiX and CUDA context optixDeviceContextDestroy(context); - cuDevicePrimaryCtxRelease(cuda_device); } private: @@ -355,8 +295,15 @@ class OptiXDevice : public Device { bool load_kernels(const DeviceRequestedFeatures &requested_features) override { - if (have_error()) - return false; // Abort early if context creation failed already + if (have_error()) { + // Abort early if context creation failed already + return false; + } + + // Load CUDA modules because we need some of the utility kernels + if (!CUDADevice::load_kernels(requested_features)) { + return false; + } // Disable baking for now, since its kernel is not well-suited for inlining and is very slow if (requested_features.use_baking) { @@ -369,7 +316,7 @@ class OptiXDevice : public Device { return false; } - const CUDAContextScope scope(cuda_context); + const CUDAContextScope scope(cuContext); // Unload existing OptiX module and pipelines first if (optix_module != NULL) { @@ -437,34 +384,6 @@ class OptiXDevice : public Device { &optix_module)); } - { // Load CUDA modules because we need some of the utility kernels - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, info.num); - - if (cuda_module == NULL) { // Avoid reloading module if it was already loaded - string cubin_data; - const string cubin_filename = string_printf("lib/kernel_sm_%d%d.cubin", major, minor); - if (!path_read_text(path_get(cubin_filename), cubin_data)) { - set_error("Failed loading pre-compiled CUDA kernel " + cubin_filename + "."); - return false; - } - - check_result_cuda_ret(cuModuleLoadData(&cuda_module, cubin_data.data())); - } - - if (requested_features.use_denoising && cuda_filter_module == NULL) { - string filter_data; - const string filter_filename = string_printf("lib/filter_sm_%d%d.cubin", major, minor); - if (!path_read_text(path_get(filter_filename), filter_data)) { - set_error("Failed loading pre-compiled CUDA filter kernel " + filter_filename + "."); - return false; - } - - check_result_cuda_ret(cuModuleLoadData(&cuda_filter_module, filter_data.data())); - } - } - // Create program groups OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {}; OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {}; @@ -650,9 +569,6 @@ class OptiXDevice : public Device { else if (task.type == DeviceTask::SHADER) { launch_shader_eval(task, thread_index); } - else if (task.type == DeviceTask::FILM_CONVERT) { - launch_film_convert(task, thread_index); - } else if (task.type == DeviceTask::DENOISE_BUFFER) { // Set up a single tile that covers the whole task and denoise it RenderTile tile; @@ -694,7 +610,7 @@ class OptiXDevice : public Device { device_ptr launch_params_ptr = launch_params.device_pointer + thread_index * launch_params.data_elements; - const CUDAContextScope scope(cuda_context); + const CUDAContextScope scope(cuContext); for (int sample = rtile.start_sample; sample < end_sample; sample += step_samples) { // Copy work tile information to device @@ -745,7 +661,7 @@ class OptiXDevice : public Device { { int total_samples = rtile.start_sample + rtile.num_samples; - const CUDAContextScope scope(cuda_context); + const CUDAContextScope scope(cuContext); // Choose between OptiX and NLM denoising if (task.denoising_use_optix) { @@ -826,7 +742,7 @@ class OptiXDevice : public Device { CUfunction filter_copy_func; check_result_cuda_ret(cuModuleGetFunction( - &filter_copy_func, cuda_filter_module, "kernel_cuda_filter_copy_input")); + &filter_copy_func, cuFilterModule, "kernel_cuda_filter_copy_input")); check_result_cuda_ret(cuFuncSetCacheConfig(filter_copy_func, CU_FUNC_CACHE_PREFER_L1)); void *args[] = { @@ -843,7 +759,7 @@ class OptiXDevice : public Device { CUfunction convert_to_rgb_func; check_result_cuda_ret(cuModuleGetFunction( - &convert_to_rgb_func, cuda_filter_module, "kernel_cuda_filter_convert_to_rgb")); + &convert_to_rgb_func, cuFilterModule, "kernel_cuda_filter_convert_to_rgb")); check_result_cuda_ret(cuFuncSetCacheConfig(convert_to_rgb_func, CU_FUNC_CACHE_PREFER_L1)); void *args[] = {&input_rgb.device_pointer, @@ -971,7 +887,7 @@ class OptiXDevice : public Device { { CUfunction convert_from_rgb_func; check_result_cuda_ret(cuModuleGetFunction( - &convert_from_rgb_func, cuda_filter_module, "kernel_cuda_filter_convert_from_rgb")); + &convert_from_rgb_func, cuFilterModule, "kernel_cuda_filter_convert_from_rgb")); check_result_cuda_ret( cuFuncSetCacheConfig(convert_from_rgb_func, CU_FUNC_CACHE_PREFER_L1)); @@ -998,47 +914,10 @@ class OptiXDevice : public Device { task.unmap_neighbor_tiles(rtiles, this); } else { + assert(thread_index == 0); // Run CUDA denoising kernels DenoisingTask denoising(this, task); - denoising.functions.construct_transform = function_bind( - &OptiXDevice::denoising_construct_transform, this, &denoising, thread_index); - denoising.functions.accumulate = function_bind( - &OptiXDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising, thread_index); - denoising.functions.solve = function_bind( - &OptiXDevice::denoising_solve, this, _1, &denoising, thread_index); - denoising.functions.divide_shadow = function_bind(&OptiXDevice::denoising_divide_shadow, - this, - _1, - _2, - _3, - _4, - _5, - &denoising, - thread_index); - denoising.functions.non_local_means = function_bind( - &OptiXDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising, thread_index); - denoising.functions.combine_halves = function_bind(&OptiXDevice::denoising_combine_halves, - this, - _1, - _2, - _3, - _4, - _5, - _6, - &denoising, - thread_index); - denoising.functions.get_feature = function_bind( - &OptiXDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising, thread_index); - denoising.functions.write_feature = function_bind( - &OptiXDevice::denoising_write_feature, this, _1, _2, _3, &denoising, thread_index); - denoising.functions.detect_outliers = function_bind( - &OptiXDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising, thread_index); - - denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); - denoising.render_buffer.samples = total_samples; - denoising.buffer.gpu_temporary_mem = true; - - denoising.run_denoising(&rtile); + CUDADevice::denoise(rtile, denoising); } // Update current sample, so it is displayed correctly @@ -1057,7 +936,7 @@ class OptiXDevice : public Device { if (task.shader_eval_type == SHADER_EVAL_DISPLACE) rgen_index = PG_DISP; - const CUDAContextScope scope(cuda_context); + const CUDAContextScope scope(cuContext); device_ptr launch_params_ptr = launch_params.device_pointer + thread_index * launch_params.data_elements; @@ -1104,62 +983,13 @@ class OptiXDevice : public Device { } } - void launch_film_convert(DeviceTask &task, int thread_index) - { - const CUDAContextScope scope(cuda_context); - - CUfunction film_convert_func; - check_result_cuda(cuModuleGetFunction(&film_convert_func, - cuda_module, - task.rgba_byte ? "kernel_cuda_convert_to_byte" : - "kernel_cuda_convert_to_half_float")); - - float sample_scale = 1.0f / (task.sample + 1); - CUdeviceptr rgba = (task.rgba_byte ? task.rgba_byte : task.rgba_half); - - void *args[] = {&rgba, - &task.buffer, - &sample_scale, - &task.x, - &task.y, - &task.w, - &task.h, - &task.offset, - &task.stride}; - - int threads_per_block; - check_result_cuda(cuFuncGetAttribute( - &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, film_convert_func)); - - const int num_threads_x = (int)sqrt(threads_per_block); - const int num_blocks_x = (task.w + num_threads_x - 1) / num_threads_x; - const int num_threads_y = (int)sqrt(threads_per_block); - const int num_blocks_y = (task.h + num_threads_y - 1) / num_threads_y; - - check_result_cuda(cuLaunchKernel(film_convert_func, - num_blocks_x, - num_blocks_y, - 1, /* blocks */ - num_threads_x, - num_threads_y, - 1, /* threads */ - 0, - cuda_stream[thread_index], - args, - 0)); - - check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index])); - - task.update_progress(NULL); - } - bool build_optix_bvh(const OptixBuildInput &build_input, uint16_t num_motion_steps, OptixTraversableHandle &out_handle) { out_handle = 0; - const CUDAContextScope scope(cuda_context); + const CUDAContextScope scope(cuContext); // Compute memory usage OptixAccelBufferSizes sizes = {}; @@ -1477,7 +1307,7 @@ class OptiXDevice : public Device { size_t motion_transform_size = sizeof(OptixSRTMotionTransform) + motion_keys * sizeof(OptixSRTData); - const CUDAContextScope scope(cuda_context); + const CUDAContextScope scope(cuContext); CUdeviceptr motion_transform_gpu = 0; check_result_cuda_ret(cuMemAlloc(&motion_transform_gpu, motion_transform_size)); @@ -1574,621 +1404,13 @@ class OptiXDevice : public Device { return build_optix_bvh(build_input, 0, tlas_handle); } - void update_texture_info() - { - if (need_texture_info) { - texture_info.copy_to_device(); - need_texture_info = false; - } - } - - void update_launch_params(const char *name, size_t offset, void *data, size_t data_size) + void const_copy_to(const char *name, void *host, size_t size) override { - const CUDAContextScope scope(cuda_context); - - for (int i = 0; i < info.cpu_threads; ++i) - check_result_cuda( - cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset, - data, - data_size)); - // Set constant memory for CUDA module - // TODO(pmours): This is only used for tonemapping (see 'launch_film_convert'). + // TODO(pmours): This is only used for tonemapping (see 'film_convert'). // Could be removed by moving those functions to filter CUDA module. - size_t bytes = 0; - CUdeviceptr mem = 0; - check_result_cuda(cuModuleGetGlobal(&mem, &bytes, cuda_module, name)); - assert(mem != 0 && bytes == data_size); - check_result_cuda(cuMemcpyHtoD(mem, data, data_size)); - } - - void mem_alloc(device_memory &mem) override - { - if (mem.type == MEM_PIXELS && !background) { - // Always fall back to no interop for now - // TODO(pmours): Support OpenGL interop when moving CUDA memory management to common code - background = true; - } - else if (mem.type == MEM_TEXTURE) { - assert(!"mem_alloc not supported for textures."); - return; - } - - generic_alloc(mem); - } - - CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0) - { - CUDAContextScope scope(cuda_context); - - CUdeviceptr device_pointer = 0; - size_t size = mem.memory_size() + pitch_padding; - - CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; - const char *status = ""; - - /* First try allocating in device memory, respecting headroom. We make - * an exception for texture info. It is small and frequently accessed, - * so treat it as working memory. - * - * If there is not enough room for working memory, we will try to move - * textures to host memory, assuming the performance impact would have - * been worse for working memory. */ - bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; - - size_t total = 0, free = 0; - cuMemGetInfo(&free, &total); - - /* Move textures to host memory if needed. */ - if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { - move_textures_to_host(size + headroom - free, is_texture); - cuMemGetInfo(&free, &total); - } - - /* Allocate in device memory. */ - if (!move_texture_to_host && (size + headroom) < free) { - mem_alloc_result = cuMemAlloc(&device_pointer, size); - if (mem_alloc_result == CUDA_SUCCESS) { - status = " in device memory"; - } - } - - /* Fall back to mapped host memory if needed and possible. */ - void *shared_pointer = 0; - - if (mem_alloc_result != CUDA_SUCCESS && can_map_host) { - if (mem.shared_pointer) { - /* Another device already allocated host memory. */ - mem_alloc_result = CUDA_SUCCESS; - shared_pointer = mem.shared_pointer; - } - else if (map_host_used + size < map_host_limit) { - /* Allocate host memory ourselves. */ - mem_alloc_result = cuMemHostAlloc( - &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); - - assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) || - (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0)); - } - - if (mem_alloc_result == CUDA_SUCCESS) { - cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0); - map_host_used += size; - status = " in host memory"; - } - else { - status = " failed, out of host memory"; - } - } - else if (mem_alloc_result != CUDA_SUCCESS) { - status = " failed, out of device and host memory"; - } - - if (mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")" << status; - } - - if (mem_alloc_result != CUDA_SUCCESS) { - set_error(string_printf("Buffer allocate %s", status)); - return NULL; - } - - mem.device_pointer = (device_ptr)device_pointer; - mem.device_size = size; - stats.mem_alloc(size); - - if (!mem.device_pointer) { - return NULL; - } - - /* Insert into map of allocations. */ - CUDAMem *cmem = &cuda_mem_map[&mem]; - if (shared_pointer != 0) { - /* Replace host pointer with our host allocation. Only works if - * CUDA memory layout is the same and has no pitch padding. Also - * does not work if we move textures to host during a render, - * since other devices might be using the memory. */ - - if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && - mem.host_pointer != shared_pointer) { - memcpy(shared_pointer, mem.host_pointer, size); - - /* A call to device_memory::host_free() should be preceded by - * a call to device_memory::device_free() for host memory - * allocated by a device to be handled properly. Two exceptions - * are here and a call in CUDADevice::generic_alloc(), where - * the current host memory can be assumed to be allocated by - * device_memory::host_alloc(), not by a device */ - - mem.host_free(); - mem.host_pointer = shared_pointer; - } - mem.shared_pointer = shared_pointer; - mem.shared_counter++; - cmem->use_mapped_host = true; - } - else { - cmem->use_mapped_host = false; - } - - return cmem; - } - - void tex_alloc(device_memory &mem) - { - CUDAContextScope scope(cuda_context); - - /* General variables for both architectures */ - string bind_name = mem.name; - size_t dsize = datatype_size(mem.data_type); - size_t size = mem.memory_size(); - - CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; - switch (mem.extension) { - case EXTENSION_REPEAT: - address_mode = CU_TR_ADDRESS_MODE_WRAP; - break; - case EXTENSION_EXTEND: - address_mode = CU_TR_ADDRESS_MODE_CLAMP; - break; - case EXTENSION_CLIP: - address_mode = CU_TR_ADDRESS_MODE_BORDER; - break; - default: - assert(0); - break; - } - - CUfilter_mode filter_mode; - if (mem.interpolation == INTERPOLATION_CLOSEST) { - filter_mode = CU_TR_FILTER_MODE_POINT; - } - else { - filter_mode = CU_TR_FILTER_MODE_LINEAR; - } - - /* Data Storage */ - if (mem.interpolation == INTERPOLATION_NONE) { - generic_alloc(mem); - generic_copy_to(mem); - - // Update data storage pointers in launch parameters -# define KERNEL_TEX(data_type, tex_name) \ - if (strcmp(mem.name, #tex_name) == 0) \ - update_launch_params( \ - mem.name, offsetof(KernelParams, tex_name), &mem.device_pointer, sizeof(device_ptr)); -# include "kernel/kernel_textures.h" -# undef KERNEL_TEX - return; - } - - /* Image Texture Storage */ - CUarray_format_enum format; - switch (mem.data_type) { - case TYPE_UCHAR: - format = CU_AD_FORMAT_UNSIGNED_INT8; - break; - case TYPE_UINT16: - format = CU_AD_FORMAT_UNSIGNED_INT16; - break; - case TYPE_UINT: - format = CU_AD_FORMAT_UNSIGNED_INT32; - break; - case TYPE_INT: - format = CU_AD_FORMAT_SIGNED_INT32; - break; - case TYPE_FLOAT: - format = CU_AD_FORMAT_FLOAT; - break; - case TYPE_HALF: - format = CU_AD_FORMAT_HALF; - break; - default: - assert(0); - return; - } - - CUDAMem *cmem = NULL; - CUarray array_3d = NULL; - size_t src_pitch = mem.data_width * dsize * mem.data_elements; - size_t dst_pitch = src_pitch; - - if (mem.data_depth > 1) { - /* 3D texture using array, there is no API for linear memory. */ - CUDA_ARRAY3D_DESCRIPTOR desc; - - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Depth = mem.data_depth; - desc.Format = format; - desc.NumChannels = mem.data_elements; - desc.Flags = 0; - - VLOG(1) << "Array 3D allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - check_result_cuda(cuArray3DCreate(&array_3d, &desc)); - - if (!array_3d) { - return; - } - - CUDA_MEMCPY3D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = array_3d; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - param.Depth = mem.data_depth; - - check_result_cuda(cuMemcpy3D(¶m)); - - mem.device_pointer = (device_ptr)array_3d; - mem.device_size = size; - stats.mem_alloc(size); - - cmem = &cuda_mem_map[&mem]; - cmem->texobject = 0; - cmem->array = array_3d; - } - else if (mem.data_height > 0) { - /* 2D texture, using pitch aligned linear memory. */ - int alignment = 0; - check_result_cuda(cuDeviceGetAttribute( - &alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuda_device)); - dst_pitch = align_up(src_pitch, alignment); - size_t dst_size = dst_pitch * mem.data_height; - - cmem = generic_alloc(mem, dst_size - mem.memory_size()); - if (!cmem) { - return; - } - - CUDA_MEMCPY2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_DEVICE; - param.dstDevice = mem.device_pointer; - param.dstPitch = dst_pitch; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - - check_result_cuda(cuMemcpy2DUnaligned(¶m)); - } - else { - /* 1D texture, using linear memory. */ - cmem = generic_alloc(mem); - if (!cmem) { - return; - } - - check_result_cuda(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); - } - - /* Kepler+, bindless textures. */ - int flat_slot = 0; - if (string_startswith(mem.name, "__tex_image")) { - int pos = string(mem.name).rfind("_"); - flat_slot = atoi(mem.name + pos + 1); - } - else { - assert(0); - } - - CUDA_RESOURCE_DESC resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - - if (array_3d) { - resDesc.resType = CU_RESOURCE_TYPE_ARRAY; - resDesc.res.array.hArray = array_3d; - resDesc.flags = 0; - } - else if (mem.data_height > 0) { - resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; - resDesc.res.pitch2D.devPtr = mem.device_pointer; - resDesc.res.pitch2D.format = format; - resDesc.res.pitch2D.numChannels = mem.data_elements; - resDesc.res.pitch2D.height = mem.data_height; - resDesc.res.pitch2D.width = mem.data_width; - resDesc.res.pitch2D.pitchInBytes = dst_pitch; - } - else { - resDesc.resType = CU_RESOURCE_TYPE_LINEAR; - resDesc.res.linear.devPtr = mem.device_pointer; - resDesc.res.linear.format = format; - resDesc.res.linear.numChannels = mem.data_elements; - resDesc.res.linear.sizeInBytes = mem.device_size; - } - - CUDA_TEXTURE_DESC texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.addressMode[0] = address_mode; - texDesc.addressMode[1] = address_mode; - texDesc.addressMode[2] = address_mode; - texDesc.filterMode = filter_mode; - texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; - - check_result_cuda(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); - - /* Resize once */ - if (flat_slot >= texture_info.size()) { - /* Allocate some slots in advance, to reduce amount - * of re-allocations. */ - texture_info.resize(flat_slot + 128); - } - - /* Set Mapping and tag that we need to (re-)upload to device */ - TextureInfo &info = texture_info[flat_slot]; - info.data = (uint64_t)cmem->texobject; - info.cl_buffer = 0; - info.interpolation = mem.interpolation; - info.extension = mem.extension; - info.width = mem.data_width; - info.height = mem.data_height; - info.depth = mem.data_depth; - need_texture_info = true; - } - - void mem_copy_to(device_memory &mem) override - { - if (mem.type == MEM_PIXELS) { - assert(!"mem_copy_to not supported for pixels."); - } - else if (mem.type == MEM_TEXTURE) { - tex_free(mem); - tex_alloc(mem); - } - else { - if (!mem.device_pointer) { - generic_alloc(mem); - } - - generic_copy_to(mem); - } - } - - void generic_copy_to(device_memory &mem) - { - if (mem.host_pointer && mem.device_pointer) { - CUDAContextScope scope(cuda_context); - - /* If use_mapped_host of mem is false, the current device only - * uses device memory allocated by cuMemAlloc regardless of - * mem.host_pointer and mem.shared_pointer, and should copy - * data from mem.host_pointer. */ - - if (cuda_mem_map[&mem].use_mapped_host == false || mem.host_pointer != mem.shared_pointer) { - check_result_cuda( - cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size())); - } - } - } - - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override - { - if (mem.type == MEM_PIXELS && !background) { - assert(!"mem_copy_from not supported for pixels."); - } - else if (mem.type == MEM_TEXTURE) { - assert(!"mem_copy_from not supported for textures."); - } - else if (mem.host_pointer) { - // Calculate linear memory offset and size - const size_t size = elem * w * h; - const size_t offset = elem * y * w; - - if (mem.device_pointer) { - const CUDAContextScope scope(cuda_context); - check_result_cuda(cuMemcpyDtoH( - (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size)); - } - else { - memset((char *)mem.host_pointer + offset, 0, size); - } - } - } - - void mem_zero(device_memory &mem) override - { - if (!mem.device_pointer) { - mem_alloc(mem); // Need to allocate memory first if it does not exist yet - } - if (!mem.device_pointer) { - return; - } - - /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory - * regardless of mem.host_pointer and mem.shared_pointer. */ - if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { - const CUDAContextScope scope(cuda_context); - check_result_cuda(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size())); - } - else if (mem.host_pointer) { - memset(mem.host_pointer, 0, mem.memory_size()); - } - } - - void mem_free(device_memory &mem) override - { - if (mem.type == MEM_PIXELS && !background) { - assert(!"mem_free not supported for pixels."); - } - else if (mem.type == MEM_TEXTURE) { - tex_free(mem); - } - else { - generic_free(mem); - } - } + CUDADevice::const_copy_to(name, host, size); - void generic_free(device_memory &mem) - { - if (mem.device_pointer) { - CUDAContextScope scope(cuda_context); - const CUDAMem &cmem = cuda_mem_map[&mem]; - - /* If cmem.use_mapped_host is true, reference counting is used - * to safely free a mapped host memory. */ - - if (cmem.use_mapped_host) { - assert(mem.shared_pointer); - if (mem.shared_pointer) { - assert(mem.shared_counter > 0); - if (--mem.shared_counter == 0) { - if (mem.host_pointer == mem.shared_pointer) { - mem.host_pointer = 0; - } - cuMemFreeHost(mem.shared_pointer); - mem.shared_pointer = 0; - } - } - map_host_used -= mem.device_size; - } - else { - /* Free device memory. */ - cuMemFree(mem.device_pointer); - } - - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } - } - - void tex_free(device_memory &mem) - { - if (mem.device_pointer) { - CUDAContextScope scope(cuda_context); - const CUDAMem &cmem = cuda_mem_map[&mem]; - - if (cmem.texobject) { - /* Free bindless texture. */ - cuTexObjectDestroy(cmem.texobject); - } - - if (cmem.array) { - /* Free array. */ - cuArrayDestroy(cmem.array); - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } - else { - generic_free(mem); - } - } - } - - void move_textures_to_host(size_t size, bool for_texture) - { - /* Signal to reallocate textures in host memory only. */ - move_texture_to_host = true; - - while (size > 0) { - /* Find suitable memory allocation to move. */ - device_memory *max_mem = NULL; - size_t max_size = 0; - bool max_is_image = false; - - foreach (auto &pair, cuda_mem_map) { - device_memory &mem = *pair.first; - CUDAMem *cmem = &pair.second; - - bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - /* Can't move this type of memory. */ - if (!is_texture || cmem->array) { - continue; - } - - /* Already in host memory. */ - if (cmem->use_mapped_host) { - continue; - } - - /* For other textures, only move image textures. */ - if (for_texture && !is_image) { - continue; - } - - /* Try to move largest allocation, prefer moving images. */ - if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { - max_is_image = is_image; - max_size = mem.device_size; - max_mem = &mem; - } - } - - /* Move to host memory. This part is mutex protected since - * multiple CUDA devices could be moving the memory. The - * first one will do it, and the rest will adopt the pointer. */ - if (max_mem) { - VLOG(1) << "Move memory from device to host: " << max_mem->name; - - static thread_mutex move_mutex; - thread_scoped_lock lock(move_mutex); - - /* Preserve the original device pointer, in case of multi device - * we can't change it because the pointer mapping would break. */ - device_ptr prev_pointer = max_mem->device_pointer; - size_t prev_size = max_mem->device_size; - - tex_free(*max_mem); - tex_alloc(*max_mem); - size = (max_size >= size) ? 0 : size - max_size; - - max_mem->device_pointer = prev_pointer; - max_mem->device_size = prev_size; - } - else { - break; - } - } - - /* Update texture info array with new pointers. */ - update_texture_info(); - - move_texture_to_host = false; - } - - void const_copy_to(const char *name, void *host, size_t size) override - { if (strcmp(name, "__data") == 0) { assert(size <= sizeof(KernelData)); @@ -2197,18 +1419,40 @@ class OptiXDevice : public Device { *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle; update_launch_params(name, offsetof(KernelParams, data), host, size); + return; } + + // Update data storage pointers in launch parameters +# define KERNEL_TEX(data_type, tex_name) \ + if (strcmp(name, #tex_name) == 0) { \ + update_launch_params(name, offsetof(KernelParams, tex_name), host, size); \ + return; \ + } +# include "kernel/kernel_textures.h" +# undef KERNEL_TEX } - device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override + void update_launch_params(const char *name, size_t offset, void *data, size_t data_size) { - return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); + const CUDAContextScope scope(cuContext); + + for (int i = 0; i < info.cpu_threads; ++i) + check_result_cuda( + cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset, + data, + data_size)); } void task_add(DeviceTask &task) override { // Upload texture information to device if it has changed since last launch - update_texture_info(); + load_texture_info(); + + if (task.type == DeviceTask::FILM_CONVERT) { + // Execute in main thread because of OpenGL access + film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); + return; + } // Split task into smaller ones list<DeviceTask> tasks; @@ -2240,403 +1484,6 @@ class OptiXDevice : public Device { // Cancel any remaining tasks in the internal pool task_pool.cancel(); } - - bool denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task, - int thread_index) - { - if (have_error()) - return false; - - int stride = task->buffer.stride; - int w = task->buffer.width; - int h = task->buffer.h; - int r = task->nlm_state.r; - int f = task->nlm_state.f; - float a = task->nlm_state.a; - float k_2 = task->nlm_state.k_2; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2 * r + 1) * (2 * r + 1); - int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; - int frame_offset = 0; - - CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer; - CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; - CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts; - CUdeviceptr scale_ptr = 0; - - check_result_cuda_ret( - cuMemsetD8Async(weightAccum, 0, sizeof(float) * pass_stride, cuda_stream[thread_index])); - check_result_cuda_ret( - cuMemsetD8Async(out_ptr, 0, sizeof(float) * pass_stride, cuda_stream[thread_index])); - - { - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput; - check_result_cuda_ret(cuModuleGetFunction( - &cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference")); - check_result_cuda_ret( - cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur")); - check_result_cuda_ret(cuModuleGetFunction( - &cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight")); - check_result_cuda_ret(cuModuleGetFunction( - &cuNLMUpdateOutput, cuda_filter_module, "kernel_cuda_filter_nlm_update_output")); - - check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts); - - void *calc_difference_args[] = {&guide_ptr, - &variance_ptr, - &scale_ptr, - &difference, - &w, - &h, - &stride, - &pass_stride, - &r, - &channel_offset, - &frame_offset, - &a, - &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = { - &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *update_output_args[] = {&blurDifference, - &image_ptr, - &out_ptr, - &weightAccum, - &w, - &h, - &stride, - &pass_stride, - &channel_offset, - &r, - &f}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args); - } - - { - CUfunction cuNLMNormalize; - check_result_cuda_ret(cuModuleGetFunction( - &cuNLMNormalize, cuda_filter_module, "kernel_cuda_filter_nlm_normalize")); - check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); - void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride}; - CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h); - CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); - check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); - } - - return !have_error(); - } - - bool denoising_construct_transform(DenoisingTask *task, int thread_index) - { - if (have_error()) - return false; - - CUfunction cuFilterConstructTransform; - check_result_cuda_ret(cuModuleGetFunction(&cuFilterConstructTransform, - cuda_filter_module, - "kernel_cuda_filter_construct_transform")); - check_result_cuda_ret( - cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); - CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h); - - void *args[] = {&task->buffer.mem.device_pointer, - &task->tile_info_mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->filter_area, - &task->rect, - &task->radius, - &task->pca_threshold, - &task->buffer.pass_stride, - &task->buffer.frame_stride, - &task->buffer.use_time}; - CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); - check_result_cuda_ret(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task, - int thread_index) - { - if (have_error()) - return false; - - int r = task->radius; - int f = 4; - float a = 1.0f; - float k_2 = task->nlm_k_2; - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - int stride = task->buffer.stride; - int frame_offset = frame * task->buffer.frame_stride; - int t = task->tile_info->frames[frame]; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2 * r + 1) * (2 * r + 1); - - CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer; - CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; - - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; - check_result_cuda_ret(cuModuleGetFunction( - &cuNLMCalcDifference, cuda_filter_module, "kernel_cuda_filter_nlm_calc_difference")); - check_result_cuda_ret( - cuModuleGetFunction(&cuNLMBlur, cuda_filter_module, "kernel_cuda_filter_nlm_blur")); - check_result_cuda_ret(cuModuleGetFunction( - &cuNLMCalcWeight, cuda_filter_module, "kernel_cuda_filter_nlm_calc_weight")); - check_result_cuda_ret(cuModuleGetFunction( - &cuNLMConstructGramian, cuda_filter_module, "kernel_cuda_filter_nlm_construct_gramian")); - - check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - check_result_cuda_ret(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - check_result_cuda_ret( - cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, - task->reconstruction_state.source_w * - task->reconstruction_state.source_h, - num_shifts); - - void *calc_difference_args[] = {&color_ptr, - &color_variance_ptr, - &scale_ptr, - &difference, - &w, - &h, - &stride, - &pass_stride, - &r, - &pass_stride, - &frame_offset, - &a, - &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = { - &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *construct_gramian_args[] = {&t, - &blurDifference, - &task->buffer.mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->reconstruction_state.filter_window, - &w, - &h, - &stride, - &pass_stride, - &r, - &f, - &frame_offset, - &task->buffer.use_time}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); - check_result_cuda_ret(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_solve(device_ptr output_ptr, DenoisingTask *task, int thread_index) - { - if (have_error()) - return false; - - CUfunction cuFinalize; - check_result_cuda_ret( - cuModuleGetFunction(&cuFinalize, cuda_filter_module, "kernel_cuda_filter_finalize")); - check_result_cuda_ret(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); - void *finalize_args[] = {&output_ptr, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->filter_area, - &task->reconstruction_state.buffer_params.x, - &task->render_buffer.samples}; - CUDA_GET_BLOCKSIZE( - cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h); - CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); - check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); - - return !have_error(); - } - - bool denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect, - DenoisingTask *task, - int thread_index) - { - if (have_error()) - return false; - - CUfunction cuFilterCombineHalves; - check_result_cuda_ret(cuModuleGetFunction( - &cuFilterCombineHalves, cuda_filter_module, "kernel_cuda_filter_combine_halves")); - check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE( - cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r}; - CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); - check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); - - return !have_error(); - } - - bool denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task, - int thread_index) - { - if (have_error()) - return false; - - CUfunction cuFilterDivideShadow; - check_result_cuda_ret(cuModuleGetFunction( - &cuFilterDivideShadow, cuda_filter_module, "kernel_cuda_filter_divide_shadow")); - check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE( - cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = {&task->render_buffer.samples, - &task->tile_info_mem.device_pointer, - &a_ptr, - &b_ptr, - &sample_variance_ptr, - &sv_variance_ptr, - &buffer_variance_ptr, - &task->rect, - &task->render_buffer.pass_stride, - &task->render_buffer.offset}; - CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); - check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); - - return !have_error(); - } - - bool denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task, - int thread_index) - { - if (have_error()) - return false; - - CUfunction cuFilterGetFeature; - check_result_cuda_ret(cuModuleGetFunction( - &cuFilterGetFeature, cuda_filter_module, "kernel_cuda_filter_get_feature")); - check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE( - cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = {&task->render_buffer.samples, - &task->tile_info_mem.device_pointer, - &mean_offset, - &variance_offset, - &mean_ptr, - &variance_ptr, - &scale, - &task->rect, - &task->render_buffer.pass_stride, - &task->render_buffer.offset}; - CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); - check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); - - return !have_error(); - } - - bool denoising_write_feature(int out_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task, - int thread_index) - { - if (have_error()) - return false; - - CUfunction cuFilterWriteFeature; - check_result_cuda_ret(cuModuleGetFunction( - &cuFilterWriteFeature, cuda_filter_module, "kernel_cuda_filter_write_feature")); - check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w); - - void *args[] = {&task->render_buffer.samples, - &task->reconstruction_state.buffer_params, - &task->filter_area, - &from_ptr, - &buffer_ptr, - &out_offset, - &task->rect}; - CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args); - check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); - - return !have_error(); - } - - bool denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task, - int thread_index) - { - if (have_error()) - return false; - - CUfunction cuFilterDetectOutliers; - check_result_cuda_ret(cuModuleGetFunction( - &cuFilterDetectOutliers, cuda_filter_module, "kernel_cuda_filter_detect_outliers")); - check_result_cuda_ret(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE( - cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y); - - void *args[] = {&image_ptr, - &variance_ptr, - &depth_ptr, - &output_ptr, - &task->rect, - &task->buffer.pass_stride}; - - CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); - check_result_cuda_ret(cuStreamSynchronize(cuda_stream[thread_index])); - - return !have_error(); - } }; bool device_optix_init() diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/device_opencl.h index 61b1e3e3b6b..61b1e3e3b6b 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/device_opencl.h diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp index af40aa89db4..012f6dbe114 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/device_opencl_impl.cpp @@ -16,7 +16,7 @@ #ifdef WITH_OPENCL -# include "device/opencl/opencl.h" +# include "device/opencl/device_opencl.h" # include "kernel/kernel_types.h" # include "kernel/split/kernel_split_data_types.h" diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp index 06d4746a86e..fedb3ea8c6a 100644 --- a/intern/cycles/device/opencl/memory_manager.cpp +++ b/intern/cycles/device/opencl/memory_manager.cpp @@ -18,7 +18,7 @@ # include "util/util_foreach.h" -# include "device/opencl/opencl.h" +# include "device/opencl/device_opencl.h" # include "device/opencl/memory_manager.h" CCL_NAMESPACE_BEGIN diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index 3eeff31f8c2..f59f109db70 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -16,7 +16,7 @@ #ifdef WITH_OPENCL -# include "device/opencl/opencl.h" +# include "device/opencl/device_opencl.h" # include "device/device_intern.h" # include "util/util_debug.h" |