Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device/device_cuda.cpp')
-rw-r--r--intern/cycles/device/device_cuda.cpp1224
1 files changed, 976 insertions, 248 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index fbb97f78e70..216c85f24e7 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -15,32 +15,39 @@
*/
#include <climits>
+#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include "device.h"
-#include "device_intern.h"
+#include "device/device.h"
+#include "device/device_denoising.h"
+#include "device/device_intern.h"
+#include "device/device_split_kernel.h"
-#include "buffers.h"
+#include "render/buffers.h"
+
+#include "kernel/filter/filter_defines.h"
#ifdef WITH_CUDA_DYNLOAD
# include "cuew.h"
#else
-# include "util_opengl.h"
+# include "util/util_opengl.h"
# include <cuda.h>
# include <cudaGL.h>
#endif
-#include "util_debug.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_md5.h"
-#include "util_opengl.h"
-#include "util_path.h"
-#include "util_string.h"
-#include "util_system.h"
-#include "util_types.h"
-#include "util_time.h"
+#include "util/util_debug.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_md5.h"
+#include "util/util_opengl.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
+#include "util/util_system.h"
+#include "util/util_types.h"
+#include "util/util_time.h"
+
+#include "kernel/split/kernel_split_data_types.h"
CCL_NAMESPACE_BEGIN
@@ -78,18 +85,55 @@ int cuewCompilerVersion(void)
} /* namespace */
#endif /* WITH_CUDA_DYNLOAD */
+class CUDADevice;
+
+class CUDASplitKernel : public DeviceSplitKernel {
+ CUDADevice *device;
+public:
+ explicit CUDASplitKernel(CUDADevice *device);
+
+ virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& kernel_globals,
+ device_memory& kernel_data_,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs);
+
+ virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
+ const DeviceRequestedFeatures&);
+ virtual int2 split_kernel_local_size();
+ virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+};
+
+/* Utility to push/pop CUDA context. */
+class CUDAContextScope {
+public:
+ CUDAContextScope(CUDADevice *device);
+ ~CUDAContextScope();
+
+private:
+ CUDADevice *device;
+};
+
class CUDADevice : public Device
{
public:
DedicatedTaskPool task_pool;
CUdevice cuDevice;
CUcontext cuContext;
- CUmodule cuModule;
+ CUmodule cuModule, cuFilterModule;
map<device_ptr, bool> tex_interp_map;
- map<device_ptr, uint> tex_bindless_map;
+ map<device_ptr, CUtexObject> tex_bindless_map;
int cuDevId;
int cuDevArchitecture;
bool first_error;
+ CUDASplitKernel *split_kernel;
struct PixelMem {
GLuint cuPBO;
@@ -101,8 +145,8 @@ public:
map<device_ptr, PixelMem> pixel_mem_map;
/* Bindless Textures */
- device_vector<uint> bindless_mapping;
- bool need_bindless_mapping;
+ device_vector<TextureInfo> texture_info;
+ bool need_texture_info;
CUdeviceptr cuda_device_ptr(device_ptr mem)
{
@@ -115,6 +159,12 @@ public:
return path_exists(cubins_path);
}
+ virtual bool show_samples() const
+ {
+ /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+ return true;
+ }
+
/*#ifdef NDEBUG
#define cuda_abort()
#else
@@ -124,7 +174,7 @@ public:
{
if(first_error) {
fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
- fprintf(stderr, "http://www.blender.org/manual/render/cycles/gpu_rendering.html\n\n");
+ fprintf(stderr, "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n");
first_error = false;
}
}
@@ -134,7 +184,7 @@ public:
CUresult result = stmt; \
\
if(result != CUDA_SUCCESS) { \
- string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+ string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
if(error_msg == "") \
error_msg = message; \
fprintf(stderr, "%s\n", message.c_str()); \
@@ -166,16 +216,6 @@ public:
cuda_error_documentation();
}
- void cuda_push_context()
- {
- cuda_assert(cuCtxSetCurrent(cuContext));
- }
-
- void cuda_pop_context()
- {
- cuda_assert(cuCtxSetCurrent(NULL));
- }
-
CUDADevice(DeviceInfo& info, Stats &stats, bool background_)
: Device(info, stats, background_)
{
@@ -186,7 +226,12 @@ public:
cuDevice = 0;
cuContext = 0;
- need_bindless_mapping = false;
+ cuModule = 0;
+ cuFilterModule = 0;
+
+ split_kernel = NULL;
+
+ need_texture_info = false;
/* intialize */
if(cuda_error(cuInit(0)))
@@ -218,15 +263,18 @@ public:
cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
cuDevArchitecture = major*100 + minor*10;
- cuda_pop_context();
+ /* Pop context set by cuCtxCreate. */
+ cuCtxPopCurrent(NULL);
}
~CUDADevice()
{
task_pool.stop();
+ delete split_kernel;
+
if(info.has_bindless_textures) {
- tex_free(bindless_mapping);
+ tex_free(texture_info);
}
cuda_assert(cuCtxDestroy(cuContext));
@@ -252,16 +300,22 @@ public:
return DebugFlags().cuda.adaptive_compile;
}
+ bool use_split_kernel()
+ {
+ return DebugFlags().cuda.split_kernel;
+ }
+
/* Common NVCC flags which stays the same regardless of shading model,
* kernel sources md5 and only depends on compiler or compilation settings.
*/
string compile_kernel_get_common_cflags(
- const DeviceRequestedFeatures& requested_features)
+ const DeviceRequestedFeatures& requested_features,
+ bool filter=false, bool split=false)
{
const int cuda_version = cuewCompilerVersion();
const int machine = system_cpu_bits();
- const string kernel_path = path_get("kernel");
- const string include = kernel_path;
+ const string source_path = path_get("source");
+ const string include_path = source_path;
string cflags = string_printf("-m%d "
"--ptxas-options=\"-v\" "
"--use_fast_math "
@@ -270,8 +324,8 @@ public:
"-I\"%s\"",
machine,
cuda_version,
- include.c_str());
- if(use_adaptive_compilation()) {
+ include_path.c_str());
+ if(!filter && use_adaptive_compilation()) {
cflags += " " + requested_features.get_build_options();
}
const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
@@ -281,6 +335,11 @@ public:
#ifdef WITH_CYCLES_DEBUG
cflags += " -D__KERNEL_DEBUG__";
#endif
+
+ if(split) {
+ cflags += " -D__SPLIT__";
+ }
+
return cflags;
}
@@ -300,22 +359,36 @@ public:
cuda_error_message("CUDA nvcc compiler version could not be parsed.");
return false;
}
- if(cuda_version < 75) {
+ if(cuda_version < 80) {
printf("Unsupported CUDA version %d.%d detected, "
- "you need CUDA 7.5 or newer.\n",
+ "you need CUDA 8.0 or newer.\n",
major, minor);
return false;
}
- else if(cuda_version != 75 && cuda_version != 80) {
+ else if(cuda_version != 80) {
printf("CUDA version %d.%d detected, build may succeed but only "
- "CUDA 7.5 and 8.0 are officially supported.\n",
+ "CUDA 8.0 is officially supported.\n",
major, minor);
}
return true;
}
- string compile_kernel(const DeviceRequestedFeatures& requested_features)
+ string compile_kernel(const DeviceRequestedFeatures& requested_features,
+ bool filter=false, bool split=false)
{
+ const char *name, *source;
+ if(filter) {
+ name = "filter";
+ source = "filter.cu";
+ }
+ else if(split) {
+ name = "kernel_split";
+ source = "kernel_split.cu";
+ }
+ else {
+ name = "kernel";
+ source = "kernel.cu";
+ }
/* Compute cubin name. */
int major, minor;
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
@@ -323,8 +396,8 @@ public:
/* Attempt to use kernel provided with Blender. */
if(!use_adaptive_compilation()) {
- const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin",
- major, minor));
+ const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
+ name, major, minor));
VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
if(path_exists(cubin)) {
VLOG(1) << "Using precompiled kernel.";
@@ -333,19 +406,19 @@ public:
}
const string common_cflags =
- compile_kernel_get_common_cflags(requested_features);
+ compile_kernel_get_common_cflags(requested_features, filter, split);
/* Try to use locally compiled kernel. */
- const string kernel_path = path_get("kernel");
- const string kernel_md5 = path_files_md5_hash(kernel_path);
+ const string source_path = path_get("source");
+ const string kernel_md5 = path_files_md5_hash(source_path);
/* We include cflags into md5 so changing cuda toolkit or changing other
* compiler command line arguments makes sure cubin gets re-built.
*/
const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
- const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin",
- major, minor,
+ const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
+ name, major, minor,
cubin_md5.c_str());
const string cubin = path_cache_get(path_join("kernels", cubin_file));
VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
@@ -377,9 +450,10 @@ public:
return "";
}
const char *nvcc = cuewCompilerPath();
- const string kernel = path_join(kernel_path,
- path_join("kernels",
- path_join("cuda", "kernel.cu")));
+ const string kernel = path_join(
+ path_join(source_path, "kernel"),
+ path_join("kernels",
+ path_join("cuda", source)));
double starttime = time_dt();
printf("Compiling CUDA kernel ...\n");
@@ -418,6 +492,16 @@ public:
bool load_kernels(const DeviceRequestedFeatures& requested_features)
{
+ /* TODO(sergey): Support kernels re-load for CUDA devices.
+ *
+ * Currently re-loading kernel will invalidate memory pointers,
+ * causing problems in cuCtxSynchronize.
+ */
+ if(cuFilterModule && cuModule) {
+ VLOG(1) << "Skipping kernel reload, not currently supported.";
+ return true;
+ }
+
/* check if cuda init succeeded */
if(cuContext == 0)
return false;
@@ -427,13 +511,16 @@ public:
return false;
/* get kernel */
- string cubin = compile_kernel(requested_features);
-
+ string cubin = compile_kernel(requested_features, false, use_split_kernel());
if(cubin == "")
return false;
+ string filter_cubin = compile_kernel(requested_features, true, false);
+ if(filter_cubin == "")
+ return false;
+
/* open module */
- cuda_push_context();
+ CUDAContextScope scope(this);
string cubin_data;
CUresult result;
@@ -446,46 +533,58 @@ public:
if(cuda_error_(result, "cuModuleLoad"))
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
- cuda_pop_context();
+ if(path_read_text(filter_cubin, cubin_data))
+ result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+ else
+ result = CUDA_ERROR_FILE_NOT_FOUND;
+
+ if(cuda_error_(result, "cuModuleLoad"))
+ cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
return (result == CUDA_SUCCESS);
}
- void load_bindless_mapping()
+ void load_texture_info()
{
- if(info.has_bindless_textures && need_bindless_mapping) {
- tex_free(bindless_mapping);
- tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT);
- need_bindless_mapping = false;
+ if(info.has_bindless_textures && need_texture_info) {
+ tex_free(texture_info);
+ tex_alloc("__texture_info", texture_info, INTERPOLATION_NONE, EXTENSION_REPEAT);
+ need_texture_info = false;
}
}
- void mem_alloc(device_memory& mem, MemoryType /*type*/)
+ void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
{
- cuda_push_context();
+ CUDAContextScope scope(this);
+
+ if(name) {
+ VLOG(1) << "Buffer allocate: " << name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+ }
+
CUdeviceptr device_pointer;
size_t size = mem.memory_size();
cuda_assert(cuMemAlloc(&device_pointer, size));
mem.device_pointer = (device_ptr)device_pointer;
mem.device_size = size;
stats.mem_alloc(size);
- cuda_pop_context();
}
void mem_copy_to(device_memory& mem)
{
- cuda_push_context();
+ CUDAContextScope scope(this);
+
if(mem.device_pointer)
cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()));
- cuda_pop_context();
}
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
{
+ CUDAContextScope scope(this);
size_t offset = elem*y*w;
size_t size = elem*w*h;
- cuda_push_context();
if(mem.device_pointer) {
cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
(CUdeviceptr)(mem.device_pointer + offset), size));
@@ -493,25 +592,25 @@ public:
else {
memset((char*)mem.data_pointer + offset, 0, size);
}
- cuda_pop_context();
}
void mem_zero(device_memory& mem)
{
- memset((void*)mem.data_pointer, 0, mem.memory_size());
+ if(mem.data_pointer) {
+ memset((void*)mem.data_pointer, 0, mem.memory_size());
+ }
- cuda_push_context();
- if(mem.device_pointer)
+ if(mem.device_pointer) {
+ CUDAContextScope scope(this);
cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
- cuda_pop_context();
+ }
}
void mem_free(device_memory& mem)
{
if(mem.device_pointer) {
- cuda_push_context();
+ CUDAContextScope scope(this);
cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)));
- cuda_pop_context();
mem.device_pointer = 0;
@@ -520,16 +619,20 @@ public:
}
}
+ virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+ {
+ return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+ }
+
void const_copy_to(const char *name, void *host, size_t size)
{
+ CUDAContextScope scope(this);
CUdeviceptr mem;
size_t bytes;
- cuda_push_context();
cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
//assert(bytes == size);
cuda_assert(cuMemcpyHtoD(mem, host, size));
- cuda_pop_context();
}
void tex_alloc(const char *name,
@@ -537,12 +640,13 @@ public:
InterpolationType interpolation,
ExtensionType extension)
{
+ CUDAContextScope scope(this);
+
VLOG(1) << "Texture allocate: " << name << ", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")";
- /* Check if we are on sm_30 or above.
- * We use arrays and bindles textures for storage there */
+ /* Check if we are on sm_30 or above, for bindless textures. */
bool has_bindless_textures = info.has_bindless_textures;
/* General variables for both architectures */
@@ -574,20 +678,10 @@ public:
filter_mode = CU_TR_FILTER_MODE_LINEAR;
}
- CUarray_format_enum format;
- switch(mem.data_type) {
- case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
- case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
- case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
- case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
- case TYPE_HALF: format = CU_AD_FORMAT_HALF; break;
- default: assert(0); return;
- }
-
/* General variables for Fermi */
CUtexref texref = NULL;
- if(!has_bindless_textures) {
+ if(!has_bindless_textures && interpolation != INTERPOLATION_NONE) {
if(mem.data_depth > 1) {
/* Kernel uses different bind names for 2d and 3d float textures,
* so we have to adjust couple of things here.
@@ -599,59 +693,47 @@ public:
tokens[3].c_str());
}
- cuda_push_context();
cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
- cuda_pop_context();
if(!texref) {
return;
}
}
- /* Data Storage */
if(interpolation == INTERPOLATION_NONE) {
- if(has_bindless_textures) {
- mem_alloc(mem, MEM_READ_ONLY);
- mem_copy_to(mem);
-
- cuda_push_context();
+ /* Data Storage */
+ mem_alloc(NULL, mem, MEM_READ_ONLY);
+ mem_copy_to(mem);
- CUdeviceptr cumem;
- size_t cubytes;
+ CUdeviceptr cumem;
+ size_t cubytes;
- cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
+ cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
- if(cubytes == 8) {
- /* 64 bit device pointer */
- uint64_t ptr = mem.device_pointer;
- cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
- }
- else {
- /* 32 bit device pointer */
- uint32_t ptr = (uint32_t)mem.device_pointer;
- cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
- }
-
- cuda_pop_context();
+ if(cubytes == 8) {
+ /* 64 bit device pointer */
+ uint64_t ptr = mem.device_pointer;
+ cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
}
else {
- mem_alloc(mem, MEM_READ_ONLY);
- mem_copy_to(mem);
-
- cuda_push_context();
-
- cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
- cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
- cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
-
- cuda_pop_context();
+ /* 32 bit device pointer */
+ uint32_t ptr = (uint32_t)mem.device_pointer;
+ cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
}
}
- /* Texture Storage */
else {
+ /* Texture Storage */
CUarray handle = NULL;
- cuda_push_context();
+ CUarray_format_enum format;
+ switch(mem.data_type) {
+ case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
+ case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
+ case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
+ case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
+ case TYPE_HALF: format = CU_AD_FORMAT_HALF; break;
+ default: assert(0); return;
+ }
if(mem.data_depth > 1) {
CUDA_ARRAY3D_DESCRIPTOR desc;
@@ -677,7 +759,6 @@ public:
}
if(!handle) {
- cuda_pop_context();
return;
}
@@ -718,8 +799,8 @@ public:
stats.mem_alloc(size);
- /* Bindless Textures - Kepler */
if(has_bindless_textures) {
+ /* Bindless Textures - Kepler */
int flat_slot = 0;
if(string_startswith(name, "__tex_image")) {
int pos = string(name).rfind("_");
@@ -752,41 +833,39 @@ public:
}
/* Resize once */
- if(flat_slot >= bindless_mapping.size()) {
+ if(flat_slot >= texture_info.size()) {
/* Allocate some slots in advance, to reduce amount
- * of re-allocations.
- */
- bindless_mapping.resize(flat_slot + 128);
+ * of re-allocations. */
+ texture_info.resize(flat_slot + 128);
}
/* Set Mapping and tag that we need to (re-)upload to device */
- bindless_mapping.get_data()[flat_slot] = (uint)tex;
- tex_bindless_map[mem.device_pointer] = (uint)tex;
- need_bindless_mapping = true;
+ TextureInfo& info = texture_info.get_data()[flat_slot];
+ info.data = (uint64_t)tex;
+ info.cl_buffer = 0;
+ info.interpolation = interpolation;
+ info.extension = extension;
+ info.width = mem.data_width;
+ info.height = mem.data_height;
+ info.depth = mem.data_depth;
+
+ tex_bindless_map[mem.device_pointer] = tex;
+ need_texture_info = true;
}
- /* Regular Textures - Fermi */
else {
+ /* Regular Textures - Fermi */
cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
cuda_assert(cuTexRefSetFilterMode(texref, filter_mode));
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
- }
-
- cuda_pop_context();
- }
- /* Fermi, Data and Image Textures */
- if(!has_bindless_textures) {
- cuda_push_context();
+ cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
+ cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
+ if(mem.data_depth > 1) {
+ cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode));
+ }
- cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
- cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
- if(mem.data_depth > 1) {
- cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode));
+ cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
}
-
- cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
-
- cuda_pop_context();
}
/* Fermi and Kepler */
@@ -797,14 +876,13 @@ public:
{
if(mem.device_pointer) {
if(tex_interp_map[mem.device_pointer]) {
- cuda_push_context();
+ CUDAContextScope scope(this);
cuArrayDestroy((CUarray)mem.device_pointer);
- cuda_pop_context();
/* Free CUtexObject (Bindless Textures) */
if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) {
- uint flat_slot = tex_bindless_map[mem.device_pointer];
- cuTexObjectDestroy(flat_slot);
+ CUtexObject tex = tex_bindless_map[mem.device_pointer];
+ cuTexObjectDestroy(tex);
}
tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
@@ -820,64 +898,466 @@ public:
}
}
- void path_trace(RenderTile& rtile, int sample, bool branched)
+ bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
+ {
+ mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+ TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+ for(int i = 0; i < 9; i++) {
+ tiles->buffers[i] = buffers[i];
+ }
+
+ mem_copy_to(task->tiles_mem);
+
+ return !have_error();
+ }
+
+#define CUDA_GET_BLOCKSIZE(func, w, h) \
+ int threads_per_block; \
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+ int threads = (int)sqrt((float)threads_per_block); \
+ int xblocks = ((w) + threads - 1)/threads; \
+ int yblocks = ((h) + threads - 1)/threads;
+
+#define CUDA_LAUNCH_KERNEL(func, args) \
+ cuda_assert(cuLaunchKernel(func, \
+ xblocks, yblocks, 1, \
+ threads, threads, 1, \
+ 0, 0, args, 0));
+
+ bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+ DenoisingTask *task)
{
if(have_error())
- return;
+ return false;
+
+ CUDAContextScope scope(this);
+
+ int4 rect = task->rect;
+ int w = align_up(rect.z-rect.x, 4);
+ int h = rect.w-rect.y;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ CUdeviceptr difference = task->nlm_state.temporary_1_ptr;
+ CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
+ CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
+
+ cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
+ cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
+
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+ cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+
+ CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
+
+ int dx, dy;
+ int4 local_rect;
+ int channel_offset = 0;
+ void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f};
+ void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f};
+ void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
+
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ dy = i / (2*r+1) - r;
+ dx = i % (2*r+1) - r;
+ local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+
+ CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
+ }
+
+ local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
+ void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
+ CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_construct_transform(DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilterConstructTransform;
+ cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+ CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
+ task->storage.w,
+ task->storage.h);
+
+ void *args[] = {&task->buffer.mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->filter_area,
+ &task->rect,
+ &task->radius,
+ &task->pca_threshold,
+ &task->buffer.pass_stride};
+ CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_reconstruct(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ mem_zero(task->storage.XtWX);
+ mem_zero(task->storage.XtWY);
+
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+ cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+ cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+
+ CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
+ task->reconstruction_state.source_w,
+ task->reconstruction_state.source_h);
+
+ CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr;
+ CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
+
+ int r = task->radius;
+ int f = 4;
+ float a = 1.0f;
+ for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+ int dy = i / (2*r+1) - r;
+ int dx = i % (2*r+1) - r;
+
+ int local_rect[4] = {max(0, -dx), max(0, -dy),
+ task->reconstruction_state.source_w - max(0, dx),
+ task->reconstruction_state.source_h - max(0, dy)};
+
+ void *calc_difference_args[] = {&dx, &dy,
+ &color_ptr,
+ &color_variance_ptr,
+ &difference,
+ &local_rect,
+ &task->buffer.w,
+ &task->buffer.pass_stride,
+ &a,
+ &task->nlm_k_2};
+ CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+
+ void *blur_args[] = {&difference,
+ &blurDifference,
+ &local_rect,
+ &task->buffer.w,
+ &f};
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+ void *calc_weight_args[] = {&blurDifference,
+ &difference,
+ &local_rect,
+ &task->buffer.w,
+ &f};
+ CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+
+ /* Reuse previous arguments. */
+ CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+ void *construct_gramian_args[] = {&dx, &dy,
+ &blurDifference,
+ &task->buffer.mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &local_rect,
+ &task->reconstruction_state.filter_rect,
+ &task->buffer.w,
+ &task->buffer.h,
+ &f,
+ &task->buffer.pass_stride};
+ CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
+ }
+
+ void *finalize_args[] = {&task->buffer.w,
+ &task->buffer.h,
+ &output_ptr,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->filter_area,
+ &task->reconstruction_state.buffer_params.x,
+ &task->render_buffer.samples};
+ CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr mean_ptr, device_ptr variance_ptr,
+ int r, int4 rect, DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilterCombineHalves;
+ cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ void *args[] = {&mean_ptr,
+ &variance_ptr,
+ &a_ptr,
+ &b_ptr,
+ &rect,
+ &r};
+ CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+ device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr, DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilterDivideShadow;
+ cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ void *args[] = {&task->render_buffer.samples,
+ &task->tiles_mem.device_pointer,
+ &a_ptr,
+ &b_ptr,
+ &sample_variance_ptr,
+ &sv_variance_ptr,
+ &buffer_variance_ptr,
+ &task->rect,
+ &task->render_buffer.pass_stride,
+ &task->render_buffer.denoising_data_offset};
+ CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilterGetFeature;
+ cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ void *args[] = {&task->render_buffer.samples,
+ &task->tiles_mem.device_pointer,
+ &mean_offset,
+ &variance_offset,
+ &mean_ptr,
+ &variance_ptr,
+ &task->rect,
+ &task->render_buffer.pass_stride,
+ &task->render_buffer.denoising_data_offset};
+ CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_detect_outliers(device_ptr image_ptr,
+ device_ptr variance_ptr,
+ device_ptr depth_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ if(have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilterDetectOutliers;
+ cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers,
+ task->rect.z-task->rect.x,
+ task->rect.w-task->rect.y);
+
+ void *args[] = {&image_ptr,
+ &variance_ptr,
+ &depth_ptr,
+ &output_ptr,
+ &task->rect,
+ &task->buffer.pass_stride};
+
+ CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ void denoise(RenderTile &rtile, const DeviceTask &task)
+ {
+ DenoisingTask denoising(this);
+
+ denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
+ denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, &denoising);
+ denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
+
+ denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+ denoising.render_buffer.samples = rtile.sample;
+
+ RenderTile rtiles[9];
+ rtiles[4] = rtile;
+ task.map_neighbor_tiles(rtiles, this);
+ denoising.tiles_from_rendertiles(rtiles);
+
+ denoising.init_from_devicetask(task);
- cuda_push_context();
+ denoising.run_denoising();
+ task.unmap_neighbor_tiles(rtiles, this);
+ }
+
+ void path_trace(DeviceTask& task, RenderTile& rtile)
+ {
+ if(have_error())
+ return;
+
+ CUDAContextScope scope(this);
CUfunction cuPathTrace;
- CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
- CUdeviceptr d_rng_state = cuda_device_ptr(rtile.rng_state);
- /* get kernel function */
- if(branched) {
+ /* Get kernel function. */
+ if(task.integrator_branched) {
cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
}
else {
cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
}
- if(have_error())
+ if(have_error()) {
return;
+ }
- /* pass in parameters */
- void *args[] = {&d_buffer,
- &d_rng_state,
- &sample,
- &rtile.x,
- &rtile.y,
- &rtile.w,
- &rtile.h,
- &rtile.offset,
- &rtile.stride};
+ cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
- /* launch kernel */
- int threads_per_block;
- cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuPathTrace));
+ /* Allocate work tile. */
+ device_vector<WorkTile> work_tiles;
+ work_tiles.resize(1);
+
+ WorkTile *wtile = work_tiles.get_data();
+ wtile->x = rtile.x;
+ wtile->y = rtile.y;
+ wtile->w = rtile.w;
+ wtile->h = rtile.h;
+ wtile->offset = rtile.offset;
+ wtile->stride = rtile.stride;
+ wtile->buffer = (float*)cuda_device_ptr(rtile.buffer);
+ mem_alloc("work_tiles", work_tiles, MEM_READ_ONLY);
+
+ CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
+
+ /* Prepare work size. More step samples render faster, but for now we
+ * remain conservative for GPUs connected to a display to avoid driver
+ * timeouts and display freezing. */
+ int min_blocks, num_threads_per_block;
+ cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+ if(!info.display_device) {
+ min_blocks *= 8;
+ }
- /*int num_registers;
- cuda_assert(cuFuncGetAttribute(&num_registers, CU_FUNC_ATTRIBUTE_NUM_REGS, cuPathTrace));
+ uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);;
- printf("threads_per_block %d\n", threads_per_block);
- printf("num_registers %d\n", num_registers);*/
+ /* Render all samples. */
+ int start_sample = rtile.start_sample;
+ int end_sample = rtile.start_sample + rtile.num_samples;
- int xthreads = (int)sqrt(threads_per_block);
- int ythreads = (int)sqrt(threads_per_block);
- int xblocks = (rtile.w + xthreads - 1)/xthreads;
- int yblocks = (rtile.h + ythreads - 1)/ythreads;
+ for(int sample = start_sample; sample < end_sample; sample += step_samples) {
+ /* Setup and copy work tile to device. */
+ wtile->start_sample = sample;
+ wtile->num_samples = min(step_samples, end_sample - sample);;
+ mem_copy_to(work_tiles);
- cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+ uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+ uint num_blocks = divide_up(total_work_size, num_threads_per_block);
- cuda_assert(cuLaunchKernel(cuPathTrace,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, args, 0));
+ /* Launch kernel. */
+ void *args[] = {&d_work_tiles,
+ &total_work_size};
- cuda_assert(cuCtxSynchronize());
+ cuda_assert(cuLaunchKernel(cuPathTrace,
+ num_blocks, 1, 1,
+ num_threads_per_block, 1, 1,
+ 0, 0, args, 0));
+
+ cuda_assert(cuCtxSynchronize());
+
+ /* Update progress. */
+ rtile.sample = sample + wtile->num_samples;
+ task.update_progress(&rtile, rtile.w*rtile.h*wtile->num_samples);
+
+ if(task.get_cancel()) {
+ if(task.need_finish_queue == false)
+ break;
+ }
+ }
- cuda_pop_context();
+ mem_free(work_tiles);
}
void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
@@ -885,7 +1365,7 @@ public:
if(have_error())
return;
- cuda_push_context();
+ CUDAContextScope scope(this);
CUfunction cuFilmConvert;
CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half);
@@ -930,8 +1410,6 @@ public:
0, 0, args, 0));
unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
-
- cuda_pop_context();
}
void shader(DeviceTask& task)
@@ -939,19 +1417,21 @@ public:
if(have_error())
return;
- cuda_push_context();
+ CUDAContextScope scope(this);
CUfunction cuShader;
CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
- CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma);
/* get kernel function */
if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
}
+ else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
+ cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
+ }
else {
- cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_shader"));
+ cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
}
/* do tasks in smaller chunks, so we can cancel it */
@@ -970,9 +1450,6 @@ public:
int arg = 0;
args[arg++] = &d_input;
args[arg++] = &d_output;
- if(task.shader_eval_type < SHADER_EVAL_BAKE) {
- args[arg++] = &d_output_luma;
- }
args[arg++] = &task.shader_eval_type;
if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
args[arg++] = &task.shader_filter;
@@ -1004,8 +1481,6 @@ public:
task.update_progress(NULL);
}
-
- cuda_pop_context();
}
CUdeviceptr map_pixels(device_ptr mem)
@@ -1041,7 +1516,7 @@ public:
pmem.w = mem.data_width;
pmem.h = mem.data_height;
- cuda_push_context();
+ CUDAContextScope scope(this);
glGenBuffers(1, &pmem.cuPBO);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
@@ -1065,8 +1540,6 @@ public:
CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
if(result == CUDA_SUCCESS) {
- cuda_pop_context();
-
mem.device_pointer = pmem.cuTexId;
pixel_mem_map[mem.device_pointer] = pmem;
@@ -1080,8 +1553,6 @@ public:
glDeleteBuffers(1, &pmem.cuPBO);
glDeleteTextures(1, &pmem.cuTexId);
- cuda_pop_context();
-
background = true;
}
}
@@ -1094,7 +1565,7 @@ public:
if(!background) {
PixelMem pmem = pixel_mem_map[mem.device_pointer];
- cuda_push_context();
+ CUDAContextScope scope(this);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
@@ -1103,8 +1574,6 @@ public:
glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
- cuda_pop_context();
-
return;
}
@@ -1117,14 +1586,12 @@ public:
if(!background) {
PixelMem pmem = pixel_mem_map[mem.device_pointer];
- cuda_push_context();
+ CUDAContextScope scope(this);
cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
glDeleteBuffers(1, &pmem.cuPBO);
glDeleteTextures(1, &pmem.cuTexId);
- cuda_pop_context();
-
pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
mem.device_pointer = 0;
@@ -1145,7 +1612,7 @@ public:
PixelMem pmem = pixel_mem_map[mem.device_pointer];
float *vpointer;
- cuda_push_context();
+ CUDAContextScope scope(this);
/* for multi devices, this assumes the inefficient method that we allocate
* all pixels on the device even though we only render to a subset */
@@ -1234,8 +1701,6 @@ public:
glBindTexture(GL_TEXTURE_2D, 0);
glDisable(GL_TEXTURE_2D);
- cuda_pop_context();
-
return;
}
@@ -1244,44 +1709,54 @@ public:
void thread_run(DeviceTask *task)
{
- if(task->type == DeviceTask::PATH_TRACE) {
+ CUDAContextScope scope(this);
+
+ if(task->type == DeviceTask::RENDER) {
RenderTile tile;
- bool branched = task->integrator_branched;
+ DeviceRequestedFeatures requested_features;
+ if(use_split_kernel()) {
+ if(!use_adaptive_compilation()) {
+ requested_features.max_closure = 64;
+ }
- /* Upload Bindless Mapping */
- load_bindless_mapping();
+ if(split_kernel == NULL) {
+ split_kernel = new CUDASplitKernel(this);
+ split_kernel->load_kernels(requested_features);
+ }
+ }
/* keep rendering tiles until done */
while(task->acquire_tile(this, tile)) {
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
-
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
+ if(tile.task == RenderTile::PATH_TRACE) {
+ if(use_split_kernel()) {
+ device_memory void_buffer;
+ split_kernel->path_trace(task, tile, void_buffer, void_buffer);
}
+ else {
+ path_trace(*task, tile);
+ }
+ }
+ else if(tile.task == RenderTile::DENOISE) {
+ tile.sample = tile.start_sample + tile.num_samples;
- path_trace(tile, sample, branched);
-
- tile.sample = sample + 1;
+ denoise(tile, *task);
- task->update_progress(&tile);
+ task->update_progress(&tile, tile.w*tile.h);
}
task->release_tile(tile);
+
+ if(task->get_cancel()) {
+ if(task->need_finish_queue == false)
+ break;
+ }
}
}
else if(task->type == DeviceTask::SHADER) {
- /* Upload Bindless Mapping */
- load_bindless_mapping();
-
shader(*task);
- cuda_push_context();
cuda_assert(cuCtxSynchronize());
- cuda_pop_context();
}
}
@@ -1301,13 +1776,15 @@ public:
void task_add(DeviceTask& task)
{
+ CUDAContextScope scope(this);
+
+ /* Load texture info. */
+ load_texture_info();
+
if(task.type == DeviceTask::FILM_CONVERT) {
/* must be done in main thread due to opengl access */
film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-
- cuda_push_context();
cuda_assert(cuCtxSynchronize());
- cuda_pop_context();
}
else {
task_pool.push(new CUDADeviceTask(this, task));
@@ -1323,8 +1800,236 @@ public:
{
task_pool.cancel();
}
+
+ friend class CUDASplitKernelFunction;
+ friend class CUDASplitKernel;
+ friend class CUDAContextScope;
+};
+
+/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
+ * now that the definition of that class is complete
+ */
+#undef cuda_assert
+#define cuda_assert(stmt) \
+ { \
+ CUresult result = stmt; \
+ \
+ if(result != CUDA_SUCCESS) { \
+ string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+ if(device->error_msg == "") \
+ device->error_msg = message; \
+ fprintf(stderr, "%s\n", message.c_str()); \
+ /*cuda_abort();*/ \
+ device->cuda_error_documentation(); \
+ } \
+ } (void)0
+
+
+/* CUDA context scope. */
+
+CUDAContextScope::CUDAContextScope(CUDADevice *device)
+: device(device)
+{
+ cuda_assert(cuCtxPushCurrent(device->cuContext));
+}
+
+CUDAContextScope::~CUDAContextScope()
+{
+ cuda_assert(cuCtxPopCurrent(NULL));
+}
+
+/* split kernel */
+
+class CUDASplitKernelFunction : public SplitKernelFunction{
+ CUDADevice* device;
+ CUfunction func;
+public:
+ CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
+
+ /* enqueue the kernel, returns false if there is an error */
+ bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
+ {
+ return enqueue(dim, NULL);
+ }
+
+ /* enqueue the kernel, returns false if there is an error */
+ bool enqueue(const KernelDimensions &dim, void *args[])
+ {
+ if(device->have_error())
+ return false;
+
+ CUDAContextScope scope(device);
+
+ /* we ignore dim.local_size for now, as this is faster */
+ int threads_per_block;
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+ int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block;
+
+ cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+ cuda_assert(cuLaunchKernel(func,
+ xblocks, 1, 1, /* blocks */
+ threads_per_block, 1, 1, /* threads */
+ 0, 0, args, 0));
+
+ return !device->have_error();
+ }
};
+CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+{
+ CUDAContextScope scope(device);
+
+ device_vector<uint64_t> size_buffer;
+ size_buffer.resize(1);
+ device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+ uint threads = num_threads;
+ CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+
+ struct args_t {
+ uint* num_threads;
+ CUdeviceptr* size;
+ };
+
+ args_t args = {
+ &threads,
+ &d_size
+ };
+
+ CUfunction state_buffer_size;
+ cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+
+ cuda_assert(cuLaunchKernel(state_buffer_size,
+ 1, 1, 1,
+ 1, 1, 1,
+ 0, 0, (void**)&args, 0));
+
+ device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
+ device->mem_free(size_buffer);
+
+ return *size_buffer.get_data();
+}
+
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+ RenderTile& rtile,
+ int num_global_elements,
+ device_memory& /*kernel_globals*/,
+ device_memory& /*kernel_data*/,
+ device_memory& split_data,
+ device_memory& ray_state,
+ device_memory& queue_index,
+ device_memory& use_queues_flag,
+ device_memory& work_pool_wgs)
+{
+ CUDAContextScope scope(device);
+
+ CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
+ CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
+ CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
+ CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
+ CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
+
+ CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
+
+ int end_sample = rtile.start_sample + rtile.num_samples;
+ int queue_size = dim.global_size[0] * dim.global_size[1];
+
+ struct args_t {
+ CUdeviceptr* split_data_buffer;
+ int* num_elements;
+ CUdeviceptr* ray_state;
+ int* start_sample;
+ int* end_sample;
+ int* sx;
+ int* sy;
+ int* sw;
+ int* sh;
+ int* offset;
+ int* stride;
+ CUdeviceptr* queue_index;
+ int* queuesize;
+ CUdeviceptr* use_queues_flag;
+ CUdeviceptr* work_pool_wgs;
+ int* num_samples;
+ CUdeviceptr* buffer;
+ };
+
+ args_t args = {
+ &d_split_data,
+ &num_global_elements,
+ &d_ray_state,
+ &rtile.start_sample,
+ &end_sample,
+ &rtile.x,
+ &rtile.y,
+ &rtile.w,
+ &rtile.h,
+ &rtile.offset,
+ &rtile.stride,
+ &d_queue_index,
+ &queue_size,
+ &d_use_queues_flag,
+ &d_work_pool_wgs,
+ &rtile.num_samples,
+ &d_buffer
+ };
+
+ CUfunction data_init;
+ cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+ if(device->have_error()) {
+ return false;
+ }
+
+ CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
+
+ return !device->have_error();
+}
+
+SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name,
+ const DeviceRequestedFeatures&)
+{
+ CUDAContextScope scope(device);
+ CUfunction func;
+
+ cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
+ if(device->have_error()) {
+ device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+ return NULL;
+ }
+
+ return new CUDASplitKernelFunction(device, func);
+}
+
+int2 CUDASplitKernel::split_kernel_local_size()
+{
+ return make_int2(32, 1);
+}
+
+int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
+{
+ CUDAContextScope scope(device);
+ size_t free;
+ size_t total;
+
+ cuda_assert(cuMemGetInfo(&free, &total));
+
+ VLOG(1) << "Maximum device allocation size: "
+ << string_human_readable_number(free) << " bytes. ("
+ << string_human_readable_size(free) << ").";
+
+ size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
+ size_t side = round_down((int)sqrt(num_elements), 32);
+ int2 global_size = make_int2(side, round_down(num_elements / side, 16));
+ VLOG(1) << "Global size: " << global_size << ".";
+ return global_size;
+}
+
bool device_cuda_init(void)
{
#ifdef WITH_CUDA_DYNLOAD
@@ -1371,18 +2076,34 @@ Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background)
return new CUDADevice(info, stats, background);
}
-void device_cuda_info(vector<DeviceInfo>& devices)
+static CUresult device_cuda_safe_init()
{
- CUresult result;
- int count = 0;
+#ifdef _WIN32
+ __try {
+ return cuInit(0);
+ }
+ __except(EXCEPTION_EXECUTE_HANDLER) {
+ /* Ignore crashes inside the CUDA driver and hope we can
+ * survive even with corrupted CUDA installs. */
+ fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n");
+ }
+
+ return CUDA_ERROR_NO_DEVICE;
+#else
+ return cuInit(0);
+#endif
+}
- result = cuInit(0);
+void device_cuda_info(vector<DeviceInfo>& devices)
+{
+ CUresult result = device_cuda_safe_init();
if(result != CUDA_SUCCESS) {
if(result != CUDA_ERROR_NO_DEVICE)
fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result));
return;
}
+ int count = 0;
result = cuDeviceGetCount(&count);
if(result != CUDA_SUCCESS) {
fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result));
@@ -1393,7 +2114,6 @@ void device_cuda_info(vector<DeviceInfo>& devices)
for(int num = 0; num < count; num++) {
char name[256];
- int attr;
if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
continue;
@@ -1412,7 +2132,8 @@ void device_cuda_info(vector<DeviceInfo>& devices)
info.advanced_shading = (major >= 2);
info.has_bindless_textures = (major >= 3);
- info.pack_images = false;
+ info.has_volume_decoupled = false;
+ info.has_qbvh = false;
int pci_location[3] = {0, 0, 0};
cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
@@ -1424,14 +2145,21 @@ void device_cuda_info(vector<DeviceInfo>& devices)
(unsigned int)pci_location[1],
(unsigned int)pci_location[2]);
- /* if device has a kernel timeout, assume it is used for display */
- if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
+ /* If device has a kernel timeout and no compute preemption, we assume
+ * it is connected to a display and will freeze the display while doing
+ * computations. */
+ int timeout_attr = 0, preempt_attr = 0;
+ cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num);
+ cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num);
+
+ if(timeout_attr && !preempt_attr) {
info.description += " (Display)";
info.display_device = true;
display_devices.push_back(info);
}
- else
+ else {
devices.push_back(info);
+ }
}
if(!display_devices.empty())
@@ -1440,7 +2168,7 @@ void device_cuda_info(vector<DeviceInfo>& devices)
string device_cuda_capabilities(void)
{
- CUresult result = cuInit(0);
+ CUresult result = device_cuda_safe_init();
if(result != CUDA_SUCCESS) {
if(result != CUDA_ERROR_NO_DEVICE) {
return string("Error initializing CUDA: ") + cuewErrorString(result);