diff options
author | Thomas Dinges <blender@dingto.org> | 2016-05-19 13:47:41 +0300 |
---|---|---|
committer | Thomas Dinges <blender@dingto.org> | 2016-05-19 14:14:37 +0300 |
commit | c9f1ed1e4c22728b1f711656ab4de56c11c65e35 (patch) | |
tree | d64b736497f386827ef327d8b35fd5d9ce91588a /intern/cycles/device/device_cuda.cpp | |
parent | 03f846ea12ba38d4686edfeef01a571329bd9385 (diff) |
Cycles: Add support for bindless textures.
This adds support for CUDA Texture objects (also known as Bindless textures) for Kepler GPUs (Geforce 6xx and above).
This is used for all 2D/3D textures, data still uses arrays as before.
User benefits:
* No more limits of image textures on Kepler.
We had 5 float4 and 145 byte4 slots there before, now we have 1024 float4 and 1024 byte4.
This can be extended further if we need to (just change the define).
* Single channel textures slots (byte and float) are now supported on Kepler as well (1024 slots for each type).
ToDo / Issues:
* 3D textures don't work yet, at least don't show up during render. I have no idea whats wrong yet.
* Dynamically allocate bindless_mapping array?
I hope Fermi still works fine, but that should be tested on a Fermi card before pushing to master.
Part of my GSoC 2016.
Reviewers: sergey, #cycles, brecht
Subscribers: swerner, jtheninja, brecht, sergey
Differential Revision: https://developer.blender.org/D1999
Diffstat (limited to 'intern/cycles/device/device_cuda.cpp')
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 365 |
1 files changed, 227 insertions, 138 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 12c62c0702c..39bb4426826 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -85,10 +85,10 @@ public: CUcontext cuContext; CUmodule cuModule; map<device_ptr, bool> tex_interp_map; + map<device_ptr, uint> tex_bindless_map; int cuDevId; int cuDevArchitecture; bool first_error; - bool use_texture_storage; struct PixelMem { GLuint cuPBO; @@ -99,6 +99,10 @@ public: map<device_ptr, PixelMem> pixel_mem_map; + /* Bindless Textures */ + device_vector<uint> bindless_mapping; + bool need_bindless_mapping; + CUdeviceptr cuda_device_ptr(device_ptr mem) { return (CUdeviceptr)mem; @@ -176,12 +180,13 @@ public: { first_error = true; background = background_; - use_texture_storage = true; cuDevId = info.num; cuDevice = 0; cuContext = 0; + need_bindless_mapping = false; + /* intialize */ if(cuda_error(cuInit(0))) return; @@ -211,11 +216,6 @@ public: cuDeviceComputeCapability(&major, &minor, cuDevId); cuDevArchitecture = major*100 + minor*10; - /* In order to use full 6GB of memory on Titan cards, use arrays instead - * of textures. On earlier cards this seems slower, but on Titan it is - * actually slightly faster in tests. */ - use_texture_storage = (cuDevArchitecture < 300); - cuda_pop_context(); } @@ -223,6 +223,10 @@ public: { task_pool.stop(); + if(info.has_bindless_textures) { + tex_free(bindless_mapping); + } + cuda_assert(cuCtxDestroy(cuContext)); } @@ -400,6 +404,15 @@ public: return (result == CUDA_SUCCESS); } + void load_bindless_mapping() + { + if(info.has_bindless_textures && need_bindless_mapping) { + tex_free(bindless_mapping); + tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT); + need_bindless_mapping = false; + } + } + void mem_alloc(device_memory& mem, MemoryType /*type*/) { cuda_push_context(); @@ -479,126 +492,99 @@ public: { VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes."; + /* Check if we are on sm_30 or above. + * We use arrays and bindles textures for storage there */ + bool has_bindless_textures = info.has_bindless_textures; + + /* General variables for both architectures */ string bind_name = name; - if(mem.data_depth > 1) { - /* Kernel uses different bind names for 2d and 3d float textures, - * so we have to adjust couple of things here. - */ - vector<string> tokens; - string_split(tokens, name, "_"); - bind_name = string_printf("__tex_image_%s_3d_%s", - tokens[2].c_str(), - tokens[3].c_str()); + size_t dsize = datatype_size(mem.data_type); + size_t size = mem.memory_size(); + + CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; + switch(extension) { + case EXTENSION_REPEAT: + address_mode = CU_TR_ADDRESS_MODE_WRAP; + break; + case EXTENSION_EXTEND: + address_mode = CU_TR_ADDRESS_MODE_CLAMP; + break; + case EXTENSION_CLIP: + address_mode = CU_TR_ADDRESS_MODE_BORDER; + break; + default: + assert(0); + break; + } + + CUfilter_mode filter_mode; + if(interpolation == INTERPOLATION_CLOSEST) { + filter_mode = CU_TR_FILTER_MODE_POINT; + } + else { + filter_mode = CU_TR_FILTER_MODE_LINEAR; } - /* determine format */ CUarray_format_enum format; - size_t dsize = datatype_size(mem.data_type); - size_t size = mem.memory_size(); - bool use_texture = (interpolation != INTERPOLATION_NONE) || use_texture_storage; + switch(mem.data_type) { + case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; + case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; + case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; + case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; + default: assert(0); return; + } - if(use_texture) { + /* General variables for Fermi */ + CUtexref texref = NULL; - switch(mem.data_type) { - case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; - case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; - case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; - case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; - default: assert(0); return; + if(!has_bindless_textures) { + if(mem.data_depth > 1) { + /* Kernel uses different bind names for 2d and 3d float textures, + * so we have to adjust couple of things here. + */ + vector<string> tokens; + string_split(tokens, name, "_"); + bind_name = string_printf("__tex_image_%s_3d_%s", + tokens[2].c_str(), + tokens[3].c_str()); } - CUtexref texref = NULL; - cuda_push_context(); cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str())); + cuda_pop_context(); if(!texref) { - cuda_pop_context(); return; } + } - if(interpolation != INTERPOLATION_NONE) { - CUarray handle = NULL; - - if(mem.data_depth > 1) { - CUDA_ARRAY3D_DESCRIPTOR desc; - - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Depth = mem.data_depth; - desc.Format = format; - desc.NumChannels = mem.data_elements; - desc.Flags = 0; - - cuda_assert(cuArray3DCreate(&handle, &desc)); - } - else { - CUDA_ARRAY_DESCRIPTOR desc; - - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Format = format; - desc.NumChannels = mem.data_elements; - - cuda_assert(cuArrayCreate(&handle, &desc)); - } + /* Data Storage */ + if(interpolation == INTERPOLATION_NONE) { + if(has_bindless_textures) { + mem_alloc(mem, MEM_READ_ONLY); + mem_copy_to(mem); - if(!handle) { - cuda_pop_context(); - return; - } + cuda_push_context(); - if(mem.data_depth > 1) { - CUDA_MEMCPY3D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = handle; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = (void*)mem.data_pointer; - param.srcPitch = mem.data_width*dsize*mem.data_elements; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - param.Depth = mem.data_depth; - - cuda_assert(cuMemcpy3D(¶m)); - } - else if(mem.data_height > 1) { - CUDA_MEMCPY2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = handle; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = (void*)mem.data_pointer; - param.srcPitch = mem.data_width*dsize*mem.data_elements; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - - cuda_assert(cuMemcpy2D(¶m)); - } - else - cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size)); + CUdeviceptr cumem; + size_t cubytes; - cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT)); + cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); - if(interpolation == INTERPOLATION_CLOSEST) { - cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)); + if(cubytes == 8) { + /* 64 bit device pointer */ + uint64_t ptr = mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); } - else if(interpolation == INTERPOLATION_LINEAR) { - cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR)); - } - else {/* CUBIC and SMART are unsupported for CUDA */ - cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR)); + else { + /* 32 bit device pointer */ + uint32_t ptr = (uint32_t)mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); } - cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); - - mem.device_pointer = (device_ptr)handle; - mem.device_size = size; - stats.mem_alloc(size); + cuda_pop_context(); } else { - cuda_pop_context(); - mem_alloc(mem, MEM_READ_ONLY); mem_copy_to(mem); @@ -607,58 +593,149 @@ public: cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size)); cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)); cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER)); + + cuda_pop_context(); } + } + /* Texture Storage */ + else { + CUarray handle = NULL; - CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; - switch(extension) { - case EXTENSION_REPEAT: - address_mode = CU_TR_ADDRESS_MODE_WRAP; - break; - case EXTENSION_EXTEND: - address_mode = CU_TR_ADDRESS_MODE_CLAMP; - break; - case EXTENSION_CLIP: - address_mode = CU_TR_ADDRESS_MODE_BORDER; - break; - default: - assert(0); - break; + cuda_push_context(); + + if(mem.data_depth > 1) { + CUDA_ARRAY3D_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; + + cuda_assert(cuArray3DCreate(&handle, &desc)); } - cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); - cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); + else { + CUDA_ARRAY_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Format = format; + desc.NumChannels = mem.data_elements; + + cuda_assert(cuArrayCreate(&handle, &desc)); + } + + if(!handle) { + cuda_pop_context(); + return; + } + + /* Allocate 3D, 2D or 1D memory */ if(mem.data_depth > 1) { - cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode)); + CUDA_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = handle; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = (void*)mem.data_pointer; + param.srcPitch = mem.data_width*dsize*mem.data_elements; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + + cuda_assert(cuMemcpy3D(¶m)); } + else if(mem.data_height > 1) { + CUDA_MEMCPY2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = handle; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = (void*)mem.data_pointer; + param.srcPitch = mem.data_width*dsize*mem.data_elements; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + cuda_assert(cuMemcpy2D(¶m)); + } + else + cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size)); - cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); + /* Fermi and Kepler */ + mem.device_pointer = (device_ptr)handle; + mem.device_size = size; - cuda_pop_context(); - } - else { - mem_alloc(mem, MEM_READ_ONLY); - mem_copy_to(mem); + stats.mem_alloc(size); - cuda_push_context(); + /* Bindless Textures - Kepler */ + if(has_bindless_textures) { + int flat_slot = 0; + if(string_startswith(name, "__tex_image")) { + int pos = string(name).rfind("_"); + flat_slot = atoi(name + pos + 1); + } + else { + assert(0); + } - CUdeviceptr cumem; - size_t cubytes; + CUDA_RESOURCE_DESC resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = CU_RESOURCE_TYPE_ARRAY; + resDesc.res.array.hArray = handle; + resDesc.flags = 0; + + CUDA_TEXTURE_DESC texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = address_mode; + texDesc.addressMode[1] = address_mode; + texDesc.addressMode[2] = address_mode; + texDesc.filterMode = filter_mode; + texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; + + CUtexObject tex = 0; + cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL)); + + /* Safety check */ + if((uint)tex > UINT_MAX) { + assert(0); + } - cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); + /* Resize once */ + if(flat_slot >= bindless_mapping.size()) + bindless_mapping.resize(4096); /*TODO(dingto): Make this a variable */ - if(cubytes == 8) { - /* 64 bit device pointer */ - uint64_t ptr = mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); + /* Set Mapping and tag that we need to (re-)upload to device */ + bindless_mapping.get_data()[flat_slot] = (uint)tex; + tex_bindless_map[mem.device_pointer] = (uint)tex; + need_bindless_mapping = true; } + /* Regular Textures - Fermi */ else { - /* 32 bit device pointer */ - uint32_t ptr = (uint32_t)mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); + cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT)); + cuda_assert(cuTexRefSetFilterMode(texref, filter_mode)); + cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); + } + + cuda_pop_context(); + } + + /* Fermi, Data and Image Textures */ + if(!has_bindless_textures) { + cuda_push_context(); + + cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); + cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); + if(mem.data_depth > 1) { + cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode)); } + cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); + cuda_pop_context(); } + /* Fermi and Kepler */ tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE); } @@ -670,6 +747,12 @@ public: cuArrayDestroy((CUarray)mem.device_pointer); cuda_pop_context(); + /* Free CUtexObject (Bindless Textures) */ + if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) { + uint flat_slot = tex_bindless_map[mem.device_pointer]; + cuTexObjectDestroy(flat_slot); + } + tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); mem.device_pointer = 0; @@ -1111,6 +1194,9 @@ public: RenderTile tile; bool branched = task->integrator_branched; + + /* Upload Bindless Mapping */ + load_bindless_mapping(); /* keep rendering tiles until done */ while(task->acquire_tile(this, tile)) { @@ -1134,6 +1220,9 @@ public: } } else if(task->type == DeviceTask::SHADER) { + /* Upload Bindless Mapping */ + load_bindless_mapping(); + shader(*task); cuda_push_context(); @@ -1269,7 +1358,7 @@ void device_cuda_info(vector<DeviceInfo>& devices) info.num = num; info.advanced_shading = (major >= 2); - info.extended_images = (major >= 3); + info.has_bindless_textures = (major >= 3); info.pack_images = false; /* if device has a kernel timeout, assume it is used for display */ |