diff options
-rw-r--r-- | intern/cycles/device/device.h | 5 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 365 | ||||
-rw-r--r-- | intern/cycles/device/device_multi.cpp | 4 | ||||
-rw-r--r-- | intern/cycles/kernel/geom/geom_volume.h | 13 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_compat_cuda.h | 25 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_textures.h | 68 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_image.h | 82 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_voxel.h | 17 | ||||
-rw-r--r-- | intern/cycles/render/image.cpp | 6 | ||||
-rw-r--r-- | intern/cycles/util/util_texture.h | 8 |
10 files changed, 307 insertions, 286 deletions
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 4c1b7224837..e11bb7f76af 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -54,7 +54,7 @@ public: bool display_device; bool advanced_shading; bool pack_images; - bool extended_images; /* flag for GPU and Multi device */ + bool has_bindless_textures; /* flag for GPU and Multi device */ bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */ vector<DeviceInfo> multi_devices; @@ -66,7 +66,7 @@ public: display_device = false; advanced_shading = true; pack_images = false; - extended_images = false; + has_bindless_textures = false; use_split_kernel = false; } }; @@ -230,6 +230,7 @@ public: (void)interpolation; /* Ignored. */ (void)extension; /* Ignored. */ }; + virtual void tex_free(device_memory& /*mem*/) {}; /* pixel memory */ diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 12c62c0702c..39bb4426826 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -85,10 +85,10 @@ public: CUcontext cuContext; CUmodule cuModule; map<device_ptr, bool> tex_interp_map; + map<device_ptr, uint> tex_bindless_map; int cuDevId; int cuDevArchitecture; bool first_error; - bool use_texture_storage; struct PixelMem { GLuint cuPBO; @@ -99,6 +99,10 @@ public: map<device_ptr, PixelMem> pixel_mem_map; + /* Bindless Textures */ + device_vector<uint> bindless_mapping; + bool need_bindless_mapping; + CUdeviceptr cuda_device_ptr(device_ptr mem) { return (CUdeviceptr)mem; @@ -176,12 +180,13 @@ public: { first_error = true; background = background_; - use_texture_storage = true; cuDevId = info.num; cuDevice = 0; cuContext = 0; + need_bindless_mapping = false; + /* intialize */ if(cuda_error(cuInit(0))) return; @@ -211,11 +216,6 @@ public: cuDeviceComputeCapability(&major, &minor, cuDevId); cuDevArchitecture = major*100 + minor*10; - /* In order to use full 6GB of memory on Titan cards, use arrays instead - * of textures. On earlier cards this seems slower, but on Titan it is - * actually slightly faster in tests. */ - use_texture_storage = (cuDevArchitecture < 300); - cuda_pop_context(); } @@ -223,6 +223,10 @@ public: { task_pool.stop(); + if(info.has_bindless_textures) { + tex_free(bindless_mapping); + } + cuda_assert(cuCtxDestroy(cuContext)); } @@ -400,6 +404,15 @@ public: return (result == CUDA_SUCCESS); } + void load_bindless_mapping() + { + if(info.has_bindless_textures && need_bindless_mapping) { + tex_free(bindless_mapping); + tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT); + need_bindless_mapping = false; + } + } + void mem_alloc(device_memory& mem, MemoryType /*type*/) { cuda_push_context(); @@ -479,126 +492,99 @@ public: { VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes."; + /* Check if we are on sm_30 or above. + * We use arrays and bindles textures for storage there */ + bool has_bindless_textures = info.has_bindless_textures; + + /* General variables for both architectures */ string bind_name = name; - if(mem.data_depth > 1) { - /* Kernel uses different bind names for 2d and 3d float textures, - * so we have to adjust couple of things here. - */ - vector<string> tokens; - string_split(tokens, name, "_"); - bind_name = string_printf("__tex_image_%s_3d_%s", - tokens[2].c_str(), - tokens[3].c_str()); + size_t dsize = datatype_size(mem.data_type); + size_t size = mem.memory_size(); + + CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; + switch(extension) { + case EXTENSION_REPEAT: + address_mode = CU_TR_ADDRESS_MODE_WRAP; + break; + case EXTENSION_EXTEND: + address_mode = CU_TR_ADDRESS_MODE_CLAMP; + break; + case EXTENSION_CLIP: + address_mode = CU_TR_ADDRESS_MODE_BORDER; + break; + default: + assert(0); + break; + } + + CUfilter_mode filter_mode; + if(interpolation == INTERPOLATION_CLOSEST) { + filter_mode = CU_TR_FILTER_MODE_POINT; + } + else { + filter_mode = CU_TR_FILTER_MODE_LINEAR; } - /* determine format */ CUarray_format_enum format; - size_t dsize = datatype_size(mem.data_type); - size_t size = mem.memory_size(); - bool use_texture = (interpolation != INTERPOLATION_NONE) || use_texture_storage; + switch(mem.data_type) { + case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; + case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; + case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; + case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; + default: assert(0); return; + } - if(use_texture) { + /* General variables for Fermi */ + CUtexref texref = NULL; - switch(mem.data_type) { - case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; - case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; - case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; - case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; - default: assert(0); return; + if(!has_bindless_textures) { + if(mem.data_depth > 1) { + /* Kernel uses different bind names for 2d and 3d float textures, + * so we have to adjust couple of things here. + */ + vector<string> tokens; + string_split(tokens, name, "_"); + bind_name = string_printf("__tex_image_%s_3d_%s", + tokens[2].c_str(), + tokens[3].c_str()); } - CUtexref texref = NULL; - cuda_push_context(); cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str())); + cuda_pop_context(); if(!texref) { - cuda_pop_context(); return; } + } - if(interpolation != INTERPOLATION_NONE) { - CUarray handle = NULL; - - if(mem.data_depth > 1) { - CUDA_ARRAY3D_DESCRIPTOR desc; - - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Depth = mem.data_depth; - desc.Format = format; - desc.NumChannels = mem.data_elements; - desc.Flags = 0; - - cuda_assert(cuArray3DCreate(&handle, &desc)); - } - else { - CUDA_ARRAY_DESCRIPTOR desc; - - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Format = format; - desc.NumChannels = mem.data_elements; - - cuda_assert(cuArrayCreate(&handle, &desc)); - } + /* Data Storage */ + if(interpolation == INTERPOLATION_NONE) { + if(has_bindless_textures) { + mem_alloc(mem, MEM_READ_ONLY); + mem_copy_to(mem); - if(!handle) { - cuda_pop_context(); - return; - } + cuda_push_context(); - if(mem.data_depth > 1) { - CUDA_MEMCPY3D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = handle; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = (void*)mem.data_pointer; - param.srcPitch = mem.data_width*dsize*mem.data_elements; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - param.Depth = mem.data_depth; - - cuda_assert(cuMemcpy3D(¶m)); - } - else if(mem.data_height > 1) { - CUDA_MEMCPY2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = handle; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = (void*)mem.data_pointer; - param.srcPitch = mem.data_width*dsize*mem.data_elements; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - - cuda_assert(cuMemcpy2D(¶m)); - } - else - cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size)); + CUdeviceptr cumem; + size_t cubytes; - cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT)); + cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); - if(interpolation == INTERPOLATION_CLOSEST) { - cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)); + if(cubytes == 8) { + /* 64 bit device pointer */ + uint64_t ptr = mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); } - else if(interpolation == INTERPOLATION_LINEAR) { - cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR)); - } - else {/* CUBIC and SMART are unsupported for CUDA */ - cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR)); + else { + /* 32 bit device pointer */ + uint32_t ptr = (uint32_t)mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); } - cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); - - mem.device_pointer = (device_ptr)handle; - mem.device_size = size; - stats.mem_alloc(size); + cuda_pop_context(); } else { - cuda_pop_context(); - mem_alloc(mem, MEM_READ_ONLY); mem_copy_to(mem); @@ -607,58 +593,149 @@ public: cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size)); cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)); cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER)); + + cuda_pop_context(); } + } + /* Texture Storage */ + else { + CUarray handle = NULL; - CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; - switch(extension) { - case EXTENSION_REPEAT: - address_mode = CU_TR_ADDRESS_MODE_WRAP; - break; - case EXTENSION_EXTEND: - address_mode = CU_TR_ADDRESS_MODE_CLAMP; - break; - case EXTENSION_CLIP: - address_mode = CU_TR_ADDRESS_MODE_BORDER; - break; - default: - assert(0); - break; + cuda_push_context(); + + if(mem.data_depth > 1) { + CUDA_ARRAY3D_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; + + cuda_assert(cuArray3DCreate(&handle, &desc)); } - cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); - cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); + else { + CUDA_ARRAY_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Format = format; + desc.NumChannels = mem.data_elements; + + cuda_assert(cuArrayCreate(&handle, &desc)); + } + + if(!handle) { + cuda_pop_context(); + return; + } + + /* Allocate 3D, 2D or 1D memory */ if(mem.data_depth > 1) { - cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode)); + CUDA_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = handle; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = (void*)mem.data_pointer; + param.srcPitch = mem.data_width*dsize*mem.data_elements; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + + cuda_assert(cuMemcpy3D(¶m)); } + else if(mem.data_height > 1) { + CUDA_MEMCPY2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = handle; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = (void*)mem.data_pointer; + param.srcPitch = mem.data_width*dsize*mem.data_elements; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + cuda_assert(cuMemcpy2D(¶m)); + } + else + cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size)); - cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); + /* Fermi and Kepler */ + mem.device_pointer = (device_ptr)handle; + mem.device_size = size; - cuda_pop_context(); - } - else { - mem_alloc(mem, MEM_READ_ONLY); - mem_copy_to(mem); + stats.mem_alloc(size); - cuda_push_context(); + /* Bindless Textures - Kepler */ + if(has_bindless_textures) { + int flat_slot = 0; + if(string_startswith(name, "__tex_image")) { + int pos = string(name).rfind("_"); + flat_slot = atoi(name + pos + 1); + } + else { + assert(0); + } - CUdeviceptr cumem; - size_t cubytes; + CUDA_RESOURCE_DESC resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = CU_RESOURCE_TYPE_ARRAY; + resDesc.res.array.hArray = handle; + resDesc.flags = 0; + + CUDA_TEXTURE_DESC texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = address_mode; + texDesc.addressMode[1] = address_mode; + texDesc.addressMode[2] = address_mode; + texDesc.filterMode = filter_mode; + texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; + + CUtexObject tex = 0; + cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL)); + + /* Safety check */ + if((uint)tex > UINT_MAX) { + assert(0); + } - cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); + /* Resize once */ + if(flat_slot >= bindless_mapping.size()) + bindless_mapping.resize(4096); /*TODO(dingto): Make this a variable */ - if(cubytes == 8) { - /* 64 bit device pointer */ - uint64_t ptr = mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); + /* Set Mapping and tag that we need to (re-)upload to device */ + bindless_mapping.get_data()[flat_slot] = (uint)tex; + tex_bindless_map[mem.device_pointer] = (uint)tex; + need_bindless_mapping = true; } + /* Regular Textures - Fermi */ else { - /* 32 bit device pointer */ - uint32_t ptr = (uint32_t)mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); + cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT)); + cuda_assert(cuTexRefSetFilterMode(texref, filter_mode)); + cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)); + } + + cuda_pop_context(); + } + + /* Fermi, Data and Image Textures */ + if(!has_bindless_textures) { + cuda_push_context(); + + cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode)); + cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode)); + if(mem.data_depth > 1) { + cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode)); } + cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)); + cuda_pop_context(); } + /* Fermi and Kepler */ tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE); } @@ -670,6 +747,12 @@ public: cuArrayDestroy((CUarray)mem.device_pointer); cuda_pop_context(); + /* Free CUtexObject (Bindless Textures) */ + if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) { + uint flat_slot = tex_bindless_map[mem.device_pointer]; + cuTexObjectDestroy(flat_slot); + } + tex_interp_map.erase(tex_interp_map.find(mem.device_pointer)); mem.device_pointer = 0; @@ -1111,6 +1194,9 @@ public: RenderTile tile; bool branched = task->integrator_branched; + + /* Upload Bindless Mapping */ + load_bindless_mapping(); /* keep rendering tiles until done */ while(task->acquire_tile(this, tile)) { @@ -1134,6 +1220,9 @@ public: } } else if(task->type == DeviceTask::SHADER) { + /* Upload Bindless Mapping */ + load_bindless_mapping(); + shader(*task); cuda_push_context(); @@ -1269,7 +1358,7 @@ void device_cuda_info(vector<DeviceInfo>& devices) info.num = num; info.advanced_shading = (major >= 2); - info.extended_images = (major >= 3); + info.has_bindless_textures = (major >= 3); info.pack_images = false; /* if device has a kernel timeout, assume it is used for display */ diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 6141f9af50f..434d0085d39 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -352,7 +352,7 @@ static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool info.advanced_shading = with_advanced_shading; info.pack_images = false; - info.extended_images = true; + info.has_bindless_textures = true; foreach(DeviceInfo& subinfo, devices) { if(subinfo.type == type) { @@ -376,7 +376,7 @@ static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool if(subinfo.display_device) info.display_device = true; info.pack_images = info.pack_images || subinfo.pack_images; - info.extended_images = info.extended_images && subinfo.extended_images; + info.has_bindless_textures = info.has_bindless_textures && subinfo.has_bindless_textures; num_added++; } } diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index ef02c01dec6..2044aafc877 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN /* Return position normalized to 0..1 in mesh bounds */ -#ifdef __KERNEL_GPU__ +#if defined(__KERNEL_GPU__) && __CUDA_ARCH__ < 300 ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z) { float4 r; @@ -65,7 +65,13 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, { float3 P = volume_normalized_position(kg, sd, sd->P); #ifdef __KERNEL_GPU__ +# if __CUDA_ARCH__ >= 300 + CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); + float f = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z); + float4 r = make_float4(f, f, f, 1.0); +# else float4 r = volume_image_texture_3d(id, P.x, P.y, P.z); +# endif #else float4 r; if(sd->flag & SD_VOLUME_CUBIC) @@ -84,7 +90,12 @@ ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *s { float3 P = volume_normalized_position(kg, sd, sd->P); #ifdef __KERNEL_GPU__ +# if __CUDA_ARCH__ >= 300 + CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); + float4 r = kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z); +# else float4 r = volume_image_texture_3d(id, P.x, P.y, P.z); +# endif #else float4 r; if(sd->flag & SD_VOLUME_CUBIC) diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index d10d3255e1b..42314756f02 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -67,20 +67,29 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4; /* Macros to handle different memory storage on different devices */ -/* In order to use full 6GB of memory on Titan cards, use arrays instead - * of textures. On earlier cards this seems slower, but on Titan it is - * actually slightly faster in tests. */ +/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images. + * On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data. + * + * Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster. + * Using Arrays on Fermi turned out to be slower.*/ + +/* Fermi */ #if __CUDA_ARCH__ < 300 # define __KERNEL_CUDA_TEX_STORAGE__ -#endif - -#ifdef __KERNEL_CUDA_TEX_STORAGE__ # define kernel_tex_fetch(t, index) tex1Dfetch(t, index) + +# define kernel_tex_image_interp(t, x, y) tex2D(t, x, y) +# define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z) + +/* Kepler */ #else # define kernel_tex_fetch(t, index) t[(index)] + +# define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y) +# define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y) +# define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z) +# define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z) #endif -#define kernel_tex_image_interp(t, x, y) tex2D(t, x, y) -#define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z) #define kernel_data __data diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index 62b0a6f2923..245d236ff97 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -72,6 +72,8 @@ KERNEL_TEX(float, texture_float, __lookup_table) /* sobol */ KERNEL_TEX(uint, texture_uint, __sobol_directions) +#ifdef __KERNEL_CUDA__ +# if __CUDA_ARCH__ < 300 /* full-float image */ KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000) KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001) @@ -174,66 +176,12 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_090) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_091) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_092) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_093) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_094) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_095) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_096) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_098) - -/* Kepler and above */ -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_099) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_100) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_101) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_102) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_103) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_104) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_106) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_107) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_108) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_109) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_110) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_111) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_112) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_114) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_115) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_116) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_117) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_118) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_119) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_120) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_122) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_123) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_124) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_125) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_126) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_127) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_128) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_130) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_131) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_132) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_133) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_134) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_135) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_136) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_138) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_139) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_140) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_141) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_142) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_143) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_144) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_146) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_147) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_148) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_149) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_150) + +# else +/* bindless textures */ +KERNEL_TEX(uint, texture_uint, __bindless_mapping) +# endif +#endif /* packed image (opencl) */ KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed) diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index faff4ce3e6d..92d2b36bbb1 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -18,11 +18,15 @@ CCL_NAMESPACE_BEGIN /* Float4 textures on various devices. */ #if defined(__KERNEL_CPU__) - #define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CPU +# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CPU #elif defined(__KERNEL_CUDA__) - #define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA +# if __CUDA_ARCH__ < 300 +# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA +# else +# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER +# endif #else - #define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_OPENCL +# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_OPENCL #endif #ifdef __KERNEL_OPENCL__ @@ -151,6 +155,7 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, #else float4 r; +# if __CUDA_ARCH__ < 300 /* not particularly proud of this massive switch, what are the * alternatives? * - use a single big 1D texture, and do our own lookup/filtering @@ -254,72 +259,19 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, case 90: r = kernel_tex_image_interp(__tex_image_byte4_090, x, y); break; case 91: r = kernel_tex_image_interp(__tex_image_byte4_091, x, y); break; case 92: r = kernel_tex_image_interp(__tex_image_byte4_092, x, y); break; - -# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300) - case 93: r = kernel_tex_image_interp(__tex_image_byte4_093, x, y); break; - case 94: r = kernel_tex_image_interp(__tex_image_byte4_094, x, y); break; - case 95: r = kernel_tex_image_interp(__tex_image_byte4_095, x, y); break; - case 96: r = kernel_tex_image_interp(__tex_image_byte4_096, x, y); break; - case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break; - case 98: r = kernel_tex_image_interp(__tex_image_byte4_098, x, y); break; - case 99: r = kernel_tex_image_interp(__tex_image_byte4_099, x, y); break; - case 100: r = kernel_tex_image_interp(__tex_image_byte4_100, x, y); break; - case 101: r = kernel_tex_image_interp(__tex_image_byte4_101, x, y); break; - case 102: r = kernel_tex_image_interp(__tex_image_byte4_102, x, y); break; - case 103: r = kernel_tex_image_interp(__tex_image_byte4_103, x, y); break; - case 104: r = kernel_tex_image_interp(__tex_image_byte4_104, x, y); break; - case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break; - case 106: r = kernel_tex_image_interp(__tex_image_byte4_106, x, y); break; - case 107: r = kernel_tex_image_interp(__tex_image_byte4_107, x, y); break; - case 108: r = kernel_tex_image_interp(__tex_image_byte4_108, x, y); break; - case 109: r = kernel_tex_image_interp(__tex_image_byte4_109, x, y); break; - case 110: r = kernel_tex_image_interp(__tex_image_byte4_110, x, y); break; - case 111: r = kernel_tex_image_interp(__tex_image_byte4_111, x, y); break; - case 112: r = kernel_tex_image_interp(__tex_image_byte4_112, x, y); break; - case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break; - case 114: r = kernel_tex_image_interp(__tex_image_byte4_114, x, y); break; - case 115: r = kernel_tex_image_interp(__tex_image_byte4_115, x, y); break; - case 116: r = kernel_tex_image_interp(__tex_image_byte4_116, x, y); break; - case 117: r = kernel_tex_image_interp(__tex_image_byte4_117, x, y); break; - case 118: r = kernel_tex_image_interp(__tex_image_byte4_118, x, y); break; - case 119: r = kernel_tex_image_interp(__tex_image_byte4_119, x, y); break; - case 120: r = kernel_tex_image_interp(__tex_image_byte4_120, x, y); break; - case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break; - case 122: r = kernel_tex_image_interp(__tex_image_byte4_122, x, y); break; - case 123: r = kernel_tex_image_interp(__tex_image_byte4_123, x, y); break; - case 124: r = kernel_tex_image_interp(__tex_image_byte4_124, x, y); break; - case 125: r = kernel_tex_image_interp(__tex_image_byte4_125, x, y); break; - case 126: r = kernel_tex_image_interp(__tex_image_byte4_126, x, y); break; - case 127: r = kernel_tex_image_interp(__tex_image_byte4_127, x, y); break; - case 128: r = kernel_tex_image_interp(__tex_image_byte4_128, x, y); break; - case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break; - case 130: r = kernel_tex_image_interp(__tex_image_byte4_130, x, y); break; - case 131: r = kernel_tex_image_interp(__tex_image_byte4_131, x, y); break; - case 132: r = kernel_tex_image_interp(__tex_image_byte4_132, x, y); break; - case 133: r = kernel_tex_image_interp(__tex_image_byte4_133, x, y); break; - case 134: r = kernel_tex_image_interp(__tex_image_byte4_134, x, y); break; - case 135: r = kernel_tex_image_interp(__tex_image_byte4_135, x, y); break; - case 136: r = kernel_tex_image_interp(__tex_image_byte4_136, x, y); break; - case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break; - case 138: r = kernel_tex_image_interp(__tex_image_byte4_138, x, y); break; - case 139: r = kernel_tex_image_interp(__tex_image_byte4_139, x, y); break; - case 140: r = kernel_tex_image_interp(__tex_image_byte4_140, x, y); break; - case 141: r = kernel_tex_image_interp(__tex_image_byte4_141, x, y); break; - case 142: r = kernel_tex_image_interp(__tex_image_byte4_142, x, y); break; - case 143: r = kernel_tex_image_interp(__tex_image_byte4_143, x, y); break; - case 144: r = kernel_tex_image_interp(__tex_image_byte4_144, x, y); break; - case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break; - case 146: r = kernel_tex_image_interp(__tex_image_byte4_146, x, y); break; - case 147: r = kernel_tex_image_interp(__tex_image_byte4_147, x, y); break; - case 148: r = kernel_tex_image_interp(__tex_image_byte4_148, x, y); break; - case 149: r = kernel_tex_image_interp(__tex_image_byte4_149, x, y); break; - case 150: r = kernel_tex_image_interp(__tex_image_byte4_150, x, y); break; -# endif - default: kernel_assert(0); return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } +# else + CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); + if(id < 2048) /* TODO(dingto): Make this a variable */ + r = kernel_tex_image_interp_float4(tex, x, y); + else { + float f = kernel_tex_image_interp_float(tex, x, y); + r = make_float4(f, f, f, 1.0); + } +# endif #endif #ifdef __KERNEL_SSE2__ diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h index 85ba2f906fa..d2cc2c3730e 100644 --- a/intern/cycles/kernel/svm/svm_voxel.h +++ b/intern/cycles/kernel/svm/svm_voxel.h @@ -42,10 +42,21 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg, tfm.w = read_node_float(kg, offset); co = transform_point(&tfm, co); } + float4 r; # if defined(__KERNEL_GPU__) - float4 r = volume_image_texture_3d(id, co.x, co.y, co.z); -# else - float4 r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z); +# if __CUDA_ARCH__ >= 300 + CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); + if(id < 2048) /* TODO(dingto): Make this a variable */ + r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z); + else { + float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z); + r = make_float4(f, f, f, 1.0); + } +# else /* __CUDA_ARCH__ >= 300 */ + r = volume_image_texture_3d(id, co.x, co.y, co.z); +# endif +# else /* __KERNEL_GPU__ */ + r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z); # endif #else float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp index 898e00fdcd9..9f40e561f59 100644 --- a/intern/cycles/render/image.cpp +++ b/intern/cycles/render/image.cpp @@ -49,7 +49,7 @@ ImageManager::ImageManager(const DeviceInfo& info) tex_image_byte_start = TEX_IMAGE_BYTE_START_CPU; } /* CUDA (Fermi) */ - else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && !info.extended_images) { + else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && !info.has_bindless_textures) { tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_IMAGES_CUDA; tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_IMAGES_CUDA; tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_IMAGES_CUDA; @@ -59,7 +59,7 @@ ImageManager::ImageManager(const DeviceInfo& info) tex_image_byte_start = TEX_IMAGE_BYTE_START_CUDA; } /* CUDA (Kepler and above) */ - else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && info.extended_images) { + else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && info.has_bindless_textures) { tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER; tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER; tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_IMAGES_CUDA_KEPLER; @@ -294,7 +294,7 @@ int ImageManager::add_image(const string& filename, if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4) is_float = true; - /* No float and byte textures on GPU yet */ + /* No single channel textures on Fermi GPUs, use available slots */ if(type == IMAGE_DATA_TYPE_FLOAT && tex_num_images[type] == 0) type = IMAGE_DATA_TYPE_FLOAT4; if(type == IMAGE_DATA_TYPE_BYTE && tex_num_images[type] == 0) diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h index 2a1cfca4fdd..6da47858133 100644 --- a/intern/cycles/util/util_texture.h +++ b/intern/cycles/util/util_texture.h @@ -40,10 +40,10 @@ CCL_NAMESPACE_BEGIN #define TEX_IMAGE_BYTE_START_CUDA (TEX_NUM_FLOAT4_IMAGES_CUDA + TEX_NUM_BYTE4_IMAGES_CUDA + TEX_NUM_BYTE_IMAGES_CUDA) /* CUDA (KEPLER and above) */ -#define TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER 145 -#define TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER 5 -#define TEX_NUM_FLOAT_IMAGES_CUDA_KEPLER 0 -#define TEX_NUM_BYTE_IMAGES_CUDA_KEPLER 0 +#define TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER 1024 +#define TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER 1024 +#define TEX_NUM_FLOAT_IMAGES_CUDA_KEPLER 1024 +#define TEX_NUM_BYTE_IMAGES_CUDA_KEPLER 1024 #define TEX_IMAGE_BYTE4_START_CUDA_KEPLER TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER #define TEX_IMAGE_FLOAT_START_CUDA_KEPLER (TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER) #define TEX_IMAGE_BYTE_START_CUDA_KEPLER (TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE_IMAGES_CUDA_KEPLER) |