Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--intern/cycles/device/device.h5
-rw-r--r--intern/cycles/device/device_cuda.cpp365
-rw-r--r--intern/cycles/device/device_multi.cpp4
-rw-r--r--intern/cycles/kernel/geom/geom_volume.h13
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h25
-rw-r--r--intern/cycles/kernel/kernel_textures.h68
-rw-r--r--intern/cycles/kernel/svm/svm_image.h82
-rw-r--r--intern/cycles/kernel/svm/svm_voxel.h17
-rw-r--r--intern/cycles/render/image.cpp6
-rw-r--r--intern/cycles/util/util_texture.h8
10 files changed, 307 insertions, 286 deletions
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 4c1b7224837..e11bb7f76af 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -54,7 +54,7 @@ public:
bool display_device;
bool advanced_shading;
bool pack_images;
- bool extended_images; /* flag for GPU and Multi device */
+ bool has_bindless_textures; /* flag for GPU and Multi device */
bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
vector<DeviceInfo> multi_devices;
@@ -66,7 +66,7 @@ public:
display_device = false;
advanced_shading = true;
pack_images = false;
- extended_images = false;
+ has_bindless_textures = false;
use_split_kernel = false;
}
};
@@ -230,6 +230,7 @@ public:
(void)interpolation; /* Ignored. */
(void)extension; /* Ignored. */
};
+
virtual void tex_free(device_memory& /*mem*/) {};
/* pixel memory */
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 12c62c0702c..39bb4426826 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -85,10 +85,10 @@ public:
CUcontext cuContext;
CUmodule cuModule;
map<device_ptr, bool> tex_interp_map;
+ map<device_ptr, uint> tex_bindless_map;
int cuDevId;
int cuDevArchitecture;
bool first_error;
- bool use_texture_storage;
struct PixelMem {
GLuint cuPBO;
@@ -99,6 +99,10 @@ public:
map<device_ptr, PixelMem> pixel_mem_map;
+ /* Bindless Textures */
+ device_vector<uint> bindless_mapping;
+ bool need_bindless_mapping;
+
CUdeviceptr cuda_device_ptr(device_ptr mem)
{
return (CUdeviceptr)mem;
@@ -176,12 +180,13 @@ public:
{
first_error = true;
background = background_;
- use_texture_storage = true;
cuDevId = info.num;
cuDevice = 0;
cuContext = 0;
+ need_bindless_mapping = false;
+
/* intialize */
if(cuda_error(cuInit(0)))
return;
@@ -211,11 +216,6 @@ public:
cuDeviceComputeCapability(&major, &minor, cuDevId);
cuDevArchitecture = major*100 + minor*10;
- /* In order to use full 6GB of memory on Titan cards, use arrays instead
- * of textures. On earlier cards this seems slower, but on Titan it is
- * actually slightly faster in tests. */
- use_texture_storage = (cuDevArchitecture < 300);
-
cuda_pop_context();
}
@@ -223,6 +223,10 @@ public:
{
task_pool.stop();
+ if(info.has_bindless_textures) {
+ tex_free(bindless_mapping);
+ }
+
cuda_assert(cuCtxDestroy(cuContext));
}
@@ -400,6 +404,15 @@ public:
return (result == CUDA_SUCCESS);
}
+ void load_bindless_mapping()
+ {
+ if(info.has_bindless_textures && need_bindless_mapping) {
+ tex_free(bindless_mapping);
+ tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT);
+ need_bindless_mapping = false;
+ }
+ }
+
void mem_alloc(device_memory& mem, MemoryType /*type*/)
{
cuda_push_context();
@@ -479,126 +492,99 @@ public:
{
VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
+ /* Check if we are on sm_30 or above.
+ * We use arrays and bindles textures for storage there */
+ bool has_bindless_textures = info.has_bindless_textures;
+
+ /* General variables for both architectures */
string bind_name = name;
- if(mem.data_depth > 1) {
- /* Kernel uses different bind names for 2d and 3d float textures,
- * so we have to adjust couple of things here.
- */
- vector<string> tokens;
- string_split(tokens, name, "_");
- bind_name = string_printf("__tex_image_%s_3d_%s",
- tokens[2].c_str(),
- tokens[3].c_str());
+ size_t dsize = datatype_size(mem.data_type);
+ size_t size = mem.memory_size();
+
+ CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+ switch(extension) {
+ case EXTENSION_REPEAT:
+ address_mode = CU_TR_ADDRESS_MODE_WRAP;
+ break;
+ case EXTENSION_EXTEND:
+ address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+ break;
+ case EXTENSION_CLIP:
+ address_mode = CU_TR_ADDRESS_MODE_BORDER;
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ CUfilter_mode filter_mode;
+ if(interpolation == INTERPOLATION_CLOSEST) {
+ filter_mode = CU_TR_FILTER_MODE_POINT;
+ }
+ else {
+ filter_mode = CU_TR_FILTER_MODE_LINEAR;
}
- /* determine format */
CUarray_format_enum format;
- size_t dsize = datatype_size(mem.data_type);
- size_t size = mem.memory_size();
- bool use_texture = (interpolation != INTERPOLATION_NONE) || use_texture_storage;
+ switch(mem.data_type) {
+ case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
+ case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
+ case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
+ case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
+ default: assert(0); return;
+ }
- if(use_texture) {
+ /* General variables for Fermi */
+ CUtexref texref = NULL;
- switch(mem.data_type) {
- case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
- case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
- case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
- case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
- default: assert(0); return;
+ if(!has_bindless_textures) {
+ if(mem.data_depth > 1) {
+ /* Kernel uses different bind names for 2d and 3d float textures,
+ * so we have to adjust couple of things here.
+ */
+ vector<string> tokens;
+ string_split(tokens, name, "_");
+ bind_name = string_printf("__tex_image_%s_3d_%s",
+ tokens[2].c_str(),
+ tokens[3].c_str());
}
- CUtexref texref = NULL;
-
cuda_push_context();
cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
+ cuda_pop_context();
if(!texref) {
- cuda_pop_context();
return;
}
+ }
- if(interpolation != INTERPOLATION_NONE) {
- CUarray handle = NULL;
-
- if(mem.data_depth > 1) {
- CUDA_ARRAY3D_DESCRIPTOR desc;
-
- desc.Width = mem.data_width;
- desc.Height = mem.data_height;
- desc.Depth = mem.data_depth;
- desc.Format = format;
- desc.NumChannels = mem.data_elements;
- desc.Flags = 0;
-
- cuda_assert(cuArray3DCreate(&handle, &desc));
- }
- else {
- CUDA_ARRAY_DESCRIPTOR desc;
-
- desc.Width = mem.data_width;
- desc.Height = mem.data_height;
- desc.Format = format;
- desc.NumChannels = mem.data_elements;
-
- cuda_assert(cuArrayCreate(&handle, &desc));
- }
+ /* Data Storage */
+ if(interpolation == INTERPOLATION_NONE) {
+ if(has_bindless_textures) {
+ mem_alloc(mem, MEM_READ_ONLY);
+ mem_copy_to(mem);
- if(!handle) {
- cuda_pop_context();
- return;
- }
+ cuda_push_context();
- if(mem.data_depth > 1) {
- CUDA_MEMCPY3D param;
- memset(&param, 0, sizeof(param));
- param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- param.dstArray = handle;
- param.srcMemoryType = CU_MEMORYTYPE_HOST;
- param.srcHost = (void*)mem.data_pointer;
- param.srcPitch = mem.data_width*dsize*mem.data_elements;
- param.WidthInBytes = param.srcPitch;
- param.Height = mem.data_height;
- param.Depth = mem.data_depth;
-
- cuda_assert(cuMemcpy3D(&param));
- }
- else if(mem.data_height > 1) {
- CUDA_MEMCPY2D param;
- memset(&param, 0, sizeof(param));
- param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- param.dstArray = handle;
- param.srcMemoryType = CU_MEMORYTYPE_HOST;
- param.srcHost = (void*)mem.data_pointer;
- param.srcPitch = mem.data_width*dsize*mem.data_elements;
- param.WidthInBytes = param.srcPitch;
- param.Height = mem.data_height;
-
- cuda_assert(cuMemcpy2D(&param));
- }
- else
- cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
+ CUdeviceptr cumem;
+ size_t cubytes;
- cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
+ cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
- if(interpolation == INTERPOLATION_CLOSEST) {
- cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
+ if(cubytes == 8) {
+ /* 64 bit device pointer */
+ uint64_t ptr = mem.device_pointer;
+ cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
}
- else if(interpolation == INTERPOLATION_LINEAR) {
- cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
- }
- else {/* CUBIC and SMART are unsupported for CUDA */
- cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
+ else {
+ /* 32 bit device pointer */
+ uint32_t ptr = (uint32_t)mem.device_pointer;
+ cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
}
- cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
-
- mem.device_pointer = (device_ptr)handle;
- mem.device_size = size;
- stats.mem_alloc(size);
+ cuda_pop_context();
}
else {
- cuda_pop_context();
-
mem_alloc(mem, MEM_READ_ONLY);
mem_copy_to(mem);
@@ -607,58 +593,149 @@ public:
cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
+
+ cuda_pop_context();
}
+ }
+ /* Texture Storage */
+ else {
+ CUarray handle = NULL;
- CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
- switch(extension) {
- case EXTENSION_REPEAT:
- address_mode = CU_TR_ADDRESS_MODE_WRAP;
- break;
- case EXTENSION_EXTEND:
- address_mode = CU_TR_ADDRESS_MODE_CLAMP;
- break;
- case EXTENSION_CLIP:
- address_mode = CU_TR_ADDRESS_MODE_BORDER;
- break;
- default:
- assert(0);
- break;
+ cuda_push_context();
+
+ if(mem.data_depth > 1) {
+ CUDA_ARRAY3D_DESCRIPTOR desc;
+
+ desc.Width = mem.data_width;
+ desc.Height = mem.data_height;
+ desc.Depth = mem.data_depth;
+ desc.Format = format;
+ desc.NumChannels = mem.data_elements;
+ desc.Flags = 0;
+
+ cuda_assert(cuArray3DCreate(&handle, &desc));
}
- cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
- cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
+ else {
+ CUDA_ARRAY_DESCRIPTOR desc;
+
+ desc.Width = mem.data_width;
+ desc.Height = mem.data_height;
+ desc.Format = format;
+ desc.NumChannels = mem.data_elements;
+
+ cuda_assert(cuArrayCreate(&handle, &desc));
+ }
+
+ if(!handle) {
+ cuda_pop_context();
+ return;
+ }
+
+ /* Allocate 3D, 2D or 1D memory */
if(mem.data_depth > 1) {
- cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode));
+ CUDA_MEMCPY3D param;
+ memset(&param, 0, sizeof(param));
+ param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ param.dstArray = handle;
+ param.srcMemoryType = CU_MEMORYTYPE_HOST;
+ param.srcHost = (void*)mem.data_pointer;
+ param.srcPitch = mem.data_width*dsize*mem.data_elements;
+ param.WidthInBytes = param.srcPitch;
+ param.Height = mem.data_height;
+ param.Depth = mem.data_depth;
+
+ cuda_assert(cuMemcpy3D(&param));
}
+ else if(mem.data_height > 1) {
+ CUDA_MEMCPY2D param;
+ memset(&param, 0, sizeof(param));
+ param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ param.dstArray = handle;
+ param.srcMemoryType = CU_MEMORYTYPE_HOST;
+ param.srcHost = (void*)mem.data_pointer;
+ param.srcPitch = mem.data_width*dsize*mem.data_elements;
+ param.WidthInBytes = param.srcPitch;
+ param.Height = mem.data_height;
+
+ cuda_assert(cuMemcpy2D(&param));
+ }
+ else
+ cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
- cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
+ /* Fermi and Kepler */
+ mem.device_pointer = (device_ptr)handle;
+ mem.device_size = size;
- cuda_pop_context();
- }
- else {
- mem_alloc(mem, MEM_READ_ONLY);
- mem_copy_to(mem);
+ stats.mem_alloc(size);
- cuda_push_context();
+ /* Bindless Textures - Kepler */
+ if(has_bindless_textures) {
+ int flat_slot = 0;
+ if(string_startswith(name, "__tex_image")) {
+ int pos = string(name).rfind("_");
+ flat_slot = atoi(name + pos + 1);
+ }
+ else {
+ assert(0);
+ }
- CUdeviceptr cumem;
- size_t cubytes;
+ CUDA_RESOURCE_DESC resDesc;
+ memset(&resDesc, 0, sizeof(resDesc));
+ resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+ resDesc.res.array.hArray = handle;
+ resDesc.flags = 0;
+
+ CUDA_TEXTURE_DESC texDesc;
+ memset(&texDesc, 0, sizeof(texDesc));
+ texDesc.addressMode[0] = address_mode;
+ texDesc.addressMode[1] = address_mode;
+ texDesc.addressMode[2] = address_mode;
+ texDesc.filterMode = filter_mode;
+ texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+ CUtexObject tex = 0;
+ cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL));
+
+ /* Safety check */
+ if((uint)tex > UINT_MAX) {
+ assert(0);
+ }
- cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
+ /* Resize once */
+ if(flat_slot >= bindless_mapping.size())
+ bindless_mapping.resize(4096); /*TODO(dingto): Make this a variable */
- if(cubytes == 8) {
- /* 64 bit device pointer */
- uint64_t ptr = mem.device_pointer;
- cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
+ /* Set Mapping and tag that we need to (re-)upload to device */
+ bindless_mapping.get_data()[flat_slot] = (uint)tex;
+ tex_bindless_map[mem.device_pointer] = (uint)tex;
+ need_bindless_mapping = true;
}
+ /* Regular Textures - Fermi */
else {
- /* 32 bit device pointer */
- uint32_t ptr = (uint32_t)mem.device_pointer;
- cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
+ cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
+ cuda_assert(cuTexRefSetFilterMode(texref, filter_mode));
+ cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
+ }
+
+ cuda_pop_context();
+ }
+
+ /* Fermi, Data and Image Textures */
+ if(!has_bindless_textures) {
+ cuda_push_context();
+
+ cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
+ cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
+ if(mem.data_depth > 1) {
+ cuda_assert(cuTexRefSetAddressMode(texref, 2, address_mode));
}
+ cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
+
cuda_pop_context();
}
+ /* Fermi and Kepler */
tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE);
}
@@ -670,6 +747,12 @@ public:
cuArrayDestroy((CUarray)mem.device_pointer);
cuda_pop_context();
+ /* Free CUtexObject (Bindless Textures) */
+ if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) {
+ uint flat_slot = tex_bindless_map[mem.device_pointer];
+ cuTexObjectDestroy(flat_slot);
+ }
+
tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
mem.device_pointer = 0;
@@ -1111,6 +1194,9 @@ public:
RenderTile tile;
bool branched = task->integrator_branched;
+
+ /* Upload Bindless Mapping */
+ load_bindless_mapping();
/* keep rendering tiles until done */
while(task->acquire_tile(this, tile)) {
@@ -1134,6 +1220,9 @@ public:
}
}
else if(task->type == DeviceTask::SHADER) {
+ /* Upload Bindless Mapping */
+ load_bindless_mapping();
+
shader(*task);
cuda_push_context();
@@ -1269,7 +1358,7 @@ void device_cuda_info(vector<DeviceInfo>& devices)
info.num = num;
info.advanced_shading = (major >= 2);
- info.extended_images = (major >= 3);
+ info.has_bindless_textures = (major >= 3);
info.pack_images = false;
/* if device has a kernel timeout, assume it is used for display */
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 6141f9af50f..434d0085d39 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -352,7 +352,7 @@ static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool
info.advanced_shading = with_advanced_shading;
info.pack_images = false;
- info.extended_images = true;
+ info.has_bindless_textures = true;
foreach(DeviceInfo& subinfo, devices) {
if(subinfo.type == type) {
@@ -376,7 +376,7 @@ static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool
if(subinfo.display_device)
info.display_device = true;
info.pack_images = info.pack_images || subinfo.pack_images;
- info.extended_images = info.extended_images && subinfo.extended_images;
+ info.has_bindless_textures = info.has_bindless_textures && subinfo.has_bindless_textures;
num_added++;
}
}
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index ef02c01dec6..2044aafc877 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
/* Return position normalized to 0..1 in mesh bounds */
-#ifdef __KERNEL_GPU__
+#if defined(__KERNEL_GPU__) && __CUDA_ARCH__ < 300
ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
{
float4 r;
@@ -65,7 +65,13 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
{
float3 P = volume_normalized_position(kg, sd, sd->P);
#ifdef __KERNEL_GPU__
+# if __CUDA_ARCH__ >= 300
+ CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
+ float f = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z);
+ float4 r = make_float4(f, f, f, 1.0);
+# else
float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
+# endif
#else
float4 r;
if(sd->flag & SD_VOLUME_CUBIC)
@@ -84,7 +90,12 @@ ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *s
{
float3 P = volume_normalized_position(kg, sd, sd->P);
#ifdef __KERNEL_GPU__
+# if __CUDA_ARCH__ >= 300
+ CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
+ float4 r = kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z);
+# else
float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
+# endif
#else
float4 r;
if(sd->flag & SD_VOLUME_CUBIC)
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index d10d3255e1b..42314756f02 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -67,20 +67,29 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
/* Macros to handle different memory storage on different devices */
-/* In order to use full 6GB of memory on Titan cards, use arrays instead
- * of textures. On earlier cards this seems slower, but on Titan it is
- * actually slightly faster in tests. */
+/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images.
+ * On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data.
+ *
+ * Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster.
+ * Using Arrays on Fermi turned out to be slower.*/
+
+/* Fermi */
#if __CUDA_ARCH__ < 300
# define __KERNEL_CUDA_TEX_STORAGE__
-#endif
-
-#ifdef __KERNEL_CUDA_TEX_STORAGE__
# define kernel_tex_fetch(t, index) tex1Dfetch(t, index)
+
+# define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
+# define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)
+
+/* Kepler */
#else
# define kernel_tex_fetch(t, index) t[(index)]
+
+# define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y)
+# define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y)
+# define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z)
+# define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z)
#endif
-#define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
-#define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)
#define kernel_data __data
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index 62b0a6f2923..245d236ff97 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -72,6 +72,8 @@ KERNEL_TEX(float, texture_float, __lookup_table)
/* sobol */
KERNEL_TEX(uint, texture_uint, __sobol_directions)
+#ifdef __KERNEL_CUDA__
+# if __CUDA_ARCH__ < 300
/* full-float image */
KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000)
KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001)
@@ -174,66 +176,12 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_090)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_091)
KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_092)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_093)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_094)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_095)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_096)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_098)
-
-/* Kepler and above */
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_099)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_100)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_101)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_102)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_103)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_104)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_106)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_107)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_108)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_109)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_110)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_111)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_112)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_114)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_115)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_116)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_117)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_118)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_119)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_120)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_122)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_123)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_124)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_125)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_126)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_127)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_128)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_130)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_131)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_132)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_133)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_134)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_135)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_136)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_138)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_139)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_140)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_141)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_142)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_143)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_144)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_146)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_147)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_148)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_149)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_150)
+
+# else
+/* bindless textures */
+KERNEL_TEX(uint, texture_uint, __bindless_mapping)
+# endif
+#endif
/* packed image (opencl) */
KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed)
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index faff4ce3e6d..92d2b36bbb1 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -18,11 +18,15 @@ CCL_NAMESPACE_BEGIN
/* Float4 textures on various devices. */
#if defined(__KERNEL_CPU__)
- #define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CPU
+# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CPU
#elif defined(__KERNEL_CUDA__)
- #define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA
+# if __CUDA_ARCH__ < 300
+# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA
+# else
+# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER
+# endif
#else
- #define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_OPENCL
+# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_OPENCL
#endif
#ifdef __KERNEL_OPENCL__
@@ -151,6 +155,7 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
#else
float4 r;
+# if __CUDA_ARCH__ < 300
/* not particularly proud of this massive switch, what are the
* alternatives?
* - use a single big 1D texture, and do our own lookup/filtering
@@ -254,72 +259,19 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
case 90: r = kernel_tex_image_interp(__tex_image_byte4_090, x, y); break;
case 91: r = kernel_tex_image_interp(__tex_image_byte4_091, x, y); break;
case 92: r = kernel_tex_image_interp(__tex_image_byte4_092, x, y); break;
-
-# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
- case 93: r = kernel_tex_image_interp(__tex_image_byte4_093, x, y); break;
- case 94: r = kernel_tex_image_interp(__tex_image_byte4_094, x, y); break;
- case 95: r = kernel_tex_image_interp(__tex_image_byte4_095, x, y); break;
- case 96: r = kernel_tex_image_interp(__tex_image_byte4_096, x, y); break;
- case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break;
- case 98: r = kernel_tex_image_interp(__tex_image_byte4_098, x, y); break;
- case 99: r = kernel_tex_image_interp(__tex_image_byte4_099, x, y); break;
- case 100: r = kernel_tex_image_interp(__tex_image_byte4_100, x, y); break;
- case 101: r = kernel_tex_image_interp(__tex_image_byte4_101, x, y); break;
- case 102: r = kernel_tex_image_interp(__tex_image_byte4_102, x, y); break;
- case 103: r = kernel_tex_image_interp(__tex_image_byte4_103, x, y); break;
- case 104: r = kernel_tex_image_interp(__tex_image_byte4_104, x, y); break;
- case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break;
- case 106: r = kernel_tex_image_interp(__tex_image_byte4_106, x, y); break;
- case 107: r = kernel_tex_image_interp(__tex_image_byte4_107, x, y); break;
- case 108: r = kernel_tex_image_interp(__tex_image_byte4_108, x, y); break;
- case 109: r = kernel_tex_image_interp(__tex_image_byte4_109, x, y); break;
- case 110: r = kernel_tex_image_interp(__tex_image_byte4_110, x, y); break;
- case 111: r = kernel_tex_image_interp(__tex_image_byte4_111, x, y); break;
- case 112: r = kernel_tex_image_interp(__tex_image_byte4_112, x, y); break;
- case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break;
- case 114: r = kernel_tex_image_interp(__tex_image_byte4_114, x, y); break;
- case 115: r = kernel_tex_image_interp(__tex_image_byte4_115, x, y); break;
- case 116: r = kernel_tex_image_interp(__tex_image_byte4_116, x, y); break;
- case 117: r = kernel_tex_image_interp(__tex_image_byte4_117, x, y); break;
- case 118: r = kernel_tex_image_interp(__tex_image_byte4_118, x, y); break;
- case 119: r = kernel_tex_image_interp(__tex_image_byte4_119, x, y); break;
- case 120: r = kernel_tex_image_interp(__tex_image_byte4_120, x, y); break;
- case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break;
- case 122: r = kernel_tex_image_interp(__tex_image_byte4_122, x, y); break;
- case 123: r = kernel_tex_image_interp(__tex_image_byte4_123, x, y); break;
- case 124: r = kernel_tex_image_interp(__tex_image_byte4_124, x, y); break;
- case 125: r = kernel_tex_image_interp(__tex_image_byte4_125, x, y); break;
- case 126: r = kernel_tex_image_interp(__tex_image_byte4_126, x, y); break;
- case 127: r = kernel_tex_image_interp(__tex_image_byte4_127, x, y); break;
- case 128: r = kernel_tex_image_interp(__tex_image_byte4_128, x, y); break;
- case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break;
- case 130: r = kernel_tex_image_interp(__tex_image_byte4_130, x, y); break;
- case 131: r = kernel_tex_image_interp(__tex_image_byte4_131, x, y); break;
- case 132: r = kernel_tex_image_interp(__tex_image_byte4_132, x, y); break;
- case 133: r = kernel_tex_image_interp(__tex_image_byte4_133, x, y); break;
- case 134: r = kernel_tex_image_interp(__tex_image_byte4_134, x, y); break;
- case 135: r = kernel_tex_image_interp(__tex_image_byte4_135, x, y); break;
- case 136: r = kernel_tex_image_interp(__tex_image_byte4_136, x, y); break;
- case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break;
- case 138: r = kernel_tex_image_interp(__tex_image_byte4_138, x, y); break;
- case 139: r = kernel_tex_image_interp(__tex_image_byte4_139, x, y); break;
- case 140: r = kernel_tex_image_interp(__tex_image_byte4_140, x, y); break;
- case 141: r = kernel_tex_image_interp(__tex_image_byte4_141, x, y); break;
- case 142: r = kernel_tex_image_interp(__tex_image_byte4_142, x, y); break;
- case 143: r = kernel_tex_image_interp(__tex_image_byte4_143, x, y); break;
- case 144: r = kernel_tex_image_interp(__tex_image_byte4_144, x, y); break;
- case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break;
- case 146: r = kernel_tex_image_interp(__tex_image_byte4_146, x, y); break;
- case 147: r = kernel_tex_image_interp(__tex_image_byte4_147, x, y); break;
- case 148: r = kernel_tex_image_interp(__tex_image_byte4_148, x, y); break;
- case 149: r = kernel_tex_image_interp(__tex_image_byte4_149, x, y); break;
- case 150: r = kernel_tex_image_interp(__tex_image_byte4_150, x, y); break;
-# endif
-
default:
kernel_assert(0);
return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
}
+# else
+ CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
+ if(id < 2048) /* TODO(dingto): Make this a variable */
+ r = kernel_tex_image_interp_float4(tex, x, y);
+ else {
+ float f = kernel_tex_image_interp_float(tex, x, y);
+ r = make_float4(f, f, f, 1.0);
+ }
+# endif
#endif
#ifdef __KERNEL_SSE2__
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index 85ba2f906fa..d2cc2c3730e 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -42,10 +42,21 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
tfm.w = read_node_float(kg, offset);
co = transform_point(&tfm, co);
}
+ float4 r;
# if defined(__KERNEL_GPU__)
- float4 r = volume_image_texture_3d(id, co.x, co.y, co.z);
-# else
- float4 r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
+# if __CUDA_ARCH__ >= 300
+ CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
+ if(id < 2048) /* TODO(dingto): Make this a variable */
+ r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z);
+ else {
+ float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z);
+ r = make_float4(f, f, f, 1.0);
+ }
+# else /* __CUDA_ARCH__ >= 300 */
+ r = volume_image_texture_3d(id, co.x, co.y, co.z);
+# endif
+# else /* __KERNEL_GPU__ */
+ r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
# endif
#else
float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index 898e00fdcd9..9f40e561f59 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -49,7 +49,7 @@ ImageManager::ImageManager(const DeviceInfo& info)
tex_image_byte_start = TEX_IMAGE_BYTE_START_CPU;
}
/* CUDA (Fermi) */
- else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && !info.extended_images) {
+ else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && !info.has_bindless_textures) {
tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_IMAGES_CUDA;
tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_IMAGES_CUDA;
tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_IMAGES_CUDA;
@@ -59,7 +59,7 @@ ImageManager::ImageManager(const DeviceInfo& info)
tex_image_byte_start = TEX_IMAGE_BYTE_START_CUDA;
}
/* CUDA (Kepler and above) */
- else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && info.extended_images) {
+ else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && info.has_bindless_textures) {
tex_num_images[IMAGE_DATA_TYPE_BYTE4] = TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER;
tex_num_images[IMAGE_DATA_TYPE_FLOAT4] = TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER;
tex_num_images[IMAGE_DATA_TYPE_FLOAT] = TEX_NUM_FLOAT_IMAGES_CUDA_KEPLER;
@@ -294,7 +294,7 @@ int ImageManager::add_image(const string& filename,
if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
is_float = true;
- /* No float and byte textures on GPU yet */
+ /* No single channel textures on Fermi GPUs, use available slots */
if(type == IMAGE_DATA_TYPE_FLOAT && tex_num_images[type] == 0)
type = IMAGE_DATA_TYPE_FLOAT4;
if(type == IMAGE_DATA_TYPE_BYTE && tex_num_images[type] == 0)
diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h
index 2a1cfca4fdd..6da47858133 100644
--- a/intern/cycles/util/util_texture.h
+++ b/intern/cycles/util/util_texture.h
@@ -40,10 +40,10 @@ CCL_NAMESPACE_BEGIN
#define TEX_IMAGE_BYTE_START_CUDA (TEX_NUM_FLOAT4_IMAGES_CUDA + TEX_NUM_BYTE4_IMAGES_CUDA + TEX_NUM_BYTE_IMAGES_CUDA)
/* CUDA (KEPLER and above) */
-#define TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER 145
-#define TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER 5
-#define TEX_NUM_FLOAT_IMAGES_CUDA_KEPLER 0
-#define TEX_NUM_BYTE_IMAGES_CUDA_KEPLER 0
+#define TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER 1024
+#define TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER 1024
+#define TEX_NUM_FLOAT_IMAGES_CUDA_KEPLER 1024
+#define TEX_NUM_BYTE_IMAGES_CUDA_KEPLER 1024
#define TEX_IMAGE_BYTE4_START_CUDA_KEPLER TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER
#define TEX_IMAGE_FLOAT_START_CUDA_KEPLER (TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER)
#define TEX_IMAGE_BYTE_START_CUDA_KEPLER (TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE4_IMAGES_CUDA_KEPLER + TEX_NUM_BYTE_IMAGES_CUDA_KEPLER)