diff options
Diffstat (limited to 'intern/cycles/kernel')
-rw-r--r-- | intern/cycles/kernel/geom/geom_volume.h | 13 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_compat_cuda.h | 25 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_textures.h | 68 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_image.h | 82 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_voxel.h | 17 |
5 files changed, 68 insertions, 137 deletions
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h index ef02c01dec6..2044aafc877 100644 --- a/intern/cycles/kernel/geom/geom_volume.h +++ b/intern/cycles/kernel/geom/geom_volume.h @@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN /* Return position normalized to 0..1 in mesh bounds */ -#ifdef __KERNEL_GPU__ +#if defined(__KERNEL_GPU__) && __CUDA_ARCH__ < 300 ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z) { float4 r; @@ -65,7 +65,13 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, { float3 P = volume_normalized_position(kg, sd, sd->P); #ifdef __KERNEL_GPU__ +# if __CUDA_ARCH__ >= 300 + CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); + float f = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z); + float4 r = make_float4(f, f, f, 1.0); +# else float4 r = volume_image_texture_3d(id, P.x, P.y, P.z); +# endif #else float4 r; if(sd->flag & SD_VOLUME_CUBIC) @@ -84,7 +90,12 @@ ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *s { float3 P = volume_normalized_position(kg, sd, sd->P); #ifdef __KERNEL_GPU__ +# if __CUDA_ARCH__ >= 300 + CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); + float4 r = kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z); +# else float4 r = volume_image_texture_3d(id, P.x, P.y, P.z); +# endif #else float4 r; if(sd->flag & SD_VOLUME_CUBIC) diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index d10d3255e1b..42314756f02 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -67,20 +67,29 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4; /* Macros to handle different memory storage on different devices */ -/* In order to use full 6GB of memory on Titan cards, use arrays instead - * of textures. On earlier cards this seems slower, but on Titan it is - * actually slightly faster in tests. */ +/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images. + * On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data. + * + * Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster. + * Using Arrays on Fermi turned out to be slower.*/ + +/* Fermi */ #if __CUDA_ARCH__ < 300 # define __KERNEL_CUDA_TEX_STORAGE__ -#endif - -#ifdef __KERNEL_CUDA_TEX_STORAGE__ # define kernel_tex_fetch(t, index) tex1Dfetch(t, index) + +# define kernel_tex_image_interp(t, x, y) tex2D(t, x, y) +# define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z) + +/* Kepler */ #else # define kernel_tex_fetch(t, index) t[(index)] + +# define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y) +# define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y) +# define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z) +# define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z) #endif -#define kernel_tex_image_interp(t, x, y) tex2D(t, x, y) -#define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z) #define kernel_data __data diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index 62b0a6f2923..245d236ff97 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -72,6 +72,8 @@ KERNEL_TEX(float, texture_float, __lookup_table) /* sobol */ KERNEL_TEX(uint, texture_uint, __sobol_directions) +#ifdef __KERNEL_CUDA__ +# if __CUDA_ARCH__ < 300 /* full-float image */ KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000) KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001) @@ -174,66 +176,12 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_090) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_091) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_092) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_093) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_094) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_095) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_096) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_098) - -/* Kepler and above */ -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_099) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_100) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_101) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_102) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_103) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_104) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_106) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_107) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_108) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_109) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_110) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_111) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_112) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_114) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_115) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_116) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_117) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_118) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_119) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_120) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_122) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_123) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_124) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_125) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_126) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_127) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_128) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_130) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_131) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_132) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_133) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_134) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_135) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_136) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_138) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_139) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_140) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_141) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_142) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_143) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_144) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_146) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_147) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_148) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_149) -KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_150) + +# else +/* bindless textures */ +KERNEL_TEX(uint, texture_uint, __bindless_mapping) +# endif +#endif /* packed image (opencl) */ KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed) diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index faff4ce3e6d..92d2b36bbb1 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -18,11 +18,15 @@ CCL_NAMESPACE_BEGIN /* Float4 textures on various devices. */ #if defined(__KERNEL_CPU__) - #define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CPU +# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CPU #elif defined(__KERNEL_CUDA__) - #define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA +# if __CUDA_ARCH__ < 300 +# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA +# else +# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_CUDA_KEPLER +# endif #else - #define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_OPENCL +# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_IMAGES_OPENCL #endif #ifdef __KERNEL_OPENCL__ @@ -151,6 +155,7 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, #else float4 r; +# if __CUDA_ARCH__ < 300 /* not particularly proud of this massive switch, what are the * alternatives? * - use a single big 1D texture, and do our own lookup/filtering @@ -254,72 +259,19 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, case 90: r = kernel_tex_image_interp(__tex_image_byte4_090, x, y); break; case 91: r = kernel_tex_image_interp(__tex_image_byte4_091, x, y); break; case 92: r = kernel_tex_image_interp(__tex_image_byte4_092, x, y); break; - -# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300) - case 93: r = kernel_tex_image_interp(__tex_image_byte4_093, x, y); break; - case 94: r = kernel_tex_image_interp(__tex_image_byte4_094, x, y); break; - case 95: r = kernel_tex_image_interp(__tex_image_byte4_095, x, y); break; - case 96: r = kernel_tex_image_interp(__tex_image_byte4_096, x, y); break; - case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break; - case 98: r = kernel_tex_image_interp(__tex_image_byte4_098, x, y); break; - case 99: r = kernel_tex_image_interp(__tex_image_byte4_099, x, y); break; - case 100: r = kernel_tex_image_interp(__tex_image_byte4_100, x, y); break; - case 101: r = kernel_tex_image_interp(__tex_image_byte4_101, x, y); break; - case 102: r = kernel_tex_image_interp(__tex_image_byte4_102, x, y); break; - case 103: r = kernel_tex_image_interp(__tex_image_byte4_103, x, y); break; - case 104: r = kernel_tex_image_interp(__tex_image_byte4_104, x, y); break; - case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break; - case 106: r = kernel_tex_image_interp(__tex_image_byte4_106, x, y); break; - case 107: r = kernel_tex_image_interp(__tex_image_byte4_107, x, y); break; - case 108: r = kernel_tex_image_interp(__tex_image_byte4_108, x, y); break; - case 109: r = kernel_tex_image_interp(__tex_image_byte4_109, x, y); break; - case 110: r = kernel_tex_image_interp(__tex_image_byte4_110, x, y); break; - case 111: r = kernel_tex_image_interp(__tex_image_byte4_111, x, y); break; - case 112: r = kernel_tex_image_interp(__tex_image_byte4_112, x, y); break; - case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break; - case 114: r = kernel_tex_image_interp(__tex_image_byte4_114, x, y); break; - case 115: r = kernel_tex_image_interp(__tex_image_byte4_115, x, y); break; - case 116: r = kernel_tex_image_interp(__tex_image_byte4_116, x, y); break; - case 117: r = kernel_tex_image_interp(__tex_image_byte4_117, x, y); break; - case 118: r = kernel_tex_image_interp(__tex_image_byte4_118, x, y); break; - case 119: r = kernel_tex_image_interp(__tex_image_byte4_119, x, y); break; - case 120: r = kernel_tex_image_interp(__tex_image_byte4_120, x, y); break; - case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break; - case 122: r = kernel_tex_image_interp(__tex_image_byte4_122, x, y); break; - case 123: r = kernel_tex_image_interp(__tex_image_byte4_123, x, y); break; - case 124: r = kernel_tex_image_interp(__tex_image_byte4_124, x, y); break; - case 125: r = kernel_tex_image_interp(__tex_image_byte4_125, x, y); break; - case 126: r = kernel_tex_image_interp(__tex_image_byte4_126, x, y); break; - case 127: r = kernel_tex_image_interp(__tex_image_byte4_127, x, y); break; - case 128: r = kernel_tex_image_interp(__tex_image_byte4_128, x, y); break; - case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break; - case 130: r = kernel_tex_image_interp(__tex_image_byte4_130, x, y); break; - case 131: r = kernel_tex_image_interp(__tex_image_byte4_131, x, y); break; - case 132: r = kernel_tex_image_interp(__tex_image_byte4_132, x, y); break; - case 133: r = kernel_tex_image_interp(__tex_image_byte4_133, x, y); break; - case 134: r = kernel_tex_image_interp(__tex_image_byte4_134, x, y); break; - case 135: r = kernel_tex_image_interp(__tex_image_byte4_135, x, y); break; - case 136: r = kernel_tex_image_interp(__tex_image_byte4_136, x, y); break; - case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break; - case 138: r = kernel_tex_image_interp(__tex_image_byte4_138, x, y); break; - case 139: r = kernel_tex_image_interp(__tex_image_byte4_139, x, y); break; - case 140: r = kernel_tex_image_interp(__tex_image_byte4_140, x, y); break; - case 141: r = kernel_tex_image_interp(__tex_image_byte4_141, x, y); break; - case 142: r = kernel_tex_image_interp(__tex_image_byte4_142, x, y); break; - case 143: r = kernel_tex_image_interp(__tex_image_byte4_143, x, y); break; - case 144: r = kernel_tex_image_interp(__tex_image_byte4_144, x, y); break; - case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break; - case 146: r = kernel_tex_image_interp(__tex_image_byte4_146, x, y); break; - case 147: r = kernel_tex_image_interp(__tex_image_byte4_147, x, y); break; - case 148: r = kernel_tex_image_interp(__tex_image_byte4_148, x, y); break; - case 149: r = kernel_tex_image_interp(__tex_image_byte4_149, x, y); break; - case 150: r = kernel_tex_image_interp(__tex_image_byte4_150, x, y); break; -# endif - default: kernel_assert(0); return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } +# else + CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); + if(id < 2048) /* TODO(dingto): Make this a variable */ + r = kernel_tex_image_interp_float4(tex, x, y); + else { + float f = kernel_tex_image_interp_float(tex, x, y); + r = make_float4(f, f, f, 1.0); + } +# endif #endif #ifdef __KERNEL_SSE2__ diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h index 85ba2f906fa..d2cc2c3730e 100644 --- a/intern/cycles/kernel/svm/svm_voxel.h +++ b/intern/cycles/kernel/svm/svm_voxel.h @@ -42,10 +42,21 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg, tfm.w = read_node_float(kg, offset); co = transform_point(&tfm, co); } + float4 r; # if defined(__KERNEL_GPU__) - float4 r = volume_image_texture_3d(id, co.x, co.y, co.z); -# else - float4 r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z); +# if __CUDA_ARCH__ >= 300 + CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); + if(id < 2048) /* TODO(dingto): Make this a variable */ + r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z); + else { + float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z); + r = make_float4(f, f, f, 1.0); + } +# else /* __CUDA_ARCH__ >= 300 */ + r = volume_image_texture_3d(id, co.x, co.y, co.z); +# endif +# else /* __KERNEL_GPU__ */ + r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z); # endif #else float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f); |