From c08c931fb6f57bdca7865d48ac09a0775590f3ce Mon Sep 17 00:00:00 2001 From: Thomas Dinges Date: Sun, 11 May 2014 03:38:39 +0200 Subject: Cycles / CUDA: Increase maximum image textures on GPU. Instead of 95, we can use 145 images now. This only affects Kepler and above (sm30, sm_35 and sm_50). This can be increased further if needed, but let's first test if this does not come with a performance impact. Originally developed during my GSoC 2013. --- intern/cycles/device/device.h | 2 ++ intern/cycles/device/device_cuda.cpp | 1 + intern/cycles/device/device_multi.cpp | 2 ++ intern/cycles/kernel/kernel_textures.h | 55 ++++++++++++++++++++++++++++++ intern/cycles/kernel/svm/svm_image.h | 61 ++++++++++++++++++++++++++++++++-- intern/cycles/render/image.cpp | 13 +++++--- intern/cycles/render/image.h | 10 ++++-- intern/cycles/render/scene.cpp | 4 +-- intern/cycles/render/scene.h | 4 +-- 9 files changed, 139 insertions(+), 13 deletions(-) (limited to 'intern/cycles') diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index cbabcb1e20e..bcddd4f73e2 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -54,6 +54,7 @@ public: bool display_device; bool advanced_shading; bool pack_images; + bool extended_images; /* flag for GPU and Multi device */ vector multi_devices; DeviceInfo() @@ -64,6 +65,7 @@ public: display_device = false; advanced_shading = true; pack_images = false; + extended_images = false; } }; diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 9139a75ef3e..68955211146 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1150,6 +1150,7 @@ void device_cuda_info(vector& devices) int major, minor; cuDeviceComputeCapability(&major, &minor, num); info.advanced_shading = (major >= 2); + info.extended_images = (major >= 3); info.pack_images = false; /* if device has a kernel timeout, assume it is used for display */ diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 59bbf3b9d5a..c866ebaaea2 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -328,6 +328,7 @@ static bool device_multi_add(vector& devices, DeviceType type, bool info.advanced_shading = with_advanced_shading; info.pack_images = false; + info.extended_images = true; foreach(DeviceInfo& subinfo, devices) { if(subinfo.type == type) { @@ -351,6 +352,7 @@ static bool device_multi_add(vector& devices, DeviceType type, bool if(subinfo.display_device) info.display_device = true; info.pack_images = info.pack_images || subinfo.pack_images; + info.extended_images = info.extended_images && subinfo.extended_images; num_added++; } } diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h index c8734d67c3b..94115fd388c 100644 --- a/intern/cycles/kernel/kernel_textures.h +++ b/intern/cycles/kernel/kernel_textures.h @@ -174,6 +174,61 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_097) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_098) KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_099) +/* Kepler and above */ +#if __CUDA_ARCH__ >= 300 +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_100) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_101) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_102) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_103) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_104) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_105) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_106) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_107) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_108) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_109) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_110) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_111) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_112) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_113) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_114) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_115) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_116) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_117) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_118) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_119) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_120) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_121) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_122) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_123) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_124) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_125) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_126) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_127) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_128) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_129) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_130) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_131) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_132) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_133) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_134) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_135) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_136) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_137) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_138) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_139) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_140) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_141) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_142) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_143) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_144) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_145) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_146) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_147) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_148) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_149) +KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_150) +#endif + /* packed image (opencl) */ KERNEL_TEX(uchar4, texture_uchar4, __tex_image_packed) KERNEL_TEX(uint4, texture_uint4, __tex_image_packed_info) diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index 6627786725f..6c658afb9df 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -149,8 +149,8 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, * - group by size and use a 3d texture, performance impact * - group into larger texture with some padding for correct lerp * - * also note that cuda has 128 textures limit, we use 100 now, since - * we still need some for other storage */ + * also note that cuda has a textures limit (128 for Fermi, 256 for Kepler), + * and we cannot use all since we still need some for other storage */ switch(id) { case 0: r = kernel_tex_image_interp(__tex_image_float_000, x, y); break; @@ -253,7 +253,62 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, case 97: r = kernel_tex_image_interp(__tex_image_097, x, y); break; case 98: r = kernel_tex_image_interp(__tex_image_098, x, y); break; case 99: r = kernel_tex_image_interp(__tex_image_099, x, y); break; - default: + +#if __CUDA_ARCH__ >= 300 + case 100: r = kernel_tex_image_interp(__tex_image_100, x, y); break; + case 101: r = kernel_tex_image_interp(__tex_image_101, x, y); break; + case 102: r = kernel_tex_image_interp(__tex_image_102, x, y); break; + case 103: r = kernel_tex_image_interp(__tex_image_103, x, y); break; + case 104: r = kernel_tex_image_interp(__tex_image_104, x, y); break; + case 105: r = kernel_tex_image_interp(__tex_image_105, x, y); break; + case 106: r = kernel_tex_image_interp(__tex_image_106, x, y); break; + case 107: r = kernel_tex_image_interp(__tex_image_107, x, y); break; + case 108: r = kernel_tex_image_interp(__tex_image_108, x, y); break; + case 109: r = kernel_tex_image_interp(__tex_image_109, x, y); break; + case 110: r = kernel_tex_image_interp(__tex_image_110, x, y); break; + case 111: r = kernel_tex_image_interp(__tex_image_111, x, y); break; + case 112: r = kernel_tex_image_interp(__tex_image_112, x, y); break; + case 113: r = kernel_tex_image_interp(__tex_image_113, x, y); break; + case 114: r = kernel_tex_image_interp(__tex_image_114, x, y); break; + case 115: r = kernel_tex_image_interp(__tex_image_115, x, y); break; + case 116: r = kernel_tex_image_interp(__tex_image_116, x, y); break; + case 117: r = kernel_tex_image_interp(__tex_image_117, x, y); break; + case 118: r = kernel_tex_image_interp(__tex_image_118, x, y); break; + case 119: r = kernel_tex_image_interp(__tex_image_119, x, y); break; + case 120: r = kernel_tex_image_interp(__tex_image_120, x, y); break; + case 121: r = kernel_tex_image_interp(__tex_image_121, x, y); break; + case 122: r = kernel_tex_image_interp(__tex_image_122, x, y); break; + case 123: r = kernel_tex_image_interp(__tex_image_123, x, y); break; + case 124: r = kernel_tex_image_interp(__tex_image_124, x, y); break; + case 125: r = kernel_tex_image_interp(__tex_image_125, x, y); break; + case 126: r = kernel_tex_image_interp(__tex_image_126, x, y); break; + case 127: r = kernel_tex_image_interp(__tex_image_127, x, y); break; + case 128: r = kernel_tex_image_interp(__tex_image_128, x, y); break; + case 129: r = kernel_tex_image_interp(__tex_image_129, x, y); break; + case 130: r = kernel_tex_image_interp(__tex_image_130, x, y); break; + case 131: r = kernel_tex_image_interp(__tex_image_131, x, y); break; + case 132: r = kernel_tex_image_interp(__tex_image_132, x, y); break; + case 133: r = kernel_tex_image_interp(__tex_image_133, x, y); break; + case 134: r = kernel_tex_image_interp(__tex_image_134, x, y); break; + case 135: r = kernel_tex_image_interp(__tex_image_135, x, y); break; + case 136: r = kernel_tex_image_interp(__tex_image_136, x, y); break; + case 137: r = kernel_tex_image_interp(__tex_image_137, x, y); break; + case 138: r = kernel_tex_image_interp(__tex_image_138, x, y); break; + case 139: r = kernel_tex_image_interp(__tex_image_139, x, y); break; + case 140: r = kernel_tex_image_interp(__tex_image_140, x, y); break; + case 141: r = kernel_tex_image_interp(__tex_image_141, x, y); break; + case 142: r = kernel_tex_image_interp(__tex_image_142, x, y); break; + case 143: r = kernel_tex_image_interp(__tex_image_143, x, y); break; + case 144: r = kernel_tex_image_interp(__tex_image_144, x, y); break; + case 145: r = kernel_tex_image_interp(__tex_image_145, x, y); break; + case 146: r = kernel_tex_image_interp(__tex_image_146, x, y); break; + case 147: r = kernel_tex_image_interp(__tex_image_147, x, y); break; + case 148: r = kernel_tex_image_interp(__tex_image_148, x, y); break; + case 149: r = kernel_tex_image_interp(__tex_image_149, x, y); break; + case 150: r = kernel_tex_image_interp(__tex_image_150, x, y); break; +#endif + + default: kernel_assert(0); return make_float4(0.0f, 0.0f, 0.0f, 0.0f); } diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp index 8dde642f70b..86755badc42 100644 --- a/intern/cycles/render/image.cpp +++ b/intern/cycles/render/image.cpp @@ -59,11 +59,16 @@ void ImageManager::set_osl_texture_system(void *texture_system) osl_texture_system = texture_system; } -void ImageManager::set_extended_image_limits(void) +void ImageManager::set_extended_image_limits(const DeviceInfo& info) { - tex_num_images = TEX_EXTENDED_NUM_IMAGES; - tex_num_float_images = TEX_EXTENDED_NUM_FLOAT_IMAGES; - tex_image_byte_start = TEX_EXTENDED_IMAGE_BYTE_START; + if(info.type == DEVICE_CPU) { + tex_num_images = TEX_EXTENDED_NUM_IMAGES_CPU; + tex_num_float_images = TEX_EXTENDED_NUM_FLOAT_IMAGES; + tex_image_byte_start = TEX_EXTENDED_IMAGE_BYTE_START; + } + else if((info.type == DEVICE_CUDA || info.type == DEVICE_MULTI) && info.extended_images) { + tex_num_images = TEX_EXTENDED_NUM_IMAGES_GPU; + } } bool ImageManager::set_animation_frame_update(int frame) diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h index a862ffce5c3..561550fe0d2 100644 --- a/intern/cycles/render/image.h +++ b/intern/cycles/render/image.h @@ -17,6 +17,7 @@ #ifndef __IMAGE_H__ #define __IMAGE_H__ +#include "device.h" #include "device_memory.h" #include "util_string.h" @@ -27,11 +28,16 @@ CCL_NAMESPACE_BEGIN +/* generic */ #define TEX_NUM_IMAGES 95 #define TEX_IMAGE_BYTE_START TEX_NUM_FLOAT_IMAGES +/* extended gpu */ +#define TEX_EXTENDED_NUM_IMAGES_GPU 145 + +/* extended cpu */ #define TEX_EXTENDED_NUM_FLOAT_IMAGES 1024 -#define TEX_EXTENDED_NUM_IMAGES 1024 +#define TEX_EXTENDED_NUM_IMAGES_CPU 1024 #define TEX_EXTENDED_IMAGE_BYTE_START TEX_EXTENDED_NUM_FLOAT_IMAGES /* color to use when textures are not found */ @@ -59,7 +65,7 @@ public: void set_osl_texture_system(void *texture_system); void set_pack_images(bool pack_images_); - void set_extended_image_limits(void); + void set_extended_image_limits(const DeviceInfo& info); bool set_animation_frame_update(int frame); bool need_update; diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp index 4a42b2c9e35..686a1bc406e 100644 --- a/intern/cycles/render/scene.cpp +++ b/intern/cycles/render/scene.cpp @@ -63,8 +63,8 @@ Scene::Scene(const SceneParams& params_, const DeviceInfo& device_info_) else shader_manager = ShaderManager::create(this, SceneParams::SVM); - if (device_info_.type == DEVICE_CPU) - image_manager->set_extended_image_limits(); + /* Extended image limits for CPU and GPUs */ + image_manager->set_extended_image_limits(device_info_); } Scene::~Scene() diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h index c913b4c59da..0f0bb725823 100644 --- a/intern/cycles/render/scene.h +++ b/intern/cycles/render/scene.h @@ -105,8 +105,8 @@ public: /* integrator */ device_vector sobol_directions; - /* images */ - device_vector tex_image[TEX_EXTENDED_NUM_IMAGES]; + /* cpu images */ + device_vector tex_image[TEX_EXTENDED_NUM_IMAGES_CPU]; device_vector tex_float_image[TEX_EXTENDED_NUM_FLOAT_IMAGES]; /* opencl images */ -- cgit v1.2.3