From 9d236ac06c2b6511365eb53f84bbe366c76acc72 Mon Sep 17 00:00:00 2001 From: Thomas Dinges Date: Thu, 11 Aug 2016 22:47:53 +0200 Subject: Cycles: Enable half float support (4 channels and 1 channel) on CUDA. Atm OpenEXR half files benefit from this and will use only 1/2 of the memory now. More space for HDRs! Part of my GSoC 2016. --- intern/cycles/device/device_cuda.cpp | 9 +++++++-- intern/cycles/kernel/kernel_compat_cuda.h | 1 + intern/cycles/kernel/svm/svm_image.h | 4 +++- intern/cycles/util/util_half.h | 12 ++++++++---- intern/cycles/util/util_texture.h | 4 ++-- 5 files changed, 21 insertions(+), 9 deletions(-) (limited to 'intern') diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index a85f34082db..76e52498b42 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -576,6 +576,7 @@ public: case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; + case TYPE_HALF: format = CU_AD_FORMAT_HALF; break; default: assert(0); return; } @@ -747,8 +748,12 @@ public: } /* Resize once */ - if(flat_slot >= bindless_mapping.size()) - bindless_mapping.resize(4096); /*TODO(dingto): Make this a variable */ + if(flat_slot >= bindless_mapping.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. + */ + bindless_mapping.resize(flat_slot + 128); + } /* Set Mapping and tag that we need to (re-)upload to device */ bindless_mapping.get_data()[flat_slot] = (uint)tex; diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index 063220b542e..d656fac81e6 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -31,6 +31,7 @@ #endif #include +#include #include /* Qualifier wrappers for different names on different devices */ diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h index f359829374d..9050ce93951 100644 --- a/intern/cycles/kernel/svm/svm_image.h +++ b/intern/cycles/kernel/svm/svm_image.h @@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN /* Float4 textures on various devices. */ #if defined(__KERNEL_CPU__) -# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CPU +# define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CPU #elif defined(__KERNEL_CUDA__) # if __CUDA_ARCH__ < 300 # define TEX_NUM_FLOAT4_IMAGES TEX_NUM_FLOAT4_CUDA @@ -277,8 +277,10 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, } # else CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id); + /* float4, byte4 and half4 */ if(id < TEX_START_FLOAT_CUDA_KEPLER) r = kernel_tex_image_interp_float4(tex, x, y); + /* float, byte and half */ else { float f = kernel_tex_image_interp_float(tex, x, y); r = make_float4(f, f, f, 1.0); diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h index ae85ab3a915..5db3384cda4 100644 --- a/intern/cycles/util/util_half.h +++ b/intern/cycles/util/util_half.h @@ -33,17 +33,21 @@ CCL_NAMESPACE_BEGIN #else +/* CUDA has its own half data type, no need to define then */ +#ifndef __KERNEL_CUDA__ typedef unsigned short half; +#endif + struct half4 { half x, y, z, w; }; #ifdef __KERNEL_CUDA__ ccl_device_inline void float4_store_half(half *h, float4 f, float scale) { - h[0] = __float2half_rn(f.x * scale); - h[1] = __float2half_rn(f.y * scale); - h[2] = __float2half_rn(f.z * scale); - h[3] = __float2half_rn(f.w * scale); + h[0] = __float2half(f.x * scale); + h[1] = __float2half(f.y * scale); + h[2] = __float2half(f.z * scale); + h[3] = __float2half(f.w * scale); } #else diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h index ec3ee2b8191..be1177d3be9 100644 --- a/intern/cycles/util/util_texture.h +++ b/intern/cycles/util/util_texture.h @@ -52,10 +52,10 @@ CCL_NAMESPACE_BEGIN /* CUDA (Kepler, Geforce 6xx and above) */ #define TEX_NUM_FLOAT4_CUDA_KEPLER 1024 #define TEX_NUM_BYTE4_CUDA_KEPLER 1024 -#define TEX_NUM_HALF4_CUDA_KEPLER 0 +#define TEX_NUM_HALF4_CUDA_KEPLER 1024 #define TEX_NUM_FLOAT_CUDA_KEPLER 1024 #define TEX_NUM_BYTE_CUDA_KEPLER 1024 -#define TEX_NUM_HALF_CUDA_KEPLER 0 +#define TEX_NUM_HALF_CUDA_KEPLER 1024 #define TEX_START_FLOAT4_CUDA_KEPLER 0 #define TEX_START_BYTE4_CUDA_KEPLER TEX_NUM_FLOAT4_CUDA_KEPLER #define TEX_START_HALF4_CUDA_KEPLER (TEX_NUM_FLOAT4_CUDA_KEPLER + TEX_NUM_BYTE4_CUDA_KEPLER) -- cgit v1.2.3