Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--intern/cycles/device/device_cuda.cpp165
-rw-r--r--intern/cycles/kernel/kernel_bvh.h19
-rw-r--r--intern/cycles/kernel/kernel_compat_cuda.h11
-rw-r--r--intern/cycles/kernel/kernel_globals.h4
-rw-r--r--intern/cycles/kernel/kernel_primitive.h8
5 files changed, 140 insertions, 67 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index b5eaa69bf0e..5440bd91987 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -43,7 +43,9 @@ public:
CUmodule cuModule;
map<device_ptr, bool> tex_interp_map;
int cuDevId;
+ int cuDevArchitecture;
bool first_error;
+ bool use_texture_storage;
struct PixelMem {
GLuint cuPBO;
@@ -173,6 +175,7 @@ public:
{
first_error = true;
background = background_;
+ use_texture_storage = true;
cuDevId = info.num;
cuDevice = 0;
@@ -203,6 +206,15 @@ public:
if(cuda_error_(result, "cuCtxCreate"))
return;
+ int major, minor;
+ cuDeviceComputeCapability(&major, &minor, cuDevId);
+ cuDevArchitecture = major*100 + minor*10;
+
+ /* In order to use full 6GB of memory on Titan cards, use arrays instead
+ * of textures. On earlier cards this seems slower, but on Titan it is
+ * actually slightly faster in tests. */
+ use_texture_storage = (cuDevArchitecture < 350);
+
cuda_pop_context();
}
@@ -210,8 +222,7 @@ public:
{
task_pool.stop();
- cuda_push_context();
- cuda_assert(cuCtxDetach(cuContext))
+ cuda_assert(cuCtxDestroy(cuContext))
}
bool support_device(bool experimental)
@@ -448,90 +459,118 @@ public:
CUarray_format_enum format;
size_t dsize = datatype_size(mem.data_type);
size_t size = mem.memory_size();
+ bool use_texture = interpolation || use_texture_storage;
- switch(mem.data_type) {
- case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
- case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
- case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
- case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
- default: assert(0); return;
- }
-
- CUtexref texref = NULL;
-
- cuda_push_context();
- cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
-
- if(!texref) {
- cuda_pop_context();
- return;
- }
+ if(use_texture) {
- if(interpolation) {
- CUarray handle = NULL;
- CUDA_ARRAY_DESCRIPTOR desc;
+ switch(mem.data_type) {
+ case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
+ case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
+ case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
+ case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
+ default: assert(0); return;
+ }
- desc.Width = mem.data_width;
- desc.Height = mem.data_height;
- desc.Format = format;
- desc.NumChannels = mem.data_elements;
+ CUtexref texref = NULL;
- cuda_assert(cuArrayCreate(&handle, &desc))
+ cuda_push_context();
+ cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
- if(!handle) {
+ if(!texref) {
cuda_pop_context();
return;
}
- if(mem.data_height > 1) {
- CUDA_MEMCPY2D param;
- memset(&param, 0, sizeof(param));
- param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- param.dstArray = handle;
- param.srcMemoryType = CU_MEMORYTYPE_HOST;
- param.srcHost = (void*)mem.data_pointer;
- param.srcPitch = mem.data_width*dsize*mem.data_elements;
- param.WidthInBytes = param.srcPitch;
- param.Height = mem.data_height;
-
- cuda_assert(cuMemcpy2D(&param))
+ if(interpolation) {
+ CUarray handle = NULL;
+ CUDA_ARRAY_DESCRIPTOR desc;
+
+ desc.Width = mem.data_width;
+ desc.Height = mem.data_height;
+ desc.Format = format;
+ desc.NumChannels = mem.data_elements;
+
+ cuda_assert(cuArrayCreate(&handle, &desc))
+
+ if(!handle) {
+ cuda_pop_context();
+ return;
+ }
+
+ if(mem.data_height > 1) {
+ CUDA_MEMCPY2D param;
+ memset(&param, 0, sizeof(param));
+ param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ param.dstArray = handle;
+ param.srcMemoryType = CU_MEMORYTYPE_HOST;
+ param.srcHost = (void*)mem.data_pointer;
+ param.srcPitch = mem.data_width*dsize*mem.data_elements;
+ param.WidthInBytes = param.srcPitch;
+ param.Height = mem.data_height;
+
+ cuda_assert(cuMemcpy2D(&param))
+ }
+ else
+ cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
+
+ cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
+
+ cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
+ cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
+
+ mem.device_pointer = (device_ptr)handle;
+
+ stats.mem_alloc(size);
}
- else
- cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
+ else {
+ cuda_pop_context();
+
+ mem_alloc(mem, MEM_READ_ONLY);
+ mem_copy_to(mem);
- cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
+ cuda_push_context();
- cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
- cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
+ cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
+ cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
+ cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
+ }
- mem.device_pointer = (device_ptr)handle;
+ if(periodic) {
+ cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
+ cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
+ }
+ else {
+ cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
+ cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
+ }
+ cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
- stats.mem_alloc(size);
+ cuda_pop_context();
}
else {
- cuda_pop_context();
-
mem_alloc(mem, MEM_READ_ONLY);
mem_copy_to(mem);
cuda_push_context();
- cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
- cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
- cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
- }
+ CUdeviceptr cumem;
+ size_t cubytes;
- if(periodic) {
- cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
- cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
- }
- else {
- cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
- cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
- }
- cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
+ cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, name))
- cuda_pop_context();
+ if(cubytes == 8) {
+ /* 64 bit device pointer */
+ uint64_t ptr = mem.device_pointer;
+ cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes))
+ }
+ else {
+ /* 32 bit device pointer */
+ uint32_t ptr = (uint32_t)mem.device_pointer;
+ cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes))
+ }
+
+ cuda_pop_context();
+ }
tex_interp_map[mem.device_pointer] = interpolation;
}
diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h
index 4b01f2eebcd..44a9822c103 100644
--- a/intern/cycles/kernel/kernel_bvh.h
+++ b/intern/cycles/kernel/kernel_bvh.h
@@ -809,11 +809,16 @@ __device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Inters
#include "kernel_bvh_subsurface.h"
#endif
-
+/* to work around titan bug when using arrays instead of textures */
+#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
+__device_inline
+#else
+__device_noinline
+#endif
#ifdef __HAIR__
-__device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax)
+bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax)
#else
-__device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
+bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
#endif
{
#ifdef __OBJECT_MOTION__
@@ -851,8 +856,14 @@ __device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const ui
#endif /* __KERNEL_CPU__ */
}
+/* to work around titan bug when using arrays instead of textures */
#ifdef __SUBSURFACE__
-__device_inline uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
+#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__)
+__device_inline
+#else
+__device_noinline
+#endif
+uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits)
{
#ifdef __OBJECT_MOTION__
if(kernel_data.bvh.have_motion) {
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index cb86ce8c4ae..44c2b9effe9 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -57,7 +57,18 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
/* Macros to handle different memory storage on different devices */
+/* In order to use full 6GB of memory on Titan cards, use arrays instead
+ * of textures. On earlier cards this seems slower, but on Titan it is
+ * actually slightly faster in tests. */
+#if __CUDA_ARCH__ < 350
+#define __KERNEL_CUDA_TEX_STORAGE__
+#endif
+
+#ifdef __KERNEL_CUDA_TEX_STORAGE__
#define kernel_tex_fetch(t, index) tex1Dfetch(t, index)
+#else
+#define kernel_tex_fetch(t, index) t[(index)]
+#endif
#define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
#define kernel_data __data
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index ab0a717b592..b5e691eb615 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -66,7 +66,11 @@ typedef struct KernelGlobals {
__constant__ KernelData __data;
typedef struct KernelGlobals {} KernelGlobals;
+#ifdef __KERNEL_CUDA_TEX_STORAGE__
#define KERNEL_TEX(type, ttype, name) ttype name;
+#else
+#define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name;
+#endif
#define KERNEL_IMAGE_TEX(type, ttype, name) ttype name;
#include "kernel_textures.h"
diff --git a/intern/cycles/kernel/kernel_primitive.h b/intern/cycles/kernel/kernel_primitive.h
index 4a06dff84bf..636cfd06532 100644
--- a/intern/cycles/kernel/kernel_primitive.h
+++ b/intern/cycles/kernel/kernel_primitive.h
@@ -93,7 +93,11 @@ __device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
{
#ifdef __HAIR__
if(sd->segment != ~0)
+#ifdef __DPDU__
return normalize(sd->dPdu);
+#else
+ return make_float3(0.0f, 0.0f, 0.0f);
+#endif
#endif
/* try to create spherical tangent from generated coordinates */
@@ -108,7 +112,11 @@ __device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
}
else {
/* otherwise use surface derivatives */
+#ifdef __DPDU__
return normalize(sd->dPdu);
+#else
+ return make_float3(0.0f, 0.0f, 0.0f);
+#endif
}
}