Cycles: fixes to make CUDA 4.2 work, compiling gave errors in shadows and

other places, was mainly due to instancing not working, but also found issues in procedural textures. The problem was with --use_fast_math, this seems to now have way lower precision for some operations. Disabled this flag and selectively use fast math functions. Did not find performance regression on GTX 460 after doing this.
author: Brecht Van Lommel <brechtvanlommel@pandora.be> 2012-05-28 23:21:13 +0400
committer: Brecht Van Lommel <brechtvanlommel@pandora.be> 2012-05-28 23:21:13 +0400
commit: 131de4352b9e8f3e4ce77d2d9dc145f5db461aed (patch)
tree: c796dc4d9697a1122383dea92aab31493e9c0e1a /intern
parent: c2be2fd4083845a381c2462a65dc0856dc6efb8e (diff)
8 files changed, 34 insertions, 22 deletions
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index d7003729c46..ad91022de07 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -304,7 +304,6 @@ Mesh *BlenderSync::sync_mesh(BL::Object b_ob, bool object_updated)
 void BlenderSync::sync_mesh_motion(BL::Object b_ob, Mesh *mesh, int motion)
 {
 	/* todo: displacement, subdivision */
-	BL::ID b_ob_data = b_ob.data();
 	size_t size = mesh->verts.size();
 
 	/* skip objects without deforming modifiers. this is not a totally reliable,
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index c0b6e210bb1..488fea8d12b 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -135,7 +135,6 @@ void BlenderSync::sync_data(BL::SpaceView3D b_v3d, BL::Object b_override, const
 
 void BlenderSync::sync_integrator()
 {
-	BL::RenderSettings r = b_scene.render();
 	PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
 
 	experimental = (RNA_enum_get(&cscene, "feature_set") != 0);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 23ad5e5ff92..491a63a7cf2 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -259,7 +259,7 @@ public:
 
 		path_create_directories(cubin);
 
-		string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" --use_fast_math "
+		string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
 			"-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
 			nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), maxreg, include.c_str());
 
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index d165716aaca..98cb16d5dfc 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -114,7 +114,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 
 		add_custom_command(
 			OUTPUT ${cuda_cubin}
-			COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu --use_fast_math -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" --maxrregcount=24 --opencc-options -OPT:Olimit=0 -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
+			COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" --maxrregcount=24 --opencc-options -OPT:Olimit=0 -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC
 			DEPENDS ${cuda_sources})
 
 		delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h
index 5da4253bd86..522f9861c35 100644
--- a/intern/cycles/kernel/kernel_bvh.h
+++ b/intern/cycles/kernel/kernel_bvh.h
@@ -74,10 +74,10 @@ __device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray
 
 __device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *idir, float *t, const float tmax)
 {
-	Transform tfm = object_fetch_transform(kg, object, ray->time, OBJECT_TRANSFORM);
-
-	if(*t != FLT_MAX)
+	if(*t != FLT_MAX) {
+		Transform tfm = object_fetch_transform(kg, object, ray->time, OBJECT_TRANSFORM);
 		*t *= len(transform_direction(&tfm, 1.0f/(*idir)));
+	}
 
 	*P = ray->P;
 	*idir = bvh_inverse_direction(ray->D);
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 2f9f2c45e88..06bdce6c35c 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -62,5 +62,15 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
 
 #define kernel_data __data
 
+/* Use fast math functions */
+
+#define cosf(x) __cosf(((float)x))
+#define sinf(x) __sinf(((float)x))
+#define powf(x, y) __powf(((float)x), ((float)y))
+#define cosf(x) __cosf(((float)x))
+#define tanf(x) __tanf(((float)x))
+#define logf(x) __logf(((float)x))
+#define expf(x) __expf(((float)x))
+
 #endif /* __KERNEL_COMPAT_CUDA_H__ */
 
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index a5735920cd9..72d36811626 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -69,20 +69,20 @@ __device float3 equirectangular_to_direction(float u, float v)
 	float theta = M_PI_F*(1.0f - v);
 
 	return make_float3(
-		sin(theta)*cos(phi),
-		sin(theta)*sin(phi),
-		cos(theta));
+		sinf(theta)*cosf(phi),
+		sinf(theta)*sinf(phi),
+		cosf(theta));
 }
 
 /* Fisheye <-> Cartesian direction */
 
 __device float2 direction_to_fisheye(float3 dir, float fov)
 {
-	float r = atan2f(sqrt(dir.y*dir.y +  dir.z*dir.z), dir.x) / fov;
-	float phi = atan2(dir.z, dir.y);
+	float r = atan2f(sqrtf(dir.y*dir.y +  dir.z*dir.z), dir.x) / fov;
+	float phi = atan2f(dir.z, dir.y);
 
-	float u = r * cos(phi) + 0.5f;
-	float v = r * sin(phi) + 0.5f;
+	float u = r * cosf(phi) + 0.5f;
+	float v = r * sinf(phi) + 0.5f;
 
 	return make_float2(u, v);
 }
@@ -92,7 +92,7 @@ __device float3 fisheye_to_direction(float u, float v, float fov)
 	u = (u - 0.5f) * 2.0f;
 	v = (v - 0.5f) * 2.0f;
 
-	float r = sqrt(u*u + v*v);
+	float r = sqrtf(u*u + v*v);
 
 	if(r > 1.0f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -127,7 +127,7 @@ __device float3 fisheye_equisolid_to_direction(float u, float v, float lens, flo
 	v = (v - 0.5f) * height;
 
 	float rmax = 2.0f * lens * sinf(fov * 0.25f);
-	float r = sqrt(u*u + v*v);
+	float r = sqrtf(u*u + v*v);
 
 	if(r > rmax)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -153,7 +153,7 @@ __device float3 mirrorball_to_direction(float u, float v)
 
 	dir.x = 2.0f*u - 1.0f;
 	dir.z = 2.0f*v - 1.0f;
-	dir.y = -sqrt(max(1.0f - dir.x*dir.x - dir.z*dir.z, 0.0f));
+	dir.y = -sqrtf(max(1.0f - dir.x*dir.x - dir.z*dir.z, 0.0f));
 
 	/* reflection */
 	float3 I = make_float3(0.0f, -1.0f, 0.0f);
@@ -166,7 +166,7 @@ __device float2 direction_to_mirrorball(float3 dir)
 	/* inverse of mirrorball_to_direction */
 	dir.y -= 1.0f;
 
-	float div = 2.0f*sqrt(max(-0.5f*dir.y, 0.0f));
+	float div = 2.0f*sqrtf(max(-0.5f*dir.y, 0.0f));
 	if(div > 0.0f)
 		dir /= div;
 
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index e4897ee6787..b460c4c87a2 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -61,16 +61,20 @@ __device_inline float3 transform_perspective(const Transform *t, const float3 a)
 
 __device_inline float3 transform_point(const Transform *t, const float3 a)
 {
-	float4 b = make_float4(a.x, a.y, a.z, 1.0f);
-	float3 c = make_float3(dot(t->x, b), dot(t->y, b), dot(t->z, b));
+	float3 c = make_float3(
+		a.x*t->x.x + a.y*t->x.y + a.z*t->x.z + t->x.w,
+		a.x*t->y.x + a.y*t->y.y + a.z*t->y.z + t->y.w,
+		a.x*t->z.x + a.y*t->z.y + a.z*t->z.z + t->z.w);
 
 	return c;
 }
 
 __device_inline float3 transform_direction(const Transform *t, const float3 a)
 {
-	float4 b = make_float4(a.x, a.y, a.z, 0.0f);
-	float3 c = make_float3(dot(t->x, b), dot(t->y, b), dot(t->z, b));
+	float3 c = make_float3(
+		a.x*t->x.x + a.y*t->x.y + a.z*t->x.z,
+		a.x*t->y.x + a.y*t->y.y + a.z*t->y.z,
+		a.x*t->z.x + a.y*t->z.y + a.z*t->z.z);
 
 	return c;
 }
author	Brecht Van Lommel <brechtvanlommel@pandora.be>	2012-05-28 23:21:13 +0400
committer	Brecht Van Lommel <brechtvanlommel@pandora.be>	2012-05-28 23:21:13 +0400
commit	131de4352b9e8f3e4ce77d2d9dc145f5db461aed (patch)
tree	c796dc4d9697a1122383dea92aab31493e9c0e1a /intern
parent	c2be2fd4083845a381c2462a65dc0856dc6efb8e (diff)