Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorSv. Lockal <lockalsash@gmail.com>2014-02-27 14:49:21 +0400
committerSv. Lockal <lockalsash@gmail.com>2014-02-27 15:01:20 +0400
commit7808360c5f35c5a6371c3627758a70db431955a2 (patch)
tree0a3cb25e749917dd0021f308a5627442888b1ecb /intern
parent8badec14f97df80b328a6d8106c6cdbcafb171c0 (diff)
Cycles: fix crash in SSE hair and half-floats on x86+vc2008
MSVC 2008 ignores alignement attribute when assigning from unaligned float4 vector, returned from other function. Now Cycles uses unaligned loads instead of casts for win32 in x86 mode.
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/kernel/kernel_bvh.h11
-rw-r--r--intern/cycles/kernel/kernel_bvh_subsurface.h4
-rw-r--r--intern/cycles/kernel/kernel_bvh_traversal.h4
-rw-r--r--intern/cycles/kernel/kernel_compat_cpu.h2
-rw-r--r--intern/cycles/kernel/kernel_film.h4
-rw-r--r--intern/cycles/util/util_half.h18
-rw-r--r--intern/cycles/util/util_simd.h24
-rw-r--r--intern/cycles/util/util_types.h2
8 files changed, 48 insertions, 21 deletions
diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h
index 93e546eaece..17791f4f35a 100644
--- a/intern/cycles/kernel/kernel_bvh.h
+++ b/intern/cycles/kernel/kernel_bvh.h
@@ -235,7 +235,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
int prim = kernel_tex_fetch(__prim_index, curveAddr);
#ifdef __KERNEL_SSE2__
- __m128 vdir = _mm_div_ps(_mm_set1_ps(1.0f), (__m128 &)idir);
+ __m128 vdir = _mm_div_ps(_mm_set1_ps(1.0f), load_m128(idir));
__m128 vcurve_coef[4];
const float3 *curve_coef = (float3 *)vcurve_coef;
@@ -268,10 +268,11 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
__m128 htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
__m128 htfm[] = { htfm0, htfm1, htfm2 };
- __m128 p0 = transform_point_T3(htfm, _mm_sub_ps(P0, (__m128 &)P));
- __m128 p1 = transform_point_T3(htfm, _mm_sub_ps(P1, (__m128 &)P));
- __m128 p2 = transform_point_T3(htfm, _mm_sub_ps(P2, (__m128 &)P));
- __m128 p3 = transform_point_T3(htfm, _mm_sub_ps(P3, (__m128 &)P));
+ __m128 vP = load_m128(P);
+ __m128 p0 = transform_point_T3(htfm, _mm_sub_ps(P0, vP));
+ __m128 p1 = transform_point_T3(htfm, _mm_sub_ps(P1, vP));
+ __m128 p2 = transform_point_T3(htfm, _mm_sub_ps(P2, vP));
+ __m128 p3 = transform_point_T3(htfm, _mm_sub_ps(P3, vP));
float fc = 0.71f;
__m128 vfc = _mm_set1_ps(fc);
diff --git a/intern/cycles/kernel/kernel_bvh_subsurface.h b/intern/cycles/kernel/kernel_bvh_subsurface.h
index 6b691f3364b..40683a2da57 100644
--- a/intern/cycles/kernel/kernel_bvh_subsurface.h
+++ b/intern/cycles/kernel/kernel_bvh_subsurface.h
@@ -49,8 +49,8 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
/* ray parameters in registers */
const float tmax = ray->t;
- ccl_align(16) float3 P = ray->P;
- ccl_align(16) float3 idir = bvh_inverse_direction(ray->D);
+ float3 P = ray->P;
+ float3 idir = bvh_inverse_direction(ray->D);
int object = ~0;
float isect_t = tmax;
diff --git a/intern/cycles/kernel/kernel_bvh_traversal.h b/intern/cycles/kernel/kernel_bvh_traversal.h
index bfd72b0aa16..0515a9e0fa7 100644
--- a/intern/cycles/kernel/kernel_bvh_traversal.h
+++ b/intern/cycles/kernel/kernel_bvh_traversal.h
@@ -55,8 +55,8 @@ ccl_device bool BVH_FUNCTION_NAME
/* ray parameters in registers */
const float tmax = ray->t;
- ccl_align(16) float3 P = ray->P;
- ccl_align(16) float3 idir = bvh_inverse_direction(ray->D);
+ float3 P = ray->P;
+ float3 idir = bvh_inverse_direction(ray->D);
int object = ~0;
#if FEATURE(BVH_MOTION)
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index b213e91274d..a9c66ec2d68 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -20,9 +20,9 @@
#define __KERNEL_CPU__
#include "util_debug.h"
-#include "util_half.h"
#include "util_math.h"
#include "util_simd.h"
+#include "util_half.h"
#include "util_types.h"
CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index cbd875e994c..dc5f6e7ce38 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -75,7 +75,7 @@ ccl_device void kernel_film_convert_to_half_float(KernelGlobals *kg,
float exposure = kernel_data.film.exposure;
- ccl_align(16) float4 rgba_in = *in;
+ float4 rgba_in = *in;
if(exposure != 1.0f) {
rgba_in.x *= exposure;
@@ -83,7 +83,7 @@ ccl_device void kernel_film_convert_to_half_float(KernelGlobals *kg,
rgba_in.z *= exposure;
}
- float4_store_half(out, &rgba_in, sample_scale);
+ float4_store_half(out, rgba_in, sample_scale);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 21192024f7f..88709955b32 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN
#ifdef __KERNEL_OPENCL__
-#define float4_store_half(h, f, scale) vstore_half4(*(f) * (scale), 0, h);
+#define float4_store_half(h, f, scale) vstore_half4(f * (scale), 0, h);
#else
@@ -34,24 +34,24 @@ struct half4 { half x, y, z, w; };
#ifdef __KERNEL_CUDA__
-ccl_device_inline void float4_store_half(half *h, const float4 *f, float scale)
+ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
{
- h[0] = __float2half_rn(f->x * scale);
- h[1] = __float2half_rn(f->y * scale);
- h[2] = __float2half_rn(f->z * scale);
- h[3] = __float2half_rn(f->w * scale);
+ h[0] = __float2half_rn(f.x * scale);
+ h[1] = __float2half_rn(f.y * scale);
+ h[2] = __float2half_rn(f.z * scale);
+ h[3] = __float2half_rn(f.w * scale);
}
#else
-ccl_device_inline void float4_store_half(half *h, const float4 *f, float scale)
+ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
{
#ifndef __KERNEL_SSE2__
for(int i = 0; i < 4; i++) {
/* optimized float to half for pixels:
* assumes no negative, no nan, no inf, and sets denormal to 0 */
union { uint i; float f; } in;
- float fscale = (*f)[i] * scale;
+ float fscale = f[i] * scale;
in.f = (fscale > 0.0f)? ((fscale < 65500.0f)? fscale: 65500.0f): 0.0f;
int x = in.i;
@@ -70,7 +70,7 @@ ccl_device_inline void float4_store_half(half *h, const float4 *f, float scale)
const __m128i mm_7FFFFFFF = _mm_set1_epi32(0x7FFFFFFF);
const __m128i mm_C8000000 = _mm_set1_epi32(0xC8000000);
- __m128 mm_fscale = _mm_mul_ps(*(__m128*)f, mm_scale);
+ __m128 mm_fscale = _mm_mul_ps(load_m128(f), mm_scale);
__m128i x = _mm_castps_si128(_mm_min_ps(_mm_max_ps(mm_fscale, _mm_set_ps1(0.0f)), _mm_set_ps1(65500.0f)));
__m128i absolute = _mm_and_si128(x, mm_7FFFFFFF);
__m128i Z = _mm_add_epi32(absolute, mm_C8000000);
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index fd5ba1de37b..fff682bb436 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -180,6 +180,30 @@ ccl_device_inline const __m128 set_sign_bit(const __m128 &a)
return _mm_xor_ps(a, _mm_castsi128_ps(_mm_setr_epi32(S1 << 31, S2 << 31, S3 << 31, S4 << 31)));
}
+#ifdef __KERNEL_WITH_SSE_ALIGN__
+ccl_device_inline const __m128 load_m128(const float4 &vec)
+{
+ return _mm_load_ps(&vec.x);
+}
+
+ccl_device_inline const __m128 load_m128(const float3 &vec)
+{
+ return _mm_load_ps(&vec.x);
+}
+
+#else
+
+ccl_device_inline const __m128 load_m128(const float4 &vec)
+{
+ return _mm_loadu_ps(&vec.x);
+}
+
+ccl_device_inline const __m128 load_m128(const float3 &vec)
+{
+ return _mm_loadu_ps(&vec.x);
+}
+#endif /* __KERNEL_WITH_SSE_ALIGN__ */
+
#endif /* __KERNEL_SSE2__ */
CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index c770931c69b..241da40975c 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -37,6 +37,7 @@
#define ccl_device_noinline static
#define ccl_global
#define ccl_constant
+#define __KERNEL_WITH_SSE_ALIGN__
#if defined(_WIN32) && !defined(FREE_WINDOWS)
@@ -45,6 +46,7 @@
#ifdef __KERNEL_64_BIT__
#define ccl_try_align(...) __declspec(align(__VA_ARGS__))
#else
+#undef __KERNEL_WITH_SSE_ALIGN__
#define ccl_try_align(...) /* not support for function arguments (error C2719) */
#endif
#define ccl_may_alias