From 4a04f7206914a49f5f95adc5eb786237f1a9f547 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Sun, 23 Oct 2011 17:52:20 +0000 Subject: remove $Id: tags after discussion on the mailign list: http://markmail.org/message/fp7ozcywxum3ar7n --- extern/Eigen3/Eigen/src/Core/arch/SSE/Complex.h | 447 ++++++++++++++++++++++++ 1 file changed, 447 insertions(+) create mode 100644 extern/Eigen3/Eigen/src/Core/arch/SSE/Complex.h (limited to 'extern/Eigen3/Eigen/src/Core/arch/SSE/Complex.h') diff --git a/extern/Eigen3/Eigen/src/Core/arch/SSE/Complex.h b/extern/Eigen3/Eigen/src/Core/arch/SSE/Complex.h new file mode 100644 index 00000000000..c352bb3e6cf --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/arch/SSE/Complex.h @@ -0,0 +1,447 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2010 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_COMPLEX_SSE_H +#define EIGEN_COMPLEX_SSE_H + +namespace internal { + +//---------- float ---------- +struct Packet2cf +{ + EIGEN_STRONG_INLINE Packet2cf() {} + EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {} + __m128 v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet2cf type; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { typedef std::complex type; enum {size=2}; }; + +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) +{ + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); + return Packet2cf(_mm_xor_ps(a.v,mask)); +} +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) +{ + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_xor_ps(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for SSE3 and 4 + #ifdef EIGEN_VECTORIZE_SSE3 + return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v), + _mm_mul_ps(_mm_movehdup_ps(a.v), + vec4f_swizzle1(b.v, 1, 0, 3, 2)))); +// return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), +// _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), +// vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000)); + return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), + _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), + vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); + #endif +} + +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(&real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(&real_ref(*from))); } + +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + Packet2cf res; + #if EIGEN_GNUC_AT_MOST(4,2) + // workaround annoying "may be used uninitialized in this function" warning with gcc 4.2 + res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)&from); + #else + res.v = _mm_loadl_pi(res.v, (const __m64*)&from); + #endif + return Packet2cf(_mm_movelh_ps(res.v,res.v)); +} + +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&real_ref(*to), from.v); } + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +{ + #if EIGEN_GNUC_AT_MOST(4,3) + // Workaround gcc 4.2 ICE - this is not performance wise ideal, but who cares... + // This workaround also fix invalid code generation with gcc 4.3 + EIGEN_ALIGN16 std::complex res[2]; + _mm_store_ps((float*)res, a.v); + return res[0]; + #else + std::complex res; + _mm_storel_pi((__m64*)&res, a.v); + return res; + #endif +} + +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); } + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v)))); +} + +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v))); +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v)))); +} + +template +struct palign_impl +{ + EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset==1) + { + first.v = _mm_movehl_ps(first.v, first.v); + first.v = _mm_movelh_ps(first.v, second.v); + } + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return internal::pmul(a, pconj(b)); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), + _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), + vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + #endif + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return internal::pmul(pconj(a), b); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), + _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), + vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); + #endif + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return pconj(internal::pmul(a, b)); + #else + const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), + _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), + vec4f_swizzle1(b.v, 1, 0, 3, 2)))); + #endif + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const + { return Packet2cf(Eigen::internal::pmul(x, y.v)); } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const + { return Packet2cf(Eigen::internal::pmul(x.v, y)); } +}; + +template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for SSE3 and 4 + Packet2cf res = conj_helper().pmul(a,b); + __m128 s = _mm_mul_ps(b.v,b.v); + return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); +} + +EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) +{ + return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); +} + + +//---------- double ---------- +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {} + __m128d v; +}; + +template<> struct packet_traits > : default_packet_traits +{ + typedef Packet1cd type; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits { typedef std::complex type; enum {size=1}; }; + +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) +{ + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_xor_pd(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for SSE3 and 4 + #ifdef EIGEN_VECTORIZE_SSE3 + return Packet1cd(_mm_addsub_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), + _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)))); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); + return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), + _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)), mask))); + #endif +} + +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } + +// FIXME force unaligned load, this is a temporary fix +template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) +{ /* here we really have to use unaligned loads :( */ return ploadu(&from); } + +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } + +// FIXME force unaligned store, this is a temporary fix +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +{ + EIGEN_ALIGN16 double res[2]; + _mm_store_pd(res, a.v); + return std::complex(res[0],res[1]); +} + +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } + +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) +{ + return pfirst(a); +} + +template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) +{ + return vecs[0]; +} + +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) +{ + return pfirst(a); +} + +template +struct palign_impl +{ + EIGEN_STRONG_INLINE static void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return internal::pmul(a, pconj(b)); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), + _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)))); + #endif + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return internal::pmul(pconj(a), b); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), + _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)), mask))); + #endif + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + #ifdef EIGEN_VECTORIZE_SSE3 + return pconj(internal::pmul(a, b)); + #else + const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); + return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), + _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), + vec2d_swizzle1(b.v, 1, 0)))); + #endif + } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const + { return Packet1cd(Eigen::internal::pmul(x, y.v)); } +}; + +template<> struct conj_helper +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const + { return Packet1cd(Eigen::internal::pmul(x.v, y)); } +}; + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for SSE3 and 4 + Packet1cd res = conj_helper().pmul(a,b); + __m128d s = _mm_mul_pd(b.v,b.v); + return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) +{ + return Packet1cd(preverse(x.v)); +} + +} // end namespace internal + +#endif // EIGEN_COMPLEX_SSE_H -- cgit v1.2.3