From 4a04f7206914a49f5f95adc5eb786237f1a9f547 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Sun, 23 Oct 2011 17:52:20 +0000 Subject: remove $Id: tags after discussion on the mailign list: http://markmail.org/message/fp7ozcywxum3ar7n --- .../Eigen/src/Core/products/CoeffBasedProduct.h | 452 +++++++ .../src/Core/products/GeneralBlockPanelKernel.h | 1285 ++++++++++++++++++++ .../Eigen/src/Core/products/GeneralMatrixMatrix.h | 439 +++++++ .../Core/products/GeneralMatrixMatrixTriangular.h | 225 ++++ .../Eigen/src/Core/products/GeneralMatrixVector.h | 559 +++++++++ .../Eigen3/Eigen/src/Core/products/Parallelizer.h | 154 +++ .../src/Core/products/SelfadjointMatrixMatrix.h | 427 +++++++ .../src/Core/products/SelfadjointMatrixVector.h | 278 +++++ .../Eigen/src/Core/products/SelfadjointProduct.h | 136 +++ .../src/Core/products/SelfadjointRank2Update.h | 104 ++ .../src/Core/products/TriangularMatrixMatrix.h | 403 ++++++ .../src/Core/products/TriangularMatrixVector.h | 325 +++++ .../src/Core/products/TriangularSolverMatrix.h | 319 +++++ .../src/Core/products/TriangularSolverVector.h | 150 +++ 14 files changed, 5256 insertions(+) create mode 100644 extern/Eigen3/Eigen/src/Core/products/CoeffBasedProduct.h create mode 100644 extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h create mode 100644 extern/Eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h create mode 100644 extern/Eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h create mode 100644 extern/Eigen3/Eigen/src/Core/products/GeneralMatrixVector.h create mode 100644 extern/Eigen3/Eigen/src/Core/products/Parallelizer.h create mode 100644 extern/Eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h create mode 100644 extern/Eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h create mode 100644 extern/Eigen3/Eigen/src/Core/products/SelfadjointProduct.h create mode 100644 extern/Eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h create mode 100644 extern/Eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h create mode 100644 extern/Eigen3/Eigen/src/Core/products/TriangularMatrixVector.h create mode 100644 extern/Eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h create mode 100644 extern/Eigen3/Eigen/src/Core/products/TriangularSolverVector.h (limited to 'extern/Eigen3/Eigen/src/Core/products') diff --git a/extern/Eigen3/Eigen/src/Core/products/CoeffBasedProduct.h b/extern/Eigen3/Eigen/src/Core/products/CoeffBasedProduct.h new file mode 100644 index 00000000000..dc20f7e1e29 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/CoeffBasedProduct.h @@ -0,0 +1,452 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2006-2008 Benoit Jacob +// Copyright (C) 2008-2010 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_COEFFBASED_PRODUCT_H +#define EIGEN_COEFFBASED_PRODUCT_H + +namespace internal { + +/********************************************************************************* +* Coefficient based product implementation. +* It is designed for the following use cases: +* - small fixed sizes +* - lazy products +*********************************************************************************/ + +/* Since the all the dimensions of the product are small, here we can rely + * on the generic Assign mechanism to evaluate the product per coeff (or packet). + * + * Note that here the inner-loops should always be unrolled. + */ + +template +struct product_coeff_impl; + +template +struct product_packet_impl; + +template +struct traits > +{ + typedef MatrixXpr XprKind; + typedef typename remove_all::type _LhsNested; + typedef typename remove_all::type _RhsNested; + typedef typename scalar_product_traits::ReturnType Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits<_RhsNested>::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits<_RhsNested>::Index>::type Index; + + enum { + LhsCoeffReadCost = _LhsNested::CoeffReadCost, + RhsCoeffReadCost = _RhsNested::CoeffReadCost, + LhsFlags = _LhsNested::Flags, + RhsFlags = _RhsNested::Flags, + + RowsAtCompileTime = _LhsNested::RowsAtCompileTime, + ColsAtCompileTime = _RhsNested::ColsAtCompileTime, + InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime), + + MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime, + MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime, + + LhsRowMajor = LhsFlags & RowMajorBit, + RhsRowMajor = RhsFlags & RowMajorBit, + + SameType = is_same::value, + + CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit) + && (ColsAtCompileTime == Dynamic + || ( (ColsAtCompileTime % packet_traits::size) == 0 + && (RhsFlags&AlignedBit) + ) + ), + + CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) + && (RowsAtCompileTime == Dynamic + || ( (RowsAtCompileTime % packet_traits::size) == 0 + && (LhsFlags&AlignedBit) + ) + ), + + EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1 + : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 + : (RhsRowMajor && !CanVectorizeLhs), + + Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit) + | (EvalToRowMajor ? RowMajorBit : 0) + | NestingFlags + | (LhsFlags & RhsFlags & AlignedBit) + // TODO enable vectorization for mixed types + | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0), + + CoeffReadCost = InnerSize == Dynamic ? Dynamic + : InnerSize * (NumTraits::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) + + (InnerSize - 1) * NumTraits::AddCost, + + /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside + * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner + * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect + * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. + */ + CanVectorizeInner = SameType + && LhsRowMajor + && (!RhsRowMajor) + && (LhsFlags & RhsFlags & ActualPacketAccessBit) + && (LhsFlags & RhsFlags & AlignedBit) + && (InnerSize % packet_traits::size == 0) + }; +}; + +} // end namespace internal + +template +class CoeffBasedProduct + : internal::no_assignment_operator, + public MatrixBase > +{ + public: + + typedef MatrixBase Base; + EIGEN_DENSE_PUBLIC_INTERFACE(CoeffBasedProduct) + typedef typename Base::PlainObject PlainObject; + + private: + + typedef typename internal::traits::_LhsNested _LhsNested; + typedef typename internal::traits::_RhsNested _RhsNested; + + enum { + PacketSize = internal::packet_traits::size, + InnerSize = internal::traits::InnerSize, + Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT, + CanVectorizeInner = internal::traits::CanVectorizeInner + }; + + typedef internal::product_coeff_impl ScalarCoeffImpl; + + typedef CoeffBasedProduct LazyCoeffBasedProductType; + + public: + + inline CoeffBasedProduct(const CoeffBasedProduct& other) + : Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs) + {} + + template + inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs) + : m_lhs(lhs), m_rhs(rhs) + { + // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable. + // We still allow to mix T and complex. + EIGEN_STATIC_ASSERT((internal::is_same::value), + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + eigen_assert(lhs.cols() == rhs.rows() + && "invalid matrix product" + && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); + } + + EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); } + EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); } + + EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const + { + Scalar res; + ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res); + return res; + } + + /* Allow index-based non-packet access. It is impossible though to allow index-based packed access, + * which is why we don't set the LinearAccessBit. + */ + EIGEN_STRONG_INLINE const Scalar coeff(Index index) const + { + Scalar res; + const Index row = RowsAtCompileTime == 1 ? 0 : index; + const Index col = RowsAtCompileTime == 1 ? index : 0; + ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res); + return res; + } + + template + EIGEN_STRONG_INLINE const PacketScalar packet(Index row, Index col) const + { + PacketScalar res; + internal::product_packet_impl + ::run(row, col, m_lhs, m_rhs, res); + return res; + } + + // Implicit conversion to the nested type (trigger the evaluation of the product) + EIGEN_STRONG_INLINE operator const PlainObject& () const + { + m_result.lazyAssign(*this); + return m_result; + } + + const _LhsNested& lhs() const { return m_lhs; } + const _RhsNested& rhs() const { return m_rhs; } + + const Diagonal diagonal() const + { return reinterpret_cast(*this); } + + template + const Diagonal diagonal() const + { return reinterpret_cast(*this); } + + const Diagonal diagonal(Index index) const + { return reinterpret_cast(*this).diagonal(index); } + + protected: + const LhsNested m_lhs; + const RhsNested m_rhs; + + mutable PlainObject m_result; +}; + +namespace internal { + +// here we need to overload the nested rule for products +// such that the nested type is a const reference to a plain matrix +template +struct nested, N, PlainObject> +{ + typedef PlainObject const& type; +}; + +/*************************************************************************** +* Normal product .coeff() implementation (with meta-unrolling) +***************************************************************************/ + +/************************************** +*** Scalar path - no vectorization *** +**************************************/ + +template +struct product_coeff_impl +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) + { + product_coeff_impl::run(row, col, lhs, rhs, res); + res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col); + } +}; + +template +struct product_coeff_impl +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) + { + res = lhs.coeff(row, 0) * rhs.coeff(0, col); + } +}; + +template +struct product_coeff_impl +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res) + { + eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix"); + res = lhs.coeff(row, 0) * rhs.coeff(0, col); + for(Index i = 1; i < lhs.cols(); ++i) + res += lhs.coeff(row, i) * rhs.coeff(i, col); + } +}; + +/******************************************* +*** Scalar path with inner vectorization *** +*******************************************/ + +template +struct product_coeff_vectorized_unroller +{ + typedef typename Lhs::Index Index; + enum { PacketSize = packet_traits::size }; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) + { + product_coeff_vectorized_unroller::run(row, col, lhs, rhs, pres); + pres = padd(pres, pmul( lhs.template packet(row, UnrollingIndex) , rhs.template packet(UnrollingIndex, col) )); + } +}; + +template +struct product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) + { + pres = pmul(lhs.template packet(row, 0) , rhs.template packet(0, col)); + } +}; + +template +struct product_coeff_impl +{ + typedef typename Lhs::PacketScalar Packet; + typedef typename Lhs::Index Index; + enum { PacketSize = packet_traits::size }; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) + { + Packet pres; + product_coeff_vectorized_unroller::run(row, col, lhs, rhs, pres); + product_coeff_impl::run(row, col, lhs, rhs, res); + res = predux(pres); + } +}; + +template +struct product_coeff_vectorized_dyn_selector +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum(); + } +}; + +// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower +// NOTE maybe they are now useless since we have a specialization for Block +template +struct product_coeff_vectorized_dyn_selector +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = lhs.transpose().cwiseProduct(rhs.col(col)).sum(); + } +}; + +template +struct product_coeff_vectorized_dyn_selector +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = lhs.row(row).transpose().cwiseProduct(rhs).sum(); + } +}; + +template +struct product_coeff_vectorized_dyn_selector +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = lhs.transpose().cwiseProduct(rhs).sum(); + } +}; + +template +struct product_coeff_impl +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + product_coeff_vectorized_dyn_selector::run(row, col, lhs, rhs, res); + } +}; + +/******************* +*** Packet path *** +*******************/ + +template +struct product_packet_impl +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) + { + product_packet_impl::run(row, col, lhs, rhs, res); + res = pmadd(pset1(lhs.coeff(row, UnrollingIndex)), rhs.template packet(UnrollingIndex, col), res); + } +}; + +template +struct product_packet_impl +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) + { + product_packet_impl::run(row, col, lhs, rhs, res); + res = pmadd(lhs.template packet(row, UnrollingIndex), pset1(rhs.coeff(UnrollingIndex, col)), res); + } +}; + +template +struct product_packet_impl +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) + { + res = pmul(pset1(lhs.coeff(row, 0)),rhs.template packet(0, col)); + } +}; + +template +struct product_packet_impl +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) + { + res = pmul(lhs.template packet(row, 0), pset1(rhs.coeff(0, col))); + } +}; + +template +struct product_packet_impl +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res) + { + eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix"); + res = pmul(pset1(lhs.coeff(row, 0)),rhs.template packet(0, col)); + for(Index i = 1; i < lhs.cols(); ++i) + res = pmadd(pset1(lhs.coeff(row, i)), rhs.template packet(i, col), res); + } +}; + +template +struct product_packet_impl +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res) + { + eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix"); + res = pmul(lhs.template packet(row, 0), pset1(rhs.coeff(0, col))); + for(Index i = 1; i < lhs.cols(); ++i) + res = pmadd(lhs.template packet(row, i), pset1(rhs.coeff(i, col)), res); + } +}; + +} // end namespace internal + +#endif // EIGEN_COEFFBASED_PRODUCT_H diff --git a/extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h new file mode 100644 index 00000000000..6f3f2717007 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -0,0 +1,1285 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_GENERAL_BLOCK_PANEL_H +#define EIGEN_GENERAL_BLOCK_PANEL_H + +namespace internal { + +template +class gebp_traits; + +/** \internal */ +inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) +{ + static std::ptrdiff_t m_l1CacheSize = 0; + static std::ptrdiff_t m_l2CacheSize = 0; + if(m_l1CacheSize==0) + { + m_l1CacheSize = queryL1CacheSize(); + m_l2CacheSize = queryTopLevelCacheSize(); + + if(m_l1CacheSize<=0) m_l1CacheSize = 8 * 1024; + if(m_l2CacheSize<=0) m_l2CacheSize = 1 * 1024 * 1024; + } + + if(action==SetAction) + { + // set the cpu cache size and cache all block sizes from a global cache size in byte + eigen_internal_assert(l1!=0 && l2!=0); + m_l1CacheSize = *l1; + m_l2CacheSize = *l2; + } + else if(action==GetAction) + { + eigen_internal_assert(l1!=0 && l2!=0); + *l1 = m_l1CacheSize; + *l2 = m_l2CacheSize; + } + else + { + eigen_internal_assert(false); + } +} + +/** \brief Computes the blocking parameters for a m x k times k x n matrix product + * + * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. + * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension. + * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. + * + * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, + * this function computes the blocking size parameters along the respective dimensions + * for matrix products and related algorithms. The blocking sizes depends on various + * parameters: + * - the L1 and L2 cache sizes, + * - the register level blocking sizes defined by gebp_traits, + * - the number of scalars that fit into a packet (when vectorization is enabled). + * + * \sa setCpuCacheSizes */ +template +void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n) +{ + EIGEN_UNUSED_VARIABLE(n); + // Explanations: + // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and + // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed + // per kc x nr vertical small panels where nr is the blocking size along the n dimension + // at the register level. For vectorization purpose, these small vertical panels are unpacked, + // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to + // stay in L1 cache. + std::ptrdiff_t l1, l2; + + typedef gebp_traits Traits; + enum { + kdiv = KcFactor * 2 * Traits::nr + * Traits::RhsProgress * sizeof(RhsScalar), + mr = gebp_traits::mr, + mr_mask = (0xffffffff/mr)*mr + }; + + manage_caching_sizes(GetAction, &l1, &l2); + k = std::min(k, l1/kdiv); + std::ptrdiff_t _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; + if(_m +inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n) +{ + computeProductBlockingSizes(k, m, n); +} + +#ifdef EIGEN_HAS_FUSE_CJMADD + #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); +#else + + // FIXME (a bit overkill maybe ?) + + template struct gebp_madd_selector { + EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) + { + c = cj.pmadd(a,b,c); + } + }; + + template struct gebp_madd_selector { + EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, T& a, T& b, T& c, T& t) + { + t = b; t = cj.pmul(a,t); c = padd(c,t); + } + }; + + template + EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) + { + gebp_madd_selector::run(cj,a,b,c,t); + } + + #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); +// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); +#endif + +/* Vectorization logic + * real*real: unpack rhs to constant packets, ... + * + * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i), + * storing each res packet into two packets (2x2), + * at the end combine them: swap the second and addsub them + * cf*cf : same but with 2x4 blocks + * cplx*real : unpack rhs to constant packets, ... + * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual + */ +template +class gebp_traits +{ +public: + typedef _LhsScalar LhsScalar; + typedef _RhsScalar RhsScalar; + typedef typename scalar_product_traits::ReturnType ResScalar; + + enum { + ConjLhs = _ConjLhs, + ConjRhs = _ConjRhs, + Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + + // register block size along the N direction (must be either 2 or 4) + nr = NumberOfRegisters/4, + + // register block size along the M direction (currently, this one cannot be modified) + mr = 2 * LhsPacketSize, + + WorkSpaceFactor = nr * RhsPacketSize, + + LhsProgress = LhsPacketSize, + RhsProgress = RhsPacketSize + }; + + typedef typename packet_traits::type _LhsPacket; + typedef typename packet_traits::type _RhsPacket; + typedef typename packet_traits::type _ResPacket; + + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1(ResScalar(0)); + } + + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + { + for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = pload(b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = pload(a); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const + { + tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); + } + + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = pmadd(c,alpha,r); + } + +protected: +// conj_helper cj; +// conj_helper pcj; +}; + +template +class gebp_traits, RealScalar, _ConjLhs, false> +{ +public: + typedef std::complex LhsScalar; + typedef RealScalar RhsScalar; + typedef typename scalar_product_traits::ReturnType ResScalar; + + enum { + ConjLhs = _ConjLhs, + ConjRhs = false, + Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + nr = NumberOfRegisters/4, + mr = 2 * LhsPacketSize, + WorkSpaceFactor = nr*RhsPacketSize, + + LhsProgress = LhsPacketSize, + RhsProgress = RhsPacketSize + }; + + typedef typename packet_traits::type _LhsPacket; + typedef typename packet_traits::type _RhsPacket; + typedef typename packet_traits::type _ResPacket; + + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1(ResScalar(0)); + } + + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + { + for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = pload(b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = pload(a); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + { + madd_impl(a, b, c, tmp, typename conditional::type()); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + { + tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const + { + c += a * b; + } + + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = cj.pmadd(c,alpha,r); + } + +protected: + conj_helper cj; +}; + +template +class gebp_traits, std::complex, _ConjLhs, _ConjRhs > +{ +public: + typedef std::complex Scalar; + typedef std::complex LhsScalar; + typedef std::complex RhsScalar; + typedef std::complex ResScalar; + + enum { + ConjLhs = _ConjLhs, + ConjRhs = _ConjRhs, + Vectorizable = packet_traits::Vectorizable + && packet_traits::Vectorizable, + RealPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + + nr = 2, + mr = 2 * ResPacketSize, + WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, + + LhsProgress = ResPacketSize, + RhsProgress = Vectorizable ? 2*ResPacketSize : 1 + }; + + typedef typename packet_traits::type RealPacket; + typedef typename packet_traits::type ScalarPacket; + struct DoublePacket + { + RealPacket first; + RealPacket second; + }; + + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; + typedef typename conditional::type AccPacket; + + EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } + + EIGEN_STRONG_INLINE void initAcc(DoublePacket& p) + { + p.first = pset1(RealScalar(0)); + p.second = pset1(RealScalar(0)); + } + + /* Unpack the rhs coeff such that each complex coefficient is spread into + * two packects containing respectively the real and imaginary coefficient + * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] + */ + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) + { + for(DenseIndex k=0; k((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); + pstore1((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); + } + else + b[k] = rhs[k]; + } + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const + { + dest.first = pload((const RealScalar*)b); + dest.second = pload((const RealScalar*)(b+ResPacketSize)); + } + + // nothing special here + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = pload((const typename unpacket_traits::type*)(a)); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const + { + c.first = padd(pmul(a,b.first), c.first); + c.second = padd(pmul(a,b.second),c.second); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const + { + c = cj.pmadd(a,b,c); + } + + EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } + + EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const + { + // assemble c + ResPacket tmp; + if((!ConjLhs)&&(!ConjRhs)) + { + tmp = pcplxflip(pconj(ResPacket(c.second))); + tmp = padd(ResPacket(c.first),tmp); + } + else if((!ConjLhs)&&(ConjRhs)) + { + tmp = pconj(pcplxflip(ResPacket(c.second))); + tmp = padd(ResPacket(c.first),tmp); + } + else if((ConjLhs)&&(!ConjRhs)) + { + tmp = pcplxflip(ResPacket(c.second)); + tmp = padd(pconj(ResPacket(c.first)),tmp); + } + else if((ConjLhs)&&(ConjRhs)) + { + tmp = pcplxflip(ResPacket(c.second)); + tmp = psub(pconj(ResPacket(c.first)),tmp); + } + + r = pmadd(tmp,alpha,r); + } + +protected: + conj_helper cj; +}; + +template +class gebp_traits, false, _ConjRhs > +{ +public: + typedef std::complex Scalar; + typedef RealScalar LhsScalar; + typedef Scalar RhsScalar; + typedef Scalar ResScalar; + + enum { + ConjLhs = false, + ConjRhs = _ConjRhs, + Vectorizable = packet_traits::Vectorizable + && packet_traits::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + nr = 4, + mr = 2*ResPacketSize, + WorkSpaceFactor = nr*RhsPacketSize, + + LhsProgress = ResPacketSize, + RhsProgress = ResPacketSize + }; + + typedef typename packet_traits::type _LhsPacket; + typedef typename packet_traits::type _RhsPacket; + typedef typename packet_traits::type _ResPacket; + + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1(ResScalar(0)); + } + + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + { + for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = pload(b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploaddup(a); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + { + madd_impl(a, b, c, tmp, typename conditional::type()); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + { + tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const + { + c += a * b; + } + + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = cj.pmadd(alpha,c,r); + } + +protected: + conj_helper cj; +}; + +/* optimized GEneral packed Block * packed Panel product kernel + * + * Mixing type logic: C += A * B + * | A | B | comments + * |real |cplx | no vectorization yet, would require to pack A with duplication + * |cplx |real | easy vectorization + */ +template +struct gebp_kernel +{ + typedef gebp_traits Traits; + typedef typename Traits::ResScalar ResScalar; + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + typedef typename Traits::AccPacket AccPacket; + + enum { + Vectorizable = Traits::Vectorizable, + LhsProgress = Traits::LhsProgress, + RhsProgress = Traits::RhsProgress, + ResPacketSize = Traits::ResPacketSize + }; + + EIGEN_FLATTEN_ATTRIB + void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0) + { + Traits traits; + + if(strideA==-1) strideA = depth; + if(strideB==-1) strideB = depth; + conj_helper cj; +// conj_helper pcj; + Index packet_cols = (cols/nr) * nr; + const Index peeled_mc = (rows/mr)*mr; + // FIXME: + const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); + const Index peeled_kc = (depth/4)*4; + + if(unpackedB==0) + unpackedB = const_cast(blockB - strideB * nr * RhsProgress); + + // loops on each micro vertical panel of rhs (depth x nr) + for(Index j2=0; j2 we select a mr x nr micro block of res which is entirely + // stored into mr/packet_size x nr registers. + for(Index i=0; i(alpha); + + R0 = ploadu(r0); + R1 = ploadu(r1); + R2 = ploadu(r2); + R3 = ploadu(r3); + R4 = ploadu(r0 + ResPacketSize); + R5 = ploadu(r1 + ResPacketSize); + R6 = ploadu(r2 + ResPacketSize); + traits.acc(C0, alphav, R0); + pstoreu(r0, R0); + R0 = ploadu(r3 + ResPacketSize); + + traits.acc(C1, alphav, R1); + traits.acc(C2, alphav, R2); + traits.acc(C3, alphav, R3); + traits.acc(C4, alphav, R4); + traits.acc(C5, alphav, R5); + traits.acc(C6, alphav, R6); + traits.acc(C7, alphav, R0); + + pstoreu(r1, R1); + pstoreu(r2, R2); + pstoreu(r3, R3); + pstoreu(r0 + ResPacketSize, R4); + pstoreu(r1 + ResPacketSize, R5); + pstoreu(r2 + ResPacketSize, R6); + pstoreu(r3 + ResPacketSize, R0); + } + else + { + ResPacket R0, R1, R4; + ResPacket alphav = pset1(alpha); + + R0 = ploadu(r0); + R1 = ploadu(r1); + R4 = ploadu(r0 + ResPacketSize); + traits.acc(C0, alphav, R0); + pstoreu(r0, R0); + R0 = ploadu(r1 + ResPacketSize); + traits.acc(C1, alphav, R1); + traits.acc(C4, alphav, R4); + traits.acc(C5, alphav, R0); + pstoreu(r1, R1); + pstoreu(r0 + ResPacketSize, R4); + pstoreu(r1 + ResPacketSize, R0); + } + + } + + if(rows-peeled_mc>=LhsProgress) + { + Index i = peeled_mc; + const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3; + traits.initAcc(C0); + traits.initAcc(C1); + if(nr==4) traits.initAcc(C2); + if(nr==4) traits.initAcc(C3); + + // performs "inner" product + const RhsScalar* blB = unpackedB; + for(Index k=0; k(alpha); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + ResScalar* r1 = r0 + resStride; + ResScalar* r2 = r1 + resStride; + ResScalar* r3 = r2 + resStride; + + R0 = ploadu(r0); + R1 = ploadu(r1); + if(nr==4) R2 = ploadu(r2); + if(nr==4) R3 = ploadu(r3); + + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + if(nr==4) traits.acc(C2, alphav, R2); + if(nr==4) traits.acc(C3, alphav, R3); + + pstoreu(r0, R0); + pstoreu(r1, R1); + if(nr==4) pstoreu(r2, R2); + if(nr==4) pstoreu(r3, R3); + } + for(Index i=peeled_mc2; i do the same but with nr==1 + for(Index j2=packet_cols; j2(alpha); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + + R0 = ploadu(r0); + R4 = ploadu(r0+ResPacketSize); + + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R4); + + pstoreu(r0, R0); + pstoreu(r0+ResPacketSize, R4); + } + if(rows-peeled_mc>=LhsProgress) + { + Index i = peeled_mc; + const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; + prefetch(&blA[0]); + + AccPacket C0; + traits.initAcc(C0); + + const RhsScalar* blB = unpackedB; + for(Index k=0; k(alpha); + ResPacket R0 = ploadu(&res[(j2+0)*resStride + i]); + traits.acc(C0, alphav, R0); + pstoreu(&res[(j2+0)*resStride + i], R0); + } + for(Index i=peeled_mc2; i +struct gemm_pack_lhs +{ + void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, + Index stride=0, Index offset=0) + { +// enum { PacketSize = packet_traits::size }; + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if::IsComplex && Conjugate> cj; + const_blas_data_mapper lhs(_lhs,lhsStride); + Index count = 0; + Index peeled_mc = (rows/Pack1)*Pack1; + for(Index i=0; i=Pack2) + { + if(PanelMode) count += Pack2*offset; + for(Index k=0; k +struct gemm_pack_rhs +{ + typedef typename packet_traits::type Packet; + enum { PacketSize = packet_traits::size }; + void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, + Index stride=0, Index offset=0) + { + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if::IsComplex && Conjugate> cj; + Index packet_cols = (cols/nr) * nr; + Index count = 0; + for(Index j2=0; j2 +struct gemm_pack_rhs +{ + enum { PacketSize = packet_traits::size }; + void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, + Index stride=0, Index offset=0) + { + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if::IsComplex && Conjugate> cj; + Index packet_cols = (cols/nr) * nr; + Index count = 0; + for(Index j2=0; j2 +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H +#define EIGEN_GENERAL_MATRIX_MATRIX_H + +namespace internal { + +template class level3_blocking; + +/* Specialization for a row-major destination matrix => simple transposition of the product */ +template< + typename Index, + typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> +struct general_matrix_matrix_product +{ + typedef typename scalar_product_traits::ReturnType ResScalar; + static EIGEN_STRONG_INLINE void run( + Index rows, Index cols, Index depth, + const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsStride, + ResScalar* res, Index resStride, + ResScalar alpha, + level3_blocking& blocking, + GemmParallelInfo* info = 0) + { + // transpose the product such that the result is column major + general_matrix_matrix_product + ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info); + } +}; + +/* Specialization for a col-major destination matrix + * => Blocking algorithm following Goto's paper */ +template< + typename Index, + typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> +struct general_matrix_matrix_product +{ +typedef typename scalar_product_traits::ReturnType ResScalar; +static void run(Index rows, Index cols, Index depth, + const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsStride, + ResScalar* res, Index resStride, + ResScalar alpha, + level3_blocking& blocking, + GemmParallelInfo* info = 0) +{ + const_blas_data_mapper lhs(_lhs,lhsStride); + const_blas_data_mapper rhs(_rhs,rhsStride); + + typedef gebp_traits Traits; + + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + //Index nc = blocking.nc(); // cache block size along the N direction + + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + gebp_kernel gebp; + +#ifdef EIGEN_HAS_OPENMP + if(info) + { + // this is the parallel version! + Index tid = omp_get_thread_num(); + Index threads = omp_get_num_threads(); + + std::size_t sizeA = kc*mc; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0); + ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0); + + RhsScalar* blockB = blocking.blockB(); + eigen_internal_assert(blockB!=0); + + // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs... + for(Index k=0; k rows of B', and cols of the A' + + // In order to reduce the chance that a thread has to wait for the other, + // let's start by packing A'. + pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc); + + // Pack B_k to B' in a parallel fashion: + // each thread packs the sub block B_k,j to B'_j where j is the thread id. + + // However, before copying to B'_j, we have to make sure that no other thread is still using it, + // i.e., we test that info[tid].users equals 0. + // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. + while(info[tid].users!=0) {} + info[tid].users += threads; + + pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length); + + // Notify the other threads that the part B'_j is ready to go. + info[tid].sync = k; + + // Computes C_i += A' * B' per B'_j + for(Index shift=0; shift0) + while(info[j].sync!=k) {} + + gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w); + } + + // Then keep going as usual with the remaining A' + for(Index i=mc; i Pack rhs's panel into a sequential chunk of memory (L2 caching) + // Note that this panel will be read as many times as the number of blocks in the lhs's + // vertical panel which is, in practice, a very low number. + pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); + + + // For each mc x kc block of the lhs's vertical panel... + // (==GEPP_VAR1) + for(Index i2=0; i2 for "large" GEMM, i.e., +* implementation of the high level wrapper to general_matrix_matrix_product +**********************************************************************************/ + +template +struct traits > + : traits, Lhs, Rhs> > +{}; + +template +struct gemm_functor +{ + gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, Scalar actualAlpha, + BlockingType& blocking) + : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) + {} + + void initParallelSession() const + { + m_blocking.allocateB(); + } + + void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo* info=0) const + { + if(cols==-1) + cols = m_rhs.cols(); + + Gemm::run(rows, cols, m_lhs.cols(), + /*(const Scalar*)*/&m_lhs.coeffRef(row,0), m_lhs.outerStride(), + /*(const Scalar*)*/&m_rhs.coeffRef(0,col), m_rhs.outerStride(), + (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(), + m_actualAlpha, m_blocking, info); + } + + protected: + const Lhs& m_lhs; + const Rhs& m_rhs; + Dest& m_dest; + Scalar m_actualAlpha; + BlockingType& m_blocking; +}; + +template class gemm_blocking_space; + +template +class level3_blocking +{ + typedef _LhsScalar LhsScalar; + typedef _RhsScalar RhsScalar; + + protected: + LhsScalar* m_blockA; + RhsScalar* m_blockB; + RhsScalar* m_blockW; + + DenseIndex m_mc; + DenseIndex m_nc; + DenseIndex m_kc; + + public: + + level3_blocking() + : m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0) + {} + + inline DenseIndex mc() const { return m_mc; } + inline DenseIndex nc() const { return m_nc; } + inline DenseIndex kc() const { return m_kc; } + + inline LhsScalar* blockA() { return m_blockA; } + inline RhsScalar* blockB() { return m_blockB; } + inline RhsScalar* blockW() { return m_blockW; } +}; + +template +class gemm_blocking_space + : public level3_blocking< + typename conditional::type, + typename conditional::type> +{ + enum { + Transpose = StorageOrder==RowMajor, + ActualRows = Transpose ? MaxCols : MaxRows, + ActualCols = Transpose ? MaxRows : MaxCols + }; + typedef typename conditional::type LhsScalar; + typedef typename conditional::type RhsScalar; + typedef gebp_traits Traits; + enum { + SizeA = ActualRows * MaxDepth, + SizeB = ActualCols * MaxDepth, + SizeW = MaxDepth * Traits::WorkSpaceFactor + }; + + EIGEN_ALIGN16 LhsScalar m_staticA[SizeA]; + EIGEN_ALIGN16 RhsScalar m_staticB[SizeB]; + EIGEN_ALIGN16 RhsScalar m_staticW[SizeW]; + + public: + + gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/) + { + this->m_mc = ActualRows; + this->m_nc = ActualCols; + this->m_kc = MaxDepth; + this->m_blockA = m_staticA; + this->m_blockB = m_staticB; + this->m_blockW = m_staticW; + } + + inline void allocateA() {} + inline void allocateB() {} + inline void allocateW() {} + inline void allocateAll() {} +}; + +template +class gemm_blocking_space + : public level3_blocking< + typename conditional::type, + typename conditional::type> +{ + enum { + Transpose = StorageOrder==RowMajor + }; + typedef typename conditional::type LhsScalar; + typedef typename conditional::type RhsScalar; + typedef gebp_traits Traits; + + DenseIndex m_sizeA; + DenseIndex m_sizeB; + DenseIndex m_sizeW; + + public: + + gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth) + { + this->m_mc = Transpose ? cols : rows; + this->m_nc = Transpose ? rows : cols; + this->m_kc = depth; + + computeProductBlockingSizes(this->m_kc, this->m_mc, this->m_nc); + m_sizeA = this->m_mc * this->m_kc; + m_sizeB = this->m_kc * this->m_nc; + m_sizeW = this->m_kc*Traits::WorkSpaceFactor; + } + + void allocateA() + { + if(this->m_blockA==0) + this->m_blockA = aligned_new(m_sizeA); + } + + void allocateB() + { + if(this->m_blockB==0) + this->m_blockB = aligned_new(m_sizeB); + } + + void allocateW() + { + if(this->m_blockW==0) + this->m_blockW = aligned_new(m_sizeW); + } + + void allocateAll() + { + allocateA(); + allocateB(); + allocateW(); + } + + ~gemm_blocking_space() + { + aligned_delete(this->m_blockA, m_sizeA); + aligned_delete(this->m_blockB, m_sizeB); + aligned_delete(this->m_blockW, m_sizeW); + } +}; + +} // end namespace internal + +template +class GeneralProduct + : public ProductBase, Lhs, Rhs> +{ + enum { + MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime) + }; + public: + EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) + + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef Scalar ResScalar; + + GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) + { + typedef internal::scalar_product_op BinOp; + EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar); + } + + template void scaleAndAddTo(Dest& dst, Scalar alpha) const + { + eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, + Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; + + typedef internal::gemm_functor< + Scalar, Index, + internal::general_matrix_matrix_product< + Index, + LhsScalar, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), + RhsScalar, (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), + (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, + _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor; + + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols()); + + internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit); + } +}; + +#endif // EIGEN_GENERAL_MATRIX_MATRIX_H diff --git a/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h new file mode 100644 index 00000000000..5043b64fe2e --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -0,0 +1,225 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009-2010 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H +#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H + +namespace internal { + +/********************************************************************** +* This file implements a general A * B product while +* evaluating only one triangular part of the product. +* This is more general version of self adjoint product (C += A A^T) +* as the level 3 SYRK Blas routine. +**********************************************************************/ + +// forward declarations (defined at the end of this file) +template +struct tribb_kernel; + +/* Optimized matrix-matrix product evaluating only one triangular half */ +template +struct general_matrix_matrix_triangular_product; + +// as usual if the result is row major => we transpose the product +template +struct general_matrix_matrix_triangular_product +{ + typedef typename scalar_product_traits::ReturnType ResScalar; + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, ResScalar alpha) + { + general_matrix_matrix_triangular_product + ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha); + } +}; + +template +struct general_matrix_matrix_triangular_product +{ + typedef typename scalar_product_traits::ReturnType ResScalar; + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, ResScalar alpha) + { + const_blas_data_mapper lhs(_lhs,lhsStride); + const_blas_data_mapper rhs(_rhs,rhsStride); + + typedef gebp_traits Traits; + + Index kc = depth; // cache block size along the K direction + Index mc = size; // cache block size along the M direction + Index nc = size; // cache block size along the N direction + computeProductBlockingSizes(kc, mc, nc); + // !!! mc must be a multiple of nr: + if(mc > Traits::nr) + mc = (mc/Traits::nr)*Traits::nr; + + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*size; + ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); + ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0); + RhsScalar* blockB = allocatedBlockB + sizeW; + + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + gebp_kernel gebp; + tribb_kernel sybb; + + for(Index k2=0; k2 processed with gebp or skipped + // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel + // 3 - after the diagonal => processed with gebp or skipped + if (UpLo==Lower) + gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, + -1, -1, 0, 0, allocatedBlockB); + + sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); + + if (UpLo==Upper) + { + Index j2 = i2+actual_mc; + gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, + -1, -1, 0, 0, allocatedBlockB); + } + } + } + } +}; + +// Optimized packed Block * packed Block product kernel evaluating only one given triangular part +// This kernel is built on top of the gebp kernel: +// - the current destination block is processed per panel of actual_mc x BlockSize +// where BlockSize is set to the minimal value allowing gebp to be as fast as possible +// - then, as usual, each panel is split into three parts along the diagonal, +// the sub blocks above and below the diagonal are processed as usual, +// while the triangular block overlapping the diagonal is evaluated into a +// small temporary buffer which is then accumulated into the result using a +// triangular traversal. +template +struct tribb_kernel +{ + typedef gebp_traits Traits; + typedef typename Traits::ResScalar ResScalar; + + enum { + BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) + }; + void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, ResScalar alpha, RhsScalar* workspace) + { + gebp_kernel gebp_kernel; + Matrix buffer; + + // let's process the block per panel of actual_mc x BlockSize, + // again, each is split into three parts, etc. + for (Index j=0; j(BlockSize,size - j); + const RhsScalar* actual_b = blockB+j*depth; + + if(UpLo==Upper) + gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, + -1, -1, 0, 0, workspace); + + // selfadjoint micro block + { + Index i = j; + buffer.setZero(); + // 1 - apply the kernel on the temporary buffer + gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, + -1, -1, 0, 0, workspace); + // 2 - triangular accumulation + for(Index j1=0; j1 +template +TriangularView& TriangularView::assignProduct(const ProductBase& prod, const Scalar& alpha) +{ + typedef typename internal::remove_all::type Lhs; + typedef internal::blas_traits LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs; + typedef typename internal::remove_all::type _ActualLhs; + const ActualLhs actualLhs = LhsBlasTraits::extract(prod.lhs()); + + typedef typename internal::remove_all::type Rhs; + typedef internal::blas_traits RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs; + typedef typename internal::remove_all::type _ActualRhs; + const ActualRhs actualRhs = RhsBlasTraits::extract(prod.rhs()); + + typename ProductDerived::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived()); + + internal::general_matrix_matrix_triangular_product + ::run(m_matrix.cols(), actualLhs.cols(), + &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(), + const_cast(m_matrix.data()), m_matrix.outerStride(), actualAlpha); + + return *this; +} + +#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H diff --git a/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixVector.h b/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixVector.h new file mode 100644 index 00000000000..e0e2cbf8f62 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixVector.h @@ -0,0 +1,559 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H +#define EIGEN_GENERAL_MATRIX_VECTOR_H + +namespace internal { + +/* Optimized col-major matrix * vector product: + * This algorithm processes 4 columns at onces that allows to both reduce + * the number of load/stores of the result by a factor 4 and to reduce + * the instruction dependency. Moreover, we know that all bands have the + * same alignment pattern. + * + * Mixing type logic: C += alpha * A * B + * | A | B |alpha| comments + * |real |cplx |cplx | no vectorization + * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization + * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp + * |cplx |real |real | optimal case, vectorization possible via real-cplx mul + */ +template +struct general_matrix_vector_product +{ +typedef typename scalar_product_traits::ReturnType ResScalar; + +enum { + Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable + && int(packet_traits::size)==int(packet_traits::size), + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1 +}; + +typedef typename packet_traits::type _LhsPacket; +typedef typename packet_traits::type _RhsPacket; +typedef typename packet_traits::type _ResPacket; + +typedef typename conditional::type LhsPacket; +typedef typename conditional::type RhsPacket; +typedef typename conditional::type ResPacket; + +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsIncr, + ResScalar* res, Index + #ifdef EIGEN_INTERNAL_DEBUGGING + resIncr + #endif + , RhsScalar alpha) +{ + eigen_internal_assert(resIncr==1); + #ifdef _EIGEN_ACCUMULATE_PACKETS + #error _EIGEN_ACCUMULATE_PACKETS has already been defined + #endif + #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ + pstore(&res[j], \ + padd(pload(&res[j]), \ + padd( \ + padd(pcj.pmul(EIGEN_CAT(ploa , A0)(&lhs0[j]), ptmp0), \ + pcj.pmul(EIGEN_CAT(ploa , A13)(&lhs1[j]), ptmp1)), \ + padd(pcj.pmul(EIGEN_CAT(ploa , A2)(&lhs2[j]), ptmp2), \ + pcj.pmul(EIGEN_CAT(ploa , A13)(&lhs3[j]), ptmp3)) ))) + + conj_helper cj; + conj_helper pcj; + if(ConjugateRhs) + alpha = conj(alpha); + + enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; + const Index columnsAtOnce = 4; + const Index peels = 2; + const Index LhsPacketAlignedMask = LhsPacketSize-1; + const Index ResPacketAlignedMask = ResPacketSize-1; + const Index PeelAlignedMask = ResPacketSize*peels-1; + const Index size = rows; + + // How many coeffs of the result do we have to skip to be aligned. + // Here we assume data are at least aligned on the base scalar type. + Index alignedStart = first_aligned(res,size); + Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; + const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; + + const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; + Index alignmentPattern = alignmentStep==0 ? AllAligned + : alignmentStep==(LhsPacketSize/2) ? EvenAligned + : FirstAligned; + + // we cannot assume the first element is aligned because of sub-matrices + const Index lhsAlignmentOffset = first_aligned(lhs,size); + + // find how many columns do we have to skip to be aligned with the result (if possible) + Index skipColumns = 0; + // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) + if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) ) + { + alignedSize = 0; + alignedStart = 0; + } + else if (LhsPacketSize>1) + { + eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size= cols) + || LhsPacketSize > size + || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); + } + else if(Vectorizable) + { + alignedStart = 0; + alignedSize = size; + alignmentPattern = AllAligned; + } + + Index offset1 = (FirstAligned && alignmentStep==1?3:1); + Index offset3 = (FirstAligned && alignmentStep==1?1:3); + + Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; + for (Index i=skipColumns; i(alpha*rhs[i*rhsIncr]), + ptmp1 = pset1(alpha*rhs[(i+offset1)*rhsIncr]), + ptmp2 = pset1(alpha*rhs[(i+2)*rhsIncr]), + ptmp3 = pset1(alpha*rhs[(i+offset3)*rhsIncr]); + + // this helps a lot generating better binary code + const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + + if (Vectorizable) + { + /* explicit vectorization */ + // process initial unaligned coeffs + for (Index j=0; jalignedStart) + { + switch(alignmentPattern) + { + case AllAligned: + for (Index j = alignedStart; j1) + { + LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; + ResPacket T0, T1; + + A01 = pload(&lhs1[alignedStart-1]); + A02 = pload(&lhs2[alignedStart-2]); + A03 = pload(&lhs3[alignedStart-3]); + + for (Index j = alignedStart; j(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); + A12 = pload(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); + A13 = pload(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + + A00 = pload(&lhs0[j]); + A10 = pload(&lhs0[j+LhsPacketSize]); + T0 = pcj.pmadd(A00, ptmp0, pload(&res[j])); + T1 = pcj.pmadd(A10, ptmp0, pload(&res[j+ResPacketSize])); + + T0 = pcj.pmadd(A01, ptmp1, T0); + A01 = pload(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + T0 = pcj.pmadd(A02, ptmp2, T0); + A02 = pload(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + T0 = pcj.pmadd(A03, ptmp3, T0); + pstore(&res[j],T0); + A03 = pload(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + T1 = pcj.pmadd(A11, ptmp1, T1); + T1 = pcj.pmadd(A12, ptmp2, T1); + T1 = pcj.pmadd(A13, ptmp3, T1); + pstore(&res[j+ResPacketSize],T1); + } + } + for (Index j = peeledSize; j(alpha*rhs[k*rhsIncr]); + const LhsScalar* lhs0 = lhs + k*lhsStride; + + if (Vectorizable) + { + /* explicit vectorization */ + // process first unaligned result's coeffs + for (Index j=0; j(&lhs0[i]), ptmp0, pload(&res[i]))); + else + for (Index i = alignedStart;i(&lhs0[i]), ptmp0, pload(&res[i]))); + } + + // process remaining scalars (or all if no explicit vectorization) + for (Index i=alignedSize; i +struct general_matrix_vector_product +{ +typedef typename scalar_product_traits::ReturnType ResScalar; + +enum { + Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable + && int(packet_traits::size)==int(packet_traits::size), + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1 +}; + +typedef typename packet_traits::type _LhsPacket; +typedef typename packet_traits::type _RhsPacket; +typedef typename packet_traits::type _ResPacket; + +typedef typename conditional::type LhsPacket; +typedef typename conditional::type RhsPacket; +typedef typename conditional::type ResPacket; + +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsIncr, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + EIGEN_UNUSED_VARIABLE(rhsIncr); + eigen_internal_assert(rhsIncr==1); + #ifdef _EIGEN_ACCUMULATE_PACKETS + #error _EIGEN_ACCUMULATE_PACKETS has already been defined + #endif + + #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ + RhsPacket b = pload(&rhs[j]); \ + ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) (&lhs0[j]), b, ptmp0); \ + ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)(&lhs1[j]), b, ptmp1); \ + ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) (&lhs2[j]), b, ptmp2); \ + ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)(&lhs3[j]), b, ptmp3); } + + conj_helper cj; + conj_helper pcj; + + enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; + const Index rowsAtOnce = 4; + const Index peels = 2; + const Index RhsPacketAlignedMask = RhsPacketSize-1; + const Index LhsPacketAlignedMask = LhsPacketSize-1; + const Index PeelAlignedMask = RhsPacketSize*peels-1; + const Index depth = cols; + + // How many coeffs of the result do we have to skip to be aligned. + // Here we assume data are at least aligned on the base scalar type + // if that's not the case then vectorization is discarded, see below. + Index alignedStart = first_aligned(rhs, depth); + Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; + const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; + + const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; + Index alignmentPattern = alignmentStep==0 ? AllAligned + : alignmentStep==(LhsPacketSize/2) ? EvenAligned + : FirstAligned; + + // we cannot assume the first element is aligned because of sub-matrices + const Index lhsAlignmentOffset = first_aligned(lhs,depth); + + // find how many rows do we have to skip to be aligned with rhs (if possible) + Index skipRows = 0; + // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) + { + alignedSize = 0; + alignedStart = 0; + } + else if (LhsPacketSize>1) + { + eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth= rows) + || LhsPacketSize > depth + || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); + } + else if(Vectorizable) + { + alignedStart = 0; + alignedSize = depth; + alignmentPattern = AllAligned; + } + + Index offset1 = (FirstAligned && alignmentStep==1?3:1); + Index offset3 = (FirstAligned && alignmentStep==1?1:3); + + Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; + for (Index i=skipRows; i(ResScalar(0)), ptmp1 = pset1(ResScalar(0)), + ptmp2 = pset1(ResScalar(0)), ptmp3 = pset1(ResScalar(0)); + + // process initial unaligned coeffs + // FIXME this loop get vectorized by the compiler ! + for (Index j=0; jalignedStart) + { + switch(alignmentPattern) + { + case AllAligned: + for (Index j = alignedStart; j1) + { + /* Here we proccess 4 rows with with two peeled iterations to hide + * tghe overhead of unaligned loads. Moreover unaligned loads are handled + * using special shift/move operations between the two aligned packets + * overlaping the desired unaligned packet. This is *much* more efficient + * than basic unaligned loads. + */ + LhsPacket A01, A02, A03, A11, A12, A13; + A01 = pload(&lhs1[alignedStart-1]); + A02 = pload(&lhs2[alignedStart-2]); + A03 = pload(&lhs3[alignedStart-3]); + + for (Index j = alignedStart; j(&rhs[j]); + A11 = pload(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); + A12 = pload(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); + A13 = pload(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + + ptmp0 = pcj.pmadd(pload(&lhs0[j]), b, ptmp0); + ptmp1 = pcj.pmadd(A01, b, ptmp1); + A01 = pload(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + ptmp2 = pcj.pmadd(A02, b, ptmp2); + A02 = pload(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + ptmp3 = pcj.pmadd(A03, b, ptmp3); + A03 = pload(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + + b = pload(&rhs[j+RhsPacketSize]); + ptmp0 = pcj.pmadd(pload(&lhs0[j+LhsPacketSize]), b, ptmp0); + ptmp1 = pcj.pmadd(A11, b, ptmp1); + ptmp2 = pcj.pmadd(A12, b, ptmp2); + ptmp3 = pcj.pmadd(A13, b, ptmp3); + } + } + for (Index j = peeledSize; j(tmp0); + const LhsScalar* lhs0 = lhs + i*lhsStride; + // process first unaligned result's coeffs + // FIXME this loop get vectorized by the compiler ! + for (Index j=0; jalignedStart) + { + // process aligned rhs coeffs + if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + for (Index j = alignedStart;j(&lhs0[j]), pload(&rhs[j]), ptmp0); + else + for (Index j = alignedStart;j(&lhs0[j]), pload(&rhs[j]), ptmp0); + tmp0 += predux(ptmp0); + } + + // process remaining scalars + // FIXME this loop get vectorized by the compiler ! + for (Index j=alignedSize; j +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_PARALLELIZER_H +#define EIGEN_PARALLELIZER_H + +namespace internal { + +/** \internal */ +inline void manage_multi_threading(Action action, int* v) +{ + static EIGEN_UNUSED int m_maxThreads = -1; + + if(action==SetAction) + { + eigen_internal_assert(v!=0); + m_maxThreads = *v; + } + else if(action==GetAction) + { + eigen_internal_assert(v!=0); + #ifdef EIGEN_HAS_OPENMP + if(m_maxThreads>0) + *v = m_maxThreads; + else + *v = omp_get_max_threads(); + #else + *v = 1; + #endif + } + else + { + eigen_internal_assert(false); + } +} + +/** \returns the max number of threads reserved for Eigen + * \sa setNbThreads */ +inline int nbThreads() +{ + int ret; + manage_multi_threading(GetAction, &ret); + return ret; +} + +/** Sets the max number of threads reserved for Eigen + * \sa nbThreads */ +inline void setNbThreads(int v) +{ + manage_multi_threading(SetAction, &v); +} + +template struct GemmParallelInfo +{ + GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {} + + int volatile sync; + int volatile users; + + Index rhs_start; + Index rhs_length; +}; + +template +void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose) +{ +#ifndef EIGEN_HAS_OPENMP + // FIXME the transpose variable is only needed to properly split + // the matrix product when multithreading is enabled. This is a temporary + // fix to support row-major destination matrices. This whole + // parallelizer mechanism has to be redisigned anyway. + EIGEN_UNUSED_VARIABLE(transpose); + func(0,rows, 0,cols); +#else + + // Dynamically check whether we should enable or disable OpenMP. + // The conditions are: + // - the max number of threads we can create is greater than 1 + // - we are not already in a parallel code + // - the sizes are large enough + + // 1- are we already in a parallel session? + // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp? + if((!Condition) || (omp_get_num_threads()>1)) + return func(0,rows, 0,cols); + + Index size = transpose ? cols : rows; + + // 2- compute the maximal number of threads from the size of the product: + // FIXME this has to be fine tuned + Index max_threads = std::max(1,size / 32); + + // 3 - compute the number of threads we are going to use + Index threads = std::min(nbThreads(), max_threads); + + if(threads==1) + return func(0,rows, 0,cols); + + func.initParallelSession(); + + if(transpose) + std::swap(rows,cols); + + Index blockCols = (cols / threads) & ~Index(0x3); + Index blockRows = (rows / threads) & ~Index(0x7); + + GemmParallelInfo* info = new GemmParallelInfo[threads]; + + #pragma omp parallel for schedule(static,1) num_threads(threads) + for(Index i=0; i +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H +#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H + +namespace internal { + +// pack a selfadjoint block diagonal for use with the gebp_kernel +template +struct symm_pack_lhs +{ + template inline + void pack(Scalar* blockA, const const_blas_data_mapper& lhs, Index cols, Index i, Index& count) + { + // normal copy + for(Index k=0; k lhs(_lhs,lhsStride); + Index count = 0; + Index peeled_mc = (rows/Pack1)*Pack1; + for(Index i=0; i(blockA, lhs, cols, i, count); + } + + if(rows-peeled_mc>=Pack2) + { + pack(blockA, lhs, cols, peeled_mc, count); + peeled_mc += Pack2; + } + + // do the same with mr==1 + for(Index i=peeled_mc; i +struct symm_pack_rhs +{ + enum { PacketSize = packet_traits::size }; + void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) + { + Index end_k = k2 + rows; + Index count = 0; + const_blas_data_mapper rhs(_rhs,rhsStride); + Index packet_cols = (cols/nr)*nr; + + // first part: normal case + for(Index j2=0; j2 the same with nr==1) + for(Index j2=packet_cols; j2 +struct product_selfadjoint_matrix; + +template +struct product_selfadjoint_matrix +{ + + static EIGEN_STRONG_INLINE void run( + Index rows, Index cols, + const Scalar* lhs, Index lhsStride, + const Scalar* rhs, Index rhsStride, + Scalar* res, Index resStride, + Scalar alpha) + { + product_selfadjoint_matrix::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs), + EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor, + LhsSelfAdjoint, NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), + ColMajor> + ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha); + } +}; + +template +struct product_selfadjoint_matrix +{ + + static EIGEN_DONT_INLINE void run( + Index rows, Index cols, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* res, Index resStride, + Scalar alpha) + { + Index size = rows; + + const_blas_data_mapper lhs(_lhs,lhsStride); + const_blas_data_mapper rhs(_rhs,rhsStride); + + typedef gebp_traits Traits; + + Index kc = size; // cache block size along the K direction + Index mc = rows; // cache block size along the M direction + Index nc = cols; // cache block size along the N direction + computeProductBlockingSizes(kc, mc, nc); + // kc must smaller than mc + kc = (std::min)(kc,mc); + + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); + ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); + Scalar* blockB = allocatedBlockB + sizeW; + + gebp_kernel gebp_kernel; + symm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + gemm_pack_lhs pack_lhs_transposed; + + for(Index k2=0; k2 transposed packed copy + // 2 - the diagonal block => special packed copy + // 3 - the panel below the diagonal block => generic packed copy + for(Index i2=0; i2() + (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + } + } + } +}; + +// matrix * selfadjoint product +template +struct product_selfadjoint_matrix +{ + + static EIGEN_DONT_INLINE void run( + Index rows, Index cols, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* res, Index resStride, + Scalar alpha) + { + Index size = cols; + + const_blas_data_mapper lhs(_lhs,lhsStride); + + typedef gebp_traits Traits; + + Index kc = size; // cache block size along the K direction + Index mc = rows; // cache block size along the M direction + Index nc = cols; // cache block size along the N direction + computeProductBlockingSizes(kc, mc, nc); + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); + ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); + Scalar* blockB = allocatedBlockB + sizeW; + + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; + symm_pack_rhs pack_rhs; + + for(Index k2=0; k2 GEPP + for(Index i2=0; i2 +struct traits > + : traits, Lhs, Rhs> > +{}; +} + +template +struct SelfadjointProductMatrix + : public ProductBase, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) + + SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + enum { + LhsIsUpper = (LhsMode&(Upper|Lower))==Upper, + LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint, + RhsIsUpper = (RhsMode&(Upper|Lower))==Upper, + RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint + }; + + template void scaleAndAddTo(Dest& dst, Scalar alpha) const + { + eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + internal::product_selfadjoint_matrix::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, + NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), + EIGEN_LOGICAL_XOR(RhsIsUpper, + internal::traits::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, + NumTraits::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), + internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor> + ::run( + lhs.rows(), rhs.cols(), // sizes + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info + &dst.coeffRef(0,0), dst.outerStride(), // result info + actualAlpha // alpha + ); + } +}; + +#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H diff --git a/extern/Eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h b/extern/Eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h new file mode 100644 index 00000000000..d6121fc07bd --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -0,0 +1,278 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H +#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H + +namespace internal { + +/* Optimized selfadjoint matrix * vector product: + * This algorithm processes 2 columns at onces that allows to both reduce + * the number of load/stores of the result by a factor 2 and to reduce + * the instruction dependency. + */ +template +static EIGEN_DONT_INLINE void product_selfadjoint_vector( + Index size, + const Scalar* lhs, Index lhsStride, + const Scalar* _rhs, Index rhsIncr, + Scalar* res, + Scalar alpha) +{ + typedef typename packet_traits::type Packet; + typedef typename NumTraits::Real RealScalar; + const Index PacketSize = sizeof(Packet)/sizeof(Scalar); + + enum { + IsRowMajor = StorageOrder==RowMajor ? 1 : 0, + IsLower = UpLo == Lower ? 1 : 0, + FirstTriangular = IsRowMajor == IsLower + }; + + conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0; + conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1; + conj_helper::IsComplex, ConjugateRhs> cjd; + + conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0; + conj_helper::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1; + + Scalar cjAlpha = ConjugateRhs ? conj(alpha) : alpha; + + // FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed. + // if the rhs is not sequentially stored in memory we copy it to a temporary buffer, + // this is because we need to extract packets + ei_declare_aligned_stack_constructed_variable(Scalar,rhs,size,rhsIncr==1 ? const_cast(_rhs) : 0); + if (rhsIncr!=1) + { + const Scalar* it = _rhs; + for (Index i=0; i(t0); + Scalar t1 = cjAlpha * rhs[j+1]; + Packet ptmp1 = pset1(t1); + + Scalar t2 = 0; + Packet ptmp2 = pset1(t2); + Scalar t3 = 0; + Packet ptmp3 = pset1(t3); + + size_t starti = FirstTriangular ? 0 : j+2; + size_t endi = FirstTriangular ? j : size; + size_t alignedStart = (starti) + first_aligned(&res[starti], endi-starti); + size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize); + + // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed + res[j] += cjd.pmul(internal::real(A0[j]), t0); + res[j+1] += cjd.pmul(internal::real(A1[j+1]), t1); + if(FirstTriangular) + { + res[j] += cj0.pmul(A1[j], t1); + t3 += cj1.pmul(A1[j], rhs[j]); + } + else + { + res[j+1] += cj0.pmul(A0[j+1],t0); + t2 += cj1.pmul(A0[j+1], rhs[j+1]); + } + + for (size_t i=starti; i huge speed up) + // gcc 4.2 does this optimization automatically. + const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart; + const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart; + const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart; + Scalar* EIGEN_RESTRICT resIt = res + alignedStart; + for (size_t i=alignedStart; i(a0It); a0It += PacketSize; + Packet A1i = ploadu(a1It); a1It += PacketSize; + Packet Bi = ploadu(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases + Packet Xi = pload (resIt); + + Xi = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi)); + ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2); + ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3); + pstore(resIt,Xi); resIt += PacketSize; + } + for (size_t i=alignedEnd; i +struct traits > + : traits, Lhs, Rhs> > +{}; +} + +template +struct SelfadjointProductMatrix + : public ProductBase, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) + + enum { + LhsUpLo = LhsMode&(Upper|Lower) + }; + + SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + template void scaleAndAddTo(Dest& dest, Scalar alpha) const + { + typedef typename Dest::Scalar ResScalar; + typedef typename Base::RhsScalar RhsScalar; + typedef Map, Aligned> MappedDest; + + eigen_assert(dest.rows()==m_lhs.rows() && dest.cols()==m_rhs.cols()); + + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + enum { + EvalToDest = (Dest::InnerStrideAtCompileTime==1), + UseRhs = (_ActualRhsType::InnerStrideAtCompileTime==1) + }; + + internal::gemv_static_vector_if static_dest; + internal::gemv_static_vector_if static_rhs; + + ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), + EvalToDest ? dest.data() : static_dest.data()); + + ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,rhs.size(), + UseRhs ? const_cast(rhs.data()) : static_rhs.data()); + + if(!EvalToDest) + { + #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + int size = dest.size(); + EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #endif + MappedDest(actualDestPtr, dest.size()) = dest; + } + + if(!UseRhs) + { + #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + int size = rhs.size(); + EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #endif + Map(actualRhsPtr, rhs.size()) = rhs; + } + + + internal::product_selfadjoint_vector::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)> + ( + lhs.rows(), // size + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + actualRhsPtr, 1, // rhs info + actualDestPtr, // result info + actualAlpha // scale factor + ); + + if(!EvalToDest) + dest = MappedDest(actualDestPtr, dest.size()); + } +}; + +namespace internal { +template +struct traits > + : traits, Lhs, Rhs> > +{}; +} + +template +struct SelfadjointProductMatrix + : public ProductBase, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) + + enum { + RhsUpLo = RhsMode&(Upper|Lower) + }; + + SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + template void scaleAndAddTo(Dest& dest, Scalar alpha) const + { + // let's simply transpose the product + Transpose destT(dest); + SelfadjointProductMatrix, int(RhsUpLo)==Upper ? Lower : Upper, false, + Transpose, 0, true>(m_rhs.transpose(), m_lhs.transpose()).scaleAndAddTo(destT, alpha); + } +}; + + +#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H diff --git a/extern/Eigen3/Eigen/src/Core/products/SelfadjointProduct.h b/extern/Eigen3/Eigen/src/Core/products/SelfadjointProduct.h new file mode 100644 index 00000000000..3a4523fa4a9 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/SelfadjointProduct.h @@ -0,0 +1,136 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_SELFADJOINT_PRODUCT_H +#define EIGEN_SELFADJOINT_PRODUCT_H + +/********************************************************************** +* This file implements a self adjoint product: C += A A^T updating only +* half of the selfadjoint matrix C. +* It corresponds to the level 3 SYRK and level 2 SYR Blas routines. +**********************************************************************/ + +template +struct selfadjoint_rank1_update; + +template +struct selfadjoint_rank1_update +{ + static void run(Index size, Scalar* mat, Index stride, const Scalar* vec, Scalar alpha) + { + internal::conj_if cj; + typedef Map > OtherMap; + typedef typename internal::conditional::type ConjRhsType; + for (Index i=0; i >(mat+stride*i+(UpLo==Lower ? i : 0), (UpLo==Lower ? size-i : (i+1))) + += (alpha * cj(vec[i])) * ConjRhsType(OtherMap(vec+(UpLo==Lower ? i : 0),UpLo==Lower ? size-i : (i+1))); + } + } +}; + +template +struct selfadjoint_rank1_update +{ + static void run(Index size, Scalar* mat, Index stride, const Scalar* vec, Scalar alpha) + { + selfadjoint_rank1_update::run(size,mat,stride,vec,alpha); + } +}; + +template +struct selfadjoint_product_selector; + +template +struct selfadjoint_product_selector +{ + static void run(MatrixType& mat, const OtherType& other, typename MatrixType::Scalar alpha) + { + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::Index Index; + typedef internal::blas_traits OtherBlasTraits; + typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType; + typedef typename internal::remove_all::type _ActualOtherType; + const ActualOtherType actualOther = OtherBlasTraits::extract(other.derived()); + + Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived()); + + enum { + StorageOrder = (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor, + UseOtherDirectly = _ActualOtherType::InnerStrideAtCompileTime==1 + }; + internal::gemv_static_vector_if static_other; + + ei_declare_aligned_stack_constructed_variable(Scalar, actualOtherPtr, other.size(), + (UseOtherDirectly ? const_cast(actualOther.data()) : static_other.data())); + + if(!UseOtherDirectly) + Map(actualOtherPtr, actualOther.size()) = actualOther; + + selfadjoint_rank1_update::IsComplex, + (!OtherBlasTraits::NeedToConjugate) && NumTraits::IsComplex> + ::run(other.size(), mat.data(), mat.outerStride(), actualOtherPtr, actualAlpha); + } +}; + +template +struct selfadjoint_product_selector +{ + static void run(MatrixType& mat, const OtherType& other, typename MatrixType::Scalar alpha) + { + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::Index Index; + typedef internal::blas_traits OtherBlasTraits; + typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType; + typedef typename internal::remove_all::type _ActualOtherType; + const ActualOtherType actualOther = OtherBlasTraits::extract(other.derived()); + + Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived()); + + enum { IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0 }; + + internal::general_matrix_matrix_triangular_product::IsComplex, + Scalar, _ActualOtherType::Flags&RowMajorBit ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits::IsComplex, + MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo> + ::run(mat.cols(), actualOther.cols(), + &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(), + mat.data(), mat.outerStride(), actualAlpha); + } +}; + +// high level API + +template +template +SelfAdjointView& SelfAdjointView +::rankUpdate(const MatrixBase& u, Scalar alpha) +{ + selfadjoint_product_selector::run(_expression().const_cast_derived(), u.derived(), alpha); + + return *this; +} + +#endif // EIGEN_SELFADJOINT_PRODUCT_H diff --git a/extern/Eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h b/extern/Eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h new file mode 100644 index 00000000000..9f8b8438a5d --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -0,0 +1,104 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_SELFADJOINTRANK2UPTADE_H +#define EIGEN_SELFADJOINTRANK2UPTADE_H + +namespace internal { + +/* Optimized selfadjoint matrix += alpha * uv' + conj(alpha)*vu' + * It corresponds to the Level2 syr2 BLAS routine + */ + +template +struct selfadjoint_rank2_update_selector; + +template +struct selfadjoint_rank2_update_selector +{ + static void run(Scalar* mat, Index stride, const UType& u, const VType& v, Scalar alpha) + { + const Index size = u.size(); + for (Index i=0; i >(mat+stride*i+i, size-i) += + (conj(alpha) * conj(u.coeff(i))) * v.tail(size-i) + + (alpha * conj(v.coeff(i))) * u.tail(size-i); + } + } +}; + +template +struct selfadjoint_rank2_update_selector +{ + static void run(Scalar* mat, Index stride, const UType& u, const VType& v, Scalar alpha) + { + const Index size = u.size(); + for (Index i=0; i >(mat+stride*i, i+1) += + (conj(alpha) * conj(u.coeff(i))) * v.head(i+1) + + (alpha * conj(v.coeff(i))) * u.head(i+1); + } +}; + +template struct conj_expr_if + : conditional::Scalar>,T> > {}; + +} // end namespace internal + +template +template +SelfAdjointView& SelfAdjointView +::rankUpdate(const MatrixBase& u, const MatrixBase& v, Scalar alpha) +{ + typedef internal::blas_traits UBlasTraits; + typedef typename UBlasTraits::DirectLinearAccessType ActualUType; + typedef typename internal::remove_all::type _ActualUType; + const ActualUType actualU = UBlasTraits::extract(u.derived()); + + typedef internal::blas_traits VBlasTraits; + typedef typename VBlasTraits::DirectLinearAccessType ActualVType; + typedef typename internal::remove_all::type _ActualVType; + const ActualVType actualV = VBlasTraits::extract(v.derived()); + + // If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and + // vice versa, and take the complex conjugate of all coefficients and vector entries. + + enum { IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0 }; + Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived()) + * internal::conj(VBlasTraits::extractScalarFactor(v.derived())); + if (IsRowMajor) + actualAlpha = internal::conj(actualAlpha); + + internal::selfadjoint_rank2_update_selector::type>::type, + typename internal::remove_all::type>::type, + (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)> + ::run(_expression().const_cast_derived().data(),_expression().outerStride(),actualU,actualV,actualAlpha); + + return *this; +} + +#endif // EIGEN_SELFADJOINTRANK2UPTADE_H diff --git a/extern/Eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h b/extern/Eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h new file mode 100644 index 00000000000..0c48d2efb75 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -0,0 +1,403 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H +#define EIGEN_TRIANGULAR_MATRIX_MATRIX_H + +namespace internal { + +// template +// struct gemm_pack_lhs_triangular +// { +// Matrix::IsComplex && Conjugate> cj; +// const_blas_data_mapper lhs(_lhs,lhsStride); +// int count = 0; +// const int peeled_mc = (rows/mr)*mr; +// for(int i=0; i +struct product_triangular_matrix_matrix; + +template +struct product_triangular_matrix_matrix +{ + static EIGEN_STRONG_INLINE void run( + Index rows, Index cols, Index depth, + const Scalar* lhs, Index lhsStride, + const Scalar* rhs, Index rhsStride, + Scalar* res, Index resStride, + Scalar alpha) + { + product_triangular_matrix_matrix + ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha); + } +}; + +// implements col-major += alpha * op(triangular) * op(general) +template +struct product_triangular_matrix_matrix +{ + + typedef gebp_traits Traits; + enum { + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + IsLower = (Mode&Lower) == Lower, + SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 + }; + + static EIGEN_DONT_INLINE void run( + Index _rows, Index _cols, Index _depth, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* res, Index resStride, + Scalar alpha) + { + // strip zeros + Index diagSize = (std::min)(_rows,_depth); + Index rows = IsLower ? _rows : diagSize; + Index depth = IsLower ? diagSize : _depth; + Index cols = _cols; + + const_blas_data_mapper lhs(_lhs,lhsStride); + const_blas_data_mapper rhs(_rhs,rhsStride); + + Index kc = depth; // cache block size along the K direction + Index mc = rows; // cache block size along the M direction + Index nc = cols; // cache block size along the N direction + computeProductBlockingSizes(kc, mc, nc); + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); + ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); + Scalar* blockB = allocatedBlockB + sizeW; + + Matrix triangularBuffer; + triangularBuffer.setZero(); + if((Mode&ZeroDiag)==ZeroDiag) + triangularBuffer.diagonal().setZero(); + else + triangularBuffer.diagonal().setOnes(); + + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + + for(Index k2=IsLower ? depth : 0; + IsLower ? k2>0 : k2rows)) + { + actual_kc = rows-k2; + k2 = k2+actual_kc-kc; + } + + pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols); + + // the selected lhs's panel has to be split in three different parts: + // 1 - the part which is zero => skip it + // 2 - the diagonal block => special kernel + // 3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP + + // the block diagonal, if any: + if(IsLower || actual_k2(actual_kc-k1, SmallPanelWidth); + Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1; + Index startBlock = actual_k2+k1; + Index blockBOffset = k1; + + // => GEBP with the micro triangular block + // The trick is to pack this micro block while filling the opposite triangular part with zeros. + // To this end we do an extra triangular copy to a small temporary buffer + for (Index k=0;k0) + { + Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2; + + pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); + + gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha, + actualPanelWidth, actual_kc, 0, blockBOffset); + } + } + } + // the part below (lower case) or above (upper case) the diagonal => GEPP + { + Index start = IsLower ? k2 : 0; + Index end = IsLower ? rows : (std::min)(actual_k2,rows); + for(Index i2=start; i2() + (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); + + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + } + } + } + } +}; + +// implements col-major += alpha * op(general) * op(triangular) +template +struct product_triangular_matrix_matrix +{ + typedef gebp_traits Traits; + enum { + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + IsLower = (Mode&Lower) == Lower, + SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 + }; + + static EIGEN_DONT_INLINE void run( + Index _rows, Index _cols, Index _depth, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* res, Index resStride, + Scalar alpha) + { + // strip zeros + Index diagSize = (std::min)(_cols,_depth); + Index rows = _rows; + Index depth = IsLower ? _depth : diagSize; + Index cols = IsLower ? diagSize : _cols; + + const_blas_data_mapper lhs(_lhs,lhsStride); + const_blas_data_mapper rhs(_rhs,rhsStride); + + Index kc = depth; // cache block size along the K direction + Index mc = rows; // cache block size along the M direction + Index nc = cols; // cache block size along the N direction + computeProductBlockingSizes(kc, mc, nc); + + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); + ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); + Scalar* blockB = allocatedBlockB + sizeW; + + Matrix triangularBuffer; + triangularBuffer.setZero(); + if((Mode&ZeroDiag)==ZeroDiag) + triangularBuffer.diagonal().setZero(); + else + triangularBuffer.diagonal().setOnes(); + + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + gemm_pack_rhs pack_rhs_panel; + + for(Index k2=IsLower ? 0 : depth; + IsLower ? k20; + IsLower ? k2+=kc : k2-=kc) + { + Index actual_kc = (std::min)(IsLower ? depth-k2 : k2, kc); + Index actual_k2 = IsLower ? k2 : k2-actual_kc; + + // align blocks with the end of the triangular part for trapezoidal rhs + if(IsLower && (k2cols)) + { + actual_kc = cols-k2; + k2 = actual_k2 + actual_kc - kc; + } + + // remaining size + Index rs = IsLower ? (std::min)(cols,actual_k2) : cols - k2; + // size of the triangular part + Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc; + + Scalar* geb = blockB+ts*ts; + + pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs); + + // pack the triangular part of the rhs padding the unrolled blocks with zeros + if(ts>0) + { + for (Index j2=0; j2(actual_kc-j2, SmallPanelWidth); + Index actual_j2 = actual_k2 + j2; + Index panelOffset = IsLower ? j2+actualPanelWidth : 0; + Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; + // general part + pack_rhs_panel(blockB+j2*actual_kc, + &rhs(actual_k2+panelOffset, actual_j2), rhsStride, + panelLength, actualPanelWidth, + actual_kc, panelOffset); + + // append the triangular part via a temporary buffer + for (Index j=0;j0) + { + for (Index j2=0; j2(actual_kc-j2, SmallPanelWidth); + Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth; + Index blockOffset = IsLower ? j2 : 0; + + gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride, + blockA, blockB+j2*actual_kc, + actual_mc, panelLength, actualPanelWidth, + alpha, + actual_kc, actual_kc, // strides + blockOffset, blockOffset,// offsets + allocatedBlockB); // workspace + } + } + gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, + blockA, geb, actual_mc, actual_kc, rs, + alpha, + -1, -1, 0, 0, allocatedBlockB); + } + } + } +}; + +/*************************************************************************** +* Wrapper to product_triangular_matrix_matrix +***************************************************************************/ + +template +struct traits > + : traits, Lhs, Rhs> > +{}; + +} // end namespace internal + +template +struct TriangularProduct + : public ProductBase, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct) + + TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + template void scaleAndAddTo(Dest& dst, Scalar alpha) const + { + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + internal::product_triangular_matrix_matrix::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, + (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, + (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor> + ::run( + lhs.rows(), rhs.cols(), lhs.cols(),// LhsIsTriangular ? rhs.cols() : lhs.rows(), // sizes + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info + &dst.coeffRef(0,0), dst.outerStride(), // result info + actualAlpha // alpha + ); + } +}; + + +#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H diff --git a/extern/Eigen3/Eigen/src/Core/products/TriangularMatrixVector.h b/extern/Eigen3/Eigen/src/Core/products/TriangularMatrixVector.h new file mode 100644 index 00000000000..71b4a52ab80 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/TriangularMatrixVector.h @@ -0,0 +1,325 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_TRIANGULARMATRIXVECTOR_H +#define EIGEN_TRIANGULARMATRIXVECTOR_H + +namespace internal { + +template +struct product_triangular_matrix_vector; + +template +struct product_triangular_matrix_vector +{ + typedef typename scalar_product_traits::ReturnType ResScalar; + enum { + IsLower = ((Mode&Lower)==Lower), + HasUnitDiag = (Mode & UnitDiag)==UnitDiag + }; + static EIGEN_DONT_INLINE void run(Index rows, Index cols, const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, ResScalar alpha) + { + static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; + + typedef Map, 0, OuterStride<> > LhsMap; + const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); + typename conj_expr_if::type cjLhs(lhs); + + typedef Map, 0, InnerStride<> > RhsMap; + const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr)); + typename conj_expr_if::type cjRhs(rhs); + + typedef Map > ResMap; + ResMap res(_res,rows); + + for (Index pi=0; pi0) + res.segment(s,r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s,r); + if (HasUnitDiag) + res.coeffRef(i) += alpha * cjRhs.coeff(i); + } + Index r = IsLower ? cols - pi - actualPanelWidth : pi; + if (r>0) + { + Index s = IsLower ? pi+actualPanelWidth : 0; + general_matrix_vector_product::run( + r, actualPanelWidth, + &lhs.coeffRef(s,pi), lhsStride, + &rhs.coeffRef(pi), rhsIncr, + &res.coeffRef(s), resIncr, alpha); + } + } + } +}; + +template +struct product_triangular_matrix_vector +{ + typedef typename scalar_product_traits::ReturnType ResScalar; + enum { + IsLower = ((Mode&Lower)==Lower), + HasUnitDiag = (Mode & UnitDiag)==UnitDiag + }; + static void run(Index rows, Index cols, const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, ResScalar alpha) + { + static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; + + typedef Map, 0, OuterStride<> > LhsMap; + const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); + typename conj_expr_if::type cjLhs(lhs); + + typedef Map > RhsMap; + const RhsMap rhs(_rhs,cols); + typename conj_expr_if::type cjRhs(rhs); + + typedef Map, 0, InnerStride<> > ResMap; + ResMap res(_res,rows,InnerStride<>(resIncr)); + + for (Index pi=0; pi0) + res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s,r).cwiseProduct(cjRhs.segment(s,r).transpose())).sum(); + if (HasUnitDiag) + res.coeffRef(i) += alpha * cjRhs.coeff(i); + } + Index r = IsLower ? pi : cols - pi - actualPanelWidth; + if (r>0) + { + Index s = IsLower ? 0 : pi + actualPanelWidth; + general_matrix_vector_product::run( + actualPanelWidth, r, + &lhs.coeffRef(pi,s), lhsStride, + &rhs.coeffRef(s), rhsIncr, + &res.coeffRef(pi), resIncr, alpha); + } + } + } +}; + +/*************************************************************************** +* Wrapper to product_triangular_vector +***************************************************************************/ + +template +struct traits > + : traits, Lhs, Rhs> > +{}; + +template +struct traits > + : traits, Lhs, Rhs> > +{}; + + +template +struct trmv_selector; + +} // end namespace internal + +template +struct TriangularProduct + : public ProductBase, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct) + + TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + template void scaleAndAddTo(Dest& dst, Scalar alpha) const + { + eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + + internal::trmv_selector<(int(internal::traits::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dst, alpha); + } +}; + +template +struct TriangularProduct + : public ProductBase, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct) + + TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + template void scaleAndAddTo(Dest& dst, Scalar alpha) const + { + eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + + typedef TriangularProduct<(Mode & UnitDiag) | ((Mode & Lower) ? Upper : Lower),true,Transpose,false,Transpose,true> TriangularProductTranspose; + Transpose dstT(dst); + internal::trmv_selector<(int(internal::traits::Flags)&RowMajorBit) ? ColMajor : RowMajor>::run( + TriangularProductTranspose(m_rhs.transpose(),m_lhs.transpose()), dstT, alpha); + } +}; + +namespace internal { + +// TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same. + +template<> struct trmv_selector +{ + template + static void run(const TriangularProduct& prod, Dest& dest, typename TriangularProduct::Scalar alpha) + { + typedef TriangularProduct ProductType; + typedef typename ProductType::Index Index; + typedef typename ProductType::LhsScalar LhsScalar; + typedef typename ProductType::RhsScalar RhsScalar; + typedef typename ProductType::Scalar ResScalar; + typedef typename ProductType::RealScalar RealScalar; + typedef typename ProductType::ActualLhsType ActualLhsType; + typedef typename ProductType::ActualRhsType ActualRhsType; + typedef typename ProductType::LhsBlasTraits LhsBlasTraits; + typedef typename ProductType::RhsBlasTraits RhsBlasTraits; + typedef Map, Aligned> MappedDest; + + const ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs()); + const ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs()); + + ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs()) + * RhsBlasTraits::extractScalarFactor(prod.rhs()); + + enum { + // FIXME find a way to allow an inner stride on the result if packet_traits::size==1 + // on, the other hand it is good for the cache to pack the vector anyways... + EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1, + ComplexByReal = (NumTraits::IsComplex) && (!NumTraits::IsComplex), + MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal + }; + + gemv_static_vector_if static_dest; + + bool alphaIsCompatible = (!ComplexByReal) || (imag(actualAlpha)==RealScalar(0)); + bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; + + RhsScalar compatibleAlpha = get_factor::run(actualAlpha); + + ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), + evalToDest ? dest.data() : static_dest.data()); + + if(!evalToDest) + { + #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + int size = dest.size(); + EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #endif + if(!alphaIsCompatible) + { + MappedDest(actualDestPtr, dest.size()).setZero(); + compatibleAlpha = RhsScalar(1); + } + else + MappedDest(actualDestPtr, dest.size()) = dest; + } + + internal::product_triangular_matrix_vector + + ::run(actualLhs.rows(),actualLhs.cols(), + actualLhs.data(),actualLhs.outerStride(), + actualRhs.data(),actualRhs.innerStride(), + actualDestPtr,1,compatibleAlpha); + + if (!evalToDest) + { + if(!alphaIsCompatible) + dest += actualAlpha * MappedDest(actualDestPtr, dest.size()); + else + dest = MappedDest(actualDestPtr, dest.size()); + } + } +}; + +template<> struct trmv_selector +{ + template + static void run(const TriangularProduct& prod, Dest& dest, typename TriangularProduct::Scalar alpha) + { + typedef TriangularProduct ProductType; + typedef typename ProductType::LhsScalar LhsScalar; + typedef typename ProductType::RhsScalar RhsScalar; + typedef typename ProductType::Scalar ResScalar; + typedef typename ProductType::Index Index; + typedef typename ProductType::ActualLhsType ActualLhsType; + typedef typename ProductType::ActualRhsType ActualRhsType; + typedef typename ProductType::_ActualRhsType _ActualRhsType; + typedef typename ProductType::LhsBlasTraits LhsBlasTraits; + typedef typename ProductType::RhsBlasTraits RhsBlasTraits; + + typename add_const::type actualLhs = LhsBlasTraits::extract(prod.lhs()); + typename add_const::type actualRhs = RhsBlasTraits::extract(prod.rhs()); + + ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs()) + * RhsBlasTraits::extractScalarFactor(prod.rhs()); + + enum { + DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1 + }; + + gemv_static_vector_if static_rhs; + + ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(), + DirectlyUseRhs ? const_cast(actualRhs.data()) : static_rhs.data()); + + if(!DirectlyUseRhs) + { + #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + int size = actualRhs.size(); + EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #endif + Map(actualRhsPtr, actualRhs.size()) = actualRhs; + } + + internal::product_triangular_matrix_vector + + ::run(actualLhs.rows(),actualLhs.cols(), + actualLhs.data(),actualLhs.outerStride(), + actualRhsPtr,1, + dest.data(),dest.innerStride(), + actualAlpha); + } +}; + +} // end namespace internal + +#endif // EIGEN_TRIANGULARMATRIXVECTOR_H diff --git a/extern/Eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h b/extern/Eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h new file mode 100644 index 00000000000..4dced6b0eb9 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -0,0 +1,319 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H +#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H + +namespace internal { + +// if the rhs is row major, let's transpose the product +template +struct triangular_solve_matrix +{ + static EIGEN_DONT_INLINE void run( + Index size, Index cols, + const Scalar* tri, Index triStride, + Scalar* _other, Index otherStride) + { + triangular_solve_matrix< + Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft, + (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper), + NumTraits::IsComplex && Conjugate, + TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor> + ::run(size, cols, tri, triStride, _other, otherStride); + } +}; + +/* Optimized triangular solver with multiple right hand side and the triangular matrix on the left + */ +template +struct triangular_solve_matrix +{ + static EIGEN_DONT_INLINE void run( + Index size, Index otherSize, + const Scalar* _tri, Index triStride, + Scalar* _other, Index otherStride) + { + Index cols = otherSize; + const_blas_data_mapper tri(_tri,triStride); + blas_data_mapper other(_other,otherStride); + + typedef gebp_traits Traits; + enum { + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + IsLower = (Mode&Lower) == Lower + }; + + Index kc = size; // cache block size along the K direction + Index mc = size; // cache block size along the M direction + Index nc = cols; // cache block size along the N direction + computeProductBlockingSizes(kc, mc, nc); + + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); + ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); + Scalar* blockB = allocatedBlockB + sizeW; + + conj_if conj; + gebp_kernel gebp_kernel; + gemm_pack_lhs pack_lhs; + gemm_pack_rhs pack_rhs; + + for(Index k2=IsLower ? 0 : size; + IsLower ? k20; + IsLower ? k2+=kc : k2-=kc) + { + const Index actual_kc = (std::min)(IsLower ? size-k2 : k2, kc); + + // We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel, + // and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into + // A11 (the triangular part) and A21 the remaining rectangular part. + // Then the high level algorithm is: + // - B = R1 => general block copy (done during the next step) + // - R1 = L1^-1 B => tricky part + // - update B from the new R1 => actually this has to be performed continuously during the above step + // - R2 = L2 * B => GEPP + + // The tricky part: compute R1 = L1^-1 B while updating B from R1 + // The idea is to split L1 into multiple small vertical panels. + // Each panel can be split into a small triangular part A1 which is processed without optimization, + // and the remaining small part A2 which is processed using gebp with appropriate block strides + { + // for each small vertical panels of lhs + for (Index k1=0; k1(actual_kc-k1, SmallPanelWidth); + // tr solve + for (Index k=0; k0) + { + Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc; + + pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); + + gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, Scalar(-1), + actualPanelWidth, actual_kc, 0, blockBOffset); + } + } + } + + // R2 = A2 * B => GEPP + { + Index start = IsLower ? k2+kc : 0; + Index end = IsLower ? size : k2-kc; + for(Index i2=start; i20) + { + pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc); + + gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1)); + } + } + } + } + } +}; + +/* Optimized triangular solver with multiple left hand sides and the trinagular matrix on the right + */ +template +struct triangular_solve_matrix +{ + static EIGEN_DONT_INLINE void run( + Index size, Index otherSize, + const Scalar* _tri, Index triStride, + Scalar* _other, Index otherStride) + { + Index rows = otherSize; + const_blas_data_mapper rhs(_tri,triStride); + blas_data_mapper lhs(_other,otherStride); + + typedef gebp_traits Traits; + enum { + RhsStorageOrder = TriStorageOrder, + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + IsLower = (Mode&Lower) == Lower + }; + +// Index kc = std::min(Traits::Max_kc/4,size); // cache block size along the K direction +// Index mc = std::min(Traits::Max_mc,size); // cache block size along the M direction + // check that !!!! + Index kc = size; // cache block size along the K direction + Index mc = size; // cache block size along the M direction + Index nc = rows; // cache block size along the N direction + computeProductBlockingSizes(kc, mc, nc); + + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*size; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); + ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); + Scalar* blockB = allocatedBlockB + sizeW; + + conj_if conj; + gebp_kernel gebp_kernel; + gemm_pack_rhs pack_rhs; + gemm_pack_rhs pack_rhs_panel; + gemm_pack_lhs pack_lhs_panel; + + for(Index k2=IsLower ? size : 0; + IsLower ? k2>0 : k20) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs); + + // triangular packing (we only pack the panels off the diagonal, + // neglecting the blocks overlapping the diagonal + { + for (Index j2=0; j2(actual_kc-j2, SmallPanelWidth); + Index actual_j2 = actual_k2 + j2; + Index panelOffset = IsLower ? j2+actualPanelWidth : 0; + Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; + + if (panelLength>0) + pack_rhs_panel(blockB+j2*actual_kc, + &rhs(actual_k2+panelOffset, actual_j2), triStride, + panelLength, actualPanelWidth, + actual_kc, panelOffset); + } + } + + for(Index i2=0; i2 vertical panels of rhs) + for (Index j2 = IsLower + ? (actual_kc - ((actual_kc%SmallPanelWidth) ? Index(actual_kc%SmallPanelWidth) + : Index(SmallPanelWidth))) + : 0; + IsLower ? j2>=0 : j2(actual_kc-j2, SmallPanelWidth); + Index absolute_j2 = actual_k2 + j2; + Index panelOffset = IsLower ? j2+actualPanelWidth : 0; + Index panelLength = IsLower ? actual_kc - j2 - actualPanelWidth : j2; + + // GEBP + if(panelLength>0) + { + gebp_kernel(&lhs(i2,absolute_j2), otherStride, + blockA, blockB+j2*actual_kc, + actual_mc, panelLength, actualPanelWidth, + Scalar(-1), + actual_kc, actual_kc, // strides + panelOffset, panelOffset, // offsets + allocatedBlockB); // workspace + } + + // unblocked triangular solve + for (Index k=0; k0) + gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, + actual_mc, actual_kc, rs, Scalar(-1), + -1, -1, 0, 0, allocatedBlockB); + } + } + } +}; + +} // end namespace internal + +#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H diff --git a/extern/Eigen3/Eigen/src/Core/products/TriangularSolverVector.h b/extern/Eigen3/Eigen/src/Core/products/TriangularSolverVector.h new file mode 100644 index 00000000000..639d4a5b476 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/TriangularSolverVector.h @@ -0,0 +1,150 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2010 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H +#define EIGEN_TRIANGULAR_SOLVER_VECTOR_H + +namespace internal { + +template +struct triangular_solve_vector +{ + static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs) + { + triangular_solve_vector::run(size, _lhs, lhsStride, rhs); + } +}; + +// forward and backward substitution, row-major, rhs is a vector +template +struct triangular_solve_vector +{ + enum { + IsLower = ((Mode&Lower)==Lower) + }; + static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs) + { + typedef Map, 0, OuterStride<> > LhsMap; + const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + typename internal::conditional< + Conjugate, + const CwiseUnaryOp,LhsMap>, + const LhsMap&> + ::type cjLhs(lhs); + static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; + for(Index pi=IsLower ? 0 : size; + IsLower ? pi0; + IsLower ? pi+=PanelWidth : pi-=PanelWidth) + { + Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth); + + Index r = IsLower ? pi : size - pi; // remaining size + if (r > 0) + { + // let's directly call the low level product function because: + // 1 - it is faster to compile + // 2 - it is slighlty faster at runtime + Index startRow = IsLower ? pi : pi-actualPanelWidth; + Index startCol = IsLower ? 0 : pi; + + general_matrix_vector_product::run( + actualPanelWidth, r, + &lhs.coeffRef(startRow,startCol), lhsStride, + rhs + startCol, 1, + rhs + startRow, 1, + RhsScalar(-1)); + } + + for(Index k=0; k0) + rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map >(rhs+s,k))).sum(); + + if(!(Mode & UnitDiag)) + rhs[i] /= cjLhs(i,i); + } + } + } +}; + +// forward and backward substitution, column-major, rhs is a vector +template +struct triangular_solve_vector +{ + enum { + IsLower = ((Mode&Lower)==Lower) + }; + static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs) + { + typedef Map, 0, OuterStride<> > LhsMap; + const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + typename internal::conditional,LhsMap>, + const LhsMap& + >::type cjLhs(lhs); + static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; + + for(Index pi=IsLower ? 0 : size; + IsLower ? pi0; + IsLower ? pi+=PanelWidth : pi-=PanelWidth) + { + Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth); + Index startBlock = IsLower ? pi : pi-actualPanelWidth; + Index endBlock = IsLower ? pi + actualPanelWidth : 0; + + for(Index k=0; k0) + Map >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + } + Index r = IsLower ? size - endBlock : startBlock; // remaining size + if (r > 0) + { + // let's directly call the low level product function because: + // 1 - it is faster to compile + // 2 - it is slighlty faster at runtime + general_matrix_vector_product::run( + r, actualPanelWidth, + &lhs.coeffRef(endBlock,startBlock), lhsStride, + rhs+startBlock, 1, + rhs+endBlock, 1, RhsScalar(-1)); + } + } + } +}; + +} // end namespace internal + +#endif // EIGEN_TRIANGULAR_SOLVER_VECTOR_H -- cgit v1.2.3