diff options
author | Campbell Barton <ideasman42@gmail.com> | 2011-10-23 21:52:20 +0400 |
---|---|---|
committer | Campbell Barton <ideasman42@gmail.com> | 2011-10-23 21:52:20 +0400 |
commit | 4a04f7206914a49f5f95adc5eb786237f1a9f547 (patch) | |
tree | 78aed2fa481f972fac0965f814bebebe9d71ae65 /extern/Eigen3/Eigen/src/Core/products | |
parent | f1cea89d99f0c80bdccd2ba1359142b5ff14cdb9 (diff) |
remove $Id: tags after discussion on the mailign list: http://markmail.org/message/fp7ozcywxum3ar7n
Diffstat (limited to 'extern/Eigen3/Eigen/src/Core/products')
14 files changed, 5256 insertions, 0 deletions
diff --git a/extern/Eigen3/Eigen/src/Core/products/CoeffBasedProduct.h b/extern/Eigen3/Eigen/src/Core/products/CoeffBasedProduct.h new file mode 100644 index 00000000000..dc20f7e1e29 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/CoeffBasedProduct.h @@ -0,0 +1,452 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com> +// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_COEFFBASED_PRODUCT_H +#define EIGEN_COEFFBASED_PRODUCT_H + +namespace internal { + +/********************************************************************************* +* Coefficient based product implementation. +* It is designed for the following use cases: +* - small fixed sizes +* - lazy products +*********************************************************************************/ + +/* Since the all the dimensions of the product are small, here we can rely + * on the generic Assign mechanism to evaluate the product per coeff (or packet). + * + * Note that here the inner-loops should always be unrolled. + */ + +template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar> +struct product_coeff_impl; + +template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct product_packet_impl; + +template<typename LhsNested, typename RhsNested, int NestingFlags> +struct traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> > +{ + typedef MatrixXpr XprKind; + typedef typename remove_all<LhsNested>::type _LhsNested; + typedef typename remove_all<RhsNested>::type _RhsNested; + typedef typename scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar; + typedef typename promote_storage_type<typename traits<_LhsNested>::StorageKind, + typename traits<_RhsNested>::StorageKind>::ret StorageKind; + typedef typename promote_index_type<typename traits<_LhsNested>::Index, + typename traits<_RhsNested>::Index>::type Index; + + enum { + LhsCoeffReadCost = _LhsNested::CoeffReadCost, + RhsCoeffReadCost = _RhsNested::CoeffReadCost, + LhsFlags = _LhsNested::Flags, + RhsFlags = _RhsNested::Flags, + + RowsAtCompileTime = _LhsNested::RowsAtCompileTime, + ColsAtCompileTime = _RhsNested::ColsAtCompileTime, + InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime), + + MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime, + MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime, + + LhsRowMajor = LhsFlags & RowMajorBit, + RhsRowMajor = RhsFlags & RowMajorBit, + + SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value, + + CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit) + && (ColsAtCompileTime == Dynamic + || ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0 + && (RhsFlags&AlignedBit) + ) + ), + + CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) + && (RowsAtCompileTime == Dynamic + || ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0 + && (LhsFlags&AlignedBit) + ) + ), + + EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1 + : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 + : (RhsRowMajor && !CanVectorizeLhs), + + Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit) + | (EvalToRowMajor ? RowMajorBit : 0) + | NestingFlags + | (LhsFlags & RhsFlags & AlignedBit) + // TODO enable vectorization for mixed types + | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0), + + CoeffReadCost = InnerSize == Dynamic ? Dynamic + : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) + + (InnerSize - 1) * NumTraits<Scalar>::AddCost, + + /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside + * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner + * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect + * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. + */ + CanVectorizeInner = SameType + && LhsRowMajor + && (!RhsRowMajor) + && (LhsFlags & RhsFlags & ActualPacketAccessBit) + && (LhsFlags & RhsFlags & AlignedBit) + && (InnerSize % packet_traits<Scalar>::size == 0) + }; +}; + +} // end namespace internal + +template<typename LhsNested, typename RhsNested, int NestingFlags> +class CoeffBasedProduct + : internal::no_assignment_operator, + public MatrixBase<CoeffBasedProduct<LhsNested, RhsNested, NestingFlags> > +{ + public: + + typedef MatrixBase<CoeffBasedProduct> Base; + EIGEN_DENSE_PUBLIC_INTERFACE(CoeffBasedProduct) + typedef typename Base::PlainObject PlainObject; + + private: + + typedef typename internal::traits<CoeffBasedProduct>::_LhsNested _LhsNested; + typedef typename internal::traits<CoeffBasedProduct>::_RhsNested _RhsNested; + + enum { + PacketSize = internal::packet_traits<Scalar>::size, + InnerSize = internal::traits<CoeffBasedProduct>::InnerSize, + Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT, + CanVectorizeInner = internal::traits<CoeffBasedProduct>::CanVectorizeInner + }; + + typedef internal::product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal, + Unroll ? InnerSize-1 : Dynamic, + _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl; + + typedef CoeffBasedProduct<LhsNested,RhsNested,NestByRefBit> LazyCoeffBasedProductType; + + public: + + inline CoeffBasedProduct(const CoeffBasedProduct& other) + : Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs) + {} + + template<typename Lhs, typename Rhs> + inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs) + : m_lhs(lhs), m_rhs(rhs) + { + // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable. + // We still allow to mix T and complex<T>. + EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value), + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + eigen_assert(lhs.cols() == rhs.rows() + && "invalid matrix product" + && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); + } + + EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); } + EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); } + + EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const + { + Scalar res; + ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res); + return res; + } + + /* Allow index-based non-packet access. It is impossible though to allow index-based packed access, + * which is why we don't set the LinearAccessBit. + */ + EIGEN_STRONG_INLINE const Scalar coeff(Index index) const + { + Scalar res; + const Index row = RowsAtCompileTime == 1 ? 0 : index; + const Index col = RowsAtCompileTime == 1 ? index : 0; + ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res); + return res; + } + + template<int LoadMode> + EIGEN_STRONG_INLINE const PacketScalar packet(Index row, Index col) const + { + PacketScalar res; + internal::product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor, + Unroll ? InnerSize-1 : Dynamic, + _LhsNested, _RhsNested, PacketScalar, LoadMode> + ::run(row, col, m_lhs, m_rhs, res); + return res; + } + + // Implicit conversion to the nested type (trigger the evaluation of the product) + EIGEN_STRONG_INLINE operator const PlainObject& () const + { + m_result.lazyAssign(*this); + return m_result; + } + + const _LhsNested& lhs() const { return m_lhs; } + const _RhsNested& rhs() const { return m_rhs; } + + const Diagonal<const LazyCoeffBasedProductType,0> diagonal() const + { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); } + + template<int DiagonalIndex> + const Diagonal<const LazyCoeffBasedProductType,DiagonalIndex> diagonal() const + { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); } + + const Diagonal<const LazyCoeffBasedProductType,Dynamic> diagonal(Index index) const + { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this).diagonal(index); } + + protected: + const LhsNested m_lhs; + const RhsNested m_rhs; + + mutable PlainObject m_result; +}; + +namespace internal { + +// here we need to overload the nested rule for products +// such that the nested type is a const reference to a plain matrix +template<typename Lhs, typename Rhs, int N, typename PlainObject> +struct nested<CoeffBasedProduct<Lhs,Rhs,EvalBeforeNestingBit|EvalBeforeAssigningBit>, N, PlainObject> +{ + typedef PlainObject const& type; +}; + +/*************************************************************************** +* Normal product .coeff() implementation (with meta-unrolling) +***************************************************************************/ + +/************************************** +*** Scalar path - no vectorization *** +**************************************/ + +template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar> +struct product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) + { + product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res); + res += lhs.coeff(row, UnrollingIndex) * rhs.coeff(UnrollingIndex, col); + } +}; + +template<typename Lhs, typename Rhs, typename RetScalar> +struct product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) + { + res = lhs.coeff(row, 0) * rhs.coeff(0, col); + } +}; + +template<typename Lhs, typename Rhs, typename RetScalar> +struct product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res) + { + eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix"); + res = lhs.coeff(row, 0) * rhs.coeff(0, col); + for(Index i = 1; i < lhs.cols(); ++i) + res += lhs.coeff(row, i) * rhs.coeff(i, col); + } +}; + +/******************************************* +*** Scalar path with inner vectorization *** +*******************************************/ + +template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet> +struct product_coeff_vectorized_unroller +{ + typedef typename Lhs::Index Index; + enum { PacketSize = packet_traits<typename Lhs::Scalar>::size }; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) + { + product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres); + pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) )); + } +}; + +template<typename Lhs, typename Rhs, typename Packet> +struct product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) + { + pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col)); + } +}; + +template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar> +struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar> +{ + typedef typename Lhs::PacketScalar Packet; + typedef typename Lhs::Index Index; + enum { PacketSize = packet_traits<typename Lhs::Scalar>::size }; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) + { + Packet pres; + product_coeff_vectorized_unroller<UnrollingIndex+1-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres); + product_coeff_impl<DefaultTraversal,UnrollingIndex,Lhs,Rhs,RetScalar>::run(row, col, lhs, rhs, res); + res = predux(pres); + } +}; + +template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime> +struct product_coeff_vectorized_dyn_selector +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum(); + } +}; + +// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower +// NOTE maybe they are now useless since we have a specialization for Block<Matrix> +template<typename Lhs, typename Rhs, int RhsCols> +struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = lhs.transpose().cwiseProduct(rhs.col(col)).sum(); + } +}; + +template<typename Lhs, typename Rhs, int LhsRows> +struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = lhs.row(row).transpose().cwiseProduct(rhs).sum(); + } +}; + +template<typename Lhs, typename Rhs> +struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + res = lhs.transpose().cwiseProduct(rhs).sum(); + } +}; + +template<typename Lhs, typename Rhs, typename RetScalar> +struct product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) + { + product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res); + } +}; + +/******************* +*** Packet path *** +*******************/ + +template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) + { + product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res); + res = pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex)), rhs.template packet<LoadMode>(UnrollingIndex, col), res); + } +}; + +template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) + { + product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res); + res = pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex), pset1<Packet>(rhs.coeff(UnrollingIndex, col)), res); + } +}; + +template<typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) + { + res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); + } +}; + +template<typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) + { + res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col))); + } +}; + +template<typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res) + { + eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix"); + res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); + for(Index i = 1; i < lhs.cols(); ++i) + res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res); + } +}; + +template<typename Lhs, typename Rhs, typename Packet, int LoadMode> +struct product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> +{ + typedef typename Lhs::Index Index; + EIGEN_STRONG_INLINE static void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res) + { + eigen_assert(lhs.cols()>0 && "you are using a non initialized matrix"); + res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col))); + for(Index i = 1; i < lhs.cols(); ++i) + res = pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res); + } +}; + +} // end namespace internal + +#endif // EIGEN_COEFFBASED_PRODUCT_H diff --git a/extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h new file mode 100644 index 00000000000..6f3f2717007 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -0,0 +1,1285 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_GENERAL_BLOCK_PANEL_H +#define EIGEN_GENERAL_BLOCK_PANEL_H + +namespace internal { + +template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false> +class gebp_traits; + +/** \internal */ +inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) +{ + static std::ptrdiff_t m_l1CacheSize = 0; + static std::ptrdiff_t m_l2CacheSize = 0; + if(m_l1CacheSize==0) + { + m_l1CacheSize = queryL1CacheSize(); + m_l2CacheSize = queryTopLevelCacheSize(); + + if(m_l1CacheSize<=0) m_l1CacheSize = 8 * 1024; + if(m_l2CacheSize<=0) m_l2CacheSize = 1 * 1024 * 1024; + } + + if(action==SetAction) + { + // set the cpu cache size and cache all block sizes from a global cache size in byte + eigen_internal_assert(l1!=0 && l2!=0); + m_l1CacheSize = *l1; + m_l2CacheSize = *l2; + } + else if(action==GetAction) + { + eigen_internal_assert(l1!=0 && l2!=0); + *l1 = m_l1CacheSize; + *l2 = m_l2CacheSize; + } + else + { + eigen_internal_assert(false); + } +} + +/** \brief Computes the blocking parameters for a m x k times k x n matrix product + * + * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. + * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension. + * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. + * + * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, + * this function computes the blocking size parameters along the respective dimensions + * for matrix products and related algorithms. The blocking sizes depends on various + * parameters: + * - the L1 and L2 cache sizes, + * - the register level blocking sizes defined by gebp_traits, + * - the number of scalars that fit into a packet (when vectorization is enabled). + * + * \sa setCpuCacheSizes */ +template<typename LhsScalar, typename RhsScalar, int KcFactor> +void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n) +{ + EIGEN_UNUSED_VARIABLE(n); + // Explanations: + // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and + // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed + // per kc x nr vertical small panels where nr is the blocking size along the n dimension + // at the register level. For vectorization purpose, these small vertical panels are unpacked, + // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to + // stay in L1 cache. + std::ptrdiff_t l1, l2; + + typedef gebp_traits<LhsScalar,RhsScalar> Traits; + enum { + kdiv = KcFactor * 2 * Traits::nr + * Traits::RhsProgress * sizeof(RhsScalar), + mr = gebp_traits<LhsScalar,RhsScalar>::mr, + mr_mask = (0xffffffff/mr)*mr + }; + + manage_caching_sizes(GetAction, &l1, &l2); + k = std::min<std::ptrdiff_t>(k, l1/kdiv); + std::ptrdiff_t _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; + if(_m<m) m = _m & mr_mask; +} + +template<typename LhsScalar, typename RhsScalar> +inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n) +{ + computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n); +} + +#ifdef EIGEN_HAS_FUSE_CJMADD + #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); +#else + + // FIXME (a bit overkill maybe ?) + + template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector { + EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) + { + c = cj.pmadd(a,b,c); + } + }; + + template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> { + EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, T& a, T& b, T& c, T& t) + { + t = b; t = cj.pmul(a,t); c = padd(c,t); + } + }; + + template<typename CJ, typename A, typename B, typename C, typename T> + EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) + { + gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t); + } + + #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); +// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); +#endif + +/* Vectorization logic + * real*real: unpack rhs to constant packets, ... + * + * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i), + * storing each res packet into two packets (2x2), + * at the end combine them: swap the second and addsub them + * cf*cf : same but with 2x4 blocks + * cplx*real : unpack rhs to constant packets, ... + * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual + */ +template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs> +class gebp_traits +{ +public: + typedef _LhsScalar LhsScalar; + typedef _RhsScalar RhsScalar; + typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + + enum { + ConjLhs = _ConjLhs, + ConjRhs = _ConjRhs, + Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, + ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + + // register block size along the N direction (must be either 2 or 4) + nr = NumberOfRegisters/4, + + // register block size along the M direction (currently, this one cannot be modified) + mr = 2 * LhsPacketSize, + + WorkSpaceFactor = nr * RhsPacketSize, + + LhsProgress = LhsPacketSize, + RhsProgress = RhsPacketSize + }; + + typedef typename packet_traits<LhsScalar>::type _LhsPacket; + typedef typename packet_traits<RhsScalar>::type _RhsPacket; + typedef typename packet_traits<ResScalar>::type _ResPacket; + + typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; + typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; + typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1<ResPacket>(ResScalar(0)); + } + + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + { + for(DenseIndex k=0; k<n; k++) + pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = pload<RhsPacket>(b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = pload<LhsPacket>(a); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const + { + tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); + } + + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = pmadd(c,alpha,r); + } + +protected: +// conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; +// conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj; +}; + +template<typename RealScalar, bool _ConjLhs> +class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false> +{ +public: + typedef std::complex<RealScalar> LhsScalar; + typedef RealScalar RhsScalar; + typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + + enum { + ConjLhs = _ConjLhs, + ConjRhs = false, + Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, + ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + nr = NumberOfRegisters/4, + mr = 2 * LhsPacketSize, + WorkSpaceFactor = nr*RhsPacketSize, + + LhsProgress = LhsPacketSize, + RhsProgress = RhsPacketSize + }; + + typedef typename packet_traits<LhsScalar>::type _LhsPacket; + typedef typename packet_traits<RhsScalar>::type _RhsPacket; + typedef typename packet_traits<ResScalar>::type _ResPacket; + + typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; + typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; + typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1<ResPacket>(ResScalar(0)); + } + + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + { + for(DenseIndex k=0; k<n; k++) + pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = pload<RhsPacket>(b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = pload<LhsPacket>(a); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + { + madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + { + tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const + { + c += a * b; + } + + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = cj.pmadd(c,alpha,r); + } + +protected: + conj_helper<ResPacket,ResPacket,ConjLhs,false> cj; +}; + +template<typename RealScalar, bool _ConjLhs, bool _ConjRhs> +class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs > +{ +public: + typedef std::complex<RealScalar> Scalar; + typedef std::complex<RealScalar> LhsScalar; + typedef std::complex<RealScalar> RhsScalar; + typedef std::complex<RealScalar> ResScalar; + + enum { + ConjLhs = _ConjLhs, + ConjRhs = _ConjRhs, + Vectorizable = packet_traits<RealScalar>::Vectorizable + && packet_traits<Scalar>::Vectorizable, + RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1, + ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, + + nr = 2, + mr = 2 * ResPacketSize, + WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, + + LhsProgress = ResPacketSize, + RhsProgress = Vectorizable ? 2*ResPacketSize : 1 + }; + + typedef typename packet_traits<RealScalar>::type RealPacket; + typedef typename packet_traits<Scalar>::type ScalarPacket; + struct DoublePacket + { + RealPacket first; + RealPacket second; + }; + + typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket; + typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket; + typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket; + typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket; + + EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } + + EIGEN_STRONG_INLINE void initAcc(DoublePacket& p) + { + p.first = pset1<RealPacket>(RealScalar(0)); + p.second = pset1<RealPacket>(RealScalar(0)); + } + + /* Unpack the rhs coeff such that each complex coefficient is spread into + * two packects containing respectively the real and imaginary coefficient + * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] + */ + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) + { + for(DenseIndex k=0; k<n; k++) + { + if(Vectorizable) + { + pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); + pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); + } + else + b[k] = rhs[k]; + } + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const + { + dest.first = pload<RealPacket>((const RealScalar*)b); + dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize)); + } + + // nothing special here + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a)); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const + { + c.first = padd(pmul(a,b.first), c.first); + c.second = padd(pmul(a,b.second),c.second); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const + { + c = cj.pmadd(a,b,c); + } + + EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } + + EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const + { + // assemble c + ResPacket tmp; + if((!ConjLhs)&&(!ConjRhs)) + { + tmp = pcplxflip(pconj(ResPacket(c.second))); + tmp = padd(ResPacket(c.first),tmp); + } + else if((!ConjLhs)&&(ConjRhs)) + { + tmp = pconj(pcplxflip(ResPacket(c.second))); + tmp = padd(ResPacket(c.first),tmp); + } + else if((ConjLhs)&&(!ConjRhs)) + { + tmp = pcplxflip(ResPacket(c.second)); + tmp = padd(pconj(ResPacket(c.first)),tmp); + } + else if((ConjLhs)&&(ConjRhs)) + { + tmp = pcplxflip(ResPacket(c.second)); + tmp = psub(pconj(ResPacket(c.first)),tmp); + } + + r = pmadd(tmp,alpha,r); + } + +protected: + conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; +}; + +template<typename RealScalar, bool _ConjRhs> +class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs > +{ +public: + typedef std::complex<RealScalar> Scalar; + typedef RealScalar LhsScalar; + typedef Scalar RhsScalar; + typedef Scalar ResScalar; + + enum { + ConjLhs = false, + ConjRhs = _ConjRhs, + Vectorizable = packet_traits<RealScalar>::Vectorizable + && packet_traits<Scalar>::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, + ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + nr = 4, + mr = 2*ResPacketSize, + WorkSpaceFactor = nr*RhsPacketSize, + + LhsProgress = ResPacketSize, + RhsProgress = ResPacketSize + }; + + typedef typename packet_traits<LhsScalar>::type _LhsPacket; + typedef typename packet_traits<RhsScalar>::type _RhsPacket; + typedef typename packet_traits<ResScalar>::type _ResPacket; + + typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; + typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; + typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1<ResPacket>(ResScalar(0)); + } + + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + { + for(DenseIndex k=0; k<n; k++) + pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = pload<RhsPacket>(b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploaddup<LhsPacket>(a); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + { + madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + { + tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const + { + c += a * b; + } + + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = cj.pmadd(alpha,c,r); + } + +protected: + conj_helper<ResPacket,ResPacket,false,ConjRhs> cj; +}; + +/* optimized GEneral packed Block * packed Panel product kernel + * + * Mixing type logic: C += A * B + * | A | B | comments + * |real |cplx | no vectorization yet, would require to pack A with duplication + * |cplx |real | easy vectorization + */ +template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +struct gebp_kernel +{ + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; + typedef typename Traits::ResScalar ResScalar; + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + typedef typename Traits::AccPacket AccPacket; + + enum { + Vectorizable = Traits::Vectorizable, + LhsProgress = Traits::LhsProgress, + RhsProgress = Traits::RhsProgress, + ResPacketSize = Traits::ResPacketSize + }; + + EIGEN_FLATTEN_ATTRIB + void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0) + { + Traits traits; + + if(strideA==-1) strideA = depth; + if(strideB==-1) strideB = depth; + conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; +// conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; + Index packet_cols = (cols/nr) * nr; + const Index peeled_mc = (rows/mr)*mr; + // FIXME: + const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); + const Index peeled_kc = (depth/4)*4; + + if(unpackedB==0) + unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress); + + // loops on each micro vertical panel of rhs (depth x nr) + for(Index j2=0; j2<packet_cols; j2+=nr) + { + traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); + + // loops on each largest micro horizontal panel of lhs (mr x depth) + // => we select a mr x nr micro block of res which is entirely + // stored into mr/packet_size x nr registers. + for(Index i=0; i<peeled_mc; i+=mr) + { + const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3, C4, C5, C6, C7; + traits.initAcc(C0); + traits.initAcc(C1); + if(nr==4) traits.initAcc(C2); + if(nr==4) traits.initAcc(C3); + traits.initAcc(C4); + traits.initAcc(C5); + if(nr==4) traits.initAcc(C6); + if(nr==4) traits.initAcc(C7); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + ResScalar* r1 = r0 + resStride; + ResScalar* r2 = r1 + resStride; + ResScalar* r3 = r2 + resStride; + + prefetch(r0+16); + prefetch(r1+16); + prefetch(r2+16); + prefetch(r3+16); + + // performs "inner" product + // TODO let's check wether the folowing peeled loop could not be + // optimized via optimal prefetching from one loop to the other + const RhsScalar* blB = unpackedB; + for(Index k=0; k<peeled_kc; k+=4) + { + if(nr==2) + { + LhsPacket A0, A1; + RhsPacket B0; + RhsPacket T0; + +EIGEN_ASM_COMMENT("mybegin2"); + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadLhs(&blA[1*LhsProgress], A1); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[1*RhsProgress], B0); + traits.madd(A0,B0,C1,T0); + traits.madd(A1,B0,C5,B0); + + traits.loadLhs(&blA[2*LhsProgress], A0); + traits.loadLhs(&blA[3*LhsProgress], A1); + traits.loadRhs(&blB[2*RhsProgress], B0); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[3*RhsProgress], B0); + traits.madd(A0,B0,C1,T0); + traits.madd(A1,B0,C5,B0); + + traits.loadLhs(&blA[4*LhsProgress], A0); + traits.loadLhs(&blA[5*LhsProgress], A1); + traits.loadRhs(&blB[4*RhsProgress], B0); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[5*RhsProgress], B0); + traits.madd(A0,B0,C1,T0); + traits.madd(A1,B0,C5,B0); + + traits.loadLhs(&blA[6*LhsProgress], A0); + traits.loadLhs(&blA[7*LhsProgress], A1); + traits.loadRhs(&blB[6*RhsProgress], B0); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[7*RhsProgress], B0); + traits.madd(A0,B0,C1,T0); + traits.madd(A1,B0,C5,B0); +EIGEN_ASM_COMMENT("myend"); + } + else + { +EIGEN_ASM_COMMENT("mybegin4"); + LhsPacket A0, A1; + RhsPacket B0, B1, B2, B3; + RhsPacket T0; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadLhs(&blA[1*LhsProgress], A1); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.loadRhs(&blB[1*RhsProgress], B1); + + traits.madd(A0,B0,C0,T0); + traits.loadRhs(&blB[2*RhsProgress], B2); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[3*RhsProgress], B3); + traits.loadRhs(&blB[4*RhsProgress], B0); + traits.madd(A0,B1,C1,T0); + traits.madd(A1,B1,C5,B1); + traits.loadRhs(&blB[5*RhsProgress], B1); + traits.madd(A0,B2,C2,T0); + traits.madd(A1,B2,C6,B2); + traits.loadRhs(&blB[6*RhsProgress], B2); + traits.madd(A0,B3,C3,T0); + traits.loadLhs(&blA[2*LhsProgress], A0); + traits.madd(A1,B3,C7,B3); + traits.loadLhs(&blA[3*LhsProgress], A1); + traits.loadRhs(&blB[7*RhsProgress], B3); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[8*RhsProgress], B0); + traits.madd(A0,B1,C1,T0); + traits.madd(A1,B1,C5,B1); + traits.loadRhs(&blB[9*RhsProgress], B1); + traits.madd(A0,B2,C2,T0); + traits.madd(A1,B2,C6,B2); + traits.loadRhs(&blB[10*RhsProgress], B2); + traits.madd(A0,B3,C3,T0); + traits.loadLhs(&blA[4*LhsProgress], A0); + traits.madd(A1,B3,C7,B3); + traits.loadLhs(&blA[5*LhsProgress], A1); + traits.loadRhs(&blB[11*RhsProgress], B3); + + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[12*RhsProgress], B0); + traits.madd(A0,B1,C1,T0); + traits.madd(A1,B1,C5,B1); + traits.loadRhs(&blB[13*RhsProgress], B1); + traits.madd(A0,B2,C2,T0); + traits.madd(A1,B2,C6,B2); + traits.loadRhs(&blB[14*RhsProgress], B2); + traits.madd(A0,B3,C3,T0); + traits.loadLhs(&blA[6*LhsProgress], A0); + traits.madd(A1,B3,C7,B3); + traits.loadLhs(&blA[7*LhsProgress], A1); + traits.loadRhs(&blB[15*RhsProgress], B3); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.madd(A0,B1,C1,T0); + traits.madd(A1,B1,C5,B1); + traits.madd(A0,B2,C2,T0); + traits.madd(A1,B2,C6,B2); + traits.madd(A0,B3,C3,T0); + traits.madd(A1,B3,C7,B3); + } + + blB += 4*nr*RhsProgress; + blA += 4*mr; + } + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + if(nr==2) + { + LhsPacket A0, A1; + RhsPacket B0; + RhsPacket T0; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadLhs(&blA[1*LhsProgress], A1); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[1*RhsProgress], B0); + traits.madd(A0,B0,C1,T0); + traits.madd(A1,B0,C5,B0); + } + else + { + LhsPacket A0, A1; + RhsPacket B0, B1, B2, B3; + RhsPacket T0; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadLhs(&blA[1*LhsProgress], A1); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.loadRhs(&blB[1*RhsProgress], B1); + + traits.madd(A0,B0,C0,T0); + traits.loadRhs(&blB[2*RhsProgress], B2); + traits.madd(A1,B0,C4,B0); + traits.loadRhs(&blB[3*RhsProgress], B3); + traits.madd(A0,B1,C1,T0); + traits.madd(A1,B1,C5,B1); + traits.madd(A0,B2,C2,T0); + traits.madd(A1,B2,C6,B2); + traits.madd(A0,B3,C3,T0); + traits.madd(A1,B3,C7,B3); + } + + blB += nr*RhsProgress; + blA += mr; + } + + if(nr==4) + { + ResPacket R0, R1, R2, R3, R4, R5, R6; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = ploadu<ResPacket>(r0); + R1 = ploadu<ResPacket>(r1); + R2 = ploadu<ResPacket>(r2); + R3 = ploadu<ResPacket>(r3); + R4 = ploadu<ResPacket>(r0 + ResPacketSize); + R5 = ploadu<ResPacket>(r1 + ResPacketSize); + R6 = ploadu<ResPacket>(r2 + ResPacketSize); + traits.acc(C0, alphav, R0); + pstoreu(r0, R0); + R0 = ploadu<ResPacket>(r3 + ResPacketSize); + + traits.acc(C1, alphav, R1); + traits.acc(C2, alphav, R2); + traits.acc(C3, alphav, R3); + traits.acc(C4, alphav, R4); + traits.acc(C5, alphav, R5); + traits.acc(C6, alphav, R6); + traits.acc(C7, alphav, R0); + + pstoreu(r1, R1); + pstoreu(r2, R2); + pstoreu(r3, R3); + pstoreu(r0 + ResPacketSize, R4); + pstoreu(r1 + ResPacketSize, R5); + pstoreu(r2 + ResPacketSize, R6); + pstoreu(r3 + ResPacketSize, R0); + } + else + { + ResPacket R0, R1, R4; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = ploadu<ResPacket>(r0); + R1 = ploadu<ResPacket>(r1); + R4 = ploadu<ResPacket>(r0 + ResPacketSize); + traits.acc(C0, alphav, R0); + pstoreu(r0, R0); + R0 = ploadu<ResPacket>(r1 + ResPacketSize); + traits.acc(C1, alphav, R1); + traits.acc(C4, alphav, R4); + traits.acc(C5, alphav, R0); + pstoreu(r1, R1); + pstoreu(r0 + ResPacketSize, R4); + pstoreu(r1 + ResPacketSize, R0); + } + + } + + if(rows-peeled_mc>=LhsProgress) + { + Index i = peeled_mc; + const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3; + traits.initAcc(C0); + traits.initAcc(C1); + if(nr==4) traits.initAcc(C2); + if(nr==4) traits.initAcc(C3); + + // performs "inner" product + const RhsScalar* blB = unpackedB; + for(Index k=0; k<peeled_kc; k+=4) + { + if(nr==2) + { + LhsPacket A0; + RhsPacket B0, B1; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.loadRhs(&blB[1*RhsProgress], B1); + traits.madd(A0,B0,C0,B0); + traits.loadRhs(&blB[2*RhsProgress], B0); + traits.madd(A0,B1,C1,B1); + traits.loadLhs(&blA[1*LhsProgress], A0); + traits.loadRhs(&blB[3*RhsProgress], B1); + traits.madd(A0,B0,C0,B0); + traits.loadRhs(&blB[4*RhsProgress], B0); + traits.madd(A0,B1,C1,B1); + traits.loadLhs(&blA[2*LhsProgress], A0); + traits.loadRhs(&blB[5*RhsProgress], B1); + traits.madd(A0,B0,C0,B0); + traits.loadRhs(&blB[6*RhsProgress], B0); + traits.madd(A0,B1,C1,B1); + traits.loadLhs(&blA[3*LhsProgress], A0); + traits.loadRhs(&blB[7*RhsProgress], B1); + traits.madd(A0,B0,C0,B0); + traits.madd(A0,B1,C1,B1); + } + else + { + LhsPacket A0; + RhsPacket B0, B1, B2, B3; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.loadRhs(&blB[1*RhsProgress], B1); + + traits.madd(A0,B0,C0,B0); + traits.loadRhs(&blB[2*RhsProgress], B2); + traits.loadRhs(&blB[3*RhsProgress], B3); + traits.loadRhs(&blB[4*RhsProgress], B0); + traits.madd(A0,B1,C1,B1); + traits.loadRhs(&blB[5*RhsProgress], B1); + traits.madd(A0,B2,C2,B2); + traits.loadRhs(&blB[6*RhsProgress], B2); + traits.madd(A0,B3,C3,B3); + traits.loadLhs(&blA[1*LhsProgress], A0); + traits.loadRhs(&blB[7*RhsProgress], B3); + traits.madd(A0,B0,C0,B0); + traits.loadRhs(&blB[8*RhsProgress], B0); + traits.madd(A0,B1,C1,B1); + traits.loadRhs(&blB[9*RhsProgress], B1); + traits.madd(A0,B2,C2,B2); + traits.loadRhs(&blB[10*RhsProgress], B2); + traits.madd(A0,B3,C3,B3); + traits.loadLhs(&blA[2*LhsProgress], A0); + traits.loadRhs(&blB[11*RhsProgress], B3); + + traits.madd(A0,B0,C0,B0); + traits.loadRhs(&blB[12*RhsProgress], B0); + traits.madd(A0,B1,C1,B1); + traits.loadRhs(&blB[13*RhsProgress], B1); + traits.madd(A0,B2,C2,B2); + traits.loadRhs(&blB[14*RhsProgress], B2); + traits.madd(A0,B3,C3,B3); + + traits.loadLhs(&blA[3*LhsProgress], A0); + traits.loadRhs(&blB[15*RhsProgress], B3); + traits.madd(A0,B0,C0,B0); + traits.madd(A0,B1,C1,B1); + traits.madd(A0,B2,C2,B2); + traits.madd(A0,B3,C3,B3); + } + + blB += nr*4*RhsProgress; + blA += 4*LhsProgress; + } + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + if(nr==2) + { + LhsPacket A0; + RhsPacket B0, B1; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.loadRhs(&blB[1*RhsProgress], B1); + traits.madd(A0,B0,C0,B0); + traits.madd(A0,B1,C1,B1); + } + else + { + LhsPacket A0; + RhsPacket B0, B1, B2, B3; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.loadRhs(&blB[1*RhsProgress], B1); + traits.loadRhs(&blB[2*RhsProgress], B2); + traits.loadRhs(&blB[3*RhsProgress], B3); + + traits.madd(A0,B0,C0,B0); + traits.madd(A0,B1,C1,B1); + traits.madd(A0,B2,C2,B2); + traits.madd(A0,B3,C3,B3); + } + + blB += nr*RhsProgress; + blA += LhsProgress; + } + + ResPacket R0, R1, R2, R3; + ResPacket alphav = pset1<ResPacket>(alpha); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + ResScalar* r1 = r0 + resStride; + ResScalar* r2 = r1 + resStride; + ResScalar* r3 = r2 + resStride; + + R0 = ploadu<ResPacket>(r0); + R1 = ploadu<ResPacket>(r1); + if(nr==4) R2 = ploadu<ResPacket>(r2); + if(nr==4) R3 = ploadu<ResPacket>(r3); + + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + if(nr==4) traits.acc(C2, alphav, R2); + if(nr==4) traits.acc(C3, alphav, R3); + + pstoreu(r0, R0); + pstoreu(r1, R1); + if(nr==4) pstoreu(r2, R2); + if(nr==4) pstoreu(r3, R3); + } + for(Index i=peeled_mc2; i<rows; i++) + { + const LhsScalar* blA = &blockA[i*strideA+offsetA]; + prefetch(&blA[0]); + + // gets a 1 x nr res block as registers + ResScalar C0(0), C1(0), C2(0), C3(0); + // TODO directly use blockB ??? + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + for(Index k=0; k<depth; k++) + { + if(nr==2) + { + LhsScalar A0; + RhsScalar B0, B1; + + A0 = blA[k]; + B0 = blB[0]; + B1 = blB[1]; + MADD(cj,A0,B0,C0,B0); + MADD(cj,A0,B1,C1,B1); + } + else + { + LhsScalar A0; + RhsScalar B0, B1, B2, B3; + + A0 = blA[k]; + B0 = blB[0]; + B1 = blB[1]; + B2 = blB[2]; + B3 = blB[3]; + + MADD(cj,A0,B0,C0,B0); + MADD(cj,A0,B1,C1,B1); + MADD(cj,A0,B2,C2,B2); + MADD(cj,A0,B3,C3,B3); + } + + blB += nr; + } + res[(j2+0)*resStride + i] += alpha*C0; + res[(j2+1)*resStride + i] += alpha*C1; + if(nr==4) res[(j2+2)*resStride + i] += alpha*C2; + if(nr==4) res[(j2+3)*resStride + i] += alpha*C3; + } + } + // process remaining rhs/res columns one at a time + // => do the same but with nr==1 + for(Index j2=packet_cols; j2<cols; j2++) + { + // unpack B + traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB); + + for(Index i=0; i<peeled_mc; i+=mr) + { + const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; + prefetch(&blA[0]); + + // TODO move the res loads to the stores + + // get res block as registers + AccPacket C0, C4; + traits.initAcc(C0); + traits.initAcc(C4); + + const RhsScalar* blB = unpackedB; + for(Index k=0; k<depth; k++) + { + LhsPacket A0, A1; + RhsPacket B0; + RhsPacket T0; + + traits.loadLhs(&blA[0*LhsProgress], A0); + traits.loadLhs(&blA[1*LhsProgress], A1); + traits.loadRhs(&blB[0*RhsProgress], B0); + traits.madd(A0,B0,C0,T0); + traits.madd(A1,B0,C4,B0); + + blB += RhsProgress; + blA += 2*LhsProgress; + } + ResPacket R0, R4; + ResPacket alphav = pset1<ResPacket>(alpha); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + + R0 = ploadu<ResPacket>(r0); + R4 = ploadu<ResPacket>(r0+ResPacketSize); + + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R4); + + pstoreu(r0, R0); + pstoreu(r0+ResPacketSize, R4); + } + if(rows-peeled_mc>=LhsProgress) + { + Index i = peeled_mc; + const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; + prefetch(&blA[0]); + + AccPacket C0; + traits.initAcc(C0); + + const RhsScalar* blB = unpackedB; + for(Index k=0; k<depth; k++) + { + LhsPacket A0; + RhsPacket B0; + traits.loadLhs(blA, A0); + traits.loadRhs(blB, B0); + traits.madd(A0, B0, C0, B0); + blB += RhsProgress; + blA += LhsProgress; + } + + ResPacket alphav = pset1<ResPacket>(alpha); + ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]); + traits.acc(C0, alphav, R0); + pstoreu(&res[(j2+0)*resStride + i], R0); + } + for(Index i=peeled_mc2; i<rows; i++) + { + const LhsScalar* blA = &blockA[i*strideA+offsetA]; + prefetch(&blA[0]); + + // gets a 1 x 1 res block as registers + ResScalar C0(0); + // FIXME directly use blockB ?? + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + for(Index k=0; k<depth; k++) + { + LhsScalar A0 = blA[k]; + RhsScalar B0 = blB[k]; + MADD(cj, A0, B0, C0, B0); + } + res[(j2+0)*resStride + i] += alpha*C0; + } + } + } +}; + +#undef CJMADD + +// pack a block of the lhs +// The travesal is as follow (mr==4): +// 0 4 8 12 ... +// 1 5 9 13 ... +// 2 6 10 14 ... +// 3 7 11 15 ... +// +// 16 20 24 28 ... +// 17 21 25 29 ... +// 18 22 26 30 ... +// 19 23 27 31 ... +// +// 32 33 34 35 ... +// 36 36 38 39 ... +template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs +{ + void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, + Index stride=0, Index offset=0) + { +// enum { PacketSize = packet_traits<Scalar>::size }; + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; + const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride); + Index count = 0; + Index peeled_mc = (rows/Pack1)*Pack1; + for(Index i=0; i<peeled_mc; i+=Pack1) + { + if(PanelMode) count += Pack1 * offset; + for(Index k=0; k<depth; k++) + for(Index w=0; w<Pack1; w++) + blockA[count++] = cj(lhs(i+w, k)); + if(PanelMode) count += Pack1 * (stride-offset-depth); + } + if(rows-peeled_mc>=Pack2) + { + if(PanelMode) count += Pack2*offset; + for(Index k=0; k<depth; k++) + for(Index w=0; w<Pack2; w++) + blockA[count++] = cj(lhs(peeled_mc+w, k)); + if(PanelMode) count += Pack2 * (stride-offset-depth); + peeled_mc += Pack2; + } + for(Index i=peeled_mc; i<rows; i++) + { + if(PanelMode) count += offset; + for(Index k=0; k<depth; k++) + blockA[count++] = cj(lhs(i, k)); + if(PanelMode) count += (stride-offset-depth); + } + } +}; + +// copy a complete panel of the rhs +// this version is optimized for column major matrices +// The traversal order is as follow: (nr==4): +// 0 1 2 3 12 13 14 15 24 27 +// 4 5 6 7 16 17 18 19 25 28 +// 8 9 10 11 20 21 22 23 26 29 +// . . . . . . . . . . +template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> +struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> +{ + typedef typename packet_traits<Scalar>::type Packet; + enum { PacketSize = packet_traits<Scalar>::size }; + void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, + Index stride=0, Index offset=0) + { + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; + Index packet_cols = (cols/nr) * nr; + Index count = 0; + for(Index j2=0; j2<packet_cols; j2+=nr) + { + // skip what we have before + if(PanelMode) count += nr * offset; + const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + const Scalar* b1 = &rhs[(j2+1)*rhsStride]; + const Scalar* b2 = &rhs[(j2+2)*rhsStride]; + const Scalar* b3 = &rhs[(j2+3)*rhsStride]; + for(Index k=0; k<depth; k++) + { + blockB[count+0] = cj(b0[k]); + blockB[count+1] = cj(b1[k]); + if(nr==4) blockB[count+2] = cj(b2[k]); + if(nr==4) blockB[count+3] = cj(b3[k]); + count += nr; + } + // skip what we have after + if(PanelMode) count += nr * (stride-offset-depth); + } + + // copy the remaining columns one at a time (nr==1) + for(Index j2=packet_cols; j2<cols; ++j2) + { + if(PanelMode) count += offset; + const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + for(Index k=0; k<depth; k++) + { + blockB[count] = cj(b0[k]); + count += 1; + } + if(PanelMode) count += (stride-offset-depth); + } + } +}; + +// this version is optimized for row major matrices +template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> +struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> +{ + enum { PacketSize = packet_traits<Scalar>::size }; + void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, + Index stride=0, Index offset=0) + { + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; + Index packet_cols = (cols/nr) * nr; + Index count = 0; + for(Index j2=0; j2<packet_cols; j2+=nr) + { + // skip what we have before + if(PanelMode) count += nr * offset; + for(Index k=0; k<depth; k++) + { + const Scalar* b0 = &rhs[k*rhsStride + j2]; + blockB[count+0] = cj(b0[0]); + blockB[count+1] = cj(b0[1]); + if(nr==4) blockB[count+2] = cj(b0[2]); + if(nr==4) blockB[count+3] = cj(b0[3]); + count += nr; + } + // skip what we have after + if(PanelMode) count += nr * (stride-offset-depth); + } + // copy the remaining columns one at a time (nr==1) + for(Index j2=packet_cols; j2<cols; ++j2) + { + if(PanelMode) count += offset; + const Scalar* b0 = &rhs[j2]; + for(Index k=0; k<depth; k++) + { + blockB[count] = cj(b0[k*rhsStride]); + count += 1; + } + if(PanelMode) count += stride-offset-depth; + } + } +}; + +} // end namespace internal + +/** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. + * \sa setCpuCacheSize */ +inline std::ptrdiff_t l1CacheSize() +{ + std::ptrdiff_t l1, l2; + internal::manage_caching_sizes(GetAction, &l1, &l2); + return l1; +} + +/** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. + * \sa setCpuCacheSize */ +inline std::ptrdiff_t l2CacheSize() +{ + std::ptrdiff_t l1, l2; + internal::manage_caching_sizes(GetAction, &l1, &l2); + return l2; +} + +/** Set the cpu L1 and L2 cache sizes (in bytes). + * These values are use to adjust the size of the blocks + * for the algorithms working per blocks. + * + * \sa computeProductBlockingSizes */ +inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) +{ + internal::manage_caching_sizes(SetAction, &l1, &l2); +} + +#endif // EIGEN_GENERAL_BLOCK_PANEL_H diff --git a/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h b/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h new file mode 100644 index 00000000000..ae94a27953b --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -0,0 +1,439 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_H +#define EIGEN_GENERAL_MATRIX_MATRIX_H + +namespace internal { + +template<typename _LhsScalar, typename _RhsScalar> class level3_blocking; + +/* Specialization for a row-major destination matrix => simple transposition of the product */ +template< + typename Index, + typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> +struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor> +{ + typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + static EIGEN_STRONG_INLINE void run( + Index rows, Index cols, Index depth, + const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsStride, + ResScalar* res, Index resStride, + ResScalar alpha, + level3_blocking<RhsScalar,LhsScalar>& blocking, + GemmParallelInfo<Index>* info = 0) + { + // transpose the product such that the result is column major + general_matrix_matrix_product<Index, + RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, + LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, + ColMajor> + ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info); + } +}; + +/* Specialization for a col-major destination matrix + * => Blocking algorithm following Goto's paper */ +template< + typename Index, + typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> +struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor> +{ +typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; +static void run(Index rows, Index cols, Index depth, + const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsStride, + ResScalar* res, Index resStride, + ResScalar alpha, + level3_blocking<LhsScalar,RhsScalar>& blocking, + GemmParallelInfo<Index>* info = 0) +{ + const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); + const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); + + typedef gebp_traits<LhsScalar,RhsScalar> Traits; + + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + //Index nc = blocking.nc(); // cache block size along the N direction + + gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; + gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; + +#ifdef EIGEN_HAS_OPENMP + if(info) + { + // this is the parallel version! + Index tid = omp_get_thread_num(); + Index threads = omp_get_num_threads(); + + std::size_t sizeA = kc*mc; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0); + ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0); + + RhsScalar* blockB = blocking.blockB(); + eigen_internal_assert(blockB!=0); + + // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs... + for(Index k=0; k<depth; k+=kc) + { + const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A' + + // In order to reduce the chance that a thread has to wait for the other, + // let's start by packing A'. + pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc); + + // Pack B_k to B' in a parallel fashion: + // each thread packs the sub block B_k,j to B'_j where j is the thread id. + + // However, before copying to B'_j, we have to make sure that no other thread is still using it, + // i.e., we test that info[tid].users equals 0. + // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. + while(info[tid].users!=0) {} + info[tid].users += threads; + + pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length); + + // Notify the other threads that the part B'_j is ready to go. + info[tid].sync = k; + + // Computes C_i += A' * B' per B'_j + for(Index shift=0; shift<threads; ++shift) + { + Index j = (tid+shift)%threads; + + // At this point we have to make sure that B'_j has been updated by the thread j, + // we use testAndSetOrdered to mimic a volatile access. + // However, no need to wait for the B' part which has been updated by the current thread! + if(shift>0) + while(info[j].sync!=k) {} + + gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w); + } + + // Then keep going as usual with the remaining A' + for(Index i=mc; i<rows; i+=mc) + { + const Index actual_mc = (std::min)(i+mc,rows)-i; + + // pack A_i,k to A' + pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc); + + // C_i += A' * B' + gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w); + } + + // Release all the sub blocks B'_j of B' for the current thread, + // i.e., we simply decrement the number of users by 1 + for(Index j=0; j<threads; ++j) + #pragma omp atomic + --(info[j].users); + } + } + else +#endif // EIGEN_HAS_OPENMP + { + EIGEN_UNUSED_VARIABLE(info); + + // this is the sequential version! + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*cols; + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + + ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW()); + + // For each horizontal panel of the rhs, and corresponding panel of the lhs... + // (==GEMM_VAR1) + for(Index k2=0; k2<depth; k2+=kc) + { + const Index actual_kc = (std::min)(k2+kc,depth)-k2; + + // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. + // => Pack rhs's panel into a sequential chunk of memory (L2 caching) + // Note that this panel will be read as many times as the number of blocks in the lhs's + // vertical panel which is, in practice, a very low number. + pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); + + + // For each mc x kc block of the lhs's vertical panel... + // (==GEPP_VAR1) + for(Index i2=0; i2<rows; i2+=mc) + { + const Index actual_mc = (std::min)(i2+mc,rows)-i2; + + // We pack the lhs's block into a sequential chunk of memory (L1 caching) + // Note that this block will be read a very high number of times, which is equal to the number of + // micro vertical panel of the large rhs's panel (e.g., cols/4 times). + pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); + + // Everything is packed, we can now call the block * panel kernel: + gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); + + } + } + } +} + +}; + +/********************************************************************************* +* Specialization of GeneralProduct<> for "large" GEMM, i.e., +* implementation of the high level wrapper to general_matrix_matrix_product +**********************************************************************************/ + +template<typename Lhs, typename Rhs> +struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> > + : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> > +{}; + +template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType> +struct gemm_functor +{ + gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, Scalar actualAlpha, + BlockingType& blocking) + : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) + {} + + void initParallelSession() const + { + m_blocking.allocateB(); + } + + void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const + { + if(cols==-1) + cols = m_rhs.cols(); + + Gemm::run(rows, cols, m_lhs.cols(), + /*(const Scalar*)*/&m_lhs.coeffRef(row,0), m_lhs.outerStride(), + /*(const Scalar*)*/&m_rhs.coeffRef(0,col), m_rhs.outerStride(), + (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(), + m_actualAlpha, m_blocking, info); + } + + protected: + const Lhs& m_lhs; + const Rhs& m_rhs; + Dest& m_dest; + Scalar m_actualAlpha; + BlockingType& m_blocking; +}; + +template<int StorageOrder, typename LhsScalar, typename RhsScalar, int MaxRows, int MaxCols, int MaxDepth, +bool FiniteAtCompileTime = MaxRows!=Dynamic && MaxCols!=Dynamic && MaxDepth != Dynamic> class gemm_blocking_space; + +template<typename _LhsScalar, typename _RhsScalar> +class level3_blocking +{ + typedef _LhsScalar LhsScalar; + typedef _RhsScalar RhsScalar; + + protected: + LhsScalar* m_blockA; + RhsScalar* m_blockB; + RhsScalar* m_blockW; + + DenseIndex m_mc; + DenseIndex m_nc; + DenseIndex m_kc; + + public: + + level3_blocking() + : m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0) + {} + + inline DenseIndex mc() const { return m_mc; } + inline DenseIndex nc() const { return m_nc; } + inline DenseIndex kc() const { return m_kc; } + + inline LhsScalar* blockA() { return m_blockA; } + inline RhsScalar* blockB() { return m_blockB; } + inline RhsScalar* blockW() { return m_blockW; } +}; + +template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth> +class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, true> + : public level3_blocking< + typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type, + typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type> +{ + enum { + Transpose = StorageOrder==RowMajor, + ActualRows = Transpose ? MaxCols : MaxRows, + ActualCols = Transpose ? MaxRows : MaxCols + }; + typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar; + typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar; + typedef gebp_traits<LhsScalar,RhsScalar> Traits; + enum { + SizeA = ActualRows * MaxDepth, + SizeB = ActualCols * MaxDepth, + SizeW = MaxDepth * Traits::WorkSpaceFactor + }; + + EIGEN_ALIGN16 LhsScalar m_staticA[SizeA]; + EIGEN_ALIGN16 RhsScalar m_staticB[SizeB]; + EIGEN_ALIGN16 RhsScalar m_staticW[SizeW]; + + public: + + gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/) + { + this->m_mc = ActualRows; + this->m_nc = ActualCols; + this->m_kc = MaxDepth; + this->m_blockA = m_staticA; + this->m_blockB = m_staticB; + this->m_blockW = m_staticW; + } + + inline void allocateA() {} + inline void allocateB() {} + inline void allocateW() {} + inline void allocateAll() {} +}; + +template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth> +class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, false> + : public level3_blocking< + typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type, + typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type> +{ + enum { + Transpose = StorageOrder==RowMajor + }; + typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar; + typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar; + typedef gebp_traits<LhsScalar,RhsScalar> Traits; + + DenseIndex m_sizeA; + DenseIndex m_sizeB; + DenseIndex m_sizeW; + + public: + + gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth) + { + this->m_mc = Transpose ? cols : rows; + this->m_nc = Transpose ? rows : cols; + this->m_kc = depth; + + computeProductBlockingSizes<LhsScalar,RhsScalar>(this->m_kc, this->m_mc, this->m_nc); + m_sizeA = this->m_mc * this->m_kc; + m_sizeB = this->m_kc * this->m_nc; + m_sizeW = this->m_kc*Traits::WorkSpaceFactor; + } + + void allocateA() + { + if(this->m_blockA==0) + this->m_blockA = aligned_new<LhsScalar>(m_sizeA); + } + + void allocateB() + { + if(this->m_blockB==0) + this->m_blockB = aligned_new<RhsScalar>(m_sizeB); + } + + void allocateW() + { + if(this->m_blockW==0) + this->m_blockW = aligned_new<RhsScalar>(m_sizeW); + } + + void allocateAll() + { + allocateA(); + allocateB(); + allocateW(); + } + + ~gemm_blocking_space() + { + aligned_delete(this->m_blockA, m_sizeA); + aligned_delete(this->m_blockB, m_sizeB); + aligned_delete(this->m_blockW, m_sizeW); + } +}; + +} // end namespace internal + +template<typename Lhs, typename Rhs> +class GeneralProduct<Lhs, Rhs, GemmProduct> + : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> +{ + enum { + MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime) + }; + public: + EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) + + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef Scalar ResScalar; + + GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) + { + typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp; + EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar); + } + + template<typename Dest> void scaleAndAddTo(Dest& dst, Scalar alpha) const + { + eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, + Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; + + typedef internal::gemm_functor< + Scalar, Index, + internal::general_matrix_matrix_product< + Index, + LhsScalar, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), + RhsScalar, (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), + (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, + _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor; + + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols()); + + internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit); + } +}; + +#endif // EIGEN_GENERAL_MATRIX_MATRIX_H diff --git a/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h new file mode 100644 index 00000000000..5043b64fe2e --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -0,0 +1,225 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H +#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H + +namespace internal { + +/********************************************************************** +* This file implements a general A * B product while +* evaluating only one triangular part of the product. +* This is more general version of self adjoint product (C += A A^T) +* as the level 3 SYRK Blas routine. +**********************************************************************/ + +// forward declarations (defined at the end of this file) +template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo> +struct tribb_kernel; + +/* Optimized matrix-matrix product evaluating only one triangular half */ +template <typename Index, + typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResStorageOrder, int UpLo> +struct general_matrix_matrix_triangular_product; + +// as usual if the result is row major => we transpose the product +template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo> +struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo> +{ + typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, ResScalar alpha) + { + general_matrix_matrix_triangular_product<Index, + RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, + LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, + ColMajor, UpLo==Lower?Upper:Lower> + ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha); + } +}; + +template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo> +struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo> +{ + typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, ResScalar alpha) + { + const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); + const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); + + typedef gebp_traits<LhsScalar,RhsScalar> Traits; + + Index kc = depth; // cache block size along the K direction + Index mc = size; // cache block size along the M direction + Index nc = size; // cache block size along the N direction + computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc); + // !!! mc must be a multiple of nr: + if(mc > Traits::nr) + mc = (mc/Traits::nr)*Traits::nr; + + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*size; + ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); + ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0); + RhsScalar* blockB = allocatedBlockB + sizeW; + + gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; + gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; + tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb; + + for(Index k2=0; k2<depth; k2+=kc) + { + const Index actual_kc = (std::min)(k2+kc,depth)-k2; + + // note that the actual rhs is the transpose/adjoint of mat + pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size); + + for(Index i2=0; i2<size; i2+=mc) + { + const Index actual_mc = (std::min)(i2+mc,size)-i2; + + pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + + // the selected actual_mc * size panel of res is split into three different part: + // 1 - before the diagonal => processed with gebp or skipped + // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel + // 3 - after the diagonal => processed with gebp or skipped + if (UpLo==Lower) + gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, + -1, -1, 0, 0, allocatedBlockB); + + sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); + + if (UpLo==Upper) + { + Index j2 = i2+actual_mc; + gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, + -1, -1, 0, 0, allocatedBlockB); + } + } + } + } +}; + +// Optimized packed Block * packed Block product kernel evaluating only one given triangular part +// This kernel is built on top of the gebp kernel: +// - the current destination block is processed per panel of actual_mc x BlockSize +// where BlockSize is set to the minimal value allowing gebp to be as fast as possible +// - then, as usual, each panel is split into three parts along the diagonal, +// the sub blocks above and below the diagonal are processed as usual, +// while the triangular block overlapping the diagonal is evaluated into a +// small temporary buffer which is then accumulated into the result using a +// triangular traversal. +template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo> +struct tribb_kernel +{ + typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits; + typedef typename Traits::ResScalar ResScalar; + + enum { + BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) + }; + void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, ResScalar alpha, RhsScalar* workspace) + { + gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel; + Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer; + + // let's process the block per panel of actual_mc x BlockSize, + // again, each is split into three parts, etc. + for (Index j=0; j<size; j+=BlockSize) + { + Index actualBlockSize = std::min<Index>(BlockSize,size - j); + const RhsScalar* actual_b = blockB+j*depth; + + if(UpLo==Upper) + gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, + -1, -1, 0, 0, workspace); + + // selfadjoint micro block + { + Index i = j; + buffer.setZero(); + // 1 - apply the kernel on the temporary buffer + gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, + -1, -1, 0, 0, workspace); + // 2 - triangular accumulation + for(Index j1=0; j1<actualBlockSize; ++j1) + { + ResScalar* r = res + (j+j1)*resStride + i; + for(Index i1=UpLo==Lower ? j1 : 0; + UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1) + r[i1] += buffer(i1,j1); + } + } + + if(UpLo==Lower) + { + Index i = j+actualBlockSize; + gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha, + -1, -1, 0, 0, workspace); + } + } + } +}; + +} // end namespace internal + +// high level API + +template<typename MatrixType, unsigned int UpLo> +template<typename ProductDerived, typename _Lhs, typename _Rhs> +TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha) +{ + typedef typename internal::remove_all<typename ProductDerived::LhsNested>::type Lhs; + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs; + typedef typename internal::remove_all<ActualLhs>::type _ActualLhs; + const ActualLhs actualLhs = LhsBlasTraits::extract(prod.lhs()); + + typedef typename internal::remove_all<typename ProductDerived::RhsNested>::type Rhs; + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs; + typedef typename internal::remove_all<ActualRhs>::type _ActualRhs; + const ActualRhs actualRhs = RhsBlasTraits::extract(prod.rhs()); + + typename ProductDerived::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived()); + + internal::general_matrix_matrix_triangular_product<Index, + typename Lhs::Scalar, _ActualLhs::Flags&RowMajorBit ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, + typename Rhs::Scalar, _ActualRhs::Flags&RowMajorBit ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, + MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo> + ::run(m_matrix.cols(), actualLhs.cols(), + &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(), + const_cast<Scalar*>(m_matrix.data()), m_matrix.outerStride(), actualAlpha); + + return *this; +} + +#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H diff --git a/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixVector.h b/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixVector.h new file mode 100644 index 00000000000..e0e2cbf8f62 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/GeneralMatrixVector.h @@ -0,0 +1,559 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H +#define EIGEN_GENERAL_MATRIX_VECTOR_H + +namespace internal { + +/* Optimized col-major matrix * vector product: + * This algorithm processes 4 columns at onces that allows to both reduce + * the number of load/stores of the result by a factor 4 and to reduce + * the instruction dependency. Moreover, we know that all bands have the + * same alignment pattern. + * + * Mixing type logic: C += alpha * A * B + * | A | B |alpha| comments + * |real |cplx |cplx | no vectorization + * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization + * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp + * |cplx |real |real | optimal case, vectorization possible via real-cplx mul + */ +template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> +struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs> +{ +typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + +enum { + Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable + && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size), + LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, + ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1 +}; + +typedef typename packet_traits<LhsScalar>::type _LhsPacket; +typedef typename packet_traits<RhsScalar>::type _RhsPacket; +typedef typename packet_traits<ResScalar>::type _ResPacket; + +typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; +typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; +typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; + +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsIncr, + ResScalar* res, Index + #ifdef EIGEN_INTERNAL_DEBUGGING + resIncr + #endif + , RhsScalar alpha) +{ + eigen_internal_assert(resIncr==1); + #ifdef _EIGEN_ACCUMULATE_PACKETS + #error _EIGEN_ACCUMULATE_PACKETS has already been defined + #endif + #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ + pstore(&res[j], \ + padd(pload<ResPacket>(&res[j]), \ + padd( \ + padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \ + pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \ + padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \ + pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) ))) + + conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; + conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; + if(ConjugateRhs) + alpha = conj(alpha); + + enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; + const Index columnsAtOnce = 4; + const Index peels = 2; + const Index LhsPacketAlignedMask = LhsPacketSize-1; + const Index ResPacketAlignedMask = ResPacketSize-1; + const Index PeelAlignedMask = ResPacketSize*peels-1; + const Index size = rows; + + // How many coeffs of the result do we have to skip to be aligned. + // Here we assume data are at least aligned on the base scalar type. + Index alignedStart = first_aligned(res,size); + Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; + const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; + + const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; + Index alignmentPattern = alignmentStep==0 ? AllAligned + : alignmentStep==(LhsPacketSize/2) ? EvenAligned + : FirstAligned; + + // we cannot assume the first element is aligned because of sub-matrices + const Index lhsAlignmentOffset = first_aligned(lhs,size); + + // find how many columns do we have to skip to be aligned with the result (if possible) + Index skipColumns = 0; + // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) + if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) ) + { + alignedSize = 0; + alignedStart = 0; + } + else if (LhsPacketSize>1) + { + eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); + + while (skipColumns<LhsPacketSize && + alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize)) + ++skipColumns; + if (skipColumns==LhsPacketSize) + { + // nothing can be aligned, no need to skip any column + alignmentPattern = NoneAligned; + skipColumns = 0; + } + else + { + skipColumns = (std::min)(skipColumns,cols); + // note that the skiped columns are processed later. + } + + eigen_internal_assert( (alignmentPattern==NoneAligned) + || (skipColumns + columnsAtOnce >= cols) + || LhsPacketSize > size + || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); + } + else if(Vectorizable) + { + alignedStart = 0; + alignedSize = size; + alignmentPattern = AllAligned; + } + + Index offset1 = (FirstAligned && alignmentStep==1?3:1); + Index offset3 = (FirstAligned && alignmentStep==1?1:3); + + Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; + for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce) + { + RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]), + ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]), + ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]), + ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]); + + // this helps a lot generating better binary code + const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + + if (Vectorizable) + { + /* explicit vectorization */ + // process initial unaligned coeffs + for (Index j=0; j<alignedStart; ++j) + { + res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); + res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); + res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); + res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); + } + + if (alignedSize>alignedStart) + { + switch(alignmentPattern) + { + case AllAligned: + for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) + _EIGEN_ACCUMULATE_PACKETS(d,d,d); + break; + case EvenAligned: + for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) + _EIGEN_ACCUMULATE_PACKETS(d,du,d); + break; + case FirstAligned: + if(peels>1) + { + LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; + ResPacket T0, T1; + + A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); + A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); + A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); + + for (Index j = alignedStart; j<peeledSize; j+=peels*ResPacketSize) + { + A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); + A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); + A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + + A00 = pload<LhsPacket>(&lhs0[j]); + A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]); + T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j])); + T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize])); + + T0 = pcj.pmadd(A01, ptmp1, T0); + A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + T0 = pcj.pmadd(A02, ptmp2, T0); + A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + T0 = pcj.pmadd(A03, ptmp3, T0); + pstore(&res[j],T0); + A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + T1 = pcj.pmadd(A11, ptmp1, T1); + T1 = pcj.pmadd(A12, ptmp2, T1); + T1 = pcj.pmadd(A13, ptmp3, T1); + pstore(&res[j+ResPacketSize],T1); + } + } + for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize) + _EIGEN_ACCUMULATE_PACKETS(d,du,du); + break; + default: + for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) + _EIGEN_ACCUMULATE_PACKETS(du,du,du); + break; + } + } + } // end explicit vectorization + + /* process remaining coeffs (or all if there is no explicit vectorization) */ + for (Index j=alignedSize; j<size; ++j) + { + res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); + res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); + res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); + res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); + } + } + + // process remaining first and last columns (at most columnsAtOnce-1) + Index end = cols; + Index start = columnBound; + do + { + for (Index k=start; k<end; ++k) + { + RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]); + const LhsScalar* lhs0 = lhs + k*lhsStride; + + if (Vectorizable) + { + /* explicit vectorization */ + // process first unaligned result's coeffs + for (Index j=0; j<alignedStart; ++j) + res[j] += cj.pmul(lhs0[j], pfirst(ptmp0)); + // process aligned result's coeffs + if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) + pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); + else + for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) + pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); + } + + // process remaining scalars (or all if no explicit vectorization) + for (Index i=alignedSize; i<size; ++i) + res[i] += cj.pmul(lhs0[i], pfirst(ptmp0)); + } + if (skipColumns) + { + start = 0; + end = skipColumns; + skipColumns = 0; + } + else + break; + } while(Vectorizable); + #undef _EIGEN_ACCUMULATE_PACKETS +} +}; + +/* Optimized row-major matrix * vector product: + * This algorithm processes 4 rows at onces that allows to both reduce + * the number of load/stores of the result by a factor 4 and to reduce + * the instruction dependency. Moreover, we know that all bands have the + * same alignment pattern. + * + * Mixing type logic: + * - alpha is always a complex (or converted to a complex) + * - no vectorization + */ +template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> +struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs> +{ +typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + +enum { + Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable + && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size), + LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, + ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1 +}; + +typedef typename packet_traits<LhsScalar>::type _LhsPacket; +typedef typename packet_traits<RhsScalar>::type _RhsPacket; +typedef typename packet_traits<ResScalar>::type _ResPacket; + +typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; +typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; +typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; + +EIGEN_DONT_INLINE static void run( + Index rows, Index cols, + const LhsScalar* lhs, Index lhsStride, + const RhsScalar* rhs, Index rhsIncr, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + EIGEN_UNUSED_VARIABLE(rhsIncr); + eigen_internal_assert(rhsIncr==1); + #ifdef _EIGEN_ACCUMULATE_PACKETS + #error _EIGEN_ACCUMULATE_PACKETS has already been defined + #endif + + #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ + RhsPacket b = pload<RhsPacket>(&rhs[j]); \ + ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \ + ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \ + ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \ + ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); } + + conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; + conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; + + enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; + const Index rowsAtOnce = 4; + const Index peels = 2; + const Index RhsPacketAlignedMask = RhsPacketSize-1; + const Index LhsPacketAlignedMask = LhsPacketSize-1; + const Index PeelAlignedMask = RhsPacketSize*peels-1; + const Index depth = cols; + + // How many coeffs of the result do we have to skip to be aligned. + // Here we assume data are at least aligned on the base scalar type + // if that's not the case then vectorization is discarded, see below. + Index alignedStart = first_aligned(rhs, depth); + Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; + const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; + + const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; + Index alignmentPattern = alignmentStep==0 ? AllAligned + : alignmentStep==(LhsPacketSize/2) ? EvenAligned + : FirstAligned; + + // we cannot assume the first element is aligned because of sub-matrices + const Index lhsAlignmentOffset = first_aligned(lhs,depth); + + // find how many rows do we have to skip to be aligned with rhs (if possible) + Index skipRows = 0; + // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) + { + alignedSize = 0; + alignedStart = 0; + } + else if (LhsPacketSize>1) + { + eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); + + while (skipRows<LhsPacketSize && + alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize)) + ++skipRows; + if (skipRows==LhsPacketSize) + { + // nothing can be aligned, no need to skip any column + alignmentPattern = NoneAligned; + skipRows = 0; + } + else + { + skipRows = (std::min)(skipRows,Index(rows)); + // note that the skiped columns are processed later. + } + eigen_internal_assert( alignmentPattern==NoneAligned + || LhsPacketSize==1 + || (skipRows + rowsAtOnce >= rows) + || LhsPacketSize > depth + || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); + } + else if(Vectorizable) + { + alignedStart = 0; + alignedSize = depth; + alignmentPattern = AllAligned; + } + + Index offset1 = (FirstAligned && alignmentStep==1?3:1); + Index offset3 = (FirstAligned && alignmentStep==1?1:3); + + Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; + for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) + { + EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); + ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); + + // this helps the compiler generating good binary code + const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + + if (Vectorizable) + { + /* explicit vectorization */ + ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)), + ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0)); + + // process initial unaligned coeffs + // FIXME this loop get vectorized by the compiler ! + for (Index j=0; j<alignedStart; ++j) + { + RhsScalar b = rhs[j]; + tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); + tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); + } + + if (alignedSize>alignedStart) + { + switch(alignmentPattern) + { + case AllAligned: + for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) + _EIGEN_ACCUMULATE_PACKETS(d,d,d); + break; + case EvenAligned: + for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) + _EIGEN_ACCUMULATE_PACKETS(d,du,d); + break; + case FirstAligned: + if (peels>1) + { + /* Here we proccess 4 rows with with two peeled iterations to hide + * tghe overhead of unaligned loads. Moreover unaligned loads are handled + * using special shift/move operations between the two aligned packets + * overlaping the desired unaligned packet. This is *much* more efficient + * than basic unaligned loads. + */ + LhsPacket A01, A02, A03, A11, A12, A13; + A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); + A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); + A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); + + for (Index j = alignedStart; j<peeledSize; j+=peels*RhsPacketSize) + { + RhsPacket b = pload<RhsPacket>(&rhs[j]); + A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); + A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); + A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + + ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0); + ptmp1 = pcj.pmadd(A01, b, ptmp1); + A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + ptmp2 = pcj.pmadd(A02, b, ptmp2); + A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + ptmp3 = pcj.pmadd(A03, b, ptmp3); + A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + + b = pload<RhsPacket>(&rhs[j+RhsPacketSize]); + ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0); + ptmp1 = pcj.pmadd(A11, b, ptmp1); + ptmp2 = pcj.pmadd(A12, b, ptmp2); + ptmp3 = pcj.pmadd(A13, b, ptmp3); + } + } + for (Index j = peeledSize; j<alignedSize; j+=RhsPacketSize) + _EIGEN_ACCUMULATE_PACKETS(d,du,du); + break; + default: + for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) + _EIGEN_ACCUMULATE_PACKETS(du,du,du); + break; + } + tmp0 += predux(ptmp0); + tmp1 += predux(ptmp1); + tmp2 += predux(ptmp2); + tmp3 += predux(ptmp3); + } + } // end explicit vectorization + + // process remaining coeffs (or all if no explicit vectorization) + // FIXME this loop get vectorized by the compiler ! + for (Index j=alignedSize; j<depth; ++j) + { + RhsScalar b = rhs[j]; + tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); + tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); + } + res[i*resIncr] += alpha*tmp0; + res[(i+offset1)*resIncr] += alpha*tmp1; + res[(i+2)*resIncr] += alpha*tmp2; + res[(i+offset3)*resIncr] += alpha*tmp3; + } + + // process remaining first and last rows (at most columnsAtOnce-1) + Index end = rows; + Index start = rowBound; + do + { + for (Index i=start; i<end; ++i) + { + EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); + ResPacket ptmp0 = pset1<ResPacket>(tmp0); + const LhsScalar* lhs0 = lhs + i*lhsStride; + // process first unaligned result's coeffs + // FIXME this loop get vectorized by the compiler ! + for (Index j=0; j<alignedStart; ++j) + tmp0 += cj.pmul(lhs0[j], rhs[j]); + + if (alignedSize>alignedStart) + { + // process aligned rhs coeffs + if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) + ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); + else + for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) + ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); + tmp0 += predux(ptmp0); + } + + // process remaining scalars + // FIXME this loop get vectorized by the compiler ! + for (Index j=alignedSize; j<depth; ++j) + tmp0 += cj.pmul(lhs0[j], rhs[j]); + res[i*resIncr] += alpha*tmp0; + } + if (skipRows) + { + start = 0; + end = skipRows; + skipRows = 0; + } + else + break; + } while(Vectorizable); + + #undef _EIGEN_ACCUMULATE_PACKETS +} +}; + +} // end namespace internal + +#endif // EIGEN_GENERAL_MATRIX_VECTOR_H diff --git a/extern/Eigen3/Eigen/src/Core/products/Parallelizer.h b/extern/Eigen3/Eigen/src/Core/products/Parallelizer.h new file mode 100644 index 00000000000..ecdedc363ce --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/Parallelizer.h @@ -0,0 +1,154 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_PARALLELIZER_H +#define EIGEN_PARALLELIZER_H + +namespace internal { + +/** \internal */ +inline void manage_multi_threading(Action action, int* v) +{ + static EIGEN_UNUSED int m_maxThreads = -1; + + if(action==SetAction) + { + eigen_internal_assert(v!=0); + m_maxThreads = *v; + } + else if(action==GetAction) + { + eigen_internal_assert(v!=0); + #ifdef EIGEN_HAS_OPENMP + if(m_maxThreads>0) + *v = m_maxThreads; + else + *v = omp_get_max_threads(); + #else + *v = 1; + #endif + } + else + { + eigen_internal_assert(false); + } +} + +/** \returns the max number of threads reserved for Eigen + * \sa setNbThreads */ +inline int nbThreads() +{ + int ret; + manage_multi_threading(GetAction, &ret); + return ret; +} + +/** Sets the max number of threads reserved for Eigen + * \sa nbThreads */ +inline void setNbThreads(int v) +{ + manage_multi_threading(SetAction, &v); +} + +template<typename Index> struct GemmParallelInfo +{ + GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {} + + int volatile sync; + int volatile users; + + Index rhs_start; + Index rhs_length; +}; + +template<bool Condition, typename Functor, typename Index> +void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose) +{ +#ifndef EIGEN_HAS_OPENMP + // FIXME the transpose variable is only needed to properly split + // the matrix product when multithreading is enabled. This is a temporary + // fix to support row-major destination matrices. This whole + // parallelizer mechanism has to be redisigned anyway. + EIGEN_UNUSED_VARIABLE(transpose); + func(0,rows, 0,cols); +#else + + // Dynamically check whether we should enable or disable OpenMP. + // The conditions are: + // - the max number of threads we can create is greater than 1 + // - we are not already in a parallel code + // - the sizes are large enough + + // 1- are we already in a parallel session? + // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp? + if((!Condition) || (omp_get_num_threads()>1)) + return func(0,rows, 0,cols); + + Index size = transpose ? cols : rows; + + // 2- compute the maximal number of threads from the size of the product: + // FIXME this has to be fine tuned + Index max_threads = std::max<Index>(1,size / 32); + + // 3 - compute the number of threads we are going to use + Index threads = std::min<Index>(nbThreads(), max_threads); + + if(threads==1) + return func(0,rows, 0,cols); + + func.initParallelSession(); + + if(transpose) + std::swap(rows,cols); + + Index blockCols = (cols / threads) & ~Index(0x3); + Index blockRows = (rows / threads) & ~Index(0x7); + + GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads]; + + #pragma omp parallel for schedule(static,1) num_threads(threads) + for(Index i=0; i<threads; ++i) + { + Index r0 = i*blockRows; + Index actualBlockRows = (i+1==threads) ? rows-r0 : blockRows; + + Index c0 = i*blockCols; + Index actualBlockCols = (i+1==threads) ? cols-c0 : blockCols; + + info[i].rhs_start = c0; + info[i].rhs_length = actualBlockCols; + + if(transpose) + func(0, cols, r0, actualBlockRows, info); + else + func(r0, actualBlockRows, 0,cols, info); + } + + delete[] info; +#endif +} + +} // end namespace internal + +#endif // EIGEN_PARALLELIZER_H diff --git a/extern/Eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/extern/Eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h new file mode 100644 index 00000000000..ccd757cfaf8 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -0,0 +1,427 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H +#define EIGEN_SELFADJOINT_MATRIX_MATRIX_H + +namespace internal { + +// pack a selfadjoint block diagonal for use with the gebp_kernel +template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder> +struct symm_pack_lhs +{ + template<int BlockRows> inline + void pack(Scalar* blockA, const const_blas_data_mapper<Scalar,Index,StorageOrder>& lhs, Index cols, Index i, Index& count) + { + // normal copy + for(Index k=0; k<i; k++) + for(Index w=0; w<BlockRows; w++) + blockA[count++] = lhs(i+w,k); // normal + // symmetric copy + Index h = 0; + for(Index k=i; k<i+BlockRows; k++) + { + for(Index w=0; w<h; w++) + blockA[count++] = conj(lhs(k, i+w)); // transposed + + blockA[count++] = real(lhs(k,k)); // real (diagonal) + + for(Index w=h+1; w<BlockRows; w++) + blockA[count++] = lhs(i+w, k); // normal + ++h; + } + // transposed copy + for(Index k=i+BlockRows; k<cols; k++) + for(Index w=0; w<BlockRows; w++) + blockA[count++] = conj(lhs(k, i+w)); // transposed + } + void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) + { + const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride); + Index count = 0; + Index peeled_mc = (rows/Pack1)*Pack1; + for(Index i=0; i<peeled_mc; i+=Pack1) + { + pack<Pack1>(blockA, lhs, cols, i, count); + } + + if(rows-peeled_mc>=Pack2) + { + pack<Pack2>(blockA, lhs, cols, peeled_mc, count); + peeled_mc += Pack2; + } + + // do the same with mr==1 + for(Index i=peeled_mc; i<rows; i++) + { + for(Index k=0; k<i; k++) + blockA[count++] = lhs(i, k); // normal + + blockA[count++] = real(lhs(i, i)); // real (diagonal) + + for(Index k=i+1; k<cols; k++) + blockA[count++] = conj(lhs(k, i)); // transposed + } + } +}; + +template<typename Scalar, typename Index, int nr, int StorageOrder> +struct symm_pack_rhs +{ + enum { PacketSize = packet_traits<Scalar>::size }; + void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) + { + Index end_k = k2 + rows; + Index count = 0; + const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride); + Index packet_cols = (cols/nr)*nr; + + // first part: normal case + for(Index j2=0; j2<k2; j2+=nr) + { + for(Index k=k2; k<end_k; k++) + { + blockB[count+0] = rhs(k,j2+0); + blockB[count+1] = rhs(k,j2+1); + if (nr==4) + { + blockB[count+2] = rhs(k,j2+2); + blockB[count+3] = rhs(k,j2+3); + } + count += nr; + } + } + + // second part: diagonal block + for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr) + { + // again we can split vertically in three different parts (transpose, symmetric, normal) + // transpose + for(Index k=k2; k<j2; k++) + { + blockB[count+0] = conj(rhs(j2+0,k)); + blockB[count+1] = conj(rhs(j2+1,k)); + if (nr==4) + { + blockB[count+2] = conj(rhs(j2+2,k)); + blockB[count+3] = conj(rhs(j2+3,k)); + } + count += nr; + } + // symmetric + Index h = 0; + for(Index k=j2; k<j2+nr; k++) + { + // normal + for (Index w=0 ; w<h; ++w) + blockB[count+w] = rhs(k,j2+w); + + blockB[count+h] = real(rhs(k,k)); + + // transpose + for (Index w=h+1 ; w<nr; ++w) + blockB[count+w] = conj(rhs(j2+w,k)); + count += nr; + ++h; + } + // normal + for(Index k=j2+nr; k<end_k; k++) + { + blockB[count+0] = rhs(k,j2+0); + blockB[count+1] = rhs(k,j2+1); + if (nr==4) + { + blockB[count+2] = rhs(k,j2+2); + blockB[count+3] = rhs(k,j2+3); + } + count += nr; + } + } + + // third part: transposed + for(Index j2=k2+rows; j2<packet_cols; j2+=nr) + { + for(Index k=k2; k<end_k; k++) + { + blockB[count+0] = conj(rhs(j2+0,k)); + blockB[count+1] = conj(rhs(j2+1,k)); + if (nr==4) + { + blockB[count+2] = conj(rhs(j2+2,k)); + blockB[count+3] = conj(rhs(j2+3,k)); + } + count += nr; + } + } + + // copy the remaining columns one at a time (=> the same with nr==1) + for(Index j2=packet_cols; j2<cols; ++j2) + { + // transpose + Index half = (std::min)(end_k,j2); + for(Index k=k2; k<half; k++) + { + blockB[count] = conj(rhs(j2,k)); + count += 1; + } + + if(half==j2 && half<k2+rows) + { + blockB[count] = real(rhs(j2,j2)); + count += 1; + } + else + half--; + + // normal + for(Index k=half+1; k<k2+rows; k++) + { + blockB[count] = rhs(k,j2); + count += 1; + } + } + } +}; + +/* Optimized selfadjoint matrix * matrix (_SYMM) product built on top of + * the general matrix matrix product. + */ +template <typename Scalar, typename Index, + int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs, + int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs, + int ResStorageOrder> +struct product_selfadjoint_matrix; + +template <typename Scalar, typename Index, + int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs, + int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs> +struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor> +{ + + static EIGEN_STRONG_INLINE void run( + Index rows, Index cols, + const Scalar* lhs, Index lhsStride, + const Scalar* rhs, Index rhsStride, + Scalar* res, Index resStride, + Scalar alpha) + { + product_selfadjoint_matrix<Scalar, Index, + EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor, + RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs), + EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor, + LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), + ColMajor> + ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha); + } +}; + +template <typename Scalar, typename Index, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs> +struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor> +{ + + static EIGEN_DONT_INLINE void run( + Index rows, Index cols, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* res, Index resStride, + Scalar alpha) + { + Index size = rows; + + const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); + const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); + + typedef gebp_traits<Scalar,Scalar> Traits; + + Index kc = size; // cache block size along the K direction + Index mc = rows; // cache block size along the M direction + Index nc = cols; // cache block size along the N direction + computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); + // kc must smaller than mc + kc = (std::min)(kc,mc); + + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); + ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); + Scalar* blockB = allocatedBlockB + sizeW; + + gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; + gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; + + for(Index k2=0; k2<size; k2+=kc) + { + const Index actual_kc = (std::min)(k2+kc,size)-k2; + + // we have selected one row panel of rhs and one column panel of lhs + // pack rhs's panel into a sequential chunk of memory + // and expand each coeff to a constant packet for further reuse + pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); + + // the select lhs's panel has to be split in three different parts: + // 1 - the transposed panel above the diagonal block => transposed packed copy + // 2 - the diagonal block => special packed copy + // 3 - the panel below the diagonal block => generic packed copy + for(Index i2=0; i2<k2; i2+=mc) + { + const Index actual_mc = (std::min)(i2+mc,k2)-i2; + // transposed packed copy + pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc); + + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + } + // the block diagonal + { + const Index actual_mc = (std::min)(k2+kc,size)-k2; + // symmetric packed copy + pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc); + + gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + } + + for(Index i2=k2+kc; i2<size; i2+=mc) + { + const Index actual_mc = (std::min)(i2+mc,size)-i2; + gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>() + (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + } + } + } +}; + +// matrix * selfadjoint product +template <typename Scalar, typename Index, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs> +struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor> +{ + + static EIGEN_DONT_INLINE void run( + Index rows, Index cols, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* res, Index resStride, + Scalar alpha) + { + Index size = cols; + + const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); + + typedef gebp_traits<Scalar,Scalar> Traits; + + Index kc = size; // cache block size along the K direction + Index mc = rows; // cache block size along the M direction + Index nc = cols; // cache block size along the N direction + computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); + ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); + Scalar* blockB = allocatedBlockB + sizeW; + + gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; + + for(Index k2=0; k2<size; k2+=kc) + { + const Index actual_kc = (std::min)(k2+kc,size)-k2; + + pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2); + + // => GEPP + for(Index i2=0; i2<rows; i2+=mc) + { + const Index actual_mc = (std::min)(i2+mc,rows)-i2; + pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + } + } + } +}; + +} // end namespace internal + +/*************************************************************************** +* Wrapper to product_selfadjoint_matrix +***************************************************************************/ + +namespace internal { +template<typename Lhs, int LhsMode, typename Rhs, int RhsMode> +struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> > + : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> > +{}; +} + +template<typename Lhs, int LhsMode, typename Rhs, int RhsMode> +struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> + : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) + + SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + enum { + LhsIsUpper = (LhsMode&(Upper|Lower))==Upper, + LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint, + RhsIsUpper = (RhsMode&(Upper|Lower))==Upper, + RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint + }; + + template<typename Dest> void scaleAndAddTo(Dest& dst, Scalar alpha) const + { + eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + internal::product_selfadjoint_matrix<Scalar, Index, + EIGEN_LOGICAL_XOR(LhsIsUpper, + internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, + NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), + EIGEN_LOGICAL_XOR(RhsIsUpper, + internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, + NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), + internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor> + ::run( + lhs.rows(), rhs.cols(), // sizes + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info + &dst.coeffRef(0,0), dst.outerStride(), // result info + actualAlpha // alpha + ); + } +}; + +#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H diff --git a/extern/Eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h b/extern/Eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h new file mode 100644 index 00000000000..d6121fc07bd --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -0,0 +1,278 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H +#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H + +namespace internal { + +/* Optimized selfadjoint matrix * vector product: + * This algorithm processes 2 columns at onces that allows to both reduce + * the number of load/stores of the result by a factor 2 and to reduce + * the instruction dependency. + */ +template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> +static EIGEN_DONT_INLINE void product_selfadjoint_vector( + Index size, + const Scalar* lhs, Index lhsStride, + const Scalar* _rhs, Index rhsIncr, + Scalar* res, + Scalar alpha) +{ + typedef typename packet_traits<Scalar>::type Packet; + typedef typename NumTraits<Scalar>::Real RealScalar; + const Index PacketSize = sizeof(Packet)/sizeof(Scalar); + + enum { + IsRowMajor = StorageOrder==RowMajor ? 1 : 0, + IsLower = UpLo == Lower ? 1 : 0, + FirstTriangular = IsRowMajor == IsLower + }; + + conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0; + conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1; + conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex, ConjugateRhs> cjd; + + conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0; + conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1; + + Scalar cjAlpha = ConjugateRhs ? conj(alpha) : alpha; + + // FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed. + // if the rhs is not sequentially stored in memory we copy it to a temporary buffer, + // this is because we need to extract packets + ei_declare_aligned_stack_constructed_variable(Scalar,rhs,size,rhsIncr==1 ? const_cast<Scalar*>(_rhs) : 0); + if (rhsIncr!=1) + { + const Scalar* it = _rhs; + for (Index i=0; i<size; ++i, it+=rhsIncr) + rhs[i] = *it; + } + + Index bound = (std::max)(Index(0),size-8) & 0xfffffffe; + if (FirstTriangular) + bound = size - bound; + + for (Index j=FirstTriangular ? bound : 0; + j<(FirstTriangular ? size : bound);j+=2) + { + register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride; + register const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride; + + Scalar t0 = cjAlpha * rhs[j]; + Packet ptmp0 = pset1<Packet>(t0); + Scalar t1 = cjAlpha * rhs[j+1]; + Packet ptmp1 = pset1<Packet>(t1); + + Scalar t2 = 0; + Packet ptmp2 = pset1<Packet>(t2); + Scalar t3 = 0; + Packet ptmp3 = pset1<Packet>(t3); + + size_t starti = FirstTriangular ? 0 : j+2; + size_t endi = FirstTriangular ? j : size; + size_t alignedStart = (starti) + first_aligned(&res[starti], endi-starti); + size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize); + + // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed + res[j] += cjd.pmul(internal::real(A0[j]), t0); + res[j+1] += cjd.pmul(internal::real(A1[j+1]), t1); + if(FirstTriangular) + { + res[j] += cj0.pmul(A1[j], t1); + t3 += cj1.pmul(A1[j], rhs[j]); + } + else + { + res[j+1] += cj0.pmul(A0[j+1],t0); + t2 += cj1.pmul(A0[j+1], rhs[j+1]); + } + + for (size_t i=starti; i<alignedStart; ++i) + { + res[i] += t0 * A0[i] + t1 * A1[i]; + t2 += conj(A0[i]) * rhs[i]; + t3 += conj(A1[i]) * rhs[i]; + } + // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up) + // gcc 4.2 does this optimization automatically. + const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart; + const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart; + const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart; + Scalar* EIGEN_RESTRICT resIt = res + alignedStart; + for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize) + { + Packet A0i = ploadu<Packet>(a0It); a0It += PacketSize; + Packet A1i = ploadu<Packet>(a1It); a1It += PacketSize; + Packet Bi = ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases + Packet Xi = pload <Packet>(resIt); + + Xi = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi)); + ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2); + ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3); + pstore(resIt,Xi); resIt += PacketSize; + } + for (size_t i=alignedEnd; i<endi; i++) + { + res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1); + t2 += cj1.pmul(A0[i], rhs[i]); + t3 += cj1.pmul(A1[i], rhs[i]); + } + + res[j] += alpha * (t2 + predux(ptmp2)); + res[j+1] += alpha * (t3 + predux(ptmp3)); + } + for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++) + { + register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride; + + Scalar t1 = cjAlpha * rhs[j]; + Scalar t2 = 0; + // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed + res[j] += cjd.pmul(internal::real(A0[j]), t1); + for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++) + { + res[i] += cj0.pmul(A0[i], t1); + t2 += cj1.pmul(A0[i], rhs[i]); + } + res[j] += alpha * t2; + } +} + +} // end namespace internal + +/*************************************************************************** +* Wrapper to product_selfadjoint_vector +***************************************************************************/ + +namespace internal { +template<typename Lhs, int LhsMode, typename Rhs> +struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> > + : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> > +{}; +} + +template<typename Lhs, int LhsMode, typename Rhs> +struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> + : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) + + enum { + LhsUpLo = LhsMode&(Upper|Lower) + }; + + SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + template<typename Dest> void scaleAndAddTo(Dest& dest, Scalar alpha) const + { + typedef typename Dest::Scalar ResScalar; + typedef typename Base::RhsScalar RhsScalar; + typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest; + + eigen_assert(dest.rows()==m_lhs.rows() && dest.cols()==m_rhs.cols()); + + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + enum { + EvalToDest = (Dest::InnerStrideAtCompileTime==1), + UseRhs = (_ActualRhsType::InnerStrideAtCompileTime==1) + }; + + internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest; + internal::gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!UseRhs> static_rhs; + + ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), + EvalToDest ? dest.data() : static_dest.data()); + + ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,rhs.size(), + UseRhs ? const_cast<RhsScalar*>(rhs.data()) : static_rhs.data()); + + if(!EvalToDest) + { + #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + int size = dest.size(); + EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #endif + MappedDest(actualDestPtr, dest.size()) = dest; + } + + if(!UseRhs) + { + #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + int size = rhs.size(); + EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #endif + Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, rhs.size()) = rhs; + } + + + internal::product_selfadjoint_vector<Scalar, Index, (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)> + ( + lhs.rows(), // size + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + actualRhsPtr, 1, // rhs info + actualDestPtr, // result info + actualAlpha // scale factor + ); + + if(!EvalToDest) + dest = MappedDest(actualDestPtr, dest.size()); + } +}; + +namespace internal { +template<typename Lhs, typename Rhs, int RhsMode> +struct traits<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false> > + : traits<ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs> > +{}; +} + +template<typename Lhs, typename Rhs, int RhsMode> +struct SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false> + : public ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) + + enum { + RhsUpLo = RhsMode&(Upper|Lower) + }; + + SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + template<typename Dest> void scaleAndAddTo(Dest& dest, Scalar alpha) const + { + // let's simply transpose the product + Transpose<Dest> destT(dest); + SelfadjointProductMatrix<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false, + Transpose<const Lhs>, 0, true>(m_rhs.transpose(), m_lhs.transpose()).scaleAndAddTo(destT, alpha); + } +}; + + +#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H diff --git a/extern/Eigen3/Eigen/src/Core/products/SelfadjointProduct.h b/extern/Eigen3/Eigen/src/Core/products/SelfadjointProduct.h new file mode 100644 index 00000000000..3a4523fa4a9 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/SelfadjointProduct.h @@ -0,0 +1,136 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_SELFADJOINT_PRODUCT_H +#define EIGEN_SELFADJOINT_PRODUCT_H + +/********************************************************************** +* This file implements a self adjoint product: C += A A^T updating only +* half of the selfadjoint matrix C. +* It corresponds to the level 3 SYRK and level 2 SYR Blas routines. +**********************************************************************/ + +template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjLhs, bool ConjRhs> +struct selfadjoint_rank1_update; + +template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs> +struct selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo,ConjLhs,ConjRhs> +{ + static void run(Index size, Scalar* mat, Index stride, const Scalar* vec, Scalar alpha) + { + internal::conj_if<ConjRhs> cj; + typedef Map<const Matrix<Scalar,Dynamic,1> > OtherMap; + typedef typename internal::conditional<ConjLhs,typename OtherMap::ConjugateReturnType,const OtherMap&>::type ConjRhsType; + for (Index i=0; i<size; ++i) + { + Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+(UpLo==Lower ? i : 0), (UpLo==Lower ? size-i : (i+1))) + += (alpha * cj(vec[i])) * ConjRhsType(OtherMap(vec+(UpLo==Lower ? i : 0),UpLo==Lower ? size-i : (i+1))); + } + } +}; + +template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs> +struct selfadjoint_rank1_update<Scalar,Index,RowMajor,UpLo,ConjLhs,ConjRhs> +{ + static void run(Index size, Scalar* mat, Index stride, const Scalar* vec, Scalar alpha) + { + selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo==Lower?Upper:Lower,ConjRhs,ConjLhs>::run(size,mat,stride,vec,alpha); + } +}; + +template<typename MatrixType, typename OtherType, int UpLo, bool OtherIsVector = OtherType::IsVectorAtCompileTime> +struct selfadjoint_product_selector; + +template<typename MatrixType, typename OtherType, int UpLo> +struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,true> +{ + static void run(MatrixType& mat, const OtherType& other, typename MatrixType::Scalar alpha) + { + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::Index Index; + typedef internal::blas_traits<OtherType> OtherBlasTraits; + typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType; + typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType; + const ActualOtherType actualOther = OtherBlasTraits::extract(other.derived()); + + Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived()); + + enum { + StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor, + UseOtherDirectly = _ActualOtherType::InnerStrideAtCompileTime==1 + }; + internal::gemv_static_vector_if<Scalar,OtherType::SizeAtCompileTime,OtherType::MaxSizeAtCompileTime,!UseOtherDirectly> static_other; + + ei_declare_aligned_stack_constructed_variable(Scalar, actualOtherPtr, other.size(), + (UseOtherDirectly ? const_cast<Scalar*>(actualOther.data()) : static_other.data())); + + if(!UseOtherDirectly) + Map<typename _ActualOtherType::PlainObject>(actualOtherPtr, actualOther.size()) = actualOther; + + selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo, + OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex, + (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex> + ::run(other.size(), mat.data(), mat.outerStride(), actualOtherPtr, actualAlpha); + } +}; + +template<typename MatrixType, typename OtherType, int UpLo> +struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false> +{ + static void run(MatrixType& mat, const OtherType& other, typename MatrixType::Scalar alpha) + { + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::Index Index; + typedef internal::blas_traits<OtherType> OtherBlasTraits; + typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType; + typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType; + const ActualOtherType actualOther = OtherBlasTraits::extract(other.derived()); + + Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived()); + + enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 }; + + internal::general_matrix_matrix_triangular_product<Index, + Scalar, _ActualOtherType::Flags&RowMajorBit ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex, + Scalar, _ActualOtherType::Flags&RowMajorBit ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex, + MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo> + ::run(mat.cols(), actualOther.cols(), + &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(), + mat.data(), mat.outerStride(), actualAlpha); + } +}; + +// high level API + +template<typename MatrixType, unsigned int UpLo> +template<typename DerivedU> +SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo> +::rankUpdate(const MatrixBase<DerivedU>& u, Scalar alpha) +{ + selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha); + + return *this; +} + +#endif // EIGEN_SELFADJOINT_PRODUCT_H diff --git a/extern/Eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h b/extern/Eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h new file mode 100644 index 00000000000..9f8b8438a5d --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -0,0 +1,104 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_SELFADJOINTRANK2UPTADE_H +#define EIGEN_SELFADJOINTRANK2UPTADE_H + +namespace internal { + +/* Optimized selfadjoint matrix += alpha * uv' + conj(alpha)*vu' + * It corresponds to the Level2 syr2 BLAS routine + */ + +template<typename Scalar, typename Index, typename UType, typename VType, int UpLo> +struct selfadjoint_rank2_update_selector; + +template<typename Scalar, typename Index, typename UType, typename VType> +struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower> +{ + static void run(Scalar* mat, Index stride, const UType& u, const VType& v, Scalar alpha) + { + const Index size = u.size(); + for (Index i=0; i<size; ++i) + { + Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+i, size-i) += + (conj(alpha) * conj(u.coeff(i))) * v.tail(size-i) + + (alpha * conj(v.coeff(i))) * u.tail(size-i); + } + } +}; + +template<typename Scalar, typename Index, typename UType, typename VType> +struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Upper> +{ + static void run(Scalar* mat, Index stride, const UType& u, const VType& v, Scalar alpha) + { + const Index size = u.size(); + for (Index i=0; i<size; ++i) + Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i, i+1) += + (conj(alpha) * conj(u.coeff(i))) * v.head(i+1) + + (alpha * conj(v.coeff(i))) * u.head(i+1); + } +}; + +template<bool Cond, typename T> struct conj_expr_if + : conditional<!Cond, const T&, + CwiseUnaryOp<scalar_conjugate_op<typename traits<T>::Scalar>,T> > {}; + +} // end namespace internal + +template<typename MatrixType, unsigned int UpLo> +template<typename DerivedU, typename DerivedV> +SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo> +::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, Scalar alpha) +{ + typedef internal::blas_traits<DerivedU> UBlasTraits; + typedef typename UBlasTraits::DirectLinearAccessType ActualUType; + typedef typename internal::remove_all<ActualUType>::type _ActualUType; + const ActualUType actualU = UBlasTraits::extract(u.derived()); + + typedef internal::blas_traits<DerivedV> VBlasTraits; + typedef typename VBlasTraits::DirectLinearAccessType ActualVType; + typedef typename internal::remove_all<ActualVType>::type _ActualVType; + const ActualVType actualV = VBlasTraits::extract(v.derived()); + + // If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and + // vice versa, and take the complex conjugate of all coefficients and vector entries. + + enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 }; + Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived()) + * internal::conj(VBlasTraits::extractScalarFactor(v.derived())); + if (IsRowMajor) + actualAlpha = internal::conj(actualAlpha); + + internal::selfadjoint_rank2_update_selector<Scalar, Index, + typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type, + typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type, + (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)> + ::run(_expression().const_cast_derived().data(),_expression().outerStride(),actualU,actualV,actualAlpha); + + return *this; +} + +#endif // EIGEN_SELFADJOINTRANK2UPTADE_H diff --git a/extern/Eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h b/extern/Eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h new file mode 100644 index 00000000000..0c48d2efb75 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -0,0 +1,403 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H +#define EIGEN_TRIANGULAR_MATRIX_MATRIX_H + +namespace internal { + +// template<typename Scalar, int mr, int StorageOrder, bool Conjugate, int Mode> +// struct gemm_pack_lhs_triangular +// { +// Matrix<Scalar,mr,mr, +// void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows) +// { +// conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; +// const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride); +// int count = 0; +// const int peeled_mc = (rows/mr)*mr; +// for(int i=0; i<peeled_mc; i+=mr) +// { +// for(int k=0; k<depth; k++) +// for(int w=0; w<mr; w++) +// blockA[count++] = cj(lhs(i+w, k)); +// } +// for(int i=peeled_mc; i<rows; i++) +// { +// for(int k=0; k<depth; k++) +// blockA[count++] = cj(lhs(i, k)); +// } +// } +// }; + +/* Optimized triangular matrix * matrix (_TRMM++) product built on top of + * the general matrix matrix product. + */ +template <typename Scalar, typename Index, + int Mode, bool LhsIsTriangular, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs, + int ResStorageOrder> +struct product_triangular_matrix_matrix; + +template <typename Scalar, typename Index, + int Mode, bool LhsIsTriangular, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs> +struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular, + LhsStorageOrder,ConjugateLhs, + RhsStorageOrder,ConjugateRhs,RowMajor> +{ + static EIGEN_STRONG_INLINE void run( + Index rows, Index cols, Index depth, + const Scalar* lhs, Index lhsStride, + const Scalar* rhs, Index rhsStride, + Scalar* res, Index resStride, + Scalar alpha) + { + product_triangular_matrix_matrix<Scalar, Index, + (Mode&(UnitDiag|ZeroDiag)) | ((Mode&Upper) ? Lower : Upper), + (!LhsIsTriangular), + RhsStorageOrder==RowMajor ? ColMajor : RowMajor, + ConjugateRhs, + LhsStorageOrder==RowMajor ? ColMajor : RowMajor, + ConjugateLhs, + ColMajor> + ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha); + } +}; + +// implements col-major += alpha * op(triangular) * op(general) +template <typename Scalar, typename Index, int Mode, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs> +struct product_triangular_matrix_matrix<Scalar,Index,Mode,true, + LhsStorageOrder,ConjugateLhs, + RhsStorageOrder,ConjugateRhs,ColMajor> +{ + + typedef gebp_traits<Scalar,Scalar> Traits; + enum { + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + IsLower = (Mode&Lower) == Lower, + SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 + }; + + static EIGEN_DONT_INLINE void run( + Index _rows, Index _cols, Index _depth, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* res, Index resStride, + Scalar alpha) + { + // strip zeros + Index diagSize = (std::min)(_rows,_depth); + Index rows = IsLower ? _rows : diagSize; + Index depth = IsLower ? diagSize : _depth; + Index cols = _cols; + + const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); + const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); + + Index kc = depth; // cache block size along the K direction + Index mc = rows; // cache block size along the M direction + Index nc = cols; // cache block size along the N direction + computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); + ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); + Scalar* blockB = allocatedBlockB + sizeW; + + Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer; + triangularBuffer.setZero(); + if((Mode&ZeroDiag)==ZeroDiag) + triangularBuffer.diagonal().setZero(); + else + triangularBuffer.diagonal().setOnes(); + + gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; + + for(Index k2=IsLower ? depth : 0; + IsLower ? k2>0 : k2<depth; + IsLower ? k2-=kc : k2+=kc) + { + Index actual_kc = (std::min)(IsLower ? k2 : depth-k2, kc); + Index actual_k2 = IsLower ? k2-actual_kc : k2; + + // align blocks with the end of the triangular part for trapezoidal lhs + if((!IsLower)&&(k2<rows)&&(k2+actual_kc>rows)) + { + actual_kc = rows-k2; + k2 = k2+actual_kc-kc; + } + + pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols); + + // the selected lhs's panel has to be split in three different parts: + // 1 - the part which is zero => skip it + // 2 - the diagonal block => special kernel + // 3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP + + // the block diagonal, if any: + if(IsLower || actual_k2<rows) + { + // for each small vertical panels of lhs + for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth) + { + Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth); + Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1; + Index startBlock = actual_k2+k1; + Index blockBOffset = k1; + + // => GEBP with the micro triangular block + // The trick is to pack this micro block while filling the opposite triangular part with zeros. + // To this end we do an extra triangular copy to a small temporary buffer + for (Index k=0;k<actualPanelWidth;++k) + { + if (SetDiag) + triangularBuffer.coeffRef(k,k) = lhs(startBlock+k,startBlock+k); + for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i) + triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k); + } + pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth); + + gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha, + actualPanelWidth, actual_kc, 0, blockBOffset); + + // GEBP with remaining micro panel + if (lengthTarget>0) + { + Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2; + + pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); + + gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha, + actualPanelWidth, actual_kc, 0, blockBOffset); + } + } + } + // the part below (lower case) or above (upper case) the diagonal => GEPP + { + Index start = IsLower ? k2 : 0; + Index end = IsLower ? rows : (std::min)(actual_k2,rows); + for(Index i2=start; i2<end; i2+=mc) + { + const Index actual_mc = (std::min)(i2+mc,end)-i2; + gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>() + (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); + + gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + } + } + } + } +}; + +// implements col-major += alpha * op(general) * op(triangular) +template <typename Scalar, typename Index, int Mode, + int LhsStorageOrder, bool ConjugateLhs, + int RhsStorageOrder, bool ConjugateRhs> +struct product_triangular_matrix_matrix<Scalar,Index,Mode,false, + LhsStorageOrder,ConjugateLhs, + RhsStorageOrder,ConjugateRhs,ColMajor> +{ + typedef gebp_traits<Scalar,Scalar> Traits; + enum { + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + IsLower = (Mode&Lower) == Lower, + SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1 + }; + + static EIGEN_DONT_INLINE void run( + Index _rows, Index _cols, Index _depth, + const Scalar* _lhs, Index lhsStride, + const Scalar* _rhs, Index rhsStride, + Scalar* res, Index resStride, + Scalar alpha) + { + // strip zeros + Index diagSize = (std::min)(_cols,_depth); + Index rows = _rows; + Index depth = IsLower ? _depth : diagSize; + Index cols = IsLower ? diagSize : _cols; + + const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); + const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); + + Index kc = depth; // cache block size along the K direction + Index mc = rows; // cache block size along the M direction + Index nc = cols; // cache block size along the N direction + computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); + + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); + ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); + Scalar* blockB = allocatedBlockB + sizeW; + + Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer; + triangularBuffer.setZero(); + if((Mode&ZeroDiag)==ZeroDiag) + triangularBuffer.diagonal().setZero(); + else + triangularBuffer.diagonal().setOnes(); + + gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; + gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; + + for(Index k2=IsLower ? 0 : depth; + IsLower ? k2<depth : k2>0; + IsLower ? k2+=kc : k2-=kc) + { + Index actual_kc = (std::min)(IsLower ? depth-k2 : k2, kc); + Index actual_k2 = IsLower ? k2 : k2-actual_kc; + + // align blocks with the end of the triangular part for trapezoidal rhs + if(IsLower && (k2<cols) && (actual_k2+actual_kc>cols)) + { + actual_kc = cols-k2; + k2 = actual_k2 + actual_kc - kc; + } + + // remaining size + Index rs = IsLower ? (std::min)(cols,actual_k2) : cols - k2; + // size of the triangular part + Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc; + + Scalar* geb = blockB+ts*ts; + + pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs); + + // pack the triangular part of the rhs padding the unrolled blocks with zeros + if(ts>0) + { + for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth) + { + Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth); + Index actual_j2 = actual_k2 + j2; + Index panelOffset = IsLower ? j2+actualPanelWidth : 0; + Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; + // general part + pack_rhs_panel(blockB+j2*actual_kc, + &rhs(actual_k2+panelOffset, actual_j2), rhsStride, + panelLength, actualPanelWidth, + actual_kc, panelOffset); + + // append the triangular part via a temporary buffer + for (Index j=0;j<actualPanelWidth;++j) + { + if (SetDiag) + triangularBuffer.coeffRef(j,j) = rhs(actual_j2+j,actual_j2+j); + for (Index k=IsLower ? j+1 : 0; IsLower ? k<actualPanelWidth : k<j; ++k) + triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j); + } + + pack_rhs_panel(blockB+j2*actual_kc, + triangularBuffer.data(), triangularBuffer.outerStride(), + actualPanelWidth, actualPanelWidth, + actual_kc, j2); + } + } + + for (Index i2=0; i2<rows; i2+=mc) + { + const Index actual_mc = (std::min)(mc,rows-i2); + pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); + + // triangular kernel + if(ts>0) + { + for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth) + { + Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth); + Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth; + Index blockOffset = IsLower ? j2 : 0; + + gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride, + blockA, blockB+j2*actual_kc, + actual_mc, panelLength, actualPanelWidth, + alpha, + actual_kc, actual_kc, // strides + blockOffset, blockOffset,// offsets + allocatedBlockB); // workspace + } + } + gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, + blockA, geb, actual_mc, actual_kc, rs, + alpha, + -1, -1, 0, 0, allocatedBlockB); + } + } + } +}; + +/*************************************************************************** +* Wrapper to product_triangular_matrix_matrix +***************************************************************************/ + +template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs> +struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> > + : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs> > +{}; + +} // end namespace internal + +template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs> +struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> + : public ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct) + + TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + template<typename Dest> void scaleAndAddTo(Dest& dst, Scalar alpha) const + { + const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs); + const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) + * RhsBlasTraits::extractScalarFactor(m_rhs); + + internal::product_triangular_matrix_matrix<Scalar, Index, + Mode, LhsIsTriangular, + (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, + (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, + (internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor> + ::run( + lhs.rows(), rhs.cols(), lhs.cols(),// LhsIsTriangular ? rhs.cols() : lhs.rows(), // sizes + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info + &dst.coeffRef(0,0), dst.outerStride(), // result info + actualAlpha // alpha + ); + } +}; + + +#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H diff --git a/extern/Eigen3/Eigen/src/Core/products/TriangularMatrixVector.h b/extern/Eigen3/Eigen/src/Core/products/TriangularMatrixVector.h new file mode 100644 index 00000000000..71b4a52ab80 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/TriangularMatrixVector.h @@ -0,0 +1,325 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_TRIANGULARMATRIXVECTOR_H +#define EIGEN_TRIANGULARMATRIXVECTOR_H + +namespace internal { + +template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder> +struct product_triangular_matrix_vector; + +template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs> +struct product_triangular_matrix_vector<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor> +{ + typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + enum { + IsLower = ((Mode&Lower)==Lower), + HasUnitDiag = (Mode & UnitDiag)==UnitDiag + }; + static EIGEN_DONT_INLINE void run(Index rows, Index cols, const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, ResScalar alpha) + { + static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; + + typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap; + const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); + typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs); + + typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap; + const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr)); + typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs); + + typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap; + ResMap res(_res,rows); + + for (Index pi=0; pi<cols; pi+=PanelWidth) + { + Index actualPanelWidth = (std::min)(PanelWidth, cols-pi); + for (Index k=0; k<actualPanelWidth; ++k) + { + Index i = pi + k; + Index s = IsLower ? (HasUnitDiag ? i+1 : i ) : pi; + Index r = IsLower ? actualPanelWidth-k : k+1; + if ((!HasUnitDiag) || (--r)>0) + res.segment(s,r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s,r); + if (HasUnitDiag) + res.coeffRef(i) += alpha * cjRhs.coeff(i); + } + Index r = IsLower ? cols - pi - actualPanelWidth : pi; + if (r>0) + { + Index s = IsLower ? pi+actualPanelWidth : 0; + general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs>::run( + r, actualPanelWidth, + &lhs.coeffRef(s,pi), lhsStride, + &rhs.coeffRef(pi), rhsIncr, + &res.coeffRef(s), resIncr, alpha); + } + } + } +}; + +template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs> +struct product_triangular_matrix_vector<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor> +{ + typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + enum { + IsLower = ((Mode&Lower)==Lower), + HasUnitDiag = (Mode & UnitDiag)==UnitDiag + }; + static void run(Index rows, Index cols, const LhsScalar* _lhs, Index lhsStride, + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, ResScalar alpha) + { + static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; + + typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap; + const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); + typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs); + + typedef Map<const Matrix<RhsScalar,Dynamic,1> > RhsMap; + const RhsMap rhs(_rhs,cols); + typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs); + + typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap; + ResMap res(_res,rows,InnerStride<>(resIncr)); + + for (Index pi=0; pi<cols; pi+=PanelWidth) + { + Index actualPanelWidth = (std::min)(PanelWidth, cols-pi); + for (Index k=0; k<actualPanelWidth; ++k) + { + Index i = pi + k; + Index s = IsLower ? pi : (HasUnitDiag ? i+1 : i); + Index r = IsLower ? k+1 : actualPanelWidth-k; + if ((!HasUnitDiag) || (--r)>0) + res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s,r).cwiseProduct(cjRhs.segment(s,r).transpose())).sum(); + if (HasUnitDiag) + res.coeffRef(i) += alpha * cjRhs.coeff(i); + } + Index r = IsLower ? pi : cols - pi - actualPanelWidth; + if (r>0) + { + Index s = IsLower ? 0 : pi + actualPanelWidth; + general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs>::run( + actualPanelWidth, r, + &lhs.coeffRef(pi,s), lhsStride, + &rhs.coeffRef(s), rhsIncr, + &res.coeffRef(pi), resIncr, alpha); + } + } + } +}; + +/*************************************************************************** +* Wrapper to product_triangular_vector +***************************************************************************/ + +template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs> +struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true> > + : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true>, Lhs, Rhs> > +{}; + +template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs> +struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false> > + : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false>, Lhs, Rhs> > +{}; + + +template<int StorageOrder> +struct trmv_selector; + +} // end namespace internal + +template<int Mode, typename Lhs, typename Rhs> +struct TriangularProduct<Mode,true,Lhs,false,Rhs,true> + : public ProductBase<TriangularProduct<Mode,true,Lhs,false,Rhs,true>, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct) + + TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + template<typename Dest> void scaleAndAddTo(Dest& dst, Scalar alpha) const + { + eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + + internal::trmv_selector<(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dst, alpha); + } +}; + +template<int Mode, typename Lhs, typename Rhs> +struct TriangularProduct<Mode,false,Lhs,true,Rhs,false> + : public ProductBase<TriangularProduct<Mode,false,Lhs,true,Rhs,false>, Lhs, Rhs > +{ + EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct) + + TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} + + template<typename Dest> void scaleAndAddTo(Dest& dst, Scalar alpha) const + { + eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + + typedef TriangularProduct<(Mode & UnitDiag) | ((Mode & Lower) ? Upper : Lower),true,Transpose<const Rhs>,false,Transpose<const Lhs>,true> TriangularProductTranspose; + Transpose<Dest> dstT(dst); + internal::trmv_selector<(int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor>::run( + TriangularProductTranspose(m_rhs.transpose(),m_lhs.transpose()), dstT, alpha); + } +}; + +namespace internal { + +// TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same. + +template<> struct trmv_selector<ColMajor> +{ + template<int Mode, typename Lhs, typename Rhs, typename Dest> + static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar alpha) + { + typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType; + typedef typename ProductType::Index Index; + typedef typename ProductType::LhsScalar LhsScalar; + typedef typename ProductType::RhsScalar RhsScalar; + typedef typename ProductType::Scalar ResScalar; + typedef typename ProductType::RealScalar RealScalar; + typedef typename ProductType::ActualLhsType ActualLhsType; + typedef typename ProductType::ActualRhsType ActualRhsType; + typedef typename ProductType::LhsBlasTraits LhsBlasTraits; + typedef typename ProductType::RhsBlasTraits RhsBlasTraits; + typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest; + + const ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs()); + const ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs()); + + ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs()) + * RhsBlasTraits::extractScalarFactor(prod.rhs()); + + enum { + // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1 + // on, the other hand it is good for the cache to pack the vector anyways... + EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1, + ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex), + MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal + }; + + gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest; + + bool alphaIsCompatible = (!ComplexByReal) || (imag(actualAlpha)==RealScalar(0)); + bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; + + RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha); + + ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), + evalToDest ? dest.data() : static_dest.data()); + + if(!evalToDest) + { + #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + int size = dest.size(); + EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #endif + if(!alphaIsCompatible) + { + MappedDest(actualDestPtr, dest.size()).setZero(); + compatibleAlpha = RhsScalar(1); + } + else + MappedDest(actualDestPtr, dest.size()) = dest; + } + + internal::product_triangular_matrix_vector + <Index,Mode, + LhsScalar, LhsBlasTraits::NeedToConjugate, + RhsScalar, RhsBlasTraits::NeedToConjugate, + ColMajor> + ::run(actualLhs.rows(),actualLhs.cols(), + actualLhs.data(),actualLhs.outerStride(), + actualRhs.data(),actualRhs.innerStride(), + actualDestPtr,1,compatibleAlpha); + + if (!evalToDest) + { + if(!alphaIsCompatible) + dest += actualAlpha * MappedDest(actualDestPtr, dest.size()); + else + dest = MappedDest(actualDestPtr, dest.size()); + } + } +}; + +template<> struct trmv_selector<RowMajor> +{ + template<int Mode, typename Lhs, typename Rhs, typename Dest> + static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar alpha) + { + typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType; + typedef typename ProductType::LhsScalar LhsScalar; + typedef typename ProductType::RhsScalar RhsScalar; + typedef typename ProductType::Scalar ResScalar; + typedef typename ProductType::Index Index; + typedef typename ProductType::ActualLhsType ActualLhsType; + typedef typename ProductType::ActualRhsType ActualRhsType; + typedef typename ProductType::_ActualRhsType _ActualRhsType; + typedef typename ProductType::LhsBlasTraits LhsBlasTraits; + typedef typename ProductType::RhsBlasTraits RhsBlasTraits; + + typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs()); + typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs()); + + ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs()) + * RhsBlasTraits::extractScalarFactor(prod.rhs()); + + enum { + DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1 + }; + + gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs; + + ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(), + DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data()); + + if(!DirectlyUseRhs) + { + #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN + int size = actualRhs.size(); + EIGEN_DENSE_STORAGE_CTOR_PLUGIN + #endif + Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs; + } + + internal::product_triangular_matrix_vector + <Index,Mode, + LhsScalar, LhsBlasTraits::NeedToConjugate, + RhsScalar, RhsBlasTraits::NeedToConjugate, + RowMajor> + ::run(actualLhs.rows(),actualLhs.cols(), + actualLhs.data(),actualLhs.outerStride(), + actualRhsPtr,1, + dest.data(),dest.innerStride(), + actualAlpha); + } +}; + +} // end namespace internal + +#endif // EIGEN_TRIANGULARMATRIXVECTOR_H diff --git a/extern/Eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h b/extern/Eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h new file mode 100644 index 00000000000..4dced6b0eb9 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -0,0 +1,319 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H +#define EIGEN_TRIANGULAR_SOLVER_MATRIX_H + +namespace internal { + +// if the rhs is row major, let's transpose the product +template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder> +struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor> +{ + static EIGEN_DONT_INLINE void run( + Index size, Index cols, + const Scalar* tri, Index triStride, + Scalar* _other, Index otherStride) + { + triangular_solve_matrix< + Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft, + (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper), + NumTraits<Scalar>::IsComplex && Conjugate, + TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor> + ::run(size, cols, tri, triStride, _other, otherStride); + } +}; + +/* Optimized triangular solver with multiple right hand side and the triangular matrix on the left + */ +template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> +struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> +{ + static EIGEN_DONT_INLINE void run( + Index size, Index otherSize, + const Scalar* _tri, Index triStride, + Scalar* _other, Index otherStride) + { + Index cols = otherSize; + const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride); + blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride); + + typedef gebp_traits<Scalar,Scalar> Traits; + enum { + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + IsLower = (Mode&Lower) == Lower + }; + + Index kc = size; // cache block size along the K direction + Index mc = size; // cache block size along the M direction + Index nc = cols; // cache block size along the N direction + computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); + + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*cols; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); + ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); + Scalar* blockB = allocatedBlockB + sizeW; + + conj_if<Conjugate> conj; + gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; + gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs; + + for(Index k2=IsLower ? 0 : size; + IsLower ? k2<size : k2>0; + IsLower ? k2+=kc : k2-=kc) + { + const Index actual_kc = (std::min)(IsLower ? size-k2 : k2, kc); + + // We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel, + // and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into + // A11 (the triangular part) and A21 the remaining rectangular part. + // Then the high level algorithm is: + // - B = R1 => general block copy (done during the next step) + // - R1 = L1^-1 B => tricky part + // - update B from the new R1 => actually this has to be performed continuously during the above step + // - R2 = L2 * B => GEPP + + // The tricky part: compute R1 = L1^-1 B while updating B from R1 + // The idea is to split L1 into multiple small vertical panels. + // Each panel can be split into a small triangular part A1 which is processed without optimization, + // and the remaining small part A2 which is processed using gebp with appropriate block strides + { + // for each small vertical panels of lhs + for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth) + { + Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth); + // tr solve + for (Index k=0; k<actualPanelWidth; ++k) + { + // TODO write a small kernel handling this (can be shared with trsv) + Index i = IsLower ? k2+k1+k : k2-k1-k-1; + Index s = IsLower ? k2+k1 : i+1; + Index rs = actualPanelWidth - k - 1; // remaining size + + Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i)); + for (Index j=0; j<cols; ++j) + { + if (TriStorageOrder==RowMajor) + { + Scalar b = 0; + const Scalar* l = &tri(i,s); + Scalar* r = &other(s,j); + for (Index i3=0; i3<k; ++i3) + b += conj(l[i3]) * r[i3]; + + other(i,j) = (other(i,j) - b)*a; + } + else + { + Index s = IsLower ? i+1 : i-rs; + Scalar b = (other(i,j) *= a); + Scalar* r = &other(s,j); + const Scalar* l = &tri(s,i); + for (Index i3=0;i3<rs;++i3) + r[i3] -= b * conj(l[i3]); + } + } + } + + Index lengthTarget = actual_kc-k1-actualPanelWidth; + Index startBlock = IsLower ? k2+k1 : k2-k1-actualPanelWidth; + Index blockBOffset = IsLower ? k1 : lengthTarget; + + // update the respective rows of B from other + pack_rhs(blockB, _other+startBlock, otherStride, actualPanelWidth, cols, actual_kc, blockBOffset); + + // GEBP + if (lengthTarget>0) + { + Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc; + + pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); + + gebp_kernel(_other+startTarget, otherStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, Scalar(-1), + actualPanelWidth, actual_kc, 0, blockBOffset); + } + } + } + + // R2 = A2 * B => GEPP + { + Index start = IsLower ? k2+kc : 0; + Index end = IsLower ? size : k2-kc; + for(Index i2=start; i2<end; i2+=mc) + { + const Index actual_mc = (std::min)(mc,end-i2); + if (actual_mc>0) + { + pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc); + + gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1)); + } + } + } + } + } +}; + +/* Optimized triangular solver with multiple left hand sides and the trinagular matrix on the right + */ +template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> +struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> +{ + static EIGEN_DONT_INLINE void run( + Index size, Index otherSize, + const Scalar* _tri, Index triStride, + Scalar* _other, Index otherStride) + { + Index rows = otherSize; + const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride); + blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride); + + typedef gebp_traits<Scalar,Scalar> Traits; + enum { + RhsStorageOrder = TriStorageOrder, + SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), + IsLower = (Mode&Lower) == Lower + }; + +// Index kc = std::min<Index>(Traits::Max_kc/4,size); // cache block size along the K direction +// Index mc = std::min<Index>(Traits::Max_mc,size); // cache block size along the M direction + // check that !!!! + Index kc = size; // cache block size along the K direction + Index mc = size; // cache block size along the M direction + Index nc = rows; // cache block size along the N direction + computeProductBlockingSizes<Scalar,Scalar,4>(kc, mc, nc); + + std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = sizeW + kc*size; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); + ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); + Scalar* blockB = allocatedBlockB + sizeW; + + conj_if<Conjugate> conj; + gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel; + gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; + gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; + gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel; + + for(Index k2=IsLower ? size : 0; + IsLower ? k2>0 : k2<size; + IsLower ? k2-=kc : k2+=kc) + { + const Index actual_kc = (std::min)(IsLower ? k2 : size-k2, kc); + Index actual_k2 = IsLower ? k2-actual_kc : k2 ; + + Index startPanel = IsLower ? 0 : k2+actual_kc; + Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc; + Scalar* geb = blockB+actual_kc*actual_kc; + + if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs); + + // triangular packing (we only pack the panels off the diagonal, + // neglecting the blocks overlapping the diagonal + { + for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth) + { + Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth); + Index actual_j2 = actual_k2 + j2; + Index panelOffset = IsLower ? j2+actualPanelWidth : 0; + Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; + + if (panelLength>0) + pack_rhs_panel(blockB+j2*actual_kc, + &rhs(actual_k2+panelOffset, actual_j2), triStride, + panelLength, actualPanelWidth, + actual_kc, panelOffset); + } + } + + for(Index i2=0; i2<rows; i2+=mc) + { + const Index actual_mc = (std::min)(mc,rows-i2); + + // triangular solver kernel + { + // for each small block of the diagonal (=> vertical panels of rhs) + for (Index j2 = IsLower + ? (actual_kc - ((actual_kc%SmallPanelWidth) ? Index(actual_kc%SmallPanelWidth) + : Index(SmallPanelWidth))) + : 0; + IsLower ? j2>=0 : j2<actual_kc; + IsLower ? j2-=SmallPanelWidth : j2+=SmallPanelWidth) + { + Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth); + Index absolute_j2 = actual_k2 + j2; + Index panelOffset = IsLower ? j2+actualPanelWidth : 0; + Index panelLength = IsLower ? actual_kc - j2 - actualPanelWidth : j2; + + // GEBP + if(panelLength>0) + { + gebp_kernel(&lhs(i2,absolute_j2), otherStride, + blockA, blockB+j2*actual_kc, + actual_mc, panelLength, actualPanelWidth, + Scalar(-1), + actual_kc, actual_kc, // strides + panelOffset, panelOffset, // offsets + allocatedBlockB); // workspace + } + + // unblocked triangular solve + for (Index k=0; k<actualPanelWidth; ++k) + { + Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k; + + Scalar* r = &lhs(i2,j); + for (Index k3=0; k3<k; ++k3) + { + Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j)); + Scalar* a = &lhs(i2,IsLower ? j+1+k3 : absolute_j2+k3); + for (Index i=0; i<actual_mc; ++i) + r[i] -= a[i] * b; + } + Scalar b = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(rhs(j,j)); + for (Index i=0; i<actual_mc; ++i) + r[i] *= b; + } + + // pack the just computed part of lhs to A + pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride, + actualPanelWidth, actual_mc, + actual_kc, j2); + } + } + + if (rs>0) + gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, + actual_mc, actual_kc, rs, Scalar(-1), + -1, -1, 0, 0, allocatedBlockB); + } + } + } +}; + +} // end namespace internal + +#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H diff --git a/extern/Eigen3/Eigen/src/Core/products/TriangularSolverVector.h b/extern/Eigen3/Eigen/src/Core/products/TriangularSolverVector.h new file mode 100644 index 00000000000..639d4a5b476 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/TriangularSolverVector.h @@ -0,0 +1,150 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H +#define EIGEN_TRIANGULAR_SOLVER_VECTOR_H + +namespace internal { + +template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate, int StorageOrder> +struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Conjugate, StorageOrder> +{ + static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs) + { + triangular_solve_vector<LhsScalar,RhsScalar,Index,OnTheLeft, + ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag), + Conjugate,StorageOrder==RowMajor?ColMajor:RowMajor + >::run(size, _lhs, lhsStride, rhs); + } +}; + +// forward and backward substitution, row-major, rhs is a vector +template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate> +struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor> +{ + enum { + IsLower = ((Mode&Lower)==Lower) + }; + static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs) + { + typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap; + const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + typename internal::conditional< + Conjugate, + const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>, + const LhsMap&> + ::type cjLhs(lhs); + static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; + for(Index pi=IsLower ? 0 : size; + IsLower ? pi<size : pi>0; + IsLower ? pi+=PanelWidth : pi-=PanelWidth) + { + Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth); + + Index r = IsLower ? pi : size - pi; // remaining size + if (r > 0) + { + // let's directly call the low level product function because: + // 1 - it is faster to compile + // 2 - it is slighlty faster at runtime + Index startRow = IsLower ? pi : pi-actualPanelWidth; + Index startCol = IsLower ? 0 : pi; + + general_matrix_vector_product<Index,LhsScalar,RowMajor,Conjugate,RhsScalar,false>::run( + actualPanelWidth, r, + &lhs.coeffRef(startRow,startCol), lhsStride, + rhs + startCol, 1, + rhs + startRow, 1, + RhsScalar(-1)); + } + + for(Index k=0; k<actualPanelWidth; ++k) + { + Index i = IsLower ? pi+k : pi-k-1; + Index s = IsLower ? pi : i+1; + if (k>0) + rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum(); + + if(!(Mode & UnitDiag)) + rhs[i] /= cjLhs(i,i); + } + } + } +}; + +// forward and backward substitution, column-major, rhs is a vector +template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate> +struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, ColMajor> +{ + enum { + IsLower = ((Mode&Lower)==Lower) + }; + static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs) + { + typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap; + const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + typename internal::conditional<Conjugate, + const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>, + const LhsMap& + >::type cjLhs(lhs); + static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; + + for(Index pi=IsLower ? 0 : size; + IsLower ? pi<size : pi>0; + IsLower ? pi+=PanelWidth : pi-=PanelWidth) + { + Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth); + Index startBlock = IsLower ? pi : pi-actualPanelWidth; + Index endBlock = IsLower ? pi + actualPanelWidth : 0; + + for(Index k=0; k<actualPanelWidth; ++k) + { + Index i = IsLower ? pi+k : pi-k-1; + if(!(Mode & UnitDiag)) + rhs[i] /= cjLhs.coeff(i,i); + + Index r = actualPanelWidth - k - 1; // remaining size + Index s = IsLower ? i+1 : i-r; + if (r>0) + Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + } + Index r = IsLower ? size - endBlock : startBlock; // remaining size + if (r > 0) + { + // let's directly call the low level product function because: + // 1 - it is faster to compile + // 2 - it is slighlty faster at runtime + general_matrix_vector_product<Index,LhsScalar,ColMajor,Conjugate,RhsScalar,false>::run( + r, actualPanelWidth, + &lhs.coeffRef(endBlock,startBlock), lhsStride, + rhs+startBlock, 1, + rhs+endBlock, 1, RhsScalar(-1)); + } + } + } +}; + +} // end namespace internal + +#endif // EIGEN_TRIANGULAR_SOLVER_VECTOR_H |