From 4a04f7206914a49f5f95adc5eb786237f1a9f547 Mon Sep 17 00:00:00 2001 From: Campbell Barton Date: Sun, 23 Oct 2011 17:52:20 +0000 Subject: remove $Id: tags after discussion on the mailign list: http://markmail.org/message/fp7ozcywxum3ar7n --- .../src/Core/products/GeneralBlockPanelKernel.h | 1285 ++++++++++++++++++++ 1 file changed, 1285 insertions(+) create mode 100644 extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h (limited to 'extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h') diff --git a/extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h new file mode 100644 index 00000000000..6f3f2717007 --- /dev/null +++ b/extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -0,0 +1,1285 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see . + +#ifndef EIGEN_GENERAL_BLOCK_PANEL_H +#define EIGEN_GENERAL_BLOCK_PANEL_H + +namespace internal { + +template +class gebp_traits; + +/** \internal */ +inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) +{ + static std::ptrdiff_t m_l1CacheSize = 0; + static std::ptrdiff_t m_l2CacheSize = 0; + if(m_l1CacheSize==0) + { + m_l1CacheSize = queryL1CacheSize(); + m_l2CacheSize = queryTopLevelCacheSize(); + + if(m_l1CacheSize<=0) m_l1CacheSize = 8 * 1024; + if(m_l2CacheSize<=0) m_l2CacheSize = 1 * 1024 * 1024; + } + + if(action==SetAction) + { + // set the cpu cache size and cache all block sizes from a global cache size in byte + eigen_internal_assert(l1!=0 && l2!=0); + m_l1CacheSize = *l1; + m_l2CacheSize = *l2; + } + else if(action==GetAction) + { + eigen_internal_assert(l1!=0 && l2!=0); + *l1 = m_l1CacheSize; + *l2 = m_l2CacheSize; + } + else + { + eigen_internal_assert(false); + } +} + +/** \brief Computes the blocking parameters for a m x k times k x n matrix product + * + * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. + * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension. + * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. + * + * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, + * this function computes the blocking size parameters along the respective dimensions + * for matrix products and related algorithms. The blocking sizes depends on various + * parameters: + * - the L1 and L2 cache sizes, + * - the register level blocking sizes defined by gebp_traits, + * - the number of scalars that fit into a packet (when vectorization is enabled). + * + * \sa setCpuCacheSizes */ +template +void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n) +{ + EIGEN_UNUSED_VARIABLE(n); + // Explanations: + // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and + // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed + // per kc x nr vertical small panels where nr is the blocking size along the n dimension + // at the register level. For vectorization purpose, these small vertical panels are unpacked, + // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to + // stay in L1 cache. + std::ptrdiff_t l1, l2; + + typedef gebp_traits Traits; + enum { + kdiv = KcFactor * 2 * Traits::nr + * Traits::RhsProgress * sizeof(RhsScalar), + mr = gebp_traits::mr, + mr_mask = (0xffffffff/mr)*mr + }; + + manage_caching_sizes(GetAction, &l1, &l2); + k = std::min(k, l1/kdiv); + std::ptrdiff_t _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; + if(_m +inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n) +{ + computeProductBlockingSizes(k, m, n); +} + +#ifdef EIGEN_HAS_FUSE_CJMADD + #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); +#else + + // FIXME (a bit overkill maybe ?) + + template struct gebp_madd_selector { + EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) + { + c = cj.pmadd(a,b,c); + } + }; + + template struct gebp_madd_selector { + EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, T& a, T& b, T& c, T& t) + { + t = b; t = cj.pmul(a,t); c = padd(c,t); + } + }; + + template + EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) + { + gebp_madd_selector::run(cj,a,b,c,t); + } + + #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); +// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); +#endif + +/* Vectorization logic + * real*real: unpack rhs to constant packets, ... + * + * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i), + * storing each res packet into two packets (2x2), + * at the end combine them: swap the second and addsub them + * cf*cf : same but with 2x4 blocks + * cplx*real : unpack rhs to constant packets, ... + * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual + */ +template +class gebp_traits +{ +public: + typedef _LhsScalar LhsScalar; + typedef _RhsScalar RhsScalar; + typedef typename scalar_product_traits::ReturnType ResScalar; + + enum { + ConjLhs = _ConjLhs, + ConjRhs = _ConjRhs, + Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + + // register block size along the N direction (must be either 2 or 4) + nr = NumberOfRegisters/4, + + // register block size along the M direction (currently, this one cannot be modified) + mr = 2 * LhsPacketSize, + + WorkSpaceFactor = nr * RhsPacketSize, + + LhsProgress = LhsPacketSize, + RhsProgress = RhsPacketSize + }; + + typedef typename packet_traits::type _LhsPacket; + typedef typename packet_traits::type _RhsPacket; + typedef typename packet_traits::type _ResPacket; + + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1(ResScalar(0)); + } + + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + { + for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = pload(b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = pload(a); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const + { + tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); + } + + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = pmadd(c,alpha,r); + } + +protected: +// conj_helper cj; +// conj_helper pcj; +}; + +template +class gebp_traits, RealScalar, _ConjLhs, false> +{ +public: + typedef std::complex LhsScalar; + typedef RealScalar RhsScalar; + typedef typename scalar_product_traits::ReturnType ResScalar; + + enum { + ConjLhs = _ConjLhs, + ConjRhs = false, + Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + nr = NumberOfRegisters/4, + mr = 2 * LhsPacketSize, + WorkSpaceFactor = nr*RhsPacketSize, + + LhsProgress = LhsPacketSize, + RhsProgress = RhsPacketSize + }; + + typedef typename packet_traits::type _LhsPacket; + typedef typename packet_traits::type _RhsPacket; + typedef typename packet_traits::type _ResPacket; + + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1(ResScalar(0)); + } + + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + { + for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = pload(b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = pload(a); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + { + madd_impl(a, b, c, tmp, typename conditional::type()); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + { + tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const + { + c += a * b; + } + + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = cj.pmadd(c,alpha,r); + } + +protected: + conj_helper cj; +}; + +template +class gebp_traits, std::complex, _ConjLhs, _ConjRhs > +{ +public: + typedef std::complex Scalar; + typedef std::complex LhsScalar; + typedef std::complex RhsScalar; + typedef std::complex ResScalar; + + enum { + ConjLhs = _ConjLhs, + ConjRhs = _ConjRhs, + Vectorizable = packet_traits::Vectorizable + && packet_traits::Vectorizable, + RealPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + + nr = 2, + mr = 2 * ResPacketSize, + WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, + + LhsProgress = ResPacketSize, + RhsProgress = Vectorizable ? 2*ResPacketSize : 1 + }; + + typedef typename packet_traits::type RealPacket; + typedef typename packet_traits::type ScalarPacket; + struct DoublePacket + { + RealPacket first; + RealPacket second; + }; + + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; + typedef typename conditional::type AccPacket; + + EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } + + EIGEN_STRONG_INLINE void initAcc(DoublePacket& p) + { + p.first = pset1(RealScalar(0)); + p.second = pset1(RealScalar(0)); + } + + /* Unpack the rhs coeff such that each complex coefficient is spread into + * two packects containing respectively the real and imaginary coefficient + * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] + */ + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) + { + for(DenseIndex k=0; k((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); + pstore1((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); + } + else + b[k] = rhs[k]; + } + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const + { + dest.first = pload((const RealScalar*)b); + dest.second = pload((const RealScalar*)(b+ResPacketSize)); + } + + // nothing special here + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = pload((const typename unpacket_traits::type*)(a)); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const + { + c.first = padd(pmul(a,b.first), c.first); + c.second = padd(pmul(a,b.second),c.second); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const + { + c = cj.pmadd(a,b,c); + } + + EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } + + EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const + { + // assemble c + ResPacket tmp; + if((!ConjLhs)&&(!ConjRhs)) + { + tmp = pcplxflip(pconj(ResPacket(c.second))); + tmp = padd(ResPacket(c.first),tmp); + } + else if((!ConjLhs)&&(ConjRhs)) + { + tmp = pconj(pcplxflip(ResPacket(c.second))); + tmp = padd(ResPacket(c.first),tmp); + } + else if((ConjLhs)&&(!ConjRhs)) + { + tmp = pcplxflip(ResPacket(c.second)); + tmp = padd(pconj(ResPacket(c.first)),tmp); + } + else if((ConjLhs)&&(ConjRhs)) + { + tmp = pcplxflip(ResPacket(c.second)); + tmp = psub(pconj(ResPacket(c.first)),tmp); + } + + r = pmadd(tmp,alpha,r); + } + +protected: + conj_helper cj; +}; + +template +class gebp_traits, false, _ConjRhs > +{ +public: + typedef std::complex Scalar; + typedef RealScalar LhsScalar; + typedef Scalar RhsScalar; + typedef Scalar ResScalar; + + enum { + ConjLhs = false, + ConjRhs = _ConjRhs, + Vectorizable = packet_traits::Vectorizable + && packet_traits::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + nr = 4, + mr = 2*ResPacketSize, + WorkSpaceFactor = nr*RhsPacketSize, + + LhsProgress = ResPacketSize, + RhsProgress = ResPacketSize + }; + + typedef typename packet_traits::type _LhsPacket; + typedef typename packet_traits::type _RhsPacket; + typedef typename packet_traits::type _ResPacket; + + typedef typename conditional::type LhsPacket; + typedef typename conditional::type RhsPacket; + typedef typename conditional::type ResPacket; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) + { + p = pset1(ResScalar(0)); + } + + EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + { + for(DenseIndex k=0; k(&b[k*RhsPacketSize], rhs[k]); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + { + dest = pload(b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploaddup(a); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + { + madd_impl(a, b, c, tmp, typename conditional::type()); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + { + tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const + { + c += a * b; + } + + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + { + r = cj.pmadd(alpha,c,r); + } + +protected: + conj_helper cj; +}; + +/* optimized GEneral packed Block * packed Panel product kernel + * + * Mixing type logic: C += A * B + * | A | B | comments + * |real |cplx | no vectorization yet, would require to pack A with duplication + * |cplx |real | easy vectorization + */ +template +struct gebp_kernel +{ + typedef gebp_traits Traits; + typedef typename Traits::ResScalar ResScalar; + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + typedef typename Traits::AccPacket AccPacket; + + enum { + Vectorizable = Traits::Vectorizable, + LhsProgress = Traits::LhsProgress, + RhsProgress = Traits::RhsProgress, + ResPacketSize = Traits::ResPacketSize + }; + + EIGEN_FLATTEN_ATTRIB + void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0) + { + Traits traits; + + if(strideA==-1) strideA = depth; + if(strideB==-1) strideB = depth; + conj_helper cj; +// conj_helper pcj; + Index packet_cols = (cols/nr) * nr; + const Index peeled_mc = (rows/mr)*mr; + // FIXME: + const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); + const Index peeled_kc = (depth/4)*4; + + if(unpackedB==0) + unpackedB = const_cast(blockB - strideB * nr * RhsProgress); + + // loops on each micro vertical panel of rhs (depth x nr) + for(Index j2=0; j2 we select a mr x nr micro block of res which is entirely + // stored into mr/packet_size x nr registers. + for(Index i=0; i(alpha); + + R0 = ploadu(r0); + R1 = ploadu(r1); + R2 = ploadu(r2); + R3 = ploadu(r3); + R4 = ploadu(r0 + ResPacketSize); + R5 = ploadu(r1 + ResPacketSize); + R6 = ploadu(r2 + ResPacketSize); + traits.acc(C0, alphav, R0); + pstoreu(r0, R0); + R0 = ploadu(r3 + ResPacketSize); + + traits.acc(C1, alphav, R1); + traits.acc(C2, alphav, R2); + traits.acc(C3, alphav, R3); + traits.acc(C4, alphav, R4); + traits.acc(C5, alphav, R5); + traits.acc(C6, alphav, R6); + traits.acc(C7, alphav, R0); + + pstoreu(r1, R1); + pstoreu(r2, R2); + pstoreu(r3, R3); + pstoreu(r0 + ResPacketSize, R4); + pstoreu(r1 + ResPacketSize, R5); + pstoreu(r2 + ResPacketSize, R6); + pstoreu(r3 + ResPacketSize, R0); + } + else + { + ResPacket R0, R1, R4; + ResPacket alphav = pset1(alpha); + + R0 = ploadu(r0); + R1 = ploadu(r1); + R4 = ploadu(r0 + ResPacketSize); + traits.acc(C0, alphav, R0); + pstoreu(r0, R0); + R0 = ploadu(r1 + ResPacketSize); + traits.acc(C1, alphav, R1); + traits.acc(C4, alphav, R4); + traits.acc(C5, alphav, R0); + pstoreu(r1, R1); + pstoreu(r0 + ResPacketSize, R4); + pstoreu(r1 + ResPacketSize, R0); + } + + } + + if(rows-peeled_mc>=LhsProgress) + { + Index i = peeled_mc; + const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3; + traits.initAcc(C0); + traits.initAcc(C1); + if(nr==4) traits.initAcc(C2); + if(nr==4) traits.initAcc(C3); + + // performs "inner" product + const RhsScalar* blB = unpackedB; + for(Index k=0; k(alpha); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + ResScalar* r1 = r0 + resStride; + ResScalar* r2 = r1 + resStride; + ResScalar* r3 = r2 + resStride; + + R0 = ploadu(r0); + R1 = ploadu(r1); + if(nr==4) R2 = ploadu(r2); + if(nr==4) R3 = ploadu(r3); + + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + if(nr==4) traits.acc(C2, alphav, R2); + if(nr==4) traits.acc(C3, alphav, R3); + + pstoreu(r0, R0); + pstoreu(r1, R1); + if(nr==4) pstoreu(r2, R2); + if(nr==4) pstoreu(r3, R3); + } + for(Index i=peeled_mc2; i do the same but with nr==1 + for(Index j2=packet_cols; j2(alpha); + + ResScalar* r0 = &res[(j2+0)*resStride + i]; + + R0 = ploadu(r0); + R4 = ploadu(r0+ResPacketSize); + + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R4); + + pstoreu(r0, R0); + pstoreu(r0+ResPacketSize, R4); + } + if(rows-peeled_mc>=LhsProgress) + { + Index i = peeled_mc; + const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; + prefetch(&blA[0]); + + AccPacket C0; + traits.initAcc(C0); + + const RhsScalar* blB = unpackedB; + for(Index k=0; k(alpha); + ResPacket R0 = ploadu(&res[(j2+0)*resStride + i]); + traits.acc(C0, alphav, R0); + pstoreu(&res[(j2+0)*resStride + i], R0); + } + for(Index i=peeled_mc2; i +struct gemm_pack_lhs +{ + void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, + Index stride=0, Index offset=0) + { +// enum { PacketSize = packet_traits::size }; + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if::IsComplex && Conjugate> cj; + const_blas_data_mapper lhs(_lhs,lhsStride); + Index count = 0; + Index peeled_mc = (rows/Pack1)*Pack1; + for(Index i=0; i=Pack2) + { + if(PanelMode) count += Pack2*offset; + for(Index k=0; k +struct gemm_pack_rhs +{ + typedef typename packet_traits::type Packet; + enum { PacketSize = packet_traits::size }; + void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, + Index stride=0, Index offset=0) + { + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if::IsComplex && Conjugate> cj; + Index packet_cols = (cols/nr) * nr; + Index count = 0; + for(Index j2=0; j2 +struct gemm_pack_rhs +{ + enum { PacketSize = packet_traits::size }; + void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, + Index stride=0, Index offset=0) + { + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if::IsComplex && Conjugate> cj; + Index packet_cols = (cols/nr) * nr; + Index count = 0; + for(Index j2=0; j2