diff options
Diffstat (limited to 'extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h')
-rw-r--r-- | extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h | 302 |
1 files changed, 162 insertions, 140 deletions
diff --git a/extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 5eb03c98ccf..bcdca5b0d23 100644 --- a/extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/extern/Eigen3/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -69,8 +69,8 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi * - the number of scalars that fit into a packet (when vectorization is enabled). * * \sa setCpuCacheSizes */ -template<typename LhsScalar, typename RhsScalar, int KcFactor> -void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n) +template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType> +void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) { EIGEN_UNUSED_VARIABLE(n); // Explanations: @@ -91,13 +91,13 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd }; manage_caching_sizes(GetAction, &l1, &l2); - k = std::min<std::ptrdiff_t>(k, l1/kdiv); - std::ptrdiff_t _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; + k = std::min<SizeType>(k, l1/kdiv); + SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; if(_m<m) m = _m & mr_mask; } -template<typename LhsScalar, typename RhsScalar> -inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n) +template<typename LhsScalar, typename RhsScalar, typename SizeType> +inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) { computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n); } @@ -527,9 +527,16 @@ struct gebp_kernel ResPacketSize = Traits::ResPacketSize }; - EIGEN_DONT_INLINE EIGEN_FLATTEN_ATTRIB + EIGEN_DONT_INLINE void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, - Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0) + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0); +}; + +template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +EIGEN_DONT_INLINE +void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> + ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB) { Traits traits; @@ -1089,7 +1096,7 @@ EIGEN_ASM_COMMENT("mybegin4"); } } } -}; + #undef CJMADD @@ -1110,80 +1117,85 @@ EIGEN_ASM_COMMENT("mybegin4"); template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode> struct gemm_pack_lhs { - EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, - Index stride=0, Index offset=0) + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, Conjugate, PanelMode> + ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +{ + typedef typename packet_traits<Scalar>::type Packet; + enum { PacketSize = packet_traits<Scalar>::size }; + + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); + EIGEN_UNUSED_VARIABLE(stride) + EIGEN_UNUSED_VARIABLE(offset) + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) ); + conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; + const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride); + Index count = 0; + Index peeled_mc = (rows/Pack1)*Pack1; + for(Index i=0; i<peeled_mc; i+=Pack1) { - typedef typename packet_traits<Scalar>::type Packet; - enum { PacketSize = packet_traits<Scalar>::size }; - - EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); - eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) ); - conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride); - Index count = 0; - Index peeled_mc = (rows/Pack1)*Pack1; - for(Index i=0; i<peeled_mc; i+=Pack1) - { - if(PanelMode) count += Pack1 * offset; + if(PanelMode) count += Pack1 * offset; - if(StorageOrder==ColMajor) + if(StorageOrder==ColMajor) + { + for(Index k=0; k<depth; k++) { - for(Index k=0; k<depth; k++) - { - Packet A, B, C, D; - if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); - if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k)); - if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k)); - if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k)); - if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } - if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } - if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } - if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } - } + Packet A, B, C, D; + if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); + if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k)); + if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k)); + if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k)); + if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } + if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } + if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } + if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } } - else + } + else + { + for(Index k=0; k<depth; k++) { - for(Index k=0; k<depth; k++) + // TODO add a vectorized transpose here + Index w=0; + for(; w<Pack1-3; w+=4) { - // TODO add a vectorized transpose here - Index w=0; - for(; w<Pack1-3; w+=4) - { - Scalar a(cj(lhs(i+w+0, k))), - b(cj(lhs(i+w+1, k))), - c(cj(lhs(i+w+2, k))), - d(cj(lhs(i+w+3, k))); - blockA[count++] = a; - blockA[count++] = b; - blockA[count++] = c; - blockA[count++] = d; - } - if(Pack1%4) - for(;w<Pack1;++w) - blockA[count++] = cj(lhs(i+w, k)); + Scalar a(cj(lhs(i+w+0, k))), + b(cj(lhs(i+w+1, k))), + c(cj(lhs(i+w+2, k))), + d(cj(lhs(i+w+3, k))); + blockA[count++] = a; + blockA[count++] = b; + blockA[count++] = c; + blockA[count++] = d; } + if(Pack1%4) + for(;w<Pack1;++w) + blockA[count++] = cj(lhs(i+w, k)); } - if(PanelMode) count += Pack1 * (stride-offset-depth); - } - if(rows-peeled_mc>=Pack2) - { - if(PanelMode) count += Pack2*offset; - for(Index k=0; k<depth; k++) - for(Index w=0; w<Pack2; w++) - blockA[count++] = cj(lhs(peeled_mc+w, k)); - if(PanelMode) count += Pack2 * (stride-offset-depth); - peeled_mc += Pack2; - } - for(Index i=peeled_mc; i<rows; i++) - { - if(PanelMode) count += offset; - for(Index k=0; k<depth; k++) - blockA[count++] = cj(lhs(i, k)); - if(PanelMode) count += (stride-offset-depth); } + if(PanelMode) count += Pack1 * (stride-offset-depth); } -}; + if(rows-peeled_mc>=Pack2) + { + if(PanelMode) count += Pack2*offset; + for(Index k=0; k<depth; k++) + for(Index w=0; w<Pack2; w++) + blockA[count++] = cj(lhs(peeled_mc+w, k)); + if(PanelMode) count += Pack2 * (stride-offset-depth); + peeled_mc += Pack2; + } + for(Index i=peeled_mc; i<rows; i++) + { + if(PanelMode) count += offset; + for(Index k=0; k<depth; k++) + blockA[count++] = cj(lhs(i, k)); + if(PanelMode) count += (stride-offset-depth); + } +} // copy a complete panel of the rhs // this version is optimized for column major matrices @@ -1197,92 +1209,102 @@ struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> { typedef typename packet_traits<Scalar>::type Packet; enum { PacketSize = packet_traits<Scalar>::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, - Index stride=0, Index offset=0) + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) +{ + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); + EIGEN_UNUSED_VARIABLE(stride) + EIGEN_UNUSED_VARIABLE(offset) + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; + Index packet_cols = (cols/nr) * nr; + Index count = 0; + for(Index j2=0; j2<packet_cols; j2+=nr) { - EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); - eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - Index packet_cols = (cols/nr) * nr; - Index count = 0; - for(Index j2=0; j2<packet_cols; j2+=nr) + // skip what we have before + if(PanelMode) count += nr * offset; + const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + const Scalar* b1 = &rhs[(j2+1)*rhsStride]; + const Scalar* b2 = &rhs[(j2+2)*rhsStride]; + const Scalar* b3 = &rhs[(j2+3)*rhsStride]; + for(Index k=0; k<depth; k++) { - // skip what we have before - if(PanelMode) count += nr * offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - const Scalar* b1 = &rhs[(j2+1)*rhsStride]; - const Scalar* b2 = &rhs[(j2+2)*rhsStride]; - const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - for(Index k=0; k<depth; k++) - { - blockB[count+0] = cj(b0[k]); - blockB[count+1] = cj(b1[k]); - if(nr==4) blockB[count+2] = cj(b2[k]); - if(nr==4) blockB[count+3] = cj(b3[k]); - count += nr; - } - // skip what we have after - if(PanelMode) count += nr * (stride-offset-depth); + blockB[count+0] = cj(b0[k]); + blockB[count+1] = cj(b1[k]); + if(nr==4) blockB[count+2] = cj(b2[k]); + if(nr==4) blockB[count+3] = cj(b3[k]); + count += nr; } + // skip what we have after + if(PanelMode) count += nr * (stride-offset-depth); + } - // copy the remaining columns one at a time (nr==1) - for(Index j2=packet_cols; j2<cols; ++j2) + // copy the remaining columns one at a time (nr==1) + for(Index j2=packet_cols; j2<cols; ++j2) + { + if(PanelMode) count += offset; + const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + for(Index k=0; k<depth; k++) { - if(PanelMode) count += offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - for(Index k=0; k<depth; k++) - { - blockB[count] = cj(b0[k]); - count += 1; - } - if(PanelMode) count += (stride-offset-depth); + blockB[count] = cj(b0[k]); + count += 1; } + if(PanelMode) count += (stride-offset-depth); } -}; +} // this version is optimized for row major matrices template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> { enum { PacketSize = packet_traits<Scalar>::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, - Index stride=0, Index offset=0) + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); +}; + +template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) +{ + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); + EIGEN_UNUSED_VARIABLE(stride) + EIGEN_UNUSED_VARIABLE(offset) + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; + Index packet_cols = (cols/nr) * nr; + Index count = 0; + for(Index j2=0; j2<packet_cols; j2+=nr) { - EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); - eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - Index packet_cols = (cols/nr) * nr; - Index count = 0; - for(Index j2=0; j2<packet_cols; j2+=nr) + // skip what we have before + if(PanelMode) count += nr * offset; + for(Index k=0; k<depth; k++) { - // skip what we have before - if(PanelMode) count += nr * offset; - for(Index k=0; k<depth; k++) - { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - if(nr==4) blockB[count+2] = cj(b0[2]); - if(nr==4) blockB[count+3] = cj(b0[3]); - count += nr; - } - // skip what we have after - if(PanelMode) count += nr * (stride-offset-depth); + const Scalar* b0 = &rhs[k*rhsStride + j2]; + blockB[count+0] = cj(b0[0]); + blockB[count+1] = cj(b0[1]); + if(nr==4) blockB[count+2] = cj(b0[2]); + if(nr==4) blockB[count+3] = cj(b0[3]); + count += nr; } - // copy the remaining columns one at a time (nr==1) - for(Index j2=packet_cols; j2<cols; ++j2) + // skip what we have after + if(PanelMode) count += nr * (stride-offset-depth); + } + // copy the remaining columns one at a time (nr==1) + for(Index j2=packet_cols; j2<cols; ++j2) + { + if(PanelMode) count += offset; + const Scalar* b0 = &rhs[j2]; + for(Index k=0; k<depth; k++) { - if(PanelMode) count += offset; - const Scalar* b0 = &rhs[j2]; - for(Index k=0; k<depth; k++) - { - blockB[count] = cj(b0[k*rhsStride]); - count += 1; - } - if(PanelMode) count += stride-offset-depth; + blockB[count] = cj(b0[k*rhsStride]); + count += 1; } + if(PanelMode) count += stride-offset-depth; } -}; +} } // end namespace internal |