diff options
Diffstat (limited to 'extern/Eigen2/Eigen/src/Core/CacheFriendlyProduct.h')
-rw-r--r-- | extern/Eigen2/Eigen/src/Core/CacheFriendlyProduct.h | 753 |
1 files changed, 753 insertions, 0 deletions
diff --git a/extern/Eigen2/Eigen/src/Core/CacheFriendlyProduct.h b/extern/Eigen2/Eigen/src/Core/CacheFriendlyProduct.h new file mode 100644 index 00000000000..b1362b0a80c --- /dev/null +++ b/extern/Eigen2/Eigen/src/Core/CacheFriendlyProduct.h @@ -0,0 +1,753 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. Eigen itself is part of the KDE project. +// +// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr> +// +// Eigen is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 3 of the License, or (at your option) any later version. +// +// Alternatively, you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of +// the License, or (at your option) any later version. +// +// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License and a copy of the GNU General Public License along with +// Eigen. If not, see <http://www.gnu.org/licenses/>. + +#ifndef EIGEN_CACHE_FRIENDLY_PRODUCT_H +#define EIGEN_CACHE_FRIENDLY_PRODUCT_H + +template <int L2MemorySize,typename Scalar> +struct ei_L2_block_traits { + enum {width = 8 * ei_meta_sqrt<L2MemorySize/(64*sizeof(Scalar))>::ret }; +}; + +#ifndef EIGEN_EXTERN_INSTANTIATIONS + +template<typename Scalar> +static void ei_cache_friendly_product( + int _rows, int _cols, int depth, + bool _lhsRowMajor, const Scalar* _lhs, int _lhsStride, + bool _rhsRowMajor, const Scalar* _rhs, int _rhsStride, + bool resRowMajor, Scalar* res, int resStride) +{ + const Scalar* EIGEN_RESTRICT lhs; + const Scalar* EIGEN_RESTRICT rhs; + int lhsStride, rhsStride, rows, cols; + bool lhsRowMajor; + + if (resRowMajor) + { + lhs = _rhs; + rhs = _lhs; + lhsStride = _rhsStride; + rhsStride = _lhsStride; + cols = _rows; + rows = _cols; + lhsRowMajor = !_rhsRowMajor; + ei_assert(_lhsRowMajor); + } + else + { + lhs = _lhs; + rhs = _rhs; + lhsStride = _lhsStride; + rhsStride = _rhsStride; + rows = _rows; + cols = _cols; + lhsRowMajor = _lhsRowMajor; + ei_assert(!_rhsRowMajor); + } + + typedef typename ei_packet_traits<Scalar>::type PacketType; + + enum { + PacketSize = sizeof(PacketType)/sizeof(Scalar), + #if (defined __i386__) + // i386 architecture provides only 8 xmm registers, + // so let's reduce the max number of rows processed at once. + MaxBlockRows = 4, + MaxBlockRows_ClampingMask = 0xFFFFFC, + #else + MaxBlockRows = 8, + MaxBlockRows_ClampingMask = 0xFFFFF8, + #endif + // maximal size of the blocks fitted in L2 cache + MaxL2BlockSize = ei_L2_block_traits<EIGEN_TUNE_FOR_CPU_CACHE_SIZE,Scalar>::width + }; + + const bool resIsAligned = (PacketSize==1) || (((resStride%PacketSize) == 0) && (size_t(res)%16==0)); + + const int remainingSize = depth % PacketSize; + const int size = depth - remainingSize; // third dimension of the product clamped to packet boundaries + const int l2BlockRows = MaxL2BlockSize > rows ? rows : MaxL2BlockSize; + const int l2BlockCols = MaxL2BlockSize > cols ? cols : MaxL2BlockSize; + const int l2BlockSize = MaxL2BlockSize > size ? size : MaxL2BlockSize; + const int l2BlockSizeAligned = (1 + std::max(l2BlockSize,l2BlockCols)/PacketSize)*PacketSize; + const bool needRhsCopy = (PacketSize>1) && ((rhsStride%PacketSize!=0) || (size_t(rhs)%16!=0)); + Scalar* EIGEN_RESTRICT block = 0; + const int allocBlockSize = l2BlockRows*size; + block = ei_aligned_stack_new(Scalar, allocBlockSize); + Scalar* EIGEN_RESTRICT rhsCopy + = ei_aligned_stack_new(Scalar, l2BlockSizeAligned*l2BlockSizeAligned); + + // loops on each L2 cache friendly blocks of the result + for(int l2i=0; l2i<rows; l2i+=l2BlockRows) + { + const int l2blockRowEnd = std::min(l2i+l2BlockRows, rows); + const int l2blockRowEndBW = l2blockRowEnd & MaxBlockRows_ClampingMask; // end of the rows aligned to bw + const int l2blockRemainingRows = l2blockRowEnd - l2blockRowEndBW; // number of remaining rows + //const int l2blockRowEndBWPlusOne = l2blockRowEndBW + (l2blockRemainingRows?0:MaxBlockRows); + + // build a cache friendly blocky matrix + int count = 0; + + // copy l2blocksize rows of m_lhs to blocks of ps x bw + for(int l2k=0; l2k<size; l2k+=l2BlockSize) + { + const int l2blockSizeEnd = std::min(l2k+l2BlockSize, size); + + for (int i = l2i; i<l2blockRowEndBW/*PlusOne*/; i+=MaxBlockRows) + { + // TODO merge the "if l2blockRemainingRows" using something like: + // const int blockRows = std::min(i+MaxBlockRows, rows) - i; + + for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize) + { + // TODO write these loops using meta unrolling + // negligible for large matrices but useful for small ones + if (lhsRowMajor) + { + for (int w=0; w<MaxBlockRows; ++w) + for (int s=0; s<PacketSize; ++s) + block[count++] = lhs[(i+w)*lhsStride + (k+s)]; + } + else + { + for (int w=0; w<MaxBlockRows; ++w) + for (int s=0; s<PacketSize; ++s) + block[count++] = lhs[(i+w) + (k+s)*lhsStride]; + } + } + } + if (l2blockRemainingRows>0) + { + for (int k=l2k; k<l2blockSizeEnd; k+=PacketSize) + { + if (lhsRowMajor) + { + for (int w=0; w<l2blockRemainingRows; ++w) + for (int s=0; s<PacketSize; ++s) + block[count++] = lhs[(l2blockRowEndBW+w)*lhsStride + (k+s)]; + } + else + { + for (int w=0; w<l2blockRemainingRows; ++w) + for (int s=0; s<PacketSize; ++s) + block[count++] = lhs[(l2blockRowEndBW+w) + (k+s)*lhsStride]; + } + } + } + } + + for(int l2j=0; l2j<cols; l2j+=l2BlockCols) + { + int l2blockColEnd = std::min(l2j+l2BlockCols, cols); + + for(int l2k=0; l2k<size; l2k+=l2BlockSize) + { + // acumulate bw rows of lhs time a single column of rhs to a bw x 1 block of res + int l2blockSizeEnd = std::min(l2k+l2BlockSize, size); + + // if not aligned, copy the rhs block + if (needRhsCopy) + for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1) + { + ei_internal_assert(l2BlockSizeAligned*(l1j-l2j)+(l2blockSizeEnd-l2k) < l2BlockSizeAligned*l2BlockSizeAligned); + memcpy(rhsCopy+l2BlockSizeAligned*(l1j-l2j),&(rhs[l1j*rhsStride+l2k]),(l2blockSizeEnd-l2k)*sizeof(Scalar)); + } + + // for each bw x 1 result's block + for(int l1i=l2i; l1i<l2blockRowEndBW; l1i+=MaxBlockRows) + { + int offsetblock = l2k * (l2blockRowEnd-l2i) + (l1i-l2i)*(l2blockSizeEnd-l2k) - l2k*MaxBlockRows; + const Scalar* EIGEN_RESTRICT localB = &block[offsetblock]; + + for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1) + { + const Scalar* EIGEN_RESTRICT rhsColumn; + if (needRhsCopy) + rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]); + else + rhsColumn = &(rhs[l1j*rhsStride]); + + PacketType dst[MaxBlockRows]; + dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.)); + if (MaxBlockRows==8) + dst[7] = dst[6] = dst[5] = dst[4] = dst[0]; + + PacketType tmp; + + for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize) + { + tmp = ei_ploadu(&rhsColumn[k]); + PacketType A0, A1, A2, A3, A4, A5; + A0 = ei_pload(localB + k*MaxBlockRows); + A1 = ei_pload(localB + k*MaxBlockRows+1*PacketSize); + A2 = ei_pload(localB + k*MaxBlockRows+2*PacketSize); + A3 = ei_pload(localB + k*MaxBlockRows+3*PacketSize); + if (MaxBlockRows==8) A4 = ei_pload(localB + k*MaxBlockRows+4*PacketSize); + if (MaxBlockRows==8) A5 = ei_pload(localB + k*MaxBlockRows+5*PacketSize); + dst[0] = ei_pmadd(tmp, A0, dst[0]); + if (MaxBlockRows==8) A0 = ei_pload(localB + k*MaxBlockRows+6*PacketSize); + dst[1] = ei_pmadd(tmp, A1, dst[1]); + if (MaxBlockRows==8) A1 = ei_pload(localB + k*MaxBlockRows+7*PacketSize); + dst[2] = ei_pmadd(tmp, A2, dst[2]); + dst[3] = ei_pmadd(tmp, A3, dst[3]); + if (MaxBlockRows==8) + { + dst[4] = ei_pmadd(tmp, A4, dst[4]); + dst[5] = ei_pmadd(tmp, A5, dst[5]); + dst[6] = ei_pmadd(tmp, A0, dst[6]); + dst[7] = ei_pmadd(tmp, A1, dst[7]); + } + } + + Scalar* EIGEN_RESTRICT localRes = &(res[l1i + l1j*resStride]); + + if (PacketSize>1 && resIsAligned) + { + // the result is aligned: let's do packet reduction + ei_pstore(&(localRes[0]), ei_padd(ei_pload(&(localRes[0])), ei_preduxp(&dst[0]))); + if (PacketSize==2) + ei_pstore(&(localRes[2]), ei_padd(ei_pload(&(localRes[2])), ei_preduxp(&(dst[2])))); + if (MaxBlockRows==8) + { + ei_pstore(&(localRes[4]), ei_padd(ei_pload(&(localRes[4])), ei_preduxp(&(dst[4])))); + if (PacketSize==2) + ei_pstore(&(localRes[6]), ei_padd(ei_pload(&(localRes[6])), ei_preduxp(&(dst[6])))); + } + } + else + { + // not aligned => per coeff packet reduction + localRes[0] += ei_predux(dst[0]); + localRes[1] += ei_predux(dst[1]); + localRes[2] += ei_predux(dst[2]); + localRes[3] += ei_predux(dst[3]); + if (MaxBlockRows==8) + { + localRes[4] += ei_predux(dst[4]); + localRes[5] += ei_predux(dst[5]); + localRes[6] += ei_predux(dst[6]); + localRes[7] += ei_predux(dst[7]); + } + } + } + } + if (l2blockRemainingRows>0) + { + int offsetblock = l2k * (l2blockRowEnd-l2i) + (l2blockRowEndBW-l2i)*(l2blockSizeEnd-l2k) - l2k*l2blockRemainingRows; + const Scalar* localB = &block[offsetblock]; + + for(int l1j=l2j; l1j<l2blockColEnd; l1j+=1) + { + const Scalar* EIGEN_RESTRICT rhsColumn; + if (needRhsCopy) + rhsColumn = &(rhsCopy[l2BlockSizeAligned*(l1j-l2j)-l2k]); + else + rhsColumn = &(rhs[l1j*rhsStride]); + + PacketType dst[MaxBlockRows]; + dst[3] = dst[2] = dst[1] = dst[0] = ei_pset1(Scalar(0.)); + if (MaxBlockRows==8) + dst[7] = dst[6] = dst[5] = dst[4] = dst[0]; + + // let's declare a few other temporary registers + PacketType tmp; + + for(int k=l2k; k<l2blockSizeEnd; k+=PacketSize) + { + tmp = ei_pload(&rhsColumn[k]); + + dst[0] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows ])), dst[0]); + if (l2blockRemainingRows>=2) dst[1] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+ PacketSize])), dst[1]); + if (l2blockRemainingRows>=3) dst[2] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+2*PacketSize])), dst[2]); + if (l2blockRemainingRows>=4) dst[3] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+3*PacketSize])), dst[3]); + if (MaxBlockRows==8) + { + if (l2blockRemainingRows>=5) dst[4] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+4*PacketSize])), dst[4]); + if (l2blockRemainingRows>=6) dst[5] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+5*PacketSize])), dst[5]); + if (l2blockRemainingRows>=7) dst[6] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+6*PacketSize])), dst[6]); + if (l2blockRemainingRows>=8) dst[7] = ei_pmadd(tmp, ei_pload(&(localB[k*l2blockRemainingRows+7*PacketSize])), dst[7]); + } + } + + Scalar* EIGEN_RESTRICT localRes = &(res[l2blockRowEndBW + l1j*resStride]); + + // process the remaining rows once at a time + localRes[0] += ei_predux(dst[0]); + if (l2blockRemainingRows>=2) localRes[1] += ei_predux(dst[1]); + if (l2blockRemainingRows>=3) localRes[2] += ei_predux(dst[2]); + if (l2blockRemainingRows>=4) localRes[3] += ei_predux(dst[3]); + if (MaxBlockRows==8) + { + if (l2blockRemainingRows>=5) localRes[4] += ei_predux(dst[4]); + if (l2blockRemainingRows>=6) localRes[5] += ei_predux(dst[5]); + if (l2blockRemainingRows>=7) localRes[6] += ei_predux(dst[6]); + if (l2blockRemainingRows>=8) localRes[7] += ei_predux(dst[7]); + } + + } + } + } + } + } + if (PacketSize>1 && remainingSize) + { + if (lhsRowMajor) + { + for (int j=0; j<cols; ++j) + for (int i=0; i<rows; ++i) + { + Scalar tmp = lhs[i*lhsStride+size] * rhs[j*rhsStride+size]; + // FIXME this loop get vectorized by the compiler ! + for (int k=1; k<remainingSize; ++k) + tmp += lhs[i*lhsStride+size+k] * rhs[j*rhsStride+size+k]; + res[i+j*resStride] += tmp; + } + } + else + { + for (int j=0; j<cols; ++j) + for (int i=0; i<rows; ++i) + { + Scalar tmp = lhs[i+size*lhsStride] * rhs[j*rhsStride+size]; + for (int k=1; k<remainingSize; ++k) + tmp += lhs[i+(size+k)*lhsStride] * rhs[j*rhsStride+size+k]; + res[i+j*resStride] += tmp; + } + } + } + + ei_aligned_stack_delete(Scalar, block, allocBlockSize); + ei_aligned_stack_delete(Scalar, rhsCopy, l2BlockSizeAligned*l2BlockSizeAligned); +} + +#endif // EIGEN_EXTERN_INSTANTIATIONS + +/* Optimized col-major matrix * vector product: + * This algorithm processes 4 columns at onces that allows to both reduce + * the number of load/stores of the result by a factor 4 and to reduce + * the instruction dependency. Moreover, we know that all bands have the + * same alignment pattern. + * TODO: since rhs gets evaluated only once, no need to evaluate it + */ +template<typename Scalar, typename RhsType> +static EIGEN_DONT_INLINE void ei_cache_friendly_product_colmajor_times_vector( + int size, + const Scalar* lhs, int lhsStride, + const RhsType& rhs, + Scalar* res) +{ + #ifdef _EIGEN_ACCUMULATE_PACKETS + #error _EIGEN_ACCUMULATE_PACKETS has already been defined + #endif + #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ + ei_pstore(&res[j], \ + ei_padd(ei_pload(&res[j]), \ + ei_padd( \ + ei_padd(ei_pmul(ptmp0,EIGEN_CAT(ei_ploa , A0)(&lhs0[j])), \ + ei_pmul(ptmp1,EIGEN_CAT(ei_ploa , A13)(&lhs1[j]))), \ + ei_padd(ei_pmul(ptmp2,EIGEN_CAT(ei_ploa , A2)(&lhs2[j])), \ + ei_pmul(ptmp3,EIGEN_CAT(ei_ploa , A13)(&lhs3[j]))) ))) + + typedef typename ei_packet_traits<Scalar>::type Packet; + const int PacketSize = sizeof(Packet)/sizeof(Scalar); + + enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; + const int columnsAtOnce = 4; + const int peels = 2; + const int PacketAlignedMask = PacketSize-1; + const int PeelAlignedMask = PacketSize*peels-1; + + // How many coeffs of the result do we have to skip to be aligned. + // Here we assume data are at least aligned on the base scalar type that is mandatory anyway. + const int alignedStart = ei_alignmentOffset(res,size); + const int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; + const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; + + const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; + int alignmentPattern = alignmentStep==0 ? AllAligned + : alignmentStep==(PacketSize/2) ? EvenAligned + : FirstAligned; + + // we cannot assume the first element is aligned because of sub-matrices + const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size); + + // find how many columns do we have to skip to be aligned with the result (if possible) + int skipColumns = 0; + if (PacketSize>1) + { + ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize); + + while (skipColumns<PacketSize && + alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%PacketSize)) + ++skipColumns; + if (skipColumns==PacketSize) + { + // nothing can be aligned, no need to skip any column + alignmentPattern = NoneAligned; + skipColumns = 0; + } + else + { + skipColumns = std::min(skipColumns,rhs.size()); + // note that the skiped columns are processed later. + } + + ei_internal_assert((alignmentPattern==NoneAligned) || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(Packet))==0); + } + + int offset1 = (FirstAligned && alignmentStep==1?3:1); + int offset3 = (FirstAligned && alignmentStep==1?1:3); + + int columnBound = ((rhs.size()-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; + for (int i=skipColumns; i<columnBound; i+=columnsAtOnce) + { + Packet ptmp0 = ei_pset1(rhs[i]), ptmp1 = ei_pset1(rhs[i+offset1]), + ptmp2 = ei_pset1(rhs[i+2]), ptmp3 = ei_pset1(rhs[i+offset3]); + + // this helps a lot generating better binary code + const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + + if (PacketSize>1) + { + /* explicit vectorization */ + // process initial unaligned coeffs + for (int j=0; j<alignedStart; ++j) + res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j]; + + if (alignedSize>alignedStart) + { + switch(alignmentPattern) + { + case AllAligned: + for (int j = alignedStart; j<alignedSize; j+=PacketSize) + _EIGEN_ACCUMULATE_PACKETS(d,d,d); + break; + case EvenAligned: + for (int j = alignedStart; j<alignedSize; j+=PacketSize) + _EIGEN_ACCUMULATE_PACKETS(d,du,d); + break; + case FirstAligned: + if(peels>1) + { + Packet A00, A01, A02, A03, A10, A11, A12, A13; + + A01 = ei_pload(&lhs1[alignedStart-1]); + A02 = ei_pload(&lhs2[alignedStart-2]); + A03 = ei_pload(&lhs3[alignedStart-3]); + + for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize) + { + A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11); + A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); + A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); + + A00 = ei_pload (&lhs0[j]); + A10 = ei_pload (&lhs0[j+PacketSize]); + A00 = ei_pmadd(ptmp0, A00, ei_pload(&res[j])); + A10 = ei_pmadd(ptmp0, A10, ei_pload(&res[j+PacketSize])); + + A00 = ei_pmadd(ptmp1, A01, A00); + A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); + A00 = ei_pmadd(ptmp2, A02, A00); + A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); + A00 = ei_pmadd(ptmp3, A03, A00); + ei_pstore(&res[j],A00); + A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); + A10 = ei_pmadd(ptmp1, A11, A10); + A10 = ei_pmadd(ptmp2, A12, A10); + A10 = ei_pmadd(ptmp3, A13, A10); + ei_pstore(&res[j+PacketSize],A10); + } + } + for (int j = peeledSize; j<alignedSize; j+=PacketSize) + _EIGEN_ACCUMULATE_PACKETS(d,du,du); + break; + default: + for (int j = alignedStart; j<alignedSize; j+=PacketSize) + _EIGEN_ACCUMULATE_PACKETS(du,du,du); + break; + } + } + } // end explicit vectorization + + /* process remaining coeffs (or all if there is no explicit vectorization) */ + for (int j=alignedSize; j<size; ++j) + res[j] += ei_pfirst(ptmp0)*lhs0[j] + ei_pfirst(ptmp1)*lhs1[j] + ei_pfirst(ptmp2)*lhs2[j] + ei_pfirst(ptmp3)*lhs3[j]; + } + + // process remaining first and last columns (at most columnsAtOnce-1) + int end = rhs.size(); + int start = columnBound; + do + { + for (int i=start; i<end; ++i) + { + Packet ptmp0 = ei_pset1(rhs[i]); + const Scalar* lhs0 = lhs + i*lhsStride; + + if (PacketSize>1) + { + /* explicit vectorization */ + // process first unaligned result's coeffs + for (int j=0; j<alignedStart; ++j) + res[j] += ei_pfirst(ptmp0) * lhs0[j]; + + // process aligned result's coeffs + if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) + for (int j = alignedStart;j<alignedSize;j+=PacketSize) + ei_pstore(&res[j], ei_pmadd(ptmp0,ei_pload(&lhs0[j]),ei_pload(&res[j]))); + else + for (int j = alignedStart;j<alignedSize;j+=PacketSize) + ei_pstore(&res[j], ei_pmadd(ptmp0,ei_ploadu(&lhs0[j]),ei_pload(&res[j]))); + } + + // process remaining scalars (or all if no explicit vectorization) + for (int j=alignedSize; j<size; ++j) + res[j] += ei_pfirst(ptmp0) * lhs0[j]; + } + if (skipColumns) + { + start = 0; + end = skipColumns; + skipColumns = 0; + } + else + break; + } while(PacketSize>1); + #undef _EIGEN_ACCUMULATE_PACKETS +} + +// TODO add peeling to mask unaligned load/stores +template<typename Scalar, typename ResType> +static EIGEN_DONT_INLINE void ei_cache_friendly_product_rowmajor_times_vector( + const Scalar* lhs, int lhsStride, + const Scalar* rhs, int rhsSize, + ResType& res) +{ + #ifdef _EIGEN_ACCUMULATE_PACKETS + #error _EIGEN_ACCUMULATE_PACKETS has already been defined + #endif + + #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ + Packet b = ei_pload(&rhs[j]); \ + ptmp0 = ei_pmadd(b, EIGEN_CAT(ei_ploa,A0) (&lhs0[j]), ptmp0); \ + ptmp1 = ei_pmadd(b, EIGEN_CAT(ei_ploa,A13)(&lhs1[j]), ptmp1); \ + ptmp2 = ei_pmadd(b, EIGEN_CAT(ei_ploa,A2) (&lhs2[j]), ptmp2); \ + ptmp3 = ei_pmadd(b, EIGEN_CAT(ei_ploa,A13)(&lhs3[j]), ptmp3); } + + typedef typename ei_packet_traits<Scalar>::type Packet; + const int PacketSize = sizeof(Packet)/sizeof(Scalar); + + enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; + const int rowsAtOnce = 4; + const int peels = 2; + const int PacketAlignedMask = PacketSize-1; + const int PeelAlignedMask = PacketSize*peels-1; + const int size = rhsSize; + + // How many coeffs of the result do we have to skip to be aligned. + // Here we assume data are at least aligned on the base scalar type that is mandatory anyway. + const int alignedStart = ei_alignmentOffset(rhs, size); + const int alignedSize = PacketSize>1 ? alignedStart + ((size-alignedStart) & ~PacketAlignedMask) : 0; + const int peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; + + const int alignmentStep = PacketSize>1 ? (PacketSize - lhsStride % PacketSize) & PacketAlignedMask : 0; + int alignmentPattern = alignmentStep==0 ? AllAligned + : alignmentStep==(PacketSize/2) ? EvenAligned + : FirstAligned; + + // we cannot assume the first element is aligned because of sub-matrices + const int lhsAlignmentOffset = ei_alignmentOffset(lhs,size); + + // find how many rows do we have to skip to be aligned with rhs (if possible) + int skipRows = 0; + if (PacketSize>1) + { + ei_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(Packet)==0 || size<PacketSize); + + while (skipRows<PacketSize && + alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%PacketSize)) + ++skipRows; + if (skipRows==PacketSize) + { + // nothing can be aligned, no need to skip any column + alignmentPattern = NoneAligned; + skipRows = 0; + } + else + { + skipRows = std::min(skipRows,res.size()); + // note that the skiped columns are processed later. + } + ei_internal_assert((alignmentPattern==NoneAligned) || PacketSize==1 + || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(Packet))==0); + } + + int offset1 = (FirstAligned && alignmentStep==1?3:1); + int offset3 = (FirstAligned && alignmentStep==1?1:3); + + int rowBound = ((res.size()-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; + for (int i=skipRows; i<rowBound; i+=rowsAtOnce) + { + Scalar tmp0 = Scalar(0), tmp1 = Scalar(0), tmp2 = Scalar(0), tmp3 = Scalar(0); + + // this helps the compiler generating good binary code + const Scalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, + *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + + if (PacketSize>1) + { + /* explicit vectorization */ + Packet ptmp0 = ei_pset1(Scalar(0)), ptmp1 = ei_pset1(Scalar(0)), ptmp2 = ei_pset1(Scalar(0)), ptmp3 = ei_pset1(Scalar(0)); + + // process initial unaligned coeffs + // FIXME this loop get vectorized by the compiler ! + for (int j=0; j<alignedStart; ++j) + { + Scalar b = rhs[j]; + tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j]; + } + + if (alignedSize>alignedStart) + { + switch(alignmentPattern) + { + case AllAligned: + for (int j = alignedStart; j<alignedSize; j+=PacketSize) + _EIGEN_ACCUMULATE_PACKETS(d,d,d); + break; + case EvenAligned: + for (int j = alignedStart; j<alignedSize; j+=PacketSize) + _EIGEN_ACCUMULATE_PACKETS(d,du,d); + break; + case FirstAligned: + if (peels>1) + { + /* Here we proccess 4 rows with with two peeled iterations to hide + * tghe overhead of unaligned loads. Moreover unaligned loads are handled + * using special shift/move operations between the two aligned packets + * overlaping the desired unaligned packet. This is *much* more efficient + * than basic unaligned loads. + */ + Packet A01, A02, A03, b, A11, A12, A13; + A01 = ei_pload(&lhs1[alignedStart-1]); + A02 = ei_pload(&lhs2[alignedStart-2]); + A03 = ei_pload(&lhs3[alignedStart-3]); + + for (int j = alignedStart; j<peeledSize; j+=peels*PacketSize) + { + b = ei_pload(&rhs[j]); + A11 = ei_pload(&lhs1[j-1+PacketSize]); ei_palign<1>(A01,A11); + A12 = ei_pload(&lhs2[j-2+PacketSize]); ei_palign<2>(A02,A12); + A13 = ei_pload(&lhs3[j-3+PacketSize]); ei_palign<3>(A03,A13); + + ptmp0 = ei_pmadd(b, ei_pload (&lhs0[j]), ptmp0); + ptmp1 = ei_pmadd(b, A01, ptmp1); + A01 = ei_pload(&lhs1[j-1+2*PacketSize]); ei_palign<1>(A11,A01); + ptmp2 = ei_pmadd(b, A02, ptmp2); + A02 = ei_pload(&lhs2[j-2+2*PacketSize]); ei_palign<2>(A12,A02); + ptmp3 = ei_pmadd(b, A03, ptmp3); + A03 = ei_pload(&lhs3[j-3+2*PacketSize]); ei_palign<3>(A13,A03); + + b = ei_pload(&rhs[j+PacketSize]); + ptmp0 = ei_pmadd(b, ei_pload (&lhs0[j+PacketSize]), ptmp0); + ptmp1 = ei_pmadd(b, A11, ptmp1); + ptmp2 = ei_pmadd(b, A12, ptmp2); + ptmp3 = ei_pmadd(b, A13, ptmp3); + } + } + for (int j = peeledSize; j<alignedSize; j+=PacketSize) + _EIGEN_ACCUMULATE_PACKETS(d,du,du); + break; + default: + for (int j = alignedStart; j<alignedSize; j+=PacketSize) + _EIGEN_ACCUMULATE_PACKETS(du,du,du); + break; + } + tmp0 += ei_predux(ptmp0); + tmp1 += ei_predux(ptmp1); + tmp2 += ei_predux(ptmp2); + tmp3 += ei_predux(ptmp3); + } + } // end explicit vectorization + + // process remaining coeffs (or all if no explicit vectorization) + // FIXME this loop get vectorized by the compiler ! + for (int j=alignedSize; j<size; ++j) + { + Scalar b = rhs[j]; + tmp0 += b*lhs0[j]; tmp1 += b*lhs1[j]; tmp2 += b*lhs2[j]; tmp3 += b*lhs3[j]; + } + res[i] += tmp0; res[i+offset1] += tmp1; res[i+2] += tmp2; res[i+offset3] += tmp3; + } + + // process remaining first and last rows (at most columnsAtOnce-1) + int end = res.size(); + int start = rowBound; + do + { + for (int i=start; i<end; ++i) + { + Scalar tmp0 = Scalar(0); + Packet ptmp0 = ei_pset1(tmp0); + const Scalar* lhs0 = lhs + i*lhsStride; + // process first unaligned result's coeffs + // FIXME this loop get vectorized by the compiler ! + for (int j=0; j<alignedStart; ++j) + tmp0 += rhs[j] * lhs0[j]; + + if (alignedSize>alignedStart) + { + // process aligned rhs coeffs + if ((size_t(lhs0+alignedStart)%sizeof(Packet))==0) + for (int j = alignedStart;j<alignedSize;j+=PacketSize) + ptmp0 = ei_pmadd(ei_pload(&rhs[j]), ei_pload(&lhs0[j]), ptmp0); + else + for (int j = alignedStart;j<alignedSize;j+=PacketSize) + ptmp0 = ei_pmadd(ei_pload(&rhs[j]), ei_ploadu(&lhs0[j]), ptmp0); + tmp0 += ei_predux(ptmp0); + } + + // process remaining scalars + // FIXME this loop get vectorized by the compiler ! + for (int j=alignedSize; j<size; ++j) + tmp0 += rhs[j] * lhs0[j]; + res[i] += tmp0; + } + if (skipRows) + { + start = 0; + end = skipRows; + skipRows = 0; + } + else + break; + } while(PacketSize>1); + + #undef _EIGEN_ACCUMULATE_PACKETS +} + +#endif // EIGEN_CACHE_FRIENDLY_PRODUCT_H |