/************************************************************************* * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ************************************************************************/ #ifndef COMMON_KERNEL_H_ #define COMMON_KERNEL_H_ #include #include #include // BAR macro and helpers #define WARP_SIZE 32 #define ROUNDUP(x, y) \ (((((x) + (y) - 1) / (y))) * (y)) #define BAR_EXEC(type, barid, nthreads) \ asm("bar." #type " " #barid ", " #nthreads ";\n\t") #define BAR_EXPAND(type, barid, nthreads) \ BAR_EXEC(type, barid, (nthreads)) // Named barrier macro. // Expands to asm("bar.type barid, nthreads") where // nthreads has been rounded up to WARP_SIZE. #define BAR(type, barid, nthreads) \ BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE)) __device__ unsigned int spinct; // Spin wait until func evaluates to true template __device__ inline void Wait(const FUNC& func) { while (!func()) { // waste time atomicInc(&spinct, 10); } } typedef uint64_t PackType; // unpack x and y to elements of type T and apply FUNC to each element template struct MULTI { __device__ PackType operator()(const PackType x, const PackType y) const; }; template struct MULTI { static_assert(sizeof(PackType) == 2 * sizeof(uint32_t), "PackType must be twice the size of uint32_t."); union converter { PackType storage; struct { uint32_t a, b; }; }; __device__ PackType operator()(const PackType x, const PackType y) const { converter cx, cy, cr; cx.storage = x; cy.storage = y; // for char, we do these as vector ops cr.a = FUNC()(cx.a, cy.a); cr.b = FUNC()(cx.b, cy.b); return cr.storage; } }; template struct MULTI { static_assert(sizeof(PackType) == 2 * sizeof(int), "PackType must be twice the size of int."); union converter { PackType storage; struct { int a, b; }; }; __device__ PackType operator()(const PackType x, const PackType y) const { converter cx, cy, cr; cx.storage = x; cy.storage = y; cr.a = FUNC()(cx.a, cy.a); cr.b = FUNC()(cx.b, cy.b); return cr.storage; } }; #ifdef CUDA_HAS_HALF template struct MULTI { static_assert(sizeof(PackType) == 2 * sizeof(float), "PackType must be twice the size of float."); union converter { PackType storage; struct { half2 a, b; }; }; __device__ PackType operator()(const PackType x, const PackType y) const { converter cx, cy, cr; cx.storage = x; cy.storage = y; cr.a = FUNC()(cx.a, cy.a); cr.b = FUNC()(cx.b, cy.b); return cr.storage; } }; #endif template struct MULTI { static_assert(sizeof(PackType) == 2 * sizeof(float), "PackType must be twice the size of float."); union converter { PackType storage; struct { float a, b; }; }; __device__ PackType operator()(const PackType x, const PackType y) const { converter cx, cy, cr; cx.storage = x; cy.storage = y; cr.a = FUNC()(cx.a, cy.a); cr.b = FUNC()(cx.b, cy.b); return cr.storage; } }; template struct MULTI { static_assert(sizeof(PackType) == sizeof(double), "PackType must be the same size as double."); __device__ PackType operator()(const PackType x, const PackType y) const { double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y)); return __double_as_longlong(rv); } }; template struct MULTI { static_assert(sizeof(PackType) == sizeof(unsigned long long), "PackType must be the same size as unsigned long long."); __device__ PackType operator()(const PackType x, const PackType y) const { unsigned long long rv = FUNC()(x, y); return rv; } }; template struct MULTI { static_assert(sizeof(PackType) == sizeof(long long), "PackType must be the same size as long long."); __device__ PackType operator()(const PackType x, const PackType y) const { long long rv = FUNC()((long long)x, (long long)y); return rv; } }; template __device__ inline void FetchOneOrTwo64b(PackType& s0, const volatile T * __restrict__ const src0, PackType& s1, const volatile T * __restrict__ const src1, const int idx) { s0 = (reinterpret_cast(src0))[idx]; if (FETCHTWO) { s1 = (reinterpret_cast(src1))[idx]; } } template __device__ inline void StoreOneOrTwo64b(volatile T * __restrict__ const dest0, volatile T * __restrict__ const dest1, PackType val, const int idx) { (reinterpret_cast(dest0))[idx] = val; if (STORETWO) { (reinterpret_cast(dest1))[idx] = val; } } template __device__ inline PackType ReduceOrCopy64b(const PackType s0, const PackType s1) { if (ISREDUCE) { return MULTI()(s0, s1); } else { return s0; } } #define ALIGNUP(x, a) ((((x)-1) & ~((a)-1)) + (a)) template __device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) { size_t ptrval = reinterpret_cast(ptr); return reinterpret_cast(ALIGNUP(ptrval, align)); } template inline __device__ T vFetch(const volatile T* ptr) { return *ptr; } #ifdef CUDA_HAS_HALF template<> inline __device__ half vFetch(const volatile half* ptr) { half r; r.x = ptr->x; return r; } #endif template inline __device__ void vStore(volatile T* ptr, const T val) { *ptr = val; } #ifdef CUDA_HAS_HALF template<> inline __device__ void vStore(volatile half* ptr, const half val) { ptr->x = val.x; } #endif // Assumptions: // - there is exactly 1 block // - THREADS is the number of producer threads // - this function is called by all producer threads template __device__ inline void ReduceOrCopy(const int tid, volatile T * __restrict__ dest0, volatile T * __restrict__ dest1, const volatile T * __restrict__ src0, const volatile T * __restrict__ src1, int N) { if (N==0) { return; } const int UNROLL2 = (UNROLL >= 2) ? (UNROLL / 2) : 1; const bool NOUNROLL2 = ((UNROLL / 2) == 0); int Npreamble = AlignUp(dest0, alignof(PackType)) - dest0; // stage 0: check if we'll be able to use the fast, 64-bit aligned path. // If not, we'll just use the slow preamble path for the whole operation bool alignable = (((AlignUp(src0, alignof(PackType)) == src0 + Npreamble)) && (!HAS_DEST1 || (AlignUp(dest1, alignof(PackType)) == dest1 + Npreamble)) && (!HAS_SRC1 || (AlignUp(src1, alignof(PackType)) == src1 + Npreamble))); if (!alignable) { Npreamble = N; } /* if (threadIdx.x == 0) { printf("** alignable: %s", (alignable ? "YES" : " NO")); printf(", dest0 = 0x%08X", dest0); printf(", src0 = 0x%08X", src0); if (HAS_DEST1) printf(", dest1 = 0x%08X", dest1); if (HAS_SRC1) printf(", src1 = 0x%08X", src1); printf("\n"); } */ // stage 1: preamble: handle any elements up to the point of everything coming // into alignment for (int idx = tid; idx < Npreamble; idx += THREADS) { // ought to be no way this is ever more than one iteration, except when // alignable is false T val = vFetch(src0+idx); if (HAS_SRC1) { val = FUNC()(val, vFetch(src1+idx)); } vStore(dest0+idx, val); if (HAS_DEST1) { vStore(dest1+idx, val); } } // reduce N by however many elements we've handled already int Ndone = Npreamble; int Nrem = N - Ndone; // stage 2: fast path: use 64b loads/stores to do the bulk of the work, // assuming the pointers we have are all 64-bit alignable. if (alignable) { if (Ndone > 0) { // align up pointers dest0 += Ndone; if (HAS_DEST1) { dest1 += Ndone; } src0 += Ndone; if (HAS_SRC1) { src1 += Ndone; } } // stage 2a: main loop int Nalign = (Nrem / (sizeof(PackType) / sizeof(T)) / (UNROLL * THREADS)) * (UNROLL * THREADS); // round down #pragma unroll 1 // don't unroll this loop for (int idx = tid; idx < Nalign; idx += UNROLL * THREADS) { PackType t0[UNROLL2]; PackType t1[UNROLL2]; PackType t2[UNROLL2]; #pragma unroll for (int j = 0; j < UNROLL2; ++j) FetchOneOrTwo64b(t0[j], src0, t1[j], src1, idx + j * THREADS); #pragma unroll for (int j = 0; j < UNROLL2; ++j) t2[j] = ReduceOrCopy64b(t0[j], t1[j]); if (!NOUNROLL2) { #pragma unroll for (int j = 0; j < UNROLL2; ++j) FetchOneOrTwo64b(t0[j], src0, t1[j], src1, idx + (UNROLL2 + j) * THREADS); } #pragma unroll for (int j = 0; j < UNROLL2; ++j) StoreOneOrTwo64b(dest0, dest1, t2[j], idx + j * THREADS); if (!NOUNROLL2) { #pragma unroll for (int j = 0; j < UNROLL2; ++j) t2[j] = ReduceOrCopy64b(t0[j], t1[j]); #pragma unroll for (int j = 0; j < UNROLL2; ++j) StoreOneOrTwo64b(dest0, dest1, t2[j], idx + (UNROLL2 + j) * THREADS); } } // stage 2b: slightly less optimized for section when we don't have full // UNROLLs int Ndone2a = Nalign * (sizeof(PackType)/sizeof(T)); Ndone += Ndone2a; Nrem = N - Ndone; // TODO: This kind of pointer update arithmetic is expensive. Should // probably find a better way. if (Nrem > 0) { dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; } src0 += Ndone2a; if (HAS_SRC1) { src1 += Ndone2a; } } Nalign = Nrem / (sizeof(PackType)/sizeof(T)); #pragma unroll 4 for (int idx = tid; idx < Nalign; idx += THREADS) { PackType t0, t1, t2; FetchOneOrTwo64b(t0, src0, t1, src1, idx); t2 = ReduceOrCopy64b(t0, t1); StoreOneOrTwo64b(dest0, dest1, t2, idx); } // stage 2c: tail int Ndone2b = Nalign * (sizeof(PackType)/sizeof(T)); Ndone += Nalign * (sizeof(PackType)/sizeof(T)); Nrem = N - Ndone; if (Nrem > 0) { dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; } src0 += Ndone2b; if (HAS_SRC1) { src1 += Ndone2b; } } for (int idx = tid; idx < Nrem; idx += THREADS) { // never ought to make it more than one time through this loop. only a // few threads should even participate T val = vFetch(src0+idx); if (HAS_SRC1) { val = FUNC()(val, vFetch(src1+idx)); } vStore(dest0+idx, val); if (HAS_DEST1) { vStore(dest1+idx, val); } } } // done fast path } template __device__ inline void CalcLastChunk(int * const bigSliceN, int * const smallSliceN, int * const lastSliceN, int * const numSlices, int * const numBigSlices, int * const numSmallSlices, const int N, const int numChunks, const int chunkSize) { int Nleft = N - ((numChunks - 1) * chunkSize); // semi-equally split up the remaining work into numslices slices. // it's "semi"-equal because we want the divisions to land as neatly as we // can on alignable boundaries int NperTile = UNROLL * THREADS * (sizeof(PackType)/sizeof(T)); int numTiles = (Nleft + NperTile - 1) / NperTile; int numTilesPerBigSlice = (numTiles + *numSlices - 1) / *numSlices; int numTilesPerSmallSlice = numTiles / *numSlices; *bigSliceN = NperTile * numTilesPerBigSlice; *smallSliceN = NperTile * numTilesPerSmallSlice; *numBigSlices = numTiles % *numSlices; *numSmallSlices = (*smallSliceN > 0) ? *numSlices - *numBigSlices : 0; // the lastSlice will take the place of one of the small slices unless // there are no small slices (because this is a very small reduction), in // which case we replace one of the big slices and leave the small slices // as 0. if (*numSmallSlices > 0) { --*numSmallSlices; if (*numSmallSlices == 0) *smallSliceN = 0; } else { --*numBigSlices; if (*numBigSlices == 0) *bigSliceN = 0; } *lastSliceN = Nleft - (*numBigSlices * *bigSliceN + *numSmallSlices * *smallSliceN); // in cases where args.N % numSlices is pretty small, we'd rather have one // slightly big last slice than one big slice, a bunch of small slices, // and one smaller last slice if ((*numBigSlices == 1) && (*numSmallSlices == *numSlices - 2) && (*lastSliceN < *smallSliceN)) { *numBigSlices += *numSmallSlices; *numSmallSlices = 0; *bigSliceN = *smallSliceN; *smallSliceN = 0; *lastSliceN = Nleft - *numBigSlices * *bigSliceN; } // done recalculating *numSlices = *numBigSlices + *numSmallSlices + 1; } #endif // COMMON_KERNEL_H_