diff options
author | Soumith Chintala <soumith@gmail.com> | 2017-03-03 23:37:26 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-03-03 23:37:26 +0300 |
commit | 226484acc0e27fcd132ab28a7c3c794ce8e1299b (patch) | |
tree | 7e735a103a68d53ff77339766bb4fa4c0dabe50d | |
parent | 5e235ad7978389fa11865f8fa02f53d74b3bc361 (diff) | |
parent | 948e753486468adcd4985a5301ce32668d5acaca (diff) |
Merge pull request #961 from torch/avxfix
TH CMake cleanup + AVX/AVX2 only on appropriate files
-rw-r--r-- | lib/TH/CMakeLists.txt | 207 | ||||
-rw-r--r-- | lib/TH/THGeneral.h.in | 7 | ||||
-rw-r--r-- | lib/TH/THVector.c | 8 | ||||
-rw-r--r-- | lib/TH/generic/THVectorDispatch.c | 18 | ||||
-rw-r--r-- | lib/TH/generic/simd/convolve.c | 6 | ||||
-rw-r--r-- | lib/TH/vector/AVX.c | 37 | ||||
-rw-r--r-- | lib/TH/vector/AVX.h | 23 | ||||
-rw-r--r-- | lib/TH/vector/AVX2.c | 7 | ||||
-rw-r--r-- | lib/TH/vector/AVX2.h | 9 |
9 files changed, 172 insertions, 150 deletions
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt index 20f6bd6..351594d 100644 --- a/lib/TH/CMakeLists.txt +++ b/lib/TH/CMakeLists.txt @@ -20,11 +20,21 @@ IF(NOT TH_INSTALL_BIN_SUBDIR SET(TH_INSTALL_CMAKE_SUBDIR "share/cmake/TH" CACHE PATH "TH install cmake subdirectory") ENDIF() -# flags +####################################################################### +##### flags section +###################################################################### IF(MSVC) - # respect the standard - ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1) + # MSVC now supports C99 since VS2013/VS2015 + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /std:c99") +ELSE(MSVC) + # enable gnu99 and not c99 because we use + # gnu extensions like posix_memalign + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu99") +ENDIF(MSVC) + +IF(MSVC) + ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1) # respect the standard ENDIF(MSVC) IF(UNIX) @@ -114,82 +124,25 @@ IF(NOT NO_GCC_EBX_FPIC_BUG) ENDIF(NOT NO_GCC_EBX_FPIC_BUG) -FIND_PACKAGE(SSE) +FIND_PACKAGE(SSE) # checks SSE, AVX and AVX2 IF(C_SSE2_FOUND) + MESSAGE(STATUS "SSE2 Found") SET(CMAKE_C_FLAGS "${C_SSE2_FLAGS} -DUSE_SSE2 ${CMAKE_C_FLAGS}") ENDIF(C_SSE2_FOUND) IF(C_SSE3_FOUND) + MESSAGE(STATUS "SSE3 Found") SET(CMAKE_C_FLAGS "${C_SSE3_FLAGS} -DUSE_SSE3 ${CMAKE_C_FLAGS}") ENDIF(C_SSE3_FOUND) +# we dont set AVX and AVX2 flags globally, but only for specific files IF(C_AVX_FOUND) - SET(CMAKE_C_FLAGS "${C_AVX_FLAGS} -DUSE_AVX ${CMAKE_C_FLAGS}") + MESSAGE(STATUS "AVX Found") + # SET(CMAKE_C_FLAGS "${C_AVX_FLAGS} ${CMAKE_C_FLAGS}") ENDIF(C_AVX_FOUND) IF(C_AVX2_FOUND) - SET(CMAKE_C_FLAGS "${C_AVX2_FLAGS} -DUSE_AVX2 ${CMAKE_C_FLAGS}") + MESSAGE(STATUS "AVX2 Found") + # SET(CMAKE_C_FLAGS "${C_AVX2_FLAGS} ${CMAKE_C_FLAGS}") ENDIF(C_AVX2_FOUND) -IF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND) - SET(simd generic/simd/convolve.c) - IF(MSVC) - SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve.c PROPERTIES COMPILE_FLAGS "/std:c99") - ELSE(MSVC) - SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve.c PROPERTIES COMPILE_FLAGS "-std=c99") - ENDIF(MSVC) -ENDIF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND) - -IF(C_SSE4_1_FOUND) - SET(CMAKE_C_FLAGS "${C_SSE4_1_FLAGS} -DUSE_SSE4_1 ${CMAKE_C_FLAGS}") -ENDIF(C_SSE4_1_FOUND) -IF(C_SSE4_2_FOUND) - SET(CMAKE_C_FLAGS "${C_SSE4_2_FLAGS} -DUSE_SSE4_2 ${CMAKE_C_FLAGS}") -ENDIF(C_SSE4_2_FOUND) - -IF(C_SSE4_1_FOUND OR C_SSE4_2_FOUND) - IF(MSVC) - SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast /std:c99") - ELSE(MSVC) - SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math -std=c99") - ENDIF(MSVC) - SET(simd ${simd} generic/simd/convolve5x5_sse.c) -ENDIF(C_SSE4_1_FOUND OR C_SSE4_2_FOUND) - -IF(MSVC) - SET_SOURCE_FILES_PROPERTIES(generic/THTensorMath.c PROPERTIES COMPILE_FLAGS "/std:c99") -ELSE(MSVC) - SET_SOURCE_FILES_PROPERTIES(generic/THTensorMath.c PROPERTIES COMPILE_FLAGS "-std=c99") -ENDIF(MSVC) - -IF(C_AVX_FOUND OR C_AVX2_FOUND) - SET(CMAKE_C_FLAGS "-DUSE_AVX ${CMAKE_C_FLAGS}") - IF(MSVC) - SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast /arch:AVX /std:c99") - ELSE(MSVC) - SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math -mavx -std=c99") - ENDIF(MSVC) - SET(simd ${simd} generic/simd/convolve5x5_avx.c) -ENDIF(C_AVX_FOUND OR C_AVX2_FOUND) - -SET(hdr - THGeneral.h THHalf.h THAllocator.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h - THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h ) - -SET(src - THGeneral.c THHalf.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c - THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c) - -SET(src ${src} ${hdr} ${simd}) -ADD_LIBRARY(TH SHARED ${src}) -if(BUILD_STATIC) - ADD_LIBRARY(TH_static STATIC ${src}) -endif() - -IF(NOT TH_SO_VERSION) - SET(TH_SO_VERSION 0) -ENDIF(NOT TH_SO_VERSION) -MESSAGE(STATUS "TH_SO_VERSION: ${TH_SO_VERSION}") -SET_TARGET_PROPERTIES(TH PROPERTIES - VERSION ${TH_SO_VERSION} - SOVERSION ${TH_SO_VERSION}) CHECK_C_SOURCE_RUNS(" #include <stdatomic.h> @@ -233,6 +186,72 @@ int main() " HAS_GCC_ATOMICS) ENDIF() +####################################################################### +##### sources section +###################################################################### + +# IF ANY SIMD FOUND +IF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND) + SET(simd generic/simd/convolve.c) +ENDIF(C_AVX2_FOUND OR C_AVX_FOUND OR C_SSE4_2_FOUND OR C_SSE4_1_FOUND) + +# IF SSE4 FOUND +IF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND) + SET(CMAKE_C_FLAGS "${C_SSE4_1_FLAGS} -DUSE_SSE4_1 ${C_SSE4_2_FLAGS} -DUSE_SSE4_2 ${CMAKE_C_FLAGS}") + IF(MSVC) + SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast") + ELSE(MSVC) + SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math") + ENDIF(MSVC) + SET(simd ${simd} generic/simd/convolve5x5_sse.c) +ENDIF(C_SSE4_1_FOUND AND C_SSE4_2_FOUND) + +# IF AVX FOUND +IF(C_AVX_FOUND) + IF(MSVC) + SET_SOURCE_FILES_PROPERTIES(vector/AVX.c generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast ${C_AVX_FLAGS}") + ELSE(MSVC) + SET_SOURCE_FILES_PROPERTIES(vector/AVX.c generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math ${C_AVX_FLAGS}") + ENDIF(MSVC) + SET(simd ${simd} vector/AVX.c generic/simd/convolve5x5_avx.c) +ENDIF(C_AVX_FOUND) + +IF(C_AVX2_FOUND) + IF(MSVC) + SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast ${C_AVX2_FLAGS}") + ELSE(MSVC) + SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math ${C_AVX2_FLAGS}") + ENDIF(MSVC) + SET(simd ${simd} vector/AVX2.c) +ENDIF(C_AVX2_FOUND) + +SET(hdr + THGeneral.h THHalf.h THAllocator.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h + THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h ) + +SET(src + THGeneral.c THHalf.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c + THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c) + +SET(src ${src} ${hdr} ${simd}) + +####################################################################### +##### build section +###################################################################### + +ADD_LIBRARY(TH SHARED ${src}) +if(BUILD_STATIC) + ADD_LIBRARY(TH_static STATIC ${src}) +endif() + +IF(NOT TH_SO_VERSION) + SET(TH_SO_VERSION 0) +ENDIF(NOT TH_SO_VERSION) +MESSAGE(STATUS "TH_SO_VERSION: ${TH_SO_VERSION}") +SET_TARGET_PROPERTIES(TH PROPERTIES + VERSION ${TH_SO_VERSION} + SOVERSION ${TH_SO_VERSION}) + IF(HAS_C11_ATOMICS) ADD_DEFINITIONS(-DUSE_C11_ATOMICS=1) MESSAGE(STATUS "Atomics: using C11 intrinsics") @@ -264,10 +283,6 @@ IF(LAPACK_FOUND) TARGET_LINK_LIBRARIES(TH ${LAPACK_LIBRARIES}) ENDIF(LAPACK_FOUND) -IF(BLAS_IS_ACCELERATE) - MESSAGE(STATUS "BLAS FOUND IS ACCELERATE: Fix for sdot") -ENDIF() - IF (UNIX AND NOT APPLE) INCLUDE(CheckLibraryExists) # https://github.com/libgit2/libgit2/issues/2128#issuecomment-35649830 @@ -284,6 +299,7 @@ IF(UNIX) IF(HAVE_MMAP) ADD_DEFINITIONS(-DHAVE_MMAP=1) ENDIF(HAVE_MMAP) + # done for lseek: https://www.gnu.org/software/libc/manual/html_node/File-Position-Primitive.html ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64) CHECK_FUNCTION_EXISTS(shm_open HAVE_SHM_OPEN) IF(HAVE_SHM_OPEN) @@ -299,47 +315,10 @@ IF(UNIX) ENDIF(HAVE_MALLOC_USABLE_SIZE) ENDIF(UNIX) - - IF(NOT MSVC) TARGET_LINK_LIBRARIES(TH m) ENDIF(NOT MSVC) -SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) -FOREACH(KEYWORD "inline" "__inline__" "__inline") - IF(NOT DEFINED C_INLINE) - - SET(CMAKE_REQUIRED_FLAGS "-Dinline=${KEYWORD} ${CMAKE_C_FLAGS}") - CHECK_C_SOURCE_RUNS(" - static inline int static_foo() - { - return 0; - } - - int main(int argc, char *argv[]) - { - static_foo(); - return 0; - }" C_HAS_${KEYWORD}) - - IF(C_HAS_${KEYWORD}) - SET(C_INLINE TRUE) -# Right now i put it in THGeneral.h -- debatable -# ADD_DEFINITIONS("-Dinline=${KEYWORD}") - SET(TH_INLINE ${KEYWORD}) - MESSAGE(STATUS "C inline is supported (${KEYWORD})") - ENDIF(C_HAS_${KEYWORD}) - ENDIF(NOT DEFINED C_INLINE) -ENDFOREACH(KEYWORD) -SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) - -IF(NOT DEFINED C_INLINE) - MESSAGE(STATUS "C inline seems not supported") -# Right now i put it in THGeneral.h -- debatable -# ADD_DEFINITIONS("-Dinline=") -SET(TH_INLINE "") -ENDIF(NOT DEFINED C_INLINE) - # Is __thread supported? IF(NOT MSVC) CHECK_C_SOURCE_COMPILES("static __thread int x = 1; int main() { return x; }" C_HAS_THREAD) @@ -355,6 +334,11 @@ ENDIF(NOT C_HAS_THREAD) INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}") CONFIGURE_FILE(THGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h") + +####################################################################### +##### install section +###################################################################### + INSTALL(TARGETS TH EXPORT TH-exports RUNTIME DESTINATION "${TH_INSTALL_BIN_SUBDIR}" @@ -389,6 +373,11 @@ INSTALL(FILES DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH") INSTALL(FILES + vector/AVX.h + vector/AVX2.h + DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH/vector") + +INSTALL(FILES generic/THBlas.c generic/THBlas.h generic/THLapack.c diff --git a/lib/TH/THGeneral.h.in b/lib/TH/THGeneral.h.in index bc7e448..de11f1b 100644 --- a/lib/TH/THGeneral.h.in +++ b/lib/TH/THGeneral.h.in @@ -13,7 +13,6 @@ #cmakedefine USE_BLAS #cmakedefine USE_LAPACK -#cmakedefine BLAS_IS_ACCELERATE #cmakedefine BLAS_F2C #ifdef __cplusplus @@ -32,12 +31,6 @@ # define TH_API TH_EXTERNC #endif -#define TH_INLINE @TH_INLINE@ - -#ifndef __cplusplus -#define inline @TH_INLINE@ -#endif - #ifndef M_PI # define M_PI 3.14159265358979323846 #endif diff --git a/lib/TH/THVector.c b/lib/TH/THVector.c index 1c9ea24..d8493a5 100644 --- a/lib/TH/THVector.c +++ b/lib/TH/THVector.c @@ -15,12 +15,12 @@ #include "vector/SSE.c" #endif -#if defined(USE_AVX) || defined(USE_AVX2) -#include "vector/AVX.c" +#if defined(__AVX__) || defined(__AVX2__) +#include "vector/AVX.h" #endif -#if defined(USE_AVX2) -#include "vector/AVX2.c" +#if defined(__AVX2__) +#include "vector/AVX2.h" #endif #include "generic/THVectorDefault.c" diff --git a/lib/TH/generic/THVectorDispatch.c b/lib/TH/generic/THVectorDispatch.c index 5b88852..6220dd6 100644 --- a/lib/TH/generic/THVectorDispatch.c +++ b/lib/TH/generic/THVectorDispatch.c @@ -26,7 +26,7 @@ static FunctionDescription THVector_(fill_DISPATCHTABLE)[] = { #endif #endif - #if defined(USE_AVX) + #if defined(__AVX__) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) FUNCTION_IMPL(THVector_(fill_AVX), SIMDExtension_AVX), #endif @@ -52,13 +52,13 @@ static FunctionDescription THVector_(cadd_DISPATCHTABLE)[] = { #endif #endif - #if defined(USE_AVX2) + #if defined(__AVX2__) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) FUNCTION_IMPL(THVector_(cadd_AVX2), SIMDExtension_AVX2), #endif #endif - #if defined(USE_AVX) + #if defined(__AVX__) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) FUNCTION_IMPL(THVector_(cadd_AVX), SIMDExtension_AVX), #endif @@ -91,7 +91,7 @@ static FunctionDescription THVector_(adds_DISPATCHTABLE)[] = { #endif #endif - #if defined(USE_AVX) + #if defined(__AVX__) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) FUNCTION_IMPL(THVector_(adds_AVX), SIMDExtension_AVX), #endif @@ -119,7 +119,7 @@ static FunctionDescription THVector_(cmul_DISPATCHTABLE)[] = { #endif #endif - #if defined(USE_AVX) + #if defined(__AVX__) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) FUNCTION_IMPL(THVector_(cmul_AVX), SIMDExtension_AVX), #endif @@ -152,7 +152,7 @@ static FunctionDescription THVector_(muls_DISPATCHTABLE)[] = { #endif #endif - #if defined(USE_AVX) + #if defined(__AVX__) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) FUNCTION_IMPL(THVector_(muls_AVX), SIMDExtension_AVX), #endif @@ -179,7 +179,7 @@ static FunctionDescription THVector_(cdiv_DISPATCHTABLE)[] = { #endif #endif - #if defined(USE_AVX) + #if defined(__AVX__) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) FUNCTION_IMPL(THVector_(cdiv_AVX), SIMDExtension_AVX), #endif @@ -206,7 +206,7 @@ static FunctionDescription THVector_(divs_DISPATCHTABLE)[] = { #endif #endif - #if defined(USE_AVX) + #if defined(__AVX__) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) FUNCTION_IMPL(THVector_(divs_AVX), SIMDExtension_AVX), #endif @@ -227,7 +227,7 @@ void THVector_(divs)(real *y, const real *x, const real c, const ptrdiff_t n) { static void (*THVector_(copy_DISPATCHPTR))(real *, const real *, const ptrdiff_t) = &THVector_(copy_DEFAULT); static FunctionDescription THVector_(copy_DISPATCHTABLE)[] = { - #if defined(USE_AVX) + #if defined(__AVX__) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) FUNCTION_IMPL(THVector_(copy_AVX), SIMDExtension_AVX), #endif diff --git a/lib/TH/generic/simd/convolve.c b/lib/TH/generic/simd/convolve.c index 842af17..bf07bbe 100644 --- a/lib/TH/generic/simd/convolve.c +++ b/lib/TH/generic/simd/convolve.c @@ -1,4 +1,4 @@ -#if defined(USE_AVX) +#if defined(__AVX__) #ifdef _MSC_VER #include <intrin.h> @@ -113,7 +113,7 @@ void convolve_5x5_sse(float* output, float* input, float* kernel, long outRows, void convolve_5x5_avx(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols); void convolve_5x5(float* output, float* input, float* kernel, long outRows, long outCols, long inCols) { -#if defined(USE_AVX) +#if defined(__AVX__) int avx = haveCPUFeature(kCPUFeature_AVX); if (avx) { @@ -124,4 +124,4 @@ void convolve_5x5(float* output, float* input, float* kernel, long outRows, long { convolve_5x5_sse(output, input, kernel, outRows, outCols, outCols, inCols); } -}
\ No newline at end of file +} diff --git a/lib/TH/vector/AVX.c b/lib/TH/vector/AVX.c index 1f902cc..b7d5dd1 100644 --- a/lib/TH/vector/AVX.c +++ b/lib/TH/vector/AVX.c @@ -1,10 +1,13 @@ +#if defined(__AVX__) #ifndef _MSC_VER #include <x86intrin.h> #else #include <intrin.h> #endif -static void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n) { +#include "AVX.h" + +void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n) { ptrdiff_t i; ptrdiff_t off; for (i=0; i<=((n)-8); i+=8) { @@ -17,7 +20,7 @@ static void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t } } -static void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n) { +void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n) { ptrdiff_t i; ptrdiff_t off; __m256d YMM0 = _mm256_set_pd(c, c, c, c); @@ -33,7 +36,7 @@ static void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n } } -static void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) { +void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-8); i+=8) { @@ -51,7 +54,7 @@ static void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, } } -static void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n) { +void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM15 = _mm256_set_pd(c, c, c, c); __m256d YMM0, YMM1; @@ -68,7 +71,7 @@ static void THDoubleVector_divs_AVX(double *y, const double *x, const double c, } } -static void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) { +void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-8); i+=8) { @@ -86,7 +89,7 @@ static void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, } } -static void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n) { +void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM15 = _mm256_set_pd(c, c, c, c); __m256d YMM0, YMM1; @@ -103,7 +106,7 @@ static void THDoubleVector_muls_AVX(double *y, const double *x, const double c, } } -static void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) { +void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM15 = _mm256_set_pd(c, c, c, c); __m256d YMM0, YMM1, YMM2, YMM3; @@ -119,7 +122,7 @@ static void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, } } -static void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n) { +void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM15 = _mm256_set_pd(c, c, c, c); __m256d YMM0, YMM1; @@ -136,7 +139,7 @@ static void THDoubleVector_adds_AVX(double *y, const double *x, const double c, } } -static void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n) { +void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n) { ptrdiff_t i; ptrdiff_t off; for (i=0; i<=((n)-16); i+=16) { @@ -149,7 +152,7 @@ static void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n) } } -static void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n) { +void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n) { ptrdiff_t i; ptrdiff_t off; __m256 YMM0 = _mm256_set_ps(c, c, c, c, c, c, c, c); @@ -165,7 +168,7 @@ static void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n) { } } -static void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) { +void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-16); i+=16) { @@ -183,7 +186,7 @@ static void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, con } } -static void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n) { +void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c); __m256 YMM0, YMM1; @@ -200,7 +203,7 @@ static void THFloatVector_divs_AVX(float *y, const float *x, const float c, cons } } -static void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) { +void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-16); i+=16) { @@ -218,7 +221,7 @@ static void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, con } } -static void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n) { +void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c); __m256 YMM0, YMM1; @@ -235,7 +238,7 @@ static void THFloatVector_muls_AVX(float *y, const float *x, const float c, cons } } -static void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) { +void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c); __m256 YMM0, YMM1, YMM2, YMM3; @@ -251,7 +254,7 @@ static void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, con } } -static void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n) { +void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c); __m256 YMM0, YMM1; @@ -267,3 +270,5 @@ static void THFloatVector_adds_AVX(float *y, const float *x, const float c, cons y[i] = x[i] + c; } } + +#endif // defined(__AVX__) diff --git a/lib/TH/vector/AVX.h b/lib/TH/vector/AVX.h new file mode 100644 index 0000000..bfaeaa6 --- /dev/null +++ b/lib/TH/vector/AVX.h @@ -0,0 +1,23 @@ +#ifndef TH_AVX_H +#define TH_AVX_H + +#include <stddef.h> + +void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n); +void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n); +void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n); +void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n); +void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n); +void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n); +void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n); +void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n); +void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n); +void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n); +void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n); +void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n); +void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n); +void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n); +void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n); +void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n); + +#endif diff --git a/lib/TH/vector/AVX2.c b/lib/TH/vector/AVX2.c index 3ccfc82..082a680 100644 --- a/lib/TH/vector/AVX2.c +++ b/lib/TH/vector/AVX2.c @@ -1,10 +1,12 @@ +#if defined(__AVX2__) #ifndef _MSC_VER #include <x86intrin.h> #else #include <intrin.h> #endif +#include "AVX2.h" -static void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) { +void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM15 = _mm256_set_pd(c, c, c, c); __m256d YMM0, YMM1, YMM2, YMM3; @@ -23,7 +25,7 @@ static void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y } } -static void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) { +void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c); __m256 YMM0, YMM1, YMM2, YMM3; @@ -42,3 +44,4 @@ static void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, co } } +#endif // defined(__AVX2__) diff --git a/lib/TH/vector/AVX2.h b/lib/TH/vector/AVX2.h new file mode 100644 index 0000000..85a9e93 --- /dev/null +++ b/lib/TH/vector/AVX2.h @@ -0,0 +1,9 @@ +#ifndef TH_AVX2_H +#define TH_AVX2_H + +#include <stddef.h> + +void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n); +void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n); + +#endif |