diff options
author | Soumith Chintala <soumith@gmail.com> | 2016-11-15 23:57:57 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-11-15 23:57:57 +0300 |
commit | 97341e27ae35fb3bcfabbad0796dc1a60e0c2cde (patch) | |
tree | 01a184fb843f8c4051959877f2c76f7412e00455 | |
parent | a7d9af35d034a9cc6c1ff4e58e4154a1cfe8464a (diff) | |
parent | 01b0f5ad33e36e0e74c86a8c44aa3e30dfee617a (diff) |
Merge pull request #839 from Atcold/fix_ASIMD
Fix compilation for ASIMD, fix #766
-rw-r--r-- | lib/TH/CMakeLists.txt | 7 | ||||
-rw-r--r-- | lib/TH/cmake/FindARM.cmake | 9 | ||||
-rw-r--r-- | lib/TH/vector/NEON.c | 296 |
3 files changed, 75 insertions, 237 deletions
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt index 29343c7..e6cf91d 100644 --- a/lib/TH/CMakeLists.txt +++ b/lib/TH/CMakeLists.txt @@ -62,10 +62,13 @@ ENDIF (WITH_OPENMP) # ARM specific flags FIND_PACKAGE(ARM) -IF (NEON_FOUND) +IF (ASIMD_FOUND) + MESSAGE(STATUS "asimd/Neon found with compiler flag : -D__NEON__") + SET(CMAKE_C_FLAGS "-D__NEON__ ${CMAKE_C_FLAGS}") +ELSEIF (NEON_FOUND) MESSAGE(STATUS "Neon found with compiler flag : -mfpu=neon -D__NEON__") SET(CMAKE_C_FLAGS "-mfpu=neon -D__NEON__ ${CMAKE_C_FLAGS}") -ENDIF (NEON_FOUND) +ENDIF (ASIMD_FOUND) IF (CORTEXA8_FOUND) MESSAGE(STATUS "Cortex-A8 Found with compiler flag : -mcpu=cortex-a8") SET(CMAKE_C_FLAGS "-mcpu=cortex-a8 -fprefetch-loop-arrays ${CMAKE_C_FLAGS}") diff --git a/lib/TH/cmake/FindARM.cmake b/lib/TH/cmake/FindARM.cmake index cf1f8fd..59c78d8 100644 --- a/lib/TH/cmake/FindARM.cmake +++ b/lib/TH/cmake/FindARM.cmake @@ -13,6 +13,15 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") set(NEON_FOUND false CACHE BOOL "NEON available on host") ENDIF (NEON_TRUE) + # on ARMv8, neon is inherit and instead listed as 'asimd' in /proc/cpuinfo + STRING(REGEX REPLACE "^.*(asimd).*$" "\\1" ASIMD_THERE ${CPUINFO}) + STRING(COMPARE EQUAL "asimd" "${ASIMD_THERE}" ASIMD_TRUE) + IF (ASIMD_TRUE) + set(ASIMD_FOUND true CACHE BOOL "ASIMD/NEON available on host") + ELSE (ASIMD_TRUE) + set(ASIMD_FOUND false CACHE BOOL "ASIMD/NEON available on host") + ENDIF (ASIMD_TRUE) + #Find the processor type (for now OMAP3 or OMAP4) STRING(REGEX REPLACE "^.*(OMAP3).*$" "\\1" OMAP3_THERE ${CPUINFO}) STRING(COMPARE EQUAL "OMAP3" "${OMAP3_THERE}" OMAP3_TRUE) diff --git a/lib/TH/vector/NEON.c b/lib/TH/vector/NEON.c index bc7cb2b..327b006 100644 --- a/lib/TH/vector/NEON.c +++ b/lib/TH/vector/NEON.c @@ -1,252 +1,78 @@ static void THFloatVector_fill_NEON(float *x, const float c, const ptrdiff_t n) { - float ctemp = c; - float * caddr = &ctemp; - __asm__ __volatile__ ( - "mov r0, %0 @ \n\t" - "ldr r4, [%1] @ \n\t" - "vdup.32 q12, r4 @ \n\t" - "vdup.32 q13, r4 @ \n\t" - "lsrs r4, %2, #3 @ \n\t" - "beq 3f @ \n\t" - "1: @ \n\t" - "vst1.32 {d24-d27}, [r0]! @ \n\t" - "subs r4, r4, #1 @ \n\t" - "bne 1b @ \n\t" - "3: @ \n\t" - "ands r4, %2, #7 @ \n\t" - "beq 5f @ \n\t" - "4: @ \n\t" - "subs r4, r4, #1 @ \n\t" - "vst1.32 {d24[0]}, [r0]! @ \n\t" - "bne 4b @ \n\t" - "5: @ " - : - :"r" (x), "r"(caddr),"r"(n) - : "cc", "r0", "r4", "memory", - "q12", - "d24", "d25", "d26", "d27" - ); + long i = 0; + + for(; i < n-4; i += 4) + { + x[i] = c; + x[i+1] = c; + x[i+2] = c; + x[i+3] = c; + } + + for(; i < n; i++) + x[i] = c; + } static void THFloatVector_diff_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) { - __asm__ __volatile__ ( - "mov r0, %2 @ \n\t" - "mov r1, %1 @ \n\t" - "mov r2, %0 @ \n\t" - "lsrs r4, %3, #3 @ \n\t" - "beq 3f @ \n\t" - "vld1.32 {d16-d19}, [r1]! @ \n\t" - "vld1.32 {d0-d3}, [r0]! @ \n\t" - "1: @ \n\t" - "vsub.f32 q12, q8, q0 @ \n\t" - "vsub.f32 q13, q9, q1 @ \n\t" - "subs r4, r4, #1 @ \n\t" - "beq 2f @ \n\t" - "vld1.32 {d16-d19}, [r1]! @ \n\t" - "vld1.32 {d0-d3}, [r0]! @ \n\t" - "vst1.32 {d24-d27}, [r2]! @ \n\t" - "b 1b @ \n\t" - "2: @ \n\t" - "vst1.32 {d24-d27}, [r2]! @ \n\t" - "3: @ \n\t" - "ands r4, %3, #7 @ \n\t" - "beq 5f @ \n\t" - "4: @ \n\t" - "subs r4, r4, #1 @ \n\t" - "vld1.32 {d16[0]}, [r1]! @ \n\t" - "vld1.32 {d0[0]}, [r0]! @ \n\t" - "vsub.f32 d24, d16, d0 @ \n\t" - "vst1.32 {d24[0]}, [r2]! @ \n\t" - "bne 4b @ \n\t" - "5: @ " - : - :"r" (z), "r" (x),"r" (y), "r"(n) - : "cc", "r0", "r1", "r2", "r4", "memory", - "q0", "q1", "q8", "q9", "q12", "q13", - "d0", "d1", "d2", "d3", - "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27" - ); + long i = 0; + + for(; i < n-4; i += 4) + { + z[i] = x[i] - y[i]; + z[i+1] = x[i+1] - y[i+1]; + z[i+2] = x[i+2] - y[i+2]; + z[i+3] = x[i+3] - y[i+3]; + } + + for(; i < n; i++) + z[i] = x[i] - y[i]; + } static void THFloatVector_scale_NEON(float *y, const float c, const ptrdiff_t n) { - float ctemp = c; - float * caddr = &ctemp; - __asm__ __volatile__ ( - "mov r0, %0 @ \n\t" - "mov r2, r0 @ \n\t" - "ldr r5, [%1] @ \n\t" - "vdup.32 q14, r5 @ \n\t" - "lsrs r5, %2, #5 @ \n\t" - "beq 3f @ \n\t" - "vld1.32 {d0-d3}, [r0]! @ \n\t" - "vld1.32 {d4-d7}, [r0]! @ \n\t" - "vld1.32 {d8-d11}, [r0]! @ \n\t" - "vld1.32 {d12-d15}, [r0]! @ \n\t" - "1: @ \n\t" - "vmul.f32 q0, q0, q14 @ \n\t" - "vmul.f32 q1, q1, q14 @ \n\t" - "vmul.f32 q2, q2, q14 @ \n\t" - "vmul.f32 q3, q3, q14 @ \n\t" - "vmul.f32 q4, q4, q14 @ \n\t" - "vmul.f32 q5, q5, q14 @ \n\t" - "vmul.f32 q6, q6, q14 @ \n\t" - "vmul.f32 q7, q7, q14 @ \n\t" - "subs r5, r5, #1 @ \n\t" - "beq 2f @ \n\t" - "vst1.32 {d0-d3}, [r2]! @ \n\t" - "vld1.32 {d0-d3}, [r0]! @ \n\t" - "vst1.32 {d4-d7}, [r2]! @ \n\t" - "vld1.32 {d4-d7}, [r0]! @ \n\t" - "vst1.32 {d8-d11}, [r2]! @ \n\t" - "vld1.32 {d8-d11}, [r0]! @ \n\t" - "vst1.32 {d12-d15}, [r2]! @ \n\t" - "vld1.32 {d12-d15}, [r0]! @ \n\t" - "b 1b @ \n\t" - "2: @ \n\t" - "vst1.32 {d0-d3}, [r2]! @ \n\t" - "vst1.32 {d4-d7}, [r2]! @ \n\t" - "vst1.32 {d8-d11}, [r2]! @ \n\t" - "vst1.32 {d12-d15}, [r2]! @ \n\t" - "3: @ \n\t" - "lsrs r5, %2, #4 @ \n\t" - "ands r5, r5, #1 @ \n\t" - "beq 4f @ \n\t" - "vld1.32 {d0-d3}, [r0]! @ \n\t" - "vld1.32 {d4-d7}, [r0]! @ \n\t" - "vmul.f32 q0, q0, q14 @ \n\t" - "vmul.f32 q1, q1, q14 @ \n\t" - "vmul.f32 q2, q2, q14 @ \n\t" - "vmul.f32 q3, q3, q14 @ \n\t" - "vst1.32 {d0-d3}, [r2]! @ \n\t" - "vst1.32 {d4-d7}, [r2]! @ \n\t" - "4: @ \n\t" - "lsrs r5, %2, #3 @ \n\t" - "ands r5, r5, #1 @ \n\t" - "beq 5f @ \n\t" - "vld1.32 {d0-d3}, [r0]! @ \n\t" - "vmul.f32 q0, q0, q14 @ \n\t" - "vmul.f32 q1, q1, q14 @ \n\t" - "vst1.32 {d0-d3}, [r2]! @ \n\t" - "5: @ \n\t" - "ands r5, %2, #7 @ \n\t" - "beq 7f @ \n\t" - "6: @ \n\t" - "subs r5, r5, #1 @ \n\t" - "vld1.32 d0[0], [r0]! @ \n\t" - "vmul.f32 d0, d0, d28 @ \n\t" - "vst1.32 d0[0], [r2]! @ \n\t" - "bne 6b @ \n\t" - "7: @ " - : - :"r" (y), "r"(caddr),"r"(n) - : "cc", "r0", "r2", "r5", "memory", - "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14", - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", - "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", - "d28", "d29" - ); + long i = 0; + for(; i < n-4; i +=4) + { + y[i] *= c; + y[i+1] *= c; + y[i+2] *= c; + y[i+3] *= c; + } + + for(; i < n; i++) + y[i] *= c; } static void THFloatVector_mul_NEON(float *y, const float *x, const ptrdiff_t n) { - __asm__ __volatile__ ( - "mov r0, %0 @ \n\t" - "mov r1, %1 @ \n\t" - "mov r2, r0 @ \n\t" - "lsrs r4, %2, #3 @ \n\t" - "beq 3f @ \n\t" - "vld1.32 {d16-d19}, [r1]! @ \n\t" - "vld1.32 {d0-d3}, [r0]! @ \n\t" - "1: @ \n\t" - "vmul.f32 q12, q8, q0 @ \n\t" - "vmul.f32 q13, q9, q1 @ \n\t" - "subs r4, r4, #1 @ \n\t" - "beq 2f @ \n\t" - "vld1.32 {d16-d19}, [r1]! @ \n\t" - "vld1.32 {d0-d3}, [r0]! @ \n\t" - "vst1.32 {d24-d27}, [r2]! @ \n\t" - "b 1b @ \n\t" - "2: @ \n\t" - "vst1.32 {d24-d27}, [r2]! @ \n\t" - "3: @ \n\t" - "ands r4, %2, #7 @ \n\t" - "beq 5f @ \n\t" - "4: @ \n\t" - "subs r4, r4, #1 @ \n\t" - "vld1.32 {d16[0]}, [r1]! @ \n\t" - "vld1.32 {d0[0]}, [r0]! @ \n\t" - "vmul.f32 q12, q8, q0 @ \n\t" - "vst1.32 {d24[0]}, [r2]! @ \n\t" - "bne 4b @ \n\t" - "5: @ " - : - :"r" (y),"r" (x),"r"(n) - : "cc", "r0", "r1", "r2", "r4", "memory", - "q0", "q1", "q8", "q9", "q12", "q13", - "d0", "d1", "d2", "d3", - "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27" - ); + long i = 0; + + for(; i < n-4; i += 4) + { + y[i] *= x[i]; + y[i+1] *= x[i+1]; + y[i+2] *= x[i+2]; + y[i+3] *= x[i+3]; + } + + for(; i < n; i++) + y[i] *= x[i]; } static void THFloatVector_add_NEON(float *y, const float *x, const float c, const ptrdiff_t n) { - float ctemp = c; - float * caddr = &ctemp; - __asm__ __volatile__ ( - "mov r0, %0 @ \n\t" - "mov r1, %1 @ \n\t" - "mov r2, r0 @ \n\t" - "ldr r5, [%2] @ \n\t" - "vdup.32 q14, r5 @ \n\t" - "lsrs r5, %3, #4 @ \n\t" - "beq 3f @ \n\t" - "vld1.32 {d16-d19}, [r1]! @ \n\t" - "vld1.32 {d0-d3}, [r0]! @ \n\t" - "vld1.32 {d20-d23}, [r1]! @ \n\t" - "vld1.32 {d4-d7}, [r0]! @ \n\t" - "1: @ \n\t" - "vmla.f32 q0, q8, q14 @ \n\t" - "vmla.f32 q1, q9, q14 @ \n\t" - "vmla.f32 q2, q10, q14 @ \n\t" - "vmla.f32 q3, q11, q14 @ \n\t" - "subs r5, r5, #1 @ \n\t" - "beq 2f @ \n\t" - "vld1.32 {d16-d19}, [r1]! @ \n\t" - "vld1.32 {d20-d23}, [r1]! @ \n\t" - "vst1.32 {d0-d3}, [r2]! @ \n\t" - "vld1.32 {d0-d3}, [r0]! @ \n\t" - "vst1.32 {d4-d7}, [r2]! @ \n\t" - "vld1.32 {d4-d7}, [r0]! @ \n\t" - "b 1b @ \n\t" - "2: @ \n\t" - "vst1.32 {d0-d3}, [r2]! @ \n\t" - "vst1.32 {d4-d7}, [r2]! @ \n\t" - "3: @ \n\t" - "lsrs r5, %3, #3 @ \n\t" - "ands r5, #1 @ \n\t" - "beq 4f @ \n\t" - "vld1.32 {d16-d19}, [r1]! @ \n\t" - "vld1.32 {d0-d3}, [r0]! @ \n\t" - "vmla.f32 q0, q8, q14 @ \n\t" - "vmla.f32 q1, q9, q14 @ \n\t" - "vst1.32 {d0-d3}, [r2]! @ \n\t" - "4: @ \n\t" - "ands r5, %3, #7 @ \n\t" - "beq 6f @ \n\t" - "5: @ \n\t" - "subs r5, r5, #1 @ \n\t" - "vld1.32 {d16[0]}, [r1]! @ \n\t" - "vld1.32 {d0[0]}, [r0]! @ \n\t" - "vmla.f32 d0, d16, d28 @ \n\t" - "vst1.32 d0[0], [r2]! @ \n\t" - "bne 5b @ \n\t" - "6: @ " - : - :"r" (y),"r" (x), "r"(caddr),"r"(n) - : "cc", "r0", "r1", "r2", "r5", "memory", - "q0", "q1", "q2", "q3", "q14", - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", - "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29" - ); + long i = 0; + + for(;i < n-4; i += 4) + { + y[i] += c * x[i]; + y[i+1] += c * x[i+1]; + y[i+2] += c * x[i+2]; + y[i+3] += c * x[i+3]; + } + + for(; i < n; i++) + y[i] += c * x[i]; } |