Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/torch7.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSoumith Chintala <soumith@gmail.com>2016-11-15 23:57:57 +0300
committerGitHub <noreply@github.com>2016-11-15 23:57:57 +0300
commit97341e27ae35fb3bcfabbad0796dc1a60e0c2cde (patch)
tree01a184fb843f8c4051959877f2c76f7412e00455
parenta7d9af35d034a9cc6c1ff4e58e4154a1cfe8464a (diff)
parent01b0f5ad33e36e0e74c86a8c44aa3e30dfee617a (diff)
Merge pull request #839 from Atcold/fix_ASIMD
Fix compilation for ASIMD, fix #766
-rw-r--r--lib/TH/CMakeLists.txt7
-rw-r--r--lib/TH/cmake/FindARM.cmake9
-rw-r--r--lib/TH/vector/NEON.c296
3 files changed, 75 insertions, 237 deletions
diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt
index 29343c7..e6cf91d 100644
--- a/lib/TH/CMakeLists.txt
+++ b/lib/TH/CMakeLists.txt
@@ -62,10 +62,13 @@ ENDIF (WITH_OPENMP)
# ARM specific flags
FIND_PACKAGE(ARM)
-IF (NEON_FOUND)
+IF (ASIMD_FOUND)
+ MESSAGE(STATUS "asimd/Neon found with compiler flag : -D__NEON__")
+ SET(CMAKE_C_FLAGS "-D__NEON__ ${CMAKE_C_FLAGS}")
+ELSEIF (NEON_FOUND)
MESSAGE(STATUS "Neon found with compiler flag : -mfpu=neon -D__NEON__")
SET(CMAKE_C_FLAGS "-mfpu=neon -D__NEON__ ${CMAKE_C_FLAGS}")
-ENDIF (NEON_FOUND)
+ENDIF (ASIMD_FOUND)
IF (CORTEXA8_FOUND)
MESSAGE(STATUS "Cortex-A8 Found with compiler flag : -mcpu=cortex-a8")
SET(CMAKE_C_FLAGS "-mcpu=cortex-a8 -fprefetch-loop-arrays ${CMAKE_C_FLAGS}")
diff --git a/lib/TH/cmake/FindARM.cmake b/lib/TH/cmake/FindARM.cmake
index cf1f8fd..59c78d8 100644
--- a/lib/TH/cmake/FindARM.cmake
+++ b/lib/TH/cmake/FindARM.cmake
@@ -13,6 +13,15 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
set(NEON_FOUND false CACHE BOOL "NEON available on host")
ENDIF (NEON_TRUE)
+ # on ARMv8, neon is inherit and instead listed as 'asimd' in /proc/cpuinfo
+ STRING(REGEX REPLACE "^.*(asimd).*$" "\\1" ASIMD_THERE ${CPUINFO})
+ STRING(COMPARE EQUAL "asimd" "${ASIMD_THERE}" ASIMD_TRUE)
+ IF (ASIMD_TRUE)
+ set(ASIMD_FOUND true CACHE BOOL "ASIMD/NEON available on host")
+ ELSE (ASIMD_TRUE)
+ set(ASIMD_FOUND false CACHE BOOL "ASIMD/NEON available on host")
+ ENDIF (ASIMD_TRUE)
+
#Find the processor type (for now OMAP3 or OMAP4)
STRING(REGEX REPLACE "^.*(OMAP3).*$" "\\1" OMAP3_THERE ${CPUINFO})
STRING(COMPARE EQUAL "OMAP3" "${OMAP3_THERE}" OMAP3_TRUE)
diff --git a/lib/TH/vector/NEON.c b/lib/TH/vector/NEON.c
index bc7cb2b..327b006 100644
--- a/lib/TH/vector/NEON.c
+++ b/lib/TH/vector/NEON.c
@@ -1,252 +1,78 @@
static void THFloatVector_fill_NEON(float *x, const float c, const ptrdiff_t n) {
- float ctemp = c;
- float * caddr = &ctemp;
- __asm__ __volatile__ (
- "mov r0, %0 @ \n\t"
- "ldr r4, [%1] @ \n\t"
- "vdup.32 q12, r4 @ \n\t"
- "vdup.32 q13, r4 @ \n\t"
- "lsrs r4, %2, #3 @ \n\t"
- "beq 3f @ \n\t"
- "1: @ \n\t"
- "vst1.32 {d24-d27}, [r0]! @ \n\t"
- "subs r4, r4, #1 @ \n\t"
- "bne 1b @ \n\t"
- "3: @ \n\t"
- "ands r4, %2, #7 @ \n\t"
- "beq 5f @ \n\t"
- "4: @ \n\t"
- "subs r4, r4, #1 @ \n\t"
- "vst1.32 {d24[0]}, [r0]! @ \n\t"
- "bne 4b @ \n\t"
- "5: @ "
- :
- :"r" (x), "r"(caddr),"r"(n)
- : "cc", "r0", "r4", "memory",
- "q12",
- "d24", "d25", "d26", "d27"
- );
+ long i = 0;
+
+ for(; i < n-4; i += 4)
+ {
+ x[i] = c;
+ x[i+1] = c;
+ x[i+2] = c;
+ x[i+3] = c;
+ }
+
+ for(; i < n; i++)
+ x[i] = c;
+
}
static void THFloatVector_diff_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) {
- __asm__ __volatile__ (
- "mov r0, %2 @ \n\t"
- "mov r1, %1 @ \n\t"
- "mov r2, %0 @ \n\t"
- "lsrs r4, %3, #3 @ \n\t"
- "beq 3f @ \n\t"
- "vld1.32 {d16-d19}, [r1]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "1: @ \n\t"
- "vsub.f32 q12, q8, q0 @ \n\t"
- "vsub.f32 q13, q9, q1 @ \n\t"
- "subs r4, r4, #1 @ \n\t"
- "beq 2f @ \n\t"
- "vld1.32 {d16-d19}, [r1]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vst1.32 {d24-d27}, [r2]! @ \n\t"
- "b 1b @ \n\t"
- "2: @ \n\t"
- "vst1.32 {d24-d27}, [r2]! @ \n\t"
- "3: @ \n\t"
- "ands r4, %3, #7 @ \n\t"
- "beq 5f @ \n\t"
- "4: @ \n\t"
- "subs r4, r4, #1 @ \n\t"
- "vld1.32 {d16[0]}, [r1]! @ \n\t"
- "vld1.32 {d0[0]}, [r0]! @ \n\t"
- "vsub.f32 d24, d16, d0 @ \n\t"
- "vst1.32 {d24[0]}, [r2]! @ \n\t"
- "bne 4b @ \n\t"
- "5: @ "
- :
- :"r" (z), "r" (x),"r" (y), "r"(n)
- : "cc", "r0", "r1", "r2", "r4", "memory",
- "q0", "q1", "q8", "q9", "q12", "q13",
- "d0", "d1", "d2", "d3",
- "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
- );
+ long i = 0;
+
+ for(; i < n-4; i += 4)
+ {
+ z[i] = x[i] - y[i];
+ z[i+1] = x[i+1] - y[i+1];
+ z[i+2] = x[i+2] - y[i+2];
+ z[i+3] = x[i+3] - y[i+3];
+ }
+
+ for(; i < n; i++)
+ z[i] = x[i] - y[i];
+
}
static void THFloatVector_scale_NEON(float *y, const float c, const ptrdiff_t n) {
- float ctemp = c;
- float * caddr = &ctemp;
- __asm__ __volatile__ (
- "mov r0, %0 @ \n\t"
- "mov r2, r0 @ \n\t"
- "ldr r5, [%1] @ \n\t"
- "vdup.32 q14, r5 @ \n\t"
- "lsrs r5, %2, #5 @ \n\t"
- "beq 3f @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vld1.32 {d4-d7}, [r0]! @ \n\t"
- "vld1.32 {d8-d11}, [r0]! @ \n\t"
- "vld1.32 {d12-d15}, [r0]! @ \n\t"
- "1: @ \n\t"
- "vmul.f32 q0, q0, q14 @ \n\t"
- "vmul.f32 q1, q1, q14 @ \n\t"
- "vmul.f32 q2, q2, q14 @ \n\t"
- "vmul.f32 q3, q3, q14 @ \n\t"
- "vmul.f32 q4, q4, q14 @ \n\t"
- "vmul.f32 q5, q5, q14 @ \n\t"
- "vmul.f32 q6, q6, q14 @ \n\t"
- "vmul.f32 q7, q7, q14 @ \n\t"
- "subs r5, r5, #1 @ \n\t"
- "beq 2f @ \n\t"
- "vst1.32 {d0-d3}, [r2]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vst1.32 {d4-d7}, [r2]! @ \n\t"
- "vld1.32 {d4-d7}, [r0]! @ \n\t"
- "vst1.32 {d8-d11}, [r2]! @ \n\t"
- "vld1.32 {d8-d11}, [r0]! @ \n\t"
- "vst1.32 {d12-d15}, [r2]! @ \n\t"
- "vld1.32 {d12-d15}, [r0]! @ \n\t"
- "b 1b @ \n\t"
- "2: @ \n\t"
- "vst1.32 {d0-d3}, [r2]! @ \n\t"
- "vst1.32 {d4-d7}, [r2]! @ \n\t"
- "vst1.32 {d8-d11}, [r2]! @ \n\t"
- "vst1.32 {d12-d15}, [r2]! @ \n\t"
- "3: @ \n\t"
- "lsrs r5, %2, #4 @ \n\t"
- "ands r5, r5, #1 @ \n\t"
- "beq 4f @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vld1.32 {d4-d7}, [r0]! @ \n\t"
- "vmul.f32 q0, q0, q14 @ \n\t"
- "vmul.f32 q1, q1, q14 @ \n\t"
- "vmul.f32 q2, q2, q14 @ \n\t"
- "vmul.f32 q3, q3, q14 @ \n\t"
- "vst1.32 {d0-d3}, [r2]! @ \n\t"
- "vst1.32 {d4-d7}, [r2]! @ \n\t"
- "4: @ \n\t"
- "lsrs r5, %2, #3 @ \n\t"
- "ands r5, r5, #1 @ \n\t"
- "beq 5f @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vmul.f32 q0, q0, q14 @ \n\t"
- "vmul.f32 q1, q1, q14 @ \n\t"
- "vst1.32 {d0-d3}, [r2]! @ \n\t"
- "5: @ \n\t"
- "ands r5, %2, #7 @ \n\t"
- "beq 7f @ \n\t"
- "6: @ \n\t"
- "subs r5, r5, #1 @ \n\t"
- "vld1.32 d0[0], [r0]! @ \n\t"
- "vmul.f32 d0, d0, d28 @ \n\t"
- "vst1.32 d0[0], [r2]! @ \n\t"
- "bne 6b @ \n\t"
- "7: @ "
- :
- :"r" (y), "r"(caddr),"r"(n)
- : "cc", "r0", "r2", "r5", "memory",
- "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14",
- "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
- "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
- "d28", "d29"
- );
+ long i = 0;
+ for(; i < n-4; i +=4)
+ {
+ y[i] *= c;
+ y[i+1] *= c;
+ y[i+2] *= c;
+ y[i+3] *= c;
+ }
+
+ for(; i < n; i++)
+ y[i] *= c;
}
static void THFloatVector_mul_NEON(float *y, const float *x, const ptrdiff_t n) {
- __asm__ __volatile__ (
- "mov r0, %0 @ \n\t"
- "mov r1, %1 @ \n\t"
- "mov r2, r0 @ \n\t"
- "lsrs r4, %2, #3 @ \n\t"
- "beq 3f @ \n\t"
- "vld1.32 {d16-d19}, [r1]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "1: @ \n\t"
- "vmul.f32 q12, q8, q0 @ \n\t"
- "vmul.f32 q13, q9, q1 @ \n\t"
- "subs r4, r4, #1 @ \n\t"
- "beq 2f @ \n\t"
- "vld1.32 {d16-d19}, [r1]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vst1.32 {d24-d27}, [r2]! @ \n\t"
- "b 1b @ \n\t"
- "2: @ \n\t"
- "vst1.32 {d24-d27}, [r2]! @ \n\t"
- "3: @ \n\t"
- "ands r4, %2, #7 @ \n\t"
- "beq 5f @ \n\t"
- "4: @ \n\t"
- "subs r4, r4, #1 @ \n\t"
- "vld1.32 {d16[0]}, [r1]! @ \n\t"
- "vld1.32 {d0[0]}, [r0]! @ \n\t"
- "vmul.f32 q12, q8, q0 @ \n\t"
- "vst1.32 {d24[0]}, [r2]! @ \n\t"
- "bne 4b @ \n\t"
- "5: @ "
- :
- :"r" (y),"r" (x),"r"(n)
- : "cc", "r0", "r1", "r2", "r4", "memory",
- "q0", "q1", "q8", "q9", "q12", "q13",
- "d0", "d1", "d2", "d3",
- "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
- );
+ long i = 0;
+
+ for(; i < n-4; i += 4)
+ {
+ y[i] *= x[i];
+ y[i+1] *= x[i+1];
+ y[i+2] *= x[i+2];
+ y[i+3] *= x[i+3];
+ }
+
+ for(; i < n; i++)
+ y[i] *= x[i];
}
static void THFloatVector_add_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
- float ctemp = c;
- float * caddr = &ctemp;
- __asm__ __volatile__ (
- "mov r0, %0 @ \n\t"
- "mov r1, %1 @ \n\t"
- "mov r2, r0 @ \n\t"
- "ldr r5, [%2] @ \n\t"
- "vdup.32 q14, r5 @ \n\t"
- "lsrs r5, %3, #4 @ \n\t"
- "beq 3f @ \n\t"
- "vld1.32 {d16-d19}, [r1]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vld1.32 {d20-d23}, [r1]! @ \n\t"
- "vld1.32 {d4-d7}, [r0]! @ \n\t"
- "1: @ \n\t"
- "vmla.f32 q0, q8, q14 @ \n\t"
- "vmla.f32 q1, q9, q14 @ \n\t"
- "vmla.f32 q2, q10, q14 @ \n\t"
- "vmla.f32 q3, q11, q14 @ \n\t"
- "subs r5, r5, #1 @ \n\t"
- "beq 2f @ \n\t"
- "vld1.32 {d16-d19}, [r1]! @ \n\t"
- "vld1.32 {d20-d23}, [r1]! @ \n\t"
- "vst1.32 {d0-d3}, [r2]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vst1.32 {d4-d7}, [r2]! @ \n\t"
- "vld1.32 {d4-d7}, [r0]! @ \n\t"
- "b 1b @ \n\t"
- "2: @ \n\t"
- "vst1.32 {d0-d3}, [r2]! @ \n\t"
- "vst1.32 {d4-d7}, [r2]! @ \n\t"
- "3: @ \n\t"
- "lsrs r5, %3, #3 @ \n\t"
- "ands r5, #1 @ \n\t"
- "beq 4f @ \n\t"
- "vld1.32 {d16-d19}, [r1]! @ \n\t"
- "vld1.32 {d0-d3}, [r0]! @ \n\t"
- "vmla.f32 q0, q8, q14 @ \n\t"
- "vmla.f32 q1, q9, q14 @ \n\t"
- "vst1.32 {d0-d3}, [r2]! @ \n\t"
- "4: @ \n\t"
- "ands r5, %3, #7 @ \n\t"
- "beq 6f @ \n\t"
- "5: @ \n\t"
- "subs r5, r5, #1 @ \n\t"
- "vld1.32 {d16[0]}, [r1]! @ \n\t"
- "vld1.32 {d0[0]}, [r0]! @ \n\t"
- "vmla.f32 d0, d16, d28 @ \n\t"
- "vst1.32 d0[0], [r2]! @ \n\t"
- "bne 5b @ \n\t"
- "6: @ "
- :
- :"r" (y),"r" (x), "r"(caddr),"r"(n)
- : "cc", "r0", "r1", "r2", "r5", "memory",
- "q0", "q1", "q2", "q3", "q14",
- "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
- "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29"
- );
+ long i = 0;
+
+ for(;i < n-4; i += 4)
+ {
+ y[i] += c * x[i];
+ y[i+1] += c * x[i+1];
+ y[i+2] += c * x[i+2];
+ y[i+3] += c * x[i+3];
+ }
+
+ for(; i < n; i++)
+ y[i] += c * x[i];
}