diff options
author | Martin Storsjö <martin@martin.st> | 2018-11-15 17:15:30 +0300 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2018-11-26 00:25:28 +0300 |
commit | 513dfa990804496780a7fa9ee0707b84e1976c13 (patch) | |
tree | 59b9bc3940ed0781e8d0d221907cfbe5bb0510cf /src/looprestoration_tmpl.c | |
parent | b6bb8536ad299d52a5ff49a4f0317b923ce6b8bb (diff) |
arm64: looprestoration: NEON optimized wiener filter
The relative speedup compared to C code is around 4.2 for a Cortex A53
and 5.1 for a Snapdragon 835 (compared to GCC's autovectorized code),
6-7x compared to GCC's output without autovectorization, and ~8x
compared to clang's output (which doesn't seem to try to vectorize
this function).
Diffstat (limited to 'src/looprestoration_tmpl.c')
-rw-r--r-- | src/looprestoration_tmpl.c | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/src/looprestoration_tmpl.c b/src/looprestoration_tmpl.c index c88e425..393ba45 100644 --- a/src/looprestoration_tmpl.c +++ b/src/looprestoration_tmpl.c @@ -573,7 +573,11 @@ void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *cons c->wiener = wiener_c; c->selfguided = selfguided_c; -#if HAVE_ASM && ARCH_X86 && BITDEPTH == 8 +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM + bitfn(dav1d_loop_restoration_dsp_init_arm)(c); +#elif ARCH_X86 bitfn(dav1d_loop_restoration_dsp_init_x86)(c); #endif +#endif } |