Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHenrik Gramner <gramner@twoorioles.com>2022-07-06 15:43:44 +0300
committerHenrik Gramner <henrik@gramner.com>2022-07-06 16:05:47 +0300
commitbd0466350d20e2c6aab4c47668cd5486dc7a3d94 (patch)
treea49202fdff9fe3f560aa4eadde6f54f4d033ed21
parent820bf5156322ea6f9d1fc180ac579743347b9c5b (diff)
Eliminate unused C DSP functions at compile time
When compiling with asm enabled there's no point in compiling C versions of DSP functions that have asm implementations using instruction sets that the compiler can unconditionally use. E.g. when compiling with -mssse3 we can remove the C version of all functions with SSSE3 implementations. This is accomplished using the compiler's dead code elimination functionality. Can be configured using the new 'trim_dsp' meson option, which by default is enabled when compiling in release mode.
-rw-r--r--.gitlab-ci.yml13
-rw-r--r--meson.build3
-rw-r--r--meson_options.txt6
-rw-r--r--src/arm/cdef.h (renamed from src/arm/cdef_init_tmpl.c)3
-rw-r--r--src/arm/filmgrain.h (renamed from src/arm/filmgrain_init_tmpl.c)54
-rw-r--r--src/arm/ipred.h (renamed from src/arm/ipred_init_tmpl.c)2
-rw-r--r--src/arm/itx.h (renamed from src/arm/itx_init_tmpl.c)4
-rw-r--r--src/arm/loopfilter.h (renamed from src/arm/loopfilter_init_tmpl.c)2
-rw-r--r--src/arm/looprestoration.h (renamed from src/arm/looprestoration_init_tmpl.c)4
-rw-r--r--src/arm/mc.h (renamed from src/arm/mc_init_tmpl.c)2
-rw-r--r--src/arm/refmvs.h (renamed from src/arm/refmvs_init.c)2
-rw-r--r--src/cdef.h3
-rw-r--r--src/cdef_tmpl.c16
-rw-r--r--src/cpu.h51
-rw-r--r--src/filmgrain.h2
-rw-r--r--src/filmgrain_tmpl.c12
-rw-r--r--src/ipred.h2
-rw-r--r--src/ipred_tmpl.c12
-rw-r--r--src/itx.h2
-rw-r--r--src/itx_tmpl.c12
-rw-r--r--src/loopfilter.h2
-rw-r--r--src/loopfilter_tmpl.c12
-rw-r--r--src/looprestoration.h3
-rw-r--r--src/looprestoration_tmpl.c16
-rw-r--r--src/mc.h2
-rw-r--r--src/mc_tmpl.c12
-rw-r--r--src/meson.build26
-rw-r--r--src/msac.c2
-rw-r--r--src/ppc/cdef.h61
-rw-r--r--src/ppc/cdef_tmpl.c (renamed from src/ppc/cdef_init_tmpl.c)43
-rw-r--r--src/ppc/looprestoration.h (renamed from src/x86/msac_init.c)29
-rw-r--r--src/ppc/looprestoration_tmpl.c (renamed from src/ppc/looprestoration_init_tmpl.c)30
-rw-r--r--src/refmvs.c12
-rw-r--r--src/x86/cdef.h (renamed from src/x86/cdef_init_tmpl.c)2
-rw-r--r--src/x86/filmgrain.h (renamed from src/x86/filmgrain_init_tmpl.c)2
-rw-r--r--src/x86/ipred.h (renamed from src/x86/ipred_init_tmpl.c)2
-rw-r--r--src/x86/itx.h (renamed from src/x86/itx_init_tmpl.c)75
-rw-r--r--src/x86/loopfilter.h (renamed from src/x86/loopfilter_init_tmpl.c)2
-rw-r--r--src/x86/looprestoration.h (renamed from src/x86/looprestoration_init_tmpl.c)10
-rw-r--r--src/x86/mc.h (renamed from src/x86/mc_init_tmpl.c)2
-rw-r--r--src/x86/msac.h23
-rw-r--r--src/x86/refmvs.h (renamed from src/x86/refmvs_init.c)2
-rw-r--r--tests/checkasm/checkasm.c5
43 files changed, 352 insertions, 230 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ce2128f..585d9f4 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -117,6 +117,7 @@ build-debian:
- amd64
script:
- meson build --buildtype release
+ -Dtrim_dsp=false
--werror
- ninja -C build
- cd build && meson test -v
@@ -259,6 +260,7 @@ build-win32:
--prefix "$(pwd)/build/dav1d_install"
--cross-file package/crossfiles/i686-w64-mingw32.meson
-Ddefault_library=both
+ -Dtrim_dsp=false
- ninja -C build
- ninja -C build install
- cd build && meson test -v
@@ -277,6 +279,7 @@ build-win32-unaligned-stack:
--werror
--cross-file package/crossfiles/i686-w64-mingw32.meson
-Dstack_alignment=4
+ -Dtrim_dsp=false
- ninja -C build
- cd build && meson test -v
@@ -290,6 +293,7 @@ build-win64:
--prefix "$(pwd)/build/dav1d_install"
--cross-file package/crossfiles/x86_64-w64-mingw32.meson
-Ddefault_library=both
+ -Dtrim_dsp=false
- ninja -C build
- ninja -C build install
- cd build && meson test -v
@@ -309,6 +313,7 @@ build-win-arm32:
--prefix "$(pwd)/build/dav1d_install"
--cross-file /opt/crossfiles/armv7-w64-mingw32.meson
-Ddefault_library=both
+ -Dtrim_dsp=false
- ninja -C build
- armv7-w64-mingw32-nm -A -g build/src/libdav1d.a | grep " [ABCDGRST] " | (! grep -E -v " \.| _*dav1d_")
@@ -321,6 +326,7 @@ build-win-arm64:
--prefix "$(pwd)/build/dav1d_install"
--cross-file /opt/crossfiles/aarch64-w64-mingw32.meson
-Ddefault_library=both
+ -Dtrim_dsp=false
- ninja -C build
- ninja -C build install
- aarch64-w64-mingw32-nm -A -g build/src/libdav1d.a | grep " [ABCDGRST] " | (! grep -E -v " \.| _*dav1d_")
@@ -339,6 +345,7 @@ build-win-arm64:
--prefix "$(pwd)/build/dav1d_install"
--cross-file $CROSSFILE
-Ddefault_library=both
+ -Dtrim_dsp=false
- ninja -C build
- ninja -C build install
@@ -406,6 +413,7 @@ build-macos:
script:
- meson build --buildtype release
-Ddefault_library=both
+ -Dtrim_dsp=false
--werror
- ninja -C build
- cd build && meson test -v
@@ -461,6 +469,7 @@ build-debian-ppc64le:
extends: .debian-ppc64le-common
script:
- meson build --buildtype release
+ -Dtrim_dsp=false
--werror
- ninja -C build
- cd build && meson test -v
@@ -514,6 +523,7 @@ test-debian:
-Dtestdata_tests=true
-Dlogging=false
-Db_coverage=true
+ -Dtrim_dsp=false
- ninja -C build
- cd build && time meson test -v
- ninja coverage-html
@@ -559,6 +569,7 @@ test-debian-avx512:
script:
- meson build --buildtype release
-Dtestdata_tests=true
+ -Dtrim_dsp=false
- ninja -C build
- cd build && time meson test --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask avx512icl"
- time meson test --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--threads 2 --framedelay 2 --cpumask avx512icl"
@@ -577,6 +588,7 @@ test-debian-unaligned-stack:
-Dtestdata_tests=true
-Dlogging=false
-Dstack_alignment=16
+ -Dtrim_dsp=false
- ninja -C build
- cd build && time meson test -v
@@ -674,6 +686,7 @@ test-win64:
- meson build --buildtype release
-Dtestdata_tests=true
-Dlogging=false
+ -Dtrim_dsp=false
--cross-file package/crossfiles/x86_64-w64-mingw32.meson
- ninja -C build
- cd build && time meson test -v
diff --git a/meson.build b/meson.build
index 372a55d..0501ef2 100644
--- a/meson.build
+++ b/meson.build
@@ -73,6 +73,9 @@ if is_asm_enabled and get_option('b_sanitize') == 'memory'
error('asm causes false positive with memory sanitizer. Use \'-Denable_asm=false\'.')
endif
+cdata.set10('TRIM_DSP_FUNCTIONS', get_option('trim_dsp') == 'true' or
+ (get_option('trim_dsp') == 'if-release' and get_option('buildtype') == 'release'))
+
# Logging option
cdata.set10('CONFIG_LOG', get_option('logging'))
diff --git a/meson_options.txt b/meson_options.txt
index 94f3704..91a0f6c 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -57,3 +57,9 @@ option('stack_alignment',
option('xxhash_muxer',
type : 'feature',
value : 'auto')
+
+option('trim_dsp',
+ type: 'combo',
+ choices: ['true', 'false', 'if-release'],
+ value: 'if-release',
+ description: 'Eliminate redundant DSP functions where possible')
diff --git a/src/arm/cdef_init_tmpl.c b/src/arm/cdef.h
index 33bd348..2e8c8ab 100644
--- a/src/arm/cdef_init_tmpl.c
+++ b/src/arm/cdef.h
@@ -76,8 +76,7 @@ DEFINE_FILTER(8, 8, 16)
DEFINE_FILTER(4, 8, 8)
DEFINE_FILTER(4, 4, 8)
-
-COLD void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) {
+static ALWAYS_INLINE void cdef_dsp_init_arm(Dav1dCdefDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
diff --git a/src/arm/filmgrain_init_tmpl.c b/src/arm/filmgrain.h
index 2156047..118ce30 100644
--- a/src/arm/filmgrain_init_tmpl.c
+++ b/src/arm/filmgrain.h
@@ -72,35 +72,6 @@ void BF(dav1d_fgy_32x32, neon)(pixel *const dst,
const ptrdiff_t type
HIGHBD_DECL_SUFFIX);
-// Use ptrdiff_t instead of int for the last few parameters, to get the
-// parameters on the stack with the same layout across platforms.
-#define FGUV(suff) \
-void BF(dav1d_fguv_32x32_ ## suff, neon)(pixel *const dst, \
- const pixel *const src, \
- const ptrdiff_t stride, \
- const uint8_t scaling[SCALING_SIZE], \
- const Dav1dFilmGrainData *const data, \
- const entry grain_lut[][GRAIN_WIDTH], \
- const pixel *const luma_row, \
- const ptrdiff_t luma_stride, \
- const int offsets[][2], \
- const ptrdiff_t h, const ptrdiff_t uv, \
- const ptrdiff_t is_id, \
- const ptrdiff_t type \
- HIGHBD_DECL_SUFFIX)
-
-FGUV(420);
-FGUV(422);
-FGUV(444);
-
-static inline int get_random_number(const int bits, unsigned *const state) {
- const int r = *state;
- unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
- *state = (r >> 1) | (bit << 15);
-
- return (*state >> (16 - bits)) & ((1 << bits) - 1);
-}
-
static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row,
const ptrdiff_t stride,
const Dav1dFilmGrainData *const data, const size_t pw,
@@ -147,7 +118,22 @@ static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row,
}
}
-#define fguv_ss_fn(nm, sx, sy) \
+// Use ptrdiff_t instead of int for the last few parameters, to get the
+// parameters on the stack with the same layout across platforms.
+#define FGUV(nm, sx, sy) \
+void BF(dav1d_fguv_32x32_##nm, neon)(pixel *const dst, \
+ const pixel *const src, \
+ const ptrdiff_t stride, \
+ const uint8_t scaling[SCALING_SIZE], \
+ const Dav1dFilmGrainData *const data, \
+ const entry grain_lut[][GRAIN_WIDTH], \
+ const pixel *const luma_row, \
+ const ptrdiff_t luma_stride, \
+ const int offsets[][2], \
+ const ptrdiff_t h, const ptrdiff_t uv, \
+ const ptrdiff_t is_id, \
+ const ptrdiff_t type \
+ HIGHBD_DECL_SUFFIX); \
static void \
fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \
@@ -197,11 +183,11 @@ fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
} \
}
-fguv_ss_fn(420, 1, 1);
-fguv_ss_fn(422, 1, 0);
-fguv_ss_fn(444, 0, 0);
+FGUV(420, 1, 1);
+FGUV(422, 1, 0);
+FGUV(444, 0, 0);
-COLD void bitfn(dav1d_film_grain_dsp_init_arm)(Dav1dFilmGrainDSPContext *const c) {
+static ALWAYS_INLINE void film_grain_dsp_init_arm(Dav1dFilmGrainDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
diff --git a/src/arm/ipred_init_tmpl.c b/src/arm/ipred.h
index 463481f..aef4dae 100644
--- a/src/arm/ipred_init_tmpl.c
+++ b/src/arm/ipred.h
@@ -50,7 +50,7 @@ decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon));
decl_pal_pred_fn(BF(dav1d_pal_pred, neon));
-COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c) {
+static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
diff --git a/src/arm/itx_init_tmpl.c b/src/arm/itx.h
index d089a6f..2ecd086 100644
--- a/src/arm/itx_init_tmpl.c
+++ b/src/arm/itx.h
@@ -77,7 +77,7 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
-COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc) {
+static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
@@ -117,7 +117,7 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
- if (bpc > 10) return;
+ if (BITDEPTH == 16 && bpc != 10) return;
assign_itx17_fn( , 4, 4, neon);
assign_itx16_fn(R, 4, 8, neon);
diff --git a/src/arm/loopfilter_init_tmpl.c b/src/arm/loopfilter.h
index 671545d..9ac08d9 100644
--- a/src/arm/loopfilter_init_tmpl.c
+++ b/src/arm/loopfilter.h
@@ -33,7 +33,7 @@ decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon));
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon));
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon));
-COLD void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const c) {
+static ALWAYS_INLINE void loop_filter_dsp_init_arm(Dav1dLoopFilterDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
diff --git a/src/arm/looprestoration_init_tmpl.c b/src/arm/looprestoration.h
index 5ba4bce..7993dbf 100644
--- a/src/arm/looprestoration_init_tmpl.c
+++ b/src/arm/looprestoration.h
@@ -246,7 +246,7 @@ static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride,
tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
}
-COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
+static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
@@ -257,7 +257,7 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPCont
#else
c->wiener[0] = c->wiener[1] = wiener_filter_neon;
#endif
- if (bpc <= 10) {
+ if (BITDEPTH == 8 || bpc == 10) {
c->sgr[0] = sgr_filter_5x5_neon;
c->sgr[1] = sgr_filter_3x3_neon;
c->sgr[2] = sgr_filter_mix_neon;
diff --git a/src/arm/mc_init_tmpl.c b/src/arm/mc.h
index 3423020..06cd533 100644
--- a/src/arm/mc_init_tmpl.c
+++ b/src/arm/mc.h
@@ -68,7 +68,7 @@ decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon));
decl_emu_edge_fn(BF(dav1d_emu_edge, neon));
-void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
+static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
#define init_mc_fn(type, name, suffix) \
c->mc[type] = BF(dav1d_put_##name, suffix)
#define init_mct_fn(type, name, suffix) \
diff --git a/src/arm/refmvs_init.c b/src/arm/refmvs.h
index acde030..4c96fc5 100644
--- a/src/arm/refmvs_init.c
+++ b/src/arm/refmvs.h
@@ -30,7 +30,7 @@
decl_splat_mv_fn(dav1d_splat_mv_neon);
-COLD void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
+static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
diff --git a/src/cdef.h b/src/cdef.h
index 2a933d5..07c84d9 100644
--- a/src/cdef.h
+++ b/src/cdef.h
@@ -67,8 +67,5 @@ typedef struct Dav1dCdefDSPContext {
} Dav1dCdefDSPContext;
bitfn_decls(void dav1d_cdef_dsp_init, Dav1dCdefDSPContext *c);
-bitfn_decls(void dav1d_cdef_dsp_init_arm, Dav1dCdefDSPContext *c);
-bitfn_decls(void dav1d_cdef_dsp_init_ppc, Dav1dCdefDSPContext *c);
-bitfn_decls(void dav1d_cdef_dsp_init_x86, Dav1dCdefDSPContext *c);
#endif /* DAV1D_SRC_CDEF_H */
diff --git a/src/cdef_tmpl.c b/src/cdef_tmpl.c
index 1c95dbf..5943945 100644
--- a/src/cdef_tmpl.c
+++ b/src/cdef_tmpl.c
@@ -303,6 +303,16 @@ static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,
return best_dir;
}
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/cdef.h"
+#elif ARCH_PPC64LE
+#include "src/ppc/cdef.h"
+#elif ARCH_X86
+#include "src/x86/cdef.h"
+#endif
+#endif
+
COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
c->dir = cdef_find_dir_c;
c->fb[0] = cdef_filter_block_8x8_c;
@@ -311,11 +321,11 @@ COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_cdef_dsp_init_arm)(c);
+ cdef_dsp_init_arm(c);
#elif ARCH_PPC64LE
- bitfn(dav1d_cdef_dsp_init_ppc)(c);
+ cdef_dsp_init_ppc(c);
#elif ARCH_X86
- bitfn(dav1d_cdef_dsp_init_x86)(c);
+ cdef_dsp_init_x86(c);
#endif
#endif
}
diff --git a/src/cpu.h b/src/cpu.h
index 7616c0a..68fbaec 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -1,6 +1,6 @@
/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2018-2022, VideoLAN and dav1d authors
+ * Copyright © 2018-2022, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -51,7 +51,52 @@ DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);
int dav1d_num_logical_processors(Dav1dContext *c);
static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
- return dav1d_cpu_flags & dav1d_cpu_flags_mask;
+ unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask;
+
+#if TRIM_DSP_FUNCTIONS
+/* Since this function is inlined, unconditionally setting a flag here will
+ * enable dead code elimination in the calling function. */
+#if ARCH_AARCH64 || ARCH_ARM
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
+ flags |= DAV1D_ARM_CPU_FLAG_NEON;
+#endif
+#elif ARCH_PPC64LE
+#if defined(__VSX__)
+ flags |= DAV1D_PPC_CPU_FLAG_VSX;
+#endif
+#elif ARCH_X86
+#if defined(__AVX512F__) && defined(__AVX512CD__) && \
+ defined(__AVX512BW__) && defined(__AVX512DQ__) && \
+ defined(__AVX512VL__) && defined(__AVX512VNNI__) && \
+ defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \
+ defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
+ defined(__AVX512BITALG__) && defined(__GFNI__) && \
+ defined(__VAES__) && defined(__VPCLMULQDQ__)
+ flags |= DAV1D_X86_CPU_FLAG_AVX512ICL |
+ DAV1D_X86_CPU_FLAG_AVX2 |
+ DAV1D_X86_CPU_FLAG_SSE41 |
+ DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif defined(__AVX2__)
+ flags |= DAV1D_X86_CPU_FLAG_AVX2 |
+ DAV1D_X86_CPU_FLAG_SSE41 |
+ DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif defined(__SSE4_1__) || defined(__AVX__)
+ flags |= DAV1D_X86_CPU_FLAG_SSE41 |
+ DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif defined(__SSSE3__)
+ flags |= DAV1D_X86_CPU_FLAG_SSSE3 |
+ DAV1D_X86_CPU_FLAG_SSE2;
+#elif ARCH_X86_64 || defined(__SSE2__) || \
+ (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+ flags |= DAV1D_X86_CPU_FLAG_SSE2;
+#endif
+#endif
+#endif
+
+ return flags;
}
#endif /* DAV1D_SRC_CPU_H */
diff --git a/src/filmgrain.h b/src/filmgrain.h
index d953542..0ffded6 100644
--- a/src/filmgrain.h
+++ b/src/filmgrain.h
@@ -80,7 +80,5 @@ typedef struct Dav1dFilmGrainDSPContext {
} Dav1dFilmGrainDSPContext;
bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c);
-bitfn_decls(void dav1d_film_grain_dsp_init_arm, Dav1dFilmGrainDSPContext *c);
-bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c);
#endif /* DAV1D_SRC_FILM_GRAIN_H */
diff --git a/src/filmgrain_tmpl.c b/src/filmgrain_tmpl.c
index 883c5cb..b772614 100644
--- a/src/filmgrain_tmpl.c
+++ b/src/filmgrain_tmpl.c
@@ -412,6 +412,14 @@ fguv_ss_fn(420, 1, 1);
fguv_ss_fn(422, 1, 0);
fguv_ss_fn(444, 0, 0);
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/filmgrain.h"
+#elif ARCH_X86
+#include "src/x86/filmgrain.h"
+#endif
+#endif
+
COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
c->generate_grain_y = generate_grain_y_c;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c;
@@ -425,9 +433,9 @@ COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_film_grain_dsp_init_arm)(c);
+ film_grain_dsp_init_arm(c);
#elif ARCH_X86
- bitfn(dav1d_film_grain_dsp_init_x86)(c);
+ film_grain_dsp_init_x86(c);
#endif
#endif
}
diff --git a/src/ipred.h b/src/ipred.h
index 8664f3f..739ef1a 100644
--- a/src/ipred.h
+++ b/src/ipred.h
@@ -90,7 +90,5 @@ typedef struct Dav1dIntraPredDSPContext {
} Dav1dIntraPredDSPContext;
bitfn_decls(void dav1d_intra_pred_dsp_init, Dav1dIntraPredDSPContext *c);
-bitfn_decls(void dav1d_intra_pred_dsp_init_arm, Dav1dIntraPredDSPContext *c);
-bitfn_decls(void dav1d_intra_pred_dsp_init_x86, Dav1dIntraPredDSPContext *c);
#endif /* DAV1D_SRC_IPRED_H */
diff --git a/src/ipred_tmpl.c b/src/ipred_tmpl.c
index 50c7a3c..151d484 100644
--- a/src/ipred_tmpl.c
+++ b/src/ipred_tmpl.c
@@ -726,6 +726,14 @@ static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
}
}
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/ipred.h"
+#elif ARCH_X86
+#include "src/x86/ipred.h"
+#endif
+#endif
+
COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
c->intra_pred[DC_PRED ] = ipred_dc_c;
c->intra_pred[DC_128_PRED ] = ipred_dc_128_c;
@@ -755,9 +763,9 @@ COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_intra_pred_dsp_init_arm)(c);
+ intra_pred_dsp_init_arm(c);
#elif ARCH_X86
- bitfn(dav1d_intra_pred_dsp_init_x86)(c);
+ intra_pred_dsp_init_x86(c);
#endif
#endif
}
diff --git a/src/itx.h b/src/itx.h
index 08f5e21..d522079 100644
--- a/src/itx.h
+++ b/src/itx.h
@@ -44,7 +44,5 @@ typedef struct Dav1dInvTxfmDSPContext {
} Dav1dInvTxfmDSPContext;
bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc);
-bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c, int bpc);
-bitfn_decls(void dav1d_itx_dsp_init_x86, Dav1dInvTxfmDSPContext *c, int bpc);
#endif /* DAV1D_SRC_ITX_H */
diff --git a/src/itx_tmpl.c b/src/itx_tmpl.c
index 2f97a9c..d385989 100644
--- a/src/itx_tmpl.c
+++ b/src/itx_tmpl.c
@@ -180,6 +180,14 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
dst[x] = iclip_pixel(dst[x] + *c++);
}
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/itx.h"
+#elif ARCH_X86
+#include "src/x86/itx.h"
+#endif
+#endif
+
COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
#define assign_itx_all_fn64(w, h, pfx) \
c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT ] = \
@@ -247,10 +255,10 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_itx_dsp_init_arm)(c, bpc);
+ itx_dsp_init_arm(c, bpc);
#endif
#if ARCH_X86
- bitfn(dav1d_itx_dsp_init_x86)(c, bpc);
+ itx_dsp_init_x86(c, bpc);
#endif
#endif
}
diff --git a/src/loopfilter.h b/src/loopfilter.h
index c159050..a0f78c9 100644
--- a/src/loopfilter.h
+++ b/src/loopfilter.h
@@ -53,7 +53,5 @@ typedef struct Dav1dLoopFilterDSPContext {
} Dav1dLoopFilterDSPContext;
bitfn_decls(void dav1d_loop_filter_dsp_init, Dav1dLoopFilterDSPContext *c);
-bitfn_decls(void dav1d_loop_filter_dsp_init_arm, Dav1dLoopFilterDSPContext *c);
-bitfn_decls(void dav1d_loop_filter_dsp_init_x86, Dav1dLoopFilterDSPContext *c);
#endif /* DAV1D_SRC_LOOPFILTER_H */
diff --git a/src/loopfilter_tmpl.c b/src/loopfilter_tmpl.c
index 6ea744f..cacf258 100644
--- a/src/loopfilter_tmpl.c
+++ b/src/loopfilter_tmpl.c
@@ -244,6 +244,14 @@ static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
}
}
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/loopfilter.h"
+#elif ARCH_X86
+#include "src/x86/loopfilter.h"
+#endif
+#endif
+
COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
@@ -252,9 +260,9 @@ COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c)
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_loop_filter_dsp_init_arm)(c);
+ loop_filter_dsp_init_arm(c);
#elif ARCH_X86
- bitfn(dav1d_loop_filter_dsp_init_x86)(c);
+ loop_filter_dsp_init_x86(c);
#endif
#endif
}
diff --git a/src/looprestoration.h b/src/looprestoration.h
index d0ab811..f55dd31 100644
--- a/src/looprestoration.h
+++ b/src/looprestoration.h
@@ -75,8 +75,5 @@ typedef struct Dav1dLoopRestorationDSPContext {
} Dav1dLoopRestorationDSPContext;
bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc);
-bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c, int bpc);
-bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c, int bpc);
-bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c, int bpc);
#endif /* DAV1D_SRC_LOOPRESTORATION_H */
diff --git a/src/looprestoration_tmpl.c b/src/looprestoration_tmpl.c
index 254c25d..d4d7867 100644
--- a/src/looprestoration_tmpl.c
+++ b/src/looprestoration_tmpl.c
@@ -524,6 +524,16 @@ static void sgr_mix_c(pixel *p, const ptrdiff_t stride,
}
}
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/looprestoration.h"
+#elif ARCH_PPC64LE
+#include "src/ppc/looprestoration.h"
+#elif ARCH_X86
+#include "src/x86/looprestoration.h"
+#endif
+#endif
+
COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c,
const int bpc)
{
@@ -534,11 +544,11 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_loop_restoration_dsp_init_arm)(c, bpc);
+ loop_restoration_dsp_init_arm(c, bpc);
#elif ARCH_PPC64LE
- bitfn(dav1d_loop_restoration_dsp_init_ppc)(c, bpc);
+ loop_restoration_dsp_init_ppc(c, bpc);
#elif ARCH_X86
- bitfn(dav1d_loop_restoration_dsp_init_x86)(c, bpc);
+ loop_restoration_dsp_init_x86(c, bpc);
#endif
#endif
}
diff --git a/src/mc.h b/src/mc.h
index 784b58d..59ba2d9 100644
--- a/src/mc.h
+++ b/src/mc.h
@@ -132,7 +132,5 @@ typedef struct Dav1dMCDSPContext {
} Dav1dMCDSPContext;
bitfn_decls(void dav1d_mc_dsp_init, Dav1dMCDSPContext *c);
-bitfn_decls(void dav1d_mc_dsp_init_arm, Dav1dMCDSPContext *c);
-bitfn_decls(void dav1d_mc_dsp_init_x86, Dav1dMCDSPContext *c);
#endif /* DAV1D_SRC_MC_H */
diff --git a/src/mc_tmpl.c b/src/mc_tmpl.c
index f8d3e3b..20226d8 100644
--- a/src/mc_tmpl.c
+++ b/src/mc_tmpl.c
@@ -902,6 +902,14 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
} while (--h);
}
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/mc.h"
+#elif ARCH_X86
+#include "src/x86/mc.h"
+#endif
+#endif
+
COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
#define init_mc_fns(type, name) do { \
c->mc [type] = put_##name##_c; \
@@ -937,9 +945,9 @@ COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- bitfn(dav1d_mc_dsp_init_arm)(c);
+ mc_dsp_init_arm(c);
#elif ARCH_X86
- bitfn(dav1d_mc_dsp_init_x86)(c);
+ mc_dsp_init_x86(c);
#endif
#endif
}
diff --git a/src/meson.build b/src/meson.build
index 5c41e37..d19b8d9 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -92,16 +92,6 @@ if is_asm_enabled
libdav1d_sources += files(
'arm/cpu.c',
- 'arm/refmvs_init.c',
- )
- libdav1d_tmpl_sources += files(
- 'arm/cdef_init_tmpl.c',
- 'arm/filmgrain_init_tmpl.c',
- 'arm/ipred_init_tmpl.c',
- 'arm/itx_init_tmpl.c',
- 'arm/loopfilter_init_tmpl.c',
- 'arm/looprestoration_init_tmpl.c',
- 'arm/mc_init_tmpl.c',
)
if (host_machine.cpu_family() == 'aarch64' or
host_machine.cpu() == 'arm64')
@@ -177,18 +167,6 @@ if is_asm_enabled
libdav1d_sources += files(
'x86/cpu.c',
- 'x86/msac_init.c',
- 'x86/refmvs_init.c',
- )
-
- libdav1d_tmpl_sources += files(
- 'x86/cdef_init_tmpl.c',
- 'x86/filmgrain_init_tmpl.c',
- 'x86/ipred_init_tmpl.c',
- 'x86/itx_init_tmpl.c',
- 'x86/loopfilter_init_tmpl.c',
- 'x86/looprestoration_init_tmpl.c',
- 'x86/mc_init_tmpl.c',
)
# NASM source files
@@ -257,8 +235,8 @@ if is_asm_enabled
'ppc/cpu.c',
)
libdav1d_arch_tmpl_sources += files(
- 'ppc/cdef_init_tmpl.c',
- 'ppc/looprestoration_init_tmpl.c',
+ 'ppc/cdef_tmpl.c',
+ 'ppc/looprestoration_tmpl.c',
)
endif
endif
diff --git a/src/msac.c b/src/msac.c
index d5f3207..43d8ae5 100644
--- a/src/msac.c
+++ b/src/msac.c
@@ -203,6 +203,6 @@ void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
#if ARCH_X86_64 && HAVE_ASM
s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
- dav1d_msac_init_x86(s);
+ msac_init_x86(s);
#endif
}
diff --git a/src/ppc/cdef.h b/src/ppc/cdef.h
new file mode 100644
index 0000000..b794ba5
--- /dev/null
+++ b/src/ppc/cdef.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2019, Luca Barbato
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+
+#include "common/bitdepth.h"
+#include "common/intops.h"
+
+#include "src/cdef.h"
+#include "src/cpu.h"
+
+#define cdef_vsx_fn(w, h) \
+void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \
+ const ptrdiff_t dst_stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, \
+ const int sec_strength, \
+ const int dir, \
+ const int damping, \
+ const enum CdefEdgeFlags edges)
+
+cdef_vsx_fn(4, 4);
+cdef_vsx_fn(4, 8);
+cdef_vsx_fn(8, 8);
+
+static ALWAYS_INLINE void cdef_dsp_init_ppc(Dav1dCdefDSPContext *const c) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
+
+#if BITDEPTH == 8
+ c->fb[0] = dav1d_cdef_filter_8x8_vsx;
+ c->fb[1] = dav1d_cdef_filter_4x8_vsx;
+ c->fb[2] = dav1d_cdef_filter_4x4_vsx;
+#endif
+}
diff --git a/src/ppc/cdef_init_tmpl.c b/src/ppc/cdef_tmpl.c
index 12e4a66..020e17b 100644
--- a/src/ppc/cdef_init_tmpl.c
+++ b/src/ppc/cdef_tmpl.c
@@ -24,15 +24,8 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#include <stdlib.h>
-
-#include "common/bitdepth.h"
-#include "common/intops.h"
-
-#include "src/cdef.h"
-#include "src/cpu.h"
-
#include "src/ppc/dav1d_types.h"
+#include "src/ppc/cdef.h"
#if BITDEPTH == 8
static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
@@ -451,18 +444,17 @@ filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
}
-
#define cdef_fn(w, h, tmp_stride) \
-static void cdef_filter_##w##x##h##_vsx(pixel *const dst, \
- const ptrdiff_t dst_stride, \
- const pixel (*left)[2], \
- const pixel *const top, \
- const pixel *const bottom, \
- const int pri_strength, \
- const int sec_strength, \
- const int dir, \
- const int damping, \
- const enum CdefEdgeFlags edges) \
+void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \
+ const ptrdiff_t dst_stride, \
+ const pixel (*left)[2], \
+ const pixel *const top, \
+ const pixel *const bottom, \
+ const int pri_strength, \
+ const int sec_strength, \
+ const int dir, \
+ const int damping, \
+ const enum CdefEdgeFlags edges) \
{ \
ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride,); \
uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
@@ -474,16 +466,3 @@ cdef_fn(4, 4, 8);
cdef_fn(4, 8, 8);
cdef_fn(8, 8, 16);
#endif
-
-COLD void bitfn(dav1d_cdef_dsp_init_ppc)(Dav1dCdefDSPContext *const c) {
- const unsigned flags = dav1d_get_cpu_flags();
-
- if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
-
-#if BITDEPTH == 8
- // c->dir = dav1d_cdef_find_dir_vsx;
- c->fb[0] = cdef_filter_8x8_vsx;
- c->fb[1] = cdef_filter_4x8_vsx;
- c->fb[2] = cdef_filter_4x4_vsx;
-#endif
-}
diff --git a/src/x86/msac_init.c b/src/ppc/looprestoration.h
index a634da2..3fe1631 100644
--- a/src/x86/msac_init.c
+++ b/src/ppc/looprestoration.h
@@ -1,5 +1,6 @@
/*
- * Copyright © 2020, VideoLAN and dav1d authors
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Michail Alvanos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -24,20 +25,24 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include "common/intops.h"
+
#include "src/cpu.h"
-#include "src/msac.h"
-#include "src/x86/msac.h"
+#include "src/looprestoration.h"
+
+void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride,
+ const uint8_t (*const left)[4],
+ const uint8_t *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
-#if ARCH_X86_64
-void dav1d_msac_init_x86(MsacContext *const s) {
+static ALWAYS_INLINE void loop_restoration_dsp_init_ppc(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
const unsigned flags = dav1d_get_cpu_flags();
- if (flags & DAV1D_X86_CPU_FLAG_SSE2) {
- s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
- }
+ if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
- if (flags & DAV1D_X86_CPU_FLAG_AVX2) {
- s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
- }
-}
+#if BITDEPTH == 8
+ c->wiener[0] = c->wiener[1] = dav1d_wiener_filter_vsx;
#endif
+}
diff --git a/src/ppc/looprestoration_init_tmpl.c b/src/ppc/looprestoration_tmpl.c
index e9bc622..f64a963 100644
--- a/src/ppc/looprestoration_init_tmpl.c
+++ b/src/ppc/looprestoration_tmpl.c
@@ -25,10 +25,8 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#include "common/intops.h"
#include "src/ppc/dav1d_types.h"
-#include "src/cpu.h"
-#include "src/looprestoration.h"
+#include "src/ppc/looprestoration.h"
#if BITDEPTH == 8
@@ -302,12 +300,12 @@ static inline void padding(uint8_t *dst, const uint8_t *p,
// (since first and last tops are always 0 for chroma)
// FIXME Could implement a version that requires less temporary memory
// (should be possible to implement with only 6 rows of temp storage)
-static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride,
- const uint8_t (*const left)[4],
- const uint8_t *lpf,
- const int w, const int h,
- const LooprestorationParams *const params,
- const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
+void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride,
+ const uint8_t (*const left)[4],
+ const uint8_t *lpf,
+ const int w, const int h,
+ const LooprestorationParams *const params,
+ const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
const int16_t (*const filter)[8] = params->filter;
@@ -321,17 +319,3 @@ static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride,
wiener_filter_v_vsx(p, stride, hor, filter[1], w, h);
}
#endif
-
-COLD void bitfn(dav1d_loop_restoration_dsp_init_ppc)(Dav1dLoopRestorationDSPContext *const c,
- const int bpc)
-{
- const unsigned flags = dav1d_get_cpu_flags();
-
- if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
-
-#if BITDEPTH == 8
- c->wiener[0] = c->wiener[1] = wiener_filter_vsx;
-#endif
-}
-
-
diff --git a/src/refmvs.c b/src/refmvs.c
index d49ebae..c7ed9db 100644
--- a/src/refmvs.c
+++ b/src/refmvs.c
@@ -922,15 +922,23 @@ static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
} while (--bh4);
}
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+#include "src/arm/refmvs.h"
+#elif ARCH_X86
+#include "src/x86/refmvs.h"
+#endif
+#endif
+
COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c)
{
c->splat_mv = splat_mv_c;
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
- dav1d_refmvs_dsp_init_arm(c);
+ refmvs_dsp_init_arm(c);
#elif ARCH_X86
- dav1d_refmvs_dsp_init_x86(c);
+ refmvs_dsp_init_x86(c);
#endif
#endif
}
diff --git a/src/x86/cdef_init_tmpl.c b/src/x86/cdef.h
index 441dfe8..553d650 100644
--- a/src/x86/cdef_init_tmpl.c
+++ b/src/x86/cdef.h
@@ -43,7 +43,7 @@ decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2));
decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4));
decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3));
-COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
+static ALWAYS_INLINE void cdef_dsp_init_x86(Dav1dCdefDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
#if BITDEPTH == 8
diff --git a/src/x86/filmgrain_init_tmpl.c b/src/x86/filmgrain.h
index 1c91d2a..eeaa328 100644
--- a/src/x86/filmgrain_init_tmpl.c
+++ b/src/x86/filmgrain.h
@@ -42,7 +42,7 @@ decl_fg_fns(ssse3);
decl_fg_fns(avx2);
decl_fg_fns(avx512icl);
-COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
+static ALWAYS_INLINE void film_grain_dsp_init_x86(Dav1dFilmGrainDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
diff --git a/src/x86/ipred_init_tmpl.c b/src/x86/ipred.h
index 0ba0a41..7df563f 100644
--- a/src/x86/ipred_init_tmpl.c
+++ b/src/x86/ipred.h
@@ -68,7 +68,7 @@ decl_fn(cfl_ac, ipred_cfl_ac_444);
decl_fn(pal_pred, pal_pred);
-COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
+static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
diff --git a/src/x86/itx_init_tmpl.c b/src/x86/itx.h
index d643592..95c1e87 100644
--- a/src/x86/itx_init_tmpl.c
+++ b/src/x86/itx.h
@@ -134,9 +134,7 @@ decl_itx_fns(ssse3);
decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));
-COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
- const int bpc)
-{
+static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
@@ -237,7 +235,7 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;
#if BITDEPTH == 16
- if (bpc <= 10) {
+ if (bpc == 10) {
assign_itx16_fn(, 4, 4, sse4);
assign_itx16_fn(R, 4, 8, sse4);
assign_itx16_fn(R, 4, 16, sse4);
@@ -264,21 +262,6 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2);
-#if BITDEPTH == 16
- assign_itx16_bpc_fn( , 4, 4, 12, avx2);
- assign_itx16_bpc_fn(R, 4, 8, 12, avx2);
- assign_itx16_bpc_fn(R, 4, 16, 12, avx2);
- assign_itx16_bpc_fn(R, 8, 4, 12, avx2);
- assign_itx16_bpc_fn( , 8, 8, 12, avx2);
- assign_itx16_bpc_fn(R, 8, 16, 12, avx2);
- assign_itx2_bpc_fn (R, 8, 32, 12, avx2);
- assign_itx16_bpc_fn(R, 16, 4, 12, avx2);
- assign_itx16_bpc_fn(R, 16, 8, 12, avx2);
- assign_itx12_bpc_fn( , 16, 16, 12, avx2);
- assign_itx2_bpc_fn (R, 32, 8, 12, avx2);
-#endif
-
- if (bpc > 10) return;
#if BITDEPTH == 8
assign_itx16_fn( , 4, 4, avx2);
@@ -300,26 +283,40 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c,
assign_itx1_fn (R, 64, 16, avx2);
assign_itx1_fn (R, 64, 32, avx2);
assign_itx1_fn ( , 64, 64, avx2);
-#elif BITDEPTH == 16
- assign_itx16_bpc_fn( , 4, 4, 10, avx2);
- assign_itx16_bpc_fn(R, 4, 8, 10, avx2);
- assign_itx16_bpc_fn(R, 4, 16, 10, avx2);
- assign_itx16_bpc_fn(R, 8, 4, 10, avx2);
- assign_itx16_bpc_fn( , 8, 8, 10, avx2);
- assign_itx16_bpc_fn(R, 8, 16, 10, avx2);
- assign_itx2_bpc_fn (R, 8, 32, 10, avx2);
- assign_itx16_bpc_fn(R, 16, 4, 10, avx2);
- assign_itx16_bpc_fn(R, 16, 8, 10, avx2);
- assign_itx12_bpc_fn( , 16, 16, 10, avx2);
- assign_itx2_bpc_fn (R, 16, 32, 10, avx2);
- assign_itx1_bpc_fn (R, 16, 64, 10, avx2);
- assign_itx2_bpc_fn (R, 32, 8, 10, avx2);
- assign_itx2_bpc_fn (R, 32, 16, 10, avx2);
- assign_itx2_bpc_fn ( , 32, 32, 10, avx2);
- assign_itx1_bpc_fn (R, 32, 64, 10, avx2);
- assign_itx1_bpc_fn (R, 64, 16, 10, avx2);
- assign_itx1_bpc_fn (R, 64, 32, 10, avx2);
- assign_itx1_bpc_fn ( , 64, 64, 10, avx2);
+#else
+ if (bpc == 10) {
+ assign_itx16_bpc_fn( , 4, 4, 10, avx2);
+ assign_itx16_bpc_fn(R, 4, 8, 10, avx2);
+ assign_itx16_bpc_fn(R, 4, 16, 10, avx2);
+ assign_itx16_bpc_fn(R, 8, 4, 10, avx2);
+ assign_itx16_bpc_fn( , 8, 8, 10, avx2);
+ assign_itx16_bpc_fn(R, 8, 16, 10, avx2);
+ assign_itx2_bpc_fn (R, 8, 32, 10, avx2);
+ assign_itx16_bpc_fn(R, 16, 4, 10, avx2);
+ assign_itx16_bpc_fn(R, 16, 8, 10, avx2);
+ assign_itx12_bpc_fn( , 16, 16, 10, avx2);
+ assign_itx2_bpc_fn (R, 16, 32, 10, avx2);
+ assign_itx1_bpc_fn (R, 16, 64, 10, avx2);
+ assign_itx2_bpc_fn (R, 32, 8, 10, avx2);
+ assign_itx2_bpc_fn (R, 32, 16, 10, avx2);
+ assign_itx2_bpc_fn ( , 32, 32, 10, avx2);
+ assign_itx1_bpc_fn (R, 32, 64, 10, avx2);
+ assign_itx1_bpc_fn (R, 64, 16, 10, avx2);
+ assign_itx1_bpc_fn (R, 64, 32, 10, avx2);
+ assign_itx1_bpc_fn ( , 64, 64, 10, avx2);
+ } else {
+ assign_itx16_bpc_fn( , 4, 4, 12, avx2);
+ assign_itx16_bpc_fn(R, 4, 8, 12, avx2);
+ assign_itx16_bpc_fn(R, 4, 16, 12, avx2);
+ assign_itx16_bpc_fn(R, 8, 4, 12, avx2);
+ assign_itx16_bpc_fn( , 8, 8, 12, avx2);
+ assign_itx16_bpc_fn(R, 8, 16, 12, avx2);
+ assign_itx2_bpc_fn (R, 8, 32, 12, avx2);
+ assign_itx16_bpc_fn(R, 16, 4, 12, avx2);
+ assign_itx16_bpc_fn(R, 16, 8, 12, avx2);
+ assign_itx12_bpc_fn( , 16, 16, 12, avx2);
+ assign_itx2_bpc_fn (R, 32, 8, 12, avx2);
+ }
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
diff --git a/src/x86/loopfilter_init_tmpl.c b/src/x86/loopfilter.h
index 1c085d9..33c842a 100644
--- a/src/x86/loopfilter_init_tmpl.c
+++ b/src/x86/loopfilter.h
@@ -38,7 +38,7 @@ decl_loopfilter_sb_fns(ssse3);
decl_loopfilter_sb_fns(avx2);
decl_loopfilter_sb_fns(avx512icl);
-COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) {
+static ALWAYS_INLINE void loop_filter_dsp_init_x86(Dav1dLoopFilterDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration.h
index dfd36e6..de23be8 100644
--- a/src/x86/looprestoration_init_tmpl.c
+++ b/src/x86/looprestoration.h
@@ -47,9 +47,7 @@ decl_sgr_filter_fns(ssse3);
decl_sgr_filter_fns(avx2);
decl_sgr_filter_fns(avx512icl);
-COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c,
- const int bpc)
-{
+static ALWAYS_INLINE void loop_restoration_dsp_init_x86(Dav1dLoopRestorationDSPContext *const c, const int bpc) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
@@ -61,7 +59,7 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
- if (bpc <= 10) {
+ if (BITDEPTH == 8 || bpc == 10) {
c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3);
c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3);
c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3);
@@ -72,7 +70,7 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
c->wiener[0] = BF(dav1d_wiener_filter7, avx2);
c->wiener[1] = BF(dav1d_wiener_filter5, avx2);
- if (bpc <= 10) {
+ if (BITDEPTH == 8 || bpc == 10) {
c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2);
c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2);
c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2);
@@ -87,7 +85,7 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
#else
c->wiener[1] = BF(dav1d_wiener_filter5, avx512icl);
#endif
- if (bpc <= 10) {
+ if (BITDEPTH == 8 || bpc == 10) {
c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx512icl);
c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx512icl);
c->sgr[2] = BF(dav1d_sgr_filter_mix, avx512icl);
diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc.h
index 57680ea..65c607e 100644
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc.h
@@ -105,7 +105,7 @@ decl_fn(emu_edge, dav1d_emu_edge);
decl_fn(resize, dav1d_resize);
-COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
+static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
diff --git a/src/x86/msac.h b/src/x86/msac.h
index e11cd08..0bb632f 100644
--- a/src/x86/msac.h
+++ b/src/x86/msac.h
@@ -28,21 +28,21 @@
#ifndef DAV1D_SRC_X86_MSAC_H
#define DAV1D_SRC_X86_MSAC_H
+#include "src/cpu.h"
+
unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
+ size_t n_symbols);
unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);
-/* Needed for checkasm */
-unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
- size_t n_symbols);
-
#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
@@ -55,10 +55,21 @@ unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf,
#if ARCH_X86_64
#define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb))
+
+static ALWAYS_INLINE void msac_init_x86(MsacContext *const s) {
+ const unsigned flags = dav1d_get_cpu_flags();
+
+ if (flags & DAV1D_X86_CPU_FLAG_SSE2) {
+ s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
+ }
+
+ if (flags & DAV1D_X86_CPU_FLAG_AVX2) {
+ s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
+ }
+}
+
#elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
#endif
-void dav1d_msac_init_x86(MsacContext *const s);
-
#endif /* DAV1D_SRC_X86_MSAC_H */
diff --git a/src/x86/refmvs_init.c b/src/x86/refmvs.h
index e3575ba..de4124c 100644
--- a/src/x86/refmvs_init.c
+++ b/src/x86/refmvs.h
@@ -32,7 +32,7 @@ decl_splat_mv_fn(dav1d_splat_mv_sse2);
decl_splat_mv_fn(dav1d_splat_mv_avx2);
decl_splat_mv_fn(dav1d_splat_mv_avx512icl);
-COLD void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
+static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 0d6d8d7..20245c7 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -579,6 +579,11 @@ int main(int argc, char *argv[]) {
argv++;
}
+#if TRIM_DSP_FUNCTIONS
+ fprintf(stderr, "checkasm: reference functions unavailable\n");
+ return 0;
+#endif
+
dav1d_init_cpu();
#ifdef _WIN32