diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/arm/cdef.h (renamed from src/arm/cdef_init_tmpl.c) | 3 | ||||
-rw-r--r-- | src/arm/filmgrain.h (renamed from src/arm/filmgrain_init_tmpl.c) | 54 | ||||
-rw-r--r-- | src/arm/ipred.h (renamed from src/arm/ipred_init_tmpl.c) | 2 | ||||
-rw-r--r-- | src/arm/itx.h (renamed from src/arm/itx_init_tmpl.c) | 4 | ||||
-rw-r--r-- | src/arm/loopfilter.h (renamed from src/arm/loopfilter_init_tmpl.c) | 2 | ||||
-rw-r--r-- | src/arm/looprestoration.h (renamed from src/arm/looprestoration_init_tmpl.c) | 4 | ||||
-rw-r--r-- | src/arm/mc.h (renamed from src/arm/mc_init_tmpl.c) | 2 | ||||
-rw-r--r-- | src/arm/refmvs.h (renamed from src/arm/refmvs_init.c) | 2 | ||||
-rw-r--r-- | src/cdef.h | 3 | ||||
-rw-r--r-- | src/cdef_tmpl.c | 16 | ||||
-rw-r--r-- | src/cpu.h | 51 | ||||
-rw-r--r-- | src/filmgrain.h | 2 | ||||
-rw-r--r-- | src/filmgrain_tmpl.c | 12 | ||||
-rw-r--r-- | src/ipred.h | 2 | ||||
-rw-r--r-- | src/ipred_tmpl.c | 12 | ||||
-rw-r--r-- | src/itx.h | 2 | ||||
-rw-r--r-- | src/itx_tmpl.c | 12 | ||||
-rw-r--r-- | src/loopfilter.h | 2 | ||||
-rw-r--r-- | src/loopfilter_tmpl.c | 12 | ||||
-rw-r--r-- | src/looprestoration.h | 3 | ||||
-rw-r--r-- | src/looprestoration_tmpl.c | 16 | ||||
-rw-r--r-- | src/mc.h | 2 | ||||
-rw-r--r-- | src/mc_tmpl.c | 12 | ||||
-rw-r--r-- | src/meson.build | 26 | ||||
-rw-r--r-- | src/msac.c | 2 | ||||
-rw-r--r-- | src/ppc/cdef.h | 61 | ||||
-rw-r--r-- | src/ppc/cdef_tmpl.c (renamed from src/ppc/cdef_init_tmpl.c) | 43 | ||||
-rw-r--r-- | src/ppc/looprestoration.h (renamed from src/x86/msac_init.c) | 29 | ||||
-rw-r--r-- | src/ppc/looprestoration_tmpl.c (renamed from src/ppc/looprestoration_init_tmpl.c) | 30 | ||||
-rw-r--r-- | src/refmvs.c | 12 | ||||
-rw-r--r-- | src/x86/cdef.h (renamed from src/x86/cdef_init_tmpl.c) | 2 | ||||
-rw-r--r-- | src/x86/filmgrain.h (renamed from src/x86/filmgrain_init_tmpl.c) | 2 | ||||
-rw-r--r-- | src/x86/ipred.h (renamed from src/x86/ipred_init_tmpl.c) | 2 | ||||
-rw-r--r-- | src/x86/itx.h (renamed from src/x86/itx_init_tmpl.c) | 75 | ||||
-rw-r--r-- | src/x86/loopfilter.h (renamed from src/x86/loopfilter_init_tmpl.c) | 2 | ||||
-rw-r--r-- | src/x86/looprestoration.h (renamed from src/x86/looprestoration_init_tmpl.c) | 10 | ||||
-rw-r--r-- | src/x86/mc.h (renamed from src/x86/mc_init_tmpl.c) | 2 | ||||
-rw-r--r-- | src/x86/msac.h | 23 | ||||
-rw-r--r-- | src/x86/refmvs.h (renamed from src/x86/refmvs_init.c) | 2 |
39 files changed, 325 insertions, 230 deletions
diff --git a/src/arm/cdef_init_tmpl.c b/src/arm/cdef.h index 33bd348..2e8c8ab 100644 --- a/src/arm/cdef_init_tmpl.c +++ b/src/arm/cdef.h @@ -76,8 +76,7 @@ DEFINE_FILTER(8, 8, 16) DEFINE_FILTER(4, 8, 8) DEFINE_FILTER(4, 4, 8) - -COLD void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) { +static ALWAYS_INLINE void cdef_dsp_init_arm(Dav1dCdefDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; diff --git a/src/arm/filmgrain_init_tmpl.c b/src/arm/filmgrain.h index 2156047..118ce30 100644 --- a/src/arm/filmgrain_init_tmpl.c +++ b/src/arm/filmgrain.h @@ -72,35 +72,6 @@ void BF(dav1d_fgy_32x32, neon)(pixel *const dst, const ptrdiff_t type HIGHBD_DECL_SUFFIX); -// Use ptrdiff_t instead of int for the last few parameters, to get the -// parameters on the stack with the same layout across platforms. -#define FGUV(suff) \ -void BF(dav1d_fguv_32x32_ ## suff, neon)(pixel *const dst, \ - const pixel *const src, \ - const ptrdiff_t stride, \ - const uint8_t scaling[SCALING_SIZE], \ - const Dav1dFilmGrainData *const data, \ - const entry grain_lut[][GRAIN_WIDTH], \ - const pixel *const luma_row, \ - const ptrdiff_t luma_stride, \ - const int offsets[][2], \ - const ptrdiff_t h, const ptrdiff_t uv, \ - const ptrdiff_t is_id, \ - const ptrdiff_t type \ - HIGHBD_DECL_SUFFIX) - -FGUV(420); -FGUV(422); -FGUV(444); - -static inline int get_random_number(const int bits, unsigned *const state) { - const int r = *state; - unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1; - *state = (r >> 1) | (bit << 15); - - return (*state >> (16 - bits)) & ((1 << bits) - 1); -} - static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row, const ptrdiff_t stride, const Dav1dFilmGrainData *const data, const size_t pw, @@ -147,7 +118,22 @@ static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row, } } -#define fguv_ss_fn(nm, sx, sy) \ +// Use ptrdiff_t instead of int for the last few parameters, to get the +// parameters on the stack with the same layout across platforms. +#define FGUV(nm, sx, sy) \ +void BF(dav1d_fguv_32x32_##nm, neon)(pixel *const dst, \ + const pixel *const src, \ + const ptrdiff_t stride, \ + const uint8_t scaling[SCALING_SIZE], \ + const Dav1dFilmGrainData *const data, \ + const entry grain_lut[][GRAIN_WIDTH], \ + const pixel *const luma_row, \ + const ptrdiff_t luma_stride, \ + const int offsets[][2], \ + const ptrdiff_t h, const ptrdiff_t uv, \ + const ptrdiff_t is_id, \ + const ptrdiff_t type \ + HIGHBD_DECL_SUFFIX); \ static void \ fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \ const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \ @@ -197,11 +183,11 @@ fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \ } \ } -fguv_ss_fn(420, 1, 1); -fguv_ss_fn(422, 1, 0); -fguv_ss_fn(444, 0, 0); +FGUV(420, 1, 1); +FGUV(422, 1, 0); +FGUV(444, 0, 0); -COLD void bitfn(dav1d_film_grain_dsp_init_arm)(Dav1dFilmGrainDSPContext *const c) { +static ALWAYS_INLINE void film_grain_dsp_init_arm(Dav1dFilmGrainDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; diff --git a/src/arm/ipred_init_tmpl.c b/src/arm/ipred.h index 463481f..aef4dae 100644 --- a/src/arm/ipred_init_tmpl.c +++ b/src/arm/ipred.h @@ -50,7 +50,7 @@ decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon)); decl_pal_pred_fn(BF(dav1d_pal_pred, neon)); -COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c) { +static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; diff --git a/src/arm/itx_init_tmpl.c b/src/arm/itx.h index d089a6f..2ecd086 100644 --- a/src/arm/itx_init_tmpl.c +++ b/src/arm/itx.h @@ -77,7 +77,7 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon)); decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon)); -COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc) { +static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) { #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) @@ -117,7 +117,7 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; - if (bpc > 10) return; + if (BITDEPTH == 16 && bpc != 10) return; assign_itx17_fn( , 4, 4, neon); assign_itx16_fn(R, 4, 8, neon); diff --git a/src/arm/loopfilter_init_tmpl.c b/src/arm/loopfilter.h index 671545d..9ac08d9 100644 --- a/src/arm/loopfilter_init_tmpl.c +++ b/src/arm/loopfilter.h @@ -33,7 +33,7 @@ decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, neon)); decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, neon)); decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, neon)); -COLD void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const c) { +static ALWAYS_INLINE void loop_filter_dsp_init_arm(Dav1dLoopFilterDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; diff --git a/src/arm/looprestoration_init_tmpl.c b/src/arm/looprestoration.h index 5ba4bce..7993dbf 100644 --- a/src/arm/looprestoration_init_tmpl.c +++ b/src/arm/looprestoration.h @@ -246,7 +246,7 @@ static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride, tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX); } -COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) { +static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; @@ -257,7 +257,7 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPCont #else c->wiener[0] = c->wiener[1] = wiener_filter_neon; #endif - if (bpc <= 10) { + if (BITDEPTH == 8 || bpc == 10) { c->sgr[0] = sgr_filter_5x5_neon; c->sgr[1] = sgr_filter_3x3_neon; c->sgr[2] = sgr_filter_mix_neon; diff --git a/src/arm/mc_init_tmpl.c b/src/arm/mc.h index 3423020..06cd533 100644 --- a/src/arm/mc_init_tmpl.c +++ b/src/arm/mc.h @@ -68,7 +68,7 @@ decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon)); decl_emu_edge_fn(BF(dav1d_emu_edge, neon)); -void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { +static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) { #define init_mc_fn(type, name, suffix) \ c->mc[type] = BF(dav1d_put_##name, suffix) #define init_mct_fn(type, name, suffix) \ diff --git a/src/arm/refmvs_init.c b/src/arm/refmvs.h index acde030..4c96fc5 100644 --- a/src/arm/refmvs_init.c +++ b/src/arm/refmvs.h @@ -30,7 +30,7 @@ decl_splat_mv_fn(dav1d_splat_mv_neon); -COLD void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) { +static ALWAYS_INLINE void refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; @@ -67,8 +67,5 @@ typedef struct Dav1dCdefDSPContext { } Dav1dCdefDSPContext; bitfn_decls(void dav1d_cdef_dsp_init, Dav1dCdefDSPContext *c); -bitfn_decls(void dav1d_cdef_dsp_init_arm, Dav1dCdefDSPContext *c); -bitfn_decls(void dav1d_cdef_dsp_init_ppc, Dav1dCdefDSPContext *c); -bitfn_decls(void dav1d_cdef_dsp_init_x86, Dav1dCdefDSPContext *c); #endif /* DAV1D_SRC_CDEF_H */ diff --git a/src/cdef_tmpl.c b/src/cdef_tmpl.c index 1c95dbf..5943945 100644 --- a/src/cdef_tmpl.c +++ b/src/cdef_tmpl.c @@ -303,6 +303,16 @@ static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride, return best_dir; } +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/cdef.h" +#elif ARCH_PPC64LE +#include "src/ppc/cdef.h" +#elif ARCH_X86 +#include "src/x86/cdef.h" +#endif +#endif + COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) { c->dir = cdef_find_dir_c; c->fb[0] = cdef_filter_block_8x8_c; @@ -311,11 +321,11 @@ COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) { #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_cdef_dsp_init_arm)(c); + cdef_dsp_init_arm(c); #elif ARCH_PPC64LE - bitfn(dav1d_cdef_dsp_init_ppc)(c); + cdef_dsp_init_ppc(c); #elif ARCH_X86 - bitfn(dav1d_cdef_dsp_init_x86)(c); + cdef_dsp_init_x86(c); #endif #endif } @@ -1,6 +1,6 @@ /* - * Copyright © 2018, VideoLAN and dav1d authors - * Copyright © 2018, Two Orioles, LLC + * Copyright © 2018-2022, VideoLAN and dav1d authors + * Copyright © 2018-2022, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -51,7 +51,52 @@ DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask); int dav1d_num_logical_processors(Dav1dContext *c); static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) { - return dav1d_cpu_flags & dav1d_cpu_flags_mask; + unsigned flags = dav1d_cpu_flags & dav1d_cpu_flags_mask; + +#if TRIM_DSP_FUNCTIONS +/* Since this function is inlined, unconditionally setting a flag here will + * enable dead code elimination in the calling function. */ +#if ARCH_AARCH64 || ARCH_ARM +#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64 + flags |= DAV1D_ARM_CPU_FLAG_NEON; +#endif +#elif ARCH_PPC64LE +#if defined(__VSX__) + flags |= DAV1D_PPC_CPU_FLAG_VSX; +#endif +#elif ARCH_X86 +#if defined(__AVX512F__) && defined(__AVX512CD__) && \ + defined(__AVX512BW__) && defined(__AVX512DQ__) && \ + defined(__AVX512VL__) && defined(__AVX512VNNI__) && \ + defined(__AVX512IFMA__) && defined(__AVX512VBMI__) && \ + defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \ + defined(__AVX512BITALG__) && defined(__GFNI__) && \ + defined(__VAES__) && defined(__VPCLMULQDQ__) + flags |= DAV1D_X86_CPU_FLAG_AVX512ICL | + DAV1D_X86_CPU_FLAG_AVX2 | + DAV1D_X86_CPU_FLAG_SSE41 | + DAV1D_X86_CPU_FLAG_SSSE3 | + DAV1D_X86_CPU_FLAG_SSE2; +#elif defined(__AVX2__) + flags |= DAV1D_X86_CPU_FLAG_AVX2 | + DAV1D_X86_CPU_FLAG_SSE41 | + DAV1D_X86_CPU_FLAG_SSSE3 | + DAV1D_X86_CPU_FLAG_SSE2; +#elif defined(__SSE4_1__) || defined(__AVX__) + flags |= DAV1D_X86_CPU_FLAG_SSE41 | + DAV1D_X86_CPU_FLAG_SSSE3 | + DAV1D_X86_CPU_FLAG_SSE2; +#elif defined(__SSSE3__) + flags |= DAV1D_X86_CPU_FLAG_SSSE3 | + DAV1D_X86_CPU_FLAG_SSE2; +#elif ARCH_X86_64 || defined(__SSE2__) || \ + (defined(_M_IX86_FP) && _M_IX86_FP >= 2) + flags |= DAV1D_X86_CPU_FLAG_SSE2; +#endif +#endif +#endif + + return flags; } #endif /* DAV1D_SRC_CPU_H */ diff --git a/src/filmgrain.h b/src/filmgrain.h index d953542..0ffded6 100644 --- a/src/filmgrain.h +++ b/src/filmgrain.h @@ -80,7 +80,5 @@ typedef struct Dav1dFilmGrainDSPContext { } Dav1dFilmGrainDSPContext; bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c); -bitfn_decls(void dav1d_film_grain_dsp_init_arm, Dav1dFilmGrainDSPContext *c); -bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c); #endif /* DAV1D_SRC_FILM_GRAIN_H */ diff --git a/src/filmgrain_tmpl.c b/src/filmgrain_tmpl.c index 883c5cb..b772614 100644 --- a/src/filmgrain_tmpl.c +++ b/src/filmgrain_tmpl.c @@ -412,6 +412,14 @@ fguv_ss_fn(420, 1, 1); fguv_ss_fn(422, 1, 0); fguv_ss_fn(444, 0, 0); +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/filmgrain.h" +#elif ARCH_X86 +#include "src/x86/filmgrain.h" +#endif +#endif + COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) { c->generate_grain_y = generate_grain_y_c; c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c; @@ -425,9 +433,9 @@ COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) { #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_film_grain_dsp_init_arm)(c); + film_grain_dsp_init_arm(c); #elif ARCH_X86 - bitfn(dav1d_film_grain_dsp_init_x86)(c); + film_grain_dsp_init_x86(c); #endif #endif } diff --git a/src/ipred.h b/src/ipred.h index 8664f3f..739ef1a 100644 --- a/src/ipred.h +++ b/src/ipred.h @@ -90,7 +90,5 @@ typedef struct Dav1dIntraPredDSPContext { } Dav1dIntraPredDSPContext; bitfn_decls(void dav1d_intra_pred_dsp_init, Dav1dIntraPredDSPContext *c); -bitfn_decls(void dav1d_intra_pred_dsp_init_arm, Dav1dIntraPredDSPContext *c); -bitfn_decls(void dav1d_intra_pred_dsp_init_x86, Dav1dIntraPredDSPContext *c); #endif /* DAV1D_SRC_IPRED_H */ diff --git a/src/ipred_tmpl.c b/src/ipred_tmpl.c index 50c7a3c..151d484 100644 --- a/src/ipred_tmpl.c +++ b/src/ipred_tmpl.c @@ -726,6 +726,14 @@ static void pal_pred_c(pixel *dst, const ptrdiff_t stride, } } +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/ipred.h" +#elif ARCH_X86 +#include "src/x86/ipred.h" +#endif +#endif + COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) { c->intra_pred[DC_PRED ] = ipred_dc_c; c->intra_pred[DC_128_PRED ] = ipred_dc_128_c; @@ -755,9 +763,9 @@ COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) { #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_intra_pred_dsp_init_arm)(c); + intra_pred_dsp_init_arm(c); #elif ARCH_X86 - bitfn(dav1d_intra_pred_dsp_init_x86)(c); + intra_pred_dsp_init_x86(c); #endif #endif } @@ -44,7 +44,5 @@ typedef struct Dav1dInvTxfmDSPContext { } Dav1dInvTxfmDSPContext; bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc); -bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c, int bpc); -bitfn_decls(void dav1d_itx_dsp_init_x86, Dav1dInvTxfmDSPContext *c, int bpc); #endif /* DAV1D_SRC_ITX_H */ diff --git a/src/itx_tmpl.c b/src/itx_tmpl.c index 2f97a9c..d385989 100644 --- a/src/itx_tmpl.c +++ b/src/itx_tmpl.c @@ -180,6 +180,14 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride, dst[x] = iclip_pixel(dst[x] + *c++); } +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/itx.h" +#elif ARCH_X86 +#include "src/x86/itx.h" +#endif +#endif + COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) { #define assign_itx_all_fn64(w, h, pfx) \ c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT ] = \ @@ -247,10 +255,10 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) { #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_itx_dsp_init_arm)(c, bpc); + itx_dsp_init_arm(c, bpc); #endif #if ARCH_X86 - bitfn(dav1d_itx_dsp_init_x86)(c, bpc); + itx_dsp_init_x86(c, bpc); #endif #endif } diff --git a/src/loopfilter.h b/src/loopfilter.h index c159050..a0f78c9 100644 --- a/src/loopfilter.h +++ b/src/loopfilter.h @@ -53,7 +53,5 @@ typedef struct Dav1dLoopFilterDSPContext { } Dav1dLoopFilterDSPContext; bitfn_decls(void dav1d_loop_filter_dsp_init, Dav1dLoopFilterDSPContext *c); -bitfn_decls(void dav1d_loop_filter_dsp_init_arm, Dav1dLoopFilterDSPContext *c); -bitfn_decls(void dav1d_loop_filter_dsp_init_x86, Dav1dLoopFilterDSPContext *c); #endif /* DAV1D_SRC_LOOPFILTER_H */ diff --git a/src/loopfilter_tmpl.c b/src/loopfilter_tmpl.c index 6ea744f..cacf258 100644 --- a/src/loopfilter_tmpl.c +++ b/src/loopfilter_tmpl.c @@ -244,6 +244,14 @@ static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride, } } +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/loopfilter.h" +#elif ARCH_X86 +#include "src/x86/loopfilter.h" +#endif +#endif + COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) { c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c; c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c; @@ -252,9 +260,9 @@ COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_loop_filter_dsp_init_arm)(c); + loop_filter_dsp_init_arm(c); #elif ARCH_X86 - bitfn(dav1d_loop_filter_dsp_init_x86)(c); + loop_filter_dsp_init_x86(c); #endif #endif } diff --git a/src/looprestoration.h b/src/looprestoration.h index d0ab811..f55dd31 100644 --- a/src/looprestoration.h +++ b/src/looprestoration.h @@ -75,8 +75,5 @@ typedef struct Dav1dLoopRestorationDSPContext { } Dav1dLoopRestorationDSPContext; bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c, int bpc); -bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c, int bpc); -bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c, int bpc); -bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c, int bpc); #endif /* DAV1D_SRC_LOOPRESTORATION_H */ diff --git a/src/looprestoration_tmpl.c b/src/looprestoration_tmpl.c index 254c25d..d4d7867 100644 --- a/src/looprestoration_tmpl.c +++ b/src/looprestoration_tmpl.c @@ -524,6 +524,16 @@ static void sgr_mix_c(pixel *p, const ptrdiff_t stride, } } +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/looprestoration.h" +#elif ARCH_PPC64LE +#include "src/ppc/looprestoration.h" +#elif ARCH_X86 +#include "src/x86/looprestoration.h" +#endif +#endif + COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, const int bpc) { @@ -534,11 +544,11 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_loop_restoration_dsp_init_arm)(c, bpc); + loop_restoration_dsp_init_arm(c, bpc); #elif ARCH_PPC64LE - bitfn(dav1d_loop_restoration_dsp_init_ppc)(c, bpc); + loop_restoration_dsp_init_ppc(c, bpc); #elif ARCH_X86 - bitfn(dav1d_loop_restoration_dsp_init_x86)(c, bpc); + loop_restoration_dsp_init_x86(c, bpc); #endif #endif } @@ -132,7 +132,5 @@ typedef struct Dav1dMCDSPContext { } Dav1dMCDSPContext; bitfn_decls(void dav1d_mc_dsp_init, Dav1dMCDSPContext *c); -bitfn_decls(void dav1d_mc_dsp_init_arm, Dav1dMCDSPContext *c); -bitfn_decls(void dav1d_mc_dsp_init_x86, Dav1dMCDSPContext *c); #endif /* DAV1D_SRC_MC_H */ diff --git a/src/mc_tmpl.c b/src/mc_tmpl.c index f8d3e3b..20226d8 100644 --- a/src/mc_tmpl.c +++ b/src/mc_tmpl.c @@ -902,6 +902,14 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride, } while (--h); } +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/mc.h" +#elif ARCH_X86 +#include "src/x86/mc.h" +#endif +#endif + COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) { #define init_mc_fns(type, name) do { \ c->mc [type] = put_##name##_c; \ @@ -937,9 +945,9 @@ COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) { #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_mc_dsp_init_arm)(c); + mc_dsp_init_arm(c); #elif ARCH_X86 - bitfn(dav1d_mc_dsp_init_x86)(c); + mc_dsp_init_x86(c); #endif #endif } diff --git a/src/meson.build b/src/meson.build index 5c41e37..d19b8d9 100644 --- a/src/meson.build +++ b/src/meson.build @@ -92,16 +92,6 @@ if is_asm_enabled libdav1d_sources += files( 'arm/cpu.c', - 'arm/refmvs_init.c', - ) - libdav1d_tmpl_sources += files( - 'arm/cdef_init_tmpl.c', - 'arm/filmgrain_init_tmpl.c', - 'arm/ipred_init_tmpl.c', - 'arm/itx_init_tmpl.c', - 'arm/loopfilter_init_tmpl.c', - 'arm/looprestoration_init_tmpl.c', - 'arm/mc_init_tmpl.c', ) if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64') @@ -177,18 +167,6 @@ if is_asm_enabled libdav1d_sources += files( 'x86/cpu.c', - 'x86/msac_init.c', - 'x86/refmvs_init.c', - ) - - libdav1d_tmpl_sources += files( - 'x86/cdef_init_tmpl.c', - 'x86/filmgrain_init_tmpl.c', - 'x86/ipred_init_tmpl.c', - 'x86/itx_init_tmpl.c', - 'x86/loopfilter_init_tmpl.c', - 'x86/looprestoration_init_tmpl.c', - 'x86/mc_init_tmpl.c', ) # NASM source files @@ -257,8 +235,8 @@ if is_asm_enabled 'ppc/cpu.c', ) libdav1d_arch_tmpl_sources += files( - 'ppc/cdef_init_tmpl.c', - 'ppc/looprestoration_init_tmpl.c', + 'ppc/cdef_tmpl.c', + 'ppc/looprestoration_tmpl.c', ) endif endif @@ -203,6 +203,6 @@ void dav1d_msac_init(MsacContext *const s, const uint8_t *const data, #if ARCH_X86_64 && HAVE_ASM s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c; - dav1d_msac_init_x86(s); + msac_init_x86(s); #endif } diff --git a/src/ppc/cdef.h b/src/ppc/cdef.h new file mode 100644 index 0000000..b794ba5 --- /dev/null +++ b/src/ppc/cdef.h @@ -0,0 +1,61 @@ +/* + * Copyright © 2019, Luca Barbato + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdlib.h> + +#include "common/bitdepth.h" +#include "common/intops.h" + +#include "src/cdef.h" +#include "src/cpu.h" + +#define cdef_vsx_fn(w, h) \ +void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \ + const ptrdiff_t dst_stride, \ + const pixel (*left)[2], \ + const pixel *const top, \ + const pixel *const bottom, \ + const int pri_strength, \ + const int sec_strength, \ + const int dir, \ + const int damping, \ + const enum CdefEdgeFlags edges) + +cdef_vsx_fn(4, 4); +cdef_vsx_fn(4, 8); +cdef_vsx_fn(8, 8); + +static ALWAYS_INLINE void cdef_dsp_init_ppc(Dav1dCdefDSPContext *const c) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return; + +#if BITDEPTH == 8 + c->fb[0] = dav1d_cdef_filter_8x8_vsx; + c->fb[1] = dav1d_cdef_filter_4x8_vsx; + c->fb[2] = dav1d_cdef_filter_4x4_vsx; +#endif +} diff --git a/src/ppc/cdef_init_tmpl.c b/src/ppc/cdef_tmpl.c index 12e4a66..020e17b 100644 --- a/src/ppc/cdef_init_tmpl.c +++ b/src/ppc/cdef_tmpl.c @@ -24,15 +24,8 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include <stdlib.h> - -#include "common/bitdepth.h" -#include "common/intops.h" - -#include "src/cdef.h" -#include "src/cpu.h" - #include "src/ppc/dav1d_types.h" +#include "src/ppc/cdef.h" #if BITDEPTH == 8 static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold, @@ -451,18 +444,17 @@ filter_8xN(pixel *dst, const ptrdiff_t dst_stride, } - #define cdef_fn(w, h, tmp_stride) \ -static void cdef_filter_##w##x##h##_vsx(pixel *const dst, \ - const ptrdiff_t dst_stride, \ - const pixel (*left)[2], \ - const pixel *const top, \ - const pixel *const bottom, \ - const int pri_strength, \ - const int sec_strength, \ - const int dir, \ - const int damping, \ - const enum CdefEdgeFlags edges) \ +void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \ + const ptrdiff_t dst_stride, \ + const pixel (*left)[2], \ + const pixel *const top, \ + const pixel *const bottom, \ + const int pri_strength, \ + const int sec_strength, \ + const int dir, \ + const int damping, \ + const enum CdefEdgeFlags edges) \ { \ ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride,); \ uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \ @@ -474,16 +466,3 @@ cdef_fn(4, 4, 8); cdef_fn(4, 8, 8); cdef_fn(8, 8, 16); #endif - -COLD void bitfn(dav1d_cdef_dsp_init_ppc)(Dav1dCdefDSPContext *const c) { - const unsigned flags = dav1d_get_cpu_flags(); - - if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return; - -#if BITDEPTH == 8 - // c->dir = dav1d_cdef_find_dir_vsx; - c->fb[0] = cdef_filter_8x8_vsx; - c->fb[1] = cdef_filter_4x8_vsx; - c->fb[2] = cdef_filter_4x4_vsx; -#endif -} diff --git a/src/x86/msac_init.c b/src/ppc/looprestoration.h index a634da2..3fe1631 100644 --- a/src/x86/msac_init.c +++ b/src/ppc/looprestoration.h @@ -1,5 +1,6 @@ /* - * Copyright © 2020, VideoLAN and dav1d authors + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2019, Michail Alvanos * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -24,20 +25,24 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include "common/intops.h" + #include "src/cpu.h" -#include "src/msac.h" -#include "src/x86/msac.h" +#include "src/looprestoration.h" + +void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride, + const uint8_t (*const left)[4], + const uint8_t *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX); -#if ARCH_X86_64 -void dav1d_msac_init_x86(MsacContext *const s) { +static ALWAYS_INLINE void loop_restoration_dsp_init_ppc(Dav1dLoopRestorationDSPContext *const c, const int bpc) { const unsigned flags = dav1d_get_cpu_flags(); - if (flags & DAV1D_X86_CPU_FLAG_SSE2) { - s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2; - } + if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return; - if (flags & DAV1D_X86_CPU_FLAG_AVX2) { - s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2; - } -} +#if BITDEPTH == 8 + c->wiener[0] = c->wiener[1] = dav1d_wiener_filter_vsx; #endif +} diff --git a/src/ppc/looprestoration_init_tmpl.c b/src/ppc/looprestoration_tmpl.c index e9bc622..f64a963 100644 --- a/src/ppc/looprestoration_init_tmpl.c +++ b/src/ppc/looprestoration_tmpl.c @@ -25,10 +25,8 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "common/intops.h" #include "src/ppc/dav1d_types.h" -#include "src/cpu.h" -#include "src/looprestoration.h" +#include "src/ppc/looprestoration.h" #if BITDEPTH == 8 @@ -302,12 +300,12 @@ static inline void padding(uint8_t *dst, const uint8_t *p, // (since first and last tops are always 0 for chroma) // FIXME Could implement a version that requires less temporary memory // (should be possible to implement with only 6 rows of temp storage) -static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride, - const uint8_t (*const left)[4], - const uint8_t *lpf, - const int w, const int h, - const LooprestorationParams *const params, - const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) +void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride, + const uint8_t (*const left)[4], + const uint8_t *lpf, + const int w, const int h, + const LooprestorationParams *const params, + const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) { const int16_t (*const filter)[8] = params->filter; @@ -321,17 +319,3 @@ static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride, wiener_filter_v_vsx(p, stride, hor, filter[1], w, h); } #endif - -COLD void bitfn(dav1d_loop_restoration_dsp_init_ppc)(Dav1dLoopRestorationDSPContext *const c, - const int bpc) -{ - const unsigned flags = dav1d_get_cpu_flags(); - - if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return; - -#if BITDEPTH == 8 - c->wiener[0] = c->wiener[1] = wiener_filter_vsx; -#endif -} - - diff --git a/src/refmvs.c b/src/refmvs.c index d49ebae..c7ed9db 100644 --- a/src/refmvs.c +++ b/src/refmvs.c @@ -922,15 +922,23 @@ static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv, } while (--bh4); } +#if HAVE_ASM +#if ARCH_AARCH64 || ARCH_ARM +#include "src/arm/refmvs.h" +#elif ARCH_X86 +#include "src/x86/refmvs.h" +#endif +#endif + COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c) { c->splat_mv = splat_mv_c; #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - dav1d_refmvs_dsp_init_arm(c); + refmvs_dsp_init_arm(c); #elif ARCH_X86 - dav1d_refmvs_dsp_init_x86(c); + refmvs_dsp_init_x86(c); #endif #endif } diff --git a/src/x86/cdef_init_tmpl.c b/src/x86/cdef.h index 441dfe8..553d650 100644 --- a/src/x86/cdef_init_tmpl.c +++ b/src/x86/cdef.h @@ -43,7 +43,7 @@ decl_cdef_dir_fn(BF(dav1d_cdef_dir, avx2)); decl_cdef_dir_fn(BF(dav1d_cdef_dir, sse4)); decl_cdef_dir_fn(BF(dav1d_cdef_dir, ssse3)); -COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) { +static ALWAYS_INLINE void cdef_dsp_init_x86(Dav1dCdefDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); #if BITDEPTH == 8 diff --git a/src/x86/filmgrain_init_tmpl.c b/src/x86/filmgrain.h index 1c91d2a..eeaa328 100644 --- a/src/x86/filmgrain_init_tmpl.c +++ b/src/x86/filmgrain.h @@ -42,7 +42,7 @@ decl_fg_fns(ssse3); decl_fg_fns(avx2); decl_fg_fns(avx512icl); -COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) { +static ALWAYS_INLINE void film_grain_dsp_init_x86(Dav1dFilmGrainDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; diff --git a/src/x86/ipred_init_tmpl.c b/src/x86/ipred.h index 0ba0a41..7df563f 100644 --- a/src/x86/ipred_init_tmpl.c +++ b/src/x86/ipred.h @@ -68,7 +68,7 @@ decl_fn(cfl_ac, ipred_cfl_ac_444); decl_fn(pal_pred, pal_pred); -COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) { +static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; diff --git a/src/x86/itx_init_tmpl.c b/src/x86/itx.h index d643592..95c1e87 100644 --- a/src/x86/itx_init_tmpl.c +++ b/src/x86/itx.h @@ -134,9 +134,7 @@ decl_itx_fns(ssse3); decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2); decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2)); -COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c, - const int bpc) -{ +static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) { #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) @@ -237,7 +235,7 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c, if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return; #if BITDEPTH == 16 - if (bpc <= 10) { + if (bpc == 10) { assign_itx16_fn(, 4, 4, sse4); assign_itx16_fn(R, 4, 8, sse4); assign_itx16_fn(R, 4, 16, sse4); @@ -264,21 +262,6 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c, if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; assign_itx_fn(, 4, 4, wht_wht, WHT_WHT, avx2); -#if BITDEPTH == 16 - assign_itx16_bpc_fn( , 4, 4, 12, avx2); - assign_itx16_bpc_fn(R, 4, 8, 12, avx2); - assign_itx16_bpc_fn(R, 4, 16, 12, avx2); - assign_itx16_bpc_fn(R, 8, 4, 12, avx2); - assign_itx16_bpc_fn( , 8, 8, 12, avx2); - assign_itx16_bpc_fn(R, 8, 16, 12, avx2); - assign_itx2_bpc_fn (R, 8, 32, 12, avx2); - assign_itx16_bpc_fn(R, 16, 4, 12, avx2); - assign_itx16_bpc_fn(R, 16, 8, 12, avx2); - assign_itx12_bpc_fn( , 16, 16, 12, avx2); - assign_itx2_bpc_fn (R, 32, 8, 12, avx2); -#endif - - if (bpc > 10) return; #if BITDEPTH == 8 assign_itx16_fn( , 4, 4, avx2); @@ -300,26 +283,40 @@ COLD void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c, assign_itx1_fn (R, 64, 16, avx2); assign_itx1_fn (R, 64, 32, avx2); assign_itx1_fn ( , 64, 64, avx2); -#elif BITDEPTH == 16 - assign_itx16_bpc_fn( , 4, 4, 10, avx2); - assign_itx16_bpc_fn(R, 4, 8, 10, avx2); - assign_itx16_bpc_fn(R, 4, 16, 10, avx2); - assign_itx16_bpc_fn(R, 8, 4, 10, avx2); - assign_itx16_bpc_fn( , 8, 8, 10, avx2); - assign_itx16_bpc_fn(R, 8, 16, 10, avx2); - assign_itx2_bpc_fn (R, 8, 32, 10, avx2); - assign_itx16_bpc_fn(R, 16, 4, 10, avx2); - assign_itx16_bpc_fn(R, 16, 8, 10, avx2); - assign_itx12_bpc_fn( , 16, 16, 10, avx2); - assign_itx2_bpc_fn (R, 16, 32, 10, avx2); - assign_itx1_bpc_fn (R, 16, 64, 10, avx2); - assign_itx2_bpc_fn (R, 32, 8, 10, avx2); - assign_itx2_bpc_fn (R, 32, 16, 10, avx2); - assign_itx2_bpc_fn ( , 32, 32, 10, avx2); - assign_itx1_bpc_fn (R, 32, 64, 10, avx2); - assign_itx1_bpc_fn (R, 64, 16, 10, avx2); - assign_itx1_bpc_fn (R, 64, 32, 10, avx2); - assign_itx1_bpc_fn ( , 64, 64, 10, avx2); +#else + if (bpc == 10) { + assign_itx16_bpc_fn( , 4, 4, 10, avx2); + assign_itx16_bpc_fn(R, 4, 8, 10, avx2); + assign_itx16_bpc_fn(R, 4, 16, 10, avx2); + assign_itx16_bpc_fn(R, 8, 4, 10, avx2); + assign_itx16_bpc_fn( , 8, 8, 10, avx2); + assign_itx16_bpc_fn(R, 8, 16, 10, avx2); + assign_itx2_bpc_fn (R, 8, 32, 10, avx2); + assign_itx16_bpc_fn(R, 16, 4, 10, avx2); + assign_itx16_bpc_fn(R, 16, 8, 10, avx2); + assign_itx12_bpc_fn( , 16, 16, 10, avx2); + assign_itx2_bpc_fn (R, 16, 32, 10, avx2); + assign_itx1_bpc_fn (R, 16, 64, 10, avx2); + assign_itx2_bpc_fn (R, 32, 8, 10, avx2); + assign_itx2_bpc_fn (R, 32, 16, 10, avx2); + assign_itx2_bpc_fn ( , 32, 32, 10, avx2); + assign_itx1_bpc_fn (R, 32, 64, 10, avx2); + assign_itx1_bpc_fn (R, 64, 16, 10, avx2); + assign_itx1_bpc_fn (R, 64, 32, 10, avx2); + assign_itx1_bpc_fn ( , 64, 64, 10, avx2); + } else { + assign_itx16_bpc_fn( , 4, 4, 12, avx2); + assign_itx16_bpc_fn(R, 4, 8, 12, avx2); + assign_itx16_bpc_fn(R, 4, 16, 12, avx2); + assign_itx16_bpc_fn(R, 8, 4, 12, avx2); + assign_itx16_bpc_fn( , 8, 8, 12, avx2); + assign_itx16_bpc_fn(R, 8, 16, 12, avx2); + assign_itx2_bpc_fn (R, 8, 32, 12, avx2); + assign_itx16_bpc_fn(R, 16, 4, 12, avx2); + assign_itx16_bpc_fn(R, 16, 8, 12, avx2); + assign_itx12_bpc_fn( , 16, 16, 12, avx2); + assign_itx2_bpc_fn (R, 32, 8, 12, avx2); + } #endif if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; diff --git a/src/x86/loopfilter_init_tmpl.c b/src/x86/loopfilter.h index 1c085d9..33c842a 100644 --- a/src/x86/loopfilter_init_tmpl.c +++ b/src/x86/loopfilter.h @@ -38,7 +38,7 @@ decl_loopfilter_sb_fns(ssse3); decl_loopfilter_sb_fns(avx2); decl_loopfilter_sb_fns(avx512icl); -COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) { +static ALWAYS_INLINE void loop_filter_dsp_init_x86(Dav1dLoopFilterDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; diff --git a/src/x86/looprestoration_init_tmpl.c b/src/x86/looprestoration.h index dfd36e6..de23be8 100644 --- a/src/x86/looprestoration_init_tmpl.c +++ b/src/x86/looprestoration.h @@ -47,9 +47,7 @@ decl_sgr_filter_fns(ssse3); decl_sgr_filter_fns(avx2); decl_sgr_filter_fns(avx512icl); -COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c, - const int bpc) -{ +static ALWAYS_INLINE void loop_restoration_dsp_init_x86(Dav1dLoopRestorationDSPContext *const c, const int bpc) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; @@ -61,7 +59,7 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; c->wiener[0] = BF(dav1d_wiener_filter7, ssse3); c->wiener[1] = BF(dav1d_wiener_filter5, ssse3); - if (bpc <= 10) { + if (BITDEPTH == 8 || bpc == 10) { c->sgr[0] = BF(dav1d_sgr_filter_5x5, ssse3); c->sgr[1] = BF(dav1d_sgr_filter_3x3, ssse3); c->sgr[2] = BF(dav1d_sgr_filter_mix, ssse3); @@ -72,7 +70,7 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont c->wiener[0] = BF(dav1d_wiener_filter7, avx2); c->wiener[1] = BF(dav1d_wiener_filter5, avx2); - if (bpc <= 10) { + if (BITDEPTH == 8 || bpc == 10) { c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx2); c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx2); c->sgr[2] = BF(dav1d_sgr_filter_mix, avx2); @@ -87,7 +85,7 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont #else c->wiener[1] = BF(dav1d_wiener_filter5, avx512icl); #endif - if (bpc <= 10) { + if (BITDEPTH == 8 || bpc == 10) { c->sgr[0] = BF(dav1d_sgr_filter_5x5, avx512icl); c->sgr[1] = BF(dav1d_sgr_filter_3x3, avx512icl); c->sgr[2] = BF(dav1d_sgr_filter_mix, avx512icl); diff --git a/src/x86/mc_init_tmpl.c b/src/x86/mc.h index 57680ea..65c607e 100644 --- a/src/x86/mc_init_tmpl.c +++ b/src/x86/mc.h @@ -105,7 +105,7 @@ decl_fn(emu_edge, dav1d_emu_edge); decl_fn(resize, dav1d_resize); -COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { +static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if(!(flags & DAV1D_X86_CPU_FLAG_SSE2)) diff --git a/src/x86/msac.h b/src/x86/msac.h index e11cd08..0bb632f 100644 --- a/src/x86/msac.h +++ b/src/x86/msac.h @@ -28,21 +28,21 @@ #ifndef DAV1D_SRC_X86_MSAC_H #define DAV1D_SRC_X86_MSAC_H +#include "src/cpu.h" + unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf, size_t n_symbols); unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf, size_t n_symbols); unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf, size_t n_symbols); +unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf, + size_t n_symbols); unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf); unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s); unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f); unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf); -/* Needed for checkasm */ -unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf, - size_t n_symbols); - #if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) #define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2 #define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2 @@ -55,10 +55,21 @@ unsigned dav1d_msac_decode_symbol_adapt16_avx2(MsacContext *s, uint16_t *cdf, #if ARCH_X86_64 #define dav1d_msac_decode_symbol_adapt16(ctx, cdf, symb) ((ctx)->symbol_adapt16(ctx, cdf, symb)) + +static ALWAYS_INLINE void msac_init_x86(MsacContext *const s) { + const unsigned flags = dav1d_get_cpu_flags(); + + if (flags & DAV1D_X86_CPU_FLAG_SSE2) { + s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2; + } + + if (flags & DAV1D_X86_CPU_FLAG_AVX2) { + s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2; + } +} + #elif defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) #define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2 #endif -void dav1d_msac_init_x86(MsacContext *const s); - #endif /* DAV1D_SRC_X86_MSAC_H */ diff --git a/src/x86/refmvs_init.c b/src/x86/refmvs.h index e3575ba..de4124c 100644 --- a/src/x86/refmvs_init.c +++ b/src/x86/refmvs.h @@ -32,7 +32,7 @@ decl_splat_mv_fn(dav1d_splat_mv_sse2); decl_splat_mv_fn(dav1d_splat_mv_avx2); decl_splat_mv_fn(dav1d_splat_mv_avx512icl); -COLD void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) { +static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; |